From 8d10c3627999f462b2a8464db164c13e6b15d6f2 Mon Sep 17 00:00:00 2001
From: sananda <sanandadutta3107@gmail.com>
Date: Sun, 29 Mar 2026 04:15:56 +0000
Subject: [PATCH 1/3] Adding-Classification-On-Decision-Tree

---
 .vscode/settings.json             |   4 +-
 machine_learning/decision_tree.py | 157 ++++++++++++++++++++++--------
 2 files changed, 122 insertions(+), 39 deletions(-)

diff --git a/.vscode/settings.json b/.vscode/settings.json
index ef16fa1aa7ac..d85baa1a5f51 100644
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -1,5 +1,7 @@
 {
     "githubPullRequests.ignoredPullRequestBranches": [
         "master"
-    ]
+    ],
+    "python-envs.defaultEnvManager": "ms-python.python:system",
+    "python-envs.defaultPackageManager": "ms-python.python:pip"
 }
diff --git a/machine_learning/decision_tree.py b/machine_learning/decision_tree.py
index b4df64796bb1..870097448e37 100644
--- a/machine_learning/decision_tree.py
+++ b/machine_learning/decision_tree.py
@@ -5,17 +5,20 @@
 """
 
 import numpy as np
+from collections import Counter
 
 
 class DecisionTree:
-    def __init__(self, depth=5, min_leaf_size=5):
+    def __init__(self, depth=5, min_leaf_size=5, task="regression", criterion="gini"):
         self.depth = depth
         self.decision_boundary = 0
         self.left = None
         self.right = None
         self.min_leaf_size = min_leaf_size
         self.prediction = None
-
+        self.task = task
+        self.criterion = criterion
+        
     def mean_squared_error(self, labels, prediction):
         """
         mean_squared_error:
@@ -38,10 +41,62 @@ def mean_squared_error(self, labels, prediction):
         True
         """
         if labels.ndim != 1:
-            print("Error: Input labels must be one dimensional")
-
+            raise ValueError("Input labels must be one dimensional")
         return np.mean((labels - prediction) ** 2)
 
+    def gini(self, y):
+        """
+        Computes the Gini impurity for a set of labels.
+        Gini impurity measures how often a randomly chosen element
+        would be incorrectly classified.
+        Formula: Gini = 1 - sum(p_i^2)
+        where p_i is the probability of class i.
+    
+        Lower Gini value indicates better purity (best split).
+        """
+        classes, counts = np.unique(y, return_counts=True)
+        prob = counts / counts.sum()
+        return 1 - np.sum(prob ** 2)
+
+    def entropy(self, y):
+        """
+        Computes the entropy (impurity) of a set of labels.
+        Entropy measures the randomness or disorder in the data.
+        Formula: Entropy = - sum(p_i * log2(p_i))
+        where p_i is the probability of class i.
+    
+        Lower entropy means higher purity.
+        """
+        classes, counts = np.unique(y, return_counts=True)
+        prob = counts / counts.sum()
+        return -np.sum(prob * np.log2(prob + 1e-9))
+
+    def information_gain(self, parent, left, right):
+        """
+        Computes the information gain from splitting a dataset.
+        Information gain represents the reduction in impurity
+        after a dataset is split into left and right subsets.
+        Formula: IG = Impurity(parent) - [weighted impurity(left) + weighted impurity(right)]
+    
+        Higher information gain indicates a better split.
+        """
+        if self.criterion == "gini":
+            func = self.gini
+        elif self.criterion == "entropy":
+            func = self.entropy
+        else:
+            raise ValueError("Invalid criterion")
+
+        weight_l = len(left) / len(parent)
+        weight_r = len(right) / len(parent)
+
+        return func(parent) - (
+            weight_l * func(left) + weight_r * func(right)
+        )
+
+    def most_common_label(self, y):
+        return Counter(y).most_common(1)[0][0]
+
     def train(self, x, y):
         """
         train:
@@ -87,35 +142,50 @@ def train(self, x, y):
         if y.ndim != 1:
             raise ValueError("Data set labels must be one-dimensional")
 
-        if len(x) < 2 * self.min_leaf_size:
-            self.prediction = np.mean(y)
-            return
-
-        if self.depth == 1:
-            self.prediction = np.mean(y)
+        if len(x) < 2 * self.min_leaf_size or self.depth == 1:
+            if self.task == "regression":
+                self.prediction = np.mean(y)
+            else:
+                self.prediction = self.most_common_label(y)
             return
 
         best_split = 0
-        min_error = self.mean_squared_error(x, np.mean(y)) * 2
-
+        
         """
         loop over all possible splits for the decision tree. find the best split.
         if no split exists that is less than 2 * error for the entire array
         then the data set is not split and the average for the entire array is used as
         the predictor
         """
+        if self.task == "regression":
+            best_score = float("inf")
+        else:
+            best_score = -float("inf")
+
         for i in range(len(x)):
-            if len(x[:i]) < self.min_leaf_size:  # noqa: SIM114
+            if len(x[:i]) < self.min_leaf_size:
                 continue
-            elif len(x[i:]) < self.min_leaf_size:
+            if len(x[i:]) < self.min_leaf_size:
                 continue
-            else:
-                error_left = self.mean_squared_error(x[:i], np.mean(y[:i]))
-                error_right = self.mean_squared_error(x[i:], np.mean(y[i:]))
-                error = error_left + error_right
-                if error < min_error:
+
+            left_y = y[:i]
+            right_y = y[i:]
+
+            if self.task == "regression":
+                error_left = self.mean_squared_error(left_y, np.mean(left_y))
+                error_right = self.mean_squared_error(right_y, np.mean(right_y))
+                score = error_left + error_right
+
+                if score < best_score:
+                    best_score = score
+                    best_split = i
+
+            else:  
+                gain = self.information_gain(y, left_y, right_y)
+
+                if gain > best_score:
+                    best_score = gain
                     best_split = i
-                    min_error = error
 
         if best_split != 0:
             left_x = x[:best_split]
@@ -124,18 +194,28 @@ def train(self, x, y):
             right_y = y[best_split:]
 
             self.decision_boundary = x[best_split]
+
             self.left = DecisionTree(
-                depth=self.depth - 1, min_leaf_size=self.min_leaf_size
+                depth=self.depth - 1,
+                min_leaf_size=self.min_leaf_size,
+                task=self.task,
+                criterion=self.criterion,
             )
             self.right = DecisionTree(
-                depth=self.depth - 1, min_leaf_size=self.min_leaf_size
+                depth=self.depth - 1,
+                min_leaf_size=self.min_leaf_size,
+                task=self.task,
+                criterion=self.criterion,
             )
+
             self.left.train(left_x, left_y)
             self.right.train(right_x, right_y)
-        else:
-            self.prediction = np.mean(y)
 
-        return
+        else:
+            if self.task == "regression":
+                self.prediction = np.mean(y)
+            else:
+                self.prediction = self.most_common_label(y)
 
     def predict(self, x):
         """
@@ -146,15 +226,15 @@ def predict(self, x):
         """
         if self.prediction is not None:
             return self.prediction
-        elif self.left is not None and self.right is not None:
+        if self.left is not None and self.right is not None:
             if x >= self.decision_boundary:
                 return self.right.predict(x)
             else:
                 return self.left.predict(x)
-        else:
-            raise ValueError("Decision tree not yet trained")
 
+        raise ValueError("Decision tree not yet trained")
 
+        
 class TestDecisionTree:
     """Decision Tres test class"""
 
@@ -172,7 +252,7 @@ def helper_mean_squared_error_test(labels, prediction):
 
         return float(squared_error_sum / labels.size)
 
-
+        
 def main():
     """
     In this demonstration we're generating a sample data set from the sin function in
@@ -183,21 +263,22 @@ def main():
     x = np.arange(-1.0, 1.0, 0.005)
     y = np.sin(x)
 
-    tree = DecisionTree(depth=10, min_leaf_size=10)
+    tree = DecisionTree(depth=10, min_leaf_size=10, task="regression")
     tree.train(x, y)
 
-    rng = np.random.default_rng()
-    test_cases = (rng.random(10) * 2) - 1
-    predictions = np.array([tree.predict(x) for x in test_cases])
-    avg_error = np.mean((predictions - test_cases) ** 2)
+    print("Regression prediction:", tree.predict(0.5))
+    x_cls = np.array([1, 2, 3, 4, 5, 6])
+    y_cls = np.array([0, 0, 0, 1, 1, 1])
+
+    clf = DecisionTree(depth=3, min_leaf_size=1, task="classification", criterion="gini")
+    clf.train(x_cls, y_cls)
 
-    print("Test values: " + str(test_cases))
-    print("Predictions: " + str(predictions))
-    print("Average error: " + str(avg_error))
+    print("Classification prediction (2):", clf.predict(2)) 
+    print("Classification prediction (5):", clf.predict(5))  
 
 
 if __name__ == "__main__":
     main()
     import doctest
 
-    doctest.testmod(name="mean_squared_error", verbose=True)
+    doctest.testmod(name="mean_squared_error", verbose=True)
\ No newline at end of file

From 0776097bc5481227a79a9d06f4c7458d5149c147 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Sun, 29 Mar 2026 04:48:39 +0000
Subject: [PATCH 2/3] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 machine_learning/decision_tree.py | 32 +++++++++++++++----------------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/machine_learning/decision_tree.py b/machine_learning/decision_tree.py
index 870097448e37..cc5e39cc905a 100644
--- a/machine_learning/decision_tree.py
+++ b/machine_learning/decision_tree.py
@@ -18,7 +18,7 @@ def __init__(self, depth=5, min_leaf_size=5, task="regression", criterion="gini"
         self.prediction = None
         self.task = task
         self.criterion = criterion
-        
+
     def mean_squared_error(self, labels, prediction):
         """
         mean_squared_error:
@@ -51,12 +51,12 @@ def gini(self, y):
         would be incorrectly classified.
         Formula: Gini = 1 - sum(p_i^2)
         where p_i is the probability of class i.
-    
+
         Lower Gini value indicates better purity (best split).
         """
         classes, counts = np.unique(y, return_counts=True)
         prob = counts / counts.sum()
-        return 1 - np.sum(prob ** 2)
+        return 1 - np.sum(prob**2)
 
     def entropy(self, y):
         """
@@ -64,7 +64,7 @@ def entropy(self, y):
         Entropy measures the randomness or disorder in the data.
         Formula: Entropy = - sum(p_i * log2(p_i))
         where p_i is the probability of class i.
-    
+
         Lower entropy means higher purity.
         """
         classes, counts = np.unique(y, return_counts=True)
@@ -77,7 +77,7 @@ def information_gain(self, parent, left, right):
         Information gain represents the reduction in impurity
         after a dataset is split into left and right subsets.
         Formula: IG = Impurity(parent) - [weighted impurity(left) + weighted impurity(right)]
-    
+
         Higher information gain indicates a better split.
         """
         if self.criterion == "gini":
@@ -90,9 +90,7 @@ def information_gain(self, parent, left, right):
         weight_l = len(left) / len(parent)
         weight_r = len(right) / len(parent)
 
-        return func(parent) - (
-            weight_l * func(left) + weight_r * func(right)
-        )
+        return func(parent) - (weight_l * func(left) + weight_r * func(right))
 
     def most_common_label(self, y):
         return Counter(y).most_common(1)[0][0]
@@ -150,7 +148,7 @@ def train(self, x, y):
             return
 
         best_split = 0
-        
+
         """
         loop over all possible splits for the decision tree. find the best split.
         if no split exists that is less than 2 * error for the entire array
@@ -180,7 +178,7 @@ def train(self, x, y):
                     best_score = score
                     best_split = i
 
-            else:  
+            else:
                 gain = self.information_gain(y, left_y, right_y)
 
                 if gain > best_score:
@@ -234,7 +232,7 @@ def predict(self, x):
 
         raise ValueError("Decision tree not yet trained")
 
-        
+
 class TestDecisionTree:
     """Decision Tres test class"""
 
@@ -252,7 +250,7 @@ def helper_mean_squared_error_test(labels, prediction):
 
         return float(squared_error_sum / labels.size)
 
-        
+
 def main():
     """
     In this demonstration we're generating a sample data set from the sin function in
@@ -270,15 +268,17 @@ def main():
     x_cls = np.array([1, 2, 3, 4, 5, 6])
     y_cls = np.array([0, 0, 0, 1, 1, 1])
 
-    clf = DecisionTree(depth=3, min_leaf_size=1, task="classification", criterion="gini")
+    clf = DecisionTree(
+        depth=3, min_leaf_size=1, task="classification", criterion="gini"
+    )
     clf.train(x_cls, y_cls)
 
-    print("Classification prediction (2):", clf.predict(2)) 
-    print("Classification prediction (5):", clf.predict(5))  
+    print("Classification prediction (2):", clf.predict(2))
+    print("Classification prediction (5):", clf.predict(5))
 
 
 if __name__ == "__main__":
     main()
     import doctest
 
-    doctest.testmod(name="mean_squared_error", verbose=True)
\ No newline at end of file
+    doctest.testmod(name="mean_squared_error", verbose=True)

From 299acbc64a1ddbf5747241b6f102a61ab0c6a4f9 Mon Sep 17 00:00:00 2001
From: sananda <sanandadutta3107@gmail.com>
Date: Sun, 29 Mar 2026 05:27:13 +0000
Subject: [PATCH 3/3] Fix ruff lint issues

---
 .../filters/local_binary_pattern.py           |  2 +-
 divide_and_conquer/convex_hull.py             |  2 +-
 dynamic_programming/catalan_numbers.py        |  2 +-
 machine_learning/decision_tree.py             | 22 +++++++++----------
 maths/greatest_common_divisor.py              |  2 +-
 project_euler/problem_002/sol4.py             |  2 +-
 project_euler/problem_003/sol1.py             |  2 +-
 project_euler/problem_003/sol2.py             |  2 +-
 project_euler/problem_003/sol3.py             |  2 +-
 project_euler/problem_005/sol1.py             |  2 +-
 project_euler/problem_007/sol2.py             |  2 +-
 web_programming/fetch_well_rx_price.py        |  2 +-
 web_programming/instagram_crawler.py          |  2 +-
 13 files changed, 22 insertions(+), 24 deletions(-)

diff --git a/digital_image_processing/filters/local_binary_pattern.py b/digital_image_processing/filters/local_binary_pattern.py
index 861369ba6a32..ac54ecce755c 100644
--- a/digital_image_processing/filters/local_binary_pattern.py
+++ b/digital_image_processing/filters/local_binary_pattern.py
@@ -19,7 +19,7 @@ def get_neighbors_pixel(
 
     try:
         return int(image[x_coordinate][y_coordinate] >= center)
-    except (IndexError, TypeError):
+    except IndexError, TypeError:
         return 0
 
 
diff --git a/divide_and_conquer/convex_hull.py b/divide_and_conquer/convex_hull.py
index 93f6daf1f88c..b1ab33cc9415 100644
--- a/divide_and_conquer/convex_hull.py
+++ b/divide_and_conquer/convex_hull.py
@@ -124,7 +124,7 @@ def _construct_points(
             else:
                 try:
                     points.append(Point(p[0], p[1]))
-                except (IndexError, TypeError):
+                except IndexError, TypeError:
                     print(
                         f"Ignoring deformed point {p}. All points"
                         " must have at least 2 coordinates."
diff --git a/dynamic_programming/catalan_numbers.py b/dynamic_programming/catalan_numbers.py
index 7b74f2763d43..a62abe36d670 100644
--- a/dynamic_programming/catalan_numbers.py
+++ b/dynamic_programming/catalan_numbers.py
@@ -71,7 +71,7 @@ def catalan_numbers(upper_limit: int) -> "list[int]":
                 print(f"The Catalan numbers from 0 through {N} are:")
                 print(catalan_numbers(N))
                 print("Try another upper limit for the sequence: ", end="")
-    except (NameError, ValueError):
+    except NameError, ValueError:
         print("\n********* Invalid input, goodbye! ************\n")
 
     import doctest
diff --git a/machine_learning/decision_tree.py b/machine_learning/decision_tree.py
index cc5e39cc905a..5c490316c53c 100644
--- a/machine_learning/decision_tree.py
+++ b/machine_learning/decision_tree.py
@@ -4,9 +4,10 @@
 Output: The decision tree maps a real number input to a real number output.
 """
 
-import numpy as np
 from collections import Counter
 
+import numpy as np
+
 
 class DecisionTree:
     def __init__(self, depth=5, min_leaf_size=5, task="regression", criterion="gini"):
@@ -54,7 +55,7 @@ def gini(self, y):
 
         Lower Gini value indicates better purity (best split).
         """
-        classes, counts = np.unique(y, return_counts=True)
+        _, counts = np.unique(y, return_counts=True)
         prob = counts / counts.sum()
         return 1 - np.sum(prob**2)
 
@@ -67,7 +68,7 @@ def entropy(self, y):
 
         Lower entropy means higher purity.
         """
-        classes, counts = np.unique(y, return_counts=True)
+        _, counts = np.unique(y, return_counts=True)
         prob = counts / counts.sum()
         return -np.sum(prob * np.log2(prob + 1e-9))
 
@@ -76,7 +77,8 @@ def information_gain(self, parent, left, right):
         Computes the information gain from splitting a dataset.
         Information gain represents the reduction in impurity
         after a dataset is split into left and right subsets.
-        Formula: IG = Impurity(parent) - [weighted impurity(left) + weighted impurity(right)]
+        Formula: IG = Impurity(parent) - [
+            weighted impurity(left) + weighted impurity(right)]
 
         Higher information gain indicates a better split.
         """
@@ -155,10 +157,7 @@ def train(self, x, y):
         then the data set is not split and the average for the entire array is used as
         the predictor
         """
-        if self.task == "regression":
-            best_score = float("inf")
-        else:
-            best_score = -float("inf")
+        best_score = float("inf") if self.task == "regression" else -float("inf")
 
         for i in range(len(x)):
             if len(x[:i]) < self.min_leaf_size:
@@ -209,11 +208,10 @@ def train(self, x, y):
             self.left.train(left_x, left_y)
             self.right.train(right_x, right_y)
 
+        elif self.task == "regression":
+            self.prediction = np.mean(y)
         else:
-            if self.task == "regression":
-                self.prediction = np.mean(y)
-            else:
-                self.prediction = self.most_common_label(y)
+            self.prediction = self.most_common_label(y)
 
     def predict(self, x):
         """
diff --git a/maths/greatest_common_divisor.py b/maths/greatest_common_divisor.py
index 1fc123fc2b14..ce0abc664cf9 100644
--- a/maths/greatest_common_divisor.py
+++ b/maths/greatest_common_divisor.py
@@ -73,7 +73,7 @@ def main():
             f"{greatest_common_divisor(num_1, num_2)}"
         )
         print(f"By iterative gcd({num_1}, {num_2}) = {gcd_by_iterative(num_1, num_2)}")
-    except (IndexError, UnboundLocalError, ValueError):
+    except IndexError, UnboundLocalError, ValueError:
         print("Wrong input")
 
 
diff --git a/project_euler/problem_002/sol4.py b/project_euler/problem_002/sol4.py
index a13d34fd760e..3341aa1d4569 100644
--- a/project_euler/problem_002/sol4.py
+++ b/project_euler/problem_002/sol4.py
@@ -56,7 +56,7 @@ def solution(n: int = 4000000) -> int:
 
     try:
         n = int(n)
-    except (TypeError, ValueError):
+    except TypeError, ValueError:
         raise TypeError("Parameter n must be int or castable to int.")
     if n <= 0:
         raise ValueError("Parameter n must be greater than or equal to one.")
diff --git a/project_euler/problem_003/sol1.py b/project_euler/problem_003/sol1.py
index d1c0e61cf1a6..dbf9a84f68bb 100644
--- a/project_euler/problem_003/sol1.py
+++ b/project_euler/problem_003/sol1.py
@@ -80,7 +80,7 @@ def solution(n: int = 600851475143) -> int:
 
     try:
         n = int(n)
-    except (TypeError, ValueError):
+    except TypeError, ValueError:
         raise TypeError("Parameter n must be int or castable to int.")
     if n <= 0:
         raise ValueError("Parameter n must be greater than or equal to one.")
diff --git a/project_euler/problem_003/sol2.py b/project_euler/problem_003/sol2.py
index 0af0daceed06..4c4f88220514 100644
--- a/project_euler/problem_003/sol2.py
+++ b/project_euler/problem_003/sol2.py
@@ -44,7 +44,7 @@ def solution(n: int = 600851475143) -> int:
 
     try:
         n = int(n)
-    except (TypeError, ValueError):
+    except TypeError, ValueError:
         raise TypeError("Parameter n must be int or castable to int.")
     if n <= 0:
         raise ValueError("Parameter n must be greater than or equal to one.")
diff --git a/project_euler/problem_003/sol3.py b/project_euler/problem_003/sol3.py
index e13a0eb74ec1..1a454b618f75 100644
--- a/project_euler/problem_003/sol3.py
+++ b/project_euler/problem_003/sol3.py
@@ -44,7 +44,7 @@ def solution(n: int = 600851475143) -> int:
 
     try:
         n = int(n)
-    except (TypeError, ValueError):
+    except TypeError, ValueError:
         raise TypeError("Parameter n must be int or castable to int.")
     if n <= 0:
         raise ValueError("Parameter n must be greater than or equal to one.")
diff --git a/project_euler/problem_005/sol1.py b/project_euler/problem_005/sol1.py
index 01cbd0e15ff7..f889c420c61d 100644
--- a/project_euler/problem_005/sol1.py
+++ b/project_euler/problem_005/sol1.py
@@ -47,7 +47,7 @@ def solution(n: int = 20) -> int:
 
     try:
         n = int(n)
-    except (TypeError, ValueError):
+    except TypeError, ValueError:
         raise TypeError("Parameter n must be int or castable to int.")
     if n <= 0:
         raise ValueError("Parameter n must be greater than or equal to one.")
diff --git a/project_euler/problem_007/sol2.py b/project_euler/problem_007/sol2.py
index fd99453c1100..d63b2f2d86ec 100644
--- a/project_euler/problem_007/sol2.py
+++ b/project_euler/problem_007/sol2.py
@@ -87,7 +87,7 @@ def solution(nth: int = 10001) -> int:
 
     try:
         nth = int(nth)
-    except (TypeError, ValueError):
+    except TypeError, ValueError:
         raise TypeError("Parameter nth must be int or castable to int.") from None
     if nth <= 0:
         raise ValueError("Parameter nth must be greater than or equal to one.")
diff --git a/web_programming/fetch_well_rx_price.py b/web_programming/fetch_well_rx_price.py
index e34a89c19cc8..680d7444bd1c 100644
--- a/web_programming/fetch_well_rx_price.py
+++ b/web_programming/fetch_well_rx_price.py
@@ -67,7 +67,7 @@ def fetch_pharmacy_and_price_list(drug_name: str, zip_code: str) -> list | None:
 
         return pharmacy_price_list
 
-    except (httpx.HTTPError, ValueError):
+    except httpx.HTTPError, ValueError:
         return None
 
 
diff --git a/web_programming/instagram_crawler.py b/web_programming/instagram_crawler.py
index 68271c1c4643..0b91db01ca09 100644
--- a/web_programming/instagram_crawler.py
+++ b/web_programming/instagram_crawler.py
@@ -53,7 +53,7 @@ def get_json(self) -> dict:
         scripts = BeautifulSoup(html, "html.parser").find_all("script")
         try:
             return extract_user_profile(scripts[4])
-        except (json.decoder.JSONDecodeError, KeyError):
+        except json.decoder.JSONDecodeError, KeyError:
             return extract_user_profile(scripts[3])
 
     def __repr__(self) -> str: