From 8d10c3627999f462b2a8464db164c13e6b15d6f2 Mon Sep 17 00:00:00 2001 From: sananda Date: Sun, 29 Mar 2026 04:15:56 +0000 Subject: [PATCH 1/3] Adding-Classification-On-Decision-Tree --- .vscode/settings.json | 4 +- machine_learning/decision_tree.py | 157 ++++++++++++++++++++++-------- 2 files changed, 122 insertions(+), 39 deletions(-) diff --git a/.vscode/settings.json b/.vscode/settings.json index ef16fa1aa7ac..d85baa1a5f51 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -1,5 +1,7 @@ { "githubPullRequests.ignoredPullRequestBranches": [ "master" - ] + ], + "python-envs.defaultEnvManager": "ms-python.python:system", + "python-envs.defaultPackageManager": "ms-python.python:pip" } diff --git a/machine_learning/decision_tree.py b/machine_learning/decision_tree.py index b4df64796bb1..870097448e37 100644 --- a/machine_learning/decision_tree.py +++ b/machine_learning/decision_tree.py @@ -5,17 +5,20 @@ """ import numpy as np +from collections import Counter class DecisionTree: - def __init__(self, depth=5, min_leaf_size=5): + def __init__(self, depth=5, min_leaf_size=5, task="regression", criterion="gini"): self.depth = depth self.decision_boundary = 0 self.left = None self.right = None self.min_leaf_size = min_leaf_size self.prediction = None - + self.task = task + self.criterion = criterion + def mean_squared_error(self, labels, prediction): """ mean_squared_error: @@ -38,10 +41,62 @@ def mean_squared_error(self, labels, prediction): True """ if labels.ndim != 1: - print("Error: Input labels must be one dimensional") - + raise ValueError("Input labels must be one dimensional") return np.mean((labels - prediction) ** 2) + def gini(self, y): + """ + Computes the Gini impurity for a set of labels. + Gini impurity measures how often a randomly chosen element + would be incorrectly classified. + Formula: Gini = 1 - sum(p_i^2) + where p_i is the probability of class i. + + Lower Gini value indicates better purity (best split). + """ + classes, counts = np.unique(y, return_counts=True) + prob = counts / counts.sum() + return 1 - np.sum(prob ** 2) + + def entropy(self, y): + """ + Computes the entropy (impurity) of a set of labels. + Entropy measures the randomness or disorder in the data. + Formula: Entropy = - sum(p_i * log2(p_i)) + where p_i is the probability of class i. + + Lower entropy means higher purity. + """ + classes, counts = np.unique(y, return_counts=True) + prob = counts / counts.sum() + return -np.sum(prob * np.log2(prob + 1e-9)) + + def information_gain(self, parent, left, right): + """ + Computes the information gain from splitting a dataset. + Information gain represents the reduction in impurity + after a dataset is split into left and right subsets. + Formula: IG = Impurity(parent) - [weighted impurity(left) + weighted impurity(right)] + + Higher information gain indicates a better split. + """ + if self.criterion == "gini": + func = self.gini + elif self.criterion == "entropy": + func = self.entropy + else: + raise ValueError("Invalid criterion") + + weight_l = len(left) / len(parent) + weight_r = len(right) / len(parent) + + return func(parent) - ( + weight_l * func(left) + weight_r * func(right) + ) + + def most_common_label(self, y): + return Counter(y).most_common(1)[0][0] + def train(self, x, y): """ train: @@ -87,35 +142,50 @@ def train(self, x, y): if y.ndim != 1: raise ValueError("Data set labels must be one-dimensional") - if len(x) < 2 * self.min_leaf_size: - self.prediction = np.mean(y) - return - - if self.depth == 1: - self.prediction = np.mean(y) + if len(x) < 2 * self.min_leaf_size or self.depth == 1: + if self.task == "regression": + self.prediction = np.mean(y) + else: + self.prediction = self.most_common_label(y) return best_split = 0 - min_error = self.mean_squared_error(x, np.mean(y)) * 2 - + """ loop over all possible splits for the decision tree. find the best split. if no split exists that is less than 2 * error for the entire array then the data set is not split and the average for the entire array is used as the predictor """ + if self.task == "regression": + best_score = float("inf") + else: + best_score = -float("inf") + for i in range(len(x)): - if len(x[:i]) < self.min_leaf_size: # noqa: SIM114 + if len(x[:i]) < self.min_leaf_size: continue - elif len(x[i:]) < self.min_leaf_size: + if len(x[i:]) < self.min_leaf_size: continue - else: - error_left = self.mean_squared_error(x[:i], np.mean(y[:i])) - error_right = self.mean_squared_error(x[i:], np.mean(y[i:])) - error = error_left + error_right - if error < min_error: + + left_y = y[:i] + right_y = y[i:] + + if self.task == "regression": + error_left = self.mean_squared_error(left_y, np.mean(left_y)) + error_right = self.mean_squared_error(right_y, np.mean(right_y)) + score = error_left + error_right + + if score < best_score: + best_score = score + best_split = i + + else: + gain = self.information_gain(y, left_y, right_y) + + if gain > best_score: + best_score = gain best_split = i - min_error = error if best_split != 0: left_x = x[:best_split] @@ -124,18 +194,28 @@ def train(self, x, y): right_y = y[best_split:] self.decision_boundary = x[best_split] + self.left = DecisionTree( - depth=self.depth - 1, min_leaf_size=self.min_leaf_size + depth=self.depth - 1, + min_leaf_size=self.min_leaf_size, + task=self.task, + criterion=self.criterion, ) self.right = DecisionTree( - depth=self.depth - 1, min_leaf_size=self.min_leaf_size + depth=self.depth - 1, + min_leaf_size=self.min_leaf_size, + task=self.task, + criterion=self.criterion, ) + self.left.train(left_x, left_y) self.right.train(right_x, right_y) - else: - self.prediction = np.mean(y) - return + else: + if self.task == "regression": + self.prediction = np.mean(y) + else: + self.prediction = self.most_common_label(y) def predict(self, x): """ @@ -146,15 +226,15 @@ def predict(self, x): """ if self.prediction is not None: return self.prediction - elif self.left is not None and self.right is not None: + if self.left is not None and self.right is not None: if x >= self.decision_boundary: return self.right.predict(x) else: return self.left.predict(x) - else: - raise ValueError("Decision tree not yet trained") + raise ValueError("Decision tree not yet trained") + class TestDecisionTree: """Decision Tres test class""" @@ -172,7 +252,7 @@ def helper_mean_squared_error_test(labels, prediction): return float(squared_error_sum / labels.size) - + def main(): """ In this demonstration we're generating a sample data set from the sin function in @@ -183,21 +263,22 @@ def main(): x = np.arange(-1.0, 1.0, 0.005) y = np.sin(x) - tree = DecisionTree(depth=10, min_leaf_size=10) + tree = DecisionTree(depth=10, min_leaf_size=10, task="regression") tree.train(x, y) - rng = np.random.default_rng() - test_cases = (rng.random(10) * 2) - 1 - predictions = np.array([tree.predict(x) for x in test_cases]) - avg_error = np.mean((predictions - test_cases) ** 2) + print("Regression prediction:", tree.predict(0.5)) + x_cls = np.array([1, 2, 3, 4, 5, 6]) + y_cls = np.array([0, 0, 0, 1, 1, 1]) + + clf = DecisionTree(depth=3, min_leaf_size=1, task="classification", criterion="gini") + clf.train(x_cls, y_cls) - print("Test values: " + str(test_cases)) - print("Predictions: " + str(predictions)) - print("Average error: " + str(avg_error)) + print("Classification prediction (2):", clf.predict(2)) + print("Classification prediction (5):", clf.predict(5)) if __name__ == "__main__": main() import doctest - doctest.testmod(name="mean_squared_error", verbose=True) + doctest.testmod(name="mean_squared_error", verbose=True) \ No newline at end of file From 0776097bc5481227a79a9d06f4c7458d5149c147 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Sun, 29 Mar 2026 04:48:39 +0000 Subject: [PATCH 2/3] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- machine_learning/decision_tree.py | 32 +++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/machine_learning/decision_tree.py b/machine_learning/decision_tree.py index 870097448e37..cc5e39cc905a 100644 --- a/machine_learning/decision_tree.py +++ b/machine_learning/decision_tree.py @@ -18,7 +18,7 @@ def __init__(self, depth=5, min_leaf_size=5, task="regression", criterion="gini" self.prediction = None self.task = task self.criterion = criterion - + def mean_squared_error(self, labels, prediction): """ mean_squared_error: @@ -51,12 +51,12 @@ def gini(self, y): would be incorrectly classified. Formula: Gini = 1 - sum(p_i^2) where p_i is the probability of class i. - + Lower Gini value indicates better purity (best split). """ classes, counts = np.unique(y, return_counts=True) prob = counts / counts.sum() - return 1 - np.sum(prob ** 2) + return 1 - np.sum(prob**2) def entropy(self, y): """ @@ -64,7 +64,7 @@ def entropy(self, y): Entropy measures the randomness or disorder in the data. Formula: Entropy = - sum(p_i * log2(p_i)) where p_i is the probability of class i. - + Lower entropy means higher purity. """ classes, counts = np.unique(y, return_counts=True) @@ -77,7 +77,7 @@ def information_gain(self, parent, left, right): Information gain represents the reduction in impurity after a dataset is split into left and right subsets. Formula: IG = Impurity(parent) - [weighted impurity(left) + weighted impurity(right)] - + Higher information gain indicates a better split. """ if self.criterion == "gini": @@ -90,9 +90,7 @@ def information_gain(self, parent, left, right): weight_l = len(left) / len(parent) weight_r = len(right) / len(parent) - return func(parent) - ( - weight_l * func(left) + weight_r * func(right) - ) + return func(parent) - (weight_l * func(left) + weight_r * func(right)) def most_common_label(self, y): return Counter(y).most_common(1)[0][0] @@ -150,7 +148,7 @@ def train(self, x, y): return best_split = 0 - + """ loop over all possible splits for the decision tree. find the best split. if no split exists that is less than 2 * error for the entire array @@ -180,7 +178,7 @@ def train(self, x, y): best_score = score best_split = i - else: + else: gain = self.information_gain(y, left_y, right_y) if gain > best_score: @@ -234,7 +232,7 @@ def predict(self, x): raise ValueError("Decision tree not yet trained") - + class TestDecisionTree: """Decision Tres test class""" @@ -252,7 +250,7 @@ def helper_mean_squared_error_test(labels, prediction): return float(squared_error_sum / labels.size) - + def main(): """ In this demonstration we're generating a sample data set from the sin function in @@ -270,15 +268,17 @@ def main(): x_cls = np.array([1, 2, 3, 4, 5, 6]) y_cls = np.array([0, 0, 0, 1, 1, 1]) - clf = DecisionTree(depth=3, min_leaf_size=1, task="classification", criterion="gini") + clf = DecisionTree( + depth=3, min_leaf_size=1, task="classification", criterion="gini" + ) clf.train(x_cls, y_cls) - print("Classification prediction (2):", clf.predict(2)) - print("Classification prediction (5):", clf.predict(5)) + print("Classification prediction (2):", clf.predict(2)) + print("Classification prediction (5):", clf.predict(5)) if __name__ == "__main__": main() import doctest - doctest.testmod(name="mean_squared_error", verbose=True) \ No newline at end of file + doctest.testmod(name="mean_squared_error", verbose=True) From 299acbc64a1ddbf5747241b6f102a61ab0c6a4f9 Mon Sep 17 00:00:00 2001 From: sananda Date: Sun, 29 Mar 2026 05:27:13 +0000 Subject: [PATCH 3/3] Fix ruff lint issues --- .../filters/local_binary_pattern.py | 2 +- divide_and_conquer/convex_hull.py | 2 +- dynamic_programming/catalan_numbers.py | 2 +- machine_learning/decision_tree.py | 22 +++++++++---------- maths/greatest_common_divisor.py | 2 +- project_euler/problem_002/sol4.py | 2 +- project_euler/problem_003/sol1.py | 2 +- project_euler/problem_003/sol2.py | 2 +- project_euler/problem_003/sol3.py | 2 +- project_euler/problem_005/sol1.py | 2 +- project_euler/problem_007/sol2.py | 2 +- web_programming/fetch_well_rx_price.py | 2 +- web_programming/instagram_crawler.py | 2 +- 13 files changed, 22 insertions(+), 24 deletions(-) diff --git a/digital_image_processing/filters/local_binary_pattern.py b/digital_image_processing/filters/local_binary_pattern.py index 861369ba6a32..ac54ecce755c 100644 --- a/digital_image_processing/filters/local_binary_pattern.py +++ b/digital_image_processing/filters/local_binary_pattern.py @@ -19,7 +19,7 @@ def get_neighbors_pixel( try: return int(image[x_coordinate][y_coordinate] >= center) - except (IndexError, TypeError): + except IndexError, TypeError: return 0 diff --git a/divide_and_conquer/convex_hull.py b/divide_and_conquer/convex_hull.py index 93f6daf1f88c..b1ab33cc9415 100644 --- a/divide_and_conquer/convex_hull.py +++ b/divide_and_conquer/convex_hull.py @@ -124,7 +124,7 @@ def _construct_points( else: try: points.append(Point(p[0], p[1])) - except (IndexError, TypeError): + except IndexError, TypeError: print( f"Ignoring deformed point {p}. All points" " must have at least 2 coordinates." diff --git a/dynamic_programming/catalan_numbers.py b/dynamic_programming/catalan_numbers.py index 7b74f2763d43..a62abe36d670 100644 --- a/dynamic_programming/catalan_numbers.py +++ b/dynamic_programming/catalan_numbers.py @@ -71,7 +71,7 @@ def catalan_numbers(upper_limit: int) -> "list[int]": print(f"The Catalan numbers from 0 through {N} are:") print(catalan_numbers(N)) print("Try another upper limit for the sequence: ", end="") - except (NameError, ValueError): + except NameError, ValueError: print("\n********* Invalid input, goodbye! ************\n") import doctest diff --git a/machine_learning/decision_tree.py b/machine_learning/decision_tree.py index cc5e39cc905a..5c490316c53c 100644 --- a/machine_learning/decision_tree.py +++ b/machine_learning/decision_tree.py @@ -4,9 +4,10 @@ Output: The decision tree maps a real number input to a real number output. """ -import numpy as np from collections import Counter +import numpy as np + class DecisionTree: def __init__(self, depth=5, min_leaf_size=5, task="regression", criterion="gini"): @@ -54,7 +55,7 @@ def gini(self, y): Lower Gini value indicates better purity (best split). """ - classes, counts = np.unique(y, return_counts=True) + _, counts = np.unique(y, return_counts=True) prob = counts / counts.sum() return 1 - np.sum(prob**2) @@ -67,7 +68,7 @@ def entropy(self, y): Lower entropy means higher purity. """ - classes, counts = np.unique(y, return_counts=True) + _, counts = np.unique(y, return_counts=True) prob = counts / counts.sum() return -np.sum(prob * np.log2(prob + 1e-9)) @@ -76,7 +77,8 @@ def information_gain(self, parent, left, right): Computes the information gain from splitting a dataset. Information gain represents the reduction in impurity after a dataset is split into left and right subsets. - Formula: IG = Impurity(parent) - [weighted impurity(left) + weighted impurity(right)] + Formula: IG = Impurity(parent) - [ + weighted impurity(left) + weighted impurity(right)] Higher information gain indicates a better split. """ @@ -155,10 +157,7 @@ def train(self, x, y): then the data set is not split and the average for the entire array is used as the predictor """ - if self.task == "regression": - best_score = float("inf") - else: - best_score = -float("inf") + best_score = float("inf") if self.task == "regression" else -float("inf") for i in range(len(x)): if len(x[:i]) < self.min_leaf_size: @@ -209,11 +208,10 @@ def train(self, x, y): self.left.train(left_x, left_y) self.right.train(right_x, right_y) + elif self.task == "regression": + self.prediction = np.mean(y) else: - if self.task == "regression": - self.prediction = np.mean(y) - else: - self.prediction = self.most_common_label(y) + self.prediction = self.most_common_label(y) def predict(self, x): """ diff --git a/maths/greatest_common_divisor.py b/maths/greatest_common_divisor.py index 1fc123fc2b14..ce0abc664cf9 100644 --- a/maths/greatest_common_divisor.py +++ b/maths/greatest_common_divisor.py @@ -73,7 +73,7 @@ def main(): f"{greatest_common_divisor(num_1, num_2)}" ) print(f"By iterative gcd({num_1}, {num_2}) = {gcd_by_iterative(num_1, num_2)}") - except (IndexError, UnboundLocalError, ValueError): + except IndexError, UnboundLocalError, ValueError: print("Wrong input") diff --git a/project_euler/problem_002/sol4.py b/project_euler/problem_002/sol4.py index a13d34fd760e..3341aa1d4569 100644 --- a/project_euler/problem_002/sol4.py +++ b/project_euler/problem_002/sol4.py @@ -56,7 +56,7 @@ def solution(n: int = 4000000) -> int: try: n = int(n) - except (TypeError, ValueError): + except TypeError, ValueError: raise TypeError("Parameter n must be int or castable to int.") if n <= 0: raise ValueError("Parameter n must be greater than or equal to one.") diff --git a/project_euler/problem_003/sol1.py b/project_euler/problem_003/sol1.py index d1c0e61cf1a6..dbf9a84f68bb 100644 --- a/project_euler/problem_003/sol1.py +++ b/project_euler/problem_003/sol1.py @@ -80,7 +80,7 @@ def solution(n: int = 600851475143) -> int: try: n = int(n) - except (TypeError, ValueError): + except TypeError, ValueError: raise TypeError("Parameter n must be int or castable to int.") if n <= 0: raise ValueError("Parameter n must be greater than or equal to one.") diff --git a/project_euler/problem_003/sol2.py b/project_euler/problem_003/sol2.py index 0af0daceed06..4c4f88220514 100644 --- a/project_euler/problem_003/sol2.py +++ b/project_euler/problem_003/sol2.py @@ -44,7 +44,7 @@ def solution(n: int = 600851475143) -> int: try: n = int(n) - except (TypeError, ValueError): + except TypeError, ValueError: raise TypeError("Parameter n must be int or castable to int.") if n <= 0: raise ValueError("Parameter n must be greater than or equal to one.") diff --git a/project_euler/problem_003/sol3.py b/project_euler/problem_003/sol3.py index e13a0eb74ec1..1a454b618f75 100644 --- a/project_euler/problem_003/sol3.py +++ b/project_euler/problem_003/sol3.py @@ -44,7 +44,7 @@ def solution(n: int = 600851475143) -> int: try: n = int(n) - except (TypeError, ValueError): + except TypeError, ValueError: raise TypeError("Parameter n must be int or castable to int.") if n <= 0: raise ValueError("Parameter n must be greater than or equal to one.") diff --git a/project_euler/problem_005/sol1.py b/project_euler/problem_005/sol1.py index 01cbd0e15ff7..f889c420c61d 100644 --- a/project_euler/problem_005/sol1.py +++ b/project_euler/problem_005/sol1.py @@ -47,7 +47,7 @@ def solution(n: int = 20) -> int: try: n = int(n) - except (TypeError, ValueError): + except TypeError, ValueError: raise TypeError("Parameter n must be int or castable to int.") if n <= 0: raise ValueError("Parameter n must be greater than or equal to one.") diff --git a/project_euler/problem_007/sol2.py b/project_euler/problem_007/sol2.py index fd99453c1100..d63b2f2d86ec 100644 --- a/project_euler/problem_007/sol2.py +++ b/project_euler/problem_007/sol2.py @@ -87,7 +87,7 @@ def solution(nth: int = 10001) -> int: try: nth = int(nth) - except (TypeError, ValueError): + except TypeError, ValueError: raise TypeError("Parameter nth must be int or castable to int.") from None if nth <= 0: raise ValueError("Parameter nth must be greater than or equal to one.") diff --git a/web_programming/fetch_well_rx_price.py b/web_programming/fetch_well_rx_price.py index e34a89c19cc8..680d7444bd1c 100644 --- a/web_programming/fetch_well_rx_price.py +++ b/web_programming/fetch_well_rx_price.py @@ -67,7 +67,7 @@ def fetch_pharmacy_and_price_list(drug_name: str, zip_code: str) -> list | None: return pharmacy_price_list - except (httpx.HTTPError, ValueError): + except httpx.HTTPError, ValueError: return None diff --git a/web_programming/instagram_crawler.py b/web_programming/instagram_crawler.py index 68271c1c4643..0b91db01ca09 100644 --- a/web_programming/instagram_crawler.py +++ b/web_programming/instagram_crawler.py @@ -53,7 +53,7 @@ def get_json(self) -> dict: scripts = BeautifulSoup(html, "html.parser").find_all("script") try: return extract_user_profile(scripts[4]) - except (json.decoder.JSONDecodeError, KeyError): + except json.decoder.JSONDecodeError, KeyError: return extract_user_profile(scripts[3]) def __repr__(self) -> str: