diff --git a/.vscode/settings.json b/.vscode/settings.json index ef16fa1aa7ac..d85baa1a5f51 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -1,5 +1,7 @@ { "githubPullRequests.ignoredPullRequestBranches": [ "master" - ] + ], + "python-envs.defaultEnvManager": "ms-python.python:system", + "python-envs.defaultPackageManager": "ms-python.python:pip" } diff --git a/digital_image_processing/filters/local_binary_pattern.py b/digital_image_processing/filters/local_binary_pattern.py index 861369ba6a32..ac54ecce755c 100644 --- a/digital_image_processing/filters/local_binary_pattern.py +++ b/digital_image_processing/filters/local_binary_pattern.py @@ -19,7 +19,7 @@ def get_neighbors_pixel( try: return int(image[x_coordinate][y_coordinate] >= center) - except (IndexError, TypeError): + except IndexError, TypeError: return 0 diff --git a/divide_and_conquer/convex_hull.py b/divide_and_conquer/convex_hull.py index 93f6daf1f88c..b1ab33cc9415 100644 --- a/divide_and_conquer/convex_hull.py +++ b/divide_and_conquer/convex_hull.py @@ -124,7 +124,7 @@ def _construct_points( else: try: points.append(Point(p[0], p[1])) - except (IndexError, TypeError): + except IndexError, TypeError: print( f"Ignoring deformed point {p}. All points" " must have at least 2 coordinates." diff --git a/dynamic_programming/catalan_numbers.py b/dynamic_programming/catalan_numbers.py index 7b74f2763d43..a62abe36d670 100644 --- a/dynamic_programming/catalan_numbers.py +++ b/dynamic_programming/catalan_numbers.py @@ -71,7 +71,7 @@ def catalan_numbers(upper_limit: int) -> "list[int]": print(f"The Catalan numbers from 0 through {N} are:") print(catalan_numbers(N)) print("Try another upper limit for the sequence: ", end="") - except (NameError, ValueError): + except NameError, ValueError: print("\n********* Invalid input, goodbye! ************\n") import doctest diff --git a/machine_learning/decision_tree.py b/machine_learning/decision_tree.py index b4df64796bb1..5c490316c53c 100644 --- a/machine_learning/decision_tree.py +++ b/machine_learning/decision_tree.py @@ -4,17 +4,21 @@ Output: The decision tree maps a real number input to a real number output. """ +from collections import Counter + import numpy as np class DecisionTree: - def __init__(self, depth=5, min_leaf_size=5): + def __init__(self, depth=5, min_leaf_size=5, task="regression", criterion="gini"): self.depth = depth self.decision_boundary = 0 self.left = None self.right = None self.min_leaf_size = min_leaf_size self.prediction = None + self.task = task + self.criterion = criterion def mean_squared_error(self, labels, prediction): """ @@ -38,10 +42,61 @@ def mean_squared_error(self, labels, prediction): True """ if labels.ndim != 1: - print("Error: Input labels must be one dimensional") - + raise ValueError("Input labels must be one dimensional") return np.mean((labels - prediction) ** 2) + def gini(self, y): + """ + Computes the Gini impurity for a set of labels. + Gini impurity measures how often a randomly chosen element + would be incorrectly classified. + Formula: Gini = 1 - sum(p_i^2) + where p_i is the probability of class i. + + Lower Gini value indicates better purity (best split). + """ + _, counts = np.unique(y, return_counts=True) + prob = counts / counts.sum() + return 1 - np.sum(prob**2) + + def entropy(self, y): + """ + Computes the entropy (impurity) of a set of labels. + Entropy measures the randomness or disorder in the data. + Formula: Entropy = - sum(p_i * log2(p_i)) + where p_i is the probability of class i. + + Lower entropy means higher purity. + """ + _, counts = np.unique(y, return_counts=True) + prob = counts / counts.sum() + return -np.sum(prob * np.log2(prob + 1e-9)) + + def information_gain(self, parent, left, right): + """ + Computes the information gain from splitting a dataset. + Information gain represents the reduction in impurity + after a dataset is split into left and right subsets. + Formula: IG = Impurity(parent) - [ + weighted impurity(left) + weighted impurity(right)] + + Higher information gain indicates a better split. + """ + if self.criterion == "gini": + func = self.gini + elif self.criterion == "entropy": + func = self.entropy + else: + raise ValueError("Invalid criterion") + + weight_l = len(left) / len(parent) + weight_r = len(right) / len(parent) + + return func(parent) - (weight_l * func(left) + weight_r * func(right)) + + def most_common_label(self, y): + return Counter(y).most_common(1)[0][0] + def train(self, x, y): """ train: @@ -87,16 +142,14 @@ def train(self, x, y): if y.ndim != 1: raise ValueError("Data set labels must be one-dimensional") - if len(x) < 2 * self.min_leaf_size: - self.prediction = np.mean(y) - return - - if self.depth == 1: - self.prediction = np.mean(y) + if len(x) < 2 * self.min_leaf_size or self.depth == 1: + if self.task == "regression": + self.prediction = np.mean(y) + else: + self.prediction = self.most_common_label(y) return best_split = 0 - min_error = self.mean_squared_error(x, np.mean(y)) * 2 """ loop over all possible splits for the decision tree. find the best split. @@ -104,18 +157,32 @@ def train(self, x, y): then the data set is not split and the average for the entire array is used as the predictor """ + best_score = float("inf") if self.task == "regression" else -float("inf") + for i in range(len(x)): - if len(x[:i]) < self.min_leaf_size: # noqa: SIM114 + if len(x[:i]) < self.min_leaf_size: continue - elif len(x[i:]) < self.min_leaf_size: + if len(x[i:]) < self.min_leaf_size: continue + + left_y = y[:i] + right_y = y[i:] + + if self.task == "regression": + error_left = self.mean_squared_error(left_y, np.mean(left_y)) + error_right = self.mean_squared_error(right_y, np.mean(right_y)) + score = error_left + error_right + + if score < best_score: + best_score = score + best_split = i + else: - error_left = self.mean_squared_error(x[:i], np.mean(y[:i])) - error_right = self.mean_squared_error(x[i:], np.mean(y[i:])) - error = error_left + error_right - if error < min_error: + gain = self.information_gain(y, left_y, right_y) + + if gain > best_score: + best_score = gain best_split = i - min_error = error if best_split != 0: left_x = x[:best_split] @@ -124,18 +191,27 @@ def train(self, x, y): right_y = y[best_split:] self.decision_boundary = x[best_split] + self.left = DecisionTree( - depth=self.depth - 1, min_leaf_size=self.min_leaf_size + depth=self.depth - 1, + min_leaf_size=self.min_leaf_size, + task=self.task, + criterion=self.criterion, ) self.right = DecisionTree( - depth=self.depth - 1, min_leaf_size=self.min_leaf_size + depth=self.depth - 1, + min_leaf_size=self.min_leaf_size, + task=self.task, + criterion=self.criterion, ) + self.left.train(left_x, left_y) self.right.train(right_x, right_y) - else: - self.prediction = np.mean(y) - return + elif self.task == "regression": + self.prediction = np.mean(y) + else: + self.prediction = self.most_common_label(y) def predict(self, x): """ @@ -146,13 +222,13 @@ def predict(self, x): """ if self.prediction is not None: return self.prediction - elif self.left is not None and self.right is not None: + if self.left is not None and self.right is not None: if x >= self.decision_boundary: return self.right.predict(x) else: return self.left.predict(x) - else: - raise ValueError("Decision tree not yet trained") + + raise ValueError("Decision tree not yet trained") class TestDecisionTree: @@ -183,17 +259,20 @@ def main(): x = np.arange(-1.0, 1.0, 0.005) y = np.sin(x) - tree = DecisionTree(depth=10, min_leaf_size=10) + tree = DecisionTree(depth=10, min_leaf_size=10, task="regression") tree.train(x, y) - rng = np.random.default_rng() - test_cases = (rng.random(10) * 2) - 1 - predictions = np.array([tree.predict(x) for x in test_cases]) - avg_error = np.mean((predictions - test_cases) ** 2) + print("Regression prediction:", tree.predict(0.5)) + x_cls = np.array([1, 2, 3, 4, 5, 6]) + y_cls = np.array([0, 0, 0, 1, 1, 1]) + + clf = DecisionTree( + depth=3, min_leaf_size=1, task="classification", criterion="gini" + ) + clf.train(x_cls, y_cls) - print("Test values: " + str(test_cases)) - print("Predictions: " + str(predictions)) - print("Average error: " + str(avg_error)) + print("Classification prediction (2):", clf.predict(2)) + print("Classification prediction (5):", clf.predict(5)) if __name__ == "__main__": diff --git a/maths/greatest_common_divisor.py b/maths/greatest_common_divisor.py index 1fc123fc2b14..ce0abc664cf9 100644 --- a/maths/greatest_common_divisor.py +++ b/maths/greatest_common_divisor.py @@ -73,7 +73,7 @@ def main(): f"{greatest_common_divisor(num_1, num_2)}" ) print(f"By iterative gcd({num_1}, {num_2}) = {gcd_by_iterative(num_1, num_2)}") - except (IndexError, UnboundLocalError, ValueError): + except IndexError, UnboundLocalError, ValueError: print("Wrong input") diff --git a/project_euler/problem_002/sol4.py b/project_euler/problem_002/sol4.py index a13d34fd760e..3341aa1d4569 100644 --- a/project_euler/problem_002/sol4.py +++ b/project_euler/problem_002/sol4.py @@ -56,7 +56,7 @@ def solution(n: int = 4000000) -> int: try: n = int(n) - except (TypeError, ValueError): + except TypeError, ValueError: raise TypeError("Parameter n must be int or castable to int.") if n <= 0: raise ValueError("Parameter n must be greater than or equal to one.") diff --git a/project_euler/problem_003/sol1.py b/project_euler/problem_003/sol1.py index d1c0e61cf1a6..dbf9a84f68bb 100644 --- a/project_euler/problem_003/sol1.py +++ b/project_euler/problem_003/sol1.py @@ -80,7 +80,7 @@ def solution(n: int = 600851475143) -> int: try: n = int(n) - except (TypeError, ValueError): + except TypeError, ValueError: raise TypeError("Parameter n must be int or castable to int.") if n <= 0: raise ValueError("Parameter n must be greater than or equal to one.") diff --git a/project_euler/problem_003/sol2.py b/project_euler/problem_003/sol2.py index 0af0daceed06..4c4f88220514 100644 --- a/project_euler/problem_003/sol2.py +++ b/project_euler/problem_003/sol2.py @@ -44,7 +44,7 @@ def solution(n: int = 600851475143) -> int: try: n = int(n) - except (TypeError, ValueError): + except TypeError, ValueError: raise TypeError("Parameter n must be int or castable to int.") if n <= 0: raise ValueError("Parameter n must be greater than or equal to one.") diff --git a/project_euler/problem_003/sol3.py b/project_euler/problem_003/sol3.py index e13a0eb74ec1..1a454b618f75 100644 --- a/project_euler/problem_003/sol3.py +++ b/project_euler/problem_003/sol3.py @@ -44,7 +44,7 @@ def solution(n: int = 600851475143) -> int: try: n = int(n) - except (TypeError, ValueError): + except TypeError, ValueError: raise TypeError("Parameter n must be int or castable to int.") if n <= 0: raise ValueError("Parameter n must be greater than or equal to one.") diff --git a/project_euler/problem_005/sol1.py b/project_euler/problem_005/sol1.py index 01cbd0e15ff7..f889c420c61d 100644 --- a/project_euler/problem_005/sol1.py +++ b/project_euler/problem_005/sol1.py @@ -47,7 +47,7 @@ def solution(n: int = 20) -> int: try: n = int(n) - except (TypeError, ValueError): + except TypeError, ValueError: raise TypeError("Parameter n must be int or castable to int.") if n <= 0: raise ValueError("Parameter n must be greater than or equal to one.") diff --git a/project_euler/problem_007/sol2.py b/project_euler/problem_007/sol2.py index fd99453c1100..d63b2f2d86ec 100644 --- a/project_euler/problem_007/sol2.py +++ b/project_euler/problem_007/sol2.py @@ -87,7 +87,7 @@ def solution(nth: int = 10001) -> int: try: nth = int(nth) - except (TypeError, ValueError): + except TypeError, ValueError: raise TypeError("Parameter nth must be int or castable to int.") from None if nth <= 0: raise ValueError("Parameter nth must be greater than or equal to one.") diff --git a/web_programming/fetch_well_rx_price.py b/web_programming/fetch_well_rx_price.py index e34a89c19cc8..680d7444bd1c 100644 --- a/web_programming/fetch_well_rx_price.py +++ b/web_programming/fetch_well_rx_price.py @@ -67,7 +67,7 @@ def fetch_pharmacy_and_price_list(drug_name: str, zip_code: str) -> list | None: return pharmacy_price_list - except (httpx.HTTPError, ValueError): + except httpx.HTTPError, ValueError: return None diff --git a/web_programming/instagram_crawler.py b/web_programming/instagram_crawler.py index 68271c1c4643..0b91db01ca09 100644 --- a/web_programming/instagram_crawler.py +++ b/web_programming/instagram_crawler.py @@ -53,7 +53,7 @@ def get_json(self) -> dict: scripts = BeautifulSoup(html, "html.parser").find_all("script") try: return extract_user_profile(scripts[4]) - except (json.decoder.JSONDecodeError, KeyError): + except json.decoder.JSONDecodeError, KeyError: return extract_user_profile(scripts[3]) def __repr__(self) -> str: