Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion .vscode/settings.json
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
{
"githubPullRequests.ignoredPullRequestBranches": [
"master"
]
],
"python-envs.defaultEnvManager": "ms-python.python:system",
"python-envs.defaultPackageManager": "ms-python.python:pip"
}
2 changes: 1 addition & 1 deletion digital_image_processing/filters/local_binary_pattern.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ def get_neighbors_pixel(

try:
return int(image[x_coordinate][y_coordinate] >= center)
except (IndexError, TypeError):
except IndexError, TypeError:
return 0


Expand Down
2 changes: 1 addition & 1 deletion divide_and_conquer/convex_hull.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,7 @@ def _construct_points(
else:
try:
points.append(Point(p[0], p[1]))
except (IndexError, TypeError):
except IndexError, TypeError:
print(
f"Ignoring deformed point {p}. All points"
" must have at least 2 coordinates."
Expand Down
2 changes: 1 addition & 1 deletion dynamic_programming/catalan_numbers.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ def catalan_numbers(upper_limit: int) -> "list[int]":
print(f"The Catalan numbers from 0 through {N} are:")
print(catalan_numbers(N))
print("Try another upper limit for the sequence: ", end="")
except (NameError, ValueError):
except NameError, ValueError:
print("\n********* Invalid input, goodbye! ************\n")

import doctest
Expand Down
145 changes: 112 additions & 33 deletions machine_learning/decision_tree.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,17 +4,21 @@
Output: The decision tree maps a real number input to a real number output.
"""

from collections import Counter

import numpy as np


class DecisionTree:
def __init__(self, depth=5, min_leaf_size=5):
def __init__(self, depth=5, min_leaf_size=5, task="regression", criterion="gini"):
self.depth = depth
self.decision_boundary = 0
self.left = None
self.right = None
self.min_leaf_size = min_leaf_size
self.prediction = None
self.task = task
self.criterion = criterion

def mean_squared_error(self, labels, prediction):
"""
Expand All @@ -38,10 +42,61 @@ def mean_squared_error(self, labels, prediction):
True
"""
if labels.ndim != 1:
print("Error: Input labels must be one dimensional")

raise ValueError("Input labels must be one dimensional")
return np.mean((labels - prediction) ** 2)

def gini(self, y):
"""
Computes the Gini impurity for a set of labels.
Gini impurity measures how often a randomly chosen element
would be incorrectly classified.
Formula: Gini = 1 - sum(p_i^2)
where p_i is the probability of class i.

Lower Gini value indicates better purity (best split).
"""
_, counts = np.unique(y, return_counts=True)
prob = counts / counts.sum()
return 1 - np.sum(prob**2)

def entropy(self, y):
"""
Computes the entropy (impurity) of a set of labels.
Entropy measures the randomness or disorder in the data.
Formula: Entropy = - sum(p_i * log2(p_i))
where p_i is the probability of class i.

Lower entropy means higher purity.
"""
_, counts = np.unique(y, return_counts=True)
prob = counts / counts.sum()
return -np.sum(prob * np.log2(prob + 1e-9))

def information_gain(self, parent, left, right):
"""
Computes the information gain from splitting a dataset.
Information gain represents the reduction in impurity
after a dataset is split into left and right subsets.
Formula: IG = Impurity(parent) - [
weighted impurity(left) + weighted impurity(right)]

Higher information gain indicates a better split.
"""
if self.criterion == "gini":
func = self.gini
elif self.criterion == "entropy":
func = self.entropy
else:
raise ValueError("Invalid criterion")

weight_l = len(left) / len(parent)
weight_r = len(right) / len(parent)

return func(parent) - (weight_l * func(left) + weight_r * func(right))

def most_common_label(self, y):
return Counter(y).most_common(1)[0][0]

def train(self, x, y):
"""
train:
Expand Down Expand Up @@ -87,35 +142,47 @@ def train(self, x, y):
if y.ndim != 1:
raise ValueError("Data set labels must be one-dimensional")

if len(x) < 2 * self.min_leaf_size:
self.prediction = np.mean(y)
return

if self.depth == 1:
self.prediction = np.mean(y)
if len(x) < 2 * self.min_leaf_size or self.depth == 1:
if self.task == "regression":
self.prediction = np.mean(y)
else:
self.prediction = self.most_common_label(y)
return

best_split = 0
min_error = self.mean_squared_error(x, np.mean(y)) * 2

"""
loop over all possible splits for the decision tree. find the best split.
if no split exists that is less than 2 * error for the entire array
then the data set is not split and the average for the entire array is used as
the predictor
"""
best_score = float("inf") if self.task == "regression" else -float("inf")

for i in range(len(x)):
if len(x[:i]) < self.min_leaf_size: # noqa: SIM114
if len(x[:i]) < self.min_leaf_size:
continue
elif len(x[i:]) < self.min_leaf_size:
if len(x[i:]) < self.min_leaf_size:
continue

left_y = y[:i]
right_y = y[i:]

if self.task == "regression":
error_left = self.mean_squared_error(left_y, np.mean(left_y))
error_right = self.mean_squared_error(right_y, np.mean(right_y))
score = error_left + error_right

if score < best_score:
best_score = score
best_split = i

else:
error_left = self.mean_squared_error(x[:i], np.mean(y[:i]))
error_right = self.mean_squared_error(x[i:], np.mean(y[i:]))
error = error_left + error_right
if error < min_error:
gain = self.information_gain(y, left_y, right_y)

if gain > best_score:
best_score = gain
best_split = i
min_error = error

if best_split != 0:
left_x = x[:best_split]
Expand All @@ -124,18 +191,27 @@ def train(self, x, y):
right_y = y[best_split:]

self.decision_boundary = x[best_split]

self.left = DecisionTree(
depth=self.depth - 1, min_leaf_size=self.min_leaf_size
depth=self.depth - 1,
min_leaf_size=self.min_leaf_size,
task=self.task,
criterion=self.criterion,
)
self.right = DecisionTree(
depth=self.depth - 1, min_leaf_size=self.min_leaf_size
depth=self.depth - 1,
min_leaf_size=self.min_leaf_size,
task=self.task,
criterion=self.criterion,
)

self.left.train(left_x, left_y)
self.right.train(right_x, right_y)
else:
self.prediction = np.mean(y)

return
elif self.task == "regression":
self.prediction = np.mean(y)
else:
self.prediction = self.most_common_label(y)

def predict(self, x):
"""
Expand All @@ -146,13 +222,13 @@ def predict(self, x):
"""
if self.prediction is not None:
return self.prediction
elif self.left is not None and self.right is not None:
if self.left is not None and self.right is not None:
if x >= self.decision_boundary:
return self.right.predict(x)
else:
return self.left.predict(x)
else:
raise ValueError("Decision tree not yet trained")

raise ValueError("Decision tree not yet trained")


class TestDecisionTree:
Expand Down Expand Up @@ -183,17 +259,20 @@ def main():
x = np.arange(-1.0, 1.0, 0.005)
y = np.sin(x)

tree = DecisionTree(depth=10, min_leaf_size=10)
tree = DecisionTree(depth=10, min_leaf_size=10, task="regression")
tree.train(x, y)

rng = np.random.default_rng()
test_cases = (rng.random(10) * 2) - 1
predictions = np.array([tree.predict(x) for x in test_cases])
avg_error = np.mean((predictions - test_cases) ** 2)
print("Regression prediction:", tree.predict(0.5))
x_cls = np.array([1, 2, 3, 4, 5, 6])
y_cls = np.array([0, 0, 0, 1, 1, 1])

clf = DecisionTree(
depth=3, min_leaf_size=1, task="classification", criterion="gini"
)
clf.train(x_cls, y_cls)

print("Test values: " + str(test_cases))
print("Predictions: " + str(predictions))
print("Average error: " + str(avg_error))
print("Classification prediction (2):", clf.predict(2))
print("Classification prediction (5):", clf.predict(5))


if __name__ == "__main__":
Expand Down
2 changes: 1 addition & 1 deletion maths/greatest_common_divisor.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ def main():
f"{greatest_common_divisor(num_1, num_2)}"
)
print(f"By iterative gcd({num_1}, {num_2}) = {gcd_by_iterative(num_1, num_2)}")
except (IndexError, UnboundLocalError, ValueError):
except IndexError, UnboundLocalError, ValueError:
print("Wrong input")


Expand Down
2 changes: 1 addition & 1 deletion project_euler/problem_002/sol4.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ def solution(n: int = 4000000) -> int:

try:
n = int(n)
except (TypeError, ValueError):
except TypeError, ValueError:
raise TypeError("Parameter n must be int or castable to int.")
if n <= 0:
raise ValueError("Parameter n must be greater than or equal to one.")
Expand Down
2 changes: 1 addition & 1 deletion project_euler/problem_003/sol1.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ def solution(n: int = 600851475143) -> int:

try:
n = int(n)
except (TypeError, ValueError):
except TypeError, ValueError:
raise TypeError("Parameter n must be int or castable to int.")
if n <= 0:
raise ValueError("Parameter n must be greater than or equal to one.")
Expand Down
2 changes: 1 addition & 1 deletion project_euler/problem_003/sol2.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ def solution(n: int = 600851475143) -> int:

try:
n = int(n)
except (TypeError, ValueError):
except TypeError, ValueError:
raise TypeError("Parameter n must be int or castable to int.")
if n <= 0:
raise ValueError("Parameter n must be greater than or equal to one.")
Expand Down
2 changes: 1 addition & 1 deletion project_euler/problem_003/sol3.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ def solution(n: int = 600851475143) -> int:

try:
n = int(n)
except (TypeError, ValueError):
except TypeError, ValueError:
raise TypeError("Parameter n must be int or castable to int.")
if n <= 0:
raise ValueError("Parameter n must be greater than or equal to one.")
Expand Down
2 changes: 1 addition & 1 deletion project_euler/problem_005/sol1.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ def solution(n: int = 20) -> int:

try:
n = int(n)
except (TypeError, ValueError):
except TypeError, ValueError:
raise TypeError("Parameter n must be int or castable to int.")
if n <= 0:
raise ValueError("Parameter n must be greater than or equal to one.")
Expand Down
2 changes: 1 addition & 1 deletion project_euler/problem_007/sol2.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ def solution(nth: int = 10001) -> int:

try:
nth = int(nth)
except (TypeError, ValueError):
except TypeError, ValueError:
raise TypeError("Parameter nth must be int or castable to int.") from None
if nth <= 0:
raise ValueError("Parameter nth must be greater than or equal to one.")
Expand Down
2 changes: 1 addition & 1 deletion web_programming/fetch_well_rx_price.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ def fetch_pharmacy_and_price_list(drug_name: str, zip_code: str) -> list | None:

return pharmacy_price_list

except (httpx.HTTPError, ValueError):
except httpx.HTTPError, ValueError:
return None


Expand Down
2 changes: 1 addition & 1 deletion web_programming/instagram_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ def get_json(self) -> dict:
scripts = BeautifulSoup(html, "html.parser").find_all("script")
try:
return extract_user_profile(scripts[4])
except (json.decoder.JSONDecodeError, KeyError):
except json.decoder.JSONDecodeError, KeyError:
return extract_user_profile(scripts[3])

def __repr__(self) -> str:
Expand Down
Loading