From b2ff54f2d69589903892296a00f022d4c032998a Mon Sep 17 00:00:00 2001 From: Michal Taraszewski Date: Sat, 7 Mar 2020 01:02:47 +0100 Subject: [PATCH 1/3] [model] (feat) Initial commit of Lin Reg model --- .../LinearRegressionModel-Copy1.py | 25 ++++++ .../LinearRegression/LinearRegressionModel.py | 27 ++++++ .../LinearRegression/Loss.py | 17 ++++ .../LinearRegression/Optimizer-Copy1.py | 66 +++++++++++++++ .../LinearRegression/Optimizer.py | 82 +++++++++++++++++++ .../LinearRegression/Validator.py | 41 ++++++++++ .../LinearRegression/__init__.py | 0 .../LinearRegression/untitled-Copy3.txt | 0 8 files changed, 258 insertions(+) create mode 100644 src/SupervisedLearning/LinearRegression/LinearRegressionModel-Copy1.py create mode 100644 src/SupervisedLearning/LinearRegression/LinearRegressionModel.py create mode 100644 src/SupervisedLearning/LinearRegression/Loss.py create mode 100644 src/SupervisedLearning/LinearRegression/Optimizer-Copy1.py create mode 100644 src/SupervisedLearning/LinearRegression/Optimizer.py create mode 100644 src/SupervisedLearning/LinearRegression/Validator.py create mode 100644 src/SupervisedLearning/LinearRegression/__init__.py create mode 100644 src/SupervisedLearning/LinearRegression/untitled-Copy3.txt diff --git a/src/SupervisedLearning/LinearRegression/LinearRegressionModel-Copy1.py b/src/SupervisedLearning/LinearRegression/LinearRegressionModel-Copy1.py new file mode 100644 index 0000000..7484b97 --- /dev/null +++ b/src/SupervisedLearning/LinearRegression/LinearRegressionModel-Copy1.py @@ -0,0 +1,25 @@ +import numpy as np +#from src.SupervisedLearning.LinearRegression.Validator import LinRegValidator +#from src.SupervisedLearning.LinearRegression.utils import * + + +class LinearRegressionModel: + def __init__(self, n_features, weights=None, optimizer="GD", measure="MSE"): + self.n_features = n_features + if weights: + self.weights = weights + else: + self.weights = np.random.rand(self.n_features, 1) + self.optimizer = optimizer + self.measure = measure + #self.validator = LinRegValidator(n_features=n_features) + + def fit(self, data, target): + #self.validator.validate_training(data, target) + self.weights = self.optimizer.optimize(data, target, self.measure) + + def predict(self, data): + data = np.c_[np.ones((data.shape[0], 1)), data] + return np.dot(data, self.weights) + + diff --git a/src/SupervisedLearning/LinearRegression/LinearRegressionModel.py b/src/SupervisedLearning/LinearRegression/LinearRegressionModel.py new file mode 100644 index 0000000..cf9b255 --- /dev/null +++ b/src/SupervisedLearning/LinearRegression/LinearRegressionModel.py @@ -0,0 +1,27 @@ +import numpy as np +from src.SupervisedLearning.LinearRegression.Optimizer import BatchGradientDecent +from src.SupervisedLearning.LinearRegression.Loss import MSE +from src.SupervisedLearning.LinearRegression.Validator import LinRegValidator + + +class LinearRegressionModel: + def __init__(self, n_features, optimizer=None, loss=None): + self.n_features = n_features + if optimizer: + self.optimizer = optimizer + else: + self.optimizer = BatchGradientDecent(learning_rate=1, n_steps=100, save_history=True) + if loss: + self.loss = loss + else: + self.loss = MSE() + self.validator = LinRegValidator(n_features=n_features) + + def fit(self, data, target, weights=None): + self.validator.validate_training(data, target) + self.weights = self.optimizer.optimize(data, target, loss=self.loss, weights=weights) + + def predict(self, data): + data = np.c_[np.ones((data.shape[0], 1)), data] + return np.dot(data, self.weights) + \ No newline at end of file diff --git a/src/SupervisedLearning/LinearRegression/Loss.py b/src/SupervisedLearning/LinearRegression/Loss.py new file mode 100644 index 0000000..93e8949 --- /dev/null +++ b/src/SupervisedLearning/LinearRegression/Loss.py @@ -0,0 +1,17 @@ +import numpy as np + +class MSE(): + def __init__(self): + pass + + def _grad(self, forward, weights, data, target): + m = data.shape[0] + return 2 / m * np.dot(data.T, forward - target) + + + def _loss(self, forward, target): + return np.square(np.subtract(forward, target)).mean() + + + def _forward(self, weights, data): + return np.dot(data, weights) \ No newline at end of file diff --git a/src/SupervisedLearning/LinearRegression/Optimizer-Copy1.py b/src/SupervisedLearning/LinearRegression/Optimizer-Copy1.py new file mode 100644 index 0000000..07ebf64 --- /dev/null +++ b/src/SupervisedLearning/LinearRegression/Optimizer-Copy1.py @@ -0,0 +1,66 @@ +import numpy as np + +class BatchGradientDecent: + def __init__(self, learning_rate = 1, n_steps = 10, save_history=False): +# def __init__(self, learning_rate = 1, n_steps = 10, save_history=False): + """ + This is the default implementation of the Gradient Decent algorithm. + + :param learning_rate: step size + :param weights: Initial theta/weights parameters + :param n_steps: number of optimization steps + :param save_history: flag whether to save gradients and weights, that can be use to + debug/analyze the learning progress + """ + self.learning_rate = learning_rate + + self.n_steps = n_steps + self.save_history = save_history + + if save_history: + self.history = {} # dictionary that keeps track of the previously calculated gradients + +# def optimize(self, weights, data, target): + def optimize(self, data, target, measure="MSE", weights=None): + if not weights: + weights = np.random.rand(data.shape[1] + 1, 1) + data = np.c_[np.ones((data.shape[0], 1)), data] # add bias term (x0 = 1) to each instance + for step in range(self.n_steps): + forward = self.__forward(weights, data) + loss = self.__loss(forward, target) + gradient = self.__grad(weights, forward, data, target, measure) + weights = weights - self.learning_rate * gradient + if self.save_history: + self.__save_history(step, weights, loss, gradient) + + return weights + + def __grad(self, weights, forward, data, target, measure): + """ + TBD + """ + if measure == "MSE": + m = data.shape[0] +# print("data.shape ", data.shape) +# print("data.T.shape ", data.T.shape) +# print("weights.shape ", weights.shape) +# print("weights.T.shape ", weights.T.shape) +# print("target.shape ", target.shape) +# print("target.T.shape ", target.T.shape) +# print(weights.shape) +# print(target.shape) + #return 2 / m * np.dot(data.T, np.dot(data.T, weights) - target) + return 2 / m * data.T.dot(data.dot(weights) - target) + + + def __loss(self, forward, target): + return np.square(np.subtract(forward, target)).mean() + + def __forward(self, weights, data): + return np.dot(data, weights) + + def __save_history(self, step, weights, loss, gradient): + self.history[step] = {"gradient": gradient, + "loss":loss, + "new_weights": weights + } diff --git a/src/SupervisedLearning/LinearRegression/Optimizer.py b/src/SupervisedLearning/LinearRegression/Optimizer.py new file mode 100644 index 0000000..568c6f3 --- /dev/null +++ b/src/SupervisedLearning/LinearRegression/Optimizer.py @@ -0,0 +1,82 @@ +import numpy as np +from src.SupervisedLearning.LinearRegression.Loss import MSE + +class BatchGradientDecent: + def __init__(self, learning_rate = 1, n_steps = 10, save_history=False): + """ + This is the default implementation of the Batch Gradient Decent algorithm. + :param learning_rate: step size + :param n_steps: number of optimization steps + :param save_history: flag whether to save gradients and weights, that can be use to + debug/analyze the learning progress + """ + self.learning_rate = learning_rate + self.n_steps = n_steps + self.save_history = save_history + if save_history: + self.history = {} # dictionary that keeps track of the previously calculated gradients + + + def optimize(self, data, target, loss=None, weights=None): + if not weights: + weights = np.random.rand(data.shape[1] + 1, 1) + data = np.c_[np.ones((data.shape[0], 1)), data] # add bias term (x0 = 1) to each instance + if not loss: + loss = MSE() + for step in range(self.n_steps): + forward = loss._forward(weights, data) + loss_value = loss._loss(forward, target) + gradient = loss._grad(forward, weights, data, target) + weights = weights - self.learning_rate * gradient + if self.save_history: + self.__save_history(step, weights, loss_value, gradient) + return weights + + + def __save_history(self, step, weights, loss_value, gradient): + self.history[step] = {"gradient": gradient, + "loss": loss_value, + "weights": weights + } + +class StochasticGradientDecent: + def __init__(self, learning_rate = 1, n_steps = 10, save_history=False): + """ + This is the default implementation of the Stochastic Decent algorithm. + :param learning_rate: step size + :param n_steps: number of optimization steps + :param save_history: flag whether to save gradients and weights, that can be use to + debug/analyze the learning progress + """ + self.learning_rate = learning_rate + self.n_steps = n_steps + self.save_history = save_history + if save_history: + self.history = {} # dictionary that keeps track of the previously calculated gradients + + + def optimize(self, data, target, loss=None, weights=None): + m = data.shape[0] + if not weights: + weights = np.random.rand(data.shape[1] + 1, 1) + data = np.c_[np.ones((data.shape[0], 1)), data] # add bias term (x0 = 1) to each instance + if not loss: + loss = MSE() + for step in range(self.n_steps): + random_index = np.random.randint(0, m + 1) + X = data[random_index:random_index+1] + y = target[random_index:random_index+1] + forward = loss._forward(weights, data) + loss_value = loss._loss(forward, target) + gradient = loss._grad(forward, weights, data, target) + weights = weights - self.learning_rate * gradient + if self.save_history: + self.__save_history(step, weights, loss_value, gradient) + return weights + + + def __save_history(self, step, weights, loss_value, gradient): + self.history[step] = {"gradient": gradient, + "loss": loss_value, + "weights": weights + } diff --git a/src/SupervisedLearning/LinearRegression/Validator.py b/src/SupervisedLearning/LinearRegression/Validator.py new file mode 100644 index 0000000..bd02ba9 --- /dev/null +++ b/src/SupervisedLearning/LinearRegression/Validator.py @@ -0,0 +1,41 @@ +import numpy as np +from pandas.api.types import is_numeric_dtype + + +class DataDimError(Exception): + def __init__(self, expected, received): + message = "".join(map(str, ["Invalid number of features, expected: ", expected, " received: ", received])) + super(DataDimError, self).__init__(message) + +class DataTargetMissmatch(Exception): + def __init__(self, data, target): + message = "".join(map(str, ["Number of data examples: ", data ," does not match target: ", target," examples"])) + super(DataTargetMissmatch, self).__init__(message) + +class DataTypeError(Exception): + def __init__(self): + message = "".join(map(str,["Invalid type of data, expected numerical."])) + super(DataTypeError, self).__init__(message) + + +class LinRegValidator: + def __init__(self, n_features): + self.n_features = n_features + + def validate_training(self, data, target): + self.__validate_data(data) + self.__validate_data_type(data) + self.__validate_data_type(target) + self.__check_if_data_and_target_match(data, target) + + def __validate_data(self, data): + if data.shape[1] != self.n_features: + raise DataDimError(data.shape[1], self.n_features) + + def __validate_data_type(self, data): + if is_numeric_dtype(data) != True: + raise DataTypeError() + + def __check_if_data_and_target_match(self, data, target): + if data.shape[0] != target.shape[0]: + raise DataTargetMissmatch(data.shape[0], target.shape[0]) \ No newline at end of file diff --git a/src/SupervisedLearning/LinearRegression/__init__.py b/src/SupervisedLearning/LinearRegression/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/SupervisedLearning/LinearRegression/untitled-Copy3.txt b/src/SupervisedLearning/LinearRegression/untitled-Copy3.txt new file mode 100644 index 0000000..e69de29 From d16c4b71c061ca5e77315a44428d54e7a83c7dd9 Mon Sep 17 00:00:00 2001 From: Michal Taraszewski Date: Sat, 7 Mar 2020 01:06:20 +0100 Subject: [PATCH 2/3] [model] (chore) removed trash --- .../LinearRegressionModel-Copy1.py | 25 ------- .../LinearRegression/Optimizer-Copy1.py | 66 ------------------- .../LinearRegression/untitled-Copy3.txt | 0 3 files changed, 91 deletions(-) delete mode 100644 src/SupervisedLearning/LinearRegression/LinearRegressionModel-Copy1.py delete mode 100644 src/SupervisedLearning/LinearRegression/Optimizer-Copy1.py delete mode 100644 src/SupervisedLearning/LinearRegression/untitled-Copy3.txt diff --git a/src/SupervisedLearning/LinearRegression/LinearRegressionModel-Copy1.py b/src/SupervisedLearning/LinearRegression/LinearRegressionModel-Copy1.py deleted file mode 100644 index 7484b97..0000000 --- a/src/SupervisedLearning/LinearRegression/LinearRegressionModel-Copy1.py +++ /dev/null @@ -1,25 +0,0 @@ -import numpy as np -#from src.SupervisedLearning.LinearRegression.Validator import LinRegValidator -#from src.SupervisedLearning.LinearRegression.utils import * - - -class LinearRegressionModel: - def __init__(self, n_features, weights=None, optimizer="GD", measure="MSE"): - self.n_features = n_features - if weights: - self.weights = weights - else: - self.weights = np.random.rand(self.n_features, 1) - self.optimizer = optimizer - self.measure = measure - #self.validator = LinRegValidator(n_features=n_features) - - def fit(self, data, target): - #self.validator.validate_training(data, target) - self.weights = self.optimizer.optimize(data, target, self.measure) - - def predict(self, data): - data = np.c_[np.ones((data.shape[0], 1)), data] - return np.dot(data, self.weights) - - diff --git a/src/SupervisedLearning/LinearRegression/Optimizer-Copy1.py b/src/SupervisedLearning/LinearRegression/Optimizer-Copy1.py deleted file mode 100644 index 07ebf64..0000000 --- a/src/SupervisedLearning/LinearRegression/Optimizer-Copy1.py +++ /dev/null @@ -1,66 +0,0 @@ -import numpy as np - -class BatchGradientDecent: - def __init__(self, learning_rate = 1, n_steps = 10, save_history=False): -# def __init__(self, learning_rate = 1, n_steps = 10, save_history=False): - """ - This is the default implementation of the Gradient Decent algorithm. - - :param learning_rate: step size - :param weights: Initial theta/weights parameters - :param n_steps: number of optimization steps - :param save_history: flag whether to save gradients and weights, that can be use to - debug/analyze the learning progress - """ - self.learning_rate = learning_rate - - self.n_steps = n_steps - self.save_history = save_history - - if save_history: - self.history = {} # dictionary that keeps track of the previously calculated gradients - -# def optimize(self, weights, data, target): - def optimize(self, data, target, measure="MSE", weights=None): - if not weights: - weights = np.random.rand(data.shape[1] + 1, 1) - data = np.c_[np.ones((data.shape[0], 1)), data] # add bias term (x0 = 1) to each instance - for step in range(self.n_steps): - forward = self.__forward(weights, data) - loss = self.__loss(forward, target) - gradient = self.__grad(weights, forward, data, target, measure) - weights = weights - self.learning_rate * gradient - if self.save_history: - self.__save_history(step, weights, loss, gradient) - - return weights - - def __grad(self, weights, forward, data, target, measure): - """ - TBD - """ - if measure == "MSE": - m = data.shape[0] -# print("data.shape ", data.shape) -# print("data.T.shape ", data.T.shape) -# print("weights.shape ", weights.shape) -# print("weights.T.shape ", weights.T.shape) -# print("target.shape ", target.shape) -# print("target.T.shape ", target.T.shape) -# print(weights.shape) -# print(target.shape) - #return 2 / m * np.dot(data.T, np.dot(data.T, weights) - target) - return 2 / m * data.T.dot(data.dot(weights) - target) - - - def __loss(self, forward, target): - return np.square(np.subtract(forward, target)).mean() - - def __forward(self, weights, data): - return np.dot(data, weights) - - def __save_history(self, step, weights, loss, gradient): - self.history[step] = {"gradient": gradient, - "loss":loss, - "new_weights": weights - } diff --git a/src/SupervisedLearning/LinearRegression/untitled-Copy3.txt b/src/SupervisedLearning/LinearRegression/untitled-Copy3.txt deleted file mode 100644 index e69de29..0000000 From efe51dbcb82ee47af0238140e501de4731b5f377 Mon Sep 17 00:00:00 2001 From: Michal Taraszewski Date: Sat, 14 Mar 2020 21:52:53 +0100 Subject: [PATCH 3/3] [model] (chore & tests) Refactored Model and Optimizer, added tests --- .../LinearRegression/LinearRegressionModel.py | 16 ++++--- .../LinearRegression/Optimizer.py | 32 +++++++++++--- .../LinearRegression/tests/__init__.py | 0 .../tests/test_LinearRegressionModel.py | 44 +++++++++++++++++++ 4 files changed, 81 insertions(+), 11 deletions(-) create mode 100644 src/SupervisedLearning/LinearRegression/tests/__init__.py create mode 100644 src/SupervisedLearning/LinearRegression/tests/test_LinearRegressionModel.py diff --git a/src/SupervisedLearning/LinearRegression/LinearRegressionModel.py b/src/SupervisedLearning/LinearRegression/LinearRegressionModel.py index cf9b255..36c38f8 100644 --- a/src/SupervisedLearning/LinearRegression/LinearRegressionModel.py +++ b/src/SupervisedLearning/LinearRegression/LinearRegressionModel.py @@ -1,12 +1,17 @@ +import sys +import os +sys.path.append(os.path.join(os.path.dirname(__file__))) + import numpy as np -from src.SupervisedLearning.LinearRegression.Optimizer import BatchGradientDecent -from src.SupervisedLearning.LinearRegression.Loss import MSE -from src.SupervisedLearning.LinearRegression.Validator import LinRegValidator +from Optimizer import BatchGradientDecent +from Loss import MSE +from Validator import LinRegValidator class LinearRegressionModel: def __init__(self, n_features, optimizer=None, loss=None): self.n_features = n_features + self.weights = np.zeros(n_features + 1) # add weight for bias term if optimizer: self.optimizer = optimizer else: @@ -17,9 +22,10 @@ def __init__(self, n_features, optimizer=None, loss=None): self.loss = MSE() self.validator = LinRegValidator(n_features=n_features) - def fit(self, data, target, weights=None): + def fit(self, data, target): + target = target.reshape(-1,) self.validator.validate_training(data, target) - self.weights = self.optimizer.optimize(data, target, loss=self.loss, weights=weights) + self.weights = self.optimizer.optimize(data, target, loss=self.loss, weights=self.weights) def predict(self, data): data = np.c_[np.ones((data.shape[0], 1)), data] diff --git a/src/SupervisedLearning/LinearRegression/Optimizer.py b/src/SupervisedLearning/LinearRegression/Optimizer.py index 568c6f3..5025605 100644 --- a/src/SupervisedLearning/LinearRegression/Optimizer.py +++ b/src/SupervisedLearning/LinearRegression/Optimizer.py @@ -1,6 +1,18 @@ import numpy as np -from src.SupervisedLearning.LinearRegression.Loss import MSE +from Loss import MSE +class DummyOptim: + def __init__(self): + """ + This class is a default implementation of the Optimizer. + It is used to test correctness of the LogisticRgressionModel implementation. + """ + pass + + def optimize(self, data, target, loss, weights): + return np.arange(weights.shape[0]) + + class BatchGradientDecent: def __init__(self, learning_rate = 1, n_steps = 10, save_history=False): """ @@ -18,15 +30,17 @@ def __init__(self, learning_rate = 1, n_steps = 10, save_history=False): def optimize(self, data, target, loss=None, weights=None): - if not weights: - weights = np.random.rand(data.shape[1] + 1, 1) + if isinstance(weights, np.ndarray): + pass + else: + weights = np.random.rand(data.shape[1] + 1, 1) # add weight for bias term data = np.c_[np.ones((data.shape[0], 1)), data] # add bias term (x0 = 1) to each instance if not loss: loss = MSE() for step in range(self.n_steps): forward = loss._forward(weights, data) loss_value = loss._loss(forward, target) - gradient = loss._grad(forward, weights, data, target) + gradient = loss._grad(forward, weights, data, target) weights = weights - self.learning_rate * gradient if self.save_history: self.__save_history(step, weights, loss_value, gradient) @@ -39,6 +53,7 @@ def __save_history(self, step, weights, loss_value, gradient): "weights": weights } + class StochasticGradientDecent: def __init__(self, learning_rate = 1, n_steps = 10, save_history=False): """ @@ -57,9 +72,14 @@ def __init__(self, learning_rate = 1, n_steps = 10, save_history=False): def optimize(self, data, target, loss=None, weights=None): m = data.shape[0] - if not weights: - weights = np.random.rand(data.shape[1] + 1, 1) + if isinstance(weights, np.ndarray): + pass + else: + weights = np.random.rand(data.shape[1] + 1, 1) # add weight for bias term data = np.c_[np.ones((data.shape[0], 1)), data] # add bias term (x0 = 1) to each instance +# if not weights: +# weights = np.random.rand(data.shape[1] + 1, 1) +# data = np.c_[np.ones((data.shape[0], 1)), data] # add bias term (x0 = 1) to each instance if not loss: loss = MSE() for step in range(self.n_steps): diff --git a/src/SupervisedLearning/LinearRegression/tests/__init__.py b/src/SupervisedLearning/LinearRegression/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/SupervisedLearning/LinearRegression/tests/test_LinearRegressionModel.py b/src/SupervisedLearning/LinearRegression/tests/test_LinearRegressionModel.py new file mode 100644 index 0000000..94076c9 --- /dev/null +++ b/src/SupervisedLearning/LinearRegression/tests/test_LinearRegressionModel.py @@ -0,0 +1,44 @@ +import sys, os.path + +sys.path.append(os.path.join(os.path.dirname(__file__), '..')) + +from unittest import TestCase +from LinearRegressionModel import * +from Validator import * +from Optimizer import DummyOptim +from Loss import MSE +import numpy as np + + +class TestLinearRegressionModel(TestCase): + def setUp(self): + self.n_features = 4 + self.linreg = LinearRegressionModel(self.n_features, optimizer=DummyOptim(), loss=MSE()) + + def test_fit_with_dummy_optim(self): + n_events = 5 + data = np.random.randint(10, size=(n_events, self.n_features)) + target = np.zeros(n_events) + self.linreg.fit(data, target) + + def test_fit_should_rise_if_wrong_data_dimensionality(self): + n_events = 5 + wrong_dim = self.n_features-2 + data = np.random.randint(10, size=(n_events, wrong_dim)) + target = np.zeros(n_events) + + self.assertRaises(DataDimError, self.linreg.fit, data, target) + + def test_fit_should_rise_if_data_target_not_equal_examples(self): + n_events = 5 + data = np.random.randint(10, size=(n_events, self.n_features)) + target = np.zeros(n_events-1) + + self.assertRaises(DataTargetMissmatch, self.linreg.fit, data, target) + + def test_predict_with_dummy_optim(self): + n_events = 5 + data = np.random.randint(10, size=(n_events, self.n_features)) + predictions = self.linreg.predict(data) + + self.assertEqual(predictions.shape[0], n_events)