diff --git a/learning_curve.py b/learning_curve.py index 2baa81b..2215f5a 100644 --- a/learning_curve.py +++ b/learning_curve.py @@ -18,28 +18,28 @@ def display_digits(): plt.show() - def train_model(): """Train a model on pictures of digits. - + Read in 8x8 pictures of numbers and evaluate the accuracy of the model when different percentages of the data are used as training data. This function plots the average accuracy of the model as a function of the percent of data used to train it. """ data = load_digits() - num_trials = 10 + num_trials = 70 train_percentages = range(5, 95, 5) test_accuracies = numpy.zeros(len(train_percentages)) - # train models with training percentages between 5 and 90 (see - # train_percentages) and evaluate the resultant accuracy for each. - # You should repeat each training percentage num_trials times to smooth out - # variability. - # For consistency with the previous example use - # model = LogisticRegression(C=10**-10) for your learner - - # TODO: your code here + for i in range(0, len(train_percentages)): + total_model_score = 0 + for n in range(0, num_trials): + x_train, x_test, y_train, y_test = train_test_split(data.data, data.target, + train_size = train_percentages[i]/100) + model = LogisticRegression(C=10**-1) #change to C=10**-10 for question 4 + model.fit(x_train, y_train) + total_model_score += model.score(x_test, y_test) + test_accuracies[i] = total_model_score/n fig = plt.figure() plt.plot(train_percentages, test_accuracies) @@ -49,6 +49,5 @@ def train_model(): if __name__ == "__main__": - # Feel free to comment/uncomment as needed - display_digits() - # train_model() + # display_digits() + train_model() diff --git a/questions.txt b/questions.txt new file mode 100644 index 0000000..b4fd38b --- /dev/null +++ b/questions.txt @@ -0,0 +1,11 @@ +1. What is the general trend in the curve? + The general trend in the curve is positive and logarithmic. + +2. Are there parts of the curve that appear to be noisier than others? Why? + Yes, the beginning of the trend (the low train percentage) is often noisier than others. It is because with limited amount of trials, the machine do not have enough data to make an accurate prediction compared to more dataset allows more analysis. + +3. How many trials do you need to get a smooth curve? + It varies. The more trials we have, the smoother the curve is. I started the number of trials with 10, and increase by 10 each time. By 70 numbers of trials, it has a pretty smooth curve. + +4. Try different values for C (by changing LogisticRegression(C=10** -10)). What happens? + When I tried with a different value for C (C=10** -10), the curve becomes more linear rather than logarithmic curve before. \ No newline at end of file