Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 10 additions & 8 deletions 01_materials/notebooks/Classification-1.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -470,7 +470,7 @@
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"RangeIndex: 569 entries, 0 to 568\n",
"Data columns (total 33 columns):\n",
"Data columns (total 32 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 id 569 non-null int64 \n",
Expand Down Expand Up @@ -505,9 +505,8 @@
" 29 concave points_worst 569 non-null float64\n",
" 30 symmetry_worst 569 non-null float64\n",
" 31 fractal_dimension_worst 569 non-null float64\n",
" 32 Unnamed: 32 0 non-null float64\n",
"dtypes: float64(31), int64(1), object(1)\n",
"memory usage: 146.8+ KB\n"
"dtypes: float64(30), int64(1), object(1)\n",
"memory usage: 142.4+ KB\n"
]
}
],
Expand Down Expand Up @@ -824,7 +823,7 @@
},
{
"cell_type": "code",
"execution_count": 10,
"execution_count": null,
"metadata": {},
"outputs": [
{
Expand Down Expand Up @@ -909,13 +908,16 @@
}
],
"source": [
"#defining a new observation we want to predict\n",
"new_obs_Perimeter = 97\n",
"new_obs_Concavity = 0.20\n",
"cancer[\"dist_from_new\"] = (\n",
" (cancer[\"perimeter_mean\"] - new_obs_Perimeter) ** 2\n",
" + (cancer[\"concavity_mean\"] - new_obs_Concavity) ** 2\n",
")**(1/2)\n",
")**(1/2) #this will give us the straight line distance of our observation from all other observations\n",
"# saved into a new column called dist_from_new\n",
"\n",
"#now you are looking at the 5 closest neighbours\n",
"nearest_5 = cancer.nsmallest(5, \"dist_from_new\")[[\n",
" \"perimeter_mean\",\n",
" \"concavity_mean\",\n",
Expand Down Expand Up @@ -2326,7 +2328,7 @@
],
"metadata": {
"kernelspec": {
"display_name": "base",
"display_name": "lcr-env",
"language": "python",
"name": "python3"
},
Expand All @@ -2340,7 +2342,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.14"
"version": "3.11.13"
}
},
"nbformat": 4,
Expand Down
79 changes: 49 additions & 30 deletions 01_materials/notebooks/Classification-2.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
Expand Down Expand Up @@ -473,12 +473,14 @@
"source": [
"#### Scale data \n",
"by standardizing our features in the dataset, to make sure theyre on the same scale. As we've seen, differences in scale can disproportionately affect machine learning models that rely on distance metrics (e.g., K-Nearest Neighbors). \n",
"The `StandardScaler()` function in the sklearn.preprocessing module is a widely used tool for this purpose."
"The `StandardScaler()` function in the sklearn.preprocessing module is a widely used tool for this purpose.\n",
"\n",
"we make a copy so we dont override anything, we don't want to scale the ID column"
]
},
{
"cell_type": "code",
"execution_count": 8,
"execution_count": null,
"metadata": {},
"outputs": [
{
Expand Down Expand Up @@ -880,7 +882,8 @@
"# This will return a list of the numeric columns we need to scale\n",
"columns_to_scale = standardized_cancer.columns.difference(columns_to_exclude)\n",
"\n",
"# Initialize the StandardScaler to standardize the selected numeric columns\n",
"# Initialize the StandardScaler to standardize the selected numeric columns, this adjusts data so each feature\n",
"# has a mean of 0, and SD of 1\n",
"scaler = StandardScaler()\n",
"\n",
"# Apply the scaler to the selected columns. This transforms the data so that each feature\n",
Expand Down Expand Up @@ -1033,7 +1036,7 @@
},
{
"cell_type": "code",
"execution_count": 11,
"execution_count": null,
"metadata": {},
"outputs": [
{
Expand Down Expand Up @@ -1070,7 +1073,7 @@
},
{
"cell_type": "code",
"execution_count": 12,
"execution_count": null,
"metadata": {},
"outputs": [
{
Expand All @@ -1088,13 +1091,14 @@
}
],
"source": [
"#step 1: initialize our model\n",
"knn = KNeighborsClassifier(n_neighbors=5)\n",
"knn"
]
},
{
"cell_type": "code",
"execution_count": 13,
"execution_count": null,
"metadata": {},
"outputs": [
{
Expand All @@ -1112,6 +1116,8 @@
}
],
"source": [
"# step 2: define our x and y\n",
"# step 3: fit our model to our data\n",
"knn.fit(X=cancer_train[[\"perimeter_mean\", \"concavity_mean\"]], y=cancer_train[\"diagnosis\"])"
]
},
Expand All @@ -1124,7 +1130,7 @@
},
{
"cell_type": "code",
"execution_count": 14,
"execution_count": null,
"metadata": {},
"outputs": [
{
Expand Down Expand Up @@ -1248,8 +1254,11 @@
}
],
"source": [
"#step 4: predict on the test set\n",
"cancer_test[\"predicted\"] = knn.predict(cancer_test[[\"perimeter_mean\", \"concavity_mean\"]])\n",
"cancer_test[[\"id\", \"diagnosis\", \"predicted\"]]"
"cancer_test[[\"id\", \"diagnosis\", \"predicted\"]]\n",
"\n",
"#x is in double brackets bc it needs to be provided as a dataframe, y is provided as a vector"
]
},
{
Expand Down Expand Up @@ -1277,7 +1286,7 @@
},
{
"cell_type": "code",
"execution_count": 15,
"execution_count": null,
"metadata": {},
"outputs": [
{
Expand All @@ -1295,7 +1304,8 @@
"knn.score(\n",
" cancer_test[[\"perimeter_mean\", \"concavity_mean\"]],\n",
" cancer_test[\"diagnosis\"]\n",
")"
")\n",
"#score is the default function to calculate accuracy"
]
},
{
Expand Down Expand Up @@ -1363,7 +1373,7 @@
},
{
"cell_type": "code",
"execution_count": 16,
"execution_count": null,
"metadata": {},
"outputs": [
{
Expand Down Expand Up @@ -1424,10 +1434,12 @@
}
],
"source": [
"#this creates a confusion matrix\n",
"pd.crosstab(\n",
" cancer_test[\"diagnosis\"],\n",
" cancer_test[\"predicted\"]\n",
")"
")\n",
"#in the ideal world, we wouldnt have any false positives/wrong classifications"
]
},
{
Expand Down Expand Up @@ -1488,7 +1500,7 @@
},
{
"cell_type": "code",
"execution_count": 17,
"execution_count": null,
"metadata": {},
"outputs": [
{
Expand All @@ -1503,11 +1515,13 @@
}
],
"source": [
"#precision: when the model predicted malignant, how often is it actually correct\n",
"precision_score(\n",
" y_true=cancer_test[\"diagnosis\"],\n",
" y_pred=cancer_test[\"predicted\"],\n",
" pos_label=\"Malignant\"\n",
")"
")\n",
"#our precision is really high"
]
},
{
Expand All @@ -1523,7 +1537,7 @@
},
{
"cell_type": "code",
"execution_count": 18,
"execution_count": null,
"metadata": {},
"outputs": [
{
Expand All @@ -1538,6 +1552,7 @@
}
],
"source": [
"# recall: of all actual malignant tumors, how many did the model successfuly identify as malignant\n",
"recall_score(\n",
" y_true=cancer_test[\"diagnosis\"],\n",
" y_pred=cancer_test[\"predicted\"],\n",
Expand Down Expand Up @@ -1592,7 +1607,7 @@
},
{
"cell_type": "code",
"execution_count": 19,
"execution_count": null,
"metadata": {},
"outputs": [
{
Expand All @@ -1613,17 +1628,19 @@
")\n",
"\n",
"# fit the model on the sub-training data\n",
"knn = KNeighborsClassifier(n_neighbors=3)\n",
"X = cancer_subtrain[[\"perimeter_mean\", \"concavity_mean\"]]\n",
"knn = KNeighborsClassifier(n_neighbors=3) #1. initialize model\n",
"X = cancer_subtrain[[\"perimeter_mean\", \"concavity_mean\"]] #2. define x and y\n",
"y = cancer_subtrain[\"diagnosis\"]\n",
"knn.fit(X, y)\n",
"knn.fit(X, y) #3. fit to subtrain data\n",
"\n",
"# compute the score on validation data\n",
"acc = knn.score(\n",
" cancer_validation[[\"perimeter_mean\", \"concavity_mean\"]],\n",
" cancer_validation[\"diagnosis\"]\n",
")\n",
"acc"
"acc\n",
"\n",
"#acc value will change everytime you re-run the split"
]
},
{
Expand Down Expand Up @@ -1671,7 +1688,7 @@
},
{
"cell_type": "code",
"execution_count": 20,
"execution_count": null,
"metadata": {},
"outputs": [
{
Expand Down Expand Up @@ -1755,7 +1772,7 @@
"y = cancer_train[\"diagnosis\"]\n",
"\n",
"returned_dictionary = cross_validate(\n",
" estimator=knn,\n",
" estimator=knn, #estimator is your model\n",
" cv=5, # setting up the cross validation number\n",
" X=X,\n",
" y=y\n",
Expand Down Expand Up @@ -1916,7 +1933,7 @@
},
{
"cell_type": "code",
"execution_count": 24,
"execution_count": null,
"metadata": {},
"outputs": [
{
Expand Down Expand Up @@ -2549,9 +2566,9 @@
"cancer_tune_grid.fit(\n",
" cancer_train[[\"perimeter_mean\", \"concavity_mean\"]],\n",
" cancer_train[\"diagnosis\"]\n",
")\n",
") #fitting it to our training data\n",
"\n",
"accuracies_grid = pd.DataFrame(cancer_tune_grid.cv_results_)\n",
"accuracies_grid = pd.DataFrame(cancer_tune_grid.cv_results_) #creating a dataframe for ease of reading\n",
"accuracies_grid"
]
},
Expand Down Expand Up @@ -2641,7 +2658,7 @@
},
{
"cell_type": "code",
"execution_count": 26,
"execution_count": null,
"metadata": {},
"outputs": [
{
Expand All @@ -2656,7 +2673,9 @@
}
],
"source": [
"cancer_tune_grid.best_params_"
"cancer_tune_grid.best_params_\n",
"\n",
"#choose the parameter that is stable"
]
},
{
Expand Down Expand Up @@ -2789,7 +2808,7 @@
],
"metadata": {
"kernelspec": {
"display_name": "base",
"display_name": "lcr-env",
"language": "python",
"name": "python3"
},
Expand All @@ -2803,7 +2822,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.10"
"version": "3.11.13"
}
},
"nbformat": 4,
Expand Down
Loading