dianayangs · dianayangs · Nov 17, 2025 · Nov 17, 2025 · Dec 1, 2025 · Dec 15, 2025
diff --git a/01_materials/notebooks/Classification-1.ipynb b/01_materials/notebooks/Classification-1.ipynb
@@ -470,7 +470,7 @@
      "text": [
       "<class 'pandas.core.frame.DataFrame'>\n",
       "RangeIndex: 569 entries, 0 to 568\n",
-      "Data columns (total 33 columns):\n",
+      "Data columns (total 32 columns):\n",
       " #   Column                   Non-Null Count  Dtype  \n",
       "---  ------                   --------------  -----  \n",
       " 0   id                       569 non-null    int64  \n",
@@ -505,9 +505,8 @@
       " 29  concave points_worst     569 non-null    float64\n",
       " 30  symmetry_worst           569 non-null    float64\n",
       " 31  fractal_dimension_worst  569 non-null    float64\n",
-      " 32  Unnamed: 32              0 non-null      float64\n",
-      "dtypes: float64(31), int64(1), object(1)\n",
-      "memory usage: 146.8+ KB\n"
+      "dtypes: float64(30), int64(1), object(1)\n",
+      "memory usage: 142.4+ KB\n"
      ]
     }
    ],
@@ -824,7 +823,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": null,
    "metadata": {},
    "outputs": [
     {
@@ -909,13 +908,16 @@
     }
    ],
    "source": [
+    "#defining a new observation we want to predict\n",
     "new_obs_Perimeter = 97\n",
     "new_obs_Concavity = 0.20\n",
     "cancer[\"dist_from_new\"] = (\n",
     "       (cancer[\"perimeter_mean\"] - new_obs_Perimeter) ** 2\n",
     "     + (cancer[\"concavity_mean\"] - new_obs_Concavity) ** 2\n",
-    ")**(1/2)\n",
+    ")**(1/2) #this will give us the straight line distance of our observation from all other observations\n",
+    "# saved into a new column called dist_from_new\n",
     "\n",
+    "#now you are looking at the 5 closest neighbours\n",
     "nearest_5 = cancer.nsmallest(5, \"dist_from_new\")[[\n",
     "    \"perimeter_mean\",\n",
     "    \"concavity_mean\",\n",
@@ -2326,7 +2328,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "base",
+   "display_name": "lcr-env",
    "language": "python",
    "name": "python3"
   },
@@ -2340,7 +2342,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.14"
+   "version": "3.11.13"
   }
  },
  "nbformat": 4,

diff --git a/01_materials/notebooks/Classification-2.ipynb b/01_materials/notebooks/Classification-2.ipynb
@@ -21,7 +21,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -473,12 +473,14 @@
    "source": [
     "#### Scale data \n",
     "by standardizing our features in the dataset, to make sure theyre on the same scale. As we've seen, differences in scale can disproportionately affect machine learning models that rely on distance metrics (e.g., K-Nearest Neighbors). \n",
-    "The `StandardScaler()` function in the sklearn.preprocessing module is a widely used tool for this purpose."
+    "The `StandardScaler()` function in the sklearn.preprocessing module is a widely used tool for this purpose.\n",
+    "\n",
+    "we make a copy so we dont override anything, we don't want to scale the ID column"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": null,
    "metadata": {},
    "outputs": [
     {
@@ -880,7 +882,8 @@
     "# This will return a list of the numeric columns we need to scale\n",
     "columns_to_scale = standardized_cancer.columns.difference(columns_to_exclude)\n",
     "\n",
-    "# Initialize the StandardScaler to standardize the selected numeric columns\n",
+    "# Initialize the StandardScaler to standardize the selected numeric columns, this adjusts data so each feature\n",
+    "# has a mean of 0, and SD of 1\n",
     "scaler = StandardScaler()\n",
     "\n",
     "# Apply the scaler to the selected columns. This transforms the data so that each feature\n",
@@ -1033,7 +1036,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": null,
    "metadata": {},
    "outputs": [
     {
@@ -1070,7 +1073,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": null,
    "metadata": {},
    "outputs": [
     {
@@ -1088,13 +1091,14 @@
     }
    ],
    "source": [
+    "#step 1: initialize our model\n",
     "knn = KNeighborsClassifier(n_neighbors=5)\n",
     "knn"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": null,
    "metadata": {},
    "outputs": [
     {
@@ -1112,6 +1116,8 @@
     }
    ],
    "source": [
+    "# step 2: define our x and y\n",
+    "# step 3: fit our model to our data\n",
     "knn.fit(X=cancer_train[[\"perimeter_mean\", \"concavity_mean\"]], y=cancer_train[\"diagnosis\"])"
    ]
   },
@@ -1124,7 +1130,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": null,
    "metadata": {},
    "outputs": [
     {
@@ -1248,8 +1254,11 @@
     }
    ],
    "source": [
+    "#step 4: predict on the test set\n",
     "cancer_test[\"predicted\"] = knn.predict(cancer_test[[\"perimeter_mean\", \"concavity_mean\"]])\n",
-    "cancer_test[[\"id\", \"diagnosis\", \"predicted\"]]"
+    "cancer_test[[\"id\", \"diagnosis\", \"predicted\"]]\n",
+    "\n",
+    "#x is in double brackets bc it needs to be provided as a dataframe, y is provided as a vector"
    ]
   },
   {
@@ -1277,7 +1286,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 15,
+   "execution_count": null,
    "metadata": {},
    "outputs": [
     {
@@ -1295,7 +1304,8 @@
     "knn.score(\n",
     "    cancer_test[[\"perimeter_mean\", \"concavity_mean\"]],\n",
     "    cancer_test[\"diagnosis\"]\n",
-    ")"
+    ")\n",
+    "#score is the default function to calculate accuracy"
    ]
   },
   {
@@ -1363,7 +1373,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 16,
+   "execution_count": null,
    "metadata": {},
    "outputs": [
     {
@@ -1424,10 +1434,12 @@
     }
    ],
    "source": [
+    "#this creates a confusion matrix\n",
     "pd.crosstab(\n",
     "    cancer_test[\"diagnosis\"],\n",
     "    cancer_test[\"predicted\"]\n",
-    ")"
+    ")\n",
+    "#in the ideal world, we wouldnt have any false positives/wrong classifications"
    ]
   },
   {
@@ -1488,7 +1500,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 17,
+   "execution_count": null,
    "metadata": {},
    "outputs": [
     {
@@ -1503,11 +1515,13 @@
     }
    ],
    "source": [
+    "#precision: when the model predicted malignant, how often is it actually correct\n",
     "precision_score(\n",
     "    y_true=cancer_test[\"diagnosis\"],\n",
     "    y_pred=cancer_test[\"predicted\"],\n",
     "    pos_label=\"Malignant\"\n",
-    ")"
+    ")\n",
+    "#our precision is really high"
    ]
   },
   {
@@ -1523,7 +1537,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 18,
+   "execution_count": null,
    "metadata": {},
    "outputs": [
     {
@@ -1538,6 +1552,7 @@
     }
    ],
    "source": [
+    "# recall: of all actual malignant tumors, how many did the model successfuly identify as malignant\n",
     "recall_score(\n",
     "    y_true=cancer_test[\"diagnosis\"],\n",
     "    y_pred=cancer_test[\"predicted\"],\n",
@@ -1592,7 +1607,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 19,
+   "execution_count": null,
    "metadata": {},
    "outputs": [
     {
@@ -1613,17 +1628,19 @@
     ")\n",
     "\n",
     "# fit the model on the sub-training data\n",
-    "knn = KNeighborsClassifier(n_neighbors=3)\n",
-    "X = cancer_subtrain[[\"perimeter_mean\", \"concavity_mean\"]]\n",
+    "knn = KNeighborsClassifier(n_neighbors=3) #1. initialize model\n",
+    "X = cancer_subtrain[[\"perimeter_mean\", \"concavity_mean\"]] #2. define x and y\n",
     "y = cancer_subtrain[\"diagnosis\"]\n",
-    "knn.fit(X, y)\n",
+    "knn.fit(X, y) #3. fit to subtrain data\n",
     "\n",
     "# compute the score on validation data\n",
     "acc = knn.score(\n",
     "    cancer_validation[[\"perimeter_mean\", \"concavity_mean\"]],\n",
     "    cancer_validation[\"diagnosis\"]\n",
     ")\n",
-    "acc"
+    "acc\n",
+    "\n",
+    "#acc value will change everytime you re-run the split"
    ]
   },
   {
@@ -1671,7 +1688,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 20,
+   "execution_count": null,
    "metadata": {},
    "outputs": [
     {
@@ -1755,7 +1772,7 @@
     "y = cancer_train[\"diagnosis\"]\n",
     "\n",
     "returned_dictionary = cross_validate(\n",
-    "    estimator=knn,\n",
+    "    estimator=knn, #estimator is your model\n",
     "    cv=5,    # setting up the cross validation number\n",
     "    X=X,\n",
     "    y=y\n",
@@ -1916,7 +1933,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 24,
+   "execution_count": null,
    "metadata": {},
    "outputs": [
     {
@@ -2549,9 +2566,9 @@
     "cancer_tune_grid.fit(\n",
     "    cancer_train[[\"perimeter_mean\", \"concavity_mean\"]],\n",
     "    cancer_train[\"diagnosis\"]\n",
-    ")\n",
+    ") #fitting it to our training data\n",
     "\n",
-    "accuracies_grid = pd.DataFrame(cancer_tune_grid.cv_results_)\n",
+    "accuracies_grid = pd.DataFrame(cancer_tune_grid.cv_results_) #creating a dataframe for ease of reading\n",
     "accuracies_grid"
    ]
   },
@@ -2641,7 +2658,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 26,
+   "execution_count": null,
    "metadata": {},
    "outputs": [
     {
@@ -2656,7 +2673,9 @@
     }
    ],
    "source": [
-    "cancer_tune_grid.best_params_"
+    "cancer_tune_grid.best_params_\n",
+    "\n",
+    "#choose the parameter that is stable"
    ]
   },
   {
@@ -2789,7 +2808,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "base",
+   "display_name": "lcr-env",
    "language": "python",
    "name": "python3"
   },
@@ -2803,7 +2822,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.10"
+   "version": "3.11.13"
   }
  },
  "nbformat": 4,