EBjerrum · EBjerrum · Jun 1, 2025 · May 11, 2025 · May 11, 2025 · May 11, 2025
@@ -103,6 +103,14 @@ If you updated any of the existing py/ipynb files, you can run `make sync-notebo
 
 `make run-notebooks` will sync, run and save the notebooks, expects an ipython kernel with scikit-mol installed.
 
+If you only want to sync and run a single notebook if you are working on updating one you can adapt the commands from the MakeFile
+
+```bash
+uv run jupytext --set-formats docs//notebooks//ipynb,docs//notebooks//scripts//py:percent --sync docs/notebooks/XX_YourNotebook.ipynb
+uv run ruff format "docs/notebooks/XX_YourNotebook.ipynb"
+uv run jupytext --execute docs/notebooks/XX_YourNotebook.ipynb
+```
+
 ## Documentation
 
 We use [MkDocs](https://www.mkdocs.org/) to host scikit-mol documentation on ReadTheDocs. If you're making some changes to the documentation or just what to see live preview of your docstring you can take a look at rendered documentation.

@@ -8,7 +8,7 @@
 #       format_version: '1.3'
 #       jupytext_version: 1.16.6
 #   kernelspec:
-#     display_name: .venv
+#     display_name: vscode
 #     language: python
 #     name: python3
 # ---
@@ -25,6 +25,7 @@
 # First, let's import the necessary libraries and load our dataset:
 
 # %%
+import os
 import numpy as np
 import pandas as pd
 from rdkit import Chem
@@ -45,7 +46,22 @@
 
 # %%
 # Load the dataset
-csv_file = "../../tests/data/SLC6A4_active_excapedb_subset.csv"
+# Results are better with the full set, but it takes longer to run, so for the notebook documentation we standard use a subset.
+# The subset has been filtered to only include nicely predicted compounds, and is thus artificial.
+
+full_set = False
+
+if full_set:
+    csv_file = "SLC6A4_active_excape_export.csv"
+    if not os.path.exists(csv_file):
+        import urllib.request
+
+        url = "https://ndownloader.figshare.com/files/25747817"
+        urllib.request.urlretrieve(url, csv_file)
+        percentile = 95
+else:
+    csv_file = "../../tests/data/SLC6A4_active_excapedb_subset.csv"
+    percentile = 90
 data = pd.read_csv(csv_file)
 
 # Add RDKit mol objects
@@ -72,7 +88,7 @@
 binary_fp_pipe = Pipeline(
     [
         ("fp", MorganFingerprintTransformer(fpSize=2048, radius=2)),
-        ("rf", RandomForestRegressor(n_estimators=100, random_state=42)),
+        ("rf", RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)),
     ]
 )
 
@@ -84,7 +100,7 @@
 abs_errors = np.abs(y_test - y_pred_test)
 
 # Create and fit k-NN AD estimator
-knn_ad = KNNApplicabilityDomain(n_neighbors=3, distance_metric="tanimoto")
+knn_ad = KNNApplicabilityDomain(n_neighbors=3, distance_metric="tanimoto", percentile=percentile)
 knn_ad.fit(binary_fp_pipe.named_steps["fp"].transform(X_train))
 
 # Fit threshold using validation set
@@ -140,7 +156,7 @@
 abs_errors = np.abs(y_test - y_pred_test)
 
 # Create and fit leverage AD estimator
-leverage_ad = LeverageApplicabilityDomain()
+leverage_ad = LeverageApplicabilityDomain(percentile=percentile)
 X_train_transformed = count_fp_pipe.named_steps["scaler"].transform(
     count_fp_pipe.named_steps["pca"].transform(
         count_fp_pipe.named_steps["fp"].transform(X_train)
@@ -291,5 +307,5 @@ def check_drug_applicability(smiles, name):
 # The famous drugs we tested showed varying degrees of being within the applicability domain, which makes sense given
 # that our training set is focused on SLC6A4 actives, while these drugs have different primary targets.
 #
-# The error analysis shows that compounds outside the applicability domain tend to have higher prediction errors,
+# The error analysis shows that compounds outside the applicability domain tend to have higher prediction errors (when using the full set),
 # validating the usefulness of these approaches for identifying potentially unreliable predictions.
@@ -100,7 +100,7 @@ def __init__(
             "estimator": self,
             "accept_sparse": False,
             "dtype": None,
-            "force_all_finite": True,
+            "ensure_all_finite": True,
             "ensure_2d": True,
         }
 

@@ -146,6 +146,10 @@ def fit(self, X: ArrayLike, y=None) -> "KNNApplicabilityDomain":
             metric=self.distance_metric,
             n_jobs=self.n_jobs,
         )
+
+        if self.distance_metric == "jaccard":
+            X = X.astype(bool)
+
         self.nn_.fit(X)
 
         # Set initial threshold based on training data
@@ -167,6 +171,10 @@ def _transform(self, X: np.ndarray) -> np.ndarray:
             Mean distance to k nearest neighbors. Higher values indicate samples
             further from the training set.
         """
+
+        if self.distance_metric == "jaccard":
+            X = X.astype(bool)
+
         distances, _ = self.nn_.kneighbors(X)
         mean_distances = distances[:, 1:].mean(axis=1)  # Skip first (self) neighbor
         return mean_distances.reshape(-1, 1)