Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions docs/contributing.md
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,14 @@ If you updated any of the existing py/ipynb files, you can run `make sync-notebo

`make run-notebooks` will sync, run and save the notebooks, expects an ipython kernel with scikit-mol installed.

If you only want to sync and run a single notebook if you are working on updating one you can adapt the commands from the MakeFile

```bash
uv run jupytext --set-formats docs//notebooks//ipynb,docs//notebooks//scripts//py:percent --sync docs/notebooks/XX_YourNotebook.ipynb
uv run ruff format "docs/notebooks/XX_YourNotebook.ipynb"
uv run jupytext --execute docs/notebooks/XX_YourNotebook.ipynb
```

## Documentation

We use [MkDocs](https://www.mkdocs.org/) to host scikit-mol documentation on ReadTheDocs. If you're making some changes to the documentation or just what to see live preview of your docstring you can take a look at rendered documentation.
Expand Down
730 changes: 180 additions & 550 deletions docs/notebooks/13_applicability_domain.ipynb

Large diffs are not rendered by default.

28 changes: 22 additions & 6 deletions docs/notebooks/scripts/13_applicability_domain.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
# format_version: '1.3'
# jupytext_version: 1.16.6
# kernelspec:
# display_name: .venv
# display_name: vscode
# language: python
# name: python3
# ---
Expand All @@ -25,6 +25,7 @@
# First, let's import the necessary libraries and load our dataset:

# %%
import os
import numpy as np
import pandas as pd
from rdkit import Chem
Expand All @@ -45,7 +46,22 @@

# %%
# Load the dataset
csv_file = "../../tests/data/SLC6A4_active_excapedb_subset.csv"
# Results are better with the full set, but it takes longer to run, so for the notebook documentation we standard use a subset.
# The subset has been filtered to only include nicely predicted compounds, and is thus artificial.

full_set = False

if full_set:
csv_file = "SLC6A4_active_excape_export.csv"
if not os.path.exists(csv_file):
import urllib.request

url = "https://ndownloader.figshare.com/files/25747817"
urllib.request.urlretrieve(url, csv_file)
percentile = 95
else:
csv_file = "../../tests/data/SLC6A4_active_excapedb_subset.csv"
percentile = 90
data = pd.read_csv(csv_file)

# Add RDKit mol objects
Expand All @@ -72,7 +88,7 @@
binary_fp_pipe = Pipeline(
[
("fp", MorganFingerprintTransformer(fpSize=2048, radius=2)),
("rf", RandomForestRegressor(n_estimators=100, random_state=42)),
("rf", RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)),
]
)

Expand All @@ -84,7 +100,7 @@
abs_errors = np.abs(y_test - y_pred_test)

# Create and fit k-NN AD estimator
knn_ad = KNNApplicabilityDomain(n_neighbors=3, distance_metric="tanimoto")
knn_ad = KNNApplicabilityDomain(n_neighbors=3, distance_metric="tanimoto", percentile=percentile)
knn_ad.fit(binary_fp_pipe.named_steps["fp"].transform(X_train))

# Fit threshold using validation set
Expand Down Expand Up @@ -140,7 +156,7 @@
abs_errors = np.abs(y_test - y_pred_test)

# Create and fit leverage AD estimator
leverage_ad = LeverageApplicabilityDomain()
leverage_ad = LeverageApplicabilityDomain(percentile=percentile)
X_train_transformed = count_fp_pipe.named_steps["scaler"].transform(
count_fp_pipe.named_steps["pca"].transform(
count_fp_pipe.named_steps["fp"].transform(X_train)
Expand Down Expand Up @@ -291,5 +307,5 @@ def check_drug_applicability(smiles, name):
# The famous drugs we tested showed varying degrees of being within the applicability domain, which makes sense given
# that our training set is focused on SLC6A4 actives, while these drugs have different primary targets.
#
# The error analysis shows that compounds outside the applicability domain tend to have higher prediction errors,
# The error analysis shows that compounds outside the applicability domain tend to have higher prediction errors (when using the full set),
# validating the usefulness of these approaches for identifying potentially unreliable predictions.
2 changes: 1 addition & 1 deletion scikit_mol/applicability/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,7 @@ def __init__(
"estimator": self,
"accept_sparse": False,
"dtype": None,
"force_all_finite": True,
"ensure_all_finite": True,
"ensure_2d": True,
}

Expand Down
8 changes: 8 additions & 0 deletions scikit_mol/applicability/knn.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,10 @@ def fit(self, X: ArrayLike, y=None) -> "KNNApplicabilityDomain":
metric=self.distance_metric,
n_jobs=self.n_jobs,
)

if self.distance_metric == "jaccard":
X = X.astype(bool)

self.nn_.fit(X)

# Set initial threshold based on training data
Expand All @@ -167,6 +171,10 @@ def _transform(self, X: np.ndarray) -> np.ndarray:
Mean distance to k nearest neighbors. Higher values indicate samples
further from the training set.
"""

if self.distance_metric == "jaccard":
X = X.astype(bool)

distances, _ = self.nn_.kneighbors(X)
mean_distances = distances[:, 1:].mean(axis=1) # Skip first (self) neighbor
return mean_distances.reshape(-1, 1)