From 89be4278b20c40112d63f7391e425fe3fa701d4e Mon Sep 17 00:00:00 2001
From: AKPreto <martinsgomes.jose@gmail.com>
Date: Mon, 4 May 2026 13:51:24 +0000
Subject: [PATCH 1/3] Reviewed data prep notebooks

---
 .gitignore | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.gitignore b/.gitignore
index aa0d6b7..254c339 100644
--- a/.gitignore
+++ b/.gitignore
@@ -56,7 +56,7 @@ DATABASE_SCHEMA.md
 /.databrickscfg
 /.data
 /.local
-/notebooks/data_prep/
+/notebooks/
 
 # Chache
 /.pytest_cache

From b4c128543d9734b2ba445fce825e87c4103c1071 Mon Sep 17 00:00:00 2001
From: AKPreto <martinsgomes.jose@gmail.com>
Date: Mon, 4 May 2026 13:55:02 +0000
Subject: [PATCH 2/3] unignoring debug

---
 .gitignore | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.gitignore b/.gitignore
index 254c339..6ba0fe4 100644
--- a/.gitignore
+++ b/.gitignore
@@ -22,7 +22,6 @@ DATABASE_SCHEMA.md
 *.svg
 *.log
 /notebooks/*.py
-/notebooks/debug/*.py
 
 # File exceptions
 !files/autodock_vina_box.txt

From 45f2b2ba3597c0f7476226b479c1f4e6c4a3da4d Mon Sep 17 00:00:00 2001
From: AKPreto <martinsgomes.jose@gmail.com>
Date: Mon, 4 May 2026 13:57:15 +0000
Subject: [PATCH 3/3] unignoring debug

---
 .gitignore                                    |   1 -
 notebooks/case_study_data/build_decoy.ipynb   | 382 ++++++++++
 .../case_study_data/ligand_preparation.ipynb  | 699 ++++++++++++++++++
 .../case_study_data/protein_preparation.ipynb | 255 +++++++
 4 files changed, 1336 insertions(+), 1 deletion(-)
 create mode 100644 notebooks/case_study_data/build_decoy.ipynb
 create mode 100644 notebooks/case_study_data/ligand_preparation.ipynb
 create mode 100644 notebooks/case_study_data/protein_preparation.ipynb

diff --git a/.gitignore b/.gitignore
index 6ba0fe4..eed5aa6 100644
--- a/.gitignore
+++ b/.gitignore
@@ -55,7 +55,6 @@ DATABASE_SCHEMA.md
 /.databrickscfg
 /.data
 /.local
-/notebooks/
 
 # Chache
 /.pytest_cache
diff --git a/notebooks/case_study_data/build_decoy.ipynb b/notebooks/case_study_data/build_decoy.ipynb
new file mode 100644
index 0000000..57cc643
--- /dev/null
+++ b/notebooks/case_study_data/build_decoy.ipynb
@@ -0,0 +1,382 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import warnings\n",
+    "\n",
+    "warnings.filterwarnings(\"ignore\")\n",
+    "import os\n",
+    "\n",
+    "import matplotlib.pyplot as plt\n",
+    "import pandas as pd\n",
+    "\n",
+    "from guild.tools.ligand_properties import (\n",
+    "    assign_properties,\n",
+    "    compound_filter,\n",
+    "    generate_equal_mw_distributions,\n",
+    ")\n",
+    "from guild.transformers.chembl import get_assays_locally, get_decoys_locally"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Get protein info"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "support_dir = \"../../guild/support\"\n",
+    "\n",
+    "os.makedirs(f\"{support_dir}/binders\", exist_ok=True)\n",
+    "os.makedirs(f\"{support_dir}/decoys\", exist_ok=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "gpcrdb_mapping = pd.read_csv(f\"{support_dir}/uniprot_gpcrdb_mapping.txt\", sep=\" \")\n",
+    "gpcrdb_mapping.head(2)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "uniprot_gene_list = set(gpcrdb_mapping[\"uniprot_id\"].tolist())\n",
+    "len(uniprot_gene_list)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Update binders to ChEMBL 36"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "assay_df = get_assays_locally(uniprot_gene_list, min_mol_wt=250, max_mol_wt=450)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Filter out compounds with pchembl value <= 0\n",
+    "assay_df = assay_df[assay_df[\"pchembl_value\"] > 0]\n",
+    "\n",
+    "# Assign activity based on pchembl value 6\n",
+    "assay_df[\"activity\"] = assay_df[\"pchembl_value\"].apply(\n",
+    "    lambda x: \"weak-binder\" if x < 6 else \"strong-binder\"\n",
+    ")\n",
+    "chembl_version = assay_df[\"chembl_version\"].values[0]\n",
+    "chembl_version"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "BINDER_FILE = f\"{support_dir}/binders/chembl_binders_{chembl_version}.tsv\"\n",
+    "\n",
+    "if not os.path.exists(BINDER_FILE):\n",
+    "    assay_df_filtered = assay_df[\n",
+    "        [\n",
+    "            \"protein_symbol\",\n",
+    "            \"activity\",\n",
+    "            \"chembl_id\",\n",
+    "            \"smiles\",\n",
+    "            \"pchembl_value\",\n",
+    "        ]\n",
+    "    ]\n",
+    "    assay_df_filtered[\"chembl_id\"] = \"CHEMBL\" + assay_df_filtered[\"chembl_id\"].astype(\n",
+    "        str\n",
+    "    )\n",
+    "    assay_df_filtered.rename(columns={\"protein_symbol\": \"uniprot_id\"}, inplace=True)\n",
+    "\n",
+    "    assay_df_filtered = pd.merge(\n",
+    "        assay_df_filtered, gpcrdb_mapping, how=\"outer\", on=\"uniprot_id\"\n",
+    "    )\n",
+    "\n",
+    "    assay_df_filtered.to_csv(\n",
+    "        f\"{support_dir}/binders/chembl_binders_{chembl_version}.tsv\",\n",
+    "        sep=\"\\t\",\n",
+    "        index=False,\n",
+    "    )\n",
+    "else:\n",
+    "    assay_df_filtered = pd.read_csv(BINDER_FILE, sep=\"\\t\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Update decoys to ChEMBL 36"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "MIN_MW = 250\n",
+    "MAX_MW = 450\n",
+    "MAX_RING_SIZE = 10"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "decoy_df = get_decoys_locally(\n",
+    "    min_mol_wt=MIN_MW, max_mol_wt=MAX_MW, version=\"chembl_36\"\n",
+    ")  # without the version flag, the latest chembl data is pulled\n",
+    "decoy_df.head()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Add molecular properties to the df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "if not os.path.exists(f\"{support_dir}/decoys/{chembl_version}_with_properties.tsv\"):\n",
+    "    decoy_df_with_properties = assign_properties(\n",
+    "        decoy_df[[\"chembl_id\", \"canonical_smiles\"]]\n",
+    "    )\n",
+    "    decoy_df_with_properties[\"scaffold_smiles\"] = decoy_df_with_properties[\n",
+    "        \"scaffold_smiles\"\n",
+    "    ].replace(\"\", None)\n",
+    "\n",
+    "    decoy_df_with_properties.to_csv(\n",
+    "        f\"{support_dir}/decoys/{chembl_version}_with_properties.tsv\",\n",
+    "        sep=\"\\t\",\n",
+    "        index=False,\n",
+    "    )\n",
+    "else:\n",
+    "    decoy_df_with_properties = pd.read_csv(\n",
+    "        f\"{support_dir}/decoys/{chembl_version}_with_properties.tsv\",\n",
+    "        sep=\"\\t\",\n",
+    "        low_memory=False,\n",
+    "    )\n",
+    "\n",
+    "decoy_df_with_properties.head()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Subset to match the same properties as other ligands"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "all_decoy_df = pd.merge(\n",
+    "    decoy_df,\n",
+    "    decoy_df_with_properties,\n",
+    "    left_on=\"chembl_id\",\n",
+    "    right_on=\"id\",\n",
+    "    how=\"inner\",\n",
+    ")\n",
+    "all_decoy_df.drop(columns=[\"id\"], inplace=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "all_decoy_df.to_csv(\n",
+    "    f\"{support_dir}/decoys/{chembl_version}_decoys_with_properties.tsv\",\n",
+    "    sep=\"\\t\",\n",
+    "    index=False,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "all_decoy_df[\"molecular_weight\"].plot.hist(bins=100)\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Subset the df based on scaffolds and MW in the distribution"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "if os.path.exists(f\"{support_dir}/decoys/{chembl_version}_decoys_filtered.tsv\"):\n",
+    "    decoys_filtered = pd.read_csv(\n",
+    "        f\"{support_dir}/decoys/{chembl_version}_decoys_filtered.tsv\", sep=\"\\t\"\n",
+    "    )\n",
+    "else:\n",
+    "    decoys_filtered = compound_filter(\n",
+    "        all_decoy_df,\n",
+    "        min_MW=MIN_MW,\n",
+    "        max_MW=MAX_MW,\n",
+    "        scaffold_col=\"scaffold_smiles\",\n",
+    "        ro5_fulfilled_col=\"ro5_fulfilled\",\n",
+    "        largest_ring_size_col=\"max_ring_size\",\n",
+    "        max_per_scaffold=5,\n",
+    "        seed=42,\n",
+    "        MW_column=\"molecular_weight\",\n",
+    "        max_ring_size=MAX_RING_SIZE,\n",
+    "    )\n",
+    "\n",
+    "    decoys_filtered.to_csv(\n",
+    "        f\"{support_dir}/decoys/{chembl_version}_decoys_filtered.tsv\",\n",
+    "        sep=\"\\t\",\n",
+    "        index=False,\n",
+    "    )"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Generate decoys sublists"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Decoys with 1000 samples\n",
+    "decoys_sample_1000 = generate_equal_mw_distributions(\n",
+    "    df_1=decoys_filtered,\n",
+    "    mw_column=\"molecular_weight\",\n",
+    "    lipinski_column=\"ro5_fulfilled\",\n",
+    "    n_bins=100,\n",
+    "    target_size=1000,\n",
+    ")\n",
+    "\n",
+    "decoys_sample_1000.to_csv(\n",
+    "    f\"{support_dir}/decoys/{chembl_version}_decoys_1000.tsv\",\n",
+    "    sep=\"\\t\",\n",
+    "    index=False,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "decoys_sample_1000[\"molecular_weight\"].plot.hist(bins=100)\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Generate 100 decoys list"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "decoys_sample_100 = generate_equal_mw_distributions(\n",
+    "    df_1=decoys_filtered,\n",
+    "    mw_column=\"molecular_weight\",\n",
+    "    lipinski_column=\"ro5_fulfilled\",\n",
+    "    n_bins=20,\n",
+    "    target_size=100,\n",
+    ")\n",
+    "\n",
+    "decoys_sample_100.to_csv(\n",
+    "    f\"{support_dir}/decoys/{chembl_version}_decoys_100.tsv\",\n",
+    "    sep=\"\\t\",\n",
+    "    index=False,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "decoys_sample_100[\"molecular_weight\"].plot.hist(bins=20)\n",
+    "plt.show()"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": ".venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.19"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/notebooks/case_study_data/ligand_preparation.ipynb b/notebooks/case_study_data/ligand_preparation.ipynb
new file mode 100644
index 0000000..79175e3
--- /dev/null
+++ b/notebooks/case_study_data/ligand_preparation.ipynb
@@ -0,0 +1,699 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "\n",
+    "import pandas as pd\n",
+    "import plotly.express as px\n",
+    "import plotly.graph_objects as go\n",
+    "from plotly.subplots import make_subplots\n",
+    "from rdkit import RDLogger\n",
+    "from rdkit.Chem import AllChem, CanonSmiles, MolFromSmiles, MolToSmiles, PandasTools\n",
+    "from rdkit.Contrib.NP_Score import npscorer\n",
+    "from sklearn.decomposition import PCA\n",
+    "from tqdm import tqdm\n",
+    "\n",
+    "from guild.tools.ligand_properties import (\n",
+    "    assign_properties,\n",
+    "    compound_filter,\n",
+    "    generate_equal_mw_distributions,\n",
+    ")\n",
+    "\n",
+    "RDLogger.DisableLog(\"rdApp.info\")\n",
+    "lg = RDLogger.logger()\n",
+    "lg.setLevel(RDLogger.CRITICAL)\n",
+    "tqdm.pandas()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "publication_data_folder = \"../../guild/support/publication_data\"\n",
+    "os.makedirs(publication_data_folder, exist_ok=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "MIN_MW = 250\n",
+    "MAX_MW = 450\n",
+    "MAX_RING_SIZE = 10"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def canonicalize_smiles(smiles):\n",
+    "    try:\n",
+    "        return CanonSmiles(smiles)\n",
+    "    except Exception:\n",
+    "        return smiles"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Preparing the NP dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "loaded_nps = pd.read_csv(\n",
+    "    \"../../guild/support/coconut_csv-02-2026.csv\", sep=\",\", header=0\n",
+    ")[[\"identifier\",\"canonical_smiles\"]]\n",
+    "loaded_nps.columns = [\"COCONUT_ID\",\"SMILES\"]\n",
+    "loaded_nps = loaded_nps.drop_duplicates(subset=\"SMILES\").reset_index(drop=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "loaded_nps[[\"simplified_id\",\"0_scaffold\"]] = loaded_nps[\"COCONUT_ID\"].str.split(\".\", expand=True)\n",
+    "loaded_nps = loaded_nps.loc[loaded_nps[\"0_scaffold\"] == \"0\"].reset_index(drop=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "loaded_nps[\"can_smiles\"] = loaded_nps[\"SMILES\"].progress_apply(canonicalize_smiles)\n",
+    "\n",
+    "loaded_nps = loaded_nps[[\"COCONUT_ID\", \"can_smiles\"]]\n",
+    "loaded_nps = loaded_nps.reset_index(drop=True)\n",
+    "loaded_nps.head(2)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Adding chemical properties"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#if not os.path.exists(\"../../guild/support/coconut_properties.tsv\"):\n",
+    "loaded_nps_properties = assign_properties(loaded_nps)\n",
+    "\n",
+    "# Replace empty strings with None\n",
+    "loaded_nps_properties[\"scaffold_smiles\"] = loaded_nps_properties[\n",
+    "    \"scaffold_smiles\"\n",
+    "].replace(\"\", None)\n",
+    "\n",
+    "loaded_nps_properties.to_csv(\n",
+    "    \"../../guild/support/coconut_properties.tsv\", sep=\"\\t\", index=None\n",
+    ")\n",
+    "#else:\n",
+    "#    loaded_nps_properties = pd.read_csv(\n",
+    "#        \"../../guild/support/coconut_properties.tsv\", sep=\"\\t\"\n",
+    "#    )\n",
+    "\n",
+    "loaded_nps_properties.head(2)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "loaded_nps_with_properties = pd.merge(\n",
+    "    loaded_nps,\n",
+    "    loaded_nps_properties,\n",
+    "    left_on=\"COCONUT_ID\",\n",
+    "    right_on=\"id\",\n",
+    "    how=\"inner\",\n",
+    ")\n",
+    "loaded_nps_with_properties.drop(columns=[\"id\"], inplace=True)\n",
+    "\n",
+    "#if not os.path.exists(f\"{publication_data_folder}/np_with_properties.tsv\"):\n",
+    "loaded_nps_with_properties.to_csv(\n",
+    "    f\"{publication_data_folder}/np_with_properties.tsv\", sep=\"\\t\", index=None\n",
+    ")\n",
+    "\n",
+    "len(loaded_nps_with_properties)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Subsetting to properties within drug-like space"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "loaded_nps_with_properties = loaded_nps_with_properties.dropna(\n",
+    "    subset=[\"molecular_weight\", \"scaffold_smiles\"], how=\"any\"\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "if not os.path.exists(f\"{publication_data_folder}/np_filtered_with_properties.tsv\"):\n",
+    "    nps_filtered = compound_filter(\n",
+    "        loaded_nps_with_properties,\n",
+    "        min_MW=MIN_MW,\n",
+    "        max_MW=MAX_MW,\n",
+    "        scaffold_col=\"scaffold_smiles\",\n",
+    "        ro5_fulfilled_col=\"ro5_fulfilled\",\n",
+    "        largest_ring_size_col=\"max_ring_size\",\n",
+    "        max_per_scaffold=5,\n",
+    "        seed=42,\n",
+    "        MW_column=\"molecular_weight\",\n",
+    "        max_ring_size=MAX_RING_SIZE,\n",
+    "    )\n",
+    "\n",
+    "    # Add activity column\n",
+    "    nps_filtered[\"activity\"] = \"Natural Product\"\n",
+    "    nps_filtered.to_csv(\n",
+    "        f\"{publication_data_folder}/np_filtered_with_properties.tsv\",\n",
+    "        sep=\"\\t\",\n",
+    "        index=False,\n",
+    "    )\n",
+    "\n",
+    "else:\n",
+    "    nps_filtered = pd.read_csv(\n",
+    "        f\"{publication_data_folder}/np_filtered_with_properties.tsv\",\n",
+    "        sep=\"\\t\",\n",
+    "        header=0,\n",
+    "    )\n",
+    "\n",
+    "f\"Filtered the NPs from {len(loaded_nps)} to {len(nps_filtered)}\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Preparing the synthetic dataset\n",
+    "\n",
+    "The dataset is from Enamine and cannot be released due to legal issues. Hence, IDs have been masked in the final output."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "loaded_synthetics = PandasTools.LoadSDF(\n",
+    "    \"../../guild/support/Enamine_Hit_Locator_Library_plated_200000cmpds_20210816.sdf\"\n",
+    ")\n",
+    "loaded_synthetics[\"smiles\"] = loaded_synthetics.ROMol.progress_apply(MolToSmiles)\n",
+    "loaded_synthetics[\"can_smiles\"] = loaded_synthetics[\"smiles\"].progress_apply(\n",
+    "    canonicalize_smiles\n",
+    ")\n",
+    "\n",
+    "loaded_synthetics = loaded_synthetics[[\"Catalog ID\", \"can_smiles\"]]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Adding chemical properties"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "if not os.path.exists(\"../../guild/support/enamine_properties.tsv\"):\n",
+    "    loaded_synthetics_properties = assign_properties(loaded_synthetics)\n",
+    "\n",
+    "    # Replace empty strings with None\n",
+    "    loaded_synthetics_properties[\"scaffold_smiles\"] = loaded_synthetics_properties[\n",
+    "        \"scaffold_smiles\"\n",
+    "    ].replace(\"\", None)\n",
+    "\n",
+    "    loaded_synthetics_properties.to_csv(\n",
+    "        \"../../guild/support/enamine_properties.tsv\", sep=\"\\t\", index=None\n",
+    "    )\n",
+    "else:\n",
+    "    loaded_synthetics_properties = pd.read_csv(\n",
+    "        \"../../guild/support/enamine_properties.tsv\", sep=\"\\t\"\n",
+    "    )\n",
+    "\n",
+    "loaded_synthetics_properties.head(2)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "loaded_synthetics_with_properties = pd.merge(\n",
+    "    loaded_synthetics,\n",
+    "    loaded_synthetics_properties,\n",
+    "    left_on=\"Catalog ID\",\n",
+    "    right_on=\"id\",\n",
+    "    how=\"inner\",\n",
+    ")\n",
+    "loaded_synthetics_with_properties.drop(columns=[\"id\"], inplace=True)\n",
+    "\n",
+    "if not os.path.exists(f\"{publication_data_folder}/synthetics_with_properties.tsv\"):\n",
+    "    loaded_synthetics_with_properties.to_csv(\n",
+    "        f\"{publication_data_folder}/synthetics_with_properties.tsv\",\n",
+    "        sep=\"\\t\",\n",
+    "        index=None,\n",
+    "    )\n",
+    "\n",
+    "len(loaded_synthetics_with_properties)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Subsetting to properties within drug-like space"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "loaded_synthetics_with_properties = loaded_synthetics_with_properties.dropna(\n",
+    "    subset=[\"molecular_weight\", \"scaffold_smiles\"], how=\"any\"\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "if not os.path.exists(\n",
+    "    f\"{publication_data_folder}/synthetics_filtered_with_properties.tsv\"\n",
+    "):\n",
+    "    synthetics_filtered = compound_filter(\n",
+    "        loaded_synthetics_with_properties,\n",
+    "        min_MW=MIN_MW,\n",
+    "        max_MW=MAX_MW,\n",
+    "        scaffold_col=\"scaffold_smiles\",\n",
+    "        ro5_fulfilled_col=\"ro5_fulfilled\",\n",
+    "        largest_ring_size_col=\"max_ring_size\",\n",
+    "        max_per_scaffold=5,\n",
+    "        seed=42,\n",
+    "        MW_column=\"molecular_weight\",\n",
+    "        max_ring_size=MAX_RING_SIZE,\n",
+    "    )\n",
+    "\n",
+    "    # Add activity column\n",
+    "    synthetics_filtered[\"activity\"] = \"Synthetic\"\n",
+    "    synthetics_filtered.to_csv(\n",
+    "        f\"{publication_data_folder}/synthetics_filtered_with_properties.tsv\",\n",
+    "        sep=\"\\t\",\n",
+    "        index=False,\n",
+    "    )\n",
+    "\n",
+    "else:\n",
+    "    synthetics_filtered = pd.read_csv(\n",
+    "        f\"{publication_data_folder}/synthetics_filtered_with_properties.tsv\",\n",
+    "        sep=\"\\t\",\n",
+    "        header=0,\n",
+    "    )\n",
+    "\n",
+    "f\"Filtered the Synthetics from {len(loaded_synthetics)} to {len(synthetics_filtered)}\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Exploratory analysis of the final sets"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "filtered_df = pd.concat(\n",
+    "    [synthetics_filtered.reset_index(drop=True), nps_filtered.reset_index(drop=True)],\n",
+    "    axis=0,\n",
+    ")\n",
+    "filtered_df[\"ID\"] = filtered_df[\"Catalog ID\"].fillna(filtered_df[\"COCONUT_ID\"])\n",
+    "filtered_df.drop(columns=[\"Catalog ID\", \"COCONUT_ID\"], inplace=True)\n",
+    "filtered_df.head(1)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Checking the molecular weight distrbution"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "fig = px.histogram(\n",
+    "    filtered_df,\n",
+    "    x=\"molecular_weight\",\n",
+    "    nbins=100,\n",
+    "    color=\"activity\",\n",
+    ")\n",
+    "fig.show()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Check the NP score distrbution"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "np_model = npscorer.readNPModel()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "filtered_df[\"npscore\"] = filtered_df[\"can_smiles\"].progress_apply(\n",
+    "    lambda x: npscorer.scoreMol(MolFromSmiles(x), np_model)\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "fig = px.histogram(filtered_df, x=\"npscore\", nbins=100, color=\"activity\")\n",
+    "fig.show()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "To have non-overlapping NP space, a cut-off of 0.05 is used."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "nps_binned = filtered_df[\n",
+    "    (filtered_df[\"npscore\"] > 0.0) & (filtered_df[\"activity\"] == \"Natural Product\")\n",
+    "]\n",
+    "synthetics_binned = filtered_df[\n",
+    "    (filtered_df[\"npscore\"] < -0.5) & (filtered_df[\"activity\"] == \"Synthetic\")\n",
+    "]\n",
+    "\n",
+    "binned_df = pd.concat(\n",
+    "    [nps_binned.reset_index(drop=True), synthetics_binned.reset_index(drop=True)],\n",
+    "    axis=0,\n",
+    ")\n",
+    "len(binned_df)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "fig = px.histogram(binned_df, x=\"npscore\", nbins=100, color=\"activity\")\n",
+    "fig.show()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Generating ECFP fingerprint and checking their chemical space"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def get_fingerprints(df, smiles_col):\n",
+    "    df[\"ROMol\"] = [\n",
+    "        MolFromSmiles(x)\n",
+    "        for x in tqdm(df[smiles_col], total=len(df), desc=\"Generating molecules\")\n",
+    "    ]\n",
+    "    radius = 3\n",
+    "    nBits = 1024\n",
+    "    ECFP6 = [\n",
+    "        list(AllChem.GetMorganFingerprintAsBitVect(x, radius=radius, nBits=nBits))\n",
+    "        for x in tqdm(df[\"ROMol\"], total=len(df), desc=\"Generating fingerprints\")\n",
+    "    ]\n",
+    "    return pd.DataFrame(ECFP6, index=df.index)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "fingerprints_df = get_fingerprints(binned_df, \"can_smiles\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pca = PCA(n_components=2)\n",
+    "pca.fit(fingerprints_df)\n",
+    "pca_result = pca.transform(fingerprints_df)\n",
+    "pca_result = pd.DataFrame(pca_result, columns=[\"PC1\", \"PC2\"])\n",
+    "pca_result[\"activity\"] = binned_df[\"activity\"].tolist()\n",
+    "\n",
+    "fig = px.scatter(\n",
+    "    data_frame=pca_result, x=\"PC1\", y=\"PC2\", color=\"activity\", opacity=0.25\n",
+    ")\n",
+    "fig.update_layout(\n",
+    "    title=\"PCA of ECFP fingerprints\",\n",
+    "    xaxis_title=\"PC1\",\n",
+    "    yaxis_title=\"PC2\",\n",
+    "    legend_title=\"Compounds type\",\n",
+    "    height=600,\n",
+    "    width=1000,\n",
+    ")\n",
+    "fig.show()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Generating the final dataset of 1000 samples each\n",
+    "\n",
+    "This dataset requires an equal molecular weight distrbution along with sampling about 1,000 compounds from each."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "matched_np, matched_synthetic = matched_np, matched_synthetic = (\n",
+    "    generate_equal_mw_distributions(\n",
+    "        df_1=nps_binned,\n",
+    "        df_2=synthetics_binned,\n",
+    "        mw_column=\"molecular_weight\",\n",
+    "        lipinski_column=\"ro5_fulfilled\",\n",
+    "        n_bins=10,\n",
+    "        target_size=1000,\n",
+    "    )\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "len(matched_np), len(matched_synthetic)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sampled_ligands = pd.concat([matched_np, matched_synthetic], ignore_index=True)\n",
+    "col_order = [\"ID\"] + [i for i in sampled_ligands.columns.tolist() if i != \"ID\"]\n",
+    "sampled_ligands = sampled_ligands[col_order]\n",
+    "sampled_ligands.head(2)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sampled_ligands.to_csv(\n",
+    "    f\"{publication_data_folder}/ligands_sampled_1000.tsv\",\n",
+    "    sep=\"\\t\",\n",
+    "    index=False,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Inspecting property of final set"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# make three histograms, one for each activity, one for each logp, one for each fsp3, one for each topological_surface_area_mapping with subplots\n",
+    "columns = [\"logp\", \"fsp3\", \"topological_surface_area_mapping\", \"max_ring_size\"]\n",
+    "activities = sampled_ligands[\"activity\"].unique()\n",
+    "\n",
+    "fig = make_subplots(rows=4, cols=1, subplot_titles=columns)\n",
+    "\n",
+    "for i, col in enumerate(columns, start=1):\n",
+    "    for activity in activities:\n",
+    "        data = sampled_ligands[sampled_ligands[\"activity\"] == activity][col]\n",
+    "        fig.add_trace(\n",
+    "            go.Histogram(\n",
+    "                x=data,\n",
+    "                nbinsx=100,\n",
+    "                name=activity,\n",
+    "                legendgroup=activity,\n",
+    "                showlegend=True,\n",
+    "            ),\n",
+    "            row=i,\n",
+    "            col=1,\n",
+    "        )\n",
+    "\n",
+    "fig.update_layout(barmode=\"overlay\", height=800)\n",
+    "fig.update_traces(opacity=0.7)\n",
+    "fig.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "subset_fingerprints_df = get_fingerprints(sampled_ligands, \"can_smiles\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pca = PCA(n_components=2)\n",
+    "pca.fit(fingerprints_df)\n",
+    "pca_result = pca.transform(fingerprints_df)\n",
+    "pca_result = pd.DataFrame(pca_result, columns=[\"PC1\", \"PC2\"])\n",
+    "pca_result[\"activity\"] = binned_df[\"activity\"].tolist()\n",
+    "\n",
+    "fig = px.scatter(\n",
+    "    data_frame=pca_result, x=\"PC1\", y=\"PC2\", color=\"activity\", opacity=0.25\n",
+    ")\n",
+    "fig.update_layout(\n",
+    "    title=\"PCA of ECFP fingerprints Sampled Ligands only\",\n",
+    "    xaxis_title=\"PC1\",\n",
+    "    yaxis_title=\"PC2\",\n",
+    "    legend_title=\"Compounds type\",\n",
+    "    height=600,\n",
+    "    width=1000,\n",
+    ")\n",
+    "fig.show()"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "guild",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.19"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/notebooks/case_study_data/protein_preparation.ipynb b/notebooks/case_study_data/protein_preparation.ipynb
new file mode 100644
index 0000000..81fe013
--- /dev/null
+++ b/notebooks/case_study_data/protein_preparation.ipynb
@@ -0,0 +1,255 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "\n",
+    "import pandas as pd\n",
+    "from tqdm import tqdm\n",
+    "\n",
+    "from guild.tools.scrapping import download_pdb_files\n",
+    "\n",
+    "tqdm.pandas()\n",
+    "pd.set_option(\"mode.chained_assignment\", None)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Get all GPCR PDB files\n",
+    "These include downlading of PDB files in the notebook folder."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "downloaded_pdbs_folder = \"../data/pdbs\"\n",
+    "publication_data_folder = \"../../guild/support/publication_data\"\n",
+    "\n",
+    "os.makedirs(downloaded_pdbs_folder, exist_ok=True)\n",
+    "os.makedirs(publication_data_folder, exist_ok=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "curated_data = pd.read_csv(\n",
+    "    \"../../guild/support/GPCRs_curated_dataset.tsv\",\n",
+    "    sep=\"\\t\",\n",
+    "    header=0,\n",
+    ")\n",
+    "usable_curated_data = curated_data[curated_data[\"Usable\"] == \"YES\"]\n",
+    "usable_curated_data[\"PDB_ID\"] = usable_curated_data[\"PDB_ID\"].str.lower()\n",
+    "usable_curated_pdbs = usable_curated_data[\"PDB_ID\"].tolist()\n",
+    "f\"Number of GPCR PDB files to download: {len(usable_curated_pdbs)}\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "failed_pdbs = download_pdb_files(\n",
+    "    usable_curated_pdbs, download_dir=downloaded_pdbs_folder\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Subset to successful PDBs only"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "downloaded_curated_data = usable_curated_data[\n",
+    "    ~usable_curated_data[\"PDB_ID\"].isin(failed_pdbs)\n",
+    "]\n",
+    "f\"Number of GPCR PDB files successfully downloaded: {len(downloaded_curated_data['PDB_ID'].tolist())}\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Add UniProt mappings"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "gpcrdb_mapping = pd.read_csv(\n",
+    "    \"../../guild/support/uniprot_gpcrdb_mapping.txt\",\n",
+    "    sep=\" \",\n",
+    ")\n",
+    "final_data = pd.merge(\n",
+    "    gpcrdb_mapping,\n",
+    "    usable_curated_data,\n",
+    "    left_on=\"pdb_id\",\n",
+    "    right_on=\"PDB_ID\",\n",
+    "    how=\"inner\",\n",
+    ")\n",
+    "final_data.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "final_data.to_csv(\n",
+    "    f\"{publication_data_folder}/gpcrdb_protein_data.csv\",\n",
+    "    index=False,\n",
+    ")\n",
+    "len(final_data)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Subsetting GPCR to limited proteins\n",
+    "\n",
+    "Since each GPCR faimly has multiple proteins, the idea here is to sample \"n\" proteins from each family to be the representative set for publication purposes."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "shuffled_data = final_data.sample(frac=1).reset_index(drop=True)\n",
+    "len(shuffled_data)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Removing families with less than 3 proteins\n",
+    "familes_removed = []\n",
+    "\n",
+    "gpcr_grouped_counts = shuffled_data.groupby(\"gpcrdb_id\").size()\n",
+    "for gpcr_id, gpcr_count in gpcr_grouped_counts.items():\n",
+    "    if gpcr_count < 3:\n",
+    "        familes_removed.append(gpcr_id)\n",
+    "\n",
+    "cleaned_final_data = shuffled_data[~shuffled_data[\"gpcrdb_id\"].isin(familes_removed)]\n",
+    "len(cleaned_final_data)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "gpcr_grouped = cleaned_final_data.groupby(\"gpcrdb_id\")\n",
+    "len(gpcr_grouped)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Set of 3 proteins per family"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "gpcr_df_sample_three = gpcr_grouped.head(3).reset_index(drop=True)\n",
+    "len(gpcr_df_sample_three)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "gpcr_df_sample_three.to_csv(\n",
+    "    f\"{publication_data_folder}/gpcrdb_protein_data_three.csv\",\n",
+    "    index=False,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Set of 1 proteins per family"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "gpcr_df_sample_one = gpcr_grouped.head(1).reset_index(drop=True)\n",
+    "len(gpcr_df_sample_one)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "gpcr_df_sample_one.to_csv(\n",
+    "    f\"{publication_data_folder}/gpcrdb_protein_data_one.csv\",\n",
+    "    index=False,\n",
+    ")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "guild",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.19"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}