From 89be4278b20c40112d63f7391e425fe3fa701d4e Mon Sep 17 00:00:00 2001 From: AKPreto Date: Mon, 4 May 2026 13:51:24 +0000 Subject: [PATCH 1/3] Reviewed data prep notebooks --- .gitignore | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index aa0d6b7..254c339 100644 --- a/.gitignore +++ b/.gitignore @@ -56,7 +56,7 @@ DATABASE_SCHEMA.md /.databrickscfg /.data /.local -/notebooks/data_prep/ +/notebooks/ # Chache /.pytest_cache From b4c128543d9734b2ba445fce825e87c4103c1071 Mon Sep 17 00:00:00 2001 From: AKPreto Date: Mon, 4 May 2026 13:55:02 +0000 Subject: [PATCH 2/3] unignoring debug --- .gitignore | 1 - 1 file changed, 1 deletion(-) diff --git a/.gitignore b/.gitignore index 254c339..6ba0fe4 100644 --- a/.gitignore +++ b/.gitignore @@ -22,7 +22,6 @@ DATABASE_SCHEMA.md *.svg *.log /notebooks/*.py -/notebooks/debug/*.py # File exceptions !files/autodock_vina_box.txt From 45f2b2ba3597c0f7476226b479c1f4e6c4a3da4d Mon Sep 17 00:00:00 2001 From: AKPreto Date: Mon, 4 May 2026 13:57:15 +0000 Subject: [PATCH 3/3] unignoring debug --- .gitignore | 1 - notebooks/case_study_data/build_decoy.ipynb | 382 ++++++++++ .../case_study_data/ligand_preparation.ipynb | 699 ++++++++++++++++++ .../case_study_data/protein_preparation.ipynb | 255 +++++++ 4 files changed, 1336 insertions(+), 1 deletion(-) create mode 100644 notebooks/case_study_data/build_decoy.ipynb create mode 100644 notebooks/case_study_data/ligand_preparation.ipynb create mode 100644 notebooks/case_study_data/protein_preparation.ipynb diff --git a/.gitignore b/.gitignore index 6ba0fe4..eed5aa6 100644 --- a/.gitignore +++ b/.gitignore @@ -55,7 +55,6 @@ DATABASE_SCHEMA.md /.databrickscfg /.data /.local -/notebooks/ # Chache /.pytest_cache diff --git a/notebooks/case_study_data/build_decoy.ipynb b/notebooks/case_study_data/build_decoy.ipynb new file mode 100644 index 0000000..57cc643 --- /dev/null +++ b/notebooks/case_study_data/build_decoy.ipynb @@ -0,0 +1,382 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import warnings\n", + "\n", + "warnings.filterwarnings(\"ignore\")\n", + "import os\n", + "\n", + "import matplotlib.pyplot as plt\n", + "import pandas as pd\n", + "\n", + "from guild.tools.ligand_properties import (\n", + " assign_properties,\n", + " compound_filter,\n", + " generate_equal_mw_distributions,\n", + ")\n", + "from guild.transformers.chembl import get_assays_locally, get_decoys_locally" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Get protein info" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "support_dir = \"../../guild/support\"\n", + "\n", + "os.makedirs(f\"{support_dir}/binders\", exist_ok=True)\n", + "os.makedirs(f\"{support_dir}/decoys\", exist_ok=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "gpcrdb_mapping = pd.read_csv(f\"{support_dir}/uniprot_gpcrdb_mapping.txt\", sep=\" \")\n", + "gpcrdb_mapping.head(2)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "uniprot_gene_list = set(gpcrdb_mapping[\"uniprot_id\"].tolist())\n", + "len(uniprot_gene_list)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Update binders to ChEMBL 36" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "assay_df = get_assays_locally(uniprot_gene_list, min_mol_wt=250, max_mol_wt=450)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Filter out compounds with pchembl value <= 0\n", + "assay_df = assay_df[assay_df[\"pchembl_value\"] > 0]\n", + "\n", + "# Assign activity based on pchembl value 6\n", + "assay_df[\"activity\"] = assay_df[\"pchembl_value\"].apply(\n", + " lambda x: \"weak-binder\" if x < 6 else \"strong-binder\"\n", + ")\n", + "chembl_version = assay_df[\"chembl_version\"].values[0]\n", + "chembl_version" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "BINDER_FILE = f\"{support_dir}/binders/chembl_binders_{chembl_version}.tsv\"\n", + "\n", + "if not os.path.exists(BINDER_FILE):\n", + " assay_df_filtered = assay_df[\n", + " [\n", + " \"protein_symbol\",\n", + " \"activity\",\n", + " \"chembl_id\",\n", + " \"smiles\",\n", + " \"pchembl_value\",\n", + " ]\n", + " ]\n", + " assay_df_filtered[\"chembl_id\"] = \"CHEMBL\" + assay_df_filtered[\"chembl_id\"].astype(\n", + " str\n", + " )\n", + " assay_df_filtered.rename(columns={\"protein_symbol\": \"uniprot_id\"}, inplace=True)\n", + "\n", + " assay_df_filtered = pd.merge(\n", + " assay_df_filtered, gpcrdb_mapping, how=\"outer\", on=\"uniprot_id\"\n", + " )\n", + "\n", + " assay_df_filtered.to_csv(\n", + " f\"{support_dir}/binders/chembl_binders_{chembl_version}.tsv\",\n", + " sep=\"\\t\",\n", + " index=False,\n", + " )\n", + "else:\n", + " assay_df_filtered = pd.read_csv(BINDER_FILE, sep=\"\\t\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Update decoys to ChEMBL 36" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "MIN_MW = 250\n", + "MAX_MW = 450\n", + "MAX_RING_SIZE = 10" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "decoy_df = get_decoys_locally(\n", + " min_mol_wt=MIN_MW, max_mol_wt=MAX_MW, version=\"chembl_36\"\n", + ") # without the version flag, the latest chembl data is pulled\n", + "decoy_df.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Add molecular properties to the df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "if not os.path.exists(f\"{support_dir}/decoys/{chembl_version}_with_properties.tsv\"):\n", + " decoy_df_with_properties = assign_properties(\n", + " decoy_df[[\"chembl_id\", \"canonical_smiles\"]]\n", + " )\n", + " decoy_df_with_properties[\"scaffold_smiles\"] = decoy_df_with_properties[\n", + " \"scaffold_smiles\"\n", + " ].replace(\"\", None)\n", + "\n", + " decoy_df_with_properties.to_csv(\n", + " f\"{support_dir}/decoys/{chembl_version}_with_properties.tsv\",\n", + " sep=\"\\t\",\n", + " index=False,\n", + " )\n", + "else:\n", + " decoy_df_with_properties = pd.read_csv(\n", + " f\"{support_dir}/decoys/{chembl_version}_with_properties.tsv\",\n", + " sep=\"\\t\",\n", + " low_memory=False,\n", + " )\n", + "\n", + "decoy_df_with_properties.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Subset to match the same properties as other ligands" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "all_decoy_df = pd.merge(\n", + " decoy_df,\n", + " decoy_df_with_properties,\n", + " left_on=\"chembl_id\",\n", + " right_on=\"id\",\n", + " how=\"inner\",\n", + ")\n", + "all_decoy_df.drop(columns=[\"id\"], inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "all_decoy_df.to_csv(\n", + " f\"{support_dir}/decoys/{chembl_version}_decoys_with_properties.tsv\",\n", + " sep=\"\\t\",\n", + " index=False,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "all_decoy_df[\"molecular_weight\"].plot.hist(bins=100)\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Subset the df based on scaffolds and MW in the distribution" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "if os.path.exists(f\"{support_dir}/decoys/{chembl_version}_decoys_filtered.tsv\"):\n", + " decoys_filtered = pd.read_csv(\n", + " f\"{support_dir}/decoys/{chembl_version}_decoys_filtered.tsv\", sep=\"\\t\"\n", + " )\n", + "else:\n", + " decoys_filtered = compound_filter(\n", + " all_decoy_df,\n", + " min_MW=MIN_MW,\n", + " max_MW=MAX_MW,\n", + " scaffold_col=\"scaffold_smiles\",\n", + " ro5_fulfilled_col=\"ro5_fulfilled\",\n", + " largest_ring_size_col=\"max_ring_size\",\n", + " max_per_scaffold=5,\n", + " seed=42,\n", + " MW_column=\"molecular_weight\",\n", + " max_ring_size=MAX_RING_SIZE,\n", + " )\n", + "\n", + " decoys_filtered.to_csv(\n", + " f\"{support_dir}/decoys/{chembl_version}_decoys_filtered.tsv\",\n", + " sep=\"\\t\",\n", + " index=False,\n", + " )" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Generate decoys sublists" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Decoys with 1000 samples\n", + "decoys_sample_1000 = generate_equal_mw_distributions(\n", + " df_1=decoys_filtered,\n", + " mw_column=\"molecular_weight\",\n", + " lipinski_column=\"ro5_fulfilled\",\n", + " n_bins=100,\n", + " target_size=1000,\n", + ")\n", + "\n", + "decoys_sample_1000.to_csv(\n", + " f\"{support_dir}/decoys/{chembl_version}_decoys_1000.tsv\",\n", + " sep=\"\\t\",\n", + " index=False,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "decoys_sample_1000[\"molecular_weight\"].plot.hist(bins=100)\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Generate 100 decoys list" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "decoys_sample_100 = generate_equal_mw_distributions(\n", + " df_1=decoys_filtered,\n", + " mw_column=\"molecular_weight\",\n", + " lipinski_column=\"ro5_fulfilled\",\n", + " n_bins=20,\n", + " target_size=100,\n", + ")\n", + "\n", + "decoys_sample_100.to_csv(\n", + " f\"{support_dir}/decoys/{chembl_version}_decoys_100.tsv\",\n", + " sep=\"\\t\",\n", + " index=False,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "decoys_sample_100[\"molecular_weight\"].plot.hist(bins=20)\n", + "plt.show()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.19" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/notebooks/case_study_data/ligand_preparation.ipynb b/notebooks/case_study_data/ligand_preparation.ipynb new file mode 100644 index 0000000..79175e3 --- /dev/null +++ b/notebooks/case_study_data/ligand_preparation.ipynb @@ -0,0 +1,699 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "\n", + "import pandas as pd\n", + "import plotly.express as px\n", + "import plotly.graph_objects as go\n", + "from plotly.subplots import make_subplots\n", + "from rdkit import RDLogger\n", + "from rdkit.Chem import AllChem, CanonSmiles, MolFromSmiles, MolToSmiles, PandasTools\n", + "from rdkit.Contrib.NP_Score import npscorer\n", + "from sklearn.decomposition import PCA\n", + "from tqdm import tqdm\n", + "\n", + "from guild.tools.ligand_properties import (\n", + " assign_properties,\n", + " compound_filter,\n", + " generate_equal_mw_distributions,\n", + ")\n", + "\n", + "RDLogger.DisableLog(\"rdApp.info\")\n", + "lg = RDLogger.logger()\n", + "lg.setLevel(RDLogger.CRITICAL)\n", + "tqdm.pandas()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "publication_data_folder = \"../../guild/support/publication_data\"\n", + "os.makedirs(publication_data_folder, exist_ok=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "MIN_MW = 250\n", + "MAX_MW = 450\n", + "MAX_RING_SIZE = 10" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def canonicalize_smiles(smiles):\n", + " try:\n", + " return CanonSmiles(smiles)\n", + " except Exception:\n", + " return smiles" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Preparing the NP dataset" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "loaded_nps = pd.read_csv(\n", + " \"../../guild/support/coconut_csv-02-2026.csv\", sep=\",\", header=0\n", + ")[[\"identifier\",\"canonical_smiles\"]]\n", + "loaded_nps.columns = [\"COCONUT_ID\",\"SMILES\"]\n", + "loaded_nps = loaded_nps.drop_duplicates(subset=\"SMILES\").reset_index(drop=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "loaded_nps[[\"simplified_id\",\"0_scaffold\"]] = loaded_nps[\"COCONUT_ID\"].str.split(\".\", expand=True)\n", + "loaded_nps = loaded_nps.loc[loaded_nps[\"0_scaffold\"] == \"0\"].reset_index(drop=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "loaded_nps[\"can_smiles\"] = loaded_nps[\"SMILES\"].progress_apply(canonicalize_smiles)\n", + "\n", + "loaded_nps = loaded_nps[[\"COCONUT_ID\", \"can_smiles\"]]\n", + "loaded_nps = loaded_nps.reset_index(drop=True)\n", + "loaded_nps.head(2)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Adding chemical properties" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#if not os.path.exists(\"../../guild/support/coconut_properties.tsv\"):\n", + "loaded_nps_properties = assign_properties(loaded_nps)\n", + "\n", + "# Replace empty strings with None\n", + "loaded_nps_properties[\"scaffold_smiles\"] = loaded_nps_properties[\n", + " \"scaffold_smiles\"\n", + "].replace(\"\", None)\n", + "\n", + "loaded_nps_properties.to_csv(\n", + " \"../../guild/support/coconut_properties.tsv\", sep=\"\\t\", index=None\n", + ")\n", + "#else:\n", + "# loaded_nps_properties = pd.read_csv(\n", + "# \"../../guild/support/coconut_properties.tsv\", sep=\"\\t\"\n", + "# )\n", + "\n", + "loaded_nps_properties.head(2)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "loaded_nps_with_properties = pd.merge(\n", + " loaded_nps,\n", + " loaded_nps_properties,\n", + " left_on=\"COCONUT_ID\",\n", + " right_on=\"id\",\n", + " how=\"inner\",\n", + ")\n", + "loaded_nps_with_properties.drop(columns=[\"id\"], inplace=True)\n", + "\n", + "#if not os.path.exists(f\"{publication_data_folder}/np_with_properties.tsv\"):\n", + "loaded_nps_with_properties.to_csv(\n", + " f\"{publication_data_folder}/np_with_properties.tsv\", sep=\"\\t\", index=None\n", + ")\n", + "\n", + "len(loaded_nps_with_properties)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Subsetting to properties within drug-like space" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "loaded_nps_with_properties = loaded_nps_with_properties.dropna(\n", + " subset=[\"molecular_weight\", \"scaffold_smiles\"], how=\"any\"\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "if not os.path.exists(f\"{publication_data_folder}/np_filtered_with_properties.tsv\"):\n", + " nps_filtered = compound_filter(\n", + " loaded_nps_with_properties,\n", + " min_MW=MIN_MW,\n", + " max_MW=MAX_MW,\n", + " scaffold_col=\"scaffold_smiles\",\n", + " ro5_fulfilled_col=\"ro5_fulfilled\",\n", + " largest_ring_size_col=\"max_ring_size\",\n", + " max_per_scaffold=5,\n", + " seed=42,\n", + " MW_column=\"molecular_weight\",\n", + " max_ring_size=MAX_RING_SIZE,\n", + " )\n", + "\n", + " # Add activity column\n", + " nps_filtered[\"activity\"] = \"Natural Product\"\n", + " nps_filtered.to_csv(\n", + " f\"{publication_data_folder}/np_filtered_with_properties.tsv\",\n", + " sep=\"\\t\",\n", + " index=False,\n", + " )\n", + "\n", + "else:\n", + " nps_filtered = pd.read_csv(\n", + " f\"{publication_data_folder}/np_filtered_with_properties.tsv\",\n", + " sep=\"\\t\",\n", + " header=0,\n", + " )\n", + "\n", + "f\"Filtered the NPs from {len(loaded_nps)} to {len(nps_filtered)}\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Preparing the synthetic dataset\n", + "\n", + "The dataset is from Enamine and cannot be released due to legal issues. Hence, IDs have been masked in the final output." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "loaded_synthetics = PandasTools.LoadSDF(\n", + " \"../../guild/support/Enamine_Hit_Locator_Library_plated_200000cmpds_20210816.sdf\"\n", + ")\n", + "loaded_synthetics[\"smiles\"] = loaded_synthetics.ROMol.progress_apply(MolToSmiles)\n", + "loaded_synthetics[\"can_smiles\"] = loaded_synthetics[\"smiles\"].progress_apply(\n", + " canonicalize_smiles\n", + ")\n", + "\n", + "loaded_synthetics = loaded_synthetics[[\"Catalog ID\", \"can_smiles\"]]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Adding chemical properties" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "if not os.path.exists(\"../../guild/support/enamine_properties.tsv\"):\n", + " loaded_synthetics_properties = assign_properties(loaded_synthetics)\n", + "\n", + " # Replace empty strings with None\n", + " loaded_synthetics_properties[\"scaffold_smiles\"] = loaded_synthetics_properties[\n", + " \"scaffold_smiles\"\n", + " ].replace(\"\", None)\n", + "\n", + " loaded_synthetics_properties.to_csv(\n", + " \"../../guild/support/enamine_properties.tsv\", sep=\"\\t\", index=None\n", + " )\n", + "else:\n", + " loaded_synthetics_properties = pd.read_csv(\n", + " \"../../guild/support/enamine_properties.tsv\", sep=\"\\t\"\n", + " )\n", + "\n", + "loaded_synthetics_properties.head(2)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "loaded_synthetics_with_properties = pd.merge(\n", + " loaded_synthetics,\n", + " loaded_synthetics_properties,\n", + " left_on=\"Catalog ID\",\n", + " right_on=\"id\",\n", + " how=\"inner\",\n", + ")\n", + "loaded_synthetics_with_properties.drop(columns=[\"id\"], inplace=True)\n", + "\n", + "if not os.path.exists(f\"{publication_data_folder}/synthetics_with_properties.tsv\"):\n", + " loaded_synthetics_with_properties.to_csv(\n", + " f\"{publication_data_folder}/synthetics_with_properties.tsv\",\n", + " sep=\"\\t\",\n", + " index=None,\n", + " )\n", + "\n", + "len(loaded_synthetics_with_properties)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Subsetting to properties within drug-like space" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "loaded_synthetics_with_properties = loaded_synthetics_with_properties.dropna(\n", + " subset=[\"molecular_weight\", \"scaffold_smiles\"], how=\"any\"\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "if not os.path.exists(\n", + " f\"{publication_data_folder}/synthetics_filtered_with_properties.tsv\"\n", + "):\n", + " synthetics_filtered = compound_filter(\n", + " loaded_synthetics_with_properties,\n", + " min_MW=MIN_MW,\n", + " max_MW=MAX_MW,\n", + " scaffold_col=\"scaffold_smiles\",\n", + " ro5_fulfilled_col=\"ro5_fulfilled\",\n", + " largest_ring_size_col=\"max_ring_size\",\n", + " max_per_scaffold=5,\n", + " seed=42,\n", + " MW_column=\"molecular_weight\",\n", + " max_ring_size=MAX_RING_SIZE,\n", + " )\n", + "\n", + " # Add activity column\n", + " synthetics_filtered[\"activity\"] = \"Synthetic\"\n", + " synthetics_filtered.to_csv(\n", + " f\"{publication_data_folder}/synthetics_filtered_with_properties.tsv\",\n", + " sep=\"\\t\",\n", + " index=False,\n", + " )\n", + "\n", + "else:\n", + " synthetics_filtered = pd.read_csv(\n", + " f\"{publication_data_folder}/synthetics_filtered_with_properties.tsv\",\n", + " sep=\"\\t\",\n", + " header=0,\n", + " )\n", + "\n", + "f\"Filtered the Synthetics from {len(loaded_synthetics)} to {len(synthetics_filtered)}\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Exploratory analysis of the final sets" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "filtered_df = pd.concat(\n", + " [synthetics_filtered.reset_index(drop=True), nps_filtered.reset_index(drop=True)],\n", + " axis=0,\n", + ")\n", + "filtered_df[\"ID\"] = filtered_df[\"Catalog ID\"].fillna(filtered_df[\"COCONUT_ID\"])\n", + "filtered_df.drop(columns=[\"Catalog ID\", \"COCONUT_ID\"], inplace=True)\n", + "filtered_df.head(1)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Checking the molecular weight distrbution" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "fig = px.histogram(\n", + " filtered_df,\n", + " x=\"molecular_weight\",\n", + " nbins=100,\n", + " color=\"activity\",\n", + ")\n", + "fig.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Check the NP score distrbution" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "np_model = npscorer.readNPModel()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "filtered_df[\"npscore\"] = filtered_df[\"can_smiles\"].progress_apply(\n", + " lambda x: npscorer.scoreMol(MolFromSmiles(x), np_model)\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "fig = px.histogram(filtered_df, x=\"npscore\", nbins=100, color=\"activity\")\n", + "fig.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To have non-overlapping NP space, a cut-off of 0.05 is used." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "nps_binned = filtered_df[\n", + " (filtered_df[\"npscore\"] > 0.0) & (filtered_df[\"activity\"] == \"Natural Product\")\n", + "]\n", + "synthetics_binned = filtered_df[\n", + " (filtered_df[\"npscore\"] < -0.5) & (filtered_df[\"activity\"] == \"Synthetic\")\n", + "]\n", + "\n", + "binned_df = pd.concat(\n", + " [nps_binned.reset_index(drop=True), synthetics_binned.reset_index(drop=True)],\n", + " axis=0,\n", + ")\n", + "len(binned_df)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "fig = px.histogram(binned_df, x=\"npscore\", nbins=100, color=\"activity\")\n", + "fig.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Generating ECFP fingerprint and checking their chemical space" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def get_fingerprints(df, smiles_col):\n", + " df[\"ROMol\"] = [\n", + " MolFromSmiles(x)\n", + " for x in tqdm(df[smiles_col], total=len(df), desc=\"Generating molecules\")\n", + " ]\n", + " radius = 3\n", + " nBits = 1024\n", + " ECFP6 = [\n", + " list(AllChem.GetMorganFingerprintAsBitVect(x, radius=radius, nBits=nBits))\n", + " for x in tqdm(df[\"ROMol\"], total=len(df), desc=\"Generating fingerprints\")\n", + " ]\n", + " return pd.DataFrame(ECFP6, index=df.index)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "fingerprints_df = get_fingerprints(binned_df, \"can_smiles\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "pca = PCA(n_components=2)\n", + "pca.fit(fingerprints_df)\n", + "pca_result = pca.transform(fingerprints_df)\n", + "pca_result = pd.DataFrame(pca_result, columns=[\"PC1\", \"PC2\"])\n", + "pca_result[\"activity\"] = binned_df[\"activity\"].tolist()\n", + "\n", + "fig = px.scatter(\n", + " data_frame=pca_result, x=\"PC1\", y=\"PC2\", color=\"activity\", opacity=0.25\n", + ")\n", + "fig.update_layout(\n", + " title=\"PCA of ECFP fingerprints\",\n", + " xaxis_title=\"PC1\",\n", + " yaxis_title=\"PC2\",\n", + " legend_title=\"Compounds type\",\n", + " height=600,\n", + " width=1000,\n", + ")\n", + "fig.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Generating the final dataset of 1000 samples each\n", + "\n", + "This dataset requires an equal molecular weight distrbution along with sampling about 1,000 compounds from each." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "matched_np, matched_synthetic = matched_np, matched_synthetic = (\n", + " generate_equal_mw_distributions(\n", + " df_1=nps_binned,\n", + " df_2=synthetics_binned,\n", + " mw_column=\"molecular_weight\",\n", + " lipinski_column=\"ro5_fulfilled\",\n", + " n_bins=10,\n", + " target_size=1000,\n", + " )\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "len(matched_np), len(matched_synthetic)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sampled_ligands = pd.concat([matched_np, matched_synthetic], ignore_index=True)\n", + "col_order = [\"ID\"] + [i for i in sampled_ligands.columns.tolist() if i != \"ID\"]\n", + "sampled_ligands = sampled_ligands[col_order]\n", + "sampled_ligands.head(2)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sampled_ligands.to_csv(\n", + " f\"{publication_data_folder}/ligands_sampled_1000.tsv\",\n", + " sep=\"\\t\",\n", + " index=False,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Inspecting property of final set" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# make three histograms, one for each activity, one for each logp, one for each fsp3, one for each topological_surface_area_mapping with subplots\n", + "columns = [\"logp\", \"fsp3\", \"topological_surface_area_mapping\", \"max_ring_size\"]\n", + "activities = sampled_ligands[\"activity\"].unique()\n", + "\n", + "fig = make_subplots(rows=4, cols=1, subplot_titles=columns)\n", + "\n", + "for i, col in enumerate(columns, start=1):\n", + " for activity in activities:\n", + " data = sampled_ligands[sampled_ligands[\"activity\"] == activity][col]\n", + " fig.add_trace(\n", + " go.Histogram(\n", + " x=data,\n", + " nbinsx=100,\n", + " name=activity,\n", + " legendgroup=activity,\n", + " showlegend=True,\n", + " ),\n", + " row=i,\n", + " col=1,\n", + " )\n", + "\n", + "fig.update_layout(barmode=\"overlay\", height=800)\n", + "fig.update_traces(opacity=0.7)\n", + "fig.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "subset_fingerprints_df = get_fingerprints(sampled_ligands, \"can_smiles\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "pca = PCA(n_components=2)\n", + "pca.fit(fingerprints_df)\n", + "pca_result = pca.transform(fingerprints_df)\n", + "pca_result = pd.DataFrame(pca_result, columns=[\"PC1\", \"PC2\"])\n", + "pca_result[\"activity\"] = binned_df[\"activity\"].tolist()\n", + "\n", + "fig = px.scatter(\n", + " data_frame=pca_result, x=\"PC1\", y=\"PC2\", color=\"activity\", opacity=0.25\n", + ")\n", + "fig.update_layout(\n", + " title=\"PCA of ECFP fingerprints Sampled Ligands only\",\n", + " xaxis_title=\"PC1\",\n", + " yaxis_title=\"PC2\",\n", + " legend_title=\"Compounds type\",\n", + " height=600,\n", + " width=1000,\n", + ")\n", + "fig.show()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "guild", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.19" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/notebooks/case_study_data/protein_preparation.ipynb b/notebooks/case_study_data/protein_preparation.ipynb new file mode 100644 index 0000000..81fe013 --- /dev/null +++ b/notebooks/case_study_data/protein_preparation.ipynb @@ -0,0 +1,255 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "\n", + "import pandas as pd\n", + "from tqdm import tqdm\n", + "\n", + "from guild.tools.scrapping import download_pdb_files\n", + "\n", + "tqdm.pandas()\n", + "pd.set_option(\"mode.chained_assignment\", None)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Get all GPCR PDB files\n", + "These include downlading of PDB files in the notebook folder." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "downloaded_pdbs_folder = \"../data/pdbs\"\n", + "publication_data_folder = \"../../guild/support/publication_data\"\n", + "\n", + "os.makedirs(downloaded_pdbs_folder, exist_ok=True)\n", + "os.makedirs(publication_data_folder, exist_ok=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "curated_data = pd.read_csv(\n", + " \"../../guild/support/GPCRs_curated_dataset.tsv\",\n", + " sep=\"\\t\",\n", + " header=0,\n", + ")\n", + "usable_curated_data = curated_data[curated_data[\"Usable\"] == \"YES\"]\n", + "usable_curated_data[\"PDB_ID\"] = usable_curated_data[\"PDB_ID\"].str.lower()\n", + "usable_curated_pdbs = usable_curated_data[\"PDB_ID\"].tolist()\n", + "f\"Number of GPCR PDB files to download: {len(usable_curated_pdbs)}\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "failed_pdbs = download_pdb_files(\n", + " usable_curated_pdbs, download_dir=downloaded_pdbs_folder\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Subset to successful PDBs only" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "downloaded_curated_data = usable_curated_data[\n", + " ~usable_curated_data[\"PDB_ID\"].isin(failed_pdbs)\n", + "]\n", + "f\"Number of GPCR PDB files successfully downloaded: {len(downloaded_curated_data['PDB_ID'].tolist())}\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Add UniProt mappings" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "gpcrdb_mapping = pd.read_csv(\n", + " \"../../guild/support/uniprot_gpcrdb_mapping.txt\",\n", + " sep=\" \",\n", + ")\n", + "final_data = pd.merge(\n", + " gpcrdb_mapping,\n", + " usable_curated_data,\n", + " left_on=\"pdb_id\",\n", + " right_on=\"PDB_ID\",\n", + " how=\"inner\",\n", + ")\n", + "final_data.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "final_data.to_csv(\n", + " f\"{publication_data_folder}/gpcrdb_protein_data.csv\",\n", + " index=False,\n", + ")\n", + "len(final_data)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Subsetting GPCR to limited proteins\n", + "\n", + "Since each GPCR faimly has multiple proteins, the idea here is to sample \"n\" proteins from each family to be the representative set for publication purposes." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "shuffled_data = final_data.sample(frac=1).reset_index(drop=True)\n", + "len(shuffled_data)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Removing families with less than 3 proteins\n", + "familes_removed = []\n", + "\n", + "gpcr_grouped_counts = shuffled_data.groupby(\"gpcrdb_id\").size()\n", + "for gpcr_id, gpcr_count in gpcr_grouped_counts.items():\n", + " if gpcr_count < 3:\n", + " familes_removed.append(gpcr_id)\n", + "\n", + "cleaned_final_data = shuffled_data[~shuffled_data[\"gpcrdb_id\"].isin(familes_removed)]\n", + "len(cleaned_final_data)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "gpcr_grouped = cleaned_final_data.groupby(\"gpcrdb_id\")\n", + "len(gpcr_grouped)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Set of 3 proteins per family" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "gpcr_df_sample_three = gpcr_grouped.head(3).reset_index(drop=True)\n", + "len(gpcr_df_sample_three)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "gpcr_df_sample_three.to_csv(\n", + " f\"{publication_data_folder}/gpcrdb_protein_data_three.csv\",\n", + " index=False,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Set of 1 proteins per family" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "gpcr_df_sample_one = gpcr_grouped.head(1).reset_index(drop=True)\n", + "len(gpcr_df_sample_one)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "gpcr_df_sample_one.to_csv(\n", + " f\"{publication_data_folder}/gpcrdb_protein_data_one.csv\",\n", + " index=False,\n", + ")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "guild", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.19" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}