From 26a551aa7a5e4cbcd26f6ac21aa3c60cc72be89d Mon Sep 17 00:00:00 2001 From: Erik Serrano Date: Thu, 18 Dec 2025 11:30:19 -0700 Subject: [PATCH 01/12] updated cpjump1 plates selection to only compound treated plates --- .pre-commit-config.yaml | 4 +- .../0.download-data/1.download-data.ipynb | 227 +++++++----------- .../nbconverted/1.download-data.py | 55 ++--- 3 files changed, 119 insertions(+), 167 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index af4b467..0d3300f 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -24,7 +24,7 @@ repos: # Python syntax upgrades (should run before linting/formatting) - repo: https://github.com/asottile/pyupgrade - rev: v3.21.1 + rev: v3.21.2 hooks: - id: pyupgrade args: ["--py311-plus"] @@ -38,7 +38,7 @@ repos: # Ruff for linting and formatting Python files - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.14.4 + rev: v0.14.9 hooks: - id: ruff-check args: ["--fix"] diff --git a/notebooks/0.download-data/1.download-data.ipynb b/notebooks/0.download-data/1.download-data.ipynb index de00dba..c1294e1 100644 --- a/notebooks/0.download-data/1.download-data.ipynb +++ b/notebooks/0.download-data/1.download-data.ipynb @@ -9,7 +9,7 @@ "\n", "This notebook focuses on downloading metadata and single-cell profiles from three key datasets:\n", "\n", - "1. **CPJUMP1 Pilot Dataset** ([link](https://github.com/jump-cellpainting/2024_Chandrasekaran_NatureMethods_CPJUMP1)): Metadata is downloaded and processed to identify and organize plates containing wells treated with CRISPR perturbations for downstream analysis.\n", + "1. **CPJUMP1 Pilot Dataset** ([link](https://github.com/jump-cellpainting/2024_Chandrasekaran_NatureMethods_CPJUMP1)): Metadata is downloaded and processed to identify and organize plates containing wells treated with compound perturbations for downstream analysis.\n", "2. **MitoCheck Dataset**: Normalized and feature-selected single-cell profiles are downloaded for further analysis.\n", "3. **CFReT Dataset**: Normalized and feature-selected single-cell profiles from the CFReT plate are downloaded for downstream analysis." ] @@ -22,7 +22,6 @@ "outputs": [], "source": [ "import sys\n", - "import pprint\n", "import pathlib\n", "import gzip\n", "import zipfile\n", @@ -188,7 +187,7 @@ "source": [ "# setting perturbation type\n", "# other options are \"compound\", \"orf\",\n", - "pert_type = \"crispr\"" + "pert_type = \"compound\"" ] }, { @@ -201,7 +200,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 4, "id": "b7381913", "metadata": {}, "outputs": [], @@ -236,15 +235,39 @@ "source": [ "## Downloading CPJUMP1 Metadata\n", "\n", - "In this section, we download and process the CPJUMP1 experimental metadata. This metadata contains information about assay plates, batches, and perturbation types, which is essential for organizing and analyzing single-cell profiles. Only plates treated with CRISPR perturbations are selected for downstream analysis." + "In this section, we download the [experimental metadata](https://github.com/carpenter-singh-lab/2024_Chandrasekaran_NatureMethods/blob/main/benchmark/output/experiment-metadata.tsv) for the CPJUMP1 dataset. This metadata provides detailed information about each experimental batch, including plate barcodes, cell lines, perturbation types, and incubation times. Access to this metadata is essential for selecting and organizing the relevant subset of CPJUMP1 data for downstream analysis.\n", + "\n", + "For this notebook, we focus on plates containing both U2OS and A549 parental cell lines that have been treated with compounds for 48 hours. More information about the batch and plate metadata can be found in the [CPJUMP1 documentation](https://github.com/carpenter-singh-lab/2024_Chandrasekaran_NatureMethods/blob/main/README.md#batch-and-plate-metadata)." ] }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 5, "id": "5b8bfe5f", "metadata": {}, "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "plates that will be downloaded are: shape: (12,)\n", + "Series: 'Assay_Plate_Barcode' [str]\n", + "[\n", + "\t\"BR00117011\"\n", + "\t\"BR00117012\"\n", + "\t\"BR00117013\"\n", + "\t\"BR00117015\"\n", + "\t\"BR00117019\"\n", + "\t…\n", + "\t\"BR00117016\"\n", + "\t\"BR00117055\"\n", + "\t\"BR00117009\"\n", + "\t\"BR00117017\"\n", + "\t\"BR00117054\"\n", + "]\n", + "shape: (12, 13)\n" + ] + }, { "data": { "text/html": [ @@ -255,10 +278,10 @@ " white-space: pre-wrap;\n", "}\n", "\n", - "shape: (22, 13)
BatchPlate_Map_NameAssay_Plate_BarcodePerturbationCell_typeTimeDensityAntibioticsCell_lineTime_delayTimes_imagedAnomalyNumber_of_images
strstrstrstrstri64i64strstrstri64stri64
"2020_11_04_CPJUMP1""JUMP-Target-1_crispr_platemap""BR00116996""crispr""U2OS"144100"absent""Cas9""Day0"1"WGA"27648
"2020_11_04_CPJUMP1""JUMP-Target-1_crispr_platemap""BR00116997""crispr""U2OS"144100"absent""Cas9""Day0"1"WGA"27648
"2020_11_04_CPJUMP1""JUMP-Target-1_crispr_platemap""BR00116998""crispr""U2OS"144100"absent""Cas9""Day0"1"WGA"27648
"2020_11_04_CPJUMP1""JUMP-Target-1_crispr_platemap""BR00116999""crispr""U2OS"144100"absent""Cas9""Day0"1"WGA"27648
"2020_11_04_CPJUMP1""JUMP-Target-1_crispr_platemap""BR00117000""crispr""A549"144100"absent""Cas9""Day0"1"none"27640
"2020_11_04_CPJUMP1""JUMP-Target-1_crispr_platemap""BR00118048""crispr""U2OS"96100"absent""Cas9""Day0"1"Phalloidin"27648
"2020_11_04_CPJUMP1_DL""JUMP-Target-1_crispr_platemap""BR00116996""crispr""U2OS"144100"absent""Cas9""Day0"1"WGA"27648
"2020_11_04_CPJUMP1_DL""JUMP-Target-1_crispr_platemap""BR00116997""crispr""U2OS"144100"absent""Cas9""Day0"1"WGA"27648
"2020_11_04_CPJUMP1_DL""JUMP-Target-1_crispr_platemap""BR00116998""crispr""U2OS"144100"absent""Cas9""Day0"1"WGA"27648
"2020_11_04_CPJUMP1_DL""JUMP-Target-1_crispr_platemap""BR00116999""crispr""U2OS"144100"absent""Cas9""Day0"1"WGA"27648
" + "shape: (12, 13)
BatchPlate_Map_NameAssay_Plate_BarcodePerturbationCell_typeTimeDensityAntibioticsCell_lineTime_delayTimes_imagedAnomalyNumber_of_images
strstrstrstrstri64i64strstrstri64stri64
"2020_11_04_CPJUMP1""JUMP-Target-1_compound_platema…"BR00117008""compound""A549"4880"absent""Parental""Day0"1"none"27648
"2020_11_04_CPJUMP1""JUMP-Target-1_compound_platema…"BR00117009""compound""A549"4880"absent""Parental""Day0"1"none"27648
"2020_11_04_CPJUMP1""JUMP-Target-1_compound_platema…"BR00117010""compound""U2OS"48100"absent""Parental""Day0"1"Mitotracker"27648
"2020_11_04_CPJUMP1""JUMP-Target-1_compound_platema…"BR00117011""compound""U2OS"48100"absent""Parental""Day0"1"none"27648
"2020_11_04_CPJUMP1""JUMP-Target-1_compound_platema…"BR00117012""compound""U2OS"48100"absent""Parental""Day0"1"none"27648
"2020_11_04_CPJUMP1""JUMP-Target-1_compound_platema…"BR00117016""compound""A549"48100"absent""Parental""Day0"1"none"49152
"2020_11_04_CPJUMP1""JUMP-Target-1_compound_platema…"BR00117017""compound""A549"48100"absent""Parental""Day0"1"none"49144
"2020_11_04_CPJUMP1""JUMP-Target-1_compound_platema…"BR00117019""compound""A549"48100"absent""Parental""Day0"1"none"49152
"2020_11_04_CPJUMP1""JUMP-Target-1_compound_platema…"BR00117054""compound""A549"48120"absent""Parental""Day0"1"none"27648
"2020_11_04_CPJUMP1""JUMP-Target-1_compound_platema…"BR00117055""compound""A549"48120"absent""Parental""Day0"1"none"27648
" ], "text/plain": [ - "shape: (22, 13)\n", + "shape: (12, 13)\n", "┌───────────┬───────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬──────────┐\n", "│ Batch ┆ Plate_Map ┆ Assay_Pla ┆ Perturbat ┆ … ┆ Time_dela ┆ Times_ima ┆ Anomaly ┆ Number_o │\n", "│ --- ┆ _Name ┆ te_Barcod ┆ ion ┆ ┆ y ┆ ged ┆ --- ┆ f_images │\n", @@ -266,51 +289,51 @@ "│ ┆ str ┆ --- ┆ str ┆ ┆ str ┆ i64 ┆ ┆ i64 │\n", "│ ┆ ┆ str ┆ ┆ ┆ ┆ ┆ ┆ │\n", "╞═══════════╪═══════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪══════════╡\n", - "│ 2020_11_0 ┆ JUMP-Targ ┆ BR0011699 ┆ crispr ┆ … ┆ Day0 ┆ 1 ┆ WGA ┆ 27648 │\n", - "│ 4_CPJUMP1 ┆ et-1_cris ┆ 6 ┆ ┆ ┆ ┆ ┆ ┆ │\n", - "│ ┆ pr_platem ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", - "│ ┆ ap ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", - "│ 2020_11_0 ┆ JUMP-Targ ┆ BR0011699 ┆ crispr ┆ … ┆ Day0 ┆ 1 ┆ WGA ┆ 27648 │\n", - "│ 4_CPJUMP1 ┆ et-1_cris ┆ 7 ┆ ┆ ┆ ┆ ┆ ┆ │\n", - "│ ┆ pr_platem ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", - "│ ┆ ap ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", - "│ 2020_11_0 ┆ JUMP-Targ ┆ BR0011699 ┆ crispr ┆ … ┆ Day0 ┆ 1 ┆ WGA ┆ 27648 │\n", - "│ 4_CPJUMP1 ┆ et-1_cris ┆ 8 ┆ ┆ ┆ ┆ ┆ ┆ │\n", - "│ ┆ pr_platem ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", - "│ ┆ ap ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", - "│ 2020_11_0 ┆ JUMP-Targ ┆ BR0011699 ┆ crispr ┆ … ┆ Day0 ┆ 1 ┆ WGA ┆ 27648 │\n", - "│ 4_CPJUMP1 ┆ et-1_cris ┆ 9 ┆ ┆ ┆ ┆ ┆ ┆ │\n", - "│ ┆ pr_platem ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", - "│ ┆ ap ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", - "│ 2020_11_0 ┆ JUMP-Targ ┆ BR0011700 ┆ crispr ┆ … ┆ Day0 ┆ 1 ┆ none ┆ 27640 │\n", - "│ 4_CPJUMP1 ┆ et-1_cris ┆ 0 ┆ ┆ ┆ ┆ ┆ ┆ │\n", - "│ ┆ pr_platem ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", - "│ ┆ ap ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", + "│ 2020_11_0 ┆ JUMP-Targ ┆ BR0011700 ┆ compound ┆ … ┆ Day0 ┆ 1 ┆ none ┆ 27648 │\n", + "│ 4_CPJUMP1 ┆ et-1_comp ┆ 8 ┆ ┆ ┆ ┆ ┆ ┆ │\n", + "│ ┆ ound_plat ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", + "│ ┆ ema… ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", + "│ 2020_11_0 ┆ JUMP-Targ ┆ BR0011700 ┆ compound ┆ … ┆ Day0 ┆ 1 ┆ none ┆ 27648 │\n", + "│ 4_CPJUMP1 ┆ et-1_comp ┆ 9 ┆ ┆ ┆ ┆ ┆ ┆ │\n", + "│ ┆ ound_plat ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", + "│ ┆ ema… ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", + "│ 2020_11_0 ┆ JUMP-Targ ┆ BR0011701 ┆ compound ┆ … ┆ Day0 ┆ 1 ┆ Mitotrack ┆ 27648 │\n", + "│ 4_CPJUMP1 ┆ et-1_comp ┆ 0 ┆ ┆ ┆ ┆ ┆ er ┆ │\n", + "│ ┆ ound_plat ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", + "│ ┆ ema… ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", + "│ 2020_11_0 ┆ JUMP-Targ ┆ BR0011701 ┆ compound ┆ … ┆ Day0 ┆ 1 ┆ none ┆ 27648 │\n", + "│ 4_CPJUMP1 ┆ et-1_comp ┆ 1 ┆ ┆ ┆ ┆ ┆ ┆ │\n", + "│ ┆ ound_plat ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", + "│ ┆ ema… ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", + "│ 2020_11_0 ┆ JUMP-Targ ┆ BR0011701 ┆ compound ┆ … ┆ Day0 ┆ 1 ┆ none ┆ 27648 │\n", + "│ 4_CPJUMP1 ┆ et-1_comp ┆ 2 ┆ ┆ ┆ ┆ ┆ ┆ │\n", + "│ ┆ ound_plat ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", + "│ ┆ ema… ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", "│ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … │\n", - "│ 2020_11_0 ┆ JUMP-Targ ┆ BR0011804 ┆ crispr ┆ … ┆ Day0 ┆ 1 ┆ Phalloidi ┆ 27648 │\n", - "│ 4_CPJUMP1 ┆ et-1_cris ┆ 8 ┆ ┆ ┆ ┆ ┆ n ┆ │\n", - "│ ┆ pr_platem ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", - "│ ┆ ap ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", - "│ 2020_11_0 ┆ JUMP-Targ ┆ BR0011699 ┆ crispr ┆ … ┆ Day0 ┆ 1 ┆ WGA ┆ 27648 │\n", - "│ 4_CPJUMP1 ┆ et-1_cris ┆ 6 ┆ ┆ ┆ ┆ ┆ ┆ │\n", - "│ _DL ┆ pr_platem ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", - "│ ┆ ap ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", - "│ 2020_11_0 ┆ JUMP-Targ ┆ BR0011699 ┆ crispr ┆ … ┆ Day0 ┆ 1 ┆ WGA ┆ 27648 │\n", - "│ 4_CPJUMP1 ┆ et-1_cris ┆ 7 ┆ ┆ ┆ ┆ ┆ ┆ │\n", - "│ _DL ┆ pr_platem ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", - "│ ┆ ap ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", - "│ 2020_11_0 ┆ JUMP-Targ ┆ BR0011699 ┆ crispr ┆ … ┆ Day0 ┆ 1 ┆ WGA ┆ 27648 │\n", - "│ 4_CPJUMP1 ┆ et-1_cris ┆ 8 ┆ ┆ ┆ ┆ ┆ ┆ │\n", - "│ _DL ┆ pr_platem ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", - "│ ┆ ap ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", - "│ 2020_11_0 ┆ JUMP-Targ ┆ BR0011699 ┆ crispr ┆ … ┆ Day0 ┆ 1 ┆ WGA ┆ 27648 │\n", - "│ 4_CPJUMP1 ┆ et-1_cris ┆ 9 ┆ ┆ ┆ ┆ ┆ ┆ │\n", - "│ _DL ┆ pr_platem ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", - "│ ┆ ap ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", + "│ 2020_11_0 ┆ JUMP-Targ ┆ BR0011701 ┆ compound ┆ … ┆ Day0 ┆ 1 ┆ none ┆ 49152 │\n", + "│ 4_CPJUMP1 ┆ et-1_comp ┆ 6 ┆ ┆ ┆ ┆ ┆ ┆ │\n", + "│ ┆ ound_plat ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", + "│ ┆ ema… ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", + "│ 2020_11_0 ┆ JUMP-Targ ┆ BR0011701 ┆ compound ┆ … ┆ Day0 ┆ 1 ┆ none ┆ 49144 │\n", + "│ 4_CPJUMP1 ┆ et-1_comp ┆ 7 ┆ ┆ ┆ ┆ ┆ ┆ │\n", + "│ ┆ ound_plat ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", + "│ ┆ ema… ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", + "│ 2020_11_0 ┆ JUMP-Targ ┆ BR0011701 ┆ compound ┆ … ┆ Day0 ┆ 1 ┆ none ┆ 49152 │\n", + "│ 4_CPJUMP1 ┆ et-1_comp ┆ 9 ┆ ┆ ┆ ┆ ┆ ┆ │\n", + "│ ┆ ound_plat ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", + "│ ┆ ema… ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", + "│ 2020_11_0 ┆ JUMP-Targ ┆ BR0011705 ┆ compound ┆ … ┆ Day0 ┆ 1 ┆ none ┆ 27648 │\n", + "│ 4_CPJUMP1 ┆ et-1_comp ┆ 4 ┆ ┆ ┆ ┆ ┆ ┆ │\n", + "│ ┆ ound_plat ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", + "│ ┆ ema… ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", + "│ 2020_11_0 ┆ JUMP-Targ ┆ BR0011705 ┆ compound ┆ … ┆ Day0 ┆ 1 ┆ none ┆ 27648 │\n", + "│ 4_CPJUMP1 ┆ et-1_comp ┆ 5 ┆ ┆ ┆ ┆ ┆ ┆ │\n", + "│ ┆ ound_plat ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", + "│ ┆ ema… ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", "└───────────┴───────────┴───────────┴───────────┴───┴───────────┴───────────┴───────────┴──────────┘" ] }, - "execution_count": 6, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } @@ -326,75 +349,30 @@ " CPJUMP1_exp_metadata_url, separator=\"\\t\", has_header=True, encoding=\"utf-8\"\n", ")\n", "\n", - "# filtering the metadata to only includes plates that their perturbation types are crispr\n", - "exp_metadata = exp_metadata.filter(exp_metadata[\"Perturbation\"].str.contains(pert_type))\n", + "# apply a single filter to select only rows matching all criteria\n", + "exp_metadata = exp_metadata.filter(\n", + " (\n", + " exp_metadata[\"Perturbation\"].str.contains(pert_type)\n", + " ) # selecting based on pert type\n", + " & (exp_metadata[\"Time\"] == 48) # time of incubation with compound\n", + " & (\n", + " exp_metadata[\"Cell_type\"].is_in([\"U2OS\", \"A549\"])\n", + " ) # selecting based on cell type\n", + " & (exp_metadata[\"Cell_line\"] == \"Parental\") # selecting only the parental cell line\n", + " & (pl.col(\"Batch\") == \"2020_11_04_CPJUMP1\") # selecting only the specified batch\n", + ")\n", "\n", "# save the experimental metadata as a csv file\n", "exp_metadata.write_csv(exp_metadata_path)\n", "\n", "# display\n", + "print(\n", + " \"plates that will be downloaded are: \", exp_metadata[\"Assay_Plate_Barcode\"].unique()\n", + ")\n", + "print(\"shape: \", exp_metadata.shape)\n", "exp_metadata" ] }, - { - "cell_type": "markdown", - "id": "9121e37c", - "metadata": {}, - "source": [ - "Creating a dictionary to group plates by their corresponding experimental batch\n", - "\n", - "This step organizes the plate barcodes from the experimental metadata into groups based on their batch. Grouping plates by batch is useful for batch-wise data processing and downstream analyses." - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "95c20b91", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{'2020_11_04_CPJUMP1': ['BR00116996',\n", - " 'BR00116997',\n", - " 'BR00116998',\n", - " 'BR00116999',\n", - " 'BR00117000',\n", - " 'BR00117001',\n", - " 'BR00117002',\n", - " 'BR00117003',\n", - " 'BR00117004',\n", - " 'BR00117005',\n", - " 'BR00118041',\n", - " 'BR00118042',\n", - " 'BR00118043',\n", - " 'BR00118044',\n", - " 'BR00118045',\n", - " 'BR00118046',\n", - " 'BR00118047',\n", - " 'BR00118048'],\n", - " '2020_11_04_CPJUMP1_DL': ['BR00116996',\n", - " 'BR00116997',\n", - " 'BR00116998',\n", - " 'BR00116999']}\n" - ] - } - ], - "source": [ - "# creating a dictionary for the batch and the associated plates with the a batch\n", - "batch_plates_dict = {}\n", - "exp_metadata_batches = exp_metadata[\"Batch\"].unique().to_list()\n", - "\n", - "for batch in exp_metadata_batches:\n", - " batch_plates_dict[batch] = exp_metadata.filter(exp_metadata[\"Batch\"] == batch)[\n", - " \"Assay_Plate_Barcode\"\n", - " ].to_list()\n", - "\n", - "# display batch (Keys) and plates (values) within each batch\n", - "pprint.pprint(batch_plates_dict)" - ] - }, { "cell_type": "markdown", "id": "7021b414", @@ -409,36 +387,15 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 6, "id": "06783224", "metadata": {}, "outputs": [ { - "name": "stderr", + "name": "stdout", "output_type": "stream", "text": [ - "Downloading: 1%|▏ | 229M/16.9G [07:43<9:33:53, 519kB/s] \n" - ] - }, - { - "ename": "KeyboardInterrupt", - "evalue": "", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[7], line 10\u001b[0m\n\u001b[1;32m 7\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mFile \u001b[39m\u001b[38;5;132;01m{\u001b[39;00moutput_path\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m already exists. Skipping download.\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 8\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 9\u001b[0m \u001b[38;5;66;03m# downloading mitocheck profiles\u001b[39;00m\n\u001b[0;32m---> 10\u001b[0m \u001b[43mdownload_compressed_file\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 11\u001b[0m \u001b[43m \u001b[49m\u001b[43msource_url\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmitocheck_url\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 12\u001b[0m \u001b[43m \u001b[49m\u001b[43moutput_path\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43moutput_path\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 13\u001b[0m \u001b[43m \u001b[49m\u001b[43mchunk_size\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m8192\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 14\u001b[0m \u001b[43m \u001b[49m\u001b[43mextract\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[1;32m 15\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n", - "Cell \u001b[0;32mIn[2], line 71\u001b[0m, in \u001b[0;36mdownload_compressed_file\u001b[0;34m(source_url, output_path, chunk_size, extract)\u001b[0m\n\u001b[1;32m 59\u001b[0m \u001b[38;5;66;03m# using tqdm to track the download progress\u001b[39;00m\n\u001b[1;32m 60\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m (\n\u001b[1;32m 61\u001b[0m \u001b[38;5;28mopen\u001b[39m(output_path, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mwb\u001b[39m\u001b[38;5;124m\"\u001b[39m) \u001b[38;5;28;01mas\u001b[39;00m file,\n\u001b[1;32m 62\u001b[0m tqdm(\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 69\u001b[0m ):\n\u001b[1;32m 70\u001b[0m \u001b[38;5;66;03m# iterating over the response content in chunks\u001b[39;00m\n\u001b[0;32m---> 71\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43;01mfor\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mchunk\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01min\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mresponse\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43miter_content\u001b[49m\u001b[43m(\u001b[49m\u001b[43mchunk_size\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mchunk_size\u001b[49m\u001b[43m)\u001b[49m\u001b[43m:\u001b[49m\n\u001b[1;32m 72\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43;01mif\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mchunk\u001b[49m\u001b[43m:\u001b[49m\n\u001b[1;32m 73\u001b[0m \u001b[43m \u001b[49m\u001b[43mfile\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mwrite\u001b[49m\u001b[43m(\u001b[49m\u001b[43mchunk\u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[0;32m~/Software/miniconda3/envs/buscar/lib/python3.12/site-packages/requests/models.py:820\u001b[0m, in \u001b[0;36mResponse.iter_content..generate\u001b[0;34m()\u001b[0m\n\u001b[1;32m 818\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mhasattr\u001b[39m(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mraw, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mstream\u001b[39m\u001b[38;5;124m\"\u001b[39m):\n\u001b[1;32m 819\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 820\u001b[0m \u001b[38;5;28;01myield from\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mraw\u001b[38;5;241m.\u001b[39mstream(chunk_size, decode_content\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m)\n\u001b[1;32m 821\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m ProtocolError \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[1;32m 822\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m ChunkedEncodingError(e)\n", - "File \u001b[0;32m~/Software/miniconda3/envs/buscar/lib/python3.12/site-packages/urllib3/response.py:1091\u001b[0m, in \u001b[0;36mHTTPResponse.stream\u001b[0;34m(self, amt, decode_content)\u001b[0m\n\u001b[1;32m 1089\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 1090\u001b[0m \u001b[38;5;28;01mwhile\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m is_fp_closed(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_fp) \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_decoded_buffer) \u001b[38;5;241m>\u001b[39m \u001b[38;5;241m0\u001b[39m:\n\u001b[0;32m-> 1091\u001b[0m data \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mread\u001b[49m\u001b[43m(\u001b[49m\u001b[43mamt\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mamt\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdecode_content\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdecode_content\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1093\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m data:\n\u001b[1;32m 1094\u001b[0m \u001b[38;5;28;01myield\u001b[39;00m data\n", - "File \u001b[0;32m~/Software/miniconda3/envs/buscar/lib/python3.12/site-packages/urllib3/response.py:980\u001b[0m, in \u001b[0;36mHTTPResponse.read\u001b[0;34m(self, amt, decode_content, cache_content)\u001b[0m\n\u001b[1;32m 977\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_decoded_buffer) \u001b[38;5;241m>\u001b[39m\u001b[38;5;241m=\u001b[39m amt:\n\u001b[1;32m 978\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_decoded_buffer\u001b[38;5;241m.\u001b[39mget(amt)\n\u001b[0;32m--> 980\u001b[0m data \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_raw_read\u001b[49m\u001b[43m(\u001b[49m\u001b[43mamt\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 982\u001b[0m flush_decoder \u001b[38;5;241m=\u001b[39m amt \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mor\u001b[39;00m (amt \u001b[38;5;241m!=\u001b[39m \u001b[38;5;241m0\u001b[39m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m data)\n\u001b[1;32m 984\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m data \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_decoded_buffer) \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m0\u001b[39m:\n", - "File \u001b[0;32m~/Software/miniconda3/envs/buscar/lib/python3.12/site-packages/urllib3/response.py:904\u001b[0m, in \u001b[0;36mHTTPResponse._raw_read\u001b[0;34m(self, amt, read1)\u001b[0m\n\u001b[1;32m 901\u001b[0m fp_closed \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mgetattr\u001b[39m(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_fp, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mclosed\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;28;01mFalse\u001b[39;00m)\n\u001b[1;32m 903\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_error_catcher():\n\u001b[0;32m--> 904\u001b[0m data \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_fp_read\u001b[49m\u001b[43m(\u001b[49m\u001b[43mamt\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mread1\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mread1\u001b[49m\u001b[43m)\u001b[49m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m fp_closed \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;124mb\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 905\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m amt \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m amt \u001b[38;5;241m!=\u001b[39m \u001b[38;5;241m0\u001b[39m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m data:\n\u001b[1;32m 906\u001b[0m \u001b[38;5;66;03m# Platform-specific: Buggy versions of Python.\u001b[39;00m\n\u001b[1;32m 907\u001b[0m \u001b[38;5;66;03m# Close the connection when no data is returned\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 912\u001b[0m \u001b[38;5;66;03m# not properly close the connection in all cases. There is\u001b[39;00m\n\u001b[1;32m 913\u001b[0m \u001b[38;5;66;03m# no harm in redundantly calling close.\u001b[39;00m\n\u001b[1;32m 914\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_fp\u001b[38;5;241m.\u001b[39mclose()\n", - "File \u001b[0;32m~/Software/miniconda3/envs/buscar/lib/python3.12/site-packages/urllib3/response.py:887\u001b[0m, in \u001b[0;36mHTTPResponse._fp_read\u001b[0;34m(self, amt, read1)\u001b[0m\n\u001b[1;32m 884\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_fp\u001b[38;5;241m.\u001b[39mread1(amt) \u001b[38;5;28;01mif\u001b[39;00m amt \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_fp\u001b[38;5;241m.\u001b[39mread1()\n\u001b[1;32m 885\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 886\u001b[0m \u001b[38;5;66;03m# StringIO doesn't like amt=None\u001b[39;00m\n\u001b[0;32m--> 887\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_fp\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mread\u001b[49m\u001b[43m(\u001b[49m\u001b[43mamt\u001b[49m\u001b[43m)\u001b[49m \u001b[38;5;28;01mif\u001b[39;00m amt \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_fp\u001b[38;5;241m.\u001b[39mread()\n", - "File \u001b[0;32m~/Software/miniconda3/envs/buscar/lib/python3.12/http/client.py:479\u001b[0m, in \u001b[0;36mHTTPResponse.read\u001b[0;34m(self, amt)\u001b[0m\n\u001b[1;32m 476\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mlength \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m amt \u001b[38;5;241m>\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mlength:\n\u001b[1;32m 477\u001b[0m \u001b[38;5;66;03m# clip the read to the \"end of response\"\u001b[39;00m\n\u001b[1;32m 478\u001b[0m amt \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mlength\n\u001b[0;32m--> 479\u001b[0m s \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfp\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mread\u001b[49m\u001b[43m(\u001b[49m\u001b[43mamt\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 480\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m s \u001b[38;5;129;01mand\u001b[39;00m amt:\n\u001b[1;32m 481\u001b[0m \u001b[38;5;66;03m# Ideally, we would raise IncompleteRead if the content-length\u001b[39;00m\n\u001b[1;32m 482\u001b[0m \u001b[38;5;66;03m# wasn't satisfied, but it might break compatibility.\u001b[39;00m\n\u001b[1;32m 483\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_close_conn()\n", - "File \u001b[0;32m~/Software/miniconda3/envs/buscar/lib/python3.12/socket.py:720\u001b[0m, in \u001b[0;36mSocketIO.readinto\u001b[0;34m(self, b)\u001b[0m\n\u001b[1;32m 718\u001b[0m \u001b[38;5;28;01mwhile\u001b[39;00m \u001b[38;5;28;01mTrue\u001b[39;00m:\n\u001b[1;32m 719\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 720\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_sock\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrecv_into\u001b[49m\u001b[43m(\u001b[49m\u001b[43mb\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 721\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m timeout:\n\u001b[1;32m 722\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_timeout_occurred \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mTrue\u001b[39;00m\n", - "File \u001b[0;32m~/Software/miniconda3/envs/buscar/lib/python3.12/ssl.py:1251\u001b[0m, in \u001b[0;36mSSLSocket.recv_into\u001b[0;34m(self, buffer, nbytes, flags)\u001b[0m\n\u001b[1;32m 1247\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m flags \u001b[38;5;241m!=\u001b[39m \u001b[38;5;241m0\u001b[39m:\n\u001b[1;32m 1248\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[1;32m 1249\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mnon-zero flags not allowed in calls to recv_into() on \u001b[39m\u001b[38;5;132;01m%s\u001b[39;00m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;241m%\u001b[39m\n\u001b[1;32m 1250\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__class__\u001b[39m)\n\u001b[0;32m-> 1251\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mread\u001b[49m\u001b[43m(\u001b[49m\u001b[43mnbytes\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mbuffer\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1252\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 1253\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28msuper\u001b[39m()\u001b[38;5;241m.\u001b[39mrecv_into(buffer, nbytes, flags)\n", - "File \u001b[0;32m~/Software/miniconda3/envs/buscar/lib/python3.12/ssl.py:1103\u001b[0m, in \u001b[0;36mSSLSocket.read\u001b[0;34m(self, len, buffer)\u001b[0m\n\u001b[1;32m 1101\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m 1102\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m buffer \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m-> 1103\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_sslobj\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mread\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mlen\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mbuffer\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1104\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 1105\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_sslobj\u001b[38;5;241m.\u001b[39mread(\u001b[38;5;28mlen\u001b[39m)\n", - "\u001b[0;31mKeyboardInterrupt\u001b[0m: " + "File /home/erikserrano/Projects/buscar/notebooks/0.download-data/data/sc-profiles/mitocheck/mitocheck_profile.zip already exists. Skipping download.\n" ] } ], @@ -476,7 +433,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 7, "id": "4d9fd47c", "metadata": {}, "outputs": [ diff --git a/notebooks/0.download-data/nbconverted/1.download-data.py b/notebooks/0.download-data/nbconverted/1.download-data.py index 9d4e14e..7ea16d1 100644 --- a/notebooks/0.download-data/nbconverted/1.download-data.py +++ b/notebooks/0.download-data/nbconverted/1.download-data.py @@ -4,7 +4,7 @@ # # This notebook focuses on downloading metadata and single-cell profiles from three key datasets: # -# 1. **CPJUMP1 Pilot Dataset** ([link](https://github.com/jump-cellpainting/2024_Chandrasekaran_NatureMethods_CPJUMP1)): Metadata is downloaded and processed to identify and organize plates containing wells treated with CRISPR perturbations for downstream analysis. +# 1. **CPJUMP1 Pilot Dataset** ([link](https://github.com/jump-cellpainting/2024_Chandrasekaran_NatureMethods_CPJUMP1)): Metadata is downloaded and processed to identify and organize plates containing wells treated with compound perturbations for downstream analysis. # 2. **MitoCheck Dataset**: Normalized and feature-selected single-cell profiles are downloaded for further analysis. # 3. **CFReT Dataset**: Normalized and feature-selected single-cell profiles from the CFReT plate are downloaded for downstream analysis. @@ -13,7 +13,6 @@ import gzip import pathlib -import pprint import sys import tarfile import zipfile @@ -151,12 +150,12 @@ def download_compressed_file( # setting perturbation type # other options are "compound", "orf", -pert_type = "crispr" +pert_type = "compound" # setting input and output paths -# In[5]: +# In[4]: # setting config path @@ -184,9 +183,11 @@ def download_compressed_file( # ## Downloading CPJUMP1 Metadata # -# In this section, we download and process the CPJUMP1 experimental metadata. This metadata contains information about assay plates, batches, and perturbation types, which is essential for organizing and analyzing single-cell profiles. Only plates treated with CRISPR perturbations are selected for downstream analysis. +# In this section, we download the [experimental metadata](https://github.com/carpenter-singh-lab/2024_Chandrasekaran_NatureMethods/blob/main/benchmark/output/experiment-metadata.tsv) for the CPJUMP1 dataset. This metadata provides detailed information about each experimental batch, including plate barcodes, cell lines, perturbation types, and incubation times. Access to this metadata is essential for selecting and organizing the relevant subset of CPJUMP1 data for downstream analysis. +# +# For this notebook, we focus on plates containing both U2OS and A549 parental cell lines that have been treated with compounds for 48 hours. More information about the batch and plate metadata can be found in the [CPJUMP1 documentation](https://github.com/carpenter-singh-lab/2024_Chandrasekaran_NatureMethods/blob/main/README.md#batch-and-plate-metadata). -# In[6]: +# In[5]: # loading config file and setting experimental metadata URL @@ -199,43 +200,37 @@ def download_compressed_file( CPJUMP1_exp_metadata_url, separator="\t", has_header=True, encoding="utf-8" ) -# filtering the metadata to only includes plates that their perturbation types are crispr -exp_metadata = exp_metadata.filter(exp_metadata["Perturbation"].str.contains(pert_type)) +# apply a single filter to select only rows matching all criteria +exp_metadata = exp_metadata.filter( + ( + exp_metadata["Perturbation"].str.contains(pert_type) + ) # selecting based on pert type + & (exp_metadata["Time"] == 48) # time of incubation with compound + & ( + exp_metadata["Cell_type"].is_in(["U2OS", "A549"]) + ) # selecting based on cell type + & (exp_metadata["Cell_line"] == "Parental") # selecting only the parental cell line + & (pl.col("Batch") == "2020_11_04_CPJUMP1") # selecting only the specified batch +) # save the experimental metadata as a csv file exp_metadata.write_csv(exp_metadata_path) # display +print( + "plates that will be downloaded are: ", exp_metadata["Assay_Plate_Barcode"].unique() +) +print("shape: ", exp_metadata.shape) exp_metadata -# Creating a dictionary to group plates by their corresponding experimental batch -# -# This step organizes the plate barcodes from the experimental metadata into groups based on their batch. Grouping plates by batch is useful for batch-wise data processing and downstream analyses. - -# In[7]: - - -# creating a dictionary for the batch and the associated plates with the a batch -batch_plates_dict = {} -exp_metadata_batches = exp_metadata["Batch"].unique().to_list() - -for batch in exp_metadata_batches: - batch_plates_dict[batch] = exp_metadata.filter(exp_metadata["Batch"] == batch)[ - "Assay_Plate_Barcode" - ].to_list() - -# display batch (Keys) and plates (values) within each batch -pprint.pprint(batch_plates_dict) - - # ## Downloading MitoCheck Data # # In this section, we download the MitoCheck data generated in [this study](https://pmc.ncbi.nlm.nih.gov/articles/PMC3108885/). # # Specifically, we are downloading data that has already been normalized and feature-selected. The normalization and feature selection pipeline is available [here](https://github.com/WayScience/mitocheck_data/tree/main/3.normalize_data). -# In[7]: +# In[6]: # url source for the MitoCheck data @@ -263,7 +258,7 @@ def download_compressed_file( # - Only the processed single-cell profiles are downloaded [here](https://github.com/WayScience/cellpainting_predicts_cardiac_fibrosis/tree/main/3.process_cfret_features/data/single_cell_profiles) # - The CFReT dataset was used and published in [this study](https://doi.org/10.1161/CIRCULATIONAHA.124.071956). -# In[8]: +# In[7]: # setting the source for the CFReT data From 4589a10b8cf275b17b55a1c1701e3d022169fbca Mon Sep 17 00:00:00 2001 From: Erik Serrano Date: Thu, 18 Dec 2025 13:32:07 -0700 Subject: [PATCH 02/12] updated download module; removed functions --- .pre-commit-config.yaml | 2 +- .../0.download-data/1.download-data.ipynb | 191 +++--------------- .../nbconverted/1.download-data.py | 167 +++------------ notebooks/nb-configs.yaml | 2 +- 4 files changed, 55 insertions(+), 307 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 0d3300f..0484715 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -38,7 +38,7 @@ repos: # Ruff for linting and formatting Python files - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.14.9 + rev: v0.14.10 hooks: - id: ruff-check args: ["--fix"] diff --git a/notebooks/0.download-data/1.download-data.ipynb b/notebooks/0.download-data/1.download-data.ipynb index c1294e1..15c7cda 100644 --- a/notebooks/0.download-data/1.download-data.ipynb +++ b/notebooks/0.download-data/1.download-data.ipynb @@ -23,145 +23,13 @@ "source": [ "import sys\n", "import pathlib\n", - "import gzip\n", - "import zipfile\n", - "import tarfile\n", "\n", - "import requests\n", "import polars as pl\n", - "from tqdm import tqdm\n", "\n", "sys.path.append(\"../../\")\n", "from utils import io_utils" ] }, - { - "cell_type": "markdown", - "id": "b7911fff", - "metadata": {}, - "source": [ - "## Helper functions" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "39105354", - "metadata": {}, - "outputs": [], - "source": [ - "def download_compressed_file(\n", - " source_url: str,\n", - " output_path: pathlib.Path | str,\n", - " chunk_size: int = 8192,\n", - " extract: bool = True,\n", - ") -> None:\n", - " \"\"\"Downloads a compressed file from a URL with progress tracking.\n", - "\n", - " Downloads a file from the specified URL and saves it to the given output path.\n", - " The download is performed in chunks to handle large files efficiently, and the progress is displayed using\n", - " the `tqdm` library. The function raises exceptions for various error conditions, including\n", - " invalid input types, file system errors, and issues during the download process.\n", - "\n", - " Parameters\n", - " ----------\n", - " source_url : str\n", - " URL to download the file from.\n", - " output_path : pathlib.Path\n", - " Full path where the file should be saved.\n", - " chunk_size : int, optional\n", - " Size of chunks to download in bytes. Defaults to 8192.\n", - " extract : bool, optional\n", - " Whether to extract the compressed file after download. Defaults to True.\n", - "\n", - " Raises\n", - " ------\n", - " requests.exceptions.RequestException\n", - " If there is an error during the download request.\n", - " Exception\n", - " For any unexpected error during file writing or progress tracking.\n", - " \"\"\"\n", - "\n", - " # type checking\n", - " if not isinstance(source_url, str):\n", - " raise TypeError(f\"source_url must be a string, got {type(source_url)}\")\n", - " if not isinstance(output_path, (pathlib.Path, str)):\n", - " raise TypeError(\n", - " f\"output_path must be a pathlib.Path or str, got {type(output_path)}\"\n", - " )\n", - " if isinstance(output_path, str):\n", - " output_path = pathlib.Path(output_path)\n", - " if not output_path.parent.exists():\n", - " raise FileNotFoundError(\n", - " f\"Output directory {output_path.parent} does not exist.\"\n", - " )\n", - " if output_path.exists() and not output_path.is_file():\n", - " raise FileExistsError(f\"Output path {output_path} exists and is not a file.\")\n", - "\n", - " # starting downloading process\n", - " try:\n", - " # sending GET request to the source URL\n", - " with requests.get(source_url, stream=True) as response:\n", - " # raise an error if the request was unsuccessful\n", - " response.raise_for_status()\n", - "\n", - " # get the total size of the file from the response headers\n", - " total_size = int(response.headers.get(\"content-length\", 0))\n", - "\n", - " # using tqdm to track the download progress\n", - " with (\n", - " open(output_path, \"wb\") as file,\n", - " tqdm(\n", - " desc=\"Downloading\",\n", - " total=total_size,\n", - " unit=\"B\",\n", - " unit_scale=True,\n", - " unit_divisor=1024,\n", - " ) as pbar,\n", - " ):\n", - " # iterating over the response content in chunks\n", - " for chunk in response.iter_content(chunk_size=chunk_size):\n", - " if chunk:\n", - " file.write(chunk)\n", - "\n", - " # this updates the progress bar\n", - " pbar.update(len(chunk))\n", - "\n", - " # extract the file if requested\n", - " if extract:\n", - " # ensring that the path is a directory if the output path is a file\n", - " # this is necessary for extraction\n", - " extract_dir = output_path\n", - " if extract_dir.is_file():\n", - " extract_dir = output_path.parent\n", - "\n", - " if output_path.suffix == \".gz\":\n", - " # handle gzip files\n", - " extracted_path = output_path.with_suffix(\"\")\n", - " with gzip.open(output_path, \"rb\") as f_in:\n", - " with open(extracted_path, \"wb\") as f_out:\n", - " f_out.write(f_in.read())\n", - " print(f\"Extracted to: {extracted_path}\")\n", - "\n", - " elif output_path.suffix == \".zip\":\n", - " # handle zip files\n", - " with zipfile.ZipFile(output_path, \"r\") as zip_ref:\n", - " zip_ref.extractall(extract_dir)\n", - " print(f\"Extracted to: {extract_dir}\")\n", - "\n", - " elif output_path.suffix in [\".tar\", \".tgz\"] or \".tar.\" in output_path.name:\n", - " # handle tar files\n", - " with tarfile.open(output_path, \"r:*\") as tar_ref:\n", - " tar_ref.extractall(extract_dir)\n", - " print(f\"Extracted to: {extract_dir}\")\n", - "\n", - " # handling exceptions\n", - " except requests.exceptions.RequestException as e:\n", - " raise requests.exceptions.RequestException(f\"Error downloading file: {e}\")\n", - " except Exception as e:\n", - " raise Exception(f\"Unexpected error: {e}\")" - ] - }, { "cell_type": "markdown", "id": "f2647c06", @@ -180,7 +48,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 2, "id": "0420eb8e", "metadata": {}, "outputs": [], @@ -200,7 +68,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 3, "id": "b7381913", "metadata": {}, "outputs": [], @@ -242,7 +110,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 4, "id": "5b8bfe5f", "metadata": {}, "outputs": [ @@ -255,15 +123,15 @@ "[\n", "\t\"BR00117011\"\n", "\t\"BR00117012\"\n", - "\t\"BR00117013\"\n", - "\t\"BR00117015\"\n", + "\t\"BR00117008\"\n", + "\t\"BR00117010\"\n", "\t\"BR00117019\"\n", "\t…\n", - "\t\"BR00117016\"\n", - "\t\"BR00117055\"\n", "\t\"BR00117009\"\n", + "\t\"BR00117016\"\n", "\t\"BR00117017\"\n", - "\t\"BR00117054\"\n", + "\t\"BR00117013\"\n", + "\t\"BR00117015\"\n", "]\n", "shape: (12, 13)\n" ] @@ -333,7 +201,7 @@ "└───────────┴───────────┴───────────┴───────────┴───┴───────────┴───────────┴───────────┴──────────┘" ] }, - "execution_count": 5, + "execution_count": 4, "metadata": {}, "output_type": "execute_result" } @@ -387,7 +255,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 5, "id": "06783224", "metadata": {}, "outputs": [ @@ -395,26 +263,25 @@ "name": "stdout", "output_type": "stream", "text": [ - "File /home/erikserrano/Projects/buscar/notebooks/0.download-data/data/sc-profiles/mitocheck/mitocheck_profile.zip already exists. Skipping download.\n" + "File /home/erikserrano/Projects/buscar/notebooks/0.download-data/data/sc-profiles/mitocheck/mitocheck_profile.parquet already exists. Skipping download.\n" ] } ], "source": [ "# url source for the MitoCheck data\n", "mitocheck_url = nb_configs[\"links\"][\"MitoCheck-profiles-source\"]\n", - "output_path = mitocheck_dir / \"mitocheck_profile.zip\"\n", + "save_path = (mitocheck_dir / \"mitocheck_profile.parquet\").resolve()\n", "\n", - "# checking if the downloaded file already exists\n", - "if output_path.exists():\n", - " print(f\"File {output_path} already exists. Skipping download.\")\n", + "if save_path.exists():\n", + " print(f\"File {save_path} already exists. Skipping download.\")\n", "else:\n", - " # downloading mitocheck profiles\n", - " download_compressed_file(\n", - " source_url=mitocheck_url,\n", - " output_path=output_path,\n", - " chunk_size=8192,\n", - " extract=True,\n", - " )" + " # read and download mitocheck data\n", + " mitocheck_profile = pl.read_csv(mitocheck_url)\n", + " mitocheck_profile.write_parquet(save_path)\n", + "\n", + " # display\n", + " print(\"shape: \", mitocheck_profile.shape)\n", + " mitocheck_profile.head()" ] }, { @@ -433,7 +300,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 6, "id": "4d9fd47c", "metadata": {}, "outputs": [ @@ -454,15 +321,17 @@ " cfret_dir / \"localhost230405150001_sc_feature_selected.parquet\"\n", ").resolve()\n", "\n", - "# checking if the download already exists if it does not exist\n", - "# download the file\n", + "# check if it exists\n", "if output_path.exists():\n", " print(f\"File {output_path} already exists. Skipping download.\")\n", "else:\n", - " download_compressed_file(\n", - " source_url=cfret_source,\n", - " output_path=output_path,\n", - " )" + " # download cfret data\n", + " cfret_df = pl.read_parquet(cfret_source)\n", + " cfret_df.write_parquet(output_path)\n", + "\n", + " # display\n", + " print(\"shape: \", cfret_df.shape)\n", + " cfret_df.head()" ] } ], diff --git a/notebooks/0.download-data/nbconverted/1.download-data.py b/notebooks/0.download-data/nbconverted/1.download-data.py index 7ea16d1..70d4ad5 100644 --- a/notebooks/0.download-data/nbconverted/1.download-data.py +++ b/notebooks/0.download-data/nbconverted/1.download-data.py @@ -11,141 +11,19 @@ # In[1]: -import gzip import pathlib import sys -import tarfile -import zipfile import polars as pl -import requests -from tqdm import tqdm sys.path.append("../../") from utils import io_utils -# ## Helper functions - -# In[2]: - - -def download_compressed_file( - source_url: str, - output_path: pathlib.Path | str, - chunk_size: int = 8192, - extract: bool = True, -) -> None: - """Downloads a compressed file from a URL with progress tracking. - - Downloads a file from the specified URL and saves it to the given output path. - The download is performed in chunks to handle large files efficiently, and the progress is displayed using - the `tqdm` library. The function raises exceptions for various error conditions, including - invalid input types, file system errors, and issues during the download process. - - Parameters - ---------- - source_url : str - URL to download the file from. - output_path : pathlib.Path - Full path where the file should be saved. - chunk_size : int, optional - Size of chunks to download in bytes. Defaults to 8192. - extract : bool, optional - Whether to extract the compressed file after download. Defaults to True. - - Raises - ------ - requests.exceptions.RequestException - If there is an error during the download request. - Exception - For any unexpected error during file writing or progress tracking. - """ - - # type checking - if not isinstance(source_url, str): - raise TypeError(f"source_url must be a string, got {type(source_url)}") - if not isinstance(output_path, (pathlib.Path, str)): - raise TypeError( - f"output_path must be a pathlib.Path or str, got {type(output_path)}" - ) - if isinstance(output_path, str): - output_path = pathlib.Path(output_path) - if not output_path.parent.exists(): - raise FileNotFoundError( - f"Output directory {output_path.parent} does not exist." - ) - if output_path.exists() and not output_path.is_file(): - raise FileExistsError(f"Output path {output_path} exists and is not a file.") - - # starting downloading process - try: - # sending GET request to the source URL - with requests.get(source_url, stream=True) as response: - # raise an error if the request was unsuccessful - response.raise_for_status() - - # get the total size of the file from the response headers - total_size = int(response.headers.get("content-length", 0)) - - # using tqdm to track the download progress - with ( - open(output_path, "wb") as file, - tqdm( - desc="Downloading", - total=total_size, - unit="B", - unit_scale=True, - unit_divisor=1024, - ) as pbar, - ): - # iterating over the response content in chunks - for chunk in response.iter_content(chunk_size=chunk_size): - if chunk: - file.write(chunk) - - # this updates the progress bar - pbar.update(len(chunk)) - - # extract the file if requested - if extract: - # ensring that the path is a directory if the output path is a file - # this is necessary for extraction - extract_dir = output_path - if extract_dir.is_file(): - extract_dir = output_path.parent - - if output_path.suffix == ".gz": - # handle gzip files - extracted_path = output_path.with_suffix("") - with gzip.open(output_path, "rb") as f_in: - with open(extracted_path, "wb") as f_out: - f_out.write(f_in.read()) - print(f"Extracted to: {extracted_path}") - - elif output_path.suffix == ".zip": - # handle zip files - with zipfile.ZipFile(output_path, "r") as zip_ref: - zip_ref.extractall(extract_dir) - print(f"Extracted to: {extract_dir}") - - elif output_path.suffix in [".tar", ".tgz"] or ".tar." in output_path.name: - # handle tar files - with tarfile.open(output_path, "r:*") as tar_ref: - tar_ref.extractall(extract_dir) - print(f"Extracted to: {extract_dir}") - - # handling exceptions - except requests.exceptions.RequestException as e: - raise requests.exceptions.RequestException(f"Error downloading file: {e}") - except Exception as e: - raise Exception(f"Unexpected error: {e}") - - # ## Downloading data # Parameters used in this notebook -# In[3]: +# In[2]: # setting perturbation type @@ -155,7 +33,7 @@ def download_compressed_file( # setting input and output paths -# In[4]: +# In[3]: # setting config path @@ -187,7 +65,7 @@ def download_compressed_file( # # For this notebook, we focus on plates containing both U2OS and A549 parental cell lines that have been treated with compounds for 48 hours. More information about the batch and plate metadata can be found in the [CPJUMP1 documentation](https://github.com/carpenter-singh-lab/2024_Chandrasekaran_NatureMethods/blob/main/README.md#batch-and-plate-metadata). -# In[5]: +# In[4]: # loading config file and setting experimental metadata URL @@ -230,24 +108,23 @@ def download_compressed_file( # # Specifically, we are downloading data that has already been normalized and feature-selected. The normalization and feature selection pipeline is available [here](https://github.com/WayScience/mitocheck_data/tree/main/3.normalize_data). -# In[6]: +# In[5]: # url source for the MitoCheck data mitocheck_url = nb_configs["links"]["MitoCheck-profiles-source"] -output_path = mitocheck_dir / "mitocheck_profile.zip" +save_path = (mitocheck_dir / "mitocheck_profile.parquet").resolve() -# checking if the downloaded file already exists -if output_path.exists(): - print(f"File {output_path} already exists. Skipping download.") +if save_path.exists(): + print(f"File {save_path} already exists. Skipping download.") else: - # downloading mitocheck profiles - download_compressed_file( - source_url=mitocheck_url, - output_path=output_path, - chunk_size=8192, - extract=True, - ) + # read and download mitocheck data + mitocheck_profile = pl.read_csv(mitocheck_url) + mitocheck_profile.write_parquet(save_path) + + # display + print("shape: ", mitocheck_profile.shape) + mitocheck_profile.head() # ## Downloading CFReT Data @@ -258,7 +135,7 @@ def download_compressed_file( # - Only the processed single-cell profiles are downloaded [here](https://github.com/WayScience/cellpainting_predicts_cardiac_fibrosis/tree/main/3.process_cfret_features/data/single_cell_profiles) # - The CFReT dataset was used and published in [this study](https://doi.org/10.1161/CIRCULATIONAHA.124.071956). -# In[7]: +# In[6]: # setting the source for the CFReT data @@ -269,12 +146,14 @@ def download_compressed_file( cfret_dir / "localhost230405150001_sc_feature_selected.parquet" ).resolve() -# checking if the download already exists if it does not exist -# download the file +# check if it exists if output_path.exists(): print(f"File {output_path} already exists. Skipping download.") else: - download_compressed_file( - source_url=cfret_source, - output_path=output_path, - ) + # download cfret data + cfret_df = pl.read_parquet(cfret_source) + cfret_df.write_parquet(output_path) + + # display + print("shape: ", cfret_df.shape) + cfret_df.head() diff --git a/notebooks/nb-configs.yaml b/notebooks/nb-configs.yaml index 49a5f97..b64f738 100644 --- a/notebooks/nb-configs.yaml +++ b/notebooks/nb-configs.yaml @@ -2,5 +2,5 @@ links: CPJUMP1-experimental-metadata-source: https://github.com/carpenter-singh-lab/2024_Chandrasekaran_NatureMethods/raw/refs/heads/main/benchmark/output/experiment-metadata.tsv CPJUMP-plate-maps-source: https://raw.githubusercontent.com/jump-cellpainting/2024_Chandrasekaran_NatureMethods_CPJUMP1/refs/heads/main/metadata/platemaps/2020_11_04_CPJUMP1/platemap/JUMP-Target-1_crispr_platemap.txt CPJUMP1-profiles-source: https://cellpainting-gallery.s3.amazonaws.com/cpg0000-jump-pilot/source_4/workspace/profiles - MitoCheck-profiles-source: https://zenodo.org/records/7967386/files/3.normalize_data__normalized_data.zip?download=1 + MitoCheck-profiles-source: https://raw.githubusercontent.com/WayScience/mitocheck_data/main/3.normalize_data/normalized_data/training_data__ic.csv.gz CFReT-profiles-source: https://github.com/WayScience/cellpainting_predicts_cardiac_fibrosis/raw/refs/heads/main/3.process_cfret_features/data/single_cell_profiles/localhost230405150001_sc_feature_selected.parquet?download= From d9e9ae0a7131cf501c7b32487927a43b930ba9ac Mon Sep 17 00:00:00 2001 From: Erik Serrano Date: Thu, 18 Dec 2025 13:41:50 -0700 Subject: [PATCH 03/12] fix metadata naming bug --- .../0.download-data/1.download-data.ipynb | 20 +++++++++++-------- .../nbconverted/1.download-data.py | 6 +++++- 2 files changed, 17 insertions(+), 9 deletions(-) diff --git a/notebooks/0.download-data/1.download-data.ipynb b/notebooks/0.download-data/1.download-data.ipynb index 15c7cda..f124444 100644 --- a/notebooks/0.download-data/1.download-data.ipynb +++ b/notebooks/0.download-data/1.download-data.ipynb @@ -87,6 +87,10 @@ "profiles_dir = (data_dir / \"sc-profiles\").resolve()\n", "profiles_dir.mkdir(exist_ok=True)\n", "\n", + "# create cpjump1 directory\n", + "cpjump1_dir = (profiles_dir / \"cpjump1\").resolve()\n", + "cpjump1_dir.mkdir(exist_ok=True)\n", + "\n", "# create mitocheck directory\n", "mitocheck_dir = (profiles_dir / \"mitocheck\").resolve()\n", "mitocheck_dir.mkdir(exist_ok=True)\n", @@ -121,17 +125,17 @@ "plates that will be downloaded are: shape: (12,)\n", "Series: 'Assay_Plate_Barcode' [str]\n", "[\n", - "\t\"BR00117011\"\n", + "\t\"BR00117015\"\n", "\t\"BR00117012\"\n", - "\t\"BR00117008\"\n", - "\t\"BR00117010\"\n", + "\t\"BR00117016\"\n", + "\t\"BR00117017\"\n", "\t\"BR00117019\"\n", "\t…\n", + "\t\"BR00117010\"\n", + "\t\"BR00117054\"\n", + "\t\"BR00117055\"\n", "\t\"BR00117009\"\n", - "\t\"BR00117016\"\n", - "\t\"BR00117017\"\n", - "\t\"BR00117013\"\n", - "\t\"BR00117015\"\n", + "\t\"BR00117008\"\n", "]\n", "shape: (12, 13)\n" ] @@ -231,7 +235,7 @@ ")\n", "\n", "# save the experimental metadata as a csv file\n", - "exp_metadata.write_csv(exp_metadata_path)\n", + "exp_metadata.write_csv(cpjump1_dir / f\"cpjump1_{pert_type}_experimental-metadata.csv\")\n", "\n", "# display\n", "print(\n", diff --git a/notebooks/0.download-data/nbconverted/1.download-data.py b/notebooks/0.download-data/nbconverted/1.download-data.py index 70d4ad5..c8142ab 100644 --- a/notebooks/0.download-data/nbconverted/1.download-data.py +++ b/notebooks/0.download-data/nbconverted/1.download-data.py @@ -50,6 +50,10 @@ profiles_dir = (data_dir / "sc-profiles").resolve() profiles_dir.mkdir(exist_ok=True) +# create cpjump1 directory +cpjump1_dir = (profiles_dir / "cpjump1").resolve() +cpjump1_dir.mkdir(exist_ok=True) + # create mitocheck directory mitocheck_dir = (profiles_dir / "mitocheck").resolve() mitocheck_dir.mkdir(exist_ok=True) @@ -92,7 +96,7 @@ ) # save the experimental metadata as a csv file -exp_metadata.write_csv(exp_metadata_path) +exp_metadata.write_csv(cpjump1_dir / f"cpjump1_{pert_type}_experimental-metadata.csv") # display print( From 8177e25a057807a93ab1b35d415c7aea683571f6 Mon Sep 17 00:00:00 2001 From: Erik Serrano Date: Fri, 19 Dec 2025 11:12:28 -0700 Subject: [PATCH 04/12] updated nb configs --- notebooks/nb-configs.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/notebooks/nb-configs.yaml b/notebooks/nb-configs.yaml index b64f738..49a5f97 100644 --- a/notebooks/nb-configs.yaml +++ b/notebooks/nb-configs.yaml @@ -2,5 +2,5 @@ links: CPJUMP1-experimental-metadata-source: https://github.com/carpenter-singh-lab/2024_Chandrasekaran_NatureMethods/raw/refs/heads/main/benchmark/output/experiment-metadata.tsv CPJUMP-plate-maps-source: https://raw.githubusercontent.com/jump-cellpainting/2024_Chandrasekaran_NatureMethods_CPJUMP1/refs/heads/main/metadata/platemaps/2020_11_04_CPJUMP1/platemap/JUMP-Target-1_crispr_platemap.txt CPJUMP1-profiles-source: https://cellpainting-gallery.s3.amazonaws.com/cpg0000-jump-pilot/source_4/workspace/profiles - MitoCheck-profiles-source: https://raw.githubusercontent.com/WayScience/mitocheck_data/main/3.normalize_data/normalized_data/training_data__ic.csv.gz + MitoCheck-profiles-source: https://zenodo.org/records/7967386/files/3.normalize_data__normalized_data.zip?download=1 CFReT-profiles-source: https://github.com/WayScience/cellpainting_predicts_cardiac_fibrosis/raw/refs/heads/main/3.process_cfret_features/data/single_cell_profiles/localhost230405150001_sc_feature_selected.parquet?download= From 09129f21d0144123ee77f2a8f5e0971dd8af6895 Mon Sep 17 00:00:00 2001 From: Erik Serrano Date: Fri, 19 Dec 2025 11:31:27 -0700 Subject: [PATCH 05/12] updated download module --- .../0.download-data/1.download-data.ipynb | 222 ++++++++++++++++-- .../nbconverted/1.download-data.py | 194 +++++++++++++-- 2 files changed, 379 insertions(+), 37 deletions(-) diff --git a/notebooks/0.download-data/1.download-data.ipynb b/notebooks/0.download-data/1.download-data.ipynb index f124444..5d93ae2 100644 --- a/notebooks/0.download-data/1.download-data.ipynb +++ b/notebooks/0.download-data/1.download-data.ipynb @@ -23,13 +23,196 @@ "source": [ "import sys\n", "import pathlib\n", + "import gzip\n", + "import zipfile\n", + "import tarfile\n", "\n", + "import requests\n", "import polars as pl\n", + "from tqdm import tqdm\n", "\n", "sys.path.append(\"../../\")\n", "from utils import io_utils" ] }, + { + "cell_type": "markdown", + "id": "f7ea9b50", + "metadata": {}, + "source": [ + "## Helpler functions" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "01c4c0b0", + "metadata": {}, + "outputs": [], + "source": [ + "def download_file(\n", + " source_url: str,\n", + " output_path: pathlib.Path | str,\n", + " chunk_size: int = 8192,\n", + ") -> pathlib.Path:\n", + " \"\"\"Downloads a file from a URL with progress tracking.\n", + "\n", + " Downloads a file from the specified URL and saves it to the given output path.\n", + " The download is performed in chunks to handle large files efficiently, and the progress is displayed using\n", + " the `tqdm` library.\n", + "\n", + " Parameters\n", + " ----------\n", + " source_url : str\n", + " URL to download the file from.\n", + " output_path : pathlib.Path | str\n", + " Full path where the file should be saved.\n", + " chunk_size : int, optional\n", + " Size of chunks to download in bytes. Defaults to 8192.\n", + "\n", + " Returns\n", + " -------\n", + " pathlib.Path\n", + " The path where the file was downloaded.\n", + "\n", + " Raises\n", + " ------\n", + " requests.exceptions.RequestException\n", + " If there is an error during the download request.\n", + " TypeError\n", + " If input types are invalid.\n", + " FileNotFoundError\n", + " If the output directory does not exist.\n", + " \"\"\"\n", + " # type checking\n", + " if not isinstance(source_url, str):\n", + " raise TypeError(f\"source_url must be a string, got {type(source_url)}\")\n", + " if not isinstance(output_path, (pathlib.Path, str)):\n", + " raise TypeError(\n", + " f\"output_path must be a pathlib.Path or str, got {type(output_path)}\"\n", + " )\n", + " if isinstance(output_path, str):\n", + " output_path = pathlib.Path(output_path)\n", + " if not output_path.parent.exists():\n", + " raise FileNotFoundError(\n", + " f\"Output directory {output_path.parent} does not exist.\"\n", + " )\n", + " if output_path.exists() and not output_path.is_file():\n", + " raise FileExistsError(f\"Output path {output_path} exists and is not a file.\")\n", + "\n", + " # starting downloading process\n", + " try:\n", + " # sending GET request to the source URL\n", + " with requests.get(source_url, stream=True) as response:\n", + " # raise an error if the request was unsuccessful\n", + " response.raise_for_status()\n", + "\n", + " # get the total size of the file from the response headers\n", + " total_size = int(response.headers.get(\"content-length\", 0))\n", + "\n", + " # using tqdm to track the download progress\n", + " with (\n", + " open(output_path, \"wb\") as file,\n", + " tqdm(\n", + " desc=\"Downloading\",\n", + " total=total_size,\n", + " unit=\"B\",\n", + " unit_scale=True,\n", + " unit_divisor=1024,\n", + " ) as pbar,\n", + " ):\n", + " # iterating over the response content in chunks\n", + " for chunk in response.iter_content(chunk_size=chunk_size):\n", + " if chunk:\n", + " file.write(chunk)\n", + "\n", + " # this updates the progress bar\n", + " pbar.update(len(chunk))\n", + " return output_path\n", + "\n", + " except requests.exceptions.RequestException as e:\n", + " raise requests.exceptions.RequestException(f\"Error downloading file: {e}\")\n", + " except Exception as e:\n", + " raise Exception(f\"Unexpected error during download: {e}\")\n", + "\n", + "\n", + "def extract_file(\n", + " file_path: pathlib.Path | str,\n", + " extract_dir: pathlib.Path | str | None = None,\n", + ") -> None:\n", + " \"\"\"Extracts a compressed file (zip, tar, tar.gz, tgz, gz).\n", + "\n", + " Parameters\n", + " ----------\n", + " file_path : pathlib.Path | str\n", + " Path to the compressed file.\n", + " extract_dir : pathlib.Path | str, optional\n", + " Directory where the file should be extracted. If None, extracts to the same directory as the file.\n", + "\n", + " Returns:\n", + " --------\n", + " None\n", + " Extracted files are saved in the specified extract_dir or in the same\n", + " directory if the extract_dir option is None\n", + "\n", + " \"\"\"\n", + " # type checking\n", + " if isinstance(file_path, str):\n", + " file_path = pathlib.Path(file_path)\n", + "\n", + " if not file_path.exists():\n", + " raise FileNotFoundError(f\"File {file_path} does not exist.\")\n", + "\n", + " if extract_dir is None:\n", + " extract_dir = file_path.parent\n", + " elif isinstance(extract_dir, str):\n", + " extract_dir = pathlib.Path(extract_dir)\n", + "\n", + " extract_dir.mkdir(parents=True, exist_ok=True)\n", + "\n", + " try:\n", + " if file_path.suffix == \".gz\" and not file_path.name.endswith(\".tar.gz\"):\n", + " # handle single gzip files\n", + " extracted_path = extract_dir / file_path.with_suffix(\"\").name\n", + " with gzip.open(file_path, \"rb\") as f_in:\n", + " with open(extracted_path, \"wb\") as f_out:\n", + " f_out.write(f_in.read())\n", + " print(f\"Extracted to: {extracted_path}\")\n", + "\n", + " elif file_path.suffix == \".zip\":\n", + " # handle zip files\n", + " with zipfile.ZipFile(file_path, \"r\") as zip_ref:\n", + " zip_ref.extractall(extract_dir)\n", + " print(f\"Extracted to: {extract_dir}\")\n", + "\n", + " elif (\n", + " file_path.suffix in [\".tar\", \".tgz\"]\n", + " or \".tar.\" in file_path.name\n", + " or file_path.name.endswith(\".tar.gz\")\n", + " ):\n", + " # handle tar files\n", + " with tarfile.open(file_path, \"r:*\") as tar_ref:\n", + " tar_ref.extractall(extract_dir)\n", + " print(f\"Extracted to: {extract_dir}\")\n", + " else:\n", + " print(f\"Unsupported file format for extraction: {file_path.suffix}\")\n", + "\n", + " except Exception as e:\n", + " raise Exception(f\"Unexpected error during extraction: {e}\")\n", + "\n", + "\n", + "def download_compressed_file(\n", + " source_url: str,\n", + " output_path: pathlib.Path | str,\n", + " chunk_size: int = 8192,\n", + " extract: bool = True,\n", + ") -> None:\n", + " \"\"\"Downloads and optionally extracts a compressed file.\"\"\"\n", + " downloaded_path = download_file(source_url, output_path, chunk_size)\n", + " if extract:\n", + " extract_file(downloaded_path)" + ] + }, { "cell_type": "markdown", "id": "f2647c06", @@ -48,7 +231,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 3, "id": "0420eb8e", "metadata": {}, "outputs": [], @@ -68,7 +251,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 4, "id": "b7381913", "metadata": {}, "outputs": [], @@ -114,7 +297,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 5, "id": "5b8bfe5f", "metadata": {}, "outputs": [ @@ -125,17 +308,17 @@ "plates that will be downloaded are: shape: (12,)\n", "Series: 'Assay_Plate_Barcode' [str]\n", "[\n", + "\t\"BR00117055\"\n", + "\t\"BR00117009\"\n", + "\t\"BR00117013\"\n", "\t\"BR00117015\"\n", + "\t\"BR00117010\"\n", + "\t…\n", "\t\"BR00117012\"\n", + "\t\"BR00117011\"\n", + "\t\"BR00117008\"\n", "\t\"BR00117016\"\n", - "\t\"BR00117017\"\n", - "\t\"BR00117019\"\n", - "\t…\n", - "\t\"BR00117010\"\n", "\t\"BR00117054\"\n", - "\t\"BR00117055\"\n", - "\t\"BR00117009\"\n", - "\t\"BR00117008\"\n", "]\n", "shape: (12, 13)\n" ] @@ -205,7 +388,7 @@ "└───────────┴───────────┴───────────┴───────────┴───┴───────────┴───────────┴───────────┴──────────┘" ] }, - "execution_count": 4, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } @@ -259,7 +442,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 6, "id": "06783224", "metadata": {}, "outputs": [ @@ -267,25 +450,18 @@ "name": "stdout", "output_type": "stream", "text": [ - "File /home/erikserrano/Projects/buscar/notebooks/0.download-data/data/sc-profiles/mitocheck/mitocheck_profile.parquet already exists. Skipping download.\n" + "File /home/erikserrano/Projects/buscar/notebooks/0.download-data/data/sc-profiles/mitocheck/normalized_data already exists. Skipping download.\n" ] } ], "source": [ "# url source for the MitoCheck data\n", "mitocheck_url = nb_configs[\"links\"][\"MitoCheck-profiles-source\"]\n", - "save_path = (mitocheck_dir / \"mitocheck_profile.parquet\").resolve()\n", - "\n", + "save_path = (mitocheck_dir / \"normalized_data\").resolve()\n", "if save_path.exists():\n", " print(f\"File {save_path} already exists. Skipping download.\")\n", "else:\n", - " # read and download mitocheck data\n", - " mitocheck_profile = pl.read_csv(mitocheck_url)\n", - " mitocheck_profile.write_parquet(save_path)\n", - "\n", - " # display\n", - " print(\"shape: \", mitocheck_profile.shape)\n", - " mitocheck_profile.head()" + " download_compressed_file(mitocheck_url, save_path)" ] }, { @@ -304,7 +480,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 7, "id": "4d9fd47c", "metadata": {}, "outputs": [ diff --git a/notebooks/0.download-data/nbconverted/1.download-data.py b/notebooks/0.download-data/nbconverted/1.download-data.py index c8142ab..75b465e 100644 --- a/notebooks/0.download-data/nbconverted/1.download-data.py +++ b/notebooks/0.download-data/nbconverted/1.download-data.py @@ -11,19 +11,192 @@ # In[1]: +import gzip import pathlib import sys +import tarfile +import zipfile import polars as pl +import requests +from tqdm import tqdm sys.path.append("../../") from utils import io_utils +# ## Helpler functions + +# In[2]: + + +def download_file( + source_url: str, + output_path: pathlib.Path | str, + chunk_size: int = 8192, +) -> pathlib.Path: + """Downloads a file from a URL with progress tracking. + + Downloads a file from the specified URL and saves it to the given output path. + The download is performed in chunks to handle large files efficiently, and the progress is displayed using + the `tqdm` library. + + Parameters + ---------- + source_url : str + URL to download the file from. + output_path : pathlib.Path | str + Full path where the file should be saved. + chunk_size : int, optional + Size of chunks to download in bytes. Defaults to 8192. + + Returns + ------- + pathlib.Path + The path where the file was downloaded. + + Raises + ------ + requests.exceptions.RequestException + If there is an error during the download request. + TypeError + If input types are invalid. + FileNotFoundError + If the output directory does not exist. + """ + # type checking + if not isinstance(source_url, str): + raise TypeError(f"source_url must be a string, got {type(source_url)}") + if not isinstance(output_path, (pathlib.Path, str)): + raise TypeError( + f"output_path must be a pathlib.Path or str, got {type(output_path)}" + ) + if isinstance(output_path, str): + output_path = pathlib.Path(output_path) + if not output_path.parent.exists(): + raise FileNotFoundError( + f"Output directory {output_path.parent} does not exist." + ) + if output_path.exists() and not output_path.is_file(): + raise FileExistsError(f"Output path {output_path} exists and is not a file.") + + # starting downloading process + try: + # sending GET request to the source URL + with requests.get(source_url, stream=True) as response: + # raise an error if the request was unsuccessful + response.raise_for_status() + + # get the total size of the file from the response headers + total_size = int(response.headers.get("content-length", 0)) + + # using tqdm to track the download progress + with ( + open(output_path, "wb") as file, + tqdm( + desc="Downloading", + total=total_size, + unit="B", + unit_scale=True, + unit_divisor=1024, + ) as pbar, + ): + # iterating over the response content in chunks + for chunk in response.iter_content(chunk_size=chunk_size): + if chunk: + file.write(chunk) + + # this updates the progress bar + pbar.update(len(chunk)) + return output_path + + except requests.exceptions.RequestException as e: + raise requests.exceptions.RequestException(f"Error downloading file: {e}") + except Exception as e: + raise Exception(f"Unexpected error during download: {e}") + + +def extract_file( + file_path: pathlib.Path | str, + extract_dir: pathlib.Path | str | None = None, +) -> None: + """Extracts a compressed file (zip, tar, tar.gz, tgz, gz). + + Parameters + ---------- + file_path : pathlib.Path | str + Path to the compressed file. + extract_dir : pathlib.Path | str, optional + Directory where the file should be extracted. If None, extracts to the same directory as the file. + + Returns: + -------- + None + Extracted files are saved in the specified extract_dir or in the same + directory if the extract_dir option is None + + """ + # type checking + if isinstance(file_path, str): + file_path = pathlib.Path(file_path) + + if not file_path.exists(): + raise FileNotFoundError(f"File {file_path} does not exist.") + + if extract_dir is None: + extract_dir = file_path.parent + elif isinstance(extract_dir, str): + extract_dir = pathlib.Path(extract_dir) + + extract_dir.mkdir(parents=True, exist_ok=True) + + try: + if file_path.suffix == ".gz" and not file_path.name.endswith(".tar.gz"): + # handle single gzip files + extracted_path = extract_dir / file_path.with_suffix("").name + with gzip.open(file_path, "rb") as f_in: + with open(extracted_path, "wb") as f_out: + f_out.write(f_in.read()) + print(f"Extracted to: {extracted_path}") + + elif file_path.suffix == ".zip": + # handle zip files + with zipfile.ZipFile(file_path, "r") as zip_ref: + zip_ref.extractall(extract_dir) + print(f"Extracted to: {extract_dir}") + + elif ( + file_path.suffix in [".tar", ".tgz"] + or ".tar." in file_path.name + or file_path.name.endswith(".tar.gz") + ): + # handle tar files + with tarfile.open(file_path, "r:*") as tar_ref: + tar_ref.extractall(extract_dir) + print(f"Extracted to: {extract_dir}") + else: + print(f"Unsupported file format for extraction: {file_path.suffix}") + + except Exception as e: + raise Exception(f"Unexpected error during extraction: {e}") + + +def download_compressed_file( + source_url: str, + output_path: pathlib.Path | str, + chunk_size: int = 8192, + extract: bool = True, +) -> None: + """Downloads and optionally extracts a compressed file.""" + downloaded_path = download_file(source_url, output_path, chunk_size) + if extract: + extract_file(downloaded_path) + + # ## Downloading data # Parameters used in this notebook -# In[2]: +# In[3]: # setting perturbation type @@ -33,7 +206,7 @@ # setting input and output paths -# In[3]: +# In[4]: # setting config path @@ -69,7 +242,7 @@ # # For this notebook, we focus on plates containing both U2OS and A549 parental cell lines that have been treated with compounds for 48 hours. More information about the batch and plate metadata can be found in the [CPJUMP1 documentation](https://github.com/carpenter-singh-lab/2024_Chandrasekaran_NatureMethods/blob/main/README.md#batch-and-plate-metadata). -# In[4]: +# In[5]: # loading config file and setting experimental metadata URL @@ -112,23 +285,16 @@ # # Specifically, we are downloading data that has already been normalized and feature-selected. The normalization and feature selection pipeline is available [here](https://github.com/WayScience/mitocheck_data/tree/main/3.normalize_data). -# In[5]: +# In[6]: # url source for the MitoCheck data mitocheck_url = nb_configs["links"]["MitoCheck-profiles-source"] -save_path = (mitocheck_dir / "mitocheck_profile.parquet").resolve() - +save_path = (mitocheck_dir / "normalized_data").resolve() if save_path.exists(): print(f"File {save_path} already exists. Skipping download.") else: - # read and download mitocheck data - mitocheck_profile = pl.read_csv(mitocheck_url) - mitocheck_profile.write_parquet(save_path) - - # display - print("shape: ", mitocheck_profile.shape) - mitocheck_profile.head() + download_compressed_file(mitocheck_url, save_path) # ## Downloading CFReT Data @@ -139,7 +305,7 @@ # - Only the processed single-cell profiles are downloaded [here](https://github.com/WayScience/cellpainting_predicts_cardiac_fibrosis/tree/main/3.process_cfret_features/data/single_cell_profiles) # - The CFReT dataset was used and published in [this study](https://doi.org/10.1161/CIRCULATIONAHA.124.071956). -# In[6]: +# In[7]: # setting the source for the CFReT data From f4386432c3f308033894c7585f6b1da1452e7dc9 Mon Sep 17 00:00:00 2001 From: Erik Serrano Date: Fri, 19 Dec 2025 11:47:08 -0700 Subject: [PATCH 06/12] updated control subsetting notebook --- notebooks/0.download-data/3.subset-jump-controls.ipynb | 4 ++-- .../0.download-data/nbconverted/3.subset-jump-controls.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/notebooks/0.download-data/3.subset-jump-controls.ipynb b/notebooks/0.download-data/3.subset-jump-controls.ipynb index cda9089..4ab1758 100644 --- a/notebooks/0.download-data/3.subset-jump-controls.ipynb +++ b/notebooks/0.download-data/3.subset-jump-controls.ipynb @@ -137,7 +137,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "id": "6a8dd258", "metadata": {}, "outputs": [], @@ -152,7 +152,7 @@ "profiles_dir = (data_dir / \"sc-profiles\").resolve(strict=True)\n", "\n", "cpjump1_data_path = (\n", - " profiles_dir / \"cpjump1\" / \"cpjump1_crispr_concat_profiles.parquet\"\n", + " profiles_dir / \"cpjump1\" / \"cpjump1_compound_concat_profiles.parquet\"\n", ").resolve(strict=True)\n", "\n", "\n", diff --git a/notebooks/0.download-data/nbconverted/3.subset-jump-controls.py b/notebooks/0.download-data/nbconverted/3.subset-jump-controls.py index b3fc534..bfd3dbc 100644 --- a/notebooks/0.download-data/nbconverted/3.subset-jump-controls.py +++ b/notebooks/0.download-data/nbconverted/3.subset-jump-controls.py @@ -110,7 +110,7 @@ def load_group_stratified_data( # Setting input and output paths -# In[3]: +# In[ ]: # setting data path @@ -123,7 +123,7 @@ def load_group_stratified_data( profiles_dir = (data_dir / "sc-profiles").resolve(strict=True) cpjump1_data_path = ( - profiles_dir / "cpjump1" / "cpjump1_crispr_concat_profiles.parquet" + profiles_dir / "cpjump1" / "cpjump1_compound_concat_profiles.parquet" ).resolve(strict=True) From c176ec9ebf834c885ce650ce68599d0faf4230cb Mon Sep 17 00:00:00 2001 From: Erik Serrano Date: Fri, 19 Dec 2025 11:56:30 -0700 Subject: [PATCH 07/12] updated preprocessing module and reran pre-commit --- .../0.download-data/2.preprocessing.ipynb | 64 ++++++++++--------- .../nbconverted/2.preprocessing.py | 64 ++++++++++--------- 2 files changed, 66 insertions(+), 62 deletions(-) diff --git a/notebooks/0.download-data/2.preprocessing.ipynb b/notebooks/0.download-data/2.preprocessing.ipynb index 57079ee..6b40b1d 100644 --- a/notebooks/0.download-data/2.preprocessing.ipynb +++ b/notebooks/0.download-data/2.preprocessing.ipynb @@ -194,7 +194,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "id": "3ea207e4", "metadata": {}, "outputs": [], @@ -207,7 +207,7 @@ "\n", "# Experimental metadata\n", "exp_metadata_path = (\n", - " profiles_dir / \"cpjump1\" / \"CPJUMP1-experimental-metadata.csv\"\n", + " profiles_dir / \"cpjump1\" / \"cpjump1_compound_experimental-metadata.csv\"\n", ").resolve(strict=True)\n", "\n", "# Setting CFReT profiles directory\n", @@ -222,10 +222,10 @@ ").resolve(strict=True)\n", "\n", "# setting mitocheck profiles directory\n", - "mitocheck_profiles_dir = (profiles_dir / \"mitocheck\").resolve(strict=True)\n", - "mitocheck_norm_profiles_dir = (mitocheck_profiles_dir / \"normalized_data\").resolve(\n", - " strict=True\n", - ")\n", + "mitocheck_dir = (profiles_dir / \"mitocheck\").resolve(strict=True)\n", + "mitocheck_compressed_profiles_dir = (\n", + " profiles_dir / \"mitocheck\" / \"normalized_data\"\n", + ").resolve(strict=True)\n", "\n", "# output directories\n", "cpjump1_output_dir = (profiles_dir / \"cpjump1\").resolve()\n", @@ -241,27 +241,27 @@ "id": "7168a71a", "metadata": {}, "source": [ - "Create a list of paths that only points crispr treated plates and load the shared features config file that can be found in this [repo](https://github.com/WayScience/JUMP-single-cell)" + "Create a list of paths that only points compound treated plates and load the shared features config file that can be found in this [repo](https://github.com/WayScience/JUMP-single-cell)" ] }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 4, "id": "c7944fc2", "metadata": {}, "outputs": [], "source": [ "# Load experimental metadata\n", - "# selecting plates that pertains to the cpjump1 CRISPR dataset\n", + "# selecting plates that pertains to the cpjump1 compound dataset\n", "exp_metadata = pl.read_csv(exp_metadata_path)\n", - "crispr_plate_names = (\n", + "compound_plate_names = (\n", " exp_metadata.select(\"Assay_Plate_Barcode\").unique().to_series().to_list()\n", ")\n", - "crispr_plate_paths = [\n", + "compound_plate_paths = [\n", " (profiles_dir / \"cpjump1\" / f\"{plate}_feature_selected_sc_qc.parquet\").resolve(\n", " strict=True\n", " )\n", - " for plate in crispr_plate_names\n", + " for plate in compound_plate_names\n", "]\n", "# Load shared features\n", "with open(shared_features_config_path) as f:\n", @@ -275,9 +275,9 @@ "id": "c6bfd5c7", "metadata": {}, "source": [ - "## Preprocessing CPJUMP1 CRISPR data\n", + "## Preprocessing CPJUMP1 Compound data\n", "\n", - "Using the filtered CRISPR plate file paths and shared features configuration, we load all individual profile files and concatenate them into a single comprehensive DataFrame. This step combines data from multiple experimental plates while maintaining the consistent feature space defined by the shared features list.\n", + "Using the filtered compound plate file paths and shared features configuration, we load all individual profile files and concatenate them into a single comprehensive DataFrame. This step combines data from multiple experimental plates while maintaining the consistent feature space defined by the shared features list.\n", "\n", "The concatenation process ensures:\n", "- All profiles use the same feature set for downstream compatibility\n", @@ -288,20 +288,20 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 5, "id": "f6f7e08d", "metadata": {}, "outputs": [], "source": [ - "# Loading crispr profiles with shared features and concat into a single DataFrame\n", + "# Loading compound profiles with shared features and concat into a single DataFrame\n", "concat_output_path = (\n", - " cpjump1_output_dir / \"cpjump1_crispr_concat_profiles.parquet\"\n", + " cpjump1_output_dir / \"cpjump1_compound_concat_profiles.parquet\"\n", ").resolve()\n", "\n", "# loaded and concatenated profiles\n", "cpjump1_profiles = load_and_concat_profiles(\n", " profile_dir=profiles_dir,\n", - " specific_plates=crispr_plate_paths,\n", + " specific_plates=compound_plate_paths,\n", " shared_features=shared_features,\n", ")\n", "\n", @@ -350,7 +350,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "id": "c5471d3e", "metadata": {}, "outputs": [], @@ -358,18 +358,18 @@ "# load in mitocheck profiles and save as parquet\n", "# drop first column which is an additional index column\n", "mitocheck_profile = pl.read_csv(\n", - " mitocheck_norm_profiles_dir / \"training_data.csv.gz\",\n", + " mitocheck_compressed_profiles_dir / \"training_data.csv.gz\",\n", ")\n", "mitocheck_profile = mitocheck_profile.select(mitocheck_profile.columns[1:])\n", "\n", "# load in the mitocheck positive controls\n", "mitocheck_pos_control_profiles = pl.read_csv(\n", - " mitocheck_norm_profiles_dir / \"positive_control_data.csv.gz\",\n", + " mitocheck_compressed_profiles_dir / \"positive_control_data.csv.gz\",\n", ")\n", "\n", "# loading in negative control profiles\n", "mitocheck_neg_control_profiles = pl.read_csv(\n", - " mitocheck_norm_profiles_dir / \"negative_control_data.csv.gz\",\n", + " mitocheck_compressed_profiles_dir / \"negative_control_data.csv.gz\",\n", ")\n", "\n", "# insert new column \"Mitocheck_Phenotypic_Class\" for both positive and negative controls\n", @@ -404,7 +404,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "id": "c57da947", "metadata": {}, "outputs": [], @@ -437,12 +437,13 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 8, "id": "1d7ced04", "metadata": {}, "outputs": [], "source": [ - "# select # naming the metadata of mitocheck profiles\n", + "# manually selecting metadata features that are present across all 3 profiles\n", + "# (negcon, poscon, and training)\n", "mitocheck_meta_data = [\n", " \"Mitocheck_Phenotypic_Class\",\n", " \"Cell_UUID\",\n", @@ -474,8 +475,9 @@ " & set(cp_mitocheck_pos_control_profiles_features)\n", ")\n", "\n", - "# now create a json file that contains the feature space configs\n", - "with open(mitocheck_profiles_dir / \"mitocheck_feature_space_configs.json\", \"w\") as f:\n", + "# create a json file that contains the feature space configs\n", + "# this is shared across all three differe plates: traiing, negcon, and poscon\n", + "with open(mitocheck_dir / \"mitocheck_feature_space_configs.json\", \"w\") as f:\n", " json.dump(\n", " {\n", " \"metadata-features\": mitocheck_meta_data,\n", @@ -488,7 +490,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 9, "id": "42108980", "metadata": {}, "outputs": [], @@ -517,7 +519,7 @@ "\n", "# save concatenated mitocheck profiles\n", "concat_mitocheck_profiles.write_parquet(\n", - " mitocheck_profiles_dir / \"mitocheck_concat_profiles.parquet\"\n", + " mitocheck_dir / \"mitocheck_concat_profiles.parquet\"\n", ")" ] }, @@ -528,14 +530,14 @@ "source": [ "## Preprocessing CFReT Dataset\n", "\n", - "This section preprocesses the CFReT (CRISPR Fluorescent Reporter of Transcription) dataset to ensure compatibility with downstream analysis workflows.\n", + "This section preprocesses the CFReT dataset to ensure compatibility with downstream analysis workflows.\n", "\n", "- **Unique cell identification**: Adding `Metadata_cell_id` column with unique hash values based on all profile features to enable precise cell tracking and deduplication\n" ] }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 10, "id": "1763d383", "metadata": {}, "outputs": [], diff --git a/notebooks/0.download-data/nbconverted/2.preprocessing.py b/notebooks/0.download-data/nbconverted/2.preprocessing.py index c586095..b643cef 100644 --- a/notebooks/0.download-data/nbconverted/2.preprocessing.py +++ b/notebooks/0.download-data/nbconverted/2.preprocessing.py @@ -164,7 +164,7 @@ def remove_feature_prefixes(df: pl.DataFrame, prefix: str = "CP__") -> pl.DataFr # # > **Note:** The shared profiles utilized here are sourced from the [JUMP-single-cell](https://github.com/WayScience/JUMP-single-cell) repository. All preprocessing and profile generation steps are performed in that repository, and this notebook focuses on downstream analysis using the generated profiles. -# In[ ]: +# In[3]: # Setting data directory @@ -175,7 +175,7 @@ def remove_feature_prefixes(df: pl.DataFrame, prefix: str = "CP__") -> pl.DataFr # Experimental metadata exp_metadata_path = ( - profiles_dir / "cpjump1" / "CPJUMP1-experimental-metadata.csv" + profiles_dir / "cpjump1" / "cpjump1_compound_experimental-metadata.csv" ).resolve(strict=True) # Setting CFReT profiles directory @@ -190,10 +190,10 @@ def remove_feature_prefixes(df: pl.DataFrame, prefix: str = "CP__") -> pl.DataFr ).resolve(strict=True) # setting mitocheck profiles directory -mitocheck_profiles_dir = (profiles_dir / "mitocheck").resolve(strict=True) -mitocheck_norm_profiles_dir = (mitocheck_profiles_dir / "normalized_data").resolve( - strict=True -) +mitocheck_dir = (profiles_dir / "mitocheck").resolve(strict=True) +mitocheck_compressed_profiles_dir = ( + profiles_dir / "mitocheck" / "normalized_data" +).resolve(strict=True) # output directories cpjump1_output_dir = (profiles_dir / "cpjump1").resolve() @@ -204,22 +204,22 @@ def remove_feature_prefixes(df: pl.DataFrame, prefix: str = "CP__") -> pl.DataFr results_dir.mkdir(exist_ok=True) -# Create a list of paths that only points crispr treated plates and load the shared features config file that can be found in this [repo](https://github.com/WayScience/JUMP-single-cell) +# Create a list of paths that only points compound treated plates and load the shared features config file that can be found in this [repo](https://github.com/WayScience/JUMP-single-cell) -# In[5]: +# In[4]: # Load experimental metadata -# selecting plates that pertains to the cpjump1 CRISPR dataset +# selecting plates that pertains to the cpjump1 compound dataset exp_metadata = pl.read_csv(exp_metadata_path) -crispr_plate_names = ( +compound_plate_names = ( exp_metadata.select("Assay_Plate_Barcode").unique().to_series().to_list() ) -crispr_plate_paths = [ +compound_plate_paths = [ (profiles_dir / "cpjump1" / f"{plate}_feature_selected_sc_qc.parquet").resolve( strict=True ) - for plate in crispr_plate_names + for plate in compound_plate_names ] # Load shared features with open(shared_features_config_path) as f: @@ -228,9 +228,9 @@ def remove_feature_prefixes(df: pl.DataFrame, prefix: str = "CP__") -> pl.DataFr shared_features = loaded_shared_features["shared-features"] -# ## Preprocessing CPJUMP1 CRISPR data +# ## Preprocessing CPJUMP1 Compound data # -# Using the filtered CRISPR plate file paths and shared features configuration, we load all individual profile files and concatenate them into a single comprehensive DataFrame. This step combines data from multiple experimental plates while maintaining the consistent feature space defined by the shared features list. +# Using the filtered compound plate file paths and shared features configuration, we load all individual profile files and concatenate them into a single comprehensive DataFrame. This step combines data from multiple experimental plates while maintaining the consistent feature space defined by the shared features list. # # The concatenation process ensures: # - All profiles use the same feature set for downstream compatibility @@ -238,18 +238,18 @@ def remove_feature_prefixes(df: pl.DataFrame, prefix: str = "CP__") -> pl.DataFr # - Data integrity is maintained during the merge operation # - Adding a unique cell id has column `Metadata_cell_id` -# In[6]: +# In[5]: -# Loading crispr profiles with shared features and concat into a single DataFrame +# Loading compound profiles with shared features and concat into a single DataFrame concat_output_path = ( - cpjump1_output_dir / "cpjump1_crispr_concat_profiles.parquet" + cpjump1_output_dir / "cpjump1_compound_concat_profiles.parquet" ).resolve() # loaded and concatenated profiles cpjump1_profiles = load_and_concat_profiles( profile_dir=profiles_dir, - specific_plates=crispr_plate_paths, + specific_plates=compound_plate_paths, shared_features=shared_features, ) @@ -290,24 +290,24 @@ def remove_feature_prefixes(df: pl.DataFrame, prefix: str = "CP__") -> pl.DataFr # # The preprocessing ensures that all MitoCheck datasets share a common feature space and are ready for comparative analysis with CPJUMP1 profiles. -# In[ ]: +# In[6]: # load in mitocheck profiles and save as parquet # drop first column which is an additional index column mitocheck_profile = pl.read_csv( - mitocheck_norm_profiles_dir / "training_data.csv.gz", + mitocheck_compressed_profiles_dir / "training_data.csv.gz", ) mitocheck_profile = mitocheck_profile.select(mitocheck_profile.columns[1:]) # load in the mitocheck positive controls mitocheck_pos_control_profiles = pl.read_csv( - mitocheck_norm_profiles_dir / "positive_control_data.csv.gz", + mitocheck_compressed_profiles_dir / "positive_control_data.csv.gz", ) # loading in negative control profiles mitocheck_neg_control_profiles = pl.read_csv( - mitocheck_norm_profiles_dir / "negative_control_data.csv.gz", + mitocheck_compressed_profiles_dir / "negative_control_data.csv.gz", ) # insert new column "Mitocheck_Phenotypic_Class" for both positive and negative controls @@ -334,7 +334,7 @@ def remove_feature_prefixes(df: pl.DataFrame, prefix: str = "CP__") -> pl.DataFr # Filter Cell Profiler (CP) features and preprocess columns by removing the "CP__" prefix to standardize feature names for downstream analysis. -# In[ ]: +# In[7]: # Split profiles to only retain cell profiler features @@ -357,10 +357,11 @@ def remove_feature_prefixes(df: pl.DataFrame, prefix: str = "CP__") -> pl.DataFr # Splitting the metadata and feature columns for each dataset to enable targeted downstream analysis and ensure consistent data structure across all profiles. -# In[ ]: +# In[8]: -# select # naming the metadata of mitocheck profiles +# manually selecting metadata features that are present across all 3 profiles +# (negcon, poscon, and training) mitocheck_meta_data = [ "Mitocheck_Phenotypic_Class", "Cell_UUID", @@ -392,8 +393,9 @@ def remove_feature_prefixes(df: pl.DataFrame, prefix: str = "CP__") -> pl.DataFr & set(cp_mitocheck_pos_control_profiles_features) ) -# now create a json file that contains the feature space configs -with open(mitocheck_profiles_dir / "mitocheck_feature_space_configs.json", "w") as f: +# create a json file that contains the feature space configs +# this is shared across all three differe plates: traiing, negcon, and poscon +with open(mitocheck_dir / "mitocheck_feature_space_configs.json", "w") as f: json.dump( { "metadata-features": mitocheck_meta_data, @@ -404,7 +406,7 @@ def remove_feature_prefixes(df: pl.DataFrame, prefix: str = "CP__") -> pl.DataFr ) -# In[ ]: +# In[9]: # create concatenated mitocheck profiles @@ -431,18 +433,18 @@ def remove_feature_prefixes(df: pl.DataFrame, prefix: str = "CP__") -> pl.DataFr # save concatenated mitocheck profiles concat_mitocheck_profiles.write_parquet( - mitocheck_profiles_dir / "mitocheck_concat_profiles.parquet" + mitocheck_dir / "mitocheck_concat_profiles.parquet" ) # ## Preprocessing CFReT Dataset # -# This section preprocesses the CFReT (CRISPR Fluorescent Reporter of Transcription) dataset to ensure compatibility with downstream analysis workflows. +# This section preprocesses the CFReT dataset to ensure compatibility with downstream analysis workflows. # # - **Unique cell identification**: Adding `Metadata_cell_id` column with unique hash values based on all profile features to enable precise cell tracking and deduplication # -# In[8]: +# In[10]: # load in cfret profiles and add a unique cell ID From 22a7a0bd80b0c724ecceffbde7d706415230ff22 Mon Sep 17 00:00:00 2001 From: Erik Serrano Date: Fri, 19 Dec 2025 11:58:58 -0700 Subject: [PATCH 08/12] reran notebooke #3 in module 0 --- notebooks/0.download-data/3.subset-jump-controls.ipynb | 2 +- notebooks/0.download-data/nbconverted/3.subset-jump-controls.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/notebooks/0.download-data/3.subset-jump-controls.ipynb b/notebooks/0.download-data/3.subset-jump-controls.ipynb index 4ab1758..c736f80 100644 --- a/notebooks/0.download-data/3.subset-jump-controls.ipynb +++ b/notebooks/0.download-data/3.subset-jump-controls.ipynb @@ -137,7 +137,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "id": "6a8dd258", "metadata": {}, "outputs": [], diff --git a/notebooks/0.download-data/nbconverted/3.subset-jump-controls.py b/notebooks/0.download-data/nbconverted/3.subset-jump-controls.py index bfd3dbc..0be0975 100644 --- a/notebooks/0.download-data/nbconverted/3.subset-jump-controls.py +++ b/notebooks/0.download-data/nbconverted/3.subset-jump-controls.py @@ -110,7 +110,7 @@ def load_group_stratified_data( # Setting input and output paths -# In[ ]: +# In[3]: # setting data path From 7fa9d451151b1610d7c5e505b9e629823f45372e Mon Sep 17 00:00:00 2001 From: Erik Serrano Date: Mon, 22 Dec 2025 09:26:10 -0700 Subject: [PATCH 09/12] added moa infromation to cpjump compound data --- .../0.download-data/2.preprocessing.ipynb | 67 +++++++++++++++---- .../nbconverted/2.preprocessing.py | 50 ++++++++++---- 2 files changed, 90 insertions(+), 27 deletions(-) diff --git a/notebooks/0.download-data/2.preprocessing.ipynb b/notebooks/0.download-data/2.preprocessing.ipynb index 6b40b1d..033358d 100644 --- a/notebooks/0.download-data/2.preprocessing.ipynb +++ b/notebooks/0.download-data/2.preprocessing.ipynb @@ -205,6 +205,11 @@ "# Setting profiles directory\n", "profiles_dir = (data_dir / \"sc-profiles\").resolve(strict=True)\n", "\n", + "# setting connectivity map drug repurposing config\n", + "drug_repurposing_config_path = (data_dir / \"repurposing_drugs_20180907.txt\").resolve(\n", + " strict=True\n", + ")\n", + "\n", "# Experimental metadata\n", "exp_metadata_path = (\n", " profiles_dir / \"cpjump1\" / \"cpjump1_compound_experimental-metadata.csv\"\n", @@ -227,6 +232,7 @@ " profiles_dir / \"mitocheck\" / \"normalized_data\"\n", ").resolve(strict=True)\n", "\n", + "\n", "# output directories\n", "cpjump1_output_dir = (profiles_dir / \"cpjump1\").resolve()\n", "cpjump1_output_dir.mkdir(exist_ok=True)\n", @@ -286,6 +292,14 @@ "- Adding a unique cell id has column `Metadata_cell_id`" ] }, + { + "cell_type": "markdown", + "id": "9ec882fa", + "metadata": {}, + "source": [ + "We are loading per-plate parquet profiles for compound-treated plates, selecting the shared feature set, concatenating them into a single Polars DataFrame while preserving metadata, and adding a unique Metadata_cell_id for each cell. The resulting cpjump1_profiles table is ready for downstream analysis." + ] + }, { "cell_type": "code", "execution_count": 5, @@ -293,11 +307,6 @@ "metadata": {}, "outputs": [], "source": [ - "# Loading compound profiles with shared features and concat into a single DataFrame\n", - "concat_output_path = (\n", - " cpjump1_output_dir / \"cpjump1_compound_concat_profiles.parquet\"\n", - ").resolve()\n", - "\n", "# loaded and concatenated profiles\n", "cpjump1_profiles = load_and_concat_profiles(\n", " profile_dir=profiles_dir,\n", @@ -306,12 +315,38 @@ ")\n", "\n", "# create an index columm and unique cell ID based on features of a single profiles\n", - "cpjump1_profiles = add_cell_id_hash(cpjump1_profiles)\n", + "cpjump1_profiles = add_cell_id_hash(cpjump1_profiles)" + ] + }, + { + "cell_type": "markdown", + "id": "3df9bbf5", + "metadata": {}, + "source": [ + "Next we annotate the compound treatments in the CPJUMP1 dataset, we annotate each cell with Mechanism of Action (MoA) information using the [Clue Drug Repurposing Hub](https://clue.io/data/REP#REP). This resource provides comprehensive drug and tool compound annotations, including target information and clinical development status.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "adfb9148", + "metadata": {}, + "outputs": [], + "source": [ + "# load drug repurposing moa file and add prefix to metadata columns\n", + "rep_moa_df = pl.read_csv(\n", + " drug_repurposing_config_path, separator=\"\\t\", skip_rows=9, encoding=\"utf8-lossy\"\n", + ").rename(lambda x: f\"Metadata_{x}\" if not x.startswith(\"Metadata_\") else x)\n", + "\n", + "# merge the original cpjump1_profiles with rep_moa_df on Metadata_pert_iname\n", + "cpjump1_profiles = cpjump1_profiles.join(\n", + " rep_moa_df, on=\"Metadata_pert_iname\", how=\"left\"\n", + ")\n", "\n", - "# Split meta and features\n", + "# split meta and feature\n", "meta_cols, features_cols = split_meta_and_features(cpjump1_profiles)\n", "\n", - "# Saving metadata and features of the concat profile into a json file\n", + "# save the feature space information into a json file\n", "meta_features_dict = {\n", " \"concat-profiles\": {\n", " \"meta-features\": meta_cols,\n", @@ -321,7 +356,11 @@ "with open(cpjump1_output_dir / \"concat_profiles_meta_features.json\", \"w\") as f:\n", " json.dump(meta_features_dict, f, indent=4)\n", "\n", - "# save as parquet with defined order of columns\n", + "# save concatenated profiles\n", + "# Loading compound profiles with shared features and concat into a single DataFrame\n", + "concat_output_path = (\n", + " cpjump1_output_dir / \"cpjump1_compound_concat_profiles.parquet\"\n", + ").resolve()\n", "cpjump1_profiles.select(meta_cols + features_cols).write_parquet(concat_output_path)" ] }, @@ -350,7 +389,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 7, "id": "c5471d3e", "metadata": {}, "outputs": [], @@ -404,7 +443,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 8, "id": "c57da947", "metadata": {}, "outputs": [], @@ -437,7 +476,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 9, "id": "1d7ced04", "metadata": {}, "outputs": [], @@ -490,7 +529,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 10, "id": "42108980", "metadata": {}, "outputs": [], @@ -537,7 +576,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 11, "id": "1763d383", "metadata": {}, "outputs": [], diff --git a/notebooks/0.download-data/nbconverted/2.preprocessing.py b/notebooks/0.download-data/nbconverted/2.preprocessing.py index b643cef..9a4532f 100644 --- a/notebooks/0.download-data/nbconverted/2.preprocessing.py +++ b/notebooks/0.download-data/nbconverted/2.preprocessing.py @@ -173,6 +173,11 @@ def remove_feature_prefixes(df: pl.DataFrame, prefix: str = "CP__") -> pl.DataFr # Setting profiles directory profiles_dir = (data_dir / "sc-profiles").resolve(strict=True) +# setting connectivity map drug repurposing config +drug_repurposing_config_path = (data_dir / "repurposing_drugs_20180907.txt").resolve( + strict=True +) + # Experimental metadata exp_metadata_path = ( profiles_dir / "cpjump1" / "cpjump1_compound_experimental-metadata.csv" @@ -195,6 +200,7 @@ def remove_feature_prefixes(df: pl.DataFrame, prefix: str = "CP__") -> pl.DataFr profiles_dir / "mitocheck" / "normalized_data" ).resolve(strict=True) + # output directories cpjump1_output_dir = (profiles_dir / "cpjump1").resolve() cpjump1_output_dir.mkdir(exist_ok=True) @@ -238,14 +244,11 @@ def remove_feature_prefixes(df: pl.DataFrame, prefix: str = "CP__") -> pl.DataFr # - Data integrity is maintained during the merge operation # - Adding a unique cell id has column `Metadata_cell_id` +# We are loading per-plate parquet profiles for compound-treated plates, selecting the shared feature set, concatenating them into a single Polars DataFrame while preserving metadata, and adding a unique Metadata_cell_id for each cell. The resulting cpjump1_profiles table is ready for downstream analysis. + # In[5]: -# Loading compound profiles with shared features and concat into a single DataFrame -concat_output_path = ( - cpjump1_output_dir / "cpjump1_compound_concat_profiles.parquet" -).resolve() - # loaded and concatenated profiles cpjump1_profiles = load_and_concat_profiles( profile_dir=profiles_dir, @@ -256,10 +259,27 @@ def remove_feature_prefixes(df: pl.DataFrame, prefix: str = "CP__") -> pl.DataFr # create an index columm and unique cell ID based on features of a single profiles cpjump1_profiles = add_cell_id_hash(cpjump1_profiles) -# Split meta and features + +# Next we annotate the compound treatments in the CPJUMP1 dataset, we annotate each cell with Mechanism of Action (MoA) information using the [Clue Drug Repurposing Hub](https://clue.io/data/REP#REP). This resource provides comprehensive drug and tool compound annotations, including target information and clinical development status. +# + +# In[ ]: + + +# load drug repurposing moa file and add prefix to metadata columns +rep_moa_df = pl.read_csv( + drug_repurposing_config_path, separator="\t", skip_rows=9, encoding="utf8-lossy" +).rename(lambda x: f"Metadata_{x}" if not x.startswith("Metadata_") else x) + +# merge the original cpjump1_profiles with rep_moa_df on Metadata_pert_iname +cpjump1_profiles = cpjump1_profiles.join( + rep_moa_df, on="Metadata_pert_iname", how="left" +) + +# split meta and feature meta_cols, features_cols = split_meta_and_features(cpjump1_profiles) -# Saving metadata and features of the concat profile into a json file +# save the feature space information into a json file meta_features_dict = { "concat-profiles": { "meta-features": meta_cols, @@ -269,7 +289,11 @@ def remove_feature_prefixes(df: pl.DataFrame, prefix: str = "CP__") -> pl.DataFr with open(cpjump1_output_dir / "concat_profiles_meta_features.json", "w") as f: json.dump(meta_features_dict, f, indent=4) -# save as parquet with defined order of columns +# save concatenated profiles +# Loading compound profiles with shared features and concat into a single DataFrame +concat_output_path = ( + cpjump1_output_dir / "cpjump1_compound_concat_profiles.parquet" +).resolve() cpjump1_profiles.select(meta_cols + features_cols).write_parquet(concat_output_path) @@ -290,7 +314,7 @@ def remove_feature_prefixes(df: pl.DataFrame, prefix: str = "CP__") -> pl.DataFr # # The preprocessing ensures that all MitoCheck datasets share a common feature space and are ready for comparative analysis with CPJUMP1 profiles. -# In[6]: +# In[7]: # load in mitocheck profiles and save as parquet @@ -334,7 +358,7 @@ def remove_feature_prefixes(df: pl.DataFrame, prefix: str = "CP__") -> pl.DataFr # Filter Cell Profiler (CP) features and preprocess columns by removing the "CP__" prefix to standardize feature names for downstream analysis. -# In[7]: +# In[8]: # Split profiles to only retain cell profiler features @@ -357,7 +381,7 @@ def remove_feature_prefixes(df: pl.DataFrame, prefix: str = "CP__") -> pl.DataFr # Splitting the metadata and feature columns for each dataset to enable targeted downstream analysis and ensure consistent data structure across all profiles. -# In[8]: +# In[9]: # manually selecting metadata features that are present across all 3 profiles @@ -406,7 +430,7 @@ def remove_feature_prefixes(df: pl.DataFrame, prefix: str = "CP__") -> pl.DataFr ) -# In[9]: +# In[10]: # create concatenated mitocheck profiles @@ -444,7 +468,7 @@ def remove_feature_prefixes(df: pl.DataFrame, prefix: str = "CP__") -> pl.DataFr # - **Unique cell identification**: Adding `Metadata_cell_id` column with unique hash values based on all profile features to enable precise cell tracking and deduplication # -# In[10]: +# In[11]: # load in cfret profiles and add a unique cell ID From 65b46f2ea5d383951cd00eb37f2f3cbd77296297 Mon Sep 17 00:00:00 2001 From: Erik Serrano Date: Mon, 22 Dec 2025 10:49:42 -0700 Subject: [PATCH 10/12] added moas --- .../0.download-data/2.preprocessing.ipynb | 22 +++++++------- .../nbconverted/2.preprocessing.py | 29 +++++++++++-------- 2 files changed, 28 insertions(+), 23 deletions(-) diff --git a/notebooks/0.download-data/2.preprocessing.ipynb b/notebooks/0.download-data/2.preprocessing.ipynb index fc1b8dc..c9323a6 100644 --- a/notebooks/0.download-data/2.preprocessing.ipynb +++ b/notebooks/0.download-data/2.preprocessing.ipynb @@ -23,7 +23,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 5, "id": "0387feba", "metadata": {}, "outputs": [], @@ -51,7 +51,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 6, "id": "d0f8b798", "metadata": {}, "outputs": [], @@ -194,7 +194,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 7, "id": "3ea207e4", "metadata": {}, "outputs": [], @@ -251,7 +251,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 8, "id": "c7944fc2", "metadata": {}, "outputs": [], @@ -301,7 +301,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 9, "id": "f6f7e08d", "metadata": {}, "outputs": [], @@ -332,7 +332,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 10, "id": "adfb9148", "metadata": {}, "outputs": [], @@ -393,7 +393,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 11, "id": "c5471d3e", "metadata": {}, "outputs": [], @@ -447,7 +447,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 12, "id": "c57da947", "metadata": {}, "outputs": [], @@ -480,7 +480,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 13, "id": "1d7ced04", "metadata": {}, "outputs": [], @@ -533,7 +533,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 14, "id": "42108980", "metadata": {}, "outputs": [], @@ -580,7 +580,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 15, "id": "1763d383", "metadata": {}, "outputs": [], diff --git a/notebooks/0.download-data/nbconverted/2.preprocessing.py b/notebooks/0.download-data/nbconverted/2.preprocessing.py index 14a70ae..8142bdb 100644 --- a/notebooks/0.download-data/nbconverted/2.preprocessing.py +++ b/notebooks/0.download-data/nbconverted/2.preprocessing.py @@ -15,7 +15,7 @@ # # These preprocessing steps ensure that all datasets are standardized, well-documented, and ready for comparative and integrative analyses. -# In[1]: +# In[5]: import json @@ -31,7 +31,7 @@ # # Contains helper function that pertains to this notebook. -# In[2]: +# In[6]: def load_and_concat_profiles( @@ -164,7 +164,7 @@ def remove_feature_prefixes(df: pl.DataFrame, prefix: str = "CP__") -> pl.DataFr # # > **Note:** The shared profiles utilized here are sourced from the [JUMP-single-cell](https://github.com/WayScience/JUMP-single-cell) repository. All preprocessing and profile generation steps are performed in that repository, and this notebook focuses on downstream analysis using the generated profiles. -# In[3]: +# In[7]: # Setting data directory @@ -211,7 +211,7 @@ def remove_feature_prefixes(df: pl.DataFrame, prefix: str = "CP__") -> pl.DataFr # Create a list of paths that only points compound treated plates and load the shared features config file that can be found in this [repo](https://github.com/WayScience/JUMP-single-cell) -# In[4]: +# In[8]: # Load experimental metadata @@ -243,11 +243,16 @@ def remove_feature_prefixes(df: pl.DataFrame, prefix: str = "CP__") -> pl.DataFr # - Data integrity is maintained during the merge operation # - Adding a unique cell id has column `Metadata_cell_id` -# In[5]: +# We are loading per-plate parquet profiles for compound-treated plates, selecting the shared feature set, concatenating them into a single Polars DataFrame while preserving metadata, and adding a unique Metadata_cell_id for each cell. The resulting cpjump1_profiles table is ready for downstream analysis. -# In[5]: +# In[9]: +# Loading compound profiles with shared features and concat into a single DataFrame +concat_output_path = ( + cpjump1_output_dir / "cpjump1_compound_concat_profiles.parquet" +).resolve() + # loaded and concatenated profiles cpjump1_profiles = load_and_concat_profiles( profile_dir=profiles_dir, @@ -262,7 +267,7 @@ def remove_feature_prefixes(df: pl.DataFrame, prefix: str = "CP__") -> pl.DataFr # Next we annotate the compound treatments in the CPJUMP1 dataset, we annotate each cell with Mechanism of Action (MoA) information using the [Clue Drug Repurposing Hub](https://clue.io/data/REP#REP). This resource provides comprehensive drug and tool compound annotations, including target information and clinical development status. # -# In[ ]: +# In[10]: # load drug repurposing moa file and add prefix to metadata columns @@ -313,7 +318,7 @@ def remove_feature_prefixes(df: pl.DataFrame, prefix: str = "CP__") -> pl.DataFr # # The preprocessing ensures that all MitoCheck datasets share a common feature space and are ready for comparative analysis with CPJUMP1 profiles. -# In[6]: +# In[11]: # load in mitocheck profiles and save as parquet @@ -357,7 +362,7 @@ def remove_feature_prefixes(df: pl.DataFrame, prefix: str = "CP__") -> pl.DataFr # Filter Cell Profiler (CP) features and preprocess columns by removing the "CP__" prefix to standardize feature names for downstream analysis. -# In[7]: +# In[12]: # Split profiles to only retain cell profiler features @@ -380,7 +385,7 @@ def remove_feature_prefixes(df: pl.DataFrame, prefix: str = "CP__") -> pl.DataFr # Splitting the metadata and feature columns for each dataset to enable targeted downstream analysis and ensure consistent data structure across all profiles. -# In[8]: +# In[13]: # manually selecting metadata features that are present across all 3 profiles @@ -429,7 +434,7 @@ def remove_feature_prefixes(df: pl.DataFrame, prefix: str = "CP__") -> pl.DataFr ) -# In[9]: +# In[14]: # create concatenated mitocheck profiles @@ -467,7 +472,7 @@ def remove_feature_prefixes(df: pl.DataFrame, prefix: str = "CP__") -> pl.DataFr # - **Unique cell identification**: Adding `Metadata_cell_id` column with unique hash values based on all profile features to enable precise cell tracking and deduplication # -# In[10]: +# In[15]: # load in cfret profiles and add a unique cell ID From 75c43e10158af621ec3d709597e65a605fbdd6e0 Mon Sep 17 00:00:00 2001 From: Erik Serrano Date: Mon, 22 Dec 2025 11:21:32 -0700 Subject: [PATCH 11/12] updated --- .../0.download-data/2.preprocessing.ipynb | 38 ++++++++----------- .../nbconverted/2.preprocessing.py | 36 ++++++++---------- utils/io_utils.py | 11 ++++++ 3 files changed, 42 insertions(+), 43 deletions(-) diff --git a/notebooks/0.download-data/2.preprocessing.ipynb b/notebooks/0.download-data/2.preprocessing.ipynb index c9323a6..7872b7f 100644 --- a/notebooks/0.download-data/2.preprocessing.ipynb +++ b/notebooks/0.download-data/2.preprocessing.ipynb @@ -23,7 +23,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 1, "id": "0387feba", "metadata": {}, "outputs": [], @@ -36,7 +36,8 @@ "import polars as pl\n", "\n", "sys.path.append(\"../../\")\n", - "from utils.data_utils import split_meta_and_features, add_cell_id_hash" + "from utils.data_utils import split_meta_and_features, add_cell_id_hash\n", + "from utils.io_utils import load_profiles" ] }, { @@ -51,7 +52,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 2, "id": "d0f8b798", "metadata": {}, "outputs": [], @@ -93,15 +94,6 @@ " \"All elements in specific_plates must be pathlib.Path objects\"\n", " )\n", "\n", - " def load_profile(file: pathlib.Path) -> pl.DataFrame:\n", - " \"\"\"internal function to load a single profile file.\"\"\"\n", - " profile_df = pl.read_parquet(file)\n", - " meta_cols, _ = split_meta_and_features(profile_df)\n", - " if shared_features is not None:\n", - " # Only select metadata and shared features\n", - " return profile_df.select(meta_cols + shared_features)\n", - " return profile_df\n", - "\n", " # Use specific_plates if provided, otherwise gather all .parquet files\n", " if specific_plates is not None:\n", " # Validate that all specific plate files exist\n", @@ -115,7 +107,9 @@ " raise FileNotFoundError(f\"No profile files found in {profile_dir}\")\n", "\n", " # Load and concatenate profiles\n", - " loaded_profiles = [load_profile(f) for f in files_to_load]\n", + " loaded_profiles = [\n", + " load_profiles(f, shared_features=shared_features) for f in files_to_load\n", + " ]\n", "\n", " # Concatenate all loaded profiles\n", " return pl.concat(loaded_profiles, rechunk=True)\n", @@ -194,7 +188,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 3, "id": "3ea207e4", "metadata": {}, "outputs": [], @@ -251,7 +245,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 4, "id": "c7944fc2", "metadata": {}, "outputs": [], @@ -301,7 +295,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 5, "id": "f6f7e08d", "metadata": {}, "outputs": [], @@ -332,7 +326,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 6, "id": "adfb9148", "metadata": {}, "outputs": [], @@ -393,7 +387,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 7, "id": "c5471d3e", "metadata": {}, "outputs": [], @@ -447,7 +441,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 8, "id": "c57da947", "metadata": {}, "outputs": [], @@ -480,7 +474,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 9, "id": "1d7ced04", "metadata": {}, "outputs": [], @@ -533,7 +527,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 10, "id": "42108980", "metadata": {}, "outputs": [], @@ -580,7 +574,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 11, "id": "1763d383", "metadata": {}, "outputs": [], diff --git a/notebooks/0.download-data/nbconverted/2.preprocessing.py b/notebooks/0.download-data/nbconverted/2.preprocessing.py index 8142bdb..bb0399a 100644 --- a/notebooks/0.download-data/nbconverted/2.preprocessing.py +++ b/notebooks/0.download-data/nbconverted/2.preprocessing.py @@ -15,7 +15,7 @@ # # These preprocessing steps ensure that all datasets are standardized, well-documented, and ready for comparative and integrative analyses. -# In[5]: +# In[1]: import json @@ -26,12 +26,13 @@ sys.path.append("../../") from utils.data_utils import add_cell_id_hash, split_meta_and_features +from utils.io_utils import load_profiles # ## Helper functions # # Contains helper function that pertains to this notebook. -# In[6]: +# In[2]: def load_and_concat_profiles( @@ -71,15 +72,6 @@ def load_and_concat_profiles( "All elements in specific_plates must be pathlib.Path objects" ) - def load_profile(file: pathlib.Path) -> pl.DataFrame: - """internal function to load a single profile file.""" - profile_df = pl.read_parquet(file) - meta_cols, _ = split_meta_and_features(profile_df) - if shared_features is not None: - # Only select metadata and shared features - return profile_df.select(meta_cols + shared_features) - return profile_df - # Use specific_plates if provided, otherwise gather all .parquet files if specific_plates is not None: # Validate that all specific plate files exist @@ -93,7 +85,9 @@ def load_profile(file: pathlib.Path) -> pl.DataFrame: raise FileNotFoundError(f"No profile files found in {profile_dir}") # Load and concatenate profiles - loaded_profiles = [load_profile(f) for f in files_to_load] + loaded_profiles = [ + load_profiles(f, shared_features=shared_features) for f in files_to_load + ] # Concatenate all loaded profiles return pl.concat(loaded_profiles, rechunk=True) @@ -164,7 +158,7 @@ def remove_feature_prefixes(df: pl.DataFrame, prefix: str = "CP__") -> pl.DataFr # # > **Note:** The shared profiles utilized here are sourced from the [JUMP-single-cell](https://github.com/WayScience/JUMP-single-cell) repository. All preprocessing and profile generation steps are performed in that repository, and this notebook focuses on downstream analysis using the generated profiles. -# In[7]: +# In[3]: # Setting data directory @@ -211,7 +205,7 @@ def remove_feature_prefixes(df: pl.DataFrame, prefix: str = "CP__") -> pl.DataFr # Create a list of paths that only points compound treated plates and load the shared features config file that can be found in this [repo](https://github.com/WayScience/JUMP-single-cell) -# In[8]: +# In[4]: # Load experimental metadata @@ -245,7 +239,7 @@ def remove_feature_prefixes(df: pl.DataFrame, prefix: str = "CP__") -> pl.DataFr # We are loading per-plate parquet profiles for compound-treated plates, selecting the shared feature set, concatenating them into a single Polars DataFrame while preserving metadata, and adding a unique Metadata_cell_id for each cell. The resulting cpjump1_profiles table is ready for downstream analysis. -# In[9]: +# In[5]: # Loading compound profiles with shared features and concat into a single DataFrame @@ -267,7 +261,7 @@ def remove_feature_prefixes(df: pl.DataFrame, prefix: str = "CP__") -> pl.DataFr # Next we annotate the compound treatments in the CPJUMP1 dataset, we annotate each cell with Mechanism of Action (MoA) information using the [Clue Drug Repurposing Hub](https://clue.io/data/REP#REP). This resource provides comprehensive drug and tool compound annotations, including target information and clinical development status. # -# In[10]: +# In[6]: # load drug repurposing moa file and add prefix to metadata columns @@ -318,7 +312,7 @@ def remove_feature_prefixes(df: pl.DataFrame, prefix: str = "CP__") -> pl.DataFr # # The preprocessing ensures that all MitoCheck datasets share a common feature space and are ready for comparative analysis with CPJUMP1 profiles. -# In[11]: +# In[7]: # load in mitocheck profiles and save as parquet @@ -362,7 +356,7 @@ def remove_feature_prefixes(df: pl.DataFrame, prefix: str = "CP__") -> pl.DataFr # Filter Cell Profiler (CP) features and preprocess columns by removing the "CP__" prefix to standardize feature names for downstream analysis. -# In[12]: +# In[8]: # Split profiles to only retain cell profiler features @@ -385,7 +379,7 @@ def remove_feature_prefixes(df: pl.DataFrame, prefix: str = "CP__") -> pl.DataFr # Splitting the metadata and feature columns for each dataset to enable targeted downstream analysis and ensure consistent data structure across all profiles. -# In[13]: +# In[9]: # manually selecting metadata features that are present across all 3 profiles @@ -434,7 +428,7 @@ def remove_feature_prefixes(df: pl.DataFrame, prefix: str = "CP__") -> pl.DataFr ) -# In[14]: +# In[10]: # create concatenated mitocheck profiles @@ -472,7 +466,7 @@ def remove_feature_prefixes(df: pl.DataFrame, prefix: str = "CP__") -> pl.DataFr # - **Unique cell identification**: Adding `Metadata_cell_id` column with unique hash values based on all profile features to enable precise cell tracking and deduplication # -# In[15]: +# In[11]: # load in cfret profiles and add a unique cell ID diff --git a/utils/io_utils.py b/utils/io_utils.py index ad8f720..6d9f1a9 100644 --- a/utils/io_utils.py +++ b/utils/io_utils.py @@ -9,11 +9,14 @@ import yaml from tqdm import tqdm +from .data_utils import split_meta_and_features + def load_profiles( fpath: str | pathlib.Path, convert_to_f32: bool = False, verbose: bool | None = False, + shared_features: list[str] | None = None, ) -> pl.DataFrame: """Load single-cell profiles from given file path. @@ -29,6 +32,9 @@ def load_profiles( If True, converts all Float64 columns to Float32 to save memory. Default is False verbose : bool, optional If True, prints information about the loaded profiles. Default is False. + shared_features : list[str] | None, optional + If provided, only loads metadata columns and these specific feature columns. + Default is None (loads all columns). Returns ------- @@ -61,6 +67,11 @@ def load_profiles( # load profiles loaded_profiles = pl.read_parquet(fpath) + # filter to shared features if provided + if shared_features is not None: + meta_cols, _ = split_meta_and_features(loaded_profiles) + loaded_profiles = loaded_profiles.select(meta_cols + shared_features) + # convert all Float64 columns to Float32 if convert_to_f32 is True if convert_to_f32: loaded_profiles = loaded_profiles.with_columns( From 76f71cb42daf151ff4550225fbdbb44534fe8021 Mon Sep 17 00:00:00 2001 From: Erik Serrano <31600622+axiomcura@users.noreply.github.com> Date: Tue, 6 Jan 2026 17:27:58 -0500 Subject: [PATCH 12/12] Update notebooks/0.download-data/nbconverted/2.preprocessing.py Co-authored-by: Jenna Tomkinson <107513215+jenna-tomkinson@users.noreply.github.com> --- notebooks/0.download-data/nbconverted/2.preprocessing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/notebooks/0.download-data/nbconverted/2.preprocessing.py b/notebooks/0.download-data/nbconverted/2.preprocessing.py index bb0399a..761ef0f 100644 --- a/notebooks/0.download-data/nbconverted/2.preprocessing.py +++ b/notebooks/0.download-data/nbconverted/2.preprocessing.py @@ -258,7 +258,7 @@ def remove_feature_prefixes(df: pl.DataFrame, prefix: str = "CP__") -> pl.DataFr cpjump1_profiles = add_cell_id_hash(cpjump1_profiles) -# Next we annotate the compound treatments in the CPJUMP1 dataset, we annotate each cell with Mechanism of Action (MoA) information using the [Clue Drug Repurposing Hub](https://clue.io/data/REP#REP). This resource provides comprehensive drug and tool compound annotations, including target information and clinical development status. +# Next, we annotate the compound treatments in the CPJUMP1 dataset. We annotate each cell with Mechanism of Action (MoA) information using the [Clue Drug Repurposing Hub](https://clue.io/data/REP#REP). This resource provides comprehensive drug and tool compound annotations, including target information and clinical development status. # # In[6]: