diff --git a/notebooks/0.download-data/2.preprocessing.ipynb b/notebooks/0.download-data/2.preprocessing.ipynb index 6b40b1d..7872b7f 100644 --- a/notebooks/0.download-data/2.preprocessing.ipynb +++ b/notebooks/0.download-data/2.preprocessing.ipynb @@ -36,7 +36,8 @@ "import polars as pl\n", "\n", "sys.path.append(\"../../\")\n", - "from utils.data_utils import split_meta_and_features, add_cell_id_hash" + "from utils.data_utils import split_meta_and_features, add_cell_id_hash\n", + "from utils.io_utils import load_profiles" ] }, { @@ -93,15 +94,6 @@ " \"All elements in specific_plates must be pathlib.Path objects\"\n", " )\n", "\n", - " def load_profile(file: pathlib.Path) -> pl.DataFrame:\n", - " \"\"\"internal function to load a single profile file.\"\"\"\n", - " profile_df = pl.read_parquet(file)\n", - " meta_cols, _ = split_meta_and_features(profile_df)\n", - " if shared_features is not None:\n", - " # Only select metadata and shared features\n", - " return profile_df.select(meta_cols + shared_features)\n", - " return profile_df\n", - "\n", " # Use specific_plates if provided, otherwise gather all .parquet files\n", " if specific_plates is not None:\n", " # Validate that all specific plate files exist\n", @@ -115,7 +107,9 @@ " raise FileNotFoundError(f\"No profile files found in {profile_dir}\")\n", "\n", " # Load and concatenate profiles\n", - " loaded_profiles = [load_profile(f) for f in files_to_load]\n", + " loaded_profiles = [\n", + " load_profiles(f, shared_features=shared_features) for f in files_to_load\n", + " ]\n", "\n", " # Concatenate all loaded profiles\n", " return pl.concat(loaded_profiles, rechunk=True)\n", @@ -205,6 +199,11 @@ "# Setting profiles directory\n", "profiles_dir = (data_dir / \"sc-profiles\").resolve(strict=True)\n", "\n", + "# setting connectivity map drug repurposing config\n", + "drug_repurposing_config_path = (data_dir / \"repurposing_drugs_20180907.txt\").resolve(\n", + " strict=True\n", + ")\n", + "\n", "# Experimental metadata\n", "exp_metadata_path = (\n", " profiles_dir / \"cpjump1\" / \"cpjump1_compound_experimental-metadata.csv\"\n", @@ -286,6 +285,14 @@ "- Adding a unique cell id has column `Metadata_cell_id`" ] }, + { + "cell_type": "markdown", + "id": "9ec882fa", + "metadata": {}, + "source": [ + "We are loading per-plate parquet profiles for compound-treated plates, selecting the shared feature set, concatenating them into a single Polars DataFrame while preserving metadata, and adding a unique Metadata_cell_id for each cell. The resulting cpjump1_profiles table is ready for downstream analysis." + ] + }, { "cell_type": "code", "execution_count": 5, @@ -306,12 +313,38 @@ ")\n", "\n", "# create an index columm and unique cell ID based on features of a single profiles\n", - "cpjump1_profiles = add_cell_id_hash(cpjump1_profiles)\n", + "cpjump1_profiles = add_cell_id_hash(cpjump1_profiles)" + ] + }, + { + "cell_type": "markdown", + "id": "3df9bbf5", + "metadata": {}, + "source": [ + "Next we annotate the compound treatments in the CPJUMP1 dataset, we annotate each cell with Mechanism of Action (MoA) information using the [Clue Drug Repurposing Hub](https://clue.io/data/REP#REP). This resource provides comprehensive drug and tool compound annotations, including target information and clinical development status.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "adfb9148", + "metadata": {}, + "outputs": [], + "source": [ + "# load drug repurposing moa file and add prefix to metadata columns\n", + "rep_moa_df = pl.read_csv(\n", + " drug_repurposing_config_path, separator=\"\\t\", skip_rows=9, encoding=\"utf8-lossy\"\n", + ").rename(lambda x: f\"Metadata_{x}\" if not x.startswith(\"Metadata_\") else x)\n", + "\n", + "# merge the original cpjump1_profiles with rep_moa_df on Metadata_pert_iname\n", + "cpjump1_profiles = cpjump1_profiles.join(\n", + " rep_moa_df, on=\"Metadata_pert_iname\", how=\"left\"\n", + ")\n", "\n", - "# Split meta and features\n", + "# split meta and feature\n", "meta_cols, features_cols = split_meta_and_features(cpjump1_profiles)\n", "\n", - "# Saving metadata and features of the concat profile into a json file\n", + "# save the feature space information into a json file\n", "meta_features_dict = {\n", " \"concat-profiles\": {\n", " \"meta-features\": meta_cols,\n", @@ -321,7 +354,11 @@ "with open(cpjump1_output_dir / \"concat_profiles_meta_features.json\", \"w\") as f:\n", " json.dump(meta_features_dict, f, indent=4)\n", "\n", - "# save as parquet with defined order of columns\n", + "# save concatenated profiles\n", + "# Loading compound profiles with shared features and concat into a single DataFrame\n", + "concat_output_path = (\n", + " cpjump1_output_dir / \"cpjump1_compound_concat_profiles.parquet\"\n", + ").resolve()\n", "cpjump1_profiles.select(meta_cols + features_cols).write_parquet(concat_output_path)" ] }, @@ -350,7 +387,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 7, "id": "c5471d3e", "metadata": {}, "outputs": [], @@ -404,7 +441,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 8, "id": "c57da947", "metadata": {}, "outputs": [], @@ -437,7 +474,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 9, "id": "1d7ced04", "metadata": {}, "outputs": [], @@ -490,7 +527,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 10, "id": "42108980", "metadata": {}, "outputs": [], @@ -537,7 +574,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 11, "id": "1763d383", "metadata": {}, "outputs": [], diff --git a/notebooks/0.download-data/nbconverted/2.preprocessing.py b/notebooks/0.download-data/nbconverted/2.preprocessing.py index b643cef..761ef0f 100644 --- a/notebooks/0.download-data/nbconverted/2.preprocessing.py +++ b/notebooks/0.download-data/nbconverted/2.preprocessing.py @@ -26,6 +26,7 @@ sys.path.append("../../") from utils.data_utils import add_cell_id_hash, split_meta_and_features +from utils.io_utils import load_profiles # ## Helper functions # @@ -71,15 +72,6 @@ def load_and_concat_profiles( "All elements in specific_plates must be pathlib.Path objects" ) - def load_profile(file: pathlib.Path) -> pl.DataFrame: - """internal function to load a single profile file.""" - profile_df = pl.read_parquet(file) - meta_cols, _ = split_meta_and_features(profile_df) - if shared_features is not None: - # Only select metadata and shared features - return profile_df.select(meta_cols + shared_features) - return profile_df - # Use specific_plates if provided, otherwise gather all .parquet files if specific_plates is not None: # Validate that all specific plate files exist @@ -93,7 +85,9 @@ def load_profile(file: pathlib.Path) -> pl.DataFrame: raise FileNotFoundError(f"No profile files found in {profile_dir}") # Load and concatenate profiles - loaded_profiles = [load_profile(f) for f in files_to_load] + loaded_profiles = [ + load_profiles(f, shared_features=shared_features) for f in files_to_load + ] # Concatenate all loaded profiles return pl.concat(loaded_profiles, rechunk=True) @@ -173,6 +167,11 @@ def remove_feature_prefixes(df: pl.DataFrame, prefix: str = "CP__") -> pl.DataFr # Setting profiles directory profiles_dir = (data_dir / "sc-profiles").resolve(strict=True) +# setting connectivity map drug repurposing config +drug_repurposing_config_path = (data_dir / "repurposing_drugs_20180907.txt").resolve( + strict=True +) + # Experimental metadata exp_metadata_path = ( profiles_dir / "cpjump1" / "cpjump1_compound_experimental-metadata.csv" @@ -238,6 +237,8 @@ def remove_feature_prefixes(df: pl.DataFrame, prefix: str = "CP__") -> pl.DataFr # - Data integrity is maintained during the merge operation # - Adding a unique cell id has column `Metadata_cell_id` +# We are loading per-plate parquet profiles for compound-treated plates, selecting the shared feature set, concatenating them into a single Polars DataFrame while preserving metadata, and adding a unique Metadata_cell_id for each cell. The resulting cpjump1_profiles table is ready for downstream analysis. + # In[5]: @@ -256,10 +257,27 @@ def remove_feature_prefixes(df: pl.DataFrame, prefix: str = "CP__") -> pl.DataFr # create an index columm and unique cell ID based on features of a single profiles cpjump1_profiles = add_cell_id_hash(cpjump1_profiles) -# Split meta and features + +# Next, we annotate the compound treatments in the CPJUMP1 dataset. We annotate each cell with Mechanism of Action (MoA) information using the [Clue Drug Repurposing Hub](https://clue.io/data/REP#REP). This resource provides comprehensive drug and tool compound annotations, including target information and clinical development status. +# + +# In[6]: + + +# load drug repurposing moa file and add prefix to metadata columns +rep_moa_df = pl.read_csv( + drug_repurposing_config_path, separator="\t", skip_rows=9, encoding="utf8-lossy" +).rename(lambda x: f"Metadata_{x}" if not x.startswith("Metadata_") else x) + +# merge the original cpjump1_profiles with rep_moa_df on Metadata_pert_iname +cpjump1_profiles = cpjump1_profiles.join( + rep_moa_df, on="Metadata_pert_iname", how="left" +) + +# split meta and feature meta_cols, features_cols = split_meta_and_features(cpjump1_profiles) -# Saving metadata and features of the concat profile into a json file +# save the feature space information into a json file meta_features_dict = { "concat-profiles": { "meta-features": meta_cols, @@ -269,7 +287,11 @@ def remove_feature_prefixes(df: pl.DataFrame, prefix: str = "CP__") -> pl.DataFr with open(cpjump1_output_dir / "concat_profiles_meta_features.json", "w") as f: json.dump(meta_features_dict, f, indent=4) -# save as parquet with defined order of columns +# save concatenated profiles +# Loading compound profiles with shared features and concat into a single DataFrame +concat_output_path = ( + cpjump1_output_dir / "cpjump1_compound_concat_profiles.parquet" +).resolve() cpjump1_profiles.select(meta_cols + features_cols).write_parquet(concat_output_path) @@ -290,7 +312,7 @@ def remove_feature_prefixes(df: pl.DataFrame, prefix: str = "CP__") -> pl.DataFr # # The preprocessing ensures that all MitoCheck datasets share a common feature space and are ready for comparative analysis with CPJUMP1 profiles. -# In[6]: +# In[7]: # load in mitocheck profiles and save as parquet @@ -334,7 +356,7 @@ def remove_feature_prefixes(df: pl.DataFrame, prefix: str = "CP__") -> pl.DataFr # Filter Cell Profiler (CP) features and preprocess columns by removing the "CP__" prefix to standardize feature names for downstream analysis. -# In[7]: +# In[8]: # Split profiles to only retain cell profiler features @@ -357,7 +379,7 @@ def remove_feature_prefixes(df: pl.DataFrame, prefix: str = "CP__") -> pl.DataFr # Splitting the metadata and feature columns for each dataset to enable targeted downstream analysis and ensure consistent data structure across all profiles. -# In[8]: +# In[9]: # manually selecting metadata features that are present across all 3 profiles @@ -406,7 +428,7 @@ def remove_feature_prefixes(df: pl.DataFrame, prefix: str = "CP__") -> pl.DataFr ) -# In[9]: +# In[10]: # create concatenated mitocheck profiles @@ -444,7 +466,7 @@ def remove_feature_prefixes(df: pl.DataFrame, prefix: str = "CP__") -> pl.DataFr # - **Unique cell identification**: Adding `Metadata_cell_id` column with unique hash values based on all profile features to enable precise cell tracking and deduplication # -# In[10]: +# In[11]: # load in cfret profiles and add a unique cell ID diff --git a/utils/io_utils.py b/utils/io_utils.py index ad8f720..6d9f1a9 100644 --- a/utils/io_utils.py +++ b/utils/io_utils.py @@ -9,11 +9,14 @@ import yaml from tqdm import tqdm +from .data_utils import split_meta_and_features + def load_profiles( fpath: str | pathlib.Path, convert_to_f32: bool = False, verbose: bool | None = False, + shared_features: list[str] | None = None, ) -> pl.DataFrame: """Load single-cell profiles from given file path. @@ -29,6 +32,9 @@ def load_profiles( If True, converts all Float64 columns to Float32 to save memory. Default is False verbose : bool, optional If True, prints information about the loaded profiles. Default is False. + shared_features : list[str] | None, optional + If provided, only loads metadata columns and these specific feature columns. + Default is None (loads all columns). Returns ------- @@ -61,6 +67,11 @@ def load_profiles( # load profiles loaded_profiles = pl.read_parquet(fpath) + # filter to shared features if provided + if shared_features is not None: + meta_cols, _ = split_meta_and_features(loaded_profiles) + loaded_profiles = loaded_profiles.select(meta_cols + shared_features) + # convert all Float64 columns to Float32 if convert_to_f32 is True if convert_to_f32: loaded_profiles = loaded_profiles.with_columns(