Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
77 changes: 57 additions & 20 deletions notebooks/0.download-data/2.preprocessing.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,8 @@
"import polars as pl\n",
"\n",
"sys.path.append(\"../../\")\n",
"from utils.data_utils import split_meta_and_features, add_cell_id_hash"
"from utils.data_utils import split_meta_and_features, add_cell_id_hash\n",
"from utils.io_utils import load_profiles"
]
},
{
Expand Down Expand Up @@ -93,15 +94,6 @@
" \"All elements in specific_plates must be pathlib.Path objects\"\n",
" )\n",
"\n",
" def load_profile(file: pathlib.Path) -> pl.DataFrame:\n",
" \"\"\"internal function to load a single profile file.\"\"\"\n",
" profile_df = pl.read_parquet(file)\n",
" meta_cols, _ = split_meta_and_features(profile_df)\n",
" if shared_features is not None:\n",
" # Only select metadata and shared features\n",
" return profile_df.select(meta_cols + shared_features)\n",
" return profile_df\n",
"\n",
" # Use specific_plates if provided, otherwise gather all .parquet files\n",
" if specific_plates is not None:\n",
" # Validate that all specific plate files exist\n",
Expand All @@ -115,7 +107,9 @@
" raise FileNotFoundError(f\"No profile files found in {profile_dir}\")\n",
"\n",
" # Load and concatenate profiles\n",
" loaded_profiles = [load_profile(f) for f in files_to_load]\n",
" loaded_profiles = [\n",
" load_profiles(f, shared_features=shared_features) for f in files_to_load\n",
" ]\n",
"\n",
" # Concatenate all loaded profiles\n",
" return pl.concat(loaded_profiles, rechunk=True)\n",
Expand Down Expand Up @@ -205,6 +199,11 @@
"# Setting profiles directory\n",
"profiles_dir = (data_dir / \"sc-profiles\").resolve(strict=True)\n",
"\n",
"# setting connectivity map drug repurposing config\n",
"drug_repurposing_config_path = (data_dir / \"repurposing_drugs_20180907.txt\").resolve(\n",
" strict=True\n",
")\n",
"\n",
"# Experimental metadata\n",
"exp_metadata_path = (\n",
" profiles_dir / \"cpjump1\" / \"cpjump1_compound_experimental-metadata.csv\"\n",
Expand Down Expand Up @@ -286,6 +285,14 @@
"- Adding a unique cell id has column `Metadata_cell_id`"
]
},
{
"cell_type": "markdown",
"id": "9ec882fa",
"metadata": {},
"source": [
"We are loading per-plate parquet profiles for compound-treated plates, selecting the shared feature set, concatenating them into a single Polars DataFrame while preserving metadata, and adding a unique Metadata_cell_id for each cell. The resulting cpjump1_profiles table is ready for downstream analysis."
]
},
{
"cell_type": "code",
"execution_count": 5,
Expand All @@ -306,12 +313,38 @@
")\n",
"\n",
"# create an index columm and unique cell ID based on features of a single profiles\n",
"cpjump1_profiles = add_cell_id_hash(cpjump1_profiles)\n",
"cpjump1_profiles = add_cell_id_hash(cpjump1_profiles)"
]
},
{
"cell_type": "markdown",
"id": "3df9bbf5",
"metadata": {},
"source": [
"Next we annotate the compound treatments in the CPJUMP1 dataset, we annotate each cell with Mechanism of Action (MoA) information using the [Clue Drug Repurposing Hub](https://clue.io/data/REP#REP). This resource provides comprehensive drug and tool compound annotations, including target information and clinical development status.\n"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "adfb9148",
"metadata": {},
"outputs": [],
"source": [
"# load drug repurposing moa file and add prefix to metadata columns\n",
"rep_moa_df = pl.read_csv(\n",
" drug_repurposing_config_path, separator=\"\\t\", skip_rows=9, encoding=\"utf8-lossy\"\n",
").rename(lambda x: f\"Metadata_{x}\" if not x.startswith(\"Metadata_\") else x)\n",
"\n",
"# merge the original cpjump1_profiles with rep_moa_df on Metadata_pert_iname\n",
"cpjump1_profiles = cpjump1_profiles.join(\n",
" rep_moa_df, on=\"Metadata_pert_iname\", how=\"left\"\n",
")\n",
"\n",
"# Split meta and features\n",
"# split meta and feature\n",
"meta_cols, features_cols = split_meta_and_features(cpjump1_profiles)\n",
"\n",
"# Saving metadata and features of the concat profile into a json file\n",
"# save the feature space information into a json file\n",
"meta_features_dict = {\n",
" \"concat-profiles\": {\n",
" \"meta-features\": meta_cols,\n",
Expand All @@ -321,7 +354,11 @@
"with open(cpjump1_output_dir / \"concat_profiles_meta_features.json\", \"w\") as f:\n",
" json.dump(meta_features_dict, f, indent=4)\n",
"\n",
"# save as parquet with defined order of columns\n",
"# save concatenated profiles\n",
"# Loading compound profiles with shared features and concat into a single DataFrame\n",
"concat_output_path = (\n",
" cpjump1_output_dir / \"cpjump1_compound_concat_profiles.parquet\"\n",
").resolve()\n",
"cpjump1_profiles.select(meta_cols + features_cols).write_parquet(concat_output_path)"
]
},
Expand Down Expand Up @@ -350,7 +387,7 @@
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": 7,
"id": "c5471d3e",
"metadata": {},
"outputs": [],
Expand Down Expand Up @@ -404,7 +441,7 @@
},
{
"cell_type": "code",
"execution_count": 7,
"execution_count": 8,
"id": "c57da947",
"metadata": {},
"outputs": [],
Expand Down Expand Up @@ -437,7 +474,7 @@
},
{
"cell_type": "code",
"execution_count": 8,
"execution_count": 9,
"id": "1d7ced04",
"metadata": {},
"outputs": [],
Expand Down Expand Up @@ -490,7 +527,7 @@
},
{
"cell_type": "code",
"execution_count": 9,
"execution_count": 10,
"id": "42108980",
"metadata": {},
"outputs": [],
Expand Down Expand Up @@ -537,7 +574,7 @@
},
{
"cell_type": "code",
"execution_count": 10,
"execution_count": 11,
"id": "1763d383",
"metadata": {},
"outputs": [],
Expand Down
58 changes: 40 additions & 18 deletions notebooks/0.download-data/nbconverted/2.preprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@

sys.path.append("../../")
from utils.data_utils import add_cell_id_hash, split_meta_and_features
from utils.io_utils import load_profiles

# ## Helper functions
#
Expand Down Expand Up @@ -71,15 +72,6 @@ def load_and_concat_profiles(
"All elements in specific_plates must be pathlib.Path objects"
)

def load_profile(file: pathlib.Path) -> pl.DataFrame:
"""internal function to load a single profile file."""
profile_df = pl.read_parquet(file)
meta_cols, _ = split_meta_and_features(profile_df)
if shared_features is not None:
# Only select metadata and shared features
return profile_df.select(meta_cols + shared_features)
return profile_df

# Use specific_plates if provided, otherwise gather all .parquet files
if specific_plates is not None:
# Validate that all specific plate files exist
Expand All @@ -93,7 +85,9 @@ def load_profile(file: pathlib.Path) -> pl.DataFrame:
raise FileNotFoundError(f"No profile files found in {profile_dir}")

# Load and concatenate profiles
loaded_profiles = [load_profile(f) for f in files_to_load]
loaded_profiles = [
load_profiles(f, shared_features=shared_features) for f in files_to_load
]

# Concatenate all loaded profiles
return pl.concat(loaded_profiles, rechunk=True)
Expand Down Expand Up @@ -173,6 +167,11 @@ def remove_feature_prefixes(df: pl.DataFrame, prefix: str = "CP__") -> pl.DataFr
# Setting profiles directory
profiles_dir = (data_dir / "sc-profiles").resolve(strict=True)

# setting connectivity map drug repurposing config
drug_repurposing_config_path = (data_dir / "repurposing_drugs_20180907.txt").resolve(
strict=True
)

# Experimental metadata
exp_metadata_path = (
profiles_dir / "cpjump1" / "cpjump1_compound_experimental-metadata.csv"
Expand Down Expand Up @@ -238,6 +237,8 @@ def remove_feature_prefixes(df: pl.DataFrame, prefix: str = "CP__") -> pl.DataFr
# - Data integrity is maintained during the merge operation
# - Adding a unique cell id has column `Metadata_cell_id`

# We are loading per-plate parquet profiles for compound-treated plates, selecting the shared feature set, concatenating them into a single Polars DataFrame while preserving metadata, and adding a unique Metadata_cell_id for each cell. The resulting cpjump1_profiles table is ready for downstream analysis.

# In[5]:


Expand All @@ -256,10 +257,27 @@ def remove_feature_prefixes(df: pl.DataFrame, prefix: str = "CP__") -> pl.DataFr
# create an index columm and unique cell ID based on features of a single profiles
cpjump1_profiles = add_cell_id_hash(cpjump1_profiles)

# Split meta and features

# Next, we annotate the compound treatments in the CPJUMP1 dataset. We annotate each cell with Mechanism of Action (MoA) information using the [Clue Drug Repurposing Hub](https://clue.io/data/REP#REP). This resource provides comprehensive drug and tool compound annotations, including target information and clinical development status.
#

# In[6]:


# load drug repurposing moa file and add prefix to metadata columns
rep_moa_df = pl.read_csv(
drug_repurposing_config_path, separator="\t", skip_rows=9, encoding="utf8-lossy"
).rename(lambda x: f"Metadata_{x}" if not x.startswith("Metadata_") else x)

# merge the original cpjump1_profiles with rep_moa_df on Metadata_pert_iname
cpjump1_profiles = cpjump1_profiles.join(
rep_moa_df, on="Metadata_pert_iname", how="left"
)

# split meta and feature
meta_cols, features_cols = split_meta_and_features(cpjump1_profiles)

# Saving metadata and features of the concat profile into a json file
# save the feature space information into a json file
meta_features_dict = {
"concat-profiles": {
"meta-features": meta_cols,
Expand All @@ -269,7 +287,11 @@ def remove_feature_prefixes(df: pl.DataFrame, prefix: str = "CP__") -> pl.DataFr
with open(cpjump1_output_dir / "concat_profiles_meta_features.json", "w") as f:
json.dump(meta_features_dict, f, indent=4)

# save as parquet with defined order of columns
# save concatenated profiles
# Loading compound profiles with shared features and concat into a single DataFrame
concat_output_path = (
cpjump1_output_dir / "cpjump1_compound_concat_profiles.parquet"
).resolve()
cpjump1_profiles.select(meta_cols + features_cols).write_parquet(concat_output_path)


Expand All @@ -290,7 +312,7 @@ def remove_feature_prefixes(df: pl.DataFrame, prefix: str = "CP__") -> pl.DataFr
#
# The preprocessing ensures that all MitoCheck datasets share a common feature space and are ready for comparative analysis with CPJUMP1 profiles.

# In[6]:
# In[7]:


# load in mitocheck profiles and save as parquet
Expand Down Expand Up @@ -334,7 +356,7 @@ def remove_feature_prefixes(df: pl.DataFrame, prefix: str = "CP__") -> pl.DataFr

# Filter Cell Profiler (CP) features and preprocess columns by removing the "CP__" prefix to standardize feature names for downstream analysis.

# In[7]:
# In[8]:


# Split profiles to only retain cell profiler features
Expand All @@ -357,7 +379,7 @@ def remove_feature_prefixes(df: pl.DataFrame, prefix: str = "CP__") -> pl.DataFr

# Splitting the metadata and feature columns for each dataset to enable targeted downstream analysis and ensure consistent data structure across all profiles.

# In[8]:
# In[9]:


# manually selecting metadata features that are present across all 3 profiles
Expand Down Expand Up @@ -406,7 +428,7 @@ def remove_feature_prefixes(df: pl.DataFrame, prefix: str = "CP__") -> pl.DataFr
)


# In[9]:
# In[10]:


# create concatenated mitocheck profiles
Expand Down Expand Up @@ -444,7 +466,7 @@ def remove_feature_prefixes(df: pl.DataFrame, prefix: str = "CP__") -> pl.DataFr
# - **Unique cell identification**: Adding `Metadata_cell_id` column with unique hash values based on all profile features to enable precise cell tracking and deduplication
#

# In[10]:
# In[11]:


# load in cfret profiles and add a unique cell ID
Expand Down
11 changes: 11 additions & 0 deletions utils/io_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,14 @@
import yaml
from tqdm import tqdm

from .data_utils import split_meta_and_features


def load_profiles(
fpath: str | pathlib.Path,
convert_to_f32: bool = False,
verbose: bool | None = False,
shared_features: list[str] | None = None,
) -> pl.DataFrame:
"""Load single-cell profiles from given file path.

Expand All @@ -29,6 +32,9 @@ def load_profiles(
If True, converts all Float64 columns to Float32 to save memory. Default is False
verbose : bool, optional
If True, prints information about the loaded profiles. Default is False.
shared_features : list[str] | None, optional
If provided, only loads metadata columns and these specific feature columns.
Default is None (loads all columns).

Returns
-------
Expand Down Expand Up @@ -61,6 +67,11 @@ def load_profiles(
# load profiles
loaded_profiles = pl.read_parquet(fpath)

# filter to shared features if provided
if shared_features is not None:
meta_cols, _ = split_meta_and_features(loaded_profiles)
loaded_profiles = loaded_profiles.select(meta_cols + shared_features)

# convert all Float64 columns to Float32 if convert_to_f32 is True
if convert_to_f32:
loaded_profiles = loaded_profiles.with_columns(
Expand Down