diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json
index 39480995..4dda9426 100644
--- a/.devcontainer/devcontainer.json
+++ b/.devcontainer/devcontainer.json
@@ -16,6 +16,8 @@
// For web display
"ghcr.io/devcontainers/features/node:1": {},
// For scripting
- "ghcr.io/va-h/devcontainers-features/uv:1": {}
+ "ghcr.io/va-h/devcontainers-features/uv:1": {},
+ // For paxtools
+ "ghcr.io/devcontainers/features/java:1": {}
}
}
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 95040b71..f687b4c2 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -1,6 +1,5 @@
# See https://pre-commit.com for more information
# See https://pre-commit.com/hooks.html for more hooks
-# See https://pre-commit.com/ for documentation
default_language_version:
# Match this to the version specified in environment.yml
python: python3.11
@@ -10,9 +9,9 @@ repos:
hooks:
# Attempts to load all yaml files to verify syntax.
- id: check-yaml
- # Attempts to load all TOML files to verify syntax.
+ # Attempts to load all TOML files to verify syntax.
- id: check-toml
- # Trims trailing whitespace.
+ # Trims trailing whitespace.
- id: trailing-whitespace
# Preserves Markdown hard linebreaks.
args: [--markdown-linebreak-ext=md]
@@ -20,11 +19,15 @@ repos:
types_or: [markdown, python, yaml]
# Skip this Markdown file, which has an example of an input text file within it.
exclude: input/README.md
- - repo: https://github.com/charliermarsh/ruff-pre-commit
- rev: 'v0.0.269'
+ - repo: https://github.com/astral-sh/ruff-pre-commit
+ rev: 'v0.15.4'
hooks:
- id: ruff
- repo: https://github.com/google/yamlfmt
rev: v0.17.0
hooks:
- id: yamlfmt
+ - repo: https://github.com/crate-ci/typos
+ rev: v1.34.0
+ hooks:
+ - id: typos
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 1c47403b..b6766532 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -2,7 +2,7 @@
## Helping Out
-There are `TODOs` that better enhance the reproducability and accuracy of datasets or analysis of algorithm outputs, as well as
+There are `TODOs` that better enhance the reproducibility and accuracy of datasets or analysis of algorithm outputs, as well as
[open resolvable issues](https://github.com/Reed-CompBio/spras-benchmarking/).
## Adding a dataset
diff --git a/README.md b/README.md
index 12a6e84b..b0503cb3 100644
--- a/README.md
+++ b/README.md
@@ -39,7 +39,7 @@ uv run snakemake --cores 1
## Organization
-There are five primary folders in this repository:
+There are six primary folders in this repository:
```
.
@@ -47,13 +47,14 @@ There are five primary folders in this repository:
├── configs
├── datasets
├── spras
+├── tools
└── web
```
`spras` is the cloned submodule of [SPRAS](https://github.com/reed-compbio/spras), `web` is an
[astro](https://astro.build/) app which generates the `spras-benchmarking` [output](https://reed-compbio.github.io/spras-benchmarking/),
`configs` is the YAML file used to talk to SPRAS, and `datasets` contains the raw data. `cache` is utility for `datasets` which provides a convenient
-way to fetch online files for further processing.
+way to fetch online files for further processing. `tools` is the miscellaneous utilities for dataset processing, for tasks common to datasets.
The workflow runs as so:
diff --git a/_typos.toml b/_typos.toml
new file mode 100644
index 00000000..8b097fd3
--- /dev/null
+++ b/_typos.toml
@@ -0,0 +1,12 @@
+[type.txt]
+# Ignore data files
+extend-glob = ["*.jsonc", "*.json"]
+check-file = false
+
+[files]
+extend-exclude = [
+ # PANTHER SPRAS formatting contains an intentional typo
+ "datasets/synthetic_data/scripts/panther_spras_formatting.py",
+ # Bad variable names in this file that may be removed later
+ "datasets/diseases/viz/viz.ipynb"
+]
diff --git a/cache/Snakefile b/cache/Snakefile
index a04c3f64..450a1408 100644
--- a/cache/Snakefile
+++ b/cache/Snakefile
@@ -1,34 +1,33 @@
-from cache import link
+from cache import FetchConfig, link
+from cache.directory import CacheItem # for exposing to Snakefiles that import this Snakefile.
from cache.util import uncompress
import urllib.parse
-from dataclasses import dataclass
from typing import Union
from pathlib import Path
-@dataclass
-class FetchConfig:
- directive: list[str]
- uncompress: bool = False
+def stringify_directive(directive: Union[CacheItem, FetchConfig]) -> str:
+ return urllib.parse.quote_plus(directive.name if isinstance(directive, CacheItem) else '/'.join(directive))
def produce_fetch_rules(input_dict: dict[str, Union[FetchConfig, list[str]]]):
"""
Produces fetch rules based on a dictionary mapping
output files to their directory.py-based directive.
"""
- # Map inputs to be wrapped with FetchConfig if list[str]
- input_dict = {k: FetchConfig(v) if isinstance(v, list) else v for k, v in input_dict.items()}
+ # Map inputs to be wrapped with FetchConfig if list[str] or CacheItem
+ input_dict = {k: FetchConfig(v) if isinstance(v, tuple) or isinstance(v, CacheItem) else v for k, v in input_dict.items()}
- directives = [urllib.parse.quote_plus("/".join(directive.directive)) for directive in input_dict.values()]
+ directives = list(input_dict.values())
assert len(directives) == len(set(directives)), "Directives aren't unique!"
for output_file, config in input_dict.items():
# Since placeholders are evaluated when the job is actually ran,
# we pass data using params and output.
rule:
- name: f"fetch_{urllib.parse.quote_plus('/'.join(config.directive))}_to_{urllib.parse.quote_plus(output_file)}"
+ name:
+ f"fetch_{stringify_directive(config.directive)}_to_{urllib.parse.quote_plus(output_file)}"
output: file=output_file
params:
config=config
run:
Path(output.file).parent.mkdir(exist_ok=True)
- link(Path(output.file), params.config.directive, uncompress=params.config.uncompress)
+ link(Path(output.file), params.config)
diff --git a/cache/__init__.py b/cache/__init__.py
index 9e48cf44..8677a219 100644
--- a/cache/__init__.py
+++ b/cache/__init__.py
@@ -2,81 +2,126 @@
This is how spras-benchmarking handles artifact caching. `cache` should be used specifically inside `Snakefile`
"""
+from dataclasses import dataclass
+from typing import Union
from cache.util import uncompress as uncompress_file
-from cache.directory import get_cache_item
+from cache.directory import CacheItem, get_cache_item
from pathlib import Path
import os
from urllib.parse import quote_plus
import pickle
-__all__ = ["link"]
+__all__ = ["FetchConfig", "link"]
dir_path = Path(os.path.dirname(os.path.realpath(__file__)))
artifacts_dir = dir_path / "artifacts"
-def get_artifact_name(directive: list[str]) -> str:
+@dataclass(frozen=True)
+class FetchConfig:
+ directive: Union[CacheItem, tuple[str, ...]]
+ uncompress: bool = False
+
+def get_artifact_name(directive: tuple[str, ...]) -> str:
return quote_plus("/".join(directive))
-def has_expired(directive: list[str]) -> bool:
+def add_suffix(path: Path, suffix: str):
+ return path.with_suffix(path.suffix + suffix)
+
+def has_expired(
+ cache_item: CacheItem,
+ output: Path
+) -> bool:
"""
Check if the artifact metadata associated with a directive has expired.
Avoids re-downloading the artifact if nothing has changed.
"""
- artifact_name = get_artifact_name(directive)
- cache_item = get_cache_item(directive)
- metadata_dir = artifacts_dir / 'metadata'
- metadata_dir.mkdir(exist_ok=True)
- metadata_file = (artifacts_dir / 'metadata' / artifact_name).with_suffix((artifacts_dir / artifact_name).suffix + '.metadata')
+ metadata_file = add_suffix(output, ".metadata")
# metadata never existed: we need to retrieve the new file
if not metadata_file.exists():
- with open(metadata_file, 'wb') as f:
+ with open(metadata_file, "wb") as f:
pickle.dump(cache_item, f)
return True
old_cache_item = None
- with open(metadata_file, 'rb') as f:
+ with open(metadata_file, "rb") as f:
old_cache_item = pickle.load(f)
# metadata expired: re-retrieve the item
if old_cache_item != cache_item:
- with open(metadata_file, 'wb') as f:
+ with open(metadata_file, "wb") as f:
pickle.dump(cache_item, f)
return True
# metadata hasn't changed and already existed: this hasn't expired
return False
-def link(output: str, directive: list[str], uncompress=False):
+def link_with_cache_item(
+ output: Path,
+ cache_item: CacheItem,
+ uncompress: bool = False
+):
+ """
+ Intermediary function for `link`.
+ This does almost all of what `link` is characterized to do in its documentation,
+ except for doing symlinking.
+ """
+ # If `uncompress` is `True`, we make
+ # `output` our 'compressed output.'
+ uncompressed_output = output
+ if uncompress:
+ output = add_suffix(output, ".compresseded")
+
+ # Re-download if the file doesn't exist or the directive has expired.
+ # Note that we check for expiration first to trigger metadata creation.
+ if has_expired(cache_item, output) or not output.exists():
+ output.unlink(missing_ok=True)
+ cache_item.download(output)
+
+ if uncompress:
+ uncompressed_artifact_path = add_suffix(output, ".uncompressed")
+ uncompressed_artifact_path.unlink(missing_ok=True)
+ uncompress_file(output, uncompressed_output)
+
+def link(
+ output: str,
+ config: FetchConfig
+):
"""
Links output files from cache.directory directives.
For example,
```py
- link("output/ensg-ensp.tsv", ["BioMart", "ensg-ensp.tsv"])
+ link("output/ensg-ensp.tsv", FetchConfig(["BioMart", "ensg-ensp.tsv"]))
```
- would download and check BioMart's cache for ENSG-ENSP mapping, then symlink the cached output
- (lying somewhere in the cache folder) with the desired `output`.
+ would download and check BioMart's cache for ENSG-ENSP mapping, then:
+ - If `config.directive` is a `CacheItem`, we write the file directly to `output`.
+ - Otherwise, we symlink the cached output (lying somewhere in the cache folder) with the desired `output`
+ to avoid file duplication.
+
+ This function wraps around link_with_cache_item and handles symlinking
+ depending on the type of config.directive.
+ TODO: most likely a nicer way to design this.
"""
- artifacts_dir.mkdir(exist_ok=True)
+ if isinstance(config.directive, CacheItem):
+ link_with_cache_item(
+ Path(output),
+ config.directive,
+ config.uncompress
+ )
+ else:
+ artifacts_dir.mkdir(exist_ok=True)
+ artifact_name = get_artifact_name(config.directive)
+ artifact_output = artifacts_dir / artifact_name
- artifact_name = get_artifact_name(directive)
+ link_with_cache_item(
+ artifact_output,
+ get_cache_item(config.directive),
+ config.uncompress
+ )
- Path(output).unlink(missing_ok=True)
+ Path(output).symlink_to(artifact_output)
- # Re-download if the file doesn't exist or the directive has expired.
- cache_item = get_cache_item(directive)
- if not (artifacts_dir / artifact_name).exists() or has_expired(directive):
- (artifacts_dir / artifact_name).unlink(missing_ok=True)
- cache_item.download(artifacts_dir / artifact_name)
-
- if uncompress:
- uncompressed_artifact_path = Path(str(artifacts_dir / artifact_name) + '.uncompressed')
- uncompressed_artifact_path.unlink(missing_ok=True)
- uncompress_file(artifacts_dir / artifact_name, uncompressed_artifact_path)
- Path(output).symlink_to(uncompressed_artifact_path)
- else:
- Path(output).symlink_to(artifacts_dir / artifact_name)
diff --git a/cache/cli.py b/cache/cli.py
index 2e8d8201..5a4edb10 100644
--- a/cache/cli.py
+++ b/cache/cli.py
@@ -10,20 +10,21 @@
import argparse
from cache.directory import get_cache_item
+
def parse_args():
- parser = argparse.ArgumentParser(
- prog='Cache',
- description='CLI utility for directory.py')
- parser.add_argument('path')
- parser.add_argument('output')
+ parser = argparse.ArgumentParser(prog="Cache", description="CLI utility for directory.py")
+ parser.add_argument("path")
+ parser.add_argument("output")
return parser.parse_args()
+
def main():
args = parse_args()
cache_item = get_cache_item(args.path.split("/"))
cache_item.download(args.output)
+
if __name__ == "__main__":
main()
diff --git a/cache/directory.py b/cache/directory.py
index 38a58635..7a868c86 100644
--- a/cache/directory.py
+++ b/cache/directory.py
@@ -16,12 +16,14 @@
dir_path = Path(__file__).parent.resolve()
# Our cache emits warnings for files with unpinned versions that don't match the cache.
-(dir_path / 'logs').mkdir(exist_ok=True)
-logger.add(dir_path / 'logs' / "cache.log", level="WARNING")
+(dir_path / "logs").mkdir(exist_ok=True)
+logger.add(dir_path / "logs" / "cache.log", level="WARNING")
+
class DownloadFileCheckException(RuntimeError):
"""See Service#download_against_cache for some motivation for this custom error"""
+
@dataclass
class Service:
url: str
@@ -34,17 +36,12 @@ def download(self, output: str | PathLike) -> requests.Response:
# As per https://stackoverflow.com/a/39217788/7589775 to enable download streaming.
with requests.get(self.url, stream=True, headers=self.headers) as response:
response.raw.decode_content = True
- with open(output, 'wb') as f:
+ with open(output, "wb") as f:
shutil.copyfileobj(response.raw, f)
return response
# NOTE: this is slightly yucky code deduplication. The only intended values of `downloaded_file_type` are `pinned` and `unpinned`.
- def download_against_cache(
- self,
- cache: Path,
- downloaded_file_type: str,
- move_output: bool
- ):
+ def download_against_cache(self, cache: Path, downloaded_file_type: str, move_output: bool):
"""
Downloads `this` Service and checks it against the provided `cache` at path. In logs,
the file will be referred to as `downloaded_file_type`.
@@ -58,7 +55,7 @@ def download_against_cache(
logger.info(f"Checking that the {downloaded_file_type} artifact {downloaded_file_path} matches with cached artifact at {cache}...")
if not filecmp.cmp(cache, downloaded_file_path):
- # This entire if-branch is debug schenanigans: we want to be able to easily compare our current cached file to the online file,
+ # This entire if-branch is debug shenanigans: we want to be able to easily compare our current cached file to the online file,
# especially since some `Service`s have special errors that can make the request hard to compare in the browser.
debug_file_path = Path(NamedTemporaryFile(prefix="spras-benchmarking-debug-artifact", delete=False).name)
@@ -68,21 +65,24 @@ def download_against_cache(
else:
shutil.copy(cache, debug_file_path)
# We use a custom error type to prevent any overlap with RuntimeError. I am not sure if there is any.
- raise DownloadFileCheckException(f"The {downloaded_file_type} file {downloaded_file_path} and " + \
- f"cached file originally at {cache} do not match! " + \
- f"Compare the pinned {downloaded_file_path} and the cached {debug_file_path}.")
+ raise DownloadFileCheckException(
+ f"The {downloaded_file_type} file {downloaded_file_path} and "
+ + f"cached file originally at {cache} do not match! "
+ + f"Compare the pinned {downloaded_file_path} and the cached {debug_file_path}."
+ )
else:
# Since we don't clean up pinned_file_path for the above branch's debugging,
# we need to clean it up here.
downloaded_file_path.unlink()
@staticmethod
- def coerce(obj: 'Service | str') -> 'Service':
+ def coerce(obj: "Service | str") -> "Service":
# TODO: This could also be replaced by coercing str to Service in CacheItem via pydantic.
if isinstance(obj, str):
return Service(url=obj)
return obj
+
def fetch_biomart_service(xml: str) -> Service:
"""
Access BioMart data through the BioMart REST API:
@@ -91,7 +91,8 @@ def fetch_biomart_service(xml: str) -> Service:
ROOT = "http://www.ensembl.org/biomart/martservice?query="
return Service(ROOT + urllib.parse.quote_plus(xml))
-@dataclass
+
+@dataclass(frozen=True)
class CacheItem:
"""
Class for differentriating between different ways of fetching data.
@@ -126,6 +127,14 @@ class CacheItem:
We will still error if the status code is not 2XX (a successful request).
"""
+ def __post_init__(self):
+ # Google Drive validation. TODO: remove if move to OSDF.
+ if "uc?id=" not in self.cached or "/view?usp=sharing" in self.cached:
+ raise RuntimeError(
+ "Make sure your Google Drive URLs are in https://drive.google.com/uc?id=... format "
+ + "with no /view?usp=sharing at the end. See CONTRIBUTING.md for more info."
+ )
+
@classmethod
@warnings.deprecated("Pending for removal after the CONTRIBUTING guide is updated.")
def cache_only(cls, name: str, cached: str) -> "CacheItem":
@@ -136,7 +145,7 @@ def download(self, output: str | PathLike):
logger.info(f"Fetching {self.name}...")
logger.info(f"Downloading cache {self.cached} to {output}...")
- gdown.download(self.cached, str(output)) # gdown doesn't have a type signature, but it expects a string :/
+ gdown.download(self.cached, str(output)) # gdown doesn't have a type signature, but it expects a string :/
if self.pinned is not None:
Service.coerce(self.pinned).download_against_cache(cache=Path(output), downloaded_file_type="pinned", move_output=True)
@@ -147,7 +156,7 @@ def download(self, output: str | PathLike):
except DownloadFileCheckException as err:
logger.warning(err)
- # TODO: yikes! same with self.unpinned
+
CacheDirectory = dict[str, Union[CacheItem, "CacheDirectory"]]
# An *unversioned* directory list.
@@ -176,6 +185,20 @@ def download(self, output: str | PathLike):
cached="https://drive.google.com/uc?id=1h2Cl-60qcKse-djcsqlRXm_n60mVY7lk",
unpinned="https://rest.uniprot.org/uniprotkb/stream?fields=accession%2Cid%2Cprotein_name%2Cgene_names&format=tsv&query=%28*%29+AND+%28reviewed%3Atrue%29+AND+%28model_organism%3A9606%29",
),
+ # Sources
+ "sources.tsv": CacheItem(
+ # Where KW-0675 is the UniProt keyword for receptors
+ name="UniProt-tagged sources (receptors)",
+ unpinned="https://rest.uniprot.org/uniprotkb/stream?fields=accession%2Cid&format=tsv&query=%28%28keyword%3A%22KW-0675%22%29%29+AND+%28reviewed%3Atrue%29+AND+%28model_organism%3A9606%29",
+ cached="https://drive.google.com/uc?id=1VbCLH9yoJ41QhzhsSy9ICAU2MLAAxfJe"
+ ),
+ # Targets
+ "targets.tsv": CacheItem(
+ name="UniProt-tagged targets (transcription factors)",
+ # Where KW-0539 and KW-0805 are the UniProt keywords for the nucleus and transcription regulators, respectively.
+ unpinned="https://rest.uniprot.org/uniprotkb/stream?fields=accession%2Cid&format=tsv&query=%28%28keyword%3AKW-0539%29+OR+%28keyword%3AKW-0805%29%29+AND+%28reviewed%3Atrue%29+AND+%28model_organism%3A9606%29",
+ cached="https://drive.google.com/uc?id=1gg_2IO1xHeho8KkcYVIfqHNWSRZx6gd1"
+ ),
# idmapping FTP files. See the associated README:
# https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/idmapping/README
"HUMAN_9606_idmapping_selected.tab.gz": CacheItem(
@@ -190,31 +213,6 @@ def download(self, output: str | PathLike):
),
}
},
- "DISEASES": {
- # Instead of going through https://unmtid-shinyapps.net/shiny/tiga/, we use their
- # archived files directory instead.
- "tiga_gene-trait_stats.tsv": CacheItem(
- name="TIGA data",
- cached="https://drive.google.com/uc?id=114qyuNDy4qdmYDHHJAW-yBeTxcGTDUnK",
- pinned="https://unmtid-dbs.net/download/TIGA/20250916/tiga_gene-trait_stats.tsv",
- ),
- "HumanDO.tsv": CacheItem(
- name="Disease ontology data",
- cached="https://drive.google.com/uc?id=1lfB1DGJgrXTxP_50L6gGu_Nq6OyDjiIi",
- # DiseaseOntology is a decently updating repository!
- unpinned="https://raw.githubusercontent.com/DiseaseOntology/HumanDiseaseOntology/refs/heads/main/DOreports/HumanDO.tsv",
- ),
- "human_disease_textmining_filtered.tsv": CacheItem(
- name="DISEASES textmining channel",
- cached="https://drive.google.com/uc?id=1vD8KbT9sk04VEJx9r3_LglCTGYJdhN0D",
- unpinned="https://download.jensenlab.org/human_disease_textmining_filtered.tsv",
- ),
- "human_disease_knowledge_filtered.tsv": CacheItem(
- name="DISEASES knowledge channel",
- cached="https://drive.google.com/uc?id=1qGUnjVwF9-8p5xvp8_6CfVsbMSM_wkld",
- unpinned="https://download.jensenlab.org/human_disease_knowledge_filtered.tsv",
- ),
- },
"BioMart": {
"ensg-ensp.tsv": CacheItem(
name="BioMart ENSG <-> ENSP mapping",
@@ -222,58 +220,6 @@ def download(self, output: str | PathLike):
unpinned=fetch_biomart_service((dir_path / "biomart" / "ensg-ensp.xml").read_text()),
)
},
- "DepMap": {
- "OmicsProfiles.csv": CacheItem(
- name="DepMap omics metadata",
- cached="https://drive.google.com/uc?id=1i54aKfO0Ci2QKLTNJnuQ_jgGhH4c9rTL",
- pinned="https://depmap.org/portal/download/api/download?file_name=downloads-by-canonical-id%2F2025-05-01-master-mapping-table-28c2.12%2Fpublic_release_date.2025-05-01.master_mapping_table.csv&dl_name=OmicsProfiles.csv&bucket=depmap-external-downloads",
- ),
- "CRISPRGeneDependency.csv": CacheItem(
- name="DepMap gene dependency probability estimates",
- cached="https://drive.google.com/uc?id=122rWNqT_u3M7B_11WYZMtOLiPbBykkaz",
- pinned="https://depmap.org/portal/download/api/download?file_name=downloads-by-canonical-id%2F25q2-public-557c.3%2FCRISPRGeneDependency.csv&dl_name=CRISPRGeneDependency.csv&bucket=depmap-external-downloads",
- ),
- "OmicsSomaticMutationsMatrixDamaging.csv": CacheItem(
- name="DepMap genotyped matrix",
- cached="https://drive.google.com/uc?id=1W7N2H0Qi7NwmTmNChcwa2ZZ4WxAuz-Xh",
- pinned="https://depmap.org/portal/download/api/download?file_name=downloads-by-canonical-id%2Fpublic-25q2-c5ef.87%2FOmicsSomaticMutationsMatrixDamaging.csv&dl_name=OmicsSomaticMutationsMatrixDamaging.csv&bucket=depmap-external-downloads",
- ),
- "OmicsExpressionProteinCodingGenesTPMLogp1.csv": CacheItem(
- name="DepMap model-level TPMs",
- cached="https://drive.google.com/uc?id=1P0m88eXJ8GPdru8h9oOcHPeXKU7ljIrP",
- pinned="https://depmap.org/portal/download/api/download?file_name=downloads-by-canonical-id%2Fpublic-25q2-c5ef.73%2FOmicsExpressionProteinCodingGenesTPMLogp1.csv&dl_name=OmicsExpressionProteinCodingGenesTPMLogp1.csv&bucket=depmap-external-downloads",
- ),
- "OmicsCNGeneWGS.csv": CacheItem(
- name="DepMap gene-level copy number data",
- cached="https://drive.google.com/uc?id=1TPp3cfK7OZUrftucr3fLO-krXSQAA6Ub",
- pinned="https://depmap.org/portal/download/api/download?file_name=downloads-by-canonical-id%2Fpublic-25q2-c5ef.104%2FOmicsCNGeneWGS.csv&dl_name=OmicsCNGeneWGS.csv&bucket=depmap-external-downloads",
- ),
- },
- "KEGG": {
- # For some reason, KEGG requires a Referer header: opening this URL otherwise fails.
- "ko03250.xml": CacheItem(
- name="KEGG 03250",
- cached="https://drive.google.com/uc?id=16dtWKHCQMp2qrLfFDE7nVhbwBCr2H5a9",
- unpinned=Service(
- "https://www.kegg.jp/kegg-bin/download?entry=ko03250&format=kgml",
- headers={'Referer': 'https://www.kegg.jp/pathway/ko03250'})
- )
- },
- "HIV1": {
- # The following files are from https://github.com/gitter-lab/hiv1-aurkb.
- # While the following files do point to the repository's main branch,
- # they aren't expected to actually change.
- "prize_05.tsv": CacheItem(
- name="HIV_05 prizes",
- cached="https://drive.google.com/uc?id=1jVWNRPfYkbqimO44GdzXYB3-7NXhet1m",
- pinned="https://raw.githubusercontent.com/gitter-lab/hiv1-aurkb/refs/heads/main/Results/base_analysis/prize_05.csv"
- ),
- "prize_060.tsv": CacheItem(
- name="HIV_060 prizes",
- cached="https://drive.google.com/uc?id=1Aucgp7pcooGr9oT4m2bvYEuYW6186WxQ",
- pinned="https://raw.githubusercontent.com/gitter-lab/hiv1-aurkb/refs/heads/main/Results/base_analysis/prize_060.csv"
- )
- },
"iRefIndex": {
# This can also be obtained from the SPRAS repo, though the SPRAS repo removes self loops. We don't.
# (https://github.com/Reed-CompBio/spras/blob/b5d7a2499afa8eab14c60ce0f99fa7e8a23a2c64/input/phosphosite-irefindex13.0-uniprot.txt).
@@ -283,63 +229,20 @@ def download(self, output: str | PathLike):
"phosphosite-irefindex13.0-uniprot.txt": CacheItem(
name="iRefIndex v13.0 UniProt interactome",
cached="https://drive.google.com/uc?id=1fQ8Z3FjEwUseEtsExO723zj7mAAtdomo",
- pinned="https://raw.githubusercontent.com/gitter-lab/tps/refs/heads/master/data/networks/phosphosite-irefindex13.0-uniprot.txt"
+ pinned="https://raw.githubusercontent.com/gitter-lab/tps/refs/heads/master/data/networks/phosphosite-irefindex13.0-uniprot.txt",
)
},
- "OsmoticStress": {
- "yeast_pcsf_network.sif": CacheItem(
- # In the paper https://doi.org/10.1016/j.celrep.2018.08.085
- name="Case Study Edge Results, from Supplementary Data 3",
- cached="https://drive.google.com/uc?id=1Agte0Aezext-8jLhGP4GmaF3tS7gHX-h"
- ),
- # The following files are from https://github.com/gitter-lab/osmotic-stress.
- # While the following files do point to the repository's main branch,
- # they aren't expected to actually change.
- "prizes.txt": CacheItem(
- name="Osmotic Stress Prizes",
- pinned="https://raw.githubusercontent.com/gitter-lab/osmotic-stress/refs/heads/master/Input%20Data/prizes.txt",
- cached="https://drive.google.com/uc?id=16WDQs0Vjv6rI12-hbifsbnpH31jMGhJg"
- ),
- "ChasmanNetwork-DirUndir.txt": CacheItem(
- name="Network Input",
- pinned="https://raw.githubusercontent.com/gitter-lab/osmotic-stress/refs/heads/master/Input%20Data/ChasmanNetwork-DirUndir.txt",
- cached="https://drive.google.com/uc?id=1qYXPaWcPU72YYME7NaBzD7thYCHRzrLH"
- ),
- "dummy.txt": CacheItem(
- name="Dummy Nodes File",
- pinned="https://raw.githubusercontent.com/gitter-lab/osmotic-stress/refs/heads/master/Input%20Data/dummy.txt",
- cached="https://drive.google.com/uc?id=1dsFIhBrIEahggg0JPxw64JwS51pKxoQU"
- ),
- "_edgeFreq.eda ": CacheItem(
- name="Case Study Omics Integrator Edge Frequencies",
- pinned="https://raw.githubusercontent.com/gitter-lab/osmotic-stress/refs/heads/master/Notebooks/Forest-TPS/_edgeFreq.eda",
- cached="https://drive.google.com/uc?id=1M_rxEzUCo_EVuFyM47OEH2J-4LB3eeCR"
- ),
- "goldStandardUnionDetailed.txt": CacheItem(
- name="Gold Standard Reference Pathways",
- pinned="https://raw.githubusercontent.com/gitter-lab/osmotic-stress/refs/heads/master/data/evaluation/goldStandardUnionDetailed.txt",
- cached="https://drive.google.com/uc?id=1-_zF9oKFCNmJbDCC2vq8OM17HJw80s2T"
- ),
- },
- "EGFR": {
- # The following files are from https://github.com/gitter-lab/tps.
- # While the following files do point to the repository's main branch,
- # they aren't expected to actually change.
- "eight-egfr-reference-all.txt": CacheItem(
- name="EGFR Gold Standard Reference",
- pinned="https://raw.githubusercontent.com/gitter-lab/tps/refs/heads/master/data/resources/eight-egfr-reference-all.txt",
- cached="https://drive.google.com/uc?id=15MqpIbH1GRA1tq0ZXH9oMnKytoFSzXyw"
+ "PathwayCommons": {
+ "pathways.txt.gz": CacheItem(
+ name="PathwayCommons Pathway Identifiers",
+ cached="https://drive.google.com/uc?id=1SMwuuohuZuNFnTev4zRNJrBnBsLlCHcK",
+ pinned="https://download.baderlab.org/PathwayCommons/PC2/v14/pathways.txt.gz",
),
- "egfr-prizes.txt": CacheItem(
- name="EGFR prizes",
- pinned="https://raw.githubusercontent.com/gitter-lab/tps/refs/heads/master/data/pcsf/egfr-prizes.txt",
- cached="https://drive.google.com/uc?id=1nI5hw-rYRZPs15UJiqokHpHEAabRq6Xj"
- )
},
}
-def get_cache_item(path: list[str]) -> CacheItem:
+def get_cache_item(path: tuple[str, ...]) -> CacheItem:
"""Takes a path and gets the underlying cache item."""
assert len(path) != 0
@@ -352,9 +255,4 @@ def get_cache_item(path: list[str]) -> CacheItem:
if not isinstance(current_item, CacheItem):
raise ValueError(f"Path {path} doesn't lead to a cache item")
- # Google Drive validation. TODO: remove if move to OSDF.
- if "uc?id=" not in current_item.cached or "/view?usp=sharing" in current_item.cached:
- raise RuntimeError("Make sure your Google Drive URLs are in https://drive.google.com/uc?id=... format " + \
- "with no /view?usp=sharing at the end. See CONTRIBUTING.md for more info.")
-
return current_item
diff --git a/configs/dmmm.yaml b/configs/dmmm.yaml
index 9fe9dd94..d0f598e3 100644
--- a/configs/dmmm.yaml
+++ b/configs/dmmm.yaml
@@ -43,7 +43,7 @@ algorithms:
g: 0
datasets:
- # TODO: use old paramaters for datasets
+ # TODO: use old parameters for datasets
# HIV: https://github.com/Reed-CompBio/spras-benchmarking/blob/0293ae4dc0be59502fac06b42cfd9796a4b4413e/hiv-benchmarking/spras-config/config.yaml
- label: dmmmhiv_060
node_files: ["processed_prizes_060.txt"]
@@ -55,12 +55,12 @@ datasets:
edge_files: ["../raw/phosphosite-irefindex13.0-uniprot.txt"]
other_files: []
data_dir: "datasets/hiv/processed"
- # Yeast: https://github.com/tristan-f-r/spras-benchmarking/blob/9477d85871024a5e3a4b0b8b9be7e78c0d0ee961/yeast-osmotic-stress/config.yaml
+ # Yeast: https://github.com/tristan-f-r/spras-benchmarking/blob/9477d85871024a5e3a4b0b8b9be7e78c0d0ee961/yeast_osmotic_stress/config.yaml
- label: dmmmyeast
node_files: ["prizes1_dummies.txt"]
edge_files: ["network1.txt"]
other_files: []
- data_dir: "datasets/yeast-osmotic-stress/processed"
+ data_dir: "datasets/yeast_osmotic_stress/processed"
- label: dmmmdiseases_alopecia_areata
data_dir: datasets/diseases
edge_files:
diff --git a/configs/pra.yaml b/configs/pra.yaml
index 3ad77733..36eb5dec 100644
--- a/configs/pra.yaml
+++ b/configs/pra.yaml
@@ -55,4 +55,4 @@ datasets:
# Placeholder
other_files: []
# Relative path from the spras directory
- data_dir: "datasets/rn-muscle-skeletal/processed"
+ data_dir: "datasets/rn_muscle_skeletal/processed"
diff --git a/datasets/README.md b/datasets/README.md
index a53730c9..10a41c2d 100644
--- a/datasets/README.md
+++ b/datasets/README.md
@@ -10,4 +10,9 @@ Many of the datasets here have been stripped of their extra post-analysis. Here,
- [`hiv`](https://github.com/Reed-CompBio/spras-benchmarking/blob/0293ae4dc0be59502fac06b42cfd9796a4b4413e/hiv-benchmarking)
- [`diseases`](https://github.com/Reed-CompBio/spras-benchmarking/tree/3c0155567dbc43278531b91f9173f6d4f4486dd8/datasets/diseases)
- [`depmap`](https://github.com/Reed-CompBio/spras-benchmarking/tree/b332c0ab53868f111cb89cd4e9f485e8c19aa9e3/datasets/depmap)
-- [`yeast-osmotic-stress`](https://github.com/Reed-CompBio/spras-benchmarking/tree/8f69dcdf4a52607347fe3a962b753df396e44cda/yeast-osmotic-stress)
+- [`yeast_osmotic_stress`](https://github.com/Reed-CompBio/spras-benchmarking/tree/8f69dcdf4a52607347fe3a962b753df396e44cda/yeast_osmotic_stress)
+
+## `explore` folders
+
+To motivate certain decisions made in-code, such as `synthetic_data`'s PANTHER pathway choices, we provide scripts that use live data
+to assist in data curation. These folders can also contain exploratory CLIs for motivating e.g. magic constants.
diff --git a/datasets/__init__.py b/datasets/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/datasets/contributing/raw_generation.py b/datasets/contributing/raw_generation.py
index acbca216..100cfd78 100644
--- a/datasets/contributing/raw_generation.py
+++ b/datasets/contributing/raw_generation.py
@@ -6,14 +6,17 @@
import uuid
import pandas
+
def random_id() -> str:
return uuid.uuid4().hex
+
def assign_ids(graph: networkx.DiGraph) -> networkx.DiGraph:
"""Assigns new IDs to a graph based on `random_id`"""
mapping = {node: random_id() for node in graph}
return networkx.relabel_nodes(graph, mapping)
+
def gnp_noise(graph: networkx.DiGraph, p: float):
"""
The mutative equivalent to networkx.gnp_random_graph,
@@ -23,8 +26,9 @@ def gnp_noise(graph: networkx.DiGraph, p: float):
if random.random() < p:
graph.add_edge(*e)
+
def generate_parser():
- parser = argparse.ArgumentParser(prog='Pathway generator')
+ parser = argparse.ArgumentParser(prog="Pathway generator")
parser.add_argument("--path-count", type=int, default=10)
parser.add_argument("--path-length", type=int, default=7)
@@ -39,6 +43,7 @@ def generate_parser():
parser.add_argument("--interactome-output", type=str, default="interactome.tsv")
return parser
+
def main():
args = generate_parser().parse_args()
@@ -66,13 +71,14 @@ def main():
gold_standard = pandas.DataFrame(((a, b) for a, b, _data in networkx.to_edgelist(graph)), columns=["Source", "Target"])
# We make the gold standard output a little annoying to force some post-processing with pandas.
gold_standard.insert(1, "Interaction-Type", "pp")
- gold_standard.to_csv(args.gold_standard_output, index=False, sep='\t')
+ gold_standard.to_csv(args.gold_standard_output, index=False, sep="\t")
# and we'll follow along similarly to above to build our interactome.
graph.add_nodes_from((random_id() for _ in range(args.interactome_extra_nodes)))
gnp_noise(graph, args.interactome_noise)
interactome = pandas.DataFrame(((a, b) for a, b, _data in networkx.to_edgelist(graph)), columns=["Source", "Target"])
- interactome.to_csv(args.interactome_output, index=False, sep='\t')
+ interactome.to_csv(args.interactome_output, index=False, sep="\t")
+
if __name__ == "__main__":
main()
diff --git a/datasets/depmap/Snakefile b/datasets/depmap/Snakefile
index 387a2622..ad9b42ea 100644
--- a/datasets/depmap/Snakefile
+++ b/datasets/depmap/Snakefile
@@ -9,15 +9,35 @@ rule all:
"raw/phosphosite-irefindex13.0-uniprot.txt"
produce_fetch_rules({
- "raw/CRISPRGeneDependency.csv": ["DepMap", "CRISPRGeneDependency.csv"],
- "raw/OmicsProfiles.csv": ["DepMap", "OmicsProfiles.csv"],
- "raw/OmicsSomaticMutationsMatrixDamaging.csv": ["DepMap", "OmicsSomaticMutationsMatrixDamaging.csv"],
- "raw/OmicsExpressionProteinCodingGenesTPMLogp1.csv": ["DepMap", "OmicsExpressionProteinCodingGenesTPMLogp1.csv"],
- "raw/OmicsCNGeneWGS.csv": ["DepMap", "OmicsCNGeneWGS.csv"],
- "raw/HUMAN_9606_idmapping.tsv": FetchConfig(["UniProt", "9606", "HUMAN_9606_idmapping.dat.gz"], uncompress=True),
- "raw/HUMAN_9606_idmapping_selected.tsv": FetchConfig(["UniProt", "9606", "HUMAN_9606_idmapping_selected.tab.gz"], uncompress=True),
- "raw/SwissProt_9606.tsv": ["UniProt", "9606", "SwissProt_9606.tsv"],
- "raw/phosphosite-irefindex13.0-uniprot.txt": ["iRefIndex", "phosphosite-irefindex13.0-uniprot.txt"],
+ "raw/OmicsProfiles.csv": CacheItem(
+ name="DepMap omics metadata",
+ cached="https://drive.google.com/uc?id=1i54aKfO0Ci2QKLTNJnuQ_jgGhH4c9rTL",
+ pinned="https://depmap.org/portal/download/api/download?file_name=downloads-by-canonical-id%2F2025-05-01-master-mapping-table-28c2.12%2Fpublic_release_date.2025-05-01.master_mapping_table.csv&dl_name=OmicsProfiles.csv&bucket=depmap-external-downloads",
+ ),
+ "raw/CRISPRGeneDependency.csv": CacheItem(
+ name="DepMap gene dependency probability estimates",
+ cached="https://drive.google.com/uc?id=122rWNqT_u3M7B_11WYZMtOLiPbBykkaz",
+ pinned="https://depmap.org/portal/download/api/download?file_name=downloads-by-canonical-id%2F25q2-public-557c.3%2FCRISPRGeneDependency.csv&dl_name=CRISPRGeneDependency.csv&bucket=depmap-external-downloads",
+ ),
+ "raw/OmicsSomaticMutationsMatrixDamaging.csv": CacheItem(
+ name="DepMap genotyped matrix",
+ cached="https://drive.google.com/uc?id=1W7N2H0Qi7NwmTmNChcwa2ZZ4WxAuz-Xh",
+ pinned="https://depmap.org/portal/download/api/download?file_name=downloads-by-canonical-id%2Fpublic-25q2-c5ef.87%2FOmicsSomaticMutationsMatrixDamaging.csv&dl_name=OmicsSomaticMutationsMatrixDamaging.csv&bucket=depmap-external-downloads",
+ ),
+ "raw/OmicsExpressionProteinCodingGenesTPMLogp1.csv": CacheItem(
+ name="DepMap model-level TPMs",
+ cached="https://drive.google.com/uc?id=1P0m88eXJ8GPdru8h9oOcHPeXKU7ljIrP",
+ pinned="https://depmap.org/portal/download/api/download?file_name=downloads-by-canonical-id%2Fpublic-25q2-c5ef.73%2FOmicsExpressionProteinCodingGenesTPMLogp1.csv&dl_name=OmicsExpressionProteinCodingGenesTPMLogp1.csv&bucket=depmap-external-downloads",
+ ),
+ "raw/OmicsCNGeneWGS.csv": CacheItem(
+ name="DepMap gene-level copy number data",
+ cached="https://drive.google.com/uc?id=1TPp3cfK7OZUrftucr3fLO-krXSQAA6Ub",
+ pinned="https://depmap.org/portal/download/api/download?file_name=downloads-by-canonical-id%2Fpublic-25q2-c5ef.104%2FOmicsCNGeneWGS.csv&dl_name=OmicsCNGeneWGS.csv&bucket=depmap-external-downloads",
+ ),
+ "raw/HUMAN_9606_idmapping.tsv": FetchConfig(("UniProt", "9606", "HUMAN_9606_idmapping.dat.gz"), uncompress=True),
+ "raw/HUMAN_9606_idmapping_selected.tsv": FetchConfig(("UniProt", "9606", "HUMAN_9606_idmapping_selected.tab.gz"), uncompress=True),
+ "raw/SwissProt_9606.tsv": ("UniProt", "9606", "SwissProt_9606.tsv"),
+ "raw/phosphosite-irefindex13.0-uniprot.txt": ("iRefIndex", "phosphosite-irefindex13.0-uniprot.txt"),
})
rule mapping:
diff --git a/datasets/diseases/Snakefile b/datasets/diseases/Snakefile
index aed94654..c4969aaf 100644
--- a/datasets/diseases/Snakefile
+++ b/datasets/diseases/Snakefile
@@ -9,13 +9,32 @@ rule all:
"prize_files/diabetes_mellitus_prizes.txt"
produce_fetch_rules({
- "raw/human_disease_textmining_filtered.tsv": ["DISEASES", "human_disease_textmining_filtered.tsv"],
- "raw/human_disease_knowledge_filtered.tsv": ["DISEASES", "human_disease_knowledge_filtered.tsv"],
- "raw/HumanDO.tsv": ["DISEASES", "HumanDO.tsv"],
- "raw/tiga_gene-trait_stats.tsv": ["DISEASES", "tiga_gene-trait_stats.tsv"],
- "raw/ensg-ensp.tsv": ["BioMart", "ensg-ensp.tsv"],
- "raw/9606.protein.links.full.txt": FetchConfig(["STRING", "9606", "9606.protein.links.full.txt.gz"], uncompress=True),
- "raw/9606.protein.aliases.txt": FetchConfig(["STRING", "9606", "9606.protein.aliases.txt.gz"], uncompress=True),
+ # Instead of going through https://unmtid-shinyapps.net/shiny/tiga/, we use their
+ # archived files directory instead.
+ "raw/tiga_gene-trait_stats.tsv": CacheItem(
+ name="TIGA data",
+ cached="https://drive.google.com/uc?id=114qyuNDy4qdmYDHHJAW-yBeTxcGTDUnK",
+ pinned="https://unmtid-dbs.net/download/TIGA/20250916/tiga_gene-trait_stats.tsv",
+ ),
+ "raw/HumanDO.tsv": CacheItem(
+ name="Disease ontology data",
+ cached="https://drive.google.com/uc?id=1lfB1DGJgrXTxP_50L6gGu_Nq6OyDjiIi",
+ # DiseaseOntology is a decently updating repository! We leave it unpinned
+ unpinned="https://raw.githubusercontent.com/DiseaseOntology/HumanDiseaseOntology/refs/heads/main/DOreports/HumanDO.tsv",
+ ),
+ "raw/human_disease_textmining_filtered.tsv": CacheItem(
+ name="DISEASES textmining channel",
+ cached="https://drive.google.com/uc?id=1vD8KbT9sk04VEJx9r3_LglCTGYJdhN0D",
+ unpinned="https://download.jensenlab.org/human_disease_textmining_filtered.tsv",
+ ),
+ "raw/human_disease_knowledge_filtered.tsv": CacheItem(
+ name="DISEASES knowledge channel",
+ cached="https://drive.google.com/uc?id=1qGUnjVwF9-8p5xvp8_6CfVsbMSM_wkld",
+ unpinned="https://download.jensenlab.org/human_disease_knowledge_filtered.tsv",
+ ),
+ "raw/ensg-ensp.tsv": ("BioMart", "ensg-ensp.tsv"),
+ "raw/9606.protein.links.full.txt": FetchConfig(("STRING", "9606", "9606.protein.links.full.txt.gz"), uncompress=True),
+ "raw/9606.protein.aliases.txt": FetchConfig(("STRING", "9606", "9606.protein.aliases.txt.gz"), uncompress=True),
})
rule inputs:
diff --git a/datasets/diseases/scripts/interactome.py b/datasets/diseases/scripts/interactome.py
index b0a40b6b..082d3478 100644
--- a/datasets/diseases/scripts/interactome.py
+++ b/datasets/diseases/scripts/interactome.py
@@ -3,6 +3,7 @@
diseases_path = Path(__file__).parent.parent.resolve()
+
def main():
# See /cache/directory.py for information on how this was grabbed.
# 9606 is the organism code for homo sapiens and the required background interactome of DISEASES.
@@ -15,5 +16,6 @@ def main():
(diseases_path / "processed").mkdir(exist_ok=True)
string.to_csv(diseases_path / "processed" / "string_interactome.tsv", sep="\t", index=False, header=False)
+
if __name__ == "__main__":
main()
diff --git a/datasets/egfr/Snakefile b/datasets/egfr/Snakefile
index f9af5435..79e856b8 100644
--- a/datasets/egfr/Snakefile
+++ b/datasets/egfr/Snakefile
@@ -13,11 +13,24 @@ rule all:
"processed/interactome.tsv",
produce_fetch_rules({
- "raw/eight-egfr-reference-all.txt": ["EGFR", "eight-egfr-reference-all.txt"],
- "raw/egfr-prizes.txt": ["EGFR", "egfr-prizes.txt"],
- "raw/9606.protein.links.full.txt": FetchConfig(["STRING", "9606", "9606.protein.links.full.txt.gz"], uncompress=True),
- "raw/HUMAN_9606_idmapping_selected.tsv": FetchConfig(["UniProt", "9606", "HUMAN_9606_idmapping_selected.tab.gz"], uncompress=True),
- "processed/phosphosite-irefindex13.0-uniprot.txt": ["iRefIndex", "phosphosite-irefindex13.0-uniprot.txt"]
+ "raw/9606.protein.links.full.txt": FetchConfig(("STRING", "9606", "9606.protein.links.full.txt.gz"), uncompress=True),
+ "raw/HUMAN_9606_idmapping_selected.tsv": FetchConfig(("UniProt", "9606", "HUMAN_9606_idmapping_selected.tab.gz"), uncompress=True),
+ "processed/phosphosite-irefindex13.0-uniprot.txt": ("iRefIndex", "phosphosite-irefindex13.0-uniprot.txt"),
+
+ # EGFR-specific files.
+ # The following files are from https://github.com/gitter-lab/tps.
+ # While the following files do point to the repository's main branch,
+ # they aren't expected to actually change.
+ "raw/eight-egfr-reference-all.txt": FetchConfig(CacheItem(
+ name="EGFR Gold Standard Reference",
+ pinned="https://raw.githubusercontent.com/gitter-lab/tps/refs/heads/master/data/resources/eight-egfr-reference-all.txt",
+ cached="https://drive.google.com/uc?id=15MqpIbH1GRA1tq0ZXH9oMnKytoFSzXyw",
+ )),
+ "raw/egfr-prizes.txt": FetchConfig(CacheItem(
+ name="EGFR prizes",
+ pinned="https://raw.githubusercontent.com/gitter-lab/tps/refs/heads/master/data/pcsf/egfr-prizes.txt",
+ cached="https://drive.google.com/uc?id=1nI5hw-rYRZPs15UJiqokHpHEAabRq6Xj",
+ )),
})
rule process_gold_standard:
@@ -45,4 +58,4 @@ rule map_ensembl:
output:
"processed/prizes.txt",
"processed/gold-standard-nodes.txt"
- shell: "uv run scripts/map_ensembl.py"
\ No newline at end of file
+ shell: "uv run scripts/map_ensembl.py"
diff --git a/datasets/egfr/scripts/map_ensembl.py b/datasets/egfr/scripts/map_ensembl.py
index d8488012..5643b6a4 100644
--- a/datasets/egfr/scripts/map_ensembl.py
+++ b/datasets/egfr/scripts/map_ensembl.py
@@ -2,39 +2,41 @@
from pathlib import Path
from tools.mapping.ensembl_uniprot import idmapping_uniprot_mapping
-egfr_directory = Path(__file__).parent.resolve() / '..'
+egfr_directory = Path(__file__).parent.resolve() / ".."
+
def main():
# We get specifically the STRING nodes, as the mapping from UniProt overeagerly maps
string_nodes = pandas.read_csv(
- egfr_directory / 'processed' / 'interactome.tsv',
- header=None, sep='\t', names=['Interactor1', 'Interactor2', 'Weight', 'Direction'])
- interactor_series = pandas.concat([string_nodes['Interactor1'], string_nodes['Interactor2']], ignore_index=True)
+ egfr_directory / "processed" / "interactome.tsv", header=None, sep="\t", names=["Interactor1", "Interactor2", "Weight", "Direction"]
+ )
+ interactor_series = pandas.concat([string_nodes["Interactor1"], string_nodes["Interactor2"]], ignore_index=True)
# Re-read the uniprot nodes from `process_gold_standard.py`
- nodes = (egfr_directory / 'processed' / 'gold-standard-nodes-uniprot.txt').read_text().splitlines()
+ nodes = (egfr_directory / "processed" / "gold-standard-nodes-uniprot.txt").read_text().splitlines()
# and the prizes from `process_prizes.py`
- prizes = pandas.read_csv(egfr_directory / 'processed' / 'prizes-uniprot.txt', sep='\t')
+ prizes = pandas.read_csv(egfr_directory / "processed" / "prizes-uniprot.txt", sep="\t")
# We grab our UniProt <-> ENSP mapping
- idmapping_df = idmapping_uniprot_mapping(egfr_directory / 'raw' / 'HUMAN_9606_idmapping_selected.tsv')
+ idmapping_df = idmapping_uniprot_mapping(egfr_directory / "raw" / "HUMAN_9606_idmapping_selected.tsv")
# Trim it with the interactor series
idmapping_df = idmapping_df[idmapping_df["Ensembl_PRO"].isin(interactor_series)]
# and map the nodes
- idmapping_nodes_df = pandas.DataFrame(nodes, columns=['UniProtKB-ID']).merge(idmapping_df, on='UniProtKB-ID', how='left')
- idmapping_nodes_df = idmapping_nodes_df.drop(columns=['UniProtKB-ID', 'UniProtKB-AC', 'Ensembl'])
- idmapping_nodes_df = idmapping_nodes_df[~idmapping_nodes_df['Ensembl_PRO'].isna()]
- nodes = idmapping_nodes_df['Ensembl_PRO'].astype(str).to_list()
- (egfr_directory / 'processed' / 'gold-standard-nodes.txt').write_text("\n".join(nodes))
+ idmapping_nodes_df = pandas.DataFrame(nodes, columns=["UniProtKB-ID"]).merge(idmapping_df, on="UniProtKB-ID", how="left")
+ idmapping_nodes_df = idmapping_nodes_df.drop(columns=["UniProtKB-ID", "UniProtKB-AC", "Ensembl"])
+ idmapping_nodes_df = idmapping_nodes_df[~idmapping_nodes_df["Ensembl_PRO"].isna()]
+ nodes = idmapping_nodes_df["Ensembl_PRO"].astype(str).to_list()
+ (egfr_directory / "processed" / "gold-standard-nodes.txt").write_text("\n".join(nodes))
# and the prizes
- idmapping_prizes_df = prizes.merge(idmapping_df, left_on='NODEID', right_on="UniProtKB-ID", how='inner')
- idmapping_prizes_df = idmapping_prizes_df.drop(columns=['UniProtKB-ID', 'UniProtKB-AC', 'Ensembl', 'NODEID'])
- idmapping_prizes_df = idmapping_prizes_df[~idmapping_prizes_df['Ensembl_PRO'].isna()]
- idmapping_prizes_df = idmapping_prizes_df.rename(columns={'Ensembl_PRO': 'NODEID'})
+ idmapping_prizes_df = prizes.merge(idmapping_df, left_on="NODEID", right_on="UniProtKB-ID", how="inner")
+ idmapping_prizes_df = idmapping_prizes_df.drop(columns=["UniProtKB-ID", "UniProtKB-AC", "Ensembl", "NODEID"])
+ idmapping_prizes_df = idmapping_prizes_df[~idmapping_prizes_df["Ensembl_PRO"].isna()]
+ idmapping_prizes_df = idmapping_prizes_df.rename(columns={"Ensembl_PRO": "NODEID"})
idmapping_prizes_df = idmapping_prizes_df[["NODEID", "prize", "active", "dummy", "source"]]
- idmapping_prizes_df.to_csv(egfr_directory / 'processed' / 'prizes.txt', sep='\t', index=False)
+ idmapping_prizes_df.to_csv(egfr_directory / "processed" / "prizes.txt", sep="\t", index=False)
+
if __name__ == "__main__":
- main()
\ No newline at end of file
+ main()
diff --git a/datasets/egfr/scripts/process_gold_standard.py b/datasets/egfr/scripts/process_gold_standard.py
index 6f5c8997..445b9262 100644
--- a/datasets/egfr/scripts/process_gold_standard.py
+++ b/datasets/egfr/scripts/process_gold_standard.py
@@ -1,14 +1,16 @@
from pathlib import Path
-egfr_directory = Path(__file__).parent.resolve() / '..'
+egfr_directory = Path(__file__).parent.resolve() / ".."
+
def main():
# First, we remove all PSUEDONODES (and any duplicates)
- nodes = (egfr_directory / 'raw' / 'eight-egfr-reference-all.txt').read_text().splitlines()
+ nodes = (egfr_directory / "raw" / "eight-egfr-reference-all.txt").read_text().splitlines()
nodes = list(set([node for node in nodes if not node.endswith("_PSEUDONODE")]))
- (egfr_directory / 'processed').mkdir(exist_ok=True)
- (egfr_directory / 'processed' / 'gold-standard-nodes-uniprot.txt').write_text("\n".join(nodes))
+ (egfr_directory / "processed").mkdir(exist_ok=True)
+ (egfr_directory / "processed" / "gold-standard-nodes-uniprot.txt").write_text("\n".join(nodes))
+
if __name__ == "__main__":
main()
diff --git a/datasets/egfr/scripts/process_interactome.py b/datasets/egfr/scripts/process_interactome.py
index 3e1c8cd2..4019f0ad 100644
--- a/datasets/egfr/scripts/process_interactome.py
+++ b/datasets/egfr/scripts/process_interactome.py
@@ -1,18 +1,20 @@
from pathlib import Path
import pandas
-egfr_directory = Path(__file__).parent.resolve() / '..'
+egfr_directory = Path(__file__).parent.resolve() / ".."
+
def main():
- interactome_df = pandas.read_csv(egfr_directory / 'raw' / '9606.protein.links.full.txt', sep=' ')
- interactome_df['protein1'] = interactome_df['protein1'].astype(str).str.removeprefix("9606.")
- interactome_df['protein2'] = interactome_df['protein2'].astype(str).str.removeprefix("9606.")
+ interactome_df = pandas.read_csv(egfr_directory / "raw" / "9606.protein.links.full.txt", sep=" ")
+ interactome_df["protein1"] = interactome_df["protein1"].astype(str).str.removeprefix("9606.")
+ interactome_df["protein2"] = interactome_df["protein2"].astype(str).str.removeprefix("9606.")
# Since this is links.full vs links, we need to restrict to a subset of headers before saving the interactome.
interactome_df = interactome_df[["protein1", "protein2", "combined_score"]]
- interactome_df['Direction'] = 'U'
+ interactome_df["Direction"] = "U"
+
+ (egfr_directory / "processed").mkdir(exist_ok=True)
+ interactome_df.to_csv(egfr_directory / "processed" / "interactome.tsv", index=False, header=False, sep="\t")
- (egfr_directory / 'processed').mkdir(exist_ok=True)
- interactome_df.to_csv(egfr_directory / 'processed' / 'interactome.tsv', index=False, header=False, sep='\t')
if __name__ == "__main__":
main()
diff --git a/datasets/egfr/scripts/process_prizes.py b/datasets/egfr/scripts/process_prizes.py
index 8763ac58..1c94c215 100644
--- a/datasets/egfr/scripts/process_prizes.py
+++ b/datasets/egfr/scripts/process_prizes.py
@@ -1,27 +1,20 @@
import pandas
from pathlib import Path
-egfr_directory = Path(__file__).parent.resolve() / '..'
+egfr_directory = Path(__file__).parent.resolve() / ".."
+
def main():
- prizes = pandas.read_csv(
- egfr_directory / 'raw' / 'egfr-prizes.txt', sep='\t',
- header=None, names=['NODEID', 'prize']
- )
- prizes = prizes.loc[~prizes['NODEID'].str.endswith('_PSEUDONODE')]
+ prizes = pandas.read_csv(egfr_directory / "raw" / "egfr-prizes.txt", sep="\t", header=None, names=["NODEID", "prize"])
+ prizes = prizes.loc[~prizes["NODEID"].str.endswith("_PSEUDONODE")]
# TODO: prize: 10 is a magic value.
prizes = pandas.concat(
- [prizes, pandas.DataFrame({
- 'NODEID': ['EGF_HUMAN'],
- 'prize': [10],
- 'dummy': ['True'],
- 'source': ['True']
- })],
- ignore_index=True
+ [prizes, pandas.DataFrame({"NODEID": ["EGF_HUMAN"], "prize": [10], "dummy": ["True"], "source": ["True"]})], ignore_index=True
)
- prizes['active'] = 'True'
+ prizes["active"] = "True"
+
+ prizes.to_csv(egfr_directory / "processed" / "prizes-uniprot.txt", index=False, sep="\t")
- prizes.to_csv(egfr_directory / 'processed' / 'prizes-uniprot.txt', index=False, sep='\t')
if __name__ == "__main__":
main()
diff --git a/datasets/hiv/README.md b/datasets/hiv/README.md
index 3eb6e998..d07668b3 100644
--- a/datasets/hiv/README.md
+++ b/datasets/hiv/README.md
@@ -13,7 +13,6 @@ Follow the `Snakemake` directive to find the fetched URLs for these.
- `prize_05.tsv`: Prizes files from HIV expressing Jurkat cells grown for 5 minutes, from the original paper above.
- `prize_060.tsv`: Prizes files from growing for 60 minutes.
-- `ko03250.xml`: KEGG Orthology Pathway ID 03250 (currently unused - was used previously for an attempt at gold standard generation.)
- `HUMAN_9606_idmapping.tsv`: File provided by UniProt, used for mapping UniProt identifiers for `name_mapping.py`.
- `phosphosite-irefindex13.0-uniprot.txt`: The background interactome from the now-gone iRefIndex.
diff --git a/datasets/hiv/Snakefile b/datasets/hiv/Snakefile
index 1320b3f7..0b11e174 100644
--- a/datasets/hiv/Snakefile
+++ b/datasets/hiv/Snakefile
@@ -7,11 +7,21 @@ rule all:
"raw/phosphosite-irefindex13.0-uniprot.txt"
produce_fetch_rules({
- "raw/prize_05.tsv": ["HIV1", "prize_05.tsv"],
- "raw/prize_060.tsv": ["HIV1", "prize_060.tsv"],
- "raw/ko03250.xml": ["KEGG", "ko03250.xml"],
- "raw/phosphosite-irefindex13.0-uniprot.txt": ["iRefIndex", "phosphosite-irefindex13.0-uniprot.txt"],
- "raw/HUMAN_9606_idmapping.tsv": FetchConfig(["UniProt", "9606", "HUMAN_9606_idmapping.dat.gz"], uncompress=True),
+ # The following files are from https://github.com/gitter-lab/hiv1-aurkb.
+ # While the following files do point to the repository's main branch,
+ # they aren't expected to actually change.
+ "raw/prize_05.tsv": FetchConfig(CacheItem(
+ name="HIV_05 prizes",
+ cached="https://drive.google.com/uc?id=1jVWNRPfYkbqimO44GdzXYB3-7NXhet1m",
+ pinned="https://raw.githubusercontent.com/gitter-lab/hiv1-aurkb/refs/heads/main/Results/base_analysis/prize_05.csv",
+ )),
+ "raw/prize_060.tsv": FetchConfig(CacheItem(
+ name="HIV_060 prizes",
+ cached="https://drive.google.com/uc?id=1Aucgp7pcooGr9oT4m2bvYEuYW6186WxQ",
+ pinned="https://raw.githubusercontent.com/gitter-lab/hiv1-aurkb/refs/heads/main/Results/base_analysis/prize_060.csv",
+ )),
+ "raw/phosphosite-irefindex13.0-uniprot.txt": ("iRefIndex", "phosphosite-irefindex13.0-uniprot.txt"),
+ "raw/HUMAN_9606_idmapping.tsv": FetchConfig(("UniProt", "9606", "HUMAN_9606_idmapping.dat.gz"), uncompress=True),
})
rule data_prep:
diff --git a/datasets/hiv/scripts/name_mapping.py b/datasets/hiv/scripts/name_mapping.py
index da78a31e..73482620 100644
--- a/datasets/hiv/scripts/name_mapping.py
+++ b/datasets/hiv/scripts/name_mapping.py
@@ -7,17 +7,20 @@
hiv_directory = Path(__file__).parent.resolve().parent
+
def main():
# See prepare.py for the `node_ids` generation: this is the deduplicated list of node IDs
# from the two prize files in `raw`.
node_ids = (hiv_directory / "intermediate" / "node_set.txt").read_text().split("\n")
- idmapping = pandas.read_csv(hiv_directory / "raw" / "HUMAN_9606_idmapping.tsv",
- sep='\t', header=None, names=["UniProtKB", "Type", "UniProtKB-ID"])
+ idmapping = pandas.read_csv(
+ hiv_directory / "raw" / "HUMAN_9606_idmapping.tsv", sep="\t", header=None, names=["UniProtKB", "Type", "UniProtKB-ID"]
+ )
idmapping = idmapping[idmapping["Type"] == "UniProtKB-ID"]
idmapping = idmapping.drop(columns="Type")
idmapping = idmapping[idmapping["UniProtKB"].isin(node_ids)]
- idmapping.to_csv(hiv_directory / "intermediate" / "mapping.tsv", index=False, sep='\t')
+ idmapping.to_csv(hiv_directory / "intermediate" / "mapping.tsv", index=False, sep="\t")
+
if __name__ == "__main__":
main()
diff --git a/datasets/hiv/scripts/prepare.py b/datasets/hiv/scripts/prepare.py
index 2dee00d0..a93c8435 100644
--- a/datasets/hiv/scripts/prepare.py
+++ b/datasets/hiv/scripts/prepare.py
@@ -3,6 +3,7 @@
hiv_path = Path(__file__).parent.resolve().parent
+
def process_prizes(prizes: pandas.DataFrame):
# Some proteins in the original prize files have the syntax `majorIdentifier-N` where N denotes isoforms.
# We don't particurarly care about any particular isoform when doing pathway reconstruction,
@@ -15,6 +16,7 @@ def process_prizes(prizes: pandas.DataFrame):
return prizes
+
def main():
# Follow `Snakefile` or the README for information about these two files.
prize_05 = process_prizes(pandas.read_csv(hiv_path / "raw" / "prize_05.tsv", sep="\t"))
@@ -27,10 +29,10 @@ def main():
# Save files to the intermediate path
intermediate_path = hiv_path / "intermediate"
intermediate_path.mkdir(exist_ok=True)
- prize_05.to_csv(intermediate_path / "prize_05.tsv", index=False, sep='\t')
- prize_060.to_csv(intermediate_path / "prize_060.tsv", index=False, sep='\t')
+ prize_05.to_csv(intermediate_path / "prize_05.tsv", index=False, sep="\t")
+ prize_060.to_csv(intermediate_path / "prize_060.tsv", index=False, sep="\t")
(intermediate_path / "node_set.txt").write_text("\n".join(node_set))
-if __name__ == '__main__':
- main()
+if __name__ == "__main__":
+ main()
diff --git a/datasets/hiv/scripts/spras_formatting.py b/datasets/hiv/scripts/spras_formatting.py
index d2542e09..3ec7aaf4 100644
--- a/datasets/hiv/scripts/spras_formatting.py
+++ b/datasets/hiv/scripts/spras_formatting.py
@@ -3,30 +3,33 @@
hiv_directory = Path(__file__).parent.resolve().parent
+
def format(prizes: pandas.DataFrame, uniprot_mapping: dict[str, str]):
prizes["Uniprot"] = prizes["Uniprot"].apply(lambda x: uniprot_mapping.get(x))
# We also filter for proteins whose UniProtKB accession numbers no longer exist
# (usually for being wrongly predicted). Older versions of the UniProtKB mapping can be used
# to preserve these invalid protein codes.
- prizes = prizes[prizes['Uniprot'].notnull()]
+ prizes = prizes[prizes["Uniprot"].notnull()]
# Format with SPRAS column names
prizes.columns = ["NODEID", "prize"]
return prizes
+
def main():
# See name_mapping.py for the origins of mapping.tsv
- mapping = pandas.read_csv(hiv_directory / 'intermediate' / 'mapping.tsv', sep='\t')
+ mapping = pandas.read_csv(hiv_directory / "intermediate" / "mapping.tsv", sep="\t")
uniprot_mapping = dict(zip(mapping["UniProtKB"], mapping["UniProtKB-ID"]))
# See prepare.py for the origins of these files.
- prize_05 = format(pandas.read_csv(hiv_directory / "intermediate" / "prize_05.tsv", sep='\t'), uniprot_mapping)
- prize_060 = format(pandas.read_csv(hiv_directory / "intermediate" / "prize_060.tsv", sep='\t'), uniprot_mapping)
+ prize_05 = format(pandas.read_csv(hiv_directory / "intermediate" / "prize_05.tsv", sep="\t"), uniprot_mapping)
+ prize_060 = format(pandas.read_csv(hiv_directory / "intermediate" / "prize_060.tsv", sep="\t"), uniprot_mapping)
prize_05.to_csv(hiv_directory / "processed" / "processed_prizes_05.txt", sep="\t", header=True, index=False)
prize_060.to_csv(hiv_directory / "processed" / "processed_prizes_060.txt", sep="\t", header=True, index=False)
-if __name__ == '__main__':
+
+if __name__ == "__main__":
main()
diff --git a/datasets/rn-muscle-skeletal/.gitignore b/datasets/rn_muscle_skeletal/.gitignore
similarity index 100%
rename from datasets/rn-muscle-skeletal/.gitignore
rename to datasets/rn_muscle_skeletal/.gitignore
diff --git a/datasets/rn-muscle-skeletal/README.md b/datasets/rn_muscle_skeletal/README.md
similarity index 100%
rename from datasets/rn-muscle-skeletal/README.md
rename to datasets/rn_muscle_skeletal/README.md
diff --git a/datasets/rn-muscle-skeletal/Snakefile b/datasets/rn_muscle_skeletal/Snakefile
similarity index 100%
rename from datasets/rn-muscle-skeletal/Snakefile
rename to datasets/rn_muscle_skeletal/Snakefile
diff --git a/datasets/rn-muscle-skeletal/curated/sources.txt b/datasets/rn_muscle_skeletal/curated/sources.txt
similarity index 100%
rename from datasets/rn-muscle-skeletal/curated/sources.txt
rename to datasets/rn_muscle_skeletal/curated/sources.txt
diff --git a/datasets/rn-muscle-skeletal/curated/targets.txt b/datasets/rn_muscle_skeletal/curated/targets.txt
similarity index 100%
rename from datasets/rn-muscle-skeletal/curated/targets.txt
rename to datasets/rn_muscle_skeletal/curated/targets.txt
diff --git a/datasets/rn-muscle-skeletal/process.py b/datasets/rn_muscle_skeletal/process.py
similarity index 100%
rename from datasets/rn-muscle-skeletal/process.py
rename to datasets/rn_muscle_skeletal/process.py
diff --git a/datasets/rn-muscle-skeletal/reproduction/raw/ResponseNetNetwork.json b/datasets/rn_muscle_skeletal/reproduction/raw/ResponseNetNetwork.json
similarity index 100%
rename from datasets/rn-muscle-skeletal/reproduction/raw/ResponseNetNetwork.json
rename to datasets/rn_muscle_skeletal/reproduction/raw/ResponseNetNetwork.json
diff --git a/datasets/synthetic_data/.gitignore b/datasets/synthetic_data/.gitignore
new file mode 100644
index 00000000..fc6eda0f
--- /dev/null
+++ b/datasets/synthetic_data/.gitignore
@@ -0,0 +1,4 @@
+/intermediate
+/processed
+/raw
+/thresholded
diff --git a/datasets/synthetic_data/README.md b/datasets/synthetic_data/README.md
new file mode 100644
index 00000000..b0ab7882
--- /dev/null
+++ b/datasets/synthetic_data/README.md
@@ -0,0 +1,40 @@
+# Synthetic Data
+
+_Synthetic Data_ is a generic dataset label for a class of synthetic pathways provided by [PathwayCommons](https://www.pathwaycommons.org/).
+Currently, we only use [PANTHER](https://pantherdb.org/) pathways from PantherCommons, specifically enumerated in `./pathways.jsonc`.
+
+This entire workflow can also be done with `uv run snakemake --cores 1` inside this directory, as like any other dataset.
+
+## Workflow
+
+The workflow follows these steps in order:
+
+## PANTHER Pathway Fetching
+
+PANTHER pathways are fetched from a singular OWL file containing a bundled collection of all pathways. Since the OWL file that
+PathwayCommons provides is over 10gb, we have a separate Snakemake workflow, located under `./panther_pathways`, that trims down the OWL file
+to only contain pathways from PANTHER.
+
+Inside `scripts/fetch_pathway.py`, we use this intermediately-generated (and cached!) OWL file to individually generate associated OWL and
+SIF files for each pathway.
+
+We have a `./util/parse_pc_pathways.py`, which takes a `pathways.txt` provided by PathwayCommons, and allows us to map the
+human-readable pathway names into [identifiers.org](https://identifiers.org/) identifiers, which we later trim down
+with our provided list of pathway names in `pathways.jsonc` using `list_curated_pathways.py`.
+
+## SIF Pathway Processing
+
+The scripts `process_panther_pathway.py` and `panther_spras_formatting` convert pathways from the fetching step into ones usable by SPRAS, using
+external data:
+- [Sources](http://wlab.ethz.ch/surfaceome/), or `table_S3_surfaceome.xlsx`, (see [original paper](https://doi.org/10.1073/pnas.1808790115))
+are silico human surfaceomes receptors.
+- [Targets]( https://guolab.wchscu.cn/AnimalTFDB4//#/), or `Homo_sapiens_TF.tsv`, (see [original paper](https://doi.org/10.1093/nar/gkac907))
+are human transcription factors. We map these to UniProt in `map_transcription_factors.py`.
+
+## Interactome Generation
+
+`interactome.py` uses STRING and UniProt data to produce a UniProt-based interactome.
+
+## Thresholding
+
+Using the interactome and processed pathway files, we threshold pathways. TODO write more about this.
diff --git a/datasets/synthetic_data/Snakefile b/datasets/synthetic_data/Snakefile
new file mode 100644
index 00000000..c9522285
--- /dev/null
+++ b/datasets/synthetic_data/Snakefile
@@ -0,0 +1,131 @@
+include: "../../cache/Snakefile"
+from jsonc_parser.parser import JsoncParser
+import urllib.parse
+
+def make_file_safe(input_str: str) -> str:
+ return urllib.parse.quote(input_str, safe='')
+
+pathways = JsoncParser.parse_file("pathways.jsonc")
+file_compatible_pathways = list(map(make_file_safe, pathways))
+
+# TODO: deduplicate from sampling.py
+thresholds = list(map(str, map(lambda x: (x + 1) / 10, range(10))))
+
+rule all:
+ input:
+ "raw/9606.protein.links.full.v12.0.txt",
+ expand([
+ "thresholded/{threshold}/{pathway}/interactome.txt",
+ "thresholded/{threshold}/{pathway}/gold_standard_edges.txt",
+ ], pathway=file_compatible_pathways, threshold=thresholds)
+
+produce_fetch_rules({
+ "raw/9606.protein.links.full.v12.0.txt": FetchConfig(("STRING", "9606", "9606.protein.links.full.txt.gz"), uncompress=True),
+ "raw/9606.protein.aliases.txt": FetchConfig(("STRING", "9606", "9606.protein.aliases.txt.gz"), uncompress=True),
+
+ "raw/human-interactome/table_S3_surfaceome.xlsx": CacheItem(
+ name="Human surfaceome",
+ unpinned="http://wlab.ethz.ch/surfaceome/table_S3_surfaceome.xlsx",
+ cached="https://docs.google.com/uc?id=1cBXYbDnAJVet0lv3BRrizV5FuqfMbBr0",
+ ),
+ "raw/human-interactome/Homo_sapiens_TF.tsv": CacheItem(
+ name="Human transcription factors",
+ # This server has anti-bot protection, so to respect their wishes, we don't download from the server.
+ # The original URL is https://guolab.wchscu.cn/AnimalTFDB4_static/download/TF_list_final/Homo_sapiens_TF,
+ # which is accessible from https://guolab.wchscu.cn/AnimalTFDB4//#/Download -> Homo sapiens
+ # (also under the Internet Archive as of Feb 2nd, 2026. If the original artifact disappears, the drive link below should suffice.)
+ cached="https://drive.google.com/uc?id=1fVi18GpudUlquRPHgUJl3H1jy54gO-uz",
+ ),
+ "raw/human-interactome/HUMAN_9606_idmapping_selected.tsv": FetchConfig(("UniProt", "9606", "HUMAN_9606_idmapping_selected.tab.gz"), uncompress=True),
+ # See ./panther_pathways for how this file was generated.
+ "raw/pc-panther-biopax.owl": CacheItem(
+ name="PathwayCommons PANTHER-only BioPAX file",
+ cached="https://drive.google.com/uc?id=1MklrD8CJ1BIjh_wWr_g5rrIJ5XJB7FUI"
+ ),
+ "raw/denylist.txt": CacheItem(
+ name="PathwayCommons small molecule denylist",
+ cached="https://drive.google.com/uc?id=1QmISJXPvVljA8oKuNYRUNbJJvZKPa_-u",
+ pinned="https://download.baderlab.org/PathwayCommons/PC2/v14/blacklist.txt",
+ ),
+ "raw/pathways.txt": FetchConfig(("PathwayCommons", "pathways.txt.gz"), uncompress=True)
+})
+
+rule interactome:
+ input:
+ "raw/human-interactome/HUMAN_9606_idmapping_selected.tsv",
+ "raw/9606.protein.links.full.v12.0.txt",
+ "raw/9606.protein.aliases.txt"
+ output: "processed/interactome.tsv"
+ shell:
+ "uv run scripts/interactome.py"
+
+rule process_tfs:
+ input:
+ "raw/human-interactome/Homo_sapiens_TF.tsv",
+ "raw/human-interactome/HUMAN_9606_idmapping_selected.tsv"
+ output:
+ "raw/human-interactome/Homo_sapiens_TF_Uniprot.tsv"
+ shell:
+ "uv run scripts/map_transcription_factors.py"
+
+rule process_panther_pathway:
+ input:
+ "intermediate/pathway-pc-data/{pathway}.sif",
+ "raw/human-interactome/table_S3_surfaceome.xlsx",
+ "raw/human-interactome/Homo_sapiens_TF_Uniprot.tsv"
+ output:
+ "intermediate/{pathway}/edges.txt",
+ "intermediate/{pathway}/nodes.txt",
+ "intermediate/{pathway}/sources.txt",
+ "intermediate/{pathway}/targets.txt",
+ "intermediate/{pathway}/prizes.txt"
+ shell:
+ 'uv run scripts/process_panther_pathway.py "{wildcards.pathway}"'
+
+rule make_spras_compatible:
+ input:
+ # We use the interactome for trimming
+ "processed/interactome.tsv",
+ "intermediate/{pathway}/edges.txt",
+ "intermediate/{pathway}/nodes.txt",
+ "intermediate/{pathway}/sources.txt",
+ "intermediate/{pathway}/targets.txt",
+ "intermediate/{pathway}/prizes.txt"
+ output:
+ "processed/pathways/{pathway}/node_prizes.txt",
+ "processed/pathways/{pathway}/gs_edges.txt",
+ "processed/pathways/{pathway}/gs_nodes.txt"
+ shell:
+ 'uv run scripts/panther_spras_formatting.py "{wildcards.pathway}"'
+
+rule threshold:
+ input:
+ "processed/pathways/{pathway}/node_prizes.txt",
+ "processed/pathways/{pathway}/gs_edges.txt",
+ "processed/interactome.tsv"
+ output:
+ expand("thresholded/{threshold}/{{pathway}}/interactome.txt", threshold=thresholds),
+ expand("thresholded/{threshold}/{{pathway}}/gold_standard_edges.txt", threshold=thresholds)
+ shell:
+ 'uv run scripts/sampling.py "{wildcards.pathway}" --percentage_thresholding_multiplier 0.9'
+
+rule make_pathway_map:
+ input:
+ "raw/pathways.txt"
+ output:
+ "intermediate/curated_pathways_id_mapping.json"
+ shell:
+ "uv run scripts/list_curated_pathways.py"
+
+rule process_pathways:
+ input:
+ "intermediate/curated_pathways_id_mapping.json",
+ "raw/pc-panther-biopax.owl"
+ params:
+ # A little trick from https://stackoverflow.com/a/71327709/7589775
+ pathway=lambda wildcards: wildcards.get("pathway")
+ output:
+ "intermediate/pathway-pc-data/{pathway}.owl",
+ "intermediate/pathway-pc-data/{pathway}.sif"
+ shell:
+ 'uv run scripts/fetch_pathway.py "{params.pathway}"'
diff --git a/datasets/synthetic_data/__init__.py b/datasets/synthetic_data/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/datasets/synthetic_data/explore/.gitignore b/datasets/synthetic_data/explore/.gitignore
new file mode 100644
index 00000000..ac8367c6
--- /dev/null
+++ b/datasets/synthetic_data/explore/.gitignore
@@ -0,0 +1 @@
+/full_stats.tsv
diff --git a/datasets/synthetic_data/explore/pathway_statistics.py b/datasets/synthetic_data/explore/pathway_statistics.py
new file mode 100644
index 00000000..2f7ceb23
--- /dev/null
+++ b/datasets/synthetic_data/explore/pathway_statistics.py
@@ -0,0 +1,68 @@
+"""
+Reports on pathway statistics located under `processed`.
+"""
+
+from pathlib import Path
+import networkx
+import pandas
+import urllib.parse
+
+from tools.sample import find_connected_sources_targets
+
+
+# From SPRAS. TODO: import once SPRAS uses pixi
+def convert_undirected_to_directed(df: pandas.DataFrame) -> pandas.DataFrame:
+ mask = df["Direction"] == "U"
+ new_df = df[mask].copy(deep=True)
+ new_df["Interactor1"], new_df["Interactor2"] = new_df["Interactor2"], new_df["Interactor1"]
+ new_df["Direction"] = "D"
+ df.loc[mask, "Direction"] = "D"
+ df = pandas.concat([df, new_df], ignore_index=True)
+ return df
+
+
+current_directory = Path(__file__).parent.resolve()
+synthetic_directory = current_directory / ".."
+
+
+def main():
+ data_entries = []
+
+ # We identify pathways by their gold standard edges, since we have a few other files mixed in with `processed`.
+ for pathway_folder in (synthetic_directory / "processed" / "pathways").rglob("*/"):
+ gs_edges_graph = networkx.from_pandas_edgelist(
+ convert_undirected_to_directed(
+ pandas.read_csv(pathway_folder / "gs_edges.txt", sep="\t", names=["Interactor1", "Interactor2", "Rank", "Direction"])
+ ),
+ "Interactor1",
+ "Interactor2",
+ create_using=networkx.DiGraph,
+ )
+ node_prizes = pandas.read_csv(pathway_folder / "node_prizes.txt", sep="\t")
+
+ sources = list(node_prizes[node_prizes["sources"] == True]["NODEID"])
+ targets = list(node_prizes[node_prizes["targets"] == True]["NODEID"])
+
+ connected_sources_targets = find_connected_sources_targets(
+ sources,
+ targets,
+ gs_edges_graph,
+ )
+ data_entries.append(
+ (
+ urllib.parse.unquote(pathway_folder.stem),
+ len(sources),
+ len(targets),
+ (float(len(connected_sources_targets)) / float(len(sources) * len(targets))) if len(sources) * len(targets) != 0 else 0.0,
+ )
+ )
+
+ data_df = pandas.DataFrame(data_entries, columns=("Name", "Sources", "Targets", "Connected Percentage"))
+ data_df.to_csv(current_directory / "full_stats.tsv", sep="\t", index=False)
+
+ filtered_df = data_df.loc[data_df["Sources"] != 0].loc[data_df["Targets"] != 0].loc[data_df["Connected Percentage"] != 0]
+ print(filtered_df)
+
+
+if __name__ == "__main__":
+ main()
diff --git a/datasets/synthetic_data/panther_pathways/.gitignore b/datasets/synthetic_data/panther_pathways/.gitignore
new file mode 100644
index 00000000..525b1c26
--- /dev/null
+++ b/datasets/synthetic_data/panther_pathways/.gitignore
@@ -0,0 +1,3 @@
+/raw
+/intermediate
+/output
diff --git a/datasets/synthetic_data/panther_pathways/README.md b/datasets/synthetic_data/panther_pathways/README.md
new file mode 100644
index 00000000..ecc9ad6b
--- /dev/null
+++ b/datasets/synthetic_data/panther_pathways/README.md
@@ -0,0 +1,9 @@
+# panther_pathways
+
+PathwayCommons provides the multi-GB file `pc-biopax.owl`. We need to extract specific pathways from this file.
+PaxTools, instead of streaming this XML file, instead opts to load the entire file into memory. Since this is infesable
+in any cheap CI system, we instead opt to make this a separate workflow: it takes `pc-biopax.owl`, along with
+all PANTHER pathways (TODO: this can be generalized), and generates a new OWL file that contains all PANTHER pathways.
+
+Then, instead of extracting files from the large OWL file above, we use this smaller OWL file in the `../` dataset
+where we then split pathways individually.
diff --git a/datasets/synthetic_data/panther_pathways/Snakefile b/datasets/synthetic_data/panther_pathways/Snakefile
new file mode 100644
index 00000000..f7e355a0
--- /dev/null
+++ b/datasets/synthetic_data/panther_pathways/Snakefile
@@ -0,0 +1,23 @@
+include: "../../../cache/Snakefile"
+
+rule all:
+ input:
+ "output/pc-panther-biopax.owl"
+
+produce_fetch_rules({
+ "raw/pc-biopax.owl": CacheItem(
+ name="PathwayCommons Universal BioPAX file",
+ cached="https://drive.google.com/uc?id=1R7uE2ky7fGlZThIWCOblu7iqbpC-aRr0",
+ pinned="https://download.baderlab.org/PathwayCommons/PC2/v14/pc-biopax.owl.gz",
+ ),
+ "raw/pathways.txt": FetchConfig(["PathwayCommons", "pathways.txt.gz"], uncompress=True)
+})
+
+rule fetch_from_owl:
+ input:
+ "raw/pc-biopax.owl",
+ "raw/pathways.txt"
+ output:
+ "output/pc-panther-biopax.owl"
+ shell:
+ "uv run fetch_from_owl.py"
diff --git a/datasets/synthetic_data/panther_pathways/__init__.py b/datasets/synthetic_data/panther_pathways/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/datasets/synthetic_data/panther_pathways/fetch_from_owl.py b/datasets/synthetic_data/panther_pathways/fetch_from_owl.py
new file mode 100644
index 00000000..1cfca4a7
--- /dev/null
+++ b/datasets/synthetic_data/panther_pathways/fetch_from_owl.py
@@ -0,0 +1,21 @@
+from pathlib import Path
+from paxtools.fetch import fetch
+from datasets.synthetic_data.util.parse_pc_pathways import parse_pc_pathways
+
+current_directory = Path(__file__).parent.resolve()
+
+
+def main():
+ pathways_df = parse_pc_pathways(current_directory / "raw" / "pathways.txt")
+ print("Fetching pathways... [This may take some time. On the author's desktop machine, it took 15 minutes.]")
+ (current_directory / "output").mkdir(exist_ok=True)
+ fetch(
+ current_directory / "raw" / "pc-biopax.owl",
+ output=(current_directory / "output" / "pc-panther-biopax.owl"),
+ uris=list(pathways_df["PATHWAY_URI"]),
+ memory=f"{2 ** (16 - 1)}m", # this is why we don't run this in CI! This is 32gb of memory.
+ )
+
+
+if __name__ == "__main__":
+ main()
diff --git a/datasets/synthetic_data/pathways.jsonc b/datasets/synthetic_data/pathways.jsonc
new file mode 100644
index 00000000..2530d8ec
--- /dev/null
+++ b/datasets/synthetic_data/pathways.jsonc
@@ -0,0 +1,37 @@
+[
+ // All commented out pathways do not have enough sources, targets, or connections.
+ // To see more, re-comment in all commented out pathways and run explore/pathway_statistics.py.
+
+ // Commonly known as the "CCKR signaling map", PathwayCommons also does not map this one correctly:
+ // TODO: report to PathwayCommons: see https://apps.pathwaycommons.org/pathways?uri=https%3A%2F%2Fidentifiers.org%2Fpanther.pathway%3AP06959
+ // "Gastrin_CCK2R_240212",
+ "Wnt signaling pathway",
+ "VEGF signaling pathway",
+ "Toll receptor signaling pathway",
+ "TGF-beta signaling pathway",
+ "PDGF signaling pathway",
+ "Notch signaling pathway",
+ "JAK/STAT signaling pathway",
+ "Interleukin signaling pathway",
+ "Interferon-gamma signaling pathway",
+ // "Integrin signalling pathway",
+ // "Insulin/IGF pathway-protein kinase B signaling cascade",
+ "Inflammation mediated by chemokine and cytokine signaling pathway",
+ "Hedgehog signaling pathway",
+ // "FGF signaling pathway",
+ "FAS signaling pathway",
+ // TODO: report to PathwayCommons: see https://apps.pathwaycommons.org/pathways?uri=https%3A%2F%2Fidentifiers.org%2Fpanther.pathway%3AP00019.
+ // We want to add the Endothelin signaling pathway, but it is currently labelled under "untitled."
+ "EGF receptor signaling pathway",
+ "Cadherin signaling pathway",
+ "Apoptosis signaling pathway",
+ // "Ras Pathway",
+ "PI3 kinase pathway",
+ "p38 MAPK pathway",
+ // "Insulin/IGF pathway-mitogen activated protein kinase kinase/MAP kinase cascade",
+ // "p53 pathway",
+ // "Hypoxia response via HIF activation",
+ // "Oxidative stress response",
+ "B cell activation"
+ // "T cell activation"
+]
diff --git a/datasets/synthetic_data/scripts/__init__.py b/datasets/synthetic_data/scripts/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/datasets/synthetic_data/scripts/fetch_pathway.py b/datasets/synthetic_data/scripts/fetch_pathway.py
new file mode 100644
index 00000000..51012706
--- /dev/null
+++ b/datasets/synthetic_data/scripts/fetch_pathway.py
@@ -0,0 +1,50 @@
+import argparse
+import json
+from pathlib import Path
+
+from paxtools.fetch import fetch
+from paxtools.sif import toSIF
+import urllib.parse
+
+synthetic_directory = Path(__file__).parent.parent.resolve()
+
+
+def parser():
+ parser = argparse.ArgumentParser(prog="PANTHER pathway fetcher")
+
+ parser.add_argument("pathway_name", type=str)
+
+ return parser
+
+
+def main():
+ args = parser().parse_args()
+ curated_pathways_df = json.loads((synthetic_directory / "intermediate" / "curated_pathways_id_mapping.json").read_text())
+ associated_id = curated_pathways_df[urllib.parse.unquote(args.pathway_name)]
+
+ pathway_data_dir = synthetic_directory / "intermediate" / "pathway-pc-data"
+ pathway_data_dir.mkdir(exist_ok=True, parents=True)
+
+ fetch(
+ synthetic_directory / "raw" / "pc-panther-biopax.owl",
+ pathway_data_dir / Path(args.pathway_name).with_suffix(".owl"),
+ denylist=synthetic_directory / "raw" / "denylist.txt",
+ uris=[associated_id],
+ absolute=True,
+ )
+
+ toSIF(
+ pathway_data_dir / Path(args.pathway_name).with_suffix(".owl"),
+ pathway_data_dir / Path(args.pathway_name).with_suffix(".sif"),
+ # See the paxtools library for information about how these settings were retrieved.
+ # These are directly from PathwayCommons.
+ denylist=str(synthetic_directory / "raw" / "denylist.txt"),
+ chemDb=["chebi"],
+ seqDb=["hgnc"],
+ exclude=["NEIGHBOR_OF"],
+ extended=True,
+ )
+
+
+if __name__ == "__main__":
+ main()
diff --git a/datasets/synthetic_data/scripts/interactome.py b/datasets/synthetic_data/scripts/interactome.py
new file mode 100644
index 00000000..ab1494de
--- /dev/null
+++ b/datasets/synthetic_data/scripts/interactome.py
@@ -0,0 +1,68 @@
+import pandas
+from pathlib import Path
+from tools.mapping.ensembl_uniprot import idmapping_as_ensp_uniprot_mapping, idmapping_uniprot_mapping
+
+current_directory = Path(__file__).parent.resolve()
+interactome_folder = current_directory / ".." / "raw" / "human-interactome"
+
+
+def main():
+ # Convert the interactome to SPRAS format
+ print("Reading interactome...")
+ interactome_df = pandas.read_csv(
+ current_directory / ".." / "raw" / "9606.protein.links.full.v12.0.txt", sep=" ", usecols=["protein1", "protein2", "combined_score"]
+ )
+ interactome_df.columns = ["Protein1", "Protein2", "Weight"]
+
+ # We also want to representatively remove a certain percentage of elements from the interactome,
+ # to make sure our interactome downsampling preserves edge weight distributions
+ # (we don't care to preserve other major topological properties just yet.)
+ # since this file is large, we opt for streaming the interactome for removing edges instead
+
+ print("Initially processing interactome...")
+ interactome_df["Weight"] = interactome_df["Weight"].div(1000) # scores are from 1-1000: we normalize from 0-1.
+ interactome_df["Direction"] = "U"
+ print("Sorting interactome...")
+ interactome_df = interactome_df.sort_values("Weight", kind="stable")
+ interactome_df = interactome_df.reset_index(drop=True)
+
+ print("Fetching mapping data...")
+
+ # Mapping ENSP IDs to ENSG IDs through the STRING aliases file
+ string_aliases = pandas.read_csv(current_directory / ".." / "raw" / "9606.protein.aliases.txt", sep="\t", usecols=["#string_protein_id", "alias"])
+ string_aliases.columns = ["ENSG", "ENSP"]
+ string_aliases = string_aliases.drop_duplicates()
+
+ # (ENSG) idmapping -> (ENSG <-> ENSP) -> (ENSP) idmapping
+ idmapping_df = idmapping_uniprot_mapping(interactome_folder / "HUMAN_9606_idmapping_selected.tsv")
+ idmapping_df = idmapping_as_ensp_uniprot_mapping(idmapping_df)
+
+ print("Mapping interactome...")
+ # We also use astype(str) as these are read as numpy objects for convenience, but this messes with merging
+ interactome_df["Protein1"] = interactome_df["Protein1"].str.removeprefix("9606.").astype(str)
+ interactome_df["Protein2"] = interactome_df["Protein2"].str.removeprefix("9606.").astype(str)
+
+ interactome_df = (
+ interactome_df.merge(idmapping_df, left_on="Protein1", right_on="Ensembl", how="left")
+ .drop(columns=["Protein1", "Ensembl"])
+ .rename(columns={"UniProtKB-AC": "Protein1"})
+ .drop_duplicates()
+ )
+ interactome_df = (
+ interactome_df.merge(idmapping_df, left_on="Protein2", right_on="Ensembl", how="left")
+ .drop(columns=["Protein2", "Ensembl"])
+ .rename(columns={"UniProtKB-AC": "Protein2"})
+ )
+
+ interactome_df = interactome_df.dropna(subset=["Protein1", "Protein2"]).reset_index(drop=True)
+ interactome_df = interactome_df[["Protein1", "Protein2", "Weight", "Direction"]]
+
+ print("Counting weight counts...")
+ interactome_df["Weight"].value_counts(sort=False).to_csv(current_directory / ".." / "processed" / "weight-counts.tsv", sep="\t")
+
+ print("Saving interactome...")
+ interactome_df.to_csv(current_directory / ".." / "processed" / "interactome.tsv", sep="\t", header=False, index=False)
+
+
+if __name__ == "__main__":
+ main()
diff --git a/datasets/synthetic_data/scripts/list_curated_pathways.py b/datasets/synthetic_data/scripts/list_curated_pathways.py
new file mode 100644
index 00000000..a5892680
--- /dev/null
+++ b/datasets/synthetic_data/scripts/list_curated_pathways.py
@@ -0,0 +1,28 @@
+import json
+from pathlib import Path
+from jsonc_parser.parser import JsoncParser
+
+from datasets.synthetic_data.util.parse_pc_pathways import parse_pc_pathways
+
+synthetic_directory = Path(__file__).parent.parent.resolve()
+
+
+def main():
+ # TODO: pass as arguments
+ pathways_df = parse_pc_pathways(synthetic_directory / "raw" / "pathways.txt")
+
+ # We use the top-level pathways.jsonc, which is a hand-curated list of pathways, as it is not deterministically
+ # automatable to decide whether or not a pathway is a signaling pathway. Yet.
+ pathway_mapping: dict[str, str] = {}
+ curated_pathways = JsoncParser.parse_file(synthetic_directory / "pathways.jsonc")
+ for pathway in curated_pathways:
+ selected_pathways = pathways_df.loc[pathways_df["DISPLAY_NAME"] == pathway].reset_index(drop=True)
+ selected_pathways_count = len(selected_pathways.index)
+ if selected_pathways_count != 1:
+ raise RuntimeError(f"{pathway} references {selected_pathways_count} pathways, when we need to uniquely get one!")
+ pathway_mapping[pathway] = selected_pathways["PATHWAY_URI"].loc[0]
+ (synthetic_directory / "intermediate" / "curated_pathways_id_mapping.json").write_text(json.dumps(pathway_mapping, indent=4))
+
+
+if __name__ == "__main__":
+ main()
diff --git a/datasets/synthetic_data/scripts/map_transcription_factors.py b/datasets/synthetic_data/scripts/map_transcription_factors.py
new file mode 100644
index 00000000..e5ab00a9
--- /dev/null
+++ b/datasets/synthetic_data/scripts/map_transcription_factors.py
@@ -0,0 +1,21 @@
+import pandas
+from pathlib import Path
+from tools.mapping.ensembl_uniprot import idmapping_uniprot_mapping, idmapping_as_ensg_uniprot_mapping
+
+current_directory = Path(__file__).parent.resolve()
+
+interactome_folder = current_directory / ".." / "raw" / "human-interactome"
+
+
+def main():
+ tf_df = pandas.read_csv(interactome_folder / "Homo_sapiens_TF.tsv", sep="\t", header=0)
+ idmapping_selected_df = idmapping_uniprot_mapping(interactome_folder / "HUMAN_9606_idmapping_selected.tsv")
+ idmapping_selected_df = idmapping_as_ensg_uniprot_mapping(idmapping_selected_df)
+ tf_df = tf_df.merge(idmapping_selected_df, on="Ensembl", how="inner")
+ tf_df = tf_df.explode("UniProtKB-AC")
+ tf_df = tf_df.fillna("NA")
+ tf_df.to_csv(interactome_folder / "Homo_sapiens_TF_Uniprot.tsv", header=True, sep="\t", index=False)
+
+
+if __name__ == "__main__":
+ main()
diff --git a/datasets/synthetic_data/scripts/panther_spras_formatting.py b/datasets/synthetic_data/scripts/panther_spras_formatting.py
new file mode 100644
index 00000000..78661ea1
--- /dev/null
+++ b/datasets/synthetic_data/scripts/panther_spras_formatting.py
@@ -0,0 +1,111 @@
+import pandas as pd
+from pathlib import Path
+from datasets.synthetic_data.scripts.util.parser import parser
+from tools.trim import trim_data_file
+
+synthetic_directory = Path(__file__).parent.parent.resolve()
+
+processed_directory = synthetic_directory / "processed"
+intermediate_directory = synthetic_directory / "intermediate"
+
+directed = [
+ "controls-state-change-of",
+ "controls-transport-of",
+ "controls-phosphorylation-of",
+ "controls-expression-of",
+ "catalysis-precedes",
+ "consumption-controlled-by",
+ "controls-production-of",
+ "controls-transport-of-chemical",
+ "chemical-affects",
+ "used-to-produce",
+ "consumption-controled-by",
+]
+
+undirected = ["in-complex-with", "interacts-with", "neighbor-of", "reacts-with"]
+
+
+def raise_unknown_direction(dir: str):
+ raise ValueError(f"Unknown direction {dir}")
+
+
+def main():
+ pathway = Path(parser().parse_args().pathway)
+ pathway_folder = intermediate_directory / pathway
+
+ processed_directory.mkdir(exist_ok=True)
+
+ # Create our output folder within the pathway directory
+ out_folder = processed_directory / "pathways" / pathway
+ out_folder.mkdir(exist_ok=True, parents=True)
+
+ nodes_file = pathway_folder / "nodes.txt"
+ nodes_df = pd.read_csv(nodes_file, sep="\t")
+
+ # a dictionary mapping gene -> Uniprot accession ID
+ gene_to_uniprot = pd.Series(nodes_df["uniprot"].values, index=nodes_df["NODE"]).to_dict()
+
+ # nodes
+ nodes_uniprot = nodes_df[["uniprot"]]
+ nodes_uniprot.to_csv(out_folder / "gs_nodes.txt", sep="\t", index=False, header=False)
+
+ # edges
+ edges_df = pd.read_csv(pathway_folder / "edges.txt", sep="\t", header=0)
+ edges_df = edges_df.rename(columns={"NODE1": "Interactor1", "NODE2": "Interactor2"})
+ edges_df["Interactor1"] = edges_df["Interactor1"].map(gene_to_uniprot)
+ edges_df["Interactor2"] = edges_df["Interactor2"].map(gene_to_uniprot)
+ edges_df["Rank"] = 1
+ edges_df["Direction"] = edges_df["INTERACTION_TYPE"].apply(
+ lambda x: "D" if x in directed else ("U" if x in undirected else raise_unknown_direction(x))
+ )
+ edges_df = edges_df.drop(columns="INTERACTION_TYPE")
+
+ # remove duplicate rows
+ # sort by (node1 and node2) to ensure deterministic sorting
+ edges_df = edges_df.sort_values(by=["Interactor1", "Interactor2"], ascending=True, ignore_index=True)
+ undirected_mask = edges_df["Direction"] == "U"
+ min_nodes = edges_df.loc[undirected_mask, ["Interactor1", "Interactor2"]].min(axis=1)
+ max_nodes = edges_df.loc[undirected_mask, ["Interactor1", "Interactor2"]].max(axis=1)
+ edges_df.loc[undirected_mask, "Interactor1"] = min_nodes
+ edges_df.loc[undirected_mask, "Interactor2"] = max_nodes
+
+ # keep 1 directed and 1 undirected edge if both exist
+ # since rank is 1, we don't need to sort by rank.
+ edges_df = edges_df.sort_values(by=["Interactor1", "Interactor2", "Direction"], ascending=True, ignore_index=True)
+ edges_df = edges_df.drop_duplicates(keep="first", ignore_index=True)
+ # We trim the gold standard edges against the interactome
+ interactome_df = pd.read_csv(
+ processed_directory / "interactome.tsv",
+ sep="\t",
+ header=None,
+ names=["Interactor1", "Interactor2", "Weight", "Direction"],
+ dtype={"Interactor1": str, "Interactor2": str},
+ )
+ edges_df = edges_df.merge(interactome_df, how="inner", on=["Interactor1", "Interactor2"])
+ # We don't care about extraneous information provided by the interactome.
+ edges_df = edges_df.drop(columns=["Direction_y", "Weight"]).rename(columns={"Direction_x": "Direction"})
+ edges_df.to_csv(out_folder / "gs_edges.txt", sep="\t", index=False, header=False)
+
+ # prizes, targets, sources
+ prizes_file = pathway_folder / "prizes.txt"
+ prizes_df = pd.read_csv(prizes_file, sep="\t")
+
+ target_file = pathway_folder / "targets.txt"
+ target_df = pd.read_csv(target_file, sep="\t")
+
+ source_file = pathway_folder / "sources.txt"
+ source_df = pd.read_csv(source_file, sep="\t")
+
+ # final resulting df combining all the sources, targets, and prizes
+ prizes_df["sources"] = prizes_df["uniprot"].isin(source_df["uniprot"])
+ prizes_df["targets"] = prizes_df["uniprot"].isin(target_df["uniprot"])
+ prizes_df["dummy"] = ""
+ prizes_df.rename(columns={"uniprot": "NODEID", "prizes": "prize"}, inplace=True)
+ result_df = prizes_df[["NODEID", "prize", "sources", "targets", "active", "dummy"]]
+ # We trim the data file against the gold standard (which was already trimmed against the interactome)
+ data_df = trim_data_file(data_df=result_df, gold_standard_df=edges_df)
+ data_df.to_csv(out_folder / "node_prizes.txt", sep="\t", index=False, header=True)
+
+
+if __name__ == "__main__":
+ main()
diff --git a/datasets/synthetic_data/scripts/process_panther_pathway.py b/datasets/synthetic_data/scripts/process_panther_pathway.py
new file mode 100644
index 00000000..de62466f
--- /dev/null
+++ b/datasets/synthetic_data/scripts/process_panther_pathway.py
@@ -0,0 +1,75 @@
+import io
+import pandas as pd
+from pathlib import Path
+
+from datasets.synthetic_data.scripts.util.parser import parser
+
+synthetic_directory = Path(__file__).parent.parent.resolve()
+
+data_directory = synthetic_directory / "intermediate" / "pathway-pc-data"
+interactome_folder = synthetic_directory / "raw" / "human-interactome"
+
+
+def process_pathway(file: Path, folder: Path):
+ file_content = file.read_text()
+ # This file has two csv files stacked on top of each other.
+ # This is the header that we are looking for
+ needle = "PARTICIPANT\tPARTICIPANT_TYPE\tPARTICIPANT_NAME\tUNIFICATION_XREF\tRELATIONSHIP_XREF"
+
+ edges, nodes = file_content.split(needle)
+ # Re-add the header
+ nodes = needle + nodes
+ # https://stackoverflow.com/a/65018984/7589775 read the text
+ # as a file.
+ edges_df = pd.read_csv(io.StringIO(edges), header=0, sep="\t")
+ nodes_df = pd.read_csv(io.StringIO(nodes), header=0, sep="\t")
+
+ # First, get the relevant info from the edges
+ edges_df = edges_df[["PARTICIPANT_A", "INTERACTION_TYPE", "PARTICIPANT_B"]]
+ edges_df.columns = ["NODE1", "INTERACTION_TYPE", "NODE2"]
+ # removing ChEBI identifiers: these aren't proteins and we therefore are not interested in them.
+ edges_df = edges_df[~edges_df["NODE1"].str.startswith("chebi:")]
+ edges_df = edges_df[~edges_df["NODE2"].str.startswith("chebi:")]
+
+ # Do the same for the nodes
+ nodes_df = nodes_df[["PARTICIPANT", "UNIFICATION_XREF"]]
+ nodes_df.columns = ["NODE", "uniprot"]
+ # removing the chebi: prefix
+ nodes_df = nodes_df[~nodes_df["NODE"].str.startswith("chebi:")]
+ # and remove the uniprot: prefix
+ nodes_df["uniprot"] = nodes_df["uniprot"].str.removeprefix("uniprot:")
+
+ # Save edges and nodes
+ edges_df.to_csv(folder / "edges.txt", header=True, index=False, sep="\t")
+ nodes_df.to_csv(folder / "nodes.txt", header=True, index=False, sep="\t")
+
+ # Then, we need to get the sources and targets, save them,
+ # and mark them with 1.0 prizes:
+
+ # First, for our targets, or transcription factors
+ human_tfs = pd.read_csv(interactome_folder / "Homo_sapiens_TF_Uniprot.tsv", sep="\t")
+ human_tfs = nodes_df.merge(human_tfs, how="inner", left_on="uniprot", right_on="UniProtKB-AC")
+ human_tfs = human_tfs[["NODE", "uniprot"]]
+ human_tfs.to_csv(folder / "targets.txt", sep="\t", index=False)
+
+ # Then, for our receptors. NOTE: we skip the first row since it's empty in the XLSX, so this might break if the surfaceome authors fix this.
+ human_receptors = pd.read_excel(interactome_folder / "table_S3_surfaceome.xlsx", sheet_name="in silico surfaceome only", skiprows=1)
+ human_receptors = human_receptors[["UniProt accession", "Ensembl gene", "Membranome Almen main-class"]]
+ human_receptors = human_receptors[human_receptors["Membranome Almen main-class"] == "Receptors"]
+ human_receptors = nodes_df.merge(human_receptors, how="inner", left_on="uniprot", right_on="UniProt accession")
+ human_receptors = human_receptors[["NODE", "uniprot"]]
+ human_receptors.to_csv(folder / "sources.txt", sep="\t", index=False)
+
+ # Finally, scores
+ scores = pd.concat([human_tfs, human_receptors]).drop_duplicates()
+ scores["prizes"] = 1
+ scores["active"] = "true"
+ scores.to_csv(folder / "prizes.txt", sep="\t", index=False)
+
+
+if __name__ == "__main__":
+ pathway = parser().parse_args().pathway
+ pathway_file = data_directory / Path(pathway).with_suffix(".sif")
+ intermediate_folder = synthetic_directory / "intermediate" / pathway
+ intermediate_folder.mkdir(parents=True, exist_ok=True)
+ process_pathway(pathway_file, intermediate_folder)
diff --git a/datasets/synthetic_data/scripts/sampling.py b/datasets/synthetic_data/scripts/sampling.py
new file mode 100644
index 00000000..fc5058b8
--- /dev/null
+++ b/datasets/synthetic_data/scripts/sampling.py
@@ -0,0 +1,138 @@
+import pandas
+from pathlib import Path
+import collections
+from typing import OrderedDict, NamedTuple
+
+import urllib.parse
+from tools.sample import attempt_sample
+from tools.trim import trim_data_file
+from datasets.synthetic_data.scripts.util.parser import parser
+import random
+
+synthetic_directory = Path(__file__).parent.parent.resolve()
+
+
+# From SPRAS. TODO: import once SPRAS uses pixi
+def convert_undirected_to_directed(df: pandas.DataFrame) -> pandas.DataFrame:
+ mask = df["Direction"] == "U"
+ new_df = df[mask].copy(deep=True)
+ new_df["Interactor1"], new_df["Interactor2"] = new_df["Interactor2"], new_df["Interactor1"]
+ new_df["Direction"] = "D"
+ df.loc[mask, "Direction"] = "D"
+ df = pandas.concat([df, new_df], ignore_index=True)
+ return df
+
+
+def count_weights() -> OrderedDict[int, int]:
+ """Returns an ordered map (lowest to highest weight) from the weight to the number of elements the weight has"""
+ weight_counts = pandas.read_csv(synthetic_directory / "processed" / "weight-counts.tsv", sep="\t")
+ return collections.OrderedDict(sorted({int(k * 1000): int(v) for k, v in dict(weight_counts.values).items()}.items()))
+
+
+def read_pathway(pathway_name: str) -> pandas.DataFrame:
+ """
+ Returns the directed-coerced pathway from a pathway name,
+ with columns Interactor1 -> Interactor2.
+ """
+ pathway_df = pandas.read_csv(
+ synthetic_directory / "processed" / "pathways" / pathway_name / "gs_edges.txt",
+ sep="\t",
+ names=["Interactor1", "Interactor2", "Weight", "Direction"],
+ )
+ # We consider an undirected edge to be two directed edges
+ pathway_df = convert_undirected_to_directed(pathway_df)
+ return pathway_df[["Interactor1", "Interactor2"]]
+
+
+class SourcesTargets(NamedTuple):
+ sources: list[str]
+ targets: list[str]
+
+
+def get_node_data(pathway_name: str) -> pandas.DataFrame:
+ return pandas.read_csv(
+ synthetic_directory / "processed" / "pathways" / pathway_name / "node_prizes.txt", sep="\t", usecols=["NODEID", "sources", "targets"]
+ )
+
+
+def sources_and_targets(pathway_node_prizes_df: pandas.DataFrame) -> SourcesTargets:
+ """
+ Returns the sources and targets associated with a particular pathway
+ """
+ sources: list[str] = list(pathway_node_prizes_df[pathway_node_prizes_df["sources"] == True]["NODEID"])
+ targets: list[str] = list(pathway_node_prizes_df[pathway_node_prizes_df["targets"] == True]["NODEID"])
+
+ return SourcesTargets(sources, targets)
+
+
+def main():
+ arg_parser = parser()
+ arg_parser.add_argument("--seed", help="The randomness seed to use", type=int, required=False)
+ arg_parser.add_argument("--amount", help="The amount of thresholds to use", type=int, default=10)
+ arg_parser.add_argument(
+ "--percentage_thresholding_multiplier",
+ help="The percentage multiplier to threshold by, " + "to unlink the sampling percentage to the actual required percentage of connections",
+ type=float,
+ default=1.0,
+ )
+
+ args = arg_parser.parse_args()
+ pathway_location = args.pathway
+ pathway_name = urllib.parse.unquote(pathway_location)
+ if args.seed is not None:
+ random.seed(args.seed)
+
+ print("Reading interactome...")
+ interactome_df = pandas.read_csv(
+ synthetic_directory / "processed" / "interactome.tsv",
+ header=None,
+ sep="\t",
+ names=["Interactor1", "Interactor2", "Weight", "Direction"],
+ usecols=[0, 1],
+ )
+
+ # For performance reasons (groupby is quite slow), we sample in the interactome using the pre-computed weight-counts.tsv file
+ weight_mapping = count_weights()
+
+ # Get information about the pathway
+ pathway_df = read_pathway(pathway_location)
+ node_data_df = get_node_data(pathway_location)
+ sources, targets = sources_and_targets(node_data_df)
+
+ percentages = list(map(lambda x: (x + 1) / args.amount, range(args.amount)))
+ for percentage_to_sample in percentages:
+ percentage_to_require = percentage_to_sample * args.percentage_thresholding_multiplier
+
+ output_directory = synthetic_directory / "thresholded" / str(percentage_to_sample) / pathway_location
+ output_directory.mkdir(exist_ok=True, parents=True)
+ output_interactome = output_directory / "interactome.txt"
+ output_gold_standard = output_directory / "gold_standard_edges.txt"
+
+ print(f"Sampling with {percentage_to_sample * 100:.1f}% of edges...")
+ attempt_number = 1
+ while (
+ attempt_sample(
+ pathway_name,
+ pathway_df,
+ percentage_to_sample,
+ percentage_to_require,
+ weight_mapping,
+ interactome_df,
+ sources,
+ targets,
+ output_interactome=output_interactome,
+ output_gold_standard=output_gold_standard,
+ )
+ is None
+ ):
+ attempt_number += 1
+ print(f"Attempt number {attempt_number}")
+
+ # We're done sampling:
+ (output_directory / "attempt-number.txt").write_text(str(attempt_number))
+ # we need to trim our data file as well. We do this already in process_panther_pathway, though.
+ trim_data_file(data_df=node_data_df, gold_standard_df=pathway_df).to_csv(output_directory / "node_prizes.tsv", sep="\t", index=False)
+
+
+if __name__ == "__main__":
+ main()
diff --git a/datasets/synthetic_data/scripts/util/__init__.py b/datasets/synthetic_data/scripts/util/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/datasets/synthetic_data/scripts/util/parser.py b/datasets/synthetic_data/scripts/util/parser.py
new file mode 100644
index 00000000..41b5b2db
--- /dev/null
+++ b/datasets/synthetic_data/scripts/util/parser.py
@@ -0,0 +1,20 @@
+import argparse
+from pathlib import Path
+
+from jsonc_parser.parser import JsoncParser
+import urllib.parse
+
+synthetic_directory = Path(__file__).parent.parent.parent.resolve()
+
+
+# TODO: deduplicate from ../Snakefile
+def make_file_safe(input_str: str) -> str:
+ return urllib.parse.quote(input_str, safe="")
+
+
+def parser():
+ parser = argparse.ArgumentParser(prog="PANTHER pathway parser")
+
+ parser.add_argument("pathway", choices=list(map(make_file_safe, JsoncParser.parse_file(synthetic_directory / "pathways.jsonc"))))
+
+ return parser
diff --git a/datasets/synthetic_data/util/__init__.py b/datasets/synthetic_data/util/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/datasets/synthetic_data/util/parse_pc_pathways.py b/datasets/synthetic_data/util/parse_pc_pathways.py
new file mode 100644
index 00000000..81cc9cc4
--- /dev/null
+++ b/datasets/synthetic_data/util/parse_pc_pathways.py
@@ -0,0 +1,25 @@
+from io import StringIO
+from os import PathLike
+from pathlib import Path
+
+import pandas
+
+
+def parse_pc_pathways(pathways_txt_path: str | PathLike) -> pandas.DataFrame:
+ """
+ Parses a pathways.txt file from the PathwayCommons FTP store into a Dataframe of the sort
+ PATHWAY_URI\tDISPLAY_NAME
+
+ such that DATASOURCE is filtered by PANTHER. TODO: generalize to other datasources
+ """
+ # We have two tables: the latter actually has more data, so we use that one instead.
+ # These two tables are separated by two newlines.
+ needle = "\n\n"
+ _, text = Path(pathways_txt_path).read_text().split(needle)
+
+ pathways_df = pandas.read_csv(StringIO(text), sep="\t")
+ pathways_df = pathways_df.loc[pathways_df["DATASOURCE"] == "PANTHER"]
+ pathways_df = pathways_df.loc[pathways_df["NUM_DIRECT_COMPONENT_OR_STEP_PROCESSES"] != 0]
+ pathways_df = pathways_df.reset_index(drop=True)
+ pathways_df = pathways_df[["PATHWAY_URI", "DISPLAY_NAME"]]
+ return pathways_df
diff --git a/datasets/yeast-osmotic-stress/Snakefile b/datasets/yeast-osmotic-stress/Snakefile
deleted file mode 100644
index 0507f176..00000000
--- a/datasets/yeast-osmotic-stress/Snakefile
+++ /dev/null
@@ -1,32 +0,0 @@
-include: "../../cache/Snakefile"
-
-rule all:
- input:
- "processed/prizes1_dummies.txt",
- "processed/network1.txt"
-
-# Not all of these files are used. Most of these were used in the original yeast-osmotic-stress data processing (see ../README.md)
-produce_fetch_rules({
- "raw/yeast_pcsf_network.sif": ["OsmoticStress", "yeast_pcsf_network.sif"],
- "raw/prizes.txt": ["OsmoticStress", "prizes.txt"],
- "raw/ChasmanNetwork-DirUndir.txt": ["OsmoticStress", "ChasmanNetwork-DirUndir.txt"],
- "raw/dummy.txt": ["OsmoticStress", "dummy.txt"],
- "raw/_edgeFreq.eda": ["OsmoticStress", "_edgeFreq.eda"],
- "raw/goldStandardUnionDetailed.txt": ["OsmoticStress", "goldStandardUnionDetailed.txt"]
-})
-
-rule process_prizes:
- input:
- "raw/prizes.txt"
- output:
- "processed/prizes1_dummies.txt"
- shell:
- "uv run process_prizes.py"
-
-rule copy_network:
- input:
- "raw/ChasmanNetwork-DirUndir.txt"
- output:
- "processed/network1.txt"
- shell:
- "cp raw/ChasmanNetwork-DirUndir.txt processed/network1.txt"
diff --git a/datasets/yeast-osmotic-stress/.gitignore b/datasets/yeast_osmotic_stress/.gitignore
similarity index 100%
rename from datasets/yeast-osmotic-stress/.gitignore
rename to datasets/yeast_osmotic_stress/.gitignore
diff --git a/datasets/yeast-osmotic-stress/README.md b/datasets/yeast_osmotic_stress/README.md
similarity index 98%
rename from datasets/yeast-osmotic-stress/README.md
rename to datasets/yeast_osmotic_stress/README.md
index 7751020e..d2f35d7f 100644
--- a/datasets/yeast-osmotic-stress/README.md
+++ b/datasets/yeast_osmotic_stress/README.md
@@ -23,7 +23,7 @@ There are other raw files inside the Snakefile, but we don't use them here. We f
## Future Work
-(_Note: results are from [this `config.yaml`](https://github.com/tristan-f-r/spras-benchmarking/blob/9477d85871024a5e3a4b0b8b9be7e78c0d0ee961/yeast-osmotic-stress/config.yaml)_).
+(_Note: results are from [this `config.yaml`](https://github.com/tristan-f-r/spras-benchmarking/blob/9477d85871024a5e3a4b0b8b9be7e78c0d0ee961/yeast_osmotic_stress/config.yaml)_).
One huge factor in why my results may have been different than the original case study has to do with the lack of a dummy node parameter implemented in the SPRAS version of Omics Integrator 1, which allows a user to pass a file with a list of dummy nodes that the algorithm has to start its reconstructions through. This feature has since been added to SPRAS.
In the case study they ran the tuned parameters with a Beta of 1.75 and r of 0.01 (to add edge noise) and generated 1000 forests. In my case Omics integrator doesn't have a way to run multiple outputs with the same parameter combination in order to ensemble the results and look at edge frequencies. My work around was to use `np.linspace` with a range between 1 and 2 and running 250 - 1000 parameter combinations. The idea being to run parameters as close to 1.75 as possible and compare the outputs.
diff --git a/datasets/yeast_osmotic_stress/Snakefile b/datasets/yeast_osmotic_stress/Snakefile
new file mode 100644
index 00000000..3eafdfde
--- /dev/null
+++ b/datasets/yeast_osmotic_stress/Snakefile
@@ -0,0 +1,59 @@
+include: "../../cache/Snakefile"
+
+rule all:
+ input:
+ "processed/prizes1_dummies.txt",
+ "processed/network1.txt"
+
+# Not all of these files are used. Most of these were used in the original yeast_osmotic_stress data processing (see ../README.md)
+produce_fetch_rules({
+ "raw/yeast_pcsf_network.sif": CacheItem(
+ # In the paper https://doi.org/10.1016/j.celrep.2018.08.085
+ name="Case Study Edge Results, from Supplementary Data 3",
+ cached="https://drive.google.com/uc?id=1Agte0Aezext-8jLhGP4GmaF3tS7gHX-h",
+ ),
+ # The following files are from https://github.com/gitter-lab/osmotic-stress.
+ # While the following files do point to the repository's main branch,
+ # they aren't expected to actually change.
+ "raw/prizes.txt": CacheItem(
+ name="Osmotic Stress Prizes",
+ pinned="https://raw.githubusercontent.com/gitter-lab/osmotic-stress/refs/heads/master/Input%20Data/prizes.txt",
+ cached="https://drive.google.com/uc?id=16WDQs0Vjv6rI12-hbifsbnpH31jMGhJg",
+ ),
+ "raw/ChasmanNetwork-DirUndir.txt": CacheItem(
+ name="Network Input",
+ pinned="https://raw.githubusercontent.com/gitter-lab/osmotic-stress/refs/heads/master/Input%20Data/ChasmanNetwork-DirUndir.txt",
+ cached="https://drive.google.com/uc?id=1qYXPaWcPU72YYME7NaBzD7thYCHRzrLH",
+ ),
+ "raw/dummy.txt": CacheItem(
+ name="Dummy Nodes File",
+ pinned="https://raw.githubusercontent.com/gitter-lab/osmotic-stress/refs/heads/master/Input%20Data/dummy.txt",
+ cached="https://drive.google.com/uc?id=1dsFIhBrIEahggg0JPxw64JwS51pKxoQU",
+ ),
+ "raw/_edgeFreq.eda ": CacheItem(
+ name="Case Study Omics Integrator Edge Frequencies",
+ pinned="https://raw.githubusercontent.com/gitter-lab/osmotic-stress/refs/heads/master/Notebooks/Forest-TPS/_edgeFreq.eda",
+ cached="https://drive.google.com/uc?id=1M_rxEzUCo_EVuFyM47OEH2J-4LB3eeCR",
+ ),
+ "goldStandardUnionDetailed.txt": CacheItem(
+ name="Gold Standard Reference Pathways",
+ pinned="https://raw.githubusercontent.com/gitter-lab/osmotic-stress/refs/heads/master/data/evaluation/goldStandardUnionDetailed.txt",
+ cached="https://drive.google.com/uc?id=1-_zF9oKFCNmJbDCC2vq8OM17HJw80s2T",
+ ),
+})
+
+rule process_prizes:
+ input:
+ "raw/prizes.txt"
+ output:
+ "processed/prizes1_dummies.txt"
+ shell:
+ "uv run process_prizes.py"
+
+rule copy_network:
+ input:
+ "raw/ChasmanNetwork-DirUndir.txt"
+ output:
+ "processed/network1.txt"
+ shell:
+ "cp raw/ChasmanNetwork-DirUndir.txt processed/network1.txt"
diff --git a/datasets/yeast-osmotic-stress/process_prizes.py b/datasets/yeast_osmotic_stress/process_prizes.py
similarity index 100%
rename from datasets/yeast-osmotic-stress/process_prizes.py
rename to datasets/yeast_osmotic_stress/process_prizes.py
diff --git a/pyproject.toml b/pyproject.toml
index b2112572..a532ceff 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -6,10 +6,15 @@ readme = "README.md"
requires-python = ">=3.13"
dependencies = [
"gdown>=5.2.0",
+ "jsonc-parser>=1.1.5",
"loguru>=0.7.3",
"more-itertools>=10.7.0",
"networkx>=3.6.1",
+ "openpyxl>=3.1.5",
"pandas>=2.3.0",
+ "paxtools>=6.0.0.2",
+ "pydantic>=2.12.5",
+ "requests>=2.32.5",
]
[dependency-groups]
@@ -20,12 +25,19 @@ dev = [
[tool.ruff]
line-length = 150
+exclude = ["spras"]
+
+[tool.ruff.lint]
+extend-ignore = ["E712"]
[tool.uv]
package = true
-[tool.setuptools.packages]
-find = {namespaces = false}
+[tool.setuptools.packages.find]
+namespaces = false
+where = ["./"]
+include = ["*"]
+exclude = ["spras*"]
[build-system]
requires = ["setuptools", "wheel", "pip"]
diff --git a/run_snakemake.sh b/run_snakemake.sh
index 137fff2f..456e4c57 100755
--- a/run_snakemake.sh
+++ b/run_snakemake.sh
@@ -9,15 +9,16 @@
set -o errexit
set -o nounset
-# Forcibly use the current CWD
+# Forcibly use the CWD
cd "$(dirname "$0")"
main() {
- uv run snakemake --cores 1 -d datasets/yeast-osmotic-stress -s datasets/yeast-osmotic-stress/Snakefile
+ uv run snakemake --cores 1 -d datasets/yeast_osmotic_stress -s datasets/yeast_osmotic_stress/Snakefile
uv run snakemake --cores 1 -d datasets/hiv -s datasets/hiv/Snakefile
uv run snakemake --cores 1 -d datasets/diseases -s datasets/diseases/Snakefile
- uv run snakemake --cores 1 -d datasets/rn-muscle-skeletal -s datasets/rn-muscle-skeletal/Snakefile
+ uv run snakemake --cores 1 -d datasets/rn_muscle_skeletal -s datasets/rn_muscle_skeletal/Snakefile
uv run snakemake --cores 1 -d datasets/depmap -s datasets/depmap/Snakefile
+ uv run snakemake --cores 1 -d datasets/synthetic_data -s datasets/synthetic_data/Snakefile
uv run snakemake --cores 1 -d datasets/egfr -s datasets/egfr/Snakefile
}
diff --git a/tools/mapping/ensembl_uniprot.py b/tools/mapping/ensembl_uniprot.py
index 73b548d9..2968e710 100644
--- a/tools/mapping/ensembl_uniprot.py
+++ b/tools/mapping/ensembl_uniprot.py
@@ -13,10 +13,8 @@
ENSG or ENSP.
"""
-def handle_ensembl_list(
- idmapping_df: pandas.DataFrame,
- column_name: str
-) -> pandas.DataFrame:
+
+def handle_ensembl_list(idmapping_df: pandas.DataFrame, column_name: str) -> pandas.DataFrame:
idmapping_df = idmapping_df[idmapping_df[column_name].notnull()]
# Handle our ;-delimited list
idmapping_df[column_name] = idmapping_df[column_name].str.split("; ")
@@ -26,16 +24,15 @@ def handle_ensembl_list(
idmapping_df = idmapping_df.reset_index(drop=True)
return idmapping_df
-def idmapping_uniprot_mapping(
- path: str | os.PathLike
- ) -> pandas.DataFrame:
+
+def idmapping_uniprot_mapping(path: str | os.PathLike) -> pandas.DataFrame:
"""
Gets the UniProt mapping file (`*_idmapping_selected`) as a dataframe with columns
UniProtKB-AC: High-quality UniProt IDs
Ensembl: ENSG
Ensembl_PRO: ENSG (Ensembl Protein IDs)
"""
- # The very powerful UniProt-provided mapping file: its Ensembl mappings are a semicolon-delimeted list of Emsembl IDs containing
+ # The very powerful UniProt-provided mapping file: its Ensembl mappings are a semicolon-delimited list of Emsembl IDs containing
# attached isoforms (and not all UniProtKB-AC identifiers have those!) so we'll need to do some extra post-processing.
# This is `*_idmapping_selected`.
idmapping_selected_df = pandas.read_csv(
@@ -50,8 +47,10 @@ def idmapping_uniprot_mapping(
idmapping_selected_df = handle_ensembl_list(idmapping_selected_df, "Ensembl_PRO")
return idmapping_selected_df
+
def idmapping_as_ensg_uniprot_mapping(uniprot_mapping: pandas.DataFrame):
return uniprot_mapping.drop(columns=["Ensembl_PRO"])
+
def idmapping_as_ensp_uniprot_mapping(uniprot_mapping: pandas.DataFrame):
return uniprot_mapping.drop(columns=["Ensembl"]).rename(columns={"Ensembl_PRO": "Ensembl"})
diff --git a/tools/sample.py b/tools/sample.py
index c5c56eb9..e85e0e3f 100644
--- a/tools/sample.py
+++ b/tools/sample.py
@@ -10,10 +10,14 @@
from typing import OrderedDict, Optional
import os
+
def count_weights(weights: dict[float, int]) -> OrderedDict[float, int]:
"""
Returns an ordered map (lowest to highest weight) from the
- weight to the number of elements the weight has.
+ weight to the number of elements of that weight.
+
+ This is to preserve the weight distribution across the interactome
+ when we sample it.
The full workflow for this function should be:
```python
@@ -22,27 +26,26 @@ def count_weights(weights: dict[float, int]) -> OrderedDict[float, int]:
"""
return collections.OrderedDict(sorted({k: int(v) for k, v in weights.items()}.items()))
-def find_connected_sources_targets(
- sources: list[str],
- targets: list[str],
- graph: networkx.Graph
-) -> list[tuple[str, str]]:
+
+def find_connected_sources_targets(sources: list[str], targets: list[str], graph: networkx.DiGraph) -> list[tuple[str, str]]:
connections: list[tuple[str, str]] = []
for source, target in itertools.product(sources, targets):
if graph.has_node(source) and graph.has_node(target) and networkx.has_path(graph, source, target):
connections.append((source, target))
return connections
+
def attempt_sample(
pathway_name: str,
pathway_df: pandas.DataFrame,
- percentage: float,
+ percentage_to_sample: float,
+ percentage_to_require: float,
weight_mapping: OrderedDict[int, int],
interactome_df: pandas.DataFrame,
sources: list[str],
targets: list[str],
output_interactome: str | os.PathLike,
- output_gold_standard: str | os.PathLike
+ output_gold_standard: str | os.PathLike,
) -> Optional[list[tuple[str, str]]]:
# TODO: generalize to node prizes/actives
"""
@@ -52,36 +55,35 @@ def attempt_sample(
returning the connections between {sources} and {targets},
or None if the target percentage failed.
"""
- interactome_df = sample_interactome(interactome_df, weight_mapping, percentage)
+ sampled_interactome_df = sample_interactome(interactome_df, weight_mapping, percentage_to_sample)
print(f"Merging {pathway_name} with interactome...")
- # While we are merging this graph, we are preparing to compare the connectedness of the prev[ious] and curr[ent] (merged) graph.
- prev_graph = networkx.from_pandas_edgelist(pathway_df, source="Interactor1", target="Interactor2")
+ # While we are merging this graph, we are preparing to compare the connectedness of the prev[ious] and curr[ent] (merged) graph
+ # where the previous graph is the one before we restrict the gold standard to the sampled interactome
+ prev_graph = networkx.from_pandas_edgelist(pathway_df, source="Interactor1", target="Interactor2", create_using=networkx.DiGraph)
prev_connections = find_connected_sources_targets(sources, targets, prev_graph)
+ # and the current graph is the one restricted to the sampled interactome.
print("Checking for pathway connectedness...")
- pathway_df = pathway_df.merge(interactome_df, how="inner", on=["Interactor1", "Interactor2"])
- curr_graph = networkx.from_pandas_edgelist(pathway_df, source="Interactor1", target="Interactor2")
+ pathway_df = pathway_df.merge(sampled_interactome_df, how="inner", on=["Interactor1", "Interactor2"])
+ curr_graph = networkx.from_pandas_edgelist(pathway_df, source="Interactor1", target="Interactor2", create_using=networkx.DiGraph)
curr_connections = find_connected_sources_targets(sources, targets, curr_graph)
# We ask that at least `percentage` of the sources and targets are connected with one another.
- connection_percentage = float(len(curr_connections)) / float(len(prev_connections))
+ connection_percentage = float(len(curr_connections)) / float(len(prev_connections)) if len(prev_connections) != 0 else 0
- if percentage < connection_percentage:
- print(f"Got {connection_percentage * 100:.1f}% connections above the {percentage * 100:.1f}% threshold.")
+ if percentage_to_require <= connection_percentage:
+ print(f"Got {connection_percentage * 100:.1f}% connections above the {percentage_to_require * 100:.1f}% required percentage threshold.")
pathway_df.to_csv(output_gold_standard, sep="\t", index=False, header=False)
- interactome_df.to_csv(output_interactome, sep='\t', index=False, header=False)
+ sampled_interactome_df.to_csv(output_interactome, sep="\t", index=False, header=False)
return curr_connections
- print(f"Failed {connection_percentage * 100:.1f}% connections below the {percentage * 100:.1f}% threshold.")
+ print(f"Failed {connection_percentage * 100:.1f}% connections below the {percentage_to_require * 100:.1f}% required percentage threshold.")
return None
-def sample_interactome(
- interactome_df: pandas.DataFrame,
- weight_mapping: OrderedDict[int, int],
- percentage: float
-):
+
+def sample_interactome(interactome_df: pandas.DataFrame, weight_mapping: OrderedDict[int, int], percentage: float):
"""
- Samples an interactome with its weight_counts dictionary. (See `count_weights` for generating `weight_counts`.)
+ Samples X% of an interactome using its weight_counts dictionary. (See `count_weights` for generating `weight_counts`.)
"""
if percentage > 1:
raise RuntimeError(f"Got a percentage above 1 ({percentage})?")
@@ -91,6 +93,9 @@ def sample_interactome(
print("Creating item samples...")
full_list: list[int] = []
curr_v = 0
+ # Sampling a percentage of the edges from each weight bucket is equivalent to
+ # sampling a percentage of the full interactome such that the weight
+ # distribution is preserved, since the buckets partition edges by weight.
for k, v in weight_mapping.items():
full_list.extend(map(lambda x: x + curr_v, random.sample(range(1, v), round(percentage * v))))
curr_v += v
diff --git a/tools/trim.py b/tools/trim.py
index 64b89ca0..0561c99f 100644
--- a/tools/trim.py
+++ b/tools/trim.py
@@ -1,5 +1,6 @@
import pandas
+
def trim_data_file(data_df: pandas.DataFrame, gold_standard_df: pandas.DataFrame) -> pandas.DataFrame:
"""
Trims the associated SPRAS data file with the nodes in the gold standard file.
diff --git a/uv.lock b/uv.lock
index f3f25d5d..7568b4bc 100644
--- a/uv.lock
+++ b/uv.lock
@@ -2,6 +2,15 @@ version = 1
revision = 3
requires-python = ">=3.13"
+[[package]]
+name = "annotated-types"
+version = "0.7.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/ee/67/531ea369ba64dcff5ec9c3402f9f51bf748cec26dde048a2f973a4eea7f5/annotated_types-0.7.0.tar.gz", hash = "sha256:aff07c09a53a08bc8cfccb9c85b05f1aa9a2a6f23728d790723543408344ce89", size = 16081, upload-time = "2024-05-20T21:33:25.928Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/78/b6/6307fbef88d9b5ee7421e68d78a9f162e0da4900bc5f5793f6d3d0e34fb8/annotated_types-0.7.0-py3-none-any.whl", hash = "sha256:1f02e8b43a8fbbc3f3e0d4f0f4bfc8131bcb4eebe8849b8e5c773f3a1c582a53", size = 13643, upload-time = "2024-05-20T21:33:24.1Z" },
+]
+
[[package]]
name = "appdirs"
version = "1.4.4"
@@ -154,6 +163,15 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/05/d1/8952806fbf9583004ab479d8f58a9496c3d35f6b6009ddd458bdd9978eaf/dpath-2.2.0-py3-none-any.whl", hash = "sha256:b330a375ded0a0d2ed404440f6c6a715deae5313af40bbb01c8a41d891900576", size = 17618, upload-time = "2024-06-12T22:08:01.881Z" },
]
+[[package]]
+name = "et-xmlfile"
+version = "2.0.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/d3/38/af70d7ab1ae9d4da450eeec1fa3918940a5fafb9055e934af8d6eb0c2313/et_xmlfile-2.0.0.tar.gz", hash = "sha256:dab3f4764309081ce75662649be815c4c9081e88f0837825f90fd28317d4da54", size = 17234, upload-time = "2024-10-25T17:25:40.039Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/c1/8b/5fe2cc11fee489817272089c4203e679c63b570a5aaeb18d852ae3cbba6a/et_xmlfile-2.0.0-py3-none-any.whl", hash = "sha256:7a91720bc756843502c3b7504c77b8fe44217c85c537d85037f0f536151b2caa", size = 18059, upload-time = "2024-10-25T17:25:39.051Z" },
+]
+
[[package]]
name = "fastjsonschema"
version = "2.21.2"
@@ -269,6 +287,15 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/62/a1/3d680cbfd5f4b8f15abc1d571870c5fc3e594bb582bc3b64ea099db13e56/jinja2-3.1.6-py3-none-any.whl", hash = "sha256:85ece4451f492d0c13c5dd7c13a64681a86afae63a5f347908daf103ce6d2f67", size = 134899, upload-time = "2025-03-05T20:05:00.369Z" },
]
+[[package]]
+name = "jsonc-parser"
+version = "1.1.5"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/35/85/bec7d88eca76c6d014d8efc17b0a9babb0810213e974bff3b7928158216b/jsonc-parser-1.1.5.tar.gz", hash = "sha256:7126d17725b0413cd40af4297d9f6412c4181a62135e4c41cdf8f6a82c5936e6", size = 4460, upload-time = "2021-05-23T19:11:05.661Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/62/be/7d27877578c43decdc38a5fac0c7aecde2e0d2f7b50869131ef9bfd1d36e/jsonc_parser-1.1.5-py3-none-any.whl", hash = "sha256:abd1db76a4c6d1733ec7bb5340a89c49cbc878a181a1e7947ee6719eedf2c6cc", size = 5671, upload-time = "2021-05-23T19:11:04.556Z" },
+]
+
[[package]]
name = "jsonschema"
version = "4.25.1"
@@ -445,6 +472,18 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/c1/9e/1652778bce745a67b5fe05adde60ed362d38eb17d919a540e813d30f6874/numpy-2.3.2-cp314-cp314t-win_arm64.whl", hash = "sha256:092aeb3449833ea9c0bf0089d70c29ae480685dd2377ec9cdbbb620257f84631", size = 10544226, upload-time = "2025-07-24T20:56:34.509Z" },
]
+[[package]]
+name = "openpyxl"
+version = "3.1.5"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+ { name = "et-xmlfile" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/3d/f9/88d94a75de065ea32619465d2f77b29a0469500e99012523b91cc4141cd1/openpyxl-3.1.5.tar.gz", hash = "sha256:cf0e3cf56142039133628b5acffe8ef0c12bc902d2aadd3e0fe5878dc08d1050", size = 186464, upload-time = "2024-06-28T14:03:44.161Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/c0/da/977ded879c29cbd04de313843e76868e6e13408a94ed6b987245dc7c8506/openpyxl-3.1.5-py2.py3-none-any.whl", hash = "sha256:5282c12b107bffeef825f4617dc029afaf41d0ea60823bbb665ef3079dc79de2", size = 250910, upload-time = "2024-06-28T14:03:41.161Z" },
+]
+
[[package]]
name = "packaging"
version = "25.0"
@@ -481,6 +520,15 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/cd/d7/612123674d7b17cf345aad0a10289b2a384bff404e0463a83c4a3a59d205/pandas-2.3.2-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:d2c3554bd31b731cd6490d94a28f3abb8dd770634a9e06eb6d2911b9827db370", size = 13186141, upload-time = "2025-08-21T10:28:05.377Z" },
]
+[[package]]
+name = "paxtools"
+version = "6.0.0.2"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/e4/7e/bcaa11908406cdffb5eeae6797244e99d4d5111e0a4e22a7eea13607af52/paxtools-6.0.0.2.tar.gz", hash = "sha256:0a313311fa313fc222c53874dfa5641b86cbedc49a2fb5da3b120949f7b84377", size = 13691523, upload-time = "2026-02-25T08:23:37.519Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/f0/e7/369d85283551f0770e16ce7054a62b4eb92107d72d2108fc563856323a67/paxtools-6.0.0.2-py3-none-any.whl", hash = "sha256:2301d374f0f5a7a72c7dcff5528ab3b0cd524c946a7246984e4221c8d36d0430", size = 13695547, upload-time = "2026-02-25T08:23:34.992Z" },
+]
+
[[package]]
name = "platformdirs"
version = "4.4.0"
@@ -530,6 +578,74 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/15/8d/a6a9d58c929a869f7f1b99b3d37b3f14ef63e2826eef581416338d686c3f/pulp-3.2.2-py3-none-any.whl", hash = "sha256:d3ca5ff11a28b3e7b2508a992d7e51f3533471d89305f0560b5fe3b6cc821043", size = 16385354, upload-time = "2025-07-29T11:42:01.829Z" },
]
+[[package]]
+name = "pydantic"
+version = "2.12.5"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+ { name = "annotated-types" },
+ { name = "pydantic-core" },
+ { name = "typing-extensions" },
+ { name = "typing-inspection" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/69/44/36f1a6e523abc58ae5f928898e4aca2e0ea509b5aa6f6f392a5d882be928/pydantic-2.12.5.tar.gz", hash = "sha256:4d351024c75c0f085a9febbb665ce8c0c6ec5d30e903bdb6394b7ede26aebb49", size = 821591, upload-time = "2025-11-26T15:11:46.471Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/5a/87/b70ad306ebb6f9b585f114d0ac2137d792b48be34d732d60e597c2f8465a/pydantic-2.12.5-py3-none-any.whl", hash = "sha256:e561593fccf61e8a20fc46dfc2dfe075b8be7d0188df33f221ad1f0139180f9d", size = 463580, upload-time = "2025-11-26T15:11:44.605Z" },
+]
+
+[[package]]
+name = "pydantic-core"
+version = "2.41.5"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+ { name = "typing-extensions" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/71/70/23b021c950c2addd24ec408e9ab05d59b035b39d97cdc1130e1bce647bb6/pydantic_core-2.41.5.tar.gz", hash = "sha256:08daa51ea16ad373ffd5e7606252cc32f07bc72b28284b6bc9c6df804816476e", size = 460952, upload-time = "2025-11-04T13:43:49.098Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/87/06/8806241ff1f70d9939f9af039c6c35f2360cf16e93c2ca76f184e76b1564/pydantic_core-2.41.5-cp313-cp313-macosx_10_12_x86_64.whl", hash = "sha256:941103c9be18ac8daf7b7adca8228f8ed6bb7a1849020f643b3a14d15b1924d9", size = 2120403, upload-time = "2025-11-04T13:40:25.248Z" },
+ { url = "https://files.pythonhosted.org/packages/94/02/abfa0e0bda67faa65fef1c84971c7e45928e108fe24333c81f3bfe35d5f5/pydantic_core-2.41.5-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:112e305c3314f40c93998e567879e887a3160bb8689ef3d2c04b6cc62c33ac34", size = 1896206, upload-time = "2025-11-04T13:40:27.099Z" },
+ { url = "https://files.pythonhosted.org/packages/15/df/a4c740c0943e93e6500f9eb23f4ca7ec9bf71b19e608ae5b579678c8d02f/pydantic_core-2.41.5-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0cbaad15cb0c90aa221d43c00e77bb33c93e8d36e0bf74760cd00e732d10a6a0", size = 1919307, upload-time = "2025-11-04T13:40:29.806Z" },
+ { url = "https://files.pythonhosted.org/packages/9a/e3/6324802931ae1d123528988e0e86587c2072ac2e5394b4bc2bc34b61ff6e/pydantic_core-2.41.5-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:03ca43e12fab6023fc79d28ca6b39b05f794ad08ec2feccc59a339b02f2b3d33", size = 2063258, upload-time = "2025-11-04T13:40:33.544Z" },
+ { url = "https://files.pythonhosted.org/packages/c9/d4/2230d7151d4957dd79c3044ea26346c148c98fbf0ee6ebd41056f2d62ab5/pydantic_core-2.41.5-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:dc799088c08fa04e43144b164feb0c13f9a0bc40503f8df3e9fde58a3c0c101e", size = 2214917, upload-time = "2025-11-04T13:40:35.479Z" },
+ { url = "https://files.pythonhosted.org/packages/e6/9f/eaac5df17a3672fef0081b6c1bb0b82b33ee89aa5cec0d7b05f52fd4a1fa/pydantic_core-2.41.5-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:97aeba56665b4c3235a0e52b2c2f5ae9cd071b8a8310ad27bddb3f7fb30e9aa2", size = 2332186, upload-time = "2025-11-04T13:40:37.436Z" },
+ { url = "https://files.pythonhosted.org/packages/cf/4e/35a80cae583a37cf15604b44240e45c05e04e86f9cfd766623149297e971/pydantic_core-2.41.5-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:406bf18d345822d6c21366031003612b9c77b3e29ffdb0f612367352aab7d586", size = 2073164, upload-time = "2025-11-04T13:40:40.289Z" },
+ { url = "https://files.pythonhosted.org/packages/bf/e3/f6e262673c6140dd3305d144d032f7bd5f7497d3871c1428521f19f9efa2/pydantic_core-2.41.5-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:b93590ae81f7010dbe380cdeab6f515902ebcbefe0b9327cc4804d74e93ae69d", size = 2179146, upload-time = "2025-11-04T13:40:42.809Z" },
+ { url = "https://files.pythonhosted.org/packages/75/c7/20bd7fc05f0c6ea2056a4565c6f36f8968c0924f19b7d97bbfea55780e73/pydantic_core-2.41.5-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:01a3d0ab748ee531f4ea6c3e48ad9dac84ddba4b0d82291f87248f2f9de8d740", size = 2137788, upload-time = "2025-11-04T13:40:44.752Z" },
+ { url = "https://files.pythonhosted.org/packages/3a/8d/34318ef985c45196e004bc46c6eab2eda437e744c124ef0dbe1ff2c9d06b/pydantic_core-2.41.5-cp313-cp313-musllinux_1_1_armv7l.whl", hash = "sha256:6561e94ba9dacc9c61bce40e2d6bdc3bfaa0259d3ff36ace3b1e6901936d2e3e", size = 2340133, upload-time = "2025-11-04T13:40:46.66Z" },
+ { url = "https://files.pythonhosted.org/packages/9c/59/013626bf8c78a5a5d9350d12e7697d3d4de951a75565496abd40ccd46bee/pydantic_core-2.41.5-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:915c3d10f81bec3a74fbd4faebe8391013ba61e5a1a8d48c4455b923bdda7858", size = 2324852, upload-time = "2025-11-04T13:40:48.575Z" },
+ { url = "https://files.pythonhosted.org/packages/1a/d9/c248c103856f807ef70c18a4f986693a46a8ffe1602e5d361485da502d20/pydantic_core-2.41.5-cp313-cp313-win32.whl", hash = "sha256:650ae77860b45cfa6e2cdafc42618ceafab3a2d9a3811fcfbd3bbf8ac3c40d36", size = 1994679, upload-time = "2025-11-04T13:40:50.619Z" },
+ { url = "https://files.pythonhosted.org/packages/9e/8b/341991b158ddab181cff136acd2552c9f35bd30380422a639c0671e99a91/pydantic_core-2.41.5-cp313-cp313-win_amd64.whl", hash = "sha256:79ec52ec461e99e13791ec6508c722742ad745571f234ea6255bed38c6480f11", size = 2019766, upload-time = "2025-11-04T13:40:52.631Z" },
+ { url = "https://files.pythonhosted.org/packages/73/7d/f2f9db34af103bea3e09735bb40b021788a5e834c81eedb541991badf8f5/pydantic_core-2.41.5-cp313-cp313-win_arm64.whl", hash = "sha256:3f84d5c1b4ab906093bdc1ff10484838aca54ef08de4afa9de0f5f14d69639cd", size = 1981005, upload-time = "2025-11-04T13:40:54.734Z" },
+ { url = "https://files.pythonhosted.org/packages/ea/28/46b7c5c9635ae96ea0fbb779e271a38129df2550f763937659ee6c5dbc65/pydantic_core-2.41.5-cp314-cp314-macosx_10_12_x86_64.whl", hash = "sha256:3f37a19d7ebcdd20b96485056ba9e8b304e27d9904d233d7b1015db320e51f0a", size = 2119622, upload-time = "2025-11-04T13:40:56.68Z" },
+ { url = "https://files.pythonhosted.org/packages/74/1a/145646e5687e8d9a1e8d09acb278c8535ebe9e972e1f162ed338a622f193/pydantic_core-2.41.5-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:1d1d9764366c73f996edd17abb6d9d7649a7eb690006ab6adbda117717099b14", size = 1891725, upload-time = "2025-11-04T13:40:58.807Z" },
+ { url = "https://files.pythonhosted.org/packages/23/04/e89c29e267b8060b40dca97bfc64a19b2a3cf99018167ea1677d96368273/pydantic_core-2.41.5-cp314-cp314-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:25e1c2af0fce638d5f1988b686f3b3ea8cd7de5f244ca147c777769e798a9cd1", size = 1915040, upload-time = "2025-11-04T13:41:00.853Z" },
+ { url = "https://files.pythonhosted.org/packages/84/a3/15a82ac7bd97992a82257f777b3583d3e84bdb06ba6858f745daa2ec8a85/pydantic_core-2.41.5-cp314-cp314-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:506d766a8727beef16b7adaeb8ee6217c64fc813646b424d0804d67c16eddb66", size = 2063691, upload-time = "2025-11-04T13:41:03.504Z" },
+ { url = "https://files.pythonhosted.org/packages/74/9b/0046701313c6ef08c0c1cf0e028c67c770a4e1275ca73131563c5f2a310a/pydantic_core-2.41.5-cp314-cp314-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:4819fa52133c9aa3c387b3328f25c1facc356491e6135b459f1de698ff64d869", size = 2213897, upload-time = "2025-11-04T13:41:05.804Z" },
+ { url = "https://files.pythonhosted.org/packages/8a/cd/6bac76ecd1b27e75a95ca3a9a559c643b3afcd2dd62086d4b7a32a18b169/pydantic_core-2.41.5-cp314-cp314-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2b761d210c9ea91feda40d25b4efe82a1707da2ef62901466a42492c028553a2", size = 2333302, upload-time = "2025-11-04T13:41:07.809Z" },
+ { url = "https://files.pythonhosted.org/packages/4c/d2/ef2074dc020dd6e109611a8be4449b98cd25e1b9b8a303c2f0fca2f2bcf7/pydantic_core-2.41.5-cp314-cp314-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:22f0fb8c1c583a3b6f24df2470833b40207e907b90c928cc8d3594b76f874375", size = 2064877, upload-time = "2025-11-04T13:41:09.827Z" },
+ { url = "https://files.pythonhosted.org/packages/18/66/e9db17a9a763d72f03de903883c057b2592c09509ccfe468187f2a2eef29/pydantic_core-2.41.5-cp314-cp314-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:2782c870e99878c634505236d81e5443092fba820f0373997ff75f90f68cd553", size = 2180680, upload-time = "2025-11-04T13:41:12.379Z" },
+ { url = "https://files.pythonhosted.org/packages/d3/9e/3ce66cebb929f3ced22be85d4c2399b8e85b622db77dad36b73c5387f8f8/pydantic_core-2.41.5-cp314-cp314-musllinux_1_1_aarch64.whl", hash = "sha256:0177272f88ab8312479336e1d777f6b124537d47f2123f89cb37e0accea97f90", size = 2138960, upload-time = "2025-11-04T13:41:14.627Z" },
+ { url = "https://files.pythonhosted.org/packages/a6/62/205a998f4327d2079326b01abee48e502ea739d174f0a89295c481a2272e/pydantic_core-2.41.5-cp314-cp314-musllinux_1_1_armv7l.whl", hash = "sha256:63510af5e38f8955b8ee5687740d6ebf7c2a0886d15a6d65c32814613681bc07", size = 2339102, upload-time = "2025-11-04T13:41:16.868Z" },
+ { url = "https://files.pythonhosted.org/packages/3c/0d/f05e79471e889d74d3d88f5bd20d0ed189ad94c2423d81ff8d0000aab4ff/pydantic_core-2.41.5-cp314-cp314-musllinux_1_1_x86_64.whl", hash = "sha256:e56ba91f47764cc14f1daacd723e3e82d1a89d783f0f5afe9c364b8bb491ccdb", size = 2326039, upload-time = "2025-11-04T13:41:18.934Z" },
+ { url = "https://files.pythonhosted.org/packages/ec/e1/e08a6208bb100da7e0c4b288eed624a703f4d129bde2da475721a80cab32/pydantic_core-2.41.5-cp314-cp314-win32.whl", hash = "sha256:aec5cf2fd867b4ff45b9959f8b20ea3993fc93e63c7363fe6851424c8a7e7c23", size = 1995126, upload-time = "2025-11-04T13:41:21.418Z" },
+ { url = "https://files.pythonhosted.org/packages/48/5d/56ba7b24e9557f99c9237e29f5c09913c81eeb2f3217e40e922353668092/pydantic_core-2.41.5-cp314-cp314-win_amd64.whl", hash = "sha256:8e7c86f27c585ef37c35e56a96363ab8de4e549a95512445b85c96d3e2f7c1bf", size = 2015489, upload-time = "2025-11-04T13:41:24.076Z" },
+ { url = "https://files.pythonhosted.org/packages/4e/bb/f7a190991ec9e3e0ba22e4993d8755bbc4a32925c0b5b42775c03e8148f9/pydantic_core-2.41.5-cp314-cp314-win_arm64.whl", hash = "sha256:e672ba74fbc2dc8eea59fb6d4aed6845e6905fc2a8afe93175d94a83ba2a01a0", size = 1977288, upload-time = "2025-11-04T13:41:26.33Z" },
+ { url = "https://files.pythonhosted.org/packages/92/ed/77542d0c51538e32e15afe7899d79efce4b81eee631d99850edc2f5e9349/pydantic_core-2.41.5-cp314-cp314t-macosx_10_12_x86_64.whl", hash = "sha256:8566def80554c3faa0e65ac30ab0932b9e3a5cd7f8323764303d468e5c37595a", size = 2120255, upload-time = "2025-11-04T13:41:28.569Z" },
+ { url = "https://files.pythonhosted.org/packages/bb/3d/6913dde84d5be21e284439676168b28d8bbba5600d838b9dca99de0fad71/pydantic_core-2.41.5-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:b80aa5095cd3109962a298ce14110ae16b8c1aece8b72f9dafe81cf597ad80b3", size = 1863760, upload-time = "2025-11-04T13:41:31.055Z" },
+ { url = "https://files.pythonhosted.org/packages/5a/f0/e5e6b99d4191da102f2b0eb9687aaa7f5bea5d9964071a84effc3e40f997/pydantic_core-2.41.5-cp314-cp314t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3006c3dd9ba34b0c094c544c6006cc79e87d8612999f1a5d43b769b89181f23c", size = 1878092, upload-time = "2025-11-04T13:41:33.21Z" },
+ { url = "https://files.pythonhosted.org/packages/71/48/36fb760642d568925953bcc8116455513d6e34c4beaa37544118c36aba6d/pydantic_core-2.41.5-cp314-cp314t-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:72f6c8b11857a856bcfa48c86f5368439f74453563f951e473514579d44aa612", size = 2053385, upload-time = "2025-11-04T13:41:35.508Z" },
+ { url = "https://files.pythonhosted.org/packages/20/25/92dc684dd8eb75a234bc1c764b4210cf2646479d54b47bf46061657292a8/pydantic_core-2.41.5-cp314-cp314t-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5cb1b2f9742240e4bb26b652a5aeb840aa4b417c7748b6f8387927bc6e45e40d", size = 2218832, upload-time = "2025-11-04T13:41:37.732Z" },
+ { url = "https://files.pythonhosted.org/packages/e2/09/f53e0b05023d3e30357d82eb35835d0f6340ca344720a4599cd663dca599/pydantic_core-2.41.5-cp314-cp314t-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:bd3d54f38609ff308209bd43acea66061494157703364ae40c951f83ba99a1a9", size = 2327585, upload-time = "2025-11-04T13:41:40Z" },
+ { url = "https://files.pythonhosted.org/packages/aa/4e/2ae1aa85d6af35a39b236b1b1641de73f5a6ac4d5a7509f77b814885760c/pydantic_core-2.41.5-cp314-cp314t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2ff4321e56e879ee8d2a879501c8e469414d948f4aba74a2d4593184eb326660", size = 2041078, upload-time = "2025-11-04T13:41:42.323Z" },
+ { url = "https://files.pythonhosted.org/packages/cd/13/2e215f17f0ef326fc72afe94776edb77525142c693767fc347ed6288728d/pydantic_core-2.41.5-cp314-cp314t-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:d0d2568a8c11bf8225044aa94409e21da0cb09dcdafe9ecd10250b2baad531a9", size = 2173914, upload-time = "2025-11-04T13:41:45.221Z" },
+ { url = "https://files.pythonhosted.org/packages/02/7a/f999a6dcbcd0e5660bc348a3991c8915ce6599f4f2c6ac22f01d7a10816c/pydantic_core-2.41.5-cp314-cp314t-musllinux_1_1_aarch64.whl", hash = "sha256:a39455728aabd58ceabb03c90e12f71fd30fa69615760a075b9fec596456ccc3", size = 2129560, upload-time = "2025-11-04T13:41:47.474Z" },
+ { url = "https://files.pythonhosted.org/packages/3a/b1/6c990ac65e3b4c079a4fb9f5b05f5b013afa0f4ed6780a3dd236d2cbdc64/pydantic_core-2.41.5-cp314-cp314t-musllinux_1_1_armv7l.whl", hash = "sha256:239edca560d05757817c13dc17c50766136d21f7cd0fac50295499ae24f90fdf", size = 2329244, upload-time = "2025-11-04T13:41:49.992Z" },
+ { url = "https://files.pythonhosted.org/packages/d9/02/3c562f3a51afd4d88fff8dffb1771b30cfdfd79befd9883ee094f5b6c0d8/pydantic_core-2.41.5-cp314-cp314t-musllinux_1_1_x86_64.whl", hash = "sha256:2a5e06546e19f24c6a96a129142a75cee553cc018ffee48a460059b1185f4470", size = 2331955, upload-time = "2025-11-04T13:41:54.079Z" },
+ { url = "https://files.pythonhosted.org/packages/5c/96/5fb7d8c3c17bc8c62fdb031c47d77a1af698f1d7a406b0f79aaa1338f9ad/pydantic_core-2.41.5-cp314-cp314t-win32.whl", hash = "sha256:b4ececa40ac28afa90871c2cc2b9ffd2ff0bf749380fbdf57d165fd23da353aa", size = 1988906, upload-time = "2025-11-04T13:41:56.606Z" },
+ { url = "https://files.pythonhosted.org/packages/22/ed/182129d83032702912c2e2d8bbe33c036f342cc735737064668585dac28f/pydantic_core-2.41.5-cp314-cp314t-win_amd64.whl", hash = "sha256:80aa89cad80b32a912a65332f64a4450ed00966111b6615ca6816153d3585a8c", size = 1981607, upload-time = "2025-11-04T13:41:58.889Z" },
+ { url = "https://files.pythonhosted.org/packages/9f/ed/068e41660b832bb0b1aa5b58011dea2a3fe0ba7861ff38c4d4904c1c1a99/pydantic_core-2.41.5-cp314-cp314t-win_arm64.whl", hash = "sha256:35b44f37a3199f771c3eaa53051bc8a70cd7b54f333531c59e29fd4db5d15008", size = 1974769, upload-time = "2025-11-04T13:42:01.186Z" },
+]
+
[[package]]
name = "pyreadline3"
version = "3.5.4"
@@ -859,10 +975,15 @@ version = "0.1.0"
source = { editable = "." }
dependencies = [
{ name = "gdown" },
+ { name = "jsonc-parser" },
{ name = "loguru" },
{ name = "more-itertools" },
{ name = "networkx" },
+ { name = "openpyxl" },
{ name = "pandas" },
+ { name = "paxtools" },
+ { name = "pydantic" },
+ { name = "requests" },
]
[package.dev-dependencies]
@@ -874,10 +995,15 @@ dev = [
[package.metadata]
requires-dist = [
{ name = "gdown", specifier = ">=5.2.0" },
+ { name = "jsonc-parser", specifier = ">=1.1.5" },
{ name = "loguru", specifier = ">=0.7.3" },
{ name = "more-itertools", specifier = ">=10.7.0" },
{ name = "networkx", specifier = ">=3.6.1" },
+ { name = "openpyxl", specifier = ">=3.1.5" },
{ name = "pandas", specifier = ">=2.3.0" },
+ { name = "paxtools", specifier = ">=6.0.0.2" },
+ { name = "pydantic", specifier = ">=2.12.5" },
+ { name = "requests", specifier = ">=2.32.5" },
]
[package.metadata.requires-dev]
@@ -934,6 +1060,18 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/18/67/36e9267722cc04a6b9f15c7f3441c2363321a3ea07da7ae0c0707beb2a9c/typing_extensions-4.15.0-py3-none-any.whl", hash = "sha256:f0fa19c6845758ab08074a0cfa8b7aecb71c999ca73d62883bc25cc018c4e548", size = 44614, upload-time = "2025-08-25T13:49:24.86Z" },
]
+[[package]]
+name = "typing-inspection"
+version = "0.4.2"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+ { name = "typing-extensions" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/55/e3/70399cb7dd41c10ac53367ae42139cf4b1ca5f36bb3dc6c9d33acdb43655/typing_inspection-0.4.2.tar.gz", hash = "sha256:ba561c48a67c5958007083d386c3295464928b01faa735ab8547c5692e87f464", size = 75949, upload-time = "2025-10-01T02:14:41.687Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/dc/9b/47798a6c91d8bdb567fe2698fe81e0c6b7cb7ef4d13da4114b41d239f65d/typing_inspection-0.4.2-py3-none-any.whl", hash = "sha256:4ed1cacbdc298c220f1bd249ed5287caa16f34d44ef4e9c3d0cbad5b521545e7", size = 14611, upload-time = "2025-10-01T02:14:40.154Z" },
+]
+
[[package]]
name = "tzdata"
version = "2025.2"
diff --git a/web/src/lib/outputStyle.ts b/web/src/lib/outputStyle.ts
index a65783c5..4c848d3b 100644
--- a/web/src/lib/outputStyle.ts
+++ b/web/src/lib/outputStyle.ts
@@ -41,11 +41,11 @@ const dataCategories = {
},
rn: {
name: "ResponseNet",
- directory: "rn-muscle-skeletal",
+ directory: "rn_muscle_skeletal",
},
yeast: {
name: "Yeast",
- directory: "yeast-osmotic-stress",
+ directory: "yeast_osmotic_stress",
},
egfr: {
name: "EGFR",
diff --git a/web/src/pages/[uid]/index.astro b/web/src/pages/[uid]/index.astro
index e8e3871e..9037417c 100644
--- a/web/src/pages/[uid]/index.astro
+++ b/web/src/pages/[uid]/index.astro
@@ -36,7 +36,7 @@ const output = parseOutputString(uid);
// We get the raw files associated to this specific run
const subPaths = getDataFiles().filter((path) => path.startsWith(asFolderName(output)));
-// The paramater config content
+// The parameter config content
const parametersCode = (
await import(`../../../../output/logs/parameters-${output.algorithm}-params-${output.paramsHash}.yaml?raw`)
).default;
diff --git a/web/src/pages/description.md b/web/src/pages/description.md
index 7a5a6518..fb0d2c96 100644
--- a/web/src/pages/description.md
+++ b/web/src/pages/description.md
@@ -13,7 +13,7 @@ All information provided is orchestrated through our GitHub Actions pipeline, an
## Format
Each run's slug has the type, the (dataset) category, the dataset, the
-algorithm, and the paramaters [hash](https://en.wikipedia.org/wiki/Hash_function).
+algorithm, and the parameters [hash](https://en.wikipedia.org/wiki/Hash_function).
There are also pages related to different categories of these runs: