Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
66 commits
Select commit Hold shift + click to select a range
b0327a2
feat: spras_revision
tristan-f-r Jul 9, 2025
8cec738
style: fmt
tristan-f-r Jul 9, 2025
5683392
test: summary
tristan-f-r Jul 10, 2025
af90ce0
docs(test_summary): mention preprocessing motivation
tristan-f-r Jul 10, 2025
6141874
test(analysis/summary): use input from /input instead
tristan-f-r Jul 10, 2025
440a2d4
docs(test/analysis): mention dual integration testing
tristan-f-r Jul 10, 2025
d9e852b
test(analysis/summary): use test/analysis provided gold standard
tristan-f-r Jul 10, 2025
abb0eb9
style: fmt
tristan-f-r Jul 10, 2025
60185fc
chore: don't repeat docs inside analysis configs
tristan-f-r Jul 10, 2025
e6bd6a0
feat: get working with cytoscape
tristan-f-r Jul 11, 2025
f9a3081
style: fmt
tristan-f-r Jul 11, 2025
77fc3b4
test: remove nondet from analysis
tristan-f-r Jul 11, 2025
0592850
fix: get input pathways at runtime
tristan-f-r Jul 11, 2025
0b6413d
Merge branch 'umain' into hash
tristan-f-r Aug 4, 2025
1817157
fix: rm run
tristan-f-r Aug 4, 2025
c077d91
Merge branch 'main' into hash
tristan-f-r Aug 14, 2025
50f2195
fix: correct for pydantic
tristan-f-r Aug 14, 2025
d3a088b
fix: attach spras revision inside gs_values
tristan-f-r Aug 14, 2025
8e3b898
chore: drop re import
tristan-f-r Aug 14, 2025
1ada504
Merge branch 'main' into hash
tristan-f-r Aug 27, 2025
34a40ad
fix: correct tests
tristan-f-r Aug 27, 2025
5d2c6d0
Merge branch 'main' into hash
tristan-f-r Sep 9, 2025
ef15781
Merge branch 'main' into hash
tristan-f-r Sep 24, 2025
8d5019b
fix: correct Snakefile
tristan-f-r Sep 24, 2025
9949572
fix: use correct gs variable
tristan-f-r Sep 25, 2025
3cd25e8
Merge branch 'main' into hash
tristan-f-r Oct 24, 2025
0965a68
test: correct config
tristan-f-r Oct 25, 2025
a169505
fix: correct name again
tristan-f-r Oct 25, 2025
eec09f2
Merge branch 'main' into hash
tristan-f-r Jan 10, 2026
a8d71bd
test: fix files
tristan-f-r Jan 10, 2026
e12fc75
apply suggestions
tristan-f-r Jan 17, 2026
977bf5a
clean, fix: strip project_directory
tristan-f-r Jan 17, 2026
8500bcb
fix: correct equality on not SPRAS pyproject.toml
tristan-f-r Jan 17, 2026
112db39
chore: grammar
tristan-f-r Jan 17, 2026
c7262ed
chore: move attach_spras_revision out of Snakefile
tristan-f-r Jan 18, 2026
f69a0f3
Merge branch 'main' into hash
tristan-f-r Jan 31, 2026
72e30bf
fix: properly resolve merge conflict
tristan-f-r Jan 31, 2026
c71b652
fix: undo mistaken merge conflict
tristan-f-r Jan 31, 2026
6b941e0
chore: drop unnecessary self.datasets initialization
tristan-f-r Jan 31, 2026
fbf0ceb
feat: dynamic spras versioning
tristan-f-r Jan 31, 2026
edc0369
chore: error handling on setup.pu
tristan-f-r Jan 31, 2026
3a1251d
docs: note on git commit hashes
tristan-f-r Jan 31, 2026
d330d6a
chore: drop git magic
tristan-f-r Jan 31, 2026
5e31d06
feat: correctly parse RECORD
tristan-f-r Jan 31, 2026
dba2b45
style: fmt
tristan-f-r Jan 31, 2026
90b4e1f
feat: optional spras revision
tristan-f-r Feb 11, 2026
fd5a490
docs: osdf_immutable info; ci: debug
tristan-f-r Feb 11, 2026
210897b
ci: ??????
tristan-f-r Feb 11, 2026
816dd28
fix: don't use distribution files, opt for purepath
tristan-f-r Feb 11, 2026
cd78a2a
style: fmt
tristan-f-r Feb 11, 2026
b025b7d
fix: tag iff osdf immutable, correct functools.partial sig
tristan-f-r Feb 11, 2026
8ce8c31
apply suggestions
tristan-f-r Feb 14, 2026
9bbf7cf
docs: info on spras revision, change names
tristan-f-r Feb 14, 2026
9ce6241
docs: clarify confusing symbol choice
tristan-f-r Feb 14, 2026
f7cabd8
refactor: move revision out
tristan-f-r Mar 9, 2026
eddcf67
fix: spelling err
tristan-f-r Mar 9, 2026
9ab902a
docs: on editable spras installs
tristan-f-r Mar 9, 2026
4b37700
docs: design
tristan-f-r Mar 16, 2026
46fff30
docs(design): notes about record files
tristan-f-r Mar 16, 2026
39f6cbc
docs(design): flag typo
tristan-f-r Mar 16, 2026
809dfb3
Merge remote-tracking branch 'upstream/main' into hash
tristan-f-r Mar 16, 2026
0b57f8c
Merge branch 'umain' into hash
tristan-f-r Mar 16, 2026
d7bf7df
refactor(Snakefile): isolate algorithm assignment
tristan-f-r Mar 16, 2026
2799cc1
docs(design): use correct parameter name
tristan-f-r Mar 16, 2026
5250f6a
docs: osdf design clarification
tristan-f-r Mar 16, 2026
a42000e
chore(test/analysis): drop unused config settings
tristan-f-r Mar 16, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 11 additions & 9 deletions Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ import yaml
from spras.dataset import Dataset
from spras.evaluation import Evaluation
from spras.analysis import ml, summary, cytoscape
from spras.config.revision import detach_spras_revision
import spras.config.config as _config

# Snakemake updated the behavior in the 6.5.0 release https://github.com/snakemake/snakemake/pull/1037
Expand Down Expand Up @@ -34,7 +35,6 @@ def get_dataset(_datasets, label):
algorithms = list(algorithm_params)
algorithms_with_params = [f'{algorithm}-params-{params_hash}' for algorithm, param_combos in algorithm_params.items() for params_hash in param_combos.keys()]
dataset_labels = list(_config.config.datasets.keys())

dataset_gold_standard_node_pairs = [f"{dataset}-{gs['label']}" for gs in _config.config.gold_standards.values() if gs['node_files'] for dataset in gs['dataset_labels']]
dataset_gold_standard_edge_pairs = [f"{dataset}-{gs['label']}" for gs in _config.config.gold_standards.values() if gs['edge_files'] for dataset in gs['dataset_labels']]

Expand Down Expand Up @@ -209,14 +209,16 @@ checkpoint prepare_input:
# Use the algorithm's generate_inputs function to load the merged dataset, extract the relevant columns,
# and write the output files specified by required_inputs
# The filename_map provides the output file path for each required input file type
filename_map = {input_type: SEP.join([out_dir, 'prepared', f'{wildcards.dataset}-{wildcards.algorithm}-inputs', f'{input_type}.txt']) for input_type in runner.get_required_inputs(wildcards.algorithm)}
runner.prepare_inputs(wildcards.algorithm, input.dataset_file, filename_map)
algorithm = detach_spras_revision(_config.config.immutable_files, wildcards.algorithm)
filename_map = {input_type: SEP.join([out_dir, 'prepared', f'{wildcards.dataset}-{wildcards.algorithm}-inputs', f'{input_type}.txt']) for input_type in runner.get_required_inputs(algorithm)}
runner.prepare_inputs(algorithm, input.dataset_file, filename_map)

# Collect the prepared input files from the specified directory
# If the directory does not exist for this dataset-algorithm pair, the checkpoint will detect that
# prepare_input needs to be run and will then automatically re-rerun downstream rules like reconstruct
# If the directory does exist but some of the required input files are missing, Snakemake will not automatically
# run prepare_input
# run prepare_inputs

# It only checks for the output of prepare_input, which is a directory
# Therefore, manually remove the entire directory if any of the expected prepared input file are missing so that
# prepare_inputs is run, the directory and prepared input files are re-generated, and the reconstruct rule is run again
Expand All @@ -227,7 +229,7 @@ def collect_prepared_input(wildcards):
prepared_dir = SEP.join([out_dir, 'prepared', f'{wildcards.dataset}-{wildcards.algorithm}-inputs'])

# Construct the list of expected prepared input files for the reconstruction algorithm
prepared_inputs = expand(f'{prepared_dir}{SEP}{{type}}.txt',type=runner.get_required_inputs(algorithm=wildcards.algorithm))
prepared_inputs = expand(f'{prepared_dir}{SEP}{{type}}.txt',type=runner.get_required_inputs(algorithm=detach_spras_revision(_config.config.immutable_files, wildcards.algorithm)))
# If the directory is missing, do nothing because the missing output triggers running prepare_input
if os.path.isdir(prepared_dir):
# First, check if .snakemake_timestamp, the last written file in a directory rule,
Expand Down Expand Up @@ -273,23 +275,23 @@ rule reconstruct:
# Create a copy so that the updates are not written to the parameters logfile
params = reconstruction_params(wildcards.algorithm, wildcards.params).copy()
# Declare the input files as a dictionary.
inputs = dict(zip(runner.get_required_inputs(wildcards.algorithm), *{input}, strict=True))
inputs = dict(zip(runner.get_required_inputs(detach_spras_revision(_config.config.immutable_files, wildcards.algorithm)), *{input}, strict=True))
# Remove the _spras_run_name parameter added for keeping track of the run name for parameters.yml
if '_spras_run_name' in params:
params.pop('_spras_run_name')
runner.run(wildcards.algorithm, inputs, output.pathway_file, params, container_settings)
runner.run(detach_spras_revision(_config.config.immutable_files, wildcards.algorithm), inputs, output.pathway_file, params, container_settings)

# Original pathway reconstruction output to universal output
# Use PRRunner as a wrapper to call the algorithm-specific parse_output
rule parse_output:
input:
input:
raw_file = SEP.join([out_dir, '{dataset}-{algorithm}-{params}', 'raw-pathway.txt']),
dataset_file = SEP.join([out_dir, 'dataset-{dataset}-merged.pickle'])
output: standardized_file = SEP.join([out_dir, '{dataset}-{algorithm}-{params}', 'pathway.txt'])
run:
params = reconstruction_params(wildcards.algorithm, wildcards.params).copy()
params['dataset'] = input.dataset_file
runner.parse_output(wildcards.algorithm, input.raw_file, output.standardized_file, params)
runner.parse_output(detach_spras_revision(_config.config.immutable_files, wildcards.algorithm), input.raw_file, output.standardized_file, params)

# TODO: reuse in the future once we make summary work for mixed graphs. See https://github.com/Reed-CompBio/spras/issues/128
# Collect summary statistics for a single pathway
Expand Down
9 changes: 9 additions & 0 deletions config/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,15 @@
# The length of the hash used to identify a parameter combination
hash_length: 7

# If enabled, this tags all output files with a SPRAS 'revision version'.
# By default, this will be the hash of all the SPRAS files in the PyPA installation. This option will not work if SPRAS was not installed
# in a PyPA-compliant manner (PyPA-compliant installations include but are not limited to pip, poetry, uv, conda, pixi.)
# For some files, the 'SPRAS revision' may be tied to the specific format version that file is on.
#
# By default, this is disabled, as it can make output file names confusing. Here, it's set to true since we use this
# configuration file for testing.
immutable_files: true

# Collection of container options
containers:
# Specify the container framework used by each PRM wrapper. Valid options include:
Expand Down
30 changes: 30 additions & 0 deletions docs/contributing/design.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
SPRAS Designs
=============

SPRAS makes a few high-level design decisions. We motivate them here.

.. Right now, this only talks about immutable outputs. In the future, this may include, and is not limited to:
.. container-agonistic volumes, directionality, parameter tuning, and typed configs/algorithms.

Immutable Outputs
-----------------

During benchmarking runs, SPRAS data is uploaded to the `Open Science
Data Federation <https://osg-htc.org/services/osdf>`__. OSDF enforces an
immutable file structure, where files can never be deleted or rewritten.
By default, SPRAS does not have immutable files. However, in SPRAS
configurations, the ``immutable_files`` parameter can be enabled to make
files fully immutable where no file with the same file name will be
written with different data.

To do this, SPRAS tags all datasets, gold standards, and algorithms with
a version hash, which is effectively the current version of how SPRAS
processes that data in-code.

In implementation, this version hash is the hash of the `RECORD
<https://packaging.python.org/en/latest/specifications/recording-installed-packages/#the-record-file>`__ file,
which contains hashes of all 'installed' files. When SPRAS is not installed
in development mode (i.e. without the ``--editable`` flag), the ``RECORD`` file
hashes all Python source files, leading to the desired effect that
the version hash changes when the source code changes. In development mode,
the ``RECORD`` file does not change when source code is changed.
1 change: 1 addition & 0 deletions docs/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ methods (PRMs) to omics data.
contributing/index
contributing/maintain
contributing/patching
contributing/design

.. toctree::
:maxdepth: 1
Expand Down
2 changes: 1 addition & 1 deletion docs/prms/bowtiebuilder.rst
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
BowTieBuilder
==========
=============

BowTieBuilder is a pathway reconstruction algorithm which constructs pathways in a 'bowtie'-like
fashion, finding the intersections of shortest paths between sources and targets and using those nodes as a basis
Expand Down
2 changes: 1 addition & 1 deletion spras/analysis/summary.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@


def summarize_networks(file_paths: Iterable[Path], node_table: pd.DataFrame, algo_params: dict[str, dict],
algo_with_params: list) -> pd.DataFrame:
algo_with_params: list[str]) -> pd.DataFrame:
"""
Generate a table that aggregates summary information about networks in file_paths, including which nodes are present
in node_table columns. Network directionality is ignored and all edges are treated as undirected. The order of the
Expand Down
32 changes: 26 additions & 6 deletions spras/config/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
"""

import copy as copy
import functools
import itertools as it
import warnings
from pathlib import Path
Expand All @@ -22,6 +23,7 @@
import yaml

from spras.config.container_schema import ProcessedContainerSettings
from spras.config.revision import attach_spras_revision, spras_revision
from spras.config.schema import DatasetSchema, RawConfig
from spras.util import LoosePathLike, NpHashEncoder, hash_params_sha1_base32

Expand Down Expand Up @@ -59,8 +61,6 @@ def __init__(self, raw_config: dict[str, Any]):
self.hash_length = parsed_raw_config.hash_length
# Container settings used by PRMs.
self.container_settings = ProcessedContainerSettings.from_container_settings(parsed_raw_config.containers, self.hash_length)
# The list of algorithms to run in the workflow. Each is a dict with 'name' as an expected key.
self.algorithms = None
# A nested dict mapping algorithm names to dicts that map parameter hashes to parameter combinations.
# Only includes algorithms that are set to be run with 'include: true'.
self.algorithm_params: dict[str, dict[str, Any]] = dict()
Expand Down Expand Up @@ -88,6 +88,8 @@ def __init__(self, raw_config: dict[str, Any]):
self.analysis_include_ml_aggregate_algo = None
# A Boolean specifying whether to run the evaluation per algorithm analysis
self.analysis_include_evaluation_aggregate_algo = None
# Specifies whether the files should be OSDF-immutable (i.e. the file names change when the file itself changes)
self.immutable_files = parsed_raw_config.immutable_files

self.process_config(parsed_raw_config)

Expand Down Expand Up @@ -117,6 +119,12 @@ def process_datasets(self, raw_config: RawConfig):
# Currently assumes all datasets have a label and the labels are unique
# When Snakemake parses the config file it loads the datasets as OrderedDicts not dicts
# Convert to dicts to simplify the yaml logging

for dataset in raw_config.datasets:
dataset.label = attach_spras_revision(self.immutable_files, dataset.label)
for gold_standard in raw_config.gold_standards:
gold_standard.label = attach_spras_revision(self.immutable_files, gold_standard.label)

for dataset in raw_config.datasets:
label = dataset.label
if label.lower() in [key.lower() for key in self.datasets.keys()]:
Expand All @@ -130,8 +138,14 @@ def process_datasets(self, raw_config: RawConfig):
dataset_labels = set(self.datasets.keys())
gold_standard_dataset_labels = {dataset_label for value in self.gold_standards.values() for dataset_label in value['dataset_labels']}
for label in gold_standard_dataset_labels:
if label not in dataset_labels:
if attach_spras_revision(self.immutable_files, label) not in dataset_labels:
raise ValueError(f"Dataset label '{label}' provided in gold standards does not exist in the existing dataset labels.")
# We attach the SPRAS revision to the individual dataset labels afterwards for a cleaner error message above.
for key, gold_standard in self.gold_standards.items():
self.gold_standards[key]["dataset_labels"] = map(
functools.partial(attach_spras_revision, self.immutable_files),
gold_standard["dataset_labels"]
)

# Code snipped from Snakefile that may be useful for assigning default labels
# dataset_labels = [dataset.get('label', f'dataset{index}') for index, dataset in enumerate(datasets)]
Expand All @@ -148,8 +162,10 @@ def process_algorithms(self, raw_config: RawConfig):
"""
prior_params_hashes = set()
self.algorithm_params = dict()
self.algorithms = raw_config.algorithms
for alg in self.algorithms:
# We copy raw_config.algorithms to avoid mutating the original config
# when we attach the SPRAS revision to algorithm names later.
for alg in raw_config.algorithms[:]:
alg.name = attach_spras_revision(self.immutable_files, alg.name)
if alg.include:
# This dict maps from parameter combinations hashes to parameter combination dictionaries
self.algorithm_params[alg.name] = dict()
Expand Down Expand Up @@ -187,7 +203,11 @@ def process_algorithms(self, raw_config: RawConfig):
run_dict[param] = float(value)
if isinstance(value, np.ndarray):
run_dict[param] = value.tolist()
params_hash = hash_params_sha1_base32(run_dict, self.hash_length, cls=NpHashEncoder)
hash_run_dict = copy.deepcopy(run_dict)
if self.immutable_files:
# Incorporates the `spras_revision` into the hash
hash_run_dict["_spras_rev"] = spras_revision()
params_hash = hash_params_sha1_base32(hash_run_dict, self.hash_length, cls=NpHashEncoder)
if params_hash in prior_params_hashes:
raise ValueError(f'Parameter hash collision detected. Increase the hash_length in the config file '
f'(current length {self.hash_length}).')
Expand Down
72 changes: 72 additions & 0 deletions spras/config/revision.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
"""
The revision is an optional hash associated to all files in the designated output directory
to make sure that file _names_ are immutable. We attach the revision to three labels:

- Datasets
- Gold standards
- Algorithms

In the future, the spras revision may change depending on what files are effected (e.g specific algorithms
will have specific revisions that change as they get updated) to avoid unnecessary running in the
Reed-CompBio/spras-benchmarking repository.

This is an optional feature, as the `spras_revision` function below is dependent on a RECORD file
(described in the docstring associated with `spras_revision`.)

We provide the convenient attach_spras_revision used in ./config.py, and `detach_spras_revision` used to get
rid of the revision for algorithms specifically.
"""

import functools
import hashlib
import importlib.metadata
import sysconfig
from pathlib import Path


@functools.cache
def spras_revision() -> str:
"""
Gets the current revision of SPRAS.

Note: This is not dependent on the SPRAS release version number nor the git commit, but rather solely on the PyPA RECORD file,
(https://packaging.python.org/en/latest/specifications/recording-installed-packages/#the-record-file), which contains
hashes of all of the installed SPRAS files [excluding RECORD itself], and is also included in the package distribution.
This means that, when developing SPRAS, `spras_revision` will be updated when spras is initially installed. However, for editable
pip installs (e.g. from `pip install -e .`), the `spras_revision` will not be updated,
as the RECORD file only contains metadata: https://setuptools.pypa.io/en/latest/userguide/development_mode.html.
"""
try:
site_packages_path = sysconfig.get_path("purelib") # where .dist-info is located.

record_path = Path(
site_packages_path,
f"spras-{importlib.metadata.version('spras')}.dist-info",
"RECORD"
)
with open(record_path, 'rb', buffering=0) as f:
# Truncated to the magic value 8, the length of the short git revision.
return hashlib.file_digest(f, 'sha256').hexdigest()[:8]
except importlib.metadata.PackageNotFoundError as err:
raise RuntimeError('spras is not an installed pip-module: did you forget to install SPRAS as a module?') from err


def attach_spras_revision(immutable_files: bool, label: str) -> str:
"""
Attaches the SPRAS revision to a label.
This function signature may become more complex as specific labels get versioned.

@param label: The label to attach the SPRAS revision to.
@param immutable_files: if False, this function is equivalent to `id`.
"""
if immutable_files is False: return label
# We use the `_` separator here instead of `-` as summary, analysis, and gold standard parts of the
# Snakemake workflow process file names by splitting on hyphens to produce new jobs.
# If this was separated with a hyphen, we would mess with that string manipulation logic.
return f"{label}_{spras_revision()}"

def detach_spras_revision(immutable_files: bool, attached_label: str) -> str:
"""The inverse of `attach_spras_revision`."""
if immutable_files is False: return attached_label
# `rpartition` starts at the end: detach_spras_revision(b, attach_spras_revision(b, s)) = s for all b, s.
return attached_label.rpartition("_")[0]
9 changes: 9 additions & 0 deletions spras/config/schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,15 @@ class ReconstructionSettings(BaseModel):

class RawConfig(BaseModel):
containers: ContainerSettings
immutable_files: bool = False
"""
If enabled, this tags all files with their local file version.
Most files do not have a specific version, and by default, this will be the hash of
all the SPRAS files in the PyPA installation. This option will not work if SPRAS was not installed
in a PyPA-compliant manner (PyPA-compliant installations include but are not limited to pip, poetry, uv, conda, pixi.)

By default, this is disabled, as it can make output file names confusing.
"""

hash_length: int = DEFAULT_HASH_LENGTH
"The length of the hash used to identify a parameter combination"
Expand Down
Loading
Loading