Reed-CompBio · agitter · Mar 18, 2026 · Jul 9, 2025 · Jul 9, 2025 · Jul 10, 2025
diff --git a/Snakefile b/Snakefile
@@ -5,6 +5,7 @@ import yaml
 from spras.dataset import Dataset
 from spras.evaluation import Evaluation
 from spras.analysis import ml, summary, cytoscape
+from spras.config.revision import detach_spras_revision
 import spras.config.config as _config
 
 # Snakemake updated the behavior in the 6.5.0 release https://github.com/snakemake/snakemake/pull/1037
@@ -34,7 +35,6 @@ def get_dataset(_datasets, label):
 algorithms = list(algorithm_params)
 algorithms_with_params = [f'{algorithm}-params-{params_hash}' for algorithm, param_combos in algorithm_params.items() for params_hash in param_combos.keys()]
 dataset_labels = list(_config.config.datasets.keys())
-
 dataset_gold_standard_node_pairs = [f"{dataset}-{gs['label']}" for gs in _config.config.gold_standards.values() if gs['node_files'] for dataset in gs['dataset_labels']]
 dataset_gold_standard_edge_pairs = [f"{dataset}-{gs['label']}" for gs in _config.config.gold_standards.values() if gs['edge_files'] for dataset in gs['dataset_labels']]
 
@@ -209,14 +209,16 @@ checkpoint prepare_input:
         # Use the algorithm's generate_inputs function to load the merged dataset, extract the relevant columns,
         # and write the output files specified by required_inputs
         # The filename_map provides the output file path for each required input file type
-        filename_map = {input_type: SEP.join([out_dir, 'prepared', f'{wildcards.dataset}-{wildcards.algorithm}-inputs', f'{input_type}.txt']) for input_type in runner.get_required_inputs(wildcards.algorithm)}
-        runner.prepare_inputs(wildcards.algorithm, input.dataset_file, filename_map)
+        algorithm = detach_spras_revision(_config.config.immutable_files, wildcards.algorithm)
+        filename_map = {input_type: SEP.join([out_dir, 'prepared', f'{wildcards.dataset}-{wildcards.algorithm}-inputs', f'{input_type}.txt']) for input_type in runner.get_required_inputs(algorithm)}
+        runner.prepare_inputs(algorithm, input.dataset_file, filename_map)
 
 # Collect the prepared input files from the specified directory
 # If the directory does not exist for this dataset-algorithm pair, the checkpoint will detect that
 # prepare_input needs to be run and will then automatically re-rerun downstream rules like reconstruct
 # If the directory does exist but some of the required input files are missing, Snakemake will not automatically
-# run prepare_input
+# run prepare_inputs
+
 # It only checks for the output of prepare_input, which is a directory
 # Therefore, manually remove the entire directory if any of the expected prepared input file are missing so that
 # prepare_inputs is run, the directory and prepared input files are re-generated, and the reconstruct rule is run again
@@ -227,7 +229,7 @@ def collect_prepared_input(wildcards):
     prepared_dir = SEP.join([out_dir, 'prepared', f'{wildcards.dataset}-{wildcards.algorithm}-inputs'])
 
     # Construct the list of expected prepared input files for the reconstruction algorithm
-    prepared_inputs = expand(f'{prepared_dir}{SEP}{{type}}.txt',type=runner.get_required_inputs(algorithm=wildcards.algorithm))
+    prepared_inputs = expand(f'{prepared_dir}{SEP}{{type}}.txt',type=runner.get_required_inputs(algorithm=detach_spras_revision(_config.config.immutable_files, wildcards.algorithm)))
     # If the directory is missing, do nothing because the missing output triggers running prepare_input
     if os.path.isdir(prepared_dir):
         # First, check if .snakemake_timestamp, the last written file in a directory rule,
@@ -273,23 +275,23 @@ rule reconstruct:
         # Create a copy so that the updates are not written to the parameters logfile
         params = reconstruction_params(wildcards.algorithm, wildcards.params).copy()
         # Declare the input files as a dictionary.
-        inputs = dict(zip(runner.get_required_inputs(wildcards.algorithm), *{input}, strict=True))
+        inputs = dict(zip(runner.get_required_inputs(detach_spras_revision(_config.config.immutable_files, wildcards.algorithm)), *{input}, strict=True))
         # Remove the _spras_run_name parameter added for keeping track of the run name for parameters.yml
         if '_spras_run_name' in params:
             params.pop('_spras_run_name')
-        runner.run(wildcards.algorithm, inputs, output.pathway_file, params, container_settings)
+        runner.run(detach_spras_revision(_config.config.immutable_files, wildcards.algorithm), inputs, output.pathway_file, params, container_settings)
 
 # Original pathway reconstruction output to universal output
 # Use PRRunner as a wrapper to call the algorithm-specific parse_output
 rule parse_output:
-    input: 
+    input:
         raw_file = SEP.join([out_dir, '{dataset}-{algorithm}-{params}', 'raw-pathway.txt']),
         dataset_file = SEP.join([out_dir, 'dataset-{dataset}-merged.pickle'])
     output: standardized_file = SEP.join([out_dir, '{dataset}-{algorithm}-{params}', 'pathway.txt'])
     run:
         params = reconstruction_params(wildcards.algorithm, wildcards.params).copy()
         params['dataset'] = input.dataset_file
-        runner.parse_output(wildcards.algorithm, input.raw_file, output.standardized_file, params)
+        runner.parse_output(detach_spras_revision(_config.config.immutable_files, wildcards.algorithm), input.raw_file, output.standardized_file, params)
 
 # TODO: reuse in the future once we make summary work for mixed graphs. See https://github.com/Reed-CompBio/spras/issues/128
 # Collect summary statistics for a single pathway

diff --git a/config/config.yaml b/config/config.yaml
@@ -3,6 +3,15 @@
 # The length of the hash used to identify a parameter combination
 hash_length: 7
 
+# If enabled, this tags all output files with a SPRAS 'revision version'.
+# By default, this will be the hash of all the SPRAS files in the PyPA installation. This option will not work if SPRAS was not installed
+# in a PyPA-compliant manner (PyPA-compliant installations include but are not limited to pip, poetry, uv, conda, pixi.)
+# For some files, the 'SPRAS revision' may be tied to the specific format version that file is on.
+#
+# By default, this is disabled, as it can make output file names confusing. Here, it's set to true since we use this
+# configuration file for testing.
+immutable_files: true
+
 # Collection of container options
 containers:
   # Specify the container framework used by each PRM wrapper. Valid options include:

diff --git a/docs/contributing/design.rst b/docs/contributing/design.rst
@@ -0,0 +1,30 @@
+SPRAS Designs
+=============
+
+SPRAS makes a few high-level design decisions. We motivate them here.
+
+.. Right now, this only talks about immutable outputs. In the future, this may include, and is not limited to:
+.. container-agonistic volumes, directionality, parameter tuning, and typed configs/algorithms.
+
+Immutable Outputs
+-----------------
+
+During benchmarking runs, SPRAS data is uploaded to the `Open Science
+Data Federation <https://osg-htc.org/services/osdf>`__. OSDF enforces an
+immutable file structure, where files can never be deleted or rewritten.
+By default, SPRAS does not have immutable files. However, in SPRAS
+configurations, the ``immutable_files`` parameter can be enabled to make
+files fully immutable where no file with the same file name will be
+written with different data.
+
+To do this, SPRAS tags all datasets, gold standards, and algorithms with
+a version hash, which is effectively the current version of how SPRAS
+processes that data in-code.
+
+In implementation, this version hash is the hash of the `RECORD
+<https://packaging.python.org/en/latest/specifications/recording-installed-packages/#the-record-file>`__ file,
+which contains hashes of all 'installed' files. When SPRAS is not installed
+in development mode (i.e. without the ``--editable`` flag), the ``RECORD`` file
+hashes all Python source files, leading to the desired effect that
+the version hash changes when the source code changes. In development mode,
+the ``RECORD`` file does not change when source code is changed.
diff --git a/docs/index.rst b/docs/index.rst
@@ -57,6 +57,7 @@ methods (PRMs) to omics data.
    contributing/index
    contributing/maintain
    contributing/patching
+   contributing/design
 
 .. toctree::
    :maxdepth: 1

diff --git a/docs/prms/bowtiebuilder.rst b/docs/prms/bowtiebuilder.rst
@@ -1,5 +1,5 @@
 BowTieBuilder
-==========
+=============
 
 BowTieBuilder is a pathway reconstruction algorithm which constructs pathways in a 'bowtie'-like
 fashion, finding the intersections of shortest paths between sources and targets and using those nodes as a basis

diff --git a/spras/analysis/summary.py b/spras/analysis/summary.py
@@ -8,7 +8,7 @@
 
 
 def summarize_networks(file_paths: Iterable[Path], node_table: pd.DataFrame, algo_params: dict[str, dict],
-                       algo_with_params: list) -> pd.DataFrame:
+                       algo_with_params: list[str]) -> pd.DataFrame:
     """
     Generate a table that aggregates summary information about networks in file_paths, including which nodes are present
     in node_table columns. Network directionality is ignored and all edges are treated as undirected. The order of the

diff --git a/spras/config/config.py b/spras/config/config.py
@@ -13,6 +13,7 @@
 """
 
 import copy as copy
+import functools
 import itertools as it
 import warnings
 from pathlib import Path
@@ -22,6 +23,7 @@
 import yaml
 
 from spras.config.container_schema import ProcessedContainerSettings
+from spras.config.revision import attach_spras_revision, spras_revision
 from spras.config.schema import DatasetSchema, RawConfig
 from spras.util import LoosePathLike, NpHashEncoder, hash_params_sha1_base32
 
@@ -59,8 +61,6 @@ def __init__(self, raw_config: dict[str, Any]):
         self.hash_length = parsed_raw_config.hash_length
         # Container settings used by PRMs.
         self.container_settings = ProcessedContainerSettings.from_container_settings(parsed_raw_config.containers, self.hash_length)
-        # The list of algorithms to run in the workflow. Each is a dict with 'name' as an expected key.
-        self.algorithms = None
         # A nested dict mapping algorithm names to dicts that map parameter hashes to parameter combinations.
         # Only includes algorithms that are set to be run with 'include: true'.
         self.algorithm_params: dict[str, dict[str, Any]] = dict()
@@ -88,6 +88,8 @@ def __init__(self, raw_config: dict[str, Any]):
         self.analysis_include_ml_aggregate_algo = None
         # A Boolean specifying whether to run the evaluation per algorithm analysis
         self.analysis_include_evaluation_aggregate_algo = None
+        # Specifies whether the files should be OSDF-immutable (i.e. the file names change when the file itself changes)
+        self.immutable_files = parsed_raw_config.immutable_files
 
         self.process_config(parsed_raw_config)
 
@@ -117,6 +119,12 @@ def process_datasets(self, raw_config: RawConfig):
         # Currently assumes all datasets have a label and the labels are unique
         # When Snakemake parses the config file it loads the datasets as OrderedDicts not dicts
         # Convert to dicts to simplify the yaml logging
+
+        for dataset in raw_config.datasets:
+            dataset.label = attach_spras_revision(self.immutable_files, dataset.label)
+        for gold_standard in raw_config.gold_standards:
+            gold_standard.label = attach_spras_revision(self.immutable_files, gold_standard.label)
+
         for dataset in raw_config.datasets:
             label = dataset.label
             if label.lower() in [key.lower() for key in self.datasets.keys()]:
@@ -130,8 +138,14 @@ def process_datasets(self, raw_config: RawConfig):
         dataset_labels = set(self.datasets.keys())
         gold_standard_dataset_labels = {dataset_label for value in self.gold_standards.values() for dataset_label in value['dataset_labels']}
         for label in gold_standard_dataset_labels:
-            if label not in dataset_labels:
+            if attach_spras_revision(self.immutable_files, label) not in dataset_labels:
                 raise ValueError(f"Dataset label '{label}' provided in gold standards does not exist in the existing dataset labels.")
+        # We attach the SPRAS revision to the individual dataset labels afterwards for a cleaner error message above.
+        for key, gold_standard in self.gold_standards.items():
+            self.gold_standards[key]["dataset_labels"] = map(
+                functools.partial(attach_spras_revision, self.immutable_files),
+                gold_standard["dataset_labels"]
+            )
 
         # Code snipped from Snakefile that may be useful for assigning default labels
         # dataset_labels = [dataset.get('label', f'dataset{index}') for index, dataset in enumerate(datasets)]
@@ -148,8 +162,10 @@ def process_algorithms(self, raw_config: RawConfig):
         """
         prior_params_hashes = set()
         self.algorithm_params = dict()
-        self.algorithms = raw_config.algorithms
-        for alg in self.algorithms:
+        # We copy raw_config.algorithms to avoid mutating the original config
+        # when we attach the SPRAS revision to algorithm names later.
+        for alg in raw_config.algorithms[:]:
+            alg.name = attach_spras_revision(self.immutable_files, alg.name)
             if alg.include:
                 # This dict maps from parameter combinations hashes to parameter combination dictionaries
                 self.algorithm_params[alg.name] = dict()
@@ -187,7 +203,11 @@ def process_algorithms(self, raw_config: RawConfig):
                             run_dict[param] = float(value)
                         if isinstance(value, np.ndarray):
                             run_dict[param] = value.tolist()
-                    params_hash = hash_params_sha1_base32(run_dict, self.hash_length, cls=NpHashEncoder)
+                    hash_run_dict = copy.deepcopy(run_dict)
+                    if self.immutable_files:
+                        # Incorporates the `spras_revision` into the hash
+                        hash_run_dict["_spras_rev"] = spras_revision()
+                    params_hash = hash_params_sha1_base32(hash_run_dict, self.hash_length, cls=NpHashEncoder)
                     if params_hash in prior_params_hashes:
                         raise ValueError(f'Parameter hash collision detected. Increase the hash_length in the config file '
                                         f'(current length {self.hash_length}).')

diff --git a/spras/config/revision.py b/spras/config/revision.py
@@ -0,0 +1,72 @@
+"""
+The revision is an optional hash associated to all files in the designated output directory
+to make sure that file _names_ are immutable. We attach the revision to three labels:
+
+- Datasets
+- Gold standards
+- Algorithms
+
+In the future, the spras revision may change depending on what files are effected (e.g specific algorithms
+will have specific revisions that change as they get updated) to avoid unnecessary running in the
+Reed-CompBio/spras-benchmarking repository.
+
+This is an optional feature, as the `spras_revision` function below is dependent on a RECORD file
+(described in the docstring associated with `spras_revision`.)
+
+We provide the convenient attach_spras_revision used in ./config.py, and `detach_spras_revision` used to get
+rid of the revision for algorithms specifically.
+"""
+
+import functools
+import hashlib
+import importlib.metadata
+import sysconfig
+from pathlib import Path
+
+
+@functools.cache
+def spras_revision() -> str:
+    """
+    Gets the current revision of SPRAS.
+
+    Note: This is not dependent on the SPRAS release version number nor the git commit, but rather solely on the PyPA RECORD file,
+    (https://packaging.python.org/en/latest/specifications/recording-installed-packages/#the-record-file), which contains
+    hashes of all of the installed SPRAS files [excluding RECORD itself], and is also included in the package distribution.
+    This means that, when developing SPRAS, `spras_revision` will be updated when spras is initially installed. However, for editable
+    pip installs (e.g. from `pip install -e .`), the `spras_revision` will not be updated,
+    as the RECORD file only contains metadata: https://setuptools.pypa.io/en/latest/userguide/development_mode.html.
+    """
+    try:
+        site_packages_path = sysconfig.get_path("purelib") # where .dist-info is located.
+
+        record_path = Path(
+            site_packages_path,
+            f"spras-{importlib.metadata.version('spras')}.dist-info",
+            "RECORD"
+        )
+        with open(record_path, 'rb', buffering=0) as f:
+            # Truncated to the magic value 8, the length of the short git revision.
+            return hashlib.file_digest(f, 'sha256').hexdigest()[:8]
+    except importlib.metadata.PackageNotFoundError as err:
+        raise RuntimeError('spras is not an installed pip-module: did you forget to install SPRAS as a module?') from err
+
+
+def attach_spras_revision(immutable_files: bool, label: str) -> str:
+    """
+    Attaches the SPRAS revision to a label.
+    This function signature may become more complex as specific labels get versioned.
+
+    @param label: The label to attach the SPRAS revision to.
+    @param immutable_files: if False, this function is equivalent to `id`.
+    """
+    if immutable_files is False: return label
+    # We use the `_` separator here instead of `-` as summary, analysis, and gold standard parts of the
+    # Snakemake workflow process file names by splitting on hyphens to produce new jobs.
+    # If this was separated with a hyphen, we would mess with that string manipulation logic.
+    return f"{label}_{spras_revision()}"
+
+def detach_spras_revision(immutable_files: bool, attached_label: str) -> str:
+    """The inverse of `attach_spras_revision`."""
+    if immutable_files is False: return attached_label
+    # `rpartition` starts at the end: detach_spras_revision(b, attach_spras_revision(b, s)) = s for all b, s.
+    return attached_label.rpartition("_")[0]
diff --git a/spras/config/schema.py b/spras/config/schema.py
@@ -101,6 +101,15 @@ class ReconstructionSettings(BaseModel):
 
 class RawConfig(BaseModel):
     containers: ContainerSettings
+    immutable_files: bool = False
+    """
+    If enabled, this tags all files with their local file version.
+    Most files do not have a specific version, and by default, this will be the hash of
+    all the SPRAS files in the PyPA installation. This option will not work if SPRAS was not installed
+    in a PyPA-compliant manner (PyPA-compliant installations include but are not limited to pip, poetry, uv, conda, pixi.)
+
+    By default, this is disabled, as it can make output file names confusing.
+    """
 
     hash_length: int = DEFAULT_HASH_LENGTH
     "The length of the hash used to identify a parameter combination"