TalusBio · jspaezp · May 9, 2026 · May 9, 2026 · May 9, 2026 · May 9, 2026
diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml
@@ -21,10 +21,10 @@ jobs:
           - "--no-default-features"  # serial
     steps:
       - uses: actions/checkout@v6
-      - name: Apply crate patches (rustyms gnome.dat stub)
-        run: |
-          cargo install --git https://github.com/jspaezp/cargo-patch-crate patch-crate --locked
-          cargo patch-crate
+      # - name: Apply crate patches (rustyms gnome.dat stub)
+      #   run: |
+      #     cargo install --git https://github.com/jspaezp/cargo-patch-crate patch-crate --locked
+      #     cargo patch-crate
       - name: Build
         run: cargo build --verbose -p timsseek ${{ matrix.features }}
       - name: Run tests

diff --git a/.gitignore b/.gitignore
@@ -37,6 +37,7 @@ results.json
 *search_results*/*
 
 wandb/*
+bench_out/
 docs/superpowers/*
 .plans/*
 .claude/*

diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -81,8 +81,9 @@ tempfile = "3.23.0"
 # rustyms into target/patch/ via timsseek_cli's build.rs. On a clean checkout
 # run `cargo patch-crate` once to populate target/patch/ before building, or
 # trigger a first build of timsseek_cli to let its build.rs handle it.
-[patch.crates-io]
-rustyms = { path = "./target/patch/rustyms-0.11.0" }
+## DISABLING FOR NOW
+## [patch.crates-io]
+## rustyms = { path = "./target/patch/rustyms-0.11.0" }
 
 [workspace.lints.clippy]
 len_without_is_empty = "allow"

diff --git a/bench/README.md b/bench/README.md
@@ -0,0 +1,88 @@
+# bench
+
+Fixture-driven bench harness for `timsseek`. Each fixture is a TOML in `bench/fixtures/` pointing at S3 URIs.
+
+## Run a fixture
+
+    uv run --group bench python -m bench.wandb_bench hela_iccoff_gt20peps
+    uv run --group bench python -m bench.wandb_bench --all
+    uv run --group bench python -m bench.wandb_bench --match 'hela*'
+
+Outputs land under `bench_out/` (gitignored): `logs/<name>-<ts>/`, `parquets/<name>-<ts>-classified.parquet`, `plots/<name>-fdr_curve-<ts>.png`, `plots/<name>-mainscore_hist-<ts>.png`. Wandb runs go to `jspaezp/timsseek`.
+
+Fixtures with `entrapment_peptides` set automatically run entrapment classification, emit lower-bound + combined FDR estimators (and matched FDR if `pairing` is set), and a 4-group `main_score` histogram (target/decoy × class=target/entrap).
+
+## Push a new fixture
+
+Requires `aws` CLI (auth via env / profile).
+
+### Foreign-species entrapment (Algorithm 2 of Noble et al, FDRBench paper)
+
+    uv run --group bench python -m bench.push_fixture \
+      --name hela_iccoff_human_yeast \
+      --bucket terraform-workstations-bucket --prefix jspaezp/timsseek_fixtures \
+      --db UP000005640 \
+      --entrap-db UP000002311 \
+      --raw ~/data/decompressed_timstof/250225_Desnaux_200ng_Hela_ICC_off_DIA.d \
+      --config bench/configs/default.toml \
+      --entrap-ratio 1.0 \
+      --request-delay-ms 250
+
+Pipeline: digest target + entrap (trypsin, 1 missed cleavage), filter to length 7-30, drop entrap peptides that also appear in target, randomly subsample entrap to `r × |target|` (seed=42; override with `--seed`). Uploads `target.peptides.txt`, `entrap.peptides.txt`, `database.peptides.txt` (union) and builds the speclib via `speclib_build_cli --peptides s3://.../database.peptides.txt`.
+
+Records `entrapment_mode = "foreign"` and the actual achieved `entrapment_ratio` on the fixture.
+
+### Shuffled entrapment (Algorithm 1 — paired estimator)
+
+    uv run --group bench python -m bench.push_fixture \
+      --name hela_iccoff_shuffled \
+      --bucket terraform-workstations-bucket --prefix jspaezp/timsseek_fixtures \
+      --db UP000005640 \
+      --entrap-db SHUFFLED \
+      --raw ~/data/decompressed_timstof/250225_Desnaux_200ng_Hela_ICC_off_DIA.d \
+      --config bench/configs/default.toml \
+      --entrap-ratio 1.0 \
+      --request-delay-ms 250
+
+Pipeline: digest target, length-filter, then for each surviving target peptide generate r distinct shuffles (interior permuted, C-term residue fixed). Targets that can't produce r unique shuffles are dropped. With `--entrap-ratio 1.0` the runner also emits `pairing.tsv` enabling the matched FDP estimator.
+
+Records `entrapment_mode = "shuffled"` plus `pairing` URI when r=1.
+
+### Common flags
+
+`--db` (and `--entrap-db`, `--calib-db`) accept: local `*.fasta(.gz)`, local `*.txt` accession list, `s3://...` URI, `UPxxxxxxxxx` proteome ID, bare UniProt accession, or the literal `SHUFFLED` for `--entrap-db` only.
+
+`fetch_proteome` defaults to `reviewed:true` (Swiss-Prot only). Pass full proteomes via fasta or accession list if you need TrEMBL.
+
+Other flags: `--peptide-min-len 7`, `--peptide-max-len 30`, `--missed-cleavages 1`, `--seed 42`. `--request-delay-ms` throttles speclib_build_cli's koina calls (default 500).
+
+After upload, hand-edit the generated `bench/fixtures/<name>.toml` to add a description, then `git add bench/fixtures/<name>.toml`.
+
+Re-running `push_fixture` is idempotent by default: existing S3 objects are skipped (single files via `aws s3 ls`; `.d` directory via `aws s3 sync`). Pass `--force` to re-upload everything.
+
+## FDP estimators
+
+Per Noble et al, FDRBench paper (Table S2). When entrapment is configured:
+
+| Estimator | Formula | Available when |
+|---|---|---|
+| Lower bound | `n_e / (n_e + n_t)` | always |
+| Combined (avg upper bound) | `n_e × (1 + 1/r) / (n_e + n_t)` | always |
+| Matched (k=1, avg upper bound) | `(n_e + n_p_s_t + 2·n_p_t_s) / (n_e + n_t)` | shuffled mode + r=1 |
+
+`r = entrapment_ratio` is recorded on the fixture from the actual ratio achieved at push time. Counts are walked over `is_target=True` rows only (post-competition target winners; decoy wins are TDA-style FPs, separate from entrapment FPs).
+
+`compute_fdr_curve` emits all available estimator columns; `plot_fdr_curve` overlays them.
+
+## Stage a fixture for offline / repeated runs
+
+When iterating on a fixture, pull its inputs to a local cache once, then run against the staged copy:
+
+    uv run --group bench python -m bench.stage_fixture hela_iccoff_human_yeast
+    uv run --group bench python -m bench.wandb_bench --fixtures-dir bench_out/staged hela_iccoff_human_yeast
+
+Defaults: cache root `bench_out/cache/<name>/` (override via `--cache-dir` or `BENCH_CACHE_DIR` env), output TOML `bench_out/staged/<name>.toml` (override via `--out`). Already-cached files are skipped on re-stage; `--force` re-downloads. Inputs that are already absolute local paths are referenced as-is (no copy).
+
+## Schema
+
+See `bench/_fixture_schema.py` for the canonical TOML. Required: `inputs.target_peptides`, `inputs.speclib`, `inputs.raw`. Optional: `inputs.entrapment_peptides` (with `entrapment_ratio` and `entrapment_mode`), `inputs.pairing` (only valid when `entrapment_mode = "shuffled"`), `inputs.calibration_speclib`. URIs accept `s3://` or absolute local paths (the latter for staged fixtures; `push_fixture` always emits `s3://`).
diff --git a/bench/__init__.py b/bench/__init__.py
diff --git a/bench/_db_resolver.py b/bench/_db_resolver.py
@@ -0,0 +1,158 @@
+"""Polymorphic --db spec parsing and resolution.
+
+Classifies one CLI value into one of: local fasta file, local accession-list
+text file, remote s3 fasta, uniprot proteome ID, or uniprot accession.
+Also provides resolve_dbs() to turn a list of specs into a merged FASTA file.
+"""
+
+from __future__ import annotations
+
+import enum
+import gzip
+import os
+import re
+import tempfile
+from dataclasses import dataclass
+from pathlib import Path
+
+from bench._s3 import s3_download_file
+from bench._uniprot import fetch_accession_batch, fetch_proteome
+
+# Longest suffixes first so 'foo.fasta.gz' does not short-circuit on '.fasta'.
+_FASTA_EXTS = (".fasta.gz", ".fa.gz", ".fasta", ".fa")
+_TXT_EXTS = (".txt",)
+_UNIPROT_PROTEOME_RE = re.compile(r"^UP\d{9}$")
+_UNIPROT_ACCESSION_RE = re.compile(
+    r"^([OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2})$"
+)
+
+
+class DbSpecKind(enum.Enum):
+    LOCAL_FASTA = "local_fasta"
+    ACCESSION_LIST_FILE = "accession_list_file"
+    S3_FASTA = "s3_fasta"
+    UNIPROT_PROTEOME = "uniprot_proteome"
+    UNIPROT_ACCESSION = "uniprot_accession"
+    SHUFFLED = "shuffled"
+
+
+@dataclass(frozen=True)
+class DbSpec:
+    kind: DbSpecKind
+    value: str  # original spec string
+
+
+def classify_db_spec(spec: str) -> DbSpec:
+    """Classify a single --db value. Raises ValueError for unrecognised input."""
+    if spec == "SHUFFLED":
+        return DbSpec(DbSpecKind.SHUFFLED, spec)
+
+    if spec.startswith("s3://"):
+        return DbSpec(DbSpecKind.S3_FASTA, spec)
+
+    # Local file? Path-shaped strings get checked first so a stray file named
+    # `UP000005640` on disk still resolves as local.
+    if os.path.exists(spec):
+        lower = spec.lower()
+        if any(lower.endswith(ext) for ext in _FASTA_EXTS):
+            return DbSpec(DbSpecKind.LOCAL_FASTA, spec)
+        if any(lower.endswith(ext) for ext in _TXT_EXTS):
+            return DbSpec(DbSpecKind.ACCESSION_LIST_FILE, spec)
+        # File exists but unrecognised extension — refuse rather than guess.
+        raise ValueError(
+            f"unrecognised --db spec: {spec!r}"
+            " (file exists but extension is not .fasta/.fa/.txt)"
+        )
+
+    if _UNIPROT_PROTEOME_RE.match(spec):
+        return DbSpec(DbSpecKind.UNIPROT_PROTEOME, spec)
+
+    if _UNIPROT_ACCESSION_RE.match(spec):
+        return DbSpec(DbSpecKind.UNIPROT_ACCESSION, spec)
+
+    raise ValueError(
+        f"unrecognised --db spec: {spec!r}"
+        " (not s3://, not a local .fasta/.txt, not UP..., not an accession)"
+    )
+
+
+# ---------------------------------------------------------------------------
+# Resolution helpers
+# ---------------------------------------------------------------------------
+
+
+def _read_local_fasta_text(path: str) -> str:
+    p = Path(path)
+    if str(p).lower().endswith(".gz"):
+        with gzip.open(p, "rt") as f:
+            return f.read()
+    return p.read_text()
+
+
+def _read_accession_list_file(path: str) -> list[str]:
+    accs: list[str] = []
+    for line in Path(path).read_text().splitlines():
+        line = line.strip()
+        if line:
+            accs.append(line)
+    return accs
+
+
+def resolve_dbs(specs: list[str], output_path: Path) -> None:
+    """Resolve a list of --db specs into a concatenated FASTA at output_path.
+
+    Individual UniProt accessions (UNIPROT_ACCESSION kind) are coalesced into
+    a single batched fetch call (one HTTP round-trip). Accession-list files
+    each produce their own batch call. Other spec kinds are fetched
+    independently and appended in CLI order.
+    """
+    output_path = Path(output_path)
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+
+    classified = [classify_db_spec(raw) for raw in specs]
+
+    for spec in classified:
+        if spec.kind is DbSpecKind.SHUFFLED:
+            raise ValueError(
+                "SHUFFLED is not resolvable to a fasta"
+                " — push_fixture handles it directly"
+            )
+
+    # Coalesce all bare UNIPROT_ACCESSION specs into one batch.
+    bare_accessions: list[str] = [
+        s.value for s in classified if s.kind is DbSpecKind.UNIPROT_ACCESSION
+    ]
+
+    fragments: list[str] = []
+    pending_accessions_flushed = False
+
+    for spec in classified:
+        if spec.kind is DbSpecKind.UNIPROT_ACCESSION:
+            # Flush the whole accession batch on the first encounter.
+            if not pending_accessions_flushed:
+                fragments.append(fetch_accession_batch(bare_accessions))
+                pending_accessions_flushed = True
+            # Subsequent accessions are already included in the batch above.
+        elif spec.kind is DbSpecKind.LOCAL_FASTA:
+            fragments.append(_read_local_fasta_text(spec.value))
+        elif spec.kind is DbSpecKind.S3_FASTA:
+            with tempfile.NamedTemporaryFile(suffix=".fasta", delete=False) as tmp:
+                tmp_path = tmp.name
+            try:
+                s3_download_file(spec.value, tmp_path)
+                fragments.append(_read_local_fasta_text(tmp_path))
+            finally:
+                Path(tmp_path).unlink(missing_ok=True)
+        elif spec.kind is DbSpecKind.UNIPROT_PROTEOME:
+            fragments.append(fetch_proteome(spec.value))
+        elif spec.kind is DbSpecKind.ACCESSION_LIST_FILE:
+            accs = _read_accession_list_file(spec.value)
+            fragments.append(fetch_accession_batch(accs))
+        else:
+            raise AssertionError(f"unhandled kind {spec.kind}")
+
+    with output_path.open("w") as f:
+        for chunk in fragments:
+            if not chunk.endswith("\n"):
+                chunk = chunk + "\n"
+            f.write(chunk)
diff --git a/bench/_digest.py b/bench/_digest.py
@@ -0,0 +1,63 @@
+"""Trypsin digestion + length filter helpers.
+
+Trypsin rule: cleave C-terminal to K or R, NOT when followed by P.
+Missed cleavages are concatenations of N+1 contiguous base segments.
+"""
+
+from __future__ import annotations
+
+import re
+from pathlib import Path
+
+# Cleavage site: after K or R that is NOT followed by P.
+# Use a regex split that emits peptides ending in K|R (or the final tail).
+_CLEAVE = re.compile(r"(?<=[KR])(?!P)")
+
+
+def parse_fasta(path: str | Path) -> dict[str, str]:
+    """Parse a FASTA into {accession: concatenated_sequence} (header line minus `>`)."""
+    out: dict[str, str] = {}
+    cur_acc: str | None = None
+    parts: list[str] = []
+    with Path(path).open("r") as f:
+        for raw in f:
+            line = raw.rstrip()
+            if not line:
+                continue
+            if line.startswith(">"):
+                if cur_acc is not None:
+                    out[cur_acc] = "".join(parts)
+                cur_acc = line[1:].strip()
+                parts = []
+            else:
+                parts.append(line)
+    if cur_acc is not None:
+        out[cur_acc] = "".join(parts)
+    return out
+
+
+def digest_protein(sequence: str, missed_cleavages: int = 1) -> list[str]:
+    """Digest one protein into peptides. Returns base segments plus all
+    contiguous merges of up to (missed_cleavages+1) segments."""
+    base = [s for s in _CLEAVE.split(sequence) if s]
+    out: list[str] = []
+    n = len(base)
+    for i in range(n):
+        for j in range(i + 1, min(i + 2 + missed_cleavages, n + 1)):
+            out.append("".join(base[i:j]))
+    return out
+
+
+def digest_proteins(
+    proteins: dict[str, str], missed_cleavages: int = 1
+) -> set[str]:
+    """Digest a {accession: sequence} dict; return the deduplicated peptide set."""
+    out: set[str] = set()
+    for seq in proteins.values():
+        out.update(digest_protein(seq, missed_cleavages=missed_cleavages))
+    return out
+
+
+def length_filter(peptides: set[str], min_len: int = 7, max_len: int = 30) -> set[str]:
+    """Keep peptides with length in [min_len, max_len]."""
+    return {p for p in peptides if min_len <= len(p) <= max_len}