diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 7eb7ad85..3d561ba4 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -45,7 +45,7 @@ jobs: artifact_name: windows-x64 steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v6 - name: Install Rust uses: actions-rust-lang/setup-rust-toolchain@v1 @@ -60,6 +60,12 @@ jobs: if: matrix.target == 'x86_64-unknown-linux-musl' run: sudo apt-get update && sudo apt-get install -y musl-tools + - name: Apply crate patches (rustyms gnome.dat stub) + shell: bash + run: | + cargo install --git https://github.com/jspaezp/cargo-patch-crate patch-crate --locked + cargo patch-crate + - name: Build Binaries shell: bash run: | diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index a2df5e7a..49011395 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -21,6 +21,10 @@ jobs: - "--no-default-features" # serial steps: - uses: actions/checkout@v6 + - name: Apply crate patches (rustyms gnome.dat stub) + run: | + cargo install --git https://github.com/jspaezp/cargo-patch-crate patch-crate --locked + cargo patch-crate - name: Build run: cargo build --verbose -p timsseek ${{ matrix.features }} - name: Run tests diff --git a/.gitignore b/.gitignore index 77b25729..b85c21eb 100644 --- a/.gitignore +++ b/.gitignore @@ -52,6 +52,9 @@ bench.bash bench_*.bash quichbench*.bash run_instrument.bash +run_remote.bash +stage.bash +here.toml *.log results.tsv config_use.json diff --git a/Cargo.lock b/Cargo.lock index 1e93a85a..68033636 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -159,7 +159,7 @@ dependencies = [ [[package]] name = "alloc_track" -version = "0.30.0" +version = "0.31.0" [[package]] name = "allocator-api2" @@ -288,7 +288,7 @@ dependencies = [ [[package]] name = "array2d" -version = "0.30.0" +version = "0.31.0" dependencies = [ "serde", ] @@ -315,10 +315,7 @@ dependencies = [ "arrow-array", "arrow-buffer", "arrow-cast", - "arrow-csv", "arrow-data", - "arrow-ipc", - "arrow-json", "arrow-ord", "arrow-row", "arrow-schema", @@ -391,21 +388,6 @@ dependencies = [ "ryu", ] -[[package]] -name = "arrow-csv" -version = "57.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8da746f4180004e3ce7b83c977daf6394d768332349d3d913998b10a120b790a" -dependencies = [ - "arrow-array", - "arrow-cast", - "arrow-schema", - "chrono", - "csv", - "csv-core", - "regex", -] - [[package]] name = "arrow-data" version = "57.3.0" @@ -433,30 +415,6 @@ dependencies = [ "flatbuffers", ] -[[package]] -name = "arrow-json" -version = "57.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0ff8357658bedc49792b13e2e862b80df908171275f8e6e075c460da5ee4bf86" -dependencies = [ - "arrow-array", - "arrow-buffer", - "arrow-cast", - "arrow-data", - "arrow-schema", - "chrono", - "half", - "indexmap", - "itoa", - "lexical-core", - "memchr", - "num-traits", - "ryu", - "serde_core", - "serde_json", - "simdutf8", -] - [[package]] name = "arrow-ord" version = "57.3.0" @@ -1174,15 +1132,6 @@ dependencies = [ "vsimd", ] -[[package]] -name = "bincode" -version = "1.3.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b1f45e9417d87227c7a56d22e471c6206462cba514c7590c09aff4cf6d1ddcad" -dependencies = [ - "serde", -] - [[package]] name = "bincode" version = "2.0.1" @@ -1398,7 +1347,7 @@ dependencies = [ [[package]] name = "calibrt" -version = "0.30.0" +version = "0.31.0" dependencies = [ "array2d", "insta", @@ -2310,6 +2259,17 @@ dependencies = [ "simd-adler32", ] +[[package]] +name = "filetime" +version = "0.2.27" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f98844151eee8917efc50bd9e8318cb963ae8b297431495d3f758616ea5c57db" +dependencies = [ + "cfg-if", + "libc", + "libredox", +] + [[package]] name = "find-msvc-tools" version = "0.1.9" @@ -2361,15 +2321,6 @@ version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "77ce24cb58228fbb8aa041425bb1050850ac19177686ea6e0f41a70416f56fdb" -[[package]] -name = "font-types" -version = "0.11.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2d9237c6d82152100c691fb77ea18037b402bcc7257d2c876a4ffac81bc22a1c" -dependencies = [ - "bytemuck", -] - [[package]] name = "foreign-types" version = "0.5.0" @@ -3251,12 +3202,6 @@ version = "1.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b9e0384b61958566e926dc50660321d12159025e767c18e043daf26b70104c39" -[[package]] -name = "identity-hash" -version = "0.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dfdd7caa900436d8f13b2346fe10257e0c05c1f1f9e351f4f5d57c03bd5f45da" - [[package]] name = "idna" version = "1.1.0" @@ -3761,7 +3706,7 @@ dependencies = [ [[package]] name = "micromzpaf" -version = "0.30.0" +version = "0.31.0" dependencies = [ "rustyms", "serde", @@ -3826,35 +3771,6 @@ dependencies = [ "pxfm", ] -[[package]] -name = "mzdata" -version = "0.59.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7e6c319f4111abe43bbcc33433b727968e30163f0a90e440386978656df8685c" -dependencies = [ - "base64-simd", - "bitflags 2.11.0", - "bytemuck", - "chrono", - "flate2", - "identity-hash", - "indexmap", - "log", - "mzpeaks", - "num-traits", - "regex", - "thiserror 2.0.18", -] - -[[package]] -name = "mzpeaks" -version = "1.0.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "543be9eac70437bfc915b3339e6ae4f23dc034922f13eb2535dcc19e7e9e9481" -dependencies = [ - "num-traits", -] - [[package]] name = "naga" version = "27.0.3" @@ -4711,16 +4627,6 @@ dependencies = [ "syn 2.0.117", ] -[[package]] -name = "probability" -version = "0.20.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "42746b805e424b759d46c22c65dc66ccca057a2db96e9db4fda6c337a287e485" -dependencies = [ - "random", - "special", -] - [[package]] name = "proc-macro-crate" version = "3.5.0" @@ -4987,12 +4893,6 @@ dependencies = [ "getrandom 0.3.4", ] -[[package]] -name = "random" -version = "0.13.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "474c42c904f04dfe2a595a02f71e1a0e5e92ffb5761cc9a4c02140b93b8dd504" - [[package]] name = "range-alloc" version = "0.1.5" @@ -5031,16 +4931,6 @@ dependencies = [ "crossbeam-utils", ] -[[package]] -name = "read-fonts" -version = "0.37.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7b634fabf032fab15307ffd272149b622260f55974d9fad689292a5d33df02e5" -dependencies = [ - "bytemuck", - "font-types", -] - [[package]] name = "redox_syscall" version = "0.4.1" @@ -5431,28 +5321,18 @@ checksum = "b39cdef0fa800fc44525c84ccb54a029961a8215f9619753635a9c0d2538d46d" [[package]] name = "rustyms" version = "0.11.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "011d3d672ae44d5e07db0488d855f2b5ed178e3d6bb7ef5b18c6415c20bbd61e" dependencies = [ - "bincode 2.0.1", + "bincode", "context_error", "flate2", "itertools 0.14.0", - "mzdata", - "ndarray", "ordered-float 5.3.0", - "paste", - "probability", - "rand 0.9.3", - "rayon", "regex", "serde", "serde_json", "similar", - "swash", "thin-vec", "uom", - "zeno", ] [[package]] @@ -5695,16 +5575,6 @@ version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b2aa850e253778c88a04c3d7323b043aeda9d3e30d5971937c1855769763678e" -[[package]] -name = "skrifa" -version = "0.40.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7fbdfe3d2475fbd7ddd1f3e5cf8288a30eb3e5f95832829570cd88115a7434ac" -dependencies = [ - "bytemuck", - "read-fonts", -] - [[package]] name = "slab" version = "0.4.12" @@ -5826,18 +5696,9 @@ dependencies = [ "smallvec", ] -[[package]] -name = "special" -version = "0.10.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b89cf0d71ae639fdd8097350bfac415a41aabf1d5ddd356295fdc95f09760382" -dependencies = [ - "libm", -] - [[package]] name = "speclib_build_cli" -version = "0.30.0" +version = "0.31.0" dependencies = [ "bloomfilter", "clap", @@ -5848,6 +5709,7 @@ dependencies = [ "serde", "serde_json", "tempfile", + "tims_stage", "timsseek", "tokio", "toml", @@ -5903,17 +5765,6 @@ version = "2.6.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "13c2bddecc57b384dee18652358fb23172facb8a2c51ccc10d74c157bdea3292" -[[package]] -name = "swash" -version = "0.2.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "842f3cd369c2ba38966204f983eaa5e54a8e84a7d7159ed36ade2b6c335aae64" -dependencies = [ - "skrifa", - "yazi", - "zeno", -] - [[package]] name = "syn" version = "1.0.109" @@ -6097,9 +5948,24 @@ dependencies = [ "time-core", ] +[[package]] +name = "tims_stage" +version = "0.31.0" +dependencies = [ + "bytes", + "filetime", + "indicatif", + "serde", + "tempfile", + "thiserror 2.0.18", + "timscentroid", + "tracing", + "url", +] + [[package]] name = "timscentroid" -version = "0.30.0" +version = "0.31.0" dependencies = [ "arrow", "async-trait", @@ -6111,6 +5977,7 @@ dependencies = [ "geo", "geo-types", "half", + "indicatif", "object_store", "once_cell", "parquet", @@ -6129,11 +5996,10 @@ dependencies = [ [[package]] name = "timsquery" -version = "0.30.0" +version = "0.31.0" dependencies = [ "array2d", "arrow", - "bincode 1.3.3", "bon", "csv", "half", @@ -6143,16 +6009,18 @@ dependencies = [ "rayon", "serde", "serde_json", + "tempfile", + "thiserror 2.0.18", + "tims_stage", "timscentroid", "timsrust", "tinyvec", "tracing", - "zstd", ] [[package]] name = "timsquery_cli" -version = "0.30.0" +version = "0.31.0" dependencies = [ "clap", "half", @@ -6171,7 +6039,7 @@ dependencies = [ [[package]] name = "timsquery_pyo3" -version = "0.30.0" +version = "0.31.0" dependencies = [ "numpy", "pyo3", @@ -6182,7 +6050,7 @@ dependencies = [ [[package]] name = "timsquery_viewer" -version = "0.30.0" +version = "0.31.0" dependencies = [ "calibrt", "clap", @@ -6226,7 +6094,7 @@ dependencies = [ [[package]] name = "timsseek" -version = "0.30.0" +version = "0.31.0" dependencies = [ "array2d", "arrow", @@ -6242,6 +6110,8 @@ dependencies = [ "rustyms", "serde", "serde_json", + "smallvec", + "tempfile", "timscentroid", "timsquery", "timsrust", @@ -6251,9 +6121,10 @@ dependencies = [ [[package]] name = "timsseek_cli" -version = "0.30.0" +version = "0.31.0" dependencies = [ "alloc_track", + "chrono", "clap", "indicatif", "mimalloc", @@ -6261,6 +6132,8 @@ dependencies = [ "regex", "serde", "serde_json", + "tempfile", + "tims_stage", "timscentroid", "timsquery", "timsrust", @@ -6554,6 +6427,16 @@ dependencies = [ "tracing-subscriber", ] +[[package]] +name = "tracing-serde" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "704b1aeb7be0d0a84fc9828cae51dab5970fee5088f83d1dd7ee6f6246fc6ff1" +dependencies = [ + "serde", + "tracing-core", +] + [[package]] name = "tracing-subscriber" version = "0.3.23" @@ -6564,12 +6447,15 @@ dependencies = [ "nu-ansi-term", "once_cell", "regex-automata", + "serde", + "serde_json", "sharded-slab", "smallvec", "thread_local", "tracing", "tracing-core", "tracing-log", + "tracing-serde", ] [[package]] @@ -7863,12 +7749,6 @@ version = "0.13.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "66fee0b777b0f5ac1c69bb06d361268faafa61cd4682ae064a171c16c433e9e4" -[[package]] -name = "yazi" -version = "0.2.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e01738255b5a16e78bbb83e7fbba0a1e7dd506905cfc53f4622d89015a03fbb5" - [[package]] name = "yoke" version = "0.8.2" @@ -7989,12 +7869,6 @@ dependencies = [ "zvariant", ] -[[package]] -name = "zeno" -version = "0.3.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6df3dc4292935e51816d896edcd52aa30bc297907c26167fec31e2b0c6a32524" - [[package]] name = "zerocopy" version = "0.8.48" diff --git a/Cargo.toml b/Cargo.toml index b1477b8f..d8ca2ce4 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -12,6 +12,7 @@ members = [ "rust/timsquery_cli", "rust/timsquery_viewer", "rust/speclib_build_cli", + "rust/tims_stage", "python/timsquery_pyo3" ] default-members = [ @@ -25,11 +26,12 @@ default-members = [ "rust/timsquery", "rust/timsquery_cli", "rust/timsquery_viewer", - "rust/speclib_build_cli" + "rust/speclib_build_cli", + "rust/tims_stage" ] [workspace.package] -version = "0.30.0" +version = "0.31.0" edition = "2024" authors = ["Sebastian Paez"] license = "Apache-2.0" @@ -42,6 +44,7 @@ serde_json = "1.0.141" tracing-subscriber = { version = "0.3.18", features = [ "registry", "env-filter", + "json", ] } timsrust = { git = "https://github.com/jspaezp/timsrust", branch = "experimental/recalib_iter" } rusqlite = "0.37.0" @@ -49,22 +52,38 @@ rayon = "1.5" clap = { version = "4.5.27", features = ["derive"] } indicatif = { version = "0.18.0", features = ["rayon"] } -# Parquet and arrow usually need to be kept in sync -# Its not needed but makes binaries smaller +# Parquet and arrow usually need to be kept in sync. +# arrow: drop csv/ipc/json default features — we don't read/write those formats. +# parquet: keep default compression codecs (zstd/snap/lz4/brotli/flate2) so we can +# read speclibs written with any of them. parquet_derive = { version = "57.1" } parquet = { version = "57.1" } -arrow = { version = "57.1" } +arrow = { version = "57.1", default-features = false } mimalloc = { version = "0.1.46", features = ["secure"] } thiserror = "2" insta = { version = "1.34.0" } bon = "3.8.1" tinyvec = { features = ["alloc", "serde"], version = "1.10.0" } -rustyms = "0.11.0" +smallvec = { version = "1.13", features = ["const_generics", "union"] } +rustyms = { version = "0.11.0", default-features = false } csv = "1.3" tempfile = "3.23.0" +# Patch rustyms to drop gnome.dat (60MB glycan ontology, unused here) and +# panic with a clear message if anything reaches the GNOME code path. +# Saves ~60MB in release binaries. The actual edits live in +# patches/rustyms+0.11.0.patch and are applied to target/patch/rustyms-0.11.0 +# by the `patch-crate` build-dep + build.rs in the crates below. +# Patch rustyms to drop gnome.dat (60MB glycan ontology) + panic-with-link on +# access. Diff lives in patches/rustyms+0.11.0.patch; `patch-crate` extracts +# rustyms into target/patch/ via timsseek_cli's build.rs. On a clean checkout +# run `cargo patch-crate` once to populate target/patch/ before building, or +# trigger a first build of timsseek_cli to let its build.rs handle it. +[patch.crates-io] +rustyms = { path = "./target/patch/rustyms-0.11.0" } + [workspace.lints.clippy] len_without_is_empty = "allow" @@ -73,5 +92,6 @@ lto = 'thin' codegen-units = 1 panic = 'abort' opt-level = 3 -# Needed for flamegraph +strip = "symbols" +# Flip `strip` off and `debug = true` on when profiling / flamegraph. # debug = true diff --git a/README.md b/README.md index 74eac30d..20a03915 100644 --- a/README.md +++ b/README.md @@ -79,6 +79,21 @@ cargo run --release --bin timsseek -- \ --dotd-files $DOTD_FILE ``` +## S3 inputs + +Both CLIs accept `s3://` URIs anywhere a path is accepted (AWS / MinIO / R2). `.d` can be a directory, `.tar`, or S3 prefix; `.idx` sidecars short-circuit staging. + +```bash +timsseek --raw-inputs s3://bkt/sample.d.tar \ + --speclib-uri s3://bkt/lib.msgpack.zst \ + --output-uri s3://bkt/runs/out + +speclib_build_cli --fasta s3://bkt/proteome.fasta \ + --output s3://bkt/lib.msgpack.zst +``` + +Auth via AWS default chain. MinIO/R2: set `AWS_ENDPOINT_URL`. See `docs/development.md` for `[staging]` config + env var list. + ## Development See [docs/development.md](docs/development.md) for dev utilities, compile flags, env vars, Taskfile targets, and scripts. diff --git a/Taskfile.yml b/Taskfile.yml index c1b5953a..e8644602 100644 --- a/Taskfile.yml +++ b/Taskfile.yml @@ -11,6 +11,17 @@ tasks: cmds: - cargo deny check + # One-time setup after a fresh clone. Installs patch-crate (if missing) and + # applies patches/rustyms+0.11.0.patch into target/patch/rustyms-0.11.0 so + # [patch.crates-io] in Cargo.toml resolves. Without this, first `cargo build` + # errors with "failed to read manifest at target/patch/rustyms-0.11.0/...". + # Uses the jspaezp fork of cargo-patch-crate (dropped the cargo-as-library + # sqlite transitive — faster install, no system sqlite needed). + setup: + cmds: + - command -v cargo-patch-crate >/dev/null 2>&1 || cargo install --git https://github.com/jspaezp/cargo-patch-crate patch-crate --locked + - cargo patch-crate + test: cmds: - cargo test -- --nocapture diff --git a/docs/development.md b/docs/development.md index 9f0e12f6..1b3c9426 100644 --- a/docs/development.md +++ b/docs/development.md @@ -2,6 +2,19 @@ Reference for working on timsbuktoolkit. All binaries ship `--help` for CLI flags. Environment variables are NOT listed by `--help`; see the Env vars table below. +## First-time setup + +`rustyms`'s 60 MB `gnome.dat` glycan database is stripped from the build via +`[patch.crates-io]` in the workspace `Cargo.toml`, pointing at +`target/patch/rustyms-0.11.0/` which must be populated before cargo runs. On a +fresh clone: + +```bash +task setup # installs jspaezp/cargo-patch-crate fork + applies patches/ +``` + +Then `cargo build` normally. CI runs the same step. Details in `patches/rustyms+0.11.0.patch`. + ## Binaries | Binary | Crate | Purpose | @@ -46,6 +59,33 @@ Not shown by `--help`. Read directly via `std::env::var`. Per-crate: `rust/timsseek/Taskfile.yml` adds a watch loop (`task timsseek`) — rebuild + test + fmt + clippy on source change. +## S3 staging + +`timsseek_cli` config: + +```toml +[staging] +tempdir_root = "/scratch/timsseek" # default: system temp +max_prefix_keys = 256 +save_sidecar = false # write .idx next to raw input +stale_sweep_age_hours = 24 # 0 disables startup sweep +``` + +Startup sweeps `timsseek-staging-*` subdirs older than threshold that lack a `.lock` sentinel. Reclaims tempdirs from SIGKILL'd/crashed runs. + +Env vars (via `object_store` default chain): `AWS_{ACCESS_KEY_ID,SECRET_ACCESS_KEY,SESSION_TOKEN,REGION}`, `AWS_ENDPOINT_URL` (MinIO/R2), `AWS_S3_FORCE_PATH_STYLE` (auto-on with endpoint). + +Enable with `--features aws` on `timscentroid` / `tims_stage`. Default build omits AWS SDK. + +MinIO smoke test (needs pre-seeded bucket): + +```bash +MINIO_TEST_ENDPOINT=http://localhost:9000 \ +AWS_ACCESS_KEY_ID=minioadmin AWS_SECRET_ACCESS_KEY=minioadmin \ +MINIO_TEST_BUCKET=tims-stage-ci \ +cargo test -p tims_stage --features aws --test minio_smoke +``` + ## Tracked scripts | Path | Purpose | diff --git a/patches/rustyms+0.11.0.patch b/patches/rustyms+0.11.0.patch new file mode 100644 index 00000000..78d07211 --- /dev/null +++ b/patches/rustyms+0.11.0.patch @@ -0,0 +1,30 @@ +diff --git a/src/ontology.rs b/src/ontology.rs +index e6c0853..1816f57 100644 +--- a/src/ontology.rs ++++ b/src/ontology.rs +@@ -224,14 +224,19 @@ mod databases { + }); + /// Get the Gnome ontology + /// # Panics +- /// Panics when the modifications are not correctly provided at compile time, always report a panic if it occurs here. ++ /// Always — the timsbuktoolkit workspace stubs out the 60MB gnome.dat to ++ /// avoid shipping glycan ontology data its pipeline never reads. ++ /// If you genuinely need glycan lookup, revert the patch at ++ /// `timsbuktoolkit/patches/rustyms+0.11.0.patch` and restore the full ++ /// `src/databases/gnome.dat` blob. + pub(super) static GNOME: LazyLock = LazyLock::new(|| { +- bincode::serde::decode_from_slice::( +- include_bytes!("databases/gnome.dat"), +- Configuration::default(), ++ panic!( ++ "Glycan (GNOme) ontology is unavailable in this build: gnome.dat was \ ++ stubbed out to save ~60 MB because the timsbuktoolkit pipeline does \ ++ not use it. If you hit this, you're decoding a glycan modification — \ ++ please report at https://github.com/jspaezp/timsbuktoolkit/issues so \ ++ we can restore the full database (or make the gate finer-grained)." + ) +- .unwrap() +- .0 + }); + /// Get the Resid ontology + /// # Panics diff --git a/pyproject.toml b/pyproject.toml index 39cde712..49ab9269 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "timsseek-workspace" -version = "0.30.0" +version = "0.31.0" requires-python = ">=3.11,<3.13" dependencies = [] @@ -31,7 +31,7 @@ preview = true select = ["E", "F", "T20", "I"] [tool.bumpver] -current_version = "0.30.0" +current_version = "0.31.0" version_pattern = "MAJOR.MINOR.PATCH[-PYTAGNUM]" tag_message = "v{new_version}" commit_message = "chore: bump version to {new_version}" diff --git a/python/timsquery_pyo3/Cargo.toml b/python/timsquery_pyo3/Cargo.toml index 618f6151..4ec295cb 100644 --- a/python/timsquery_pyo3/Cargo.toml +++ b/python/timsquery_pyo3/Cargo.toml @@ -13,5 +13,14 @@ timsquery = { path = "../../rust/timsquery" } timscentroid = { path = "../../rust/timscentroid" } rayon = { workspace = true } -pyo3 = { version = "0.25", features = ["extension-module"] } +pyo3 = { version = "0.25" } numpy = "0.25" + +[features] +default = [] +# Enable when building the wheel via maturin so pyo3 doesn't try to link +# libpython (symbols resolve at runtime from the host Python). Plain +# `cargo build -p timsquery_pyo3` without this feature builds but the +# produced cdylib is only usable inside a Python interpreter that +# provides libpython at load time. +extension-module = ["pyo3/extension-module"] diff --git a/python/timsquery_pyo3/pyproject.toml b/python/timsquery_pyo3/pyproject.toml index 13fdac4d..2b67b812 100644 --- a/python/timsquery_pyo3/pyproject.toml +++ b/python/timsquery_pyo3/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "maturin" [project] name = "timsquery_pyo3" -version = "0.30.0" +version = "0.31.0" requires-python = ">=3.11,<3.14" classifiers = [ "Programming Language :: Rust", @@ -13,5 +13,5 @@ classifiers = [ dynamic = ["version"] [tool.maturin] -features = ["pyo3/extension-module"] +features = ["extension-module"] manifest-path = "Cargo.toml" diff --git a/python/timsquery_pyo3/uv.lock b/python/timsquery_pyo3/uv.lock new file mode 100644 index 00000000..1ae170d8 --- /dev/null +++ b/python/timsquery_pyo3/uv.lock @@ -0,0 +1,7 @@ +version = 1 +revision = 3 +requires-python = ">=3.11, <3.14" + +[[package]] +name = "timsquery-pyo3" +source = { editable = "." } diff --git a/rust/speclib_build_cli/Cargo.toml b/rust/speclib_build_cli/Cargo.toml index 9699e85e..456fe6f7 100644 --- a/rust/speclib_build_cli/Cargo.toml +++ b/rust/speclib_build_cli/Cargo.toml @@ -15,6 +15,7 @@ path = "src/main.rs" [dependencies] timsseek = { path = "../timsseek" } micromzpaf = { path = "../micromzpaf" } +tims_stage = { path = "../tims_stage" } rustyms = { workspace = true } clap = { workspace = true } @@ -23,11 +24,15 @@ serde_json = { workspace = true } tracing = { workspace = true } tracing-subscriber = { workspace = true } indicatif = { workspace = true } +tempfile = { workspace = true } reqwest = { version = "0.12", default-features = false, features = ["json", "rustls-tls"] } tokio = { version = "1", features = ["rt-multi-thread", "macros"] } toml = "0.8" bloomfilter = "1" -[dev-dependencies] -tempfile = { workspace = true } +[features] +default = ["aws"] +aws = ["tims_stage/aws"] +gcp = ["tims_stage/gcp"] +azure = ["tims_stage/azure"] diff --git a/rust/speclib_build_cli/src/cli.rs b/rust/speclib_build_cli/src/cli.rs index 336a5d6d..ce27fd62 100644 --- a/rust/speclib_build_cli/src/cli.rs +++ b/rust/speclib_build_cli/src/cli.rs @@ -6,13 +6,13 @@ use std::path::PathBuf; #[command(name = "speclib_build", version, about)] pub struct Cli { // ── Input ────────────────────────────────────────────────────────────── - /// FASTA file to digest into peptides. + /// FASTA URI (local path or s3://...). #[arg(long)] - pub fasta: Option, + pub fasta: Option, - /// Pre-digested peptide list (one bare sequence per line). + /// Peptide list URI (local path or s3://...). One bare sequence per line. #[arg(long)] - pub peptide_list: Option, + pub peptide_list: Option, // ── Config ───────────────────────────────────────────────────────────── /// TOML config file; CLI flags override values from this file. @@ -20,9 +20,10 @@ pub struct Cli { pub config: Option, // ── Output ───────────────────────────────────────────────────────────── - /// Output path for the spectral library (default: library.msgpack.zst). + /// Output URI for the spectral library (local path or s3://...; default: + /// library.msgpack.zst). #[arg(long, short = 'o')] - pub output: Option, + pub output: Option, // ── Modifications ────────────────────────────────────────────────────── /// Fixed modification, e.g. `Carbamidomethyl@C`. Repeatable. diff --git a/rust/speclib_build_cli/src/config.rs b/rust/speclib_build_cli/src/config.rs index dc4456ee..0c24bff1 100644 --- a/rust/speclib_build_cli/src/config.rs +++ b/rust/speclib_build_cli/src/config.rs @@ -1,5 +1,4 @@ use serde::Deserialize; -use std::path::PathBuf; use crate::cli::Cli; @@ -62,14 +61,14 @@ fn default_max_ion_mz() -> f32 { fn default_min_ions() -> usize { 3 } -fn default_output() -> PathBuf { - PathBuf::from("library.msgpack.zst") +fn default_output() -> String { + "library.msgpack.zst".to_string() } // ── Sub-structs ────────────────────────────────────────────────────────────── #[derive(Debug, Deserialize)] -#[serde(default)] +#[serde(default, deny_unknown_fields)] pub struct DigestionConfig { /// Enzyme name (currently only "trypsin" is recognised). pub enzyme: String, @@ -90,7 +89,7 @@ impl Default for DigestionConfig { } #[derive(Debug, Deserialize)] -#[serde(default)] +#[serde(default, deny_unknown_fields)] pub struct ModificationsConfig { /// Fixed modifications applied to every matching residue, e.g. ["Carbamidomethyl@C"]. pub fixed: Vec, @@ -111,7 +110,7 @@ impl Default for ModificationsConfig { } #[derive(Debug, Deserialize)] -#[serde(default)] +#[serde(default, deny_unknown_fields)] pub struct ChargesConfig { pub min: u8, pub max: u8, @@ -127,7 +126,7 @@ impl Default for ChargesConfig { } #[derive(Debug, Deserialize)] -#[serde(default)] +#[serde(default, deny_unknown_fields)] pub struct DecoysConfig { /// Strategy for generating decoy entries: "none", "reverse", or "edge_mutate". pub strategy: String, @@ -142,7 +141,7 @@ impl Default for DecoysConfig { } #[derive(Debug, Deserialize)] -#[serde(default)] +#[serde(default, deny_unknown_fields)] pub struct PredictionConfig { pub fragment_model: String, pub rt_model: String, @@ -168,7 +167,7 @@ impl Default for PredictionConfig { } #[derive(Debug, Deserialize)] -#[serde(default)] +#[serde(default, deny_unknown_fields)] pub struct FiltersConfig { pub max_ions: usize, pub min_mz: f32, @@ -195,15 +194,16 @@ impl Default for FiltersConfig { // ── Top-level config ───────────────────────────────────────────────────────── #[derive(Debug, Deserialize)] -#[serde(default)] +#[serde(default, deny_unknown_fields)] pub struct SpeclibBuildConfig { // Inputs — not directly deserialised from TOML but set after merging CLI args. + // URIs: either local paths or remote (s3://...). #[serde(skip)] - pub fasta: Option, + pub fasta: Option, #[serde(skip)] - pub peptide_list: Option, + pub peptide_list: Option, - pub output: PathBuf, + pub output: String, pub digestion: DigestionConfig, pub modifications: ModificationsConfig, pub charges: ChargesConfig, diff --git a/rust/speclib_build_cli/src/dedup.rs b/rust/speclib_build_cli/src/dedup.rs index bb238042..ddc2e2b8 100644 --- a/rust/speclib_build_cli/src/dedup.rs +++ b/rust/speclib_build_cli/src/dedup.rs @@ -1,16 +1,16 @@ use bloomfilter::Bloom; use std::collections::HashMap; -use timsseek::DigestSlice; +use timsseek::ProteinSlice; pub struct PeptideDedup; impl PeptideDedup { - /// Deduplicate DigestSlices using bloom filter fast-pass + bucket HashMap. + /// Deduplicate ProteinSlices using bloom filter fast-pass + bucket HashMap. /// `estimated_count` used to preallocate bloom + HashMap capacity. - pub fn dedup(slices: Vec, estimated_count: usize) -> Vec { + pub fn dedup(slices: Vec, estimated_count: usize) -> Vec { let est = estimated_count.max(64); let mut bloom = Bloom::new_for_fp_rate(est, 0.01); - let mut buckets: HashMap<(u8, u16), Vec> = HashMap::with_capacity(est / 20); + let mut buckets: HashMap<(u8, u16), Vec> = HashMap::with_capacity(est / 20); for slice in slices { let seq = slice.as_str().as_bytes(); @@ -47,10 +47,10 @@ mod tests { use std::sync::Arc; use timsseek::models::DecoyMarking; - fn make_slice(seq: &str) -> DigestSlice { + fn make_slice(seq: &str) -> ProteinSlice { let s: Arc = seq.into(); let len = s.len() as u16; - DigestSlice::new(s, 0..len, DecoyMarking::Target, 0) + ProteinSlice::new(s, 0..len, DecoyMarking::Target, 0) } #[test] @@ -88,8 +88,8 @@ mod tests { // Two slices from same Arc, same range → deduped let backbone: Arc = "PEPTIDEPINKTOMATO".into(); let len = backbone.len() as u16; - let slice1 = DigestSlice::new(backbone.clone(), 0..len, DecoyMarking::Target, 0); - let slice2 = DigestSlice::new(backbone.clone(), 0..len, DecoyMarking::Target, 0); + let slice1 = ProteinSlice::new(backbone.clone(), 0..len, DecoyMarking::Target, 0); + let slice2 = ProteinSlice::new(backbone.clone(), 0..len, DecoyMarking::Target, 0); let slices = vec![slice1, slice2]; let result = PeptideDedup::dedup(slices, 10); assert_eq!(result.len(), 1); diff --git a/rust/speclib_build_cli/src/entry.rs b/rust/speclib_build_cli/src/entry.rs index 1d6495d6..e5686aa6 100644 --- a/rust/speclib_build_cli/src/entry.rs +++ b/rust/speclib_build_cli/src/entry.rs @@ -47,18 +47,13 @@ pub fn strip_mods(seq: &str) -> String { out } -/// Convert our short mod notation to ProForma for rustyms. -/// `C[U:4]` → `C[UNIMOD:4]`, mass-shift mods like `[+57.02]` pass through. -/// Note: rustyms parses `[U:N]` as element Uranium, not UNIMOD — must expand. -fn to_proforma(seq: &str) -> String { - seq.replace("[U:", "[UNIMOD:") -} +use timsseek::models::sequence::normalize_to_proforma; /// Compute the monoisotopic precursor m/z using rustyms. /// Input should be the modified sequence (mods included in mass). fn compute_precursor_mz(modified_seq: &str, charge: u8) -> Option { use rustyms::prelude::*; - let proforma = to_proforma(modified_seq); + let proforma = normalize_to_proforma(modified_seq); let peptide = Peptidoform::pro_forma(&proforma, None).ok()?; let linear = peptide.as_linear()?; let formulas = linear.formulas(); @@ -72,7 +67,7 @@ fn compute_precursor_mz(modified_seq: &str, charge: u8) -> Option { /// Count carbon and sulphur from modified sequence (mods affect formula). fn count_cs_modified(modified_seq: &str) -> Option<(u16, u16)> { - let proforma = to_proforma(modified_seq); + let proforma = normalize_to_proforma(modified_seq); count_carbon_sulphur_in_sequence(&proforma).ok() } diff --git a/rust/speclib_build_cli/src/pipeline.rs b/rust/speclib_build_cli/src/pipeline.rs index b6bb0bb1..6e451e94 100644 --- a/rust/speclib_build_cli/src/pipeline.rs +++ b/rust/speclib_build_cli/src/pipeline.rs @@ -1,10 +1,14 @@ -use std::io::BufRead; +use std::io::{ + BufRead, + Write, +}; use indicatif::{ ProgressBar, + ProgressFinish, ProgressStyle, }; -use timsseek::DigestSlice; +use timsseek::ProteinSlice; use timsseek::data_sources::speclib::SpeclibWriter; use timsseek::digest::digestion::{ DigestionEnd, @@ -111,9 +115,27 @@ async fn flush_batch( pub async fn run(config: &SpeclibBuildConfig) -> Result<(), Box> { // ── Phase 1: Get base peptides ────────────────────────────────────────── - let base_peptides: Vec = if let Some(fasta_path) = &config.fasta { - tracing::info!("Reading FASTA: {}", fasta_path.display()); - let collection = ProteinSequenceCollection::from_fasta_file(fasta_path)?; + let base_peptides: Vec = if let Some(fasta_uri) = &config.fasta { + tracing::info!("Reading FASTA: {}", fasta_uri); + let collection = if is_remote_uri(fasta_uri) { + // Download remote FASTA to a tempfile (preserving extension for any + // downstream sniffing), then parse as usual. + let mut reader = tims_stage::open_reader(fasta_uri) + .map_err(|e| format!("open fasta {fasta_uri}: {e}"))?; + let ext = std::path::Path::new(fasta_uri.trim_end_matches('/')) + .extension() + .and_then(|s| s.to_str()) + .unwrap_or("fasta"); + let mut tf = tempfile::Builder::new() + .prefix("speclib-fasta-") + .suffix(&format!(".{ext}")) + .tempfile()?; + std::io::copy(&mut reader, tf.as_file_mut())?; + tf.as_file_mut().flush()?; + ProteinSequenceCollection::from_fasta_file(tf.path())? + } else { + ProteinSequenceCollection::from_fasta_file(std::path::Path::new(fasta_uri))? + }; let total_aa: usize = collection.sequences.iter().map(|p| p.sequence.len()).sum(); tracing::info!( "Read {} proteins ({} amino acids)", @@ -160,18 +182,19 @@ pub async fn run(config: &SpeclibBuildConfig) -> Result<(), Box = Vec::new(); + } else if let Some(list_uri) = &config.peptide_list { + tracing::info!("Reading peptide list: {}", list_uri); + let raw_reader = tims_stage::open_reader(list_uri) + .map_err(|e| format!("open peptide list {list_uri}: {e}"))?; + let reader = std::io::BufReader::new(raw_reader); + let mut slices: Vec = Vec::new(); for (i, line) in reader.lines().enumerate() { let line = line?; let seq = line.trim().to_string(); if seq.is_empty() || seq.starts_with('#') { continue; } - slices.push(DigestSlice::from_string(seq, false, i as u32)); + slices.push(ProteinSlice::from_string(seq, false, i as u32)); } tracing::info!("Read {} peptides from list", slices.len()); @@ -240,7 +263,36 @@ pub async fn run(config: &SpeclibBuildConfig) -> Result<(), Box = if remote_output { + let ext = std::path::Path::new(output_uri.trim_end_matches('/')) + .extension() + .and_then(|s| s.to_str()) + .unwrap_or("msgpack.zst"); + let tf = tempfile::Builder::new() + .prefix("speclib-out-") + .suffix(&format!(".{ext}")) + .tempfile()?; + Some(tf) + } else { + // Ensure parent directory exists for local output. + if let Some(parent) = std::path::Path::new(output_uri).parent() { + if !parent.as_os_str().is_empty() { + std::fs::create_dir_all(parent)?; + } + } + None + }; + let working_path: std::path::PathBuf = if let Some(tf) = &output_tempfile { + tf.path().to_path_buf() + } else { + std::path::PathBuf::from(output_uri) + }; + + let out_file = std::fs::File::create(&working_path)?; let mut writer = SpeclibWriter::new_msgpack_zstd(out_file)?; // Estimate total items for progress bar: peptides * mod_variants * charges * (1 + decoy) @@ -333,18 +385,28 @@ pub async fn run(config: &SpeclibBuildConfig) -> Result<(), Box) -> (Vec, usize) { +fn filter_nonstandard_aa(peptides: Vec) -> (Vec, usize) { let before = peptides.len(); - let kept: Vec = peptides + let kept: Vec = peptides .into_iter() .filter(|p| !p.as_str().chars().any(|c| NONSTANDARD_AA.contains(&c))) .collect(); diff --git a/rust/tims_stage/Cargo.toml b/rust/tims_stage/Cargo.toml new file mode 100644 index 00000000..72551942 --- /dev/null +++ b/rust/tims_stage/Cargo.toml @@ -0,0 +1,28 @@ +[package] +name = "tims_stage" +version.workspace = true +edition.workspace = true +license.workspace = true + +[dependencies] +timscentroid = { path = "../timscentroid" } +serde = { workspace = true, features = ["derive"] } +thiserror = { workspace = true } +tracing = { workspace = true } +tempfile = { workspace = true } +indicatif = { workspace = true } +bytes = "1" +url = "2" + +[dev-dependencies] +tempfile = { workspace = true } +filetime = "0.2" + +[features] +default = [] +aws = ["timscentroid/aws"] +gcp = ["timscentroid/gcp"] +azure = ["timscentroid/azure"] + +[lints] +workspace = true diff --git a/rust/tims_stage/src/backend.rs b/rust/tims_stage/src/backend.rs new file mode 100644 index 00000000..0dc79337 --- /dev/null +++ b/rust/tims_stage/src/backend.rs @@ -0,0 +1,174 @@ +//! StagingBackend trait, PerRunTempdir implementation, StagedDotD handle, +//! plus a startup sweep for stale tempdirs. + +use crate::error::StageError; +use crate::resolve::SourceSpec; +use std::path::{ + Path, + PathBuf, +}; + +pub trait StagingBackend { + fn stage(&self, spec: &SourceSpec) -> Result; +} + +#[derive(Debug, Clone)] +pub struct StagingConfig { + /// None = system default temp dir. + pub tempdir_root: Option, + /// Hard cap on prefix listing size. + pub max_prefix_keys: usize, + /// Age threshold for the startup sweep. `0` disables the sweep. + pub stale_sweep_age_hours: u64, +} + +impl Default for StagingConfig { + fn default() -> Self { + Self { + tempdir_root: None, + max_prefix_keys: 256, + stale_sweep_age_hours: 24, + } + } +} + +/// RAII handle over a per-run tempdir + its inner `sample.d/` path. +/// +/// Callers MUST keep `StagedDotD` alive for as long as they use the path — +/// the tempdir is deleted on drop, and `AsRef` does not extend the +/// borrow beyond the handle's lifetime. Do NOT store `&Path` derived from +/// `as_ref()` past the end of the scope that holds the handle. +pub struct StagedDotD { + /// The tempdir owning `dotd`. `None` when the source is a local `.d` + /// that we borrow in place (no tempdir to drop). + _tempdir: Option, + dotd: PathBuf, +} + +impl StagedDotD { + pub(crate) fn owned(tempdir: tempfile::TempDir, dotd: PathBuf) -> Self { + Self { + _tempdir: Some(tempdir), + dotd, + } + } +} + +impl AsRef for StagedDotD { + fn as_ref(&self) -> &Path { + &self.dotd + } +} + +/// MVP-only staging backend: per-run tempdir, deleted when the +/// `StagedDotD` drops. The Phase 9 startup sweep (added later) will reclaim +/// any tempdirs leaked by abnormal termination. +pub struct PerRunTempdir { + cfg: StagingConfig, +} + +impl PerRunTempdir { + pub fn new(cfg: StagingConfig) -> Result { + if cfg.stale_sweep_age_hours > 0 { + let root = cfg.tempdir_root.clone().unwrap_or_else(std::env::temp_dir); + sweep_stale(&root, cfg.stale_sweep_age_hours)?; + } + Ok(Self { cfg }) + } + + pub fn config(&self) -> &StagingConfig { + &self.cfg + } + + /// Create a namespaced tempdir under the configured root. The + /// `timsseek-staging-` prefix is what the startup sweep uses to + /// identify our tempdirs. + pub(crate) fn new_tempdir(&self) -> Result { + let builder = tempfile::Builder::new() + .prefix("timsseek-staging-") + .to_owned(); + let td = match &self.cfg.tempdir_root { + Some(root) => builder.tempdir_in(root).map_err(StageError::Io)?, + None => builder.tempdir().map_err(StageError::Io)?, + }; + Ok(td) + } +} + +impl StagingBackend for PerRunTempdir { + fn stage(&self, spec: &SourceSpec) -> Result { + match spec { + SourceSpec::S3Tar { .. } => crate::tar::stage_s3_tar(self, spec), + SourceSpec::S3Prefix { .. } => crate::prefix::stage_s3_prefix(self, spec), + SourceSpec::LocalTar { .. } => crate::tar::stage_local_tar(self, spec), + } + } +} + +fn sweep_stale(root: &Path, age_hours: u64) -> Result<(), StageError> { + let threshold = std::time::SystemTime::now() + .checked_sub(std::time::Duration::from_secs(age_hours * 3600)) + .unwrap_or(std::time::UNIX_EPOCH); + let rd = match std::fs::read_dir(root) { + Ok(x) => x, + Err(e) if e.kind() == std::io::ErrorKind::NotFound => return Ok(()), + Err(e) => return Err(StageError::Io(e)), + }; + for entry in rd.flatten() { + let name = entry.file_name().to_string_lossy().to_string(); + if !name.starts_with("timsseek-staging-") { + continue; + } + let meta = match entry.metadata() { + Ok(m) => m, + Err(_) => continue, + }; + if !meta.is_dir() { + continue; + } + let path = entry.path(); + let has_lock = path.join(".lock").exists(); + let mtime = meta.modified().ok(); + if has_lock { + // Live session. Only force-remove if it's very stale (2x threshold) + // AND still claims to be locked — best-effort for crashed runs. + if let Some(mt) = mtime { + let hard_threshold = std::time::SystemTime::now() + .checked_sub(std::time::Duration::from_secs(age_hours * 2 * 3600)) + .unwrap_or(std::time::UNIX_EPOCH); + if mt < hard_threshold { + tracing::info!(?path, "removing very-stale locked tempdir"); + let _ = std::fs::remove_dir_all(&path); + } + } + continue; + } + if let Some(mt) = mtime { + if mt < threshold { + tracing::info!(?path, "sweeping stale tempdir"); + let _ = std::fs::remove_dir_all(&path); + } + } + } + Ok(()) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn default_config_has_256_cap_and_24h_sweep() { + let c = StagingConfig::default(); + assert_eq!(c.max_prefix_keys, 256); + assert_eq!(c.stale_sweep_age_hours, 24); + } + + #[test] + fn tempdir_name_has_namespace_prefix() { + let backend = PerRunTempdir::new(StagingConfig::default()).unwrap(); + let td = backend.new_tempdir().unwrap(); + let name = td.path().file_name().unwrap().to_string_lossy().to_string(); + assert!(name.starts_with("timsseek-staging-"), "got {}", name); + } +} diff --git a/rust/tims_stage/src/common.rs b/rust/tims_stage/src/common.rs new file mode 100644 index 00000000..b886dd61 --- /dev/null +++ b/rust/tims_stage/src/common.rs @@ -0,0 +1,37 @@ +//! Shared internal constants + helpers for the staging impls. + +use crate::error::{ + StageError, + redact_uri, +}; +use timscentroid::serialization::SerializationError; + +/// Filenames required inside a `.d` for it to be openable by timsrust. +pub(crate) const REQUIRED_DOTD_FILES: &[&str] = &["analysis.tdf", "analysis.tdf_bin"]; + +/// TTY-aware progress bar. Returns a hidden bar in non-TTY environments so +/// stderr isn't spammed when piped to a file. +pub(crate) fn make_bar(size: u64, label: &str) -> indicatif::ProgressBar { + use std::io::IsTerminal; + if !std::io::stderr().is_terminal() { + return indicatif::ProgressBar::hidden(); + } + let bar = indicatif::ProgressBar::new(size); + bar.set_style( + indicatif::ProgressStyle::with_template( + "{spinner:.green} {msg} [{elapsed_precise}] [{wide_bar:.cyan/blue}] {bytes}/{total_bytes} ({eta})", + ) + .unwrap(), + ); + bar.set_message(label.to_string()); + bar +} + +/// Wrap a `SerializationError` into a `StageError::Transport` with a +/// query-stripped URI so presigned credentials never hit the error chain. +pub(crate) fn transport_err(uri: &str) -> impl FnOnce(SerializationError) -> StageError + '_ { + move |e| StageError::Transport { + uri: redact_uri(uri), + source: e, + } +} diff --git a/rust/tims_stage/src/download.rs b/rust/tims_stage/src/download.rs new file mode 100644 index 00000000..a2beb784 --- /dev/null +++ b/rust/tims_stage/src/download.rs @@ -0,0 +1,48 @@ +//! Stream a URI to a local file. +//! +//! For URIs that may exceed RAM (speclib parquet, large FASTAs), this is the +//! right primitive — it streams via `StorageProvider::get_to_file` for remote +//! and is a plain file copy for local. Unlike [`crate::open_reader`] it does +//! not buffer the whole payload in memory, so no size cap is imposed. + +use crate::common::transport_err; +use crate::error::StageError; +use crate::uri::{ + is_remote_uri, + split_uri, +}; +use std::path::Path; +use timscentroid::StorageProvider; + +pub fn download_to_file(uri: &str, dst: &Path) -> Result<(), StageError> { + if let Some(parent) = dst.parent() { + std::fs::create_dir_all(parent).map_err(StageError::Io)?; + } + if is_remote_uri(uri) { + let (loc, key) = split_uri(uri)?; + let provider = StorageProvider::open(loc).map_err(transport_err(uri))?; + let bar = indicatif::ProgressBar::hidden(); + provider + .get_to_file(&key, dst, &bar) + .map_err(transport_err(uri))?; + } else { + std::fs::copy(uri, dst).map_err(StageError::Io)?; + } + Ok(()) +} + +#[cfg(test)] +mod tests { + use super::*; + use tempfile::TempDir; + + #[test] + fn copies_local() { + let dir = TempDir::new().unwrap(); + let src = dir.path().join("src.bin"); + let dst = dir.path().join("nested/dst.bin"); + std::fs::write(&src, b"payload").unwrap(); + download_to_file(src.to_str().unwrap(), &dst).unwrap(); + assert_eq!(std::fs::read(&dst).unwrap(), b"payload"); + } +} diff --git a/rust/tims_stage/src/error.rs b/rust/tims_stage/src/error.rs new file mode 100644 index 00000000..16f8bd38 --- /dev/null +++ b/rust/tims_stage/src/error.rs @@ -0,0 +1,75 @@ +//! Errors produced by the tims_stage crate. + +use thiserror::Error; + +#[derive(Debug, Error)] +pub enum StageError { + #[error("unknown URI suffix: {0} (expected .idx, .d, .d/, or .tar)")] + UnknownSuffix(String), + + #[error("URI malformed: {0}")] + InvalidUri(String), + + #[error("prefix '{prefix}' yields more than {cap} entries; refusing to list")] + PrefixCapExceeded { prefix: String, cap: usize }, + + #[error(".d source missing required basenames {missing:?}; found {found:?}")] + MissingRequiredFiles { + missing: Vec, + found: Vec, + }, + + #[error( + "GNU long-name tar entries are not supported; rebuild with `tar --format=ustar` or upload as an S3 prefix" + )] + UnsupportedTarFeature, + + #[error("tar payload short read: expected {expected} bytes, got {actual}")] + ShortRead { expected: u64, actual: u64 }, + + #[error("shape mismatch: {0}")] + ShapeMismatch(String), + + #[error("storage transport error at {uri}: {source}")] + Transport { + uri: String, + #[source] + source: timscentroid::serialization::SerializationError, + }, + + #[error("io error: {0}")] + Io(#[from] std::io::Error), + + #[error( + "payload at {uri} is {size} bytes, exceeds in-memory load cap of {cap}; \ + use a streaming/staging path instead" + )] + PayloadTooLarge { uri: String, size: u64, cap: u64 }, +} + +/// Strip query strings from a URI before embedding in an error message — +/// presigned-style URLs carry credentials in the query. +pub(crate) fn redact_uri(uri: &str) -> String { + match uri.split_once('?') { + Some((base, _)) => base.to_string(), + None => uri.to_string(), + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn redact_uri_strips_query_string() { + assert_eq!(redact_uri("s3://bkt/key?X-Sig=abc"), "s3://bkt/key"); + assert_eq!(redact_uri("/tmp/a.bin"), "/tmp/a.bin"); + } + + #[test] + fn stage_error_display_has_context() { + let e = StageError::UnknownSuffix("/foo.txt".into()); + let s = format!("{e}"); + assert!(s.contains("/foo.txt")); + } +} diff --git a/rust/tims_stage/src/lib.rs b/rust/tims_stage/src/lib.rs new file mode 100644 index 00000000..2b4bea7b --- /dev/null +++ b/rust/tims_stage/src/lib.rs @@ -0,0 +1,49 @@ +//! S3-aware input resolution and staging for TimsTOF pipelines. +//! +//! This crate implements Layers 1 (resolution) and 2 (staging policy) of the +//! design at `docs/superpowers/specs/2026-04-21-s3-staging-design.md`. Layer 0 +//! primitives live in `timscentroid::storage`. Layer 3 orchestration lives in +//! `timsquery`. + +pub mod backend; +pub(crate) mod common; +pub mod download; +pub mod error; +pub mod open; +pub mod prefix; +pub mod resolve; +pub mod tar; +pub mod upload; +pub mod uri; + +pub use backend::{ + PerRunTempdir, + StagedDotD, + StagingBackend, + StagingConfig, +}; +pub use download::download_to_file; +pub use error::StageError; +pub use open::{ + DEFAULT_IN_MEMORY_CAP, + open_reader, + open_reader_with_cap, + uri_exists, +}; +pub use resolve::{ + Resolved, + SourceSpec, + resolve, +}; +pub use upload::{ + DEFAULT_UPLOAD_CAP, + upload_file, + upload_file_with_cap, +}; +pub use uri::{ + canonical_uri, + expand_local_uri, + is_remote_uri, + sidecar_of, + split_uri, +}; diff --git a/rust/tims_stage/src/open.rs b/rust/tims_stage/src/open.rs new file mode 100644 index 00000000..9f691039 --- /dev/null +++ b/rust/tims_stage/src/open.rs @@ -0,0 +1,123 @@ +//! Generic URI → sync `Read` for payloads that fit in RAM. +//! +//! Remote URIs are fetched whole via `StorageProvider::get_bytes` and wrapped +//! in `std::io::Cursor`. Callers see a boxed `Read`. Local paths get a +//! `BufReader`. Rationale: the `object_store` GetResult body stream is +//! async; bridging it to a sync `Read` via `block_on_or_in_place` on every +//! `read()` call is reentrancy-hostile under rayon, so we front-load into a +//! single buffer. +//! +//! **This path is only appropriate for small inputs** (speclibs, FASTAs, +//! peptide lists — typically MBs to low-GB). The default cap is +//! [`DEFAULT_IN_MEMORY_CAP`] (4 GiB); remote payloads that exceed this abort +//! with [`StageError::PayloadTooLarge`] *before* the full GET. For streaming +//! reads of larger objects (e.g. raw `.d` bundles), use the tar / prefix +//! staging path instead. +//! +//! Callers that know their payload is bounded by some stricter limit can call +//! [`open_reader_with_cap`] directly. +//! +//! Local files are not size-checked: they do not count against the remote +//! cap because `File::open` is cheap and callers already control local disk. + +use crate::common::transport_err; +use crate::error::StageError; +use crate::uri::{ + is_remote_uri, + split_uri, +}; +use bytes::Bytes; +use std::io::{ + Cursor, + Read, +}; +use timscentroid::StorageProvider; + +/// Default ceiling on the size of a remote payload loaded into RAM via +/// [`open_reader`]. Intended to catch "someone pointed this at a raw .d" bugs +/// early, not to be a production resource limit. +pub const DEFAULT_IN_MEMORY_CAP: u64 = 4 * 1024 * 1024 * 1024; + +pub fn open_reader(uri: &str) -> Result, StageError> { + open_reader_with_cap(uri, DEFAULT_IN_MEMORY_CAP) +} + +/// Cheap existence probe for a URI. Remote URIs dispatch a HEAD through the +/// provider; local paths call `Path::exists`. Network failures surface as +/// [`StageError::Transport`] — only `NotFound` translates to `Ok(false)`. +pub fn uri_exists(uri: &str) -> Result { + if is_remote_uri(uri) { + let (loc, key) = split_uri(uri)?; + let provider = StorageProvider::open(loc).map_err(transport_err(uri))?; + provider.exists(&key).map_err(transport_err(uri)) + } else { + Ok(std::path::Path::new(uri).exists()) + } +} + +pub fn open_reader_with_cap(uri: &str, max_bytes: u64) -> Result, StageError> { + if is_remote_uri(uri) { + let bytes = fetch_remote_bytes(uri, max_bytes)?; + Ok(Box::new(Cursor::new(bytes))) + } else { + let f = std::fs::File::open(uri).map_err(StageError::Io)?; + Ok(Box::new(std::io::BufReader::new(f))) + } +} + +fn fetch_remote_bytes(uri: &str, max_bytes: u64) -> Result { + let (loc, key) = split_uri(uri)?; + let provider = StorageProvider::open(loc).map_err(transport_err(uri))?; + let meta = provider.head(&key).map_err(transport_err(uri))?; + let size = meta.size; + if size > max_bytes { + return Err(StageError::PayloadTooLarge { + uri: crate::error::redact_uri(uri).to_string(), + size, + cap: max_bytes, + }); + } + provider.get_bytes(&key).map_err(transport_err(uri)) +} + +#[cfg(test)] +mod tests { + use super::*; + use std::io::Read; + use tempfile::TempDir; + + #[test] + fn reads_local_file() { + let dir = TempDir::new().unwrap(); + let p = dir.path().join("a.txt"); + std::fs::write(&p, b"hello").unwrap(); + let mut r = open_reader(p.to_str().unwrap()).unwrap(); + let mut s = String::new(); + r.read_to_string(&mut s).unwrap(); + assert_eq!(s, "hello"); + } + + #[test] + fn local_file_bypasses_cap() { + // Local files are not size-checked; pick a trivially small cap and + // show it doesn't matter for local paths. + let dir = TempDir::new().unwrap(); + let p = dir.path().join("a.txt"); + std::fs::write(&p, b"hello").unwrap(); + let mut r = open_reader_with_cap(p.to_str().unwrap(), 1).unwrap(); + let mut s = String::new(); + r.read_to_string(&mut s).unwrap(); + assert_eq!(s, "hello"); + } + + #[test] + fn errors_for_missing_local_file() { + let result = open_reader("/definitely/not/a/real/file.txt"); + assert!(result.is_err()); + if let Err(StageError::Io(_)) = result { + // success + } else { + panic!("expected StageError::Io"); + } + } +} diff --git a/rust/tims_stage/src/prefix.rs b/rust/tims_stage/src/prefix.rs new file mode 100644 index 00000000..074f84b0 --- /dev/null +++ b/rust/tims_stage/src/prefix.rs @@ -0,0 +1,76 @@ +use crate::backend::{ + PerRunTempdir, + StagedDotD, +}; +use crate::common::{ + REQUIRED_DOTD_FILES as REQUIRED, + make_bar, + transport_err, +}; +use crate::error::StageError; +use crate::resolve::SourceSpec; +use timscentroid::StorageProvider; + +pub(crate) fn stage_s3_prefix( + backend: &PerRunTempdir, + spec: &SourceSpec, +) -> Result { + let SourceSpec::S3Prefix { loc, prefix } = spec else { + unreachable!() + }; + let step = timscentroid::TimedStep::begin("Staging .d (prefix)"); + let uri_for_err = format!("{loc:?}/{prefix}"); + + // Read-only: do not create local dirs. + let provider = StorageProvider::open(loc.clone()).map_err(transport_err(&uri_for_err))?; + + let listing = provider + .list_capped(prefix, backend.config().max_prefix_keys) + .map_err(|e| match e { + timscentroid::serialization::SerializationError::PrefixCapExceeded { cap, prefix } => { + StageError::PrefixCapExceeded { cap, prefix } + } + other => StageError::Transport { + uri: crate::error::redact_uri(&uri_for_err), + source: other, + }, + })?; + + // Filter to required basenames. + let mut by_name: std::collections::HashMap<&'static str, _> = Default::default(); + for meta in &listing { + let full = meta.location.to_string(); + let bn = full.rsplit('/').next().unwrap_or(full.as_str()).to_string(); + if let Some(r) = REQUIRED.iter().find(|r| ***r == bn) { + by_name.insert(*r, meta); + } + } + let missing: Vec = REQUIRED + .iter() + .filter(|r| !by_name.contains_key(*r)) + .map(|s| s.to_string()) + .collect(); + if !missing.is_empty() { + return Err(StageError::MissingRequiredFiles { + missing, + found: listing.iter().map(|m| m.location.to_string()).collect(), + }); + } + + let tempdir = backend.new_tempdir()?; + let dotd = tempdir.path().join("sample.d"); + std::fs::create_dir_all(&dotd).map_err(StageError::Io)?; + std::fs::File::create(tempdir.path().join(".lock")).map_err(StageError::Io)?; + + for bn in REQUIRED { + let meta = by_name.get(bn).expect("preflight guarantees presence"); + let bar = make_bar(meta.size, bn); + let full_key = meta.location.to_string(); + provider + .get_to_file(&full_key, &dotd.join(bn), &bar) + .map_err(transport_err(&full_key))?; + bar.finish_and_clear(); + } + step.finish(); + Ok(StagedDotD::owned(tempdir, dotd)) +} diff --git a/rust/tims_stage/src/resolve.rs b/rust/tims_stage/src/resolve.rs new file mode 100644 index 00000000..e055c24e --- /dev/null +++ b/rust/tims_stage/src/resolve.rs @@ -0,0 +1,204 @@ +//! Decision-tree resolution from URI string to Resolved branch. +//! +//! See `docs/superpowers/specs/2026-04-21-s3-staging-design.md` §"Resolution +//! decision tree". + +use crate::error::{ + StageError, + redact_uri, +}; +use crate::uri::{ + LocKind, + NameKind, + parse_uri_shape, + sidecar_of, +}; +use std::path::PathBuf; +use timscentroid::{ + StorageLocation, + StorageProvider, +}; + +#[derive(Debug)] +pub enum Resolved { + /// An `.idx` directory on the local filesystem. + LocalIdx { loc: StorageLocation }, + /// An `.idx` directory in remote object storage. + RemoteIdx { loc: StorageLocation }, + /// A `.d` directory on the local filesystem. + LocalDotD { path: PathBuf }, + /// Requires materialization to a local tempdir before indexing. + Stageable { spec: SourceSpec }, +} + +#[derive(Debug, Clone)] +pub enum SourceSpec { + S3Tar { + loc: StorageLocation, + key: String, + }, + S3Prefix { + loc: StorageLocation, + prefix: String, + }, + LocalTar { + path: PathBuf, + }, +} + +/// Resolve a URI to a concrete Resolved variant. May perform at most one +/// remote round-trip: a HEAD for `/metadata.json`. No staging +/// or downloading happens here. +pub fn resolve(uri: &str) -> Result { + let shape = parse_uri_shape(uri)?; + + // If user already pointed at an .idx, no sidecar hunt. + if shape.name == NameKind::Idx { + let loc = idx_location_for(uri)?; + return Ok(match shape.loc { + LocKind::Local => Resolved::LocalIdx { loc }, + LocKind::Remote => Resolved::RemoteIdx { loc }, + }); + } + + // Check for an .idx sidecar. The `.idx` is a DIRECTORY containing + // `metadata.json` + parquet shards; probe by existence of that metadata + // file inside the directory. + let sidecar = sidecar_of(uri); + if sidecar_ready(&sidecar)? { + let loc = idx_location_for(&sidecar)?; + return Ok(match shape.loc { + LocKind::Local => Resolved::LocalIdx { loc }, + LocKind::Remote => Resolved::RemoteIdx { loc }, + }); + } + + // No sidecar. Dispatch by raw-data shape. + match (shape.loc, shape.name) { + (LocKind::Local, NameKind::DotD) => Ok(Resolved::LocalDotD { + path: PathBuf::from(uri.trim_end_matches('/')), + }), + (LocKind::Local, NameKind::Tar) => Ok(Resolved::Stageable { + spec: SourceSpec::LocalTar { + path: PathBuf::from(uri), + }, + }), + (LocKind::Remote, NameKind::DotD) => { + let (loc, prefix) = crate::uri::split_uri(uri)?; + Ok(Resolved::Stageable { + spec: SourceSpec::S3Prefix { loc, prefix }, + }) + } + (LocKind::Remote, NameKind::Tar) => { + let (loc, key) = crate::uri::split_uri(uri)?; + Ok(Resolved::Stageable { + spec: SourceSpec::S3Tar { loc, key }, + }) + } + (_, NameKind::Idx) => unreachable!("handled above"), + } +} + +/// Build a `StorageLocation` that ROOTS at the `.idx` directory itself. +/// For local paths: `StorageLocation::Local()`. For remote URIs: +/// `StorageLocation::Url()`. +fn idx_location_for(idx_uri: &str) -> Result { + let trimmed = idx_uri.trim_end_matches('/'); + if crate::uri::is_remote_uri(trimmed) { + StorageLocation::from_url(trimmed) + .map_err(|e| StageError::InvalidUri(format!("{idx_uri}: {e}"))) + } else { + Ok(StorageLocation::from_path(trimmed)) + } +} + +/// Probe whether an `.idx` directory is ready by checking for its +/// `metadata.json`. Uses the non-creating `StorageProvider::open` so we +/// don't inadvertently materialize local directories during resolution. +fn sidecar_ready(sidecar_uri: &str) -> Result { + let loc = match idx_location_for(sidecar_uri) { + Ok(l) => l, + Err(_) => return Ok(false), + }; + let provider = match StorageProvider::open(loc) { + Ok(p) => p, + Err(_) => return Ok(false), + }; + provider + .exists("metadata.json") + .map_err(|e| StageError::Transport { + uri: redact_uri(sidecar_uri), + source: e, + }) +} + +#[cfg(test)] +mod tests { + use super::*; + use tempfile::TempDir; + + /// Build a fake `.idx` directory that passes the sidecar probe: it must + /// contain a `metadata.json` file. Content doesn't matter for resolve(). + fn fake_idx_dir(root: &std::path::Path, name: &str) -> std::path::PathBuf { + let p = root.join(name); + std::fs::create_dir(&p).unwrap(); + std::fs::write(p.join("metadata.json"), b"{}").unwrap(); + p + } + + #[test] + fn local_idx_resolves_directly() { + let dir = TempDir::new().unwrap(); + let idx = fake_idx_dir(dir.path(), "sample.d.idx"); + let r = resolve(idx.to_str().unwrap()).unwrap(); + assert!(matches!(r, Resolved::LocalIdx { .. })); + } + + #[test] + fn local_dotd_with_sidecar_becomes_local_idx() { + let dir = TempDir::new().unwrap(); + let dotd = dir.path().join("sample.d"); + std::fs::create_dir(&dotd).unwrap(); + fake_idx_dir(dir.path(), "sample.d.idx"); + let r = resolve(dotd.to_str().unwrap()).unwrap(); + assert!(matches!(r, Resolved::LocalIdx { .. })); + } + + #[test] + fn local_dotd_without_sidecar_becomes_local_dotd() { + let dir = TempDir::new().unwrap(); + let dotd = dir.path().join("sample.d"); + std::fs::create_dir(&dotd).unwrap(); + let r = resolve(dotd.to_str().unwrap()).unwrap(); + assert!(matches!(r, Resolved::LocalDotD { .. })); + } + + #[test] + fn local_tar_without_sidecar_becomes_stageable_localtar() { + let dir = TempDir::new().unwrap(); + let tar = dir.path().join("sample.d.tar"); + std::fs::write(&tar, b"fake").unwrap(); + let r = resolve(tar.to_str().unwrap()).unwrap(); + assert!(matches!( + r, + Resolved::Stageable { + spec: SourceSpec::LocalTar { .. } + } + )); + } + + #[test] + fn local_dotd_with_empty_idx_dir_does_not_shortcut() { + // An .idx dir with no metadata.json is NOT a valid sidecar. + let dir = TempDir::new().unwrap(); + let dotd = dir.path().join("sample.d"); + std::fs::create_dir(&dotd).unwrap(); + std::fs::create_dir(dir.path().join("sample.d.idx")).unwrap(); + let r = resolve(dotd.to_str().unwrap()).unwrap(); + assert!( + matches!(r, Resolved::LocalDotD { .. }), + "empty .idx dir should not shortcut; got {:?}", + r + ); + } +} diff --git a/rust/tims_stage/src/tar.rs b/rust/tims_stage/src/tar.rs new file mode 100644 index 00000000..788d985f --- /dev/null +++ b/rust/tims_stage/src/tar.rs @@ -0,0 +1,467 @@ +use crate::backend::{ + PerRunTempdir, + StagedDotD, +}; +use crate::common::{ + REQUIRED_DOTD_FILES as REQUIRED, + make_bar, + transport_err, +}; +use crate::error::StageError; +use crate::resolve::SourceSpec; +use bytes::Bytes; +use std::collections::BTreeMap; +use std::io::{ + Read, + Seek, + SeekFrom, +}; +use std::path::Path; +use timscentroid::{ + StorageLocation, + StorageProvider, +}; + +const BLOCK: usize = 512; +const PREFETCH: usize = 64 * 1024; + +/// Abstraction over "something we can pull byte ranges out of". +/// +/// `read_range` is for small header reads (up to a few KiB). For large +/// payload copies (often GBs), use `copy_range_to_file` which streams — the +/// S3 impl routes that through one range-GET whose response body writes +/// directly to disk. Never loop `read_range` for big payloads: each call +/// is a separate HTTP GET on S3. +pub(crate) trait TarReader { + fn read_range(&mut self, offset: u64, len: usize) -> Result; + fn total_len(&self) -> u64; + fn copy_range_to_file( + &mut self, + offset: u64, + size: u64, + dst: &Path, + bar: &indicatif::ProgressBar, + ) -> Result<(), StageError>; +} + +/// Range-GET backed reader over an S3 object. Prefetches the first 64 KiB at +/// construction so walks that fit in the prefetch window complete in one +/// round-trip. +pub(crate) struct S3TarReader { + provider: StorageProvider, + key: String, + size: u64, + /// Prefix buffer covering `[0, prefix.len())` bytes of the object. + prefix: Bytes, +} + +impl S3TarReader { + pub(crate) fn new(loc: &StorageLocation, key: &str) -> Result { + let uri_for_err = format!("{loc:?}/{key}"); + let provider = StorageProvider::open(loc.clone()).map_err(transport_err(&uri_for_err))?; + let meta = provider.head(key).map_err(transport_err(&uri_for_err))?; + let size = meta.size; + let window = std::cmp::min(PREFETCH as u64, size); + let prefix = if window == 0 { + Bytes::new() + } else { + provider + .range_get(key, 0..window) + .map_err(transport_err(&uri_for_err))? + }; + Ok(Self { + provider, + key: key.to_string(), + size, + prefix, + }) + } +} + +impl TarReader for S3TarReader { + fn read_range(&mut self, offset: u64, len: usize) -> Result { + let end = offset + .checked_add(len as u64) + .ok_or_else(|| StageError::ShapeMismatch("offset overflow".into()))?; + if end <= self.prefix.len() as u64 { + let start = offset as usize; + return Ok(self.prefix.slice(start..start + len)); + } + let got = self + .provider + .range_get(&self.key, offset..end) + .map_err(transport_err(&self.key))?; + if got.len() < len { + return Err(StageError::ShortRead { + expected: len as u64, + actual: got.len() as u64, + }); + } + Ok(got) + } + + fn total_len(&self) -> u64 { + self.size + } + + fn copy_range_to_file( + &mut self, + offset: u64, + size: u64, + dst: &Path, + bar: &indicatif::ProgressBar, + ) -> Result<(), StageError> { + let end = offset + .checked_add(size) + .ok_or_else(|| StageError::ShapeMismatch("offset+size overflow".into()))?; + self.provider + .range_get_to_file(&self.key, offset..end, dst, bar) + .map_err(transport_err(&self.key)) + } +} + +/// File-backed reader over a local `.tar`. +pub(crate) struct LocalTarReader { + file: std::fs::File, + size: u64, +} + +impl LocalTarReader { + pub(crate) fn new(path: &Path) -> Result { + let file = std::fs::File::open(path).map_err(StageError::Io)?; + let size = file.metadata().map_err(StageError::Io)?.len(); + Ok(Self { file, size }) + } +} + +impl TarReader for LocalTarReader { + fn read_range(&mut self, offset: u64, len: usize) -> Result { + self.file + .seek(SeekFrom::Start(offset)) + .map_err(StageError::Io)?; + let mut buf = vec![0u8; len]; + self.file.read_exact(&mut buf).map_err(StageError::Io)?; + Ok(Bytes::from(buf)) + } + + fn total_len(&self) -> u64 { + self.size + } + + fn copy_range_to_file( + &mut self, + offset: u64, + size: u64, + dst: &Path, + bar: &indicatif::ProgressBar, + ) -> Result<(), StageError> { + use std::io::Write; + if let Some(parent) = dst.parent() { + std::fs::create_dir_all(parent).map_err(StageError::Io)?; + } + let mut out = std::fs::File::create(dst).map_err(StageError::Io)?; + self.file + .seek(SeekFrom::Start(offset)) + .map_err(StageError::Io)?; + let mut remaining = size; + let mut buf = vec![0u8; 4 * 1024 * 1024]; + while remaining > 0 { + let take = std::cmp::min(buf.len() as u64, remaining) as usize; + self.file + .read_exact(&mut buf[..take]) + .map_err(StageError::Io)?; + out.write_all(&buf[..take]).map_err(StageError::Io)?; + bar.inc(take as u64); + remaining -= take as u64; + } + Ok(()) + } +} + +// -------- ustar header parsing -------- + +#[derive(Debug, Clone)] +struct TarHeader { + name: String, + size: u64, + typeflag: u8, +} + +fn parse_header(block: &[u8]) -> Result { + if block.len() != BLOCK { + return Err(StageError::ShapeMismatch(format!( + "expected 512 B header, got {}", + block.len() + ))); + } + let name = { + let raw = &block[0..100]; + let end = raw.iter().position(|&b| b == 0).unwrap_or(raw.len()); + std::str::from_utf8(&raw[..end]) + .map_err(|e| StageError::ShapeMismatch(format!("header name not UTF-8: {e}")))? + .to_string() + }; + let size = parse_octal(&block[124..136])?; + let typeflag = block[156]; + Ok(TarHeader { + name, + size, + typeflag, + }) +} + +fn parse_octal(raw: &[u8]) -> Result { + let s = std::str::from_utf8(raw) + .map_err(|e| StageError::ShapeMismatch(format!("octal field not ASCII: {e}")))? + .trim_end_matches(|c: char| c == '\0' || c == ' ') + .trim_start_matches(' '); + if s.is_empty() { + return Ok(0); + } + u64::from_str_radix(s, 8) + .map_err(|e| StageError::ShapeMismatch(format!("bad octal `{s}`: {e}"))) +} + +fn is_zero_block(b: &[u8]) -> bool { + b.iter().all(|&x| x == 0) +} + +fn basename_of(path: &str) -> &str { + path.rsplit('/').next().unwrap_or(path) +} + +/// Walk headers until all required basenames are located, or EOA is reached. +/// Returns a map `basename -> (payload_offset, payload_size)`. +/// +/// Typeflag handling: +/// - `'0'` / `0` : regular file — record if basename is required. +/// - `'x'` / `'g'`: pax extended header — skip payload. +/// - `'L'` / `'K'`: GNU long-name record — hard error. +/// - anything else: unknown — skip payload, don't record. +/// +/// Permissive-by-default; the corpus-inventory step will tell us which +/// branches can be pruned later. +fn walk_tar_index(reader: &mut dyn TarReader) -> Result, StageError> { + let mut out: BTreeMap = BTreeMap::new(); + let total = reader.total_len(); + let mut offset: u64 = 0; + let mut zero_run = 0; + while offset + BLOCK as u64 <= total { + let block = reader.read_range(offset, BLOCK)?; + if is_zero_block(&block) { + zero_run += 1; + if zero_run >= 2 { + break; + } + offset += BLOCK as u64; + continue; + } + zero_run = 0; + let h = parse_header(&block)?; + let payload_offset = offset + BLOCK as u64; + let padded = ((h.size + BLOCK as u64 - 1) / BLOCK as u64) * BLOCK as u64; + match h.typeflag { + b'0' | 0 => { + let bn = basename_of(&h.name).to_string(); + if REQUIRED.contains(&bn.as_str()) { + out.insert(bn, (payload_offset, h.size)); + } + } + b'x' | b'g' => { /* pax extended header — skip payload */ } + b'L' | b'K' => return Err(StageError::UnsupportedTarFeature), + _ => { /* unknown typeflag — skip payload */ } + } + offset = payload_offset + padded; + if REQUIRED.iter().all(|r| out.contains_key(*r)) { + break; + } + } + let missing: Vec = REQUIRED + .iter() + .filter(|r| !out.contains_key(**r)) + .map(|s| s.to_string()) + .collect(); + if !missing.is_empty() { + return Err(StageError::MissingRequiredFiles { + missing, + found: out.keys().cloned().collect(), + }); + } + Ok(out) +} + +// -------- stage helpers -------- + +pub(crate) fn stage_s3_tar( + backend: &PerRunTempdir, + spec: &SourceSpec, +) -> Result { + let SourceSpec::S3Tar { loc, key } = spec else { + unreachable!() + }; + let step = timscentroid::TimedStep::begin("Staging .d (tar)"); + let mut reader = S3TarReader::new(loc, key)?; + let index = walk_tar_index(&mut reader)?; + let staged = materialize(backend, &mut reader, &index)?; + step.finish(); + Ok(staged) +} + +pub(crate) fn stage_local_tar( + backend: &PerRunTempdir, + spec: &SourceSpec, +) -> Result { + let SourceSpec::LocalTar { path } = spec else { + unreachable!() + }; + let step = timscentroid::TimedStep::begin("Staging .d (local tar)"); + let mut reader = LocalTarReader::new(path)?; + let index = walk_tar_index(&mut reader)?; + let staged = materialize(backend, &mut reader, &index)?; + step.finish(); + Ok(staged) +} + +fn materialize( + backend: &PerRunTempdir, + reader: &mut dyn TarReader, + index: &BTreeMap, +) -> Result { + let tempdir = backend.new_tempdir()?; + let dotd = tempdir.path().join("sample.d"); + std::fs::create_dir_all(&dotd).map_err(StageError::Io)?; + std::fs::File::create(tempdir.path().join(".lock")).map_err(StageError::Io)?; + for bn in REQUIRED { + let (offset, size) = *index.get(*bn).expect("preflight guarantees presence"); + let bar = make_bar(size, bn); + reader.copy_range_to_file(offset, size, &dotd.join(bn), &bar)?; + bar.finish_and_clear(); + } + Ok(StagedDotD::owned(tempdir, dotd)) +} + +// -------- walker unit tests -------- + +#[cfg(test)] +mod walker_tests { + use super::*; + + fn build_ustar_header(name: &str, size: u64, typeflag: u8) -> Vec { + let mut buf = vec![0u8; BLOCK]; + let nb = name.as_bytes(); + buf[..nb.len()].copy_from_slice(nb); + let s = format!("{:011o}", size); + buf[124..124 + s.len()].copy_from_slice(s.as_bytes()); + buf[135] = 0; + buf[156] = typeflag; + for i in 148..156 { + buf[i] = b' '; + } + buf + } + + fn pad_to_block(payload: &mut Vec) { + let pad = (BLOCK - payload.len() % BLOCK) % BLOCK; + payload.extend(std::iter::repeat(0u8).take(pad)); + } + + struct InMemTar { + data: Vec, + } + + impl TarReader for InMemTar { + fn read_range(&mut self, offset: u64, len: usize) -> Result { + let o = offset as usize; + if o + len > self.data.len() { + return Err(StageError::ShortRead { + expected: len as u64, + actual: (self.data.len() - o) as u64, + }); + } + Ok(Bytes::copy_from_slice(&self.data[o..o + len])) + } + + fn total_len(&self) -> u64 { + self.data.len() as u64 + } + + fn copy_range_to_file( + &mut self, + offset: u64, + size: u64, + dst: &Path, + bar: &indicatif::ProgressBar, + ) -> Result<(), StageError> { + let bytes = self.read_range(offset, size as usize)?; + std::fs::write(dst, &bytes).map_err(StageError::Io)?; + bar.inc(bytes.len() as u64); + Ok(()) + } + } + + fn minimal_tar(entries: &[(&str, Vec, u8)]) -> Vec { + let mut out = Vec::new(); + for (name, payload, typeflag) in entries { + out.extend_from_slice(&build_ustar_header(name, payload.len() as u64, *typeflag)); + out.extend_from_slice(payload); + pad_to_block(&mut out); + } + out.extend(std::iter::repeat(0u8).take(BLOCK * 2)); + out + } + + #[test] + fn walker_finds_required_basenames_flat_tar() { + let data = minimal_tar(&[ + ("analysis.tdf", vec![1u8; 10], b'0'), + ("analysis.tdf_bin", vec![2u8; 4096], b'0'), + ]); + let mut r = InMemTar { data }; + let idx = walk_tar_index(&mut r).unwrap(); + assert!(idx.contains_key("analysis.tdf")); + assert!(idx.contains_key("analysis.tdf_bin")); + } + + #[test] + fn walker_finds_basenames_nested_in_sample_dot_d() { + let data = minimal_tar(&[ + ("sample.d/analysis.tdf", vec![1u8; 10], b'0'), + ("sample.d/analysis.tdf_bin", vec![2u8; 4096], b'0'), + ]); + let mut r = InMemTar { data }; + let idx = walk_tar_index(&mut r).unwrap(); + assert_eq!(idx.len(), 2); + } + + #[test] + fn walker_skips_pax_x_header() { + let data = minimal_tar(&[ + ("PaxHeader/extra", vec![1u8; 32], b'x'), + ("analysis.tdf", vec![1u8; 10], b'0'), + ("analysis.tdf_bin", vec![2u8; 4096], b'0'), + ]); + let mut r = InMemTar { data }; + let idx = walk_tar_index(&mut r).unwrap(); + assert_eq!(idx.len(), 2); + } + + #[test] + fn walker_errors_on_gnu_longname() { + let data = minimal_tar(&[ + ("LongNameBlock", vec![1u8; 32], b'L'), + ("analysis.tdf", vec![1u8; 10], b'0'), + ("analysis.tdf_bin", vec![2u8; 4096], b'0'), + ]); + let mut r = InMemTar { data }; + let err = walk_tar_index(&mut r).unwrap_err(); + assert!(matches!(err, StageError::UnsupportedTarFeature)); + } + + #[test] + fn walker_errors_on_missing_basenames() { + let data = minimal_tar(&[("some_other_file", vec![1u8; 10], b'0')]); + let mut r = InMemTar { data }; + let err = walk_tar_index(&mut r).unwrap_err(); + assert!(matches!(err, StageError::MissingRequiredFiles { .. })); + } +} diff --git a/rust/tims_stage/src/upload.rs b/rust/tims_stage/src/upload.rs new file mode 100644 index 00000000..41033fe4 --- /dev/null +++ b/rust/tims_stage/src/upload.rs @@ -0,0 +1,80 @@ +//! Upload a local file to a destination URI (local path or S3). +//! +//! The whole file is read into memory before the PUT. **Only suitable for +//! small outputs** (result JSONs, parquet shards, run reports). For the +//! default cap see [`DEFAULT_UPLOAD_CAP`]. To stream a large `.d` bundle +//! instead, bundle it into a tar and use the staging pipeline in reverse +//! (not yet implemented — avoid pointing `upload_file` at multi-GB payloads). + +use crate::common::transport_err; +use crate::error::StageError; +use crate::uri::{ + is_remote_uri, + split_uri, +}; +use std::path::Path; +use timscentroid::StorageProvider; + +/// Default ceiling on the size of a payload uploaded via [`upload_file`]. +/// Matches [`crate::open::DEFAULT_IN_MEMORY_CAP`] (4 GiB). +pub const DEFAULT_UPLOAD_CAP: u64 = 4 * 1024 * 1024 * 1024; + +pub fn upload_file(local: &Path, dest_uri: &str) -> Result<(), StageError> { + upload_file_with_cap(local, dest_uri, DEFAULT_UPLOAD_CAP) +} + +pub fn upload_file_with_cap( + local: &Path, + dest_uri: &str, + max_bytes: u64, +) -> Result<(), StageError> { + let size = std::fs::metadata(local).map_err(StageError::Io)?.len(); + if size > max_bytes { + return Err(StageError::PayloadTooLarge { + uri: crate::error::redact_uri(dest_uri).to_string(), + size, + cap: max_bytes, + }); + } + let bytes = std::fs::read(local).map_err(StageError::Io)?; + if is_remote_uri(dest_uri) { + let (loc, key) = split_uri(dest_uri)?; + // Writes go through `new` (creates parent dir for local). + let provider = StorageProvider::new(loc).map_err(transport_err(dest_uri))?; + provider + .write_bytes(&key, bytes) + .map_err(transport_err(dest_uri))?; + } else { + if let Some(parent) = Path::new(dest_uri).parent() { + std::fs::create_dir_all(parent).map_err(StageError::Io)?; + } + std::fs::write(dest_uri, &bytes).map_err(StageError::Io)?; + } + Ok(()) +} + +#[cfg(test)] +mod tests { + use super::*; + use tempfile::TempDir; + + #[test] + fn copies_local_to_local() { + let dir = TempDir::new().unwrap(); + let src = dir.path().join("src.bin"); + let dst = dir.path().join("subdir/dst.bin"); + std::fs::write(&src, b"payload").unwrap(); + upload_file(&src, dst.to_str().unwrap()).unwrap(); + assert_eq!(std::fs::read(&dst).unwrap(), b"payload"); + } + + #[test] + fn rejects_oversize_payload() { + let dir = TempDir::new().unwrap(); + let src = dir.path().join("src.bin"); + let dst = dir.path().join("dst.bin"); + std::fs::write(&src, b"1234567890").unwrap(); + let result = upload_file_with_cap(&src, dst.to_str().unwrap(), 5); + assert!(matches!(result, Err(StageError::PayloadTooLarge { .. }))); + } +} diff --git a/rust/tims_stage/src/uri.rs b/rust/tims_stage/src/uri.rs new file mode 100644 index 00000000..db54a96d --- /dev/null +++ b/rust/tims_stage/src/uri.rs @@ -0,0 +1,249 @@ +//! URI shape parsing + sidecar convention + single canonical splitter. + +use crate::error::StageError; +use std::path::Path; +use timscentroid::StorageLocation; + +/// Where an URI points — local filesystem or remote object store. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub(crate) enum LocKind { + Local, + Remote, +} + +/// What kind of artifact the URI names, by suffix. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub(crate) enum NameKind { + Idx, + DotD, + Tar, +} + +#[derive(Debug, Clone)] +pub(crate) struct UriShape { + pub loc: LocKind, + pub name: NameKind, + #[allow(dead_code)] + pub raw: String, +} + +/// Classify a URI string by scheme + suffix. Pure logic, no I/O. +pub(crate) fn parse_uri_shape(uri: &str) -> Result { + let trimmed = uri.trim_end_matches('/'); + let loc = if is_remote_uri(uri) { + LocKind::Remote + } else { + LocKind::Local + }; + let name = if trimmed.ends_with(".idx") { + NameKind::Idx + } else if trimmed.ends_with(".tar") { + NameKind::Tar + } else if trimmed.ends_with(".d") { + NameKind::DotD + } else { + return Err(StageError::UnknownSuffix(uri.to_string())); + }; + Ok(UriShape { + loc, + name, + raw: uri.to_string(), + }) +} + +/// Cheap test for a supported remote URI scheme. +/// Exported so consumers don't re-implement `uri.starts_with("s3://") || ...`. +pub fn is_remote_uri(uri: &str) -> bool { + remote_scheme_prefix(uri).is_some() +} + +/// Expand `~/` (and bare `~`) in local-path URIs to `$HOME`. Remote URIs +/// pass through unchanged. `Path::exists()` and `File::open()` don't +/// recognise `~`; callers pulling paths from config files / non-shell +/// sources should run this before any filesystem probe. +pub fn expand_local_uri(uri: &str) -> String { + if is_remote_uri(uri) { + return uri.to_string(); + } + let home = std::env::var_os("HOME"); + match (uri, &home) { + ("~", Some(h)) => h.to_string_lossy().into_owned(), + (u, Some(h)) if u.starts_with("~/") => { + let mut p = std::path::PathBuf::from(h); + p.push(&u[2..]); + p.to_string_lossy().into_owned() + } + _ => uri.to_string(), + } +} + +fn remote_scheme_prefix(uri: &str) -> Option<&'static str> { + if uri.starts_with("s3://") { + Some("s3://") + } else if uri.starts_with("gs://") { + Some("gs://") + } else if uri.starts_with("az://") { + Some("az://") + } else { + None + } +} + +/// Canonical `.idx` sidecar URI. Strips one trailing `/` so `sample.d/` → +/// `sample.d.idx`, not `sample.d/.idx`. Matches the existing index_serde +/// convention for local paths. +pub fn sidecar_of(uri: &str) -> String { + let trimmed = uri.strip_suffix('/').unwrap_or(uri); + format!("{trimmed}.idx") +} + +/// Split `s3://bucket/path/to/obj` into +/// (`StorageLocation::from_url("s3://bucket")`, `"path/to/obj"`). +/// For local paths, returns +/// (`StorageLocation::from_path(parent)`, `file_name`). One splitter used by +/// all downstream call sites. +pub fn split_uri(uri: &str) -> Result<(StorageLocation, String), StageError> { + let trimmed = uri.trim_end_matches('/'); + if let Some(scheme_prefix) = remote_scheme_prefix(trimmed) { + let rest = &trimmed[scheme_prefix.len()..]; + let (bucket, key) = rest.split_once('/').unwrap_or((rest, "")); + let loc = StorageLocation::from_url(format!("{scheme_prefix}{bucket}")) + .map_err(|e| StageError::InvalidUri(format!("{uri}: {e}")))?; + return Ok((loc, key.to_string())); + } + let p = Path::new(trimmed); + let parent = p.parent().unwrap_or_else(|| Path::new(".")); + let tail = p + .file_name() + .ok_or_else(|| StageError::InvalidUri(format!("missing filename in {uri}")))? + .to_string_lossy() + .to_string(); + Ok((StorageLocation::from_path(parent), tail)) +} + +/// Canonical URI used as a cache key / sidecar-origin identifier. +/// +/// - Strips exactly one trailing `/` if present (so `foo.d/` and `foo.d` +/// yield the same key). +/// - Preserves scheme + host + key verbatim for remote URIs (S3 keys are +/// case-sensitive). +/// - For local paths, runs `std::fs::canonicalize` when the path exists; +/// otherwise returns the trimmed input unchanged. +pub fn canonical_uri(uri: &str) -> String { + let trimmed = uri.trim_end_matches('/').to_string(); + if is_remote_uri(&trimmed) { + return trimmed; + } + match std::fs::canonicalize(&trimmed) { + Ok(p) => p.to_string_lossy().to_string(), + Err(_) => trimmed, + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn recognizes_local_dotd() { + let s = parse_uri_shape("/tmp/sample.d").unwrap(); + assert_eq!(s.loc, LocKind::Local); + assert_eq!(s.name, NameKind::DotD); + } + + #[test] + fn recognizes_local_dotd_with_trailing_slash() { + let s = parse_uri_shape("/tmp/sample.d/").unwrap(); + assert_eq!(s.name, NameKind::DotD); + } + + #[test] + fn recognizes_s3_tar() { + let s = parse_uri_shape("s3://bkt/sample.d.tar").unwrap(); + assert_eq!(s.loc, LocKind::Remote); + assert_eq!(s.name, NameKind::Tar); + } + + #[test] + fn recognizes_s3_idx() { + let s = parse_uri_shape("s3://bkt/sample.d.idx").unwrap(); + assert_eq!(s.name, NameKind::Idx); + } + + #[test] + fn errors_on_unknown_suffix() { + let err = parse_uri_shape("/tmp/some.txt").unwrap_err(); + assert!(matches!(err, StageError::UnknownSuffix(_))); + } + + #[test] + fn sidecar_strips_trailing_slash() { + assert_eq!(sidecar_of("/tmp/sample.d"), "/tmp/sample.d.idx"); + assert_eq!(sidecar_of("/tmp/sample.d/"), "/tmp/sample.d.idx"); + } + + #[test] + fn sidecar_of_s3_prefix() { + assert_eq!(sidecar_of("s3://bkt/sample.d/"), "s3://bkt/sample.d.idx"); + } + + #[test] + fn sidecar_of_s3_tar() { + assert_eq!( + sidecar_of("s3://bkt/sample.d.tar"), + "s3://bkt/sample.d.tar.idx" + ); + } + + #[test] + fn splits_s3_uri() { + let (_loc, key) = split_uri("s3://bkt/prefix/a.bin").unwrap(); + assert_eq!(key, "prefix/a.bin"); + } + + #[test] + fn splits_s3_bucket_root() { + let (_loc, key) = split_uri("s3://bkt/a.bin").unwrap(); + assert_eq!(key, "a.bin"); + } + + #[test] + fn splits_local_path() { + let (_loc, key) = split_uri("/tmp/a.bin").unwrap(); + assert_eq!(key, "a.bin"); + } + + #[test] + fn canonical_uri_strips_trailing_slash_on_remote() { + assert_eq!(canonical_uri("s3://bkt/foo/"), "s3://bkt/foo"); + } + + #[test] + fn canonical_uri_local_existing_path_is_absolute() { + let d = tempfile::TempDir::new().unwrap(); + let p = d.path().join("x"); + std::fs::write(&p, b"").unwrap(); + let got = canonical_uri(p.to_str().unwrap()); + assert!(std::path::Path::new(&got).is_absolute()); + } + + #[test] + fn canonical_uri_local_nonexistent_returns_trimmed() { + let got = canonical_uri("/tmp/definitely-does-not-exist-xyz123/"); + assert_eq!(got, "/tmp/definitely-does-not-exist-xyz123"); + } + + #[test] + fn expand_local_uri_tilde() { + // SAFETY: test-only env mutation; tests are serial within this module. + unsafe { + std::env::set_var("HOME", "/home/test"); + } + assert_eq!(expand_local_uri("~/data/a.txt"), "/home/test/data/a.txt"); + assert_eq!(expand_local_uri("~"), "/home/test"); + assert_eq!(expand_local_uri("/abs/path"), "/abs/path"); + assert_eq!(expand_local_uri("relative/path"), "relative/path"); + // Remote URIs pass through even if they start with ~ by accident. + assert_eq!(expand_local_uri("s3://bkt/key"), "s3://bkt/key"); + } +} diff --git a/rust/tims_stage/tests/common/minio.rs b/rust/tims_stage/tests/common/minio.rs new file mode 100644 index 00000000..8b273237 --- /dev/null +++ b/rust/tims_stage/tests/common/minio.rs @@ -0,0 +1,10 @@ +//! MinIO test fixture (requires `MINIO_TEST_ENDPOINT` env var). +//! Returns None when unset; callers should skip the test gracefully. + +pub fn minio_endpoint() -> Option { + std::env::var("MINIO_TEST_ENDPOINT").ok() +} + +pub fn minio_bucket() -> String { + std::env::var("MINIO_TEST_BUCKET").unwrap_or_else(|_| "tims-stage-ci".to_string()) +} diff --git a/rust/tims_stage/tests/common/mod.rs b/rust/tims_stage/tests/common/mod.rs new file mode 100644 index 00000000..f6e21a85 --- /dev/null +++ b/rust/tims_stage/tests/common/mod.rs @@ -0,0 +1,3 @@ +//! Shared test fixtures. + +pub mod minio; diff --git a/rust/tims_stage/tests/local_tar_staging.rs b/rust/tims_stage/tests/local_tar_staging.rs new file mode 100644 index 00000000..6a4e66a3 --- /dev/null +++ b/rust/tims_stage/tests/local_tar_staging.rs @@ -0,0 +1,98 @@ +use std::io::Write; +use tempfile::TempDir; +use tims_stage::{ + PerRunTempdir, + SourceSpec, + StagingBackend, + StagingConfig, +}; + +fn write_ustar_header(buf: &mut Vec, name: &str, size: u64, typeflag: u8) { + let start = buf.len(); + buf.resize(start + 512, 0); + let blk = &mut buf[start..start + 512]; + let nb = name.as_bytes(); + blk[..nb.len()].copy_from_slice(nb); + let s = format!("{:011o}", size); + blk[124..124 + s.len()].copy_from_slice(s.as_bytes()); + blk[156] = typeflag; + for b in &mut blk[148..156] { + *b = b' '; + } +} + +fn pad(buf: &mut Vec) { + let p = (512 - buf.len() % 512) % 512; + buf.extend(std::iter::repeat(0u8).take(p)); +} + +#[test] +fn stage_local_tar_materializes_sample_d_wrapper() { + let dir = TempDir::new().unwrap(); + let tar_path = dir.path().join("sample.d.tar"); + let mut tar = Vec::new(); + let tdf_body = vec![1u8; 7]; + let tdf_bin_body = vec![2u8; 1024]; + write_ustar_header(&mut tar, "analysis.tdf", tdf_body.len() as u64, b'0'); + tar.extend_from_slice(&tdf_body); + pad(&mut tar); + write_ustar_header( + &mut tar, + "analysis.tdf_bin", + tdf_bin_body.len() as u64, + b'0', + ); + tar.extend_from_slice(&tdf_bin_body); + pad(&mut tar); + tar.extend(std::iter::repeat(0u8).take(1024)); + std::fs::File::create(&tar_path) + .unwrap() + .write_all(&tar) + .unwrap(); + + let backend = PerRunTempdir::new(StagingConfig::default()).unwrap(); + let spec = SourceSpec::LocalTar { path: tar_path }; + let staged = backend.stage(&spec).unwrap(); + + assert!(staged.as_ref().join("analysis.tdf").is_file()); + assert!(staged.as_ref().join("analysis.tdf_bin").is_file()); + assert_eq!( + std::fs::read(staged.as_ref().join("analysis.tdf")).unwrap(), + tdf_body + ); + assert_eq!( + std::fs::read(staged.as_ref().join("analysis.tdf_bin")).unwrap(), + tdf_bin_body + ); +} + +#[test] +fn stage_local_tar_tempdir_vanishes_on_drop() { + let dir = TempDir::new().unwrap(); + let tar_path = dir.path().join("sample.d.tar"); + let mut tar = Vec::new(); + write_ustar_header(&mut tar, "analysis.tdf", 3, b'0'); + tar.extend_from_slice(&[1, 2, 3]); + pad(&mut tar); + write_ustar_header(&mut tar, "analysis.tdf_bin", 3, b'0'); + tar.extend_from_slice(&[4, 5, 6]); + pad(&mut tar); + tar.extend(std::iter::repeat(0u8).take(1024)); + std::fs::File::create(&tar_path) + .unwrap() + .write_all(&tar) + .unwrap(); + + let backend = PerRunTempdir::new(StagingConfig::default()).unwrap(); + let spec = SourceSpec::LocalTar { path: tar_path }; + let tempdir_path_copy; + { + let staged = backend.stage(&spec).unwrap(); + tempdir_path_copy = staged.as_ref().parent().unwrap().to_path_buf(); + assert!(tempdir_path_copy.is_dir()); + } + assert!( + !tempdir_path_copy.is_dir(), + "tempdir should be gone after drop" + ); +} diff --git a/rust/tims_stage/tests/minio_smoke.rs b/rust/tims_stage/tests/minio_smoke.rs new file mode 100644 index 00000000..d7a9662e --- /dev/null +++ b/rust/tims_stage/tests/minio_smoke.rs @@ -0,0 +1,57 @@ +mod common; +use common::minio; + +/// Round-trips a synthetic `.d` prefix through MinIO. +/// +/// Marked `#[ignore]` so it never runs in the default suite. CI and local +/// dev must invoke explicitly: +/// +/// MINIO_TEST_ENDPOINT=http://localhost:9000 \ +/// MINIO_TEST_BUCKET=tims-stage-ci \ +/// AWS_ACCESS_KEY_ID=... AWS_SECRET_ACCESS_KEY=... \ +/// cargo test -p tims_stage --features aws --test minio_smoke -- --ignored +/// +/// Fixtures are seeded by the test itself (two tiny objects) so no external +/// bucket state is assumed. A missing `MINIO_TEST_ENDPOINT` is a hard failure +/// when the test runs — silent passes would mask CI misconfiguration. +#[test] +#[ignore = "requires MinIO endpoint + aws feature; run explicitly with --ignored"] +fn stage_s3_prefix_against_minio() { + let endpoint = minio::minio_endpoint() + .expect("MINIO_TEST_ENDPOINT must be set when running this ignored test"); + unsafe { + std::env::set_var("AWS_ENDPOINT_URL", &endpoint); + std::env::set_var("AWS_S3_FORCE_PATH_STYLE", "true"); + } + + let bucket = minio::minio_bucket(); + // Unique prefix per run so parallel CI jobs / repeated invocations don't collide. + let run_id = std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .unwrap() + .as_nanos(); + let uri_prefix = format!("s3://{bucket}/smoke/{run_id}/sample.d/"); + + let tmp = tempfile::TempDir::new().unwrap(); + let tdf = tmp.path().join("analysis.tdf"); + let tdf_bin = tmp.path().join("analysis.tdf_bin"); + std::fs::write(&tdf, b"fake-tdf-payload").unwrap(); + std::fs::write(&tdf_bin, b"fake-tdf-bin-payload").unwrap(); + + tims_stage::upload_file(&tdf, &format!("{uri_prefix}analysis.tdf")).unwrap(); + tims_stage::upload_file(&tdf_bin, &format!("{uri_prefix}analysis.tdf_bin")).unwrap(); + + let backend = tims_stage::PerRunTempdir::new(tims_stage::StagingConfig::default()).unwrap(); + let resolved = tims_stage::resolve(&uri_prefix).unwrap(); + let spec = match resolved { + tims_stage::Resolved::Stageable { spec } => spec, + other => panic!("expected Stageable, got {other:?}"), + }; + let staged = + ::stage(&backend, &spec).unwrap(); + + let got_tdf = std::fs::read(staged.as_ref().join("analysis.tdf")).unwrap(); + let got_bin = std::fs::read(staged.as_ref().join("analysis.tdf_bin")).unwrap(); + assert_eq!(got_tdf, b"fake-tdf-payload"); + assert_eq!(got_bin, b"fake-tdf-bin-payload"); +} diff --git a/rust/tims_stage/tests/prefix_staging.rs b/rust/tims_stage/tests/prefix_staging.rs new file mode 100644 index 00000000..84d711df --- /dev/null +++ b/rust/tims_stage/tests/prefix_staging.rs @@ -0,0 +1,56 @@ +use tempfile::TempDir; +use tims_stage::{ + PerRunTempdir, + SourceSpec, + StagingBackend, + StagingConfig, +}; +use timscentroid::StorageLocation; + +#[test] +fn stage_s3_prefix_on_local_fs() { + let root = TempDir::new().unwrap(); + let sample_dir = root.path().join("sample.d"); + std::fs::create_dir(&sample_dir).unwrap(); + std::fs::write(sample_dir.join("analysis.tdf"), b"tdf").unwrap(); + std::fs::write(sample_dir.join("analysis.tdf_bin"), b"tdfbin").unwrap(); + + let loc = StorageLocation::from_path(root.path()); + let backend = PerRunTempdir::new(StagingConfig::default()).unwrap(); + let spec = SourceSpec::S3Prefix { + loc, + prefix: "sample.d".to_string(), + }; + let staged = backend.stage(&spec).unwrap(); + + assert_eq!( + std::fs::read(staged.as_ref().join("analysis.tdf")).unwrap(), + b"tdf" + ); + assert_eq!( + std::fs::read(staged.as_ref().join("analysis.tdf_bin")).unwrap(), + b"tdfbin" + ); +} + +#[test] +fn stage_s3_prefix_errors_on_missing_files() { + let root = TempDir::new().unwrap(); + let sample_dir = root.path().join("sample.d"); + std::fs::create_dir(&sample_dir).unwrap(); + std::fs::write(sample_dir.join("analysis.tdf"), b"tdf").unwrap(); + // missing analysis.tdf_bin + + let loc = StorageLocation::from_path(root.path()); + let backend = PerRunTempdir::new(StagingConfig::default()).unwrap(); + let spec = SourceSpec::S3Prefix { + loc, + prefix: "sample.d".to_string(), + }; + let err = match backend.stage(&spec) { + Ok(_) => panic!("expected error, got Ok"), + Err(e) => e, + }; + let msg = format!("{err}"); + assert!(msg.contains("analysis.tdf_bin"), "{msg}"); +} diff --git a/rust/tims_stage/tests/sweep.rs b/rust/tims_stage/tests/sweep.rs new file mode 100644 index 00000000..803afe15 --- /dev/null +++ b/rust/tims_stage/tests/sweep.rs @@ -0,0 +1,85 @@ +use filetime::FileTime; +use std::time::{ + Duration, + SystemTime, +}; +use tempfile::TempDir; +use tims_stage::{ + PerRunTempdir, + StagingConfig, +}; + +fn backdate(path: &std::path::Path, hours_ago: u64) { + let ts = SystemTime::now() - Duration::from_secs(hours_ago * 3600); + filetime::set_file_mtime(path, FileTime::from_system_time(ts)).unwrap(); +} + +#[test] +fn sweep_removes_old_unlocked_tempdirs() { + let root = TempDir::new().unwrap(); + let stale = root.path().join("timsseek-staging-ABC123"); + std::fs::create_dir(&stale).unwrap(); + backdate(&stale, 48); + let cfg = StagingConfig { + tempdir_root: Some(root.path().to_path_buf()), + stale_sweep_age_hours: 24, + ..StagingConfig::default() + }; + let _backend = PerRunTempdir::new(cfg).unwrap(); + assert!( + !stale.is_dir(), + "48h-old unlocked dir should have been swept" + ); +} + +#[test] +fn sweep_preserves_recent_tempdirs() { + let root = TempDir::new().unwrap(); + let fresh = root.path().join("timsseek-staging-FRESH"); + std::fs::create_dir(&fresh).unwrap(); + // mtime defaults to now; no backdating. + let cfg = StagingConfig { + tempdir_root: Some(root.path().to_path_buf()), + stale_sweep_age_hours: 24, + ..StagingConfig::default() + }; + let _backend = PerRunTempdir::new(cfg).unwrap(); + assert!(fresh.is_dir()); +} + +#[test] +fn sweep_preserves_locked_tempdirs() { + let root = TempDir::new().unwrap(); + let locked = root.path().join("timsseek-staging-LIVE"); + std::fs::create_dir(&locked).unwrap(); + std::fs::File::create(locked.join(".lock")).unwrap(); + backdate(&locked, 36); + let cfg = StagingConfig { + tempdir_root: Some(root.path().to_path_buf()), + stale_sweep_age_hours: 24, + ..StagingConfig::default() + }; + let _backend = PerRunTempdir::new(cfg).unwrap(); + assert!( + locked.is_dir(), + "locked dir should survive sweep when mtime < 2x threshold" + ); +} + +#[test] +fn sweep_ignores_unrelated_dirs() { + let root = TempDir::new().unwrap(); + let unrelated = root.path().join("my-important-data"); + std::fs::create_dir(&unrelated).unwrap(); + backdate(&unrelated, 48); + let cfg = StagingConfig { + tempdir_root: Some(root.path().to_path_buf()), + stale_sweep_age_hours: 24, + ..StagingConfig::default() + }; + let _backend = PerRunTempdir::new(cfg).unwrap(); + assert!( + unrelated.is_dir(), + "non-namespaced dir should be ignored by sweep" + ); +} diff --git a/rust/timscentroid/Cargo.toml b/rust/timscentroid/Cargo.toml index b361acee..3068e130 100644 --- a/rust/timscentroid/Cargo.toml +++ b/rust/timscentroid/Cargo.toml @@ -22,12 +22,13 @@ chrono = { version = "0.4", features = ["serde"] } # Storage abstraction (always available) object_store = "0.12" -tokio = { version = "1", features = ["rt", "rt-multi-thread"] } +tokio = { version = "1", features = ["rt", "rt-multi-thread", "io-util"] } futures = "0.3" bytes = "1" url = "2" once_cell = "1" async-trait = "0.1" +indicatif = { workspace = true } aws-config = { version = "1.0" , optional = true } aws-credential-types = { version = "1.0", optional = true } diff --git a/rust/timscentroid/src/lib.rs b/rust/timscentroid/src/lib.rs index 395e3fc2..96b686c2 100644 --- a/rust/timscentroid/src/lib.rs +++ b/rust/timscentroid/src/lib.rs @@ -6,8 +6,12 @@ pub mod lazy; pub mod rt_mapping; pub mod serialization; pub mod storage; +pub mod timings; pub mod utils; +#[doc(inline)] +pub use timings::TimedStep; + #[doc(inline)] pub use geometry::QuadrupoleIsolationScheme; diff --git a/rust/timscentroid/src/serialization/mod.rs b/rust/timscentroid/src/serialization/mod.rs index f8adec49..86bb5336 100644 --- a/rust/timscentroid/src/serialization/mod.rs +++ b/rust/timscentroid/src/serialization/mod.rs @@ -100,6 +100,10 @@ pub enum SerializationError { expected: &'static str, found: String, }, + PrefixCapExceeded { + prefix: String, + cap: usize, + }, } impl fmt::Display for SerializationError { @@ -136,6 +140,13 @@ impl fmt::Display for SerializationError { expected, found ) } + Self::PrefixCapExceeded { prefix, cap } => { + write!( + f, + "prefix yields more than {} entries; refusing to list (prefix={})", + cap, prefix + ) + } } } } @@ -518,16 +529,11 @@ impl IndexedTimstofPeaks { storage: StorageProvider, meta: TimscentroidMetadata, ) -> Result { - // Choose loading strategy based on storage type: - // - Local: CPU bottleneck → parallel loading benefits from multi-core - // - Cloud: Network I/O bottleneck → serial loading avoids connection overhead - let use_parallel = storage.is_local(); - - if use_parallel { - Self::load_from_storage_parallel(storage, meta) - } else { - Self::load_from_storage_serial(storage, meta) - } + // Always parallel — measured 144s serial vs Result { - // Load MS1 first (normalize path for cross-platform compatibility) - let ms1_path = normalize_storage_path(&meta.ms1_peaks.relative_path); - let ms1_cols = storage.read_parquet_peaks(&ms1_path)?; - let (ms1_peaks, _stats) = IndexedPeakGroup::new_from_soa( - ms1_cols, - meta.ms1_peaks.cycle_to_rt_ms.clone(), - meta.ms1_peaks.bucket_size, - ); - - // Load MS2 groups sequentially - let ms2_window_groups: Result, SerializationError> = meta - .ms2_window_groups - .iter() - .map(|group_meta| { - // Normalize path separators for cross-platform compatibility - let path = normalize_storage_path(&group_meta.group_info.relative_path); - let cols = storage.read_parquet_peaks(&path)?; - - let (group, _stats) = IndexedPeakGroup::new_from_soa( - cols, - group_meta.group_info.cycle_to_rt_ms.clone(), - group_meta.group_info.bucket_size, - ); - Ok((group_meta.quadrupole_isolation.clone(), group)) - }) - .collect(); - - Ok(Self { - ms1_peaks, - ms2_window_groups: ms2_window_groups?, - }) - } } // Helper function to write parquet to bytes (used by cloud storage). diff --git a/rust/timscentroid/src/storage.rs b/rust/timscentroid/src/storage.rs index 01aab38c..369ba2c7 100644 --- a/rust/timscentroid/src/storage.rs +++ b/rust/timscentroid/src/storage.rs @@ -169,6 +169,157 @@ impl StorageProvider { }) } + /// Non-creating constructor for read-only callers. Does NOT `create_dir_all` + /// on a local path — use `new` when you want writes (which may implicitly + /// create the parent directory). + pub fn open(location: StorageLocation) -> Result { + let (store, is_local, prefix): (Arc, bool, String) = match location { + StorageLocation::Local(path) => ( + Arc::new(LocalFileSystem::new_with_prefix(path)?), + true, + String::new(), + ), + StorageLocation::Url(url) => { + let prefix = url.path().trim_start_matches('/').to_string(); + (block_on_or_in_place(parse_url(&url))?, false, prefix) + } + }; + Ok(Self { + store, + is_local, + prefix, + metrics: None, + }) + } + + /// HEAD an object. Returns its metadata. + pub fn head(&self, key: &str) -> Result { + let full = self.qualified_path(key); + let store = self.store.clone(); + block_on_or_in_place( + async move { store.head(&full).await.map_err(SerializationError::from) }, + ) + } + + /// Check whether an object exists. Returns `Ok(false)` for NotFound, + /// `Err(..)` for other transport errors. + pub fn exists(&self, key: &str) -> Result { + match self.head(key) { + Ok(_) => Ok(true), + Err(SerializationError::ObjectStore(object_store::Error::NotFound { .. })) => Ok(false), + Err(e) => Err(e), + } + } + + /// Fetch the whole object into memory. + pub fn get_bytes(&self, key: &str) -> Result { + let full = self.qualified_path(key); + let store = self.store.clone(); + block_on_or_in_place(async move { + let got = store.get(&full).await.map_err(SerializationError::from)?; + let bytes = got.bytes().await.map_err(SerializationError::from)?; + Ok(bytes) + }) + } + + /// Fetch a specific byte range. Errors on short read — S3 returns 416 on + /// out-of-bounds, but `LocalFileSystem` silently truncates to EOF, so we + /// post-check the returned length against the requested length. No + /// pre-HEAD; the tar walker issues many small range GETs and doubling + /// the request count is expensive. + pub fn range_get( + &self, + key: &str, + range: std::ops::Range, + ) -> Result { + use object_store::GetRange; + let full = self.qualified_path(key); + let store = self.store.clone(); + let expected = range.end.saturating_sub(range.start); + block_on_or_in_place(async move { + let opts = object_store::GetOptions { + range: Some(GetRange::Bounded(range)), + ..Default::default() + }; + let got = store + .get_opts(&full, opts) + .await + .map_err(SerializationError::from)?; + let bytes = got.bytes().await.map_err(SerializationError::from)?; + if (bytes.len() as u64) < expected { + return Err(SerializationError::Io(std::io::Error::new( + std::io::ErrorKind::UnexpectedEof, + format!( + "range_get short read: expected {} bytes, got {}", + expected, + bytes.len() + ), + ))); + } + Ok(bytes) + }) + } + + /// Stream a specific byte range directly to a local file, ticking the + /// progress bar per chunk. One HTTP GET with a Range header; the + /// response body streams to disk without buffering the whole range in + /// memory. Use this for large tar payloads; prefer `get_to_file` when + /// you want the whole object. + pub fn range_get_to_file( + &self, + key: &str, + range: std::ops::Range, + dst: &Path, + bar: &indicatif::ProgressBar, + ) -> Result<(), SerializationError> { + use futures::StreamExt; + use object_store::GetRange; + let full = self.qualified_path(key); + let store = self.store.clone(); + let dst = dst.to_path_buf(); + let expected = range.end.saturating_sub(range.start); + block_on_or_in_place(async move { + if let Some(parent) = dst.parent() { + std::fs::create_dir_all(parent)?; + } + let mut file = std::fs::File::create(&dst)?; + let opts = object_store::GetOptions { + range: Some(GetRange::Bounded(range)), + ..Default::default() + }; + let got = store + .get_opts(&full, opts) + .await + .map_err(SerializationError::from)?; + let mut stream = got.into_stream(); + let mut written: u64 = 0; + while let Some(chunk) = stream.next().await { + let chunk = chunk.map_err(SerializationError::from)?; + use std::io::Write; + file.write_all(&chunk)?; + bar.inc(chunk.len() as u64); + written += chunk.len() as u64; + } + if written < expected { + return Err(SerializationError::Io(std::io::Error::new( + std::io::ErrorKind::UnexpectedEof, + format!("range_get_to_file short read: expected {expected}, got {written}"), + ))); + } + Ok(()) + }) + } + + /// Build a fully-qualified `ObjectPath` from a caller-provided key, applying + /// the provider's URL-path prefix when present. + fn qualified_path(&self, key: &str) -> ObjectPath { + if self.prefix.is_empty() { + ObjectPath::from(key) + } else { + ObjectPath::from(format!("{}/{}", self.prefix.trim_end_matches('/'), key)) + } + } + /// Enable instrumentation for this storage provider /// /// This wraps the underlying ObjectStore with an InstrumentedStore that tracks: @@ -291,10 +442,14 @@ impl StorageProvider { std::io::ErrorKind::Other }; - return Err(SerializationError::Io(std::io::Error::new( - error_kind, - format!("Failed to read object at path {}: {}", full_path, e), - ))); + let msg = format!("Failed to read object at path {}: {}", full_path, e); + return Err(SerializationError::Io( + if error_kind == std::io::ErrorKind::Other { + std::io::Error::other(msg) + } else { + std::io::Error::new(error_kind, msg) + }, + )); } }; let bytes = result.bytes().await?; @@ -321,12 +476,48 @@ impl StorageProvider { /// Write bytes to a file pub fn write_bytes(&self, path: &str, data: Vec) -> Result<(), SerializationError> { - let full_path = self.build_path(path); - let object_path = ObjectPath::from(full_path.as_str()); - block_on_or_in_place(async { - self.store - .put(&object_path, Bytes::from(data).into()) - .await?; + use object_store::buffered::BufWriter; + use tokio::io::AsyncWriteExt; + let full = self.qualified_path(path); + let store = self.store.clone(); + block_on_or_in_place(async move { + let mut w = BufWriter::new(store, full); + w.write_all(&data).await.map_err(|e| { + SerializationError::Io(std::io::Error::other(format!("buffered write failed: {e}"))) + })?; + w.shutdown().await.map_err(|e| { + SerializationError::Io(std::io::Error::other(format!( + "buffered write shutdown failed: {e}" + ))) + })?; + Ok(()) + }) + } + + /// Stream an object into a local file, ticking the progress bar per chunk. + pub fn get_to_file( + &self, + key: &str, + dst: &Path, + bar: &indicatif::ProgressBar, + ) -> Result<(), SerializationError> { + use futures::StreamExt; + let full = self.qualified_path(key); + let store = self.store.clone(); + let dst = dst.to_path_buf(); + block_on_or_in_place(async move { + if let Some(parent) = dst.parent() { + std::fs::create_dir_all(parent)?; + } + let mut file = std::fs::File::create(&dst)?; + let got = store.get(&full).await.map_err(SerializationError::from)?; + let mut stream = got.into_stream(); + while let Some(chunk) = stream.next().await { + let chunk = chunk.map_err(SerializationError::from)?; + use std::io::Write; + file.write_all(&chunk)?; + bar.inc(chunk.len() as u64); + } Ok(()) }) } @@ -343,6 +534,36 @@ impl StorageProvider { Ok(()) } + /// List up to `cap` entries under `prefix`. Errors with + /// `SerializationError::PrefixCapExceeded` as soon as the listing yields + /// more than `cap` items. This is a fail-fast mechanism to prevent callers + /// from accidentally listing a whole bucket. + pub fn list_capped( + &self, + prefix: &str, + cap: usize, + ) -> Result, SerializationError> { + use futures::StreamExt; + let full = self.qualified_path(prefix); + let store = self.store.clone(); + let prefix_display = full.to_string(); + block_on_or_in_place(async move { + let mut stream = store.list(Some(&full)); + let mut out = Vec::with_capacity(cap); + while let Some(item) = stream.next().await { + let meta = item.map_err(SerializationError::from)?; + out.push(meta); + if out.len() > cap { + return Err(SerializationError::PrefixCapExceeded { + prefix: prefix_display.clone(), + cap, + }); + } + } + Ok(out) + }) + } + /// Read indexed peaks from a parquet file into SoA columns. /// /// Uses `ParquetObjectReader` with async streaming for both local and cloud storage. @@ -416,9 +637,7 @@ async fn parse_url(url: &url::Url) -> Result, Serialization let credentials = credentials_provider .provide_credentials() .await - .map_err(|e| { - SerializationError::Io(std::io::Error::new(std::io::ErrorKind::Other, e)) - })?; + .map_err(|e| SerializationError::Io(std::io::Error::other(e)))?; // 3. Initialize the builder using the resolved credentials let mut builder = AmazonS3Builder::new() @@ -499,3 +718,118 @@ async fn parse_url(url: &url::Url) -> Result, Serialization } } } + +#[cfg(test)] +mod s3_layer0_tests { + use super::*; + use tempfile::TempDir; + + fn local_provider() -> (TempDir, StorageProvider) { + let dir = TempDir::new().unwrap(); + let loc = StorageLocation::from_path(dir.path()); + let p = StorageProvider::new(loc).unwrap(); + (dir, p) + } + + #[test] + fn head_reports_size_for_existing_file() { + let (_dir, p) = local_provider(); + p.write_bytes("a.bin", vec![1u8; 17]).unwrap(); + let meta = p.head("a.bin").unwrap(); + assert_eq!(meta.size, 17u64); + } + + #[test] + fn head_errors_for_missing_file() { + let (_dir, p) = local_provider(); + assert!(p.head("missing.bin").is_err()); + } + + #[test] + fn exists_returns_true_for_existing_file() { + let (_dir, p) = local_provider(); + p.write_bytes("a.bin", vec![1u8; 3]).unwrap(); + assert!(p.exists("a.bin").unwrap()); + } + + #[test] + fn exists_returns_false_for_missing_file() { + let (_dir, p) = local_provider(); + assert!(!p.exists("missing.bin").unwrap()); + } + + #[test] + fn get_bytes_roundtrips() { + let (_dir, p) = local_provider(); + let payload = b"hello world".to_vec(); + p.write_bytes("a.bin", payload.clone()).unwrap(); + let out = p.get_bytes("a.bin").unwrap(); + assert_eq!(out.as_ref(), payload.as_slice()); + } + + #[test] + fn range_get_returns_exact_slice() { + let (_dir, p) = local_provider(); + let payload: Vec = (0..100u8).collect(); + p.write_bytes("a.bin", payload.clone()).unwrap(); + let slice = p.range_get("a.bin", 10..50).unwrap(); + assert_eq!(slice.as_ref(), &payload[10..50]); + } + + #[test] + fn range_get_errors_on_out_of_bounds() { + let (_dir, p) = local_provider(); + p.write_bytes("a.bin", vec![0u8; 10]).unwrap(); + assert!(p.range_get("a.bin", 0..100).is_err()); + } + + #[test] + fn list_capped_returns_all_below_cap() { + let (_dir, p) = local_provider(); + p.write_bytes("sample.d/analysis.tdf", vec![0u8; 3]) + .unwrap(); + p.write_bytes("sample.d/analysis.tdf_bin", vec![0u8; 3]) + .unwrap(); + let list = p.list_capped("sample.d", 10).unwrap(); + assert_eq!(list.len(), 2); + let names: Vec = list.iter().map(|m| m.location.to_string()).collect(); + assert!(names.iter().any(|n| n.ends_with("/analysis.tdf"))); + assert!(names.iter().any(|n| n.ends_with("/analysis.tdf_bin"))); + } + + #[test] + fn list_capped_errors_on_cap_exceeded() { + let (_dir, p) = local_provider(); + for i in 0..5 { + p.write_bytes(&format!("many/f{i}.bin"), vec![0u8; 1]) + .unwrap(); + } + let err = p.list_capped("many", 3).unwrap_err(); + assert!(matches!( + err, + SerializationError::PrefixCapExceeded { cap: 3, .. } + )); + } + + #[test] + fn get_to_file_copies_object_to_local_path() { + let (_dir, p) = local_provider(); + p.write_bytes("a.bin", vec![7u8; 100]).unwrap(); + let out_dir = TempDir::new().unwrap(); + let dst = out_dir.path().join("a_copy.bin"); + let bar = indicatif::ProgressBar::hidden(); + p.get_to_file("a.bin", &dst, &bar).unwrap(); + let got = std::fs::read(&dst).unwrap(); + assert_eq!(got, vec![7u8; 100]); + } + + #[test] + fn write_bytes_uses_multipart_for_large_buffers() { + // Local filesystem doesn't care; exercise the path. + let (_dir, p) = local_provider(); + let big = vec![9u8; 16 * 1024 * 1024]; + p.write_bytes("big.bin", big.clone()).unwrap(); + let got = p.get_bytes("big.bin").unwrap(); + assert_eq!(got.as_ref(), big.as_slice()); + } +} diff --git a/rust/timscentroid/src/timings.rs b/rust/timscentroid/src/timings.rs new file mode 100644 index 00000000..7fa97c1d --- /dev/null +++ b/rust/timscentroid/src/timings.rs @@ -0,0 +1,82 @@ +//! Progressive CLI timing output. +//! +//! [`TimedStep`] prints a dot-padded label immediately, opens a tracing span, +//! then appends elapsed time when the work finishes. + +use std::fmt; +use std::time::{ + Duration, + Instant, +}; + +/// A timed step that prints a dot-padded label immediately, opens a tracing +/// span, and appends elapsed time on finish. +/// +/// ```ignore +/// let step = TimedStep::begin("Loading speclib"); +/// let speclib = load_speclib()?; +/// let elapsed = step.finish_with(format_args!("{} entries", speclib.len())); +/// // terminal: "Loading speclib .......... 834.567ms (225178 entries)" +/// ``` +pub struct TimedStep { + start: Instant, + stderr: bool, + _span: tracing::span::EnteredSpan, +} + +/// Column width for dot-padded labels on stdout. +const LABEL_WIDTH: usize = 26; + +impl TimedStep { + /// Dot-pad `label` to stdout, open a tracing span, flush, start clock. + pub fn begin(label: impl fmt::Display) -> Self { + let label = label.to_string(); + let span = tracing::info_span!("step", label = label.as_str()); + let dots = LABEL_WIDTH.saturating_sub(label.len() + 1); + if dots > 0 { + print!("{label} {:. Self { + let label = label.to_string(); + let span = tracing::info_span!("step", label = label.as_str()); + eprint!("{label}"); + Self { + start: Instant::now(), + stderr: true, + _span: span.entered(), + } + } + + /// Print ` {elapsed:?}\n`, return Duration. + pub fn finish(self) -> Duration { + let d = self.start.elapsed(); + self.emit(format_args!(" {:?}", d)); + d + } + + /// Print ` {elapsed:?} ({detail})\n`, return Duration. + pub fn finish_with(self, detail: impl fmt::Display) -> Duration { + let d = self.start.elapsed(); + self.emit(format_args!(" {:?} ({})", d, detail)); + d + } + + fn emit(&self, msg: fmt::Arguments<'_>) { + if self.stderr { + eprintln!("{msg}"); + } else { + println!("{msg}"); + } + } +} diff --git a/rust/timsquery/Cargo.toml b/rust/timsquery/Cargo.toml index 4d3c722f..747b293a 100644 --- a/rust/timsquery/Cargo.toml +++ b/rust/timsquery/Cargo.toml @@ -5,15 +5,12 @@ edition.workspace = true license.workspace = true [dependencies] -bincode = "1.3" -zstd = "0.13" - - # Workspace member deps array2d = { path = "../array2d" } timscentroid = { path = "../timscentroid" } micromzpaf = { path = "../micromzpaf" } +tims_stage = { path = "../tims_stage" } # Workspace-inherited deps timsrust = { workspace = true } @@ -27,6 +24,10 @@ bon = { workspace = true } csv = { workspace = true } parquet = { workspace = true } arrow = { workspace = true } +thiserror = { workspace = true } + +[dev-dependencies] +tempfile = { workspace = true } [lib] diff --git a/rust/timsquery/src/errors.rs b/rust/timsquery/src/errors.rs index 2ae17a78..7327bc53 100644 --- a/rust/timsquery/src/errors.rs +++ b/rust/timsquery/src/errors.rs @@ -46,6 +46,9 @@ impl From for DataReadingError { pub enum UnsupportedDataError { NoMS2DataError, CloudRawDataNotSupported { url: String, suggestion: String }, + LazyMaterializationUnsupported, + CacheDisabled, + InvalidCacheUrl { url: String }, } impl Display for UnsupportedDataError { @@ -59,6 +62,19 @@ impl Display for UnsupportedDataError { url, suggestion ) } + Self::LazyMaterializationUnsupported => write!( + f, + "Lazy -> eager materialization not yet implemented; \ + load the index eagerly from source instead" + ), + Self::CacheDisabled => write!( + f, + "Cache is disabled; cannot convert eager index to lazy \ + without a cache location" + ), + Self::InvalidCacheUrl { url } => { + write!(f, "Cache URL is not a valid StorageLocation: {}", url) + } } } } diff --git a/rust/timsquery/src/lib.rs b/rust/timsquery/src/lib.rs index e8cfff37..c317c5cb 100644 --- a/rust/timsquery/src/lib.rs +++ b/rust/timsquery/src/lib.rs @@ -61,3 +61,8 @@ pub mod ion { } pub use ion::IonAnnot; + +pub use crate::serde::index_serde::{ + LoadIndexError, + load_index, +}; diff --git a/rust/timsquery/src/serde/index_serde.rs b/rust/timsquery/src/serde/index_serde.rs index afc7ef24..57d2b79a 100644 --- a/rust/timsquery/src/serde/index_serde.rs +++ b/rust/timsquery/src/serde/index_serde.rs @@ -158,7 +158,7 @@ impl IndexedPeaksHandle { // TODO: Implement lazy -> eager materialization // The lazy type needs to expose its storage location so we can reload as eager Err(crate::errors::DataReadingError::UnsupportedDataError( - crate::errors::UnsupportedDataError::NoMS2DataError, + crate::errors::UnsupportedDataError::LazyMaterializationUnsupported, )) } } @@ -194,15 +194,22 @@ impl IndexedPeaksHandle { Self::Eager(eager) => { if matches!(cache_location, CacheLocation::Disabled) { return Err(crate::errors::DataReadingError::UnsupportedDataError( - crate::errors::UnsupportedDataError::NoMS2DataError, + crate::errors::UnsupportedDataError::CacheDisabled, )); } + let cache_url_repr = match &cache_location { + CacheLocation::Url(u) => u.clone(), + CacheLocation::Local(p) => p.display().to_string(), + CacheLocation::Auto | CacheLocation::Disabled => String::new(), + }; let storage_location = match cache_location.to_storage_location() { Some(Ok(loc)) => loc, Some(Err(_)) | None => { return Err(crate::errors::DataReadingError::UnsupportedDataError( - crate::errors::UnsupportedDataError::NoMS2DataError, + crate::errors::UnsupportedDataError::InvalidCacheUrl { + url: cache_url_repr, + }, )); } }; @@ -824,3 +831,96 @@ pub fn load_index_auto( } } } + +// --------------------------------------------------------------------------- +// Layer-3 orchestrator: `load_index` +// --------------------------------------------------------------------------- + +use tims_stage::{ + Resolved, + StageError, + StagingBackend, + canonical_uri, + resolve, + sidecar_of, +}; +use timscentroid::serialization::SerializationError; + +/// Load an eagerly-materialized index from a URI. +/// +/// - `uri` — local path or `s3://` URL. May point at a `.idx` directory, a +/// `.d` directory, a `.tar`, or an S3 `.d` prefix. +/// - `backend` — the staging backend to use when the URI resolves to +/// `Stageable`. Typically `&tims_stage::PerRunTempdir::new(cfg)?`. +/// - `save_sidecar` — when true, and we built an index from a `.d`/tar/prefix +/// input, write a `.idx` sidecar to `sidecar_of(uri)` so the next run +/// short-circuits on the sidecar fast-path. +/// - `centroid_cfg` — centroiding configuration threaded through +/// `from_timstof_file` on the build paths. +pub fn load_index( + uri: &str, + backend: &dyn StagingBackend, + save_sidecar: bool, + centroid_cfg: CentroidingConfig, +) -> Result { + let canon = canonical_uri(uri); + match resolve(&canon)? { + Resolved::LocalIdx { loc } | Resolved::RemoteIdx { loc } => { + IndexedTimstofPeaks::load_from_storage(loc).map_err(LoadIndexError::Load) + } + Resolved::LocalDotD { path } => { + let idx = build_index(&path, centroid_cfg)?; + if save_sidecar { + write_sidecar(&canon, &idx)?; + } + Ok(idx) + } + Resolved::Stageable { spec } => { + let staged = backend.stage(&spec).map_err(LoadIndexError::Stage)?; + let idx = build_index(staged.as_ref(), centroid_cfg)?; + if save_sidecar { + write_sidecar(&canon, &idx)?; + } + Ok(idx) + // staged drops here (RAII) — tempdir removed. + } + } +} + +#[derive(Debug, thiserror::Error)] +pub enum LoadIndexError { + #[error(transparent)] + Stage(#[from] StageError), + #[error("index build from local .d failed: {0}")] + Build(String), + #[error("index load from storage failed: {0:?}")] + Load(SerializationError), + #[error("sidecar save failed: {0:?}")] + SidecarWrite(SerializationError), +} + +fn build_index( + dotd: &Path, + centroid_cfg: CentroidingConfig, +) -> Result { + let path_str = dotd + .to_str() + .ok_or_else(|| LoadIndexError::Build(format!("path is not valid UTF-8: {:?}", dotd)))?; + let tt = TimsTofPath::new(path_str) + .map_err(|e| LoadIndexError::Build(format!("failed to open {:?}: {:?}", dotd, e)))?; + let (idx, _stats) = IndexedTimstofPeaks::from_timstof_file(&tt, centroid_cfg); + Ok(idx) +} + +fn write_sidecar(orig_uri: &str, idx: &IndexedTimstofPeaks) -> Result<(), LoadIndexError> { + let sidecar_uri = sidecar_of(orig_uri); + let loc = if tims_stage::is_remote_uri(&sidecar_uri) { + StorageLocation::from_url(&sidecar_uri).map_err(|e| { + LoadIndexError::Stage(StageError::InvalidUri(format!("{sidecar_uri}: {e}"))) + })? + } else { + StorageLocation::from_path(&sidecar_uri) + }; + idx.save_to_storage(loc, SerializationConfig::default()) + .map_err(LoadIndexError::SidecarWrite) +} diff --git a/rust/timsquery/tests/load_index.rs b/rust/timsquery/tests/load_index.rs new file mode 100644 index 00000000..a107fe74 --- /dev/null +++ b/rust/timsquery/tests/load_index.rs @@ -0,0 +1,55 @@ +use std::path::PathBuf; +use tempfile::TempDir; +use tims_stage::{ + PerRunTempdir, + StagingConfig, +}; +use timscentroid::CentroidingConfig; +use timsquery::load_index; + +fn fixture_dotd() -> PathBuf { + // Adjust this path to a real fixture if available. The tests are + // #[ignore]'d so CI does not require the fixture. + PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("../../test_data/sample.d") +} + +#[test] +#[ignore = "requires local .d fixture"] +fn load_index_from_local_dotd() { + let backend = PerRunTempdir::new(StagingConfig::default()).unwrap(); + let uri = fixture_dotd().to_string_lossy().to_string(); + let _idx = load_index(&uri, &backend, false, CentroidingConfig::default()) + .expect("load_index should succeed"); +} + +#[test] +#[ignore = "requires local .d fixture"] +fn sidecar_roundtrip() { + let backend = PerRunTempdir::new(StagingConfig::default()).unwrap(); + let dotd = fixture_dotd(); + let work = TempDir::new().unwrap(); + let dotd_copy = work.path().join("sample.d"); + copy_dir_recursive(&dotd, &dotd_copy).unwrap(); + let uri = dotd_copy.to_string_lossy().to_string(); + let _idx1 = load_index(&uri, &backend, true, CentroidingConfig::default()).unwrap(); + assert!( + work.path().join("sample.d.idx/metadata.json").is_file(), + "sidecar .idx should contain metadata.json after build" + ); + let _idx2 = load_index(&uri, &backend, false, CentroidingConfig::default()).unwrap(); +} + +fn copy_dir_recursive(src: &std::path::Path, dst: &std::path::Path) -> std::io::Result<()> { + std::fs::create_dir_all(dst)?; + for entry in std::fs::read_dir(src)? { + let entry = entry?; + let ft = entry.file_type()?; + let dst_child = dst.join(entry.file_name()); + if ft.is_dir() { + copy_dir_recursive(&entry.path(), &dst_child)?; + } else { + std::fs::copy(entry.path(), &dst_child)?; + } + } + Ok(()) +} diff --git a/rust/timsseek/Cargo.toml b/rust/timsseek/Cargo.toml index 11d78df8..6a1221ae 100644 --- a/rust/timsseek/Cargo.toml +++ b/rust/timsseek/Cargo.toml @@ -30,6 +30,10 @@ tracing = { workspace = true } rayon = { workspace = true, optional = true } parquet = { workspace = true } arrow = { workspace = true } +smallvec = { workspace = true } + +[dev-dependencies] +tempfile = { workspace = true } [features] default = ["rayon"] diff --git a/rust/timsseek/DATA_FLOW.md b/rust/timsseek/DATA_FLOW.md index 3aeaf089..79f8d3ee 100644 --- a/rust/timsseek/DATA_FLOW.md +++ b/rust/timsseek/DATA_FLOW.md @@ -302,7 +302,11 @@ The `as_feature()` method produces the following features for rescoring: ``` QueryItemToScore -├── digest: DigestSlice sequence + modifications +├── digest: Peptide raw sequence + optional parsed residues/mods +│ ├── raw: Arc on-disk modified-sequence string +│ ├── parsed: Option None iff speclib gate=off (all-or-none) +│ ├── decoy: DecoyMarking Target / ReversedDecoy +│ └── decoy_group: u32 ├── query: TimsElutionGroup precursor/fragment m/z, RT, mobility └── expected_intensity: ExpectedIntensities diff --git a/rust/timsseek/src/data_sources/speclib.rs b/rust/timsseek/src/data_sources/speclib.rs index b3638348..c6ddf1aa 100644 --- a/rust/timsseek/src/data_sources/speclib.rs +++ b/rust/timsseek/src/data_sources/speclib.rs @@ -1,8 +1,12 @@ use crate::errors::LibraryReadingError; use crate::fragment_mass::elution_group_converter::isotope_dist_from_seq; -use crate::models::{ - DecoyMarking, - DigestSlice, +use crate::models::DecoyMarking; +use crate::models::sequence::{ + Peptide, + SeqFormat, + SpeclibMeta, + normalize_to_proforma, + parse_sequence, }; use crate::{ ExpectedIntensities, @@ -116,28 +120,24 @@ impl PrecursorEntry { } } -impl From for DigestSlice { - fn from(x: PrecursorEntry) -> Self { - let decoy = if x.decoy { - DecoyMarking::ReversedDecoy - } else { - DecoyMarking::Target - }; - let seq: Arc = x.sequence.clone().into(); - if seq.as_ref().len() >= u16::MAX as usize { - panic!("Sequence too long (gt: {}): {}", u16::MAX, seq); - } - let range = 0u16..seq.as_ref().len() as u16; - DigestSlice::new(seq, range, decoy, x.decoy_group) - } -} - impl TryFrom for QueryItemToScore { type Error = LibraryReadingError; fn try_from(x: SerSpeclibElement) -> Result { let charge = x.precursor.charge; - let precursor = x.precursor.into(); + let raw: Arc = x.precursor.sequence.clone().into(); + let normalized = normalize_to_proforma(&raw); + let parsed = parse_sequence(&normalized); + let precursor = Peptide { + raw, + parsed, + decoy: if x.precursor.decoy { + DecoyMarking::ReversedDecoy + } else { + DecoyMarking::Target + }, + decoy_group: x.precursor.decoy_group, + }; let ref_eg = x.elution_group; let expected_intensity = ExpectedIntensities::try_from_pairs( ref_eg @@ -222,11 +222,25 @@ fn convert_diann_to_query_item( extra: DiannPrecursorExtras, decoy_group_id: u32, ) -> Result { - let digest = DigestSlice::from_string( - extra.stripped_peptide.clone(), - extra.is_decoy, - decoy_group_id, - ); + // Prefer modified form for raw; fall back to stripped if empty. + let raw_src = if extra.modified_peptide.is_empty() { + extra.stripped_peptide.clone() + } else { + extra.modified_peptide.clone() + }; + let raw: Arc = raw_src.into(); + let normalized = normalize_to_proforma(&raw); + let parsed = parse_sequence(&normalized); + let digest = Peptide { + raw, + parsed, + decoy: if extra.is_decoy { + DecoyMarking::ReversedDecoy + } else { + DecoyMarking::Target + }, + decoy_group: decoy_group_id, + }; let (rebuilt_eg, precursor_pairs) = match isotope_dist_from_seq(&extra.stripped_peptide) { Ok(isotope_ratios) => { @@ -287,8 +301,12 @@ fn create_mass_shifted_decoy( decoy_group_id: u32, mass_shift_da: f64, ) -> Result { - let target_sequence: String = target.digest.clone().into(); - let decoy_digest = DigestSlice::from_string(target_sequence, true, decoy_group_id); + let decoy_digest = Peptide { + raw: target.digest.raw.clone(), + parsed: target.digest.parsed.clone(), + decoy: DecoyMarking::ReversedDecoy, + decoy_group: decoy_group_id, + }; let charge = target.query.precursor_charge(); let mz_shift = mass_shift_da / charge as f64; @@ -385,7 +403,10 @@ fn convert_elution_group_collection( }) .collect::, _>>()?; - Ok(Speclib { elems }) + Ok(Speclib { + elems, + meta: SpeclibMeta::default(), + }) } ElutionGroupCollection::MzpafLabels(egs, Some(FileReadingExtras::Skyline(extras))) => { if egs.len() != extras.len() { @@ -414,7 +435,10 @@ fn convert_elution_group_collection( }) .collect::, _>>()?; - Ok(Speclib { elems }) + Ok(Speclib { + elems, + meta: SpeclibMeta::default(), + }) } ElutionGroupCollection::MzpafLabels(_, None) => { Err(LibraryReadingError::UnsupportedFormat { @@ -435,6 +459,7 @@ fn convert_elution_group_collection( #[derive(Debug, Clone)] pub struct Speclib { pub elems: Vec, + pub meta: SpeclibMeta, } struct SpeclibIterator<'a> { @@ -647,6 +672,38 @@ impl Iterator for MessagePackReader { } } +/// Inspect every `Peptide.parsed`. If any is `None`, zero them all, flip +/// `parsable_sequences` off, log counts. Returns the finalized meta. +/// +/// MUST run AFTER all decoy generation (including `create_mass_shifted_decoy` +/// and DIA-NN target-decoy pairing). Otherwise decoys generated post-gate +/// inherit `parsed: Some(...)` from their target while the gate said off. +fn apply_parse_gate(items: &mut [QueryItemToScore], expected_format: SeqFormat) -> SpeclibMeta { + let total = items.len(); + let parsed = items.iter().filter(|q| q.digest.parsed.is_some()).count(); + let parsable = parsed == total && total > 0; + if !parsable { + tracing::warn!( + "speclib load: n_total={}, n_parsed={}, n_unparsed={}; sequence features disabled", + total, + parsed, + total.saturating_sub(parsed) + ); + for q in items.iter_mut() { + q.digest.parsed = None; + } + } else { + tracing::info!( + "speclib load: n_total={}, all parsed; sequence features enabled", + total + ); + } + SpeclibMeta { + parsable_sequences: parsable, + sequence_format: expected_format, + } +} + impl Speclib { pub fn iter<'a>(&'a self) -> impl Iterator + 'a { SpeclibIterator { @@ -690,7 +747,9 @@ impl Speclib { let speclib = convert_elution_group_collection(collection)?; // Apply decoy strategy to the loaded speclib - let speclib = Self::apply_decoy_strategy(speclib, decoy_strategy)?; + let mut speclib = Self::apply_decoy_strategy(speclib, decoy_strategy)?; + let meta = apply_parse_gate(&mut speclib.elems, SeqFormat::Modified); + speclib.meta = meta; speclib.log_entry_stats(); Ok(speclib) } @@ -723,6 +782,7 @@ impl Speclib { ); Ok(Speclib { elems: targets.into_iter().chain(decoys).collect(), + meta: SpeclibMeta::default(), }) } @@ -766,7 +826,10 @@ impl Speclib { all_entries.len() * 2 / 3 ); - Ok(Speclib { elems: all_entries }) + Ok(Speclib { + elems: all_entries, + meta: SpeclibMeta::default(), + }) } DecoyStrategy::IfMissing => { @@ -778,6 +841,7 @@ impl Speclib { ); Ok(Speclib { elems: targets.into_iter().chain(decoys).collect(), + meta: SpeclibMeta::default(), }) } else { tracing::warn!( @@ -815,7 +879,10 @@ impl Speclib { all_entries.len() * 2 / 3 ); - Ok(Speclib { elems: all_entries }) + Ok(Speclib { + elems: all_entries, + meta: SpeclibMeta::default(), + }) } } } @@ -837,10 +904,15 @@ impl Speclib { let elements: Result, _> = reader.collect(); let elems = elements?; - let speclib = Self { elems }; + let speclib = Self { + elems, + meta: SpeclibMeta::default(), + }; // Apply decoy strategy to the loaded speclib - let speclib = Self::apply_decoy_strategy(speclib, decoy_strategy)?; + let mut speclib = Self::apply_decoy_strategy(speclib, decoy_strategy)?; + let meta = apply_parse_gate(&mut speclib.elems, SeqFormat::Modified); + speclib.meta = meta; speclib.log_entry_stats(); Ok(speclib) } @@ -848,6 +920,7 @@ impl Speclib { pub fn sample() -> Self { Self { elems: vec![QueryItemToScore::sample()], + meta: SpeclibMeta::default(), } } @@ -981,7 +1054,10 @@ mod tests { .map(QueryItemToScore::try_from) .collect::>()?; - Ok(Self { elems: speclib }) + Ok(Self { + elems: speclib, + meta: SpeclibMeta::default(), + }) } } @@ -1099,6 +1175,25 @@ mod tests { ); } + #[test] + fn test_diann_tsv_parsable_gate() { + let test_file = std::path::PathBuf::from(env!("CARGO_MANIFEST_DIR")) + .parent() + .unwrap() + .join("timsquery") + .join("tests") + .join("diann_io_files") + .join("sample_lib.txt"); + + let speclib = Speclib::from_file(&test_file, crate::models::DecoyStrategy::default()) + .expect("Failed to load DIA-NN TSV library"); + + assert!( + speclib.meta.parsable_sequences, + "DIA-NN sample fixture should all parse with modified_peptide" + ); + } + #[test] fn test_load_skyline_csv_library() { let test_file = std::path::PathBuf::from(env!("CARGO_MANIFEST_DIR")) @@ -1493,4 +1588,83 @@ mod tests { let items: Vec<_> = reader.collect::, _>>().unwrap(); assert_eq!(items.len(), 2); } + + #[cfg(test)] + fn fixture_query_item( + raw_seq: &str, + parsed: Option, + ) -> QueryItemToScore { + let elem = SerSpeclibElement::new( + PrecursorEntry::new(raw_seq.to_string(), 2, false, 0), + ReferenceEG::new( + 0, + 500.0, + vec![0, 1, 2], + vec![300.0, 400.0], + vec![ + IonAnnot::try_from("y1").unwrap(), + IonAnnot::try_from("y2").unwrap(), + ], + vec![1.0, 0.5, 0.2], + vec![0.8, 0.3], + 0.75, + 120.0, + ), + ); + let mut q: QueryItemToScore = elem.try_into().expect("fixture convert"); + q.digest.parsed = parsed; + q + } + + #[test] + fn test_parse_gate_off_on_poisoned_row() { + use crate::models::sequence::{ + ParsedSequence, + SeqFormat, + }; + use smallvec::SmallVec; + + let mut items = vec![ + fixture_query_item( + "PEPTIDEK", + Some(ParsedSequence { + residues: SmallVec::new(), + mods: SmallVec::new(), + }), + ), + fixture_query_item("GARBAGE!!!", None), + ]; + let meta = apply_parse_gate(&mut items, SeqFormat::Modified); + assert!(!meta.parsable_sequences); + assert!(items.iter().all(|q| q.digest.parsed.is_none())); + } + + #[test] + fn test_parse_gate_on_when_all_parsed() { + use crate::models::sequence::{ + ParsedSequence, + SeqFormat, + }; + use smallvec::SmallVec; + + let mut items = vec![ + fixture_query_item( + "PEPTIDEK", + Some(ParsedSequence { + residues: SmallVec::new(), + mods: SmallVec::new(), + }), + ), + fixture_query_item( + "ABCDEK", + Some(ParsedSequence { + residues: SmallVec::new(), + mods: SmallVec::new(), + }), + ), + ]; + let meta = apply_parse_gate(&mut items, SeqFormat::Modified); + assert!(meta.parsable_sequences); + assert!(items.iter().all(|q| q.digest.parsed.is_some())); + } } diff --git a/rust/timsseek/src/digest/digestion.rs b/rust/timsseek/src/digest/digestion.rs index 1fa09592..7bf90d4d 100644 --- a/rust/timsseek/src/digest/digestion.rs +++ b/rust/timsseek/src/digest/digestion.rs @@ -1,6 +1,6 @@ use crate::models::{ DecoyMarking, - DigestSlice, + ProteinSlice, }; use regex::Regex; use std::ops::Range; @@ -82,7 +82,7 @@ impl DigestionParameters { sites } - pub fn digest(&self, sequence: Arc) -> Vec { + pub fn digest(&self, sequence: Arc) -> Vec { let sites = self.cleavage_sites(sequence.as_ref()); let num_sites = sites.len(); let mut out = Vec::new(); @@ -99,7 +99,7 @@ impl DigestionParameters { if span < self.min_length || span > self.max_length { return; } - out.push(DigestSlice::new( + out.push(ProteinSlice::new( sequence.clone(), start as u16..end as u16, DecoyMarking::Target, @@ -112,7 +112,7 @@ impl DigestionParameters { out } - pub fn digest_multiple(&self, sequences: &[Arc]) -> Vec { + pub fn digest_multiple(&self, sequences: &[Arc]) -> Vec { let mut out = Vec::new(); let mut last_id = 0; sequences.iter().for_each(|seq| { diff --git a/rust/timsseek/src/lib.rs b/rust/timsseek/src/lib.rs index 68fbd5b7..ffab4650 100644 --- a/rust/timsseek/src/lib.rs +++ b/rust/timsseek/src/lib.rs @@ -15,8 +15,8 @@ pub use micromzpaf; pub use data_sources::Speclib; pub use models::{ DecoyStrategy, - DigestSlice, ExpectedIntensities, + ProteinSlice, QueryItemToScore, }; pub use scoring::{ diff --git a/rust/timsseek/src/ml/cv.rs b/rust/timsseek/src/ml/cv.rs index 4ed034a7..4371d38a 100644 --- a/rust/timsseek/src/ml/cv.rs +++ b/rust/timsseek/src/ml/cv.rs @@ -7,6 +7,7 @@ pub use forust_ml::constraints::{ pub use forust_ml::errors::ForustError; pub use forust_ml::gradientbooster::{ GrowPolicy, + ImportanceMethod, MissingNodeTreatment, }; pub use forust_ml::metric::{ @@ -19,7 +20,11 @@ pub use forust_ml::{ GradientBooster, Matrix, }; -pub use std::collections::HashSet; +use serde::Serialize; +pub use std::collections::{ + HashMap, + HashSet, +}; pub struct GBMConfig { iterations: usize, @@ -354,6 +359,28 @@ impl PrecomputedFeatures { } } +/// Per-feature summary within a fold. `mean` is computed over finite +/// values only (NaN/+-Inf skipped). `nan_ratio` is the fraction of +/// non-finite values seen for this feature in the fold (0.0..=1.0). +#[derive(Debug, Serialize)] +pub struct FeatureStat { + pub name: &'static str, + pub mean: f32, + pub nan_ratio: f32, +} + +/// Per-fold feature statistics. `feature_stats` preserves `feature_names` +/// insertion order. `feature_importance` sorted by gain descending +/// (top features first) so the JSON reads top-down. +#[derive(Debug, Serialize)] +pub struct FoldStats { + pub fold: u8, + pub feature_stats: Vec, + pub feature_importance: Vec<(&'static str, f32)>, +} + +pub type RescoreFeatureStats = Vec; + pub struct CrossValidatedScorer { n_folds: u8, data: Vec, @@ -473,6 +500,93 @@ impl CrossValidatedScorer { self.data } + /// Read-only access to the scored items (e.g. to query feature names). + pub fn data(&self) -> &[T] { + &self.data + } + + /// Compute per-fold feature means + Forust Gain importance. + /// + /// `names` must be the feature-name list in the same order `as_feature` emits. + /// Folds with no classifier (shouldn't happen post-fit) produce empty maps. + pub fn feature_stats(&self, names: &[&'static str]) -> RescoreFeatureStats { + let mut out: RescoreFeatureStats = Vec::with_capacity(self.n_folds as usize); + for fold in 0..self.n_folds { + // --- Gain importance from the booster (sorted desc by gain) --- + let importance: Vec<(&'static str, f32)> = match self + .fold_classifiers + .get(fold as usize) + .and_then(|o| o.as_ref()) + { + Some(booster) => { + let raw_imp = + booster.calculate_feature_importance(ImportanceMethod::Gain, true); + debug_assert!( + raw_imp.keys().all(|&idx| idx < names.len()), + "forust importance index exceeds feature name list" + ); + let mut pairs: Vec<(&'static str, f32)> = raw_imp + .into_iter() + .filter_map(|(idx, v)| names.get(idx).map(|n| (*n, v))) + .collect(); + pairs + .sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal)); + pairs + } + None => Vec::new(), + }; + + // --- Per-feature finite-only means + NaN ratio --- + let mut sums: Vec = vec![0.0; names.len()]; + let mut finite_counts: Vec = vec![0; names.len()]; + let mut nan_counts: Vec = vec![0; names.len()]; + let mut n: usize = 0; + for (i, item) in self.data.iter().enumerate() { + if self.assigned_fold.get(i) == Some(&fold) { + let mut seen = 0usize; + for (j, v) in item.as_feature().into_iter().enumerate() { + seen += 1; + if j < sums.len() { + if v.is_finite() { + sums[j] += v; + finite_counts[j] += 1; + } else { + nan_counts[j] += 1; + } + } + } + debug_assert_eq!(seen, names.len(), "as_feature length drift for item {i}"); + n += 1; + } + } + let feature_stats: Vec = if n > 0 { + names + .iter() + .zip(sums.iter()) + .zip(finite_counts.iter()) + .zip(nan_counts.iter()) + .map(|(((name, s), fc), nc)| { + let mean = if *fc > 0 { *s / *fc as f64 } else { f64::NAN }; + FeatureStat { + name: *name, + mean: mean as f32, + nan_ratio: *nc as f32 / n as f32, + } + }) + .collect() + } else { + Vec::new() + }; + + out.push(FoldStats { + fold, + feature_stats, + feature_importance: importance, + }); + } + out + } + fn next_fold(&self, fold: u8) -> u8 { let mut maybe_next = fold + 1; if maybe_next >= self.n_folds { diff --git a/rust/timsseek/src/ml/mod.rs b/rust/timsseek/src/ml/mod.rs index 7afc7fcb..476c17db 100644 --- a/rust/timsseek/src/ml/mod.rs +++ b/rust/timsseek/src/ml/mod.rs @@ -1,5 +1,6 @@ pub mod cv; pub mod qvalues; +pub use cv::RescoreFeatureStats; pub use qvalues::rescore; #[derive(Clone, Copy, Debug, PartialEq, Eq)] diff --git a/rust/timsseek/src/ml/qvalues.rs b/rust/timsseek/src/ml/qvalues.rs index 7f311156..0a26c454 100644 --- a/rust/timsseek/src/ml/qvalues.rs +++ b/rust/timsseek/src/ml/qvalues.rs @@ -3,6 +3,7 @@ use super::cv::{ DataBuffer, FeatureLike, GBMConfig, + RescoreFeatureStats, }; use super::{ LabelledScore, @@ -90,7 +91,7 @@ pub fn report_qvalues_at_thresholds( /// deterministic; this seals the only remaining entropy source. const RESCORE_SHUFFLE_SEED: u64 = 42; -pub fn rescore(mut data: Vec) -> Vec { +pub fn rescore(mut data: Vec) -> (Vec, RescoreFeatureStats) { let config = GBMConfig::default(); // Canonicalize input order before the seeded shuffle. Upstream @@ -113,6 +114,13 @@ pub fn rescore(mut data: Vec) -> Vec { .fit(&mut DataBuffer::default(), &mut DataBuffer::default()) .unwrap(); + let names: Vec<&'static str> = scorer + .data() + .first() + .map(|c| c.feature_names()) + .unwrap_or_default(); + let stats = scorer.feature_stats(&names); + let mut scored = scorer.score(); // Sort by score descending #[cfg(feature = "rayon")] @@ -123,9 +131,10 @@ pub fn rescore(mut data: Vec) -> Vec { debug!("Best:\n{:#?}", scored.first()); debug!("Worst:\n{:#?}", scored.last()); - scored.into_iter().map(|c| c.into_final()).collect() + (scored.into_iter().map(|c| c.into_final()).collect(), stats) } +use crate::models::AA_COUNT_NAMES; use crate::scoring::results::{ CompetedCandidate, FinalResult, @@ -143,115 +152,193 @@ fn mean_abs_error(errs: &[f32]) -> f64 { // CompetedCandidate: FeatureLike + LabelledScore // --------------------------------------------------------------------------- -impl FeatureLike for CompetedCandidate { - fn as_feature(&self) -> impl IntoIterator + '_ { +impl CompetedCandidate { + /// Single source of truth for feature values and names. + /// + /// Base block: 88 dims always present. + /// Sequence block: 22 dims appended when `peptide.parsed.is_some()`. + /// The gate is speclib-wide: either all candidates in a run have Some, + /// or all have None — so vector length is stable within a single fit. + fn named_features(&self) -> Vec<(f64, &'static str)> { let s = &self.scoring; - - vec![ - (s.precursor_mz / 5.0).round(), - s.precursor_charge as f64, - s.precursor_mobility as f64, - s.calibrated_rt_seconds.round() as f64, - s.n_scored_fragments as f64, + let mut v: Vec<(f64, &'static str)> = vec![ + // Identity / precursor + ((s.precursor_mz / 5.0).round(), "precursor_mz_round5"), + (s.precursor_charge as f64, "precursor_charge"), + (s.precursor_mobility as f64, "precursor_mobility"), + ( + s.calibrated_rt_seconds.round() as f64, + "calibrated_rt_seconds_round", + ), + (s.n_scored_fragments as f64, "n_scored_fragments"), // Combined - s.main_score as f64, - (s.main_score / s.delta_next) as f64, - s.delta_next as f64, - s.delta_second_next as f64, - s.obs_rt_seconds as f64, - s.obs_mobility as f64, - (s.obs_rt_seconds - s.calibrated_rt_seconds) as f64, - s.calibrated_sq_delta_rt as f64, - s.delta_ms1_ms2_mobility as f64, - s.sq_delta_ms1_ms2_mobility as f64, - s.rising_cycles as f64, - s.falling_cycles as f64, + (s.main_score as f64, "main_score"), + ((s.main_score / s.delta_next) as f64, "main_over_delta_next"), + (s.delta_next as f64, "delta_next"), + (s.delta_second_next as f64, "delta_second_next"), + (s.obs_rt_seconds as f64, "obs_rt_seconds"), + (s.obs_mobility as f64, "obs_mobility"), + ( + (s.obs_rt_seconds - s.calibrated_rt_seconds) as f64, + "rt_err", + ), + (s.calibrated_sq_delta_rt as f64, "calibrated_sq_delta_rt"), + (s.delta_ms1_ms2_mobility as f64, "delta_ms1_ms2_mobility"), + ( + s.sq_delta_ms1_ms2_mobility as f64, + "sq_delta_ms1_ms2_mobility", + ), + (s.rising_cycles as f64, "rising_cycles"), + (s.falling_cycles as f64, "falling_cycles"), // MS2 - s.npeaks as f64, - s.apex_lazyscore as f64, - (s.ms2_summed_intensity as f64).ln_1p(), - s.ms2_lazyscore as f64, - s.ms2_isotope_lazyscore as f64, - s.ms2_isotope_lazyscore_ratio as f64, - s.lazyscore_z as f64, - s.lazyscore_vs_baseline as f64, + (s.npeaks as f64, "npeaks"), + (s.apex_lazyscore as f64, "apex_lazyscore"), + ( + (s.ms2_summed_intensity as f64).ln_1p(), + "ms2_summed_intensity_ln1p", + ), + (s.ms2_lazyscore as f64, "ms2_lazyscore"), + (s.ms2_isotope_lazyscore as f64, "ms2_isotope_lazyscore"), + ( + s.ms2_isotope_lazyscore_ratio as f64, + "ms2_isotope_lazyscore_ratio", + ), + (s.lazyscore_z as f64, "lazyscore_z"), + (s.lazyscore_vs_baseline as f64, "lazyscore_vs_baseline"), // Split product & apex features - (s.split_product_score as f64).ln_1p(), - (s.cosine_au as f64).ln_1p(), - (s.scribe_au as f64).ln_1p(), - s.cosine_cg as f64, - s.scribe_cg as f64, - s.cosine_weighted_coelution as f64, - s.cosine_gradient_consistency as f64, - s.scribe_weighted_coelution as f64, - s.scribe_gradient_consistency as f64, - s.peak_shape as f64, - s.ratio_cv as f64, - s.centered_apex as f64, - s.precursor_coelution as f64, - s.fragment_coverage as f64, - s.precursor_apex_match as f64, - s.xic_quality as f64, - s.fragment_apex_agreement as f64, - s.isotope_correlation as f64, - s.gaussian_correlation as f64, - s.per_frag_gaussian_corr as f64, - // MS2 per-ion errors - s.ms2_mz_errors[0] as f64, - s.ms2_mz_errors[1] as f64, - s.ms2_mz_errors[2] as f64, - s.ms2_mz_errors[3] as f64, - s.ms2_mz_errors[4] as f64, - s.ms2_mz_errors[5] as f64, - s.ms2_mz_errors[6] as f64, - s.ms2_mobility_errors[0] as f64, - s.ms2_mobility_errors[1] as f64, - s.ms2_mobility_errors[2] as f64, - s.ms2_mobility_errors[3] as f64, - s.ms2_mobility_errors[4] as f64, - s.ms2_mobility_errors[5] as f64, - s.ms2_mobility_errors[6] as f64, + ( + (s.split_product_score as f64).ln_1p(), + "split_product_score_ln1p", + ), + ((s.cosine_au as f64).ln_1p(), "cosine_au_ln1p"), + ((s.scribe_au as f64).ln_1p(), "scribe_au_ln1p"), + (s.cosine_cg as f64, "cosine_cg"), + (s.scribe_cg as f64, "scribe_cg"), + ( + s.cosine_weighted_coelution as f64, + "cosine_weighted_coelution", + ), + ( + s.cosine_gradient_consistency as f64, + "cosine_gradient_consistency", + ), + ( + s.scribe_weighted_coelution as f64, + "scribe_weighted_coelution", + ), + ( + s.scribe_gradient_consistency as f64, + "scribe_gradient_consistency", + ), + (s.peak_shape as f64, "peak_shape"), + (s.ratio_cv as f64, "ratio_cv"), + (s.centered_apex as f64, "centered_apex"), + (s.precursor_coelution as f64, "precursor_coelution"), + (s.fragment_coverage as f64, "fragment_coverage"), + (s.precursor_apex_match as f64, "precursor_apex_match"), + (s.xic_quality as f64, "xic_quality"), + (s.fragment_apex_agreement as f64, "fragment_apex_agreement"), + (s.isotope_correlation as f64, "isotope_correlation"), + (s.gaussian_correlation as f64, "gaussian_correlation"), + (s.per_frag_gaussian_corr as f64, "per_frag_gaussian_corr"), + // MS2 per-ion errors (7 mz + 7 mobility) + (s.ms2_mz_errors[0] as f64, "ms2_mz_err_0"), + (s.ms2_mz_errors[1] as f64, "ms2_mz_err_1"), + (s.ms2_mz_errors[2] as f64, "ms2_mz_err_2"), + (s.ms2_mz_errors[3] as f64, "ms2_mz_err_3"), + (s.ms2_mz_errors[4] as f64, "ms2_mz_err_4"), + (s.ms2_mz_errors[5] as f64, "ms2_mz_err_5"), + (s.ms2_mz_errors[6] as f64, "ms2_mz_err_6"), + (s.ms2_mobility_errors[0] as f64, "ms2_mob_err_0"), + (s.ms2_mobility_errors[1] as f64, "ms2_mob_err_1"), + (s.ms2_mobility_errors[2] as f64, "ms2_mob_err_2"), + (s.ms2_mobility_errors[3] as f64, "ms2_mob_err_3"), + (s.ms2_mobility_errors[4] as f64, "ms2_mob_err_4"), + (s.ms2_mobility_errors[5] as f64, "ms2_mob_err_5"), + (s.ms2_mobility_errors[6] as f64, "ms2_mob_err_6"), // MS1 - (s.ms1_summed_intensity as f64).ln_1p(), - // MS1 per-ion errors - s.ms1_mz_errors[0] as f64, - s.ms1_mz_errors[1] as f64, - s.ms1_mz_errors[2] as f64, - s.ms1_mobility_errors[0] as f64, - s.ms1_mobility_errors[1] as f64, - s.ms1_mobility_errors[2] as f64, + ( + (s.ms1_summed_intensity as f64).ln_1p(), + "ms1_summed_intensity_ln1p", + ), + // MS1 per-ion errors (3 mz + 3 mobility) + (s.ms1_mz_errors[0] as f64, "ms1_mz_err_0"), + (s.ms1_mz_errors[1] as f64, "ms1_mz_err_1"), + (s.ms1_mz_errors[2] as f64, "ms1_mz_err_2"), + (s.ms1_mobility_errors[0] as f64, "ms1_mob_err_0"), + (s.ms1_mobility_errors[1] as f64, "ms1_mob_err_1"), + (s.ms1_mobility_errors[2] as f64, "ms1_mob_err_2"), // Relative intensities - s.ms1_intensity_ratios[0] as f64, - s.ms1_intensity_ratios[1] as f64, - s.ms1_intensity_ratios[2] as f64, - s.ms2_intensity_ratios[0] as f64, - s.ms2_intensity_ratios[1] as f64, - s.ms2_intensity_ratios[2] as f64, - s.ms2_intensity_ratios[3] as f64, - s.ms2_intensity_ratios[4] as f64, - s.ms2_intensity_ratios[5] as f64, - s.ms2_intensity_ratios[6] as f64, - self.delta_group as f64, - self.delta_group_ratio as f64, - s.calibrated_rt_seconds as f64, - s.calibrated_sq_delta_rt as f64, + (s.ms1_intensity_ratios[0] as f64, "ms1_intensity_ratio_0"), + (s.ms1_intensity_ratios[1] as f64, "ms1_intensity_ratio_1"), + (s.ms1_intensity_ratios[2] as f64, "ms1_intensity_ratio_2"), + (s.ms2_intensity_ratios[0] as f64, "ms2_intensity_ratio_0"), + (s.ms2_intensity_ratios[1] as f64, "ms2_intensity_ratio_1"), + (s.ms2_intensity_ratios[2] as f64, "ms2_intensity_ratio_2"), + (s.ms2_intensity_ratios[3] as f64, "ms2_intensity_ratio_3"), + (s.ms2_intensity_ratios[4] as f64, "ms2_intensity_ratio_4"), + (s.ms2_intensity_ratios[5] as f64, "ms2_intensity_ratio_5"), + (s.ms2_intensity_ratios[6] as f64, "ms2_intensity_ratio_6"), + (self.delta_group as f64, "delta_group"), + (self.delta_group_ratio as f64, "delta_group_ratio"), + (s.calibrated_rt_seconds as f64, "calibrated_rt_seconds"), // Derived intensity features - { - let ratios = &s.ms2_intensity_ratios; - ratios - .iter() - .filter(|r| r.is_finite()) - .fold(f32::NEG_INFINITY, |a, &b| a.max(b)) as f64 - }, + ( + { + let ratios = &s.ms2_intensity_ratios; + ratios + .iter() + .filter(|r| r.is_finite()) + .fold(f32::NEG_INFINITY, |a, &b| a.max(b)) as f64 + }, + "ms2_intensity_ratios_max", + ), // Interaction features - (s.main_score * s.delta_next) as f64, - (s.split_product_score * s.fragment_coverage) as f64, + ( + (s.main_score * s.delta_next) as f64, + "main_times_delta_next", + ), + ( + (s.split_product_score * s.fragment_coverage) as f64, + "split_product_x_coverage", + ), // Summary error features - mean_abs_error(&s.ms2_mz_errors), - mean_abs_error(&s.ms2_mobility_errors), - mean_abs_error(&s.ms1_mz_errors), - mean_abs_error(&s.ms1_mobility_errors), - ] + (mean_abs_error(&s.ms2_mz_errors), "ms2_mz_mean_abs_error"), + ( + mean_abs_error(&s.ms2_mobility_errors), + "ms2_mob_mean_abs_error", + ), + (mean_abs_error(&s.ms1_mz_errors), "ms1_mz_mean_abs_error"), + ( + mean_abs_error(&s.ms1_mobility_errors), + "ms1_mob_mean_abs_error", + ), + ]; + + // Sequence-derived block. Gated — all-or-none per speclib load. + if let Some(counts) = s.peptide.aa_counts() { + let length = s.peptide.length().unwrap() as f64; + let n_mods = s.peptide.n_mods().unwrap() as f64; + v.push((length, "peptide_length")); + for (i, c) in counts.iter().enumerate() { + v.push((*c, AA_COUNT_NAMES[i])); + } + v.push((n_mods, "peptide_n_mods")); + } + v + } + + pub fn feature_names(&self) -> Vec<&'static str> { + self.named_features().into_iter().map(|(_, n)| n).collect() + } +} + +impl FeatureLike for CompetedCandidate { + fn as_feature(&self) -> impl IntoIterator + '_ { + self.named_features() + .into_iter() + .map(|(v, _)| v) + .collect::>() } fn get_y(&self) -> f64 { @@ -402,3 +489,172 @@ mod tests { } } } + +#[cfg(test)] +mod feature_tests { + use super::*; + use crate::models::DecoyMarking; + use crate::models::sequence::{ + AminoAcid, + ParsedSequence, + Peptide, + }; + use crate::scoring::results::{ + CompetedCandidate, + ScoringFields, + }; + use smallvec::smallvec; + use std::sync::Arc; + + fn base_scoring_fields(peptide: Peptide) -> ScoringFields { + ScoringFields { + peptide, + library_id: 0, + decoy_group_id: 0, + precursor_mz: 500.0, + precursor_charge: 2, + precursor_mobility: 0.9, + is_target: true, + library_rt: 60.0, + calibrated_rt_seconds: 3600.0, + obs_rt_seconds: 3601.0, + calibrated_sq_delta_rt: 1.0, + obs_mobility: 0.91, + delta_ms1_ms2_mobility: 0.01, + sq_delta_ms1_ms2_mobility: 0.0001, + main_score: 10.0, + delta_next: 2.0, + delta_second_next: 1.0, + apex_lazyscore: 5.0, + ms2_lazyscore: 4.0, + ms2_isotope_lazyscore: 3.0, + ms2_isotope_lazyscore_ratio: 0.5, + lazyscore_z: 2.0, + lazyscore_vs_baseline: 1.5, + split_product_score: 0.8, + cosine_au: 0.7, + scribe_au: 0.6, + cosine_cg: 0.5, + scribe_cg: 0.4, + cosine_weighted_coelution: 0.9, + cosine_gradient_consistency: 0.85, + scribe_weighted_coelution: 0.88, + scribe_gradient_consistency: 0.82, + peak_shape: 0.95, + ratio_cv: 0.1, + centered_apex: 0.5, + precursor_coelution: 0.9, + fragment_coverage: 0.8, + precursor_apex_match: 0.7, + xic_quality: 0.75, + fragment_apex_agreement: 0.85, + isotope_correlation: 0.9, + gaussian_correlation: 0.88, + per_frag_gaussian_corr: 0.87, + rising_cycles: 3, + falling_cycles: 2, + npeaks: 5, + n_scored_fragments: 6, + ms2_summed_intensity: 1000.0, + ms1_summed_intensity: 500.0, + ms2_mz_errors: [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7], + ms2_mobility_errors: [0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07], + ms1_mz_errors: [0.1, 0.2, 0.3], + ms1_mobility_errors: [0.01, 0.02, 0.03], + ms2_intensity_ratios: [0.9, 0.8, 0.7, 0.6, 0.5, 0.4, 0.3], + ms1_intensity_ratios: [0.9, 0.8, 0.7], + } + } + + fn sample_competed_candidate_parsed() -> CompetedCandidate { + let parsed = ParsedSequence { + // PEPTIDEK — 8 residues + residues: smallvec![ + AminoAcid::from_ascii(b'P'), + AminoAcid::from_ascii(b'E'), + AminoAcid::from_ascii(b'P'), + AminoAcid::from_ascii(b'T'), + AminoAcid::from_ascii(b'I'), + AminoAcid::from_ascii(b'D'), + AminoAcid::from_ascii(b'E'), + AminoAcid::from_ascii(b'K'), + ], + mods: smallvec![], + }; + let peptide = Peptide { + raw: Arc::from("PEPTIDEK"), + parsed: Some(parsed), + decoy: DecoyMarking::Target, + decoy_group: 0, + }; + CompetedCandidate { + scoring: base_scoring_fields(peptide), + delta_group: 1.0, + delta_group_ratio: 0.5, + discriminant_score: 0.0, + qvalue: 1.0, + } + } + + fn sample_competed_candidate_unparsed() -> CompetedCandidate { + let peptide = Peptide { + raw: Arc::from("PEPTIDEK"), + parsed: None, + decoy: DecoyMarking::Target, + decoy_group: 0, + }; + CompetedCandidate { + scoring: base_scoring_fields(peptide), + delta_group: 1.0, + delta_group_ratio: 0.5, + discriminant_score: 0.0, + qvalue: 1.0, + } + } + + #[test] + fn features_and_names_same_length() { + let cand = sample_competed_candidate_parsed(); + let feats: Vec = cand.as_feature().into_iter().collect(); + let names = cand.feature_names(); + assert_eq!(feats.len(), names.len()); + } + + #[test] + fn sequence_block_present_when_gate_on() { + let cand = sample_competed_candidate_parsed(); + let names = cand.feature_names(); + assert!(names.contains(&"peptide_length")); + assert!(names.contains(&"aa_count_A")); + assert!(names.contains(&"aa_count_Y")); + assert!(names.contains(&"peptide_n_mods")); + assert_eq!(names[names.len() - 22], "peptide_length"); + assert_eq!(names[names.len() - 1], "peptide_n_mods"); + } + + #[test] + fn sequence_block_absent_when_gate_off() { + let cand = sample_competed_candidate_unparsed(); + let names = cand.feature_names(); + assert!(!names.contains(&"peptide_length")); + assert!(!names.contains(&"peptide_n_mods")); + } + + #[test] + fn gate_delta_is_22_dims() { + let on = sample_competed_candidate_parsed().feature_names().len(); + let off = sample_competed_candidate_unparsed().feature_names().len(); + assert_eq!(on - off, 22); + } + + #[test] + fn base_feature_count_locked() { + let off = sample_competed_candidate_unparsed().feature_names().len(); + assert_eq!( + off, 86, + "base (gate-off) feature count is locked at 86; update this test if the set intentionally changes" + ); + let on = sample_competed_candidate_parsed().feature_names().len(); + assert_eq!(on, 108, "gate-on total is 86 base + 22 sequence = 108"); + } +} diff --git a/rust/timsseek/src/models/decoy.rs b/rust/timsseek/src/models/decoy.rs index bebcecf9..c7496f29 100644 --- a/rust/timsseek/src/models/decoy.rs +++ b/rust/timsseek/src/models/decoy.rs @@ -1,13 +1,9 @@ use serde::Serialize; -/// The different labels that denote if a sequence is a decoy or not. -/// -/// NOTE: The main difference between the decoy and reversed decoy is that the reversed decoy -/// has already been reversed, thus converting it to a string can be done as-is. +/// Target vs decoy marker on a peptide. #[derive(Debug, Clone, Copy, Serialize, PartialEq, Eq, std::hash::Hash, PartialOrd, Ord)] pub enum DecoyMarking { Target, - NonReversedDecoy, ReversedDecoy, } @@ -15,44 +11,15 @@ impl DecoyMarking { pub fn as_str(&self) -> &'static str { match self { DecoyMarking::Target => "Target", - DecoyMarking::NonReversedDecoy => "Decoy", DecoyMarking::ReversedDecoy => "Decoy", } } pub fn is_decoy(&self) -> bool { - match self { - DecoyMarking::Target => false, - DecoyMarking::NonReversedDecoy => true, - DecoyMarking::ReversedDecoy => true, - } + matches!(self, DecoyMarking::ReversedDecoy) } pub fn is_target(&self) -> bool { - !self.is_decoy() - } -} - -/// Helper function to convert a sequence into its decoy form -pub(crate) fn as_decoy_string(sequence: &str) -> String { - let mut sequence = sequence.to_string(); - let inner_rev = sequence[1..(sequence.len() - 1)] - .chars() - .rev() - .collect::(); - sequence.replace_range(1..(sequence.len() - 1), &inner_rev); - sequence -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_decoy() { - let sequence = "PEPTIDEPINK"; - let decoy = as_decoy_string(sequence); - assert_eq!(sequence, "PEPTIDEPINK"); - assert_eq!(decoy, "PNIPEDITPEK"); + matches!(self, DecoyMarking::Target) } } diff --git a/rust/timsseek/src/models/digest.rs b/rust/timsseek/src/models/digest.rs index df08d517..85505494 100644 --- a/rust/timsseek/src/models/digest.rs +++ b/rust/timsseek/src/models/digest.rs @@ -1,7 +1,4 @@ -use super::decoy::{ - DecoyMarking, - as_decoy_string, -}; +use super::decoy::DecoyMarking; use serde::Serialize; use std::collections::HashSet; use std::fmt::Display as FmtDisplay; @@ -9,14 +6,14 @@ use std::ops::Range; use std::sync::Arc; #[derive(Debug, Clone, PartialEq, Eq)] -pub struct DigestSlice { +pub struct ProteinSlice { ref_seq: Arc, range: Range, pub decoy: DecoyMarking, pub decoy_group: u32, } -impl Serialize for DigestSlice { +impl Serialize for ProteinSlice { fn serialize(&self, serializer: S) -> Result where S: serde::Serializer, @@ -26,7 +23,7 @@ impl Serialize for DigestSlice { } } -impl DigestSlice { +impl ProteinSlice { pub fn new( ref_seq: Arc, range: Range, @@ -41,9 +38,9 @@ impl DigestSlice { } } - pub fn from_string(seq: String, decoy: bool, decoy_group: u32) -> DigestSlice { + pub fn from_string(seq: String, decoy: bool, decoy_group: u32) -> ProteinSlice { let len = seq.len(); - DigestSlice { + ProteinSlice { ref_seq: seq.into(), range: 0..(len as u16), decoy: if decoy { @@ -55,21 +52,6 @@ impl DigestSlice { } } - pub fn as_decoy(&self) -> DigestSlice { - DigestSlice { - ref_seq: self.ref_seq.clone(), - range: self.range.clone(), - decoy: DecoyMarking::NonReversedDecoy, - decoy_group: self.decoy_group, - } - } - - pub fn as_decoy_string(&self) -> String { - as_decoy_string( - &self.ref_seq.as_ref()[(self.range.start as usize)..(self.range.end as usize)], - ) - } - pub fn len(&self) -> usize { self.range.len() } @@ -79,42 +61,29 @@ impl DigestSlice { } /// Returns the peptide sequence as a string slice without allocation. - /// For Target and ReversedDecoy, returns the raw slice. - /// For NonReversedDecoy, this returns the ORIGINAL (non-reversed) sequence — - /// use `Into::` if you need the decoy form. pub fn as_str(&self) -> &str { &self.ref_seq[self.range.start as usize..self.range.end as usize] } pub fn is_decoy(&self) -> bool { - matches!( - self.decoy, - DecoyMarking::ReversedDecoy | DecoyMarking::NonReversedDecoy - ) + matches!(self.decoy, DecoyMarking::ReversedDecoy) } } -impl FmtDisplay for DigestSlice { +impl FmtDisplay for ProteinSlice { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { let local_str: String = self.clone().into(); write!(f, "{}", local_str) } } -impl From for String { - // TODO make this a cow ... maybe ... - fn from(x: DigestSlice) -> Self { - let tmp = &x.ref_seq.as_ref()[(x.range.start as usize)..(x.range.end as usize)]; - - match x.decoy { - DecoyMarking::Target => tmp.to_string(), - DecoyMarking::ReversedDecoy => tmp.to_string(), - DecoyMarking::NonReversedDecoy => as_decoy_string(tmp), - } +impl From for String { + fn from(x: ProteinSlice) -> Self { + x.ref_seq.as_ref()[(x.range.start as usize)..(x.range.end as usize)].to_string() } } -pub fn deduplicate_digests(mut digest_slices: Vec) -> Vec { +pub fn deduplicate_digests(mut digest_slices: Vec) -> Vec { let mut seen = HashSet::new(); digest_slices.retain(|x| { let local_str: String = x.clone().into(); @@ -134,26 +103,26 @@ mod tests { let seq: Arc = "PEPTIDEPINKTOMATOTOMATO".into(); let seq2: Arc = "PEPTIDEPINKTOMATO".into(); let seq2_rep: Arc = "PEPTIDEPINKTOMATO".into(); - let digests: Vec = vec![ - DigestSlice { + let digests: Vec = vec![ + ProteinSlice { ref_seq: seq.clone(), range: (0_u16..seq.as_ref().len() as u16), decoy: DecoyMarking::Target, decoy_group: 1, }, - DigestSlice { + ProteinSlice { ref_seq: seq.clone(), range: (0_u16..seq2.as_ref().len() as u16), // Note the short length decoy: DecoyMarking::Target, decoy_group: 1, }, - DigestSlice { + ProteinSlice { ref_seq: seq2.clone(), range: (0_u16..seq2.as_ref().len() as u16), decoy: DecoyMarking::Target, decoy_group: 1, }, - DigestSlice { + ProteinSlice { ref_seq: seq2_rep.clone(), range: (0_u16..seq2_rep.as_ref().len() as u16), decoy: DecoyMarking::Target, @@ -169,7 +138,7 @@ mod tests { #[test] fn test_as_str() { let seq: Arc = "PEPTIDEPINKTOMATO".into(); - let slice = DigestSlice::new(seq.clone(), 0..7, DecoyMarking::Target, 0); + let slice = ProteinSlice::new(seq.clone(), 0..7, DecoyMarking::Target, 0); assert_eq!(slice.as_str(), "PEPTIDE"); assert_eq!(slice.as_str().as_bytes(), b"PEPTIDE"); } @@ -178,8 +147,8 @@ mod tests { fn test_from_string() { let seq = "PEPTIDEPINKTOMATO".to_string(); let expect_len = seq.len(); - let digests = DigestSlice::from_string(seq.clone(), false, 1); - let digests_decoy = DigestSlice::from_string(seq.clone(), true, 2); + let digests = ProteinSlice::from_string(seq.clone(), false, 1); + let digests_decoy = ProteinSlice::from_string(seq.clone(), true, 2); assert_eq!(digests.len(), expect_len); assert_eq!(digests.decoy, DecoyMarking::Target); assert_eq!(digests_decoy.len(), expect_len); diff --git a/rust/timsseek/src/models/mod.rs b/rust/timsseek/src/models/mod.rs index 7ff8dc06..214a364b 100644 --- a/rust/timsseek/src/models/mod.rs +++ b/rust/timsseek/src/models/mod.rs @@ -2,14 +2,30 @@ mod decoy; mod decoy_strategy; mod digest; pub mod query_item; +pub mod sequence; pub use decoy::DecoyMarking; pub use decoy_strategy::DecoyStrategy; pub use digest::{ - DigestSlice, + ProteinSlice, deduplicate_digests, }; pub use query_item::{ ExpectedIntensities, QueryItemToScore, }; +pub use sequence::{ + AA_COUNT_NAMES, + AminoAcid, + CANONICAL_AA_INDICES, + CANONICAL_AA_LETTERS, + Mod, + ModEntry, + POS_C_TERM, + POS_N_TERM, + ParsedSequence, + Peptide, + SeqFormat, + SpeclibMeta, + UNKNOWN_AA, +}; diff --git a/rust/timsseek/src/models/query_item.rs b/rust/timsseek/src/models/query_item.rs index 37049f23..6c9c92d0 100644 --- a/rust/timsseek/src/models/query_item.rs +++ b/rust/timsseek/src/models/query_item.rs @@ -1,4 +1,4 @@ -use super::DigestSlice; +use crate::models::sequence::Peptide; use micromzpaf::IonAnnot; use serde::{ Deserialize, @@ -182,7 +182,7 @@ impl ExpectedIntensities { #[derive(Debug, Clone)] pub struct QueryItemToScore { // Kinda hate this - pub digest: DigestSlice, + pub digest: Peptide, pub query: TimsElutionGroup, pub expected_intensity: ExpectedIntensities, } @@ -225,8 +225,13 @@ impl QueryItemToScore { (2, 0.3), ), }; - let pepseq = "PEPTIDEPINKPEPTIDE".into(); - let digest = DigestSlice::from_string(pepseq, false, 1); + let raw: std::sync::Arc = "PEPTIDEPINKPEPTIDE".into(); + let digest = Peptide { + raw, + parsed: None, + decoy: crate::models::decoy::DecoyMarking::Target, + decoy_group: 1, + }; let query = eg; let expected_intensity = ei; QueryItemToScore { diff --git a/rust/timsseek/src/models/sequence.rs b/rust/timsseek/src/models/sequence.rs new file mode 100644 index 00000000..f916b0dc --- /dev/null +++ b/rust/timsseek/src/models/sequence.rs @@ -0,0 +1,393 @@ +//! Peptide and mod representation for per-entry speclib rows. +//! +//! `Peptide` is used on scoring paths. `ProteinSlice` stays on FASTA digestion. + +use crate::models::decoy::DecoyMarking; +use serde::Serialize; +use smallvec::SmallVec; +use std::sync::Arc; + +/// Amino acid stored as alphabet offset `c - b'A'` (0..=25). `u8::MAX` +/// means "unrecognized / non-alpha". Unreachable slots in count buffers +/// (B=1, J=9, O=14, U=20, X=23, Z=25) stay zero. +/// +/// Chosen over canonical-20 index storage so parse cost is one subtraction +/// per residue (no 20-scan `.find`) and `aa_counts` inner loop is branchless. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] +pub struct AminoAcid(pub u8); + +pub const UNKNOWN_AA: AminoAcid = AminoAcid(u8::MAX); + +/// Canonical output order for the 20-dim count vector: +/// A C D E F G H I K L M N P Q R S T V W Y. +/// Do not reorder — `FEATURE_NAMES` follows this. +pub const CANONICAL_AA_INDICES: [usize; 20] = [ + 0, 2, 3, 4, 5, 6, 7, 8, 10, 11, 12, 13, 15, 16, 17, 18, 19, 21, 22, 24, +]; + +pub const CANONICAL_AA_LETTERS: [u8; 20] = *b"ACDEFGHIKLMNPQRSTVWY"; + +/// Feature-vector names for the 20-dim AA-count block, derived from +/// [`CANONICAL_AA_LETTERS`] so the order can never drift out of sync. +/// `AA_COUNT_NAMES[i]` is `format!("aa_count_{}", CANONICAL_AA_LETTERS[i] as char)` +/// with a single one-time allocation leaked to `&'static str`. +pub static AA_COUNT_NAMES: std::sync::LazyLock<[&'static str; 20]> = + std::sync::LazyLock::new(|| { + let mut out: [&'static str; 20] = [""; 20]; + for (i, &c) in CANONICAL_AA_LETTERS.iter().enumerate() { + let s = format!("aa_count_{}", c as char); + out[i] = Box::leak(s.into_boxed_str()); + } + out + }); + +impl AminoAcid { + pub fn from_ascii(c: u8) -> AminoAcid { + if c.is_ascii_uppercase() { + AminoAcid(c - b'A') + } else { + UNKNOWN_AA + } + } + + /// Returns `true` for any ASCII uppercase letter (`A`..=`Z`), including + /// non-standard codes `B/J/O/U/X/Z`. These non-canonical codes occupy + /// slots in the 26-wide count buffer but are never projected into the + /// 20-dim `aa_counts` output. For "is this one of the canonical 20?", + /// check `CANONICAL_AA_INDICES.contains(&(self.0 as usize))`. + pub fn is_known(self) -> bool { + self.0 < 26 + } +} + +#[derive(Debug, Clone, Copy, PartialEq)] +pub enum Mod { + Unimod(u16), + Mass(f32), +} + +#[derive(Debug, Clone, Copy, PartialEq)] +pub struct ModEntry { + /// Residue position. Sentinel values: `POS_N_TERM` (0xFF) / `POS_C_TERM` (0xFE). + /// Otherwise a 0-based residue index, valid range `0..=253`. Sequences + /// longer than 254 residues cannot be represented — the parser rejects them + /// in `parse_sequence` rather than truncating here. + pub pos: u8, + pub kind: Mod, +} + +pub const POS_N_TERM: u8 = 0xFF; +pub const POS_C_TERM: u8 = 0xFE; + +#[derive(Debug, Clone, PartialEq)] +pub struct ParsedSequence { + pub residues: SmallVec<[AminoAcid; 32]>, + pub mods: SmallVec<[ModEntry; 2]>, +} + +impl ParsedSequence { + pub fn aa_counts(&self) -> [f64; 20] { + // Count into 26-wide alphabet buffer (branchless hot loop). + let mut tmp = [0.0_f64; 26]; + for r in &self.residues { + if r.is_known() { + tmp[r.0 as usize] += 1.0; + } + } + // Project to canonical 20-dim output. + let mut out = [0.0_f64; 20]; + for (i, &idx) in CANONICAL_AA_INDICES.iter().enumerate() { + out[i] = tmp[idx]; + } + out + } +} + +#[derive(Debug, Clone)] +pub struct Peptide { + pub raw: Arc, + pub parsed: Option, + pub decoy: DecoyMarking, + pub decoy_group: u32, +} + +impl Peptide { + pub fn as_str(&self) -> &str { + &self.raw + } + + /// Byte length of the raw sequence string (not residue count). + pub fn len(&self) -> usize { + self.raw.len() + } + + pub fn is_empty(&self) -> bool { + self.raw.is_empty() + } + + /// `true` when this entry is a decoy (reversed or mass-shifted). + pub fn is_decoy(&self) -> bool { + matches!( + self.decoy, + crate::models::decoy::DecoyMarking::ReversedDecoy + ) + } + + pub fn length(&self) -> Option { + self.parsed.as_ref().map(|p| p.residues.len() as u8) + } + + pub fn aa_counts(&self) -> Option<[f64; 20]> { + self.parsed.as_ref().map(|p| p.aa_counts()) + } + + pub fn n_mods(&self) -> Option { + self.parsed.as_ref().map(|p| p.mods.len() as u8) + } +} + +impl Serialize for Peptide { + fn serialize(&self, serializer: S) -> Result { + serializer.serialize_str(&self.raw) + } +} + +/// Speclib-level metadata. Lives on `Speclib` struct. +#[derive(Debug, Clone, Copy, PartialEq)] +pub enum SeqFormat { + /// Bare AA only, no mods (e.g. `PEPTIDEK`). + Plain, + /// Modified ProForma-able form (`[UNIMOD:4]`, `[+15.995]`, `_..._` OK). + Modified, +} + +#[derive(Debug, Clone, Copy, PartialEq)] +pub struct SpeclibMeta { + pub parsable_sequences: bool, + pub sequence_format: SeqFormat, +} + +impl Default for SpeclibMeta { + fn default() -> Self { + Self { + parsable_sequences: false, + sequence_format: SeqFormat::Plain, + } + } +} + +/// Parse a ProForma-normalized peptide string into our thin representation. +/// Returns `None` on parse error, non-linear peptides, or mods outside the +/// `Unimod`/`Mass` subset. Off the hot path — runs once per speclib load. +pub fn parse_sequence(normalized: &str) -> Option { + use rustyms::prelude::IsAminoAcid; + use rustyms::sequence::Peptidoform; + + let pf = Peptidoform::pro_forma(normalized, None).ok()?; + let linear = pf.into_linear()?; + + let mut residues: SmallVec<[AminoAcid; 32]> = SmallVec::new(); + let mut mods: SmallVec<[ModEntry; 2]> = SmallVec::new(); + + for (i, el) in linear.sequence().iter().enumerate() { + if i > 253 { + return None; + } + let c = el.aminoacid.pro_forma_definition().chars().next()?; + residues.push(AminoAcid::from_ascii(c as u8)); + + for m in &el.modifications { + let kind = modification_to_mod(m)?; + mods.push(ModEntry { pos: i as u8, kind }); + } + } + + for m in linear.get_n_term() { + let kind = modification_to_mod(m)?; + mods.push(ModEntry { + pos: POS_N_TERM, + kind, + }); + } + for m in linear.get_c_term() { + let kind = modification_to_mod(m)?; + mods.push(ModEntry { + pos: POS_C_TERM, + kind, + }); + } + + Some(ParsedSequence { residues, mods }) +} + +fn modification_to_mod(m: &rustyms::sequence::Modification) -> Option { + use rustyms::ontology::Ontology; + use rustyms::sequence::{ + Modification, + SimpleModificationInner, + }; + let simple = match m { + Modification::Simple(s) => s, + _ => return None, // Cross-link / ambiguous — out of v1 scope + }; + match simple.as_ref() { + SimpleModificationInner::Mass(mass) => Some(Mod::Mass(mass.value as f32)), + SimpleModificationInner::Database { id, .. } => { + if id.ontology == Ontology::Unimod { + Some(Mod::Unimod(id.id? as u16)) + } else { + None + } + } + _ => None, + } +} + +/// Coerce DIA-NN / short-form modified-sequence strings into rustyms-parseable +/// ProForma. Strips `_..._` wrapping used by DIA-NN and normalizes UNIMOD tag +/// casing (`[UniMod:`, `[Unimod:`, `[U:` → `[UNIMOD:`). Pass-through otherwise. +/// Off the hot path — allocates on every replacement, which is fine at load. +pub fn normalize_to_proforma(raw: &str) -> String { + let trimmed = raw.trim_matches('_'); + // Fast path: plain-AA sequences (no mod tags) skip the replace chain. + if !trimmed.contains('[') { + return trimmed.to_owned(); + } + // Longer prefixes first so `[Uni...` doesn't partially-rewrite over `[U:`. + trimmed + .replace("[UniMod:", "[UNIMOD:") + .replace("[Unimod:", "[UNIMOD:") + .replace("[U:", "[UNIMOD:") +} + +#[cfg(test)] +mod tests { + use super::*; + use smallvec::smallvec; + + fn ci(ch: u8) -> usize { + CANONICAL_AA_LETTERS.iter().position(|&x| x == ch).unwrap() + } + + #[test] + fn aa_counts_peptidek() { + let seq = ParsedSequence { + residues: "PEPTIDEK".bytes().map(AminoAcid::from_ascii).collect(), + mods: smallvec![], + }; + let c = seq.aa_counts(); + assert_eq!(c[ci(b'P')], 2.0); + assert_eq!(c[ci(b'E')], 2.0); + assert_eq!(c[ci(b'T')], 1.0); + assert_eq!(c[ci(b'I')], 1.0); + assert_eq!(c[ci(b'D')], 1.0); + assert_eq!(c[ci(b'K')], 1.0); + assert_eq!(c[ci(b'A')], 0.0); + assert_eq!(c.iter().sum::(), 8.0); + } + + #[test] + fn aa_counts_skip_unknown() { + // 'X' maps to a non-canonical alpha slot (tmp[23]); never projected to out[]. + let seq = ParsedSequence { + residues: "AXA".bytes().map(AminoAcid::from_ascii).collect(), + mods: smallvec![], + }; + let c = seq.aa_counts(); + assert_eq!(c[ci(b'A')], 2.0); + assert_eq!(c.iter().sum::(), 2.0); + } + + #[test] + fn aa_encoding_alpha_offset() { + assert_eq!(AminoAcid::from_ascii(b'A').0, 0); + assert_eq!(AminoAcid::from_ascii(b'Y').0, 24); + assert_eq!(AminoAcid::from_ascii(b'Z').0, 25); + assert_eq!(AminoAcid::from_ascii(b'!').0, u8::MAX); + } + + #[test] + fn normalize_strips_underscores() { + assert_eq!(normalize_to_proforma("_PEPTIDEK_"), "PEPTIDEK"); + } + + #[test] + fn normalize_diann_unimod_case() { + assert_eq!( + normalize_to_proforma("_LSHPGC[UniMod:4]K_"), + "LSHPGC[UNIMOD:4]K" + ); + assert_eq!( + normalize_to_proforma("_C[Unimod:4]TVPGHK_"), + "C[UNIMOD:4]TVPGHK" + ); + } + + #[test] + fn normalize_short_u_form() { + assert_eq!( + normalize_to_proforma("PEPTC[U:4]IDEK"), + "PEPTC[UNIMOD:4]IDEK" + ); + } + + #[test] + fn normalize_mass_shift_unchanged() { + assert_eq!( + normalize_to_proforma("PEPTM[+15.995]IDEK"), + "PEPTM[+15.995]IDEK" + ); + } + + #[test] + fn normalize_plain_unchanged() { + assert_eq!(normalize_to_proforma("PEPTIDEK"), "PEPTIDEK"); + } + + #[test] + fn parse_plain() { + let p = parse_sequence("PEPTIDEK").expect("parse"); + assert_eq!(p.residues.len(), 8); + assert_eq!(p.mods.len(), 0); + assert_eq!(p.residues[0], AminoAcid::from_ascii(b'P')); + assert_eq!(p.residues[7], AminoAcid::from_ascii(b'K')); + } + + #[test] + fn parse_with_unimod() { + // PEPTC[UNIMOD:4]IDEK = 9 residues: P-E-P-T-C-I-D-E-K, C at index 4 + let p = parse_sequence("PEPTC[UNIMOD:4]IDEK").expect("parse"); + assert_eq!(p.residues.len(), 9); + assert_eq!(p.mods.len(), 1); + assert_eq!(p.mods[0].pos, 4); + assert_eq!(p.mods[0].kind, Mod::Unimod(4)); + } + + #[test] + fn parse_with_mass_shift() { + // PEPTM[+15.995]IDEK = 9 residues: P-E-P-T-M-I-D-E-K, M at index 4 + let p = parse_sequence("PEPTM[+15.995]IDEK").expect("parse"); + assert_eq!(p.residues.len(), 9); + assert_eq!(p.mods.len(), 1); + assert_eq!(p.mods[0].pos, 4); + match p.mods[0].kind { + Mod::Mass(m) => assert!((m - 15.995).abs() < 1e-3), + _ => panic!("expected Mass variant"), + } + } + + #[test] + fn parse_rejects_garbage() { + assert!(parse_sequence("not a peptide!!!").is_none()); + } + + #[test] + fn parse_n_term_mod() { + // Acetyl (UNIMOD:1) on n-term + let p = parse_sequence("[Acetyl]-PEPTIDEK").expect("parse"); + assert_eq!(p.residues.len(), 8); + assert_eq!(p.mods.len(), 1); + assert_eq!(p.mods[0].pos, POS_N_TERM); + assert_eq!(p.mods[0].kind, Mod::Unimod(1)); + } +} diff --git a/rust/timsseek/src/scoring/apex_finding.rs b/rust/timsseek/src/scoring/apex_finding.rs index 4dda4ee9..4706d954 100644 --- a/rust/timsseek/src/scoring/apex_finding.rs +++ b/rust/timsseek/src/scoring/apex_finding.rs @@ -33,10 +33,8 @@ use super::{ }; use crate::IonAnnot; use crate::errors::DataProcessingError; -use crate::models::{ - DigestSlice, - ExpectedIntensities, -}; +use crate::models::ExpectedIntensities; +use crate::models::sequence::Peptide; use crate::scoring::scores::apex_features::{ ApexFeatures, SCRIBE_FLOOR, @@ -81,7 +79,7 @@ pub struct CandidateContext { #[derive(Debug, Clone)] pub struct PeptideMetadata { /// The peptide sequence and modification information. - pub digest: DigestSlice, + pub digest: Peptide, /// The precursor charge state. pub charge: u8, diff --git a/rust/timsseek/src/scoring/mod.rs b/rust/timsseek/src/scoring/mod.rs index d8079f21..21d8f7ac 100644 --- a/rust/timsseek/src/scoring/mod.rs +++ b/rust/timsseek/src/scoring/mod.rs @@ -34,6 +34,7 @@ pub use timings::{ PipelineReport, PrescoreTimings, RunReport, + RunStatus, ScoreTimings, }; diff --git a/rust/timsseek/src/scoring/parquet_writer.rs b/rust/timsseek/src/scoring/parquet_writer.rs index ad71fc25..65e8bd0c 100644 --- a/rust/timsseek/src/scoring/parquet_writer.rs +++ b/rust/timsseek/src/scoring/parquet_writer.rs @@ -3,6 +3,7 @@ use arrow::datatypes::*; use arrow::record_batch::RecordBatch; use parquet::arrow::ArrowWriter; use parquet::basic::Compression; +use parquet::file::metadata::KeyValue; use parquet::file::properties::WriterProperties; use std::fs::File; use std::path::Path; @@ -75,7 +76,7 @@ pub fn build_record_batch(results: &[FinalResult]) -> std::io::Result std::io::Result DataType::Utf8, StringArray(|r: &FinalResult| Some(r.scoring.sequence.as_str())); + "sequence" => DataType::Utf8, StringArray(|r: &FinalResult| Some(r.scoring.peptide.as_str())); "library_id" => DataType::UInt32, UInt32Array(|r: &FinalResult| Some(r.scoring.library_id)); "decoy_group_id" => DataType::UInt32, UInt32Array(|r: &FinalResult| Some(r.scoring.decoy_group_id)); "precursor_mz" => DataType::Float64, Float64Array(|r: &FinalResult| Some(r.scoring.precursor_mz)); @@ -283,7 +284,11 @@ pub struct ResultParquetWriter { } impl ResultParquetWriter { - pub fn new(path: impl AsRef, row_group_size: usize) -> std::io::Result { + pub fn new( + path: impl AsRef, + row_group_size: usize, + parsable_sequences: bool, + ) -> std::io::Result { let file = match File::create_new(path.as_ref()) { Ok(f) => f, Err(err) => { @@ -296,8 +301,13 @@ impl ResultParquetWriter { let empty_batch = build_record_batch(&[])?; let schema = empty_batch.schema(); + let kv = vec![KeyValue { + key: "parsable_sequences".to_string(), + value: Some(parsable_sequences.to_string()), + }]; let props = WriterProperties::builder() .set_compression(Compression::SNAPPY) + .set_key_value_metadata(Some(kv)) .build(); let writer = @@ -335,3 +345,45 @@ impl ResultParquetWriter { Ok(()) } } + +#[cfg(test)] +mod tests { + use super::*; + use parquet::file::reader::{ + FileReader, + SerializedFileReader, + }; + use std::fs::File; + + #[test] + fn parsable_sequences_key_in_parquet_metadata() { + let tmp = tempfile::NamedTempFile::new().expect("tmpfile"); + let path = tmp.path().to_path_buf(); + // Close the NamedTempFile so ResultParquetWriter can create the file + // (File::create_new fails if the path already exists on some platforms, + // but NamedTempFile holds it open; drop it first). + drop(tmp); + { + let writer = ResultParquetWriter::new(&path, 1024, true).expect("create writer"); + writer.close().expect("close"); + } + let file = File::open(&path).expect("open"); + let reader = SerializedFileReader::new(file).expect("reader"); + let meta = reader.metadata().file_metadata(); + let kv_list = meta.key_value_metadata().expect("kv metadata present"); + let found: Vec<_> = kv_list + .iter() + .filter(|k| k.key == "parsable_sequences") + .collect(); + assert_eq!( + found.len(), + 1, + "expected exactly one parsable_sequences key" + ); + assert_eq!( + found[0].value.as_deref(), + Some("true"), + "parsable_sequences value should be 'true'" + ); + } +} diff --git a/rust/timsseek/src/scoring/results.rs b/rust/timsseek/src/scoring/results.rs index e090f07c..4a0a4064 100644 --- a/rust/timsseek/src/scoring/results.rs +++ b/rust/timsseek/src/scoring/results.rs @@ -8,6 +8,7 @@ use super::apex_finding::{ use super::offsets::MzMobilityOffsets; use super::pipeline::SecondaryLazyScores; use crate::errors::DataProcessingError; +use crate::models::sequence::Peptide; use super::{ NUM_MS1_IONS, @@ -18,7 +19,7 @@ use super::{ #[derive(Debug, Clone, Serialize)] pub struct ScoringFields { // Identity - pub sequence: String, + pub peptide: Peptide, pub library_id: u32, pub decoy_group_id: u32, pub precursor_mz: f64, @@ -194,7 +195,7 @@ impl SetField { #[derive(Debug, Default)] pub struct ScoredCandidateBuilder { // --- Identity --- - sequence: SetField, + peptide: SetField, library_id: SetField, decoy_group_id: SetField, precursor_mz: SetField, @@ -274,7 +275,7 @@ impl ScoredCandidateBuilder { /// Populate identity fields and reference values from peptide metadata. pub fn with_metadata(mut self, metadata: &PeptideMetadata) -> Self { self.library_id = SetField::Some(metadata.library_id); - self.sequence = SetField::Some(String::from(metadata.digest.clone())); + self.peptide = SetField::Some(metadata.digest.clone()); self.is_target = SetField::Some(metadata.digest.decoy.is_target()); self.decoy_group_id = SetField::Some(metadata.digest.decoy_group); self.precursor_charge = SetField::Some(metadata.charge); @@ -408,7 +409,7 @@ impl ScoredCandidateBuilder { let scoring = ScoringFields { // Identity - sequence: expect_some!(sequence), + peptide: expect_some!(peptide), library_id: expect_some!(library_id), decoy_group_id: expect_some!(decoy_group_id), precursor_mz: expect_some!(precursor_mz), diff --git a/rust/timsseek/src/scoring/timings.rs b/rust/timsseek/src/scoring/timings.rs index e218eafa..e7ba14b3 100644 --- a/rust/timsseek/src/scoring/timings.rs +++ b/rust/timsseek/src/scoring/timings.rs @@ -8,15 +8,14 @@ //! //! - [`timed!`] — times a block and accumulates elapsed into a `Duration` field. //! - [`TimedStep`] — progressive CLI output: prints a label immediately, then -//! appends elapsed time when the work finishes. +//! appends elapsed time when the work finishes. Re-exported from +//! [`timscentroid::TimedStep`]. use super::skip::SkipCounts; use serde::Serialize; -use std::fmt; -use std::time::{ - Duration, - Instant, -}; +use std::time::Duration; + +pub use timscentroid::TimedStep; /// Time a block, accumulate elapsed into `$target`, return the block's value. /// @@ -36,78 +35,6 @@ macro_rules! timed { }}; } -/// A timed step that prints a dot-padded label immediately, opens a tracing -/// span, and appends elapsed time on finish. -/// -/// ```ignore -/// let step = TimedStep::begin("Loading speclib"); -/// let speclib = load_speclib()?; -/// let elapsed = step.finish_with(format_args!("{} entries", speclib.len())); -/// // terminal: "Loading speclib .......... 834.567ms (225178 entries)" -/// ``` -pub struct TimedStep { - start: Instant, - stderr: bool, - _span: tracing::span::EnteredSpan, -} - -/// Column width for dot-padded labels on stdout. -const LABEL_WIDTH: usize = 26; - -impl TimedStep { - /// Dot-pad `label` to stdout, open a tracing span, flush, start clock. - pub fn begin(label: impl fmt::Display) -> Self { - let label = label.to_string(); - let span = tracing::info_span!("step", label = label.as_str()); - let dots = LABEL_WIDTH.saturating_sub(label.len() + 1); - if dots > 0 { - print!("{label} {:. Self { - let label = label.to_string(); - let span = tracing::info_span!("step", label = label.as_str()); - eprint!("{label}"); - Self { - start: Instant::now(), - stderr: true, - _span: span.entered(), - } - } - - /// Print ` {elapsed:?}\n`, return Duration. - pub fn finish(self) -> Duration { - let d = self.start.elapsed(); - self.emit(format_args!(" {:?}", d)); - d - } - - /// Print ` {elapsed:?} ({detail})\n`, return Duration. - pub fn finish_with(self, detail: impl fmt::Display) -> Duration { - let d = self.start.elapsed(); - self.emit(format_args!(" {:?} ({})", d, detail)); - d - } - - fn emit(&self, msg: fmt::Arguments<'_>) { - if self.stderr { - eprintln!("{msg}"); - } else { - println!("{msg}"); - } - } -} - /// Accumulated timing measurements for the four scoring stages. /// /// Each field tracks the total time spent in that stage across all queries. @@ -233,20 +160,46 @@ pub struct PipelineReport { pub phase3_skips: SkipCounts, } +/// Terminal state of a run. `Completed` is the normal happy path; `Aborted` +/// means the batch loop stopped early (e.g. I/O error) and the file list is +/// a partial prefix of what was requested. Consumers reading `run_report.json` +/// should treat anything other than `Completed` as incomplete. +#[derive(Debug, Default, Serialize, Clone, Copy, PartialEq, Eq)] +#[serde(rename_all = "snake_case")] +pub enum RunStatus { + #[default] + Completed, + Aborted, +} + /// Top-level report for an entire CLI invocation. /// Contains shared loading costs and per-file pipeline reports. #[derive(Debug, Default, Serialize)] pub struct RunReport { + /// Terminal status of the run. See [`RunStatus`]. + pub status: RunStatus, + /// When `status == Aborted`, a short human-readable reason. `None` on + /// successful runs. + #[serde(skip_serializing_if = "Option::is_none")] + pub abort_reason: Option, pub load_speclib_ms: u64, pub load_calib_lib_ms: u64, pub speclib_entries: usize, pub calib_lib_entries: usize, pub files: Vec, + /// Top-level run artifacts (run_report.json, config_used.json, etc). + #[serde(default)] + pub artifacts: Vec, } -/// Per-file report: file name + pipeline report. +/// Per-file report: file name + pipeline report + output locations. +/// `outputs` lists every file written for this sample (parquet + sidecars) +/// at their final destination URI so a downstream consumer can transfer +/// them with no extra discovery step. #[derive(Debug, Serialize)] pub struct FileReport { pub file_name: String, pub pipeline: PipelineReport, + #[serde(default)] + pub outputs: Vec, } diff --git a/rust/timsseek_cli/Cargo.toml b/rust/timsseek_cli/Cargo.toml index ab5c3eca..7c78a643 100644 --- a/rust/timsseek_cli/Cargo.toml +++ b/rust/timsseek_cli/Cargo.toml @@ -4,13 +4,26 @@ version.workspace = true edition.workspace = true license.workspace = true +# rustyms is patched (see workspace Cargo.toml `[patch.crates-io]` + patches/) +# to drop the 60MB gnome.dat glycan database we don't use. Fresh-clone setup: +# task setup (or: cargo install --git patch-crate --locked && cargo patch-crate) +# The [patch.crates-io] path in the workspace Cargo.toml requires target/patch/ +# to exist before any cargo resolution — build.rs automation isn't possible +# because cargo reads [patch] before any build script runs, even with the +# jspaezp fork that fixed the tool's own bootstrap paradox. +[package.metadata.patch] +crates = ["rustyms"] + [dependencies] timsseek = { path = "../timsseek", default-features = false } timsquery = { path = "../timsquery" } timscentroid = { path = "../timscentroid" } +tims_stage = { path = "../tims_stage" } alloc_track = { path = "../alloc_track" } +tempfile = { workspace = true } regex = "1.11.1" +chrono = { version = "0.4", default-features = false, features = ["clock"] } # tracing-profile = { version="0.10.11",features=["perfetto"] } tracing-profile = { git = "https://github.com/jspaezp/tracing-profile.git" , branch = "feat/aggregate_common", optional = true } @@ -37,7 +50,11 @@ path = "src/sample_speclib.rs" mimalloc = { workspace = true, features = ["secure"] } [features] -default = ["parallel"] +default = ["parallel", "aws"] parallel = ["timsseek/rayon"] instrumentation = ["timsseek/instrumentation", "tracing-profile"] track-alloc = ["alloc_track/enabled"] +# S3 is default (see above). GCS/Azure stay opt-in — add when actually needed. +aws = ["tims_stage/aws", "timscentroid/aws"] +gcp = ["tims_stage/gcp", "timscentroid/gcp"] +azure = ["tims_stage/azure", "timscentroid/azure"] diff --git a/rust/timsseek_cli/assets/default_config.toml b/rust/timsseek_cli/assets/default_config.toml index 21b0e86c..e38e81c1 100644 --- a/rust/timsseek_cli/assets/default_config.toml +++ b/rust/timsseek_cli/assets/default_config.toml @@ -10,10 +10,10 @@ # # The CLI also accepts JSON configs; TOML is preferred for hand-authored ones. -## Input spectral library (optional — `--speclib-file` overrides this). +## Input spectral library (optional — `--speclib-uri` overrides this). # [input] # type = "speclib" -# path = "path/to/library.msgpack.zst" +# uri = "path/to/library.msgpack.zst" [analysis] @@ -69,6 +69,18 @@ min_rt_tolerance_minutes = 0.5 calibration_query_rt_window_minutes = 0.083333336 # 5 seconds (5 / 60) -## Output directory (optional — `--output-dir` overrides this). +## Output directory (optional — `--output-uri` overrides this). # [output] -# directory = "./results" +# uri = "./results" + + +## Staging controls for S3/remote raw inputs (all optional). +## Tempdirs are created under `tempdir_root` (system temp when unset); +## remote prefixes are capped at `max_prefix_keys` keys; `save_sidecar` +## persists a `.idx` next to the input so repeat runs skip the build; +## `stale_sweep_age_hours` reclaims leaked tempdirs from prior crashes. +# [staging] +# tempdir_root = "/scratch/timsseek" +# max_prefix_keys = 256 +# save_sidecar = false +# stale_sweep_age_hours = 24 diff --git a/rust/timsseek_cli/examples/load_bench.rs b/rust/timsseek_cli/examples/load_bench.rs new file mode 100644 index 00000000..311d7ca9 --- /dev/null +++ b/rust/timsseek_cli/examples/load_bench.rs @@ -0,0 +1,125 @@ +//! Benchmark index loading through each `tims_stage::resolve()` branch +//! without the full search pipeline. +//! +//! Reads one URI from the `LOAD_BENCH_URI` env var. Resolves, stages (if +//! needed), loads/builds the index, prints explicit timings for each +//! phase, and exits. No scoring, no output writes. +//! +//! Use this to measure: +//! - S3 tar staging time vs prefix staging time vs local-idx load vs +//! remote-idx load. +//! - Per-phase breakdown (resolve probe, stage, build/load). +//! +//! Build + run: +//! cargo build --release --features aws -p timsseek_cli --example load_bench +//! LOAD_BENCH_URI='s3://bkt/sample.d.tar' \ +//! ./target/release/examples/load_bench +//! +//! Env: +//! LOAD_BENCH_URI (required) — URI of the raw input to resolve+load. +//! RUST_LOG (optional) — default "info,tims_stage=info,timscentroid=info". + +use std::time::Instant; + +use tims_stage::{ + PerRunTempdir, + Resolved, + StagingBackend, + StagingConfig, +}; +use timscentroid::{ + CentroidingConfig, + IndexedTimstofPeaks, + StorageLocation, +}; +use timsrust::TimsTofPath; +use tracing::info; + +fn main() -> Result<(), Box> { + let filter = std::env::var("RUST_LOG") + .unwrap_or_else(|_| "info,tims_stage=info,timscentroid=info".to_string()); + tracing_subscriber::fmt() + .with_env_filter(filter) + .with_target(true) + .with_writer(std::io::stderr) + .init(); + + let uri = std::env::var("LOAD_BENCH_URI").map_err(|_| "LOAD_BENCH_URI env var required")?; + + info!(%uri, "load_bench start"); + let t_total = Instant::now(); + + // Phase 1: resolve (pure + one HEAD for sidecar on non-.idx inputs). + let t = Instant::now(); + let resolved = tims_stage::resolve(&uri)?; + let resolve_ms = t.elapsed().as_millis(); + info!( + elapsed_ms = resolve_ms, + kind = kind_of(&resolved), + "resolve done" + ); + + // Phase 2 + 3: stage (if Stageable) + load/build. + match resolved { + Resolved::LocalIdx { loc } | Resolved::RemoteIdx { loc } => { + let t = Instant::now(); + let _idx = IndexedTimstofPeaks::load_from_storage(loc)?; + info!( + elapsed_ms = t.elapsed().as_millis(), + "load_from_storage done" + ); + } + Resolved::LocalDotD { path } => { + let t = Instant::now(); + let tt = TimsTofPath::new(path.to_str().unwrap()) + .map_err(|e| format!("TimsTofPath::new: {e:?}"))?; + let (_idx, stats) = + IndexedTimstofPeaks::from_timstof_file(&tt, CentroidingConfig::default()); + info!( + elapsed_ms = t.elapsed().as_millis(), + ms1_time_ms = stats.ms1_total_time.as_millis(), + ms2_time_ms = stats.ms2_total_time.as_millis(), + "from_timstof_file (local .d) done" + ); + } + Resolved::Stageable { spec } => { + let backend = PerRunTempdir::new(StagingConfig::default())?; + let t = Instant::now(); + let staged = backend.stage(&spec)?; + let stage_ms = t.elapsed().as_millis(); + info!(elapsed_ms = stage_ms, dotd = ?staged.as_ref(), "stage done"); + + let t = Instant::now(); + let tt = TimsTofPath::new(staged.as_ref().to_str().unwrap()) + .map_err(|e| format!("TimsTofPath::new: {e:?}"))?; + let (_idx, stats) = + IndexedTimstofPeaks::from_timstof_file(&tt, CentroidingConfig::default()); + info!( + elapsed_ms = t.elapsed().as_millis(), + ms1_time_ms = stats.ms1_total_time.as_millis(), + ms2_time_ms = stats.ms2_total_time.as_millis(), + "from_timstof_file (staged) done" + ); + } + }; + + info!(total_ms = t_total.elapsed().as_millis(), "load_bench end"); + Ok(()) +} + +fn kind_of(r: &Resolved) -> &'static str { + match r { + Resolved::LocalIdx { .. } => "LocalIdx", + Resolved::RemoteIdx { .. } => "RemoteIdx", + Resolved::LocalDotD { .. } => "LocalDotD", + Resolved::Stageable { spec } => match spec { + tims_stage::SourceSpec::S3Tar { .. } => "Stageable(S3Tar)", + tims_stage::SourceSpec::S3Prefix { .. } => "Stageable(S3Prefix)", + tims_stage::SourceSpec::LocalTar { .. } => "Stageable(LocalTar)", + }, + } +} + +// Silence unused import if StorageLocation is re-exported only when feature is on. +#[allow(dead_code)] +fn _unused(_: Option) {} diff --git a/rust/timsseek_cli/src/cli.rs b/rust/timsseek_cli/src/cli.rs index 92d6ab8f..ab94eb1c 100644 --- a/rust/timsseek_cli/src/cli.rs +++ b/rust/timsseek_cli/src/cli.rs @@ -21,23 +21,24 @@ pub struct Cli { /// Path to the .d file(s) (will over-write the config file) /// Can be specified multiple times for batch processing - #[arg(short, long, value_name = "FILE")] - pub dotd_files: Vec, + #[arg(long = "raw-inputs", alias = "dotd-files", value_name = "URI")] + pub raw_inputs: Vec, /// Path to the speclib file (will over-write the config file) - #[arg(short, long)] - pub speclib_file: Option, + #[arg(long = "speclib-uri", alias = "speclib-file")] + pub speclib_uri: Option, - /// Path to a calibration library (optional). + /// URI of a calibration library (optional). /// If provided, Phase 1 prescore uses this library for calibrant selection, /// while Phase 3 scoring uses the main speclib. /// If not provided, the main speclib is used for both phases. - #[arg(long)] - pub calib_lib: Option, + /// Accepts local paths or s3:// URIs. + #[arg(long, value_name = "URI")] + pub calib_lib: Option, /// Path to the output directory - #[arg(short, long)] - pub output_dir: Option, + #[arg(long = "output-uri", short = 'o', alias = "output-dir")] + pub output_uri: Option, /// Overwrite existing output directory if it exists #[arg(short = 'O', long)] @@ -64,4 +65,8 @@ pub struct Cli { /// Write the default TOML configuration to the given path and exit. #[arg(long, value_name = "PATH")] pub write_default_config: Option, + + /// Skip writing results.feature_stats.json sidecar after rescoring. + #[arg(long)] + pub no_feature_stats: bool, } diff --git a/rust/timsseek_cli/src/config.rs b/rust/timsseek_cli/src/config.rs index 86012111..148f5cae 100644 --- a/rust/timsseek_cli/src/config.rs +++ b/rust/timsseek_cli/src/config.rs @@ -2,7 +2,6 @@ use serde::{ Deserialize, Serialize, }; -use std::path::PathBuf; use timsquery::Tolerance; use timsquery::models::tolerance::{ MobilityTolerance, @@ -26,19 +25,56 @@ pub struct Config { #[serde(default = "CalibrationConfig::default")] pub calibration: CalibrationConfig, pub output: Option, + #[serde(default)] + pub staging: Option, +} + +#[derive(Debug, Serialize, Deserialize, Clone)] +#[serde(deny_unknown_fields)] +pub struct StagingConfig { + #[serde(default)] + pub tempdir_root: Option, + #[serde(default = "default_max_prefix_keys")] + pub max_prefix_keys: usize, + #[serde(default)] + pub save_sidecar: bool, + #[serde(default = "default_sweep_age")] + pub stale_sweep_age_hours: u64, +} + +fn default_max_prefix_keys() -> usize { + 256 +} +fn default_sweep_age() -> u64 { + 24 +} + +impl Default for StagingConfig { + fn default() -> Self { + Self { + tempdir_root: None, + max_prefix_keys: 256, + save_sidecar: false, + stale_sweep_age_hours: 24, + } + } } #[derive(Debug, Serialize, Deserialize, Clone)] #[serde(tag = "type", deny_unknown_fields)] pub enum InputConfig { #[serde(rename = "speclib")] - Speclib { path: PathBuf }, + Speclib { + #[serde(alias = "path")] + uri: String, + }, } #[derive(Debug, Serialize, Deserialize, Clone)] #[serde(deny_unknown_fields)] pub struct AnalysisConfig { - pub dotd_files: Option>, + #[serde(alias = "dotd_files")] + pub raw_inputs: Option>, pub chunk_size: usize, pub tolerance: Tolerance, @@ -49,7 +85,8 @@ pub struct AnalysisConfig { #[derive(Debug, Serialize, Deserialize, Clone)] #[serde(deny_unknown_fields)] pub struct OutputConfig { - pub directory: PathBuf, + #[serde(alias = "directory")] + pub uri: String, } impl Config { @@ -64,7 +101,7 @@ impl Config { Config { input: None, analysis: AnalysisConfig { - dotd_files: None, + raw_inputs: None, chunk_size: 20000, tolerance: Tolerance { ms: MzTolerance::Ppm((15.0, 15.0)), @@ -76,6 +113,7 @@ impl Config { }, calibration: CalibrationConfig::default(), output: None, + staging: None, } } } diff --git a/rust/timsseek_cli/src/main.rs b/rust/timsseek_cli/src/main.rs index 039ba05e..a8f005d3 100644 --- a/rust/timsseek_cli/src/main.rs +++ b/rust/timsseek_cli/src/main.rs @@ -4,16 +4,21 @@ mod errors; mod processing; use clap::Parser; -use timsquery::TimsTofPath; -use timsquery::serde::load_index_auto; use timsquery::utils::TupleRange; +use timsquery::{ + CentroidingConfig, + IndexedTimstofPeaks, + load_index, +}; use timsseek::scoring::Scorer; use timsseek::scoring::timings::TimedStep; use tracing::{ error, info, + info_span, }; use tracing_subscriber::filter::EnvFilter; +use tracing_subscriber::fmt::format::FmtSpan; use tracing_subscriber::fmt::{ self, }; @@ -49,13 +54,18 @@ static GLOBAL: alloc_track::TrackingAllocator = alloc_track::TrackingAllocator:: /// Validated inputs ready for processing struct ValidatedInputs { - dotd_files: Vec, - speclib_path: std::path::PathBuf, - calib_lib_path: Option, - output_directory: std::path::PathBuf, + raw_inputs: Vec, + speclib_uri: String, + calib_lib_uri: Option, + output_uri: String, overwrite: bool, } +use tims_stage::{ + expand_local_uri, + is_remote_uri, +}; + /// Validates all inputs and outputs before processing begins. /// Returns ValidatedInputs on success, or an error if any validation fails. fn validate_inputs( @@ -64,19 +74,22 @@ fn validate_inputs( ) -> std::result::Result { info!("Validating inputs and outputs before processing..."); - // Get list of raw files to process - let dotd_files = match config.analysis.dotd_files.clone() { - Some(files) => files, + // Get list of raw files to process. `~` in local paths is not expanded + // by `Path::exists` / `File::open`; canonicalise via tims_stage once so + // every downstream consumer (validation, staging, probes) sees the same + // expanded form. + let raw_inputs: Vec = match config.analysis.raw_inputs.clone() { + Some(files) => files.iter().map(|f| expand_local_uri(f)).collect(), None => { return Err(errors::CliError::Config { - source: "No raw files provided, please provide dotd_files in either the config file or with the --dotd-files flag".to_string(), + source: "No raw files provided, please provide raw_inputs in either the config file or with the --raw-inputs flag".to_string(), }); } }; - // Get speclib path - let speclib_path = match &config.input { - Some(InputConfig::Speclib { path }) => path.clone(), + // Get speclib URI + let speclib_uri: String = match &config.input { + Some(InputConfig::Speclib { uri }) => expand_local_uri(uri), None => { return Err(errors::CliError::Config { source: "No input specified".to_string(), @@ -84,9 +97,9 @@ fn validate_inputs( } }; - // Get output directory - let output_directory = match &config.output { - Some(output_config) => output_config.directory.clone(), + // Get output URI (local path or s3:// etc.) + let output_uri: String = match &config.output { + Some(output_config) => expand_local_uri(&output_config.uri), None => { return Err(errors::CliError::Config { source: "No output directory specified".to_string(), @@ -94,180 +107,268 @@ fn validate_inputs( } }; - // Validate speclib exists - if !speclib_path.exists() { + // Validate speclib exists (local only — remote resolution happens at open) + if !is_remote_uri(&speclib_uri) && !std::path::Path::new(&speclib_uri).exists() { return Err(errors::CliError::Io { source: "Speclib file does not exist".to_string(), - path: Some(speclib_path.to_string_lossy().to_string()), + path: Some(speclib_uri.clone()), }); } - info!("✓ Speclib file exists: {:?}", speclib_path); + info!("✓ Speclib URI: {}", speclib_uri); // Validate calib lib if provided - let calib_lib_path = args.calib_lib.clone(); - if let Some(ref path) = calib_lib_path { - if !path.exists() { + let calib_lib_uri: Option = args.calib_lib.as_deref().map(expand_local_uri); + if let Some(ref uri) = calib_lib_uri { + if !is_remote_uri(uri) && !std::path::Path::new(uri).exists() { return Err(errors::CliError::Io { source: "Calibration library file does not exist".to_string(), - path: Some(path.to_string_lossy().to_string()), + path: Some(uri.clone()), }); } - info!("✓ Calibration library exists: {:?}", path); + info!("✓ Calibration library URI: {}", uri); } - // Validate all raw files exist - for dotd_file in &dotd_files { - if !dotd_file.exists() { + // Validate all raw inputs: local-path existence only. Remote URIs are + // resolved by tims_stage at staging time. + for raw_uri in &raw_inputs { + if !is_remote_uri(raw_uri) && !std::path::Path::new(raw_uri.as_str()).exists() { return Err(errors::CliError::Io { source: "Raw file does not exist".to_string(), - path: Some(dotd_file.to_string_lossy().to_string()), + path: Some(raw_uri.clone()), }); } } - info!("✓ All {} raw file(s) exist", dotd_files.len()); + info!("✓ All {} raw input(s) validated", raw_inputs.len()); - // Check if output directory exists and handle based on overwrite flag - if output_directory.exists() && !args.overwrite { - return Err(errors::CliError::Config { - source: format!( - "Output directory {:?} already exists. Use --overwrite/-O to overwrite.", - output_directory - ), - }); - } + // Local-output path checks: writability probe. Remote outputs skip this + // — the upload itself is the write test. + if !is_remote_uri(&output_uri) { + let output_dir_path = std::path::Path::new(&output_uri); - // Create output directory and test write permissions - match std::fs::create_dir_all(&output_directory) { - Ok(_) => { - if args.overwrite { - info!("✓ Using existing output directory (overwrite mode)"); - } else { - info!("✓ Created output directory"); + match std::fs::create_dir_all(output_dir_path) { + Ok(_) => { + info!("✓ Output directory ready"); } - } - Err(e) => { - return Err(errors::CliError::Io { - source: format!("Failed to create output directory: {}", e), - path: Some(output_directory.to_string_lossy().to_string()), - }); - } - }; + Err(e) => { + return Err(errors::CliError::Io { + source: format!("Failed to create output directory: {}", e), + path: Some(output_uri.clone()), + }); + } + }; - // Test write permissions by creating a test file - let test_file = output_directory.join(".write_test"); - match std::fs::write(&test_file, "test") { - Ok(_) => { - let _ = std::fs::remove_file(&test_file); - info!("✓ Output directory is writable"); - } - Err(e) => { - return Err(errors::CliError::Io { - source: format!("Output directory is not writable: {}", e), - path: Some(output_directory.to_string_lossy().to_string()), - }); + let test_file = output_dir_path.join(".write_test"); + match std::fs::write(&test_file, "test") { + Ok(_) => { + let _ = std::fs::remove_file(&test_file); + info!("✓ Output directory is writable"); + } + Err(e) => { + return Err(errors::CliError::Io { + source: format!("Output directory is not writable: {}", e), + path: Some(output_uri.clone()), + }); + } } } - // Validate per-file output subdirectories - for dotd_file in &dotd_files { - let file_stem = dotd_file - .file_stem() - .and_then(|s| s.to_str()) - .ok_or_else(|| errors::CliError::Io { + // Proactive overwrite check: probe every artifact this run will write + // (local or remote) and abort up-front if any exists and --overwrite + // isn't set. Fails fast before heavy analysis instead of after. + // + // IMPORTANT — keep the two artifact lists below in sync with the writer + // sites. If you add or rename an output file, update both here and the + // writer. Drift-aware writer sites (search for `ARTIFACT-LIST`): + // per-sample: + // - processing.rs — results.parquet, performance_report.json + // - main.rs overwrite-cleanup block (same two) + // run-level: + // - main.rs — run_report.json, config_used.json + // - OutputSink::finalize_run call site + if !args.overwrite { + let mut collisions: Vec = Vec::new(); + for raw_uri in &raw_inputs { + let sample = sample_name_from_uri(raw_uri).ok_or_else(|| errors::CliError::Io { source: "Unable to extract file stem".to_string(), - path: Some(dotd_file.to_string_lossy().to_string()), + path: Some(raw_uri.clone()), })?; - let file_output_dir = output_directory.join(file_stem); - - // Check if per-file directory exists when not in overwrite mode - if file_output_dir.exists() && !args.overwrite { + // ARTIFACT-LIST (per-sample) + for artifact in ["results.parquet", "performance_report.json"] { + let uri = join_output_uri(&output_uri, &format!("{sample}/{artifact}")); + if probe_uri_exists(&uri)? { + collisions.push(uri); + } + } + } + // ARTIFACT-LIST (run-level) + for artifact in ["run_report.json", "config_used.json"] { + let uri = join_output_uri(&output_uri, artifact); + if probe_uri_exists(&uri)? { + collisions.push(uri); + } + } + if !collisions.is_empty() { + let list = collisions + .iter() + .take(8) + .map(|s| format!(" - {s}")) + .collect::>() + .join("\n"); + let more = if collisions.len() > 8 { + format!("\n ... and {} more", collisions.len() - 8) + } else { + String::new() + }; return Err(errors::CliError::Config { source: format!( - "Output subdirectory {:?} already exists. Use --overwrite/-O to overwrite.", - file_output_dir + "{} output artifact(s) already exist; pass --overwrite/-O to replace them:\n{list}{more}", + collisions.len() ), }); } + info!("✓ No output collisions (checked local + remote artifacts)"); + } else { + info!("✓ --overwrite set; existing output artifacts will be replaced"); } - info!("✓ All output subdirectories validated"); info!("All validations passed! Starting processing..."); Ok(ValidatedInputs { - dotd_files, - speclib_path, - calib_lib_path, - output_directory, + raw_inputs, + speclib_uri, + calib_lib_uri, + output_uri, overwrite: args.overwrite, }) } -fn get_frag_range(file: &TimsTofPath) -> Result, errors::CliError> { - let reader = file - .load_frame_reader() - .map_err(|e| errors::CliError::DataReading { - source: format!("Failed to load frame reader: {:?}", e), - })?; - let dia_windows = reader - .dia_windows - .as_ref() - .ok_or_else(|| errors::CliError::DataReading { - source: "File does not contain DIA windows — is this a DIA run?".to_string(), - })?; +/// Join an artifact path onto a base output URI. For remote URIs this is a +/// plain `base/rel` concat (single trailing slash); for local paths it goes +/// through `PathBuf::join` so OS-specific separators are respected. +fn join_output_uri(base: &str, rel: &str) -> String { + if is_remote_uri(base) { + format!("{}/{}", base.trim_end_matches('/'), rel) + } else { + std::path::Path::new(base) + .join(rel) + .to_string_lossy() + .to_string() + } +} - let upper_mz = dia_windows - .iter() - .map(|w| { - w.isolation_mz - .iter() - .zip(w.isolation_width.iter()) - .map(|(imz, iw)| imz + (iw / 2.0)) - .fold(0.0, f64::max) - }) - .fold(0.0, f64::max); - - let lower_mz = dia_windows - .iter() - .map(|w| { - w.isolation_mz - .iter() - .zip(w.isolation_width.iter()) - .map(|(imz, iw)| imz - (iw / 2.0)) - .fold(f64::MAX, f64::min) - }) - .fold(f64::MAX, f64::min); - - TupleRange::try_new(lower_mz, upper_mz).map_err(|e| errors::CliError::DataReading { - source: format!("Invalid DIA m/z range: {:?}", e), +/// Cheap existence probe: local `Path::exists` or remote HEAD. +fn probe_uri_exists(uri: &str) -> Result { + tims_stage::uri_exists(uri).map_err(|e| errors::CliError::Io { + source: format!("existence probe {uri}: {e}"), + path: Some(uri.to_string()), }) } +/// Extract a sample name from a raw-input URI. Strips trailing slashes and +/// each of `.idx`, `.tar`, `.d` in turn so `sample.d.tar`, `sample.d/`, +/// `sample.d.idx/` all collapse to `sample`. Previously used short-circuit +/// `.or_else` which left `.d` in place on `.d.tar` inputs. +fn sample_name_from_uri(uri: &str) -> Option { + let trimmed = uri.trim_end_matches('/'); + let mut stem = trimmed.rsplit('/').next()?; + // Strip known suffixes in outer-to-inner order, looping so chained + // suffixes (e.g. `.d.tar`) collapse fully. Order matters: `.idx`/`.tar` + // come off before `.d` so they can't leave a bare `.d` behind. + loop { + let before = stem; + for ext in [".idx", ".tar", ".d"] { + if let Some(s) = stem.strip_suffix(ext) { + stem = s; + } + } + if stem == before { + break; + } + } + if stem.is_empty() { + None + } else { + Some(stem.to_string()) + } +} + +#[cfg(test)] +mod sample_name_tests { + use super::sample_name_from_uri; + #[test] + fn local_dotd_plain() { + assert_eq!(sample_name_from_uri("/data/run.d").as_deref(), Some("run")); + } + #[test] + fn local_dotd_trailing_slash() { + assert_eq!(sample_name_from_uri("/data/run.d/").as_deref(), Some("run")); + } + #[test] + fn s3_tar_collapses_both_suffixes() { + assert_eq!( + sample_name_from_uri("s3://bkt/run.d.tar").as_deref(), + Some("run") + ); + } + #[test] + fn s3_idx_directory() { + assert_eq!( + sample_name_from_uri("s3://bkt/run.d.idx/").as_deref(), + Some("run") + ); + } +} + +fn get_frag_range_from_index( + index: &IndexedTimstofPeaks, +) -> Result, errors::CliError> { + // `IndexedTimstofPeaks::fragmented_range` already folds over every + // ms2 window-group via `QuadrupoleIsolationScheme::fragmented_range`, + // which is defined on the ring-shape geometry (AABB/Trapezoid/Polygon). + // It panics only if there are no window groups — treat that as a + // non-DIA run and surface a readable error instead. + // + // NOTE: the task spec proposed reaching into raw `isolation_mz` / + // `isolation_width` Vec fields, but `QuadrupoleIsolationScheme` + // stores classified ring shapes (not the raw arrays). Using the + // existing public accessor is both correct and narrower. + let result = + std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| index.fragmented_range())); + match result { + Ok(r) => Ok(r), + Err(_) => Err(errors::CliError::DataReading { + source: "Index has no MS2 window groups — is this a DIA run?".to_string(), + }), + } +} + fn process_single_file( - dotd_file: &std::path::Path, + raw_uri: &str, + backend: &tims_stage::PerRunTempdir, + save_sidecar: bool, speclib: &timsseek::data_sources::speclib::Speclib, calib_lib: Option<&timsseek::data_sources::speclib::Speclib>, config: &Config, base_output_dir: &std::path::Path, overwrite: bool, max_qvalue: f32, + no_feature_stats: bool, ) -> std::result::Result { - info!("Processing file: {:?}", dotd_file); - - let timstofpath = - TimsTofPath::new(dotd_file.to_str().unwrap()).map_err(|e| errors::CliError::Io { - source: format!("Failed to open raw file: {:?}", e), - path: Some(dotd_file.to_string_lossy().to_string()), - })?; + let file_name = std::path::Path::new(raw_uri) + .file_name() + .and_then(|s| s.to_str()) + .unwrap_or(raw_uri); + let _file_span = info_span!("file", name = file_name).entered(); + info!("Processing raw input: {}", raw_uri); let step = TimedStep::begin("Loading index"); - let index = load_index_auto( - dotd_file.to_str().ok_or_else(|| errors::CliError::Io { - source: "Invalid path encoding".to_string(), - path: None, - })?, - None, - )? - .into_eager()?; + let index = + load_index(raw_uri, backend, save_sidecar, CentroidingConfig::default()).map_err(|e| { + errors::CliError::Io { + source: format!("load_index({raw_uri}): {e}"), + path: Some(raw_uri.to_string()), + } + })?; let load_index_ms = step.finish().as_millis() as u64; alloc_track::snap!("Loading index"); @@ -285,7 +386,7 @@ fn process_single_file( let index = index.rebucket(new_bucket_size); step.finish(); - let fragmented_range = get_frag_range(&timstofpath)?; + let fragmented_range = get_frag_range_from_index(&index)?; let pipeline = Scorer { index, @@ -293,21 +394,19 @@ fn process_single_file( fragmented_range, }; - let file_stem = dotd_file - .file_stem() - .and_then(|s| s.to_str()) - .ok_or_else(|| errors::CliError::Io { - source: "Unable to extract file stem".to_string(), - path: Some(dotd_file.to_string_lossy().to_string()), - })?; - let file_output_dir = base_output_dir.join(file_stem); + let file_stem = sample_name_from_uri(raw_uri).ok_or_else(|| errors::CliError::Io { + source: "Unable to derive sample name from URI".to_string(), + path: Some(raw_uri.to_string()), + })?; + let file_output_dir = base_output_dir.join(&file_stem); std::fs::create_dir_all(&file_output_dir).map_err(|e| errors::CliError::Io { source: format!("Failed to create output subdirectory: {}", e), path: Some(file_output_dir.to_string_lossy().to_string()), })?; - // If overwrite mode, delete the specific files we're about to write + // If overwrite mode, delete the specific files we're about to write. + // ARTIFACT-LIST (per-sample): keep in sync with validate_inputs proactive check. if overwrite { let results_file = file_output_dir.join("results.parquet"); if results_file.exists() { @@ -327,7 +426,7 @@ fn process_single_file( } let file_output_config = OutputConfig { - directory: file_output_dir, + uri: file_output_dir.to_string_lossy().to_string(), }; let report = processing::run_pipeline( @@ -339,15 +438,329 @@ fn process_single_file( max_qvalue, load_index_ms, &config.calibration, + no_feature_stats, ) .map_err(|e| errors::CliError::DataReading { source: format!("{}", e), })?; - info!("Successfully processed {:?}", dotd_file); + info!("Successfully processed {}", raw_uri); Ok(report) } +/// Handles local-vs-remote output routing. When `dest_uri` is an s3:// / +/// gs:// / az:// URL, writes into a tempdir and uploads per-sample; when +/// local, writes directly. +struct OutputSink { + dest_uri: String, + working_dir: std::path::PathBuf, + remote: bool, + _tempdir: Option, +} + +impl OutputSink { + fn new(dest_uri: &str) -> Result { + if is_remote_uri(dest_uri) { + let td = tempfile::Builder::new() + .prefix("timsseek-output-") + .tempdir() + .map_err(|e| errors::CliError::Io { + source: format!("output tempdir: {e}"), + path: None, + })?; + let working_dir = td.path().to_path_buf(); + Ok(Self { + dest_uri: dest_uri.to_string(), + working_dir, + remote: true, + _tempdir: Some(td), + }) + } else { + std::fs::create_dir_all(dest_uri).map_err(|e| errors::CliError::Io { + source: format!("create output dir: {e}"), + path: Some(dest_uri.to_string()), + })?; + Ok(Self { + dest_uri: dest_uri.to_string(), + working_dir: std::path::PathBuf::from(dest_uri), + remote: false, + _tempdir: None, + }) + } + } + + fn sample_dir(&self, sample: &str) -> std::path::PathBuf { + self.working_dir.join(sample) + } + + fn root(&self) -> &std::path::Path { + &self.working_dir + } + + /// Final destination URI for a sample's output directory — the s3:// / + /// gs:// / local path where files *end up*, not the working tempdir. + /// Use this for user-facing output (log lines, reports) so users see the + /// real location instead of a transient tempdir that will be wiped. + fn dest_uri_for_sample(&self, sample: &str) -> String { + if self.remote { + format!("{}/{}", self.dest_uri.trim_end_matches('/'), sample) + } else { + self.sample_dir(sample).to_string_lossy().into_owned() + } + } + + /// Upload and remove a per-sample subdir after the sample has finished + /// writing; no-op for local destinations. + fn finalize_sample(&self, sample: &str) -> Result<(), errors::CliError> { + if !self.remote { + return Ok(()); + } + let local = self.sample_dir(sample); + let sample_dest = self.dest_uri_for_sample(sample); + for entry in std::fs::read_dir(&local).map_err(|e| errors::CliError::Io { + source: format!("read sample dir: {e}"), + path: Some(local.to_string_lossy().to_string()), + })? { + let entry = entry.map_err(|e| errors::CliError::Io { + source: format!("read dir entry: {e}"), + path: None, + })?; + let bn = entry.file_name().to_string_lossy().to_string(); + let dest = format!("{sample_dest}/{bn}"); + tims_stage::upload_file(&entry.path(), &dest).map_err(|e| errors::CliError::Io { + source: format!("upload {dest}: {e}"), + path: None, + })?; + } + std::fs::remove_dir_all(&local).map_err(|e| errors::CliError::Io { + source: format!("cleanup sample dir: {e}"), + path: Some(local.to_string_lossy().to_string()), + })?; + Ok(()) + } + + /// Upload named top-level files (run_report.json, config_used.json) + /// that exist in the working dir; no-op for local destinations. + fn finalize_run(&self, files: &[&str]) -> Result<(), errors::CliError> { + if !self.remote { + return Ok(()); + } + for bn in files { + let local = self.working_dir.join(bn); + if !local.exists() { + continue; + } + let dest = format!("{}/{}", self.dest_uri.trim_end_matches('/'), bn); + tims_stage::upload_file(&local, &dest).map_err(|e| errors::CliError::Io { + source: format!("upload {dest}: {e}"), + path: None, + })?; + } + Ok(()) + } +} + +/// Load a Speclib from a local path or remote URI. +/// +/// `Speclib::from_file` takes a `Path` (it sniffs format by extension), so +/// remote URIs are streamed to a tempfile via `tims_stage::download_to_file` +/// — preserving the original basename so extension-based format detection +/// still works. Streaming (not `open_reader`) is used because speclibs can +/// be multi-GB parquet files. +fn speclib_from_uri( + uri: &str, + decoy_strategy: timsseek::DecoyStrategy, +) -> Result< + ( + timsseek::data_sources::speclib::Speclib, + Option, + ), + errors::CliError, +> { + if !is_remote_uri(uri) { + let path = std::path::Path::new(uri); + let lib = timsseek::data_sources::speclib::Speclib::from_file(path, decoy_strategy) + .map_err(|e| errors::CliError::Config { + source: format!("Failed to load speclib {uri}: {:?}", e), + })?; + return Ok((lib, None)); + } + + let trimmed = uri.trim_end_matches('/'); + let fname = trimmed + .rsplit('/') + .next() + .filter(|s| !s.is_empty()) + .unwrap_or("speclib.bin"); + let td = tempfile::Builder::new() + .prefix("timsseek-speclib-") + .tempdir() + .map_err(|e| errors::CliError::Io { + source: format!("speclib tempdir: {e}"), + path: None, + })?; + let local = td.path().join(fname); + tims_stage::download_to_file(uri, &local).map_err(|e| errors::CliError::Io { + source: format!("download speclib {uri}: {e}"), + path: Some(uri.to_string()), + })?; + let lib = timsseek::data_sources::speclib::Speclib::from_file(&local, decoy_strategy).map_err( + |e| errors::CliError::Config { + source: format!("Failed to load speclib {uri}: {:?}", e), + }, + )?; + Ok((lib, Some(td))) +} + +/// Handle returned by [`init_tracing`]. Holds resources whose lifetime must +/// span the entire run — notably the `instrumentation`-feature flush guard, +/// which flushes aggregated spans on drop. Callers bind with a `_`-prefixed +/// name and drop at end of scope. +/// +/// The guard is type-erased via `Box` so we don't have to name the +/// `tracing_profile` guard type from outside the feature-gated section. +/// Dropping the box runs the guard's `Drop` impl. +struct TracingHandle { + #[cfg(feature = "instrumentation")] + _tree_guard: Box, + #[cfg(not(feature = "instrumentation"))] + _empty: (), +} + +/// Install the tracing subscriber. Returns a handle whose lifetime keeps +/// the flush guard alive for the whole run. +/// +/// Resolution order for the log file: +/// 1. `--log-path -` → stderr-only, no file +/// 2. `--log-path

` → that path verbatim +/// 3. default, local output → `/timsseek-.log` +/// 4. default, no/remote output → `./timsseek-.log` in CWD +/// +/// The timestamp suffix (`YYYYMMDDTHHMMSS`, local time) avoids clobbering +/// previous runs that share the same directory. Tracing spans/logs +/// always go to a file unless `--log-path -` explicitly opts in to +/// stderr — matches the "no cli args, never tracing logs on terminal" +/// contract. +fn init_tracing(args: &Cli, config: &Config) -> TracingHandle { + let log_file_path: Option = match args.log_path { + Some(ref p) if p.to_str() == Some("-") => None, + Some(ref p) => Some(p.clone()), + None => { + let base: std::path::PathBuf = args + .output_uri + .as_ref() + .or(config.output.as_ref().map(|o| &o.uri)) + .filter(|d| !is_remote_uri(d.as_str())) + .map(std::path::PathBuf::from) + .unwrap_or_else(|| { + std::env::current_dir().unwrap_or_else(|_| std::path::PathBuf::from(".")) + }); + let ts = chrono::Local::now().format("%Y%m%dT%H%M%S"); + Some(base.join(format!("timsseek-{ts}.log"))) + } + }; + + let build_env_filter = || { + EnvFilter::builder() + .with_default_directive( + args.log_level + .parse() + .unwrap_or_else(|_| "info".parse().unwrap()), + ) + .from_env_lossy() + .add_directive("forust_ml=warn".parse().unwrap()) + .add_directive("timscentroid::storage=warn".parse().unwrap()) + }; + + let (file_layer, stderr_warn_layer, stderr_all_layer) = + if let Some(ref log_path) = log_file_path { + if let Some(parent) = log_path.parent() { + let _ = std::fs::create_dir_all(parent); + } + let log_file = std::fs::File::create(log_path).expect("Failed to create log file"); + let fl = fmt::layer() + .with_writer(std::sync::Mutex::new(log_file)) + .with_filter(build_env_filter()); + let sl = fmt::layer() + .with_writer(std::io::stderr) + .without_time() + .with_filter(tracing_subscriber::filter::LevelFilter::WARN); + (Some(fl), Some(sl), None) + } else { + let sl = fmt::layer() + .with_writer(std::io::stderr) + .with_filter(build_env_filter()); + (None, None, Some(sl)) + }; + + let spans_layer = log_file_path.as_ref().map(|log_path| { + let mut spans_path = log_path.clone(); + let fname = spans_path + .file_name() + .and_then(|s| s.to_str()) + .unwrap_or("timsseek.log") + .to_string(); + spans_path.set_file_name(format!("{fname}.spans.jsonl")); + let spans_file = std::fs::File::create(&spans_path).expect("Failed to create spans file"); + fmt::layer() + .json() + .with_span_events(FmtSpan::NEW | FmtSpan::CLOSE) + .with_writer(std::sync::Mutex::new(spans_file)) + .with_filter(build_env_filter()) + }); + + #[cfg(feature = "instrumentation")] + let perf_filter = EnvFilter::builder() + .with_default_directive("trace".parse().unwrap()) + .with_env_var("RUST_PERF_LOG") + .from_env_lossy() + .add_directive("forust_ml::gradientbooster=warn".parse().unwrap()); + + #[cfg(feature = "instrumentation")] + let events_filter = tracing_subscriber::filter::filter_fn(|metadata| !metadata.is_event()); + + #[cfg(feature = "instrumentation")] + let (tree_layer, _tree_guard) = PrintTreeLayer::new(PrintTreeConfig { + attention_above_percent: 25.0, + relevant_above_percent: 2.5, + hide_below_percent: 0.0, + display_unaccounted: true, + no_color: false, + accumulate_spans_count: false, + accumulate_events: false, + aggregate_similar_siblings: true, + }); + #[cfg(feature = "instrumentation")] + let tree_layer = tree_layer + .with_filter(perf_filter) + .with_filter(events_filter); + + let reg = tracing_subscriber::registry() + .with(file_layer) + .with(spans_layer) + .with(stderr_warn_layer) + .with(stderr_all_layer); + + #[cfg(feature = "instrumentation")] + let reg = reg.with(tree_layer); + + reg.init(); + + println!("timsseek v{}", env!("CARGO_PKG_VERSION")); + match log_file_path { + Some(ref log_path) => println!("Log: {}", log_path.display()), + None => println!("Log: (--log-path -)"), + } + println!(); + + TracingHandle { + #[cfg(feature = "instrumentation")] + _tree_guard: Box::new(_tree_guard), + #[cfg(not(feature = "instrumentation"))] + _empty: (), + } +} + fn main() { if let Err(e) = run() { eprintln!("{e}"); @@ -408,22 +821,22 @@ fn run() -> std::result::Result<(), errors::CliError> { }; // Override config with command line arguments if provided - if !args.dotd_files.is_empty() { - config.analysis.dotd_files = Some(args.dotd_files.clone()); + if !args.raw_inputs.is_empty() { + config.analysis.raw_inputs = Some(args.raw_inputs.clone()); } - if let Some(ref speclib_file) = args.speclib_file { + if let Some(ref speclib_uri) = args.speclib_uri { config.input = Some(InputConfig::Speclib { - path: speclib_file.clone(), + uri: speclib_uri.clone(), }); } if config.input.is_none() { return Err(errors::CliError::Config { - source: "No input provided, please provide one in either the config file or with the --speclib-file flag".to_string(), + source: "No input provided, please provide one in either the config file or with the --speclib-uri flag".to_string(), }); } - if let Some(ref output_dir) = args.output_dir { + if let Some(ref output_uri) = args.output_uri { config.output = Some(OutputConfig { - directory: output_dir.clone(), + uri: output_uri.clone(), }); } @@ -432,107 +845,42 @@ fn run() -> std::result::Result<(), errors::CliError> { config.analysis.decoy_strategy = strategy; } - // === Set up tracing subscriber === - // We defer this until after config/validation so we know the output directory for the log file. - - // Determine log file path - let log_file_path = match args.log_path { - Some(ref p) if p.to_str() == Some("-") => None, // stderr-only mode - Some(ref p) => Some(p.clone()), - None => args - .output_dir - .as_ref() - .or(config.output.as_ref().map(|o| &o.directory)) - .map(|d| d.join("timsseek.log")), - }; - - // Build the env filter for the main logging layer - let env_filter = EnvFilter::builder() - .with_default_directive( - args.log_level - .parse() - .unwrap_or_else(|_| "info".parse().unwrap()), - ) - .from_env_lossy() - .add_directive("forust_ml=warn".parse().unwrap()) - .add_directive("timscentroid::storage=warn".parse().unwrap()); - - // Use Option layers so we can build a single subscriber type regardless - // of whether we're writing to a log file or to stderr. - let (file_layer, stderr_warn_layer, stderr_all_layer) = - if let Some(ref log_path) = log_file_path { - // File mode: log file gets env_filter, stderr gets WARN+ only - if let Some(parent) = log_path.parent() { - let _ = std::fs::create_dir_all(parent); - } - let log_file = std::fs::File::create(log_path).expect("Failed to create log file"); - let fl = fmt::layer() - .with_writer(std::sync::Mutex::new(log_file)) - .with_filter(env_filter); - let sl = fmt::layer() - .with_writer(std::io::stderr) - .without_time() - .with_filter(tracing_subscriber::filter::LevelFilter::WARN); - (Some(fl), Some(sl), None) - } else { - // stderr-only mode (--log-path -) - let sl = fmt::layer() - .with_writer(std::io::stderr) - .with_filter(env_filter); - (None, None, Some(sl)) - }; - - #[cfg(feature = "instrumentation")] - let perf_filter = EnvFilter::builder() - .with_default_directive("trace".parse().unwrap()) - .with_env_var("RUST_PERF_LOG") - .from_env_lossy() - .add_directive("forust_ml::gradientbooster=warn".parse().unwrap()); - - #[cfg(feature = "instrumentation")] - let events_filter = tracing_subscriber::filter::filter_fn(|metadata| !metadata.is_event()); - - #[cfg(feature = "instrumentation")] - let (tree_layer, _guard) = PrintTreeLayer::new(PrintTreeConfig { - attention_above_percent: 25.0, - relevant_above_percent: 2.5, - hide_below_percent: 0.0, - display_unaccounted: true, - no_color: false, - accumulate_spans_count: false, - accumulate_events: false, - aggregate_similar_siblings: true, - }); - #[cfg(feature = "instrumentation")] - let tree_layer = tree_layer - .with_filter(perf_filter) - .with_filter(events_filter); - - let reg = tracing_subscriber::registry() - .with(file_layer) - .with(stderr_warn_layer) - .with(stderr_all_layer); - - #[cfg(feature = "instrumentation")] - let reg = reg.with(tree_layer); - - reg.init(); - - // Print version and log path to stdout - if let Some(ref log_path) = log_file_path { - println!("timsseek v{}", env!("CARGO_PKG_VERSION")); - println!("Log: {}", log_path.display()); - println!(); - } + // Install tracing subscriber. The returned handle carries the log file + // path (if any) plus — under the `instrumentation` feature — the + // PrintTreeLayer flush guard that must outlive every traced span. Hold + // it in `run()`'s scope so drop order flushes after all work completes. + let _tracing = init_tracing(&args, &config); info!("Parsed configuration: {:#?}", config.clone()); alloc_track::snap!("start"); let validated = validate_inputs(&config, &args)?; - let config_output_path = validated.output_directory.join("config_used.json"); + // Build staging backend once from [staging] config (falls back to + // defaults when absent). The sweep runs in `PerRunTempdir::new`. + let staging_cfg = config.staging.clone().unwrap_or_default(); + let save_sidecar_flag = staging_cfg.save_sidecar; + let backend = tims_stage::PerRunTempdir::new(tims_stage::StagingConfig { + tempdir_root: staging_cfg.tempdir_root.clone(), + max_prefix_keys: staging_cfg.max_prefix_keys, + stale_sweep_age_hours: staging_cfg.stale_sweep_age_hours, + }) + .map_err(|e| errors::CliError::Io { + source: format!("staging backend: {e}"), + path: None, + })?; + + // Route outputs through a local tempdir when the destination is remote + // (s3://, gs://, az://). Per-sample subdirs are uploaded + removed + // after each sample finishes; run-level files are uploaded after the + // batch. + let sink = OutputSink::new(&validated.output_uri)?; - // If overwrite mode, delete existing config file + // ARTIFACT-LIST (run-level): keep in sync with validate_inputs proactive check. + let config_output_path = sink.root().join("config_used.json"); + + // If overwrite mode, delete existing config file (local-only; remote + // will simply overwrite on upload). if validated.overwrite && config_output_path.exists() { std::fs::remove_file(&config_output_path).map_err(|e| errors::CliError::Io { source: format!("Failed to remove existing config file: {}", e), @@ -551,50 +899,39 @@ fn run() -> std::result::Result<(), errors::CliError> { info!("Wrote final configuration to {:?}", config_output_path); let mut run_report = timsseek::scoring::RunReport::default(); - let mut failed_files: Vec<(std::path::PathBuf, errors::CliError)> = Vec::new(); - let mut successful_files: Vec = Vec::new(); + let mut failed_files: Vec<(String, errors::CliError)> = Vec::new(); + let mut successful_files: Vec = Vec::new(); // Load speclib once (shared across all files) let step = TimedStep::begin("Loading speclib"); info!( - "Building database from speclib file {:?}", - validated.speclib_path + "Building database from speclib URI {}", + validated.speclib_uri ); info!( "Decoy generation strategy: {}", config.analysis.decoy_strategy ); - let speclib = timsseek::data_sources::speclib::Speclib::from_file( - &validated.speclib_path, - config.analysis.decoy_strategy, - ) - .map_err(|e| errors::CliError::Config { - source: format!("Failed to load speclib: {:?}", e), - })?; + let (speclib, _speclib_td) = + speclib_from_uri(&validated.speclib_uri, config.analysis.decoy_strategy)?; let load_speclib_ms = step .finish_with(format_args!("{} entries", speclib.len())) .as_millis() as u64; alloc_track::snap!("Loading speclib"); // Load calibration library once (if provided) - let (calib_lib, load_calib_lib_ms) = match &validated.calib_lib_path { - Some(p) => { + let (calib_lib, _calib_td, load_calib_lib_ms) = match &validated.calib_lib_uri { + Some(uri) => { let step = TimedStep::begin("Loading calib lib"); - info!("Loading calibration library from {:?}", p); - let lib = timsseek::data_sources::speclib::Speclib::from_file( - p, - config.analysis.decoy_strategy, - ) - .map_err(|e| errors::CliError::Config { - source: format!("Failed to load calib lib: {:?}", e), - })?; + info!("Loading calibration library from {}", uri); + let (lib, td) = speclib_from_uri(uri, config.analysis.decoy_strategy)?; let ms = step .finish_with(format_args!("{} entries", lib.len())) .as_millis() as u64; alloc_track::snap!("Loading calib lib"); - (Some(lib), ms) + (Some(lib), td, ms) } - None => (None, 0), + None => (None, None, 0), }; run_report.load_speclib_ms = load_speclib_ms; @@ -602,59 +939,145 @@ fn run() -> std::result::Result<(), errors::CliError> { run_report.speclib_entries = speclib.len(); run_report.calib_lib_entries = calib_lib.as_ref().map_or(0, |l| l.len()); - let total_files = validated.dotd_files.len(); - info!("Processing {} raw file(s)", total_files); + let total_files = validated.raw_inputs.len(); + info!("Processing {} raw input(s)", total_files); - for (idx, dotd_file) in validated.dotd_files.iter().enumerate() { + for (idx, raw_uri) in validated.raw_inputs.iter().enumerate() { info!( - "Processing file {} of {}: {:?}", + "Processing input {} of {}: {}", idx + 1, total_files, - dotd_file + raw_uri ); + let sample_name = match sample_name_from_uri(raw_uri) { + Some(s) => s, + None => { + let e = errors::CliError::Io { + source: "Unable to derive sample name from URI".to_string(), + path: Some(raw_uri.clone()), + }; + error!("Failed to process {}: {}", raw_uri, e); + failed_files.push((raw_uri.clone(), e)); + continue; + } + }; + + // Per-file wall clock, printed as a footer so user sees total time per + // input even when several are batched. Intermediate phase output from + // `processing::run_pipeline` lands between the header and footer. + println!("=== [{}/{}] {} ===", idx + 1, total_files, sample_name); + let file_start = std::time::Instant::now(); + let sample_dest = sink.dest_uri_for_sample(&sample_name); + match process_single_file( - dotd_file, + raw_uri, + &backend, + save_sidecar_flag, &speclib, calib_lib.as_ref(), &config, - &validated.output_directory, + sink.root(), validated.overwrite, args.max_qvalue, + args.no_feature_stats, ) { Ok(report) => { - successful_files.push(dotd_file.clone()); + if let Err(e) = sink.finalize_sample(&sample_name) { + error!("Failed to finalize sample {}: {}", sample_name, e); + println!( + "=== [{}/{}] {} failed upload after {:?} ===", + idx + 1, + total_files, + sample_name, + file_start.elapsed() + ); + run_report.status = timsseek::scoring::timings::RunStatus::Aborted; + run_report.abort_reason = + Some(format!("upload failure on sample {sample_name}: {e}")); + failed_files.push((raw_uri.clone(), e)); + error!("Aborting batch due to upload failure"); + break; + } + println!("Output: {sample_dest}"); + println!( + "=== [{}/{}] {} done in {:?} ===", + idx + 1, + total_files, + sample_name, + file_start.elapsed() + ); + successful_files.push(raw_uri.clone()); + let mut outputs = vec![format!("{sample_dest}/results.parquet")]; + if !args.no_feature_stats { + outputs.push(format!( + "{sample_dest}/{}", + processing::FEATURE_STATS_FILENAME + )); + outputs.push(format!( + "{sample_dest}/{}", + processing::FEATURE_IMPORTANCE_FILENAME + )); + } run_report.files.push(timsseek::scoring::FileReport { - file_name: dotd_file.to_string_lossy().to_string(), + file_name: raw_uri.clone(), pipeline: report, + outputs, }); } Err(e) => { - error!("Failed to process {:?}: {}", dotd_file, e); + error!("Failed to process {}: {}", raw_uri, e); + println!( + "=== [{}/{}] {} FAILED after {:?}: {} ===", + idx + 1, + total_files, + sample_name, + file_start.elapsed(), + e + ); // I/O errors are likely systemic (disk full, permissions) — // abort the batch instead of failing every remaining file. if matches!(e, errors::CliError::Io { .. }) { - failed_files.push((dotd_file.clone(), e)); + run_report.status = timsseek::scoring::timings::RunStatus::Aborted; + run_report.abort_reason = Some(format!("I/O error on {raw_uri}: {e}")); + failed_files.push((raw_uri.clone(), e)); error!("Aborting batch due to I/O error"); break; } - failed_files.push((dotd_file.clone(), e)); + failed_files.push((raw_uri.clone(), e)); } } } - // Write run-level report - let run_report_path = validated.output_directory.join("run_report.json"); + // Write run-level report into the sink's working dir. + // ARTIFACT-LIST (run-level): keep in sync with validate_inputs proactive check. + let finalize_step = TimedStep::begin("Finalize run"); + // Populate top-level artifact list with final destination URIs before + // serialization so the report self-describes where to fetch everything. + let dest_root = sink + .dest_uri_for_sample("") + .trim_end_matches('/') + .to_string(); + run_report.artifacts = vec![ + format!("{dest_root}/run_report.json"), + format!("{dest_root}/config_used.json"), + ]; + let run_report_path = sink.root().join("run_report.json"); if let Ok(json) = serde_json::to_string_pretty(&run_report) { let _ = std::fs::write(&run_report_path, json); info!("Wrote run report to {:?}", run_report_path); } + // Upload run-level artifacts for remote destinations (no-op locally). + // ARTIFACT-LIST (run-level): keep in sync with validate_inputs proactive check. + sink.finalize_run(&["run_report.json", "config_used.json"])?; + finalize_step.finish(); + info!("Successfully processed {} file(s)", successful_files.len()); if !failed_files.is_empty() { error!("Failed to process {} file(s):", failed_files.len()); for (file, err) in &failed_files { - error!(" {:?}: {}", file, err); + error!(" {}: {}", file, err); } return Err(errors::CliError::Config { source: format!("Failed to process {} file(s)", failed_files.len()), diff --git a/rust/timsseek_cli/src/processing.rs b/rust/timsseek_cli/src/processing.rs index 6969e483..47bcda32 100644 --- a/rust/timsseek_cli/src/processing.rs +++ b/rust/timsseek_cli/src/processing.rs @@ -1,9 +1,9 @@ use super::config::OutputConfig; use indicatif::{ ProgressBar, + ProgressFinish, ProgressIterator, ProgressStyle, - ProgressFinish, }; use std::io::IsTerminal; use timsquery::models::tolerance::{ @@ -21,7 +21,10 @@ use timsquery::{ use timsseek::data_sources::speclib::Speclib; use timsseek::errors::TimsSeekError; use timsseek::ml::qvalues::report_qvalues_at_thresholds; -use timsseek::ml::rescore; +use timsseek::ml::{ + RescoreFeatureStats, + rescore, +}; use timsseek::rt_calibration::{ CalibRtError, CalibratedGrid, @@ -68,7 +71,9 @@ fn make_progress_bar(len: u64, label: &str) -> ProgressBar { label )) .unwrap(); - ProgressBar::new(len).with_style(style).with_finish(ProgressFinish::AndLeave) + ProgressBar::new(len) + .with_style(style) + .with_finish(ProgressFinish::AndLeave) } /// Check that two speclibs are on a compatible RT scale. @@ -128,6 +133,65 @@ fn check_rt_scale_compatibility(main_lib: &Speclib, calib_lib: &Speclib) { } } +pub const FEATURE_STATS_FILENAME: &str = "results.feature_stats.tsv"; +pub const FEATURE_IMPORTANCE_FILENAME: &str = "results.feature_importance.tsv"; + +fn write_tsv( + path: &std::path::Path, + header: &str, + mut write_rows: impl FnMut(&mut String), + label: &str, +) -> std::io::Result<()> { + use std::fmt::Write as _; + let mut buf = String::new(); + writeln!(buf, "{header}").unwrap(); + write_rows(&mut buf); + std::fs::write(path, buf)?; + eprintln!("wrote {label}: {}", path.display()); + tracing::info!(path = %path.display(), "wrote {} tsv", label); + Ok(()) +} + +fn write_feature_stats_sidecar( + stats: &RescoreFeatureStats, + parquet_path: &std::path::Path, +) -> std::io::Result<()> { + use std::fmt::Write as _; + + write_tsv( + &parquet_path.with_file_name(FEATURE_STATS_FILENAME), + "name\tmean\tmissing\tfold", + |buf| { + for fold in stats.iter() { + for fs in fold.feature_stats.iter() { + writeln!( + buf, + "{}\t{}\t{}\t{}", + fs.name, fs.mean, fs.nan_ratio, fold.fold + ) + .unwrap(); + } + } + }, + "feature stats", + )?; + + write_tsv( + &parquet_path.with_file_name(FEATURE_IMPORTANCE_FILENAME), + "name\tgain\tfold", + |buf| { + for fold in stats.iter() { + for (name, gain) in fold.feature_importance.iter() { + writeln!(buf, "{}\t{}\t{}", name, gain, fold.fold).unwrap(); + } + } + }, + "feature importance", + )?; + + Ok(()) +} + #[cfg_attr( feature = "instrumentation", tracing::instrument(skip_all, level = "trace") @@ -140,6 +204,7 @@ pub fn execute_pipeline( out_path: &OutputConfig, max_qvalue: f32, calib_config: &CalibrationConfig, + no_feature_stats: bool, ) -> std::result::Result { // === PHASE 1: Broad prescore -> collect top calibrants === // Use calibration library if provided, otherwise fall back to main speclib @@ -268,7 +333,7 @@ pub fn execute_pipeline( let (rt_lo_ms, rt_hi_ms) = pipeline.index.ms1_cycle_mapping().range_milis(); let rt_lo = rt_lo_ms as f64 / 1000.0; let rt_hi = rt_hi_ms as f64 / 1000.0; - let cal_json_path = out_path.directory.join("calibration.json"); + let cal_json_path = std::path::Path::new(&out_path.uri).join("calibration.json"); if let Err(e) = calibration.save_json( &cal_points_tuples, [rt_lo, rt_hi], @@ -319,7 +384,7 @@ pub fn execute_pipeline( // === PHASE 5: Rescore === let step = TimedStep::begin("Phase 5: Rescore"); - let data = rescore(competed); + let (data, feature_stats) = rescore(competed); let phase5_ms = step.finish().as_millis() as u64; alloc_track::snap!("Phase 5: Rescore"); @@ -344,14 +409,17 @@ pub fn execute_pipeline( // === PHASE 6: Write Parquet output === let step = TimedStep::begin("Phase 6: Write output"); - let out_path_pq = out_path.directory.join("results.parquet"); - let mut pq_writer = - timsseek::scoring::parquet_writer::ResultParquetWriter::new(&out_path_pq, 20_000).map_err( - |e| TimsSeekError::Io { - path: out_path_pq.clone().into(), - source: e, - }, - )?; + // ARTIFACT-LIST (per-sample): keep in sync with validate_inputs in main.rs. + let out_path_pq = std::path::Path::new(&out_path.uri).join("results.parquet"); + let mut pq_writer = timsseek::scoring::parquet_writer::ResultParquetWriter::new( + &out_path_pq, + 20_000, + speclib.meta.parsable_sequences, + ) + .map_err(|e| TimsSeekError::Io { + path: out_path_pq.clone().into(), + source: e, + })?; for res in data.into_iter() { if res.qvalue <= max_qvalue { pq_writer.add(res).map_err(|e| TimsSeekError::Io { @@ -368,10 +436,18 @@ pub fn execute_pipeline( alloc_track::snap!("Phase 6: Write output"); info!("Wrote final results to {:?}", out_path_pq); - // Key result to stdout + if !no_feature_stats { + if let Err(e) = write_feature_stats_sidecar(&feature_stats, &out_path_pq) { + // Non-fatal: log and continue. + tracing::warn!("Failed to write feature_stats sidecar: {}", e); + } + } + + // Key result to stdout. The final output URI is printed by main.rs + // per-file footer — out_path_pq here is the local working path (which + // is a tempdir for remote destinations), not the eventual location. println!(); println!("{} targets at 1% FDR", targets_at_1pct_qval); - println!("Output: {}", out_path_pq.display()); Ok(PipelineReport { load_index_ms: 0, // set by caller after return @@ -762,7 +838,7 @@ fn target_decoy_compete(mut results: Vec) -> Vec) -> Vec) -> Vec std::result::Result { - let performance_report_path = output.directory.join("performance_report.json"); + // ARTIFACT-LIST (per-sample): keep in sync with validate_inputs in main.rs. + let performance_report_path = std::path::Path::new(&output.uri).join("performance_report.json"); let mut timings = execute_pipeline( speclib, @@ -927,6 +1005,7 @@ pub fn run_pipeline( output, max_qvalue, calib_config, + no_feature_stats, )?; timings.load_index_ms = load_index_ms; // Write per-file report diff --git a/scripts/plot_spans.py b/scripts/plot_spans.py new file mode 100644 index 00000000..49a7cb11 --- /dev/null +++ b/scripts/plot_spans.py @@ -0,0 +1,252 @@ +# /// script +# requires-python = ">=3.11" +# dependencies = [ +# "matplotlib", +# ] +# /// +"""Gantt plot of tracing spans from a timsseek `.spans.jsonl` file. + +Produced by the JSON `fmt` layer in `init_tracing` with +`FmtSpan::NEW | FmtSpan::CLOSE`. Each CLOSE event carries `span.name`, +fields, and `time.busy` / `time.idle`. We match NEW↔CLOSE by +(thread, span name, nearest unfinished open) and draw one bar per span, +colored by span name, grouped by thread. + +Usage: + uv run scripts/plot_spans.py [-o out.png] [--wandb run-name] +""" + +from __future__ import annotations + +import argparse +import json +import re +from collections import defaultdict +from dataclasses import dataclass +from datetime import datetime +from pathlib import Path + +import matplotlib.pyplot as plt + + +@dataclass +class Span: + name: str + label: str + thread: str + start: float # seconds since run start + end: float + busy_s: float + idle_s: float + depth: int + + +_DUR_RE = re.compile(r"^\s*([\d.]+)\s*(ns|us|µs|ms|s|m)\s*$") + + +def parse_duration(s: str) -> float: + """Parse tracing duration strings like '1.23ms', '45s', '2m' → seconds.""" + m = _DUR_RE.match(s) + if not m: + return 0.0 + val, unit = float(m.group(1)), m.group(2) + return { + "ns": 1e-9, + "us": 1e-6, + "µs": 1e-6, + "ms": 1e-3, + "s": 1.0, + "m": 60.0, + }[unit] * val + + +def parse_ts(s: str) -> float: + # tracing JSON layer emits RFC3339 UTC with `Z` + return datetime.fromisoformat(s.replace("Z", "+00:00")).timestamp() + + +def load_spans(path: Path) -> list[Span]: + opens: dict[tuple[str, str], list[tuple[float, int]]] = defaultdict(list) + depth_by_thread: dict[str, int] = defaultdict(int) + closed: list[Span] = [] + t0: float | None = None + + for line in path.read_text().splitlines(): + line = line.strip() + if not line: + continue + try: + ev = json.loads(line) + except json.JSONDecodeError: + continue + + msg = ev.get("fields", {}).get("message", "") + span = ev.get("span") or {} + name = span.get("name") or ev.get("target", "") + if not name: + continue + thread = ev.get("threadName") or ev.get("thread_name") or "main" + ts = parse_ts(ev["timestamp"]) + if t0 is None: + t0 = ts + + key = (thread, name) + if msg == "new": + depth = depth_by_thread[thread] + depth_by_thread[thread] += 1 + opens[key].append((ts, depth)) + elif msg == "close": + if not opens[key]: + continue + start, depth = opens[key].pop() + depth_by_thread[thread] = max(0, depth_by_thread[thread] - 1) + fields = ev.get("fields", {}) + busy = parse_duration(str(fields.get("time.busy", "0ns"))) + idle = parse_duration(str(fields.get("time.idle", "0ns"))) + label_fields = {k: v for k, v in span.items() if k != "name"} + # `step{label=...}` and `file{name=...}` are the dominant spans + # — show only the field value, not the wrapper `step[label=x]`. + if name == "step" and "label" in label_fields: + label = str(label_fields["label"]) + elif name == "file" and "name" in label_fields: + label = str(label_fields["name"]) + elif label_fields: + label = ( + f"{name}[" + + ",".join(f"{k}={v}" for k, v in label_fields.items()) + + "]" + ) + else: + label = name + closed.append( + Span( + name=name, + label=label, + thread=thread, + start=start - t0, + end=ts - t0, + busy_s=busy, + idle_s=idle, + depth=depth, + ) + ) + return closed + + +def plot(spans: list[Span], out: Path, min_ms: float) -> None: + if not spans: + raise SystemExit("no spans parsed") + + total_end = max(s.end for s in spans) + kept = [s for s in spans if (s.end - s.start) * 1000.0 >= min_ms] + if not kept: + raise SystemExit(f"no spans >= {min_ms}ms; lower --min-ms") + + # One lane per (thread, depth) — avoids vertical squish when depth is high. + lane_keys = sorted({(s.thread, s.depth) for s in kept}) + lane_idx = {k: i for i, k in enumerate(lane_keys)} + + # Color by first-seen order for stable legend. + names: list[str] = [] + seen: set[str] = set() + for s in sorted(kept, key=lambda x: x.start): + if s.name not in seen: + names.append(s.name) + seen.add(s.name) + cmap = plt.get_cmap("tab20") + color_for = {n: cmap(i % 20) for i, n in enumerate(names)} + + # Height scales with lanes so no squish. Extra headroom for title + stats. + fig_h = max(4.5, 0.45 * len(lane_keys) + 2.5) + fig, ax = plt.subplots(figsize=(16, fig_h)) + + bar_h = 0.78 + for s in kept: + y = lane_idx[(s.thread, s.depth)] + width = max(s.end - s.start, 1e-4) + ax.barh( + y, + width, + left=s.start, + height=bar_h, + color=color_for[s.name], + edgecolor="black", + linewidth=0.4, + alpha=0.85, + ) + if width / total_end > 0.025: + busy_pct = 100.0 * s.busy_s / max(width, 1e-9) + ax.text( + s.start + width / 2, + y, + f"{s.label}\n{width:.2f}s · busy {busy_pct:.0f}%", + ha="center", + va="center", + fontsize=7, + clip_on=True, + ) + + ax.set_yticks(range(len(lane_keys))) + ax.set_yticklabels([f"{t} · d{d}" for t, d in lane_keys]) + ax.set_xlabel("time since run start (s)") + ax.set_ylabel("thread · depth") + ax.set_title( + f"tracing spans · {len(kept)}/{len(spans)} shown " + f"(>= {min_ms:g}ms) · {total_end:.1f}s total" + ) + ax.set_xlim(left=-0.02 * total_end, right=total_end * 1.02) + ax.invert_yaxis() + ax.grid(axis="x", linestyle=":", alpha=0.4) + + # Per-name totals panel as stats footer. + name_totals: dict[str, float] = defaultdict(float) + name_counts: dict[str, int] = defaultdict(int) + for s in kept: + name_totals[s.name] += s.end - s.start + name_counts[s.name] += 1 + top = sorted(name_totals.items(), key=lambda kv: -kv[1])[:8] + stats = " · ".join(f"{n}:{t:.1f}s (x{name_counts[n]})" for n, t in top) + fig.text(0.01, 0.01, stats, fontsize=7, color="#444") + + handles = [plt.Rectangle((0, 0), 1, 1, color=color_for[n], label=n) for n in names] + ax.legend( + handles=handles, + loc="upper left", + bbox_to_anchor=(1.01, 1.0), + fontsize=8, + frameon=False, + ) + + fig.tight_layout() + fig.savefig(out, dpi=140, bbox_inches="tight") + print(f"wrote {out}") + + +def main() -> None: + ap = argparse.ArgumentParser() + ap.add_argument("spans_jsonl", type=Path) + ap.add_argument("-o", "--out", type=Path, default=None) + ap.add_argument( + "--min-ms", + type=float, + default=50.0, + help="hide spans shorter than this (default 50ms) — filters out bucket-sort noise", + ) + ap.add_argument("--wandb", metavar="RUN_NAME", default=None) + ap.add_argument("--wandb-project", default="timsseek-spans") + args = ap.parse_args() + + out = args.out or args.spans_jsonl.with_suffix(".gantt.png") + spans = load_spans(args.spans_jsonl) + plot(spans, out, min_ms=args.min_ms) + + if args.wandb: + import wandb # noqa: PLC0415 + + run = wandb.init(project=args.wandb_project, name=args.wandb, reinit=True) # ty: ignore[unresolved-attribute] + run.log({"spans_gantt": wandb.Image(str(out))}) # ty: ignore[unresolved-attribute] + run.finish() + + +if __name__ == "__main__": + main() diff --git a/uv.lock b/uv.lock index 8bb8738c..3325b063 100644 --- a/uv.lock +++ b/uv.lock @@ -682,7 +682,7 @@ wheels = [ [[package]] name = "timsseek-workspace" -version = "0.30.0" +version = "0.31.0" source = { virtual = "." } [package.dev-dependencies]