diff --git a/.github/workflows/cleanup_pr_tags.yml b/.github/workflows/cleanup_pr_tags.yml new file mode 100644 index 0000000..4473b76 --- /dev/null +++ b/.github/workflows/cleanup_pr_tags.yml @@ -0,0 +1,28 @@ +name: Cleanup PR Tags + +on: + pull_request: + types: [closed] + +jobs: + delete-tag: + if: contains(github.event.pull_request.labels.*.name, 'build-artifact') + runs-on: ubuntu-latest + permissions: + contents: write # Needed to delete tags + steps: + - name: Delete PR Tag + uses: actions/github-script@v7 + with: + script: | + const tagName = `pr-build-${context.issue.number}`; + try { + await github.rest.git.deleteRef({ + owner: context.repo.owner, + repo: context.repo.repo, + ref: `tags/${tagName}`, + }); + console.log(`Successfully deleted tag: ${tagName}`); + } catch (error) { + console.log(`Tag ${tagName} not found or already deleted.`); + } \ No newline at end of file diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index dfd41f6..bd334bd 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -3,39 +3,41 @@ name: Release on: push: tags: - - 'v?[0-9]+.*' # Trigger on version tags + - 'v?[0-9]+.*' + pull_request: + # 'synchronize' triggers on every push to the PR + # 'reopened' triggers if you close and open the PR again + types: [labeled, synchronize, reopened] env: CARGO_TERM_COLOR: always - # Define the binaries to package - # TODO: use our taskfile to centralize the build - RELEASE_BINARIES: timsseek timsquery_cli # timsseek_rts + RELEASE_BINARIES: timsseek timsquery_cli timsquery_viewer jobs: - create-release: - runs-on: ubuntu-latest - outputs: - upload_url: ${{ steps.create_release.outputs.upload_url }} - steps: - - name: Create Release - id: create_release - uses: softprops/action-gh-release@v2 - env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - with: - tag_name: ${{ github.ref }} - release_name: Release ${{ github.ref }} - draft: false - prerelease: false - - linux-builds: - needs: create-release - runs-on: ubuntu-latest + build-and-release: + # Run on tags OR if the PR has the 'build-artifact' label + if: > + github.event_name == 'push' || + (github.event_name == 'pull_request' && contains(github.event.pull_request.labels.*.name, 'build-artifact')) + name: Build (${{ matrix.target }}) + runs-on: ${{ matrix.os }} strategy: + fail-fast: false matrix: - target: - - x86_64-unknown-linux-gnu - # - aarch64-unknown-linux-gnu + include: + - os: ubuntu-latest + target: x86_64-unknown-linux-gnu + artifact_name: linux-x86_64 + - os: macos-latest # M-series + target: aarch64-apple-darwin + artifact_name: macos-arm64 + # - os: macos-latest + # target: x86_64-apple-darwin + # artifact_name: macos-x86_64 + - os: windows-latest + target: x86_64-pc-windows-msvc + artifact_name: windows-x64 + steps: - uses: actions/checkout@v4 @@ -45,112 +47,40 @@ jobs: target: ${{ matrix.target }} cache: true - - name: Build + - name: Build Binaries + shell: bash run: | - rustup target add ${{ matrix.target }} - for binary in $RELEASE_BINARIES; do - cargo build --release --bin $binary --target ${{ matrix.target }} - done - - - name: Prepare binaries - run: | - mkdir artifacts for binary in $RELEASE_BINARIES; do - cp "target/${{ matrix.target }}/release/$binary" artifacts/ + cargo build --release --bin $binary --target ${{ matrix.target }} done - cd artifacts - tar czf ../${{ matrix.target }}.tar.gz * - - name: Upload Release Assets - uses: actions/upload-release-asset@v1 - env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - with: - upload_url: ${{ needs.create-release.outputs.upload_url }} - asset_path: ${{ matrix.target }}.tar.gz - asset_name: ${{ matrix.target }}.tar.gz - asset_content_type: application/gzip - - macos-builds: - needs: create-release - runs-on: macos-latest - strategy: - matrix: - target: - - x86_64-apple-darwin - - aarch64-apple-darwin - steps: - - uses: actions/checkout@v4 - - - name: Install Rust - uses: actions-rust-lang/setup-rust-toolchain@v1 - with: - target: ${{ matrix.target }} - cache: true - - - - name: Build + - name: Package Artifacts + shell: bash run: | - rustup target add ${{ matrix.target }} - for binary in $RELEASE_BINARIES; do - cargo build --release --bin $binary --target ${{ matrix.target }} - done - - - name: Prepare binaries - run: | - mkdir artifacts + mkdir dist for binary in $RELEASE_BINARIES; do - cp "target/${{ matrix.target }}/release/$binary" artifacts/ + [ "${{ matrix.os }}" = "windows-latest" ] && EXT=".exe" || EXT="" + cp "target/${{ matrix.target }}/release/${binary}${EXT}" dist/ done - cd artifacts - tar czf ../${{ matrix.target }}.tar.gz * - - - name: Upload Release Assets - uses: actions/upload-release-asset@v1 - env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + + if [ "${{ matrix.os }}" = "windows-latest" ]; then + 7z a "${{ matrix.artifact_name }}.zip" ./dist/* + else + tar -czf "${{ matrix.artifact_name }}.tar.gz" -C dist . + fi + + - name: Upload to GitHub Release + uses: softprops/action-gh-release@v2 with: - upload_url: ${{ needs.create-release.outputs.upload_url }} - asset_path: ${{ matrix.target }}.tar.gz - asset_name: ${{ matrix.target }}.tar.gz - asset_content_type: application/gzip - - # windows-builds: - # needs: create-release - # runs-on: windows-latest - # steps: - # - uses: actions/checkout@v4 - - # - name: Install Rust - # uses: actions-rust-lang/setup-rust-toolchain@v1 - # with: - # target: x86_64-pc-windows-msvc - # cache: true - - # - name: Build - # run: | - # rustup target add x86_64-pc-windows-msvc - # for binary in $RELEASE_BINARIES; do - # cargo build --release --bin $binary --features="build-binary" --target x86_64-pc-windows-msvc - # done - - # - name: Prepare binaries - # shell: bash - # run: | - # mkdir artifacts - # for binary in $RELEASE_BINARIES; do - # cp "target/x86_64-pc-windows-msvc/release/$binary.exe" artifacts/ - # done - # cd artifacts - # tar czf ../x86_64-pc-windows-msvc.tar.gz * - - # - name: Upload Release Assets - # uses: actions/upload-release-asset@v1 - # env: - # GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - # with: - # upload_url: ${{ needs.create-release.outputs.upload_url }} - # asset_path: x86_64-pc-windows-msvc.tar.gz - # asset_name: x86_64-pc-windows-msvc.tar.gz - # asset_content_type: application/gzip - + # Use the tag name for pushes, or a special 'pr-#' tag for PRs + tag_name: "${{ github.event_name == 'push' && github.ref_name || format('pr-build-{0}', github.event.pull_request.number) }}" + name: "${{ github.event_name == 'push' && github.ref_name || format('Pre-release Build (PR #{0})', github.event.pull_request.number) }}" + # Mark as prerelease if it's a PR + prerelease: ${{ github.event_name == 'pull_request' }} + draft: false + make_latest: ${{ github.event_name == 'push' && 'true' || 'false' }} + files: | + *.tar.gz + *.zip + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} \ No newline at end of file diff --git a/Cargo.lock b/Cargo.lock index 4a729d9..d3a6bb2 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -297,17 +297,17 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cb372a7cbcac02a35d3fb7b3fc1f969ec078e871f9bb899bf00a2e1809bec8a3" dependencies = [ "arrow-arith", - "arrow-array 57.1.0", - "arrow-buffer 57.1.0", - "arrow-cast 57.1.0", + "arrow-array", + "arrow-buffer", + "arrow-cast", "arrow-csv", - "arrow-data 57.1.0", - "arrow-ipc 57.1.0", + "arrow-data", + "arrow-ipc", "arrow-json", "arrow-ord", "arrow-row", - "arrow-schema 57.1.0", - "arrow-select 57.1.0", + "arrow-schema", + "arrow-select", "arrow-string", ] @@ -317,30 +317,14 @@ version = "57.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0f377dcd19e440174596d83deb49cd724886d91060c07fec4f67014ef9d54049" dependencies = [ - "arrow-array 57.1.0", - "arrow-buffer 57.1.0", - "arrow-data 57.1.0", - "arrow-schema 57.1.0", + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", "chrono", "num-traits", ] -[[package]] -name = "arrow-array" -version = "56.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8548ca7c070d8db9ce7aa43f37393e4bfcf3f2d3681df278490772fd1673d08d" -dependencies = [ - "ahash", - "arrow-buffer 56.2.0", - "arrow-data 56.2.0", - "arrow-schema 56.2.0", - "chrono", - "half", - "hashbrown 0.16.1", - "num", -] - [[package]] name = "arrow-array" version = "57.1.0" @@ -348,9 +332,9 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a23eaff85a44e9fa914660fb0d0bb00b79c4a3d888b5334adb3ea4330c84f002" dependencies = [ "ahash", - "arrow-buffer 57.1.0", - "arrow-data 57.1.0", - "arrow-schema 57.1.0", + "arrow-buffer", + "arrow-data", + "arrow-schema", "chrono", "half", "hashbrown 0.16.1", @@ -359,17 +343,6 @@ dependencies = [ "num-traits", ] -[[package]] -name = "arrow-buffer" -version = "56.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e003216336f70446457e280807a73899dd822feaf02087d31febca1363e2fccc" -dependencies = [ - "bytes", - "half", - "num", -] - [[package]] name = "arrow-buffer" version = "57.1.0" @@ -382,38 +355,18 @@ dependencies = [ "num-traits", ] -[[package]] -name = "arrow-cast" -version = "56.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "919418a0681298d3a77d1a315f625916cb5678ad0d74b9c60108eb15fd083023" -dependencies = [ - "arrow-array 56.2.0", - "arrow-buffer 56.2.0", - "arrow-data 56.2.0", - "arrow-schema 56.2.0", - "arrow-select 56.2.0", - "atoi", - "base64", - "chrono", - "half", - "lexical-core", - "num", - "ryu", -] - [[package]] name = "arrow-cast" version = "57.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e3d131abb183f80c450d4591dc784f8d7750c50c6e2bc3fcaad148afc8361271" dependencies = [ - "arrow-array 57.1.0", - "arrow-buffer 57.1.0", - "arrow-data 57.1.0", + "arrow-array", + "arrow-buffer", + "arrow-data", "arrow-ord", - "arrow-schema 57.1.0", - "arrow-select 57.1.0", + "arrow-schema", + "arrow-select", "atoi", "base64", "chrono", @@ -429,65 +382,39 @@ version = "57.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2275877a0e5e7e7c76954669366c2aa1a829e340ab1f612e647507860906fb6b" dependencies = [ - "arrow-array 57.1.0", - "arrow-cast 57.1.0", - "arrow-schema 57.1.0", + "arrow-array", + "arrow-cast", + "arrow-schema", "chrono", "csv", "csv-core", "regex", ] -[[package]] -name = "arrow-data" -version = "56.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a5c64fff1d142f833d78897a772f2e5b55b36cb3e6320376f0961ab0db7bd6d0" -dependencies = [ - "arrow-buffer 56.2.0", - "arrow-schema 56.2.0", - "half", - "num", -] - [[package]] name = "arrow-data" version = "57.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "05738f3d42cb922b9096f7786f606fcb8669260c2640df8490533bb2fa38c9d3" dependencies = [ - "arrow-buffer 57.1.0", - "arrow-schema 57.1.0", + "arrow-buffer", + "arrow-schema", "half", "num-integer", "num-traits", ] -[[package]] -name = "arrow-ipc" -version = "56.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1d3594dcddccc7f20fd069bc8e9828ce37220372680ff638c5e00dea427d88f5" -dependencies = [ - "arrow-array 56.2.0", - "arrow-buffer 56.2.0", - "arrow-data 56.2.0", - "arrow-schema 56.2.0", - "arrow-select 56.2.0", - "flatbuffers", -] - [[package]] name = "arrow-ipc" version = "57.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3d09446e8076c4b3f235603d9ea7c5494e73d441b01cd61fb33d7254c11964b3" dependencies = [ - "arrow-array 57.1.0", - "arrow-buffer 57.1.0", - "arrow-data 57.1.0", - "arrow-schema 57.1.0", - "arrow-select 57.1.0", + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", + "arrow-select", "flatbuffers", ] @@ -497,11 +424,11 @@ version = "57.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "371ffd66fa77f71d7628c63f209c9ca5341081051aa32f9c8020feb0def787c0" dependencies = [ - "arrow-array 57.1.0", - "arrow-buffer 57.1.0", - "arrow-cast 57.1.0", - "arrow-data 57.1.0", - "arrow-schema 57.1.0", + "arrow-array", + "arrow-buffer", + "arrow-cast", + "arrow-data", + "arrow-schema", "chrono", "half", "indexmap", @@ -521,11 +448,11 @@ version = "57.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cbc94fc7adec5d1ba9e8cd1b1e8d6f72423b33fe978bf1f46d970fafab787521" dependencies = [ - "arrow-array 57.1.0", - "arrow-buffer 57.1.0", - "arrow-data 57.1.0", - "arrow-schema 57.1.0", - "arrow-select 57.1.0", + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", + "arrow-select", ] [[package]] @@ -534,39 +461,19 @@ version = "57.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "169676f317157dc079cc5def6354d16db63d8861d61046d2f3883268ced6f99f" dependencies = [ - "arrow-array 57.1.0", - "arrow-buffer 57.1.0", - "arrow-data 57.1.0", - "arrow-schema 57.1.0", + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", "half", ] -[[package]] -name = "arrow-schema" -version = "56.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b3aa9e59c611ebc291c28582077ef25c97f1975383f1479b12f3b9ffee2ffabe" - [[package]] name = "arrow-schema" version = "57.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d27609cd7dd45f006abae27995c2729ef6f4b9361cde1ddd019dc31a5aa017e0" -[[package]] -name = "arrow-select" -version = "56.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8c41dbbd1e97bfcaee4fcb30e29105fb2c75e4d82ae4de70b792a5d3f66b2e7a" -dependencies = [ - "ahash", - "arrow-array 56.2.0", - "arrow-buffer 56.2.0", - "arrow-data 56.2.0", - "arrow-schema 56.2.0", - "num", -] - [[package]] name = "arrow-select" version = "57.1.0" @@ -574,10 +481,10 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ae980d021879ea119dd6e2a13912d81e64abed372d53163e804dfe84639d8010" dependencies = [ "ahash", - "arrow-array 57.1.0", - "arrow-buffer 57.1.0", - "arrow-data 57.1.0", - "arrow-schema 57.1.0", + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", "num-traits", ] @@ -587,11 +494,11 @@ version = "57.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cf35e8ef49dcf0c5f6d175edee6b8af7b45611805333129c541a8b89a0fc0534" dependencies = [ - "arrow-array 57.1.0", - "arrow-buffer 57.1.0", - "arrow-data 57.1.0", - "arrow-schema 57.1.0", - "arrow-select 57.1.0", + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", + "arrow-select", "memchr", "num-traits", "regex", @@ -612,7 +519,7 @@ checksum = "45403b49e3954a4b8428a0ac21a4b7afadccf92bfd96273f1a58cd4812496ae0" dependencies = [ "generic-array 0.12.4", "generic-array 0.13.3", - "generic-array 0.14.9", + "generic-array 0.14.7", "stable_deref_trait", ] @@ -709,16 +616,16 @@ dependencies = [ "futures-lite", "parking", "polling", - "rustix 1.1.2", + "rustix 1.1.3", "slab", "windows-sys 0.61.2", ] [[package]] name = "async-lock" -version = "3.4.1" +version = "3.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5fd03604047cee9b6ce9de9f70c6cd540a0520c813cbd49bae61f33ab80ed1dc" +checksum = "290f7f2596bd5b78a9fec8088ccd89180d7f9f55b94b0576823bbbdc72ee8311" dependencies = [ "event-listener", "event-listener-strategy", @@ -751,7 +658,7 @@ dependencies = [ "cfg-if", "event-listener", "futures-lite", - "rustix 1.1.2", + "rustix 1.1.3", ] [[package]] @@ -777,7 +684,7 @@ dependencies = [ "cfg-if", "futures-core", "futures-io", - "rustix 1.1.2", + "rustix 1.1.3", "signal-hook-registry", "slab", "windows-sys 0.61.2", @@ -880,12 +787,370 @@ version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c08606f8c3cbf4ce6ec8e28fb0014a2c086708fe954eaa885384a6165172e7e8" +[[package]] +name = "aws-config" +version = "1.8.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "96571e6996817bf3d58f6b569e4b9fd2e9d2fcf9f7424eed07b2ce9bb87535e5" +dependencies = [ + "aws-credential-types", + "aws-runtime", + "aws-sdk-sso", + "aws-sdk-ssooidc", + "aws-sdk-sts", + "aws-smithy-async", + "aws-smithy-http", + "aws-smithy-json", + "aws-smithy-runtime", + "aws-smithy-runtime-api", + "aws-smithy-types", + "aws-types", + "bytes", + "fastrand", + "hex", + "http 1.4.0", + "ring", + "time", + "tokio", + "tracing", + "url", + "zeroize", +] + +[[package]] +name = "aws-credential-types" +version = "1.2.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3cd362783681b15d136480ad555a099e82ecd8e2d10a841e14dfd0078d67fee3" +dependencies = [ + "aws-smithy-async", + "aws-smithy-runtime-api", + "aws-smithy-types", + "zeroize", +] + +[[package]] +name = "aws-lc-rs" +version = "1.15.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6a88aab2464f1f25453baa7a07c84c5b7684e274054ba06817f382357f77a288" +dependencies = [ + "aws-lc-sys", + "zeroize", +] + +[[package]] +name = "aws-lc-sys" +version = "0.35.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b45afffdee1e7c9126814751f88dddc747f41d91da16c9551a0f1e8a11e788a1" +dependencies = [ + "cc", + "cmake", + "dunce", + "fs_extra", +] + +[[package]] +name = "aws-runtime" +version = "1.5.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d81b5b2898f6798ad58f484856768bca817e3cd9de0974c24ae0f1113fe88f1b" +dependencies = [ + "aws-credential-types", + "aws-sigv4", + "aws-smithy-async", + "aws-smithy-http", + "aws-smithy-runtime", + "aws-smithy-runtime-api", + "aws-smithy-types", + "aws-types", + "bytes", + "fastrand", + "http 0.2.12", + "http-body 0.4.6", + "percent-encoding", + "pin-project-lite", + "tracing", + "uuid", +] + +[[package]] +name = "aws-sdk-sso" +version = "1.91.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8ee6402a36f27b52fe67661c6732d684b2635152b676aa2babbfb5204f99115d" +dependencies = [ + "aws-credential-types", + "aws-runtime", + "aws-smithy-async", + "aws-smithy-http", + "aws-smithy-json", + "aws-smithy-runtime", + "aws-smithy-runtime-api", + "aws-smithy-types", + "aws-types", + "bytes", + "fastrand", + "http 0.2.12", + "regex-lite", + "tracing", +] + +[[package]] +name = "aws-sdk-ssooidc" +version = "1.93.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a45a7f750bbd170ee3677671ad782d90b894548f4e4ae168302c57ec9de5cb3e" +dependencies = [ + "aws-credential-types", + "aws-runtime", + "aws-smithy-async", + "aws-smithy-http", + "aws-smithy-json", + "aws-smithy-runtime", + "aws-smithy-runtime-api", + "aws-smithy-types", + "aws-types", + "bytes", + "fastrand", + "http 0.2.12", + "regex-lite", + "tracing", +] + +[[package]] +name = "aws-sdk-sts" +version = "1.95.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "55542378e419558e6b1f398ca70adb0b2088077e79ad9f14eb09441f2f7b2164" +dependencies = [ + "aws-credential-types", + "aws-runtime", + "aws-smithy-async", + "aws-smithy-http", + "aws-smithy-json", + "aws-smithy-query", + "aws-smithy-runtime", + "aws-smithy-runtime-api", + "aws-smithy-types", + "aws-smithy-xml", + "aws-types", + "fastrand", + "http 0.2.12", + "regex-lite", + "tracing", +] + +[[package]] +name = "aws-sigv4" +version = "1.3.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "69e523e1c4e8e7e8ff219d732988e22bfeae8a1cafdbe6d9eca1546fa080be7c" +dependencies = [ + "aws-credential-types", + "aws-smithy-http", + "aws-smithy-runtime-api", + "aws-smithy-types", + "bytes", + "form_urlencoded", + "hex", + "hmac", + "http 0.2.12", + "http 1.4.0", + "percent-encoding", + "sha2", + "time", + "tracing", +] + +[[package]] +name = "aws-smithy-async" +version = "1.2.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9ee19095c7c4dda59f1697d028ce704c24b2d33c6718790c7f1d5a3015b4107c" +dependencies = [ + "futures-util", + "pin-project-lite", + "tokio", +] + +[[package]] +name = "aws-smithy-http" +version = "0.62.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "826141069295752372f8203c17f28e30c464d22899a43a0c9fd9c458d469c88b" +dependencies = [ + "aws-smithy-runtime-api", + "aws-smithy-types", + "bytes", + "bytes-utils", + "futures-core", + "futures-util", + "http 0.2.12", + "http 1.4.0", + "http-body 0.4.6", + "percent-encoding", + "pin-project-lite", + "pin-utils", + "tracing", +] + +[[package]] +name = "aws-smithy-http-client" +version = "1.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "59e62db736db19c488966c8d787f52e6270be565727236fd5579eaa301e7bc4a" +dependencies = [ + "aws-smithy-async", + "aws-smithy-runtime-api", + "aws-smithy-types", + "h2", + "http 1.4.0", + "hyper", + "hyper-rustls", + "hyper-util", + "pin-project-lite", + "rustls", + "rustls-native-certs", + "rustls-pki-types", + "tokio", + "tokio-rustls", + "tower", + "tracing", +] + +[[package]] +name = "aws-smithy-json" +version = "0.61.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "49fa1213db31ac95288d981476f78d05d9cbb0353d22cdf3472cc05bb02f6551" +dependencies = [ + "aws-smithy-types", +] + +[[package]] +name = "aws-smithy-observability" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "17f616c3f2260612fe44cede278bafa18e73e6479c4e393e2c4518cf2a9a228a" +dependencies = [ + "aws-smithy-runtime-api", +] + +[[package]] +name = "aws-smithy-query" +version = "0.60.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ae5d689cf437eae90460e944a58b5668530d433b4ff85789e69d2f2a556e057d" +dependencies = [ + "aws-smithy-types", + "urlencoding", +] + +[[package]] +name = "aws-smithy-runtime" +version = "1.9.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a392db6c583ea4a912538afb86b7be7c5d8887d91604f50eb55c262ee1b4a5f5" +dependencies = [ + "aws-smithy-async", + "aws-smithy-http", + "aws-smithy-http-client", + "aws-smithy-observability", + "aws-smithy-runtime-api", + "aws-smithy-types", + "bytes", + "fastrand", + "http 0.2.12", + "http 1.4.0", + "http-body 0.4.6", + "http-body 1.0.1", + "pin-project-lite", + "pin-utils", + "tokio", + "tracing", +] + +[[package]] +name = "aws-smithy-runtime-api" +version = "1.9.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ab0d43d899f9e508300e587bf582ba54c27a452dd0a9ea294690669138ae14a2" +dependencies = [ + "aws-smithy-async", + "aws-smithy-types", + "bytes", + "http 0.2.12", + "http 1.4.0", + "pin-project-lite", + "tokio", + "tracing", + "zeroize", +] + +[[package]] +name = "aws-smithy-types" +version = "1.3.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "905cb13a9895626d49cf2ced759b062d913834c7482c38e49557eac4e6193f01" +dependencies = [ + "base64-simd", + "bytes", + "bytes-utils", + "http 0.2.12", + "http 1.4.0", + "http-body 0.4.6", + "http-body 1.0.1", + "http-body-util", + "itoa", + "num-integer", + "pin-project-lite", + "pin-utils", + "ryu", + "serde", + "time", +] + +[[package]] +name = "aws-smithy-xml" +version = "0.60.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "11b2f670422ff42bf7065031e72b45bc52a3508bd089f743ea90731ca2b6ea57" +dependencies = [ + "xmlparser", +] + +[[package]] +name = "aws-types" +version = "1.3.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1d980627d2dd7bfc32a3c025685a033eeab8d365cc840c631ef59d1b8f428164" +dependencies = [ + "aws-credential-types", + "aws-smithy-async", + "aws-smithy-runtime-api", + "aws-smithy-types", + "rustc_version", + "tracing", +] + [[package]] name = "base64" version = "0.22.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "72b3254f16251a8381aa12e40e3c4d2f0199f8c6508fbecb9d91f575e0fbb8c6" +[[package]] +name = "base64-simd" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "339abbe78e73178762e23bea9dfd08e697eb3f3301cd4be981c0f78ba5859195" +dependencies = [ + "outref", + "vsimd", +] + [[package]] name = "bincode" version = "1.3.3" @@ -895,6 +1160,26 @@ dependencies = [ "serde", ] +[[package]] +name = "bincode" +version = "2.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "36eaf5d7b090263e8150820482d5d93cd964a81e4019913c972f4edcc6edb740" +dependencies = [ + "bincode_derive", + "serde", + "unty", +] + +[[package]] +name = "bincode_derive" +version = "2.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bf95709a440f45e986983918d0e8a1f30a9b1df04918fc828670606804ac3c09" +dependencies = [ + "virtue", +] + [[package]] name = "bit-set" version = "0.8.0" @@ -921,6 +1206,9 @@ name = "bitflags" version = "2.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "812e12b5285cc515a9c72a5c1d3b6d46a19dac5acfef5265968c166106e31dd3" +dependencies = [ + "serde_core", +] [[package]] name = "block" @@ -928,6 +1216,15 @@ version = "0.1.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0d8c1fef690941d3e7788d328517591fecc684c084084702d6ff1641e993699a" +[[package]] +name = "block-buffer" +version = "0.10.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3078c7629b62d3f0439517fa394996acacc5cbc91c5a20d8c658e77abd503a71" +dependencies = [ + "generic-array 0.14.7", +] + [[package]] name = "block2" version = "0.5.1" @@ -1007,9 +1304,9 @@ dependencies = [ [[package]] name = "bumpalo" -version = "3.19.0" +version = "3.19.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "46c5e41b57b8bba42a04676d81cb89e9ee8e859a1a66f80a5a72e1cb76b34d43" +checksum = "5dd9dc738b7a8311c7ade152424974d8115f2cdad61e8dab8dac9f2362298510" [[package]] name = "bytemuck" @@ -1049,9 +1346,19 @@ version = "1.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b35204fbdc0b3f4446b89fc1ac2cf84a8a68971995d0bf2e925ec7cd960f9cb3" +[[package]] +name = "bytes-utils" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7dafe3a8757b027e2be6e4e5601ed563c55989fcf1546e933c66c8eb3a058d35" +dependencies = [ + "bytes", + "either", +] + [[package]] name = "calibrt" -version = "0.20.1" +version = "0.22.0" dependencies = [ "insta", "rand 0.8.5", @@ -1082,7 +1389,7 @@ checksum = "cb9f6e1368bd4621d2c86baa7e37de77a938adf5221e5dd3d6133340101b309e" dependencies = [ "bitflags 2.10.0", "polling", - "rustix 1.1.2", + "rustix 1.1.3", "slab", "tracing", ] @@ -1106,16 +1413,16 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "138efcf0940a02ebf0cc8d1eff41a1682a46b431630f4c52450d6265876021fa" dependencies = [ "calloop 0.14.3", - "rustix 1.1.2", + "rustix 1.1.3", "wayland-backend", "wayland-client", ] [[package]] name = "cc" -version = "1.2.49" +version = "1.2.50" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "90583009037521a116abf44494efecd645ba48b6622457080f080b85544e2215" +checksum = "9f50d563227a1c37cc0a263f64eca3334388c01c5e4c4861a9def205c614383c" dependencies = [ "find-msvc-tools", "jobserver", @@ -1213,6 +1520,15 @@ dependencies = [ "error-code", ] +[[package]] +name = "cmake" +version = "0.1.57" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "75443c44cd6b379beb8c5b45d85d0773baf31cce901fe7bb252f4eff3008ef7d" +dependencies = [ + "cc", +] + [[package]] name = "codespan-reporting" version = "0.12.0" @@ -1263,9 +1579,9 @@ dependencies = [ [[package]] name = "console" -version = "0.16.1" +version = "0.16.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b430743a6eb14e9764d4260d4c0d8123087d504eeb9c48f2b2a5e810dd369df4" +checksum = "03e45a4a8926227e4197636ba97a9fc9b00477e9f4bd711395687c5f0734bec4" dependencies = [ "encode_unicode", "libc", @@ -1294,6 +1610,15 @@ dependencies = [ "tiny-keccak", ] +[[package]] +name = "context_error" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "da7e1b8dc6f4cdc4f6b897d6aa1b7eaec6d95331bdb765d2a51cdd948e157ee0" +dependencies = [ + "serde", +] + [[package]] name = "core-foundation" version = "0.9.4" @@ -1355,6 +1680,15 @@ dependencies = [ "libc", ] +[[package]] +name = "cpufeatures" +version = "0.2.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "59ed5838eebb26a2bb2e58f6d5b5316989ae9d08bab10e0e6d103e656d1b0280" +dependencies = [ + "libc", +] + [[package]] name = "crc32fast" version = "1.5.0" @@ -1401,6 +1735,16 @@ version = "0.2.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "460fbee9c2c2f33933d720630a6a0bac33ba7053db5344fac858d4b8952d77d5" +[[package]] +name = "crypto-common" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "78c8292055d1c1df0cce5d180393dc8cce0abec0a7102adb6c7b1eef6016d60a" +dependencies = [ + "generic-array 0.14.7", + "typenum", +] + [[package]] name = "csv" version = "1.4.0" @@ -1463,6 +1807,26 @@ dependencies = [ "syn 2.0.111", ] +[[package]] +name = "deranged" +version = "0.5.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ececcb659e7ba858fb4f10388c250a7252eb0a27373f1a72b8748afdd248e587" +dependencies = [ + "powerfmt", +] + +[[package]] +name = "digest" +version = "0.10.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9ed9a281f7bc9b7576e61468ba615a66a5c8cfdff42420a70aa82701a3b1e292" +dependencies = [ + "block-buffer", + "crypto-common", + "subtle", +] + [[package]] name = "dispatch" version = "0.2.0" @@ -1533,6 +1897,12 @@ version = "0.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d8b14ccef22fc6f5a8f4d7d768562a182c04ce9a3b3157b91390b52ddfdf1a76" +[[package]] +name = "dunce" +version = "1.0.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "92773504d58c093f6de2459af4af33faa518c13451eb8f2b5698ed3d36e7c813" + [[package]] name = "duplicate" version = "2.0.1" @@ -1581,6 +1951,7 @@ dependencies = [ "glow", "glutin", "glutin-winit", + "home", "image", "js-sys", "log", @@ -1591,6 +1962,8 @@ dependencies = [ "percent-encoding", "profiling", "raw-window-handle", + "ron 0.11.0", + "serde", "static_assertions", "wasm-bindgen", "wasm-bindgen-futures", @@ -1614,6 +1987,7 @@ dependencies = [ "log", "nohash-hasher", "profiling", + "ron 0.11.0", "serde", "smallvec", "unicode-segmentation", @@ -1655,6 +2029,7 @@ dependencies = [ "objc2-ui-kit", "profiling", "raw-window-handle", + "serde", "smithay-clipboard", "web-time", "webbrowser", @@ -1670,6 +2045,7 @@ dependencies = [ "duplicate", "egui", "paste", + "serde", ] [[package]] @@ -1812,6 +2188,7 @@ dependencies = [ "nohash-hasher", "parking_lot", "profiling", + "rayon", "serde", ] @@ -1919,9 +2296,9 @@ checksum = "3a3076410a55c90011c298b04d0cfa770b00fa04e1e3c97d3f6c9de105a03844" [[package]] name = "flatbuffers" -version = "25.9.23" +version = "25.12.19" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "09b6620799e7340ebd9968d2e0708eb82cf1971e9a16821e2091b6d6e475eed5" +checksum = "35f6839d7b3b98adde531effaf34f0c2badc6f4735d26fe74709d8e513a96ef3" dependencies = [ "bitflags 2.10.0", "rustc_version", @@ -1962,6 +2339,15 @@ version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "77ce24cb58228fbb8aa041425bb1050850ac19177686ea6e0f41a70416f56fdb" +[[package]] +name = "font-types" +version = "0.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "39a654f404bbcbd48ea58c617c2993ee91d1cb63727a37bf2323a4edeed1b8c5" +dependencies = [ + "bytemuck", +] + [[package]] name = "foreign-types" version = "0.5.0" @@ -2012,6 +2398,27 @@ dependencies = [ "thiserror 1.0.69", ] +[[package]] +name = "fs_extra" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "42703706b716c37f96a77aea830392ad231f44c9e9a67872fa5548707e11b11c" + +[[package]] +name = "futures" +version = "0.3.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "65bc07b1a8bc7c85c5f2e110c476c7389b4554ba72af57d8445ea63a576b0876" +dependencies = [ + "futures-channel", + "futures-core", + "futures-executor", + "futures-io", + "futures-sink", + "futures-task", + "futures-util", +] + [[package]] name = "futures-channel" version = "0.3.31" @@ -2019,6 +2426,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2dff15bf788c671c1934e366d07e30c1814a8ef514e1af724a602e8a2fbe1b10" dependencies = [ "futures-core", + "futures-sink", ] [[package]] @@ -2027,6 +2435,17 @@ version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "05f29059c0c2090612e8d742178b0580d2dc940c837851ad723096f87af6663e" +[[package]] +name = "futures-executor" +version = "0.3.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e28d1d997f585e54aebc3f97d39e72338912123a67330d723fdbb564d646c9f" +dependencies = [ + "futures-core", + "futures-task", + "futures-util", +] + [[package]] name = "futures-io" version = "0.3.31" @@ -2057,6 +2476,12 @@ dependencies = [ "syn 2.0.111", ] +[[package]] +name = "futures-sink" +version = "0.3.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e575fab7d1e0dcb8d0c7bcf9a63ee213816ab51902e6d244a95819acacf1d4f7" + [[package]] name = "futures-task" version = "0.3.31" @@ -2069,9 +2494,11 @@ version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9fa08315bb612088cc391249efdc3bc77536f16c91f6cf495e6fbe85b20a4a81" dependencies = [ + "futures-channel", "futures-core", "futures-io", "futures-macro", + "futures-sink", "futures-task", "memchr", "pin-project-lite", @@ -2099,9 +2526,9 @@ dependencies = [ [[package]] name = "generic-array" -version = "0.14.9" +version = "0.14.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4bb6743198531e02858aeaea5398fcc883e71851fcbcb5a2f773e2fb6cb1edf2" +checksum = "85649ca51fd72272d7821adaf274ad91c288277713d9c18820d8499a7ff69e9a" dependencies = [ "typenum", "version_check", @@ -2158,7 +2585,7 @@ version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1bd49230192a3797a9a4d6abe9b3eed6f7fa4c8a8a4947977c6f80025f92cbd8" dependencies = [ - "rustix 1.1.2", + "rustix 1.1.3", "windows-link 0.2.1", ] @@ -2169,8 +2596,10 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "335ff9f135e4384c8150d6f27c6daed433577f86b4750418338c01a1a2528592" dependencies = [ "cfg-if", + "js-sys", "libc", "wasi", + "wasm-bindgen", ] [[package]] @@ -2180,9 +2609,11 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "899def5c37c4fd7b2664648c28120ecec138e4d395b459e5ca34f9cce2dd77fd" dependencies = [ "cfg-if", + "js-sys", "libc", "r-efi", "wasip2", + "wasm-bindgen", ] [[package]] @@ -2325,6 +2756,25 @@ dependencies = [ "bitflags 2.10.0", ] +[[package]] +name = "h2" +version = "0.4.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f3c0b69cfcb4e1b9f1bf2f53f95f766e4661169728ec61cd3fe5a0166f2d1386" +dependencies = [ + "atomic-waker", + "bytes", + "fnv", + "futures-core", + "futures-sink", + "http 1.4.0", + "indexmap", + "slab", + "tokio", + "tokio-util", + "tracing", +] + [[package]] name = "half" version = "2.7.1" @@ -2401,7 +2851,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "634bd4d29cbf24424d0a4bfcbf80c6960129dc24424752a7d1d1390607023422" dependencies = [ "as-slice", - "generic-array 0.14.9", + "generic-array 0.14.7", "hash32 0.1.1", "stable_deref_trait", ] @@ -2453,6 +2903,154 @@ version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "dfa686283ad6dd069f105e5ab091b04c62850d3e4cf5d67debad1933f55023df" +[[package]] +name = "hmac" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6c49c37c09c17a53d937dfbb742eb3a961d65a994e6bcdcf37e7399d0cc8ab5e" +dependencies = [ + "digest", +] + +[[package]] +name = "home" +version = "0.5.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cc627f471c528ff0c4a49e1d5e60450c8f6461dd6d10ba9dcd3a61d3dff7728d" +dependencies = [ + "windows-sys 0.61.2", +] + +[[package]] +name = "http" +version = "0.2.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "601cbb57e577e2f5ef5be8e7b83f0f63994f25aa94d673e54a92d5c516d101f1" +dependencies = [ + "bytes", + "fnv", + "itoa", +] + +[[package]] +name = "http" +version = "1.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e3ba2a386d7f85a81f119ad7498ebe444d2e22c2af0b86b069416ace48b3311a" +dependencies = [ + "bytes", + "itoa", +] + +[[package]] +name = "http-body" +version = "0.4.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7ceab25649e9960c0311ea418d17bee82c0dcec1bd053b5f9a66e265a693bed2" +dependencies = [ + "bytes", + "http 0.2.12", + "pin-project-lite", +] + +[[package]] +name = "http-body" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1efedce1fb8e6913f23e0c92de8e62cd5b772a67e7b3946df930a62566c93184" +dependencies = [ + "bytes", + "http 1.4.0", +] + +[[package]] +name = "http-body-util" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b021d93e26becf5dc7e1b75b1bed1fd93124b374ceb73f43d4d4eafec896a64a" +dependencies = [ + "bytes", + "futures-core", + "http 1.4.0", + "http-body 1.0.1", + "pin-project-lite", +] + +[[package]] +name = "httparse" +version = "1.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6dbf3de79e51f3d586ab4cb9d5c3e2c14aa28ed23d180cf89b4df0454a69cc87" + +[[package]] +name = "humantime" +version = "2.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "135b12329e5e3ce057a9f972339ea52bc954fe1e9358ef27f95e89716fbc5424" + +[[package]] +name = "hyper" +version = "1.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2ab2d4f250c3d7b1c9fcdff1cece94ea4e2dfbec68614f7b87cb205f24ca9d11" +dependencies = [ + "atomic-waker", + "bytes", + "futures-channel", + "futures-core", + "h2", + "http 1.4.0", + "http-body 1.0.1", + "httparse", + "itoa", + "pin-project-lite", + "pin-utils", + "smallvec", + "tokio", + "want", +] + +[[package]] +name = "hyper-rustls" +version = "0.27.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e3c93eb611681b207e1fe55d5a71ecf91572ec8a6705cdb6857f7d8d5242cf58" +dependencies = [ + "http 1.4.0", + "hyper", + "hyper-util", + "rustls", + "rustls-native-certs", + "rustls-pki-types", + "tokio", + "tokio-rustls", + "tower-service", +] + +[[package]] +name = "hyper-util" +version = "0.1.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "727805d60e7938b76b826a6ef209eb70eaa1812794f9424d4a4e2d740662df5f" +dependencies = [ + "base64", + "bytes", + "futures-channel", + "futures-core", + "futures-util", + "http 1.4.0", + "http-body 1.0.1", + "hyper", + "ipnet", + "libc", + "percent-encoding", + "pin-project-lite", + "socket2", + "tokio", + "tower-service", + "tracing", +] + [[package]] name = "i_float" version = "1.15.0" @@ -2607,6 +3205,12 @@ version = "1.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b9e0384b61958566e926dc50660321d12159025e767c18e043daf26b70104c39" +[[package]] +name = "identity-hash" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dfdd7caa900436d8f13b2346fe10257e0c05c1f1f9e351f4f5d57c03bd5f45da" + [[package]] name = "idna" version = "1.1.0" @@ -2650,6 +3254,8 @@ checksum = "0ad4bb2b565bca0645f4d68c5c9af97fba094e9791da685bf83cb5f3ce74acf2" dependencies = [ "equivalent", "hashbrown 0.16.1", + "serde", + "serde_core", ] [[package]] @@ -2658,7 +3264,7 @@ version = "0.18.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9375e112e4b463ec1b1c6c011953545c65a30164fbab5b581df32b3abf0dcb88" dependencies = [ - "console 0.16.1", + "console 0.16.2", "portable-atomic", "rayon", "unicode-width", @@ -2668,13 +3274,14 @@ dependencies = [ [[package]] name = "insta" -version = "1.44.3" +version = "1.45.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b5c943d4415edd8153251b6f197de5eb1640e56d84e8d9159bea190421c73698" +checksum = "b76866be74d68b1595eb8060cb9191dca9c021db2316558e52ddc5d55d41b66c" dependencies = [ "console 0.15.11", "once_cell", "similar", + "tempfile", ] [[package]] @@ -2683,6 +3290,22 @@ version = "3.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8bb03732005da905c88227371639bf1ad885cc712789c011c31c5fb3ab3ccf02" +[[package]] +name = "ipnet" +version = "2.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "469fb0b9cefa57e3ef31275ee7cacb78f2fdca44e4765491884a2b119d4eb130" + +[[package]] +name = "iri-string" +version = "0.7.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4f867b9d1d896b67beb18518eda36fdb77a32ea590de864f1325b294a6d14397" +dependencies = [ + "memchr", + "serde", +] + [[package]] name = "is_terminal_polyfill" version = "1.70.2" @@ -2700,18 +3323,18 @@ dependencies = [ [[package]] name = "itertools" -version = "0.12.1" +version = "0.14.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ba291022dbbd398a455acf126c1e341954079855bc60dfdda641363bd6922569" +checksum = "2b192c782037fadd9cfa75548310488aabdbf3d2da73885b31bd0abd03351285" dependencies = [ "either", ] [[package]] name = "itoa" -version = "1.0.15" +version = "1.0.16" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4a5f13b858c8d314ee3e8f639011f7ccefe71f97f96e50151fb991f267928e2c" +checksum = "7ee5b5339afb4c41626dde77b7a611bd4f2c202b897852b4bcf5d03eddc61010" [[package]] name = "jni" @@ -2869,13 +3492,13 @@ dependencies = [ [[package]] name = "libredox" -version = "0.1.10" +version = "0.1.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "416f7e718bdb06000964960ffa43b4335ad4012ae8b99060261aa4a8088d5ccb" +checksum = "df15f6eac291ed1cf25865b1ee60399f57e7c227e7f51bdbd4c5270396a9ed50" dependencies = [ "bitflags 2.10.0", "libc", - "redox_syscall 0.5.18", + "redox_syscall 0.6.0", ] [[package]] @@ -2891,9 +3514,9 @@ dependencies = [ [[package]] name = "libz-rs-sys" -version = "0.5.4" +version = "0.5.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "15413ef615ad868d4d65dce091cb233b229419c7c0c4bcaa746c0901c49ff39c" +checksum = "c10501e7805cee23da17c7790e59df2870c0d4043ec6d03f67d31e2b53e77415" dependencies = [ "zlib-rs", ] @@ -2954,13 +3577,10 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5e5032e24019045c762d3c0f28f5b6b8bbf38563a65908389bf7978758920897" [[package]] -name = "lz4_flex" -version = "0.11.5" +name = "lru-slab" +version = "0.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "08ab2867e3eeeca90e844d1940eab391c9dc5228783db2ed999acbc0a9ed375a" -dependencies = [ - "twox-hash", -] +checksum = "112b39cec0b298b6c1999fee3e31427f74f676e4cb9879ed1a121b43661a4154" [[package]] name = "lz4_flex" @@ -2989,6 +3609,26 @@ dependencies = [ "regex-automata", ] +[[package]] +name = "matrixmultiply" +version = "0.3.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a06de3016e9fae57a36fd14dba131fccf49f74b40b7fbdb472f96e361ec71a08" +dependencies = [ + "autocfg", + "rawpointer", +] + +[[package]] +name = "md-5" +version = "0.10.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d89e7ee0cfbedfc4da3340218492196241d89eefb6dab27de5df917a6d2e78cf" +dependencies = [ + "cfg-if", + "digest", +] + [[package]] name = "memchr" version = "2.7.6" @@ -3030,7 +3670,7 @@ dependencies = [ [[package]] name = "micromzpaf" -version = "0.20.1" +version = "0.22.0" dependencies = [ "rustyms", "serde", @@ -3074,16 +3714,56 @@ dependencies = [ "simd-adler32", ] +[[package]] +name = "mio" +version = "1.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a69bcab0ad47271a0234d9422b131806bf3968021e5dc9328caf2d4cd58557fc" +dependencies = [ + "libc", + "wasi", + "windows-sys 0.61.2", +] + [[package]] name = "moxcms" -version = "0.7.10" +version = "0.7.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "80986bbbcf925ebd3be54c26613d861255284584501595cf418320c078945608" +checksum = "ac9557c559cd6fc9867e122e20d2cbefc9ca29d80d027a8e39310920ed2f0a97" dependencies = [ "num-traits", "pxfm", ] +[[package]] +name = "mzdata" +version = "0.59.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7e6c319f4111abe43bbcc33433b727968e30163f0a90e440386978656df8685c" +dependencies = [ + "base64-simd", + "bitflags 2.10.0", + "bytemuck", + "chrono", + "flate2", + "identity-hash", + "indexmap", + "log", + "mzpeaks", + "num-traits", + "regex", + "thiserror 2.0.17", +] + +[[package]] +name = "mzpeaks" +version = "1.0.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "543be9eac70437bfc915b3339e6ae4f23dc034922f13eb2535dcc19e7e9e9481" +dependencies = [ + "num-traits", +] + [[package]] name = "naga" version = "27.0.3" @@ -3110,6 +3790,21 @@ dependencies = [ "unicode-ident", ] +[[package]] +name = "ndarray" +version = "0.16.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "882ed72dce9365842bf196bdeedf5055305f11fc8c03dee7bb0194a6cad34841" +dependencies = [ + "matrixmultiply", + "num-complex", + "num-integer", + "num-traits", + "portable-atomic", + "portable-atomic-util", + "rawpointer", +] + [[package]] name = "ndk" version = "0.9.0" @@ -3173,25 +3868,11 @@ checksum = "2bf50223579dc7cdcfb3bfcacf7069ff68243f8c363f62ffa99cf000a6b9c451" [[package]] name = "nu-ansi-term" -version = "0.50.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7957b9740744892f114936ab4a57b3f487491bbeafaf8083688b16841a4240e5" -dependencies = [ - "windows-sys 0.61.2", -] - -[[package]] -name = "num" -version = "0.4.3" +version = "0.50.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "35bd024e8b2ff75562e5f34e7f4905839deb4b22955ef5e73d2fea1b9813cb23" +checksum = "7957b9740744892f114936ab4a57b3f487491bbeafaf8083688b16841a4240e5" dependencies = [ - "num-bigint", - "num-complex", - "num-integer", - "num-iter", - "num-rational", - "num-traits", + "windows-sys 0.61.2", ] [[package]] @@ -3202,6 +3883,7 @@ checksum = "a5e44f723f1133c9deac646763579fdb3ac745e418f2a7af9cd0c431da1f20b9" dependencies = [ "num-integer", "num-traits", + "serde", ] [[package]] @@ -3211,25 +3893,21 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "73f88a1307638156682bada9d7604135552957b7818057dcef22705b4d509495" dependencies = [ "num-traits", + "serde", ] [[package]] -name = "num-integer" -version = "0.1.46" +name = "num-conv" +version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7969661fd2958a5cb096e56c8e1ad0444ac2bbcd0061bd28660485a44879858f" -dependencies = [ - "num-traits", -] +checksum = "51d515d32fb182ee37cda2ccdcb92950d6a3c2893aa280e540671c2cd0f3b1d9" [[package]] -name = "num-iter" -version = "0.1.45" +name = "num-integer" +version = "0.1.46" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1429034a0490724d0075ebb2bc9e875d6503c3cf69e235a8941aa757d83ef5bf" +checksum = "7969661fd2958a5cb096e56c8e1ad0444ac2bbcd0061bd28660485a44879858f" dependencies = [ - "autocfg", - "num-integer", "num-traits", ] @@ -3239,9 +3917,9 @@ version = "0.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f83d14da390562dca69fc84082e73e548e1ad308d24accdedd2720017cb37824" dependencies = [ - "num-bigint", "num-integer", "num-traits", + "serde", ] [[package]] @@ -3557,6 +4235,44 @@ dependencies = [ "objc2-foundation 0.2.2", ] +[[package]] +name = "object_store" +version = "0.12.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4c1be0c6c22ec0817cdc77d3842f721a17fd30ab6965001415b5402a74e6b740" +dependencies = [ + "async-trait", + "base64", + "bytes", + "chrono", + "form_urlencoded", + "futures", + "http 1.4.0", + "http-body-util", + "httparse", + "humantime", + "hyper", + "itertools 0.14.0", + "md-5", + "parking_lot", + "percent-encoding", + "quick-xml 0.38.4", + "rand 0.9.2", + "reqwest", + "ring", + "rustls-pemfile", + "serde", + "serde_json", + "serde_urlencoded", + "thiserror 2.0.17", + "tokio", + "tracing", + "url", + "walkdir", + "wasm-bindgen-futures", + "web-time", +] + [[package]] name = "once_cell" version = "1.21.3" @@ -3569,6 +4285,12 @@ version = "1.70.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "384b8ab6d37215f3c5301a95a4accb5d64aa607f1fcb26a11b5303878451b4fe" +[[package]] +name = "openssl-probe" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d05e27ee213611ffe7d6348b942e8f942b37114c00cc03cec254295a4a17852e" + [[package]] name = "orbclient" version = "0.3.49" @@ -3587,17 +4309,6 @@ dependencies = [ "num-traits", ] -[[package]] -name = "ordered-float" -version = "4.6.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7bb71e1b3fa6ca1c61f383464aaf2bb0e2f8e772a1f01d486832464de363b951" -dependencies = [ - "num-traits", - "rand 0.8.5", - "serde", -] - [[package]] name = "ordered-float" version = "5.1.0" @@ -3605,6 +4316,8 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7f4779c6901a562440c3786d08192c6fbda7c1c2060edd10006b05ee35d10f2d" dependencies = [ "num-traits", + "rand 0.8.5", + "serde", ] [[package]] @@ -3617,6 +4330,12 @@ dependencies = [ "pin-project-lite", ] +[[package]] +name = "outref" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1a80800c0488c3a21695ea981a54918fbb37abf04f4d0720c453632255e2ff0e" + [[package]] name = "owned_ttf_parser" version = "0.25.1" @@ -3655,39 +4374,6 @@ dependencies = [ "windows-link 0.2.1", ] -[[package]] -name = "parquet" -version = "56.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f0dbd48ad52d7dccf8ea1b90a3ddbfaea4f69878dd7683e51c507d4bc52b5b27" -dependencies = [ - "ahash", - "arrow-array 56.2.0", - "arrow-buffer 56.2.0", - "arrow-cast 56.2.0", - "arrow-data 56.2.0", - "arrow-ipc 56.2.0", - "arrow-schema 56.2.0", - "arrow-select 56.2.0", - "base64", - "brotli", - "bytes", - "chrono", - "flate2", - "half", - "hashbrown 0.16.1", - "lz4_flex 0.11.5", - "num", - "num-bigint", - "paste", - "seq-macro", - "simdutf8", - "snap", - "thrift", - "twox-hash", - "zstd", -] - [[package]] name = "parquet" version = "57.1.0" @@ -3695,40 +4381,43 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "be3e4f6d320dd92bfa7d612e265d7d08bba0a240bab86af3425e1d255a511d89" dependencies = [ "ahash", - "arrow-array 57.1.0", - "arrow-buffer 57.1.0", - "arrow-cast 57.1.0", - "arrow-data 57.1.0", - "arrow-ipc 57.1.0", - "arrow-schema 57.1.0", - "arrow-select 57.1.0", + "arrow-array", + "arrow-buffer", + "arrow-cast", + "arrow-data", + "arrow-ipc", + "arrow-schema", + "arrow-select", "base64", "brotli", "bytes", "chrono", "flate2", + "futures", "half", "hashbrown 0.16.1", - "lz4_flex 0.12.0", + "lz4_flex", "num-bigint", "num-integer", "num-traits", + "object_store", "paste", "seq-macro", "simdutf8", "snap", "thrift", + "tokio", "twox-hash", "zstd", ] [[package]] name = "parquet_derive" -version = "56.2.0" +version = "57.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "34cb149bbb159c7a4554718ddd31ad9fa6356310162b5c0524b8894e5702554c" +checksum = "777a89bd0515e7948516ce6d3f41c16b3a32de71f7eb99c5fdb55456b695b227" dependencies = [ - "parquet 56.2.0", + "parquet", "proc-macro2", "quote", "syn 2.0.111", @@ -3868,7 +4557,7 @@ dependencies = [ "concurrent-queue", "hermit-abi", "pin-project-lite", - "rustix 1.1.2", + "rustix 1.1.3", "windows-sys 0.61.2", ] @@ -3880,9 +4569,9 @@ checksum = "2f3a9f18d041e6d0e102a0a46750538147e5e8992d3b4873aaafee2520b00ce3" [[package]] name = "portable-atomic" -version = "1.11.1" +version = "1.12.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f84267b20a16ea918e43c6a88433c2d54fa145c92a811b5b047ccbe153674483" +checksum = "f59e70c4aef1e55797c2e8fd94a4f2a973fc972cfde0e0b05f683667b0cd39dd" [[package]] name = "portable-atomic-util" @@ -3902,6 +4591,12 @@ dependencies = [ "zerovec", ] +[[package]] +name = "powerfmt" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "439ee305def115ba05938db6eb1644ff94165c5ab5e9420d1c1bcedbba909391" + [[package]] name = "ppv-lite86" version = "0.2.21" @@ -4007,6 +4702,71 @@ dependencies = [ "memchr", ] +[[package]] +name = "quick-xml" +version = "0.38.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b66c2058c55a409d601666cffe35f04333cf1013010882cec174a7467cd4e21c" +dependencies = [ + "memchr", + "serde", +] + +[[package]] +name = "quinn" +version = "0.11.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b9e20a958963c291dc322d98411f541009df2ced7b5a4f2bd52337638cfccf20" +dependencies = [ + "bytes", + "cfg_aliases", + "pin-project-lite", + "quinn-proto", + "quinn-udp", + "rustc-hash 2.1.1", + "rustls", + "socket2", + "thiserror 2.0.17", + "tokio", + "tracing", + "web-time", +] + +[[package]] +name = "quinn-proto" +version = "0.11.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f1906b49b0c3bc04b5fe5d86a77925ae6524a19b816ae38ce1e426255f1d8a31" +dependencies = [ + "bytes", + "getrandom 0.3.4", + "lru-slab", + "rand 0.9.2", + "ring", + "rustc-hash 2.1.1", + "rustls", + "rustls-pki-types", + "slab", + "thiserror 2.0.17", + "tinyvec", + "tracing", + "web-time", +] + +[[package]] +name = "quinn-udp" +version = "0.5.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "addec6a0dcad8a8d96a771f815f0eaf55f9d1805756410b39f5fa81332574cbd" +dependencies = [ + "cfg_aliases", + "libc", + "once_cell", + "socket2", + "tracing", + "windows-sys 0.60.2", +] + [[package]] name = "quote" version = "1.0.42" @@ -4101,6 +4861,12 @@ version = "0.6.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "20675572f6f24e9e76ef639bc5552774ed45f1c30e2951e1e99c59888861c539" +[[package]] +name = "rawpointer" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "60a357793950651c4ed0f3f52338f53b2f809f32d83a07f72909fa13e4c6c1e3" + [[package]] name = "rayon" version = "1.11.0" @@ -4121,6 +4887,16 @@ dependencies = [ "crossbeam-utils", ] +[[package]] +name = "read-fonts" +version = "0.35.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6717cf23b488adf64b9d711329542ba34de147df262370221940dfabc2c91358" +dependencies = [ + "bytemuck", + "font-types", +] + [[package]] name = "redox_syscall" version = "0.4.1" @@ -4139,6 +4915,15 @@ dependencies = [ "bitflags 2.10.0", ] +[[package]] +name = "redox_syscall" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ec96166dafa0886eb81fe1c0a388bece180fbef2135f97c1e2cf8302e74b43b5" +dependencies = [ + "bitflags 2.10.0", +] + [[package]] name = "regex" version = "1.12.2" @@ -4162,6 +4947,12 @@ dependencies = [ "regex-syntax", ] +[[package]] +name = "regex-lite" +version = "0.1.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8d942b98df5e658f56f20d592c7f868833fe38115e65c33003d8cd224b0155da" + [[package]] name = "regex-syntax" version = "0.8.8" @@ -4174,11 +4965,53 @@ version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "19b30a45b0cd0bcca8037f3d0dc3421eaf95327a17cad11964fb8179b4fc4832" +[[package]] +name = "reqwest" +version = "0.12.28" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eddd3ca559203180a307f12d114c268abf583f59b03cb906fd0b3ff8646c1147" +dependencies = [ + "base64", + "bytes", + "futures-core", + "futures-util", + "h2", + "http 1.4.0", + "http-body 1.0.1", + "http-body-util", + "hyper", + "hyper-rustls", + "hyper-util", + "js-sys", + "log", + "percent-encoding", + "pin-project-lite", + "quinn", + "rustls", + "rustls-native-certs", + "rustls-pki-types", + "serde", + "serde_json", + "serde_urlencoded", + "sync_wrapper", + "tokio", + "tokio-rustls", + "tokio-util", + "tower", + "tower-http", + "tower-service", + "url", + "wasm-bindgen", + "wasm-bindgen-futures", + "wasm-streams", + "web-sys", +] + [[package]] name = "rfd" -version = "0.15.4" +version = "0.16.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ef2bee61e6cffa4635c72d7d81a84294e28f0930db0ddcb0f66d10244674ebed" +checksum = "a15ad77d9e70a92437d8f74c35d99b4e4691128df018833e99f90bcd36152672" dependencies = [ "ashpd", "block2 0.6.2", @@ -4195,27 +5028,38 @@ dependencies = [ "wasm-bindgen", "wasm-bindgen-futures", "web-sys", - "windows-sys 0.59.0", + "windows-sys 0.60.2", +] + +[[package]] +name = "ring" +version = "0.17.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a4689e6c2294d81e88dc6261c768b63bc4fcdb852be6d1352498b114f61383b7" +dependencies = [ + "cc", + "cfg-if", + "getrandom 0.2.16", + "libc", + "untrusted", + "windows-sys 0.52.0", ] [[package]] name = "rmp" -version = "0.8.14" +version = "0.8.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "228ed7c16fa39782c3b3468e974aec2795e9089153cd08ee2e9aefb3613334c4" +checksum = "4ba8be72d372b2c9b35542551678538b562e7cf86c3315773cae48dfbfe7790c" dependencies = [ - "byteorder", "num-traits", - "paste", ] [[package]] name = "rmp-serde" -version = "1.3.0" +version = "1.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "52e599a477cf9840e92f2cde9a7189e67b42c57532749bf90aea6ec10facd4db" +checksum = "72f81bee8c8ef9b577d1681a70ebbc962c232461e397b22c208c43c04b67a155" dependencies = [ - "byteorder", "rmp", "serde", ] @@ -4226,6 +5070,33 @@ version = "1.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4e27ee8bb91ca0adcf0ecb116293afa12d393f9c2b9b9cd54d33e8078fe19839" +[[package]] +name = "ron" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "db09040cc89e461f1a265139777a2bde7f8d8c67c4936f700c63ce3e2904d468" +dependencies = [ + "base64", + "bitflags 2.10.0", + "serde", + "serde_derive", + "unicode-ident", +] + +[[package]] +name = "ron" +version = "0.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fd490c5b18261893f14449cbd28cb9c0b637aebf161cd77900bfdedaff21ec32" +dependencies = [ + "bitflags 2.10.0", + "once_cell", + "serde", + "serde_derive", + "typeid", + "unicode-ident", +] + [[package]] name = "rstar" version = "0.8.4" @@ -4328,24 +5199,82 @@ version = "0.38.44" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fdb5bc1ae2baa591800df16c9ca78619bf65c0488b41b96ccec5d11220d8c154" dependencies = [ - "bitflags 2.10.0", - "errno", - "libc", - "linux-raw-sys 0.4.15", - "windows-sys 0.59.0", + "bitflags 2.10.0", + "errno", + "libc", + "linux-raw-sys 0.4.15", + "windows-sys 0.59.0", +] + +[[package]] +name = "rustix" +version = "1.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "146c9e247ccc180c1f61615433868c99f3de3ae256a30a43b49f67c2d9171f34" +dependencies = [ + "bitflags 2.10.0", + "errno", + "libc", + "linux-raw-sys 0.11.0", + "windows-sys 0.61.2", +] + +[[package]] +name = "rustls" +version = "0.23.35" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "533f54bc6a7d4f647e46ad909549eda97bf5afc1585190ef692b4286b198bd8f" +dependencies = [ + "aws-lc-rs", + "once_cell", + "ring", + "rustls-pki-types", + "rustls-webpki", + "subtle", + "zeroize", +] + +[[package]] +name = "rustls-native-certs" +version = "0.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9980d917ebb0c0536119ba501e90834767bffc3d60641457fd84a1f3fd337923" +dependencies = [ + "openssl-probe", + "rustls-pki-types", + "schannel", + "security-framework", +] + +[[package]] +name = "rustls-pemfile" +version = "2.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dce314e5fee3f39953d46bb63bb8a46d40c2f8fb7cc5a3b6cab2bde9721d6e50" +dependencies = [ + "rustls-pki-types", +] + +[[package]] +name = "rustls-pki-types" +version = "1.13.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "21e6f2ab2928ca4291b86736a8bd920a277a399bba1589409d72154ff87c1282" +dependencies = [ + "web-time", + "zeroize", ] [[package]] -name = "rustix" -version = "1.1.2" +name = "rustls-webpki" +version = "0.103.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cd15f8a2c5551a84d56efdc1cd049089e409ac19a3072d5037a17fd70719ff3e" +checksum = "2ffdfa2f5286e2247234e03f680868ac2815974dc39e00ea15adc445d0aafe52" dependencies = [ - "bitflags 2.10.0", - "errno", - "libc", - "linux-raw-sys 0.11.0", - "windows-sys 0.61.2", + "aws-lc-rs", + "ring", + "rustls-pki-types", + "untrusted", ] [[package]] @@ -4356,27 +5285,36 @@ checksum = "b39cdef0fa800fc44525c84ccb54a029961a8215f9619753635a9c0d2538d46d" [[package]] name = "rustyms" -version = "0.8.3" +version = "0.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4831e9fe077e1d905b993838753820e0d70f81a20bf86f8837374c0823523353" +checksum = "011d3d672ae44d5e07db0488d855f2b5ed178e3d6bb7ef5b18c6415c20bbd61e" dependencies = [ - "bincode", + "bincode 2.0.1", + "context_error", "flate2", - "itertools 0.12.1", - "ordered-float 4.6.0", + "itertools 0.14.0", + "mzdata", + "ndarray", + "ordered-float 5.1.0", + "paste", "probability", + "rand 0.9.2", "rayon", "regex", "serde", + "serde_json", "similar", + "swash", + "thin-vec", "uom", + "zeno", ] [[package]] name = "ryu" -version = "1.0.20" +version = "1.0.21" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "28d3b2b1366ec20994f1fd18c3c594f05c5dd4bc44d8bb0c1c632c8d6829481f" +checksum = "62049b2877bf12821e8f9ad256ee38fdc31db7387ec2d3b3f403024de2034aea" [[package]] name = "same-file" @@ -4387,6 +5325,15 @@ dependencies = [ "winapi-util", ] +[[package]] +name = "schannel" +version = "0.1.28" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "891d81b926048e76efe18581bf793546b4c0eaf8448d72be8de2bbee5fd166e1" +dependencies = [ + "windows-sys 0.61.2", +] + [[package]] name = "scoped-tls" version = "1.0.1" @@ -4412,6 +5359,29 @@ dependencies = [ "tiny-skia", ] +[[package]] +name = "security-framework" +version = "3.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b3297343eaf830f66ede390ea39da1d462b6b0c1b000f420d0a83f898bbbe6ef" +dependencies = [ + "bitflags 2.10.0", + "core-foundation 0.10.1", + "core-foundation-sys", + "libc", + "security-framework-sys", +] + +[[package]] +name = "security-framework-sys" +version = "2.15.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cc1f0cbffaac4852523ce30d8bd3c5cdc873501d96ff467ca09b6767bb8cd5c0" +dependencies = [ + "core-foundation-sys", + "libc", +] + [[package]] name = "semver" version = "1.0.27" @@ -4456,15 +5426,15 @@ dependencies = [ [[package]] name = "serde_json" -version = "1.0.145" +version = "1.0.147" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "402a6f66d8c709116cf22f558eab210f5a50187f702eb4d7e5ef38d9a7f1c79c" +checksum = "6af14725505314343e673e9ecb7cd7e8a36aa9791eb936235a3567cc31447ae4" dependencies = [ "itoa", "memchr", - "ryu", "serde", "serde_core", + "zmij", ] [[package]] @@ -4478,6 +5448,29 @@ dependencies = [ "syn 2.0.111", ] +[[package]] +name = "serde_urlencoded" +version = "0.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d3491c14715ca2294c4d6a88f15e84739788c1d030eed8c110436aafdaa2f3fd" +dependencies = [ + "form_urlencoded", + "itoa", + "ryu", + "serde", +] + +[[package]] +name = "sha2" +version = "0.10.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a7507d819769d01a365ab707794a4084392c824f54a7a6a7862f8c3d0892b283" +dependencies = [ + "cfg-if", + "cpufeatures", + "digest", +] + [[package]] name = "sharded-slab" version = "0.1.7" @@ -4526,6 +5519,16 @@ version = "1.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "56199f7ddabf13fe5074ce809e7d3f42b42ae711800501b5b16ea82ad029c39d" +[[package]] +name = "skrifa" +version = "0.37.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8c31071dedf532758ecf3fed987cdb4bd9509f900e026ab684b4ecb81ea49841" +dependencies = [ + "bytemuck", + "read-fonts", +] + [[package]] name = "slab" version = "0.4.11" @@ -4585,7 +5588,7 @@ dependencies = [ "libc", "log", "memmap2", - "rustix 1.1.2", + "rustix 1.1.3", "thiserror 2.0.17", "wayland-backend", "wayland-client", @@ -4625,6 +5628,16 @@ version = "1.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1b6b67fb9a61334225b5b790716f609cd58395f895b3fe8b328786812a40bc3b" +[[package]] +name = "socket2" +version = "0.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "17129e116933cf371d018bb80ae557e889637989d8638274fb25622827b03881" +dependencies = [ + "libc", + "windows-sys 0.60.2", +] + [[package]] name = "spade" version = "2.15.0" @@ -4688,6 +5701,23 @@ version = "0.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f" +[[package]] +name = "subtle" +version = "2.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "13c2bddecc57b384dee18652358fb23172facb8a2c51ccc10d74c157bdea3292" + +[[package]] +name = "swash" +version = "0.2.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "47846491253e976bdd07d0f9cc24b7daf24720d11309302ccbbc6e6b6e53550a" +dependencies = [ + "skrifa", + "yazi", + "zeno", +] + [[package]] name = "syn" version = "1.0.109" @@ -4710,6 +5740,15 @@ dependencies = [ "unicode-ident", ] +[[package]] +name = "sync_wrapper" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0bf256ce5efdfa370213c1dabab5935a12e49f2c58d15e9eac2870d3b4f27263" +dependencies = [ + "futures-core", +] + [[package]] name = "synstructure" version = "0.13.2" @@ -4723,14 +5762,14 @@ dependencies = [ [[package]] name = "tempfile" -version = "3.23.0" +version = "3.24.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2d31c77bdf42a745371d260a26ca7163f1e0924b64afa0b688e61b5a9fa02f16" +checksum = "655da9c7eb6305c55742045d5a8d2037996d61d8de95806335c7c86ce0f82e9c" dependencies = [ "fastrand", "getrandom 0.3.4", "once_cell", - "rustix 1.1.2", + "rustix 1.1.3", "windows-sys 0.61.2", ] @@ -4743,6 +5782,15 @@ dependencies = [ "winapi-util", ] +[[package]] +name = "thin-vec" +version = "0.2.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "144f754d318415ac792f9d69fc87abbbfc043ce2ef041c60f16ad828f638717d" +dependencies = [ + "serde", +] + [[package]] name = "thiserror" version = "1.0.69" @@ -4817,30 +5865,71 @@ dependencies = [ "zune-jpeg", ] +[[package]] +name = "time" +version = "0.3.44" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "91e7d9e3bb61134e77bde20dd4825b97c010155709965fedf0f49bb138e52a9d" +dependencies = [ + "deranged", + "num-conv", + "powerfmt", + "serde", + "time-core", + "time-macros", +] + +[[package]] +name = "time-core" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "40868e7c1d2f0b8d73e4a8c7f0ff63af4f6d19be117e90bd73eb1d62cf831c6b" + +[[package]] +name = "time-macros" +version = "0.2.24" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "30cfb0125f12d9c277f35663a0a33f8c30190f4e4574868a330595412d34ebf3" +dependencies = [ + "num-conv", + "time-core", +] + [[package]] name = "timscentroid" -version = "0.20.1" +version = "0.22.0" dependencies = [ "arrow", + "async-trait", + "aws-config", + "aws-credential-types", + "bytes", "chrono", + "futures", "geo", "geo-types", "half", - "parquet 57.1.0", + "object_store", + "once_cell", + "parquet", + "rand 0.9.2", "rayon", "serde", "serde_json", + "tempfile", "thiserror 2.0.17", "timsrust", + "tokio", "tracing", "tracing-subscriber", + "url", ] [[package]] name = "timsquery" -version = "0.20.1" +version = "0.22.0" dependencies = [ - "bincode", + "bincode 1.3.3", "bon", "csv", "half", @@ -4858,7 +5947,7 @@ dependencies = [ [[package]] name = "timsquery_cli" -version = "0.20.1" +version = "0.22.0" dependencies = [ "clap", "half", @@ -4866,6 +5955,7 @@ dependencies = [ "rayon", "serde", "serde_json", + "tempfile", "thiserror 2.0.17", "timscentroid", "timsquery", @@ -4876,8 +5966,9 @@ dependencies = [ [[package]] name = "timsquery_viewer" -version = "0.20.1" +version = "0.22.0" dependencies = [ + "clap", "eframe", "egui", "egui_dock", @@ -4886,12 +5977,14 @@ dependencies = [ "mimalloc", "rayon", "rfd", + "ron 0.12.0", "serde", "serde_json", "thiserror 2.0.17", "timscentroid", "timsquery", "timsrust", + "timsseek", "tracing", "tracing-subscriber", ] @@ -4899,12 +5992,12 @@ dependencies = [ [[package]] name = "timsrust" version = "0.5.0" -source = "git+https://github.com/jspaezp/timsrust?branch=experimental%2Frecalib_iter#5f50da4f846efab3e878f4aee4637c0182b192e0" +source = "git+https://github.com/jspaezp/timsrust?branch=experimental%2Frecalib_iter#1ae76212863aee08e574c80aed505c3e359d758e" dependencies = [ "bytemuck", "linreg", "memmap2", - "parquet 56.2.0", + "parquet", "rayon", "rusqlite", "serde", @@ -4915,12 +6008,12 @@ dependencies = [ [[package]] name = "timsseek" -version = "0.20.1" +version = "0.22.0" dependencies = [ "calibrt", "forust-ml", "micromzpaf", - "parquet 57.1.0", + "parquet", "parquet_derive", "rand 0.9.2", "rayon", @@ -4930,6 +6023,7 @@ dependencies = [ "rustyms", "serde", "serde_json", + "timscentroid", "timsquery", "timsrust", "tracing", @@ -4938,7 +6032,7 @@ dependencies = [ [[package]] name = "timsseek_cli" -version = "0.20.1" +version = "0.22.0" dependencies = [ "clap", "indicatif", @@ -4955,22 +6049,6 @@ dependencies = [ "tracing-subscriber", ] -[[package]] -name = "timsseek_rts" -version = "0.20.1" -dependencies = [ - "clap", - "rayon", - "regex", - "serde", - "serde_json", - "timsquery", - "timsrust", - "timsseek", - "tracing", - "tracing-subscriber", -] - [[package]] name = "tiny-keccak" version = "2.0.2" @@ -5031,20 +6109,70 @@ version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" +[[package]] +name = "tokio" +version = "1.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ff360e02eab121e0bc37a2d3b4d4dc622e6eda3a8e5253d5435ecf5bd4c68408" +dependencies = [ + "bytes", + "libc", + "mio", + "pin-project-lite", + "signal-hook-registry", + "socket2", + "tokio-macros", + "windows-sys 0.61.2", +] + +[[package]] +name = "tokio-macros" +version = "2.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "af407857209536a95c8e56f8231ef2c2e2aff839b22e07a1ffcbc617e9db9fa5" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.111", +] + +[[package]] +name = "tokio-rustls" +version = "0.26.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1729aa945f29d91ba541258c8df89027d5792d85a8841fb65e8bf0f4ede4ef61" +dependencies = [ + "rustls", + "tokio", +] + +[[package]] +name = "tokio-util" +version = "0.7.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2efa149fe76073d6e8fd97ef4f4eca7b67f599660115591483572e406e165594" +dependencies = [ + "bytes", + "futures-core", + "futures-sink", + "pin-project-lite", + "tokio", +] + [[package]] name = "toml_datetime" -version = "0.7.3" +version = "0.7.5+spec-1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f2cdb639ebbc97961c51720f858597f7f24c4fc295327923af55b74c3c724533" +checksum = "92e1cfed4a3038bc5a127e35a2d360f145e1f4b971b551a2ba5fd7aedf7e1347" dependencies = [ "serde_core", ] [[package]] name = "toml_edit" -version = "0.23.9" +version = "0.23.10+spec-1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5d7cbc3b4b49633d57a0509303158ca50de80ae32c265093b24c414705807832" +checksum = "84c8b9f757e028cee9fa244aea147aab2a9ec09d5325a9b01e0a49730c2b5269" dependencies = [ "indexmap", "toml_datetime", @@ -5054,18 +6182,63 @@ dependencies = [ [[package]] name = "toml_parser" -version = "1.0.4" +version = "1.0.6+spec-1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c0cbe268d35bdb4bb5a56a2de88d0ad0eb70af5384a99d648cd4b3d04039800e" +checksum = "a3198b4b0a8e11f09dd03e133c0280504d0801269e9afa46362ffde1cbeebf44" dependencies = [ "winnow", ] +[[package]] +name = "tower" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d039ad9159c98b70ecfd540b2573b97f7f52c3e8d9f8ad57a24b916a536975f9" +dependencies = [ + "futures-core", + "futures-util", + "pin-project-lite", + "sync_wrapper", + "tokio", + "tower-layer", + "tower-service", +] + +[[package]] +name = "tower-http" +version = "0.6.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d4e6559d53cc268e5031cd8429d05415bc4cb4aefc4aa5d6cc35fbf5b924a1f8" +dependencies = [ + "bitflags 2.10.0", + "bytes", + "futures-util", + "http 1.4.0", + "http-body 1.0.1", + "iri-string", + "pin-project-lite", + "tower", + "tower-layer", + "tower-service", +] + +[[package]] +name = "tower-layer" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "121c2a6cda46980bb0fcd1647ffaf6cd3fc79a013de288782836f6df9c48780e" + +[[package]] +name = "tower-service" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8df9b6e13f2d32c91b9bd719c00d1958837bc7dec474d94952798cc8e69eeec3" + [[package]] name = "tracing" -version = "0.1.43" +version = "0.1.44" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2d15d90a0b5c19378952d479dc858407149d7bb45a14de0142f6c534b16fc647" +checksum = "63e71662fa4b2a2c3a26f570f037eb95bb1f85397f3cd8076caed2f026a6d100" dependencies = [ "log", "pin-project-lite", @@ -5086,9 +6259,9 @@ dependencies = [ [[package]] name = "tracing-core" -version = "0.1.35" +version = "0.1.36" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7a04e24fab5c89c6a36eb8558c9656f30d81de51dfa4d3b45f26b21d61fa0a6c" +checksum = "db97caf9d906fbde555dd62fa95ddba9eecfd14cb388e4f491a66d74cd5fb79a" dependencies = [ "once_cell", "valuable", @@ -5136,6 +6309,12 @@ dependencies = [ "tracing-log", ] +[[package]] +name = "try-lock" +version = "0.2.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e421abadd41a4225275504ea4d6566923418b7f05506fbc9c0fe86ba7396114b" + [[package]] name = "ttf-parser" version = "0.25.1" @@ -5157,6 +6336,12 @@ dependencies = [ "rustc-hash 2.1.1", ] +[[package]] +name = "typeid" +version = "1.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bc7d623258602320d5c55d1bc22793b57daff0ec7efc270ea7d55ce1d5f5471c" + [[package]] name = "typenum" version = "1.19.0" @@ -5204,12 +6389,26 @@ version = "0.5.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "81e544489bf3d8ef66c953931f56617f423cd4b5494be343d9b9d3dda037b9a3" +[[package]] +name = "untrusted" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8ecb6da28b8a351d773b68d5825ac39017e680750f980f3a1a85cd8dd28a47c1" + +[[package]] +name = "unty" +version = "0.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6d49784317cd0d1ee7ec5c716dd598ec5b4483ea832a2dced265471cc0f690ae" + [[package]] name = "uom" -version = "0.35.0" +version = "0.37.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8362194c7a9845a7a7f3562173d6e1da3f24f7132018cb78fe77a5b4474187b2" +checksum = "cd5cfe7d84f6774726717f358a37f5bca8fca273bed4de40604ad129d1107b49" dependencies = [ + "num-bigint", + "num-complex", "num-rational", "num-traits", "serde", @@ -5275,6 +6474,18 @@ version = "0.9.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a" +[[package]] +name = "virtue" +version = "0.0.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "051eb1abcf10076295e815102942cc58f9d5e3b4560e46e53c21e8ff6f3af7b1" + +[[package]] +name = "vsimd" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c3082ca00d5a5ef149bb8b555a72ae84c9c59f7250f013ac822ac2e49b19c64" + [[package]] name = "walkdir" version = "2.5.0" @@ -5285,6 +6496,15 @@ dependencies = [ "winapi-util", ] +[[package]] +name = "want" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bfa7760aed19e106de2c7c0b581b509f2f25d3dacaf737cb82ac61bc6d760b0e" +dependencies = [ + "try-lock", +] + [[package]] name = "wasi" version = "0.11.1+wasi-snapshot-preview1" @@ -5358,6 +6578,19 @@ dependencies = [ "unicode-ident", ] +[[package]] +name = "wasm-streams" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "15053d8d85c7eccdbefef60f06769760a563c7f0a9d6902a13d35c7800b0ad65" +dependencies = [ + "futures-util", + "js-sys", + "wasm-bindgen", + "wasm-bindgen-futures", + "web-sys", +] + [[package]] name = "wayland-backend" version = "0.3.11" @@ -5366,7 +6599,7 @@ checksum = "673a33c33048a5ade91a6b139580fa174e19fb0d23f396dca9fa15f2e1e49b35" dependencies = [ "cc", "downcast-rs", - "rustix 1.1.2", + "rustix 1.1.3", "scoped-tls", "smallvec", "wayland-sys", @@ -5379,7 +6612,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c66a47e840dc20793f2264eb4b3e4ecb4b75d91c0dd4af04b456128e0bdd449d" dependencies = [ "bitflags 2.10.0", - "rustix 1.1.2", + "rustix 1.1.3", "wayland-backend", "wayland-scanner", ] @@ -5401,7 +6634,7 @@ version = "0.31.11" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "447ccc440a881271b19e9989f75726d60faa09b95b0200a9b7eb5cc47c3eeb29" dependencies = [ - "rustix 1.1.2", + "rustix 1.1.3", "wayland-client", "xcursor", ] @@ -6255,7 +7488,7 @@ dependencies = [ "libc", "libloading", "once_cell", - "rustix 1.1.2", + "rustix 1.1.3", "x11rb-protocol", ] @@ -6296,6 +7529,18 @@ version = "0.8.28" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3ae8337f8a065cfc972643663ea4279e04e7256de865aa66fe25cec5fb912d3f" +[[package]] +name = "xmlparser" +version = "0.13.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "66fee0b777b0f5ac1c69bb06d361268faafa61cd4682ae064a171c16c433e9e4" + +[[package]] +name = "yazi" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e01738255b5a16e78bbb83e7fbba0a1e7dd506905cfc53f4622d89015a03fbb5" + [[package]] name = "yoke" version = "0.8.1" @@ -6417,6 +7662,12 @@ dependencies = [ "zvariant", ] +[[package]] +name = "zeno" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6df3dc4292935e51816d896edcd52aa30bc297907c26167fec31e2b0c6a32524" + [[package]] name = "zerocopy" version = "0.8.31" @@ -6458,6 +7709,12 @@ dependencies = [ "synstructure", ] +[[package]] +name = "zeroize" +version = "1.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b97154e67e32c85465826e8bcc1c59429aaaf107c1e4a9e53c8d8ccd5eff88d0" + [[package]] name = "zerotrie" version = "0.2.3" @@ -6493,9 +7750,15 @@ dependencies = [ [[package]] name = "zlib-rs" -version = "0.5.4" +version = "0.5.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "40990edd51aae2c2b6907af74ffb635029d5788228222c4bb811e9351c0caad3" + +[[package]] +name = "zmij" +version = "0.1.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "51f936044d677be1a1168fae1d03b583a285a5dd9d8cbf7b24c23aa1fc775235" +checksum = "9e404bcd8afdaf006e529269d3e85a743f9480c3cef60034d77860d02964f3ba" [[package]] name = "zstd" diff --git a/Cargo.toml b/Cargo.toml index 3b4eb76..8bfd82b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -6,14 +6,13 @@ members = [ "rust/timscentroid", "rust/timsseek", "rust/timsseek_cli", - "rust/timsseek_rts", "rust/timsquery", "rust/timsquery_cli", "rust/timsquery_viewer" ] [workspace.package] -version = "0.20.1" +version = "0.22.0" edition = "2024" authors = ["Sebastian Paez"] license = "Apache-2.0" @@ -35,6 +34,7 @@ indicatif = { version = "0.18.0", features = ["rayon"] } # Parquet and arrow usually need to be kept in sync # Its not needed but makes binaries smaller +parquet_derive = { version = "57.1" } parquet = { version = "57.1" } arrow = { version = "57.1" } @@ -43,9 +43,11 @@ thiserror = "2" insta = { version = "1.34.0" } bon = "3.8.1" tinyvec = { features = ["alloc", "serde"], version = "1.10.0" } -rustyms = "0.8.3" +rustyms = "0.11.0" csv = "1.3" +tempfile = "3.23.0" + [profile.release] lto = 'thin' codegen-units = 1 diff --git a/pyproject.toml b/pyproject.toml index cca1b6a..9878760 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,8 +1,9 @@ [project] name = "timsseek-workspace" -version = "0.19.0" +version = "0.22.0" requires-python = ">=3.11,<3.13" dependencies = [ + "jupyter[python]>=1.1.1", "speclib_builder[ml]", "timsseek_rescore", "timsseek_rts_receiver", @@ -53,7 +54,7 @@ packages = [ ] [tool.bumpver] -current_version = "0.19.0" +current_version = "0.22.0" version_pattern = "MAJOR.MINOR.PATCH[-PYTAGNUM]" tag_message = "v{new_version}" commit_message = "chore: bump version to {new_version}" @@ -65,4 +66,3 @@ push = true "python/*/pyproject.toml" = ['version = "{version}"$'] "pyproject.toml" = ['version = "{version}"$'] "Cargo.toml" = ['version = "{version}"$'] -"rust/*/Cargo.toml" = ['version = "{version}"$'] diff --git a/python/speclib_builder/pyproject.toml b/python/speclib_builder/pyproject.toml index 09ce9d8..f941729 100644 --- a/python/speclib_builder/pyproject.toml +++ b/python/speclib_builder/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "hatchling.build" [project] name = "speclib_builder" -version = "0.19.0" +version = "0.22.0" requires-python = ">=3.11,<3.13" dependencies = [ "rich", diff --git a/python/timsseek_rescore/pyproject.toml b/python/timsseek_rescore/pyproject.toml index 5789698..e45c6a7 100644 --- a/python/timsseek_rescore/pyproject.toml +++ b/python/timsseek_rescore/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "hatchling.build" [project] name = "timsseek_rescore" -version = "0.19.0" +version = "0.22.0" requires-python = ">=3.11,<3.13" dependencies = [ "polars", diff --git a/python/timsseek_rts_receiver/pyproject.toml b/python/timsseek_rts_receiver/pyproject.toml index a56ef3e..d3d2498 100644 --- a/python/timsseek_rts_receiver/pyproject.toml +++ b/python/timsseek_rts_receiver/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "timsseek_rts_receiver" -version = "0.19.0" +version = "0.22.0" requires-python = ">=3.11,<3.13" description = "Add your description here" dependencies = [ diff --git a/rust/micromzpaf/src/lib.rs b/rust/micromzpaf/src/lib.rs index 414fbc7..99ba242 100644 --- a/rust/micromzpaf/src/lib.rs +++ b/rust/micromzpaf/src/lib.rs @@ -392,34 +392,34 @@ impl TryFrom for IonSeriesOrdinal { }) } let tmp = match value { - FragmentType::a(ordinal) => IonSeriesOrdinal::a { + FragmentType::a(ordinal, _) => IonSeriesOrdinal::a { ordinal: try_convert_ordinal(ordinal.series_number, 'a')?, }, - FragmentType::b(ordinal) => IonSeriesOrdinal::b { + FragmentType::b(ordinal, _) => IonSeriesOrdinal::b { ordinal: try_convert_ordinal(ordinal.series_number, 'b')?, }, - FragmentType::c(ordinal) => IonSeriesOrdinal::c { + FragmentType::c(ordinal, _) => IonSeriesOrdinal::c { ordinal: try_convert_ordinal(ordinal.series_number, 'c')?, }, - FragmentType::d(ordinal) => IonSeriesOrdinal::d { + FragmentType::d(ordinal, _, _, _, _) => IonSeriesOrdinal::d { ordinal: try_convert_ordinal(ordinal.series_number, 'd')?, }, - FragmentType::v(ordinal) => IonSeriesOrdinal::v { + FragmentType::v(ordinal, _, _, _) => IonSeriesOrdinal::v { ordinal: try_convert_ordinal(ordinal.series_number, 'v')?, }, - FragmentType::w(ordinal) => IonSeriesOrdinal::w { + FragmentType::w(ordinal, _, _, _, _) => IonSeriesOrdinal::w { ordinal: try_convert_ordinal(ordinal.series_number, 'w')?, }, - FragmentType::x(ordinal) => IonSeriesOrdinal::x { + FragmentType::x(ordinal, _) => IonSeriesOrdinal::x { ordinal: try_convert_ordinal(ordinal.series_number, 'x')?, }, - FragmentType::y(ordinal) => IonSeriesOrdinal::y { + FragmentType::y(ordinal, _) => IonSeriesOrdinal::y { ordinal: try_convert_ordinal(ordinal.series_number, 'y')?, }, - FragmentType::z(ordinal) => IonSeriesOrdinal::z { + FragmentType::z(ordinal, _) => IonSeriesOrdinal::z { ordinal: try_convert_ordinal(ordinal.series_number, 'z')?, }, - FragmentType::precursor => IonSeriesOrdinal::precursor, + FragmentType::Precursor => IonSeriesOrdinal::precursor, _ => { return Err(IonParsingError::Custom { error: format!("Unsupported fragment type: {value:?}"), diff --git a/rust/timscentroid/Cargo.toml b/rust/timscentroid/Cargo.toml index 933d60d..e58b42a 100644 --- a/rust/timscentroid/Cargo.toml +++ b/rust/timscentroid/Cargo.toml @@ -16,14 +16,34 @@ serde = { workspace = true, features = ["derive"] } serde_json = { workspace = true } thiserror = { workspace = true } tracing = { workspace = true } -parquet = { workspace = true } +parquet = { workspace = true, features = ["async", "object_store"] } arrow = { workspace = true } chrono = { version = "0.4", features = ["serde"] } +# Storage abstraction (always available) +object_store = "0.12" +tokio = { version = "1", features = ["rt", "rt-multi-thread"] } +futures = "0.3" +bytes = "1" +url = "2" +once_cell = "1" +async-trait = "0.1" +aws-config = { version = "1.0" , optional = true } +aws-credential-types = { version = "1.0", optional = true } # Example dependencies +[dev-dependencies] tracing-subscriber = { workspace = true, features = ["env-filter"], default-features = true} +rand = "0.9.2" +tempfile = "3" +tokio = { version = "1", features = ["macros"] } # Not using it RN but looks like a super solid way to make # builder patterns # bon = "3.7.2" + +[features] +default = [] +aws = ["object_store/aws", "aws-config", "aws-credential-types"] +gcp = ["object_store/gcp"] +azure = ["object_store/azure"] diff --git a/rust/timscentroid/README.md b/rust/timscentroid/README.md new file mode 100644 index 0000000..ef337ba --- /dev/null +++ b/rust/timscentroid/README.md @@ -0,0 +1,307 @@ +# timscentroid + +Efficient indexing and lazy loading for timsTOF mass spectrometry data. + +## Quick Start + +### Basic Usage (Local Files) + +```rust +use timscentroid::{IndexedTimstofPeaks, CentroidingConfig}; +use timscentroid::lazy::LazyIndexedTimstofPeaks; +use timscentroid::utils::{TupleRange, OptionallyRestricted::*}; + +// 1. Index your timsTOF data +let index = IndexedTimstofPeaks::from_timstof_file( + &file, + CentroidingConfig::default() +); + +// 2. Save to disk +index.save_to_directory("./indexed_peaks")?; + +// 3. Load lazily (only loads metadata, ~50ms) +let lazy_index = LazyIndexedTimstofPeaks::load_from_directory("./indexed_peaks")?; + +// 4. Query peaks (loads relevant data on-demand) +let mz_range = TupleRange::try_new(400.0, 500.0)?; +for peak in lazy_index.query_peaks_ms1(mz_range, Unrestricted, Unrestricted) { + println!("m/z: {}, intensity: {}", peak.mz, peak.intensity); +} +``` + +## Installation + +Add to your `Cargo.toml`: + +```toml +[dependencies] +timscentroid = "0.21" +``` + +For cloud storage support, enable the appropriate features: + +```toml +[dependencies] +timscentroid = { version = "0.21", features = ["aws"] } # For S3 +# or +timscentroid = { version = "0.21", features = ["gcp"] } # For GCS +# or +timscentroid = { version = "0.21", features = ["azure"] } # For Azure +``` + +## Cloud Storage + +### Saving to Cloud + +```rust +// Save to S3 +index.save_to_url("s3://my-bucket/indexed_peaks/")?; + +// Save to Google Cloud Storage +index.save_to_url("gs://my-bucket/indexed_peaks/")?; + +// Save to Azure Blob Storage +index.save_to_url("az://my-container/indexed_peaks/")?; +``` + +### Loading from Cloud + +```rust +// Load from S3 +let lazy_index = LazyIndexedTimstofPeaks::load_from_url( + "s3://my-bucket/indexed_peaks/" +)?; + +// Query works the same as local +let peaks = lazy_index.query_peaks_ms1(mz_range, Unrestricted, Unrestricted); +``` + +### Authentication + +Cloud storage uses default credential chains: + +**AWS S3:** +```bash +# Option 1: AWS credentials file +cat ~/.aws/credentials +#[default] +#aws_access_key_id = YOUR_KEY +#aws_secret_access_key = YOUR_SECRET + +# Option 2: Environment variables +export AWS_ACCESS_KEY_ID=YOUR_KEY +export AWS_SECRET_ACCESS_KEY=YOUR_SECRET + +# Option 3: IAM role (when running on EC2) +``` + +**Google Cloud Storage:** +```bash +# Option 1: Service account key +export GOOGLE_APPLICATION_CREDENTIALS=/path/to/service-account-key.json + +# Option 2: gcloud auth +gcloud auth application-default login +``` + +**Azure Blob Storage:** +```bash +# Option 1: Connection string +export AZURE_STORAGE_CONNECTION_STRING="DefaultEndpointsProtocol=https;..." + +# Option 2: Account key +export AZURE_STORAGE_ACCOUNT=myaccount +export AZURE_STORAGE_KEY=mykey +``` + +## Async vs Sync API + +The library provides both async and blocking APIs. Use async when you're already in an async context: + +### Async (Recommended in async contexts) + +```rust +use timscentroid::lazy::LazyIndexedTimstofPeaks; + +#[tokio::main] +async fn main() -> Result<(), Box> { + // Load asynchronously (doesn't block executor thread) + let lazy_index = LazyIndexedTimstofPeaks::load_from_url_async( + "s3://my-bucket/indexed_peaks/" + ).await?; + + // Save asynchronously + index.save_to_url_async("s3://my-bucket/output/").await?; + + Ok(()) +} +``` + +### Blocking (Simpler for scripts) + +```rust +fn main() -> Result<(), Box> { + // Blocks current thread (spawns internal runtime) + let lazy_index = LazyIndexedTimstofPeaks::load_from_url( + "s3://my-bucket/indexed_peaks/" + )?; + + // This also blocks + index.save_to_url("s3://my-bucket/output/")?; + + Ok(()) +} +``` + +## Configuration + +### Serialization Settings + +Control parquet file characteristics: + +```rust +use timscentroid::serialization::SerializationConfig; +use parquet::basic::{Compression, ZstdLevel}; + +let config = SerializationConfig { + row_group_size: 100_000, // Peaks per row group + compression: Compression::ZSTD(ZstdLevel::try_new(3)?), +}; + +index.save_to_directory_with_config("./indexed_peaks", config)?; +``` + +**Row group size recommendations:** +- Small datasets (< 1M peaks): 10k-50k per row group +- Medium datasets (1M-10M peaks): 50k-200k per row group +- Large datasets (> 10M peaks): 200k-1M per row group + +Smaller row groups = more granular queries but more overhead. + +### Centroiding Settings + +```rust +use timscentroid::CentroidingConfig; + +let config = CentroidingConfig { + min_intensity: 100.0, + min_peaks_per_group: 3, + ..Default::default() +}; + +let index = IndexedTimstofPeaks::from_timstof_file(&file, config); +``` + +## Working with S3-Compatible Services + +Digital Ocean Spaces, MinIO, Wasabi, and other S3-compatible services work via environment variables: + +```bash +# Set the custom endpoint +export AWS_ENDPOINT_URL="https://nyc3.digitaloceanspaces.com" +export AWS_ACCESS_KEY_ID="your-spaces-key" +export AWS_SECRET_ACCESS_KEY="your-spaces-secret" +export AWS_REGION="nyc3" +``` + +```rust +// Use s3:// URL - will connect to custom endpoint +let index = LazyIndexedTimstofPeaks::load_from_url("s3://my-space/data/")?; +``` + +## Performance Notes + +**Lazy loading initialization:** +- Local: ~20-50ms (reads metadata.json only) +- Cloud: ~100-150ms (includes network round-trip) + +**First query (cold):** +- Local: 20-50ms per row group +- Cloud: ~200ms for concurrent row group fetching + +**Concurrent row group fetching:** +The library automatically fetches multiple parquet row groups in parallel when querying cloud storage, providing ~5x speedup compared to sequential fetching. + +## Examples + +### Query with Filters + +```rust +use timscentroid::utils::{TupleRange, OptionallyRestricted::*}; + +let mz_range = TupleRange::try_new(400.0, 500.0)?; +let rt_range = TupleRange::try_new(100, 200)?; // Cycle indices +let im_range = TupleRange::try_new( + half::f16::from_f32(0.8), + half::f16::from_f32(1.2) +)?; + +let peaks: Vec<_> = lazy_index + .query_peaks_ms1( + mz_range, + Restricted(rt_range), + Restricted(im_range) + ) + .collect(); + +println!("Found {} peaks", peaks.len()); +``` + +### Multiple Queries (Reuse Index) + +```rust +// Load once +let lazy_index = LazyIndexedTimstofPeaks::load_from_directory("./indexed_peaks")?; + +// Query multiple times (efficient) +for window_start in (400..=1000).step_by(100) { + let mz_range = TupleRange::try_new(window_start as f32, (window_start + 100) as f32)?; + let peaks: Vec<_> = lazy_index + .query_peaks_ms1(mz_range, Unrestricted, Unrestricted) + .collect(); + println!("m/z {}-{}: {} peaks", window_start, window_start + 100, peaks.len()); +} +``` + +## Troubleshooting + +### "Unsupported URL scheme" Error + +Enable the appropriate feature flag in `Cargo.toml`: + +```toml +timscentroid = { version = "0.21", features = ["aws"] } +``` + +### Authentication Errors + +Verify credentials are configured: + +```bash +# AWS +aws s3 ls s3://my-bucket/ + +# GCP +gcloud auth application-default login + +# Azure +az storage blob list --account-name myaccount --container-name mycontainer +``` + +### Slow Cloud Performance + +- Ensure you're in the same region as your data +- Check row group size configuration +- Monitor cloud provider logs for rate limiting +- Consider using a cloud VM near your data + +## Architecture + +**Storage abstraction:** Uses the `object_store` crate for unified local/cloud access + +**Async runtime:** Single global Tokio current-thread runtime (~1.5MB), created lazily on first use + +**API design:** Async methods are primary, blocking wrappers provided for convenience + +**Zero-cost abstraction:** No runtime overhead for storage operations, feature flags only control URL parsing diff --git a/rust/timscentroid/examples/load_benchmark.rs b/rust/timscentroid/examples/load_benchmark.rs new file mode 100644 index 0000000..e4a0248 --- /dev/null +++ b/rust/timscentroid/examples/load_benchmark.rs @@ -0,0 +1,210 @@ +/// Compare local vs S3 lazy loading performance +use half::f16; +use timscentroid::StorageLocation; +use timscentroid::lazy::LazyIndexedTimstofPeaks; +use timscentroid::utils::{ + OptionallyRestricted, + TupleRange, +}; + +use OptionallyRestricted::{ + Restricted, + Unrestricted, +}; + +fn main() { + tracing_subscriber::fmt() + .with_env_filter(tracing_subscriber::EnvFilter::from_default_env()) + .init(); + + let serialized_dir = std::path::Path::new("./serialized_peaks"); + let s3_url = "s3://terraform-workstations-bucket/jspaezp/serialized_peaks_test"; + + if !serialized_dir.exists() { + eprintln!("Error: Run `cargo run --example serialization --release` first"); + std::process::exit(1); + } + + println!("Lazy Loading Benchmark: Local vs S3\n"); + const NUM_ITERATIONS: usize = 5; + + // Benchmark init times + println!("Local init:"); + let local_times = benchmark_init(serialized_dir, "Local", NUM_ITERATIONS); + let local_avg = avg(&local_times); + + println!("\nS3 init ({}):", s3_url); + let s3_times = benchmark_init_s3(s3_url, "S3", NUM_ITERATIONS); + let s3_avg = avg(&s3_times); + + println!( + "\nInit time - Local: {:.2?}, S3: {:.2?} (Local is {:.1}x faster)", + local_avg, + s3_avg, + s3_avg.as_secs_f64() / local_avg.as_secs_f64() + ); + + // Load indices for query tests + let index_local = + LazyIndexedTimstofPeaks::load_from_storage(StorageLocation::from_path(serialized_dir)) + .unwrap() + .with_instrumentation("Local"); + let index_s3 = + LazyIndexedTimstofPeaks::load_from_storage(StorageLocation::from_url(s3_url).unwrap()) + .unwrap() + .with_instrumentation("S3"); + + println!("\nQuery tests:"); + compare_results(&index_local, &index_s3); +} + +fn benchmark_init(dir: &std::path::Path, label: &str, n: usize) -> Vec { + (0..n) + .map(|i| { + let start = std::time::Instant::now(); + let _index = + LazyIndexedTimstofPeaks::load_from_storage(StorageLocation::from_path(dir)) + .unwrap() + .with_instrumentation(label); + let elapsed = start.elapsed(); + println!(" {}: {:.2?}", i + 1, elapsed); + elapsed + }) + .collect() +} + +fn benchmark_init_s3(url: &str, label: &str, n: usize) -> Vec { + (0..n) + .map(|i| { + let start = std::time::Instant::now(); + let _index = + LazyIndexedTimstofPeaks::load_from_storage(StorageLocation::from_url(url).unwrap()) + .unwrap() + .with_instrumentation(label); + let elapsed = start.elapsed(); + println!(" {}: {:.2?}", i + 1, elapsed); + elapsed + }) + .collect() +} + +fn avg(times: &[std::time::Duration]) -> std::time::Duration { + times.iter().sum::() / times.len() as u32 +} + +fn compare_results(index_local: &LazyIndexedTimstofPeaks, index_s3: &LazyIndexedTimstofPeaks) { + let queries = build_queries(&mut rand::rng()); + + // Sync queries + println!("\n Sync API:"); + let local_sync = test_querying(index_local, &queries); + println!( + " Local: {} peaks, {:.2?}/query", + local_sync.npeaks, local_sync.query_time + ); + let s3_sync = test_querying(index_s3, &queries); + println!( + " S3: {} peaks, {:.2?}/query", + s3_sync.npeaks, s3_sync.query_time + ); + + // Async queries + println!("\n Async API:"); + let runtime = tokio::runtime::Runtime::new().unwrap(); + let local_async = runtime.block_on(test_querying_async(index_local, &queries)); + println!( + " Local: {} peaks, {:.2?}/query", + local_async.npeaks, local_async.query_time + ); + let s3_async = runtime.block_on(test_querying_async(index_s3, &queries)); + println!( + " S3: {} peaks, {:.2?}/query", + s3_async.npeaks, s3_async.query_time + ); + + println!("\n Speedup (async vs sync):"); + println!( + " Local: {:.1}x", + local_sync.query_time.as_secs_f64() / local_async.query_time.as_secs_f64() + ); + println!( + " S3: {:.1}x", + s3_sync.query_time.as_secs_f64() / s3_async.query_time.as_secs_f64() + ); + + index_local.print_metrics("Local"); + index_s3.print_metrics("S3"); +} + +#[derive(Debug)] +struct QueryResult { + npeaks: usize, + query_time: std::time::Duration, +} + +fn test_querying( + index: &LazyIndexedTimstofPeaks, + queries: &[((f32, f32), (f32, f32), (f16, f16))], +) -> QueryResult { + let start = std::time::Instant::now(); + let mut npeaks = 0; + + for (prec, frag, im) in queries { + let prec = TupleRange::try_new(prec.0, prec.1).unwrap(); + let frag = TupleRange::try_new(frag.0, frag.1).unwrap(); + let im = TupleRange::try_new(im.0, im.1).unwrap(); + for (_wg, peaks_vec) in index.query_peaks_ms2(prec, frag, Unrestricted, Restricted(im)) { + npeaks += peaks_vec.len(); + } + } + + QueryResult { + npeaks, + query_time: start.elapsed() / queries.len() as u32, + } +} + +async fn test_querying_async( + index: &LazyIndexedTimstofPeaks, + queries: &[((f32, f32), (f32, f32), (f16, f16))], +) -> QueryResult { + let start = std::time::Instant::now(); + + let tasks: Vec<_> = queries + .iter() + .map(|(prec, frag, im)| { + let prec = TupleRange::try_new(prec.0, prec.1).unwrap(); + let frag = TupleRange::try_new(frag.0, frag.1).unwrap(); + let im = TupleRange::try_new(im.0, im.1).unwrap(); + index.query_peaks_ms2_async(prec, frag, Unrestricted, Restricted(im)) + }) + .collect(); + + let all_results = futures::future::join_all(tasks).await; + let npeaks = all_results + .iter() + .filter_map(|r| r.as_ref().ok()) + .flat_map(|results| results.iter()) + .map(|(_wg, peaks)| peaks.len()) + .sum(); + + QueryResult { + npeaks, + query_time: start.elapsed() / queries.len() as u32, + } +} + +fn build_queries(rng: &mut impl rand::Rng) -> Vec<((f32, f32), (f32, f32), (f16, f16))> { + (0..10) + .map(|_| { + let prec_start: f32 = rng.random_range(600.0..800.0); + let frag_start: f32 = rng.random_range(600.0..800.0); + let im_start = f16::from_f32(rng.random_range(0.7..1.1)); + ( + (prec_start, prec_start + 0.05), + (frag_start, frag_start + 0.05), + (im_start, im_start + f16::from_f32(0.1)), + ) + }) + .collect() +} diff --git a/rust/timscentroid/examples/local_instrumented_benchmark.rs b/rust/timscentroid/examples/local_instrumented_benchmark.rs new file mode 100644 index 0000000..2217440 --- /dev/null +++ b/rust/timscentroid/examples/local_instrumented_benchmark.rs @@ -0,0 +1,201 @@ +use OptionallyRestricted::{ + Restricted, + Unrestricted, +}; +use half::f16; +use rand::SeedableRng; +use timscentroid::StorageLocation; +use timscentroid::lazy::LazyIndexedTimstofPeaks; +use timscentroid::utils::{ + OptionallyRestricted, + TupleRange, +}; + +fn main() { + let serialized_dir = std::path::Path::new("./serialized_peaks"); + + // Check if serialized data exists locally + if !serialized_dir.exists() { + eprintln!("Error: Serialized data not found at {:?}", serialized_dir); + eprintln!("Please run `cargo run --example serialization --release` first"); + std::process::exit(1); + } + + println!("=== Local Instrumented Benchmark ===\n"); + + // Configuration + const NUM_QUERIES: usize = 100; // Increased from 10 now that we're fast! + const RANDOM_SEED: u64 = 42; // Deterministic seed for reproducibility + + // Load index with instrumentation + println!("Loading lazy index..."); + let location = StorageLocation::from_path(serialized_dir); + let index = LazyIndexedTimstofPeaks::load_from_storage(location) + .unwrap() + .with_instrumentation("Local Benchmark"); + + // Generate deterministic test queries + let queries = { + let mut rng = rand::rngs::StdRng::seed_from_u64(RANDOM_SEED); + build_queries(&mut rng, NUM_QUERIES) + }; + + println!("Running {} queries (Sync API)...\n", queries.len()); + + // Run sync queries + let start = std::time::Instant::now(); + let mut total_peaks = 0; + let mut total_intensity = 0.0; + + for (i, (prec, frag, im)) in queries.iter().enumerate() { + let prec = TupleRange::try_new(prec.0, prec.1).unwrap(); + let frag = TupleRange::try_new(frag.0, frag.1).unwrap(); + let im = TupleRange::try_new(im.0, im.1).unwrap(); + + let results = index.query_peaks_ms2(prec, frag, Unrestricted, Restricted(im)); + + for (_wg, peaks_vec) in results { + for peak in peaks_vec { + total_intensity += peak.intensity as f64; + total_peaks += 1; + } + } + + if i == 0 { + // Print metrics after first query to see initialization overhead + println!("=== After First Query ==="); + index.print_metrics("First Query"); + println!(); + } + } + + let elapsed = start.elapsed(); + println!("\n=== Final Results ==="); + println!("Total time: {:?}", elapsed); + println!("Time per query: {:?}", elapsed / queries.len() as u32); + println!("Total peaks: {}", total_peaks); + println!( + "Peaks per query: {:.2}", + total_peaks as f64 / queries.len() as f64 + ); + println!("Total intensity: {:.2e}", total_intensity); + + println!("\n=== Final Metrics ==="); + index.print_metrics("All Queries"); + + // Now test async API + println!("\n\n=== Testing Async API ===\n"); + + let location = StorageLocation::from_path(serialized_dir); + let index_async = LazyIndexedTimstofPeaks::load_from_storage(location) + .unwrap() + .with_instrumentation("Local Async Benchmark"); + + // Regenerate same queries for async test + let queries_async = { + let mut rng = rand::rngs::StdRng::seed_from_u64(RANDOM_SEED); + build_queries(&mut rng, NUM_QUERIES) + }; + + let runtime = tokio::runtime::Runtime::new().unwrap(); + + println!( + "Running {} queries concurrently (Async API)...\n", + queries_async.len() + ); + + let start = std::time::Instant::now(); + let (total_peaks_async, total_intensity_async) = runtime.block_on(async { + let mut tasks = Vec::new(); + + for (prec, frag, im) in &queries_async { + let prec = TupleRange::try_new(prec.0, prec.1).unwrap(); + let frag = TupleRange::try_new(frag.0, frag.1).unwrap(); + let im = TupleRange::try_new(im.0, im.1).unwrap(); + + tasks.push(index_async.query_peaks_ms2_async(prec, frag, Unrestricted, Restricted(im))); + } + + let all_results = futures::future::join_all(tasks).await; + + let mut total_peaks = 0; + let mut total_intensity = 0.0; + + for result in all_results { + match result { + Ok(results) => { + for (_wg, peaks_vec) in results { + for peak in peaks_vec { + total_intensity += peak.intensity as f64; + total_peaks += 1; + } + } + } + Err(e) => { + eprintln!("Query error: {}", e); + } + } + } + + (total_peaks, total_intensity) + }); + + let elapsed_async = start.elapsed(); + + println!("\n=== Async API Results ==="); + println!("Total time: {:?}", elapsed_async); + println!("Time per query: {:?}", elapsed_async / queries.len() as u32); + println!("Total peaks: {}", total_peaks_async); + println!( + "Peaks per query: {:.2}", + total_peaks_async as f64 / queries.len() as f64 + ); + println!("Total intensity: {:.2e}", total_intensity_async); + + println!("\n=== Async API Metrics ==="); + index_async.print_metrics("All Async Queries"); + + // Compare + println!("\n=== Performance Comparison ==="); + let speedup = elapsed.as_secs_f64() / elapsed_async.as_secs_f64(); + println!("Async is {:.2}x faster than sync", speedup); + + // Verify correctness + if total_peaks == total_peaks_async && (total_intensity - total_intensity_async).abs() < 0.001 { + println!("✓ Sync and async results match!"); + } else { + println!("✗ Results differ!"); + println!( + " Sync: {} peaks, {:.2e} intensity", + total_peaks, total_intensity + ); + println!( + " Async: {} peaks, {:.2e} intensity", + total_peaks_async, total_intensity_async + ); + } +} + +fn build_queries( + rng: &mut impl rand::Rng, + num_queries: usize, +) -> Vec<((f32, f32), (f32, f32), (f16, f16))> { + let mut out = Vec::with_capacity(num_queries); + for _ in 0..num_queries { + let prec_start: f32 = rng.random_range(600.0..800.0); + let prec_end = prec_start + 0.05; + let prec = (prec_start, prec_end).try_into().unwrap(); + + let frag_start: f32 = rng.random_range(600.0..800.0); + let frag_end = frag_start + 0.05; + let frag = (frag_start, frag_end).try_into().unwrap(); + + let im_start_i: f32 = rng.random_range(0.7..1.1); + let im_start: f16 = f16::from_f32(im_start_i); + let im_end = im_start + (f16::from_f32(0.1f32)); + let im = (im_start, im_end).try_into().unwrap(); + + out.push((prec, frag, im)); + } + out +} diff --git a/rust/timscentroid/examples/row_group_benchmark.rs b/rust/timscentroid/examples/row_group_benchmark.rs new file mode 100644 index 0000000..cb63466 --- /dev/null +++ b/rust/timscentroid/examples/row_group_benchmark.rs @@ -0,0 +1,232 @@ +/// Benchmark different row group sizes for Parquet serialization +/// +/// Compares disk usage, load times, and query efficiency across different row group sizes +/// to find the optimal balance for lazy loading from cloud storage. +use half::f16; +use rand::SeedableRng; +use timscentroid::lazy::LazyIndexedTimstofPeaks; +use timscentroid::serialization::SerializationConfig; +use timscentroid::utils::{ + OptionallyRestricted, + TupleRange, +}; +use timscentroid::{ + CentroidingConfig, + IndexedTimstofPeaks, + StorageLocation, +}; +use timsrust::TimsTofPath; + +use OptionallyRestricted::{ + Restricted, + Unrestricted, +}; + +const NUM_QUERIES: usize = 500; + +fn main() { + const DATA_FILE: &str = + "/Users/sebastianpaez/data/decompressed_timstof/250225_Desnaux_200ng_Hela_ICC_on_DIA.d/"; + const RANDOM_SEED: u64 = 42; + + let row_group_sizes = vec![ + ("4K", 4_096), + ("10K", 10_000), + ("50K", 50_000), + ("100K", 100_000), + ("500K", 500_000), + ("1M", 1_000_000), + ]; + + println!( + "Row Group Size Benchmark - {} queries per config", + NUM_QUERIES + ); + println!("Data: {}\n", DATA_FILE); + + // Load and index data + let file = TimsTofPath::new(DATA_FILE).expect("Failed to open TimsTOF file"); + let config = CentroidingConfig { + max_peaks: 20_000, + mz_ppm_tol: 5.0, + im_pct_tol: 3.0, + early_stop_iterations: 200, + }; + + let start = std::time::Instant::now(); + let (index, stats) = IndexedTimstofPeaks::from_timstof_file(&file, config); + println!("Indexed in {:?}: {}\n", start.elapsed(), stats); + + // Generate deterministic queries + let queries = { + let mut rng = rand::rngs::StdRng::seed_from_u64(RANDOM_SEED); + build_queries(&mut rng, NUM_QUERIES) + }; + + // Test each row group size + let mut results = Vec::new(); + for (label, row_group_size) in &row_group_sizes { + println!("Testing {} ({} peaks/group)", label, row_group_size); + let result = test_row_group_size(&index, *row_group_size, label, &queries); + results.push((label.to_string(), *row_group_size, result)); + } + + print_results(&results); +} + +#[derive(Debug, Clone)] +struct BenchmarkResult { + disk_mb: f64, + eager_load_ms: f64, + lazy_init_ms: f64, + query_ms: f64, + bytes_per_peak: f64, + num_gets: usize, +} + +fn test_row_group_size( + index: &IndexedTimstofPeaks, + row_group_size: usize, + label: &str, + queries: &[((f32, f32), (f32, f32), (f16, f16))], +) -> BenchmarkResult { + let output_dir = std::path::PathBuf::from(format!("./benchmark_rg_{}", label)); + if output_dir.exists() { + std::fs::remove_dir_all(&output_dir).unwrap(); + } + + // Serialize + let config = SerializationConfig { + compression: parquet::basic::Compression::SNAPPY, + row_group_size, + write_batch_size: 8192, + }; + index + .save_to_directory_with_config(&output_dir, config) + .unwrap(); + + let disk_mb = calculate_directory_size(&output_dir).unwrap() as f64 / 1_048_576.0; + + // Eager load + let start = std::time::Instant::now(); + let location = StorageLocation::from_path(&output_dir); + let _eager = IndexedTimstofPeaks::load_from_storage(location.clone()).unwrap(); + let eager_load_ms = start.elapsed().as_secs_f64() * 1000.0; + + // Lazy load + queries + let start = std::time::Instant::now(); + let lazy_index = LazyIndexedTimstofPeaks::load_from_storage(location) + .unwrap() + .with_instrumentation(format!("RG_{}", label)); + let lazy_init_ms = start.elapsed().as_secs_f64() * 1000.0; + + let query_start = std::time::Instant::now(); + let mut total_peaks = 0; + for (prec, frag, im) in queries { + let prec = TupleRange::try_new(prec.0, prec.1).unwrap(); + let frag = TupleRange::try_new(frag.0, frag.1).unwrap(); + let im = TupleRange::try_new(im.0, im.1).unwrap(); + let results = lazy_index.query_peaks_ms2(prec, frag, Unrestricted, Restricted(im)); + for (_wg, peaks_vec) in results { + total_peaks += peaks_vec.len(); + } + } + let query_ms = query_start.elapsed().as_secs_f64() * 1000.0 / queries.len() as f64; + + let metrics = lazy_index.metrics().unwrap().snapshot(); + let bytes_per_peak = metrics.bytes_read as f64 / total_peaks.max(1) as f64; + + println!( + " {:.1} MB, eager: {:.0}ms, lazy: {:.0}ms, query: {:.2}ms, {:.0} B/peak, {} GETs", + disk_mb, eager_load_ms, lazy_init_ms, query_ms, bytes_per_peak, metrics.get_count + ); + + std::fs::remove_dir_all(&output_dir).unwrap(); + + BenchmarkResult { + disk_mb, + eager_load_ms, + lazy_init_ms, + query_ms, + bytes_per_peak, + num_gets: metrics.get_count, + } +} + +fn print_results(results: &[(String, usize, BenchmarkResult)]) { + println!( + "\n{:<8} {:<10} {:<10} {:<10} {:<10} {:<12} {:<6}", + "Size", "Disk(MB)", "Eager(ms)", "Lazy(ms)", "Query(ms)", "Bytes/Peak", "GETs" + ); + println!("{}", "-".repeat(72)); + + for (label, _size, r) in results { + println!( + "{:<8} {:<10.1} {:<10.0} {:<10.0} {:<10.2} {:<12.0} {:<6}", + label, + r.disk_mb, + r.eager_load_ms, + r.lazy_init_ms, + r.query_ms, + r.bytes_per_peak, + r.num_gets + ); + } + + // Find best configurations + let best_disk = results + .iter() + .min_by(|a, b| a.2.disk_mb.partial_cmp(&b.2.disk_mb).unwrap()) + .unwrap(); + let best_query = results + .iter() + .min_by(|a, b| a.2.bytes_per_peak.partial_cmp(&b.2.bytes_per_peak).unwrap()) + .unwrap(); + + println!( + "\nBest disk usage: {} ({:.1} MB)", + best_disk.0, best_disk.2.disk_mb + ); + println!( + "Best query efficiency: {} ({:.0} bytes/peak, {} GETs)", + best_query.0, best_query.2.bytes_per_peak, best_query.2.num_gets + ); + + println!("\nS3 estimated (150ms/GET):"); + for (label, _size, r) in results { + let s3_time = (r.num_gets as f64 * 150.0) + r.query_ms; + println!(" {:<8} -> ~{:.0} ms/query", label, s3_time); + } +} + +fn build_queries( + rng: &mut impl rand::Rng, + num_queries: usize, +) -> Vec<((f32, f32), (f32, f32), (f16, f16))> { + (0..num_queries) + .map(|_| { + let prec_start: f32 = rng.random_range(600.0..800.0); + let frag_start: f32 = rng.random_range(600.0..800.0); + let im_start = f16::from_f32(rng.random_range(0.7..1.1)); + ( + (prec_start, prec_start + 0.04), + (frag_start, frag_start + 0.04), + (im_start, im_start + f16::from_f32(0.1)), + ) + }) + .collect() +} + +fn calculate_directory_size(path: &std::path::Path) -> std::io::Result { + let mut total_size = 0u64; + for entry in std::fs::read_dir(path)? { + let entry = entry?; + let metadata = entry.metadata()?; + if metadata.is_file() { + total_size += metadata.len(); + } else if metadata.is_dir() { + total_size += calculate_directory_size(&entry.path())?; + } + } + Ok(total_size) +} diff --git a/rust/timscentroid/examples/serialization.rs b/rust/timscentroid/examples/serialization.rs index 2655149..1cb4883 100644 --- a/rust/timscentroid/examples/serialization.rs +++ b/rust/timscentroid/examples/serialization.rs @@ -3,6 +3,7 @@ use timscentroid::utils::OptionallyRestricted; use timscentroid::{ CentroidingConfig, IndexedTimstofPeaks, + StorageLocation, }; use timsrust::TimsTofPath; @@ -66,7 +67,8 @@ fn main() { // Deserialize from disk println!("\n=== Deserialization ==="); let start_read = std::time::Instant::now(); - let index_loaded = IndexedTimstofPeaks::load_from_directory(output_dir).unwrap(); + let location = StorageLocation::from_path(output_dir); + let index_loaded = IndexedTimstofPeaks::load_from_storage(location).unwrap(); let read_time = start_read.elapsed(); println!("Deserialization time: {:?}", read_time); @@ -80,7 +82,10 @@ fn main() { "Speedup (load vs centroid): {:.1}x faster", centroid_time.as_secs_f64() / read_time.as_secs_f64() ); + compare_results(&index_original, &index_loaded); +} +fn compare_results(index_original: &IndexedTimstofPeaks, index_loaded: &IndexedTimstofPeaks) { // Test querying on both to ensure functional equivalence println!("\n=== Query Test (Original) ==="); let original_result = test_querying(&index_original); diff --git a/rust/timscentroid/src/geometry.rs b/rust/timscentroid/src/geometry.rs index b69e4a3..5b943c6 100644 --- a/rust/timscentroid/src/geometry.rs +++ b/rust/timscentroid/src/geometry.rs @@ -136,7 +136,16 @@ impl QuadrupoleIsolationScheme { } pub(crate) fn from_quad(quad: &QuadrupoleSettings, ims_converter: impl Fn(f64) -> f64) -> Self { - let geom = quad_to_geometry(quad, ims_converter); + let xxyys = quad_to_xxyy(quad, ims_converter); + let geom = xxyys_to_geometry(&xxyys); + Self { inner: geom } + } + + /// Creates a QuadrupoleIsolationScheme from an iterator of (mz_start, mz_end, im_start, im_end) tuples. + /// Each tuple represents a quadrupole window. + pub fn from_xxyy>(iter: T) -> Self { + let xxyys: Vec<(f64, f64, f64, f64)> = iter.collect(); + let geom = xxyys_to_geometry(&xxyys); Self { inner: geom } } } @@ -147,27 +156,15 @@ fn connect_edges(left: &[(f64, f64)], right: &[(f64, f64)]) -> Polygon { Polygon::new(all_points.into(), vec![]) } -// We let geo-rs do the heavy lifting of geometry operations -// so we just make possibly a lot of boxes and then let it simplify -// the geometry for us. -fn quad_to_geometry( +fn quad_to_xxyy( quad: &QuadrupoleSettings, ims_converter: impl Fn(f64) -> f64, -) -> MultiPolygon { +) -> Vec<(f64, f64, f64, f64)> { assert!(quad.scan_starts.len() == quad.scan_ends.len()); assert!(quad.scan_starts.len() == quad.isolation_mz.len()); assert!(quad.scan_starts.len() == quad.isolation_width.len()); assert!(quad.scan_starts.len() == quad.collision_energy.len()); - if quad.scan_starts.is_empty() { - return MultiPolygon(vec![]); - } - let mut polygons = Vec::new(); - let mut curr_left_edge = Vec::new(); - let mut curr_right_edge = Vec::new(); - - let mut last_scan_end: Option = None; - let mut last_quad_range: Option<(f64, f64)> = None; - + let mut result = Vec::new(); for i in 0..quad.scan_starts.len() { let scan_start = quad.scan_starts[i]; let scan_end = quad.scan_ends[i]; @@ -177,7 +174,26 @@ fn quad_to_geometry( let mz_width = quad.isolation_width[i]; let mz_start = mz_center - mz_width / 2.0; let mz_end = mz_center + mz_width / 2.0; + result.push((mz_start, mz_end, im_start, im_end)); + } + result +} +// We let geo-rs do the heavy lifting of geometry operations +// so we just make possibly a lot of boxes and then let it simplify +// the geometry for us. +fn xxyys_to_geometry(xxyys: &[(f64, f64, f64, f64)]) -> MultiPolygon { + if xxyys.is_empty() { + return MultiPolygon(vec![]); + } + let mut polygons = Vec::new(); + let mut curr_left_edge = Vec::new(); + let mut curr_right_edge = Vec::new(); + + let mut last_mobility_end: Option = None; + let mut last_quad_range: Option<(f64, f64)> = None; + + for (mz_start, mz_end, im_start, im_end) in xxyys.iter().copied() { let mut flush_polygon = false; if let Some((last_mz_start, last_mz_end)) = last_quad_range { @@ -188,11 +204,11 @@ fn quad_to_geometry( } } - if let Some(last_end) = last_scan_end { + if let Some(last_end) = last_mobility_end { // If the current im_start is less than the last_im_end, // we have an overlap in the IMS dimension. // We also need the quads to overlap in the mz dimension - if scan_start != last_end { + if im_start != last_end { flush_polygon = true; } } @@ -212,7 +228,7 @@ fn quad_to_geometry( curr_right_edge.push((mz_end, im_start)); curr_right_edge.push((mz_end, im_end)); - last_scan_end = Some(quad.scan_ends[i]); + last_mobility_end = Some(im_end); last_quad_range = Some((mz_start, mz_end)); } @@ -250,7 +266,8 @@ mod tests { isolation_width: vec![40.0, 40.0], collision_energy: vec![20.0, 22.0], }; - let geom = quad_to_geometry(&dia_pasef_window, dummy_ims_converter); + let xxyys = quad_to_xxyy(&dia_pasef_window, dummy_ims_converter); + let geom = xxyys_to_geometry(&xxyys); println!("Geometry: {:?}", geom); assert_eq!(geom.0.len(), 2); } @@ -271,9 +288,56 @@ mod tests { isolation_width: vec![20., 20., 20., 20., 20., 20.], collision_energy: vec![20.0, 22.0, 24.0, 26.0, 28.0, 30.0], }; - let geom = quad_to_geometry(&dia_pasef_window, dummy_ims_converter); + let geom = quad_to_xxyy(&dia_pasef_window, dummy_ims_converter); + let geom = xxyys_to_geometry(&geom); println!("Geometry: {:?}", geom); assert_eq!(geom.0.len(), 1); + + // Extract that one polygon + let poly = &geom.0[0]; + let coords: Vec<(f64, f64)> = poly.exterior().coords().map(|c| (c.x, c.y)).collect(); + println!("Coords: {:?}", coords); + let max_ims = dia_pasef_window + .scan_starts + .first() + .map(|s| dummy_ims_converter(*s as f64)) + .unwrap(); + let min_ims = dia_pasef_window + .scan_ends + .last() + .map(|s| dummy_ims_converter(*s as f64)) + .unwrap(); + + // Since this layout is in essence a trapezoid, it gets summarized to 5 points + // (the simplification removes intermediate points) + // The first two edges are the left side (mz 90-110 at max ims to min ims) + // The next two edges are the right side (mz 85-105 at min ims to max ims) + // (note: this connects the last point of the first edge to the first point of the second edge) + // And the last point closes the polygon back to the start + let expect = [ + (90.0, max_ims), + (85.0, min_ims), + (105.0, min_ims), + (110.0, max_ims), + (90.0, max_ims), + ]; + + for i in 0..expect.len() { + let (exp_mz, exp_im) = expect[i]; + let (got_mz, got_im) = coords[i]; + assert!( + (exp_mz - got_mz).abs() < 0.001, + "Expected mz {:.2}, got {:.2}", + exp_mz, + got_mz + ); + assert!( + (exp_im - got_im).abs() < 0.001, + "Expected im {:.4}, got {:.4}", + exp_im, + got_im + ); + } } #[test] diff --git a/rust/timscentroid/src/indexing.rs b/rust/timscentroid/src/indexing.rs index 192b02f..28d30c2 100644 --- a/rust/timscentroid/src/indexing.rs +++ b/rust/timscentroid/src/indexing.rs @@ -14,7 +14,14 @@ use timsrust::{ FramePeaks, Metadata, }; +use tracing::instrument; +use crate::rt_mapping::{ + CycleToRTMapping, + MS1CycleIndex, + RTIndex, + WindowCycleIndex, +}; use crate::utils::OptionallyRestricted::{ Restricted, Unrestricted, @@ -43,11 +50,11 @@ use crate::geometry::QuadrupoleIsolationScheme; // Which is not horrendous tbh ... #[derive(Debug, Clone, Copy, PartialEq, serde::Serialize, serde::Deserialize)] -pub struct IndexedPeak { +pub struct IndexedPeak { pub mz: f32, pub intensity: f32, pub mobility_ook0: f16, - pub cycle_index: u32, + pub cycle_index: T, } /// Main struct of the whole crate. @@ -60,8 +67,11 @@ pub struct IndexedPeak { /// 2. Query the peaks using [IndexedTimstofPeaks::query_peaks_ms1] or [IndexedTimstofPeaks::query_peaks_ms2] #[derive(Debug, Clone, serde::Serialize, serde::Deserialize)] pub struct IndexedTimstofPeaks { - pub(crate) ms2_window_groups: Vec<(QuadrupoleIsolationScheme, IndexedPeakGroup)>, - pub(crate) ms1_peaks: IndexedPeakGroup, + pub(crate) ms2_window_groups: Vec<( + QuadrupoleIsolationScheme, + IndexedPeakGroup, + )>, + pub(crate) ms1_peaks: IndexedPeakGroup, } /// Statistics about the indexing process. @@ -87,6 +97,36 @@ impl std::fmt::Display for IndexBuildingStats { } impl IndexedTimstofPeaks { + /// Create an IndexedTimstofPeaks from pre-built components. + /// + /// This is useful for testing - you can build the MS1 and MS2 groups + /// using `IndexedPeakGroup::new()` with mock data, then combine them here. + /// + /// # Example + /// ```ignore + /// let ms1_peaks = vec![/* ... */]; + /// let (ms1_group, _) = IndexedPeakGroup::new(ms1_peaks, cycle_to_rt, 4096); + /// + /// let ms2_groups = vec![ + /// (quad_geometry1, ms2_group1), + /// (quad_geometry2, ms2_group2), + /// ]; + /// + /// let index = IndexedTimstofPeaks::from_parts(ms1_group, ms2_groups); + /// ``` + pub fn from_parts( + ms1_peaks: IndexedPeakGroup, + ms2_window_groups: Vec<( + QuadrupoleIsolationScheme, + IndexedPeakGroup, + )>, + ) -> Self { + Self { + ms1_peaks, + ms2_window_groups, + } + } + pub fn from_timstof_file( file: &TimsTofPath, centroiding_config: CentroidingConfig, @@ -159,9 +199,9 @@ impl IndexedTimstofPeaks { pub fn query_peaks_ms1( &self, mz_range: TupleRange, - cycle_range: OptionallyRestricted>, + cycle_range: OptionallyRestricted>, im_range: OptionallyRestricted>, - ) -> impl Iterator { + ) -> impl Iterator> { self.ms1_peaks.query_peaks(mz_range, cycle_range, im_range) } @@ -181,17 +221,17 @@ impl IndexedTimstofPeaks { &self, precursor_range_mz: TupleRange, mz_range: TupleRange, - cycle_range: OptionallyRestricted>, + cycle_range: OptionallyRestricted>, im_range: OptionallyRestricted>, ) -> impl Iterator< Item = ( &QuadrupoleIsolationScheme, - impl Iterator, + impl Iterator>, ), > { self.filter_precursor_ranges(precursor_range_mz, im_range) .map(move |(wg_info, peak_group)| { - let im_range = im_range.map(|r| { + let local_im_range = im_range.map(|r| { wg_info .intersects_ranges( ( @@ -209,7 +249,7 @@ impl IndexedTimstofPeaks { ( wg_info, - peak_group.query_peaks(mz_range, cycle_range, im_range), + peak_group.query_peaks(mz_range, cycle_range, local_im_range), ) }) } @@ -218,7 +258,12 @@ impl IndexedTimstofPeaks { &self, precursor_range_mz: TupleRange, ion_mobility_range: OptionallyRestricted>, - ) -> impl Iterator { + ) -> impl Iterator< + Item = &( + QuadrupoleIsolationScheme, + IndexedPeakGroup, + ), + > { let f64_mz_range = ( precursor_range_mz.start() as f64, precursor_range_mz.end() as f64, @@ -230,11 +275,12 @@ impl IndexedTimstofPeaks { }) } - pub fn rt_ms_to_cycle_index(&self, rt_ms: u32) -> u32 { - // Q: do I need to clamp to 0-max cycle index? - self.ms1_peaks - .cycle_to_rt_ms - .partition_point(|x| *x <= rt_ms) as u32 + pub fn ms1_cycle_mapping(&self) -> &CycleToRTMapping { + &self.ms1_peaks.cycle_to_rt_ms + } + + pub fn rt_ms_to_cycle_index(&self, rt_ms: u32) -> MS1CycleIndex { + self.ms1_peaks.cycle_to_rt_ms.ms_to_closest_index(rt_ms) } /// Read MS1 frames and return an IndexedPeakGroup along with building stats. @@ -244,7 +290,10 @@ impl IndexedTimstofPeaks { frame_reader: &FrameReader, metadata: &Metadata, centroiding_config: CentroidingConfig, - ) -> (IndexedPeakGroup, IndexedPeakGroupBuildingStats) { + ) -> ( + IndexedPeakGroup, + IndexedPeakGroupBuildingStats, + ) { IndexedPeakGroup::read_with_filter( frame_reader, metadata, @@ -259,7 +308,10 @@ impl IndexedTimstofPeaks { centroiding_config: CentroidingConfig, ) -> Result< ( - Vec<(QuadrupoleIsolationScheme, IndexedPeakGroup)>, + Vec<( + QuadrupoleIsolationScheme, + IndexedPeakGroup, + )>, IndexedPeakGroupBuildingStats, ), (), @@ -305,20 +357,22 @@ impl IndexedTimstofPeaks { /// Represents a group of indexed peaks, organized into buckets based on m/z ranges. /// Each bucket internally sorted by retention time (rt). #[derive(Debug, Clone, serde::Serialize, serde::Deserialize)] -pub struct IndexedPeakGroup { - pub(crate) peaks: Vec, +pub struct IndexedPeakGroup { + pub(crate) peaks: Vec>, pub(crate) bucket_mz_ranges: Vec>, pub(crate) bucket_size: usize, - pub(crate) cycle_to_rt_ms: Vec, + pub(crate) cycle_to_rt_ms: CycleToRTMapping, } #[derive(Debug, Clone, Copy)] -struct PeakBucket<'a> { - inner: &'a [IndexedPeak], +struct PeakBucket<'a, T: RTIndex> { + inner: &'a [IndexedPeak], } -impl<'a> PeakBucket<'a> { - fn find_cycle_range(&'a self, cycle_range: TupleRange) -> std::ops::Range { +impl<'a, T: RTIndex> PeakBucket<'a, T> { + // Returns a range such that all peaks within self.inner[range] + // have cycle_index within the provided cycle_range. + fn find_cycle_range(&'a self, cycle_range: TupleRange) -> std::ops::Range { let start_idx = self .inner .partition_point(|x| x.cycle_index < cycle_range.start()); @@ -332,8 +386,11 @@ impl<'a> PeakBucket<'a> { } } -impl<'a> From<&'a [IndexedPeak]> for PeakBucket<'a> { - fn from(value: &'a [IndexedPeak]) -> Self { +impl<'a, T: RTIndex> From<&'a [IndexedPeak]> for PeakBucket<'a, T> { + /// NOTE: by calling this you are ASSURING that the input slice is sorted by cycle_index + /// If this is not the case the behavior of methods on PeakBucket is undefined. + /// or will panic! + fn from(value: &'a [IndexedPeak]) -> Self { assert!(value.first().unwrap().cycle_index <= value.last().unwrap().cycle_index); Self { inner: value } } @@ -404,53 +461,99 @@ impl IndexedPeakGroupBuildingStats { } } -impl IndexedPeakGroup { +impl IndexedPeakGroup { /// Query peaks based on m/z, rt, and im ranges. pub fn query_peaks( &self, mz_range: TupleRange, - cycle_range: OptionallyRestricted>, + cycle_range: OptionallyRestricted>, im_range: OptionallyRestricted>, - ) -> impl Iterator { + ) -> impl Iterator> { QueryPeaksIterator::new(self, mz_range, cycle_range, im_range) } /// Create a new IndexedPeakGroup from a vector of peaks. /// + /// This is the **canonical** way to build an IndexedPeakGroup - all bucketing + /// logic lives here to ensure consistency. + /// /// NOTE: This internally uses `par_sort_unstable` to sort the peaks /// so in theory it should not be called within a parallel loop. + #[instrument(level = "info", skip_all, fields(num_peaks = peaks.len()))] pub(crate) fn new( - mut peaks: Vec, - cycle_to_rt_ms: Vec, + mut peaks: Vec>, + cycle_to_rt_ms: CycleToRTMapping, bucket_size: usize, ) -> (Self, IndexedPeakGroupStats) { + // TODO: Implement a "try_from_parts" that does not do any sorting or bucketing + // but requires the user to pass the ranges. + // I need to add serialization to that so we can read it. let st = std::time::Instant::now(); - peaks.par_sort_unstable_by(|x, y| x.mz.partial_cmp(&y.mz).unwrap()); - let sort_time = st.elapsed(); - assert!(peaks.first().unwrap().mz <= peaks.last().unwrap().mz); + + // This is meant to be a "fast-pass-check" in case the peaks had been previously sorted + // into buckets (like using ::new and then ::unpack/::pack or serialization/deserialization) + let needs_sorting = !Self::check_bucket_sorted_heuristic(&peaks, bucket_size); + // let needs_sorting = true; + if needs_sorting { + peaks.par_sort_unstable_by(|x, y| x.mz.partial_cmp(&y.mz).unwrap()); + // If we have a single bucket that has already been sorted in "index order" + // the last can be less than the first! (since the re-sort internally sorts by cycle_index) + assert!( + peaks.first().unwrap().mz <= peaks.last().unwrap().mz, + "Peaks should be sorted by m/z after sorting step {:?} - {:?} [{};{}]", + peaks.first().unwrap(), + peaks.last().unwrap(), + peaks.len(), + bucket_size + ); + } + // These are a bit slow tbh ... + // But I really want to be alerted if something is wrong + // and at the end of the day its less than 40ms even for large datasets + // and happens max 20 times per file. assert!(peaks.iter().all(|x| x.intensity >= 0.0)); - assert!( - peaks - .iter() - .all(|x| x.cycle_index < cycle_to_rt_ms.len() as u32) - ); + let max_cycle = T::new(cycle_to_rt_ms.len() as u32 - 1); + assert!(peaks.iter().all(|x| x.cycle_index <= max_cycle)); + let sort_time = st.elapsed(); let st = std::time::Instant::now(); let bucket_mz_ranges: Vec<_> = peaks .par_chunks_mut(bucket_size) .map(|chunk| { - let start = chunk.first().unwrap().mz; - let end = chunk.last().unwrap().mz; - chunk.sort_unstable_by(|x, y| { - x.cycle_index - .partial_cmp(&y.cycle_index) - .unwrap() - .then(x.mobility_ook0.partial_cmp(&y.mobility_ook0).unwrap()) - }); + let (start, end) = if needs_sorting { + let start = chunk.first().unwrap().mz; + let end = chunk.last().unwrap().mz; + chunk.sort_unstable_by(|x, y| { + x.cycle_index + .partial_cmp(&y.cycle_index) + .unwrap() + .then(x.mobility_ook0.partial_cmp(&y.mobility_ook0).unwrap()) + }); + (start, end) + } else { + let mut start = f32::MAX; + let mut end = f32::MIN; + for peak in chunk.iter() { + if peak.mz < start { + start = peak.mz; + } + if peak.mz > end { + end = peak.mz; + } + } + (start, end) + }; TupleRange::try_new(start, end).expect("Incoming vec should have been sorted") }) .collect(); + if needs_sorting { + assert!( + Self::check_bucket_sorted_heuristic(&peaks, bucket_size), + "Peaks should be bucket sorted after bucketing step" + ); + } + let bucket_time = st.elapsed(); let tmp = Self { @@ -469,11 +572,87 @@ impl IndexedPeakGroup { (tmp, stats) } + #[instrument(level = "info", skip_all, fields(num_peaks = peaks.len(), result))] + /// After some benchmarking this can take ~30ms in the worst case for large datasets. + /// where the data is already in the correct order. + /// The fail fast path (most first-time builds will be unsorted) is 17us. + pub fn check_bucket_sorted_heuristic(peaks: &[IndexedPeak], bucket_size: usize) -> bool { + // We check that: + // 1. the max value of each bucket is <= min value of the next bucket + // 2. each bucket is sorted by cycle_index + let mut last_max = f32::MIN; + let buckets_ordered = peaks.chunks(bucket_size).all(|bucket| { + let curr_min = bucket + .iter() + .map(|x| x.mz) + .fold(f32::INFINITY, |a, b| a.min(b)); + if curr_min < last_max { + return false; + } + let curr_max = bucket + .iter() + .map(|x| x.mz) + .fold(f32::NEG_INFINITY, |a, b| a.max(b)); + last_max = curr_max; + true + }); + if !buckets_ordered { + tracing::Span::current().record("result", false); + return false; + } + + let internally_sorted_heuristic: bool = + peaks.chunks(bucket_size).enumerate().all(|(i, x)| { + if i == 0 { + // Do the full check for the first bucket + return x.windows(2).all(|w| w[0].cycle_index <= w[1].cycle_index); + } + let first_val = x.first().unwrap().cycle_index; + let last_val = x.last().unwrap().cycle_index; + last_val >= first_val + }); + + let res = internally_sorted_heuristic && buckets_ordered; + tracing::Span::current().record("result", res); + res + } + + pub fn unpack(self) -> (Vec>, Vec, usize, Vec>) { + let Self { + peaks, + bucket_mz_ranges, + bucket_size, + cycle_to_rt_ms, + } = self; + ( + peaks, + cycle_to_rt_ms.unpack(), + bucket_size, + bucket_mz_ranges, + ) + } + + /// Create a new IndexedPeakGroup for testing purposes. + /// + /// I only make this public for testing - in real use cases + /// You should never be used + #[doc(hidden)] + pub fn testing_new( + peaks: Vec>, + cycle_to_rt_ms: CycleToRTMapping, + bucket_size: usize, + ) -> (Self, IndexedPeakGroupStats) { + if cfg!(test) { + panic!("Not intended to run in production code") + } + Self::new(peaks, cycle_to_rt_ms, bucket_size) + } + fn aproximate_memory_usage(&self) -> usize { let self_mem = std::mem::size_of::(); let bucket_mem = std::mem::size_of::>() * (self.bucket_mz_ranges.capacity()); - let peak_mem = self.peaks.capacity() * std::mem::size_of::(); + let peak_mem = self.peaks.capacity() * std::mem::size_of::>(); self_mem + bucket_mem + peak_mem } @@ -523,10 +702,7 @@ impl IndexedPeakGroup { } /// Read frames from a FrameReader that match a given filter function. - #[tracing::instrument( - level = "debug", - skip(frame_reader, metadata, filter, centroiding_config) - )] + #[tracing::instrument(level = "debug", skip_all)] fn read_with_filter( frame_reader: &FrameReader, metadata: &Metadata, @@ -599,13 +775,13 @@ impl IndexedPeakGroup { mz, intensity, mobility_ook0, - cycle_index, + cycle_index: T::new(cycle_index), } }); // let res_arc = all_peaks.clone(); // let mut res = res_arc.lock().unwrap(); // res.extend(tmp); - Ok((tmp.collect::>(), reason)) + Ok((tmp.collect::>>(), reason)) }, ) .map(|x| match x { @@ -640,7 +816,8 @@ impl IndexedPeakGroup { // In theory there should be no zeros ... // 2**12 = 4096 peaks per bucket - let (out, stats) = IndexedPeakGroup::new(inner_vec, cycle_to_rt_ms, 2usize.pow(12)); + let cycle_mapping = CycleToRTMapping::new(cycle_to_rt_ms); + let (out, stats) = IndexedPeakGroup::new(inner_vec, cycle_mapping, 2usize.pow(12)); ( out, IndexedPeakGroupBuildingStats { @@ -664,7 +841,7 @@ impl IndexedPeakGroup { /// Get a specific bucket by its index. /// Returns None if the index is out of bounds. - fn get_bucket(&self, bucket_idx: usize) -> Option> { + fn get_bucket(&self, bucket_idx: usize) -> Option> { self.get_bucket_range(bucket_idx) .map(|r| PeakBucket::from(&self.peaks[r])) } @@ -682,7 +859,7 @@ impl IndexedPeakGroup { fn print_glimpse(&self) { let num_buckets = self.bucket_mz_ranges.len(); let num_peaks = self.peaks.len(); - let mem_usage = num_peaks * std::mem::size_of::(); + let mem_usage = num_peaks * std::mem::size_of::>(); println!("IndexedPeakGroup Glimpse:"); println!(" Number of peaks: {}", num_peaks); println!(" Number of buckets: {}", num_buckets); @@ -698,23 +875,23 @@ impl IndexedPeakGroup { /// from attempting a flat map over multiple iterators that borrow from self /// when querying the peaks. #[derive(Debug)] -struct QueryPeaksIterator<'a> { - indexed_window_group: &'a IndexedPeakGroup, +struct QueryPeaksIterator<'a, T: RTIndex> { + indexed_window_group: &'a IndexedPeakGroup, mz_range: TupleRange, - cycle_range: OptionallyRestricted>, + cycle_range: OptionallyRestricted>, im_range: OptionallyRestricted>, bucket_idx: usize, bucket_end: usize, position_in_bucket: usize, end_of_current_bucket: usize, - current_bucket: Option>, + current_bucket: Option>, } -impl<'a> QueryPeaksIterator<'a> { +impl<'a, T: RTIndex> QueryPeaksIterator<'a, T> { pub fn new( - indexed_window_group: &'a IndexedPeakGroup, + indexed_window_group: &'a IndexedPeakGroup, mz_range: TupleRange, - cycle_range: OptionallyRestricted>, + cycle_range: OptionallyRestricted>, im_range: OptionallyRestricted>, ) -> Self { let bucket_range = indexed_window_group.query_bucket_range(mz_range); @@ -733,7 +910,10 @@ impl<'a> QueryPeaksIterator<'a> { } } -impl<'a> QueryPeaksIterator<'a> { +impl<'a, T: RTIndex> QueryPeaksIterator<'a, T> { + // Advance to the next bucket that matches the m/z range. + // None if there are no more buckets. + // Returns the index of the advanced bucket. fn advance_bucket(&mut self) -> Option { if self.bucket_idx >= self.bucket_end { return None; @@ -758,7 +938,7 @@ impl<'a> QueryPeaksIterator<'a> { Some(self.bucket_idx - 1) } - fn next_in_current_bucket(&mut self) -> Option<&'a IndexedPeak> { + fn next_in_current_bucket(&mut self) -> Option<&'a IndexedPeak> { // Use a loop instead of recursion to avoid stack overflow when many // consecutive peaks don't match the filter criteria (common in dense spectra). while let Some(bucket) = self.current_bucket.as_ref() { @@ -783,8 +963,8 @@ impl<'a> QueryPeaksIterator<'a> { } } -impl<'a> Iterator for QueryPeaksIterator<'a> { - type Item = &'a IndexedPeak; +impl<'a, T: RTIndex> Iterator for QueryPeaksIterator<'a, T> { + type Item = &'a IndexedPeak; fn next(&mut self) -> Option { loop { @@ -805,13 +985,13 @@ mod tests { use super::*; use half::f16; - fn tuples_to_peaks(data: &[(f32, f32, f32, u32)]) -> Vec { + fn tuples_to_peaks(data: &[(f32, f32, f32, u32)]) -> Vec> { data.iter() - .map(|&(mz, intensity, im, cycle_index)| IndexedPeak { + .map(|&(mz, intensity, im, cycle_index)| IndexedPeak:: { mz, intensity, mobility_ook0: f16::from_f32(im), - cycle_index, + cycle_index: T::new(cycle_index), }) .collect() } @@ -828,13 +1008,13 @@ mod tests { (100.0, 400.0, 1.0, 4u32), ]; - let peaks = tuples_to_peaks(&test_data); - let bucket = PeakBucket::from(&peaks[..]); + let peaks = tuples_to_peaks::(&test_data); + let bucket = PeakBucket::::from(&peaks[..]); // let rt_range: std::ops::Range = 1900..4100; - let cycle_range = (1, 3); + let cycle_range = (MS1CycleIndex::new(1), MS1CycleIndex::new(3)); let rt_idx_range = bucket.find_cycle_range(cycle_range.try_into().unwrap()); let out: Vec<_> = bucket.inner[rt_idx_range].to_vec(); - let expected = tuples_to_peaks(&[ + let expected = tuples_to_peaks::(&[ (100.0, 250.0, 1.0, 1), (100.0, 250.0, 1.0, 1), (100.0, 250.0, 1.0, 1), @@ -844,4 +1024,54 @@ mod tests { assert_eq!(out, expected); } + + #[test] + fn test_checking_bucketing() { + let test_data = vec![ + // mz, intensity, im, cycle_index + // So everything here is sorted by both mz and cycle_index + (100.0, 200.0, 1.0, 0u32), + (100.0, 250.0, 1.0, 1u32), + (100.0, 250.0, 1.0, 1u32), + (100.0, 250.0, 1.0, 1u32), + (150.0, 300.0, 1.0, 2u32), + (150.0, 350.0, 1.0, 3u32), + (200.0, 400.0, 1.0, 4u32), + ]; + let peaks = tuples_to_peaks::(&test_data); + let check_res = IndexedPeakGroup::check_bucket_sorted_heuristic(&peaks, 4); + assert!(check_res); + + let test_data = vec![ + // mz, intensity, im, cycle_index + // all mz's from bucket N are less than (or equal to) all mz's from bucket N+1 + (200.0, 200.0, 1.0, 0u32), + (100.0, 250.0, 1.0, 2u32), + (50.0, 250.0, 1.0, 23u32), + // End bucket 1 (internally sorted by cycle, not mz) + (2000.0, 250.0, 1.0, 1u32), + (1900.0, 300.0, 1.0, 2u32), + (1800.0, 350.0, 1.0, 3u32), + // End bucket 2 + (20_000.0, 400.0, 1.0, 4u32), + ]; + let peaks = tuples_to_peaks::(&test_data); + let check_res = IndexedPeakGroup::check_bucket_sorted_heuristic(&peaks, 3); + assert!(check_res); + let check_res = IndexedPeakGroup::check_bucket_sorted_heuristic(&peaks, 2); + // This should fail because in bucket 2 we have mz's less than in bucket 1 + assert!(!check_res); + // Any bucket number other than 3 should fail + for bucket_size in [1, 2, 4, 5, 10] { + if bucket_size != 3 { + let check_res = + IndexedPeakGroup::check_bucket_sorted_heuristic(&peaks, bucket_size); + assert!( + !check_res, + "Bucket size {} should fail the check", + bucket_size + ); + } + } + } } diff --git a/rust/timscentroid/src/instrumentation.rs b/rust/timscentroid/src/instrumentation.rs new file mode 100644 index 0000000..431aefd --- /dev/null +++ b/rust/timscentroid/src/instrumentation.rs @@ -0,0 +1,382 @@ +//! Instrumentation and observability for storage operations +//! +//! This module provides wrappers around ObjectStore that track metrics: +//! - Number of GET/PUT/HEAD operations +//! - Total bytes transferred +//! - Time spent in each operation +//! - Detailed operation logs + +use async_trait::async_trait; +use bytes::Bytes; +use futures::stream::BoxStream; +use object_store::path::Path; +use object_store::{ + GetOptions, + GetResult, + ListResult, + MultipartUpload, + ObjectMeta, + ObjectStore, + PutMultipartOptions, + PutOptions, + PutPayload, + PutResult, + Result, +}; +use std::fmt::Display; +use std::ops::Range; +use std::sync::Arc; +use std::sync::atomic::{ + AtomicU64, + AtomicUsize, + Ordering, +}; + +/// Metrics collected during storage operations +#[derive(Debug, Clone)] +pub struct StorageMetrics { + // Operation counts + pub get_count: Arc, + pub put_count: Arc, + pub head_count: Arc, + pub delete_count: Arc, + pub list_count: Arc, + + // Bytes transferred + pub bytes_read: Arc, + pub bytes_written: Arc, + + // Time spent (in microseconds) + pub get_time_us: Arc, + pub put_time_us: Arc, + pub head_time_us: Arc, +} + +impl Default for StorageMetrics { + fn default() -> Self { + Self::new() + } +} + +impl StorageMetrics { + pub fn new() -> Self { + Self { + get_count: Arc::new(AtomicUsize::new(0)), + put_count: Arc::new(AtomicUsize::new(0)), + head_count: Arc::new(AtomicUsize::new(0)), + delete_count: Arc::new(AtomicUsize::new(0)), + list_count: Arc::new(AtomicUsize::new(0)), + bytes_read: Arc::new(AtomicU64::new(0)), + bytes_written: Arc::new(AtomicU64::new(0)), + get_time_us: Arc::new(AtomicU64::new(0)), + put_time_us: Arc::new(AtomicU64::new(0)), + head_time_us: Arc::new(AtomicU64::new(0)), + } + } + + /// Reset all metrics to zero + pub fn reset(&self) { + self.get_count.store(0, Ordering::SeqCst); + self.put_count.store(0, Ordering::SeqCst); + self.head_count.store(0, Ordering::SeqCst); + self.delete_count.store(0, Ordering::SeqCst); + self.list_count.store(0, Ordering::SeqCst); + self.bytes_read.store(0, Ordering::SeqCst); + self.bytes_written.store(0, Ordering::SeqCst); + self.get_time_us.store(0, Ordering::SeqCst); + self.put_time_us.store(0, Ordering::SeqCst); + self.head_time_us.store(0, Ordering::SeqCst); + } + + /// Get snapshot of current metrics + pub fn snapshot(&self) -> MetricsSnapshot { + MetricsSnapshot { + get_count: self.get_count.load(Ordering::SeqCst), + put_count: self.put_count.load(Ordering::SeqCst), + head_count: self.head_count.load(Ordering::SeqCst), + delete_count: self.delete_count.load(Ordering::SeqCst), + list_count: self.list_count.load(Ordering::SeqCst), + bytes_read: self.bytes_read.load(Ordering::SeqCst), + bytes_written: self.bytes_written.load(Ordering::SeqCst), + get_time_us: self.get_time_us.load(Ordering::SeqCst), + put_time_us: self.put_time_us.load(Ordering::SeqCst), + head_time_us: self.head_time_us.load(Ordering::SeqCst), + } + } +} + +/// Immutable snapshot of metrics +#[derive(Debug, Clone, Copy)] +pub struct MetricsSnapshot { + pub get_count: usize, + pub put_count: usize, + pub head_count: usize, + pub delete_count: usize, + pub list_count: usize, + pub bytes_read: u64, + pub bytes_written: u64, + pub get_time_us: u64, + pub put_time_us: u64, + pub head_time_us: u64, +} + +impl MetricsSnapshot { + pub fn print_report(&self, label: &str) { + println!("\n=== Storage Metrics: {} ===", label); + println!("Operations:"); + println!( + " GET: {:>6} calls, {:>10.2} MB, {:>8.2} ms total, {:>6.2} ms/call", + self.get_count, + self.bytes_read as f64 / 1_024_000.0, + self.get_time_us as f64 / 1_000.0, + if self.get_count > 0 { + self.get_time_us as f64 / 1_000.0 / self.get_count as f64 + } else { + 0.0 + } + ); + println!( + " HEAD: {:>6} calls, {:>8.2} ms total, {:>6.2} ms/call", + self.head_count, + self.head_time_us as f64 / 1_000.0, + if self.head_count > 0 { + self.head_time_us as f64 / 1_000.0 / self.head_count as f64 + } else { + 0.0 + } + ); + println!( + " PUT: {:>6} calls, {:>10.2} MB, {:>8.2} ms total", + self.put_count, + self.bytes_written as f64 / 1_024_000.0, + self.put_time_us as f64 / 1_000.0 + ); + println!(" DELETE: {:>6} calls", self.delete_count); + println!(" LIST: {:>6} calls", self.list_count); + println!("\nTotals:"); + println!( + " Total operations: {}", + self.get_count + self.put_count + self.head_count + self.delete_count + self.list_count + ); + println!( + " Total bytes read: {:.2} MB", + self.bytes_read as f64 / 1_024_000.0 + ); + println!( + " Total bytes written: {:.2} MB", + self.bytes_written as f64 / 1_024_000.0 + ); + println!( + " Total I/O time: {:.2} ms", + (self.get_time_us + self.put_time_us + self.head_time_us) as f64 / 1_000.0 + ); + } +} + +/// ObjectStore wrapper that tracks metrics +pub struct InstrumentedStore { + inner: Arc, + metrics: Arc, + label: String, +} + +impl InstrumentedStore { + pub fn new( + inner: Arc, + metrics: Arc, + label: impl Into, + ) -> Self { + Self { + inner, + metrics, + label: label.into(), + } + } + + pub fn metrics(&self) -> Arc { + self.metrics.clone() + } + + pub fn print_metrics(&self) { + self.metrics.snapshot().print_report(&self.label); + } +} + +impl std::fmt::Debug for InstrumentedStore { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "InstrumentedStore({})", self.label) + } +} + +impl Display for InstrumentedStore { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "InstrumentedStore({})", self.label) + } +} + +#[async_trait] +impl ObjectStore for InstrumentedStore { + async fn put(&self, location: &Path, payload: PutPayload) -> Result { + let start = std::time::Instant::now(); + let bytes_len = payload.content_length(); + + let result = self.inner.put(location, payload).await; + + let elapsed = start.elapsed(); + self.metrics.put_count.fetch_add(1, Ordering::SeqCst); + self.metrics + .bytes_written + .fetch_add(bytes_len as u64, Ordering::SeqCst); + self.metrics + .put_time_us + .fetch_add(elapsed.as_micros() as u64, Ordering::SeqCst); + + result + } + + async fn put_opts( + &self, + location: &Path, + payload: PutPayload, + opts: PutOptions, + ) -> Result { + let start = std::time::Instant::now(); + let bytes_len = payload.content_length(); + + let result = self.inner.put_opts(location, payload, opts).await; + + let elapsed = start.elapsed(); + self.metrics.put_count.fetch_add(1, Ordering::SeqCst); + self.metrics + .bytes_written + .fetch_add(bytes_len as u64, Ordering::SeqCst); + self.metrics + .put_time_us + .fetch_add(elapsed.as_micros() as u64, Ordering::SeqCst); + + result + } + + async fn get(&self, location: &Path) -> Result { + let start = std::time::Instant::now(); + let result = self.inner.get(location).await; + let elapsed = start.elapsed(); + + self.metrics.get_count.fetch_add(1, Ordering::SeqCst); + self.metrics + .get_time_us + .fetch_add(elapsed.as_micros() as u64, Ordering::SeqCst); + + // Track bytes read when the result is consumed + if let Ok(get_result) = &result { + let bytes_len = get_result.meta.size; + self.metrics + .bytes_read + .fetch_add(bytes_len, Ordering::SeqCst); + } + + result + } + + async fn get_opts(&self, location: &Path, options: GetOptions) -> Result { + let start = std::time::Instant::now(); + let result = self.inner.get_opts(location, options).await; + let elapsed = start.elapsed(); + + self.metrics.get_count.fetch_add(1, Ordering::SeqCst); + self.metrics + .get_time_us + .fetch_add(elapsed.as_micros() as u64, Ordering::SeqCst); + + if let Ok(get_result) = &result { + let bytes_len = get_result.meta.size; + self.metrics + .bytes_read + .fetch_add(bytes_len, Ordering::SeqCst); + } + + result + } + + async fn get_range(&self, location: &Path, range: Range) -> Result { + let start = std::time::Instant::now(); + let result = self.inner.get_range(location, range.clone()).await; + let elapsed = start.elapsed(); + + self.metrics.get_count.fetch_add(1, Ordering::SeqCst); + self.metrics + .get_time_us + .fetch_add(elapsed.as_micros() as u64, Ordering::SeqCst); + + if let Ok(bytes) = &result { + self.metrics + .bytes_read + .fetch_add(bytes.len() as u64, Ordering::SeqCst); + } + + result + } + + async fn head(&self, location: &Path) -> Result { + let start = std::time::Instant::now(); + let result = self.inner.head(location).await; + let elapsed = start.elapsed(); + + self.metrics.head_count.fetch_add(1, Ordering::SeqCst); + self.metrics + .head_time_us + .fetch_add(elapsed.as_micros() as u64, Ordering::SeqCst); + + result + } + + async fn delete(&self, location: &Path) -> Result<()> { + self.metrics.delete_count.fetch_add(1, Ordering::SeqCst); + self.inner.delete(location).await + } + + fn list(&self, prefix: Option<&Path>) -> BoxStream<'static, Result> { + self.metrics.list_count.fetch_add(1, Ordering::SeqCst); + self.inner.list(prefix) + } + + fn list_with_offset( + &self, + prefix: Option<&Path>, + offset: &Path, + ) -> BoxStream<'static, Result> { + self.metrics.list_count.fetch_add(1, Ordering::SeqCst); + self.inner.list_with_offset(prefix, offset) + } + + async fn list_with_delimiter(&self, prefix: Option<&Path>) -> Result { + self.metrics.list_count.fetch_add(1, Ordering::SeqCst); + self.inner.list_with_delimiter(prefix).await + } + + async fn copy(&self, from: &Path, to: &Path) -> Result<()> { + self.inner.copy(from, to).await + } + + async fn copy_if_not_exists(&self, from: &Path, to: &Path) -> Result<()> { + self.inner.copy_if_not_exists(from, to).await + } + + async fn rename_if_not_exists(&self, from: &Path, to: &Path) -> Result<()> { + self.inner.rename_if_not_exists(from, to).await + } + + async fn put_multipart(&self, location: &Path) -> Result> { + self.metrics.put_count.fetch_add(1, Ordering::SeqCst); + self.inner.put_multipart(location).await + } + + async fn put_multipart_opts( + &self, + location: &Path, + opts: PutMultipartOptions, + ) -> Result> { + self.metrics.put_count.fetch_add(1, Ordering::SeqCst); + self.inner.put_multipart_opts(location, opts).await + } +} diff --git a/rust/timscentroid/src/lazy/mod.rs b/rust/timscentroid/src/lazy/mod.rs new file mode 100644 index 0000000..cdb97d6 --- /dev/null +++ b/rust/timscentroid/src/lazy/mod.rs @@ -0,0 +1,573 @@ +//! Lazy loading for indexed timsTOF peaks. +//! +//! # Example +//! +//! ```no_run +//! use timscentroid::lazy::LazyIndexedTimstofPeaks; +//! use timscentroid::utils::{TupleRange, OptionallyRestricted::*}; +//! use timscentroid::StorageLocation; +//! +//! # fn main() -> Result<(), Box> { +//! // Fast initialization: loads only metadata.json +//! let location = StorageLocation::from_path("indexed_peaks/"); +//! let lazy_index = LazyIndexedTimstofPeaks::load_from_storage(location)?; +//! +//! // Query peaks (loads needed row groups on first access) +//! let peaks: Vec<_> = lazy_index.query_peaks_ms1( +//! TupleRange::try_new(400.0, 500.0).unwrap(), // m/z range +//! Unrestricted, // all retention times +//! Unrestricted, // all ion mobilities +//! ).collect(); +//! # Ok(()) +//! # } +//! ``` +//! +//! # Future Optimizations +//! +//! The current implementation uses the existing parquet file layout where row groups +//! (~2M peaks) don't align with buckets (4096 peaks). This causes some over-reading. +//! +//! Future optimization: Reorganize parquet files to align bucket boundaries with row +//! group boundaries. This would enable more precise loading without per-bucket statistics, +//! at the cost of more row groups (smaller row group size). +pub mod query; + +use crate::geometry::QuadrupoleIsolationScheme; +use crate::indexing::IndexedPeak; +use crate::lazy::query::ParquetQuerier; +use crate::rt_mapping::{ + MS1CycleIndex, + RTIndex, + WindowCycleIndex, +}; +use crate::serialization::{ + PeakGroupMetadata, + SerializationError, + TimscentroidMetadata, +}; +use crate::storage::{ + RUNTIME, + StorageLocation, + StorageProvider, +}; +use crate::utils::{ + OptionallyRestricted, + TupleRange, +}; +use half::f16; +use std::fmt::Debug; +use std::sync::Arc; + +/// Lazy-loading indexed peaks with on-demand row group loading +#[derive(Clone)] +pub struct LazyIndexedTimstofPeaks { + storage: StorageProvider, + ms1_metadata: PeakGroupMetadata, + ms2_metadata: Vec<( + QuadrupoleIsolationScheme, + PeakGroupMetadata, + )>, + // OPTIMIZATION: Pre-initialized queriers for reuse across queries + // This eliminates the need to create new queriers (and fetch metadata) on every query + ms1_querier: Arc, + ms2_queriers: Vec>, +} + +impl Debug for LazyIndexedTimstofPeaks { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("LazyIndexedTimstofPeaks") + .field("storage", &self.storage) + .field("ms1_metadata", &self.ms1_metadata) + .field("ms2_metadata", &self.ms2_metadata) + .finish() + } +} + +impl LazyIndexedTimstofPeaks { + /// Enable instrumentation for this lazy index + /// + /// This wraps the storage provider with metrics tracking and recreates all queriers + /// with the instrumented storage. Returns a new instance with instrumentation enabled. + pub fn with_instrumentation(mut self, label: impl Into) -> Self { + // Wrap the storage with instrumentation + self.storage = self.storage.with_instrumentation(label); + + // CRITICAL: Recreate all queriers with the instrumented storage + // The old queriers hold references to the unwrapped storage and would bypass instrumentation + + // Recreate MS1 querier + let ms1_relative_path = self.ms1_metadata.relative_path.to_str().unwrap(); + self.ms1_querier = Arc::new( + RUNTIME + .block_on(ParquetQuerier::new_async( + self.storage.clone(), + ms1_relative_path, + )) + .expect("Failed to recreate MS1 querier with instrumentation"), + ); + + // Recreate MS2 queriers + self.ms2_queriers.clear(); + for (_, group_metadata) in &self.ms2_metadata { + let relative_path = group_metadata.relative_path.to_str().unwrap(); + let querier = Arc::new( + RUNTIME + .block_on(ParquetQuerier::new_async( + self.storage.clone(), + relative_path, + )) + .expect("Failed to recreate MS2 querier with instrumentation"), + ); + self.ms2_queriers.push(querier); + } + + self + } + + /// Print storage metrics if instrumentation is enabled + pub fn print_metrics(&self, label: &str) { + self.storage.print_metrics(label); + } + + /// Get storage metrics if instrumentation is enabled + pub fn metrics(&self) -> Option<&crate::instrumentation::StorageMetrics> { + self.storage.metrics() + } +} + +impl LazyIndexedTimstofPeaks { + /// Load from cloud storage URL (async version) + /// + /// Supports s3://, gs://, az:// URLs depending on enabled feature flags. + pub async fn load_from_url_async(url: impl AsRef) -> Result { + let location = StorageLocation::from_url(url)?; + Self::load_from_storage_async(location).await + } + + /// Load from cloud storage URL (blocking version) + /// + /// This blocks the current thread. If you're in an async context, + /// prefer `load_from_url_async`. + pub fn load_from_url(url: impl AsRef) -> Result { + RUNTIME.block_on(Self::load_from_url_async(url)) + } + + /// Load from any storage location (async version) + pub async fn load_from_storage_async( + location: StorageLocation, + ) -> Result { + let start = std::time::Instant::now(); + + let storage = StorageProvider::new(location)?; + + // Read metadata.json (using async version) + let metadata_json = storage.read_to_string_async("metadata.json").await?; + let meta: TimscentroidMetadata = serde_json::from_str(&metadata_json)?; + + // Validate version + if meta.version != "1.0" { + // This might be over-engineered for now but something tells me we might want to + // support multiple versions in the future ... + return Err(SerializationError::SchemaVersionMismatch { + expected: "1.0", + found: meta.version, + }); + } + + let ms2_metadata: Vec<( + QuadrupoleIsolationScheme, + PeakGroupMetadata, + )> = meta + .ms2_window_groups + .into_iter() + .map(|m| (m.quadrupole_isolation, m.group_info)) + .collect(); + + // OPTIMIZATION: Create queriers during initialization for reuse + // This eliminates the need to create queriers (and fetch metadata) on every query + + // Create MS1 querier + let ms1_relative_path = meta.ms1_peaks.relative_path.to_str().unwrap(); + let ms1_querier = Arc::new( + ParquetQuerier::new_async(storage.clone(), ms1_relative_path) + .await + .map_err(|e| { + SerializationError::Io(std::io::Error::other(format!( + "Error creating MS1 querier: {}", + e + ))) + })?, + ); + + // Create MS2 queriers for each window group + let mut ms2_queriers = Vec::with_capacity(ms2_metadata.len()); + for (_, group_metadata) in &ms2_metadata { + let relative_path = group_metadata.relative_path.to_str().unwrap(); + let querier = Arc::new( + ParquetQuerier::new_async(storage.clone(), relative_path) + .await + .map_err(|e| { + SerializationError::Io(std::io::Error::other(format!( + "Error creating MS2 querier: {}", + e + ))) + })?, + ); + ms2_queriers.push(querier); + } + + let elapsed = start.elapsed(); + eprintln!("Lazy loading initialization: {:?}", elapsed); + + Ok(Self { + storage, + ms1_metadata: meta.ms1_peaks, + ms2_metadata, + ms1_querier, + ms2_queriers, + }) + } + + /// Load from any storage location (blocking version) + /// + /// This blocks the current thread. If you're in an async context, + /// prefer `load_from_storage_async`. + pub fn load_from_storage(location: StorageLocation) -> Result { + RUNTIME.block_on(Self::load_from_storage_async(location)) + } + + pub fn ms1_metadata(&self) -> &PeakGroupMetadata { + &self.ms1_metadata + } + + pub fn ms1_cycle_mapping(&self) -> &crate::rt_mapping::CycleToRTMapping { + &self.ms1_metadata.cycle_to_rt_ms + } + + pub fn rt_ms_to_cycle_index(&self, rt_ms: u32) -> MS1CycleIndex { + self.ms1_metadata.cycle_to_rt_ms.ms_to_closest_index(rt_ms) + } +} + +impl LazyIndexedTimstofPeaks { + /// Query MS1 peaks with lazy loading + /// + /// This method performs query planning to identify relevant row groups, + /// loads them on-demand (with caching), and returns an iterator over matching peaks. + pub fn query_peaks_ms1( + &self, + mz_range: TupleRange, + cycle_range: OptionallyRestricted>, + im_range: OptionallyRestricted>, + ) -> impl Iterator> { + let relative_path = self.ms1_metadata.relative_path.to_str().unwrap(); + self.query_peaks_file(relative_path, mz_range, cycle_range, im_range) + } + + /// Query peaks from a parquet file asynchronously (async helper) + /// + /// OPTIMIZATION: Takes a pre-created querier instead of creating one, + /// eliminating metadata fetching on every query + async fn query_peaks_file_with_querier_async( + &self, + querier: &ParquetQuerier, + mz_range: TupleRange, + cycle_range: OptionallyRestricted>, + im_range: OptionallyRestricted>, + ) -> Result>, SerializationError> { + // Use the provided querier (no metadata fetch!) + + let mz_range_range = mz_range.start()..mz_range.end(); + let im_range_opt = match im_range { + OptionallyRestricted::Restricted(r) => Some(r.start()..r.end()), + OptionallyRestricted::Unrestricted => None, + }; + + // Query asynchronously + let record_batch = querier + .query_async(mz_range_range, im_range_opt) + .await + .map_err(|e| { + SerializationError::Io(std::io::Error::other(format!( + "Error querying Parquet: {}", + e + ))) + })?; + + // Convert RecordBatch to Vec of IndexedPeak + let peaks: Vec<_> = record_batch + .column_by_name("mz") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap() + .iter() + .zip( + record_batch + .column_by_name("intensity") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap() + .iter(), + ) + .zip( + record_batch + .column_by_name("mobility_ook0") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap() + .iter(), + ) + .zip( + record_batch + .column_by_name("cycle_index") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap() + .iter(), + ) + .filter_map(|(((mz, intensity), mobility), cycle)| { + if let (Some(mz), Some(intensity), Some(mobility), Some(cycle)) = + (mz, intensity, mobility, cycle) + { + let mobility_f16: f16 = mobility; + let cycle_u32: u32 = cycle; + + // Apply cycle range filter if specified + if let OptionallyRestricted::Restricted(cycle_range) = &cycle_range + && !cycle_range.contains(cycle_u32) + { + return None; + } + + Some(IndexedPeak { + mz, + intensity, + mobility_ook0: mobility_f16, + cycle_index: T::new(cycle_u32), + }) + } else { + None + } + }) + .collect(); + + Ok(peaks) + } + + fn query_peaks_file( + &self, + relative_path: &str, + mz_range: TupleRange, + cycle_range: OptionallyRestricted>, + im_range: OptionallyRestricted>, + ) -> impl Iterator> { + // Create querier with storage provider + let querier = match ParquetQuerier::new(self.storage.clone(), relative_path) { + Ok(q) => q, + Err(e) => { + eprintln!("Error initializing ParquetQuerier: {}", e); + return vec![].into_iter(); + } + }; + + let mz_range_range = mz_range.start()..mz_range.end(); + let im_range_opt = match im_range { + OptionallyRestricted::Restricted(r) => Some(r.start()..r.end()), + OptionallyRestricted::Unrestricted => None, + }; + let record_batch = match querier.query(mz_range_range, im_range_opt) { + Ok(batch) => batch, + Err(e) => { + eprintln!("Error querying Parquet: {}", e); + // return vec![].into_iter(); + eprintln!("Error querying Parquet file: {}", e); + todo!("Sebastian has been to lazy to make this a good error ... fix me!"); + } + }; + + // Convert RecordBatch to iterator of IndexedPeak + let peaks: Vec<_> = record_batch + .column_by_name("mz") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap() + .iter() + .zip( + record_batch + .column_by_name("intensity") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap() + .iter(), + ) + .zip( + record_batch + .column_by_name("mobility_ook0") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap() + .iter(), + ) + .zip( + record_batch + .column_by_name("cycle_index") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap() + .iter(), + ) + .filter_map(|(((mz, intensity), mobility), cycle)| { + if let (Some(mz), Some(intensity), Some(mobility), Some(cycle)) = + (mz, intensity, mobility, cycle) + { + let mobility_f16: f16 = mobility; + let cycle_u32: u32 = cycle; + + // Apply cycle range filter if specified + if let OptionallyRestricted::Restricted(cycle_range) = &cycle_range + && !cycle_range.contains(cycle_u32) + { + return None; + } + + Some(IndexedPeak { + mz, + intensity, + mobility_ook0: mobility_f16, + cycle_index: T::new(cycle_u32), + }) + } else { + None + } + }) + .collect(); + + peaks.into_iter() + } + + /// Query MS2 peaks with lazy loading (async version with concurrent execution) + /// + /// This async method queries multiple MS2 groups concurrently, significantly improving + /// performance for cloud storage by overlapping network I/O operations. + /// + /// Returns a vector of (isolation_scheme, peaks) pairs for window groups that match + /// the precursor range. + pub async fn query_peaks_ms2_async( + &self, + precursor_range_mz: TupleRange, + mz_range: TupleRange, + cycle_range: OptionallyRestricted>, + im_range: OptionallyRestricted>, + ) -> Result< + Vec<( + QuadrupoleIsolationScheme, + Vec>, + )>, + SerializationError, + > { + // Use futures::stream to handle async iteration properly + use futures::stream::{ + FuturesUnordered, + StreamExt, + }; + + let mut futures = FuturesUnordered::new(); + + // OPTIMIZATION: Use pre-created queriers instead of creating new ones + // Iterate through metadata and queriers together + for ((isolation_scheme, _group_metadata), querier) in + self.ms2_metadata.iter().zip(self.ms2_queriers.iter()) + { + // Filter by precursor range using geometric intersection + let f64_mz_range = ( + precursor_range_mz.start() as f64, + precursor_range_mz.end() as f64, + ); + let f64_im_range = im_range.map(|r| (r.start().to_f64(), r.end().to_f64())); + + if !isolation_scheme + .intersects(f64_mz_range, f64_im_range.unwrap_or((f64::MIN, f64::MAX))) + { + continue; + } + + let local_im_range = im_range.map(|r| { + isolation_scheme + .intersects_ranges( + ( + precursor_range_mz.start() as f64, + precursor_range_mz.end() as f64, + ), + (r.start().to_f64(), r.end().to_f64()), + ) + .map(|(im_start, im_end)| { + TupleRange::try_new(f16::from_f64(im_start), f16::from_f64(im_end)) + .expect("Intersect should always be valid") + }) + .expect("Since we filtered before the precursor range. It should intersect") + }); + + let isolation_scheme = isolation_scheme.clone(); + let querier = querier.clone(); // Arc clone is cheap + + // Push the future into the unordered set - this will execute concurrently + futures.push(async move { + let peaks = self + .query_peaks_file_with_querier_async::( + &querier, + mz_range, + cycle_range, + local_im_range, + ) + .await?; + Ok::<_, SerializationError>((isolation_scheme, peaks)) + }); + } + + // Collect all results concurrently + let mut results = Vec::new(); + while let Some(result) = futures.next().await { + results.push(result?); + } + + Ok(results) + } + + /// Query MS2 peaks with lazy loading (blocking version) + /// + /// This is a blocking wrapper around `query_peaks_ms2_async()`. For better performance, + /// prefer using `query_peaks_ms2_async()` in async contexts. + /// + /// Returns an iterator over (isolation_scheme, peaks_iterator) pairs for + /// window groups that match the precursor range. + pub fn query_peaks_ms2( + &self, + precursor_range_mz: TupleRange, + mz_range: TupleRange, + cycle_range: OptionallyRestricted>, + im_range: OptionallyRestricted>, + ) -> Vec<( + QuadrupoleIsolationScheme, + Vec>, + )> { + RUNTIME + .block_on(self.query_peaks_ms2_async( + precursor_range_mz, + mz_range, + cycle_range, + im_range, + )) + .unwrap_or_else(|e| { + eprintln!("Error in query_peaks_ms2: {}", e); + Vec::new() + }) + } +} diff --git a/rust/timscentroid/src/lazy/query.rs b/rust/timscentroid/src/lazy/query.rs new file mode 100644 index 0000000..f7d9ac4 --- /dev/null +++ b/rust/timscentroid/src/lazy/query.rs @@ -0,0 +1,330 @@ +use std::ops::Range; +use std::sync::Arc; + +use crate::storage::{ + RUNTIME, + StorageProvider, +}; +use arrow::array::{ + AsArray, + BooleanArray, + Float16Array, + Float32Array, +}; +use arrow::compute::kernels::cmp as array_cmp; +use arrow::compute::{ + and, + concat_batches, +}; +use arrow::datatypes::{ + Float16Type, + Float32Type, +}; +use arrow::error::ArrowError; +use arrow::record_batch::RecordBatch; +use futures::stream::StreamExt; +use half::f16; +use object_store::path::Path as ObjectPath; +use parquet::arrow::arrow_reader::{ + ArrowPredicate, + ArrowReaderMetadata, + RowFilter, +}; +use parquet::arrow::async_reader::{ + ParquetObjectReader, + ParquetRecordBatchStreamBuilder, +}; +use parquet::file::metadata::{ + ParquetMetaData, + RowGroupMetaData, +}; +use parquet::file::statistics::Statistics; + +use crate::serialization::PeakSchema; + +pub struct ParquetQuerier { + storage: StorageProvider, + relative_path: String, + peak_schema: PeakSchema, + file_metadata: ArrowReaderMetadata, +} + +struct FilterPredicate { + mz_range: Range, + ims_range: Option>, + projection_mask: parquet::arrow::ProjectionMask, +} + +impl FilterPredicate { + fn new( + mz_range: Range, + ims_range: Option>, + index_mz: usize, + index_ims: usize, + file_metadata: Arc, + ) -> Self { + // Define the projection mask to only include the necessary columns + let projection_mask = parquet::arrow::ProjectionMask::leaves( + file_metadata.file_metadata().schema_descr(), + vec![index_mz, index_ims], + ); + + Self { + mz_range, + ims_range, + projection_mask, + } + } +} + +impl ArrowPredicate for FilterPredicate { + fn evaluate(&mut self, batch: RecordBatch) -> Result { + // 1. Filter Column A + // Note we are referencing by index 0 because of the projection order + // tht we defined earlier. + let col_mz = batch + .column_by_name("mz") + .unwrap() + .as_primitive::(); + + // Logic: A >= start && A < end + // Note: Arrow compute kernels handle nulls automatically (usually propagating them or treating as false) + let a_gte = array_cmp::gt_eq(&col_mz, &Float32Array::new_scalar(self.mz_range.start))?; + let a_lt = array_cmp::lt_eq(&col_mz, &Float32Array::new_scalar(self.mz_range.end))?; + let mask_a = and(&a_gte, &a_lt)?; + + if let Some(ims_range) = &self.ims_range { + // 2. Filter Column B + let col_ims = batch + .column_by_name("mobility_ook0") + .unwrap() + .as_primitive::(); + // Logic: B >= start && B < end + let b_gte = array_cmp::gt_eq(&col_ims, &Float16Array::new_scalar(ims_range.start))?; + let b_lt = array_cmp::lt_eq(&col_ims, &Float16Array::new_scalar(ims_range.end))?; + let mask_b = and(&b_gte, &b_lt)?; + + // 3. Combine: Mask A AND Mask B + and(&mask_a, &mask_b) + } else { + Ok(mask_a) + } + } + + fn projection(&self) -> &parquet::arrow::ProjectionMask { + &self.projection_mask + } +} + +impl ParquetQuerier { + /// Initialize with storage provider and relative path, load metadata, and validate schema (async version). + /// + /// This async method fetches the parquet file metadata once and caches it for reuse in subsequent queries. + /// This eliminates the need to refetch metadata on every query, significantly improving performance + /// for cloud storage where each metadata fetch can add 50-100ms of latency. + pub async fn new_async( + storage: StorageProvider, + relative_path: &str, + ) -> Result> { + let object_store = storage.as_object_store(); + let full_path = storage.build_path(relative_path); + let object_path = ObjectPath::from(full_path.as_str()); + + // Fetch metadata (just the footer - lightweight operation) + let mut reader = ParquetObjectReader::new(object_store, object_path.clone()); + + // Load metadata using ArrowReaderMetadata for caching + let file_metadata = + ArrowReaderMetadata::load_async(&mut reader, Default::default()).await?; + + let schema = file_metadata.schema(); + let peak_schema = + PeakSchema::validate(schema).map_err(|e| format!("Schema validation error: {}", e))?; + + Ok(Self { + storage, + relative_path: relative_path.to_string(), + peak_schema, + file_metadata, + }) + } + + /// Initialize with storage provider and relative path, load metadata, and validate schema (blocking version). + /// + /// This is a blocking wrapper around `new_async()`. For better performance in async contexts, + /// prefer using `new_async()` directly. + pub fn new( + storage: StorageProvider, + relative_path: &str, + ) -> Result> { + RUNTIME.block_on(Self::new_async(storage, relative_path)) + } + + /// Query parquet file with predicates (async version). + /// + /// This async method reuses the cached metadata from `new_async()`, avoiding the overhead + /// of refetching metadata on every query. This is critical for cloud storage performance + /// where metadata fetches can add significant latency. + /// + /// The query uses predicate pushdown to: + /// 1. Skip row groups based on statistics + /// 2. Fetch only needed row groups via concurrent HTTP range requests + /// 3. Apply predicates during decoding + pub async fn query_async( + &self, + mz_range: Range, + ims_range: Option>, + ) -> Result> { + let object_store = self.storage.as_object_store(); + let full_path = self.storage.build_path(&self.relative_path); + let object_path = ObjectPath::from(full_path.as_str()); + + // Create async reader + let reader = ParquetObjectReader::new(object_store, object_path); + + // 1. Identify which Row Groups to read based on Statistics + let parquet_metadata = self.file_metadata.metadata(); + let mz_col_idx = self.peak_schema.mz_idx(); + let ims_col_idx = self.peak_schema.mobility_idx(); + + // Filter the row group indices + let row_groups_to_fetch: Vec = parquet_metadata + .row_groups() + .iter() + .enumerate() + .filter(|(_idx, rg_meta)| { + // Check MZ Range + if !overlap_check_f32(rg_meta, mz_col_idx, &mz_range) { + return false; + } + + // Check IMS Range (if exists) + if let Some(r) = &ims_range + && !overlap_check_f16(rg_meta, ims_col_idx, r) + { + return false; + } + + true + }) + .map(|(idx, _)| idx) + .collect(); + + // If no row groups match, return empty early + if row_groups_to_fetch.is_empty() { + let schema = self.file_metadata.schema(); + return Ok(RecordBatch::new_empty(schema.clone())); + } + + // 2. Initialize Builder with Cached Metadata + let mut builder = + ParquetRecordBatchStreamBuilder::new_with_metadata(reader, self.file_metadata.clone()); + + // 3. APPLY THE PRUNING (This is the missing link) + builder = builder.with_row_groups(row_groups_to_fetch); + + // 4. Set up the RowFilter (Keep this! It filters specific rows within the kept groups) + let pred = FilterPredicate::new( + mz_range, + ims_range, + mz_col_idx, + ims_col_idx, + parquet_metadata.clone(), + ); + let filter = RowFilter::new(vec![Box::new(pred)]); + builder = builder.with_row_filter(filter); + + let mut stream = builder.build()?; + let schema = stream.schema().clone(); + let mut batches = Vec::new(); + + while let Some(batch) = stream.next().await { + batches.push(batch?); + } + + if batches.is_empty() { + return Ok(RecordBatch::new_empty(schema)); + } + + let combined = concat_batches(&schema, &batches)?; + Ok(combined) + } + + /// Query parquet file with predicates (blocking version). + /// + /// This is a blocking wrapper around `query_async()`. For better performance in async contexts, + /// prefer using `query_async()` directly. + pub fn query( + &self, + mz_range: Range, + ims_range: Option>, + ) -> Result> { + RUNTIME.block_on(self.query_async(mz_range, ims_range)) + } +} + +// --- Helper Functions for Statistic Checking --- + +fn overlap_check_f32(rg: &RowGroupMetaData, col_idx: usize, range: &Range) -> bool { + let col_meta = rg.column(col_idx); + + if let Some(stats) = col_meta.statistics() { + match stats { + Statistics::Float(value_stats) => { + match (value_stats.min_opt(), value_stats.max_opt()) { + (Some(min), Some(max)) => { + // Check for overlap: !(Max < Start || Min >= End) + // Simplified: Max >= Start && Min < End + return *max >= range.start && *min < range.end; + } + _ => return true, // Missing min/max stats? Safety fallback: read it. + } + } + _ => return true, // Stats exist but wrong type? Safety fallback: read it. + } + } + // If stats are missing, we must read the group to be safe + true +} + +fn overlap_check_f16(rg: &RowGroupMetaData, col_idx: usize, range: &Range) -> bool { + let col_meta = rg.column(col_idx); + + // 1. Check if stats exist + let stats = match col_meta.statistics() { + Some(s) => s, + None => return true, // No stats, must read to be safe + }; + + // 2. Match strict Physical Types + match stats { + // Float16 is ALWAYS stored as FixedLenByteArray in Parquet + Statistics::FixedLenByteArray(stats) => { + let (min_bytes, max_bytes) = match (stats.min_opt(), stats.max_opt()) { + (Some(min), Some(max)) => (min, max), + _ => return true, // Missing min/max stats? Safety fallback: read it. + }; + + // Safety: Float16 must be exactly 2 bytes + if min_bytes.len() != 2 || max_bytes.len() != 2 { + return true; + } + + // Copy to array for conversion + let min_arr: [u8; 2] = min_bytes.as_ref().try_into().unwrap(); + let max_arr: [u8; 2] = max_bytes.as_ref().try_into().unwrap(); + + // Decode Little Endian (Parquet Standard) + let min_val = f16::from_le_bytes(min_arr); + let max_val = f16::from_le_bytes(max_arr); + + // Check overlap: Max >= Start && Min < End + max_val >= range.start && min_val < range.end + } + // If the writer did something weird (like storing it as Int32), + // we can't prune safely, so we default to reading the row group. + _ => true, + } +} + +// Tests are in the integration test file diff --git a/rust/timscentroid/src/lib.rs b/rust/timscentroid/src/lib.rs index 6b80525..395e3fc 100644 --- a/rust/timscentroid/src/lib.rs +++ b/rust/timscentroid/src/lib.rs @@ -1,7 +1,11 @@ pub mod centroiding; pub mod geometry; pub mod indexing; +pub mod instrumentation; +pub mod lazy; +pub mod rt_mapping; pub mod serialization; +pub mod storage; pub mod utils; #[doc(inline)] @@ -16,3 +20,9 @@ pub use indexing::{ #[doc(inline)] pub use centroiding::CentroidingConfig; + +#[doc(inline)] +pub use storage::{ + StorageLocation, + StorageProvider, +}; diff --git a/rust/timscentroid/src/rt_mapping.rs b/rust/timscentroid/src/rt_mapping.rs new file mode 100644 index 0000000..5806399 --- /dev/null +++ b/rust/timscentroid/src/rt_mapping.rs @@ -0,0 +1,261 @@ +//! The main purpose of this module is to provide some ... domain-knowledge +//! to map indices to retention times. (In essence make more idiomatic the conversion +//! between RT in milliseconds and index in the timsTOF data ... and to distinguish +//! between MS1 and MS2 RTs). +//! +//! I am aware that this seems like a single use bad abstraction but +//! on the longer term It will help with more complex RT mapping needs +//! (eg: schemas where MS1 and MS2 RTs are not aligned 1:1). + +use serde::{ + Deserialize, + Serialize, +}; +use std::fmt::Debug; +use std::ops::{ + Range, + RangeInclusive, +}; + +/// Represents an index into the MS1 cycles. +#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)] +pub struct MS1CycleIndex { + pub index: u32, +} + +/// Represents an index into the MS2 windows. +#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)] +pub struct WindowCycleIndex { + /// TODO: add a field for the window number? + /// In theory an u16 should be enough here. + pub index: u32, +} + +pub trait RTIndex: + Copy + PartialEq + Eq + PartialOrd + Ord + std::fmt::Debug + Send + Sync + Debug +{ + fn index(&self) -> usize; + fn new(index: u32) -> Self; + fn as_u32(&self) -> u32; +} +impl RTIndex for MS1CycleIndex { + fn index(&self) -> usize { + self.index as usize + } + + fn new(index: u32) -> Self { + MS1CycleIndex { index } + } + + fn as_u32(&self) -> u32 { + self.index + } +} +impl RTIndex for WindowCycleIndex { + fn index(&self) -> usize { + self.index as usize + } + + fn new(index: u32) -> Self { + WindowCycleIndex { index } + } + + fn as_u32(&self) -> u32 { + self.index + } +} + +#[derive(Clone)] +pub struct CycleToRTMapping { + // Q: Do I want to newtype the u32 to make clear its meant + // for RT in milliseconds? + rt_milis: Vec, + phantom: std::marker::PhantomData, +} + +fn glimpse_slc_u32(slice: &[u32]) -> String { + if slice.len() <= 5 { + format!("{:?}", slice) + } else { + format!( + "[{}, {}, ..., {}, {}] (len={})", + slice[0], + slice[1], + slice[slice.len() - 2], + slice[slice.len() - 1], + slice.len() + ) + } +} + +impl Debug for CycleToRTMapping { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("CycleToRTMapping") + .field("rt_milis", &glimpse_slc_u32(&self.rt_milis)) + .finish() + } +} + +impl CycleToRTMapping { + pub fn new(rt_milis: Vec) -> Self { + // TODO: check that rt_milis is sorted ascendingly? + Self { + rt_milis, + phantom: std::marker::PhantomData, + } + } + + pub fn ms_to_closest_index(&self, rt_milis: u32) -> T { + // Binary search for the closest RT in milliseconds + let pos = self.rt_milis.partition_point(|x| *x <= rt_milis); + if pos == 0 { + T::new(0) + } else if pos >= self.rt_milis.len() { + T::new((self.rt_milis.len() - 1) as u32) + } else { + let before = self.rt_milis[pos - 1]; + let after = self.rt_milis[pos]; + if rt_milis - before <= after - rt_milis { + T::new((pos - 1) as u32) + } else { + T::new(pos as u32) + } + } + } + + pub fn range_milis(&self) -> (u32, u32) { + ( + *self.rt_milis.first().unwrap(), + *self.rt_milis.last().unwrap(), + ) + } + + pub fn len(&self) -> usize { + self.rt_milis.len() + } + + pub fn unpack(self) -> Vec { + self.rt_milis + } + + pub fn rt_milis_for_index(&self, index: &T) -> Result { + self.rt_milis + .get(index.index()) + .copied() + .ok_or(RTMappingError::IndexOutOfBounds) + } + + pub fn get_slice(&self, range: Range) -> Result<&[u32], RTMappingError> { + let start = range.start.index(); + let end = range.end.index(); + if start >= self.len() || end > self.len() || start > end { + return Err(RTMappingError::IndexOutOfBounds); + } + Ok(&self.rt_milis[start..end]) + } + + pub fn get_inclusive_slice(&self, range: RangeInclusive) -> Result<&[u32], RTMappingError> { + let start = range.start().index(); + let end = range.end().index(); + if start >= self.len() || end >= self.len() || start > end { + return Err(RTMappingError::IndexOutOfBounds); + } + Ok(&self.rt_milis[start..=end]) + } +} + +#[derive(Debug)] +pub enum RTMappingError { + IndexOutOfBounds, +} + +impl Serialize for MS1CycleIndex { + fn serialize(&self, serializer: S) -> Result + where + S: serde::Serializer, + { + serializer.serialize_u32(self.index) + } +} + +impl<'de> Deserialize<'de> for MS1CycleIndex { + fn deserialize(deserializer: D) -> Result + where + D: serde::Deserializer<'de>, + { + let index = u32::deserialize(deserializer)?; + Ok(MS1CycleIndex { index }) + } +} + +impl Serialize for WindowCycleIndex { + fn serialize(&self, serializer: S) -> Result + where + S: serde::Serializer, + { + serializer.serialize_u32(self.index) + } +} + +impl<'de> Deserialize<'de> for WindowCycleIndex { + fn deserialize(deserializer: D) -> Result + where + D: serde::Deserializer<'de>, + { + let index = u32::deserialize(deserializer)?; + Ok(WindowCycleIndex { index }) + } +} + +impl Serialize for CycleToRTMapping { + fn serialize(&self, serializer: S) -> Result + where + S: serde::Serializer, + { + self.rt_milis.serialize(serializer) + } +} + +impl<'de, T: RTIndex> Deserialize<'de> for CycleToRTMapping { + fn deserialize(deserializer: D) -> Result + where + D: serde::Deserializer<'de>, + { + let rt_milis = Vec::::deserialize(deserializer)?; + Ok(CycleToRTMapping::new(rt_milis)) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_ms1_cycle_index_serialization() { + let index = MS1CycleIndex { index: 42 }; + let serialized = serde_json::to_string(&index).unwrap(); + assert_eq!(serialized, "42"); + let deserialized: MS1CycleIndex = serde_json::from_str(&serialized).unwrap(); + assert_eq!(deserialized.index, 42); + } + + #[test] + fn test_window_cycle_index_serialization() { + let index = WindowCycleIndex { index: 99 }; + let serialized = serde_json::to_string(&index).unwrap(); + assert_eq!(serialized, "99"); + let deserialized: WindowCycleIndex = serde_json::from_str(&serialized).unwrap(); + assert_eq!(deserialized.index, 99); + } + + #[test] + fn test_container_serde() { + let mapping: CycleToRTMapping = + CycleToRTMapping::new(vec![1000, 2000, 3000]); + let serialized = serde_json::to_string(&mapping).unwrap(); + assert_eq!(serialized, r#"[1000,2000,3000]"#); + let deserialized: CycleToRTMapping = + serde_json::from_str(&serialized).unwrap(); + assert_eq!(deserialized.rt_milis, vec![1000, 2000, 3000]); + } +} diff --git a/rust/timscentroid/src/serialization/mod.rs b/rust/timscentroid/src/serialization/mod.rs index 2ab6e33..e38a9ba 100644 --- a/rust/timscentroid/src/serialization/mod.rs +++ b/rust/timscentroid/src/serialization/mod.rs @@ -1,29 +1,11 @@ //! High-performance serialization for indexed timsTOF peaks. //! -//! This module provides efficient Parquet-based serialization of [`IndexedTimstofPeaks`] -//! with optimizations for mass spectrometry data. -//! -//! # Features -//! -//! - **Parallel I/O**: MS1 and MS2 groups are written/read concurrently for maximum throughput -//! - **Native f16 support**: Mobility data stored as Float16 without conversion overhead -//! - **Optimized encodings**: BYTE_STREAM_SPLIT for floats, DELTA_BINARY_PACKED for integers -//! - **Configurable compression**: Choose speed vs size tradeoff for your workflow -//! -//! # Performance -//! -//! The default configuration was optimized through extensive benchmarking across compression -//! algorithms (ZSTD, SNAPPY, LZ4, uncompressed), compression levels (1-9), and row group sizes -//! (250K-5M). For typical datasets (~100M peaks): -//! -//! - **Write time**: ~3-4 seconds (with parallel I/O) -//! - **Read time**: ~1-2 seconds (10-20x faster than centroiding) -//! - **Disk usage**: ~750 MB (20% smaller than uncompressed) +//! The only struct here that actually matters is [`IndexedTimstofPeaks`] //! //! # Examples //! //! ```no_run -//! use timscentroid::{IndexedTimstofPeaks, CentroidingConfig}; +//! use timscentroid::{IndexedTimstofPeaks, CentroidingConfig, StorageLocation}; //! use timscentroid::serialization::SerializationConfig; //! use timsrust::TimsTofPath; //! @@ -37,23 +19,12 @@ //! index.save_to_directory("indexed_peaks/")?; //! //! // Load from disk (much faster than re-indexing) -//! let loaded = IndexedTimstofPeaks::load_from_directory("indexed_peaks/")?; +//! let location = StorageLocation::from_path("indexed_peaks/"); +//! let loaded = IndexedTimstofPeaks::load_from_storage(location)?; //! -//! // Use custom settings for specific needs -//! index.save_to_directory_with_config( -//! "fast_peaks/", -//! SerializationConfig::speed_optimized() -//! )?; //! # Ok(()) //! # } //! ``` -//! -//! # Configuration Presets -//! -//! - **Default**: ZSTD(1), 2M row groups - best overall balance -//! - **Speed optimized**: Uncompressed, 5M row groups - fastest load times -//! - **Balanced**: SNAPPY, 2M row groups - fast decompression, good size -//! - **Max compression**: ZSTD(6), 2M row groups - smallest files, slower I/O use crate::geometry::QuadrupoleIsolationScheme; use crate::indexing::{ @@ -61,6 +32,16 @@ use crate::indexing::{ IndexedPeakGroup, IndexedTimstofPeaks, }; +use crate::rt_mapping::{ + CycleToRTMapping, + MS1CycleIndex, + RTIndex, + WindowCycleIndex, +}; +use crate::storage::{ + StorageLocation, + StorageProvider, +}; use arrow::array::{ Array, Float16Array, @@ -77,7 +58,7 @@ use parquet::arrow::ArrowWriter; use parquet::basic::{ Compression, Encoding, - ZstdLevel, + // ZstdLevel, // Sometimes I want to use zstd level 1. }; use parquet::file::properties::{ EnabledStatistics, @@ -90,7 +71,6 @@ use serde::{ Serialize, }; use std::fmt; -use std::fs::File; use std::path::{ Path, PathBuf, @@ -104,6 +84,8 @@ pub enum SerializationError { Parquet(parquet::errors::ParquetError), Arrow(arrow::error::ArrowError), Json(serde_json::Error), + ObjectStore(object_store::Error), + UrlParse(url::ParseError), MissingColumn { name: String, available: Vec, @@ -126,6 +108,8 @@ impl fmt::Display for SerializationError { Self::Parquet(e) => write!(f, "Parquet error: {}", e), Self::Arrow(e) => write!(f, "Arrow error: {}", e), Self::Json(e) => write!(f, "JSON error: {}", e), + Self::ObjectStore(e) => write!(f, "Object store error: {}", e), + Self::UrlParse(e) => write!(f, "URL parse error: {}", e), Self::MissingColumn { name, available } => { write!( f, @@ -181,6 +165,18 @@ impl From for SerializationError { } } +impl From for SerializationError { + fn from(e: object_store::Error) -> Self { + Self::ObjectStore(e) + } +} + +impl From for SerializationError { + fn from(e: url::ParseError) -> Self { + Self::UrlParse(e) + } +} + const SCHEMA_VERSION: &str = "1.0"; /// Configuration for parquet serialization @@ -188,9 +184,9 @@ const SCHEMA_VERSION: &str = "1.0"; pub struct SerializationConfig { /// Compression algorithm and level pub compression: Compression, - /// Number of rows per row group (affects memory usage and parallelism) + // Number of rows per row group (affects memory usage and parallelism) pub row_group_size: usize, - /// Write batch size for internal buffering + /// Write batch size for internal buffering, the square of this is the row group size pub write_batch_size: usize, } @@ -200,40 +196,9 @@ impl Default for SerializationConfig { // levels, and row group sizes. This configuration provides the best balance // of read/write speed and disk space for typical MS data (100M+ peaks). Self { - compression: Compression::ZSTD(ZstdLevel::try_new(1).expect("ZSTD level 1 is valid")), - row_group_size: 2_000_000, - write_batch_size: 8192, - } - } -} - -impl SerializationConfig { - /// Maximum read speed (uncompressed, large row groups). - /// Use when disk space is not a concern and load speed is critical. - pub fn speed_optimized() -> Self { - Self { - compression: Compression::UNCOMPRESSED, - row_group_size: 5_000_000, - write_batch_size: 8192, - } - } - - /// Balanced speed and compression using SNAPPY. - /// Faster decompression than ZSTD with moderate compression. - pub fn balanced() -> Self { - Self { + // compression: Compression::ZSTD(ZstdLevel::try_new(1).expect("ZSTD level 1 is valid")), compression: Compression::SNAPPY, - row_group_size: 2_000_000, - write_batch_size: 8192, - } - } - - /// Maximum compression for archival or network transfer. - /// Slower read/write but smallest disk footprint. - pub fn max_compression() -> Self { - Self { - compression: Compression::ZSTD(ZstdLevel::try_new(6).expect("ZSTD level 6 is valid")), - row_group_size: 2_000_000, + row_group_size: 100_000, write_batch_size: 8192, } } @@ -243,14 +208,14 @@ impl SerializationConfig { pub struct TimscentroidMetadata { pub version: String, pub created_at: String, - pub ms1_peaks: PeakGroupMetadata, + pub ms1_peaks: PeakGroupMetadata, pub ms2_window_groups: Vec, } -#[derive(Serialize, Deserialize)] -pub struct PeakGroupMetadata { +#[derive(Serialize, Deserialize, Debug, Clone)] +pub struct PeakGroupMetadata { pub relative_path: PathBuf, - pub cycle_to_rt_ms: Vec, + pub cycle_to_rt_ms: CycleToRTMapping, pub bucket_size: usize, } @@ -258,11 +223,11 @@ pub struct PeakGroupMetadata { pub struct Ms2GroupMetadata { pub id: usize, pub quadrupole_isolation: QuadrupoleIsolationScheme, - pub group_info: PeakGroupMetadata, + pub group_info: PeakGroupMetadata, } /// Expected schema for IndexedPeak parquet files -struct PeakSchema { +pub(crate) struct PeakSchema { mz_idx: usize, intensity_idx: usize, mobility_idx: usize, @@ -270,8 +235,26 @@ struct PeakSchema { } impl PeakSchema { + pub(crate) fn mz_idx(&self) -> usize { + self.mz_idx + } + + #[allow(dead_code)] + pub(crate) fn intensity_idx(&self) -> usize { + self.intensity_idx + } + + pub(crate) fn mobility_idx(&self) -> usize { + self.mobility_idx + } + + #[allow(dead_code)] + pub(crate) fn cycle_idx(&self) -> usize { + self.cycle_idx + } + /// Create the canonical Arrow schema for IndexedPeak data - fn canonical() -> Schema { + pub(crate) fn canonical() -> Schema { Schema::new(vec![ Field::new("mz", DataType::Float32, false), Field::new("intensity", DataType::Float32, false), @@ -281,7 +264,7 @@ impl PeakSchema { } /// Validate that a schema matches our expected format and return column indices - fn validate(schema: &Schema) -> Result { + pub(crate) fn validate(schema: &Schema) -> Result { let field_names: Vec = schema.fields().iter().map(|f| f.name().clone()).collect(); let find_col = @@ -317,26 +300,98 @@ impl PeakSchema { } impl IndexedTimstofPeaks { - /// Save indexed peaks to a directory using default serialization settings + /// Save indexed peaks to a directory (async version) + pub async fn save_to_directory_async( + &self, + directory: impl AsRef, + ) -> Result<(), SerializationError> { + let location = StorageLocation::from_path(directory); + self.save_to_storage_async(location, SerializationConfig::default()) + .await + } + + /// Save indexed peaks to a directory (blocking version) + /// + /// This blocks the current thread. If you're in an async context, + /// prefer `save_to_directory_async`. pub fn save_to_directory(&self, directory: impl AsRef) -> Result<(), SerializationError> { - self.save_to_directory_with_config(directory, SerializationConfig::default()) + let location = StorageLocation::from_path(directory); + self.save_to_storage(location, SerializationConfig::default()) } - /// Save indexed peaks to a directory with custom serialization settings + /// Save indexed peaks with custom config (async version) + pub async fn save_to_directory_with_config_async( + &self, + directory: impl AsRef, + config: SerializationConfig, + ) -> Result<(), SerializationError> { + let location = StorageLocation::from_path(directory); + self.save_to_storage_async(location, config).await + } + + /// Save indexed peaks with custom config (blocking version) + /// + /// This blocks the current thread. If you're in an async context, + /// prefer `save_to_directory_with_config_async`. pub fn save_to_directory_with_config( &self, directory: impl AsRef, config: SerializationConfig, ) -> Result<(), SerializationError> { - let directory = directory.as_ref(); - if !directory.exists() { - std::fs::create_dir_all(directory)?; - } + let location = StorageLocation::from_path(directory); + self.save_to_storage(location, config) + } - let ms2_dir = directory.join("ms2"); - if !ms2_dir.exists() { - std::fs::create_dir(&ms2_dir)?; - } + /// Save to cloud storage URL (async version) + pub async fn save_to_url_async(&self, url: impl AsRef) -> Result<(), SerializationError> { + let location = StorageLocation::from_url(url)?; + self.save_to_storage_async(location, SerializationConfig::default()) + .await + } + + /// Save to cloud storage URL (blocking version) + /// + /// This blocks the current thread. If you're in an async context, + /// prefer `save_to_url_async`. + pub fn save_to_url(&self, url: impl AsRef) -> Result<(), SerializationError> { + let location = StorageLocation::from_url(url)?; + self.save_to_storage(location, SerializationConfig::default()) + } + + /// Save to any storage location (async version) + pub async fn save_to_storage_async( + &self, + location: StorageLocation, + config: SerializationConfig, + ) -> Result<(), SerializationError> { + // Note: Currently this is not truly async - it uses blocking I/O internally. + // This wrapper exists for API consistency and future optimization. + self.save_to_storage_impl(location, config) + } + + /// Save to any storage location (blocking version) + /// + /// This blocks the current thread. If you're in an async context, + /// prefer `save_to_storage_async`. + pub fn save_to_storage( + &self, + location: StorageLocation, + config: SerializationConfig, + ) -> Result<(), SerializationError> { + self.save_to_storage_impl(location, config) + } + + /// Internal implementation of save_to_storage + fn save_to_storage_impl( + &self, + location: StorageLocation, + config: SerializationConfig, + ) -> Result<(), SerializationError> { + let storage = StorageProvider::new(location)?; + + // Ensure directories exist (no-op for cloud, creates dirs for local) + storage.ensure_directory("")?; + storage.ensure_directory("ms2")?; // Parallel write: MS1 and all MS2 groups written concurrently let (ms1_result, ms2_results): ( @@ -345,12 +400,11 @@ impl IndexedTimstofPeaks { ) = rayon::join( // Write MS1 in parallel thread || { - let ms1_filename = "ms1.parquet"; - let ms1_path = directory.join(ms1_filename); - write_peaks_to_parquet(&self.ms1_peaks.peaks, &ms1_path, config)?; + let ms1_bytes = write_peaks_to_parquet_bytes(&self.ms1_peaks.peaks, config)?; + storage.write_bytes("ms1.parquet", ms1_bytes)?; Ok(PeakGroupMetadata { - relative_path: PathBuf::from(ms1_filename), + relative_path: PathBuf::from("ms1.parquet"), cycle_to_rt_ms: self.ms1_peaks.cycle_to_rt_ms.clone(), bucket_size: self.ms1_peaks.bucket_size, }) @@ -362,9 +416,10 @@ impl IndexedTimstofPeaks { .enumerate() .map(|(i, (quad, group))| { let filename = format!("group_{}.parquet", i); - let path = ms2_dir.join(&filename); + let path = format!("ms2/{}", filename); - write_peaks_to_parquet(&group.peaks, &path, config)?; + let bytes = write_peaks_to_parquet_bytes(&group.peaks, config)?; + storage.write_bytes(&path, bytes)?; Ok(Ms2GroupMetadata { id: i, @@ -383,6 +438,7 @@ impl IndexedTimstofPeaks { let ms1_meta = ms1_result?; let ms2_metas: Vec<_> = ms2_results.into_iter().collect::>()?; + // Write metadata.json let meta = TimscentroidMetadata { version: SCHEMA_VERSION.to_string(), created_at: chrono::Utc::now().to_rfc3339(), @@ -390,17 +446,21 @@ impl IndexedTimstofPeaks { ms2_window_groups: ms2_metas, }; - let meta_file = File::create(directory.join("metadata.json"))?; - serde_json::to_writer_pretty(meta_file, &meta)?; + let metadata_json = serde_json::to_string_pretty(&meta)?; + storage.write_bytes("metadata.json", metadata_json.into_bytes())?; Ok(()) } - pub fn load_from_directory(directory: impl AsRef) -> Result { - let directory = directory.as_ref(); - let meta_path = directory.join("metadata.json"); - let file = File::open(meta_path)?; - let meta: TimscentroidMetadata = serde_json::from_reader(file)?; + /// Load indexed peaks from cloud storage (async version) + pub async fn load_from_storage_async( + location: StorageLocation, + ) -> Result { + let storage = StorageProvider::new(location)?; + + // Read metadata.json + let metadata_json = storage.read_to_string_async("metadata.json").await?; + let meta: TimscentroidMetadata = serde_json::from_str(&metadata_json)?; // Validate schema version if meta.version != SCHEMA_VERSION { @@ -410,7 +470,51 @@ impl IndexedTimstofPeaks { }); } - // Parallel read: MS1 and all MS2 groups loaded concurrently + // Use storage-aware loading strategy + Self::load_from_storage_impl(storage, meta) + } + + /// Load indexed peaks from cloud storage (blocking version) + pub fn load_from_storage(location: StorageLocation) -> Result { + let storage = StorageProvider::new(location)?; + + // Read metadata.json + let metadata_json = storage.read_to_string("metadata.json")?; + let meta: TimscentroidMetadata = serde_json::from_str(&metadata_json)?; + + // Validate schema version + if meta.version != SCHEMA_VERSION { + return Err(SerializationError::SchemaVersionMismatch { + expected: SCHEMA_VERSION, + found: meta.version, + }); + } + + // Use storage-aware loading strategy (serial for cloud, parallel for local) + Self::load_from_storage_impl(storage, meta) + } + + fn load_from_storage_impl( + storage: StorageProvider, + meta: TimscentroidMetadata, + ) -> Result { + // Choose loading strategy based on storage type: + // - Local: CPU bottleneck → parallel loading benefits from multi-core + // - Cloud: Network I/O bottleneck → serial loading avoids connection overhead + let use_parallel = storage.is_local(); + + if use_parallel { + Self::load_from_storage_parallel(storage, meta) + } else { + Self::load_from_storage_serial(storage, meta) + } + } + + /// Parallel loading: optimized for local storage where CPU is the bottleneck + fn load_from_storage_parallel( + storage: StorageProvider, + meta: TimscentroidMetadata, + ) -> Result { let (ms1_result, ms2_results): ( Result<_, SerializationError>, Vec>, @@ -418,8 +522,8 @@ impl IndexedTimstofPeaks { // Load MS1 in parallel thread || { let ms1_peaks_vec = - read_peaks_from_parquet(&directory.join(&meta.ms1_peaks.relative_path))?; - // Stats are recomputed on-the-fly; no need to persist them + storage.read_parquet_peaks(meta.ms1_peaks.relative_path.to_str().unwrap())?; + let (ms1_peaks, _stats) = IndexedPeakGroup::new( ms1_peaks_vec, meta.ms1_peaks.cycle_to_rt_ms.clone(), @@ -432,10 +536,10 @@ impl IndexedTimstofPeaks { meta.ms2_window_groups .par_iter() .map(|group_meta| { - let peaks_vec = read_peaks_from_parquet( - &directory.join(&group_meta.group_info.relative_path), + let peaks_vec = storage.read_parquet_peaks( + group_meta.group_info.relative_path.to_str().unwrap(), )?; - // Stats are recomputed on-the-fly; no need to persist them + let (group, _stats) = IndexedPeakGroup::new( peaks_vec, group_meta.group_info.cycle_to_rt_ms.clone(), @@ -455,26 +559,65 @@ impl IndexedTimstofPeaks { ms2_window_groups, }) } + + /// Serial loading: optimized for cloud storage where network I/O is the bottleneck + /// + /// Avoids creating multiple concurrent connections which can cause: + /// - Rate limiting from cloud providers + /// - Connection overhead + /// - Inefficient use of network bandwidth + fn load_from_storage_serial( + storage: StorageProvider, + meta: TimscentroidMetadata, + ) -> Result { + // Load MS1 first + let ms1_peaks_vec = + storage.read_parquet_peaks(meta.ms1_peaks.relative_path.to_str().unwrap())?; + let (ms1_peaks, _stats) = IndexedPeakGroup::new( + ms1_peaks_vec, + meta.ms1_peaks.cycle_to_rt_ms.clone(), + meta.ms1_peaks.bucket_size, + ); + + // Load MS2 groups sequentially + let ms2_window_groups: Result, SerializationError> = meta + .ms2_window_groups + .iter() + .map(|group_meta| { + let peaks_vec = storage + .read_parquet_peaks(group_meta.group_info.relative_path.to_str().unwrap())?; + + let (group, _stats) = IndexedPeakGroup::new( + peaks_vec, + group_meta.group_info.cycle_to_rt_ms.clone(), + group_meta.group_info.bucket_size, + ); + Ok((group_meta.quadrupole_isolation.clone(), group)) + }) + .collect(); + + Ok(Self { + ms1_peaks, + ms2_window_groups: ms2_window_groups?, + }) + } } -fn write_peaks_to_parquet( - peaks: &[IndexedPeak], - path: &Path, +// Helper function to write parquet to bytes (used by cloud storage) +fn write_peaks_to_parquet_bytes( + peaks: &[IndexedPeak], config: SerializationConfig, -) -> Result<(), SerializationError> { - let file = File::create(path)?; +) -> Result, SerializationError> { + let mut buffer = Vec::new(); + let cursor = std::io::Cursor::new(&mut buffer); // Build writer properties with provided configuration let props = WriterProperties::builder() .set_compression(config.compression) .set_max_row_group_size(config.row_group_size) - .set_write_batch_size(config.write_batch_size) - // Enable statistics for query optimization .set_statistics_enabled(EnabledStatistics::Page) - // BYTE_STREAM_SPLIT encoding improves compression for float columns - .set_column_encoding(ColumnPath::from("mz"), Encoding::BYTE_STREAM_SPLIT) + .set_column_encoding(ColumnPath::from("mz"), Encoding::RLE) .set_column_encoding(ColumnPath::from("intensity"), Encoding::BYTE_STREAM_SPLIT) - // DELTA_BINARY_PACKED is efficient for sequential integers .set_column_encoding( ColumnPath::from("cycle_index"), Encoding::DELTA_BINARY_PACKED, @@ -482,15 +625,15 @@ fn write_peaks_to_parquet( .build(); let schema = Arc::new(PeakSchema::canonical()); - let mut writer = ArrowWriter::try_new(file, schema.clone(), Some(props))?; + let mut writer = ArrowWriter::try_new(cursor, schema.clone(), Some(props))?; // Write peaks in chunks to control memory usage and row group size for chunk in peaks.chunks(config.row_group_size) { let mz_array = Float32Array::from_iter_values(chunk.iter().map(|p| p.mz)); let intensity_array = Float32Array::from_iter_values(chunk.iter().map(|p| p.intensity)); - // Use native f16 array - no conversion needed! let mobility_array = Float16Array::from_iter_values(chunk.iter().map(|p| p.mobility_ook0)); - let cycle_array = UInt32Array::from_iter_values(chunk.iter().map(|p| p.cycle_index)); + let cycle_array = + UInt32Array::from_iter_values(chunk.iter().map(|p| p.cycle_index.as_u32())); let batch = RecordBatch::try_new( schema.clone(), @@ -506,78 +649,74 @@ fn write_peaks_to_parquet( } writer.close()?; - Ok(()) + Ok(buffer) } -fn read_peaks_from_parquet(path: &Path) -> Result, SerializationError> { - let file = File::open(path)?; - let builder = parquet::arrow::arrow_reader::ParquetRecordBatchReaderBuilder::try_new(file)?; - - // Validate schema before reading data - let schema = builder.schema(); - let peak_schema = PeakSchema::validate(schema)?; - - let reader = builder.build()?; - let mut peaks = Vec::new(); - - for batch in reader { - let batch = batch?; - - // Use validated column indices - no string lookups in hot loop - let mz = batch - .column(peak_schema.mz_idx) - .as_any() - .downcast_ref::() - .ok_or_else(|| SerializationError::WrongColumnType { - column: "mz".to_string(), - expected: "Float32", - got: format!("{:?}", batch.column(peak_schema.mz_idx).data_type()), - })?; - - let intensity = batch - .column(peak_schema.intensity_idx) - .as_any() - .downcast_ref::() - .ok_or_else(|| SerializationError::WrongColumnType { - column: "intensity".to_string(), - expected: "Float32", - got: format!("{:?}", batch.column(peak_schema.intensity_idx).data_type()), - })?; - - // Read as f16 directly - no conversion! - let mobility = batch - .column(peak_schema.mobility_idx) - .as_any() - .downcast_ref::() - .ok_or_else(|| SerializationError::WrongColumnType { - column: "mobility_ook0".to_string(), - expected: "Float16", - got: format!("{:?}", batch.column(peak_schema.mobility_idx).data_type()), - })?; - - let cycle = batch - .column(peak_schema.cycle_idx) - .as_any() - .downcast_ref::() - .ok_or_else(|| SerializationError::WrongColumnType { - column: "cycle_index".to_string(), - expected: "UInt32", - got: format!("{:?}", batch.column(peak_schema.cycle_idx).data_type()), - })?; - - // Pre-allocate for this batch - peaks.reserve(batch.num_rows()); - - // Use iterator-based approach for better optimization - // Arrow guarantees all columns have the same length in a RecordBatch - for i in 0..batch.num_rows() { - peaks.push(IndexedPeak { - mz: mz.value(i), - intensity: intensity.value(i), - mobility_ook0: mobility.value(i), // No conversion needed! - cycle_index: cycle.value(i), - }); - } +/// Convert a RecordBatch to peaks +/// +/// This helper extracts peaks from an Arrow RecordBatch, validating +/// column types and indices. Used by both file-based and cloud-based +/// parquet readers. +pub(crate) fn batch_to_peaks( + batch: &RecordBatch, +) -> Result>, SerializationError> { + // Validate schema and get column indices + let peak_schema = PeakSchema::validate(batch.schema().as_ref())?; + + // Use validated column indices - no string lookups in hot loop + let mz = batch + .column(peak_schema.mz_idx) + .as_any() + .downcast_ref::() + .ok_or_else(|| SerializationError::WrongColumnType { + column: "mz".to_string(), + expected: "Float32", + got: format!("{:?}", batch.column(peak_schema.mz_idx).data_type()), + })?; + + let intensity = batch + .column(peak_schema.intensity_idx) + .as_any() + .downcast_ref::() + .ok_or_else(|| SerializationError::WrongColumnType { + column: "intensity".to_string(), + expected: "Float32", + got: format!("{:?}", batch.column(peak_schema.intensity_idx).data_type()), + })?; + + // Read as f16 directly - no conversion! + let mobility = batch + .column(peak_schema.mobility_idx) + .as_any() + .downcast_ref::() + .ok_or_else(|| SerializationError::WrongColumnType { + column: "mobility_ook0".to_string(), + expected: "Float16", + got: format!("{:?}", batch.column(peak_schema.mobility_idx).data_type()), + })?; + + let cycle = batch + .column(peak_schema.cycle_idx) + .as_any() + .downcast_ref::() + .ok_or_else(|| SerializationError::WrongColumnType { + column: "cycle_index".to_string(), + expected: "UInt32", + got: format!("{:?}", batch.column(peak_schema.cycle_idx).data_type()), + })?; + + // Pre-allocate for this batch + let mut peaks = Vec::with_capacity(batch.num_rows()); + + // Use iterator-based approach for better optimization + // Arrow guarantees all columns have the same length in a RecordBatch + for i in 0..batch.num_rows() { + peaks.push(IndexedPeak { + mz: mz.value(i), + intensity: intensity.value(i), + mobility_ook0: mobility.value(i), // No conversion needed! + cycle_index: T::new(cycle.value(i)), + }); } Ok(peaks) diff --git a/rust/timscentroid/src/storage.rs b/rust/timscentroid/src/storage.rs new file mode 100644 index 0000000..3506367 --- /dev/null +++ b/rust/timscentroid/src/storage.rs @@ -0,0 +1,466 @@ +//! Storage abstraction for local and cloud object stores +//! +//! This module provides a unified interface for accessing files from local +//! filesystem or cloud object storage (S3, GCS, Azure). +//! +//! # Features +//! +//! - **Unified API**: Same code works for local files and cloud storage +//! - **Lazy Runtime**: Tokio runtime created only when needed +//! - **Feature Flags**: Enable cloud providers via Cargo features +//! - **Sync API**: All async operations hidden behind synchronous interface +//! +//! # Examples +//! +//! ## Local Filesystem +//! +//! ```no_run +//! use timscentroid::{StorageLocation, StorageProvider}; +//! +//! // Create from local path +//! let location = StorageLocation::from_path("/path/to/data"); +//! let storage = StorageProvider::new(location)?; +//! +//! // Read and write files +//! let data = storage.read_bytes("file.txt")?; +//! storage.write_bytes("output.txt", vec![1, 2, 3])?; +//! # Ok::<(), timscentroid::serialization::SerializationError>(()) +//! ``` +//! +//! ## Cloud Storage (S3) +//! +//! ```no_run +//! use timscentroid::{StorageLocation, StorageProvider}; +//! +//! // Requires "aws" feature flag +//! let location = StorageLocation::from_url("s3://my-bucket/prefix")?; +//! let storage = StorageProvider::new(location)?; +//! +//! // Same API as local filesystem +//! let data = storage.read_bytes("data.parquet")?; +//! # Ok::<(), timscentroid::serialization::SerializationError>(()) +//! ``` +//! +//! # Authentication +//! +//! Cloud providers use default credential chains: +//! - **AWS**: `~/.aws/credentials`, IAM roles, or environment variables +//! - **GCP**: `GOOGLE_APPLICATION_CREDENTIALS` environment variable +//! - **Azure**: `AZURE_STORAGE_ACCOUNT` and `AZURE_STORAGE_KEY` environment variables + +use bytes::Bytes; +use object_store::ObjectStore; +use object_store::local::LocalFileSystem; +use object_store::path::Path as ObjectPath; +use once_cell::sync::Lazy; +use std::path::Path; +use std::sync::Arc; +use tokio::runtime::Runtime; + +use crate::instrumentation::{ + InstrumentedStore, + StorageMetrics, +}; +use crate::serialization::SerializationError; +use tracing::{ + info, + instrument, +}; + +/// Global tokio runtime for all async operations (created lazily) +/// +/// By default, uses 8 worker threads to enable high concurrency for cloud storage I/O. +/// This is critical for performance when querying multiple parquet files concurrently +/// from S3 or other cloud providers. +/// +/// Can be configured via the `TIMSCENTROID_WORKER_THREADS` environment variable. +pub(crate) static RUNTIME: Lazy = Lazy::new(|| { + let worker_threads = std::env::var("TIMSCENTROID_WORKER_THREADS") + .ok() + .and_then(|s| s.parse().ok()) + .unwrap_or(8); // Increased from 2 to 8 for better S3 concurrency + + tokio::runtime::Builder::new_multi_thread() + .worker_threads(worker_threads) + .enable_all() + .build() + .expect("Failed to create Tokio runtime") +}); + +/// Helper to run async code from sync context, handling nested runtime calls +fn block_on_or_in_place(future: F) -> F::Output { + match tokio::runtime::Handle::try_current() { + Ok(handle) => { + // We're already in a runtime, use block_in_place to avoid panic + tokio::task::block_in_place(|| handle.block_on(future)) + } + Err(_) => { + // Not in a runtime, use our global runtime + RUNTIME.block_on(future) + } + } +} + +/// Storage location - either a local path or a cloud URL +#[derive(Debug, Clone)] +pub enum StorageLocation { + Local(std::path::PathBuf), + Url(url::Url), +} + +impl StorageLocation { + /// Create from a local filesystem path + pub fn from_path(path: impl AsRef) -> Self { + Self::Local(path.as_ref().to_path_buf()) + } + + /// Create from a URL (s3://, gs://, az://, or file://) + pub fn from_url(url: impl AsRef) -> Result { + let parsed = url::Url::parse(url.as_ref())?; + + // Handle file:// URLs as local paths + if parsed.scheme() == "file" + && let Ok(path) = parsed.to_file_path() + { + return Ok(Self::Local(path)); + } + + Ok(Self::Url(parsed)) + } +} + +/// Storage provider wrapping an ObjectStore +#[derive(Clone, Debug)] +pub struct StorageProvider { + store: Arc, + is_local: bool, + /// Path prefix to prepend to all file accesses (for cloud URLs with paths) + prefix: String, + /// Optional metrics if instrumentation is enabled + metrics: Option>, +} + +impl StorageProvider { + /// Create a new storage provider from a location + pub fn new(location: StorageLocation) -> Result { + let (store, is_local, prefix): (Arc, bool, String) = match location { + StorageLocation::Local(path) => { + // Create directory if it doesn't exist + std::fs::create_dir_all(&path)?; + ( + Arc::new(LocalFileSystem::new_with_prefix(path)?), + true, + String::new(), + ) + } + StorageLocation::Url(url) => { + // Extract path prefix from URL (e.g., s3://bucket/prefix/path -> "prefix/path") + let prefix = url.path().trim_start_matches('/').to_string(); + (block_on_or_in_place(parse_url(&url))?, false, prefix) + } + }; + + Ok(Self { + store, + is_local, + prefix, + metrics: None, + }) + } + + /// Enable instrumentation for this storage provider + /// + /// This wraps the underlying ObjectStore with an InstrumentedStore that tracks: + /// - Number of GET/PUT/HEAD operations + /// - Bytes transferred + /// - Time spent in each operation + /// + /// Returns a new StorageProvider with instrumentation enabled. + pub fn with_instrumentation(self, label: impl Into) -> Self { + let metrics = Arc::new(StorageMetrics::new()); + let instrumented = Arc::new(InstrumentedStore::new( + self.store.clone(), + metrics.clone(), + label.into(), + )); + + Self { + store: instrumented, + is_local: self.is_local, + prefix: self.prefix, + metrics: Some(metrics), + } + } + + /// Get metrics if instrumentation is enabled + pub fn metrics(&self) -> Option<&StorageMetrics> { + self.metrics.as_ref().map(|m| m.as_ref()) + } + + /// Print metrics report if instrumentation is enabled + pub fn print_metrics(&self, label: &str) { + if let Some(metrics) = &self.metrics { + metrics.snapshot().print_report(label); + } + } + + /// Check if this storage provider is backed by local filesystem + /// + /// Returns true for local paths, false for cloud storage (S3, GCS, Azure) + pub fn is_local(&self) -> bool { + self.is_local + } + + /// Build full path by prepending prefix (for cloud storage) + pub fn build_path(&self, path: &str) -> String { + if self.prefix.is_empty() { + path.to_string() + } else { + format!("{}/{}", self.prefix, path) + } + } + + /// Read a file as bytes (async version) + #[instrument(skip(self), fields(path = %path))] + pub async fn read_bytes_async(&self, path: &str) -> Result, SerializationError> { + let full_path = self.build_path(path); + info!("Reading from full path: {}", full_path); + let object_path = ObjectPath::from(full_path.as_str()); + info!("Object path: {:?}", object_path); + let result = match self.store.get(&object_path).await { + Ok(res) => res, + Err(e) => { + // Categorize the error properly based on what it actually is + let error_str = e.to_string(); + info!("Error getting object: {:?}", e); + + // Check if it's an authentication/permission error + let error_kind = if error_str.contains("ExpiredToken") + || error_str.contains("The provided token has expired") + || error_str.contains("Access Denied") + || error_str.contains("InvalidAccessKeyId") + || error_str.contains("SignatureDoesNotMatch") + || error_str.contains("Forbidden") + || error_str.contains("status: 401") + || error_str.contains("status: 403") + { + std::io::ErrorKind::PermissionDenied + } else if error_str.contains("status: 404") || error_str.contains("NotFound") { + std::io::ErrorKind::NotFound + } else { + // For other errors, use Other to indicate an unexpected issue + std::io::ErrorKind::Other + }; + + return Err(SerializationError::Io(std::io::Error::new( + error_kind, + format!("Failed to read object at path {}: {}", full_path, e), + ))); + } + }; + let bytes = result.bytes().await?; + Ok(bytes.to_vec()) + } + + /// Read a file as bytes (blocking version) + pub fn read_bytes(&self, path: &str) -> Result, SerializationError> { + block_on_or_in_place(self.read_bytes_async(path)) + } + + /// Read a file as string (async version) + pub async fn read_to_string_async(&self, path: &str) -> Result { + let bytes = self.read_bytes_async(path).await?; + String::from_utf8(bytes).map_err(|e| { + SerializationError::Io(std::io::Error::new(std::io::ErrorKind::InvalidData, e)) + }) + } + + /// Read a file as string (blocking version) + pub fn read_to_string(&self, path: &str) -> Result { + block_on_or_in_place(self.read_to_string_async(path)) + } + + /// Write bytes to a file + pub fn write_bytes(&self, path: &str, data: Vec) -> Result<(), SerializationError> { + let full_path = self.build_path(path); + let object_path = ObjectPath::from(full_path.as_str()); + block_on_or_in_place(async { + self.store + .put(&object_path, Bytes::from(data).into()) + .await?; + Ok(()) + }) + } + + /// Get the underlying ObjectStore + pub(crate) fn as_object_store(&self) -> Arc { + self.store.clone() + } + + /// Ensure directory exists (no-op for object stores, but kept for API consistency) + pub fn ensure_directory(&self, _path: &str) -> Result<(), SerializationError> { + // Object stores don't have directories, so this is a no-op + // Local filesystem handles this via put() creating parent "directories" as needed + Ok(()) + } + + /// Read indexed peaks from a parquet file + /// + /// Uses `ParquetObjectReader` with async streaming for both local and cloud storage. + /// This provides efficient reading without intermediate copies or temp files. + /// + /// # Arguments + /// * `path` - Relative path to the parquet file + /// + /// # Returns + /// Vector of indexed peaks loaded from the parquet file + #[instrument(skip(self), fields(path = %path))] + pub fn read_parquet_peaks( + &self, + path: &str, + ) -> Result>, SerializationError> { + use futures::stream::StreamExt; + use parquet::arrow::ParquetRecordBatchStreamBuilder; + use parquet::arrow::async_reader::ParquetObjectReader; + + let full_path = self.build_path(path); + let object_path = ObjectPath::from(full_path.as_str()); + + block_on_or_in_place(async { + // Create ParquetObjectReader - works for both local and cloud storage + let reader = ParquetObjectReader::new(self.store.clone(), object_path); + + // Build stream + let builder = ParquetRecordBatchStreamBuilder::new(reader).await?; + let mut stream = builder.build()?; + + let mut peaks = Vec::new(); + + // Stream record batches + while let Some(batch_result) = stream.next().await { + let batch = batch_result?; + // Convert batch to peaks (reuse logic from serialization module) + peaks.extend(crate::serialization::batch_to_peaks::(&batch)?); + } + + Ok(peaks) + }) + } +} + +/// Parse a URL into an ObjectStore +/// +/// Supports: +/// - s3://bucket/prefix (requires "aws" feature) +/// - gs://bucket/prefix (requires "gcp" feature) +/// - az://container/prefix or azure://container/prefix (requires "azure" feature) +async fn parse_url(url: &url::Url) -> Result, SerializationError> { + match url.scheme() { + #[cfg(feature = "aws")] + "s3" => { + use aws_config::BehaviorVersion; + use aws_credential_types::provider::ProvideCredentials; + use object_store::aws::AmazonS3Builder; + + let bucket = url.host_str().ok_or_else(|| { + SerializationError::Io(std::io::Error::new( + std::io::ErrorKind::InvalidInput, + "Missing bucket in S3 URL", + )) + })?; + info!("Creating S3 ObjectStore for bucket: {}", bucket); + + // 1. Load the AWS configuration from the environment (handles Profile, MFA, SSO, etc.) + let sdk_config = aws_config::load_defaults(BehaviorVersion::latest()).await; + + // 2. Extract the credentials from the resolved config + // (This executes the chain: Env Vars -> Profile -> Web Identity -> IMDS) + let credentials_provider = sdk_config + .credentials_provider() + .expect("No credentials provider found"); + let credentials = credentials_provider + .provide_credentials() + .await + .map_err(|e| { + SerializationError::Io(std::io::Error::new(std::io::ErrorKind::Other, e)) + })?; + + // 3. Initialize the builder using the resolved credentials + let mut builder = AmazonS3Builder::new() + .with_bucket_name(bucket) + .with_region( + sdk_config + .region() + .map(|r| r.as_ref()) + .unwrap_or("us-west-2"), + ) + .with_access_key_id(credentials.access_key_id()) + .with_secret_access_key(credentials.secret_access_key()); + + // 4. Important: Attach the session token if it exists (Critical for MFA/SSO) + if let Some(token) = credentials.session_token() { + builder = builder.with_token(token); + } + + Ok(Arc::new(builder.build()?)) + } + + #[cfg(feature = "gcp")] + "gs" => { + use object_store::gcp::GoogleCloudStorageBuilder; + + let bucket = url.host_str().ok_or_else(|| { + SerializationError::Io(std::io::Error::new( + std::io::ErrorKind::InvalidInput, + "Missing bucket in GCS URL", + )) + })?; + + Ok(Arc::new( + GoogleCloudStorageBuilder::from_env() + .with_bucket_name(bucket) + .build()?, + )) + } + + #[cfg(feature = "azure")] + "az" | "azure" => { + use object_store::azure::MicrosoftAzureBuilder; + + let container = url.host_str().ok_or_else(|| { + SerializationError::Io(std::io::Error::new( + std::io::ErrorKind::InvalidInput, + "Missing container in Azure URL", + )) + })?; + + Ok(Arc::new( + MicrosoftAzureBuilder::from_env() + .with_container_name(container) + .build()?, + )) + } + + scheme => + { + #[allow(unreachable_code)] + Err(SerializationError::Io(std::io::Error::new( + std::io::ErrorKind::InvalidInput, + format!( + "Unsupported URL scheme: '{}'. Available schemes: {}", + scheme, + { + let schemes = ["file"]; + #[cfg(feature = "aws")] + let schemes = [schemes.as_slice(), &["s3"]].concat(); + #[cfg(feature = "gcp")] + let schemes = [schemes.as_slice(), &["gs"]].concat(); + #[cfg(feature = "azure")] + let schemes = [schemes.as_slice(), &["az/azure"]].concat(); + schemes.join(", ") + } + ), + ))) + } + } +} diff --git a/rust/timscentroid/tests/lazy_loading_tests.rs b/rust/timscentroid/tests/lazy_loading_tests.rs new file mode 100644 index 0000000..cdb7f17 --- /dev/null +++ b/rust/timscentroid/tests/lazy_loading_tests.rs @@ -0,0 +1,493 @@ +use half::f16; +use timscentroid::StorageLocation; +use timscentroid::indexing::{ + IndexedPeak, + IndexedPeakGroup, + IndexedTimstofPeaks, +}; +use timscentroid::lazy::LazyIndexedTimstofPeaks; +use timscentroid::rt_mapping::{ + CycleToRTMapping, + MS1CycleIndex, + RTIndex, +}; +use timscentroid::utils::OptionallyRestricted::*; +use timscentroid::utils::TupleRange; + +// For now, we'll skip MS2 geometry tests since QuadrupoleIsolationScheme +// doesn't have a public constructor. We can test MS1 queries instead. + +#[test] +fn test_small_dataset_eager_vs_lazy_ms1() { + // Create a small, controlled dataset + let mut peaks = Vec::new(); + + // Add 10 peaks across different m/z ranges + for i in 0..10 { + peaks.push(IndexedPeak { + mz: 400.0 + (i as f32) * 10.0, // 400, 410, 420, ..., 490 + intensity: 100.0 * (i as f32 + 1.0), + mobility_ook0: f16::from_f32(1.0 + (i as f32) * 0.01), + cycle_index: MS1CycleIndex::new(i % 3), // Cycles 0, 1, 2 + }); + } + + let cycle_to_rt_ms = vec![0, 100, 200]; // 3 cycles + let bucket_size = 4; // Small buckets: 4 peaks each + + // Build MS1 group using canonical constructor + let (ms1_group, stats) = + IndexedPeakGroup::testing_new(peaks, CycleToRTMapping::new(cycle_to_rt_ms), bucket_size); + + println!( + "Created MS1 group: {} peaks, {} buckets", + stats.num_peaks, stats.num_buckets + ); + + // Create index with empty MS2 (test MS1 only) + let index = IndexedTimstofPeaks::from_parts(ms1_group, vec![]); + + // Serialize to temp directory + let temp_dir = std::env::temp_dir().join("timscentroid_test_lazy_ms1"); + if temp_dir.exists() { + std::fs::remove_dir_all(&temp_dir).unwrap(); + } + + let mut config = timscentroid::serialization::SerializationConfig::default(); + // Just some weird number ... + config.row_group_size = 3; + + index + .save_to_directory_with_config(&temp_dir, config) + .unwrap(); + + // Load eagerly + let location = StorageLocation::from_path(&temp_dir); + let eager_index = IndexedTimstofPeaks::load_from_storage(location).unwrap(); + + // Load lazily + let location = StorageLocation::from_path(&temp_dir); + let lazy_index = LazyIndexedTimstofPeaks::load_from_storage(location).unwrap(); + + // Query both with same parameters + let mz_range = TupleRange::try_new(420.0, 460.0).unwrap(); // Should match peaks at 420, 430, 440, 450, 460 + + // Original index results + let orig_index_results = index + .query_peaks_ms1(mz_range, Unrestricted, Unrestricted) + .collect::>(); + + assert!(orig_index_results.iter().any(|p| p.mz == 420.0)); + assert!(orig_index_results.iter().any(|p| p.mz == 430.0)); + assert!(orig_index_results.iter().any(|p| p.mz == 440.0)); + assert!(orig_index_results.iter().any(|p| p.mz == 450.0)); + assert!(orig_index_results.iter().any(|p| p.mz == 460.0)); + assert_eq!( + orig_index_results.len(), + 5, + "Should find 5 peaks in original index" + ); + + // Query MS1 with eager + let eager_results: Vec<_> = eager_index + .query_peaks_ms1(mz_range, Unrestricted, Unrestricted) + .collect(); + + // Query MS1 with lazy + let lazy_results: Vec<_> = lazy_index + .query_peaks_ms1(mz_range, Unrestricted, Unrestricted) + .collect(); + + println!("Original index found {} peaks", orig_index_results.len()); + println!("Eager found {} peaks", eager_results.len()); + println!("Lazy found {} peaks", lazy_results.len()); + + // Print details for debugging + println!("\nEager peaks:"); + for peak in &eager_results { + println!( + " mz={:.1}, intensity={:.1}, mobility={:.3}, cycle={:?}", + peak.mz, peak.intensity, peak.mobility_ook0, peak.cycle_index + ); + } + + println!("\nLazy peaks:"); + for peak in &lazy_results { + println!( + " mz={:.1}, intensity={:.1}, mobility={:.3}, cycle={:?}", + peak.mz, peak.intensity, peak.mobility_ook0, peak.cycle_index + ); + } + + // Assert they match + assert_eq!( + eager_results.len(), + lazy_results.len(), + "Eager and lazy should return same number of peaks!" + ); + assert_eq!( + orig_index_results.len(), + lazy_results.len(), + "Lazy results should match original index!" + ); + + // Cleanup + std::fs::remove_dir_all(&temp_dir).unwrap(); +} + +// I could mayube go nuclear and fuzz test it ... just generate random peak sets and ensure +// that the bucket calculations are consistent with each other AND with a naive greedy implementation +// with varying bucket sizes and row group sizes ... + +#[test] +fn test_bucket_boundary_case() { + // This test checks the bucket boundary issue + // If we have 10 peaks and bucket_size=4: + // Bucket 0: peaks 0-3 (mz 400-430) + // Bucket 1: peaks 4-7 (mz 440-470) + // Bucket 2: peaks 8-9 (mz 480-490) + // + // Row group with row_group_size=2M will contain all buckets + // But the bucket calculation might have off-by-one errors + + let peaks: Vec<_> = (0..10) + .map(|i| IndexedPeak { + mz: 400.0 + (i as f32) * 10.0, + intensity: 100.0, + mobility_ook0: f16::from_f32(1.0), + cycle_index: MS1CycleIndex::new(0), + }) + .collect(); + + let cycle_to_rt_ms = vec![0]; + let bucket_size = 4; + + let (_group, stats) = IndexedPeakGroup::testing_new( + peaks, + CycleToRTMapping::::new(cycle_to_rt_ms), + bucket_size, + ); + + // Should have 3 buckets: [0-3], [4-7], [8-9] + assert_eq!(stats.num_buckets, 3, "Should have 3 buckets"); + assert_eq!(stats.num_peaks, 10, "Should have 10 peaks"); + + println!( + "Test passed: {} peaks organized into {} buckets", + stats.num_peaks, stats.num_buckets + ); +} + +#[test] +fn test_storage_abstraction_with_local_filesystem() { + // Test that StorageProvider works with local filesystem + let temp_dir = std::env::temp_dir().join("timscentroid_storage_test"); + if temp_dir.exists() { + std::fs::remove_dir_all(&temp_dir).unwrap(); + } + + // Create test data - small dataset with 5 peaks + let peaks: Vec<_> = (0..5) + .map(|i| IndexedPeak { + mz: 400.0 + (i as f32) * 20.0, // 400, 420, 440, 460, 480 + intensity: 100.0 * (i as f32 + 1.0), + mobility_ook0: f16::from_f32(1.0 + (i as f32) * 0.01), + cycle_index: MS1CycleIndex::new(i % 2), // Cycles 0, 1 + }) + .collect(); + + let cycle_to_rt_ms = vec![0, 100]; + let bucket_size = 2; + + let (ms1_group, stats) = + IndexedPeakGroup::testing_new(peaks, CycleToRTMapping::new(cycle_to_rt_ms), bucket_size); + + println!( + "Created test data: {} peaks, {} buckets", + stats.num_peaks, stats.num_buckets + ); + + let index = IndexedTimstofPeaks::from_parts(ms1_group, vec![]); + + // Test 1: Save using storage abstraction + let location = StorageLocation::from_path(&temp_dir); + index.save_to_storage(location, Default::default()).unwrap(); + + // Verify files were created + assert!(temp_dir.join("metadata.json").exists()); + assert!(temp_dir.join("ms1.parquet").exists()); + + // Test 2: Load using storage abstraction + let location = StorageLocation::from_path(&temp_dir); + let lazy_index = LazyIndexedTimstofPeaks::load_from_storage(location).unwrap(); + + // Test 3: Query and verify results + let mz_range = TupleRange::try_new(420.0, 460.0).unwrap(); + let results: Vec<_> = lazy_index + .query_peaks_ms1(mz_range, Unrestricted, Unrestricted) + .collect(); + + // Should find peaks at 420, 440, 460 + assert_eq!(results.len(), 3, "Should find 3 peaks in range"); + assert!(results.iter().any(|p| (p.mz - 420.0).abs() < 0.01)); + assert!(results.iter().any(|p| (p.mz - 440.0).abs() < 0.01)); + assert!(results.iter().any(|p| (p.mz - 460.0).abs() < 0.01)); + + println!("Storage abstraction test passed!"); + + // Cleanup + std::fs::remove_dir_all(&temp_dir).unwrap(); +} + +#[test] +fn test_url_parsing() { + // Test file:// URL parsing + let location = StorageLocation::from_url("file:///tmp/test").unwrap(); + assert!(matches!(location, StorageLocation::Local(_))); + + // Test that invalid URLs are rejected + assert!(StorageLocation::from_url("not-a-url").is_err()); + + println!("URL parsing test passed!"); +} + +#[tokio::test(flavor = "multi_thread")] +async fn test_async_parquet_querier() { + // Create a small test dataset + let peaks: Vec<_> = (0..10) + .map(|i| IndexedPeak { + mz: 400.0 + (i as f32) * 10.0, + intensity: 100.0, + mobility_ook0: f16::from_f32(1.0), + cycle_index: MS1CycleIndex::new(0), + }) + .collect(); + + let cycle_to_rt_ms = vec![0]; + let bucket_size = 4; + + let (ms1_group, _) = IndexedPeakGroup::testing_new( + peaks, + CycleToRTMapping::::new(cycle_to_rt_ms), + bucket_size, + ); + + let index = IndexedTimstofPeaks::from_parts(ms1_group, vec![]); + + // Serialize to temp directory + let temp_dir = std::env::temp_dir().join("timscentroid_test_async_querier"); + if temp_dir.exists() { + std::fs::remove_dir_all(&temp_dir).unwrap(); + } + + index.save_to_directory(&temp_dir).unwrap(); + + // Test async ParquetQuerier methods + use timscentroid::lazy::query::ParquetQuerier; + use timscentroid::storage::StorageProvider; + + let storage = StorageProvider::new(StorageLocation::from_path(&temp_dir)).unwrap(); + + // Test new_async + let querier = ParquetQuerier::new_async(storage.clone(), "ms1.parquet") + .await + .expect("Failed to create async querier"); + + // Test query_async + let mz_range = 420.0..460.0; + let record_batch = querier + .query_async(mz_range, None) + .await + .expect("Failed to query async"); + + // Verify results + assert!(record_batch.num_rows() > 0, "Should find some peaks"); + println!( + "Async ParquetQuerier test passed: found {} peaks", + record_batch.num_rows() + ); + + // Cleanup + std::fs::remove_dir_all(&temp_dir).unwrap(); +} + +#[tokio::test(flavor = "multi_thread")] +async fn test_async_ms2_query_vs_sync() { + // Create test data with multiple MS2 groups + let temp_dir = std::env::temp_dir().join("timscentroid_test_async_ms2"); + if temp_dir.exists() { + std::fs::remove_dir_all(&temp_dir).unwrap(); + } + + // Create simple test data (just MS1 for now, as creating MS2 data requires more setup) + let peaks: Vec<_> = (0..50) + .map(|i| IndexedPeak { + mz: 400.0 + (i as f32) * 10.0, + intensity: 100.0 * (i as f32 + 1.0), + mobility_ook0: f16::from_f32(1.0 + (i as f32) * 0.01), + cycle_index: MS1CycleIndex::new(i % 3), + }) + .collect(); + + let cycle_to_rt_ms = vec![0, 100, 200]; + let bucket_size = 4; + + let (ms1_group, _) = + IndexedPeakGroup::testing_new(peaks, CycleToRTMapping::new(cycle_to_rt_ms), bucket_size); + + let index = IndexedTimstofPeaks::from_parts(ms1_group, vec![]); + index.save_to_directory(&temp_dir).unwrap(); + + // Load with lazy index using async version + let location = StorageLocation::from_path(&temp_dir); + let _lazy_index = LazyIndexedTimstofPeaks::load_from_storage_async(location) + .await + .unwrap(); + + // We can't easily test async MS1 queries yet since query_peaks_ms1 returns an iterator + // For now, just verify the index loads correctly in async context + println!("Async lazy loading test passed - index created successfully using async load!"); + + // Note: To fully test async queries, we'd need MS2 data which requires more complex setup + + // Cleanup + std::fs::remove_dir_all(&temp_dir).unwrap(); +} + +#[tokio::test(flavor = "multi_thread")] +async fn test_concurrent_metadata_caching() { + // This test verifies that metadata is cached and not refetched on every query + let temp_dir = std::env::temp_dir().join("timscentroid_test_metadata_cache"); + if temp_dir.exists() { + std::fs::remove_dir_all(&temp_dir).unwrap(); + } + + // Create test data + let peaks: Vec<_> = (0..20) + .map(|i| IndexedPeak { + mz: 400.0 + (i as f32) * 20.0, + intensity: 100.0, + mobility_ook0: f16::from_f32(1.0), + cycle_index: MS1CycleIndex::new(0), + }) + .collect(); + + let (ms1_group, _) = + IndexedPeakGroup::testing_new(peaks, CycleToRTMapping::::new(vec![0]), 4); + + let index = IndexedTimstofPeaks::from_parts(ms1_group, vec![]); + index.save_to_directory(&temp_dir).unwrap(); + + // Test metadata caching + use timscentroid::lazy::query::ParquetQuerier; + use timscentroid::storage::StorageProvider; + + let storage = StorageProvider::new(StorageLocation::from_path(&temp_dir)).unwrap(); + + // Create querier once - metadata fetched here + let querier = ParquetQuerier::new_async(storage.clone(), "ms1.parquet") + .await + .expect("Failed to create querier"); + + // Run 10 queries - metadata should NOT be refetched + for i in 0..10 { + let mz_start = 400.0 + (i as f32) * 40.0; + let mz_end = mz_start + 80.0; + let _ = querier + .query_async(mz_start..mz_end, None) + .await + .expect("Query failed"); + } + + println!("Metadata caching test passed - 10 queries executed with cached metadata!"); + + // Cleanup + std::fs::remove_dir_all(&temp_dir).unwrap(); +} + +#[test] +fn test_nested_path_prefix_handling() { + // This test simulates cloud storage with path prefixes by using nested local directories + // e.g., s3://bucket/prefix/subdir/data -> /tmp/bucket/prefix/subdir/data + + let temp_root = std::env::temp_dir().join("timscentroid_prefix_test"); + let nested_path = temp_root.join("level1/level2/data"); + + // Clean up if exists + if temp_root.exists() { + std::fs::remove_dir_all(&temp_root).unwrap(); + } + + std::fs::create_dir_all(&nested_path).unwrap(); + + // Create test data - small dataset + let peaks: Vec<_> = (0..5) + .map(|i| IndexedPeak { + mz: 600.0 + (i as f32) * 10.0, // 600, 610, 620, 630, 640 + intensity: 100.0 * (i as f32 + 1.0), + mobility_ook0: f16::from_f32(1.0), + cycle_index: MS1CycleIndex::new(0), + }) + .collect(); + + let cycle_to_rt_ms = vec![0]; + let bucket_size = 2; + + let (ms1_group, stats) = + IndexedPeakGroup::testing_new(peaks, CycleToRTMapping::new(cycle_to_rt_ms), bucket_size); + + println!( + "Created test data: {} peaks, {} buckets", + stats.num_peaks, stats.num_buckets + ); + + let index = IndexedTimstofPeaks::from_parts(ms1_group, vec![]); + + // Save to nested path (simulates cloud storage with prefix) + let location = StorageLocation::from_path(&nested_path); + index.save_to_storage(location, Default::default()).unwrap(); + + // Verify files were created in the correct nested location + assert!( + nested_path.join("metadata.json").exists(), + "metadata.json should exist" + ); + assert!( + nested_path.join("ms1.parquet").exists(), + "ms1.parquet should exist" + ); + + // Load from nested path - this tests that prefix handling works correctly + let location = StorageLocation::from_path(&nested_path); + let lazy_index = + LazyIndexedTimstofPeaks::load_from_storage(location).expect("Should load from nested path"); + + // Query peaks - this is where prefix handling is critical for parquet file access + let mz_range = TupleRange::try_new(610.0, 630.0).unwrap(); + let results: Vec<_> = lazy_index + .query_peaks_ms1(mz_range, Unrestricted, Unrestricted) + .collect(); + + // Should find peaks at 610, 620, 630 + assert_eq!(results.len(), 3, "Should find 3 peaks in range"); + assert!( + results.iter().any(|p| (p.mz - 610.0).abs() < 0.01), + "Should find peak at 610" + ); + assert!( + results.iter().any(|p| (p.mz - 620.0).abs() < 0.01), + "Should find peak at 620" + ); + assert!( + results.iter().any(|p| (p.mz - 630.0).abs() < 0.01), + "Should find peak at 630" + ); + + println!( + "Path prefix test passed! Successfully loaded and queried from nested path: {:?}", + nested_path + ); + + // Cleanup + std::fs::remove_dir_all(&temp_root).unwrap(); +} diff --git a/rust/timsquery/src/errors.rs b/rust/timsquery/src/errors.rs index a6e36cb..2df6064 100644 --- a/rust/timsquery/src/errors.rs +++ b/rust/timsquery/src/errors.rs @@ -1,4 +1,5 @@ use std::fmt::Display; +use timscentroid::serialization::SerializationError; use timsrust::{ TimsRustError, TimsTofPathError, @@ -28,6 +29,7 @@ pub enum DataReadingError { UnsupportedDataError(UnsupportedDataError), TimsTofPathError(TimsTofPathError), TimsRustError(TimsRustError), // Why doesnt timsrust error derive clone? + SerializationError(SerializationError), } impl From for DataReadingError { @@ -36,9 +38,28 @@ impl From for DataReadingError { } } +// Note: Can't implement From due to blanket impl conflict +// Use DataReadingError::SerializationError(e) directly instead + #[derive(Debug)] pub enum UnsupportedDataError { NoMS2DataError, + CloudRawDataNotSupported { url: String, suggestion: String }, +} + +impl Display for UnsupportedDataError { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + Self::NoMS2DataError => write!(f, "No MS2 data found"), + Self::CloudRawDataNotSupported { url, suggestion } => { + write!( + f, + "Cannot read raw .d files from cloud storage: {}\n\n{}", + url, suggestion + ) + } + } + } } #[derive(Debug)] diff --git a/rust/timsquery/src/models/aggregators/chromatogram_agg.rs b/rust/timsquery/src/models/aggregators/chromatogram_agg.rs index 5ad6256..03a6bc0 100644 --- a/rust/timsquery/src/models/aggregators/chromatogram_agg.rs +++ b/rust/timsquery/src/models/aggregators/chromatogram_agg.rs @@ -11,6 +11,11 @@ use crate::{ TimsElutionGroup, ValueLike, }; +use timscentroid::rt_mapping::{ + CycleToRTMapping, + MS1CycleIndex, + RTIndex, // Trait needed for the index method. +}; use timscentroid::utils::TupleRange; #[derive(Debug, Clone, Serialize)] @@ -25,30 +30,31 @@ impl ChromatogramCollector { pub fn new( eg: TimsElutionGroup, rt_range_ms: TupleRange, - ref_rt_ms: &[u32], + ref_rt_ms: &CycleToRTMapping, ) -> Result { // We binary search the start and end rt to calculate the length // and the offset of the reference rt array - let start = ref_rt_ms.partition_point(|&rt| rt < rt_range_ms.start()); - let end = ref_rt_ms.partition_point(|&rt| rt <= rt_range_ms.end()); - let length = end - start; + let start = ref_rt_ms.ms_to_closest_index(rt_range_ms.start()); + let end = ref_rt_ms.ms_to_closest_index(rt_range_ms.end()); + let num_cycles = end.index() - start.index() + 1; let precursor_order: Vec<_> = eg.iter_precursors().collect(); let fragment_order: Vec<_> = eg .iter_fragments_refs() .map(|(k, v)| (k.clone(), *v)) .collect(); - if precursor_order.is_empty() && fragment_order.is_empty() { return Err(DataProcessingError::ExpectedNonEmptyData); } - if length == 0 { + if num_cycles == 0 { return Err(DataProcessingError::ExpectedNonEmptyData); } - let precursors = MzMajorIntensityArray::try_new_empty(precursor_order, length, start) - .expect("Already checked non-empty"); - let fragments = MzMajorIntensityArray::try_new_empty(fragment_order, length, start)?; + let precursors = + MzMajorIntensityArray::try_new_empty(precursor_order, num_cycles, start.index()) + .expect("Already checked non-empty"); + let fragments = + MzMajorIntensityArray::try_new_empty(fragment_order, num_cycles, start.index())?; Ok(Self { eg, precursors, @@ -57,6 +63,39 @@ impl ChromatogramCollector { }) } + pub fn try_reset_with( + &mut self, + eg: TimsElutionGroup, + rt_range_ms: TupleRange, + ref_rt_ms: &CycleToRTMapping, + ) -> Result<(), DataProcessingError> { + // We binary search the start and end rt to calculate the length + // and the offset of the reference rt array + let start = ref_rt_ms.ms_to_closest_index(rt_range_ms.start()); + let end = ref_rt_ms.ms_to_closest_index(rt_range_ms.end()); + let num_cycles = end.index() - start.index() + 1; + + let precursor_order: Vec<_> = eg.iter_precursors().collect(); + let fragment_order: Vec<_> = eg + .iter_fragments_refs() + .map(|(k, v)| (k.clone(), *v)) + .collect(); + if precursor_order.is_empty() && fragment_order.is_empty() { + return Err(DataProcessingError::ExpectedNonEmptyData); + } + if num_cycles == 0 { + return Err(DataProcessingError::ExpectedNonEmptyData); + } + + self.eg = eg; + self.rt_range_ms = rt_range_ms; + self.precursors + .clear_with_order(precursor_order, num_cycles, start.index()); + self.fragments + .clear_with_order(fragment_order, num_cycles, start.index()); + Ok(()) + } + pub fn iter_mut_precursors( &mut self, ) -> impl Iterator)> { @@ -146,7 +185,7 @@ mod tests { .try_build() .expect("I passed valid vec lengths!"); - let rt_ms: Arc<[u32]> = vec![10, 20].into(); + let rt_ms = CycleToRTMapping::new(vec![10, 20]); let mut collector = ChromatogramCollector::::new( eg, TupleRange::try_new(9, 20).unwrap(), diff --git a/rust/timsquery/src/models/aggregators/spectrum_agg.rs b/rust/timsquery/src/models/aggregators/spectrum_agg.rs index f3dc7a9..39d94fa 100644 --- a/rust/timsquery/src/models/aggregators/spectrum_agg.rs +++ b/rust/timsquery/src/models/aggregators/spectrum_agg.rs @@ -1,6 +1,7 @@ use serde::Serialize; use serde::ser::SerializeStruct; use timscentroid::indexing::IndexedPeak; +use timscentroid::rt_mapping::RTIndex; use crate::traits::queriable_data::PeakAddable; use crate::utils::streaming_calculators::{ @@ -118,8 +119,8 @@ impl MzMobilityStatsCollector { } } -impl AddAssign for MzMobilityStatsCollector { - fn add_assign(&mut self, other: IndexedPeak) { +impl AddAssign> for MzMobilityStatsCollector { + fn add_assign(&mut self, other: IndexedPeak) { self.add( other.intensity as f64, other.mz as f64, @@ -128,7 +129,7 @@ impl AddAssign for MzMobilityStatsCollector { } } -impl PeakAddable for MzMobilityStatsCollector {} +impl PeakAddable for MzMobilityStatsCollector {} impl Add for MzMobilityStatsCollector { type Output = Self; diff --git a/rust/timsquery/src/models/base/mz_rt_arrays.rs b/rust/timsquery/src/models/base/mz_rt_arrays.rs index deb5efe..7ed190d 100644 --- a/rust/timsquery/src/models/base/mz_rt_arrays.rs +++ b/rust/timsquery/src/models/base/mz_rt_arrays.rs @@ -178,6 +178,20 @@ impl MzMajorIntensityArray { } } + pub fn clear_with_order( + &mut self, + order: Vec<(K, f64)>, + num_cycles: usize, + cycle_offset: usize, + ) { + let minor_dim = num_cycles; + let major_dim = order.len(); + self.arr + .reset_with_value(minor_dim, major_dim, V::default()); + self.mz_order = order; + self.cycle_offset = cycle_offset; + } + /// The main purpose of this function is to preserve the allocation /// of the array but replace the data contained in it. pub fn try_reset_with( diff --git a/rust/timsquery/src/models/elution_group.rs b/rust/timsquery/src/models/elution_group.rs index 9667b42..9ca6e3f 100644 --- a/rust/timsquery/src/models/elution_group.rs +++ b/rust/timsquery/src/models/elution_group.rs @@ -4,7 +4,6 @@ use crate::models::elution_group::tims_elution_group_builder::{ }; use crate::traits::KeyLike; use crate::utils::constants::NEUTRON_MASS; -use bon::builder; use serde::{ Deserialize, Serialize, @@ -20,9 +19,13 @@ use tinyvec::TinyVec; #[builder(finish_fn(vis = "", name = try_build_internal))] pub struct TimsElutionGroup { id: u64, + #[serde(alias = "mobility")] mobility_ook0: f32, rt_seconds: f32, + #[serde(alias = "precursor")] + #[serde(alias = "precursor_mz")] precursor_mono_mz: f64, + #[serde(alias = "charge")] precursor_charge: u8, // The baseline size of TinyVec is 24 bits. (due to the stack size of a Vec) @@ -45,8 +48,10 @@ pub struct TimsElutionGroup { // the concrete type TimsElutionGroup but 408 bytes for TimsElutionGroup // // In theory I can make this lighter if it was a genetic ... + #[serde(alias = "fragments")] fragment_mzs: Vec, fragment_labels: TinyVec<[T; 13]>, + #[serde(alias = "precursor_isotopes")] precursor_labels: TinyVec<[i8; 13]>, } @@ -105,6 +110,11 @@ impl TimsElutionGroup { self.precursor_mono_mz } + pub fn set_precursor_labels(&mut self, labels: impl Iterator) { + self.precursor_labels.clear(); + self.precursor_labels.extend(labels); + } + // NOTE: I am thinking about removing this and leave the rest as a trait pub fn set_rt_seconds(&mut self, rt_seconds: f32) { self.rt_seconds = rt_seconds; @@ -205,4 +215,20 @@ impl TimsElutionGroup { ..self } } + + pub fn cast(&self, f: impl Fn(&T) -> U) -> TimsElutionGroup { + let fragment_labels_converted: TinyVec<[U; 13]> = + self.fragment_labels.iter().map(f).collect(); + + TimsElutionGroup { + id: self.id, + mobility_ook0: self.mobility_ook0, + rt_seconds: self.rt_seconds, + precursor_mono_mz: self.precursor_mono_mz, + precursor_charge: self.precursor_charge, + fragment_mzs: self.fragment_mzs.clone(), + fragment_labels: fragment_labels_converted, + precursor_labels: self.precursor_labels.clone(), + } + } } diff --git a/rust/timsquery/src/models/indexed_data.rs b/rust/timsquery/src/models/indexed_data.rs index f0baa6b..5f72b2e 100644 --- a/rust/timsquery/src/models/indexed_data.rs +++ b/rust/timsquery/src/models/indexed_data.rs @@ -68,66 +68,185 @@ use crate::models::aggregators::{ PointIntensityAggregator, SpectralCollector, }; +use crate::serde::IndexedPeaksHandle; use crate::traits::{ PeakAddable, QueriableData, }; use crate::{ KeyLike, + OptionallyRestricted, Tolerance, }; use half::f16; use timscentroid::IndexedTimstofPeaks; +use timscentroid::rt_mapping::{ + MS1CycleIndex, + RTIndex, + WindowCycleIndex, +}; use timscentroid::utils::OptionallyRestricted::{ Restricted, Unrestricted, }; use timscentroid::utils::TupleRange; -impl QueriableData> for IndexedTimstofPeaks { - fn add_query(&self, aggregator: &mut PointIntensityAggregator, tolerance: &Tolerance) { - let prec_mz_limits: TupleRange = - aggregator.eg.get_precursor_mz_limits().try_into().unwrap(); +/// Encapsulates the query ranges computed from an elution group and tolerance. +/// +/// This struct centralizes the tolerance-to-range conversion logic that's used +/// across all aggregator implementations, reducing code duplication. +struct QueryRanges { + quad_range: TupleRange, + im_range: OptionallyRestricted>, + ms1_cycle_range: OptionallyRestricted>, + ms2_cycle_range: OptionallyRestricted>, +} + +impl QueryRanges { + /// Compute query ranges from an elution group and tolerance. + /// + /// This is the standard conversion used by most aggregators. + fn from_elution_group( + aggregator: &A, + tolerance: &Tolerance, + rt_ms_to_cycle: impl Fn(u32) -> MS1CycleIndex, + ) -> Self + where + A: HasElutionGroup, + { + let prec_mz_limits = aggregator.elution_group().get_precursor_mz_limits(); + let quad_range = + tolerance.quad_range_f32((prec_mz_limits.0 as f32, prec_mz_limits.1 as f32)); + let im_range = tolerance.mobility_range_f16(aggregator.elution_group().mobility_ook0()); + let rt_range_milliseconds = + tolerance.rt_range_as_milis(aggregator.elution_group().rt_seconds()); + let ms1_cycle_range = match rt_range_milliseconds { + Restricted(x) => Restricted( + TupleRange::try_new(rt_ms_to_cycle(x.start()), rt_ms_to_cycle(x.end())).unwrap(), + ), + Unrestricted => Unrestricted, + }; + let ms2_cycle_range = ms1_to_ms2_cycle_range(&ms1_cycle_range); + + Self { + quad_range, + im_range, + ms1_cycle_range, + ms2_cycle_range, + } + } + + /// Compute query ranges with RT intersection for ChromatogramCollector. + /// + /// Returns None if the RT ranges don't intersect (early termination case). + fn from_elution_group_with_rt_intersection( + aggregator: &A, + tolerance: &Tolerance, + rt_limits_milis: TupleRange, + rt_ms_to_cycle: impl Fn(u32) -> MS1CycleIndex, + ) -> Option + where + A: HasElutionGroup, + { + let prec_mz_limits = aggregator.elution_group().get_precursor_mz_limits(); let quad_range = - tolerance.quad_range_f32((prec_mz_limits.start() as f32, prec_mz_limits.end() as f32)); - let im_range = tolerance.mobility_range_f16(aggregator.eg.mobility_ook0()); - let rt_range_milliseconds = tolerance.rt_range_as_milis(aggregator.eg.rt_seconds()); - let cycle_range = match rt_range_milliseconds { + tolerance.quad_range_f32((prec_mz_limits.0 as f32, prec_mz_limits.1 as f32)); + let im_range = tolerance.mobility_range_f16(aggregator.elution_group().mobility_ook0()); + + let rt_range_milliseconds = match tolerance + .rt_range_as_milis(aggregator.elution_group().rt_seconds()) + .map(|x| x.try_intercept(rt_limits_milis)) + { + Restricted(Some(x)) => Restricted(x), + Restricted(None) => return None, + Unrestricted => Unrestricted, + }; + + let ms1_cycle_range = match rt_range_milliseconds { Restricted(x) => Restricted( - TupleRange::try_new( - self.rt_ms_to_cycle_index(x.start()), - self.rt_ms_to_cycle_index(x.end()), - ) - .unwrap(), + TupleRange::try_new(rt_ms_to_cycle(x.start()), rt_ms_to_cycle(x.end())).unwrap(), ), Unrestricted => Unrestricted, }; + let ms2_cycle_range = ms1_to_ms2_cycle_range(&ms1_cycle_range); + + Some(Self { + quad_range, + im_range, + ms1_cycle_range, + ms2_cycle_range, + }) + } + + /// Constrain the IM range based on quadrupole isolation geometry. + /// + /// For MS2 queries, the ion mobility range must be further constrained to the + /// intersection of the original IM tolerance and the quadrupole isolation window. + /// This is the repeated pattern in all MS2 query implementations. + fn constrain_im_for_quadrupole( + &self, + quad_info: &timscentroid::geometry::QuadrupoleIsolationScheme, + ) -> OptionallyRestricted> { + self.im_range.map(|x| { + let tmp = quad_info.intersects_ranges( + (self.quad_range.start() as f64, self.quad_range.end() as f64), + (x.start().to_f64(), x.end().to_f64()), + ); + let tmp = + tmp.expect("Since we filtered based on the mz+ims there should always be a match"); + (f16::from_f32(tmp.0 as f32), f16::from_f32(tmp.1 as f32)) + .try_into() + .unwrap() + }) + } +} + +/// Trait to abstract access to the elution group from different aggregator types. +trait HasElutionGroup { + fn elution_group(&self) -> &crate::models::elution_group::TimsElutionGroup; +} + +impl HasElutionGroup for PointIntensityAggregator { + fn elution_group(&self) -> &crate::models::elution_group::TimsElutionGroup { + &self.eg + } +} + +impl HasElutionGroup for ChromatogramCollector { + fn elution_group(&self) -> &crate::models::elution_group::TimsElutionGroup { + &self.eg + } +} + +impl HasElutionGroup + for SpectralCollector +{ + fn elution_group(&self) -> &crate::models::elution_group::TimsElutionGroup { + &self.eg + } +} + +impl QueriableData> for IndexedTimstofPeaks { + fn add_query(&self, aggregator: &mut PointIntensityAggregator, tolerance: &Tolerance) { + let ranges = QueryRanges::from_elution_group(aggregator, tolerance, |rt| { + self.rt_ms_to_cycle_index(rt) + }); + aggregator.eg.iter_precursors().for_each(|(_idx, mz)| { let mz_range = tolerance.mz_range_f32(mz as f32); - self.query_peaks_ms1(mz_range, cycle_range, im_range) + self.query_peaks_ms1(mz_range, ranges.ms1_cycle_range, ranges.im_range) .for_each(|peak| { aggregator.intensity += peak.intensity as f64; }); }); - self.filter_precursor_ranges(quad_range, im_range) + self.filter_precursor_ranges(ranges.quad_range, ranges.im_range) .for_each(|(quad_info, peaks)| { - let constrained_im = im_range.map(|x| { - let tmp = quad_info.intersects_ranges( - (quad_range.start() as f64, quad_range.end() as f64), - (x.start().to_f64(), x.end().to_f64()), - ); - let tmp = tmp.expect( - "Since we filtered based on the mz+ims there should always be a match", - ); - (f16::from_f32(tmp.0 as f32), f16::from_f32(tmp.1 as f32)) - .try_into() - .unwrap() - }); + let constrained_im = ranges.constrain_im_for_quadrupole(quad_info); aggregator.eg.iter_fragments_refs().for_each(|(_idx, mz)| { let mz_range = tolerance.mz_range_f32(*mz as f32); peaks - .query_peaks(mz_range, cycle_range, constrained_im) + .query_peaks(mz_range, ranges.ms2_cycle_range, constrained_im) .for_each(|peak| { aggregator.intensity += peak.intensity as f64; }); @@ -138,52 +257,26 @@ impl QueriableData> for IndexedTimstof impl QueriableData> for IndexedTimstofPeaks { fn add_query(&self, aggregator: &mut ChromatogramCollector, tolerance: &Tolerance) { - let agg_rt_limits_seconds = aggregator.rt_range_milis(); - let prec_mz_limits = aggregator.eg.get_precursor_mz_limits(); - let quad_range = - tolerance.quad_range_f32((prec_mz_limits.0 as f32, prec_mz_limits.1 as f32)); - let im_range: timscentroid::utils::OptionallyRestricted> = - tolerance.mobility_range_f16(aggregator.eg.mobility_ook0()); - let rt_range_milliseconds = match tolerance - .rt_range_as_milis(aggregator.eg.rt_seconds()) - .map(|x| x.try_intercept(agg_rt_limits_seconds)) - { - Restricted(Some(x)) => Restricted(x), - Restricted(None) => return, - Unrestricted => Unrestricted, - }; - let cycle_range = match rt_range_milliseconds { - Restricted(x) => Restricted( - TupleRange::try_new( - self.rt_ms_to_cycle_index(x.start()), - self.rt_ms_to_cycle_index(x.end()), - ) - .unwrap(), - ), - Unrestricted => Unrestricted, + let Some(ranges) = QueryRanges::from_elution_group_with_rt_intersection( + aggregator, + tolerance, + aggregator.rt_range_milis(), + |rt| self.rt_ms_to_cycle_index(rt), + ) else { + return; // No RT intersection, early exit }; + aggregator .iter_mut_precursors() .for_each(|((_idx, mz), mut chr)| { let mz_range = tolerance.mz_range_f32(*mz as f32); - self.query_peaks_ms1(mz_range, cycle_range, im_range) - .for_each(|peak| chr.add_at_index(peak.cycle_index, peak.intensity)); + self.query_peaks_ms1(mz_range, ranges.ms1_cycle_range, ranges.im_range) + .for_each(|peak| chr.add_at_index(peak.cycle_index.as_u32(), peak.intensity)); }); - self.filter_precursor_ranges(quad_range, im_range) + self.filter_precursor_ranges(ranges.quad_range, ranges.im_range) .for_each(|(quad_info, peaks)| { - let constrained_im = im_range.map(|x| { - let tmp = quad_info.intersects_ranges( - (quad_range.start() as f64, quad_range.end() as f64), - (x.start().to_f64(), x.end().to_f64()), - ); - let tmp = tmp.expect( - "Since we filtered based on the mz+ims there should always be a match", - ); - (f16::from_f32(tmp.0 as f32), f16::from_f32(tmp.1 as f32)) - .try_into() - .unwrap() - }); + let constrained_im = ranges.constrain_im_for_quadrupole(quad_info); aggregator .iter_mut_fragments() .for_each(|((_idx, mz), mut chr)| { @@ -193,9 +286,9 @@ impl QueriableData> for IndexedTimst let mz_range = tolerance.mz_range_f32(*mz as f32); peaks - .query_peaks(mz_range, cycle_range, constrained_im) + .query_peaks(mz_range, ranges.ms2_cycle_range, constrained_im) .for_each(|x| { - chr.add_at_index(x.cycle_index, x.intensity); + chr.add_at_index(x.cycle_index.as_u32(), x.intensity); }); }); }) @@ -204,45 +297,23 @@ impl QueriableData> for IndexedTimst impl QueriableData> for IndexedTimstofPeaks { fn add_query(&self, aggregator: &mut SpectralCollector, tolerance: &Tolerance) { - let prec_mz_limits = aggregator.eg.get_precursor_mz_limits(); - let quad_range = - tolerance.quad_range_f32((prec_mz_limits.0 as f32, prec_mz_limits.1 as f32)); - let im_range = tolerance.mobility_range_f16(aggregator.eg.mobility_ook0()); - let rt_range_milliseconds = tolerance.rt_range_as_milis(aggregator.eg.rt_seconds()); - let cycle_range = match rt_range_milliseconds { - Restricted(x) => Restricted( - TupleRange::try_new( - self.rt_ms_to_cycle_index(x.start()), - self.rt_ms_to_cycle_index(x.end()), - ) - .unwrap(), - ), - Unrestricted => Unrestricted, - }; + let ranges = QueryRanges::from_elution_group(aggregator, tolerance, |rt| { + self.rt_ms_to_cycle_index(rt) + }); + aggregator .iter_mut_precursors() .for_each(|((_idx, mz), ion)| { let mz_range = tolerance.mz_range_f32(mz as f32); - self.query_peaks_ms1(mz_range, cycle_range, im_range) + self.query_peaks_ms1(mz_range, ranges.ms1_cycle_range, ranges.im_range) .for_each(|peak| { *ion += peak.intensity; }); }); - self.filter_precursor_ranges(quad_range, im_range) + self.filter_precursor_ranges(ranges.quad_range, ranges.im_range) .for_each(|(quad_info, peaks)| { - let constrained_im = im_range.map(|x| { - let tmp = quad_info.intersects_ranges( - (quad_range.start() as f64, quad_range.end() as f64), - (x.start().to_f64(), x.end().to_f64()), - ); - let tmp = tmp.expect( - "Since we filtered based on the mz+ims there should always be a match", - ); - (f16::from_f32(tmp.0 as f32), f16::from_f32(tmp.1 as f32)) - .try_into() - .unwrap() - }); + let constrained_im = ranges.constrain_im_for_quadrupole(quad_info); aggregator .iter_mut_fragments() .for_each(|((_idx, mz), ion)| { @@ -251,7 +322,7 @@ impl QueriableData> for IndexedTimstofPe // TODO: Fix later ... let mz_range = tolerance.mz_range_f32(*mz as f32); peaks - .query_peaks(mz_range, cycle_range, constrained_im) + .query_peaks(mz_range, ranges.ms2_cycle_range, constrained_im) .for_each(|x| { *ion += x.intensity; }); @@ -260,48 +331,27 @@ impl QueriableData> for IndexedTimstofPe } } -// This is some ugly copy-pasted code but I will fix it later if I have to ... -impl QueriableData> for IndexedTimstofPeaks { +impl + PeakAddable> + QueriableData> for IndexedTimstofPeaks +{ fn add_query(&self, aggregator: &mut SpectralCollector, tolerance: &Tolerance) { - let prec_mz_limits = aggregator.eg.get_precursor_mz_limits(); - let quad_range = - tolerance.quad_range_f32((prec_mz_limits.0 as f32, prec_mz_limits.1 as f32)); - let im_range = tolerance.mobility_range_f16(aggregator.eg.mobility_ook0()); - let rt_range_milliseconds = tolerance.rt_range_as_milis(aggregator.eg.rt_seconds()); - let cycle_range = match rt_range_milliseconds { - Restricted(x) => Restricted( - TupleRange::try_new( - self.rt_ms_to_cycle_index(x.start()), - self.rt_ms_to_cycle_index(x.end()), - ) - .unwrap(), - ), - Unrestricted => Unrestricted, - }; + let ranges = QueryRanges::from_elution_group(aggregator, tolerance, |rt| { + self.rt_ms_to_cycle_index(rt) + }); + aggregator .iter_mut_precursors() .for_each(|((_idx, mz), ion)| { let mz_range = tolerance.mz_range_f32(mz as f32); - self.query_peaks_ms1(mz_range, cycle_range, im_range) + self.query_peaks_ms1(mz_range, ranges.ms1_cycle_range, ranges.im_range) .for_each(|peak| { *ion += *peak; }); }); - self.filter_precursor_ranges(quad_range, im_range) + self.filter_precursor_ranges(ranges.quad_range, ranges.im_range) .for_each(|(quad_info, peaks)| { - let constrained_im = im_range.map(|x| { - let tmp = quad_info.intersects_ranges( - (quad_range.start() as f64, quad_range.end() as f64), - (x.start().to_f64(), x.end().to_f64()), - ); - let tmp = tmp.expect( - "Since we filtered based on the mz+ims there should always be a match", - ); - (f16::from_f32(tmp.0 as f32), f16::from_f32(tmp.1 as f32)) - .try_into() - .unwrap() - }); + let constrained_im = ranges.constrain_im_for_quadrupole(quad_info); aggregator .iter_mut_fragments() .for_each(|((_idx, mz), ion)| { @@ -310,7 +360,7 @@ impl QueriableData> for In // TODO: Fix later ... let mz_range = tolerance.mz_range_f32(*mz as f32); peaks - .query_peaks(mz_range, cycle_range, constrained_im) + .query_peaks(mz_range, ranges.ms2_cycle_range, constrained_im) .for_each(|x| { *ion += *x; }); @@ -318,3 +368,83 @@ impl QueriableData> for In }); } } + +fn ms1_to_ms2_cycle_range( + cycle_range: &OptionallyRestricted>, +) -> OptionallyRestricted> { + // FOR NOW we are assuming that the cycle for MS1 will be the same as MS2 ... + // This is not true in some acquisition schemes but we can fix it later ... + // This expression is pretty verbose ... + match cycle_range { + Restricted(x) => Restricted( + TupleRange::try_new( + WindowCycleIndex::new(x.start().as_u32()), + WindowCycleIndex::new(x.end().as_u32()), + ) + .unwrap(), + ), + Unrestricted => Unrestricted, + } +} + +impl QueriableData> for IndexedPeaksHandle { + fn add_query(&self, aggregator: &mut ChromatogramCollector, tolerance: &Tolerance) { + match self { + IndexedPeaksHandle::Eager(eager) => { + // Delegate to existing implementation + eager.add_query(aggregator, tolerance) + } + IndexedPeaksHandle::Lazy(lazy) => { + // Use QueryRanges to compute common tolerances, then convert to u32 for lazy API + let Some(ranges) = QueryRanges::from_elution_group_with_rt_intersection( + aggregator, + tolerance, + aggregator.rt_range_milis(), + |rt| lazy.rt_ms_to_cycle_index(rt), + ) else { + return; // No RT intersection, early exit + }; + + // Convert MS1CycleIndex ranges to u32 for lazy API + let cycle_range_u32 = match ranges.ms1_cycle_range { + Restricted(x) => Restricted( + TupleRange::try_new(x.start().as_u32(), x.end().as_u32()).unwrap(), + ), + Unrestricted => Unrestricted, + }; + + // Query MS1 precursors + aggregator + .iter_mut_precursors() + .for_each(|((_idx, mz), mut chr)| { + let mz_range = tolerance.mz_range_f32(*mz as f32); + lazy.query_peaks_ms1(mz_range, cycle_range_u32, ranges.im_range) + .for_each(|peak| { + chr.add_at_index(peak.cycle_index.as_u32(), peak.intensity) + }); + }); + + // Query MS2 fragments + // LazyIndexedTimstofPeaks::query_peaks_ms2 already handles precursor filtering + aggregator + .iter_mut_fragments() + .for_each(|((_idx, mz), mut chr)| { + let mz_range = tolerance.mz_range_f32(*mz as f32); + let results = lazy.query_peaks_ms2( + ranges.quad_range, + mz_range, + cycle_range_u32, + ranges.im_range, + ); + + // Iterate through all matching window groups and their peaks + for (_isolation_scheme, peaks) in results { + for peak in peaks { + chr.add_at_index(peak.cycle_index.as_u32(), peak.intensity); + } + } + }); + } + } + } +} diff --git a/rust/timsquery/src/models/lazy.rs b/rust/timsquery/src/models/lazy.rs new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/rust/timsquery/src/models/lazy.rs @@ -0,0 +1 @@ + diff --git a/rust/timsquery/src/models/mod.rs b/rust/timsquery/src/models/mod.rs index 8e4c63a..27208d5 100644 --- a/rust/timsquery/src/models/mod.rs +++ b/rust/timsquery/src/models/mod.rs @@ -2,6 +2,7 @@ pub mod aggregators; pub mod base; pub mod elution_group; pub mod indexed_data; +mod lazy; pub mod tolerance; pub use crate::traits::PeakAddable; diff --git a/rust/timsquery/src/models/tolerance.rs b/rust/timsquery/src/models/tolerance.rs index b731145..f7cdb69 100644 --- a/rust/timsquery/src/models/tolerance.rs +++ b/rust/timsquery/src/models/tolerance.rs @@ -34,7 +34,7 @@ use timscentroid::utils::{ /// in terms of positive values. For instance, here a tolerance of (1,1) on a value /// of 10 means a range of (9,11) while in some software the same range would be defined /// as (-1,1). -#[derive(Debug, Clone, Serialize, Deserialize)] +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] pub struct Tolerance { pub ms: MzTolerance, #[serde(default)] @@ -43,36 +43,38 @@ pub struct Tolerance { pub quad: QuadTolerance, } -#[derive(Debug, Clone, Serialize, Deserialize)] +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] pub enum MzTolerance { - #[serde(rename = "da")] + #[serde(alias = "da")] Absolute((f64, f64)), - #[serde(rename = "ppm")] + #[serde(alias = "ppm")] Ppm((f64, f64)), } -#[derive(Debug, Clone, Serialize, Deserialize, Default)] +#[derive(Debug, Clone, Serialize, Deserialize, Default, PartialEq)] pub enum RtTolerance { - #[serde(rename = "minutes")] + #[serde(alias = "minutes")] Minutes((f32, f32)), - #[serde(rename = "percent")] + #[serde(alias = "percent")] Pct((f32, f32)), #[default] Unrestricted, } -#[derive(Debug, Clone, Serialize, Deserialize)] +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] pub enum MobilityTolerance { - #[serde(rename = "absolute")] + #[serde(alias = "absolute")] Absolute((f32, f32)), - #[serde(rename = "percent")] + #[serde(alias = "percent")] + #[serde(alias = "pct")] Pct((f32, f32)), Unrestricted, } -#[derive(Debug, Clone, Serialize, Deserialize)] +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] pub enum QuadTolerance { - #[serde(rename = "absolute")] + #[serde(alias = "absolute")] + #[serde(alias = "da")] Absolute((f32, f32)), } diff --git a/rust/timsquery/src/serde/chromatogram_output.rs b/rust/timsquery/src/serde/chromatogram_output.rs index ff27f16..03c8609 100644 --- a/rust/timsquery/src/serde/chromatogram_output.rs +++ b/rust/timsquery/src/serde/chromatogram_output.rs @@ -2,6 +2,11 @@ use crate::{ ChromatogramCollector, KeyLike, }; +use timscentroid::rt_mapping::{ + CycleToRTMapping, + MS1CycleIndex, + RTIndex, // Trait needed for the index method. +}; /// Represents the output format for an aggregated chromatogram /// It is pretty ugly performance-wise but I am keeping it as is because @@ -23,9 +28,15 @@ pub struct ChromatogramOutput { } impl ChromatogramOutput { + /// Create a new ChromatogramOutput from a ChromatogramCollector + /// and a reference CycleToRTMapping + /// + /// In theory the original collector does not need to be mutable, + /// but I have not implemented a nice way to iterate over + /// the internal chromatograms without mutability yet. pub fn try_new( - mut collector: ChromatogramCollector, - reference_cycles: &[u32], + collector: &mut ChromatogramCollector, + reference_cycles: &CycleToRTMapping, ) -> Result { let mut local_non_zero_min_idx = collector.num_cycles(); let mut local_non_zero_max_idx = 0usize; @@ -71,7 +82,7 @@ impl ChromatogramOutput { let cycle_offset = collector.cycle_offset(); let (precursor_mzs, precursor_intensities): (Vec, Vec>) = collector .iter_mut_precursors() - .filter_map(|(&(idx, mz), cmg)| { + .filter_map(|(&(_idx, mz), cmg)| { let out_vec = cmg .try_get_slice( local_non_zero_min_idx + cycle_offset, @@ -117,7 +128,12 @@ impl ChromatogramOutput { precursor_intensities, fragment_intensities, fragment_labels, - retention_time_results_seconds: reference_cycles[non_zero_min_idx..=non_zero_max_idx] + retention_time_results_seconds: reference_cycles + .get_inclusive_slice( + MS1CycleIndex::new(non_zero_min_idx as u32) + ..=MS1CycleIndex::new(non_zero_max_idx as u32), + ) + .unwrap() .iter() .map(|&x| x as f32 / 1000.0) .collect(), diff --git a/rust/timsquery/src/serde/diann_io.rs b/rust/timsquery/src/serde/diann_io.rs index e5b8343..5c18cb0 100644 --- a/rust/timsquery/src/serde/diann_io.rs +++ b/rust/timsquery/src/serde/diann_io.rs @@ -8,46 +8,50 @@ use std::path::Path; use tinyvec::tiny_vec; use tracing::{ debug, + error, info, warn, }; #[derive(Debug)] pub enum DiannReadingError { - IoError(std::io::Error), - CsvError(csv::Error), - DiannPrecursorParsingError(DiannPrecursorParsingError), - UnableToParseElutionGroups, + IoError, + CsvError, + DiannPrecursorParsingError, } #[derive(Debug)] pub enum DiannPrecursorParsingError { - IonParsingError(IonParsingError), - IonAnnotParseError(String), - Other(String), + IonParsingError, + IonOverCapacity, + EmptyIonString, + Other, } impl From for DiannPrecursorParsingError { fn from(err: IonParsingError) -> Self { - DiannPrecursorParsingError::IonParsingError(err) + error!("Ion parsing error: {:?}", err); + DiannPrecursorParsingError::IonParsingError } } impl From for DiannReadingError { - fn from(err: DiannPrecursorParsingError) -> Self { - DiannReadingError::DiannPrecursorParsingError(err) + fn from(_err: DiannPrecursorParsingError) -> Self { + DiannReadingError::DiannPrecursorParsingError } } impl From for DiannReadingError { fn from(err: csv::Error) -> Self { - DiannReadingError::CsvError(err) + error!("CSV reading error: {:?}", err); + DiannReadingError::CsvError } } impl From for DiannReadingError { fn from(err: std::io::Error) -> Self { - DiannReadingError::IoError(err) + error!("IO error: {:?}", err); + DiannReadingError::IoError } } @@ -66,6 +70,7 @@ struct DiannLibraryRow { #[serde(rename = "ModifiedPeptide")] modified_peptide: String, #[serde(rename = "StrippedPeptide")] + #[serde(alias = "PeptideSequence")] stripped_peptide: String, #[serde(rename = "PrecursorMz")] precursor_mz: f64, @@ -76,20 +81,25 @@ struct DiannLibraryRow { #[serde(rename = "IonMobility")] ion_mobility: f64, #[serde(rename = "ProteinID")] + #[serde(alias = "ProteinGroup")] protein_id: String, #[serde(rename = "Decoy")] + #[serde(alias = "decoy")] decoy: i32, #[serde(rename = "FragmentMz")] + #[serde(alias = "ProductMz")] fragment_mz: f64, #[serde(rename = "FragmentType")] fragment_type: String, #[serde(rename = "FragmentNumber")] + #[serde(alias = "FragmentSeriesNumber")] fragment_number: i32, #[serde(rename = "FragmentCharge")] fragment_charge: i32, #[serde(rename = "FragmentLossType")] fragment_loss_type: String, #[serde(rename = "RelativeIntensity")] + #[serde(alias = "LibraryIntensity")] relative_intensity: f32, } @@ -107,25 +117,6 @@ impl DiannLibraryRow { } } -fn required_columns() -> Vec<&'static str> { - vec![ - "ModifiedPeptide", - "StrippedPeptide", - "PrecursorMz", - "PrecursorCharge", - "Tr_recalibrated", - "IonMobility", - "ProteinID", - "Decoy", - "FragmentMz", - "FragmentType", - "FragmentNumber", - "FragmentCharge", - "FragmentLossType", - "RelativeIntensity", - ] -} - pub fn sniff_diann_library_file>(file: T) -> bool { let file_result = std::fs::File::open(file.as_ref()); let file = match file_result { @@ -149,12 +140,29 @@ pub fn sniff_diann_library_file>(file: T) -> bool { }; let columns: Vec = headers.iter().map(|s| s.to_string()).collect(); - let required = required_columns(); - // Check if all required columns are present - required + // Define required columns with their aliases + let required_with_aliases = vec![ + vec!["ModifiedPeptide"], + vec!["StrippedPeptide", "PeptideSequence"], + vec!["PrecursorMz"], + vec!["PrecursorCharge"], + vec!["Tr_recalibrated"], + vec!["IonMobility"], + vec!["ProteinID", "ProteinGroup"], + vec!["Decoy", "decoy"], + vec!["FragmentMz", "ProductMz"], + vec!["FragmentType"], + vec!["FragmentNumber", "FragmentSeriesNumber"], + vec!["FragmentCharge"], + vec!["FragmentLossType"], + vec!["RelativeIntensity", "LibraryIntensity"], + ]; + + // Check if all required columns (or their aliases) are present + required_with_aliases .iter() - .all(|col| columns.contains(&col.to_string())) + .all(|aliases| aliases.iter().any(|col| columns.contains(&col.to_string()))) } struct ParsingBuffers { @@ -216,9 +224,8 @@ fn parse_precursor_group( buffers: &mut ParsingBuffers, ) -> Result<(TimsElutionGroup, DiannPrecursorExtras), DiannPrecursorParsingError> { if rows.is_empty() { - return Err(DiannPrecursorParsingError::Other( - "Empty precursor group".to_string(), - )); + error!("Empty precursor group encountered on {id}"); + return Err(DiannPrecursorParsingError::Other); } // All rows in this group share the same precursor info, so take first row @@ -234,10 +241,8 @@ fn parse_precursor_group( .precursor_charge .try_into() .map_err(|e: std::num::TryFromIntError| { - DiannPrecursorParsingError::Other(format!( - "Failed to convert PrecursorCharge to u8: {:?}", - e - )) + error!("Failed to convert PrecursorCharge to u8: {:?}", e); + DiannPrecursorParsingError::IonOverCapacity })?; // Extract fragment information from all rows @@ -252,10 +257,8 @@ fn parse_precursor_group( row.fragment_charge .try_into() .map_err(|e: std::num::TryFromIntError| { - DiannPrecursorParsingError::Other(format!( - "Failed to convert FragmentCharge to u8: {:?}", - e - )) + error!("Failed to convert FragmentCharge to u8: {:?}", e); + DiannPrecursorParsingError::IonOverCapacity })?; let rel_intensity = row.relative_intensity; @@ -274,16 +277,20 @@ fn parse_precursor_group( continue; } - let frag_char = - row.fragment_type.chars().next().ok_or_else(|| { - DiannPrecursorParsingError::Other("Empty fragment type".to_string()) - })?; + let frag_char = row.fragment_type.chars().next().ok_or_else(|| { + error!( + "Empty FragmentType at row {}; cannot parse ion annotation", + i + ); + DiannPrecursorParsingError::EmptyIonString + })?; let frag_num = row.fragment_number.try_into().map_err(|_| { - DiannPrecursorParsingError::Other(format!( - "Invalid fragment number: {}", + error!( + "Invalid fragment number (I expect all of then < 255): {}", row.fragment_number - )) + ); + DiannPrecursorParsingError::IonOverCapacity })?; let ion_annot = IonAnnot::try_new(frag_char, Some(frag_num), frag_charge as i8, 0)?; @@ -297,10 +304,8 @@ fn parse_precursor_group( 0 => false, 1 => true, other => { - return Err(DiannPrecursorParsingError::Other(format!( - "Unexpected Decoy value: {}", - other - ))); + error!("Unexpected Decoy value: {}", other); + return Err(DiannPrecursorParsingError::Other); } }; @@ -444,4 +449,23 @@ mod tests { let _elution_groups = read_library_file(file_path).expect("Failed to read library"); // TODO ... implement the actual assertions } + + #[test] + fn test_speclib2_tsv_parsing() { + let manifest_dir = env!("CARGO_MANIFEST_DIR"); + let file_path = PathBuf::from(manifest_dir) + .join("tests") + .join("diann_io_files") + .join("sample_lib.tsv"); + + // Check that sniff detects it correctly + let is_diann = sniff_diann_library_file(&file_path); + assert!( + is_diann, + "speclib2 TSV file should be detected as DIA-NN library" + ); + + // This test mainly checks that the name aliases wotk correctly + let _elution_groups = read_library_file(file_path).expect("Failed to read library"); + } } diff --git a/rust/timsquery/src/serde/index_serde.rs b/rust/timsquery/src/serde/index_serde.rs index 7fd4bf3..7df157f 100644 --- a/rust/timsquery/src/serde/index_serde.rs +++ b/rust/timsquery/src/serde/index_serde.rs @@ -1,53 +1,434 @@ +//! Smart index loading with automatic format detection and cloud support +//! +//! This module provides a unified API for loading timsTOF data from any source: +//! local .d files, cached .idx directories, or cloud storage (S3, GCS, Azure). +//! +//! # Quick Start +//! +//! ## Automatic Loading (Detects Format) +//! +//! ```no_run +//! use timsquery::serde::load_index_auto; +//! +//! // Works with any input - auto-detects format and location +//! let index = load_index_auto("data.d", None)?.into_eager()?; // Local raw +//! let index = load_index_auto("data.d.idx", None)?.into_eager()?; // Local cached +//! let index = load_index_auto("s3://bucket/exp.d", None)?.into_eager()?; // Cloud raw +//! let index = load_index_auto("s3://bucket/exp.idx", None)?.into_eager()?; // Cloud cached +//! # Ok::<(), Box>(()) +//! ``` +//! +//! ## Lazy Loading (For Large Datasets) +//! +//! ```no_run +//! use timsquery::serde::{load_index_auto, IndexLoadConfig}; +//! +//! // Prefer lazy loading when possible (faster initialization, less memory) +//! let config = IndexLoadConfig { +//! prefer_lazy: true, +//! ..Default::default() +//! }; +//! +//! let handle = load_index_auto("experiment.d.idx", Some(config))?; +//! +//! // Query directly on lazy handle (loads data on-demand) +//! if let Some(lazy) = handle.as_lazy() { +//! let peaks = lazy.query_peaks_ms1(mz_range, rt_range, im_range); +//! } +//! # Ok::<(), Box>(()) +//! ``` +//! +//! ## Custom Cache Configuration +//! +//! ```no_run +//! use timsquery::serde::{load_index_auto, IndexLoadConfig, CacheLocation}; +//! +//! let config = IndexLoadConfig { +//! cache_location: CacheLocation::Url("s3://my-bucket/cache/".to_string()), +//! ..Default::default() +//! }; +//! +//! // Process raw data and cache to S3 +//! let index = load_index_auto("data.d", Some(config))?.into_eager()?; +//! # Ok::<(), Box>(()) +//! ``` +//! +//! # Cache Workflow +//! +//! 1. **First run**: Reads raw .d file, builds index (slow), saves to cache +//! 2. **Subsequent runs**: Loads from cache (fast), skips raw data processing +//! +//! # Configuration +//! +//! ```no_run +//! use timsquery::serde::TimsIndexReader; +//! use timsquery::CentroidingConfig; +//! use timscentroid::serialization::SerializationConfig; +//! use parquet::basic::{Compression, ZstdLevel}; +//! +//! let index = TimsIndexReader::new() +//! .with_auto_cache() +//! .with_centroiding_config(CentroidingConfig { +//! max_peaks: 50_000, +//! mz_ppm_tol: 10.0, +//! im_pct_tol: 5.0, +//! early_stop_iterations: 200, +//! }) +//! .with_serialization_config(SerializationConfig { +//! row_group_size: 100_000, +//! compression: Compression::ZSTD(ZstdLevel::try_new(3)?), +//! }) +//! .read_index("data.d")?; +//! # Ok::<(), Box>(()) +//! ``` +//! +//! # Use Cases +//! +//! **Team sharing via S3:** +//! ```no_run +//! use timsquery::serde::TimsIndexReader; +//! +//! // First team member: builds and caches to S3 +//! let index = TimsIndexReader::new() +//! .with_cloud_cache("s3://team-bucket/indexes/exp001/") +//! .read_index("/local/exp001.d")?; +//! +//! // Other team members: load from S3 cache (no raw data needed!) +//! let index = TimsIndexReader::from_cache_url( +//! "s3://team-bucket/indexes/exp001/" +//! )?; +//! # Ok::<(), Box>(()) +//! ``` +//! +//! **CI/CD pipelines:** +//! ```no_run +//! use timsquery::serde::TimsIndexReader; +//! +//! // Build step: process raw data, cache result +//! let index = TimsIndexReader::new() +//! .with_cloud_cache("s3://pipeline-artifacts/indexes/build-123/") +//! .read_index("raw_data.d")?; +//! +//! // Deploy step: load pre-built index +//! let index = TimsIndexReader::from_cache_url( +//! "s3://pipeline-artifacts/indexes/build-123/" +//! )?; +//! # Ok::<(), Box>(()) +//! ``` + use crate::{ CentroidingConfig, IndexedTimstofPeaks, TimsTofPath, }; -use std::path::Path; +use std::path::{ + Path, + PathBuf, +}; +use timscentroid::StorageLocation; +use timscentroid::lazy::LazyIndexedTimstofPeaks; use timscentroid::serialization::SerializationConfig; use tracing::{ error, info, }; -fn maybe_cache_load_index(index_cache_loc: impl AsRef) -> Option { - info!( - "Attempting to load index from cache at {:?}", - index_cache_loc.as_ref() - ); - match IndexedTimstofPeaks::load_from_directory(index_cache_loc.as_ref()) { - Ok(idx) => { - info!("Loaded index from cache at {:?}", index_cache_loc.as_ref()); - Some(idx) +/// Handle to indexed peaks - can be lazy or materialized (eager) +/// +/// This enum allows applications to work with either lazy-loaded or fully materialized +/// index data. Lazy loading is faster to initialize and uses less memory, while eager +/// loading provides faster queries and is required for some operations. +#[derive(Debug, Clone)] +pub enum IndexedPeaksHandle { + /// Lazy loading - queries fetch data on-demand from parquet files + Lazy(LazyIndexedTimstofPeaks), + /// Eager/materialized - all data loaded in memory + Eager(IndexedTimstofPeaks), +} + +impl IndexedPeaksHandle { + /// Materialize to eager if needed (no-op if already eager) + /// + /// This loads all parquet data into memory if the handle is lazy. + pub fn into_eager(self) -> Result { + match self { + Self::Eager(peaks) => Ok(peaks), + Self::Lazy(_lazy) => { + // Materialize: load all data from parquet files into memory + // We need to get the storage location from lazy and reload as eager + info!("Materializing lazy index to eager (loading all data into memory)"); + + // TODO: Implement lazy -> eager materialization + // The lazy type needs to expose its storage location so we can reload as eager + Err(crate::errors::DataReadingError::UnsupportedDataError( + crate::errors::UnsupportedDataError::NoMS2DataError, + )) + } } - Err(e) => { - error!( - "Failed to load index from cache at {:?}: {:?}", - index_cache_loc.as_ref(), - e - ); - None + } + + /// Try to convert to lazy loading + /// + /// If already lazy, returns self unchanged. If eager, saves to cache and reloads as lazy. + /// Returns error if caching is disabled or save/load fails. + /// + /// # Arguments + /// * `cache_location` - Where to save the cache (must not be Disabled) + /// * `serialization_config` - Configuration for saving (if needed) + /// + /// # Example + /// ```no_run + /// use timsquery::serde::{load_index_auto, CacheLocation}; + /// use timscentroid::serialization::SerializationConfig; + /// + /// let handle = load_index_auto("data.d", None)?; + /// let lazy = handle.try_into_lazy( + /// CacheLocation::Local("/tmp/cache".into()), + /// SerializationConfig::default() + /// )?; + /// # Ok::<(), Box>(()) + /// ``` + pub fn try_into_lazy( + self, + cache_location: CacheLocation, + serialization_config: SerializationConfig, + ) -> Result { + match self { + Self::Lazy(lazy) => Ok(lazy), + Self::Eager(eager) => { + if matches!(cache_location, CacheLocation::Disabled) { + return Err(crate::errors::DataReadingError::UnsupportedDataError( + crate::errors::UnsupportedDataError::NoMS2DataError, + )); + } + + let storage_location = match cache_location.to_storage_location() { + Some(Ok(loc)) => loc, + Some(Err(_)) | None => { + return Err(crate::errors::DataReadingError::UnsupportedDataError( + crate::errors::UnsupportedDataError::NoMS2DataError, + )); + } + }; + + info!("Saving eager index to cache for lazy loading"); + eager + .save_to_storage(storage_location.clone(), serialization_config) + .map_err(crate::errors::DataReadingError::SerializationError)?; + + info!("Loading saved index as lazy"); + LazyIndexedTimstofPeaks::load_from_storage(storage_location) + .map_err(crate::errors::DataReadingError::SerializationError) + } + } + } + + /// Check if this handle is lazy + pub fn is_lazy(&self) -> bool { + matches!(self, Self::Lazy(_)) + } + + /// Check if this handle is eager + pub fn is_eager(&self) -> bool { + matches!(self, Self::Eager(_)) + } + + pub fn ms1_cycle_mapping( + &self, + ) -> &timscentroid::rt_mapping::CycleToRTMapping { + match self { + Self::Lazy(lazy) => &lazy.ms1_metadata().cycle_to_rt_ms, + Self::Eager(eager) => eager.ms1_cycle_mapping(), + } + } +} + +/// Configuration for smart index loading +#[derive(Debug, Clone)] +pub struct IndexLoadConfig { + /// Cache location (Auto, Local, Url, Disabled) + pub cache_location: CacheLocation, + + /// Centroiding configuration (only used when loading raw .d files) + pub centroiding_config: Option, + + /// Serialization configuration + pub serialization_config: SerializationConfig, + + /// Prefer lazy loading when possible (default: false) + /// + /// When true, cached indexes will be loaded lazily for faster initialization + /// and lower memory usage. Raw .d files still require eager loading. + pub prefer_lazy: bool, + + /// Allow writing cache if missing (default: true) + pub write_missing_cache: bool, +} + +impl Default for IndexLoadConfig { + fn default() -> Self { + Self { + cache_location: CacheLocation::Auto, + centroiding_config: None, + serialization_config: SerializationConfig::default(), + prefer_lazy: false, + write_missing_cache: true, } } } -/// Builder for loading timsTOF indices with caching options +/// Cache location for indexed peaks +/// +/// # Storage Layout +/// +/// Cached indexes are stored as a directory containing: +/// - `metadata.json` - Index metadata (version, timestamps, file mappings) +/// - `ms1.parquet` - MS1 peaks +/// - `ms2/group_0.parquet`, `ms2/group_1.parquet`, ... - MS2 window groups +/// +/// ## Examples +/// +/// **Auto:** Derives location from input path +/// ```text +/// Input: /data/experiment.d +/// Cache: /data/experiment.d.idx/ +/// ├── metadata.json +/// ├── ms1.parquet +/// └── ms2/ +/// ├── group_0.parquet +/// └── group_1.parquet +/// ``` +/// +/// **Local:** Explicit local directory +/// ```text +/// CacheLocation::Local("/cache/my_index") +/// Cache: /cache/my_index/ +/// ├── metadata.json +/// ├── ms1.parquet +/// └── ms2/... +/// ``` +/// +/// **Url:** Cloud storage URL (full path including .idx directory name) +/// ```text +/// CacheLocation::Url("s3://bucket/experiments/exp001.idx") +/// Cache: s3://bucket/experiments/exp001.idx/ +/// ├── metadata.json +/// ├── ms1.parquet +/// └── ms2/... +/// ``` +/// +/// **Note:** Currently cache paths are exact - relative path preservation +/// (e.g., `/local/parent/data.d` → `s3://bucket/parent/data.d.idx`) is not yet +/// implemented but would be valuable for large dataset workflows. +#[derive(Debug, Clone)] +pub enum CacheLocation { + /// Derive cache location from input path (adds .idx suffix) + /// + /// Example: "data.d" → "data.d.idx" + Auto, + + /// Explicit local filesystem directory path + /// + /// This is the full path to the .idx directory (not the parent). + Local(PathBuf), + + /// Cloud storage URL + /// + /// Must be a String (not PathBuf) because it contains URL scheme and authority: + /// - "s3://bucket/path.idx" (scheme: s3, authority: bucket) + /// - "gs://bucket/path.idx" (scheme: gs, authority: bucket) + /// - "az://container/path.idx" (scheme: az, authority: container) + /// + /// PathBuf is for filesystem paths only and doesn't support URLs. + Url(String), + + /// No caching - process raw data every time + Disabled, +} + +impl CacheLocation { + /// Convert to StorageLocation, returning None for Disabled or Auto + fn to_storage_location(&self) -> Option> { + match self { + Self::Local(path) => Some(Ok(StorageLocation::from_path(path))), + Self::Url(url) => Some( + StorageLocation::from_url(url).map_err(|e| format!("Invalid cache URL: {}", e)), + ), + Self::Auto | Self::Disabled => None, + } + } + + /// Derive auto cache location from input path + /// + /// Appends `.idx` to the filename (not replacing extension). + /// Example: "data.d" → "data.d.idx" + fn derive_auto_location(input_path: &Path) -> PathBuf { + // Can't use with_extension("idx") because that would replace ".d" with ".idx" + // We want to append, so "data.d" becomes "data.d.idx" not "data.idx" + let mut index_location = input_path.to_path_buf(); + let current_name = index_location + .file_name() + .and_then(|n| n.to_str()) + .unwrap_or(""); + index_location.set_file_name(format!("{}.idx", current_name)); + index_location + } +} + +/// Builder for loading timsTOF indices with cloud storage and caching support pub struct TimsIndexReader { + cache_location: CacheLocation, write_missing_cache: bool, centroiding_config: Option, serialization_config: SerializationConfig, } impl TimsIndexReader { - /// Create a new index reader with default settings + /// Create a new index reader with automatic caching enabled (default) + /// + /// Cache location is automatically derived by appending `.idx` to the input path. + /// Example: "data.d" → "data.d.idx" + /// + /// Use `without_cache()` if you don't want any caching. pub fn new() -> Self { Self { + cache_location: CacheLocation::Auto, write_missing_cache: true, centroiding_config: None, serialization_config: SerializationConfig::default(), } } + /// Create a new index reader with caching disabled + /// + /// Raw data will be processed every time without saving/loading cache. + pub fn new_without_cache() -> Self { + Self { + cache_location: CacheLocation::Disabled, + write_missing_cache: false, + centroiding_config: None, + serialization_config: SerializationConfig::default(), + } + } + + /// Set a custom local cache directory + pub fn with_local_cache(mut self, path: impl Into) -> Self { + self.cache_location = CacheLocation::Local(path.into()); + self + } + + /// Set a cloud storage cache location (s3://, gs://, az://) + pub fn with_cloud_cache(mut self, url: impl Into) -> Self { + self.cache_location = CacheLocation::Url(url.into()); + self + } + + /// Set custom cache location + pub fn with_cache_location(mut self, location: CacheLocation) -> Self { + self.cache_location = location; + self + } + /// Set whether to write cache when it's missing (default: true) pub fn with_write_missing_cache(mut self, write: bool) -> Self { self.write_missing_cache = write; @@ -66,6 +447,57 @@ impl TimsIndexReader { self } + /// Load directly from a cached index (bypasses centroiding, just loads pre-built index) + /// + /// This is useful when you have a pre-cached index and don't need the original .d file. + /// + /// # Examples + /// + /// ```no_run + /// use timsquery::serde::TimsIndexReader; + /// use timscentroid::StorageLocation; + /// + /// // Load from local cache + /// let index = TimsIndexReader::from_cache( + /// StorageLocation::from_path("/path/to/experiment.d.idx") + /// )?; + /// + /// // Load from S3 cache + /// let index = TimsIndexReader::from_cache( + /// StorageLocation::from_url("s3://bucket/cache/experiment.idx")? + /// )?; + /// # Ok::<(), Box>(()) + /// ``` + pub fn from_cache( + location: StorageLocation, + ) -> Result { + IndexedTimstofPeaks::load_from_storage(location) + .map_err(crate::errors::DataReadingError::SerializationError) + } + + /// Load directly from a cached index at a local path + /// + /// Convenience wrapper for `from_cache(StorageLocation::from_path(...))` + pub fn from_cache_path( + path: impl AsRef, + ) -> Result { + Self::from_cache(StorageLocation::from_path(path)) + } + + /// Load directly from a cached index at a cloud URL + /// + /// Convenience wrapper for `from_cache(StorageLocation::from_url(...))` + pub fn from_cache_url( + url: impl AsRef, + ) -> Result { + let location = StorageLocation::from_url(url.as_ref()).map_err(|e| { + crate::errors::DataReadingError::SerializationError( + timscentroid::serialization::SerializationError::UrlParse(e), + ) + })?; + Self::from_cache(location) + } + /// Load timsTOF index with configured caching behavior /// /// # Arguments @@ -79,6 +511,7 @@ impl TimsIndexReader { file_location: impl AsRef, ) -> Result { let st = std::time::Instant::now(); + let timstofpath = match TimsTofPath::new(file_location.as_ref()) { Ok(x) => x, Err(e) => { @@ -86,23 +519,37 @@ impl TimsIndexReader { } }; - // Create cache directory path by appending .idx to the .d directory - let mut index_location = file_location.as_ref().to_path_buf(); - let new_name = format!( - "{}.idx", - index_location.file_name().unwrap().to_str().unwrap() - ); - index_location.set_file_name(new_name); + // Determine cache location and handle errors early + let cache_storage_location: Option = match &self.cache_location { + CacheLocation::Auto => { + let path = CacheLocation::derive_auto_location(file_location.as_ref()); + Some(StorageLocation::from_path(path)) + } + other => match other.to_storage_location() { + Some(Ok(loc)) => Some(loc), + Some(Err(e)) => { + error!("Invalid cache location: {}", e); + None + } + None => None, + }, + }; - let out = if let Some(idx) = maybe_cache_load_index(&index_location) { - Ok(idx) + // Try to load from cache + let out = if let Some(storage_loc) = &cache_storage_location { + match self.try_load_from_cache(storage_loc) { + Some(idx) => Ok(idx), + None => { + let cache_loc = if self.write_missing_cache { + cache_storage_location + } else { + None + }; + Ok(self.uncached_load_index(&timstofpath, cache_loc)) + } + } } else { - let cache_loc = if self.write_missing_cache { - Some(index_location) - } else { - None - }; - Ok(self.uncached_load_index(&timstofpath, &cache_loc)) + Ok(self.uncached_load_index(&timstofpath, None)) }; let et = st.elapsed(); @@ -110,10 +557,36 @@ impl TimsIndexReader { out } + fn try_load_from_cache( + &self, + storage_location: &StorageLocation, + ) -> Option { + let location_desc = match storage_location { + StorageLocation::Local(p) => format!("{:?}", p), + StorageLocation::Url(u) => u.to_string(), + }; + + info!("Attempting to load index from cache at {}", location_desc); + + match IndexedTimstofPeaks::load_from_storage(storage_location.clone()) { + Ok(idx) => { + info!("Loaded index from cache at {}", location_desc); + Some(idx) + } + Err(e) => { + error!( + "Failed to load index from cache at {}: {:?}", + location_desc, e + ); + None + } + } + } + fn uncached_load_index( &self, timstofpath: &TimsTofPath, - cache_loc: &Option, + cache_loc: Option, ) -> IndexedTimstofPeaks { let centroiding_config = self.centroiding_config.unwrap_or(CentroidingConfig { max_peaks: 50_000, @@ -121,22 +594,30 @@ impl TimsIndexReader { im_pct_tol: 5.0, early_stop_iterations: 200, }); + info!("Using centroiding config: {:#?}", centroiding_config); - info!("Starting centroiging + load of the raw data (might take a min)"); + info!("Starting centroiding + load of the raw data (might take a min)"); + let (index, build_stats) = IndexedTimstofPeaks::from_timstof_file(timstofpath, centroiding_config); + info!("Index built with stats: {}", build_stats); // Save to cache - if let Some(idx_path) = cache_loc { - info!("Saving index to cache at {:?}", idx_path); - if let Err(e) = index.save_to_directory_with_config(idx_path, self.serialization_config) - { - error!("Failed to save index to cache: {:?}", e); - } else { - info!("Saved index to cache"); + if let Some(storage_loc) = cache_loc { + let location_desc = match &storage_loc { + StorageLocation::Local(p) => format!("{:?}", p), + StorageLocation::Url(u) => u.to_string(), + }; + + info!("Saving index to cache at {}", location_desc); + + match index.save_to_storage(storage_loc, self.serialization_config) { + Ok(_) => info!("Saved index to cache"), + Err(e) => error!("Failed to save index to cache: {:?}", e), } } + index } } @@ -147,9 +628,201 @@ impl Default for TimsIndexReader { } } -/// Convenience function for loading index with default caching behavior -pub fn load_index_caching( - file_location: impl AsRef, -) -> Result { - TimsIndexReader::new().read_index(file_location) +/// Check if a location contains a cached index by looking for metadata.json +/// +/// This checks the actual directory contents rather than just the path extension. +/// +/// Returns Ok(true) if cached index exists, Ok(false) if not found, +/// Err for permission/auth errors that should be propagated to the user. +fn sniff_cached_index(location: &str) -> Result { + let is_cloud = location.contains("://"); + + // Try to create storage location and check for metadata.json + let storage_result = if is_cloud { + StorageLocation::from_url(location) + } else { + Ok(StorageLocation::from_path(location)) + }; + + let storage_location = match storage_result { + Ok(loc) => loc, + Err(e) => { + error!("Failed to parse storage location for sniffing: {:?}", e); + return Ok(false); // Treat parse errors as "not cached" + } + }; + + // Try to read metadata.json as a quick check + match timscentroid::storage::StorageProvider::new(storage_location) { + Ok(provider) => { + // Just try to read a few bytes - if metadata.json exists, it's likely a cached index + match provider.read_bytes("metadata.json") { + Ok(_) => Ok(true), + Err(e) => { + // Check if it's a permission error - propagate it! + if let timscentroid::serialization::SerializationError::Io(io_err) = &e + && io_err.kind() == std::io::ErrorKind::PermissionDenied + { + error!("Permission denied while checking for cached index: {:?}", e); + return Err(crate::errors::DataReadingError::SerializationError(e)); + } + // For other errors (like NotFound), treat as not cached + error!("metadata.json not found or unreadable: {:?}", e); + Ok(false) + } + } + } + Err(e) => { + error!("Failed to access storage location for sniffing: {:?}", e); + Ok(false) // Treat provider creation errors as "not cached" + } + } +} + +/// Smart index loader - auto-detects input type and loads appropriately +/// +/// This is the unified entry point for loading indexed peaks from any source. +/// It automatically detects: +/// - Local vs cloud storage (by checking for "://" in path) +/// - Raw .d files vs cached .idx files (by sniffing directory contents) +/// - Whether to load lazily or eagerly (based on config) +/// +/// # Format Detection +/// +/// The function intelligently detects the input format by: +/// 1. Checking for cached index: Looks for `metadata.json` file +/// 2. Checking for raw data: Looks for `analysis.tdf` and `analysis.tdf_bin` files +/// 3. Falling back to path-based heuristics for cloud URLs +/// +/// This is more robust than just checking file extensions, especially for +/// cloud URLs where paths may not follow standard naming conventions. +/// +/// # Arguments +/// +/// * `path_or_url` - Can be: +/// - "/path/to/experiment.d" - Local raw data +/// - "/path/to/experiment.d.idx" - Local cached index +/// - "s3://bucket/experiment.d" - Cloud raw data +/// - "s3://bucket/experiment.idx" - Cloud cached index +/// - Any directory containing the appropriate files (extension-agnostic) +/// * `config` - Optional configuration (uses defaults if None) +/// +/// # Returns +/// +/// `IndexedPeaksHandle` - Either lazy or eager depending on config and input type +/// +/// # Examples +/// +/// ```no_run +/// use timsquery::serde::load_index_auto; +/// +/// // Simple usage - auto-detects everything +/// let index = load_index_auto("data.d", None)?.into_eager()?; +/// +/// // Load from cloud (works without .idx extension if metadata.json exists) +/// let index = load_index_auto("s3://bucket/my_experiment", None)?.into_eager()?; +/// +/// // Prefer lazy loading +/// use timsquery::serde::IndexLoadConfig; +/// let config = IndexLoadConfig { prefer_lazy: true, ..Default::default() }; +/// let handle = load_index_auto("data.d.idx", Some(config))?; +/// # Ok::<(), Box>(()) +/// ``` +pub fn load_index_auto( + path_or_url: impl AsRef, + config: Option, +) -> Result { + let input = path_or_url.as_ref(); + let config = config.unwrap_or_default(); + + info!("Loading index from: {}", input); + + // Detect input type by actually checking directory contents + // This is more robust than just looking at file extensions + // Propagate permission errors to the user + let is_cached = sniff_cached_index(input)?; + let is_cloud = input.contains("://"); + + info!( + "Detected: cached={}, cloud={}, prefer_lazy={}", + is_cached, is_cloud, config.prefer_lazy + ); + + // Early validation: reject cloud raw .d files with helpful error + if is_cloud && !is_cached { + error!( + "Attempted to load raw .d file from cloud storage: {}", + input + ); + return Err(crate::errors::DataReadingError::UnsupportedDataError( + crate::errors::UnsupportedDataError::CloudRawDataNotSupported { + url: input.to_string(), + suggestion: format!( + "Raw .d files must be processed locally first. Suggested workflow:\n\ + 1. Download the .d file locally\n\ + 2. Process it to create a cached index:\n\ + \n \ + use timsquery::serde::TimsIndexReader;\n \ + let index = TimsIndexReader::new()\n \ + .with_cloud_cache(\"{}\")\n \ + .read_index(\"/local/path/to/data.d\")?;\n\ + \n\ + 3. Then load from the cloud cache:\n \ + let index = TimsIndexReader::from_cache_url(\"{}\")?;\n\ + \n\ + Alternatively, use a local cache and upload it manually to your cloud storage.", + input.trim_end_matches(".d").to_string() + ".idx", + input.trim_end_matches(".d").to_string() + ".idx", + ), + }, + )); + } + + match (is_cached, config.prefer_lazy) { + (true, true) => { + // Cached index + prefer lazy = load lazy + info!("Loading as lazy (cached index)"); + let location = if is_cloud { + StorageLocation::from_url(input).map_err(|e| { + crate::errors::DataReadingError::SerializationError( + timscentroid::serialization::SerializationError::UrlParse(e), + ) + })? + } else { + StorageLocation::from_path(input) + }; + let lazy = LazyIndexedTimstofPeaks::load_from_storage(location) + .map_err(crate::errors::DataReadingError::SerializationError)?; + Ok(IndexedPeaksHandle::Lazy(lazy)) + } + (true, false) => { + // Cached index + prefer eager = load eager + info!("Loading as eager (cached index)"); + let eager = if is_cloud { + TimsIndexReader::from_cache_url(input)? + } else { + TimsIndexReader::from_cache_path(input)? + }; + Ok(IndexedPeaksHandle::Eager(eager)) + } + (false, _) => { + // Raw .d file - need to centroid/index (always eager) + info!("Loading as eager (raw .d file - requires centroiding)"); + + let mut reader = TimsIndexReader::new() + .with_cache_location(config.cache_location) + .with_write_missing_cache(config.write_missing_cache) + .with_serialization_config(config.serialization_config); + + if let Some(centroid_cfg) = config.centroiding_config { + reader = reader.with_centroiding_config(centroid_cfg); + } + + let eager = reader.read_index(input)?; + + // Note: Raw .d files always return eager, even if prefer_lazy is true + // (can't lazy-load raw data - it needs to be centroided first) + Ok(IndexedPeaksHandle::Eager(eager)) + } + } } diff --git a/rust/timsquery/src/serde/library_file.rs b/rust/timsquery/src/serde/library_file.rs index 9c5ef5a..9230e9b 100644 --- a/rust/timsquery/src/serde/library_file.rs +++ b/rust/timsquery/src/serde/library_file.rs @@ -1,3 +1,4 @@ +pub use super::diann_io::DiannPrecursorExtras; use super::diann_io::{ read_library_file as read_diann, sniff_diann_library_file, @@ -34,21 +35,26 @@ impl From for LibraryReadingError { } } +#[derive(Debug)] +pub enum FileReadingExtras { + Diann(Vec), +} + #[derive(Debug)] pub enum ElutionGroupCollection { - StringLabels(Vec>), - MzpafLabels(Vec>), - TinyIntLabels(Vec>), - IntLabels(Vec>), + StringLabels(Vec>, Option), + MzpafLabels(Vec>, Option), + TinyIntLabels(Vec>, Option), + IntLabels(Vec>, Option), } impl ElutionGroupCollection { pub fn len(&self) -> usize { match self { - ElutionGroupCollection::StringLabels(egs) => egs.len(), - ElutionGroupCollection::MzpafLabels(egs) => egs.len(), - ElutionGroupCollection::TinyIntLabels(egs) => egs.len(), - ElutionGroupCollection::IntLabels(egs) => egs.len(), + ElutionGroupCollection::StringLabels(egs, _) => egs.len(), + ElutionGroupCollection::MzpafLabels(egs, _) => egs.len(), + ElutionGroupCollection::TinyIntLabels(egs, _) => egs.len(), + ElutionGroupCollection::IntLabels(egs, _) => egs.len(), } } @@ -91,25 +97,25 @@ impl ElutionGroupCollection { .into_iter() .map( as TryInto>>::try_into) .collect(); - return Ok(ElutionGroupCollection::TinyIntLabels(out?)); + return Ok(ElutionGroupCollection::TinyIntLabels(out?, None)); } debug!("Attempting to deserialize elution group inputs with int labels"); if let Ok(eg_inputs) = serde_json::from_str::>>(content) { let out: Result>, ElutionGroupInputError> = eg_inputs.into_iter().map(|x| x.try_into()).collect(); - return Ok(ElutionGroupCollection::IntLabels(out?)); + return Ok(ElutionGroupCollection::IntLabels(out?, None)); } debug!("Attempting to deserialize elution group inputs with mzpaf labels"); if let Ok(eg_inputs) = serde_json::from_str::>>(content) { let out: Result>, ElutionGroupInputError> = eg_inputs.into_iter().map(|x| x.try_into()).collect(); - return Ok(ElutionGroupCollection::MzpafLabels(out?)); + return Ok(ElutionGroupCollection::MzpafLabels(out?, None)); } debug!("Attempting to deserialize elution group inputs with string labels"); if let Ok(eg_inputs) = serde_json::from_str::>>(content) { let out: Result>, ElutionGroupInputError> = eg_inputs.into_iter().map(|x| x.try_into()).collect(); - return Ok(ElutionGroupCollection::StringLabels(out?)); + return Ok(ElutionGroupCollection::StringLabels(out?, None)); } Err(LibraryReadingError::UnableToParseElutionGroups) } @@ -121,19 +127,19 @@ impl ElutionGroupCollection { debug!("Attempting direct deserialization of elution groups"); debug!("Attempting to deserialize elution groups with tiny int labels"); if let Ok(egs) = serde_json::from_str::>>(content) { - return Ok(ElutionGroupCollection::TinyIntLabels(egs)); + return Ok(ElutionGroupCollection::TinyIntLabels(egs, None)); } debug!("Attempting to deserialize elution groups with int labels"); if let Ok(egs) = serde_json::from_str::>>(content) { - return Ok(ElutionGroupCollection::IntLabels(egs)); + return Ok(ElutionGroupCollection::IntLabels(egs, None)); } debug!("Attempting to deserialize elution groups with mzpaf labels"); if let Ok(egs) = serde_json::from_str::>>(content) { - return Ok(ElutionGroupCollection::MzpafLabels(egs)); + return Ok(ElutionGroupCollection::MzpafLabels(egs, None)); } debug!("Attempting to deserialize elution groups with string labels"); if let Ok(egs) = serde_json::from_str::>>(content) { - return Ok(ElutionGroupCollection::StringLabels(egs)); + return Ok(ElutionGroupCollection::StringLabels(egs, None)); } Err(LibraryReadingError::UnableToParseElutionGroups) } @@ -149,9 +155,12 @@ impl ElutionGroupCollection { return Err(LibraryReadingError::UnableToParseElutionGroups); } }; - let out = egs.into_iter().map(|x| x.0).collect(); + let (egs, extras): (Vec<_>, Vec<_>) = egs.into_iter().unzip(); info!("Successfully read DIA-NN library file"); - return Ok(ElutionGroupCollection::MzpafLabels(out)); + return Ok(ElutionGroupCollection::MzpafLabels( + egs, + Some(FileReadingExtras::Diann(extras)), + )); } Err(LibraryReadingError::UnableToParseElutionGroups) } diff --git a/rust/timsquery/src/serde/mod.rs b/rust/timsquery/src/serde/mod.rs index 6c74732..a13858e 100644 --- a/rust/timsquery/src/serde/mod.rs +++ b/rust/timsquery/src/serde/mod.rs @@ -7,7 +7,9 @@ mod library_file; pub use chromatogram_output::*; pub use index_serde::*; pub use library_file::{ + DiannPrecursorExtras, ElutionGroupCollection, + FileReadingExtras, LibraryReadingError, read_library_file, }; diff --git a/rust/timsquery/src/traits/queriable_data.rs b/rust/timsquery/src/traits/queriable_data.rs index bf166ca..a20a715 100644 --- a/rust/timsquery/src/traits/queriable_data.rs +++ b/rust/timsquery/src/traits/queriable_data.rs @@ -12,6 +12,7 @@ use crate::{ use rayon::prelude::*; use std::ops::AddAssign; use timscentroid::indexing::IndexedPeak; +use timscentroid::rt_mapping::RTIndex; /// Trait indicating that indexed data can be queried with a specific aggregator type. /// @@ -107,7 +108,7 @@ where /// - Implement `Default` for initialization /// /// Common implementations: `f32`, custom statistics collectors. -pub trait PeakAddable: AddAssign + ValueLike + Default {} +pub trait PeakAddable: AddAssign> + ValueLike + Default {} /// Convenience trait for indexed data that supports all common aggregator types. /// diff --git a/rust/timsquery/tests/diann_io_files/sample_lib.tsv b/rust/timsquery/tests/diann_io_files/sample_lib.tsv new file mode 100644 index 0000000..6ecf0d5 --- /dev/null +++ b/rust/timsquery/tests/diann_io_files/sample_lib.tsv @@ -0,0 +1,10 @@ +FileName PrecursorMz ProductMz Tr_recalibrated IonMobility transition_name LibraryIntensity transition_group_id decoy PeptideSequence Proteotypic QValue PGQValue Ms1ProfileCorr ProteinGroup ProteinName Genes FullUniModPeptideName ModifiedPeptide PrecursorCharge PeptideGroupLabel UniprotID NTerm CTerm FragmentType FragmentCharge FragmentSeriesNumber FragmentLossType ExcludeFromAssay +MSR39214_DIA.d 478.78064 672.40503 -14.411617 0.8309375 AAAAAAALQAK2_121_1_0_4 1 AAAAAAALQAK2 0 AAAAAAALQAK 1 6.483484e-06 0.00050479558 0.99249279 P36578 RL4_HUMAN RPL4 AAAAAAALQAK AAAAAAALQAK 2 AAAAAAALQAK P36578 0 0 y 1 7 noloss False +MSR39214_DIA.d 478.78064 601.36786 -14.411617 0.8309375 AAAAAAALQAK2_121_1_0_5 0.81050944 AAAAAAALQAK2 0 AAAAAAALQAK 1 6.483484e-06 0.00050479558 0.99249279 P36578 RL4_HUMAN RPL4 AAAAAAALQAK AAAAAAALQAK 2 AAAAAAALQAK P36578 0 0 y 1 6 noloss False +MSR39214_DIA.d 478.78064 214.11917 -14.411617 0.8309375 AAAAAAALQAK2_98_1_0_3 0.60258061 AAAAAAALQAK2 0 AAAAAAALQAK 1 6.483484e-06 0.00050479558 0.99249279 P36578 RL4_HUMAN RPL4 AAAAAAALQAK AAAAAAALQAK 2 AAAAAAALQAK P36578 0 0 b 1 3 noloss True +MSR39214_DIA.d 478.78064 743.44214 -14.411617 0.8309375 AAAAAAALQAK2_121_1_0_3 0.55991459 AAAAAAALQAK2 0 AAAAAAALQAK 1 6.483484e-06 0.00050479558 0.99249279 P36578 RL4_HUMAN RPL4 AAAAAAALQAK AAAAAAALQAK 2 AAAAAAALQAK P36578 0 0 y 1 8 noloss True +MSR39214_DIA.d 478.78064 530.33075 -14.411617 0.8309375 AAAAAAALQAK2_121_1_0_6 0.42974067 AAAAAAALQAK2 0 AAAAAAALQAK 1 6.483484e-06 0.00050479558 0.99249279 P36578 RL4_HUMAN RPL4 AAAAAAALQAK AAAAAAALQAK 2 AAAAAAALQAK P36578 0 0 y 1 5 noloss True +MSR39214_DIA.d 478.78064 814.47925 -14.411617 0.8309375 AAAAAAALQAK2_121_1_0_2 0.40478963 AAAAAAALQAK2 0 AAAAAAALQAK 1 6.483484e-06 0.00050479558 0.99249279 P36578 RL4_HUMAN RPL4 AAAAAAALQAK AAAAAAALQAK 2 AAAAAAALQAK P36578 0 0 y 1 9 noloss False +MSR39214_DIA.d 478.78064 285.15628 -14.411617 0.8309375 AAAAAAALQAK2_98_1_0_4 0.27873087 AAAAAAALQAK2 0 AAAAAAALQAK 1 6.483484e-06 0.00050479558 0.99249279 P36578 RL4_HUMAN RPL4 AAAAAAALQAK AAAAAAALQAK 2 AAAAAAALQAK P36578 0 0 b 1 4 noloss True +MSR39214_DIA.d 478.78064 459.29367 -14.411617 0.8309375 AAAAAAALQAK2_121_1_0_7 0.23921025 AAAAAAALQAK2 0 AAAAAAALQAK 1 6.483484e-06 0.00050479558 0.99249279 P36578 RL4_HUMAN RPL4 AAAAAAALQAK AAAAAAALQAK 2 AAAAAAALQAK P36578 0 0 y 1 4 noloss True +MSR39214_DIA.d 478.78064 346.20959 -14.411617 0.8309375 AAAAAAALQAK2_121_1_0_8 0.17267415 AAAAAAALQAK2 0 AAAAAAALQAK 1 6.483484e-06 0.00050479558 0.99249279 P36578 RL4_HUMAN RPL4 AAAAAAALQAK AAAAAAALQAK 2 AAAAAAALQAK P36578 0 0 y 1 3 noloss True diff --git a/rust/timsquery_cli/Cargo.toml b/rust/timsquery_cli/Cargo.toml index 2295ec6..36380d2 100644 --- a/rust/timsquery_cli/Cargo.toml +++ b/rust/timsquery_cli/Cargo.toml @@ -20,5 +20,8 @@ rayon = { workspace = true } half = { workspace = true } thiserror = { workspace = true } # User requested +[dev-dependencies] +tempfile = { workspace = true } + [target.'cfg(windows)'.dependencies] mimalloc = { workspace = true, features = ["secure"] } diff --git a/rust/timsquery_cli/data_contracts/single_elution_group.json b/rust/timsquery_cli/data_contracts/single_elution_group.json new file mode 100644 index 0000000..e7b506f --- /dev/null +++ b/rust/timsquery_cli/data_contracts/single_elution_group.json @@ -0,0 +1,152 @@ +[ + { + "fragment_labels":[ + "y1", + "y1^2", + "y2", + "y2^2", + "y3", + "y3^2", + "y4", + "y4^2", + "y5", + "y5^2", + "y6", + "y6^2", + "y7", + "y7^2", + "y8", + "y8^2", + "y9", + "y9^2", + "y10", + "y10^2", + "y11", + "y11^2", + "y12", + "y12^2", + "y13", + "y13^2", + "y14", + "y14^2", + "y15", + "y15^2", + "y16", + "y16^2", + "y17", + "y17^2", + "b1", + "b1^2", + "b2", + "b2^2", + "b3", + "b3^2", + "b4", + "b4^2", + "b5", + "b5^2", + "b6", + "b6^2", + "b7", + "b7^2", + "b8", + "b8^2", + "b9", + "b9^2", + "b10", + "b10^2", + "b11", + "b11^2", + "b12", + "b12^2", + "b13", + "b13^2", + "b14", + "b14^2", + "b15", + "b15^2", + "b16", + "b16^2", + "b17", + "b17^2", + "p^2" + ], + "fragments":[ + 147.112804164512, + 74.060040315662, + 248.160482632922, + 124.58387954986699, + 347.22889654591205, + 174.11808650636203, + 418.26601033062207, + 209.63664339871704, + 533.292953354452, + 267.150114910632, + 590.314417075022, + 295.660846770917, + 677.346445479292, + 339.176860973052, + 792.373388503122, + 396.690332484967, + 849.3948522236919, + 425.20106434525195, + 920.4319660084019, + 460.71962123760693, + 977.4534297289719, + 489.2303530978919, + 1034.474893449542, + 517.741084958177, + 1105.512007234252, + 553.259641850532, + 1162.533470954822, + 581.770373710817, + 1233.570584739532, + 617.288930603172, + 1304.607698524242, + 652.8074874955271, + 1375.6448123089522, + 688.3260443878821, + 72.044390251522, + 36.525833359167, + 143.081504036232, + 72.04439025152199, + 214.11861782094198, + 107.56294714387698, + 285.155731605652, + 143.08150403623202, + 342.17719532622203, + 171.59223589651702, + 413.21430911093205, + 207.11079278887203, + 470.23577283150206, + 235.62152464915704, + 527.257236552072, + 264.13225650944196, + 598.2943503367819, + 299.65081340179694, + 655.3158140573519, + 328.1615452620819, + 770.3427570811818, + 385.6750167739969, + 857.3747854854519, + 429.1910309761319, + 914.3962492060218, + 457.7017628364169, + 1029.423192229852, + 515.215234348332, + 1100.460306014562, + 550.7337912406871, + 1199.5287199275522, + 600.2679981971821, + 1300.576398395962, + 650.7918374313871, + 723.844601280237 + ], + "id":0, + "mobility":0.9851410984992981, + "precursor":723.844601280237, + "precursor_charge":2, + "precursor_isotopes":[0,1,2], + "rt_seconds":302.2712230682373 + } +] \ No newline at end of file diff --git a/rust/timsquery_cli/src/cli.rs b/rust/timsquery_cli/src/cli.rs index aada2be..9b1ee45 100644 --- a/rust/timsquery_cli/src/cli.rs +++ b/rust/timsquery_cli/src/cli.rs @@ -21,8 +21,8 @@ pub enum Commands { #[derive(Debug, Default, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, clap::ValueEnum)] pub enum PossibleAggregator { - #[default] PointIntensityAggregator, + #[default] ChromatogramAggregator, SpectrumAggregator, } @@ -32,7 +32,7 @@ pub enum SerializationFormat { Json, #[default] PrettyJson, - // Ndjson, + Ndjson, } #[derive(Parser, Debug, Clone)] @@ -71,8 +71,4 @@ pub struct WriteTemplateArgs { /// The path to the output files. #[arg(short, long)] pub output_path: PathBuf, - - /// The number of elution groups to generate. - #[arg(short, long, default_value_t = 10)] - pub num_elution_groups: usize, } diff --git a/rust/timsquery_cli/src/commands.rs b/rust/timsquery_cli/src/commands.rs index 7f6e3f1..90ea357 100644 --- a/rust/timsquery_cli/src/commands.rs +++ b/rust/timsquery_cli/src/commands.rs @@ -1,33 +1,25 @@ +use serde::Serialize; +use std::fmt::Display; use std::fs::File; -use std::io::BufWriter; +use std::io::{ + self, + BufWriter, + Write, +}; use std::path::{ Path, PathBuf, }; -use std::sync::Arc; use std::time::{ Duration, Instant, }; -use serde::ser::{ - SerializeSeq, - Serializer, -}; -use timscentroid::{ - IndexedTimstofPeaks, - TimsTofPath, -}; +use timscentroid::IndexedTimstofPeaks; +use timsquery::KeyLike; use timsquery::models::elution_group::TimsElutionGroup; -use timsquery::models::tolerance::{ - MobilityTolerance, - MzTolerance, - QuadTolerance, - RtTolerance, - Tolerance, -}; -use timsquery::serde::load_index_caching; -use timsrust::MSLevel; +use timsquery::models::tolerance::Tolerance; +use timsquery::serde::load_index_auto; use tracing::{ info, instrument, @@ -42,6 +34,7 @@ use crate::cli::{ }; use crate::error::CliError; use crate::processing::AggregatorContainer; +use timsquery::serde::ElutionGroupCollection; /// Main function for the 'query-index' subcommand. #[instrument] @@ -58,13 +51,18 @@ pub fn main_query_index(args: QueryIndexArgs) -> Result<(), CliError> { "Loading elution groups from {}", elution_groups_path.display() ); - let elution_groups: Vec> = - read_query_elution_groups(&elution_groups_path)?; + let elution_groups: ElutionGroupCollection = read_query_elution_groups(&elution_groups_path)?; info!("Loaded {} elution groups", elution_groups.len()); - let index = load_index_caching(&raw_file_path) - .map_err(|e| CliError::DataReading(format!("{:?}", e)))?; - let rts = get_ms1_rts_as_millis(&raw_file_path)?; + let index = load_index_auto( + raw_file_path + .to_str() + .ok_or_else(|| CliError::DataReading("Invalid path encoding".to_string()))?, + None, // Use default config + ) + .map_err(|e| CliError::DataReading(format!("{:?}", e)))? + .into_eager() + .map_err(|e| CliError::DataReading(format!("{:?}", e)))?; let output_path = args.output_path; let serialization_format = args.format; @@ -73,27 +71,51 @@ pub fn main_query_index(args: QueryIndexArgs) -> Result<(), CliError> { std::fs::create_dir_all(&output_path)?; let put_path = output_path.join("results.json"); - stream_process_batches( - elution_groups, - aggregator_use, - rts, - &index, - &tolerance_settings, - serialization_format, - &put_path, - batch_size, - )?; + match elution_groups { + ElutionGroupCollection::StringLabels(egs, _) => stream_process_batches( + egs, + aggregator_use, + &index, + &tolerance_settings, + serialization_format, + &put_path, + batch_size, + ), + ElutionGroupCollection::MzpafLabels(egs, _) => stream_process_batches( + egs, + aggregator_use, + &index, + &tolerance_settings, + serialization_format, + &put_path, + batch_size, + ), + ElutionGroupCollection::TinyIntLabels(egs, _) => stream_process_batches( + egs, + aggregator_use, + &index, + &tolerance_settings, + serialization_format, + &put_path, + batch_size, + ), + ElutionGroupCollection::IntLabels(egs, _) => stream_process_batches( + egs, + aggregator_use, + &index, + &tolerance_settings, + serialization_format, + &put_path, + batch_size, + ), + }?; Ok(()) } /// Reads elution groups from a given path, attempting to parse them in several formats. -pub fn read_query_elution_groups(path: &PathBuf) -> Result>, CliError> { +pub fn read_query_elution_groups(path: &PathBuf) -> Result { match timsquery::serde::read_library_file(path) { - Ok(timsquery::serde::ElutionGroupCollection::TinyIntLabels(egs)) => Ok(egs), - Ok(other) => Err(CliError::DataReading(format!( - "Expected elution groups with u8 labels, but got different label type: {:?}", - other - ))), + Ok(egs) => Ok(egs), Err(e) => Err(CliError::DataReading(format!( "Failed to read elution groups from {}: {:?}", path.display(), @@ -102,35 +124,131 @@ pub fn read_query_elution_groups(path: &PathBuf) -> Result Result<(), CliError> { - todo!("Ooops, this feature is not yet implemented."); + let target_dir = args.output_path; + std::fs::create_dir_all(&target_dir)?; + + let narrow_path = target_dir.join("narrow_tolerance_template.json"); + let wide_path = target_dir.join("wide_tolerance_template.json"); + std::fs::write(&narrow_path, NARROW_TOLERANCE_TEMPLATE)?; + std::fs::write(&wide_path, WIDE_TOLERANCE_TEMPLATE)?; + println!( + "Wrote tolerance templates to:\n- {}\n- {}", + narrow_path.display(), + wide_path.display() + ); + + // Elution group template + let elution_group_path = target_dir.join("elution_group_template.json"); + std::fs::write(&elution_group_path, ELUTION_GROUP_TEMPLATE)?; + println!( + "Wrote elution group template to: {}", + elution_group_path.display() + ); + Ok(()) } -/// Retrieves MS1 retention times from a TIMS-TOF file, sorted and deduped. -pub fn get_ms1_rts_as_millis(file: &PathBuf) -> Result, CliError> { - let ttp = TimsTofPath::new(file).map_err(|e| CliError::TimsFileLoad { - path: file.clone(), - source: e, - })?; - let reader = ttp.load_frame_reader()?; - let mut rts: Vec<_> = reader - .frame_metas - .iter() - .filter(|x| x.ms_level == MSLevel::MS1) - .map(|f| (f.rt_in_seconds * 1000.0).round() as u32) - .collect(); - rts.sort_unstable(); - rts.dedup(); - Ok(rts.into()) +pub struct JsonStreamSerializer { + writer: W, + format: SerializationFormat, + is_first: bool, +} + +impl JsonStreamSerializer { + pub fn new(writer: W, format: SerializationFormat) -> Self { + Self { + writer, + format, + is_first: true, + } + } + + /// Serializes an item based on the selected format. + pub fn serialize(&mut self, item: &T) -> io::Result<()> { + match self.format { + SerializationFormat::Ndjson => { + serde_json::to_writer(&mut self.writer, item).map_err(io::Error::other)?; + self.writer.write_all(b"\n")?; + } + SerializationFormat::Json | SerializationFormat::PrettyJson => { + if self.is_first { + self.writer.write_all(b"[")?; + self.is_first = false; + } else { + self.writer.write_all(b",")?; + } + + if matches!(self.format, SerializationFormat::PrettyJson) { + serde_json::to_writer_pretty(&mut self.writer, item) + } else { + serde_json::to_writer(&mut self.writer, item) + } + .map_err(io::Error::other)?; + } + } + Ok(()) + } + + /// Finalizes the output (crucial for closing JSON arrays). + pub fn finish(mut self) -> io::Result<()> { + match self.format { + SerializationFormat::Json | SerializationFormat::PrettyJson => { + if self.is_first { + // Handle case where no items were ever serialized + self.writer.write_all(b"[]")?; + } else { + self.writer.write_all(b"]")?; + } + } + SerializationFormat::Ndjson => {} // No closing tag needed + } + self.writer.flush() + } } /// Streams and processes elution groups in batches, then serializes the results. #[instrument(skip_all)] -pub fn stream_process_batches( - elution_groups: Vec>, +pub fn stream_process_batches( + elution_groups: Vec>, aggregator_use: PossibleAggregator, - rts: Arc<[u32]>, index: &IndexedTimstofPeaks, tolerance: &Tolerance, serialization_format: SerializationFormat, @@ -151,15 +269,13 @@ pub fn stream_process_batches( match serialization_format { SerializationFormat::PrettyJson => { - let mut ser = serde_json::Serializer::pretty(writer); + let ser = JsonStreamSerializer::new(writer, SerializationFormat::PrettyJson); process_and_serialize( elution_groups, aggregator_use, - rts, index, tolerance, - &mut ser, - total_groups, + ser, total_batches, batch_size, serialization_start, @@ -167,15 +283,27 @@ pub fn stream_process_batches( )?; } SerializationFormat::Json => { - let mut ser = serde_json::Serializer::new(writer); + let ser = JsonStreamSerializer::new(writer, SerializationFormat::Json); + process_and_serialize( + elution_groups, + aggregator_use, + index, + tolerance, + ser, + total_batches, + batch_size, + serialization_start, + output_path, + )?; + } + SerializationFormat::Ndjson => { + let ser = JsonStreamSerializer::new(writer, SerializationFormat::Ndjson); process_and_serialize( elution_groups, aggregator_use, - rts, index, tolerance, - &mut ser, - total_groups, + ser, total_batches, batch_size, serialization_start, @@ -188,24 +316,17 @@ pub fn stream_process_batches( /// Processes batches of elution groups and serializes the aggregated results. #[instrument(skip_all)] -pub fn process_and_serialize( - elution_groups: Vec>, +pub fn process_and_serialize( + elution_groups: Vec>, aggregator_use: PossibleAggregator, - rts: Arc<[u32]>, index: &IndexedTimstofPeaks, tolerance: &Tolerance, - ser: S, - total_groups: usize, + mut ser: JsonStreamSerializer, total_batches: usize, batch_size: usize, serialization_start: Instant, output_path: &Path, -) -> Result<(), CliError> -where - S: Serializer, -{ - let mut seq = ser.serialize_seq(Some(total_groups)).unwrap(); - +) -> Result<(), CliError> { let mut last_progress = Instant::now(); let progress_interval = Duration::from_secs(2); @@ -220,16 +341,18 @@ where last_progress = Instant::now(); } - let mut container = - AggregatorContainer::new(chunk.to_vec(), aggregator_use, rts.clone(), tolerance)?; + let mut container = AggregatorContainer::new( + chunk.to_vec(), + aggregator_use, + index.ms1_cycle_mapping(), + tolerance, + )?; container.add_query(index, tolerance); - - container.serialize_to_seq(&mut seq, &rts)?; + container.serialize_to_seq(&mut ser, index.ms1_cycle_mapping())?; } - seq.end().unwrap(); - + ser.finish()?; let serialization_elapsed = serialization_start.elapsed(); println!("Wrote to {}", output_path.display()); println!( @@ -239,12 +362,57 @@ where Ok(()) } -/// Generates a default set of tolerance settings. -pub fn template_tolerance_settings() -> Tolerance { - Tolerance { - ms: MzTolerance::Ppm((15.0, 15.0)), - rt: RtTolerance::Unrestricted, - mobility: MobilityTolerance::Pct((10.0, 10.0)), - quad: QuadTolerance::Absolute((0.1, 0.1)), +#[cfg(test)] +mod tests { + use super::*; + use timsquery::models::tolerance::{ + MobilityTolerance, + MzTolerance, + QuadTolerance, + RtTolerance, + Tolerance, + }; + + #[test] + fn test_we_can_read_data_contract() { + let manifest_path = env!("CARGO_MANIFEST_DIR"); + let elution_groups_path = + PathBuf::from(manifest_path).join("data_contracts/single_elution_group.json"); + + let elution_groups = read_query_elution_groups(&elution_groups_path).unwrap(); + // Do not change that file, it means its a data contract test + // AKA, we promised we would be compatible with that file. + // IF you do want to change it contact the Carafe developers first. + assert!(elution_groups.len() == 1); + } + + #[test] + fn test_templates_tolerance_deserializable() { + let narrow: Tolerance = serde_json::from_str(NARROW_TOLERANCE_TEMPLATE).unwrap(); + let wide: Tolerance = serde_json::from_str(WIDE_TOLERANCE_TEMPLATE).unwrap(); + + assert!(matches!(narrow.ms, MzTolerance::Ppm(_))); + assert!(matches!(narrow.rt, RtTolerance::Minutes(_))); + assert!(matches!(narrow.mobility, MobilityTolerance::Pct(_))); + assert!(matches!(narrow.quad, QuadTolerance::Absolute(_))); + + assert!(matches!(wide.ms, MzTolerance::Absolute(_))); + assert!(matches!(wide.rt, RtTolerance::Unrestricted)); + assert!(matches!(wide.mobility, MobilityTolerance::Pct(_))); + assert!(matches!(wide.quad, QuadTolerance::Absolute(_))); + } + + #[test] + fn test_elution_group_template_deserializable() { + use timsquery::IonAnnot; + let elution_groups = + serde_json::from_str::>>(ELUTION_GROUP_TEMPLATE) + .unwrap(); + assert!(elution_groups.len() == 2); + // Write to a temp file ... while I implement direct reading api + let tmp_file = tempfile::NamedTempFile::new().unwrap(); + std::fs::write(tmp_file.path(), ELUTION_GROUP_TEMPLATE).unwrap(); + let elution_groups = read_query_elution_groups(&tmp_file.path().to_path_buf()).unwrap(); + assert!(elution_groups.len() == 2); } } diff --git a/rust/timsquery_cli/src/error.rs b/rust/timsquery_cli/src/error.rs index 0ecbcee..af88944 100644 --- a/rust/timsquery_cli/src/error.rs +++ b/rust/timsquery_cli/src/error.rs @@ -1,6 +1,4 @@ -use std::path::PathBuf; use thiserror::Error; -use timsrust::TimsTofPathError; use timsrust::readers::FrameReaderError; #[derive(Error, Debug)] @@ -11,24 +9,12 @@ pub enum CliError { #[error("JSON parsing error: {0}")] Json(#[from] serde_json::Error), - #[error("Failed to load TIMS file at '{path}': {source}")] - TimsFileLoad { - path: PathBuf, - source: TimsTofPathError, - }, - #[error("TIMS frame reader error: {0}")] FrameReader(#[from] FrameReaderError), - #[error("No non-zero intensities found for chromatogram id {0}")] - EmptyChromatogram(u64), - #[error("Data processing error: {0}")] DataProcessing(String), #[error("Data reading error: {0}")] DataReading(String), - - #[error("Non-recoverable internal error: {0}")] - NonRecoverableError(String), } diff --git a/rust/timsquery_cli/src/processing.rs b/rust/timsquery_cli/src/processing.rs index e839126..9bdd5dd 100644 --- a/rust/timsquery_cli/src/processing.rs +++ b/rust/timsquery_cli/src/processing.rs @@ -1,17 +1,21 @@ use crate::cli::PossibleAggregator; +use crate::commands::JsonStreamSerializer; use crate::error::CliError; use rayon::iter::{ ParallelDrainRange, ParallelIterator, }; -use serde::ser::SerializeSeq; use serde::{ Deserialize, Serialize, }; -use std::sync::Arc; +use std::fmt::Display; +use std::io::Write; use timscentroid::IndexedTimstofPeaks; -use timsquery::QueriableData; +use timscentroid::rt_mapping::{ + CycleToRTMapping, + MS1CycleIndex, +}; use timsquery::models::aggregators::{ ChromatogramCollector, PointIntensityAggregator, @@ -20,6 +24,10 @@ use timsquery::models::aggregators::{ use timsquery::models::elution_group::TimsElutionGroup; use timsquery::models::tolerance::Tolerance; use timsquery::serde::ChromatogramOutput; +use timsquery::{ + KeyLike, + QueriableData, +}; use tracing::{ error, warn, @@ -39,8 +47,8 @@ pub struct SpectrumOutput { precursor_labels: Vec, } -impl From<&SpectralCollector> for SpectrumOutput { - fn from(agg: &SpectralCollector) -> Self { +impl From<&SpectralCollector> for SpectrumOutput { + fn from(agg: &SpectralCollector) -> Self { let (fragment_mzs, fragment_intensities) = agg .iter_fragments() .map(|((_idx, mz), inten)| (mz, inten)) @@ -65,17 +73,17 @@ impl From<&SpectralCollector> for SpectrumOutput { } } -pub enum AggregatorContainer { - Point(Vec>), - Chromatogram(Vec>), - Spectrum(Vec>), +pub enum AggregatorContainer { + Point(Vec>), + Chromatogram(Vec>), + Spectrum(Vec>), } -impl AggregatorContainer { +impl AggregatorContainer { pub fn new( - queries: Vec>, + queries: Vec>, aggregator: PossibleAggregator, - ref_rts: Arc<[u32]>, + ref_rts: &CycleToRTMapping, tolerance: &Tolerance, ) -> Result { Ok(match aggregator { @@ -91,15 +99,13 @@ impl AggregatorContainer { .map(|x| { let rt_range = match tolerance.rt_range_as_milis(x.rt_seconds()) { timsquery::OptionallyRestricted::Unrestricted => { - timsquery::TupleRange::try_new( - *ref_rts.first().unwrap(), - *ref_rts.last().unwrap(), - ) - .expect("Reference RTs should be sorted and valid") + let range = ref_rts.range_milis(); + timsquery::TupleRange::try_new(range.0, range.1) + .expect("Reference RTs should be sorted and valid") } timsquery::OptionallyRestricted::Restricted(r) => r, }; - ChromatogramCollector::new(x, rt_range, &ref_rts) + ChromatogramCollector::new(x, rt_range, ref_rts) .map_err(|e| CliError::DataProcessing(format!("{:?}", e))) }) .collect::, _>>()?; @@ -125,22 +131,23 @@ impl AggregatorContainer { } } - pub fn serialize_to_seq(&mut self, seq: &mut S, ref_rts: &[u32]) -> Result<(), CliError> - where - S: SerializeSeq, - { + pub fn serialize_to_seq( + &mut self, + seq: &mut JsonStreamSerializer, + ref_rts: &CycleToRTMapping, + ) -> Result<(), CliError> { match self { AggregatorContainer::Point(aggregators) => { for agg in aggregators { - seq.serialize_element(agg).unwrap(); + seq.serialize(agg).unwrap(); } } AggregatorContainer::Chromatogram(aggregators) => { let converted_results: Vec = aggregators .par_drain(..) - .filter_map(|agg| { + .filter_map(|mut agg| { let agg_id = agg.eg.id(); - match ChromatogramOutput::try_new(agg, ref_rts) { + match ChromatogramOutput::try_new(&mut agg, ref_rts) { Ok(output) => Some(output), Err(e) => { // if !matches!(e, CliError::EmptyChromatogram(_)) { @@ -172,13 +179,13 @@ impl AggregatorContainer { .collect(); for ser_agg in converted_results { - seq.serialize_element(&ser_agg).unwrap(); + seq.serialize(&ser_agg).unwrap(); } } AggregatorContainer::Spectrum(aggregators) => { for agg in aggregators.iter() { let ser_agg = SpectrumOutput::from(agg); - seq.serialize_element(&ser_agg).unwrap(); + seq.serialize(&ser_agg).unwrap(); } } } diff --git a/rust/timsquery_viewer/Cargo.toml b/rust/timsquery_viewer/Cargo.toml index 687d0a0..76177eb 100644 --- a/rust/timsquery_viewer/Cargo.toml +++ b/rust/timsquery_viewer/Cargo.toml @@ -12,14 +12,16 @@ path = "src/main.rs" [dependencies] timsquery = { path = "../timsquery" } timscentroid = { path = "../timscentroid" } +timsseek = { path = "../timsseek" } # GUI dependencies -egui = "0.33" -eframe = { version = "0.33", default-features = true, features = ["default_fonts"] } +egui = {version = "0.33", features = ["persistence", "rayon"]} +eframe = { version = "0.33", default-features = true, features = ["default_fonts", "persistence"] } egui_plot = "0.34" egui_extras = { version = "0.33", features = ["serde"] } -egui_dock = "0.18" -rfd = "0.15" +egui_dock = { version = "0.18", features = ["serde"] } +rfd = "0.16" # File dialogs dependency +ron = "0.12.0" # Yet another serialization format ... # Workspace-inherited deps serde = { workspace = true } @@ -29,6 +31,10 @@ tracing-subscriber = { workspace = true } timsrust = { workspace = true } rayon = { workspace = true } thiserror = { workspace = true } +clap = { workspace = true, features = ["derive"] } + +[features] +instrumentation = ["timsseek/instrumentation"] [target.'cfg(windows)'.dependencies] mimalloc = { workspace = true, features = ["secure"] } diff --git a/rust/timsquery_viewer/src/app.rs b/rust/timsquery_viewer/src/app.rs index 1daebfc..e0f0366 100644 --- a/rust/timsquery_viewer/src/app.rs +++ b/rust/timsquery_viewer/src/app.rs @@ -1,4 +1,5 @@ use eframe::egui; +use egui::Color32; use egui_dock::{ DockArea, DockState, @@ -7,56 +8,99 @@ use egui_dock::{ }; use std::path::PathBuf; use std::sync::Arc; -use timscentroid::IndexedTimstofPeaks; use timsquery::models::tolerance::Tolerance; +use timsquery::serde::IndexedPeaksHandle; -use crate::chromatogram_processor::{ - self, - ChromatogramOutput, - SmoothingMethod, +use crate::chromatogram_processor::SmoothingMethod; +use crate::cli::Cli; +use crate::computed_state::{ + ChromatogramComputationResult, + ComputedState, }; -use crate::domain::ChromatogramService; use crate::file_loader::{ ElutionGroupData, FileLoader, }; -use crate::plot_renderer::{ - ChromatogramLines, - MS2Spectrum, -}; +use crate::plot_renderer::AutoZoomMode; use crate::ui::panels::{ - LeftPanel, + ConfigPanel, SpectrumPanel, TablePanel, }; -use crate::ui::{ - Panel, - PanelContext, +use std::sync::Arc as StdArc; +use std::sync::atomic::{ + AtomicBool, + Ordering, }; +use std::sync::mpsc::{ + Receiver, + channel, +}; + +/// Result from background chromatogram computation +type ChromatogramComputeResult = Result< + ( + crate::chromatogram_processor::ChromatogramOutput, + crate::chromatogram_processor::ChromatogramCollector, + timsseek::ExpectedIntensities, + u64, // selected_idx as cache key + ), + String, +>; + +/// Handle to cancel a running background computation +#[derive(Clone)] +struct CancellationToken { + cancelled: StdArc, +} + +impl CancellationToken { + fn new() -> Self { + Self { + cancelled: StdArc::new(AtomicBool::new(false)), + } + } + + fn cancel(&self) { + self.cancelled.store(true, Ordering::Relaxed); + } -/// Commands that trigger state changes in the application -#[derive(Debug, Clone)] -pub enum AppCommand { - /// Regenerate the chromatogram for the current selection - RegenerateChromatogram, - /// Select a specific elution group by index - SelectElutionGroup(usize), - /// Update the tolerance settings - UpdateTolerance, - /// Update the smoothing settings - UpdateSmoothing, - /// Query MS2 spectrum at given RT (seconds) - QueryMS2Spectrum(f64), + fn is_cancelled(&self) -> bool { + self.cancelled.load(Ordering::Relaxed) + } } +// UI spacing and layout constants +const SECTION_MARGIN: i8 = 10; +const SECTION_SPACING: f32 = 12.0; +const INTERNAL_SPACING: f32 = 8.0; +const SMALL_SPACING: f32 = 4.0; +const SMALL_SPACING_VERTICAL: f32 = 6.0; +const SEPARATOR_SPACING: f32 = 16.0; +const LOCATION_DISPLAY_MAX_LEN: usize = 40; +const CORNER_RADIUS: u8 = 4; +const LOCATION_FRAME_PADDING_H: i8 = 8; +const LOCATION_FRAME_PADDING_V: i8 = 4; + /// Pane types for the tile layout -#[derive(Debug)] +#[derive(Debug, Clone, Copy, serde::Deserialize, serde::Serialize)] enum Pane { - LeftPanel, + ConfigPanel, TablePanel, MS2Spectrum, PrecursorPlot, FragmentPlot, + ScoresPlot, +} + +/// State to be persisted across restarts +#[derive(serde::Deserialize, serde::Serialize)] +struct PersistentState { + file_loader: FileLoader, + ui_state: UiState, + tolerance: Tolerance, + smoothing: SmoothingMethod, + dock_state: DockState, } /// State of indexed raw data loading @@ -65,14 +109,14 @@ pub enum IndexedDataState { /// No data loaded #[default] None, - /// Currently loading data from path - Loading(PathBuf), + /// Currently loading data from location (path or URL) + Loading(String), /// Loading failed with error message - Failed(PathBuf, String), + Failed(String, String), /// Data successfully loaded Loaded { - index: Arc, - ms1_rts: Arc<[u32]>, + index: Arc, + source: String, }, } @@ -81,16 +125,28 @@ pub enum IndexedDataState { pub struct DataState { /// Loaded elution groups pub elution_groups: Option, + /// Path from which the elution groups were loaded + pub elution_groups_source: Option, /// Indexed timsTOF data with loading state pub indexed_data: IndexedDataState, /// Tolerance settings pub tolerance: Tolerance, /// Smoothing method configuration pub smoothing: SmoothingMethod, + /// Auto-zoom mode for plots + pub auto_zoom_mode: AutoZoomMode, +} + +/// Input mode for raw data loading +#[derive(Debug, Clone, Copy, PartialEq, serde::Deserialize, serde::Serialize, Default)] +pub enum RawDataInputMode { + #[default] + Local, + Cloud, } /// UI-specific state - transient UI state that doesn't affect data -#[derive(Debug)] +#[derive(Debug, serde::Deserialize, serde::Serialize, Default)] pub struct UiState { /// Filter text for precursor table pub table_filter: String, @@ -100,35 +156,8 @@ pub struct UiState { pub search_mode: bool, /// Vim mode: search input buffer pub search_input: String, - /// Master toggle for MS2 spectrum feature - pub show_ms2_spectrum: bool, -} - -impl Default for UiState { - fn default() -> Self { - Self { - table_filter: String::new(), - selected_index: None, - search_mode: false, - search_input: String::new(), - show_ms2_spectrum: true, // Enable MS2 by default - } - } -} - -/// Computed/cached state - derived from data and UI state -#[derive(Debug, Default)] -pub struct ComputedState { - /// Computed chromatogram for the selected elution group (plot data) - pub chromatogram: Option, - /// X-axis bounds to apply on next plot render (min_rt, max_rt) - pub chromatogram_x_bounds: Option<(f64, f64)>, - /// Raw chromatogram output data (for MS2 extraction) - pub chromatogram_output: Option, - /// Computed MS2 spectrum at selected RT - pub ms2_spectrum: Option, - /// Whether the chromatogram has been automatically zoomed to data range - pub chromatogram_auto_zoom_applied: bool, + /// Raw data input mode (Local or Cloud) + pub raw_data_input_mode: RawDataInputMode, } /// Main application state @@ -145,23 +174,26 @@ pub struct ViewerApp { /// Computed/cached state computed: ComputedState, - /// Pending commands to be executed - pending_commands: Vec, - /// Dock state for layout management dock_state: DockState, /// UI Panels - left_panel: LeftPanel, + config_panel: ConfigPanel, table_panel: TablePanel, spectrum_panel: SpectrumPanel, + + /// Receiver for background chromatogram computation + chromatogram_receiver: Option>, + /// Token to cancel current background computation + cancellation_token: Option, } impl ViewerApp { - pub fn new(_cc: &eframe::CreationContext<'_>) -> Self { - // Create initial tabs: Settings, Table, Precursors, Fragments, MS2 + /// Create a new test instance without eframe context + #[cfg(test)] + pub fn new_test() -> Self { let tabs = vec![ - Pane::LeftPanel, + Pane::ConfigPanel, Pane::TablePanel, Pane::PrecursorPlot, Pane::FragmentPlot, @@ -175,67 +207,94 @@ impl ViewerApp { data: DataState::default(), ui: UiState::default(), computed: ComputedState::default(), - pending_commands: Vec::new(), dock_state, - left_panel: LeftPanel::new(), + config_panel: ConfigPanel::new(), table_panel: TablePanel::new(), spectrum_panel: SpectrumPanel::new(), + chromatogram_receiver: None, + cancellation_token: None, } } - fn handle_commands(&mut self) { - let commands = std::mem::take(&mut self.pending_commands); - - for cmd in commands { - tracing::debug!("Handling command: {:?}", cmd); - - match cmd { - AppCommand::RegenerateChromatogram => { - self.generate_chromatogram(); - } - AppCommand::SelectElutionGroup(idx) => { - self.ui.selected_index = Some(idx); - self.pending_commands - .push(AppCommand::RegenerateChromatogram); - } - AppCommand::UpdateTolerance => { - if self.ui.selected_index.is_some() { - self.pending_commands - .push(AppCommand::RegenerateChromatogram); - } - } - AppCommand::UpdateSmoothing => { - if self.ui.selected_index.is_some() { - self.pending_commands - .push(AppCommand::RegenerateChromatogram); + pub fn new(cc: &eframe::CreationContext<'_>, args: &Cli) -> Self { + // Try to load previous state + if let Some(storage) = cc.storage { + if let Some(state_string) = storage.get_string(eframe::APP_KEY) { + // Try RON first (new format), then fallback to JSON (legacy) + let state = match ron::from_str::(&state_string) { + Ok(state) => { + tracing::info!("Loaded persistent state successfully (RON)."); + Some(state) } - } - AppCommand::QueryMS2Spectrum(rt_seconds) => { - if let Some(chrom_output) = &self.computed.chromatogram_output { - match chromatogram_processor::extract_ms2_spectrum_from_chromatogram( - chrom_output, - rt_seconds, - ) { - Ok(spectrum) => { - let num_peaks = spectrum.mz_values.len(); - self.computed.ms2_spectrum = Some(spectrum); - tracing::info!( - "Extracted MS2 spectrum at RT {:.2}s with {} peaks", - rt_seconds, - num_peaks - ); + Err(ron_err) => { + // Fallback to JSON for backward compatibility + match serde_json::from_str::(&state_string) { + Ok(state) => { + tracing::info!("Loaded persistent state successfully (JSON)."); + Some(state) } - Err(e) => { - tracing::error!("Failed to extract MS2 spectrum: {:?}", e); - self.computed.ms2_spectrum = None; + Err(json_err) => { + tracing::warn!( + "Failed to deserialize persistent state. RON: {:?}, JSON: {:?}", + ron_err, + json_err + ); + None } } - } else { - tracing::warn!("No chromatogram data available for MS2 extraction"); } + }; + + if let Some(state) = state { + return Self { + file_loader: state + .file_loader + .with_initial_paths(&args.raw_data_path, &args.elution_groups_path), + data: DataState { + tolerance: state.tolerance, + smoothing: state.smoothing, + ..DataState::default() + }, + ui: state.ui_state, + computed: ComputedState::default(), + dock_state: state.dock_state, + config_panel: ConfigPanel::new(), + table_panel: TablePanel::new(), + spectrum_panel: SpectrumPanel::new(), + chromatogram_receiver: None, + cancellation_token: None, + }; } + } else { + tracing::info!("No persistent state found."); } } + + // Create initial tabs: Settings, Table, Precursors, Fragments, MS2 + let tabs = vec![ + Pane::ConfigPanel, + Pane::TablePanel, + Pane::PrecursorPlot, + Pane::FragmentPlot, + Pane::MS2Spectrum, + Pane::ScoresPlot, + ]; + + let dock_state = DockState::new(tabs); + + Self { + file_loader: FileLoader::new() + .with_initial_paths(&args.raw_data_path, &args.elution_groups_path), + data: DataState::default(), + ui: UiState::default(), + computed: ComputedState::default(), + dock_state, + config_panel: ConfigPanel::new(), + table_panel: TablePanel::new(), + spectrum_panel: SpectrumPanel::new(), + chromatogram_receiver: None, + cancellation_token: None, + } } fn handle_vim_keys(&mut self, ctx: &egui::Context) { @@ -247,12 +306,9 @@ impl ViewerApp { if self.ui.search_mode { if i.key_pressed(egui::Key::Escape) { self.ui.search_mode = false; - self.ui.search_input.clear(); } if i.key_pressed(egui::Key::Enter) { - self.ui.table_filter = self.ui.search_input.clone(); self.ui.search_mode = false; - self.ui.search_input.clear(); } return; } @@ -263,37 +319,46 @@ impl ViewerApp { return; } - let Some(egs) = &self.data.elution_groups else { + if self.data.elution_groups.is_none() { return; - }; + } // Cache filtered indices - computed once and reused - let filtered_indices = egs.matching_indices_for_id_filter(&self.ui.table_filter); - - if filtered_indices.is_empty() { - return; - } + let filtered_indices = &self.table_panel.filtered_indices(); + let cursor = &mut self.ui.selected_index; if i.key_pressed(egui::Key::J) && !i.modifiers.any() { - self.move_selection_down(&filtered_indices); + Self::move_selection_down(cursor, filtered_indices); } if i.key_pressed(egui::Key::K) && !i.modifiers.any() { - self.move_selection_up(&filtered_indices); + Self::move_selection_up(cursor, filtered_indices); } if i.key_pressed(egui::Key::G) && !i.modifiers.any() { - self.move_selection_to_first(&filtered_indices); + Self::move_selection_to_first(cursor, filtered_indices); } if i.key_pressed(egui::Key::G) && i.modifiers.shift_only() { - self.move_selection_to_last(&filtered_indices); + Self::move_selection_to_last(cursor, filtered_indices); } }); } - fn generate_chromatogram(&mut self) { - let IndexedDataState::Loaded { index, ms1_rts } = &self.data.indexed_data else { + /// Generate MS2 spectrum if user clicked on a new RT position + fn generate_ms2_spectrum_if_needed(&mut self) { + let Some(requested_rt) = self.computed.clicked_rt else { + return; + }; + + if self.computed.generate_spectrum_at_rt(requested_rt) { + self.computed + .insert_reference_line("Clicked RT".into(), requested_rt, Color32::GREEN); + } + } + + fn generate_chromatogram(&mut self, ctx: &egui::Context) { + let IndexedDataState::Loaded { index, .. } = &self.data.indexed_data else { return; }; @@ -302,43 +367,200 @@ impl ViewerApp { None => return, }; - if let Some(elution_groups) = &self.data.elution_groups { - macro_rules! process_chromatogram { - ($egs:expr) => { - match ChromatogramService::generate( - &$egs[selected_idx], - index, - Arc::clone(ms1_rts), - &self.data.tolerance, - &self.data.smoothing, - ) { - Ok(chrom) => { - let chrom_lines = ChromatogramLines::from_chromatogram(&chrom); - self.computed.chromatogram_x_bounds = - Some(chrom_lines.rt_seconds_range); - self.computed.chromatogram = Some(chrom_lines); - self.computed.chromatogram_output = Some(chrom); - self.computed.chromatogram_auto_zoom_applied = false; - } - Err(e) => { - tracing::error!("Failed to generate chromatogram: {:?}", e); - self.computed.chromatogram = None; - self.computed.chromatogram_output = None; - self.computed.chromatogram_x_bounds = None; - } + let Some(elution_groups) = &self.data.elution_groups else { + return; + }; + + if !self + .computed + .is_cache_valid(selected_idx, &self.data.tolerance, &self.data.smoothing) + { + let is_new_request = self + .computed + .computing_index() + .map(|computing_idx| computing_idx != selected_idx as u64) + .unwrap_or(true); + + if self.computed.is_computing() && !is_new_request { + tracing::trace!( + "Chromatogram computation already in progress for index {}, skipping", + selected_idx + ); + return; + } + + let Ok((elution_group, expected_intensities)) = elution_groups.get_elem(selected_idx) + else { + tracing::error!("Invalid elution group index: {}", selected_idx); + return; + }; + + if self.computed.is_computing() && is_new_request { + tracing::debug!( + "Cancelling previous chromatogram computation (switching to index {})", + selected_idx + ); + if let Some(token) = &self.cancellation_token { + token.cancel(); + } + self.chromatogram_receiver = None; + self.cancellation_token = None; + } + + tracing::debug!( + "Starting new chromatogram computation for index {}", + selected_idx + ); + self.computed.start_computing(selected_idx as u64); + + let cancel_token = CancellationToken::new(); + self.cancellation_token = Some(cancel_token.clone()); + + let (tx, rx) = channel(); + self.chromatogram_receiver = Some(rx); + + let index_owned = index.clone(); + let elution_group_owned = elution_group.clone(); + let expected_intensities_owned = expected_intensities.clone(); + let tolerance_owned = self.data.tolerance.clone(); + let smoothing_owned = self.data.smoothing; + let ctx_clone = ctx.clone(); + match std::thread::Builder::new() + .name(format!("chrom-{}", selected_idx)) + .spawn(move || { + tracing::debug!( + "Starting chromatogram computation for elution group {}", + selected_idx + ); + let result = Self::compute_chromatogram_background( + elution_group_owned, + expected_intensities_owned, + selected_idx, + index_owned, + tolerance_owned, + smoothing_owned, + cancel_token, + ); + if let Err(e) = &result { + tracing::error!("Chromatogram computation failed: {}", e); + } else { + tracing::debug!( + "Chromatogram computation completed for elution group {}", + selected_idx + ); } - }; + let _ = tx.send(result); + ctx_clone.request_repaint(); + }) { + Ok(_) => { + tracing::debug!("Successfully spawned chromatogram computation thread"); + } + Err(e) => { + tracing::error!("Failed to spawn chromatogram thread: {}", e); + self.computed.cancel_computing(); + self.chromatogram_receiver = None; + self.cancellation_token = None; + } } + } + } + + /// Compute chromatogram in background thread + fn compute_chromatogram_background( + elution_group: timsquery::models::elution_group::TimsElutionGroup, + expected_intensities: timsseek::ExpectedIntensities, + selected_idx: usize, + index: Arc, + tolerance: Tolerance, + smoothing: SmoothingMethod, + cancel_token: CancellationToken, + ) -> ChromatogramComputeResult { + // Check if cancelled before starting + if cancel_token.is_cancelled() { + tracing::debug!("Chromatogram computation cancelled before starting"); + return Err("Computation cancelled".to_string()); + } + + // Build collector + let mut collector = ComputedState::build_collector(&index, elution_group.clone()) + .map_err(|e| format!("Failed to build collector: {:?}", e))?; + + // Check if cancelled after building collector + if cancel_token.is_cancelled() { + tracing::debug!("Chromatogram computation cancelled after building collector"); + return Err("Computation cancelled".to_string()); + } + + // Generate chromatogram + let output = ComputedState::generate_chromatogram( + &mut collector, + &elution_group, + &index, + &tolerance, + &smoothing, + ) + .map_err(|e| format!("Failed to generate chromatogram: {:?}", e))?; + + // Check if cancelled after generating chromatogram + if cancel_token.is_cancelled() { + tracing::debug!("Chromatogram computation cancelled after generation"); + return Err("Computation cancelled".to_string()); + } - crate::with_elution_collection!(elution_groups, process_chromatogram); + Ok((output, collector, expected_intensities, selected_idx as u64)) + } + + /// Check if background chromatogram computation completed + fn check_chromatogram_completion(&mut self) { + let Some(rx) = &self.chromatogram_receiver else { + return; }; - if self.ui.show_ms2_spectrum - && let Some(chrom_output) = &self.computed.chromatogram_output - { - let reference_rt = chrom_output.rt_seconds as f64; - self.pending_commands - .push(AppCommand::QueryMS2Spectrum(reference_rt)); + // Try to receive result (non-blocking) + if let Ok(result) = rx.try_recv() { + self.chromatogram_receiver = None; + self.cancellation_token = None; + + match result { + Ok((output, collector, expected_intensities, selected_idx)) => { + tracing::debug!( + "Chromatogram computation result received for index {}", + selected_idx + ); + + if let IndexedDataState::Loaded { index, .. } = &self.data.indexed_data { + let result = ChromatogramComputationResult { + selected_idx, + output, + collector, + expected_intensities, + }; + + self.computed.complete_chromatogram_computation( + result, + index, + &self.data.tolerance, + self.data.smoothing, + ); + } + + // Add library RT reference line + if let Some(elution_groups) = &self.data.elution_groups + && let Ok((elution_group, _)) = + elution_groups.get_elem(selected_idx as usize) + { + self.computed.insert_reference_line( + "Library RT".into(), + elution_group.rt_seconds() as f64, + Color32::BLUE, + ); + } + } + Err(e) => { + tracing::error!("Chromatogram computation failed: {}", e); + self.computed.cancel_computing(); + } + } } } @@ -347,20 +569,31 @@ impl ViewerApp { file_loader: &mut FileLoader, data: &mut DataState, ) { - ui.label("Elution Groups:"); - if ui.button("Load Elution Groups...").clicked() { - file_loader.open_elution_groups_dialog(); - } + egui::Frame::group(ui.style()) + .inner_margin(egui::Margin::same(SECTION_MARGIN)) + .show(ui, |ui| { + ui.heading("Elution Groups"); + ui.add_space(INTERNAL_SPACING); + + if ui.button("Load Elution Groups...").clicked() { + file_loader.open_elution_groups_dialog(); + } - if let Some(path) = &file_loader.elution_groups_path { - Self::display_filename(ui, path); - } + if let Some(path) = &file_loader.elution_groups_path { + ui.add_space(SMALL_SPACING); + Self::display_filename(ui, path); + } - Self::load_elution_groups_if_needed(ui, file_loader, data); + Self::load_elution_groups_if_needed(ui, file_loader, data); - if let Some(egs) = &data.elution_groups { - ui.label(format!("✓ Loaded: {} elution groups", egs.len())); - } + if let Some(egs) = &data.elution_groups { + ui.add_space(SMALL_SPACING); + ui.label( + egui::RichText::new(format!("✓ {} groups loaded", egs.len())) + .color(egui::Color32::DARK_GREEN), + ); + } + }); } fn render_raw_data_section_static( @@ -368,20 +601,141 @@ impl ViewerApp { file_loader: &mut FileLoader, data: &mut DataState, ui_state: &mut UiState, + computed: &mut ComputedState, ) { - ui.label("Raw Data File (.d):"); - if ui.button("Load Raw Data...").clicked() { - file_loader.open_raw_data_dialog(); - } + egui::Frame::group(ui.style()) + .inner_margin(egui::Margin::same(SECTION_MARGIN)) + .show(ui, |ui| { + ui.heading("Raw Data"); + ui.add_space(INTERNAL_SPACING); - if let Some(path) = &file_loader.raw_data_path { - Self::display_filename(ui, path); - } + // Clearer tab selector with better visual style + ui.horizontal(|ui| { + ui.label("Source:"); + ui.radio_value( + &mut ui_state.raw_data_input_mode, + RawDataInputMode::Local, + "Local File", + ); + ui.radio_value( + &mut ui_state.raw_data_input_mode, + RawDataInputMode::Cloud, + "Cloud URL", + ); + }); + + ui.add_space(INTERNAL_SPACING); + + match ui_state.raw_data_input_mode { + RawDataInputMode::Local => { + // Existing folder picker button + if ui.button("Browse for Raw Data...").clicked() { + file_loader.open_raw_data_dialog(); + } + ui.label( + egui::RichText::new("Select a .d folder or .idx cache") + .small() + .weak(), + ); + } + RawDataInputMode::Cloud => { + // URL input field + ui.vertical(|ui| { + ui.label("Enter cloud storage URL:"); + let mut url_buffer = + file_loader.raw_data_url.clone().unwrap_or_default(); + let response = ui.add( + egui::TextEdit::singleline(&mut url_buffer) + .hint_text("s3://bucket/data.idx or gs://bucket/data.idx") + .desired_width(f32::INFINITY), + ); + + if response.changed() { + if !url_buffer.is_empty() { + file_loader.set_raw_data_url(url_buffer); + } else { + file_loader.clear_raw_data(); + } + } + + ui.add_space(SMALL_SPACING); + // Show examples in a collapsible section + ui.collapsing("Examples", |ui| { + ui.label(egui::RichText::new("S3:").strong().small()); + ui.label( + egui::RichText::new(" s3://bucket/experiment.d.idx") + .code() + .small(), + ); + ui.add_space(2.0); + ui.label(egui::RichText::new("Google Cloud:").strong().small()); + ui.label( + egui::RichText::new(" gs://bucket/experiment.d.idx") + .code() + .small(), + ); + }); + }); + } + } + + // Show current location with clear button + if let Some(location) = file_loader.get_raw_data_location() { + ui.add_space(SMALL_SPACING_VERTICAL); + egui::Frame::new() + .fill(ui.visuals().faint_bg_color) + .corner_radius(egui::CornerRadius::same(CORNER_RADIUS)) + .inner_margin(egui::Margin::symmetric( + LOCATION_FRAME_PADDING_H, + LOCATION_FRAME_PADDING_V, + )) + .show(ui, |ui| { + ui.horizontal(|ui| { + let display_text = + Self::truncate_middle(&location, LOCATION_DISPLAY_MAX_LEN); + ui.label(egui::RichText::new(display_text).small()) + .on_hover_text(&location); + + ui.with_layout( + egui::Layout::right_to_left(egui::Align::Center), + |ui| { + if ui.small_button("Clear").clicked() { + file_loader.clear_raw_data(); + } + }, + ); + }); + }); + } - Self::load_raw_data_if_needed(ui, file_loader, data); + Self::load_raw_data_if_needed(ui, file_loader, data, computed); + + if let IndexedDataState::Loaded { source, .. } = &data.indexed_data { + ui.add_space(SMALL_SPACING); + ui.label( + egui::RichText::new("✓ Raw data indexed").color(egui::Color32::DARK_GREEN), + ); - if matches!(data.indexed_data, IndexedDataState::Loaded { .. }) { - ui.label("✓ Raw data indexed"); + // Show data source with middle truncation for long paths + let display_source = Self::truncate_middle(source, 60); + + ui.label( + egui::RichText::new(format!("Source: {}", display_source)) + .small() + .color(egui::Color32::GRAY), + ); + } + }); + } + + /// Helper to truncate long paths/URLs for display + fn truncate_middle(s: &str, max_len: usize) -> String { + if s.len() <= max_len { + s.to_string() + } else { + let start = &s[..max_len / 2 - 2]; + let end = &s[s.len() - max_len / 2 + 2..]; + format!("{}...{}", start, end) } } @@ -390,18 +744,29 @@ impl ViewerApp { file_loader: &mut FileLoader, data: &mut DataState, ) { - ui.label("Tolerance Settings:"); - if ui.button("Load Tolerances...").clicked() { - file_loader.open_tolerance_dialog(); - } + egui::Frame::group(ui.style()) + .inner_margin(egui::Margin::same(SECTION_MARGIN)) + .show(ui, |ui| { + ui.heading("Tolerance Settings"); + ui.add_space(INTERNAL_SPACING); + + if ui.button("Load Tolerances...").clicked() { + file_loader.open_tolerance_dialog(); + } - if let Some(path) = &file_loader.tolerance_path { - Self::display_filename(ui, path); - } + if let Some(path) = &file_loader.tolerance_path { + ui.add_space(SMALL_SPACING); + Self::display_filename(ui, path); + } - Self::load_tolerance_if_needed(file_loader, data); + Self::load_tolerance_if_needed(file_loader, data); - ui.label("Tolerance settings loaded"); + ui.add_space(SMALL_SPACING); + ui.label( + egui::RichText::new("✓ Tolerance settings loaded") + .color(egui::Color32::DARK_GREEN), + ); + }); } fn display_filename(ui: &mut egui::Ui, path: &std::path::Path) { @@ -418,20 +783,26 @@ impl ViewerApp { file_loader: &mut FileLoader, data: &mut DataState, ) { - if let Some(path) = &file_loader.elution_groups_path - && data.elution_groups.is_none() - { - ui.horizontal(|ui| { - ui.spinner(); - ui.label("Loading elution groups..."); - }); - match file_loader.load_elution_groups(path) { - Ok(egs) => { - tracing::info!("Loaded {} elution groups", egs.len()); - data.elution_groups = Some(egs); - } - Err(e) => { - tracing::error!("Failed to load elution groups: {:?}", e); + if let Some(path) = &file_loader.elution_groups_path { + let should_load = match &data.elution_groups_source { + Some(current_path) => current_path != path, + None => true, + }; + + if should_load { + ui.horizontal(|ui| { + ui.spinner(); + ui.label("Loading elution groups..."); + }); + match file_loader.load_elution_groups(path) { + Ok(egs) => { + tracing::info!("Loaded {} elution groups", egs.len()); + data.elution_groups = Some(egs); + data.elution_groups_source = Some(path.clone()); + } + Err(e) => { + tracing::error!("Failed to load elution groups: {:?}", e); + } } } } @@ -441,43 +812,55 @@ impl ViewerApp { ui: &mut egui::Ui, file_loader: &mut FileLoader, data: &mut DataState, + computed: &mut ComputedState, ) { - if let Some(path) = &file_loader.raw_data_path { - // Transition from None to Loading - if matches!(data.indexed_data, IndexedDataState::None) { - data.indexed_data = IndexedDataState::Loading(path.clone()); + if let Some(location) = file_loader.get_raw_data_location() { + // Check if we need to load new data + let should_load = match &data.indexed_data { + IndexedDataState::None => true, + IndexedDataState::Loading(current_location) => current_location != &location, + IndexedDataState::Loaded { source, .. } => source != &location, + IndexedDataState::Failed(_, _) => true, + }; + + // Transition to Loading if new location + if should_load { + tracing::info!("Starting to load new raw data from: {}", location); + // Clear computed state to avoid showing stale chromatograms from old index + computed.clear(); + data.indexed_data = IndexedDataState::Loading(location.clone()); ui.ctx().request_repaint(); } } match &data.indexed_data { - IndexedDataState::Loading(path) => { + IndexedDataState::Loading(location) => { ui.horizontal(|ui| { ui.spinner(); ui.label("Indexing raw data... (this may take 10-30 seconds)"); }); - match file_loader.load_raw_data(path) { - Ok((index, ms1_rts)) => { - data.indexed_data = IndexedDataState::Loaded { index, ms1_rts }; - file_loader.raw_data_path = None; + match file_loader.load_raw_data_from_location(location) { + Ok(index) => { + data.indexed_data = IndexedDataState::Loaded { + index, + source: location.to_string(), + }; + file_loader.clear_raw_data(); tracing::info!("Raw data indexing completed"); } Err(e) => { let error_msg = format!("{:?}", e); tracing::error!("Failed to load raw data: {}", error_msg); - data.indexed_data = IndexedDataState::Failed(path.clone(), error_msg); - file_loader.raw_data_path = None; + data.indexed_data = IndexedDataState::Failed(location.clone(), error_msg); + file_loader.clear_raw_data(); } } } - IndexedDataState::Failed(path, error) => { + IndexedDataState::Failed(location, error) => { ui.label( - egui::RichText::new(format!( - "Failed to load raw data from {}:", - path.display() - )) - .color(egui::Color32::RED), + egui::RichText::new(format!("Failed to load raw data from {}:", location)) + .color(egui::Color32::RED), ); ui.label(egui::RichText::new(error).color(egui::Color32::RED).small()); if ui.button("Clear Error").clicked() { @@ -501,79 +884,117 @@ impl ViewerApp { } } - fn move_selection_down(&mut self, filtered_indices: &[usize]) { + fn move_selection_down(cursor: &mut Option, filtered_indices: &[usize]) { if filtered_indices.is_empty() { return; } - match self.ui.selected_index { + match cursor { None => { - self.select_elution_group(filtered_indices[0]); + Self::select_elution_group(cursor, filtered_indices[0]); } Some(current) => { - if let Some(pos) = filtered_indices.iter().position(|&idx| idx == current) + if let Some(pos) = filtered_indices.iter().position(|&idx| idx == *current) && pos + 1 < filtered_indices.len() { - self.select_elution_group(filtered_indices[pos + 1]); + Self::select_elution_group(cursor, filtered_indices[pos + 1]); } } } } - fn move_selection_up(&mut self, filtered_indices: &[usize]) { + fn move_selection_up(cursor: &mut Option, filtered_indices: &[usize]) { if filtered_indices.is_empty() { return; } - match self.ui.selected_index { + match cursor { None => { - self.select_elution_group(filtered_indices[0]); + Self::select_elution_group(cursor, filtered_indices[0]); } Some(current) => { - if let Some(pos) = filtered_indices.iter().position(|&idx| idx == current) + if let Some(pos) = filtered_indices.iter().position(|&idx| idx == *current) && pos > 0 { - self.select_elution_group(filtered_indices[pos - 1]); + Self::select_elution_group(cursor, filtered_indices[pos - 1]); } } } } - fn move_selection_to_first(&mut self, filtered_indices: &[usize]) { + fn move_selection_to_first(cursor: &mut Option, filtered_indices: &[usize]) { if !filtered_indices.is_empty() { - self.select_elution_group(filtered_indices[0]); + Self::select_elution_group(cursor, filtered_indices[0]); } } - fn move_selection_to_last(&mut self, filtered_indices: &[usize]) { + fn move_selection_to_last(cursor: &mut Option, filtered_indices: &[usize]) { if !filtered_indices.is_empty() { - self.select_elution_group(filtered_indices[filtered_indices.len() - 1]); + Self::select_elution_group(cursor, filtered_indices[filtered_indices.len() - 1]); } } - fn select_elution_group(&mut self, idx: usize) { - self.pending_commands - .push(AppCommand::SelectElutionGroup(idx)); + fn select_elution_group(cursor: &mut Option, idx: usize) { + *cursor = Some(idx); } } impl eframe::App for ViewerApp { + fn save(&mut self, storage: &mut dyn eframe::Storage) { + tracing::info!("Saving application state..."); + let state = PersistentState { + file_loader: FileLoader { + elution_groups_path: self.file_loader.elution_groups_path.clone(), + raw_data_path: self.file_loader.raw_data_path.clone(), + raw_data_url: self.file_loader.raw_data_url.clone(), + tolerance_path: self.file_loader.tolerance_path.clone(), + }, + ui_state: UiState { + table_filter: self.ui.table_filter.clone(), + selected_index: self.ui.selected_index, + search_mode: self.ui.search_mode, + search_input: self.ui.search_input.clone(), + raw_data_input_mode: self.ui.raw_data_input_mode, + }, + tolerance: self.data.tolerance.clone(), + smoothing: self.data.smoothing, + dock_state: self.dock_state.clone(), + }; + + if let Ok(value) = ron::to_string(&state) { + storage.set_string(eframe::APP_KEY, value); + } else { + tracing::error!("Failed to serialize state to RON"); + } + } + + fn auto_save_interval(&self) -> std::time::Duration { + std::time::Duration::from_secs(5) + } + fn update(&mut self, ctx: &egui::Context, _frame: &mut eframe::Frame) { self.handle_vim_keys(ctx); - self.handle_commands(); egui::TopBottomPanel::top("top_panel").show(ctx, |ui| { ui.heading("TimsQuery Viewer"); ui.separator(); }); + // Check if background computation completed + self.check_chromatogram_completion(); + + // Generate MS2 spectrum if RT was clicked + self.generate_ms2_spectrum_if_needed(); + + // Generate chromatogram if needed + self.generate_chromatogram(ctx); + let mut tab_viewer = AppTabViewer { file_loader: &mut self.file_loader, data: &mut self.data, ui: &mut self.ui, computed: &mut self.computed, - pending_commands: &mut self.pending_commands, - left_panel: &mut self.left_panel, + left_panel: &mut self.config_panel, table_panel: &mut self.table_panel, spectrum_panel: &mut self.spectrum_panel, }; @@ -591,36 +1012,41 @@ struct AppTabViewer<'a> { data: &'a mut DataState, ui: &'a mut UiState, computed: &'a mut ComputedState, - pending_commands: &'a mut Vec, - left_panel: &'a mut LeftPanel, + left_panel: &'a mut ConfigPanel, table_panel: &'a mut TablePanel, spectrum_panel: &'a mut SpectrumPanel, } impl<'a> AppTabViewer<'a> { fn render_left_panel(&mut self, ui: &mut egui::Ui) { - ui.heading("Data Loading"); - ui.separator(); + // Data Loading Section + ui.label(egui::RichText::new("DATA LOADING").strong().size(13.0)); + ui.add_space(INTERNAL_SPACING); ViewerApp::render_elution_groups_section_static(ui, self.file_loader, self.data); - ui.add_space(10.0); + ui.add_space(SECTION_SPACING); - ViewerApp::render_raw_data_section_static(ui, self.file_loader, self.data, self.ui); - ui.add_space(10.0); + ViewerApp::render_raw_data_section_static( + ui, + self.file_loader, + self.data, + self.ui, + self.computed, + ); + ui.add_space(SECTION_SPACING); ViewerApp::render_tolerance_loading_section_static(ui, self.file_loader, self.data); - ui.add_space(20.0); + ui.add_space(SEPARATOR_SPACING); ui.separator(); + ui.add_space(INTERNAL_SPACING); - let mut ctx = PanelContext::new( - self.data, - self.ui, - self.computed, - self.file_loader, - self.pending_commands, + self.left_panel.render( + ui, + &mut self.data.tolerance, + &mut self.data.smoothing, + &mut self.data.auto_zoom_mode, ); - self.left_panel.render(ui, &mut ctx); } } @@ -629,17 +1055,27 @@ impl<'a> TabViewer for AppTabViewer<'a> { fn title(&mut self, tab: &mut Self::Tab) -> egui::WidgetText { match tab { - Pane::LeftPanel => self.left_panel.title().into(), + Pane::ConfigPanel => self.left_panel.title().into(), Pane::TablePanel => self.table_panel.title().into(), Pane::MS2Spectrum => self.spectrum_panel.title().into(), Pane::PrecursorPlot => "Precursors".into(), Pane::FragmentPlot => "Fragments".into(), + Pane::ScoresPlot => "Scores".into(), } } fn ui(&mut self, ui: &mut egui::Ui, tab: &mut Self::Tab) { + let mode = self.data.auto_zoom_mode; + + // TODO: figure out how to prevent this allocation per frame... + let ref_lines: Vec<(String, f64, Color32)> = self + .computed + .reference_lines() + .iter() + .map(|(k, v)| (k.clone(), v.0, v.1)) + .collect(); match tab { - Pane::LeftPanel => { + Pane::ConfigPanel => { // Wrap settings in a scroll area egui::ScrollArea::vertical() .auto_shrink([false, false]) @@ -648,27 +1084,29 @@ impl<'a> TabViewer for AppTabViewer<'a> { }); } Pane::TablePanel => { - let mut ctx = PanelContext::new( - self.data, - self.ui, - self.computed, - self.file_loader, - self.pending_commands, + self.table_panel.render( + ui, + &self.data.elution_groups, + self.ui.search_mode, + &mut self.ui.search_input, + &mut self.ui.selected_index, ); - self.table_panel.render(ui, &mut ctx); } Pane::MS2Spectrum => { - let mut ctx = PanelContext::new( - self.data, - self.ui, - self.computed, - self.file_loader, - self.pending_commands, + self.spectrum_panel.render( + ui, + &self.computed.ms2_spectrum, + &self.computed.expected_intensities, ); - self.spectrum_panel.render(ui, &mut ctx); } Pane::PrecursorPlot => { - if let Some(chromatogram) = &self.computed.chromatogram { + // Show loading indicator if computing + if self.computed.is_computing() { + ui.centered_and_justified(|ui| { + ui.spinner(); + ui.label("Computing chromatogram..."); + }); + } else if let Some(chromatogram) = &self.computed.chromatogram_lines { // Use shared link_id for synchronized X-axis with Fragments let click_response = crate::plot_renderer::render_chromatogram_plot( ui, @@ -676,11 +1114,13 @@ impl<'a> TabViewer for AppTabViewer<'a> { crate::plot_renderer::PlotMode::PrecursorsOnly, Some("precursor_fragment_x_axis"), true, - &mut self.computed.chromatogram_auto_zoom_applied, + &mut self.computed.auto_zoom_frame_counter, + &mode, + &ref_lines, + self.computed.apex_score.as_ref(), ); if let Some(clicked_rt) = click_response { - self.pending_commands - .push(AppCommand::QueryMS2Spectrum(clicked_rt)); + self.computed.clicked_rt = Some(clicked_rt); } } else if self.ui.selected_index.is_some() { ui.label("Generating chromatogram..."); @@ -689,7 +1129,13 @@ impl<'a> TabViewer for AppTabViewer<'a> { } } Pane::FragmentPlot => { - if let Some(chromatogram) = &self.computed.chromatogram { + // Show loading indicator if computing + if self.computed.is_computing() { + ui.centered_and_justified(|ui| { + ui.spinner(); + ui.label("Computing chromatogram..."); + }); + } else if let Some(chromatogram) = &self.computed.chromatogram_lines { // Use shared link_id for synchronized X-axis with Precursors let response = crate::plot_renderer::render_chromatogram_plot( ui, @@ -697,11 +1143,13 @@ impl<'a> TabViewer for AppTabViewer<'a> { crate::plot_renderer::PlotMode::FragmentsOnly, Some("precursor_fragment_x_axis"), false, - &mut self.computed.chromatogram_auto_zoom_applied, + &mut self.computed.auto_zoom_frame_counter, + &mode, + &ref_lines, + self.computed.apex_score.as_ref(), ); if let Some(clicked_rt) = response { - self.pending_commands - .push(AppCommand::QueryMS2Spectrum(clicked_rt)); + self.computed.clicked_rt = Some(clicked_rt); } } else if self.ui.selected_index.is_some() { ui.label("Generating chromatogram..."); @@ -709,6 +1157,30 @@ impl<'a> TabViewer for AppTabViewer<'a> { ui.label("Select a precursor to view fragment traces"); } } + Pane::ScoresPlot => { + // Show loading indicator if computing + if self.computed.is_computing() { + ui.centered_and_justified(|ui| { + ui.spinner(); + ui.label("Computing scores..."); + }); + } else if let Some(score_lines) = &self.computed.score_lines { + let response = score_lines.render( + ui, + Some("precursor_fragment_x_axis"), + &mut self.computed.auto_zoom_frame_counter, + &mode, + &ref_lines, + ); + if let Some(clicked_rt) = response { + self.computed.clicked_rt = Some(clicked_rt); + } + } else if self.ui.selected_index.is_some() { + ui.label("Generating score plot..."); + } else { + ui.label("Select a precursor to view score traces"); + } + } } } diff --git a/rust/timsquery_viewer/src/chromatogram_processor.rs b/rust/timsquery_viewer/src/chromatogram_processor.rs index bfb8250..92a0b24 100644 --- a/rust/timsquery_viewer/src/chromatogram_processor.rs +++ b/rust/timsquery_viewer/src/chromatogram_processor.rs @@ -1,12 +1,4 @@ -use std::sync::Arc; -use timscentroid::IndexedTimstofPeaks; pub use timsquery::models::aggregators::ChromatogramCollector; -use timsquery::models::elution_group::TimsElutionGroup; -use timsquery::models::tolerance::Tolerance; -use timsquery::{ - KeyLike, - QueriableData, -}; use crate::error::ViewerError; use crate::plot_renderer::MS2Spectrum; @@ -49,41 +41,6 @@ impl SmoothingMethod { } } -#[instrument(skip_all)] -pub fn generate_chromatogram( - elution_group: &TimsElutionGroup, - index: &IndexedTimstofPeaks, - ms1_rts: Arc<[u32]>, - tolerance: &Tolerance, - smoothing: &SmoothingMethod, -) -> Result { - let rt_range_ms = match tolerance.rt_range_as_milis(elution_group.rt_seconds()) { - timsquery::OptionallyRestricted::Unrestricted => { - timsquery::TupleRange::try_new(*ms1_rts.first().unwrap(), *ms1_rts.last().unwrap()) - .expect("Reference RTs should be sorted and valid") - } - timsquery::OptionallyRestricted::Restricted(r) => r, - }; - let mut collector = ChromatogramCollector::new(elution_group.clone(), rt_range_ms, &ms1_rts) - .map_err(|e| ViewerError::General(format!("Failed to create collector: {:?}", e)))?; - - index.add_query(&mut collector, tolerance); - - let mut output = match ChromatogramOutput::try_new(collector, &ms1_rts) { - Ok(cmg) => cmg, - Err(e) => { - return Err(ViewerError::General(format!( - "Failed to generate chromatogram output: {:?}", - e - ))); - } - }; - - apply_smoothing_chromatogram(&mut output, smoothing); - - Ok(output) -} - #[instrument(skip(chromatogram))] pub fn extract_ms2_spectrum_from_chromatogram( chromatogram: &ChromatogramOutput, @@ -256,21 +213,20 @@ fn savitzky_golay_smooth(data: &[f32], window: usize, polynomial: usize) -> Vec< /// smoothing but not for precise quantitative analysis. fn compute_savitzky_golay_weights(window_size: usize, polynomial_order: usize) -> Vec { let half_window = window_size / 2; - let mut weights = vec![0.0; window_size]; - - for position in 0..window_size { - let distance_from_center = ((position as isize) - (half_window as isize)).abs() as f32; - let normalized_distance = distance_from_center / (half_window as f32); - - weights[position] = match polynomial_order { - 0 | 1 => 1.0 - normalized_distance, - 2 => (1.0 - normalized_distance * normalized_distance).max(0.0), - 3 => (1.0 - normalized_distance.powi(3)).max(0.0), - _ => (1.0 - normalized_distance.powi(polynomial_order as i32)).max(0.0), - }; - } - weights + (0..window_size) + .map(|position| { + let distance_from_center = ((position as isize) - (half_window as isize)).abs() as f32; + let normalized_distance = distance_from_center / (half_window as f32); + + match polynomial_order { + 0 | 1 => 1.0 - normalized_distance, + 2 => (1.0 - normalized_distance * normalized_distance).max(0.0), + 3 => (1.0 - normalized_distance.powi(3)).max(0.0), + _ => (1.0 - normalized_distance.powi(polynomial_order as i32)).max(0.0), + } + }) + .collect() } #[cfg(test)] diff --git a/rust/timsquery_viewer/src/cli.rs b/rust/timsquery_viewer/src/cli.rs new file mode 100644 index 0000000..4896686 --- /dev/null +++ b/rust/timsquery_viewer/src/cli.rs @@ -0,0 +1,21 @@ +use clap::Parser; +use std::path::PathBuf; + +#[derive(Parser, Debug)] +#[command(author, version, about, long_about = None)] +pub struct Cli { + #[arg( + long, + value_name = "FILE", + help = "Path to raw .d directory", + short = 'r' + )] + pub raw_data_path: Option, + #[arg( + long, + value_name = "FILE", + help = "Path to elution groups file (.json or .txt)", + short = 'e' + )] + pub elution_groups_path: Option, +} diff --git a/rust/timsquery_viewer/src/computed_state.rs b/rust/timsquery_viewer/src/computed_state.rs new file mode 100644 index 0000000..ff685f3 --- /dev/null +++ b/rust/timsquery_viewer/src/computed_state.rs @@ -0,0 +1,344 @@ +use egui::Color32; +use std::collections::HashMap; +use timsquery::models::elution_group::TimsElutionGroup; +use timsquery::models::tolerance::Tolerance; +use timsquery::{ + QueriableData, + TupleRange, +}; +use timsseek::ExpectedIntensities; +use timsseek::errors::DataProcessingError; +use tracing::instrument; + +use crate::chromatogram_processor; +use crate::chromatogram_processor::{ + ChromatogramCollector, + ChromatogramOutput, + SmoothingMethod, + apply_smoothing_chromatogram, +}; +use crate::error::ViewerError; +use crate::plot_renderer::{ + ChromatogramLines, + MS2Spectrum, + ScoreLines, +}; +use timscentroid::rt_mapping::{ + MS1CycleIndex, + RTIndex, +}; +use timsquery::serde::IndexedPeaksHandle; +use timsseek::scoring::apex_finding::{ + ApexFinder, + ApexScore, + ScoringContext, +}; + +/// Result bundle from background chromatogram computation +pub(crate) struct ChromatogramComputationResult { + pub selected_idx: u64, + pub output: ChromatogramOutput, + pub collector: ChromatogramCollector, + pub expected_intensities: ExpectedIntensities, +} + +/// Computed/cached state - derived from data and UI state +#[derive(Debug, Default)] +pub struct ComputedState { + /// Computed chromatogram for the selected elution group (plot data) + pub chromatogram_lines: Option, + pub score_lines: Option, + pub chromatogram_x_bounds: Option<(f64, f64)>, + pub ms2_spectrum: Option, + pub auto_zoom_frame_counter: u8, + pub clicked_rt: Option, + pub expected_intensities: Option>, + pub apex_score: Option, + + // Internal state (private) + is_computing_chromatogram: bool, + computing_index: Option, + chromatogram_output: Option, + chromatogram_collector_buffer: Option>, + apex_finder_buffer: Option, + cache_key: Option<(u64, Tolerance, SmoothingMethod)>, + last_requested_rt: Option, + reference_lines: HashMap, +} + +impl ComputedState { + pub fn reference_lines(&self) -> &HashMap { + &self.reference_lines + } + + pub fn insert_reference_line(&mut self, name: String, rt: f64, color: Color32) { + self.reference_lines.insert(name, (rt, color)); + } + + pub fn is_computing(&self) -> bool { + self.is_computing_chromatogram + } + + pub fn computing_index(&self) -> Option { + self.computing_index + } + + pub fn start_computing(&mut self, index: u64) { + self.is_computing_chromatogram = true; + self.computing_index = Some(index); + } + + pub fn cancel_computing(&mut self) { + self.is_computing_chromatogram = false; + self.computing_index = None; + } + + pub fn is_cache_valid( + &self, + selected_idx: usize, + tolerance: &Tolerance, + smoothing: &SmoothingMethod, + ) -> bool { + if let Some((cached_id, cached_tolerance, cached_smoothing)) = &self.cache_key { + return *cached_id == selected_idx as u64 + && cached_tolerance == tolerance + && cached_smoothing == smoothing; + } + false + } + + /// Resets computed state when data or UI changes significantly + pub fn clear(&mut self) { + self.chromatogram_lines = None; + self.chromatogram_x_bounds = None; + self.chromatogram_output = None; + self.ms2_spectrum = None; + self.auto_zoom_frame_counter = 0; + self.clicked_rt = None; + self.cache_key = None; + self.computing_index = None; + self.last_requested_rt = None; + self.apex_score = None; + self.score_lines = None; + self.expected_intensities = None; + self.reference_lines.clear(); + self.is_computing_chromatogram = false; + } + + pub(crate) fn build_collector( + index: &IndexedPeaksHandle, + elution_group: TimsElutionGroup, + ) -> Result, ViewerError> { + let max_range = index.ms1_cycle_mapping().range_milis(); + // Create collector for this elution group + let collector = ChromatogramCollector::new( + elution_group, + TupleRange::try_new(max_range.0, max_range.1) + .expect("Reference RTs should be sorted and valid"), + index.ms1_cycle_mapping(), + ) + .map_err(|e| ViewerError::General(format!("Failed to create collector: {:?}", e)))?; + Ok(collector) + } + + /// Generate a chromatogram for a single elution group + /// + /// # Arguments + /// * `elution_group` - The elution group to generate a chromatogram for + /// * `index` - Indexed timsTOF peaks data + /// * `ms1_rts` - MS1 retention times in milliseconds + /// * `tolerance` - Tolerance settings for querying + /// * `smoothing` - Smoothing method to apply + /// + /// # Returns + /// A `ChromatogramOutput` containing the generated chromatogram data + #[instrument(skip_all, fields(eg_id = %elution_group.id()))] + pub(crate) fn generate_chromatogram( + collector: &mut ChromatogramCollector, + elution_group: &TimsElutionGroup, + index: &IndexedPeaksHandle, + tolerance: &Tolerance, + smoothing: &SmoothingMethod, + ) -> Result { + // Query the index + index.add_query(collector, tolerance); + + // Convert to output format + let mut output = ChromatogramOutput::try_new(collector, index.ms1_cycle_mapping()) + .map_err(|e| { + ViewerError::General(format!("Failed to generate chromatogram output: {:?}", e)) + })?; + + // Apply smoothing if configured + apply_smoothing_chromatogram(&mut output, smoothing); + + tracing::info!( + "Generated chromatogram for elution group {} with {} precursors and {} fragments", + output.id, + output.precursor_mzs.len(), + output.fragment_mzs.len() + ); + + Ok(output) + } + + #[instrument(skip_all, fields(eg_id = %context.query_values.eg.id()))] + fn find_apex( + apex_finder: &mut ApexFinder, + context: &ScoringContext, + index: &IndexedPeaksHandle, + ) -> Result { + apex_finder.find_apex(context, &|idx| { + index + .ms1_cycle_mapping() + .rt_milis_for_index(&MS1CycleIndex::new(idx as u32)) + .unwrap() + }).map_err(|x| { + match x { + DataProcessingError::ExpectedNonEmptyData { context: err_context } => { + tracing::warn!( + "{:#?}", context.query_values.eg, + ); + tracing::warn!( + "Apex finding failed for elution group {}: No valid data found in context {:?}", + context.query_values.eg.id(), + err_context + ); + } + _ => { + tracing::error!( + "Apex finding failed for elution group {}: {:?}", + context.query_values.eg.id(), + x + ); + } + }; + ViewerError::General("Apex finding error".into()) + + }) + } + + /// Generate MS2 spectrum at the given retention time. + /// Returns true if a new spectrum was generated, false if skipped (already generated for this RT). + #[instrument(skip(self), fields(eg_id = %self.cache_key.as_ref().map(|(id, _, _)| *id).unwrap_or(0)))] + pub fn generate_spectrum_at_rt(&mut self, rt_seconds: f64) -> bool { + // Skip if we already generated spectrum for this RT + if let Some(last_rt) = self.last_requested_rt + && (last_rt - rt_seconds).abs() < f64::EPSILON + { + return false; + } + + self.last_requested_rt = Some(rt_seconds); + if let Some(chrom_output) = &self.chromatogram_output { + match chromatogram_processor::extract_ms2_spectrum_from_chromatogram( + chrom_output, + rt_seconds, + ) { + Ok(spectrum) => { + let num_peaks = spectrum.mz_values.len(); + self.ms2_spectrum = Some(spectrum); + tracing::info!( + "Extracted MS2 spectrum at RT {:.2}s with {} peaks", + rt_seconds, + num_peaks + ); + true + } + Err(e) => { + tracing::error!("Failed to extract MS2 spectrum: {:?}", e); + self.ms2_spectrum = None; + false + } + } + } else { + tracing::warn!("No chromatogram data available for MS2 extraction"); + false + } + } + + /// Complete chromatogram computation with results from background thread + pub(crate) fn complete_chromatogram_computation( + &mut self, + result: ChromatogramComputationResult, + index: &IndexedPeaksHandle, + tolerance: &Tolerance, + smoothing: SmoothingMethod, + ) { + // Store chromatogram output and lines + let chrom_lines = ChromatogramLines::from_chromatogram(&result.output); + self.chromatogram_x_bounds = Some(chrom_lines.rt_seconds_range); + self.chromatogram_lines = Some(chrom_lines); + self.chromatogram_output = Some(result.output.clone()); + self.auto_zoom_frame_counter = 5; + + // Update cache key + self.cache_key = Some((result.selected_idx, tolerance.clone(), smoothing)); + + // Store for scoring + self.expected_intensities = Some(result.expected_intensities.clone()); + self.chromatogram_collector_buffer = Some(result.collector.clone()); + + // Compute scores on main thread + self.compute_scores_from_buffers( + index, + &result.output, + &result.expected_intensities, + &result.collector, + ); + + // Clear computing state + self.is_computing_chromatogram = false; + self.computing_index = None; + } + + /// Compute scores from buffers + fn compute_scores_from_buffers( + &mut self, + index: &IndexedPeaksHandle, + output: &ChromatogramOutput, + expected_intensities: &ExpectedIntensities, + collector: &ChromatogramCollector, + ) { + let num_cycles = output.retention_time_results_seconds.len(); + + // Prepare apex finder + let apex_finder = if self.apex_finder_buffer.is_none() { + self.apex_finder_buffer = Some(ApexFinder::new(num_cycles)); + self.apex_finder_buffer.as_mut().unwrap() + } else { + self.apex_finder_buffer.as_mut().unwrap() + }; + + // Build scoring context + let scoring_ctx = ScoringContext { + expected_intensities: expected_intensities.clone(), + query_values: collector.clone(), + }; + + // Find apex + let apex_score = match Self::find_apex(apex_finder, &scoring_ctx, index) { + Ok(score) => score, + Err(e) => { + tracing::error!("Failed to compute apex score: {:?}", e); + return; + } + }; + + // Update RT reference (use insert to overwrite any previous apex RT) + self.clicked_rt = Some(apex_score.retention_time_ms as f64 / 1000.0); + self.reference_lines.insert( + "Apex RT".into(), + (apex_score.retention_time_ms as f64 / 1000.0, Color32::RED), + ); + + // Generate score lines + self.score_lines = Some(ScoreLines::from_scores( + apex_score, + &apex_finder.traces, + index.ms1_cycle_mapping(), + collector.cycle_offset(), + )); + self.apex_score = Some(apex_score); + } +} diff --git a/rust/timsquery_viewer/src/domain/chromatogram_service.rs b/rust/timsquery_viewer/src/domain/chromatogram_service.rs deleted file mode 100644 index e9db850..0000000 --- a/rust/timsquery_viewer/src/domain/chromatogram_service.rs +++ /dev/null @@ -1,108 +0,0 @@ -//! Chromatogram generation service - -use std::sync::Arc; -use timscentroid::IndexedTimstofPeaks; -use timsquery::models::elution_group::TimsElutionGroup; -use timsquery::models::tolerance::Tolerance; -use timsquery::{ - KeyLike, - QueriableData, -}; -use tracing::instrument; - -use crate::chromatogram_processor::{ - ChromatogramCollector, - ChromatogramOutput, - SmoothingMethod, - apply_smoothing_chromatogram, -}; -use crate::error::ViewerError; - -/// Service for generating chromatograms from elution groups -pub struct ChromatogramService; - -impl ChromatogramService { - /// Generate a chromatogram for a single elution group - /// - /// # Arguments - /// * `elution_group` - The elution group to generate a chromatogram for - /// * `index` - Indexed timsTOF peaks data - /// * `ms1_rts` - MS1 retention times in milliseconds - /// * `tolerance` - Tolerance settings for querying - /// * `smoothing` - Smoothing method to apply - /// - /// # Returns - /// A `ChromatogramOutput` containing the generated chromatogram data - #[instrument(skip_all, fields(eg_id = %elution_group.id()))] - pub fn generate( - elution_group: &TimsElutionGroup, - index: &IndexedTimstofPeaks, - ms1_rts: Arc<[u32]>, - tolerance: &Tolerance, - smoothing: &SmoothingMethod, - ) -> Result { - // Determine RT range for the query - let rt_range_ms = match tolerance.rt_range_as_milis(elution_group.rt_seconds()) { - timsquery::OptionallyRestricted::Unrestricted => { - timsquery::TupleRange::try_new(*ms1_rts.first().unwrap(), *ms1_rts.last().unwrap()) - .expect("Reference RTs should be sorted and valid") - } - timsquery::OptionallyRestricted::Restricted(r) => r, - }; - - // Create collector for this elution group - let mut collector = - ChromatogramCollector::new(elution_group.clone(), rt_range_ms, &ms1_rts).map_err( - |e| ViewerError::General(format!("Failed to create collector: {:?}", e)), - )?; - - // Query the index - index.add_query(&mut collector, tolerance); - - // Convert to output format - let mut output = ChromatogramOutput::try_new(collector, &ms1_rts).map_err(|e| { - ViewerError::General(format!("Failed to generate chromatogram output: {:?}", e)) - })?; - - // Apply smoothing if configured - apply_smoothing_chromatogram(&mut output, smoothing); - - tracing::info!( - "Generated chromatogram for elution group {} with {} precursors and {} fragments", - output.id, - output.precursor_mzs.len(), - output.fragment_mzs.len() - ); - - Ok(output) - } -} - -#[cfg(test)] -mod tests { - use super::*; - - // Note: These tests require test data fixtures. - // For now, we're documenting the test structure. - // In a real implementation, you'd use test fixtures or mocks. - - #[test] - fn test_chromatogram_service_structure() { - // This test just verifies the service exists and is well-formed - let _service = ChromatogramService; - // Future tests would: - // 1. Load test data - // 2. Call ChromatogramService::generate - // 3. Assert on the output structure - } - - // Future test ideas: - // #[test] - // fn test_generate_chromatogram_with_smoothing() { ... } - // - // #[test] - // fn test_generate_chromatogram_without_smoothing() { ... } - // - // #[test] - // fn test_generate_chromatogram_invalid_rt_range() { ... } -} diff --git a/rust/timsquery_viewer/src/domain/file_service.rs b/rust/timsquery_viewer/src/domain/file_service.rs index d49e144..fcacada 100644 --- a/rust/timsquery_viewer/src/domain/file_service.rs +++ b/rust/timsquery_viewer/src/domain/file_service.rs @@ -2,19 +2,21 @@ use std::path::Path; use std::sync::Arc; -use timscentroid::{ - IndexedTimstofPeaks, - TimsTofPath, -}; use timsquery::models::tolerance::Tolerance; -use timsquery::serde::load_index_caching; -use timsrust::MSLevel; +use timsquery::serde::{ + IndexLoadConfig, + IndexedPeaksHandle, + load_index_auto, +}; use tracing::info; use crate::error::ViewerError; use crate::file_loader::ElutionGroupData; /// Service for loading files +/// +/// JSPP: NGL ... this file is stupid ... should refactor to just have free functions +/// instead of a struct with no state pub struct FileService; impl FileService { @@ -32,31 +34,54 @@ impl FileService { res.len(), path.display() ); - Ok(ElutionGroupData { inner: res }) + Ok(ElutionGroupData::new(res)) } - /// Load and index raw timsTOF data + /// Load and index raw timsTOF data from a location (path or URL) + /// + /// Supports both local paths and cloud URLs (s3://, gs://, az://). + /// Automatically detects input type and loads appropriately. + /// + /// For cloud cached indexes (.idx), uses lazy loading for faster initialization + /// (loads metadata only, row groups fetched on-demand during queries). /// - /// Note: This operation may take 10-30 seconds for large datasets. + /// Note: This operation may take 10-30 seconds for large datasets when + /// loading from raw .d files. Cached .idx files load much faster. /// /// # Arguments - /// * `path` - Path to the .d directory + /// * `location` - Path or URL to the .d directory or .idx cache /// /// # Returns - /// A tuple of (indexed peaks, MS1 retention times in milliseconds) - pub fn load_raw_data( - path: &Path, - ) -> Result<(Arc, Arc<[u32]>), ViewerError> { - let index = load_index_caching(path).map_err(|e| ViewerError::DataLoading { - path: path.to_path_buf(), - source: Box::new(ViewerError::General(format!("{:?}", e))), - })?; + /// Indexed peaks loaded into memory + pub fn load_raw_data_from_location( + location: &str, + ) -> Result, ViewerError> { + // Detect if cloud URL + let is_cloud = location.contains("://") && !location.starts_with("file://"); + // Use lazy loading for cloud cached indexes (fast init, load on query) + let prefer_lazy = is_cloud; + + info!( + "Loading raw data from {}: is_cloud={}, prefer_lazy={}", + location, is_cloud, prefer_lazy + ); + + let config = IndexLoadConfig { + prefer_lazy, + ..Default::default() + }; - let rts = Self::get_ms1_rts_as_millis(path)?; + let index = load_index_auto(location, Some(config)) + .map_err(|e| ViewerError::General(format!("Failed to load index: {:?}", e)))?; - Ok((Arc::new(index), rts)) + Ok(Arc::new(index)) } + /// Load and index raw timsTOF data (legacy method for local paths) + /// + /// Supports both local paths and cloud URLs (s3://, gs://, az://). + /// Automatically detects input type and loads appropriately. + /// /// Load tolerance settings from a JSON file /// /// # Arguments @@ -69,30 +94,6 @@ impl FileService { let tolerance: Tolerance = serde_json::from_str(&file_content)?; Ok(tolerance) } - - /// Retrieves MS1 retention times from a TIMS-TOF file, sorted and deduped - /// - /// # Arguments - /// * `path` - Path to the .d directory - /// - /// # Returns - /// MS1 retention times in milliseconds - fn get_ms1_rts_as_millis(path: &Path) -> Result, ViewerError> { - let ttp = TimsTofPath::new(path).map_err(|e| ViewerError::TimsFileLoad { - path: path.to_path_buf(), - source: e, - })?; - let reader = ttp.load_frame_reader()?; - let mut rts: Vec<_> = reader - .frame_metas - .iter() - .filter(|x| x.ms_level == MSLevel::MS1) - .map(|f| (f.rt_in_seconds * 1000.0).round() as u32) - .collect(); - rts.sort_unstable(); - rts.dedup(); - Ok(rts.into()) - } } #[cfg(test)] diff --git a/rust/timsquery_viewer/src/domain/mod.rs b/rust/timsquery_viewer/src/domain/mod.rs index 297b761..2ab4545 100644 --- a/rust/timsquery_viewer/src/domain/mod.rs +++ b/rust/timsquery_viewer/src/domain/mod.rs @@ -1,10 +1,4 @@ //! Domain layer - business logic -pub mod chromatogram_service; pub mod file_service; - -#[cfg(test)] -mod tests; - -pub use chromatogram_service::ChromatogramService; pub use file_service::FileService; diff --git a/rust/timsquery_viewer/src/domain/tests.rs b/rust/timsquery_viewer/src/domain/tests.rs deleted file mode 100644 index 81117d0..0000000 --- a/rust/timsquery_viewer/src/domain/tests.rs +++ /dev/null @@ -1,21 +0,0 @@ -//! Tests for domain layer services - -#[cfg(test)] -mod chromatogram_service_tests { - use super::super::ChromatogramService; - - #[test] - fn test_chromatogram_service_exists() { - let _service = ChromatogramService; - } -} - -#[cfg(test)] -mod file_service_tests { - use super::super::FileService; - - #[test] - fn test_file_service_exists() { - let _service = FileService; - } -} diff --git a/rust/timsquery_viewer/src/error.rs b/rust/timsquery_viewer/src/error.rs index 8c01660..c302d0d 100644 --- a/rust/timsquery_viewer/src/error.rs +++ b/rust/timsquery_viewer/src/error.rs @@ -1,6 +1,4 @@ -use std::path::PathBuf; use thiserror::Error; -use timsrust::TimsTofPathError; use timsrust::readers::FrameReaderError; #[derive(Error, Debug)] @@ -11,22 +9,9 @@ pub enum ViewerError { #[error("JSON parsing error: {0}")] Json(#[from] serde_json::Error), - #[error("Failed to load TIMS file at '{path}': {source}")] - TimsFileLoad { - path: PathBuf, - source: TimsTofPathError, - }, - #[error("TIMS frame reader error: {0}")] FrameReader(#[from] FrameReaderError), - #[error("Failed to load data from {path}: {source}")] - DataLoading { - path: PathBuf, - #[source] - source: Box, - }, - #[error("General error: {0}")] General(String), } diff --git a/rust/timsquery_viewer/src/file_loader.rs b/rust/timsquery_viewer/src/file_loader.rs index c5506ab..3408121 100644 --- a/rust/timsquery_viewer/src/file_loader.rs +++ b/rust/timsquery_viewer/src/file_loader.rs @@ -1,16 +1,40 @@ -use std::path::PathBuf; -use std::sync::Arc; -use timscentroid::IndexedTimstofPeaks; -use timsquery::models::tolerance::Tolerance; -use timsquery::serde::ElutionGroupCollection; - use crate::domain::FileService; use crate::error::ViewerError; +use egui_extras::{ + Table, + TableBuilder, +}; +use std::collections::HashMap; +use std::path::{ + Path, + PathBuf, +}; +use std::sync::Arc; +use timsquery::models::tolerance::Tolerance; +use timsquery::serde::{ + DiannPrecursorExtras, + ElutionGroupCollection, + FileReadingExtras, + IndexedPeaksHandle, +}; +use timsquery::{ + KeyLike, + TimsElutionGroup, +}; +use timsseek::ExpectedIntensities; +use timsseek::fragment_mass::elution_group_converter::isotope_dist_from_seq; +use tracing::{ + info, + instrument, + warn, +}; /// Handles file dialogs and file loading operations +#[derive(Debug, serde::Deserialize, serde::Serialize)] pub struct FileLoader { pub elution_groups_path: Option, pub raw_data_path: Option, + pub raw_data_url: Option, pub tolerance_path: Option, } @@ -19,14 +43,33 @@ impl FileLoader { Self { elution_groups_path: None, raw_data_path: None, + raw_data_url: None, tolerance_path: None, } } + pub fn with_initial_paths( + mut self, + raw_data_path: &Option, + elution_groups_path: &Option, + ) -> Self { + if let Some(raw_data_path) = raw_data_path { + self.raw_data_path = Some(raw_data_path.clone()); + } + if let Some(elution_groups_path) = elution_groups_path { + self.elution_groups_path = Some(elution_groups_path.clone()); + } + + self + } + /// Open a file dialog for elution groups JSON file pub fn open_elution_groups_dialog(&mut self) { if let Some(path) = rfd::FileDialog::new() - .add_filter("Elution Groups File (json/diann txt)", &["json", "txt"]) + .add_filter( + "Elution Groups File (json/diann txt/tsv)", + &["json", "txt", "tsv"], + ) .pick_file() { self.elution_groups_path = Some(path); @@ -36,7 +79,7 @@ impl FileLoader { /// Open a file dialog for raw data .d directory pub fn open_raw_data_dialog(&mut self) { if let Some(path) = rfd::FileDialog::new().pick_folder() { - self.raw_data_path = Some(path); + self.set_raw_data_path(path); } } @@ -51,35 +94,77 @@ impl FileLoader { } /// Load elution groups from a JSON file - pub fn load_elution_groups(&self, path: &PathBuf) -> Result { + pub fn load_elution_groups(&self, path: &Path) -> Result { FileService::load_elution_groups(path) } - /// Load and index raw timsTOF data - pub fn load_raw_data( + /// Load and index raw timsTOF data from a location (path or URL) + pub fn load_raw_data_from_location( &self, - path: &PathBuf, - ) -> Result<(Arc, Arc<[u32]>), ViewerError> { - FileService::load_raw_data(path) + location: &str, + ) -> Result, ViewerError> { + FileService::load_raw_data_from_location(location) } /// Load tolerance settings from a JSON file pub fn load_tolerance(&self, path: &PathBuf) -> Result { FileService::load_tolerance(path) } + + /// Set raw data URL for cloud storage + pub fn set_raw_data_url(&mut self, url: String) { + self.raw_data_url = Some(url); + // Clear path when URL is set + self.raw_data_path = None; + } + + /// Set raw data path for local storage + pub fn set_raw_data_path(&mut self, path: PathBuf) { + self.raw_data_path = Some(path); + // Clear URL when path is set + self.raw_data_url = None; + } + + /// Get the current raw data location (path or URL) + pub fn get_raw_data_location(&self) -> Option { + self.raw_data_url + .clone() + .or_else(|| self.raw_data_path.as_ref().map(|p| p.display().to_string())) + } + + /// Clear raw data location (both path and URL) + pub fn clear_raw_data(&mut self) { + self.raw_data_path = None; + self.raw_data_url = None; + } } #[derive(Debug)] pub struct ElutionGroupData { - pub inner: ElutionGroupCollection, + inner: ElutionGroupCollection, } +const BASE_LABELS: [&str; 6] = [ + "ID", + "RT (s)", + "Mobility", + "Precursor m/z", + "Precursor Charge", + "Fragments", +]; + +const DIANN_EXTRA_LABELS: [&str; 3] = ["Modified Peptide", "Protein ID(s)", "Is Decoy"]; impl ElutionGroupData { + pub fn new(inner: ElutionGroupCollection) -> Self { + Self { inner } + } + pub fn len(&self) -> usize { self.inner.len() } #[must_use] + #[allow(dead_code)] pub fn is_empty(&self) -> bool { self.len() == 0 } @@ -88,50 +173,324 @@ impl ElutionGroupData { /// /// If filter is an empty string, returns ALL indices (no filtering applied). /// This allows seamless toggling between filtered and unfiltered views. - pub fn matching_indices_for_id_filter(&self, filter: &str) -> Vec { + #[instrument(skip(self, buffer))] + pub fn matching_indices_for_id_filter(&self, filter: &str, buffer: &mut Vec) { + buffer.clear(); if filter.is_empty() { - return (0..self.len()).collect(); + buffer.extend(0..self.len()); + return; + } + + let mut str_buffer = String::new(); + for i in 0..self.len() { + if self.key_onto(i, &mut str_buffer).is_ok() && str_buffer.contains(filter) { + buffer.push(i); + } + } + } + + /// Adds the key contents to the string, the idea here is to avoid allocations + fn key_onto(&self, idx: usize, buffer: &mut String) -> Result<(), ()> { + use std::fmt::Write; + buffer.clear(); + match &self.inner { + ElutionGroupCollection::StringLabels(egs, _) => { + write!(buffer, "{}", egs[idx].id()).map_err(|_| ())?; + } + ElutionGroupCollection::MzpafLabels(egs, _) => { + write!(buffer, "{}", egs[idx].id()).map_err(|_| ())?; + } + ElutionGroupCollection::TinyIntLabels(egs, _) => { + write!(buffer, "{}", egs[idx].id()).map_err(|_| ())?; + } + ElutionGroupCollection::IntLabels(egs, _) => { + write!(buffer, "{}", egs[idx].id()).map_err(|_| ())?; + } + } + let extras = match &self.inner { + ElutionGroupCollection::StringLabels(_, extras) + | ElutionGroupCollection::MzpafLabels(_, extras) + | ElutionGroupCollection::TinyIntLabels(_, extras) + | ElutionGroupCollection::IntLabels(_, extras) => match extras { + Some(FileReadingExtras::Diann(diann_extras)) => Some(&diann_extras[idx]), + _ => None, + }, + }; + if let Some(diann_extra) = extras { + write!( + buffer, + "|{}|{}|{}", + diann_extra.modified_peptide, diann_extra.protein_id, diann_extra.is_decoy + ) + .map_err(|_| ())?; } + Ok(()) + } - macro_rules! get_ids { - ($self:expr) => { - $self + pub fn get_elem( + &self, + index: usize, + ) -> Result<(TimsElutionGroup, ExpectedIntensities), ViewerError> { + let (eg, extras) = match &self.inner { + ElutionGroupCollection::StringLabels(egs, ext) => (egs.get(index).cloned(), ext), + ElutionGroupCollection::MzpafLabels(egs, ext) => { + (egs.get(index).map(|eg| eg.cast(|x| x.to_string())), ext) + } + ElutionGroupCollection::TinyIntLabels(egs, ext) => { + (egs.get(index).map(|eg| eg.cast(|x| x.to_string())), ext) + } + ElutionGroupCollection::IntLabels(egs, ext) => { + (egs.get(index).map(|eg| eg.cast(|x| x.to_string())), ext) + } + }; + let mut eg = eg.ok_or(ViewerError::General(format!( + "Elution group index {} out of bounds", + index + )))?; + + let extra = match extras { + Some(FileReadingExtras::Diann(diann_extras)) => { + let de = diann_extras.get(index).ok_or(ViewerError::General(format!( + "Diann extras index {} out of bounds", + index + )))?; + let fragment_intensities = HashMap::from_iter( + de.relative_intensities + .iter() + .cloned() + .map(|(k, v)| (k.clone().to_string(), v)), + ); + + // TODO: Actually get expected intensities ... I could make + // The simple isotope calculation from number of carbons + sulphur. + let isotopes = match isotope_dist_from_seq(&de.stripped_peptide) { + Ok(isotopes) => isotopes, + Err(e) => { + warn!( + "Failed to calculate isotope distribution for sequence {}: {}", + &de.stripped_peptide, e + ); + [1.0, 0.0, 0.0] + } + }; + eg.set_precursor_labels([0, 1, 2].iter().cloned()); + let precursor_intensities: HashMap = isotopes .iter() + .cloned() .enumerate() - .filter(|(_, eg)| eg.id().to_string().contains(filter)) - .map(|(idx, _)| idx) - .collect() + .map(|(i, intensity)| (i as i8, intensity)) + .collect(); + + ExpectedIntensities { + precursor_intensities, + fragment_intensities, + } + } + None => ExpectedIntensities { + precursor_intensities: eg.iter_precursors().map(|(idx, _mz)| (idx, 1.0)).collect(), + fragment_intensities: eg + .iter_fragments() + .map(|(label, _mz)| (label.to_string(), 1.0)) + .collect(), + }, + }; + Ok((eg, extra)) + } + + pub fn render_table( + &self, + ui: &mut egui::Ui, + filtered_eg_idxs: &[usize], + selected_index: &mut Option, + scroll_to_selection: bool, + ) { + let builder = TableBuilder::new(ui) + .striped(true) + .resizable(true) + .cell_layout(egui::Layout::left_to_right(egui::Align::Center)); + let mut builder = self.add_columns(builder); + if let Some(row_index) = selected_index.as_ref() { + // Since the index is the original index, we need to find its position + // in the filtered list first + let local_index = match filtered_eg_idxs.binary_search(row_index) { + Ok(idx) => idx, + Err(insert_idx) => { + info!("Selected index {} not found in filtered indices", row_index); + // Set the selection to the closest match + let clamped_idx = if insert_idx >= filtered_eg_idxs.len() { + filtered_eg_idxs.len().saturating_sub(1) // Prevents underflow + } else { + insert_idx + }; + if !filtered_eg_idxs.is_empty() { + *selected_index = Some(filtered_eg_idxs[clamped_idx]); + }; + clamped_idx + } }; + if scroll_to_selection && !filtered_eg_idxs.is_empty() { + builder = builder.scroll_to_row(local_index, None); + } } + let builder = self.add_headers(builder); - match &self.inner { - ElutionGroupCollection::StringLabels(egs) => get_ids!(egs), - ElutionGroupCollection::MzpafLabels(egs) => get_ids!(egs), - ElutionGroupCollection::TinyIntLabels(egs) => get_ids!(egs), - ElutionGroupCollection::IntLabels(egs) => get_ids!(egs), + builder.body(|body| { + let row_height = 18.0; + body.rows(row_height, filtered_eg_idxs.len(), |mut row| { + let row_idx = row.index(); + let original_idx = filtered_eg_idxs[row_idx]; + self.add_row_content(original_idx, selected_index, &mut row); + }); + }); + } + + fn add_columns<'a>(&self, mut table: TableBuilder<'a>) -> TableBuilder<'a> { + let has_diann_extras = match &self.inner { + ElutionGroupCollection::StringLabels(_, extras) + | ElutionGroupCollection::MzpafLabels(_, extras) + | ElutionGroupCollection::TinyIntLabels(_, extras) + | ElutionGroupCollection::IntLabels(_, extras) => { + matches!(extras, Some(FileReadingExtras::Diann(_))) + } + }; + if has_diann_extras { + for _ in DIANN_EXTRA_LABELS.iter() { + table = table.column(egui_extras::Column::auto().at_least(100.0)); + } } + for _ in BASE_LABELS.iter() { + table = table.column(egui_extras::Column::auto().at_least(80.0)); + } + table } -} -/// Execute a macro with the appropriate elution group collection variant. -/// -/// # Example -/// ```ignore -/// macro_rules! process { -/// ($egs:expr) => {{ -/// ChromatogramService::generate(&$egs[idx], ...) -/// }}; -/// } -/// with_elution_collection!(elution_groups, process) -/// ``` -#[macro_export] -macro_rules! with_elution_collection { - ($data:expr, $macro_name:ident) => { - match &$data.inner { - timsquery::serde::ElutionGroupCollection::StringLabels(egs) => $macro_name!(egs), - timsquery::serde::ElutionGroupCollection::MzpafLabels(egs) => $macro_name!(egs), - timsquery::serde::ElutionGroupCollection::TinyIntLabels(egs) => $macro_name!(egs), - timsquery::serde::ElutionGroupCollection::IntLabels(egs) => $macro_name!(egs), + fn add_headers<'a>(&self, builder: TableBuilder<'a>) -> Table<'a> { + let has_diann_extras = match &self.inner { + ElutionGroupCollection::StringLabels(_, extras) + | ElutionGroupCollection::MzpafLabels(_, extras) + | ElutionGroupCollection::TinyIntLabels(_, extras) + | ElutionGroupCollection::IntLabels(_, extras) => { + matches!(extras, Some(FileReadingExtras::Diann(_))) + } + }; + + builder.header(20.0, |mut header| { + if has_diann_extras { + for label in DIANN_EXTRA_LABELS.iter() { + header.col(|ui| { + ui.strong(*label); + }); + } + } + for label in BASE_LABELS.iter() { + header.col(|ui| { + ui.strong(*label); + }); + } + }) + } + + /// Helper function to add row content + /// `is_selected` indicates if the row is currently selected + /// This function adds the appropriate columns based on available extras + /// Returns true if any of the content was clicked (for selection handling) + fn add_row_content_inner( + eg: &TimsElutionGroup, + extras: Option, + table_row: &mut egui_extras::TableRow, + is_selected: bool, + ) -> bool { + let mut clicked = false; + let mut add_col = |ui: &mut egui::Ui, text: &str| { + // Highlight if selected + let maybe_highlighted_text = if is_selected { + egui::RichText::new(text).background_color(ui.visuals().selection.bg_fill) + } else { + egui::RichText::new(text) + }; + let label = ui.selectable_label(is_selected, maybe_highlighted_text); + let label = if is_selected { + label.highlight() + } else { + label + }; + + if label.clicked() { + clicked = true; + } + }; + match extras { + Some(diann_extra) => { + table_row.col(|ui| { + add_col(ui, &diann_extra.modified_peptide); + }); + table_row.col(|ui| { + add_col(ui, &diann_extra.protein_id); + }); + table_row.col(|ui| { + add_col(ui, if diann_extra.is_decoy { "Yes" } else { "No" }); + }); + } + None => { /* No extra columns */ } + } + table_row.col(|ui| { + add_col(ui, &eg.id().to_string()); + }); + table_row.col(|ui| { + let text = format!("{:.2}", eg.rt_seconds()); + add_col(ui, &text); + }); + table_row.col(|ui| { + let text = format!("{:.4}", eg.mobility_ook0()); + add_col(ui, &text); + }); + table_row.col(|ui| { + let display_text = format!("{:.4}", eg.mono_precursor_mz()); + add_col(ui, &display_text); + }); + table_row.col(|ui| { + let text = format!("{}", eg.precursor_charge()); + add_col(ui, &text); + }); + table_row.col(|ui| { + let text = format!("{}", eg.fragment_count()); + add_col(ui, &text); + }); + clicked + } + + fn add_row_content( + &self, + idx: usize, + selected_index: &mut Option, + table_row: &mut egui_extras::TableRow, + ) { + let diann_extra = match &self.inner { + ElutionGroupCollection::StringLabels(_, extras) + | ElutionGroupCollection::MzpafLabels(_, extras) + | ElutionGroupCollection::TinyIntLabels(_, extras) + | ElutionGroupCollection::IntLabels(_, extras) => match extras { + Some(FileReadingExtras::Diann(diann_extras)) => Some(diann_extras[idx].clone()), + _ => None, + }, + }; + let is_selected = Some(idx) == *selected_index; + let clicked = match &self.inner { + ElutionGroupCollection::StringLabels(egs, _) => { + Self::add_row_content_inner(&egs[idx], diann_extra, table_row, is_selected) + } + ElutionGroupCollection::MzpafLabels(egs, _) => { + Self::add_row_content_inner(&egs[idx], diann_extra, table_row, is_selected) + } + ElutionGroupCollection::TinyIntLabels(egs, _) => { + Self::add_row_content_inner(&egs[idx], diann_extra, table_row, is_selected) + } + ElutionGroupCollection::IntLabels(egs, _) => { + Self::add_row_content_inner(&egs[idx], diann_extra, table_row, is_selected) + } + }; + if clicked { + *selected_index = Some(idx); } - }; + } } diff --git a/rust/timsquery_viewer/src/main.rs b/rust/timsquery_viewer/src/main.rs index fd4c30a..733c905 100644 --- a/rust/timsquery_viewer/src/main.rs +++ b/rust/timsquery_viewer/src/main.rs @@ -1,5 +1,7 @@ mod app; mod chromatogram_processor; +mod cli; +mod computed_state; mod domain; mod error; mod file_loader; @@ -7,12 +9,15 @@ mod plot_renderer; mod ui; use eframe::egui; -use tracing::subscriber::set_global_default; use tracing_subscriber::EnvFilter; use tracing_subscriber::fmt::format::FmtSpan; -use tracing_subscriber::prelude::*; use tracing_subscriber::registry::Registry; +use clap::Parser; +use std::fmt::Write as FMTWrite; +use tracing_subscriber::layer::SubscriberExt; +use tracing_subscriber::util::SubscriberInitExt; + #[cfg(target_os = "windows")] use mimalloc::MiMalloc; @@ -20,24 +25,45 @@ use mimalloc::MiMalloc; #[global_allocator] static GLOBAL: MiMalloc = MiMalloc; -fn main() -> eframe::Result { - let env_filter = EnvFilter::try_from_default_env().unwrap_or_else(|_| EnvFilter::new("info")); +fn setup_logger() { + let app_level = std::env::var("RUST_LOG").unwrap_or_else(|_| "info".to_string()); + let env_filter = match EnvFilter::builder().parse(&app_level) { + Ok(filter) => filter, + Err(_) => { + let mut warning_msg = String::new(); + let _ = writeln!( + &mut warning_msg, + "Warning: Invalid RUST_LOG value: {}. Falling back to 'info'.", + app_level + ); + eprintln!("{}", warning_msg); + EnvFilter::new("info") + } + }; + + // 5. Initialize Subscriber let subscriber = Registry::default() .with(env_filter) .with(tracing_subscriber::fmt::layer().with_span_events(FmtSpan::CLOSE)); - set_global_default(subscriber).expect("Setting default subscriber failed"); + subscriber.init(); // simpler than set_global_default + expect +} + +fn main() -> eframe::Result { + let args = cli::Cli::parse(); + setup_logger(); let options = eframe::NativeOptions { viewport: egui::ViewportBuilder::default() .with_inner_size([1400.0, 800.0]) - .with_min_inner_size([800.0, 600.0]), + .with_min_inner_size([800.0, 600.0]) + .with_app_id("timsquery_viewer"), ..Default::default() }; eframe::run_native( "TimsQuery Viewer", options, - Box::new(|cc| Ok(Box::new(app::ViewerApp::new(cc)))), + Box::new(|cc| Ok(Box::new(app::ViewerApp::new(cc, &args)))), ) } diff --git a/rust/timsquery_viewer/src/plot_renderer.rs b/rust/timsquery_viewer/src/plot_renderer.rs index f08669f..638ffb4 100644 --- a/rust/timsquery_viewer/src/plot_renderer.rs +++ b/rust/timsquery_viewer/src/plot_renderer.rs @@ -1,4 +1,5 @@ use eframe::egui; +use egui::Color32; use egui_plot::{ Legend, Line, @@ -7,9 +8,19 @@ use egui_plot::{ PlotPoints, Polygon, }; +use timscentroid::rt_mapping::{ + CycleToRTMapping, + MS1CycleIndex, + RTIndex, +}; +use timsseek::scoring::apex_finding::{ + ApexScore, + ScoreTraces, +}; use crate::chromatogram_processor::ChromatogramOutput; use tracing::{ + debug, info, instrument, }; @@ -20,6 +31,7 @@ const REFERENCE_RT_BAND_WIDTH_SECONDS: f64 = 10.0; #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub enum PlotMode { /// Show all traces (precursors + fragments) + #[allow(dead_code)] All, /// Show only precursor traces PrecursorsOnly, @@ -27,6 +39,14 @@ pub enum PlotMode { FragmentsOnly, } +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Default)] +pub enum AutoZoomMode { + Disabled, + #[default] + PeakApex, + QueryRange, +} + #[derive(Debug)] pub struct ChromatogramLines { precursor_lines: Vec, @@ -38,6 +58,220 @@ pub struct ChromatogramLines { pub rt_seconds_range: (f64, f64), } +#[derive(Debug)] +pub struct ScoreLines { + main_score_line: LineData, + lines: Vec, + apex_score: ApexScore, + rt_seconds_range: (f64, f64), +} + +impl ScoreLines { + #[instrument(skip_all)] + pub(crate) fn from_scores( + apex: ApexScore, + scores: &ScoreTraces, + mapper: &CycleToRTMapping, + cycle_offset: usize, + ) -> Self { + let mut lines: Vec<_> = scores + .iter_scores() + .map(|(name, trace)| { + let max_val = trace.iter().cloned().fold(f32::NEG_INFINITY, f32::max); + debug!("Max score for {}: {}", name, max_val); + let norm_factor = max_val.max(1e-6); + let inv_norm_factor = (1.0 / norm_factor) as f64; + let inv_norm_factor = if name == "main_score" { + debug!("Main score trace length: {}", trace.len()); + 1.0 + } else { + inv_norm_factor + }; + let points: Vec = trace + .iter() + .enumerate() + .map(|(cycle_idx, score)| { + let global_idx = cycle_idx + cycle_offset; + mapper + .rt_milis_for_index(&MS1CycleIndex::new(global_idx as u32)) + .map(|rt| { + let rt_seconds = rt as f64 / 1000.0; + PlotPoint::new(rt_seconds, *score as f64 * inv_norm_factor) + }) + .unwrap() + }) + .collect(); + + LineData { + points, + name: name.into(), + stroke: egui::Stroke::new(1.5, egui::Color32::LIGHT_BLUE), + } + }) + .collect(); + + // Check that all lines are the same length + let first_line_len = lines.first().map(|line| line.points.len()).unwrap_or(0); + for line in &lines { + assert_eq!( + line.points.len(), + first_line_len, + "All score lines should have the same number of points" + ); + } + + let main_score_line = lines + .pop_if(|x| x.name == "main_score") + .expect("There should be a main_score line"); + + let rt_seconds_range = ( + lines + .first() + .and_then(|line| line.points.first()) + .map(|pt| pt.x) + .unwrap_or(0.0), + lines + .last() + .and_then(|line| line.points.last()) + .map(|pt| pt.x) + .unwrap_or(0.0), + ); + Self { + main_score_line, + lines, + apex_score: apex, + rt_seconds_range, + } + } + + pub fn render( + &self, + ui: &mut egui::Ui, + link_group_id: Option<&str>, + auto_zoom_frame_counter: &mut u8, + auto_zoom_mode: &AutoZoomMode, + // Lines to add vertical markers for + label_lines: &[(String, f64, Color32)], + ) -> Option { + let (plot_id_top, plot_id_bot) = match link_group_id { + Some(id) => ( + format!("score_traces_top_{}", id), + format!("score_traces_bot_{}", id), + ), + None => ( + "score_traces_top".to_string(), + "score_traces_bot".to_string(), + ), + }; + let (scroll_delta, _shift_pressed) = + ui.input(|i| (i.smooth_scroll_delta, i.modifiers.shift)); + let half_height = ui.available_height() * 0.5; + let top_plot = Plot::new(plot_id_top) + .legend(Legend::default()) + .height(half_height) + .show_axes([true, true]) + .allow_zoom(false) + .allow_drag(false) + .allow_scroll(false) + .x_axis_label("Retention Time (s)") + .y_axis_label("Normalized Score"); + + let bot_plot = Plot::new(plot_id_bot) + .height(half_height) + .show_axes([true, true]) + .allow_zoom(false) + .allow_drag(false) + .allow_scroll(false) + .x_axis_label("Retention Time (s)") + .y_axis_label("Main Score"); + + let (top_plot, bot_plot) = if let Some(link_id) = link_group_id { + const ONLY_X_AXIS: [bool; 2] = [true, false]; + ( + top_plot.link_axis(link_id.to_string(), ONLY_X_AXIS), + bot_plot.link_axis(link_id.to_string(), ONLY_X_AXIS), + ) + } else { + (top_plot, bot_plot) + }; + + let mut clicked_rt = None; + + top_plot.include_y(0.0).show(ui, |plot_ui| { + for (idx, line) in self.lines.iter().enumerate() { + plot_ui.line(line.to_plot_line().color(get_palette1_colors(idx))); + } + plot_reflines(label_lines, plot_ui, 0.0, 1.0); + zoom_behavior(plot_ui, &scroll_delta); + if *auto_zoom_frame_counter == 0 { + // Since they are normalized the max will be 1.0 + clamp_bounds(plot_ui, 1.0, self.rt_seconds_range); + } + if plot_ui.response().clicked() + && let Some(pointer_pos) = plot_ui.pointer_coordinate() + { + clicked_rt = Some(pointer_pos.x); + info!("Plot clicked at RT: {:.2}s", pointer_pos.x); + } + }); + + bot_plot.include_y(0.0).show(ui, |plot_ui| { + let max_y = self + .main_score_line + .points + .iter() + .map(|pt| pt.y) + .fold(0.0, f64::max); + plot_ui.line(self.main_score_line.to_plot_line()); + plot_reflines(label_lines, plot_ui, 0.0, max_y); + + let apex_rt_seconds = self.apex_score.retention_time_ms as f64 / 1000.0; + let apex_line = Polygon::new( + "Apex RT", + PlotPoints::new(vec![ + [apex_rt_seconds - 0.1, 0.0], + [apex_rt_seconds + 0.1, 0.0], + [apex_rt_seconds + 0.1, max_y], + [apex_rt_seconds - 0.1, max_y], + ]), + ) + .fill_color(egui::Color32::from_rgba_premultiplied(255, 0, 0, 26)) + .stroke(egui::Stroke::NONE); + + plot_ui.polygon(apex_line); + + zoom_behavior(plot_ui, &scroll_delta); + if *auto_zoom_frame_counter > 0 { + // plot_ui.set_plot_bounds_x(self.rt_seconds_range.0..=self.rt_seconds_range.1); + match auto_zoom_mode { + AutoZoomMode::QueryRange => { + plot_ui + .set_plot_bounds_x(self.rt_seconds_range.0..=self.rt_seconds_range.1); + plot_ui.set_plot_bounds_y(0.0..=max_y); + } + AutoZoomMode::PeakApex => { + plot_ui + .set_plot_bounds_x((apex_rt_seconds - 30.0)..=(apex_rt_seconds + 30.0)); + plot_ui.set_plot_bounds_y(0.0..=max_y); + } + AutoZoomMode::Disabled => {} + } + *auto_zoom_frame_counter -= 1; + } else { + clamp_bounds(plot_ui, max_y, self.rt_seconds_range); + } + if plot_ui.response().clicked() + && let Some(pointer_pos) = plot_ui.pointer_coordinate() + { + clicked_rt = Some(pointer_pos.x); + info!("Plot clicked at RT: {:.2}s", pointer_pos.x); + } + }); + + clicked_rt + } +} + impl ChromatogramLines { #[instrument(skip(chromatogram))] pub(crate) fn from_chromatogram(chromatogram: &ChromatogramOutput) -> Self { @@ -143,6 +377,36 @@ impl ChromatogramLines { .map(|line| line.intensity_max) .fold(f64::NEG_INFINITY, f64::max) } + + /// Get the maximum intensity within a specific RT range for a given plot mode + fn get_intensity_max_in_range(&self, rt_min: f64, rt_max: f64, mode: PlotMode) -> f64 { + let mut max_intensity = 0.0_f64; + + // Check precursor lines if applicable + if matches!(mode, PlotMode::All | PlotMode::PrecursorsOnly) { + for line in &self.precursor_lines { + for point in &line.data.points { + if point.x >= rt_min && point.x <= rt_max { + max_intensity = max_intensity.max(point.y); + } + } + } + } + + // Check fragment lines if applicable + if matches!(mode, PlotMode::All | PlotMode::FragmentsOnly) { + for line in &self.fragment_lines { + for point in &line.data.points { + if point.x >= rt_min && point.x <= rt_max { + max_intensity = max_intensity.max(point.y); + } + } + } + } + + // Return at least a small value to avoid zero bounds + max_intensity.max(1.0) + } } /// MS2 spectrum data at a specific retention time @@ -159,14 +423,19 @@ pub struct MS2Spectrum { /// /// If `link_group_id` is provided, the X-axis will be linked to other plots with the same ID /// If `show_header` is false, the elution group ID and reference RT/mobility labels are not shown -/// If `reset_bounds_applied` is false, the plot bounds will be reset to show the full data range, and the flag will be set to true +/// If `auto_zoom_frame_counter` is greater than 0, the plot bounds will be reset to show the full data range, and the counter will be decremented pub fn render_chromatogram_plot( ui: &mut egui::Ui, chromatogram: &ChromatogramLines, mode: PlotMode, link_group_id: Option<&str>, show_header: bool, - reset_bounds_applied: &mut bool, + auto_zoom_frame_counter: &mut u8, + auto_zoom_mode: &AutoZoomMode, + // Lines to add vertical markers for + label_lines: &[(String, f64, Color32)], + // Apex score for PeakApex auto-zoom mode + apex_score: Option<&ApexScore>, ) -> Option { let mut clicked_rt = None; @@ -178,7 +447,6 @@ pub fn render_chromatogram_plot( chromatogram.reference_rt_seconds, chromatogram.reference_ook0 )); } - let (scroll_delta, _shift_pressed) = ui.input(|i| (i.smooth_scroll_delta, i.modifiers.shift)); let plot_id = match mode { @@ -196,7 +464,8 @@ pub fn render_chromatogram_plot( .allow_scroll(false); if let Some(link_id) = link_group_id { - plot = plot.link_axis(link_id.to_string(), [true, false]); + const ONLY_X_AXIS: [bool; 2] = [true, false]; + plot = plot.link_axis(link_id.to_string(), ONLY_X_AXIS); } plot.show(ui, |plot_ui| { @@ -244,64 +513,170 @@ pub fn render_chromatogram_plot( } _ => {} } + plot_reflines(label_lines, plot_ui, 0.0, max_polygon_height); + + zoom_behavior(plot_ui, &scroll_delta); + if *auto_zoom_frame_counter > 0 { + match auto_zoom_mode { + AutoZoomMode::QueryRange => { + plot_ui.set_plot_bounds_x( + chromatogram.rt_seconds_range.0..=chromatogram.rt_seconds_range.1, + ); + plot_ui.set_plot_bounds_y(0.0..=max_polygon_height); + } + AutoZoomMode::PeakApex => { + if let Some(apex) = apex_score { + let apex_rt_seconds = apex.retention_time_ms as f64 / 1000.0; + let rt_min = apex_rt_seconds - 30.0; + let rt_max = apex_rt_seconds + 30.0; + plot_ui.set_plot_bounds_x(rt_min..=rt_max); + + // Calculate max intensity in the zoomed region + let zoomed_max_intensity = + chromatogram.get_intensity_max_in_range(rt_min, rt_max, mode); + plot_ui.set_plot_bounds_y(0.0..=zoomed_max_intensity); + } else { + plot_ui.set_plot_bounds_y(0.0..=max_polygon_height); + } + } + AutoZoomMode::Disabled => {} + } + *auto_zoom_frame_counter -= 1; + } else { + clamp_bounds(plot_ui, max_polygon_height, chromatogram.rt_seconds_range); + } - let plot_hovered = plot_ui.response().hovered(); - if plot_hovered && scroll_delta.length_sq() > 0.0 { - let zoom_speed = 0.05; - let scroll_y = scroll_delta.y; - let scroll_x = scroll_delta.x; + if plot_ui.response().clicked() + && let Some(pointer_pos) = plot_ui.pointer_coordinate() + { + clicked_rt = Some(pointer_pos.x); + info!("Plot clicked at RT: {:.2}s", pointer_pos.x); + } + }); - let zoom_amount_y = (scroll_y * zoom_speed / 10.0).exp(); - let zoom_amount_x = (scroll_x * zoom_speed / 10.0).exp(); + clicked_rt +} - let zoom_factor = egui::Vec2::new(zoom_amount_x, zoom_amount_y); +fn zoom_behavior(plot_ui: &mut egui_plot::PlotUi, scroll_delta: &egui::Vec2) { + let plot_hovered = plot_ui.response().hovered(); + if plot_hovered && scroll_delta.length_sq() > 0.0 { + let zoom_speed = 0.05; + let scroll_y = scroll_delta.y; + let scroll_x = scroll_delta.x; - plot_ui.zoom_bounds_around_hovered(zoom_factor); + let zoom_amount_y = (scroll_y * zoom_speed / 10.0).exp(); + let zoom_amount_x = (scroll_x * zoom_speed / 10.0).exp(); + + let zoom_factor = egui::Vec2::new(zoom_amount_x, zoom_amount_y); + + plot_ui.zoom_bounds_around_hovered(zoom_factor); + } + + let shift_pressed = plot_ui.ctx().input(|i| i.modifiers.shift); + + if shift_pressed { + // Shift-drag to zoom to selected x-axis region + if plot_ui.response().drag_started() + && let Some(start_pos) = plot_ui.pointer_coordinate() + { + plot_ui.ctx().data_mut(|d| { + d.insert_temp(egui::Id::new("zoom_drag_start"), start_pos.x); + }); } + if plot_ui.response().dragged() { + // Visual feedback - draw selection rectangle + if let Some(current_pos) = plot_ui.pointer_coordinate() { + let start_x = plot_ui + .ctx() + .data(|d| d.get_temp::(egui::Id::new("zoom_drag_start"))); + + if let Some(start_x) = start_x { + let bounds = plot_ui.plot_bounds(); + let selection_rect = Polygon::new( + "Zoom Selection", + PlotPoints::new(vec![ + [start_x, bounds.min()[1]], + [current_pos.x, bounds.min()[1]], + [current_pos.x, bounds.max()[1]], + [start_x, bounds.max()[1]], + ]), + ) + .fill_color(egui::Color32::from_rgba_premultiplied(100, 150, 255, 50)) + .stroke(egui::Stroke::new( + 1.5, + egui::Color32::from_rgb(100, 150, 255), + )); + + plot_ui.polygon(selection_rect); + } + } + } + + if plot_ui.response().drag_stopped() + && let Some(end_pos) = plot_ui.pointer_coordinate() + { + let start_x = plot_ui + .ctx() + .data(|d| d.get_temp::(egui::Id::new("zoom_drag_start"))); + + if let Some(start_x) = start_x { + let x_min = start_x.min(end_pos.x); + let x_max = start_x.max(end_pos.x); + + if (x_max - x_min).abs() > 1e-6 { + plot_ui.set_plot_bounds_x(x_min..=x_max); + } + } + } + } else { + // Normal pan behavior when shift is not pressed let pointer_drag_delta = plot_ui.pointer_coordinate_drag_delta(); if pointer_drag_delta.x != 0.0 || pointer_drag_delta.y != 0.0 { let pan_delta = egui::Vec2::new(-pointer_drag_delta.x, -pointer_drag_delta.y); plot_ui.translate_bounds(pan_delta); } + } +} - if !*reset_bounds_applied { - plot_ui.set_plot_bounds_x( - chromatogram.rt_seconds_range.0..=chromatogram.rt_seconds_range.1, - ); - plot_ui.set_plot_bounds_y(0.0..=max_polygon_height); - *reset_bounds_applied = true; - } else { - let bounds = plot_ui.plot_bounds(); - - let y_min = bounds.min()[1]; - let y_max = bounds.max()[1]; - let clamped_y_min = 0.0; - let clamped_y_max = y_max.min(max_polygon_height); +fn clamp_bounds(plot_ui: &mut egui_plot::PlotUi, y_max_clamp: f64, x: (f64, f64)) { + let bounds = plot_ui.plot_bounds(); - if y_min != clamped_y_min || y_max != clamped_y_max { - plot_ui.set_plot_bounds_y(clamped_y_min..=clamped_y_max); - } + let y_min = bounds.min()[1]; + let y_max = bounds.max()[1]; + let clamped_y_min = 0.0; + let clamped_y_max = y_max.min(y_max_clamp); - let x_min = bounds.min()[0]; - let x_max = bounds.max()[0]; - let clamped_x_min = x_min.max(chromatogram.rt_seconds_range.0); - let clamped_x_max = x_max.min(chromatogram.rt_seconds_range.1); + if y_min != clamped_y_min || y_max != clamped_y_max { + plot_ui.set_plot_bounds_y(clamped_y_min..=clamped_y_max); + } - if x_min != clamped_x_min || x_max != clamped_x_max { - plot_ui.set_plot_bounds_x(clamped_x_min..=clamped_x_max); - } - } + let x_min = bounds.min()[0]; + let x_max = bounds.max()[0]; + let clamped_x_min = x_min.max(x.0); + let clamped_x_max = x_max.min(x.1); - if plot_ui.response().clicked() - && let Some(pointer_pos) = plot_ui.pointer_coordinate() - { - clicked_rt = Some(pointer_pos.x); - info!("Plot clicked at RT: {:.2}s", pointer_pos.x); - } - }); + if x_min != clamped_x_min || x_max != clamped_x_max { + plot_ui.set_plot_bounds_x(clamped_x_min..=clamped_x_max); + } +} - clicked_rt +fn plot_reflines( + label_lines: &[(String, f64, Color32)], + plot_ui: &mut egui_plot::PlotUi, + min_y: f64, + max_y: f64, +) { + for (label, rt_seconds, color) in label_lines { + let vertical_line = Line::new( + label.as_str(), + PlotPoints::new(vec![[*rt_seconds, min_y], [*rt_seconds, max_y]]), + ) + .color(*color) + .stroke(egui::Stroke::new(1.0, *color)) + .style(egui_plot::LineStyle::dashed_loose()); + plot_ui.line(vertical_line); + } } #[derive(Debug)] @@ -347,3 +722,12 @@ fn get_fragment_color(index: usize) -> egui::Color32 { ]; colors[index % colors.len()] } + +fn get_palette1_colors(idx: usize) -> egui::Color32 { + const COLORS: [&str; 5] = ["eac435", "345995", "03cea4", "fb4d3d", "ca1551"]; + let color = COLORS[idx % COLORS.len()]; + let r = u8::from_str_radix(&color[0..2], 16).unwrap(); + let g = u8::from_str_radix(&color[2..4], 16).unwrap(); + let b = u8::from_str_radix(&color[4..6], 16).unwrap(); + egui::Color32::from_rgb(r, g, b) +} diff --git a/rust/timsquery_viewer/src/ui/components/mod.rs b/rust/timsquery_viewer/src/ui/components/mod.rs deleted file mode 100644 index 4a75474..0000000 --- a/rust/timsquery_viewer/src/ui/components/mod.rs +++ /dev/null @@ -1,3 +0,0 @@ -//! Reusable UI components. - -pub mod precursor_table; diff --git a/rust/timsquery_viewer/src/ui/components/precursor_table.rs b/rust/timsquery_viewer/src/ui/components/precursor_table.rs deleted file mode 100644 index 311b678..0000000 --- a/rust/timsquery_viewer/src/ui/components/precursor_table.rs +++ /dev/null @@ -1,82 +0,0 @@ -use eframe::egui; -use timsquery::KeyLike; -use timsquery::models::elution_group::TimsElutionGroup; - -pub fn render_precursor_table_filtered( - ui: &mut egui::Ui, - filtered_eg_idxs: &[usize], - reference_eg_slice: &[TimsElutionGroup], - selected_index: &mut Option, -) { - use egui_extras::{ - Column, - TableBuilder, - }; - - TableBuilder::new(ui) - .striped(true) - .resizable(true) - .cell_layout(egui::Layout::left_to_right(egui::Align::Center)) - .column(Column::auto().at_least(60.0)) // ID - .column(Column::auto().at_least(80.0)) // RT - .column(Column::auto().at_least(80.0)) // Mobility - .column(Column::auto().at_least(120.0)) // Precursor m/z - .column(Column::auto().at_least(100.0)) // Fragment count - .header(20.0, |mut header| { - header.col(|ui| { - ui.strong("ID"); - }); - header.col(|ui| { - ui.strong("RT (s)"); - }); - header.col(|ui| { - ui.strong("Mobility"); - }); - header.col(|ui| { - ui.strong("Precursor m/z"); - }); - header.col(|ui| { - ui.strong("Fragments"); - }); - }) - .body(|body| { - let row_height = 18.0; - body.rows(row_height, filtered_eg_idxs.len(), |mut row| { - let row_idx = row.index(); - let original_idx = filtered_eg_idxs[row_idx]; - let eg = &reference_eg_slice[original_idx]; - let is_selected = Some(original_idx) == *selected_index; - - row.col(|ui| { - if ui - .selectable_label(is_selected, format!("{}", eg.id())) - .clicked() - { - *selected_index = Some(original_idx); - } - }); - - row.col(|ui| { - let text = format!("{:.2}", eg.rt_seconds()); - ui.label(text); - }); - - row.col(|ui| { - let text = format!("{:.4}", eg.mobility_ook0()); - ui.label(text); - }); - - row.col(|ui| { - let lims = eg.get_precursor_mz_limits(); - let display_text = format!("{:.4} - {:.4}", lims.0, lims.1); - - ui.label(display_text); - }); - - row.col(|ui| { - let text = format!("{}", eg.fragment_count()); - ui.label(text); - }); - }); - }); -} diff --git a/rust/timsquery_viewer/src/ui/mod.rs b/rust/timsquery_viewer/src/ui/mod.rs index 8eb27b3..0175d7a 100644 --- a/rust/timsquery_viewer/src/ui/mod.rs +++ b/rust/timsquery_viewer/src/ui/mod.rs @@ -1,12 +1,4 @@ //! UI module - all user interface components and panels. -pub mod components; -pub mod panel_trait; pub mod panels; pub mod tolerance_editor; - -pub use panel_trait::{ - CommandSink, - Panel, - PanelContext, -}; diff --git a/rust/timsquery_viewer/src/ui/panel_trait.rs b/rust/timsquery_viewer/src/ui/panel_trait.rs deleted file mode 100644 index ade2750..0000000 --- a/rust/timsquery_viewer/src/ui/panel_trait.rs +++ /dev/null @@ -1,81 +0,0 @@ -//! Panel trait and context for consistent UI architecture - -use eframe::egui; - -use crate::app::{ - AppCommand, - ComputedState, - DataState, - UiState, -}; -use crate::file_loader::FileLoader; - -/// Write-only command sink -/// -/// Prevents panels from seeing or modifying commands from other panels. -/// Panels can only push commands, ensuring proper encapsulation. -pub struct CommandSink<'a> { - queue: &'a mut Vec, -} - -impl<'a> CommandSink<'a> { - pub(crate) fn new(queue: &'a mut Vec) -> Self { - Self { queue } - } - - /// Push a command to be processed by the app - #[inline] - pub fn push(&mut self, cmd: AppCommand) { - self.queue.push(cmd); - } - - /// Push multiple commands at once - #[inline] - pub fn extend(&mut self, cmds: impl IntoIterator) { - self.queue.extend(cmds); - } -} - -/// Context passed to all panels during rendering -/// -/// Provides read access to application state and write-only access to commands. -pub struct PanelContext<'a> { - pub data: &'a mut DataState, - pub ui: &'a mut UiState, - pub computed: &'a ComputedState, - pub file_loader: &'a mut FileLoader, - pub commands: CommandSink<'a>, -} - -impl<'a> PanelContext<'a> { - /// Create a new panel context - pub fn new( - data: &'a mut DataState, - ui: &'a mut UiState, - computed: &'a ComputedState, - file_loader: &'a mut FileLoader, - command_queue: &'a mut Vec, - ) -> Self { - Self { - data, - ui, - computed, - file_loader, - commands: CommandSink::new(command_queue), - } - } -} - -/// Trait that all panels must implement -/// -/// Provides a consistent interface for rendering UI panels and handling events. -pub trait Panel { - /// Render the panel - /// - /// Panels should read state from `ctx`, render UI using `ui`, - /// and emit commands by pushing to `ctx.commands`. - fn render(&mut self, ui: &mut egui::Ui, ctx: &mut PanelContext); - - /// Title displayed in the tab - fn title(&self) -> &str; -} diff --git a/rust/timsquery_viewer/src/ui/panels/config_panel.rs b/rust/timsquery_viewer/src/ui/panels/config_panel.rs new file mode 100644 index 0000000..5be4424 --- /dev/null +++ b/rust/timsquery_viewer/src/ui/panels/config_panel.rs @@ -0,0 +1,176 @@ +use eframe::egui; + +use crate::chromatogram_processor::SmoothingMethod; +use crate::plot_renderer::AutoZoomMode; +use crate::ui::tolerance_editor; +use timsquery::Tolerance; + +// UI spacing constants +const SECTION_MARGIN: i8 = 10; +const SECTION_SPACING: f32 = 12.0; +const INTERNAL_SPACING: f32 = 8.0; + +/// Panel for configuration settings +pub struct ConfigPanel; + +impl ConfigPanel { + pub fn new() -> Self { + Self + } + + /// Renders tolerance editor section + fn render_tolerance_editor(&self, ui: &mut egui::Ui, tolerance: &mut Tolerance) { + egui::Frame::group(ui.style()) + .inner_margin(egui::Margin::same(SECTION_MARGIN)) + .show(ui, |ui| { + ui.heading("Tolerance Settings"); + ui.add_space(INTERNAL_SPACING); + + tolerance_editor::render_tolerance_editor(ui, tolerance); + }); + } + + /// Renders smoothing configuration section + fn render_smoothing(&self, ui: &mut egui::Ui, smoothing_method: &mut SmoothingMethod) { + egui::Frame::group(ui.style()) + .inner_margin(egui::Margin::same(SECTION_MARGIN)) + .show(ui, |ui| { + ui.heading("Smoothing"); + ui.add_space(INTERNAL_SPACING); + + ui.label("Method:"); + let current_method = match smoothing_method { + SmoothingMethod::None => 0, + SmoothingMethod::SavitzkyGolay { .. } => 1, + SmoothingMethod::Gaussian { .. } => 2, + }; + + let mut selected = current_method; + egui::ComboBox::from_id_salt("smoothing_method") + .selected_text(match current_method { + 0 => "None", + 1 => "Savitzky-Golay", + 2 => "Gaussian", + _ => "Unknown", + }) + .show_ui(ui, |ui| { + if ui.selectable_value(&mut selected, 0, "None").clicked() { + *smoothing_method = SmoothingMethod::None; + } + if ui + .selectable_value(&mut selected, 1, "Savitzky-Golay") + .clicked() + { + *smoothing_method = SmoothingMethod::default_savitzky_golay(); + } + if ui.selectable_value(&mut selected, 2, "Gaussian").clicked() { + *smoothing_method = SmoothingMethod::default_gaussian(); + } + }); + + ui.add_space(INTERNAL_SPACING); + + match smoothing_method { + SmoothingMethod::None => { + ui.label(egui::RichText::new("No smoothing applied").weak().italics()); + } + SmoothingMethod::SavitzkyGolay { window, polynomial } => { + ui.label(egui::RichText::new("Parameters:").strong().small()); + ui.horizontal(|ui| { + ui.label("Window size:"); + let mut window_val = *window as i32; + ui.add(egui::Slider::new(&mut window_val, 3..=21).step_by(2.0)); + *window = window_val as usize; + }); + ui.horizontal(|ui| { + ui.label("Polynomial:"); + let mut poly_val = *polynomial as i32; + ui.add(egui::Slider::new(&mut poly_val, 0..=5)); + *polynomial = poly_val as usize; + }); + } + SmoothingMethod::Gaussian { sigma } => { + ui.label(egui::RichText::new("Parameters:").strong().small()); + ui.horizontal(|ui| { + ui.label("Sigma:"); + ui.add(egui::Slider::new(sigma, 0.5..=10.0)); + }); + } + } + }); + } + + pub fn render( + &mut self, + ui: &mut egui::Ui, + tolerance: &mut Tolerance, + smoothing_method: &mut SmoothingMethod, + auto_zoom_mode: &mut AutoZoomMode, + ) { + // Configuration Section Header + ui.label(egui::RichText::new("CONFIGURATION").strong().size(13.0)); + ui.add_space(INTERNAL_SPACING); + + self.render_tolerance_editor(ui, tolerance); + ui.add_space(SECTION_SPACING); + + self.render_smoothing(ui, smoothing_method); + ui.add_space(SECTION_SPACING); + + // Auto Zoom section + egui::Frame::group(ui.style()) + .inner_margin(egui::Margin::same(SECTION_MARGIN)) + .show(ui, |ui| { + ui.heading("Auto Zoom"); + ui.add_space(INTERNAL_SPACING); + + ui.horizontal(|ui| { + ui.label("Mode:"); + egui::ComboBox::from_id_salt("auto_zoom_mode") + .selected_text(match auto_zoom_mode { + AutoZoomMode::Disabled => "None", + AutoZoomMode::PeakApex => "Peak Score Apex", + AutoZoomMode::QueryRange => "Query Range", + }) + .show_ui(ui, |ui| { + if ui + .selectable_value(auto_zoom_mode, AutoZoomMode::Disabled, "None") + .clicked() + { + *auto_zoom_mode = AutoZoomMode::Disabled; + } + if ui + .selectable_value( + auto_zoom_mode, + AutoZoomMode::PeakApex, + "Peak Score Apex", + ) + .clicked() + { + *auto_zoom_mode = AutoZoomMode::PeakApex; + } + if ui + .selectable_value( + auto_zoom_mode, + AutoZoomMode::QueryRange, + "Query Range", + ) + .clicked() + { + *auto_zoom_mode = AutoZoomMode::QueryRange; + } + }); + }); + }); + } + + pub fn title(&self) -> &str { + "Settings" + } +} + +impl Default for ConfigPanel { + fn default() -> Self { + Self::new() + } +} diff --git a/rust/timsquery_viewer/src/ui/panels/left_settings_panel.rs b/rust/timsquery_viewer/src/ui/panels/left_settings_panel.rs deleted file mode 100644 index c1fe926..0000000 --- a/rust/timsquery_viewer/src/ui/panels/left_settings_panel.rs +++ /dev/null @@ -1,137 +0,0 @@ -use eframe::egui; - -use crate::app::AppCommand; -use crate::chromatogram_processor::SmoothingMethod; -use crate::ui::{ - Panel, - PanelContext, - tolerance_editor, -}; - -/// Panel for data loading and settings on the left side -pub struct LeftPanel; - -impl LeftPanel { - pub fn new() -> Self { - Self - } - - /// Renders tolerance editor section - fn render_tolerance_editor(&self, ui: &mut egui::Ui, ctx: &mut PanelContext) { - ui.heading("Tolerance Settings"); - - let changed = tolerance_editor::render_tolerance_editor(ui, &mut ctx.data.tolerance); - if changed { - ctx.commands.push(AppCommand::UpdateTolerance); - } - } - - /// Renders smoothing configuration section - fn render_smoothing(&self, ui: &mut egui::Ui, ctx: &mut PanelContext) { - ui.heading("Smoothing"); - - let mut changed = false; - - ui.label("Method:"); - let current_method = match ctx.data.smoothing { - SmoothingMethod::None => 0, - SmoothingMethod::SavitzkyGolay { .. } => 1, - SmoothingMethod::Gaussian { .. } => 2, - }; - - let mut selected = current_method; - egui::ComboBox::from_id_salt("smoothing_method") - .selected_text(match current_method { - 0 => "None", - 1 => "Savitzky-Golay", - 2 => "Gaussian", - _ => "Unknown", - }) - .show_ui(ui, |ui| { - if ui.selectable_value(&mut selected, 0, "None").clicked() { - ctx.data.smoothing = SmoothingMethod::None; - changed = true; - } - if ui - .selectable_value(&mut selected, 1, "Savitzky-Golay") - .clicked() - { - ctx.data.smoothing = SmoothingMethod::default_savitzky_golay(); - changed = true; - } - if ui.selectable_value(&mut selected, 2, "Gaussian").clicked() { - ctx.data.smoothing = SmoothingMethod::default_gaussian(); - changed = true; - } - }); - - ui.add_space(10.0); - - match &mut ctx.data.smoothing { - SmoothingMethod::None => { - ui.label("No smoothing applied"); - } - SmoothingMethod::SavitzkyGolay { window, polynomial } => { - ui.label("Parameters:"); - ui.horizontal(|ui| { - ui.label("Window size:"); - let mut window_val = *window as i32; - if ui - .add(egui::Slider::new(&mut window_val, 3..=21).step_by(2.0)) - .changed() - { - *window = window_val as usize; - changed = true; - } - }); - ui.horizontal(|ui| { - ui.label("Polynomial:"); - let mut poly_val = *polynomial as i32; - if ui.add(egui::Slider::new(&mut poly_val, 0..=5)).changed() { - *polynomial = poly_val as usize; - changed = true; - } - }); - } - SmoothingMethod::Gaussian { sigma } => { - ui.label("Parameters:"); - ui.horizontal(|ui| { - ui.label("Sigma:"); - if ui.add(egui::Slider::new(sigma, 0.5..=10.0)).changed() { - changed = true; - } - }); - } - } - - if changed { - ctx.commands.push(AppCommand::UpdateSmoothing); - } - } -} - -impl Panel for LeftPanel { - fn render(&mut self, ui: &mut egui::Ui, ctx: &mut PanelContext) { - self.render_tolerance_editor(ui, ctx); - ui.add_space(20.0); - ui.separator(); - - self.render_smoothing(ui, ctx); - ui.add_space(20.0); - ui.separator(); - - // Plot visibility controls - ui.heading("Plot Settings"); - ui.checkbox(&mut ctx.ui.show_ms2_spectrum, "Show MS2 Spectrum"); - } - - fn title(&self) -> &str { - "Settings" - } -} - -impl Default for LeftPanel { - fn default() -> Self { - Self::new() - } -} diff --git a/rust/timsquery_viewer/src/ui/panels/mod.rs b/rust/timsquery_viewer/src/ui/panels/mod.rs index e4c9ae6..e5f6846 100644 --- a/rust/timsquery_viewer/src/ui/panels/mod.rs +++ b/rust/timsquery_viewer/src/ui/panels/mod.rs @@ -2,10 +2,10 @@ // Each panel is responsible for rendering its portion of the UI // and returning commands for state changes -pub mod left_settings_panel; +pub mod config_panel; pub mod precursor_table_panel; pub mod spectrum_display_panel; -pub use left_settings_panel::LeftPanel; +pub use config_panel::ConfigPanel; pub use precursor_table_panel::TablePanel; pub use spectrum_display_panel::SpectrumPanel; diff --git a/rust/timsquery_viewer/src/ui/panels/precursor_table_panel.rs b/rust/timsquery_viewer/src/ui/panels/precursor_table_panel.rs index a692d16..ebd001c 100644 --- a/rust/timsquery_viewer/src/ui/panels/precursor_table_panel.rs +++ b/rust/timsquery_viewer/src/ui/panels/precursor_table_panel.rs @@ -1,44 +1,57 @@ use eframe::egui; -use crate::app::AppCommand; use crate::file_loader::ElutionGroupData; -use crate::ui::components::precursor_table; -use crate::ui::{ - Panel, - PanelContext, -}; + +enum Modes { + Insert, + Normal, +} + /// Panel for displaying and filtering the precursor table -pub struct TablePanel; +pub struct TablePanel { + filtered_indices: Option>, + last_search: Option, + last_displayed_mode: Modes, + last_selected_index: Option, +} impl TablePanel { pub fn new() -> Self { - Self + Self { + filtered_indices: None, + last_search: None, + last_displayed_mode: Modes::Normal, + last_selected_index: None, + } } - fn render_search_ui(&self, ui: &mut egui::Ui, ctx: &mut PanelContext) { + pub fn filtered_indices(&self) -> &[usize] { + match &self.filtered_indices { + Some(indices) => indices, + None => &[], + } + } + + fn render_search_ui(&mut self, ui: &mut egui::Ui, search_line: &mut String, search_mode: bool) { ui.horizontal(|ui| { ui.label("Search:"); - let response = ui.text_edit_singleline(&mut ctx.ui.search_input); - response.request_focus(); - ui.label("(Enter to apply, Esc to cancel)"); + let response = ui.text_edit_singleline(search_line); + // TODO make some color change to indicate mode + + let last_mode = &mut self.last_displayed_mode; + if search_mode { + response.request_focus(); + *last_mode = Modes::Insert; + } else if let Modes::Insert = last_mode { + // Just exited insert mode + *last_mode = Modes::Normal; + } }); ui.separator(); } - fn render_filter_ui(&self, ui: &mut egui::Ui, ctx: &mut PanelContext) { - ui.horizontal(|ui| { - ui.label("Filter by ID:"); - ui.text_edit_singleline(&mut ctx.ui.table_filter); - if ui.button("Clear").clicked() { - ctx.ui.table_filter.clear(); - } - }); - ui.add_space(5.0); - ui.label( - egui::RichText::new("Vim keys: j/k=navigate, /=search, g/G=first/last") - .small() - .italics(), - ); + fn render_keybinding_ui(&self, ui: &mut egui::Ui) { + // TODO: Add ... ui.separator(); } @@ -48,55 +61,52 @@ impl TablePanel { filtered_indices: &[usize], elution_groups: &ElutionGroupData, selected_index: &mut Option, - commands: &mut crate::ui::CommandSink, + scroll_to_selection: bool, ) { - let old_selection = *selected_index; - egui::ScrollArea::vertical() .auto_shrink([false; 2]) .show(ui, |ui| { - macro_rules! render_table { - ($egs:expr) => { - precursor_table::render_precursor_table_filtered( - ui, - filtered_indices, - $egs, - selected_index, - ) - }; - } - crate::with_elution_collection!(elution_groups, render_table); + elution_groups.render_table( + ui, + filtered_indices, + selected_index, + scroll_to_selection, + ); }); - - if old_selection != *selected_index - && let Some(new_idx) = *selected_index - { - commands.push(AppCommand::SelectElutionGroup(new_idx)); - } } -} -impl Panel for TablePanel { - fn render(&mut self, ui: &mut egui::Ui, ctx: &mut PanelContext) { + pub fn render( + &mut self, + ui: &mut egui::Ui, + elution_groups: &Option, + search_mode: bool, + search_line: &mut String, + selected_index: &mut Option, + ) { ui.heading("Precursor Table"); ui.separator(); // Check if we have data first - if ctx.data.elution_groups.is_none() { + if elution_groups.is_none() { ui.label("Load elution groups to see the table"); return; } - // Render search/filter UI (needs mutable ctx, doesn't need elution_groups) - if ctx.ui.search_mode { - self.render_search_ui(ui, ctx); - } else { - self.render_filter_ui(ui, ctx); - } + self.render_search_ui(ui, search_line, search_mode); + self.render_keybinding_ui(ui); - // Now borrow elution_groups for the rest - let elution_groups = ctx.data.elution_groups.as_ref().unwrap(); - let filtered_indices = elution_groups.matching_indices_for_id_filter(&ctx.ui.table_filter); + let elution_groups = elution_groups.as_ref().unwrap(); + // Invalidate cache if search line changed + // TODO: I think I can optimize search using the fact that when typing + // letters are added, so I can filter from previous results. + if self.last_search.as_ref() != Some(search_line) { + elution_groups.matching_indices_for_id_filter( + search_line, + self.filtered_indices.get_or_insert_with(Vec::new), + ); + self.last_search = Some(search_line.clone()); + } + let filtered_indices = self.filtered_indices.as_ref().unwrap(); ui.label(format!( "Showing {} of {} precursors", @@ -104,16 +114,26 @@ impl Panel for TablePanel { elution_groups.len() )); + // We scroll to selection only if the selection changed + let scroll_to_selection = match (self.last_selected_index, *selected_index) { + (Some(last), Some(current)) => last != current, + (None, Some(_)) => true, + _ => false, + }; + if scroll_to_selection { + self.last_selected_index = *selected_index; + } + self.render_table( ui, - &filtered_indices, + filtered_indices, elution_groups, - &mut ctx.ui.selected_index, - &mut ctx.commands, + selected_index, + scroll_to_selection, ); } - fn title(&self) -> &str { + pub fn title(&self) -> &str { "Table" } } diff --git a/rust/timsquery_viewer/src/ui/panels/spectrum_display_panel.rs b/rust/timsquery_viewer/src/ui/panels/spectrum_display_panel.rs index 9da2d9d..9c3955c 100644 --- a/rust/timsquery_viewer/src/ui/panels/spectrum_display_panel.rs +++ b/rust/timsquery_viewer/src/ui/panels/spectrum_display_panel.rs @@ -1,14 +1,17 @@ -use eframe::egui; +use eframe::egui::{ + self, + Color32, +}; use egui_plot::{ - Bar, - BarChart, + Line, Plot, + PlotPoint, + PlotPoints, + Text, }; +use timsseek::ExpectedIntensities; -use crate::ui::{ - Panel, - PanelContext, -}; +use crate::plot_renderer::MS2Spectrum; /// Panel for displaying MS2 spectrum pub struct SpectrumPanel; @@ -17,33 +20,89 @@ impl SpectrumPanel { pub fn new() -> Self { Self } -} -impl Panel for SpectrumPanel { - fn render(&mut self, ui: &mut egui::Ui, ctx: &mut PanelContext) { - if let Some(spec) = &ctx.computed.ms2_spectrum { + pub fn title(&self) -> &str { + "MS2" + } + + /// Get color based on fragment label prefix + fn get_fragment_color(label: &str) -> Color32 { + match label.chars().next() { + Some('b') | Some('B') => Color32::from_rgb(100, 149, 237), // Blue (Cornflower) + Some('y') | Some('Y') => Color32::from_rgb(220, 20, 60), // Red (Crimson) + Some('p') | Some('P') => Color32::from_rgb(255, 200, 0), // Yellow + _ => Color32::from_rgb(50, 205, 50), // Green (Lime) + } + } + + pub fn render( + &mut self, + ui: &mut egui::Ui, + ms2_spectrum: &Option, + expected_intensities: &Option>, + ) { + if let Some(spec) = ms2_spectrum { + let expected_intensities = expected_intensities + .as_ref() + .expect("If a spectrum is present we should also have expected intensities."); ui.label(format!("RT: {:.2} seconds", spec.rt_seconds)); ui.separator(); + // Calculate label offset based on max intensity + let max_intensity = spec.intensities.iter().cloned().fold(0.0f32, f32::max); + let norm_factor = max_intensity.max(1.0); + let label_offset = 0.03f64; // 3% of max intensity + + let expected_norm_factor = expected_intensities + .fragment_intensities + .values() + .cloned() + .max_by(|a, b| a.partial_cmp(b).unwrap()) + .unwrap_or(1.0); + Plot::new("ms2_spectrum") .height(ui.available_height()) .show_axes([true, true]) .allow_zoom(true) .allow_drag(true) + .x_axis_label("m/z") + .y_axis_label("Intensity") + .include_y(0.0) .show(ui, |plot_ui| { - let bars: Vec = spec - .mz_values - .iter() - .zip(&spec.intensities) - .enumerate() - .map(|(idx, (&mz, &intensity))| { - Bar::new(mz, intensity as f64) - .width(0.5) - .name(&spec.fragment_labels[idx]) - }) - .collect(); + // Draw each peak as a vertical line from 0 to intensity + for (idx, (&mz, &intensity)) in + spec.mz_values.iter().zip(&spec.intensities).enumerate() + { + let label_str = &spec.fragment_labels[idx]; + let color = Self::get_fragment_color(label_str); + let y_value = (intensity / norm_factor) as f64; + + let points = PlotPoints::new(vec![[mz, 0.0], [mz, y_value]]); + let line = Line::new(label_str, points).color(color); + plot_ui.line(line); - plot_ui.bar_chart(BarChart::new("MS2 Spectrum", bars)); + // Add label above the peak with offset + let label = Text::new( + label_str, + PlotPoint::new(mz, y_value + label_offset), + label_str, + ) + .color(color); + plot_ui.text(label); + + // If expected intensities are provided, draw them as dashed lines + // in the negative direction + let ei = expected_intensities.fragment_intensities.get(label_str); + if let Some(&expected_intensity) = ei { + let y_ref_value = (expected_intensity / expected_norm_factor) as f64; + let expected_points = + PlotPoints::new(vec![[mz, 0.0], [mz, -y_ref_value]]); + let expected_line = + Line::new(format!("expected_{}", label_str), expected_points) + .color(color); + plot_ui.line(expected_line); + } + } }); } else { ui.centered_and_justified(|ui| { @@ -51,10 +110,6 @@ impl Panel for SpectrumPanel { }); } } - - fn title(&self) -> &str { - "MS2" - } } impl Default for SpectrumPanel { diff --git a/rust/timsquery_viewer/src/ui/tolerance_editor.rs b/rust/timsquery_viewer/src/ui/tolerance_editor.rs index 12f9d5a..6eeca4d 100644 --- a/rust/timsquery_viewer/src/ui/tolerance_editor.rs +++ b/rust/timsquery_viewer/src/ui/tolerance_editor.rs @@ -8,29 +8,25 @@ use timsquery::models::tolerance::{ }; /// Renders the tolerance editor UI -pub fn render_tolerance_editor(ui: &mut egui::Ui, tolerance: &mut Tolerance) -> bool { - let mut changed = false; - +pub fn render_tolerance_editor(ui: &mut egui::Ui, tolerance: &mut Tolerance) { ui.collapsing("M/Z Tolerance", |ui| { - changed |= render_mz_tolerance(ui, &mut tolerance.ms); + render_mz_tolerance(ui, &mut tolerance.ms); }); ui.collapsing("RT Tolerance", |ui| { - changed |= render_rt_tolerance(ui, &mut tolerance.rt); + render_rt_tolerance(ui, &mut tolerance.rt); }); ui.collapsing("Mobility Tolerance", |ui| { - changed |= render_mobility_tolerance(ui, &mut tolerance.mobility); + render_mobility_tolerance(ui, &mut tolerance.mobility); }); ui.collapsing("Quadrupole Tolerance", |ui| { - changed |= render_quad_tolerance(ui, &mut tolerance.quad); + render_quad_tolerance(ui, &mut tolerance.quad); }); - - changed } -fn render_mz_tolerance(ui: &mut egui::Ui, tol: &mut MzTolerance) -> bool { +fn render_mz_tolerance(ui: &mut egui::Ui, tol: &mut MzTolerance) { let mut changed = false; let mut is_ppm = matches!(tol, MzTolerance::Ppm(_)); @@ -73,13 +69,9 @@ fn render_mz_tolerance(ui: &mut egui::Ui, tol: &mut MzTolerance) -> bool { }); } } - - changed } -fn render_rt_tolerance(ui: &mut egui::Ui, tol: &mut RtTolerance) -> bool { - let mut changed = false; - +fn render_rt_tolerance(ui: &mut egui::Ui, tol: &mut RtTolerance) { let mut selected = match tol { RtTolerance::Minutes(_) => 0, RtTolerance::Pct(_) => 1, @@ -101,36 +93,33 @@ fn render_rt_tolerance(ui: &mut egui::Ui, tol: &mut RtTolerance) -> bool { 2 => RtTolerance::Unrestricted, _ => RtTolerance::Unrestricted, }; - changed = true; } match tol { RtTolerance::Minutes((lower, upper)) => { ui.horizontal(|ui| { ui.label("Lower (min):"); - changed |= ui.add(egui::DragValue::new(lower).speed(0.1)).changed(); + ui.add(egui::DragValue::new(lower).speed(0.1)); }); ui.horizontal(|ui| { ui.label("Upper (min):"); - changed |= ui.add(egui::DragValue::new(upper).speed(0.1)).changed(); + ui.add(egui::DragValue::new(upper).speed(0.1)); }); } RtTolerance::Pct((lower, upper)) => { ui.horizontal(|ui| { ui.label("Lower (%):"); - changed |= ui.add(egui::DragValue::new(lower).speed(0.5)).changed(); + ui.add(egui::DragValue::new(lower).speed(0.5)); }); ui.horizontal(|ui| { ui.label("Upper (%):"); - changed |= ui.add(egui::DragValue::new(upper).speed(0.5)).changed(); + ui.add(egui::DragValue::new(upper).speed(0.5)); }); } RtTolerance::Unrestricted => { ui.label("No RT restriction"); } } - - changed } fn render_mobility_tolerance(ui: &mut egui::Ui, tol: &mut MobilityTolerance) -> bool { diff --git a/rust/timsseek/Cargo.toml b/rust/timsseek/Cargo.toml index 320f5d6..da7439f 100644 --- a/rust/timsseek/Cargo.toml +++ b/rust/timsseek/Cargo.toml @@ -6,7 +6,6 @@ license.workspace = true [dependencies] regex = "1.10.6" -parquet_derive = "56.1" rmp-serde = "1.1" zstd = "0.13" @@ -18,6 +17,7 @@ rand = "0.9.2" timsquery = { path = "../timsquery" } calibrt = { path = "../calibrt" } micromzpaf = { path = "../micromzpaf" } +timscentroid = { path = "../timscentroid" } # Workspace-inherited deps rustyms = { workspace = true } @@ -27,6 +27,7 @@ serde = { workspace = true } serde_json = { workspace = true } tracing = { workspace = true } rayon = { workspace = true } +parquet_derive = { workspace = true } parquet = { workspace = true } [features] diff --git a/rust/timsseek/src/fragment_mass/elution_group_converter.rs b/rust/timsseek/src/fragment_mass/elution_group_converter.rs index 07a8d8c..cd42828 100644 --- a/rust/timsseek/src/fragment_mass/elution_group_converter.rs +++ b/rust/timsseek/src/fragment_mass/elution_group_converter.rs @@ -1,29 +1,8 @@ -use super::fragment_mass_builder::FragmentMassBuilder; use crate::isotopes::peptide_isotopes; -use crate::models::DigestSlice; -use crate::{ - ExpectedIntensities, - IonAnnot, -}; -use rayon::prelude::*; -use rustyms::error::{ - Context, - CustomError, -}; -use rustyms::{ - LinearPeptide, - MolecularCharge, +use rustyms::prelude::{ + Element, MolecularFormula, - MultiChemical, -}; -use std::collections::HashMap; -use std::ops::RangeInclusive; -use std::sync::Arc; -use timsquery::models::elution_group::TimsElutionGroup; -use timsquery::utils::constants::PROTON_MASS; -use tracing::{ - error, - warn, + Peptidoform, }; /// Super simple 1/k0 prediction. @@ -54,40 +33,17 @@ pub fn supersimpleprediction(mz: f64, charge: i32) -> f64 { + (1.417e-01 * charge as f64) } -#[derive(Debug)] -pub struct SequenceToElutionGroupConverter { - pub precursor_charge_range: RangeInclusive, - pub fragment_buildder: FragmentMassBuilder, - pub max_precursor_mz: f64, - pub min_precursor_mz: f64, - pub max_fragment_mz: f64, - pub min_fragment_mz: f64, -} - -impl Default for SequenceToElutionGroupConverter { - fn default() -> Self { - Self { - precursor_charge_range: 2..=3, - fragment_buildder: FragmentMassBuilder::default(), - max_precursor_mz: 1000., - min_precursor_mz: 400., - max_fragment_mz: 2000., - min_fragment_mz: 200., - } - } -} - fn count_carbon_sulphur(form: &MolecularFormula) -> (u16, u16) { let mut ncarbon = 0; let mut nsulphur = 0; for (elem, count, _extras) in form.elements() { match (elem, count) { - (&rustyms::Element::C, Some(cnt)) => { - ncarbon += *cnt; + (&Element::C, Some(cnt)) => { + ncarbon += cnt.get(); } - (&rustyms::Element::S, Some(cnt)) => { - nsulphur += *cnt; + (&Element::S, Some(cnt)) => { + nsulphur += cnt.get(); } _ => {} } @@ -96,223 +52,31 @@ fn count_carbon_sulphur(form: &MolecularFormula) -> (u16, u16) { (ncarbon, nsulphur) } -impl SequenceToElutionGroupConverter { - pub fn convert_sequence( - &self, - sequence: &str, - id: u64, - ) -> Result< - ( - Vec>>, - Vec, - Vec, - ), - CustomError, - > { - let mut peptide = LinearPeptide::pro_forma(sequence)?; - let pep_formulas = peptide.formulas(); - let (pep_mono_mass, pep_formula) = if pep_formulas.len() > 1 { - return Err(CustomError::error( - "Peptide contains more than one formula.", - "", - Context::none(), +pub fn count_carbon_sulphur_in_sequence(sequence: &str) -> Result<(u16, u16), String> { + let peptide = match Peptidoform::pro_forma(sequence, None) { + Ok(pep) => pep, + Err(e) => { + return Err(format!( + "Error parsing peptide sequence {}: {:?}", + sequence, e )); - } else { - let form = pep_formulas[0].clone(); - let mono_mass = pep_formulas[0].mass(rustyms::MassMode::Monoisotopic); - (mono_mass.value, form) - }; - let (ncarbon, nsulphur) = count_carbon_sulphur(&pep_formula); - let pep_isotope = peptide_isotopes(ncarbon, nsulphur); - let mut expected_prec_inten: HashMap = HashMap::new(); - expected_prec_inten.insert(-1, 0.0); - - for (ii, isot) in pep_isotope.iter().enumerate() { - expected_prec_inten.insert(ii as i8, *isot); } - - let mut out_eg = Vec::new(); - let mut out_exp_int = Vec::new(); - let mut out_charges = Vec::new(); - - for charge in self.precursor_charge_range.clone() { - // Q: Why am I adding the charge here manually instead of using the calculator in the - // Formula? - let precursor_mz = (pep_mono_mass + (charge as f64 * PROTON_MASS)) / charge as f64; - - if precursor_mz < self.min_precursor_mz || precursor_mz > self.max_precursor_mz { - continue; - } - - peptide = peptide.charge_carriers(Some(MolecularCharge::proton(charge.into()))); - - let mut fragment_mzs = self - .fragment_buildder - .fragment_mzs_from_linear_peptide(&peptide) - .unwrap(); - - fragment_mzs - .retain(|(_pos, mz, _)| *mz > self.min_fragment_mz && *mz < self.max_fragment_mz); - fragment_mzs.sort_by(|a, b| a.1.partial_cmp(&b.1).unwrap()); - - let ((fragment_labels, fragment_mzs), fragment_intensities): ( - (Vec<_>, Vec<_>), - Vec<_>, - ) = fragment_mzs - .into_iter() - .map(|(label, mz, intensity)| ((label, mz), intensity)) - .unzip(); - - let mobility = supersimpleprediction(precursor_mz, charge as i32); - let precursor_labels = vec![-1, 0, 1, 2]; - - let fragment_expect_inten = HashMap::from_iter( - fragment_labels - .iter() - .cloned() - .zip(fragment_intensities.iter().cloned()), - ); - let eg = TimsElutionGroup::builder() - .id(id) - .mobility_ook0(mobility as f32) - .rt_seconds(0.0f32) - .precursor_labels(precursor_labels.as_slice().into()) - .precursor(precursor_mz, charge) - .fragment_mzs(fragment_mzs) - .fragment_labels(fragment_labels.as_slice().into()) - .try_build() - .unwrap(); - - let ei = ExpectedIntensities { - fragment_intensities: fragment_expect_inten, - precursor_intensities: expected_prec_inten.clone(), - }; - - out_eg.push(Arc::new(eg)); - out_exp_int.push(ei); - out_charges.push(charge); - } - - Ok((out_eg, out_exp_int, out_charges)) - } - - pub fn convert_sequences<'a>( - &self, - sequences: &'a [DigestSlice], - ) -> Result< - ( - Vec<&'a DigestSlice>, - Vec>>, - Vec, - Vec, - ), - CustomError, - > { - let (seqs, (eg, (ei, crg))) = sequences - .par_iter() - .enumerate() - .flat_map(|(id, dig_slice)| { - let sequence: String = dig_slice.clone().into(); - let tmp = self.convert_sequence(sequence.as_ref(), id as u64); - match tmp { - Ok(x) => { - let expanded_sequence: Vec<&DigestSlice> = - (0..(x.0.len())).map(|_x| dig_slice).collect(); - Some((expanded_sequence, (x.0, (x.1, x.2)))) - } - Err(e) => { - warn!("Error converting sequence {:?}, err: {:?}", sequence, e); - None - } - } - }) - .flatten() - .collect(); - Ok((seqs, eg, ei, crg)) + }; + let peptide = match peptide.as_linear() { + Some(pep) => pep, + None => return Err("Peptide is not linear.".to_string()), } + .clone(); - pub fn convert_enumerated_sequences<'a>( - &self, - enum_sequences: &'a [(usize, DigestSlice)], - ) -> Result< - ( - Vec<&'a DigestSlice>, - Vec>>, - Vec, - Vec, - ), - CustomError, - > { - let (seqs, (eg, (ei, crg))) = enum_sequences - .par_iter() - .flat_map(|(i, s)| { - let sequence: String = s.clone().into(); - let tmp = self.convert_sequence(sequence.as_ref(), *i as u64); - match tmp { - Ok(x) => { - let expanded_sequence: Vec<&DigestSlice> = - (0..(x.0.len())).map(|_x| s).collect(); - Some((expanded_sequence, (x.0, (x.1, x.2)))) - } - Err(e) => { - error!("Error converting sequence {:?}, err: {:?}", s, e); - None - } - } - }) - .flatten() - .collect(); - Ok((seqs, eg, ei, crg)) + let pep_formulas = peptide.formulas(); + if pep_formulas.len() > 1 { + return Err("Peptide contains more than one formula.".to_string()); } + let form = pep_formulas[0].clone(); + Ok(count_carbon_sulphur(&form)) } -#[cfg(test)] -mod tests { - use super::*; - use crate::models::DecoyMarking; - use rustyms::model::{ - Location, - Model, - }; - use rustyms::system::f64::MassOverCharge; - use rustyms::system::mass_over_charge::mz; - use rustyms::system::{ - Charge, - e, - }; - use std::sync::Arc; - - #[test] - fn test_converter() { - let converter = SequenceToElutionGroupConverter { - precursor_charge_range: 2..=3, - fragment_buildder: FragmentMassBuilder { - model: Model { - a: (Location::None, Vec::new()), - b: (Location::SkipNC(2, 2), vec![]), - c: (Location::None, Vec::new()), - d: (Location::None, Vec::new()), - v: (Location::None, Vec::new()), - w: (Location::None, Vec::new()), - x: (Location::None, Vec::new()), - y: (Location::SkipNC(2, 2), vec![]), - z: (Location::None, Vec::new()), - precursor: vec![], - ppm: MassOverCharge::new::(20.0), - glycan_fragmentation: None, - }, - max_charge: Charge::new::(2.0), - }, - max_precursor_mz: 1000., - min_precursor_mz: 400., - max_fragment_mz: 2000., - min_fragment_mz: 200., - }; - let seq: Arc = "PEPTIDEPINK".into(); - let range_use: std::ops::Range = 0u16..seq.len() as u16; - let dig_slice = DigestSlice::new(seq, range_use, DecoyMarking::Target, 1); - let seq_slc = vec![dig_slice]; - let out = converter.convert_sequences(&seq_slc).unwrap(); - assert_eq!(out.0.len(), 2); - } +pub fn isotope_dist_from_seq(sequence: &str) -> Result<[f32; 3], String> { + let (ncarbon, nsulphur) = count_carbon_sulphur_in_sequence(sequence)?; + Ok(peptide_isotopes(ncarbon, nsulphur)) } diff --git a/rust/timsseek/src/fragment_mass/fragment_mass_builder.rs b/rust/timsseek/src/fragment_mass/fragment_mass_builder.rs deleted file mode 100644 index ddacb53..0000000 --- a/rust/timsseek/src/fragment_mass/fragment_mass_builder.rs +++ /dev/null @@ -1,82 +0,0 @@ -use rustyms::fragment::FragmentType; -use rustyms::model::Location; -use rustyms::spectrum::MassMode; -use rustyms::system::f64::MassOverCharge; -use rustyms::system::mass_over_charge::mz; -use rustyms::system::{ - Charge, - e, -}; -use rustyms::{ - Fragment, - LinearPeptide, - Model, -}; - -use crate::{ - IonAnnot, - IonParsingError, -}; - -#[derive(Debug)] -pub struct FragmentMassBuilder { - pub model: Model, - pub max_charge: Charge, -} - -impl Default for FragmentMassBuilder { - fn default() -> Self { - let by_ions = Model { - a: (Location::None, Vec::new()), - b: (Location::SkipNC(2, 2), vec![]), - c: (Location::None, Vec::new()), - d: (Location::None, Vec::new()), - v: (Location::None, Vec::new()), - w: (Location::None, Vec::new()), - x: (Location::None, Vec::new()), - y: (Location::SkipNC(2, 2), vec![]), - z: (Location::None, Vec::new()), - precursor: vec![], - // TODO: Fix this hard-coded value - ppm: MassOverCharge::new::(20.0), - glycan_fragmentation: None, - }; - let max_charge: Charge = Charge::new::(2.0); - Self { - model: by_ions, - max_charge, - } - } -} - -impl FragmentMassBuilder { - pub fn fragment_mzs_from_linear_peptide( - &self, - peptide: &LinearPeptide, - ) -> Result, IonParsingError> { - // NOTE: I have to add this retain bc it generates precursor ions even if they are not - // defined. - // TODO: return a different error ... this one is very loaded. - let ions: Vec = peptide - .generate_theoretical_fragments(self.max_charge, &self.model) - .into_iter() - .collect(); - - // Does this generate ions above the charge of the precursor? - ions.into_iter() - .map(|x| { - let intensity = match x.ion { - FragmentType::Y(_) => 1.0, - FragmentType::B(_) => 0.5, - _ => 0.01, - }; - Ok(( - IonAnnot::from_fragment(x.ion.clone(), x.charge.value as i8, 0)?, - // IonAnnot::new(x.ion.clone(), x.charge.abs().value as u8, 0)?, - x.mz(MassMode::Monoisotopic).value, - intensity, - )) - }) - .collect() - } -} diff --git a/rust/timsseek/src/fragment_mass/mod.rs b/rust/timsseek/src/fragment_mass/mod.rs index a636742..8d8e62a 100644 --- a/rust/timsseek/src/fragment_mass/mod.rs +++ b/rust/timsseek/src/fragment_mass/mod.rs @@ -1,2 +1 @@ pub mod elution_group_converter; -pub mod fragment_mass_builder; diff --git a/rust/timsseek/src/models/digest.rs b/rust/timsseek/src/models/digest.rs index 3aa0d6e..98f99f3 100644 --- a/rust/timsseek/src/models/digest.rs +++ b/rust/timsseek/src/models/digest.rs @@ -4,6 +4,7 @@ use super::decoy::{ }; use serde::Serialize; use std::collections::HashSet; +use std::fmt::Display as FmtDisplay; use std::ops::Range; use std::sync::Arc; @@ -85,6 +86,13 @@ impl DigestSlice { } } +impl FmtDisplay for DigestSlice { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + let local_str: String = self.clone().into(); + write!(f, "{}", local_str) + } +} + impl From for String { // TODO make this a cow ... maybe ... fn from(x: DigestSlice) -> Self { diff --git a/rust/timsseek/src/models/query_item.rs b/rust/timsseek/src/models/query_item.rs index 7ba3b36..0b73e72 100644 --- a/rust/timsseek/src/models/query_item.rs +++ b/rust/timsseek/src/models/query_item.rs @@ -5,15 +5,18 @@ use serde::{ Serialize, }; use std::collections::HashMap; -use timsquery::TimsElutionGroup; use timsquery::tinyvec::tiny_vec; +use timsquery::{ + KeyLike, + TimsElutionGroup, +}; // TODO: reimplement my own "keyed_vec" (essentially a vec that enforces // unique keys on insertion) to avoid the HashMap overhead here. #[derive(Debug, Clone, Serialize, Deserialize)] -pub struct ExpectedIntensities { - pub fragment_intensities: HashMap, +pub struct ExpectedIntensities { + pub fragment_intensities: HashMap, pub precursor_intensities: HashMap, } @@ -22,7 +25,7 @@ pub struct QueryItemToScore { // Kinda hate this pub digest: DigestSlice, pub query: TimsElutionGroup, - pub expected_intensity: ExpectedIntensities, + pub expected_intensity: ExpectedIntensities, } impl QueryItemToScore { diff --git a/rust/timsseek/src/scoring/apex_finding.rs b/rust/timsseek/src/scoring/apex_finding.rs index b578412..ec72550 100644 --- a/rust/timsseek/src/scoring/apex_finding.rs +++ b/rust/timsseek/src/scoring/apex_finding.rs @@ -10,7 +10,7 @@ //! use timsseek::scoring::apex_finding::{ApexFinder, CandidateContext}; //! //! // 1. Create a reusable finder (one per thread) -//! let mut finder = ApexFinder::new(rt_ms_arc.clone()); +//! let mut finder = ApexFinder::new(chromatogram_collector.num_cycles()); //! //! // 2. Create the context for a specific query //! let context = CandidateContext { @@ -21,10 +21,12 @@ //! }; //! //! // 3. Score (reusing the finder's internal buffers) -//! let score = finder.find_apex(&context).unwrap(); +//! let score = finder.find_apex(&context, rt_mapping_fn).unwrap(); //! println!("Found apex at RT: {} ms with score {}", score.retention_time_ms, score.score); //! ``` +use std::fmt::Display; + use super::{ COELUTION_WINDOW_WIDTH, NUM_MS1_IONS, @@ -40,32 +42,30 @@ use crate::scoring::scores::coelution::coelution_score::coelution_vref_score_fil use crate::scoring::scores::corr_v_ref; use crate::utils::top_n_array::TopNArray; use serde::Serialize; -use std::sync::Arc; use timsquery::models::aggregators::ChromatogramCollector; use timsquery::traits::KeyLike; use timsquery::{ MzMobilityStatsCollector, SpectralCollector, }; -use tracing::{ - trace, - warn, -}; /// Represents a peptide candidate context required for scoring. /// /// This bundles the theoretical peptide info (digest, expected intensities) /// with the raw extracted data (`ChromatogramCollector`). +/// +/// The label is in essence anything that identifies the peptide sequence and modifications, +/// so it can be a string, or a more complex struct. (or a simpler one like a smart pointer) #[derive(Debug)] -pub struct CandidateContext { +pub struct CandidateContext { /// The peptide sequence and modification information. - pub digest: DigestSlice, + pub label: L, /// The precursor charge state. pub charge: u8, /// The expected theoretical intensities of precursor and fragment ions. - pub expected_intensities: ExpectedIntensities, + pub expected_intensities: ExpectedIntensities, /// The observed chromatogram data collected from the instrument. - pub query_values: ChromatogramCollector, + pub query_values: ChromatogramCollector, } /// Immutable peptide metadata that never changes during scoring. @@ -98,12 +98,12 @@ pub struct PeptideMetadata { /// This contains only the data needed for scoring calculations, /// separated from metadata for clarity and efficiency. #[derive(Debug)] -pub struct ScoringContext { +pub struct ScoringContext { /// The expected theoretical intensities of precursor and fragment ions. - pub expected_intensities: ExpectedIntensities, + pub expected_intensities: ExpectedIntensities, /// The observed chromatogram data collected from the instrument. - pub query_values: ChromatogramCollector, + pub query_values: ChromatogramCollector, } /// The result of the apex finding process. @@ -190,16 +190,30 @@ impl ScoreTraces { // but for safety we can resize it too. self.main_score.resize(len, 0.0); } + + pub fn iter_scores(&self) -> impl Iterator + '_ { + vec![ + ("ms1_cosine_ref_sim", &self.ms1_cosine_ref_sim[..]), + ("ms1_coelution_score", &self.ms1_coelution_score[..]), + ("ms1_corr_v_gauss", &self.ms1_corr_v_gauss[..]), + ("ms2_cosine_ref_sim", &self.ms2_cosine_ref_sim[..]), + ("ms2_coelution_score", &self.ms2_coelution_score[..]), + ("ms2_lazyscore", &self.ms2_lazyscore[..]), + ("ms2_corr_v_gauss", &self.ms2_corr_v_gauss[..]), + ("main_score", &self.main_score[..]), + ] + .into_iter() + } } /// The core engine for finding peptide apexes. +#[derive(Debug)] pub struct ApexFinder { pub traces: ScoreTraces, buffers: ApexFinderBuffers, - /// Reference to the global retention time array (for index -> RT mapping). - rt_ms_arc: Arc<[u32]>, } +#[derive(Debug)] struct ApexFinderBuffers { temp_ms2_dot_prod: Vec, temp_ms2_norm_sq_obs: Vec, @@ -237,22 +251,22 @@ impl ApexFinderBuffers { } impl ApexFinder { - pub fn new(rt_ms_arc: Arc<[u32]>) -> Self { + pub fn new(capacity: usize) -> Self { Self { - traces: ScoreTraces::new_with_capacity(rt_ms_arc.len()), - buffers: ApexFinderBuffers::new(rt_ms_arc.len()), - rt_ms_arc, + traces: ScoreTraces::new_with_capacity(capacity), + buffers: ApexFinderBuffers::new(capacity), } } /// Find the peptide apex within the provided scoring context. #[cfg_attr( feature = "instrumentation", - tracing::instrument(skip(self, scoring_ctx), level = "trace") + tracing::instrument(skip(self, scoring_ctx, rt_mapper), level = "trace") )] - pub fn find_apex( + pub fn find_apex( &mut self, - scoring_ctx: &ScoringContext, + scoring_ctx: &ScoringContext, + rt_mapper: &dyn Fn(usize) -> u32, ) -> Result { let collector = &scoring_ctx.query_values; let n_cycles = collector.num_cycles(); @@ -274,18 +288,23 @@ impl ApexFinder { self.compute_main_score_trace(); // 5. Find Apex and Extract Features - self.extract_apex_score(scoring_ctx) + self.extract_apex_score(scoring_ctx, &rt_mapper) } /// Pass 1: Scores that depend only on individual ion traces. /// - Lazyscore (Hyperscore approximation) /// - Cosine Similarity vs Expected Intensities /// - Gaussian Shape Correlation + /// + /// Can be an error if no valid ions are found for scoring. #[cfg_attr( feature = "instrumentation", tracing::instrument(skip_all, level = "trace") )] - fn compute_pass_1(&mut self, scoring_ctx: &ScoringContext) -> Result<(), DataProcessingError> { + fn compute_pass_1( + &mut self, + scoring_ctx: &ScoringContext, + ) -> Result<(), DataProcessingError> { let collector = &scoring_ctx.query_values; // --- MS2 (Fragments) --- @@ -406,7 +425,10 @@ impl ApexFinder { feature = "instrumentation", tracing::instrument(skip_all, level = "trace") )] - fn compute_pass_2(&mut self, scoring_ctx: &ScoringContext) -> Result<(), DataProcessingError> { + fn compute_pass_2( + &mut self, + scoring_ctx: &ScoringContext, + ) -> Result<(), DataProcessingError> { let collector = &scoring_ctx.query_values; // Apply smoothing to Lazyscore BEFORE using it as a reference for Coelution @@ -480,9 +502,10 @@ impl ApexFinder { } } - fn extract_apex_score( + fn extract_apex_score( &self, - scoring_ctx: &ScoringContext, + scoring_ctx: &ScoringContext, + rt_mapper: &dyn Fn(usize) -> u32, ) -> Result { let mut peak_picker = PeakPicker::new(&self.traces.main_score); @@ -523,7 +546,7 @@ impl ApexFinder { // Extract features at max_loc let cycle_offset = scoring_ctx.query_values.cycle_offset(); let global_loc = max_loc + cycle_offset; - let retention_time_ms = self.rt_ms_arc.get(global_loc).copied().unwrap_or(0); + let retention_time_ms = rt_mapper(global_loc); let (ms1_summed_intensity, _ms1_npeaks) = self.sum_intensities_at(&scoring_ctx.query_values.precursors, max_loc); diff --git a/rust/timsseek/src/scoring/pipeline.rs b/rust/timsseek/src/scoring/pipeline.rs index e9f5635..1357b02 100644 --- a/rust/timsseek/src/scoring/pipeline.rs +++ b/rust/timsseek/src/scoring/pipeline.rs @@ -35,12 +35,16 @@ use crate::{ ScorerQueriable, }; use rayon::prelude::*; -use std::sync::Arc; use std::time::Instant; +use timscentroid::rt_mapping::{ + MS1CycleIndex, + RTIndex, +}; use timsquery::models::tolerance::MobilityTolerance; use timsquery::utils::TupleRange; use timsquery::{ ChromatogramCollector, + KeyLike, MzMobilityStatsCollector, OptionallyRestricted, SpectralCollector, @@ -96,9 +100,9 @@ impl ToleranceHierarchy { feature = "instrumentation", tracing::instrument(skip_all, level = "trace") )] -fn filter_zero_intensity_ions( - agg: &mut ChromatogramCollector, - expected: &mut crate::ExpectedIntensities, +fn filter_zero_intensity_ions( + agg: &mut ChromatogramCollector, + expected: &mut crate::ExpectedIntensities, ) { // Early-exit predicate: stop at first non-zero value (much faster than summing) let predicate = |chrom: &[f32]| chrom.iter().any(|&x| x > 0.0); @@ -148,8 +152,7 @@ fn compute_secondary_lazyscores( .iter_fragments() .map(|((_k, _mz), v)| v.weight() as f32), ); - let iso_lazyscore = - single_lazyscore(isotope.iter_fragments().map(|((_k, _mz), v)| *v)); + let iso_lazyscore = single_lazyscore(isotope.iter_fragments().map(|((_k, _mz), v)| *v)); let ratio = iso_lazyscore / lazyscore.max(1.0); SecondaryLazyScores { lazyscore, @@ -163,9 +166,6 @@ fn compute_secondary_lazyscores( /// Pipeline stages: build context → find apex → refine → finalize. /// Uses progressive tolerance refinement, metadata separation, and buffer reuse for high throughput. pub struct ScoringPipeline { - /// Retention time in milliseconds for each index cycle. - pub index_cycle_rt_ms: Arc<[u32]>, - /// Indexed peak data that implements the required query aggregators. pub index: I, @@ -190,12 +190,16 @@ impl ScoringPipeline { fn build_candidate_context( &self, item: &QueryItemToScore, - ) -> Result<(super::apex_finding::PeptideMetadata, super::apex_finding::ScoringContext), SkippingReason> { - let max_range = TupleRange::try_new( - *self.index_cycle_rt_ms.first().unwrap(), - *self.index_cycle_rt_ms.last().unwrap(), - ) - .expect("Reference RTs should be sorted and valid"); + ) -> Result< + ( + super::apex_finding::PeptideMetadata, + super::apex_finding::ScoringContext, + ), + SkippingReason, + > { + let max_range = self.index.ms1_cycle_mapping().range_milis(); + let max_range = TupleRange::try_new(max_range.0, max_range.1) + .expect("Reference RTs should be sorted and valid"); let rt_range = match self .tolerances .prescore @@ -215,7 +219,7 @@ impl ScoringPipeline { match ChromatogramCollector::new( item.query.clone(), rt_range, - &self.index_cycle_rt_ms, + self.index.ms1_cycle_mapping(), ) { Ok(collector) => collector, Err(e) => { @@ -228,9 +232,11 @@ impl ScoringPipeline { } }); - tracing::span!(tracing::Level::TRACE, "build_candidate_context::add_query").in_scope(|| { - self.index.add_query(&mut agg, &self.tolerances.prescore); - }); + tracing::span!(tracing::Level::TRACE, "build_candidate_context::add_query").in_scope( + || { + self.index.add_query(&mut agg, &self.tolerances.prescore); + }, + ); // Filter out zero-intensity ions and update expected intensities in one pass let mut expected_intensities = item.expected_intensity.clone(); @@ -386,22 +392,26 @@ impl ScoringPipeline { }; timings.prescore += st.elapsed(); - if scoring_ctx.expected_intensities.fragment_intensities.is_empty() { + if scoring_ctx + .expected_intensities + .fragment_intensities + .is_empty() + { return None; } let st = Instant::now(); - let apex_score = match buffer.find_apex(&scoring_ctx) { - Ok(score) => score, - Err(_e) => { - return None; - } - }; + let apex_score = + match buffer.find_apex(&scoring_ctx, &|idx| self.map_rt_index_to_milis(idx)) { + Ok(score) => score, + Err(_e) => { + return None; + } + }; timings.localize += st.elapsed(); let st = Instant::now(); - let (inner_collector, isotope_collector) = - self.execute_secondary_query(&item, &apex_score); + let (inner_collector, isotope_collector) = self.execute_secondary_query(&item, &apex_score); timings.secondary_query += st.elapsed(); let nqueries = scoring_ctx.query_values.fragments.num_ions() as u8; @@ -429,18 +439,18 @@ impl ScoringPipeline { &self, item: QueryItemToScore, ) -> Result { - let mut buffer = ApexFinder::new(self.index_cycle_rt_ms.clone()); + let mut buffer = ApexFinder::new(self.num_cycles()); // Re-implementing logic here because process_query consumes `item` and returns `Option`. // We want intermediate results for `FullQueryResult`. - let (metadata, scoring_ctx) = self - .build_candidate_context(&item) - .map_err(|_| DataProcessingError::ExpectedNonEmptyData { + let (metadata, scoring_ctx) = self.build_candidate_context(&item).map_err(|_| { + DataProcessingError::ExpectedNonEmptyData { context: Some("RT out of bounds".into()), - })?; + } + })?; - let apex_score = buffer.find_apex(&scoring_ctx)?; + let apex_score = buffer.find_apex(&scoring_ctx, &|idx| self.map_rt_index_to_milis(idx))?; let (inner_collector, isotope_collector) = self.execute_secondary_query(&item, &apex_score); let nqueries = scoring_ctx.query_values.fragments.num_ions() as u8; @@ -474,9 +484,7 @@ impl ScoringPipeline { let num_input_items = items_to_score.len(); let loc_score_start = Instant::now(); - let init_fn = || { - ApexFinder::new(self.index_cycle_rt_ms.clone()) - }; + let init_fn = || ApexFinder::new(self.num_cycles()); let filter_fn = |x: &&QueryItemToScore| { let tmp = x.query.get_precursor_mz_limits(); @@ -526,4 +534,15 @@ impl ScoringPipeline { (results.res, results.timings) } + + fn map_rt_index_to_milis(&self, rt_index: usize) -> u32 { + self.index + .ms1_cycle_mapping() + .rt_milis_for_index(&MS1CycleIndex::new(rt_index as u32)) + .unwrap_or(0) + } + + fn num_cycles(&self) -> usize { + self.index.ms1_cycle_mapping().len() + } } diff --git a/rust/timsseek/src/scoring/scores/coelution/coelution_score.rs b/rust/timsseek/src/scoring/scores/coelution/coelution_score.rs index 531bec7..f46c24e 100644 --- a/rust/timsseek/src/scoring/scores/coelution/coelution_score.rs +++ b/rust/timsseek/src/scoring/scores/coelution/coelution_score.rs @@ -49,15 +49,16 @@ fn coelution_vref_score_filter_onto( } let num_elems = (0..slices.nrows()).filter(|&i| filter(i)).count(); - let norm_factor = 1f32 / (num_elems as f32).max(1.0); - if norm_factor == 1.0 { + if num_elems == 0 { trace!("No valid slices after filtering"); return Err(DataProcessingError::ExpectedNonEmptyData { context: None }); } + let norm_factor = 1f32 / num_elems as f32; if num_elems > 50 { trace!( "There are too many valid slices after filtering, probably an mz-major and an rt-major array got mixed up" ); + // TODO: make this a more specific error return Err(DataProcessingError::ExpectedNonEmptyData { context: None }); } buffer.clear(); @@ -104,7 +105,7 @@ fn coelution_vref_score_filter_onto( /// * `filter` - A closure that takes a key (of type `K`) from the `mz_order` and returns /// `true` if the corresponding chromatogram should be included in the calculation. /// * `buffer` - A mutable buffer to store the resulting coelution scores. -pub fn coelution_vref_score_filter_into<'a, K: Clone + Ord>( +pub fn coelution_vref_score_filter_into<'a, K: Clone + Eq>( slices: &'a MzMajorIntensityArray, ref_slice: &'a [f32], window: usize, diff --git a/rust/timsseek/src/scoring/scores/corr_v_ref.rs b/rust/timsseek/src/scoring/scores/corr_v_ref.rs index 051d85e..924b2b4 100644 --- a/rust/timsseek/src/scoring/scores/corr_v_ref.rs +++ b/rust/timsseek/src/scoring/scores/corr_v_ref.rs @@ -1,23 +1,8 @@ use crate::errors::DataProcessingError; use crate::utils::correlation::cosine_similarity; -use timsquery::models::{ - MzMajorIntensityArray, - RTMajorIntensityArray, -}; +use timsquery::models::MzMajorIntensityArray; use timsquery::traits::key_like::KeyLike; -// This is used to calculate the correlation with the theoretical intensity. -// of either the isotope pattern or the predicted fragment intensities. -pub fn calculate_cosine_with_ref( - slices: &RTMajorIntensityArray, - ref_slice: &[f32], -) -> Result, DataProcessingError> { - slices - .arr - .row_apply(|slice| cosine_similarity(slice, ref_slice)) - .collect() -} - // From https://doi.org/10.1101/2024.11.19.624419 const REF_GAUSSIAN: [f32; 7] = [0.0044, 0.054, 0.242, 0.399, 0.242, 0.054, 0.0044]; const REF_GAUSS_OFFSET: usize = 4; diff --git a/rust/timsseek/src/scoring/search_results.rs b/rust/timsseek/src/scoring/search_results.rs index ceafbcc..a25fd18 100644 --- a/rust/timsseek/src/scoring/search_results.rs +++ b/rust/timsseek/src/scoring/search_results.rs @@ -101,14 +101,17 @@ impl SetField { } impl<'q> SearchResultBuilder<'q> { - pub fn with_candidate_context(mut self, candidate_context: &'q CandidateContext) -> Self { + pub fn with_candidate_context( + mut self, + candidate_context: &'q CandidateContext, + ) -> Self { self.library_id = SetField::Some(candidate_context.query_values.eg.id() as u32); - self.digest_slice = SetField::Some(&candidate_context.digest); + self.digest_slice = SetField::Some(&candidate_context.label); self.ref_eg = SetField::Some(&candidate_context.query_values.eg); self.nqueries = SetField::Some(candidate_context.query_values.fragments.num_ions() as u8); - self.decoy_marking = SetField::Some(candidate_context.digest.decoy); + self.decoy_marking = SetField::Some(candidate_context.label.decoy); self.charge = SetField::Some(candidate_context.charge); - self.decoy_group_id = SetField::Some(candidate_context.digest.decoy_group); + self.decoy_group_id = SetField::Some(candidate_context.label.decoy_group); self } diff --git a/rust/timsseek/src/traits.rs b/rust/timsseek/src/traits.rs index 5c865c4..438726d 100644 --- a/rust/timsseek/src/traits.rs +++ b/rust/timsseek/src/traits.rs @@ -67,6 +67,7 @@ pub trait ScorerQueriable: QueriableData> + QueriableData> + QueriableData> + + MappableRTCycles { } @@ -75,5 +76,20 @@ impl ScorerQueriable for I where I: QueriableData> + QueriableData> + QueriableData> + + MappableRTCycles { } + +pub trait MappableRTCycles { + fn ms1_cycle_mapping( + &self, + ) -> &timscentroid::rt_mapping::CycleToRTMapping; +} + +impl MappableRTCycles for timscentroid::IndexedTimstofPeaks { + fn ms1_cycle_mapping( + &self, + ) -> &timscentroid::rt_mapping::CycleToRTMapping { + self.ms1_cycle_mapping() + } +} diff --git a/rust/timsseek/src/utils/mod.rs b/rust/timsseek/src/utils/mod.rs index 0696e92..1c0a207 100644 --- a/rust/timsseek/src/utils/mod.rs +++ b/rust/timsseek/src/utils/mod.rs @@ -2,5 +2,4 @@ pub mod aligning; pub mod correlation; pub mod elution_group_ops; pub mod math; -pub mod tdf; pub mod top_n_array; diff --git a/rust/timsseek/src/utils/tdf.rs b/rust/timsseek/src/utils/tdf.rs deleted file mode 100644 index 6540d20..0000000 --- a/rust/timsseek/src/utils/tdf.rs +++ /dev/null @@ -1,16 +0,0 @@ -use rusqlite::{ - Connection, - Result, -}; -use std::sync::Arc; - -pub fn get_ms1_frame_times_ms(tdf_path: &str) -> Result> { - let conn = Connection::open(tdf_path)?; - let mut stmt = conn.prepare("SELECT Time FROM Frames WHERE MsMsType == 0")?; - - let times: Vec = stmt - .query_map([], |row| row.get(0))? - .collect::>>()?; - let times: Vec = times.iter().map(|&x| (x * 1000.0) as u32).collect(); - Ok(Arc::from(times)) -} diff --git a/rust/timsseek_cli/Cargo.toml b/rust/timsseek_cli/Cargo.toml index 88edd65..7e6dc99 100644 --- a/rust/timsseek_cli/Cargo.toml +++ b/rust/timsseek_cli/Cargo.toml @@ -10,7 +10,7 @@ timsquery = { path = "../timsquery" } regex = "1.11.1" # tracing-profile = { version="0.10.11",features=["perfetto"] } -tracing-profile = { git = "https://github.com/jspaezp/tracing-profile.git" , branch = "feat/aggregate_common" } +tracing-profile = { git = "https://github.com/jspaezp/tracing-profile.git" , branch = "feat/aggregate_common", optional = true } # Workspace-inherited deps clap = { workspace = true, features = ["derive"] } @@ -34,4 +34,4 @@ path = "src/sample_speclib.rs" mimalloc = { workspace = true, features = ["secure"] } [features] -instrumentation = ["timsseek/instrumentation"] +instrumentation = ["timsseek/instrumentation", "tracing-profile"] diff --git a/rust/timsseek_cli/src/config.rs b/rust/timsseek_cli/src/config.rs index 44d4c4a..dd3ed19 100644 --- a/rust/timsseek_cli/src/config.rs +++ b/rust/timsseek_cli/src/config.rs @@ -5,9 +5,6 @@ use serde::{ use std::path::PathBuf; use timsquery::Tolerance; -use crate::cli::Cli; -use crate::errors; - #[derive(Debug, Serialize, Deserialize, Clone)] pub struct Config { pub input: Option, @@ -39,29 +36,3 @@ pub struct AnalysisConfig { pub struct OutputConfig { pub directory: PathBuf, } - -#[derive(Debug, Serialize, Deserialize, Clone)] -pub struct DigestionConfig { - pub min_length: u32, - pub max_length: u32, - pub max_missed_cleavages: u32, - pub build_decoys: bool, -} - -impl Default for DigestionConfig { - fn default() -> Self { - Self { - min_length: 7, - max_length: 30, - max_missed_cleavages: 2, - build_decoys: true, - } - } -} - -impl Config { - #![allow(dead_code)] - pub fn with_cli_args(_config: Cli) -> Result { - unimplemented!() - } -} diff --git a/rust/timsseek_cli/src/errors.rs b/rust/timsseek_cli/src/errors.rs index b649e05..48c622b 100644 --- a/rust/timsseek_cli/src/errors.rs +++ b/rust/timsseek_cli/src/errors.rs @@ -10,6 +10,9 @@ pub enum CliError { source: String, path: Option, }, + DataReading { + source: String, + }, } impl std::fmt::Display for CliError { @@ -24,6 +27,15 @@ impl std::fmt::Display for CliError { write!(f, "Error reading file: {}", source) } } + CliError::DataReading { source } => write!(f, "Error reading data: {}", source), + } + } +} + +impl From for CliError { + fn from(e: timsquery::DataReadingError) -> Self { + CliError::DataReading { + source: format!("{:?}", e), } } } diff --git a/rust/timsseek_cli/src/main.rs b/rust/timsseek_cli/src/main.rs index 8403675..330b17a 100644 --- a/rust/timsseek_cli/src/main.rs +++ b/rust/timsseek_cli/src/main.rs @@ -6,7 +6,7 @@ mod processing; use clap::Parser; use timsquery::TimsTofPath; use timsquery::models::tolerance::RtTolerance; -use timsquery::serde::load_index_caching; +use timsquery::serde::load_index_auto; use timsquery::utils::TupleRange; use timsseek::scoring::{ ScoringPipeline, @@ -16,10 +16,6 @@ use tracing::{ error, info, }; -use tracing_profile::{ - PrintTreeConfig, - PrintTreeLayer, -}; use tracing_subscriber::filter::EnvFilter; use tracing_subscriber::fmt::format::FmtSpan; use tracing_subscriber::fmt::{ @@ -30,13 +26,18 @@ use tracing_subscriber::{ self, }; +#[cfg(feature = "instrumentation")] +use tracing_profile::{ + PrintTreeConfig, + PrintTreeLayer, +}; + use cli::Cli; use config::{ Config, InputConfig, OutputConfig, }; -use std::sync::Arc; // use tracing_profile::PerfettoLayer; #[cfg(target_os = "windows")] @@ -46,21 +47,6 @@ use mimalloc::MiMalloc; #[global_allocator] static GLOBAL: MiMalloc = MiMalloc; -fn get_ms1_rts_as_millis(file: &TimsTofPath) -> Arc<[u32]> { - let reader = file.load_frame_reader().unwrap(); - let mut rts: Vec<_> = reader - .frame_metas - .iter() - .filter_map(|f| match f.ms_level { - timsrust::MSLevel::MS1 => Some((f.rt_in_seconds * 1000.0).round() as u32), - _ => None, - }) - .collect(); - rts.sort_unstable(); - rts.dedup(); - rts.into() -} - fn get_frag_range(file: &TimsTofPath) -> TupleRange { let reader = file.load_frame_reader().unwrap(); let upper_mz = reader @@ -93,6 +79,12 @@ fn get_frag_range(file: &TimsTofPath) -> TupleRange { } fn main() -> std::result::Result<(), errors::CliError> { + let fmt_filter = EnvFilter::builder() + .with_default_directive("info".parse().unwrap()) + .with_env_var("RUST_LOG") + .from_env_lossy(); + + #[cfg(feature = "instrumentation")] let perf_filter = EnvFilter::builder() .with_default_directive("trace".parse().unwrap()) .with_env_var("RUST_PERF_LOG") @@ -100,13 +92,11 @@ fn main() -> std::result::Result<(), errors::CliError> { .add_directive("forust_ml::gradientbooster=warn".parse().unwrap()); // Filter out events but keep spans + #[cfg(feature = "instrumentation")] let events_filter = tracing_subscriber::filter::filter_fn(|metadata| !metadata.is_event()); - let fmt_filter = EnvFilter::builder() - .with_default_directive("info".parse().unwrap()) - .with_env_var("RUST_LOG") - .from_env_lossy(); - + // I am aware that this conditional compilation is ugly ... + #[cfg(feature = "instrumentation")] let (tree_layer, _guard) = PrintTreeLayer::new(PrintTreeConfig { attention_above_percent: 25.0, relevant_above_percent: 2.5, @@ -117,6 +107,10 @@ fn main() -> std::result::Result<(), errors::CliError> { accumulate_events: false, aggregate_similar_siblings: true, }); + #[cfg(feature = "instrumentation")] + let tree_layer = tree_layer + .with_filter(perf_filter) + .with_filter(events_filter); // let (pf_layer, pf_guard) = PerfettoLayer::new_from_env().unwrap(); @@ -124,15 +118,12 @@ fn main() -> std::result::Result<(), errors::CliError> { .with_span_events(FmtSpan::CLOSE) .with_filter(fmt_filter); - tracing_subscriber::registry() - .with(fmt_layer) - .with( - tree_layer - .with_filter(perf_filter) - .with_filter(events_filter), - ) - // .with(pf_layer) - .init(); + let reg = tracing_subscriber::registry().with(fmt_layer); + + #[cfg(feature = "instrumentation")] + let reg = reg.with(tree_layer); + + reg.init(); // Parse command line arguments let args = Cli::parse(); @@ -204,8 +195,15 @@ fn main() -> std::result::Result<(), errors::CliError> { } }; - let index = load_index_caching(file_loc).unwrap(); - let index_cycle_rt_ms = get_ms1_rts_as_millis(&timstofpath); + let index = load_index_auto( + file_loc.to_str().ok_or_else(|| errors::CliError::Io { + source: "Invalid path encoding".to_string(), + path: None, + })?, + None, // Use default config - could add cache_location support in config later + )? + .into_eager()?; + let fragmented_range = get_frag_range(&timstofpath); // Process based on input type @@ -219,7 +217,6 @@ fn main() -> std::result::Result<(), errors::CliError> { // } Some(InputConfig::Speclib { path }) => { let pipeline = ScoringPipeline { - index_cycle_rt_ms, index, tolerances: ToleranceHierarchy { prescore: config.analysis.tolerance.clone(), diff --git a/rust/timsseek_rts/Cargo.toml b/rust/timsseek_rts/Cargo.toml deleted file mode 100644 index e6befba..0000000 --- a/rust/timsseek_rts/Cargo.toml +++ /dev/null @@ -1,28 +0,0 @@ -[package] -name = "timsseek_rts" -version.workspace = true -edition.workspace = true -license.workspace = true - -[dependencies] -timsseek = { path = "../timsseek" } -timsquery = { path = "../timsquery" } - -regex = "1.10.6" - -# Workspace-inherited deps -timsrust = { workspace = true } -tracing = { workspace = true } -tracing-subscriber = { workspace = true, features = [ - "registry", - "env-filter", -] } -serde = { workspace = true } -serde_json = { workspace = true } -rayon = { workspace = true } -clap = { workspace = true, features = ["derive"] } - -[[bin]] -name = "timsseek_rts" -path = "src/main.rs" - diff --git a/rust/timsseek_rts/src/cli.rs b/rust/timsseek_rts/src/cli.rs deleted file mode 100644 index fcc0fd8..0000000 --- a/rust/timsseek_rts/src/cli.rs +++ /dev/null @@ -1,46 +0,0 @@ -use clap::Parser; -use std::path::PathBuf; -use timsquery::Tolerance; -use timsseek::errors::{ - Result, - TimsSeekError, -}; - -#[derive(Parser, Debug)] -#[command(author, version, about, long_about = None)] -pub struct Cli { - /// Path to the .d file (will over-write the config file) - #[arg(short, long)] - pub dotd_file: PathBuf, - - /// Path to the speclib file - #[arg(short, long)] - pub config: PathBuf, - - /// Path to the output directory - #[arg(short, long)] - #[clap(default_value("127.0.0.1:3724"))] - pub address: String, -} - -impl Cli { - pub fn read_config(&self) -> Result { - let conf = match std::fs::File::open(&self.config) { - Ok(x) => x, - Err(e) => { - return Err(TimsSeekError::Io { - source: e, - path: None, - }); - } - }; - let config: core::result::Result = serde_json::from_reader(conf); - let config = match config { - Ok(x) => x, - Err(e) => { - return Err(TimsSeekError::ParseError { msg: e.to_string() }); - } - }; - Ok(config) - } -} diff --git a/rust/timsseek_rts/src/index.rs b/rust/timsseek_rts/src/index.rs deleted file mode 100644 index 28848b9..0000000 --- a/rust/timsseek_rts/src/index.rs +++ /dev/null @@ -1,45 +0,0 @@ -use std::time::Instant; - -use timsquery::{ - IndexedTimstofPeaks, - Tolerance, -}; -use timsseek::errors::Result; -use timsseek::utils::tdf::get_ms1_frame_times_ms; -use timsseek::{ - ScoringPipeline, - ToleranceHierarchy, -}; - -pub fn new_index( - dotd_file_location: std::path::PathBuf, - tolerance: Tolerance, -) -> Result> { - let st = Instant::now(); - // Can use centroided for faster queries ... - // - let file_loc = dotd_file_location.clone(); - let index = timsquery::serde::load_index_caching(file_loc).unwrap(); - let elap_time = st.elapsed(); - println!( - "Loading index took: {:?} for {}", - elap_time, - dotd_file_location.display() - ); - - let tdf_path = &dotd_file_location.clone().join("analysis.tdf"); - let ref_time_ms = get_ms1_frame_times_ms(tdf_path.to_str().unwrap()).unwrap(); - let fragmented_range = index.fragmented_range(); - - Ok(ScoringPipeline { - index_cycle_rt_ms: ref_time_ms, - index, - tolerances: ToleranceHierarchy { - prescore: tolerance.clone(), - secondary: tolerance.with_rt_tolerance( - timsquery::models::tolerance::RtTolerance::Minutes((0.1, 0.1)), - ), - }, - fragmented_range, - }) -} diff --git a/rust/timsseek_rts/src/main.rs b/rust/timsseek_rts/src/main.rs deleted file mode 100644 index fb5b459..0000000 --- a/rust/timsseek_rts/src/main.rs +++ /dev/null @@ -1,221 +0,0 @@ -use clap::Parser; -use serde_json::{ - Value, - json, -}; -use std::io::{ - BufReader, - Read, - Write, -}; -use std::net::{ - TcpListener, - TcpStream, -}; -use std::sync::Arc; -use std::thread; -use timsquery::IndexedTimstofPeaks; -use timsseek::data_sources::speclib::SerSpeclibElement; -use timsseek::errors::{ - Result, - TimsSeekError, -}; -use timsseek::scoring::ScoringPipeline; - -mod cli; -mod index; - -struct DaemonServer { - index: Arc>, - running: std::sync::atomic::AtomicBool, -} - -impl DaemonServer { - pub fn new(index: ScoringPipeline) -> std::io::Result { - Ok(Self { - index: Arc::new(index), - running: std::sync::atomic::AtomicBool::new(true), - }) - } - - pub fn run(&self, addr: &str) -> std::io::Result<()> { - let listener = TcpListener::bind(addr)?; - println!("Listening on {}", addr); - - while self.running.load(std::sync::atomic::Ordering::Relaxed) { - // listener.set_nonblocking(true)?; - - match listener.accept() { - Ok((stream, _)) => { - let index = Arc::clone(&self.index); - let running = &self.running; - - match handle_connection(stream, index, running) { - Ok(_) => (), - Err(e) => eprintln!("Error handling connection: {}", e), - }; - } - Err(ref e) if e.kind() == std::io::ErrorKind::WouldBlock => { - thread::sleep(std::time::Duration::from_millis(100)); - continue; - } - Err(e) => eprintln!("Error accepting connection: {}", e), - } - } - - Ok(()) - } -} - -fn handle_connection( - mut stream: TcpStream, - index: Arc>, - _running: &std::sync::atomic::AtomicBool, -) -> std::io::Result<()> { - let mut reader = BufReader::new(stream.try_clone()?); - let mut buffer = String::new(); - - loop { - buffer.clear(); - reader.read_to_string(&mut buffer)?; - - if buffer.is_empty() { - break; - } - println!("read data: {}", buffer); - - let query: Value = match serde_json::from_str(&buffer) { - Ok(q) => q, - Err(e) => { - let response = json!({ - "status": "error", - "data": format!("Invalid JSON format: {}", e) - }); - send_response(&mut stream, &response)?; - continue; - } - }; - - let query: SerSpeclibElement = match serde_json::from_value(query) { - Ok(q) => q, - Err(e) => { - let response = json!({ - "status": "error", - "data": format!("Invalid query format: {}", e) - }); - send_response(&mut stream, &response)?; - continue; - } - }; - - let start = std::time::Instant::now(); - let query_res = index.as_ref().process_query_full(query.into()); - let elap_time = start.elapsed(); - println!("Querying took {:#?} for query", elap_time); - let response = match query_res { - Ok(q) => json!({ - "status": "success", - "data": q - }), - Err(e) => json!({ - "status": "error", - "data": format!("{:?}", e) - }), - }; - send_response(&mut stream, &response)?; - } - - Ok(()) -} - -fn send_response(stream: &mut TcpStream, response: &Value) -> std::io::Result<()> { - stream.write_all(response.to_string().as_bytes())?; - stream.write_all(b"\n")?; - Ok(()) -} - -// Example usage -fn main() -> Result<()> { - let conf = cli::Cli::parse(); - // SerSpeclibElement - // let sample = QueryItemToScore::sample(); - let sample = SerSpeclibElement::sample(); - let tol = conf.read_config()?; - let index = index::new_index(conf.dotd_file, tol)?; - - println!("Starting server"); - println!( - "Sample query: \n{}", - serde_json::to_string_pretty(&sample).unwrap() - ); - - let st = std::time::Instant::now(); - match index.process_query_full(sample.into()) { - Ok(_q) => { - println!("Query OK"); - } - Err(e) => { - println!("Query failed: {:?}", e); - // return Err(e.into()); - } - }; - let elap_time = st.elapsed(); - println!("Querying took {:#?} for sample query", elap_time); - - // println!("Query result: \n{}", serde_json::to_string_pretty(&check_out).unwrap()); - - let server = match DaemonServer::new(index) { - Ok(s) => s, - Err(e) => { - return Err(TimsSeekError::Io { - source: e, - path: None, - }); - } - }; - match server.run(&conf.address) { - Ok(_) => Ok(()), - Err(e) => Err(TimsSeekError::Io { - source: e, - path: None, - }), - } -} - -// Example client code -// fn example_client() -> Result<()> { -// let mut stream = TcpStream::connect("127.0.0.1:8080")?; -// -// let query = json!({ -// "type": "search", -// "term": "example" -// }); -// -// match stream.write_all(query.to_string().as_bytes()) { -// Ok(_) => (), -// Err(e) => { -// return Err(TimsSeekError::Io { -// source: e, -// path: None, -// }); -// } -// }; -// match stream.write_all(b"\n") { -// Ok(_) => (), -// Err(e) => { -// return Err(TimsSeekError::Io { -// source: e, -// path: None, -// }); -// } -// }; -// -// let mut reader = BufReader::new(stream); -// let mut response = String::new(); -// reader.read_line(&mut response)?; -// -// let response_json: Value = serde_json::from_str(&response).unwrap(); -// println!("Response: {}", response_json); -// -// Ok(()) -// }