From d6d020f77e0153cea2b193b510908ee1a2d6846c Mon Sep 17 00:00:00 2001 From: "J. Sebastian Paez" Date: Fri, 8 May 2026 20:14:14 -0700 Subject: [PATCH 01/41] chore(bench): add bench dependency group and gitignore bench_out/ --- .gitignore | 1 + bench/__init__.py | 0 bench/tests/__init__.py | 0 pyproject.toml | 15 + uv.lock | 737 ++++++++++++++++++++++++++++++++++++++-- 5 files changed, 720 insertions(+), 33 deletions(-) create mode 100644 bench/__init__.py create mode 100644 bench/tests/__init__.py diff --git a/.gitignore b/.gitignore index b85c21eb..ec690bc9 100644 --- a/.gitignore +++ b/.gitignore @@ -37,6 +37,7 @@ results.json *search_results*/* wandb/* +bench_out/ docs/superpowers/* .plans/* .claude/* diff --git a/bench/__init__.py b/bench/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/bench/tests/__init__.py b/bench/tests/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/pyproject.toml b/pyproject.toml index 49ab9269..72c1f701 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -19,6 +19,21 @@ interactive = [ "matplotlib", "vizta", ] +bench = [ + "wandb[media]", + "loguru", + "pydantic>=2.6", + "pyahocorasick>=2.0", + "pyarrow>=15", + "polars>=1", + "matplotlib>=3.8", + "requests>=2.31", + "responses>=0.25", + "pytest", +] + +[tool.pytest.ini_options] +testpaths = ["bench/tests"] [tool.ruff] target-version = "py312" diff --git a/uv.lock b/uv.lock index 3325b063..01c50faa 100644 --- a/uv.lock +++ b/uv.lock @@ -16,6 +16,35 @@ resolution-markers = [ "python_full_version < '3.12' and platform_machine == 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32'", ] +[[package]] +name = "annotated-types" +version = "0.7.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/ee/67/531ea369ba64dcff5ec9c3402f9f51bf748cec26dde048a2f973a4eea7f5/annotated_types-0.7.0.tar.gz", hash = "sha256:aff07c09a53a08bc8cfccb9c85b05f1aa9a2a6f23728d790723543408344ce89", size = 16081, upload-time = "2024-05-20T21:33:25.928Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/78/b6/6307fbef88d9b5ee7421e68d78a9f162e0da4900bc5f5793f6d3d0e34fb8/annotated_types-0.7.0-py3-none-any.whl", hash = "sha256:1f02e8b43a8fbbc3f3e0d4f0f4bfc8131bcb4eebe8849b8e5c773f3a1c582a53", size = 13643, upload-time = "2024-05-20T21:33:24.1Z" }, +] + +[[package]] +name = "bokeh" +version = "3.9.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "contourpy" }, + { name = "jinja2" }, + { name = "narwhals" }, + { name = "numpy" }, + { name = "packaging" }, + { name = "pillow" }, + { name = "pyyaml" }, + { name = "tornado", marker = "sys_platform != 'emscripten'" }, + { name = "xyzservices" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/bf/0d/fabb70707646217e4b0e3943e05730eab8c1f7b7e7485145f8594b52e606/bokeh-3.9.0.tar.gz", hash = "sha256:775219714a8496973ddbae16b1861606ba19fe670a421e4d43267b41148e07a3", size = 5740345, upload-time = "2026-03-11T17:58:34.062Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/47/0b/bdf449df87be3f07b23091ceafee8c3ef569cf6d2fb7edec6e3b12b3faa4/bokeh-3.9.0-py3-none-any.whl", hash = "sha256:b252bfb16a505f0e0c57d532d0df308ae1667235bafc622aa9441fe9e7c5ce4a", size = 6396068, upload-time = "2026-03-11T17:58:31.645Z" }, +] + [[package]] name = "bumpver" version = "2025.1131" @@ -31,6 +60,51 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/1d/5b/2d5ea6802495ee4506721977be522804314aa66ad629d9356e3c7e5af4a6/bumpver-2025.1131-py2.py3-none-any.whl", hash = "sha256:c02527f6ed7887afbc06c07630047b24a9f9d02d544a65639e99bf8b92aaa674", size = 65361, upload-time = "2025-07-02T20:36:10.103Z" }, ] +[[package]] +name = "certifi" +version = "2026.4.22" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/25/ee/6caf7a40c36a1220410afe15a1cc64993a1f864871f698c0f93acb72842a/certifi-2026.4.22.tar.gz", hash = "sha256:8d455352a37b71bf76a79caa83a3d6c25afee4a385d632127b6afb3963f1c580", size = 137077, upload-time = "2026-04-22T11:26:11.191Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/22/30/7cd8fdcdfbc5b869528b079bfb76dcdf6056b1a2097a662e5e8c04f42965/certifi-2026.4.22-py3-none-any.whl", hash = "sha256:3cb2210c8f88ba2318d29b0388d1023c8492ff72ecdde4ebdaddbb13a31b1c4a", size = 135707, upload-time = "2026-04-22T11:26:09.372Z" }, +] + +[[package]] +name = "cffi" +version = "2.0.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "pycparser", marker = "implementation_name != 'PyPy'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/eb/56/b1ba7935a17738ae8453301356628e8147c79dbb825bcbc73dc7401f9846/cffi-2.0.0.tar.gz", hash = "sha256:44d1b5909021139fe36001ae048dbdde8214afa20200eda0f64c068cac5d5529", size = 523588, upload-time = "2025-09-08T23:24:04.541Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/12/4a/3dfd5f7850cbf0d06dc84ba9aa00db766b52ca38d8b86e3a38314d52498c/cffi-2.0.0-cp311-cp311-macosx_10_13_x86_64.whl", hash = "sha256:b4c854ef3adc177950a8dfc81a86f5115d2abd545751a304c5bcf2c2c7283cfe", size = 184344, upload-time = "2025-09-08T23:22:26.456Z" }, + { url = "https://files.pythonhosted.org/packages/4f/8b/f0e4c441227ba756aafbe78f117485b25bb26b1c059d01f137fa6d14896b/cffi-2.0.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2de9a304e27f7596cd03d16f1b7c72219bd944e99cc52b84d0145aefb07cbd3c", size = 180560, upload-time = "2025-09-08T23:22:28.197Z" }, + { url = "https://files.pythonhosted.org/packages/b1/b7/1200d354378ef52ec227395d95c2576330fd22a869f7a70e88e1447eb234/cffi-2.0.0-cp311-cp311-manylinux1_i686.manylinux2014_i686.manylinux_2_17_i686.manylinux_2_5_i686.whl", hash = "sha256:baf5215e0ab74c16e2dd324e8ec067ef59e41125d3eade2b863d294fd5035c92", size = 209613, upload-time = "2025-09-08T23:22:29.475Z" }, + { url = "https://files.pythonhosted.org/packages/b8/56/6033f5e86e8cc9bb629f0077ba71679508bdf54a9a5e112a3c0b91870332/cffi-2.0.0-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:730cacb21e1bdff3ce90babf007d0a0917cc3e6492f336c2f0134101e0944f93", size = 216476, upload-time = "2025-09-08T23:22:31.063Z" }, + { url = "https://files.pythonhosted.org/packages/dc/7f/55fecd70f7ece178db2f26128ec41430d8720f2d12ca97bf8f0a628207d5/cffi-2.0.0-cp311-cp311-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:6824f87845e3396029f3820c206e459ccc91760e8fa24422f8b0c3d1731cbec5", size = 203374, upload-time = "2025-09-08T23:22:32.507Z" }, + { url = "https://files.pythonhosted.org/packages/84/ef/a7b77c8bdc0f77adc3b46888f1ad54be8f3b7821697a7b89126e829e676a/cffi-2.0.0-cp311-cp311-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:9de40a7b0323d889cf8d23d1ef214f565ab154443c42737dfe52ff82cf857664", size = 202597, upload-time = "2025-09-08T23:22:34.132Z" }, + { url = "https://files.pythonhosted.org/packages/d7/91/500d892b2bf36529a75b77958edfcd5ad8e2ce4064ce2ecfeab2125d72d1/cffi-2.0.0-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:8941aaadaf67246224cee8c3803777eed332a19d909b47e29c9842ef1e79ac26", size = 215574, upload-time = "2025-09-08T23:22:35.443Z" }, + { url = "https://files.pythonhosted.org/packages/44/64/58f6255b62b101093d5df22dcb752596066c7e89dd725e0afaed242a61be/cffi-2.0.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:a05d0c237b3349096d3981b727493e22147f934b20f6f125a3eba8f994bec4a9", size = 218971, upload-time = "2025-09-08T23:22:36.805Z" }, + { url = "https://files.pythonhosted.org/packages/ab/49/fa72cebe2fd8a55fbe14956f9970fe8eb1ac59e5df042f603ef7c8ba0adc/cffi-2.0.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:94698a9c5f91f9d138526b48fe26a199609544591f859c870d477351dc7b2414", size = 211972, upload-time = "2025-09-08T23:22:38.436Z" }, + { url = "https://files.pythonhosted.org/packages/0b/28/dd0967a76aab36731b6ebfe64dec4e981aff7e0608f60c2d46b46982607d/cffi-2.0.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:5fed36fccc0612a53f1d4d9a816b50a36702c28a2aa880cb8a122b3466638743", size = 217078, upload-time = "2025-09-08T23:22:39.776Z" }, + { url = "https://files.pythonhosted.org/packages/2b/c0/015b25184413d7ab0a410775fdb4a50fca20f5589b5dab1dbbfa3baad8ce/cffi-2.0.0-cp311-cp311-win32.whl", hash = "sha256:c649e3a33450ec82378822b3dad03cc228b8f5963c0c12fc3b1e0ab940f768a5", size = 172076, upload-time = "2025-09-08T23:22:40.95Z" }, + { url = "https://files.pythonhosted.org/packages/ae/8f/dc5531155e7070361eb1b7e4c1a9d896d0cb21c49f807a6c03fd63fc877e/cffi-2.0.0-cp311-cp311-win_amd64.whl", hash = "sha256:66f011380d0e49ed280c789fbd08ff0d40968ee7b665575489afa95c98196ab5", size = 182820, upload-time = "2025-09-08T23:22:42.463Z" }, + { url = "https://files.pythonhosted.org/packages/95/5c/1b493356429f9aecfd56bc171285a4c4ac8697f76e9bbbbb105e537853a1/cffi-2.0.0-cp311-cp311-win_arm64.whl", hash = "sha256:c6638687455baf640e37344fe26d37c404db8b80d037c3d29f58fe8d1c3b194d", size = 177635, upload-time = "2025-09-08T23:22:43.623Z" }, + { url = "https://files.pythonhosted.org/packages/ea/47/4f61023ea636104d4f16ab488e268b93008c3d0bb76893b1b31db1f96802/cffi-2.0.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:6d02d6655b0e54f54c4ef0b94eb6be0607b70853c45ce98bd278dc7de718be5d", size = 185271, upload-time = "2025-09-08T23:22:44.795Z" }, + { url = "https://files.pythonhosted.org/packages/df/a2/781b623f57358e360d62cdd7a8c681f074a71d445418a776eef0aadb4ab4/cffi-2.0.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:8eca2a813c1cb7ad4fb74d368c2ffbbb4789d377ee5bb8df98373c2cc0dee76c", size = 181048, upload-time = "2025-09-08T23:22:45.938Z" }, + { url = "https://files.pythonhosted.org/packages/ff/df/a4f0fbd47331ceeba3d37c2e51e9dfc9722498becbeec2bd8bc856c9538a/cffi-2.0.0-cp312-cp312-manylinux1_i686.manylinux2014_i686.manylinux_2_17_i686.manylinux_2_5_i686.whl", hash = "sha256:21d1152871b019407d8ac3985f6775c079416c282e431a4da6afe7aefd2bccbe", size = 212529, upload-time = "2025-09-08T23:22:47.349Z" }, + { url = "https://files.pythonhosted.org/packages/d5/72/12b5f8d3865bf0f87cf1404d8c374e7487dcf097a1c91c436e72e6badd83/cffi-2.0.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:b21e08af67b8a103c71a250401c78d5e0893beff75e28c53c98f4de42f774062", size = 220097, upload-time = "2025-09-08T23:22:48.677Z" }, + { url = "https://files.pythonhosted.org/packages/c2/95/7a135d52a50dfa7c882ab0ac17e8dc11cec9d55d2c18dda414c051c5e69e/cffi-2.0.0-cp312-cp312-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:1e3a615586f05fc4065a8b22b8152f0c1b00cdbc60596d187c2a74f9e3036e4e", size = 207983, upload-time = "2025-09-08T23:22:50.06Z" }, + { url = "https://files.pythonhosted.org/packages/3a/c8/15cb9ada8895957ea171c62dc78ff3e99159ee7adb13c0123c001a2546c1/cffi-2.0.0-cp312-cp312-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:81afed14892743bbe14dacb9e36d9e0e504cd204e0b165062c488942b9718037", size = 206519, upload-time = "2025-09-08T23:22:51.364Z" }, + { url = "https://files.pythonhosted.org/packages/78/2d/7fa73dfa841b5ac06c7b8855cfc18622132e365f5b81d02230333ff26e9e/cffi-2.0.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:3e17ed538242334bf70832644a32a7aae3d83b57567f9fd60a26257e992b79ba", size = 219572, upload-time = "2025-09-08T23:22:52.902Z" }, + { url = "https://files.pythonhosted.org/packages/07/e0/267e57e387b4ca276b90f0434ff88b2c2241ad72b16d31836adddfd6031b/cffi-2.0.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:3925dd22fa2b7699ed2617149842d2e6adde22b262fcbfada50e3d195e4b3a94", size = 222963, upload-time = "2025-09-08T23:22:54.518Z" }, + { url = "https://files.pythonhosted.org/packages/b6/75/1f2747525e06f53efbd878f4d03bac5b859cbc11c633d0fb81432d98a795/cffi-2.0.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:2c8f814d84194c9ea681642fd164267891702542f028a15fc97d4674b6206187", size = 221361, upload-time = "2025-09-08T23:22:55.867Z" }, + { url = "https://files.pythonhosted.org/packages/7b/2b/2b6435f76bfeb6bbf055596976da087377ede68df465419d192acf00c437/cffi-2.0.0-cp312-cp312-win32.whl", hash = "sha256:da902562c3e9c550df360bfa53c035b2f241fed6d9aef119048073680ace4a18", size = 172932, upload-time = "2025-09-08T23:22:57.188Z" }, + { url = "https://files.pythonhosted.org/packages/f8/ed/13bd4418627013bec4ed6e54283b1959cf6db888048c7cf4b4c3b5b36002/cffi-2.0.0-cp312-cp312-win_amd64.whl", hash = "sha256:da68248800ad6320861f129cd9c1bf96ca849a2771a59e0344e88681905916f5", size = 183557, upload-time = "2025-09-08T23:22:58.351Z" }, + { url = "https://files.pythonhosted.org/packages/95/31/9f7f93ad2f8eff1dbc1c3656d7ca5bfd8fb52c9d786b4dcf19b2d02217fa/cffi-2.0.0-cp312-cp312-win_arm64.whl", hash = "sha256:4671d9dd5ec934cb9a73e7ee9676f9362aba54f7f34910956b84d727b0d73fb6", size = 177762, upload-time = "2025-09-08T23:22:59.668Z" }, +] + [[package]] name = "cfgv" version = "3.5.0" @@ -40,6 +114,47 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/db/3c/33bac158f8ab7f89b2e59426d5fe2e4f63f7ed25df84c036890172b412b5/cfgv-3.5.0-py2.py3-none-any.whl", hash = "sha256:a8dc6b26ad22ff227d2634a65cb388215ce6cc96bbcc5cfde7641ae87e8dacc0", size = 7445, upload-time = "2025-11-19T20:55:50.744Z" }, ] +[[package]] +name = "charset-normalizer" +version = "3.4.7" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/e7/a1/67fe25fac3c7642725500a3f6cfe5821ad557c3abb11c9d20d12c7008d3e/charset_normalizer-3.4.7.tar.gz", hash = "sha256:ae89db9e5f98a11a4bf50407d4363e7b09b31e55bc117b4f7d80aab97ba009e5", size = 144271, upload-time = "2026-04-02T09:28:39.342Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/c2/d7/b5b7020a0565c2e9fa8c09f4b5fa6232feb326b8c20081ccded47ea368fd/charset_normalizer-3.4.7-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:7641bb8895e77f921102f72833904dcd9901df5d6d72a2ab8f31d04b7e51e4e7", size = 309705, upload-time = "2026-04-02T09:26:02.191Z" }, + { url = "https://files.pythonhosted.org/packages/5a/53/58c29116c340e5456724ecd2fff4196d236b98f3da97b404bc5e51ac3493/charset_normalizer-3.4.7-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:202389074300232baeb53ae2569a60901f7efadd4245cf3a3bf0617d60b439d7", size = 206419, upload-time = "2026-04-02T09:26:03.583Z" }, + { url = "https://files.pythonhosted.org/packages/b2/02/e8146dc6591a37a00e5144c63f29fb7c97a734ea8a111190783c0e60ab63/charset_normalizer-3.4.7-cp311-cp311-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:30b8d1d8c52a48c2c5690e152c169b673487a2a58de1ec7393196753063fcd5e", size = 227901, upload-time = "2026-04-02T09:26:04.738Z" }, + { url = "https://files.pythonhosted.org/packages/fb/73/77486c4cd58f1267bf17db420e930c9afa1b3be3fe8c8b8ebbebc9624359/charset_normalizer-3.4.7-cp311-cp311-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:532bc9bf33a68613fd7d65e4b1c71a6a38d7d42604ecf239c77392e9b4e8998c", size = 222742, upload-time = "2026-04-02T09:26:06.36Z" }, + { url = "https://files.pythonhosted.org/packages/a1/fa/f74eb381a7d94ded44739e9d94de18dc5edc9c17fb8c11f0a6890696c0a9/charset_normalizer-3.4.7-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:2fe249cb4651fd12605b7288b24751d8bfd46d35f12a20b1ba33dea122e690df", size = 214061, upload-time = "2026-04-02T09:26:08.347Z" }, + { url = "https://files.pythonhosted.org/packages/dc/92/42bd3cefcf7687253fb86694b45f37b733c97f59af3724f356fa92b8c344/charset_normalizer-3.4.7-cp311-cp311-manylinux_2_31_armv7l.whl", hash = "sha256:65bcd23054beab4d166035cabbc868a09c1a49d1efe458fe8e4361215df40265", size = 199239, upload-time = "2026-04-02T09:26:09.823Z" }, + { url = "https://files.pythonhosted.org/packages/4c/3d/069e7184e2aa3b3cddc700e3dd267413dc259854adc3380421c805c6a17d/charset_normalizer-3.4.7-cp311-cp311-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:08e721811161356f97b4059a9ba7bafb23ea5ee2255402c42881c214e173c6b4", size = 210173, upload-time = "2026-04-02T09:26:10.953Z" }, + { url = "https://files.pythonhosted.org/packages/62/51/9d56feb5f2e7074c46f93e0ebdbe61f0848ee246e2f0d89f8e20b89ebb8f/charset_normalizer-3.4.7-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:e060d01aec0a910bdccb8be71faf34e7799ce36950f8294c8bf612cba65a2c9e", size = 209841, upload-time = "2026-04-02T09:26:12.142Z" }, + { url = "https://files.pythonhosted.org/packages/d2/59/893d8f99cc4c837dda1fe2f1139079703deb9f321aabcb032355de13b6c7/charset_normalizer-3.4.7-cp311-cp311-musllinux_1_2_armv7l.whl", hash = "sha256:38c0109396c4cfc574d502df99742a45c72c08eff0a36158b6f04000043dbf38", size = 200304, upload-time = "2026-04-02T09:26:13.711Z" }, + { url = "https://files.pythonhosted.org/packages/7d/1d/ee6f3be3464247578d1ed5c46de545ccc3d3ff933695395c402c21fa6b77/charset_normalizer-3.4.7-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:1c2a768fdd44ee4a9339a9b0b130049139b8ce3c01d2ce09f67f5a68048d477c", size = 229455, upload-time = "2026-04-02T09:26:14.941Z" }, + { url = "https://files.pythonhosted.org/packages/54/bb/8fb0a946296ea96a488928bdce8ef99023998c48e4713af533e9bb98ef07/charset_normalizer-3.4.7-cp311-cp311-musllinux_1_2_riscv64.whl", hash = "sha256:1a87ca9d5df6fe460483d9a5bbf2b18f620cbed41b432e2bddb686228282d10b", size = 210036, upload-time = "2026-04-02T09:26:16.478Z" }, + { url = "https://files.pythonhosted.org/packages/9a/bc/015b2387f913749f82afd4fcba07846d05b6d784dd16123cb66860e0237d/charset_normalizer-3.4.7-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:d635aab80466bc95771bb78d5370e74d36d1fe31467b6b29b8b57b2a3cd7d22c", size = 224739, upload-time = "2026-04-02T09:26:17.751Z" }, + { url = "https://files.pythonhosted.org/packages/17/ab/63133691f56baae417493cba6b7c641571a2130eb7bceba6773367ab9ec5/charset_normalizer-3.4.7-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:ae196f021b5e7c78e918242d217db021ed2a6ace2bc6ae94c0fc596221c7f58d", size = 216277, upload-time = "2026-04-02T09:26:18.981Z" }, + { url = "https://files.pythonhosted.org/packages/06/6d/3be70e827977f20db77c12a97e6a9f973631a45b8d186c084527e53e77a4/charset_normalizer-3.4.7-cp311-cp311-win32.whl", hash = "sha256:adb2597b428735679446b46c8badf467b4ca5f5056aae4d51a19f9570301b1ad", size = 147819, upload-time = "2026-04-02T09:26:20.295Z" }, + { url = "https://files.pythonhosted.org/packages/20/d9/5f67790f06b735d7c7637171bbfd89882ad67201891b7275e51116ed8207/charset_normalizer-3.4.7-cp311-cp311-win_amd64.whl", hash = "sha256:8e385e4267ab76874ae30db04c627faaaf0b509e1ccc11a95b3fc3e83f855c00", size = 159281, upload-time = "2026-04-02T09:26:21.74Z" }, + { url = "https://files.pythonhosted.org/packages/ca/83/6413f36c5a34afead88ce6f66684d943d91f233d76dd083798f9602b75ae/charset_normalizer-3.4.7-cp311-cp311-win_arm64.whl", hash = "sha256:d4a48e5b3c2a489fae013b7589308a40146ee081f6f509e047e0e096084ceca1", size = 147843, upload-time = "2026-04-02T09:26:22.901Z" }, + { url = "https://files.pythonhosted.org/packages/0c/eb/4fc8d0a7110eb5fc9cc161723a34a8a6c200ce3b4fbf681bc86feee22308/charset_normalizer-3.4.7-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:eca9705049ad3c7345d574e3510665cb2cf844c2f2dcfe675332677f081cbd46", size = 311328, upload-time = "2026-04-02T09:26:24.331Z" }, + { url = "https://files.pythonhosted.org/packages/f8/e3/0fadc706008ac9d7b9b5be6dc767c05f9d3e5df51744ce4cc9605de7b9f4/charset_normalizer-3.4.7-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6178f72c5508bfc5fd446a5905e698c6212932f25bcdd4b47a757a50605a90e2", size = 208061, upload-time = "2026-04-02T09:26:25.568Z" }, + { url = "https://files.pythonhosted.org/packages/42/f0/3dd1045c47f4a4604df85ec18ad093912ae1344ac706993aff91d38773a2/charset_normalizer-3.4.7-cp312-cp312-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:e1421b502d83040e6d7fb2fb18dff63957f720da3d77b2fbd3187ceb63755d7b", size = 229031, upload-time = "2026-04-02T09:26:26.865Z" }, + { url = "https://files.pythonhosted.org/packages/dc/67/675a46eb016118a2fbde5a277a5d15f4f69d5f3f5f338e5ee2f8948fcf43/charset_normalizer-3.4.7-cp312-cp312-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:edac0f1ab77644605be2cbba52e6b7f630731fc42b34cb0f634be1a6eface56a", size = 225239, upload-time = "2026-04-02T09:26:28.044Z" }, + { url = "https://files.pythonhosted.org/packages/4b/f8/d0118a2f5f23b02cd166fa385c60f9b0d4f9194f574e2b31cef350ad7223/charset_normalizer-3.4.7-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:5649fd1c7bade02f320a462fdefd0b4bd3ce036065836d4f42e0de958038e116", size = 216589, upload-time = "2026-04-02T09:26:29.239Z" }, + { url = "https://files.pythonhosted.org/packages/b1/f1/6d2b0b261b6c4ceef0fcb0d17a01cc5bc53586c2d4796fa04b5c540bc13d/charset_normalizer-3.4.7-cp312-cp312-manylinux_2_31_armv7l.whl", hash = "sha256:203104ed3e428044fd943bc4bf45fa73c0730391f9621e37fe39ecf477b128cb", size = 202733, upload-time = "2026-04-02T09:26:30.5Z" }, + { url = "https://files.pythonhosted.org/packages/6f/c0/7b1f943f7e87cc3db9626ba17807d042c38645f0a1d4415c7a14afb5591f/charset_normalizer-3.4.7-cp312-cp312-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:298930cec56029e05497a76988377cbd7457ba864beeea92ad7e844fe74cd1f1", size = 212652, upload-time = "2026-04-02T09:26:31.709Z" }, + { url = "https://files.pythonhosted.org/packages/38/dd/5a9ab159fe45c6e72079398f277b7d2b523e7f716acc489726115a910097/charset_normalizer-3.4.7-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:708838739abf24b2ceb208d0e22403dd018faeef86ddac04319a62ae884c4f15", size = 211229, upload-time = "2026-04-02T09:26:33.282Z" }, + { url = "https://files.pythonhosted.org/packages/d5/ff/531a1cad5ca855d1c1a8b69cb71abfd6d85c0291580146fda7c82857caa1/charset_normalizer-3.4.7-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:0f7eb884681e3938906ed0434f20c63046eacd0111c4ba96f27b76084cd679f5", size = 203552, upload-time = "2026-04-02T09:26:34.845Z" }, + { url = "https://files.pythonhosted.org/packages/c1/4c/a5fb52d528a8ca41f7598cb619409ece30a169fbdf9cdce592e53b46c3a6/charset_normalizer-3.4.7-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:4dc1e73c36828f982bfe79fadf5919923f8a6f4df2860804db9a98c48824ce8d", size = 230806, upload-time = "2026-04-02T09:26:36.152Z" }, + { url = "https://files.pythonhosted.org/packages/59/7a/071feed8124111a32b316b33ae4de83d36923039ef8cf48120266844285b/charset_normalizer-3.4.7-cp312-cp312-musllinux_1_2_riscv64.whl", hash = "sha256:aed52fea0513bac0ccde438c188c8a471c4e0f457c2dd20cdbf6ea7a450046c7", size = 212316, upload-time = "2026-04-02T09:26:37.672Z" }, + { url = "https://files.pythonhosted.org/packages/fd/35/f7dba3994312d7ba508e041eaac39a36b120f32d4c8662b8814dab876431/charset_normalizer-3.4.7-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:fea24543955a6a729c45a73fe90e08c743f0b3334bbf3201e6c4bc1b0c7fa464", size = 227274, upload-time = "2026-04-02T09:26:38.93Z" }, + { url = "https://files.pythonhosted.org/packages/8a/2d/a572df5c9204ab7688ec1edc895a73ebded3b023bb07364710b05dd1c9be/charset_normalizer-3.4.7-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:bb6d88045545b26da47aa879dd4a89a71d1dce0f0e549b1abcb31dfe4a8eac49", size = 218468, upload-time = "2026-04-02T09:26:40.17Z" }, + { url = "https://files.pythonhosted.org/packages/86/eb/890922a8b03a568ca2f336c36585a4713c55d4d67bf0f0c78924be6315ca/charset_normalizer-3.4.7-cp312-cp312-win32.whl", hash = "sha256:2257141f39fe65a3fdf38aeccae4b953e5f3b3324f4ff0daf9f15b8518666a2c", size = 148460, upload-time = "2026-04-02T09:26:41.416Z" }, + { url = "https://files.pythonhosted.org/packages/35/d9/0e7dffa06c5ab081f75b1b786f0aefc88365825dfcd0ac544bdb7b2b6853/charset_normalizer-3.4.7-cp312-cp312-win_amd64.whl", hash = "sha256:5ed6ab538499c8644b8a3e18debabcd7ce684f3fa91cf867521a7a0279cab2d6", size = 159330, upload-time = "2026-04-02T09:26:42.554Z" }, + { url = "https://files.pythonhosted.org/packages/9e/5d/481bcc2a7c88ea6b0878c299547843b2521ccbc40980cb406267088bc701/charset_normalizer-3.4.7-cp312-cp312-win_arm64.whl", hash = "sha256:56be790f86bfb2c98fb742ce566dfb4816e5a83384616ab59c49e0604d49c51d", size = 147828, upload-time = "2026-04-02T09:26:44.075Z" }, + { url = "https://files.pythonhosted.org/packages/db/8f/61959034484a4a7c527811f4721e75d02d653a35afb0b6054474d8185d4c/charset_normalizer-3.4.7-py3-none-any.whl", hash = "sha256:3dce51d0f5e7951f8bb4900c257dad282f49190fdbebecd4ba99bcc41fef404d", size = 61958, upload-time = "2026-04-02T09:28:37.794Z" }, +] + [[package]] name = "click" version = "8.3.2" @@ -108,6 +223,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/e7/05/c19819d5e3d95294a6f5947fb9b9629efb316b96de511b418c53d245aae6/cycler-0.12.1-py3-none-any.whl", hash = "sha256:85cef7cff222d8644161529808465972e51340599459b8ac3ccbac5a854e0d30", size = 8321, upload-time = "2023-10-07T05:32:16.783Z" }, ] +[[package]] +name = "decorator" +version = "5.2.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/43/fa/6d96a0978d19e17b68d634497769987b16c8f4cd0a7a05048bec693caa6b/decorator-5.2.1.tar.gz", hash = "sha256:65f266143752f734b0a7cc83c46f4618af75b8c5911b00ccb61d0ac9b6da0360", size = 56711, upload-time = "2025-02-24T04:41:34.073Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/4e/8c/f3147f5c4b73e7550fe5f9352eaa956ae838d5c51eb58e7a25b9f3e2643b/decorator-5.2.1-py3-none-any.whl", hash = "sha256:d316bb415a2d9e2d2b3abcc4084c6502fc09240e292cd76a76afc106a1c8e04a", size = 9190, upload-time = "2025-02-24T04:41:32.565Z" }, +] + [[package]] name = "distlib" version = "0.4.0" @@ -151,6 +275,30 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/fd/ba/56147c165442cc5ba7e82ecf301c9a68353cede498185869e6e02b4c264f/fonttools-4.62.1-py3-none-any.whl", hash = "sha256:7487782e2113861f4ddcc07c3436450659e3caa5e470b27dc2177cade2d8e7fd", size = 1152647, upload-time = "2026-03-13T13:54:22.735Z" }, ] +[[package]] +name = "gitdb" +version = "4.0.12" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "smmap" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/72/94/63b0fc47eb32792c7ba1fe1b694daec9a63620db1e313033d18140c2320a/gitdb-4.0.12.tar.gz", hash = "sha256:5ef71f855d191a3326fcfbc0d5da835f26b13fbcba60c32c21091c349ffdb571", size = 394684, upload-time = "2025-01-02T07:20:46.413Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/a0/61/5c78b91c3143ed5c14207f463aecfc8f9dbb5092fb2869baf37c273b2705/gitdb-4.0.12-py3-none-any.whl", hash = "sha256:67073e15955400952c6565cc3e707c554a4eea2e428946f7a4c162fab9bd9bcf", size = 62794, upload-time = "2025-01-02T07:20:43.624Z" }, +] + +[[package]] +name = "gitpython" +version = "3.1.50" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "gitdb" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/33/f6/354ae6491228b5eb40e10d89c4d13c651fe1cf7556e35ebdded50cff57ce/gitpython-3.1.50.tar.gz", hash = "sha256:80da2d12504d52e1f998772dc5baf6e553f8d2fcfe1fcc226c9d9a2ee3372dcc", size = 219798, upload-time = "2026-05-06T04:01:26.571Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/20/7a/1c6e3562dfd8950adbb11ffbc65d21e7c89d01a6e4f137fa981056de25c5/gitpython-3.1.50-py3-none-any.whl", hash = "sha256:d352abe2908d07355014abdd21ddf798c2a961469239afec4962e9da884858f9", size = 212507, upload-time = "2026-05-06T04:01:23.799Z" }, +] + [[package]] name = "identify" version = "2.6.18" @@ -160,6 +308,42 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/46/33/92ef41c6fad0233e41d3d84ba8e8ad18d1780f1e5d99b3c683e6d7f98b63/identify-2.6.18-py2.py3-none-any.whl", hash = "sha256:8db9d3c8ea9079db92cafb0ebf97abdc09d52e97f4dcf773a2e694048b7cd737", size = 99394, upload-time = "2026-03-15T18:39:48.915Z" }, ] +[[package]] +name = "idna" +version = "3.13" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/ce/cc/762dfb036166873f0059f3b7de4565e1b5bc3d6f28a414c13da27e442f99/idna-3.13.tar.gz", hash = "sha256:585ea8fe5d69b9181ec1afba340451fba6ba764af97026f92a91d4eef164a242", size = 194210, upload-time = "2026-04-22T16:42:42.314Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/5d/13/ad7d7ca3808a898b4612b6fe93cde56b53f3034dcde235acb1f0e1df24c6/idna-3.13-py3-none-any.whl", hash = "sha256:892ea0cde124a99ce773decba204c5552b69c3c67ffd5f232eb7696135bc8bb3", size = 68629, upload-time = "2026-04-22T16:42:40.909Z" }, +] + +[[package]] +name = "imageio" +version = "2.37.3" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "numpy" }, + { name = "pillow" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/b1/84/93bcd1300216ea50811cee96873b84a1bebf8d0489ffaf7f2a3756bab866/imageio-2.37.3.tar.gz", hash = "sha256:bbb37efbfc4c400fcd534b367b91fcd66d5da639aaa138034431a1c5e0a41451", size = 389673, upload-time = "2026-03-09T11:31:12.573Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/49/fa/391e437a34e55095173dca5f24070d89cbc233ff85bf1c29c93248c6588d/imageio-2.37.3-py3-none-any.whl", hash = "sha256:46f5bb8522cd421c0f5ae104d8268f569d856b29eb1a13b92829d1970f32c9f0", size = 317646, upload-time = "2026-03-09T11:31:10.771Z" }, +] + +[[package]] +name = "imageio-ffmpeg" +version = "0.6.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/44/bd/c3343c721f2a1b0c9fc71c1aebf1966a3b7f08c2eea8ed5437a2865611d6/imageio_ffmpeg-0.6.0.tar.gz", hash = "sha256:e2556bed8e005564a9f925bb7afa4002d82770d6b08825078b7697ab88ba1755", size = 25210, upload-time = "2025-01-16T21:34:32.747Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/da/58/87ef68ac83f4c7690961bce288fd8e382bc5f1513860fc7f90a9c1c1c6bf/imageio_ffmpeg-0.6.0-py3-none-macosx_10_9_intel.macosx_10_9_x86_64.whl", hash = "sha256:9d2baaf867088508d4a3458e61eeb30e945c4ad8016025545f66c4b5aaef0a61", size = 24932969, upload-time = "2025-01-16T21:34:20.464Z" }, + { url = "https://files.pythonhosted.org/packages/40/5c/f3d8a657d362cc93b81aab8feda487317da5b5d31c0e1fdfd5e986e55d17/imageio_ffmpeg-0.6.0-py3-none-macosx_11_0_arm64.whl", hash = "sha256:b1ae3173414b5fc5f538a726c4e48ea97edc0d2cdc11f103afee655c463fa742", size = 21113891, upload-time = "2025-01-16T21:34:00.277Z" }, + { url = "https://files.pythonhosted.org/packages/33/e7/1925bfbc563c39c1d2e82501d8372734a5c725e53ac3b31b4c2d081e895b/imageio_ffmpeg-0.6.0-py3-none-manylinux2014_aarch64.whl", hash = "sha256:1d47bebd83d2c5fc770720d211855f208af8a596c82d17730aa51e815cdee6dc", size = 25632706, upload-time = "2025-01-16T21:33:53.475Z" }, + { url = "https://files.pythonhosted.org/packages/a0/2d/43c8522a2038e9d0e7dbdf3a61195ecc31ca576fb1527a528c877e87d973/imageio_ffmpeg-0.6.0-py3-none-manylinux2014_x86_64.whl", hash = "sha256:c7e46fcec401dd990405049d2e2f475e2b397779df2519b544b8aab515195282", size = 29498237, upload-time = "2025-01-16T21:34:13.726Z" }, + { url = "https://files.pythonhosted.org/packages/a0/13/59da54728351883c3c1d9fca1710ab8eee82c7beba585df8f25ca925f08f/imageio_ffmpeg-0.6.0-py3-none-win32.whl", hash = "sha256:196faa79366b4a82f95c0f4053191d2013f4714a715780f0ad2a68ff37483cc2", size = 19652251, upload-time = "2025-01-16T21:34:06.812Z" }, + { url = "https://files.pythonhosted.org/packages/2c/c6/fa760e12a2483469e2bf5058c5faff664acf66cadb4df2ad6205b016a73d/imageio_ffmpeg-0.6.0-py3-none-win_amd64.whl", hash = "sha256:02fa47c83703c37df6bfe4896aab339013f62bf02c5ebf2dce6da56af04ffc0a", size = 31246824, upload-time = "2025-01-16T21:34:28.6Z" }, +] + [[package]] name = "iniconfig" version = "2.3.0" @@ -169,6 +353,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/cb/b1/3846dd7f199d53cb17f49cba7e651e9ce294d8497c8c150530ed11865bb8/iniconfig-2.3.0-py3-none-any.whl", hash = "sha256:f631c04d2c48c52b84d0d0549c99ff3859c98df65b3101406327ecc7d53fbf12", size = 7484, upload-time = "2025-10-18T21:55:41.639Z" }, ] +[[package]] +name = "jinja2" +version = "3.1.6" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "markupsafe" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/df/bf/f7da0350254c0ed7c72f3e33cef02e048281fec7ecec5f032d4aac52226b/jinja2-3.1.6.tar.gz", hash = "sha256:0137fb05990d35f1275a587e9aee6d56da821fc83491a0fb838183be43f66d6d", size = 245115, upload-time = "2025-03-05T20:05:02.478Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/62/a1/3d680cbfd5f4b8f15abc1d571870c5fc3e594bb582bc3b64ea099db13e56/jinja2-3.1.6-py3-none-any.whl", hash = "sha256:85ece4451f492d0c13c5dd7c13a64681a86afae63a5f347908daf103ce6d2f67", size = 134899, upload-time = "2025-03-05T20:05:00.369Z" }, +] + [[package]] name = "kiwisolver" version = "1.5.0" @@ -225,6 +421,49 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/cf/e3/35764404a4b7e2021be1f88f42264c2e92e0c4720273559a62461ce64a47/lexid-2021.1006-py2.py3-none-any.whl", hash = "sha256:5526bb5606fd74c7add23320da5f02805bddd7c77916f2dc1943e6bada8605ed", size = 7587, upload-time = "2021-04-02T20:18:33.129Z" }, ] +[[package]] +name = "loguru" +version = "0.7.3" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "colorama", marker = "sys_platform == 'win32'" }, + { name = "win32-setctime", marker = "sys_platform == 'win32'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/3a/05/a1dae3dffd1116099471c643b8924f5aa6524411dc6c63fdae648c4f1aca/loguru-0.7.3.tar.gz", hash = "sha256:19480589e77d47b8d85b2c827ad95d49bf31b0dcde16593892eb51dd18706eb6", size = 63559, upload-time = "2024-12-06T11:20:56.608Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/0c/29/0348de65b8cc732daa3e33e67806420b2ae89bdce2b04af740289c5c6c8c/loguru-0.7.3-py3-none-any.whl", hash = "sha256:31a33c10c8e1e10422bfd431aeb5d351c7cf7fa671e3c4df004162264b28220c", size = 61595, upload-time = "2024-12-06T11:20:54.538Z" }, +] + +[[package]] +name = "markupsafe" +version = "3.0.3" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/7e/99/7690b6d4034fffd95959cbe0c02de8deb3098cc577c67bb6a24fe5d7caa7/markupsafe-3.0.3.tar.gz", hash = "sha256:722695808f4b6457b320fdc131280796bdceb04ab50fe1795cd540799ebe1698", size = 80313, upload-time = "2025-09-27T18:37:40.426Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/08/db/fefacb2136439fc8dd20e797950e749aa1f4997ed584c62cfb8ef7c2be0e/markupsafe-3.0.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:1cc7ea17a6824959616c525620e387f6dd30fec8cb44f649e31712db02123dad", size = 11631, upload-time = "2025-09-27T18:36:18.185Z" }, + { url = "https://files.pythonhosted.org/packages/e1/2e/5898933336b61975ce9dc04decbc0a7f2fee78c30353c5efba7f2d6ff27a/markupsafe-3.0.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:4bd4cd07944443f5a265608cc6aab442e4f74dff8088b0dfc8238647b8f6ae9a", size = 12058, upload-time = "2025-09-27T18:36:19.444Z" }, + { url = "https://files.pythonhosted.org/packages/1d/09/adf2df3699d87d1d8184038df46a9c80d78c0148492323f4693df54e17bb/markupsafe-3.0.3-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6b5420a1d9450023228968e7e6a9ce57f65d148ab56d2313fcd589eee96a7a50", size = 24287, upload-time = "2025-09-27T18:36:20.768Z" }, + { url = "https://files.pythonhosted.org/packages/30/ac/0273f6fcb5f42e314c6d8cd99effae6a5354604d461b8d392b5ec9530a54/markupsafe-3.0.3-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:0bf2a864d67e76e5c9a34dc26ec616a66b9888e25e7b9460e1c76d3293bd9dbf", size = 22940, upload-time = "2025-09-27T18:36:22.249Z" }, + { url = "https://files.pythonhosted.org/packages/19/ae/31c1be199ef767124c042c6c3e904da327a2f7f0cd63a0337e1eca2967a8/markupsafe-3.0.3-cp311-cp311-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:bc51efed119bc9cfdf792cdeaa4d67e8f6fcccab66ed4bfdd6bde3e59bfcbb2f", size = 21887, upload-time = "2025-09-27T18:36:23.535Z" }, + { url = "https://files.pythonhosted.org/packages/b2/76/7edcab99d5349a4532a459e1fe64f0b0467a3365056ae550d3bcf3f79e1e/markupsafe-3.0.3-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:068f375c472b3e7acbe2d5318dea141359e6900156b5b2ba06a30b169086b91a", size = 23692, upload-time = "2025-09-27T18:36:24.823Z" }, + { url = "https://files.pythonhosted.org/packages/a4/28/6e74cdd26d7514849143d69f0bf2399f929c37dc2b31e6829fd2045b2765/markupsafe-3.0.3-cp311-cp311-musllinux_1_2_riscv64.whl", hash = "sha256:7be7b61bb172e1ed687f1754f8e7484f1c8019780f6f6b0786e76bb01c2ae115", size = 21471, upload-time = "2025-09-27T18:36:25.95Z" }, + { url = "https://files.pythonhosted.org/packages/62/7e/a145f36a5c2945673e590850a6f8014318d5577ed7e5920a4b3448e0865d/markupsafe-3.0.3-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:f9e130248f4462aaa8e2552d547f36ddadbeaa573879158d721bbd33dfe4743a", size = 22923, upload-time = "2025-09-27T18:36:27.109Z" }, + { url = "https://files.pythonhosted.org/packages/0f/62/d9c46a7f5c9adbeeeda52f5b8d802e1094e9717705a645efc71b0913a0a8/markupsafe-3.0.3-cp311-cp311-win32.whl", hash = "sha256:0db14f5dafddbb6d9208827849fad01f1a2609380add406671a26386cdf15a19", size = 14572, upload-time = "2025-09-27T18:36:28.045Z" }, + { url = "https://files.pythonhosted.org/packages/83/8a/4414c03d3f891739326e1783338e48fb49781cc915b2e0ee052aa490d586/markupsafe-3.0.3-cp311-cp311-win_amd64.whl", hash = "sha256:de8a88e63464af587c950061a5e6a67d3632e36df62b986892331d4620a35c01", size = 15077, upload-time = "2025-09-27T18:36:29.025Z" }, + { url = "https://files.pythonhosted.org/packages/35/73/893072b42e6862f319b5207adc9ae06070f095b358655f077f69a35601f0/markupsafe-3.0.3-cp311-cp311-win_arm64.whl", hash = "sha256:3b562dd9e9ea93f13d53989d23a7e775fdfd1066c33494ff43f5418bc8c58a5c", size = 13876, upload-time = "2025-09-27T18:36:29.954Z" }, + { url = "https://files.pythonhosted.org/packages/5a/72/147da192e38635ada20e0a2e1a51cf8823d2119ce8883f7053879c2199b5/markupsafe-3.0.3-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:d53197da72cc091b024dd97249dfc7794d6a56530370992a5e1a08983ad9230e", size = 11615, upload-time = "2025-09-27T18:36:30.854Z" }, + { url = "https://files.pythonhosted.org/packages/9a/81/7e4e08678a1f98521201c3079f77db69fb552acd56067661f8c2f534a718/markupsafe-3.0.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:1872df69a4de6aead3491198eaf13810b565bdbeec3ae2dc8780f14458ec73ce", size = 12020, upload-time = "2025-09-27T18:36:31.971Z" }, + { url = "https://files.pythonhosted.org/packages/1e/2c/799f4742efc39633a1b54a92eec4082e4f815314869865d876824c257c1e/markupsafe-3.0.3-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:3a7e8ae81ae39e62a41ec302f972ba6ae23a5c5396c8e60113e9066ef893da0d", size = 24332, upload-time = "2025-09-27T18:36:32.813Z" }, + { url = "https://files.pythonhosted.org/packages/3c/2e/8d0c2ab90a8c1d9a24f0399058ab8519a3279d1bd4289511d74e909f060e/markupsafe-3.0.3-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:d6dd0be5b5b189d31db7cda48b91d7e0a9795f31430b7f271219ab30f1d3ac9d", size = 22947, upload-time = "2025-09-27T18:36:33.86Z" }, + { url = "https://files.pythonhosted.org/packages/2c/54/887f3092a85238093a0b2154bd629c89444f395618842e8b0c41783898ea/markupsafe-3.0.3-cp312-cp312-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:94c6f0bb423f739146aec64595853541634bde58b2135f27f61c1ffd1cd4d16a", size = 21962, upload-time = "2025-09-27T18:36:35.099Z" }, + { url = "https://files.pythonhosted.org/packages/c9/2f/336b8c7b6f4a4d95e91119dc8521402461b74a485558d8f238a68312f11c/markupsafe-3.0.3-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:be8813b57049a7dc738189df53d69395eba14fb99345e0a5994914a3864c8a4b", size = 23760, upload-time = "2025-09-27T18:36:36.001Z" }, + { url = "https://files.pythonhosted.org/packages/32/43/67935f2b7e4982ffb50a4d169b724d74b62a3964bc1a9a527f5ac4f1ee2b/markupsafe-3.0.3-cp312-cp312-musllinux_1_2_riscv64.whl", hash = "sha256:83891d0e9fb81a825d9a6d61e3f07550ca70a076484292a70fde82c4b807286f", size = 21529, upload-time = "2025-09-27T18:36:36.906Z" }, + { url = "https://files.pythonhosted.org/packages/89/e0/4486f11e51bbba8b0c041098859e869e304d1c261e59244baa3d295d47b7/markupsafe-3.0.3-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:77f0643abe7495da77fb436f50f8dab76dbc6e5fd25d39589a0f1fe6548bfa2b", size = 23015, upload-time = "2025-09-27T18:36:37.868Z" }, + { url = "https://files.pythonhosted.org/packages/2f/e1/78ee7a023dac597a5825441ebd17170785a9dab23de95d2c7508ade94e0e/markupsafe-3.0.3-cp312-cp312-win32.whl", hash = "sha256:d88b440e37a16e651bda4c7c2b930eb586fd15ca7406cb39e211fcff3bf3017d", size = 14540, upload-time = "2025-09-27T18:36:38.761Z" }, + { url = "https://files.pythonhosted.org/packages/aa/5b/bec5aa9bbbb2c946ca2733ef9c4ca91c91b6a24580193e891b5f7dbe8e1e/markupsafe-3.0.3-cp312-cp312-win_amd64.whl", hash = "sha256:26a5784ded40c9e318cfc2bdb30fe164bdb8665ded9cd64d500a34fb42067b1c", size = 15105, upload-time = "2025-09-27T18:36:39.701Z" }, + { url = "https://files.pythonhosted.org/packages/e5/f1/216fc1bbfd74011693a4fd837e7026152e89c4bcf3e77b6692fba9923123/markupsafe-3.0.3-cp312-cp312-win_arm64.whl", hash = "sha256:35add3b638a5d900e807944a078b51922212fb3dedb01633a8defc4b01a3c85f", size = 13906, upload-time = "2025-09-27T18:36:40.689Z" }, +] + [[package]] name = "matplotlib" version = "3.10.8" @@ -276,6 +515,33 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/bd/30/b6617c74a8234ff60265373ef730eb6378ccdda74042f51f9ac936191664/mizani-0.14.4-py3-none-any.whl", hash = "sha256:ed72bf249e2a18b5dcc65cd54c7eaa5444b2cb09c7e18aafa2ab6f05f1b78620", size = 133471, upload-time = "2026-01-28T14:42:16.328Z" }, ] +[[package]] +name = "moviepy" +version = "2.2.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "decorator" }, + { name = "imageio" }, + { name = "imageio-ffmpeg" }, + { name = "numpy" }, + { name = "pillow" }, + { name = "proglog" }, + { name = "python-dotenv" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/de/61/15f9476e270f64c78a834e7459ca045d669f869cec24eed26807b8cd479d/moviepy-2.2.1.tar.gz", hash = "sha256:c80cb56815ece94e5e3e2d361aa40070eeb30a09d23a24c4e684d03e16deacb1", size = 58431438, upload-time = "2025-05-21T19:31:52.601Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/9a/73/7d3b2010baa0b5eb1e4dfa9e4385e89b6716be76f2fa21a6c0fe34b68e5a/moviepy-2.2.1-py3-none-any.whl", hash = "sha256:6b56803fec2ac54b557404126ac1160e65448e03798fa282bd23e8fab3795060", size = 129871, upload-time = "2025-05-21T19:31:50.11Z" }, +] + +[[package]] +name = "narwhals" +version = "2.21.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/2d/0e/3ad61eb87088cc4932e0d851531fa82f845a6230b68b091a0e298cc7e537/narwhals-2.21.0.tar.gz", hash = "sha256:7c6e7f50528e62b7a967dd864d7e117d2955d38d4f730653ce46a9861358e2dc", size = 633083, upload-time = "2026-05-08T12:29:02.587Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/c7/e1/68c2256b69a314eba133673377ba9118c356f6342a0c02b61de449cf2bf2/narwhals-2.21.0-py3-none-any.whl", hash = "sha256:1e6617d0fca68ae1fda29e5397c4eaacd3ffc9fffe6bcd6ded0c690475e853be", size = 451943, upload-time = "2026-05-08T12:29:01.058Z" }, +] + [[package]] name = "nodeenv" version = "1.10.0" @@ -374,39 +640,39 @@ wheels = [ [[package]] name = "pillow" -version = "12.2.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/8c/21/c2bcdd5906101a30244eaffc1b6e6ce71a31bd0742a01eb89e660ebfac2d/pillow-12.2.0.tar.gz", hash = "sha256:a830b1a40919539d07806aa58e1b114df53ddd43213d9c8b75847eee6c0182b5", size = 46987819, upload-time = "2026-04-01T14:46:17.687Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/68/e1/748f5663efe6edcfc4e74b2b93edfb9b8b99b67f21a854c3ae416500a2d9/pillow-12.2.0-cp311-cp311-macosx_10_10_x86_64.whl", hash = "sha256:8be29e59487a79f173507c30ddf57e733a357f67881430449bb32614075a40ab", size = 5354347, upload-time = "2026-04-01T14:42:44.255Z" }, - { url = "https://files.pythonhosted.org/packages/47/a1/d5ff69e747374c33a3b53b9f98cca7889fce1fd03d79cdc4e1bccc6c5a87/pillow-12.2.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:71cde9a1e1551df7d34a25462fc60325e8a11a82cc2e2f54578e5e9a1e153d65", size = 4695873, upload-time = "2026-04-01T14:42:46.452Z" }, - { url = "https://files.pythonhosted.org/packages/df/21/e3fbdf54408a973c7f7f89a23b2cb97a7ef30c61ab4142af31eee6aebc88/pillow-12.2.0-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:f490f9368b6fc026f021db16d7ec2fbf7d89e2edb42e8ec09d2c60505f5729c7", size = 6280168, upload-time = "2026-04-01T14:42:49.228Z" }, - { url = "https://files.pythonhosted.org/packages/d3/f1/00b7278c7dd52b17ad4329153748f87b6756ec195ff786c2bdf12518337d/pillow-12.2.0-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:8bd7903a5f2a4545f6fd5935c90058b89d30045568985a71c79f5fd6edf9b91e", size = 8088188, upload-time = "2026-04-01T14:42:51.735Z" }, - { url = "https://files.pythonhosted.org/packages/ad/cf/220a5994ef1b10e70e85748b75649d77d506499352be135a4989c957b701/pillow-12.2.0-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:3997232e10d2920a68d25191392e3a4487d8183039e1c74c2297f00ed1c50705", size = 6394401, upload-time = "2026-04-01T14:42:54.343Z" }, - { url = "https://files.pythonhosted.org/packages/e9/bd/e51a61b1054f09437acfbc2ff9106c30d1eb76bc1453d428399946781253/pillow-12.2.0-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:e74473c875d78b8e9d5da2a70f7099549f9eb37ded4e2f6a463e60125bccd176", size = 7079655, upload-time = "2026-04-01T14:42:56.954Z" }, - { url = "https://files.pythonhosted.org/packages/6b/3d/45132c57d5fb4b5744567c3817026480ac7fc3ce5d4c47902bc0e7f6f853/pillow-12.2.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:56a3f9c60a13133a98ecff6197af34d7824de9b7b38c3654861a725c970c197b", size = 6503105, upload-time = "2026-04-01T14:42:59.847Z" }, - { url = "https://files.pythonhosted.org/packages/7d/2e/9df2fc1e82097b1df3dce58dc43286aa01068e918c07574711fcc53e6fb4/pillow-12.2.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:90e6f81de50ad6b534cab6e5aef77ff6e37722b2f5d908686f4a5c9eba17a909", size = 7203402, upload-time = "2026-04-01T14:43:02.664Z" }, - { url = "https://files.pythonhosted.org/packages/bd/2e/2941e42858ebb67e50ae741473de81c2984e6eff7b397017623c676e2e8d/pillow-12.2.0-cp311-cp311-win32.whl", hash = "sha256:8c984051042858021a54926eb597d6ee3012393ce9c181814115df4c60b9a808", size = 6378149, upload-time = "2026-04-01T14:43:05.274Z" }, - { url = "https://files.pythonhosted.org/packages/69/42/836b6f3cd7f3e5fa10a1f1a5420447c17966044c8fbf589cc0452d5502db/pillow-12.2.0-cp311-cp311-win_amd64.whl", hash = "sha256:6e6b2a0c538fc200b38ff9eb6628228b77908c319a005815f2dde585a0664b60", size = 7082626, upload-time = "2026-04-01T14:43:08.557Z" }, - { url = "https://files.pythonhosted.org/packages/c2/88/549194b5d6f1f494b485e493edc6693c0a16f4ada488e5bd974ed1f42fad/pillow-12.2.0-cp311-cp311-win_arm64.whl", hash = "sha256:9a8a34cc89c67a65ea7437ce257cea81a9dad65b29805f3ecee8c8fe8ff25ffe", size = 2463531, upload-time = "2026-04-01T14:43:10.743Z" }, - { url = "https://files.pythonhosted.org/packages/58/be/7482c8a5ebebbc6470b3eb791812fff7d5e0216c2be3827b30b8bb6603ed/pillow-12.2.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:2d192a155bbcec180f8564f693e6fd9bccff5a7af9b32e2e4bf8c9c69dbad6b5", size = 5308279, upload-time = "2026-04-01T14:43:13.246Z" }, - { url = "https://files.pythonhosted.org/packages/d8/95/0a351b9289c2b5cbde0bacd4a83ebc44023e835490a727b2a3bd60ddc0f4/pillow-12.2.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:f3f40b3c5a968281fd507d519e444c35f0ff171237f4fdde090dd60699458421", size = 4695490, upload-time = "2026-04-01T14:43:15.584Z" }, - { url = "https://files.pythonhosted.org/packages/de/af/4e8e6869cbed569d43c416fad3dc4ecb944cb5d9492defaed89ddd6fe871/pillow-12.2.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:03e7e372d5240cc23e9f07deca4d775c0817bffc641b01e9c3af208dbd300987", size = 6284462, upload-time = "2026-04-01T14:43:18.268Z" }, - { url = "https://files.pythonhosted.org/packages/e9/9e/c05e19657fd57841e476be1ab46c4d501bffbadbafdc31a6d665f8b737b6/pillow-12.2.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:b86024e52a1b269467a802258c25521e6d742349d760728092e1bc2d135b4d76", size = 8094744, upload-time = "2026-04-01T14:43:20.716Z" }, - { url = "https://files.pythonhosted.org/packages/2b/54/1789c455ed10176066b6e7e6da1b01e50e36f94ba584dc68d9eebfe9156d/pillow-12.2.0-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:7371b48c4fa448d20d2714c9a1f775a81155050d383333e0a6c15b1123dda005", size = 6398371, upload-time = "2026-04-01T14:43:23.443Z" }, - { url = "https://files.pythonhosted.org/packages/43/e3/fdc657359e919462369869f1c9f0e973f353f9a9ee295a39b1fea8ee1a77/pillow-12.2.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:62f5409336adb0663b7caa0da5c7d9e7bdbaae9ce761d34669420c2a801b2780", size = 7087215, upload-time = "2026-04-01T14:43:26.758Z" }, - { url = "https://files.pythonhosted.org/packages/8b/f8/2f6825e441d5b1959d2ca5adec984210f1ec086435b0ed5f52c19b3b8a6e/pillow-12.2.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:01afa7cf67f74f09523699b4e88c73fb55c13346d212a59a2db1f86b0a63e8c5", size = 6509783, upload-time = "2026-04-01T14:43:29.56Z" }, - { url = "https://files.pythonhosted.org/packages/67/f9/029a27095ad20f854f9dba026b3ea6428548316e057e6fc3545409e86651/pillow-12.2.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:fc3d34d4a8fbec3e88a79b92e5465e0f9b842b628675850d860b8bd300b159f5", size = 7212112, upload-time = "2026-04-01T14:43:32.091Z" }, - { url = "https://files.pythonhosted.org/packages/be/42/025cfe05d1be22dbfdb4f264fe9de1ccda83f66e4fc3aac94748e784af04/pillow-12.2.0-cp312-cp312-win32.whl", hash = "sha256:58f62cc0f00fd29e64b29f4fd923ffdb3859c9f9e6105bfc37ba1d08994e8940", size = 6378489, upload-time = "2026-04-01T14:43:34.601Z" }, - { url = "https://files.pythonhosted.org/packages/5d/7b/25a221d2c761c6a8ae21bfa3874988ff2583e19cf8a27bf2fee358df7942/pillow-12.2.0-cp312-cp312-win_amd64.whl", hash = "sha256:7f84204dee22a783350679a0333981df803dac21a0190d706a50475e361c93f5", size = 7084129, upload-time = "2026-04-01T14:43:37.213Z" }, - { url = "https://files.pythonhosted.org/packages/10/e1/542a474affab20fd4a0f1836cb234e8493519da6b76899e30bcc5d990b8b/pillow-12.2.0-cp312-cp312-win_arm64.whl", hash = "sha256:af73337013e0b3b46f175e79492d96845b16126ddf79c438d7ea7ff27783a414", size = 2463612, upload-time = "2026-04-01T14:43:39.421Z" }, - { url = "https://files.pythonhosted.org/packages/4e/b7/2437044fb910f499610356d1352e3423753c98e34f915252aafecc64889f/pillow-12.2.0-pp311-pypy311_pp73-macosx_10_15_x86_64.whl", hash = "sha256:0538bd5e05efec03ae613fd89c4ce0368ecd2ba239cc25b9f9be7ed426b0af1f", size = 5273969, upload-time = "2026-04-01T14:45:55.538Z" }, - { url = "https://files.pythonhosted.org/packages/f6/f4/8316e31de11b780f4ac08ef3654a75555e624a98db1056ecb2122d008d5a/pillow-12.2.0-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:394167b21da716608eac917c60aa9b969421b5dcbbe02ae7f013e7b85811c69d", size = 4659674, upload-time = "2026-04-01T14:45:58.093Z" }, - { url = "https://files.pythonhosted.org/packages/d4/37/664fca7201f8bb2aa1d20e2c3d5564a62e6ae5111741966c8319ca802361/pillow-12.2.0-pp311-pypy311_pp73-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:5d04bfa02cc2d23b497d1e90a0f927070043f6cbf303e738300532379a4b4e0f", size = 5288479, upload-time = "2026-04-01T14:46:01.141Z" }, - { url = "https://files.pythonhosted.org/packages/49/62/5b0ed78fce87346be7a5cfcfaaad91f6a1f98c26f86bdbafa2066c647ef6/pillow-12.2.0-pp311-pypy311_pp73-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:0c838a5125cee37e68edec915651521191cef1e6aa336b855f495766e77a366e", size = 7032230, upload-time = "2026-04-01T14:46:03.874Z" }, - { url = "https://files.pythonhosted.org/packages/c3/28/ec0fc38107fc32536908034e990c47914c57cd7c5a3ece4d8d8f7ffd7e27/pillow-12.2.0-pp311-pypy311_pp73-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:4a6c9fa44005fa37a91ebfc95d081e8079757d2e904b27103f4f5fa6f0bf78c0", size = 5355404, upload-time = "2026-04-01T14:46:06.33Z" }, - { url = "https://files.pythonhosted.org/packages/5e/8b/51b0eddcfa2180d60e41f06bd6d0a62202b20b59c68f5a132e615b75aecf/pillow-12.2.0-pp311-pypy311_pp73-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:25373b66e0dd5905ed63fa3cae13c82fbddf3079f2c8bf15c6fb6a35586324c1", size = 6002215, upload-time = "2026-04-01T14:46:08.83Z" }, - { url = "https://files.pythonhosted.org/packages/bc/60/5382c03e1970de634027cee8e1b7d39776b778b81812aaf45b694dfe9e28/pillow-12.2.0-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:bfa9c230d2fe991bed5318a5f119bd6780cda2915cca595393649fc118ab895e", size = 7080946, upload-time = "2026-04-01T14:46:11.734Z" }, +version = "11.3.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/f3/0d/d0d6dea55cd152ce3d6767bb38a8fc10e33796ba4ba210cbab9354b6d238/pillow-11.3.0.tar.gz", hash = "sha256:3828ee7586cd0b2091b6209e5ad53e20d0649bbe87164a459d0676e035e8f523", size = 47113069, upload-time = "2025-07-01T09:16:30.666Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/db/26/77f8ed17ca4ffd60e1dcd220a6ec6d71210ba398cfa33a13a1cd614c5613/pillow-11.3.0-cp311-cp311-macosx_10_10_x86_64.whl", hash = "sha256:1cd110edf822773368b396281a2293aeb91c90a2db00d78ea43e7e861631b722", size = 5316531, upload-time = "2025-07-01T09:13:59.203Z" }, + { url = "https://files.pythonhosted.org/packages/cb/39/ee475903197ce709322a17a866892efb560f57900d9af2e55f86db51b0a5/pillow-11.3.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:9c412fddd1b77a75aa904615ebaa6001f169b26fd467b4be93aded278266b288", size = 4686560, upload-time = "2025-07-01T09:14:01.101Z" }, + { url = "https://files.pythonhosted.org/packages/d5/90/442068a160fd179938ba55ec8c97050a612426fae5ec0a764e345839f76d/pillow-11.3.0-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:7d1aa4de119a0ecac0a34a9c8bde33f34022e2e8f99104e47a3ca392fd60e37d", size = 5870978, upload-time = "2025-07-03T13:09:55.638Z" }, + { url = "https://files.pythonhosted.org/packages/13/92/dcdd147ab02daf405387f0218dcf792dc6dd5b14d2573d40b4caeef01059/pillow-11.3.0-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:91da1d88226663594e3f6b4b8c3c8d85bd504117d043740a8e0ec449087cc494", size = 7641168, upload-time = "2025-07-03T13:10:00.37Z" }, + { url = "https://files.pythonhosted.org/packages/6e/db/839d6ba7fd38b51af641aa904e2960e7a5644d60ec754c046b7d2aee00e5/pillow-11.3.0-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:643f189248837533073c405ec2f0bb250ba54598cf80e8c1e043381a60632f58", size = 5973053, upload-time = "2025-07-01T09:14:04.491Z" }, + { url = "https://files.pythonhosted.org/packages/f2/2f/d7675ecae6c43e9f12aa8d58b6012683b20b6edfbdac7abcb4e6af7a3784/pillow-11.3.0-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:106064daa23a745510dabce1d84f29137a37224831d88eb4ce94bb187b1d7e5f", size = 6640273, upload-time = "2025-07-01T09:14:06.235Z" }, + { url = "https://files.pythonhosted.org/packages/45/ad/931694675ede172e15b2ff03c8144a0ddaea1d87adb72bb07655eaffb654/pillow-11.3.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:cd8ff254faf15591e724dc7c4ddb6bf4793efcbe13802a4ae3e863cd300b493e", size = 6082043, upload-time = "2025-07-01T09:14:07.978Z" }, + { url = "https://files.pythonhosted.org/packages/3a/04/ba8f2b11fc80d2dd462d7abec16351b45ec99cbbaea4387648a44190351a/pillow-11.3.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:932c754c2d51ad2b2271fd01c3d121daaa35e27efae2a616f77bf164bc0b3e94", size = 6715516, upload-time = "2025-07-01T09:14:10.233Z" }, + { url = "https://files.pythonhosted.org/packages/48/59/8cd06d7f3944cc7d892e8533c56b0acb68399f640786313275faec1e3b6f/pillow-11.3.0-cp311-cp311-win32.whl", hash = "sha256:b4b8f3efc8d530a1544e5962bd6b403d5f7fe8b9e08227c6b255f98ad82b4ba0", size = 6274768, upload-time = "2025-07-01T09:14:11.921Z" }, + { url = "https://files.pythonhosted.org/packages/f1/cc/29c0f5d64ab8eae20f3232da8f8571660aa0ab4b8f1331da5c2f5f9a938e/pillow-11.3.0-cp311-cp311-win_amd64.whl", hash = "sha256:1a992e86b0dd7aeb1f053cd506508c0999d710a8f07b4c791c63843fc6a807ac", size = 6986055, upload-time = "2025-07-01T09:14:13.623Z" }, + { url = "https://files.pythonhosted.org/packages/c6/df/90bd886fabd544c25addd63e5ca6932c86f2b701d5da6c7839387a076b4a/pillow-11.3.0-cp311-cp311-win_arm64.whl", hash = "sha256:30807c931ff7c095620fe04448e2c2fc673fcbb1ffe2a7da3fb39613489b1ddd", size = 2423079, upload-time = "2025-07-01T09:14:15.268Z" }, + { url = "https://files.pythonhosted.org/packages/40/fe/1bc9b3ee13f68487a99ac9529968035cca2f0a51ec36892060edcc51d06a/pillow-11.3.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:fdae223722da47b024b867c1ea0be64e0df702c5e0a60e27daad39bf960dd1e4", size = 5278800, upload-time = "2025-07-01T09:14:17.648Z" }, + { url = "https://files.pythonhosted.org/packages/2c/32/7e2ac19b5713657384cec55f89065fb306b06af008cfd87e572035b27119/pillow-11.3.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:921bd305b10e82b4d1f5e802b6850677f965d8394203d182f078873851dada69", size = 4686296, upload-time = "2025-07-01T09:14:19.828Z" }, + { url = "https://files.pythonhosted.org/packages/8e/1e/b9e12bbe6e4c2220effebc09ea0923a07a6da1e1f1bfbc8d7d29a01ce32b/pillow-11.3.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:eb76541cba2f958032d79d143b98a3a6b3ea87f0959bbe256c0b5e416599fd5d", size = 5871726, upload-time = "2025-07-03T13:10:04.448Z" }, + { url = "https://files.pythonhosted.org/packages/8d/33/e9200d2bd7ba00dc3ddb78df1198a6e80d7669cce6c2bdbeb2530a74ec58/pillow-11.3.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:67172f2944ebba3d4a7b54f2e95c786a3a50c21b88456329314caaa28cda70f6", size = 7644652, upload-time = "2025-07-03T13:10:10.391Z" }, + { url = "https://files.pythonhosted.org/packages/41/f1/6f2427a26fc683e00d985bc391bdd76d8dd4e92fac33d841127eb8fb2313/pillow-11.3.0-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:97f07ed9f56a3b9b5f49d3661dc9607484e85c67e27f3e8be2c7d28ca032fec7", size = 5977787, upload-time = "2025-07-01T09:14:21.63Z" }, + { url = "https://files.pythonhosted.org/packages/e4/c9/06dd4a38974e24f932ff5f98ea3c546ce3f8c995d3f0985f8e5ba48bba19/pillow-11.3.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:676b2815362456b5b3216b4fd5bd89d362100dc6f4945154ff172e206a22c024", size = 6645236, upload-time = "2025-07-01T09:14:23.321Z" }, + { url = "https://files.pythonhosted.org/packages/40/e7/848f69fb79843b3d91241bad658e9c14f39a32f71a301bcd1d139416d1be/pillow-11.3.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:3e184b2f26ff146363dd07bde8b711833d7b0202e27d13540bfe2e35a323a809", size = 6086950, upload-time = "2025-07-01T09:14:25.237Z" }, + { url = "https://files.pythonhosted.org/packages/0b/1a/7cff92e695a2a29ac1958c2a0fe4c0b2393b60aac13b04a4fe2735cad52d/pillow-11.3.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:6be31e3fc9a621e071bc17bb7de63b85cbe0bfae91bb0363c893cbe67247780d", size = 6723358, upload-time = "2025-07-01T09:14:27.053Z" }, + { url = "https://files.pythonhosted.org/packages/26/7d/73699ad77895f69edff76b0f332acc3d497f22f5d75e5360f78cbcaff248/pillow-11.3.0-cp312-cp312-win32.whl", hash = "sha256:7b161756381f0918e05e7cb8a371fff367e807770f8fe92ecb20d905d0e1c149", size = 6275079, upload-time = "2025-07-01T09:14:30.104Z" }, + { url = "https://files.pythonhosted.org/packages/8c/ce/e7dfc873bdd9828f3b6e5c2bbb74e47a98ec23cc5c74fc4e54462f0d9204/pillow-11.3.0-cp312-cp312-win_amd64.whl", hash = "sha256:a6444696fce635783440b7f7a9fc24b3ad10a9ea3f0ab66c5905be1c19ccf17d", size = 6986324, upload-time = "2025-07-01T09:14:31.899Z" }, + { url = "https://files.pythonhosted.org/packages/16/8f/b13447d1bf0b1f7467ce7d86f6e6edf66c0ad7cf44cf5c87a37f9bed9936/pillow-11.3.0-cp312-cp312-win_arm64.whl", hash = "sha256:2aceea54f957dd4448264f9bf40875da0415c83eb85f55069d89c0ed436e3542", size = 2423067, upload-time = "2025-07-01T09:14:33.709Z" }, + { url = "https://files.pythonhosted.org/packages/9e/e3/6fa84033758276fb31da12e5fb66ad747ae83b93c67af17f8c6ff4cc8f34/pillow-11.3.0-pp311-pypy311_pp73-macosx_10_15_x86_64.whl", hash = "sha256:7c8ec7a017ad1bd562f93dbd8505763e688d388cde6e4a010ae1486916e713e6", size = 5270566, upload-time = "2025-07-01T09:16:19.801Z" }, + { url = "https://files.pythonhosted.org/packages/5b/ee/e8d2e1ab4892970b561e1ba96cbd59c0d28cf66737fc44abb2aec3795a4e/pillow-11.3.0-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:9ab6ae226de48019caa8074894544af5b53a117ccb9d3b3dcb2871464c829438", size = 4654618, upload-time = "2025-07-01T09:16:21.818Z" }, + { url = "https://files.pythonhosted.org/packages/f2/6d/17f80f4e1f0761f02160fc433abd4109fa1548dcfdca46cfdadaf9efa565/pillow-11.3.0-pp311-pypy311_pp73-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:fe27fb049cdcca11f11a7bfda64043c37b30e6b91f10cb5bab275806c32f6ab3", size = 4874248, upload-time = "2025-07-03T13:11:20.738Z" }, + { url = "https://files.pythonhosted.org/packages/de/5f/c22340acd61cef960130585bbe2120e2fd8434c214802f07e8c03596b17e/pillow-11.3.0-pp311-pypy311_pp73-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:465b9e8844e3c3519a983d58b80be3f668e2a7a5db97f2784e7079fbc9f9822c", size = 6583963, upload-time = "2025-07-03T13:11:26.283Z" }, + { url = "https://files.pythonhosted.org/packages/31/5e/03966aedfbfcbb4d5f8aa042452d3361f325b963ebbadddac05b122e47dd/pillow-11.3.0-pp311-pypy311_pp73-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:5418b53c0d59b3824d05e029669efa023bbef0f3e92e75ec8428f3799487f361", size = 4957170, upload-time = "2025-07-01T09:16:23.762Z" }, + { url = "https://files.pythonhosted.org/packages/cc/2d/e082982aacc927fc2cab48e1e731bdb1643a1406acace8bed0900a61464e/pillow-11.3.0-pp311-pypy311_pp73-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:504b6f59505f08ae014f724b6207ff6222662aab5cc9542577fb084ed0676ac7", size = 5581505, upload-time = "2025-07-01T09:16:25.593Z" }, + { url = "https://files.pythonhosted.org/packages/34/e7/ae39f538fd6844e982063c3a5e4598b8ced43b9633baa3a85ef33af8c05c/pillow-11.3.0-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:c84d689db21a1c397d001aa08241044aa2069e7587b398c8cc63020390b1c1b8", size = 6984598, upload-time = "2025-07-01T09:16:27.732Z" }, ] [[package]] @@ -418,6 +684,19 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/75/a6/a0a304dc33b49145b21f4808d763822111e67d1c3a32b524a1baf947b6e1/platformdirs-4.9.6-py3-none-any.whl", hash = "sha256:e61adb1d5e5cb3441b4b7710bea7e4c12250ca49439228cc1021c00dcfac0917", size = 21348, upload-time = "2026-04-09T00:04:09.463Z" }, ] +[[package]] +name = "plotly" +version = "6.7.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "narwhals" }, + { name = "packaging" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/3a/7f/0f100df1172aadf88a929a9dbb902656b0880ba4b960fe5224867159d8f4/plotly-6.7.0.tar.gz", hash = "sha256:45eea0ff27e2a23ccd62776f77eb43aa1ca03df4192b76036e380bb479b892c6", size = 6911286, upload-time = "2026-04-09T20:36:45.738Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/90/ad/cba91b3bcf04073e4d1655a5c1710ef3f457f56f7d1b79dcc3d72f4dd912/plotly-6.7.0-py3-none-any.whl", hash = "sha256:ac8aca1c25c663a59b5b9140a549264a5badde2e057d79b8c772ae2920e32ff0", size = 9898444, upload-time = "2026-04-09T20:36:39.812Z" }, +] + [[package]] name = "plotnine" version = "0.15.3" @@ -488,6 +767,158 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/5d/19/fd3ef348460c80af7bb4669ea7926651d1f95c23ff2df18b9d24bab4f3fa/pre_commit-4.5.1-py2.py3-none-any.whl", hash = "sha256:3b3afd891e97337708c1674210f8eba659b52a38ea5f822ff142d10786221f77", size = 226437, upload-time = "2025-12-16T21:14:32.409Z" }, ] +[[package]] +name = "proglog" +version = "0.1.12" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "tqdm" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/c2/af/c108866c452eda1132f3d6b3cb6be2ae8430c97e9309f38ca9dbd430af37/proglog-0.1.12.tar.gz", hash = "sha256:361ee074721c277b89b75c061336cb8c5f287c92b043efa562ccf7866cda931c", size = 8794, upload-time = "2025-05-09T14:36:18.316Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/c1/1b/f7ea6cde25621cd9236541c66ff018f4268012a534ec31032bcb187dc5e7/proglog-0.1.12-py3-none-any.whl", hash = "sha256:ccaafce51e80a81c65dc907a460c07ccb8ec1f78dc660cfd8f9ec3a22f01b84c", size = 6337, upload-time = "2025-05-09T14:36:16.798Z" }, +] + +[[package]] +name = "protobuf" +version = "7.34.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/6b/6b/a0e95cad1ad7cc3f2c6821fcab91671bd5b78bd42afb357bb4765f29bc41/protobuf-7.34.1.tar.gz", hash = "sha256:9ce42245e704cc5027be797c1db1eb93184d44d1cdd71811fb2d9b25ad541280", size = 454708, upload-time = "2026-03-20T17:34:47.036Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/ec/11/3325d41e6ee15bf1125654301211247b042563bcc898784351252549a8ad/protobuf-7.34.1-cp310-abi3-macosx_10_9_universal2.whl", hash = "sha256:d8b2cc79c4d8f62b293ad9b11ec3aebce9af481fa73e64556969f7345ebf9fc7", size = 429247, upload-time = "2026-03-20T17:34:37.024Z" }, + { url = "https://files.pythonhosted.org/packages/eb/9d/aa69df2724ff63efa6f72307b483ce0827f4347cc6d6df24b59e26659fef/protobuf-7.34.1-cp310-abi3-manylinux2014_aarch64.whl", hash = "sha256:5185e0e948d07abe94bb76ec9b8416b604cfe5da6f871d67aad30cbf24c3110b", size = 325753, upload-time = "2026-03-20T17:34:38.751Z" }, + { url = "https://files.pythonhosted.org/packages/92/e8/d174c91fd48e50101943f042b09af9029064810b734e4160bbe282fa1caa/protobuf-7.34.1-cp310-abi3-manylinux2014_s390x.whl", hash = "sha256:403b093a6e28a960372b44e5eb081775c9b056e816a8029c61231743d63f881a", size = 340198, upload-time = "2026-03-20T17:34:39.871Z" }, + { url = "https://files.pythonhosted.org/packages/53/1b/3b431694a4dc6d37b9f653f0c64b0a0d9ec074ee810710c0c3da21d67ba7/protobuf-7.34.1-cp310-abi3-manylinux2014_x86_64.whl", hash = "sha256:8ff40ce8cd688f7265326b38d5a1bed9bfdf5e6723d49961432f83e21d5713e4", size = 324267, upload-time = "2026-03-20T17:34:41.1Z" }, + { url = "https://files.pythonhosted.org/packages/85/29/64de04a0ac142fb685fd09999bc3d337943fb386f3a0ec57f92fd8203f97/protobuf-7.34.1-cp310-abi3-win32.whl", hash = "sha256:34b84ce27680df7cca9f231043ada0daa55d0c44a2ddfaa58ec1d0d89d8bf60a", size = 426628, upload-time = "2026-03-20T17:34:42.536Z" }, + { url = "https://files.pythonhosted.org/packages/4d/87/cb5e585192a22b8bd457df5a2c16a75ea0db9674c3a0a39fc9347d84e075/protobuf-7.34.1-cp310-abi3-win_amd64.whl", hash = "sha256:e97b55646e6ce5cbb0954a8c28cd39a5869b59090dfaa7df4598a7fba869468c", size = 437901, upload-time = "2026-03-20T17:34:44.112Z" }, + { url = "https://files.pythonhosted.org/packages/88/95/608f665226bca68b736b79e457fded9a2a38c4f4379a4a7614303d9db3bc/protobuf-7.34.1-py3-none-any.whl", hash = "sha256:bb3812cd53aefea2b028ef42bd780f5b96407247f20c6ef7c679807e9d188f11", size = 170715, upload-time = "2026-03-20T17:34:45.384Z" }, +] + +[[package]] +name = "pyahocorasick" +version = "2.3.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/b0/3c/dc9e31a0f004eabe2ef5d31456766555a02e2af29e159daa31266934af79/pyahocorasick-2.3.1.tar.gz", hash = "sha256:9d0f6bb522237ed7f111ed59c9e8baea7d1e75813587b6773babd43bda35db9f", size = 105024, upload-time = "2026-04-27T16:30:25.957Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/7c/06/2798edbcff0d50a51f8ef527cb3f861e69f694d80043826529c33fe15aa3/pyahocorasick-2.3.1-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:3a69041f5fd665ec0edcffd9562dd0f2f23c236bbc950e18ada854e29fc3dd88", size = 59714, upload-time = "2026-04-27T16:31:26.083Z" }, + { url = "https://files.pythonhosted.org/packages/58/00/4b475d2f26240253bc6412c509c1c103844a8eac326a1353d9bc798beb74/pyahocorasick-2.3.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:e8f9c21fd2bd72c0454ba6df0c7dbdfd7236c5cfd161fc983476fffbde92e18f", size = 33988, upload-time = "2026-04-27T16:31:27.351Z" }, + { url = "https://files.pythonhosted.org/packages/32/9b/5eef7545f3556d8b2ca8ee943938e94a62b659ee6f6978573efd2d597e2a/pyahocorasick-2.3.1-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:0a8bed95da02e7c874818825d65e6e31d5b38c88ecba02a6c7144524074ddade", size = 113162, upload-time = "2026-04-27T16:31:28.704Z" }, + { url = "https://files.pythonhosted.org/packages/bf/55/807c408bd7baaa137643e99b4b642abd850d83c3e80b17e17f62b5842429/pyahocorasick-2.3.1-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:2541c437dc0f04475729076ec36aac72604b767fa347107bcd6945d61d5ba437", size = 113939, upload-time = "2026-04-27T16:31:31.935Z" }, + { url = "https://files.pythonhosted.org/packages/b1/d4/ffe0a07979ed128ed55c9e4ac7007be4d2048c2582de68035bd84c22e585/pyahocorasick-2.3.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:aa05c56eaeee2e0242a84f53d9927d795d26002493c69ba8a4af1d86bdca7edb", size = 116159, upload-time = "2026-04-27T16:31:33.662Z" }, + { url = "https://files.pythonhosted.org/packages/1c/97/c5b6962d93d0e7870a8e0e1d76c71cd30133a96c642190531d5fae754de0/pyahocorasick-2.3.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:dfc4749cca4df4327dd2fcbbd49e5148e72840366023429729cf468f28c938a2", size = 116390, upload-time = "2026-04-27T16:31:35.554Z" }, + { url = "https://files.pythonhosted.org/packages/12/63/7072ae6d6458518c277b256a14dd1b20726192e880915b4f6d3daeb0700d/pyahocorasick-2.3.1-cp311-cp311-win_amd64.whl", hash = "sha256:cb75c32f73be3f70435e49bbc5518105b54f1320a51e7da18ac989bfe93f6c1c", size = 35152, upload-time = "2026-04-27T16:31:36.828Z" }, + { url = "https://files.pythonhosted.org/packages/29/a6/2ee9301a36c9d6bcd7e745e8a98e72fddf1ff1cd3ae899f498383c3ad1c9/pyahocorasick-2.3.1-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:f0df14cb10ed1e942a30c0f11d242472452e7c567acbf3ac070e5d6912b71ca9", size = 60112, upload-time = "2026-04-27T16:31:38.39Z" }, + { url = "https://files.pythonhosted.org/packages/7c/c6/f242c7966d8207822d7ecb183101522ca03df5f302ee6520fe4412f03fae/pyahocorasick-2.3.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:873911f1d80acd82ac00aae277a9a2b335a0c0cac0a0ef1c6635b57badc6f7a6", size = 34154, upload-time = "2026-04-27T16:31:39.719Z" }, + { url = "https://files.pythonhosted.org/packages/f7/01/0a7387a6327f4ef9b7dcf3cea84dfea3e4b0e85eb37a52b612985b1f9a9a/pyahocorasick-2.3.1-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:9a4d4f5b05ce9d8af82c40ed39cd6892613e9e8bf1b5e6ea79009c566430adb1", size = 113543, upload-time = "2026-04-27T16:31:41.311Z" }, + { url = "https://files.pythonhosted.org/packages/a1/f2/d13807476195e4ec5999a78f22db592a64da54229c9183438f3165105779/pyahocorasick-2.3.1-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:9ec1d3465f25a5063c7eaa85ecb106cbe256064669c754e0b13b2483cf613a98", size = 114873, upload-time = "2026-04-27T16:31:42.625Z" }, + { url = "https://files.pythonhosted.org/packages/af/32/d79302845be8629f9aee2a3dbeb9ad089b036f089e99589a08814e7e5910/pyahocorasick-2.3.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:e4e1e90eb2e755c79b9b904fd8adcca61c22b4b48811b9435f0c4b2d718895d6", size = 116455, upload-time = "2026-04-27T16:31:44.366Z" }, + { url = "https://files.pythonhosted.org/packages/0e/c9/2e3019eb9f4404dc1fe1309535d1220740cc95275ad1b4a70f7f891cb296/pyahocorasick-2.3.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:e3922f66721b5b777eae758d2a0acffd98ee97dc7e6e452ba533d1c5892e15b7", size = 117863, upload-time = "2026-04-27T16:31:45.831Z" }, + { url = "https://files.pythonhosted.org/packages/3a/6e/5fa2f6fafb7a5bb82cad6e2ef3c8eed7c859ba16242766a5a425e19334b5/pyahocorasick-2.3.1-cp312-cp312-win_amd64.whl", hash = "sha256:f5cc3c021be241fe9317c5991f8efba2b876e3956691322ad9e55c0d9ff7c599", size = 35258, upload-time = "2026-04-27T16:31:47.053Z" }, +] + +[[package]] +name = "pyarrow" +version = "24.0.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/91/13/13e1069b351bdc3881266e11147ffccf687505dbb0ea74036237f5d454a5/pyarrow-24.0.0.tar.gz", hash = "sha256:85fe721a14dd823aca09127acbb06c3ca723efbd436c004f16bca601b04dcc83", size = 1180261, upload-time = "2026-04-21T10:51:25.837Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/62/c9/a47ab7ece0d86cbe6678418a0fbd1ac4bb493b9184a3891dfa0e7f287ae0/pyarrow-24.0.0-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:b0e131f880cda8d04e076cee175a46fc0e8bc8b65c99c6c09dff6669335fde74", size = 35068898, upload-time = "2026-04-21T10:46:36.599Z" }, + { url = "https://files.pythonhosted.org/packages/d1/bc/8db86617a9a58008acf8913d6fed68ea2a46acb6de928db28d724c891a68/pyarrow-24.0.0-cp311-cp311-macosx_12_0_x86_64.whl", hash = "sha256:1b2fe7f9a5566401a0ef2571f197eb92358925c1f0c8dba305d6e43ea0871bb3", size = 36679915, upload-time = "2026-04-21T10:46:42.602Z" }, + { url = "https://files.pythonhosted.org/packages/eb/8e/fb178720400ef69db251eb4a9c3ccf4af269bc1feb5055529b8fc87170d1/pyarrow-24.0.0-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:0b3537c00fb8d384f15ac1e79b6eb6db04a16514c8c1d22e59a9b95c8ba42868", size = 45697931, upload-time = "2026-04-21T10:46:48.403Z" }, + { url = "https://files.pythonhosted.org/packages/f3/27/99c42abe8e21b44f4917f62631f3aa31404882a2c41d8a4cd5c110e13d52/pyarrow-24.0.0-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:14e31a3c9e35f1ab6356c6378f6f72830e6d2d5f1791df3774a7b097d18a6a1e", size = 48837449, upload-time = "2026-04-21T10:46:55.329Z" }, + { url = "https://files.pythonhosted.org/packages/36/b6/333749e2666e9032891125bf9c691146e92901bece62030ac1430e2e7c88/pyarrow-24.0.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:b7d9a514e73bc42711e6a35aaccf3587c520024fe0a25d830a1a8a27c15f4f57", size = 49395949, upload-time = "2026-04-21T10:47:01.869Z" }, + { url = "https://files.pythonhosted.org/packages/17/25/c5201706a2dd374e8ba6ee3fd7a8c89fb7ffc16eed5217a91fd2bd7f7626/pyarrow-24.0.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:b196eb3f931862af3fa84c2a253514d859c08e0d8fe020e07be12e75a5a9780c", size = 51912986, upload-time = "2026-04-21T10:47:09.872Z" }, + { url = "https://files.pythonhosted.org/packages/f8/d2/4d1bbba65320b21a49678d6fbdc6ff7c649251359fdcfc03568c4136231d/pyarrow-24.0.0-cp311-cp311-win_amd64.whl", hash = "sha256:35405aecb474e683fb36af650618fd5340ee5471fc65a21b36076a18bbc6c981", size = 27255371, upload-time = "2026-04-21T10:47:15.943Z" }, + { url = "https://files.pythonhosted.org/packages/b4/a9/9686d9f07837f91f775e8932659192e02c74f9d8920524b480b85212cc68/pyarrow-24.0.0-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:6233c9ed9ab9d1db47de57d9753256d9dcffbf42db341576099f0fd9f6bf4810", size = 34981559, upload-time = "2026-04-21T10:47:22.17Z" }, + { url = "https://files.pythonhosted.org/packages/80/b6/0ddf0e9b6ead3474ab087ae598c76b031fc45532bf6a63f3a553440fb258/pyarrow-24.0.0-cp312-cp312-macosx_12_0_x86_64.whl", hash = "sha256:f7616236ec1bc2b15bfdec22a71ab38851c86f8f05ff64f379e1278cf20c634a", size = 36663654, upload-time = "2026-04-21T10:47:28.315Z" }, + { url = "https://files.pythonhosted.org/packages/7c/3b/926382efe8ce27ba729071d3566ade6dfb86bdf112f366000196b2f5780a/pyarrow-24.0.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:1617043b99bd33e5318ae18eb2919af09c71322ef1ca46566cdafc6e6712fb66", size = 45679394, upload-time = "2026-04-21T10:47:34.821Z" }, + { url = "https://files.pythonhosted.org/packages/b3/7a/829f7d9dfd37c207206081d6dad474d81dde29952401f07f2ba507814818/pyarrow-24.0.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:6165461f55ef6314f026de6638d661188e3455d3ec49834556a0ebbdbace18bb", size = 48863122, upload-time = "2026-04-21T10:47:42.056Z" }, + { url = "https://files.pythonhosted.org/packages/5f/e8/f88ce625fe8babaae64e8db2d417c7653adb3019b08aae85c5ed787dc816/pyarrow-24.0.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:3b13dedfe76a0ad2d1d859b0811b53827a4e9d93a0bcb05cf59333ab4980cc7e", size = 49376032, upload-time = "2026-04-21T10:47:48.967Z" }, + { url = "https://files.pythonhosted.org/packages/36/7a/82c363caa145fff88fb475da50d3bf52bb024f61917be5424c3392eaf878/pyarrow-24.0.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:25ea65d868eb04015cd18e6df2fbe98f07e5bda2abefabcb88fce39a947716f6", size = 51929490, upload-time = "2026-04-21T10:47:55.981Z" }, + { url = "https://files.pythonhosted.org/packages/66/1c/e3e72c8014ad2743ca64a701652c733cc5cbcee15c0463a32a8c55518d9e/pyarrow-24.0.0-cp312-cp312-win_amd64.whl", hash = "sha256:295f0a7f2e242dabd513737cf076007dc5b2d59237e3eca37b05c0c6446f3826", size = 27355660, upload-time = "2026-04-21T10:48:01.718Z" }, +] + +[[package]] +name = "pycparser" +version = "3.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/1b/7d/92392ff7815c21062bea51aa7b87d45576f649f16458d78b7cf94b9ab2e6/pycparser-3.0.tar.gz", hash = "sha256:600f49d217304a5902ac3c37e1281c9fe94e4d0489de643a9504c5cdfdfc6b29", size = 103492, upload-time = "2026-01-21T14:26:51.89Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/0c/c3/44f3fbbfa403ea2a7c779186dc20772604442dde72947e7d01069cbe98e3/pycparser-3.0-py3-none-any.whl", hash = "sha256:b727414169a36b7d524c1c3e31839a521725078d7b2ff038656844266160a992", size = 48172, upload-time = "2026-01-21T14:26:50.693Z" }, +] + +[[package]] +name = "pydantic" +version = "2.13.4" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "annotated-types" }, + { name = "pydantic-core" }, + { name = "typing-extensions" }, + { name = "typing-inspection" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/18/a5/b60d21ac674192f8ab0ba4e9fd860690f9b4a6e51ca5df118733b487d8d6/pydantic-2.13.4.tar.gz", hash = "sha256:c40756b57adaa8b1efeeced5c196f3f3b7c435f90e84ea7f443901bec8099ef6", size = 844775, upload-time = "2026-05-06T13:43:05.343Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/fd/7b/122376b1fd3c62c1ed9dc80c931ace4844b3c55407b6fb2d199377c9736f/pydantic-2.13.4-py3-none-any.whl", hash = "sha256:45a282cde31d808236fd7ea9d919b128653c8b38b393d1c4ab335c62924d9aba", size = 472262, upload-time = "2026-05-06T13:43:02.641Z" }, +] + +[[package]] +name = "pydantic-core" +version = "2.46.4" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/9d/56/921726b776ace8d8f5db44c4ef961006580d91dc52b803c489fafd1aa249/pydantic_core-2.46.4.tar.gz", hash = "sha256:62f875393d7f270851f20523dd2e29f082bcc82292d66db2b64ea71f64b6e1c1", size = 471464, upload-time = "2026-05-06T13:37:06.98Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/5c/fa/6d7708d2cfc1a832acb6aeb0cd16e801902df8a0f583bb3b4b527fde022e/pydantic_core-2.46.4-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:0e96592440881c74a213e5ad528e2b24d3d4f940de2766bed9010ab1d9e51594", size = 2111872, upload-time = "2026-05-06T13:40:27.596Z" }, + { url = "https://files.pythonhosted.org/packages/ae/6f/aa064a3e74b5745afbdf250594f38e7ead05e2d651bcb35994b9417a0d4d/pydantic_core-2.46.4-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:e0d65b8c354be7fb5f720c3caa8bc940bc2d20ce749c8e06135f07f8ed95dd7c", size = 1948255, upload-time = "2026-05-06T13:39:12.574Z" }, + { url = "https://files.pythonhosted.org/packages/43/3a/41114a9f7569b84b4d84e7a018c57c56347dac30c0d4a872946ec4e36c46/pydantic_core-2.46.4-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7bfb192b3f4b9e8a89b6277b6ce787564f62cfd272055f6e685726b111dc7826", size = 1972827, upload-time = "2026-05-06T13:38:19.841Z" }, + { url = "https://files.pythonhosted.org/packages/ef/25/1ab42e8048fe551934d9884e8d64daa7e990ad386f310a15981aeb6a5b08/pydantic_core-2.46.4-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:9037063db01f09b09e237c282b6792bd4da634b5402c4e7f0c61effed7701a04", size = 2041051, upload-time = "2026-05-06T13:38:10.447Z" }, + { url = "https://files.pythonhosted.org/packages/94/c2/1a934597ddf08da410385b3b7aae91956a5a76c635effef456074fad7e88/pydantic_core-2.46.4-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:fc010ab034c8c7452522748bf937df58020d256ccae0874463d1f4d01758af8e", size = 2221314, upload-time = "2026-05-06T13:40:13.089Z" }, + { url = "https://files.pythonhosted.org/packages/02/6d/9e8ad178c9c4df27ad3c8f25d1fe2a7ab0d2ba0559fad4aee5d3d1f16771/pydantic_core-2.46.4-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:8c5dac79fa1614d1e06ca695109c6105923bd9c7d1d6c918d4e637b7e6b32fd3", size = 2285146, upload-time = "2026-05-06T13:38:59.224Z" }, + { url = "https://files.pythonhosted.org/packages/80/50/540cd3aeefc041beb111125c4bff779831a2111fc6b15a9138cda277d32c/pydantic_core-2.46.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f9fa868638bf362d3d138ea55829cefb3d5f4b0d7f142234382a15e2485dbec4", size = 2089685, upload-time = "2026-05-06T13:38:17.762Z" }, + { url = "https://files.pythonhosted.org/packages/6b/a4/b440ad35f05f6a38f89fa0f149accb3f0e02be94ca5e15f3c449a61b4bc9/pydantic_core-2.46.4-cp311-cp311-manylinux_2_31_riscv64.whl", hash = "sha256:17299feefe090f2caa5b8e37222bb5f663e4935a8bfa6931d4102e5df1a9f398", size = 2115420, upload-time = "2026-05-06T13:37:58.195Z" }, + { url = "https://files.pythonhosted.org/packages/99/61/de4f55db8dfd57bfdfa9a12ec90fe1b57c4f41062f7ca86f08586b3e0ac0/pydantic_core-2.46.4-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:4c63ebc82684aa89d9a3bcbd13d515b3be44250dc68dd3bd81526c1cb31286c3", size = 2165122, upload-time = "2026-05-06T13:37:01.167Z" }, + { url = "https://files.pythonhosted.org/packages/f7/52/7c529d7bdb2d1068bd52f51fe32572c8301f9a4febf1948f10639f1436f5/pydantic_core-2.46.4-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:aaa2a54443eff1950ba5ddc6b6ccda0d9c84a364276a62f969bdf2a390650848", size = 2182573, upload-time = "2026-05-06T13:38:45.04Z" }, + { url = "https://files.pythonhosted.org/packages/37/b3/7c40325848ba78247f2812dcf9c7274e38cd801820ca6dd9fe63bcfb0eb4/pydantic_core-2.46.4-cp311-cp311-musllinux_1_1_armv7l.whl", hash = "sha256:18e5ceec2ab67e6d5f1a9085e5a24c9c4e2ac4545730bfe668680bca05e555f3", size = 2317139, upload-time = "2026-05-06T13:37:15.539Z" }, + { url = "https://files.pythonhosted.org/packages/d9/37/f913f81a657c865b75da6c0dbed79876073c2a43b5bd9edbe8da785e4d49/pydantic_core-2.46.4-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:a0f62d0a58f4e7da165457e995725421e0064f2255d8eccebc49f41bbc23b109", size = 2360433, upload-time = "2026-05-06T13:37:30.099Z" }, + { url = "https://files.pythonhosted.org/packages/c4/67/6acaa1be2567f9256b056d8477158cac7240813956ce86e49deae8e173b4/pydantic_core-2.46.4-cp311-cp311-win32.whl", hash = "sha256:041bde0a48fd37cf71cab1c9d56d3e8625a3793fef1f7dd232b3ff37e978ecda", size = 1985513, upload-time = "2026-05-06T13:38:15.669Z" }, + { url = "https://files.pythonhosted.org/packages/aa/e6/c505f83dfeda9a2e5c995cfd872949e4d05e12f7feb3dca72f633daefa94/pydantic_core-2.46.4-cp311-cp311-win_amd64.whl", hash = "sha256:6f2eeda33a839975441c86a4119e1383c50b47faf0cbb5176985565c6bb02c33", size = 2071114, upload-time = "2026-05-06T13:40:35.416Z" }, + { url = "https://files.pythonhosted.org/packages/0f/da/7a263a96d965d9d0df5e8de8a475f33495451117035b09acb110288c381f/pydantic_core-2.46.4-cp311-cp311-win_arm64.whl", hash = "sha256:14f4c5d6db102bd796a627bbb3a17b4cf4574b9ae861d8b7c9a9661c6dd3362d", size = 2044298, upload-time = "2026-05-06T13:38:29.754Z" }, + { url = "https://files.pythonhosted.org/packages/ce/8c/af022f0af448d7747c5154288d46b5f2bc5f17366eaa0e23e9aa04d59f3b/pydantic_core-2.46.4-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:3245406455a5d98187ec35530fd772b1d799b26667980872c8d4614991e2c4a2", size = 2106158, upload-time = "2026-05-06T13:38:57.215Z" }, + { url = "https://files.pythonhosted.org/packages/19/95/6195171e385007300f0f5574592e467c568becce2d937a0b6804f218bc49/pydantic_core-2.46.4-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:962ccbab7b642487b1d8b7df90ef677e03134cf1fd8880bf698649b22a69371f", size = 1951724, upload-time = "2026-05-06T13:37:02.697Z" }, + { url = "https://files.pythonhosted.org/packages/8e/bc/f47d1ff9cbb1620e1b5b697eef06010035735f07820180e74178226b27b3/pydantic_core-2.46.4-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8233f2947cf85404441fd7e0085f53b10c93e0ee78611099b5c7237e36aacbf7", size = 1975742, upload-time = "2026-05-06T13:37:09.448Z" }, + { url = "https://files.pythonhosted.org/packages/5b/11/9b9a5b0306345664a2da6410877af6e8082481b5884b3ddd78d47c6013ce/pydantic_core-2.46.4-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:3a233125ac121aa3ffba9a2b59edfc4a985a76092dc8279586ab4b71390875e7", size = 2052418, upload-time = "2026-05-06T13:37:38.234Z" }, + { url = "https://files.pythonhosted.org/packages/f1/b7/a65fec226f5d78fc39f4a13c4cc0c768c22b113438f60c14adc9d2865038/pydantic_core-2.46.4-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5b712b53160b79a5850310b912a5ef8e57e56947c8ad690c227f5c9d7e561712", size = 2232274, upload-time = "2026-05-06T13:38:27.753Z" }, + { url = "https://files.pythonhosted.org/packages/68/f0/92039db98b907ef49269a8271f67db9cb78ae2fc68062ef7e4e77adb5f61/pydantic_core-2.46.4-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9401557acd873c3a7f3eb9383edef8ac4968f9510e340f4808d427e75667e7b4", size = 2309940, upload-time = "2026-05-06T13:38:05.353Z" }, + { url = "https://files.pythonhosted.org/packages/5f/97/2aab507d3d00ca626e8e57c1eac6a79e4e5fbcc63eb99733ff55d1717f65/pydantic_core-2.46.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:926c9541b14b12b1681dca8a0b75feb510b06c6341b70a8e500c2fdcff837cce", size = 2094516, upload-time = "2026-05-06T13:39:10.577Z" }, + { url = "https://files.pythonhosted.org/packages/22/37/a8aca44d40d737dde2bc05b3c6c07dff0de07ce6f82e9f3167aeaf4d5dea/pydantic_core-2.46.4-cp312-cp312-manylinux_2_31_riscv64.whl", hash = "sha256:56cb4851bcaf3d117eddcef4fe66afd750a50274b0da8e22be256d10e5611987", size = 2136854, upload-time = "2026-05-06T13:40:22.59Z" }, + { url = "https://files.pythonhosted.org/packages/24/99/fcef1b79238c06a8cbec70819ac722ba76e02bc8ada9b0fd66eba40da01b/pydantic_core-2.46.4-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:c68fcd102d71ea85c5b2dfac3f4f8476eff42a9e078fd5faefff6d145063536b", size = 2180306, upload-time = "2026-05-06T13:40:10.666Z" }, + { url = "https://files.pythonhosted.org/packages/ae/6c/fc44000918855b42779d007ae63b0532794739027b2f417321cddbc44f6a/pydantic_core-2.46.4-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:b2f69dec1725e79a012d920df1707de5caf7ed5e08f3be4435e25803efc47458", size = 2190044, upload-time = "2026-05-06T13:40:43.231Z" }, + { url = "https://files.pythonhosted.org/packages/6b/65/d9cadc9f1920d7a127ad2edba16c1db7916e59719285cd6c94600b0080ba/pydantic_core-2.46.4-cp312-cp312-musllinux_1_1_armv7l.whl", hash = "sha256:8d0820e8192167f80d88d64038e609c31452eeca865b4e1d9950a27a4609b00b", size = 2329133, upload-time = "2026-05-06T13:39:57.365Z" }, + { url = "https://files.pythonhosted.org/packages/d0/cf/c873d91679f3a30bcf5e7ac280ce5573483e72295307685120d0d5ad3416/pydantic_core-2.46.4-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:fbdb89b3e1c94a30cc5edfce477c6e6a5dc4d8f84665b455c27582f211a1c72c", size = 2374464, upload-time = "2026-05-06T13:38:06.976Z" }, + { url = "https://files.pythonhosted.org/packages/47/bd/6f2fc8188f31bf10590f1e98e7b306336161fac930a8c514cd7bd828c7dc/pydantic_core-2.46.4-cp312-cp312-win32.whl", hash = "sha256:9aa768456404a8bf48a4406685ac2bec8e72b62c69313734fa3b73cf33b3a894", size = 1974823, upload-time = "2026-05-06T13:40:47.985Z" }, + { url = "https://files.pythonhosted.org/packages/40/8c/985c1d41ea1107c2534abd9870e4ed5c8e7669b5c308297835c001e7a1c4/pydantic_core-2.46.4-cp312-cp312-win_amd64.whl", hash = "sha256:e9c26f834c65f5752f3f06cb08cb86a913ceb7274d0db6e267808a708b46bc89", size = 2072919, upload-time = "2026-05-06T13:39:21.153Z" }, + { url = "https://files.pythonhosted.org/packages/c4/ba/f463d006e0c47373ca7ec5e1a261c59dc01ef4d62b2657af925fb0deee3a/pydantic_core-2.46.4-cp312-cp312-win_arm64.whl", hash = "sha256:4fc73cb559bdb54b1134a706a2802a4cddd27a0633f5abb7e53056268751ac6a", size = 2027604, upload-time = "2026-05-06T13:39:03.753Z" }, + { url = "https://files.pythonhosted.org/packages/ee/a4/73995fd4ebbb46ba0ee51e6fa049b8f02c40daebb762208feda8a6b7894d/pydantic_core-2.46.4-graalpy311-graalpy242_311_native-macosx_10_12_x86_64.whl", hash = "sha256:14d4edf427bdcf950a8a02d7cb44a08614388dd6e1bdcbf4f67504fa7887da9c", size = 2111589, upload-time = "2026-05-06T13:37:10.817Z" }, + { url = "https://files.pythonhosted.org/packages/fb/7f/f37d3a5e8bfcc2e403f5c57a730f2d815693fb42119e8ea48b3789335af1/pydantic_core-2.46.4-graalpy311-graalpy242_311_native-macosx_11_0_arm64.whl", hash = "sha256:0ce40cd7b21210e99342afafbd4d0f76d784eb5b1d60f3bdc566be4983c6c73b", size = 1944552, upload-time = "2026-05-06T13:36:56.717Z" }, + { url = "https://files.pythonhosted.org/packages/15/3c/d7eb777b3ff43e8433a4efb39a17aa8fd98a4ee8561a24a67ef5db07b2d6/pydantic_core-2.46.4-graalpy311-graalpy242_311_native-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:90884113d8b48f760e9587002789ddd741e76ab9f89518cd1e43b1f1a52ec44b", size = 1982984, upload-time = "2026-05-06T13:39:06.207Z" }, + { url = "https://files.pythonhosted.org/packages/63/87/70b9f40170a81afd55ca26c9b2acb25c20d64bcfbf888fafecb3ba077d4c/pydantic_core-2.46.4-graalpy311-graalpy242_311_native-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:66ce7632c22d837c95301830e111ad0128a32b8207533b60896a96c4915192ea", size = 2138417, upload-time = "2026-05-06T13:39:45.476Z" }, + { url = "https://files.pythonhosted.org/packages/9d/1d/8987ad40f65ae1432753072f214fb5c74fe47ffbd0698bb9cbbb585664f8/pydantic_core-2.46.4-graalpy312-graalpy250_312_native-macosx_10_12_x86_64.whl", hash = "sha256:1d8ba486450b14f3b1d63bc521d410ec7565e52f887b9fb671791886436a42f7", size = 2095527, upload-time = "2026-05-06T13:39:52.283Z" }, + { url = "https://files.pythonhosted.org/packages/64/d3/84c282a7eee1d3ac4c0377546ef5a1ea436ce26840d9ac3b7ed54a377507/pydantic_core-2.46.4-graalpy312-graalpy250_312_native-macosx_11_0_arm64.whl", hash = "sha256:3009f12e4e90b7f88b4f9adb1b0c4a3d58fe7820f3238c190047209d148026df", size = 1936024, upload-time = "2026-05-06T13:40:15.671Z" }, + { url = "https://files.pythonhosted.org/packages/d7/ca/eac61596cdeb4d7e174d3dc0bd8a6238f14f75f97a24e7b7db4c7e7340a0/pydantic_core-2.46.4-graalpy312-graalpy250_312_native-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ad785e92e6dc634c21555edc8bd6b64957ab844541bcb96a1366c202951ae526", size = 1990696, upload-time = "2026-05-06T13:38:34.717Z" }, + { url = "https://files.pythonhosted.org/packages/fa/c3/7c8b240552251faf6b3a957db200fcfbbcec36763c050428b601e0c9b83b/pydantic_core-2.46.4-graalpy312-graalpy250_312_native-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:00c603d540afdd6b80eb39f078f33ebd46211f02f33e34a32d9f053bba711de0", size = 2147590, upload-time = "2026-05-06T13:39:29.883Z" }, + { url = "https://files.pythonhosted.org/packages/11/cb/428de0385b6c8d44b716feba566abfacfbd23ee3c4439faa789a1456242f/pydantic_core-2.46.4-pp311-pypy311_pp73-macosx_10_12_x86_64.whl", hash = "sha256:0c563b08bca408dc7f65f700633d8442fffb2421fc47b8101377e9fd65051ff0", size = 2112782, upload-time = "2026-05-06T13:37:04.016Z" }, + { url = "https://files.pythonhosted.org/packages/0b/b5/6a17bdadd0fc1f170adfd05a20d37c832f52b117b4d9131da1f41bb097ce/pydantic_core-2.46.4-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:db06ffe51636ffe9ca531fe9023dd64bdd794be8754cb5df57c5498ae5b518a7", size = 1952146, upload-time = "2026-05-06T13:39:43.092Z" }, + { url = "https://files.pythonhosted.org/packages/2a/dc/03734d80e362cd43ef65428e9de77c730ce7f2f11c60d2b1e1b39f0fbf99/pydantic_core-2.46.4-pp311-pypy311_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:133878133d271ade3d41d1bfb2a45ec38dbdbda40bc065921c6b04e4630127e2", size = 2134492, upload-time = "2026-05-06T13:36:58.124Z" }, + { url = "https://files.pythonhosted.org/packages/de/df/5e5ffc085ed07cc22d298134d3d911c63e91f6a0eb91fe646750a3209910/pydantic_core-2.46.4-pp311-pypy311_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:9bc519fbf2b7578398853d815009ae5e4d4603d12f4e3f91da8c06852d3da3e9", size = 2156604, upload-time = "2026-05-06T13:37:49.88Z" }, + { url = "https://files.pythonhosted.org/packages/81/44/6e112a4253e56f5705467cbab7ab5e91ee7398ba3d56d358635958893d3e/pydantic_core-2.46.4-pp311-pypy311_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:c7a7bd4e39e8e4c12c39cd480356842b6a8a06e41b23a55a5e3e191718838ddf", size = 2183828, upload-time = "2026-05-06T13:37:43.053Z" }, + { url = "https://files.pythonhosted.org/packages/ac/ad/5565071e937d8e752842ac241463944c9eb14c87e2d269f2658a5bd05e98/pydantic_core-2.46.4-pp311-pypy311_pp73-musllinux_1_1_armv7l.whl", hash = "sha256:d396ec2b979760aaf3218e76c24e65bd0aca24983298653b3a9d7a45f9e47b30", size = 2310000, upload-time = "2026-05-06T13:37:56.694Z" }, + { url = "https://files.pythonhosted.org/packages/4f/c3/66883a5cec183e7fba4d024b4cbbe61851a63750ef606b0afecc46d1f2bf/pydantic_core-2.46.4-pp311-pypy311_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:86e1a4418c6cd97d60c95c71164158eaf7324fae7b0923264016baa993eba6fc", size = 2361286, upload-time = "2026-05-06T13:40:05.667Z" }, + { url = "https://files.pythonhosted.org/packages/4b/2d/69abac8f838090bbecd5df894befb2c2619e7996a98ddb949db9f3b93225/pydantic_core-2.46.4-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:d51026d73fcfd93610abc7b27789c26b313920fcfb20e27462d74a7f8b06e983", size = 2193071, upload-time = "2026-05-06T13:38:08.682Z" }, +] + [[package]] name = "pygments" version = "2.20.0" @@ -547,6 +978,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/d8/db/795879cc3ddfe338599bddea6388cc5100b088db0a4caf6e6c1af1c27e04/python_discovery-1.2.2-py3-none-any.whl", hash = "sha256:e1ae95d9af875e78f15e19aed0c6137ab1bb49c200f21f5061786490c9585c7a", size = 31894, upload-time = "2026-04-07T17:28:48.09Z" }, ] +[[package]] +name = "python-dotenv" +version = "1.2.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/82/ed/0301aeeac3e5353ef3d94b6ec08bbcabd04a72018415dcb29e588514bba8/python_dotenv-1.2.2.tar.gz", hash = "sha256:2c371a91fbd7ba082c2c1dc1f8bf89ca22564a087c2c287cd9b662adde799cf3", size = 50135, upload-time = "2026-03-01T16:00:26.196Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/0b/d7/1959b9648791274998a9c3526f6d0ec8fd2233e4d4acce81bbae76b44b2a/python_dotenv-1.2.2-py3-none-any.whl", hash = "sha256:1d8214789a24de455a8b8bd8ae6fe3c6b69a5e3d64aa8a8e5d68e694bbcb285a", size = 22101, upload-time = "2026-03-01T16:00:25.09Z" }, +] + [[package]] name = "pyyaml" version = "6.0.3" @@ -574,6 +1014,54 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/1a/08/67bd04656199bbb51dbed1439b7f27601dfb576fb864099c7ef0c3e55531/pyyaml-6.0.3-cp312-cp312-win_arm64.whl", hash = "sha256:64386e5e707d03a7e172c0701abfb7e10f0fb753ee1d773128192742712a98fd", size = 140344, upload-time = "2025-09-25T21:32:22.617Z" }, ] +[[package]] +name = "rdkit" +version = "2026.3.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "numpy" }, + { name = "pillow" }, +] +wheels = [ + { url = "https://files.pythonhosted.org/packages/87/54/e4fb88a6bce76424ba76b2aee3941115fc58ff19c9bed0d3a39743171a5a/rdkit-2026.3.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:41edffacdf49ddb502688e00a7e349cdfe895bfeefc59a4859b9387f7839985a", size = 29858944, upload-time = "2026-04-05T11:26:29.395Z" }, + { url = "https://files.pythonhosted.org/packages/08/5c/b999261bf7419398133abf3f9ba58776d1f0e584f36cc66aed1860bd8835/rdkit-2026.3.1-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:092cc114ba3fd76c85da0c9a1e8831be53dc70236448b671d70919c2e8c52384", size = 35613411, upload-time = "2026-04-05T11:26:33.852Z" }, + { url = "https://files.pythonhosted.org/packages/4d/a3/890e65ede0231b39aa0f93d03b109977ee0307eb8b5363b306e82c736bc5/rdkit-2026.3.1-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:97cd3081224629caf169a531d6350c8f5b3fcc16dc85e6dad0209a7a15a909f4", size = 37061286, upload-time = "2026-04-05T11:26:38.103Z" }, + { url = "https://files.pythonhosted.org/packages/86/c3/cdf07922c6f7db0026fb08d8e107530b8d4465a4b03ce0e9b3789bd87f7b/rdkit-2026.3.1-cp311-cp311-win_amd64.whl", hash = "sha256:394b15d92df6fb011744502322b3e4de94ef939f628c384fea83aabb224ea8c7", size = 24515977, upload-time = "2026-04-05T11:26:41.375Z" }, + { url = "https://files.pythonhosted.org/packages/49/b8/b63e73f8849e90b673965e581e53a2b2620aa3be25e8e3bfce72f5f6c10f/rdkit-2026.3.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:14e050ffa5d6c5b6b8c87435888c017dcc9fb1fde763f5a49c78bb331aac8c8b", size = 29903263, upload-time = "2026-04-05T11:26:45.301Z" }, + { url = "https://files.pythonhosted.org/packages/ba/b3/e6b7c7da20fa18c8924fc19c86300431ae5b57892b752aad7289c47f883f/rdkit-2026.3.1-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:7b2480bdbbf8d20cf857a635a40de249a97d619a5ae270d840771cc6bdb4979c", size = 35499474, upload-time = "2026-04-05T11:26:49.328Z" }, + { url = "https://files.pythonhosted.org/packages/ef/db/5b1cddbcf0f44e74d1a7756d0c3551c7507d05656a2ac55e6d0d17f69f04/rdkit-2026.3.1-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:a84112a2f33256789319e1142f005f19333161549b1b3e86a0945d6545ace851", size = 36996740, upload-time = "2026-04-05T11:26:53.113Z" }, + { url = "https://files.pythonhosted.org/packages/e2/bf/6441076139ce0dbcb063f2fe1d89a5a3a2179215c459932865c088bb4668/rdkit-2026.3.1-cp312-cp312-win_amd64.whl", hash = "sha256:8be158d43c7f5a31822a8195e6f036e4227a085d3029636a815a00f6ec3dfa5e", size = 24536311, upload-time = "2026-04-05T11:26:57.109Z" }, +] + +[[package]] +name = "requests" +version = "2.33.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "certifi" }, + { name = "charset-normalizer" }, + { name = "idna" }, + { name = "urllib3" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/5f/a4/98b9c7c6428a668bf7e42ebb7c79d576a1c3c1e3ae2d47e674b468388871/requests-2.33.1.tar.gz", hash = "sha256:18817f8c57c6263968bc123d237e3b8b08ac046f5456bd1e307ee8f4250d3517", size = 134120, upload-time = "2026-03-30T16:09:15.531Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/d7/8e/7540e8a2036f79a125c1d2ebadf69ed7901608859186c856fa0388ef4197/requests-2.33.1-py3-none-any.whl", hash = "sha256:4e6d1ef462f3626a1f0a0a9c42dd93c63bad33f9f1c1937509b8c5c8718ab56a", size = 64947, upload-time = "2026-03-30T16:09:13.83Z" }, +] + +[[package]] +name = "responses" +version = "0.26.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "pyyaml" }, + { name = "requests" }, + { name = "urllib3" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/9f/b4/b7e040379838cc71bf5aabdb26998dfbe5ee73904c92c1c161faf5de8866/responses-0.26.0.tar.gz", hash = "sha256:c7f6923e6343ef3682816ba421c006626777893cb0d5e1434f674b649bac9eb4", size = 81303, upload-time = "2026-02-19T14:38:05.574Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/ce/04/7f73d05b556da048923e31a0cc878f03be7c5425ed1f268082255c75d872/responses-0.26.0-py3-none-any.whl", hash = "sha256:03ec4409088cd5c66b71ecbbbd27fe2c58ddfad801c66203457b3e6a04868c37", size = 35099, upload-time = "2026-02-19T14:38:03.847Z" }, +] + [[package]] name = "ruff" version = "0.15.10" @@ -644,6 +1132,19 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/83/11/00d3c3dfc25ad54e731d91449895a79e4bf2384dc3ac01809010ba88f6d5/seaborn-0.13.2-py3-none-any.whl", hash = "sha256:636f8336facf092165e27924f223d3c62ca560b1f2bb5dff7ab7fad265361987", size = 294914, upload-time = "2024-01-25T13:21:49.598Z" }, ] +[[package]] +name = "sentry-sdk" +version = "2.59.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "certifi" }, + { name = "urllib3" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/65/e0/9bf5e5fc7442b10880f3ec0eff0ef4208b84a099606f343ec4f5445227fb/sentry_sdk-2.59.0.tar.gz", hash = "sha256:cd265808ef8bf3f3edf69b527c0a0b2b6b1322762679e55b8987db2e9584aec1", size = 447331, upload-time = "2026-05-04T12:19:06.538Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/bf/00/b8cc413748fb6383d1582e7cda51314f99743351c462a92dc690d5b5853b/sentry_sdk-2.59.0-py2.py3-none-any.whl", hash = "sha256:abcf65ee9a9d9cdebf9ad369782408ecca9c1c792686ef06ba34f5ab233527fe", size = 468432, upload-time = "2026-05-04T12:19:04.741Z" }, +] + [[package]] name = "six" version = "1.17.0" @@ -653,6 +1154,34 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/b7/ce/149a00dd41f10bc29e5921b496af8b574d8413afcd5e30dfa0ed46c2cc5e/six-1.17.0-py2.py3-none-any.whl", hash = "sha256:4721f391ed90541fddacab5acf947aa0d3dc7d27b2e1e8eda2be8970586c3274", size = 11050, upload-time = "2024-12-04T17:35:26.475Z" }, ] +[[package]] +name = "smmap" +version = "5.0.3" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/1f/ea/49c993d6dfdd7338c9b1000a0f36817ed7ec84577ae2e52f890d1a4ff909/smmap-5.0.3.tar.gz", hash = "sha256:4d9debb8b99007ae47165abc08670bd74cb74b5227dda7f643eccc4e9eb5642c", size = 22506, upload-time = "2026-03-09T03:43:26.1Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/c1/d4/59e74daffcb57a07668852eeeb6035af9f32cbfd7a1d2511f17d2fe6a738/smmap-5.0.3-py3-none-any.whl", hash = "sha256:c106e05d5a61449cf6ba9a1e650227ecfb141590d2a98412103ff35d89fc7b2f", size = 24390, upload-time = "2026-03-09T03:43:24.361Z" }, +] + +[[package]] +name = "soundfile" +version = "0.13.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "cffi" }, + { name = "numpy" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/e1/41/9b873a8c055582859b239be17902a85339bec6a30ad162f98c9b0288a2cc/soundfile-0.13.1.tar.gz", hash = "sha256:b2c68dab1e30297317080a5b43df57e302584c49e2942defdde0acccc53f0e5b", size = 46156, upload-time = "2025-01-25T09:17:04.831Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/64/28/e2a36573ccbcf3d57c00626a21fe51989380636e821b341d36ccca0c1c3a/soundfile-0.13.1-py2.py3-none-any.whl", hash = "sha256:a23c717560da2cf4c7b5ae1142514e0fd82d6bbd9dfc93a50423447142f2c445", size = 25751, upload-time = "2025-01-25T09:16:44.235Z" }, + { url = "https://files.pythonhosted.org/packages/ea/ab/73e97a5b3cc46bba7ff8650a1504348fa1863a6f9d57d7001c6b67c5f20e/soundfile-0.13.1-py2.py3-none-macosx_10_9_x86_64.whl", hash = "sha256:82dc664d19831933fe59adad199bf3945ad06d84bc111a5b4c0d3089a5b9ec33", size = 1142250, upload-time = "2025-01-25T09:16:47.583Z" }, + { url = "https://files.pythonhosted.org/packages/a0/e5/58fd1a8d7b26fc113af244f966ee3aecf03cb9293cb935daaddc1e455e18/soundfile-0.13.1-py2.py3-none-macosx_11_0_arm64.whl", hash = "sha256:743f12c12c4054921e15736c6be09ac26b3b3d603aef6fd69f9dde68748f2593", size = 1101406, upload-time = "2025-01-25T09:16:49.662Z" }, + { url = "https://files.pythonhosted.org/packages/58/ae/c0e4a53d77cf6e9a04179535766b3321b0b9ced5f70522e4caf9329f0046/soundfile-0.13.1-py2.py3-none-manylinux_2_28_aarch64.whl", hash = "sha256:9c9e855f5a4d06ce4213f31918653ab7de0c5a8d8107cd2427e44b42df547deb", size = 1235729, upload-time = "2025-01-25T09:16:53.018Z" }, + { url = "https://files.pythonhosted.org/packages/57/5e/70bdd9579b35003a489fc850b5047beeda26328053ebadc1fb60f320f7db/soundfile-0.13.1-py2.py3-none-manylinux_2_28_x86_64.whl", hash = "sha256:03267c4e493315294834a0870f31dbb3b28a95561b80b134f0bd3cf2d5f0e618", size = 1313646, upload-time = "2025-01-25T09:16:54.872Z" }, + { url = "https://files.pythonhosted.org/packages/fe/df/8c11dc4dfceda14e3003bb81a0d0edcaaf0796dd7b4f826ea3e532146bba/soundfile-0.13.1-py2.py3-none-win32.whl", hash = "sha256:c734564fab7c5ddf8e9be5bf70bab68042cd17e9c214c06e365e20d64f9a69d5", size = 899881, upload-time = "2025-01-25T09:16:56.663Z" }, + { url = "https://files.pythonhosted.org/packages/14/e9/6b761de83277f2f02ded7e7ea6f07828ec78e4b229b80e4ca55dd205b9dc/soundfile-0.13.1-py2.py3-none-win_amd64.whl", hash = "sha256:1e70a05a0626524a69e9f0f4dd2ec174b4e9567f4d8b6c11d38b5c289be36ee9", size = 1019162, upload-time = "2025-01-25T09:16:59.573Z" }, +] + [[package]] name = "statsmodels" version = "0.14.6" @@ -686,6 +1215,18 @@ version = "0.31.0" source = { virtual = "." } [package.dev-dependencies] +bench = [ + { name = "loguru" }, + { name = "matplotlib" }, + { name = "polars" }, + { name = "pyahocorasick" }, + { name = "pyarrow" }, + { name = "pydantic" }, + { name = "pytest" }, + { name = "requests" }, + { name = "responses" }, + { name = "wandb", extra = ["media"] }, +] dev = [ { name = "bumpver" }, { name = "plotnine" }, @@ -705,6 +1246,18 @@ interactive = [ [package.metadata] [package.metadata.requires-dev] +bench = [ + { name = "loguru" }, + { name = "matplotlib", specifier = ">=3.8" }, + { name = "polars", specifier = ">=1" }, + { name = "pyahocorasick", specifier = ">=2.0" }, + { name = "pyarrow", specifier = ">=15" }, + { name = "pydantic", specifier = ">=2.6" }, + { name = "pytest" }, + { name = "requests", specifier = ">=2.31" }, + { name = "responses", specifier = ">=0.25" }, + { name = "wandb", extras = ["media"] }, +] dev = [ { name = "bumpver" }, { name = "plotnine", specifier = ">=0.15.2" }, @@ -729,6 +1282,56 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/44/6f/7120676b6d73228c96e17f1f794d8ab046fc910d781c8d151120c3f1569e/toml-0.10.2-py2.py3-none-any.whl", hash = "sha256:806143ae5bfb6a3c6e736a764057db0e6a0e05e338b5630894a5f779cabb4f9b", size = 16588, upload-time = "2020-11-01T01:40:20.672Z" }, ] +[[package]] +name = "tornado" +version = "6.5.5" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/f8/f1/3173dfa4a18db4a9b03e5d55325559dab51ee653763bb8745a75af491286/tornado-6.5.5.tar.gz", hash = "sha256:192b8f3ea91bd7f1f50c06955416ed76c6b72f96779b962f07f911b91e8d30e9", size = 516006, upload-time = "2026-03-10T21:31:02.067Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/59/8c/77f5097695f4dd8255ecbd08b2a1ed8ba8b953d337804dd7080f199e12bf/tornado-6.5.5-cp39-abi3-macosx_10_9_universal2.whl", hash = "sha256:487dc9cc380e29f58c7ab88f9e27cdeef04b2140862e5076a66fb6bb68bb1bfa", size = 445983, upload-time = "2026-03-10T21:30:44.28Z" }, + { url = "https://files.pythonhosted.org/packages/ab/5e/7625b76cd10f98f1516c36ce0346de62061156352353ef2da44e5c21523c/tornado-6.5.5-cp39-abi3-macosx_10_9_x86_64.whl", hash = "sha256:65a7f1d46d4bb41df1ac99f5fcb685fb25c7e61613742d5108b010975a9a6521", size = 444246, upload-time = "2026-03-10T21:30:46.571Z" }, + { url = "https://files.pythonhosted.org/packages/b2/04/7b5705d5b3c0fab088f434f9c83edac1573830ca49ccf29fb83bf7178eec/tornado-6.5.5-cp39-abi3-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:e74c92e8e65086b338fd56333fb9a68b9f6f2fe7ad532645a290a464bcf46be5", size = 447229, upload-time = "2026-03-10T21:30:48.273Z" }, + { url = "https://files.pythonhosted.org/packages/34/01/74e034a30ef59afb4097ef8659515e96a39d910b712a89af76f5e4e1f93c/tornado-6.5.5-cp39-abi3-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:435319e9e340276428bbdb4e7fa732c2d399386d1de5686cb331ec8eee754f07", size = 448192, upload-time = "2026-03-10T21:30:51.22Z" }, + { url = "https://files.pythonhosted.org/packages/be/00/fe9e02c5a96429fce1a1d15a517f5d8444f9c412e0bb9eadfbe3b0fc55bf/tornado-6.5.5-cp39-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:3f54aa540bdbfee7b9eb268ead60e7d199de5021facd276819c193c0fb28ea4e", size = 448039, upload-time = "2026-03-10T21:30:53.52Z" }, + { url = "https://files.pythonhosted.org/packages/82/9e/656ee4cec0398b1d18d0f1eb6372c41c6b889722641d84948351ae19556d/tornado-6.5.5-cp39-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:36abed1754faeb80fbd6e64db2758091e1320f6bba74a4cf8c09cd18ccce8aca", size = 447445, upload-time = "2026-03-10T21:30:55.541Z" }, + { url = "https://files.pythonhosted.org/packages/5a/76/4921c00511f88af86a33de770d64141170f1cfd9c00311aea689949e274e/tornado-6.5.5-cp39-abi3-win32.whl", hash = "sha256:dd3eafaaeec1c7f2f8fdcd5f964e8907ad788fe8a5a32c4426fbbdda621223b7", size = 448582, upload-time = "2026-03-10T21:30:57.142Z" }, + { url = "https://files.pythonhosted.org/packages/2c/23/f6c6112a04d28eed765e374435fb1a9198f73e1ec4b4024184f21faeb1ad/tornado-6.5.5-cp39-abi3-win_amd64.whl", hash = "sha256:6443a794ba961a9f619b1ae926a2e900ac20c34483eea67be4ed8f1e58d3ef7b", size = 448990, upload-time = "2026-03-10T21:30:58.857Z" }, + { url = "https://files.pythonhosted.org/packages/b7/c8/876602cbc96469911f0939f703453c1157b0c826ecb05bdd32e023397d4e/tornado-6.5.5-cp39-abi3-win_arm64.whl", hash = "sha256:2c9a876e094109333f888539ddb2de4361743e5d21eece20688e3e351e4990a6", size = 448016, upload-time = "2026-03-10T21:31:00.43Z" }, +] + +[[package]] +name = "tqdm" +version = "4.67.3" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "colorama", marker = "sys_platform == 'win32'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/09/a9/6ba95a270c6f1fbcd8dac228323f2777d886cb206987444e4bce66338dd4/tqdm-4.67.3.tar.gz", hash = "sha256:7d825f03f89244ef73f1d4ce193cb1774a8179fd96f31d7e1dcde62092b960bb", size = 169598, upload-time = "2026-02-03T17:35:53.048Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/16/e1/3079a9ff9b8e11b846c6ac5c8b5bfb7ff225eee721825310c91b3b50304f/tqdm-4.67.3-py3-none-any.whl", hash = "sha256:ee1e4c0e59148062281c49d80b25b67771a127c85fc9676d3be5f243206826bf", size = 78374, upload-time = "2026-02-03T17:35:50.982Z" }, +] + +[[package]] +name = "typing-extensions" +version = "4.15.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/72/94/1a15dd82efb362ac84269196e94cf00f187f7ed21c242792a923cdb1c61f/typing_extensions-4.15.0.tar.gz", hash = "sha256:0cea48d173cc12fa28ecabc3b837ea3cf6f38c6d1136f85cbaaf598984861466", size = 109391, upload-time = "2025-08-25T13:49:26.313Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/18/67/36e9267722cc04a6b9f15c7f3441c2363321a3ea07da7ae0c0707beb2a9c/typing_extensions-4.15.0-py3-none-any.whl", hash = "sha256:f0fa19c6845758ab08074a0cfa8b7aecb71c999ca73d62883bc25cc018c4e548", size = 44614, upload-time = "2025-08-25T13:49:24.86Z" }, +] + +[[package]] +name = "typing-inspection" +version = "0.4.2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/55/e3/70399cb7dd41c10ac53367ae42139cf4b1ca5f36bb3dc6c9d33acdb43655/typing_inspection-0.4.2.tar.gz", hash = "sha256:ba561c48a67c5958007083d386c3295464928b01faa735ab8547c5692e87f464", size = 75949, upload-time = "2025-10-01T02:14:41.687Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/dc/9b/47798a6c91d8bdb567fe2698fe81e0c6b7cb7ef4d13da4114b41d239f65d/typing_inspection-0.4.2-py3-none-any.whl", hash = "sha256:4ed1cacbdc298c220f1bd249ed5287caa16f34d44ef4e9c3d0cbad5b521545e7", size = 14611, upload-time = "2025-10-01T02:14:40.154Z" }, +] + [[package]] name = "tzdata" version = "2026.1" @@ -738,6 +1341,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/b0/70/d460bd685a170790ec89317e9bd33047988e4bce507b831f5db771e142de/tzdata-2026.1-py2.py3-none-any.whl", hash = "sha256:4b1d2be7ac37ceafd7327b961aa3a54e467efbdb563a23655fbfe0d39cfc42a9", size = 348952, upload-time = "2026-04-03T11:25:20.313Z" }, ] +[[package]] +name = "urllib3" +version = "2.7.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/53/0c/06f8b233b8fd13b9e5ee11424ef85419ba0d8ba0b3138bf360be2ff56953/urllib3-2.7.0.tar.gz", hash = "sha256:231e0ec3b63ceb14667c67be60f2f2c40a518cb38b03af60abc813da26505f4c", size = 433602, upload-time = "2026-05-07T16:13:18.596Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/7f/3e/5db95bcf282c52709639744ca2a8b149baccf648e39c8cc87553df9eae0c/urllib3-2.7.0-py3-none-any.whl", hash = "sha256:9fb4c81ebbb1ce9531cce37674bbc6f1360472bc18ca9a553ede278ef7276897", size = 131087, upload-time = "2026-05-07T16:13:17.151Z" }, +] + [[package]] name = "uv" version = "0.11.6" @@ -820,3 +1432,62 @@ sdist = { url = "https://files.pythonhosted.org/packages/f6/82/8786e7633cd2ef03c wheels = [ { url = "https://files.pythonhosted.org/packages/48/ee/b1dcfa25f18e6964cf73906c5c39a15654d0a702670ff89e9ed3ebab3e05/vizta-1.1.2-py3-none-any.whl", hash = "sha256:39d66bc7c30256d47a5cd2ca0a0924bd8dc65b5f63ea686da8216b606b95ee3c", size = 8864, upload-time = "2025-09-01T21:08:57.577Z" }, ] + +[[package]] +name = "wandb" +version = "0.26.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "click" }, + { name = "gitpython" }, + { name = "packaging" }, + { name = "platformdirs" }, + { name = "protobuf" }, + { name = "pydantic" }, + { name = "pyyaml" }, + { name = "requests" }, + { name = "sentry-sdk" }, + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/6a/a4/72a6640e1f566e81f184a426e3e45298d4c6672664de41adb7eb6f64370a/wandb-0.26.1.tar.gz", hash = "sha256:eef2dbaea06f0b1c0cdc5d76f544ae4c2b8848fc512442a00bd59f0502fc8aa1", size = 42159814, upload-time = "2026-04-23T16:27:34.033Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/8c/09/3296235f3906e904f06f2df29eed4d672fb23c0932c9486e2af64f2f2a66/wandb-0.26.1-py3-none-macosx_12_0_arm64.whl", hash = "sha256:2955fe190c005fb83ee6d73f066c8a33f09f3212a1f2eb53faa6581440e456be", size = 24857204, upload-time = "2026-04-23T16:26:58.576Z" }, + { url = "https://files.pythonhosted.org/packages/a1/ad/e39ca3086534129e42208ba00ed2c6247ce425f890219eeec33b4f162864/wandb-0.26.1-py3-none-macosx_12_0_x86_64.whl", hash = "sha256:55d91cabde98162d7116a5e19ddd052bd9848556243f1da4cbb9ffb7ad435bfc", size = 26014649, upload-time = "2026-04-23T16:27:02.559Z" }, + { url = "https://files.pythonhosted.org/packages/56/af/400d84a3bdce0b062b4baa70acb6becd2c8018697f4fbf5af9a9e1e406e5/wandb-0.26.1-py3-none-manylinux_2_28_aarch64.whl", hash = "sha256:7c78bc2454cfe1ffa1c3a256060a387356eed8a4488e024d9d2eba8f2b5bd51d", size = 25421317, upload-time = "2026-04-23T16:27:06.411Z" }, + { url = "https://files.pythonhosted.org/packages/7b/e9/b4bf8f3509dcea1cec52233a38991459654635b5a8e6a494eb912e1b9cfb/wandb-0.26.1-py3-none-manylinux_2_28_x86_64.whl", hash = "sha256:a2c8eeec8706dcd2872e69c3b4d20ec523082fdb4440295491556e219ad2aa67", size = 27192831, upload-time = "2026-04-23T16:27:10.308Z" }, + { url = "https://files.pythonhosted.org/packages/62/cf/4a6dce0c782223ef0eeea7139daee73418a7322befcf083512c31cebaa18/wandb-0.26.1-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:2fa768ee0636a569afb7541cf996e56309c47070566a38916823f94e02afe586", size = 25593326, upload-time = "2026-04-23T16:27:14.259Z" }, + { url = "https://files.pythonhosted.org/packages/df/99/58c3d8c36ae8e2b7d70bf6493eb5daa1cca0231a04b025717b4cd1a78f1e/wandb-0.26.1-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:5854928725cfeff1f284d5c043cd353f810e5da02eead2c120ef5056ad026fea", size = 27535542, upload-time = "2026-04-23T16:27:18.473Z" }, + { url = "https://files.pythonhosted.org/packages/7c/d0/4e846ffc1d0cc435518dfa581ce73ac82cfd0ebbf35f3853c9277f632e5f/wandb-0.26.1-py3-none-win32.whl", hash = "sha256:5c2bd44e575ae9944e2764d1aaa031461178276bf2636d5558399c2816ef5cfe", size = 24968151, upload-time = "2026-04-23T16:27:22.086Z" }, + { url = "https://files.pythonhosted.org/packages/e3/9b/487413eaccefdb58799a226726e24b486e9192d2671c75a4550c160aba23/wandb-0.26.1-py3-none-win_amd64.whl", hash = "sha256:5817785467d3f1676f1812ec19a89f77f6e56dfe67d9f47080075af95f705d3e", size = 24968155, upload-time = "2026-04-23T16:27:25.731Z" }, + { url = "https://files.pythonhosted.org/packages/04/dc/5baf3e99b3eeb709d6f75124b5bec8cb73d4b38d2b10df7fdcfde4966200/wandb-0.26.1-py3-none-win_arm64.whl", hash = "sha256:f848b7744f896bc04cabbb28360a2814d1551a91fa2c456243e06435729c8a2e", size = 22912416, upload-time = "2026-04-23T16:27:29.456Z" }, +] + +[package.optional-dependencies] +media = [ + { name = "bokeh" }, + { name = "imageio" }, + { name = "moviepy" }, + { name = "numpy" }, + { name = "pillow" }, + { name = "plotly" }, + { name = "rdkit" }, + { name = "soundfile" }, +] + +[[package]] +name = "win32-setctime" +version = "1.2.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/b3/8f/705086c9d734d3b663af0e9bb3d4de6578d08f46b1b101c2442fd9aecaa2/win32_setctime-1.2.0.tar.gz", hash = "sha256:ae1fdf948f5640aae05c511ade119313fb6a30d7eabe25fef9764dca5873c4c0", size = 4867, upload-time = "2024-12-07T15:28:28.314Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/e1/07/c6fe3ad3e685340704d314d765b7912993bcb8dc198f0e7a89382d37974b/win32_setctime-1.2.0-py3-none-any.whl", hash = "sha256:95d644c4e708aba81dc3704a116d8cbc974d70b3bdb8be1d150e36be6e9d1390", size = 4083, upload-time = "2024-12-07T15:28:26.465Z" }, +] + +[[package]] +name = "xyzservices" +version = "2026.3.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/f6/08/3cb9f67a8d48021aca2a02292cc26eecd71d949ae70ad66420a8730cc302/xyzservices-2026.3.0.tar.gz", hash = "sha256:d226866a5d8e9fef337034d8da37a8298f0a1d9d1489b4018e69579eb321fea4", size = 1135736, upload-time = "2026-03-30T14:42:25.596Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/a8/a9/d23012099dc88ec69a29c6407b41d89681cb674c2043cd5b467c7e299c08/xyzservices-2026.3.0-py3-none-any.whl", hash = "sha256:503183d4b322bfebc3c50cdd21192aa3e81e36c5efbf9133d54ae82143e0576b", size = 94101, upload-time = "2026-03-30T14:42:24.608Z" }, +] From 4c93b7f3a59c768bd098c9cb87a6d94a57802525 Mon Sep 17 00:00:00 2001 From: "J. Sebastian Paez" Date: Fri, 8 May 2026 20:17:17 -0700 Subject: [PATCH 02/41] feat(bench): pydantic schema and loader for fixture TOMLs --- bench/_fixture_schema.py | 65 ++++++++++++++++++++ bench/tests/test_fixture_schema.py | 97 ++++++++++++++++++++++++++++++ 2 files changed, 162 insertions(+) create mode 100644 bench/_fixture_schema.py create mode 100644 bench/tests/test_fixture_schema.py diff --git a/bench/_fixture_schema.py b/bench/_fixture_schema.py new file mode 100644 index 00000000..d3b9a145 --- /dev/null +++ b/bench/_fixture_schema.py @@ -0,0 +1,65 @@ +"""Pydantic schema and loader for fixture TOML files.""" + +from __future__ import annotations + +import sys +from pathlib import Path +from typing import Any + +from pydantic import BaseModel, ConfigDict, Field, field_validator + +if sys.version_info >= (3, 11): + import tomllib +else: + import tomli as tomllib # noqa: F401 (fallback path; we target 3.12) + + +def _require_s3(value: str, field: str) -> str: + if not value.startswith("s3://"): + raise ValueError(f"{field} must be an s3:// URI, got {value!r}") + return value + + +class FixtureInputs(BaseModel): + model_config = ConfigDict(extra="forbid") + + fasta: str + speclib: str + raw: str + entrapment_fasta: str | None = None + calibration_speclib: str | None = None + + @field_validator("fasta", "speclib", "raw") + @classmethod + def _required_s3(cls, v: str, info) -> str: + return _require_s3(v, info.field_name) + + @field_validator("entrapment_fasta", "calibration_speclib") + @classmethod + def _optional_s3(cls, v: str | None, info) -> str | None: + if v is None: + return v + return _require_s3(v, info.field_name) + + +class Fixture(BaseModel): + model_config = ConfigDict(extra="forbid") + + name: str + description: str = "" + inputs: FixtureInputs + config: dict[str, Any] = Field(default_factory=dict) + + def has_entrapment(self) -> bool: + return self.inputs.entrapment_fasta is not None + + def has_calibration_speclib(self) -> bool: + return self.inputs.calibration_speclib is not None + + +def load_fixture(path: str | Path) -> Fixture: + """Load a fixture TOML and validate it against the schema.""" + p = Path(path) + with p.open("rb") as f: + raw = tomllib.load(f) + return Fixture.model_validate(raw) diff --git a/bench/tests/test_fixture_schema.py b/bench/tests/test_fixture_schema.py new file mode 100644 index 00000000..4058150c --- /dev/null +++ b/bench/tests/test_fixture_schema.py @@ -0,0 +1,97 @@ +import textwrap + +import pytest + +from bench._fixture_schema import Fixture, load_fixture + + +def _write(tmp_path, content: str): + p = tmp_path / "fix.toml" + p.write_text(textwrap.dedent(content).strip()) + return p + + +def test_minimal_fixture_loads(tmp_path): + p = _write( + tmp_path, + """ + name = "hela" + description = "test" + + [inputs] + fasta = "s3://b/p.fasta" + speclib = "s3://b/lib.msgpack.zst" + raw = "s3://b/sample.d" + + [config.analysis] + chunk_size = 20000 + """, + ) + f = load_fixture(p) + assert f.name == "hela" + assert f.inputs.fasta == "s3://b/p.fasta" + assert f.inputs.entrapment_fasta is None + assert f.inputs.calibration_speclib is None + assert not f.has_entrapment() + assert f.config["analysis"]["chunk_size"] == 20000 + + +def test_entrapment_and_calib_optional_present(tmp_path): + p = _write( + tmp_path, + """ + name = "hela_entrap" + description = "test" + + [inputs] + fasta = "s3://b/p.fasta" + speclib = "s3://b/lib.msgpack.zst" + raw = "s3://b/sample.d" + entrapment_fasta = "s3://b/entrap.fasta" + calibration_speclib = "s3://b/calib.msgpack.zst" + + [config.analysis] + chunk_size = 20000 + """, + ) + f = load_fixture(p) + assert f.has_entrapment() + assert f.inputs.calibration_speclib == "s3://b/calib.msgpack.zst" + + +def test_local_path_in_inputs_rejected(tmp_path): + p = _write( + tmp_path, + """ + name = "bad" + description = "test" + + [inputs] + fasta = "/home/me/p.fasta" + speclib = "s3://b/lib.msgpack.zst" + raw = "s3://b/sample.d" + + [config.analysis] + chunk_size = 20000 + """, + ) + with pytest.raises(ValueError, match="must be an s3:// URI"): + load_fixture(p) + + +def test_missing_required_input_rejected(tmp_path): + p = _write( + tmp_path, + """ + name = "missing" + description = "test" + + [inputs] + fasta = "s3://b/p.fasta" + + [config.analysis] + chunk_size = 20000 + """, + ) + with pytest.raises(ValueError): + load_fixture(p) From 090bd0d657ddfb729bac30b5430ad72e3873e4a2 Mon Sep 17 00:00:00 2001 From: "J. Sebastian Paez" Date: Fri, 8 May 2026 20:21:58 -0700 Subject: [PATCH 03/41] fix(bench): clean up fixture schema (dead tomli fallback, validator types, test tightening) --- bench/_fixture_schema.py | 13 ++++--------- bench/tests/test_fixture_schema.py | 5 +++-- 2 files changed, 7 insertions(+), 11 deletions(-) diff --git a/bench/_fixture_schema.py b/bench/_fixture_schema.py index d3b9a145..5038caae 100644 --- a/bench/_fixture_schema.py +++ b/bench/_fixture_schema.py @@ -2,16 +2,11 @@ from __future__ import annotations -import sys +import tomllib from pathlib import Path from typing import Any -from pydantic import BaseModel, ConfigDict, Field, field_validator - -if sys.version_info >= (3, 11): - import tomllib -else: - import tomli as tomllib # noqa: F401 (fallback path; we target 3.12) +from pydantic import BaseModel, ConfigDict, Field, ValidationInfo, field_validator def _require_s3(value: str, field: str) -> str: @@ -31,12 +26,12 @@ class FixtureInputs(BaseModel): @field_validator("fasta", "speclib", "raw") @classmethod - def _required_s3(cls, v: str, info) -> str: + def _required_s3(cls, v: str, info: ValidationInfo) -> str: return _require_s3(v, info.field_name) @field_validator("entrapment_fasta", "calibration_speclib") @classmethod - def _optional_s3(cls, v: str | None, info) -> str | None: + def _optional_s3(cls, v: str | None, info: ValidationInfo) -> str | None: if v is None: return v return _require_s3(v, info.field_name) diff --git a/bench/tests/test_fixture_schema.py b/bench/tests/test_fixture_schema.py index 4058150c..3d3c0545 100644 --- a/bench/tests/test_fixture_schema.py +++ b/bench/tests/test_fixture_schema.py @@ -2,7 +2,7 @@ import pytest -from bench._fixture_schema import Fixture, load_fixture +from bench._fixture_schema import load_fixture def _write(tmp_path, content: str): @@ -57,6 +57,7 @@ def test_entrapment_and_calib_optional_present(tmp_path): f = load_fixture(p) assert f.has_entrapment() assert f.inputs.calibration_speclib == "s3://b/calib.msgpack.zst" + assert f.has_calibration_speclib() def test_local_path_in_inputs_rejected(tmp_path): @@ -93,5 +94,5 @@ def test_missing_required_input_rejected(tmp_path): chunk_size = 20000 """, ) - with pytest.raises(ValueError): + with pytest.raises(ValueError, match="speclib"): load_fixture(p) From 5c52907e0b50a7535859ba61f9b1800ee32136fb Mon Sep 17 00:00:00 2001 From: "J. Sebastian Paez" Date: Fri, 8 May 2026 20:24:21 -0700 Subject: [PATCH 04/41] feat(bench): polymorphic --db spec classifier --- bench/_db_resolver.py | 64 +++++++++++++++++++++++++++++++++ bench/tests/test_db_resolver.py | 49 +++++++++++++++++++++++++ 2 files changed, 113 insertions(+) create mode 100644 bench/_db_resolver.py create mode 100644 bench/tests/test_db_resolver.py diff --git a/bench/_db_resolver.py b/bench/_db_resolver.py new file mode 100644 index 00000000..6b07ac56 --- /dev/null +++ b/bench/_db_resolver.py @@ -0,0 +1,64 @@ +"""Polymorphic --db spec parsing. + +Classifies one CLI value into one of: local fasta file, local accession-list +text file, remote s3 fasta, uniprot proteome ID, or uniprot accession. +""" + +from __future__ import annotations + +import enum +import os +import re +from dataclasses import dataclass + +_FASTA_EXTS = (".fasta", ".fa", ".fasta.gz", ".fa.gz") +_TXT_EXTS = (".txt",) +_UNIPROT_PROTEOME_RE = re.compile(r"^UP\d{9}$") +_UNIPROT_ACCESSION_RE = re.compile( + r"^([OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2})$" +) + + +class DbSpecKind(enum.Enum): + LOCAL_FASTA = "local_fasta" + ACCESSION_LIST_FILE = "accession_list_file" + S3_FASTA = "s3_fasta" + UNIPROT_PROTEOME = "uniprot_proteome" + UNIPROT_ACCESSION = "uniprot_accession" + + +@dataclass(frozen=True) +class DbSpec: + kind: DbSpecKind + value: str # original spec string + + +def classify_db_spec(spec: str) -> DbSpec: + """Classify a single --db value. Raises ValueError for unrecognised input.""" + if spec.startswith("s3://"): + return DbSpec(DbSpecKind.S3_FASTA, spec) + + # Local file? Path-shaped strings get checked first so a stray file named + # `UP000005640` on disk still resolves as local. + if os.path.exists(spec): + lower = spec.lower() + if any(lower.endswith(ext) for ext in _FASTA_EXTS): + return DbSpec(DbSpecKind.LOCAL_FASTA, spec) + if any(lower.endswith(ext) for ext in _TXT_EXTS): + return DbSpec(DbSpecKind.ACCESSION_LIST_FILE, spec) + # File exists but unrecognised extension — refuse rather than guess. + raise ValueError( + f"unrecognised --db spec: {spec!r}" + " (file exists but extension is not .fasta/.fa/.txt)" + ) + + if _UNIPROT_PROTEOME_RE.match(spec): + return DbSpec(DbSpecKind.UNIPROT_PROTEOME, spec) + + if _UNIPROT_ACCESSION_RE.match(spec): + return DbSpec(DbSpecKind.UNIPROT_ACCESSION, spec) + + raise ValueError( + f"unrecognised --db spec: {spec!r}" + " (not s3://, not a local .fasta/.txt, not UP..., not an accession)" + ) diff --git a/bench/tests/test_db_resolver.py b/bench/tests/test_db_resolver.py new file mode 100644 index 00000000..4bede78c --- /dev/null +++ b/bench/tests/test_db_resolver.py @@ -0,0 +1,49 @@ +import pytest + +from bench._db_resolver import DbSpec, DbSpecKind, classify_db_spec + + +def test_classify_local_fasta(tmp_path): + p = tmp_path / "x.fasta" + p.write_text(">a\nMK\n") + spec = classify_db_spec(str(p)) + assert spec == DbSpec(kind=DbSpecKind.LOCAL_FASTA, value=str(p)) + + +def test_classify_local_fasta_gz(tmp_path): + p = tmp_path / "x.fasta.gz" + p.write_bytes(b"\x1f\x8b") + spec = classify_db_spec(str(p)) + assert spec.kind == DbSpecKind.LOCAL_FASTA + + +def test_classify_local_accession_list(tmp_path): + p = tmp_path / "ids.txt" + p.write_text("P12345\nQ67890\n") + spec = classify_db_spec(str(p)) + assert spec.kind == DbSpecKind.ACCESSION_LIST_FILE + + +def test_classify_s3(tmp_path): + spec = classify_db_spec("s3://bkt/proteome.fasta") + assert spec.kind == DbSpecKind.S3_FASTA + + +def test_classify_uniprot_proteome(): + spec = classify_db_spec("UP000005640") + assert spec.kind == DbSpecKind.UNIPROT_PROTEOME + + +def test_classify_uniprot_accession(): + spec = classify_db_spec("P12345") + assert spec.kind == DbSpecKind.UNIPROT_ACCESSION + + +def test_classify_unknown_raises(): + with pytest.raises(ValueError, match="unrecognised"): + classify_db_spec("not-a-real-thing") + + +def test_classify_local_missing_file_raises(tmp_path): + with pytest.raises(ValueError, match="unrecognised"): + classify_db_spec(str(tmp_path / "nope.fasta")) From d046d4cca3203705a04f5ba40190ffd05dd81d46 Mon Sep 17 00:00:00 2001 From: "J. Sebastian Paez" Date: Fri, 8 May 2026 20:27:47 -0700 Subject: [PATCH 05/41] fix(bench): drop unused fixture, add wrong-extension test, sort suffix tuple --- bench/_db_resolver.py | 3 ++- bench/tests/test_db_resolver.py | 9 ++++++++- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/bench/_db_resolver.py b/bench/_db_resolver.py index 6b07ac56..5c3451a3 100644 --- a/bench/_db_resolver.py +++ b/bench/_db_resolver.py @@ -11,7 +11,8 @@ import re from dataclasses import dataclass -_FASTA_EXTS = (".fasta", ".fa", ".fasta.gz", ".fa.gz") +# Longest suffixes first so 'foo.fasta.gz' does not short-circuit on '.fasta'. +_FASTA_EXTS = (".fasta.gz", ".fa.gz", ".fasta", ".fa") _TXT_EXTS = (".txt",) _UNIPROT_PROTEOME_RE = re.compile(r"^UP\d{9}$") _UNIPROT_ACCESSION_RE = re.compile( diff --git a/bench/tests/test_db_resolver.py b/bench/tests/test_db_resolver.py index 4bede78c..7913fedf 100644 --- a/bench/tests/test_db_resolver.py +++ b/bench/tests/test_db_resolver.py @@ -24,7 +24,7 @@ def test_classify_local_accession_list(tmp_path): assert spec.kind == DbSpecKind.ACCESSION_LIST_FILE -def test_classify_s3(tmp_path): +def test_classify_s3(): spec = classify_db_spec("s3://bkt/proteome.fasta") assert spec.kind == DbSpecKind.S3_FASTA @@ -47,3 +47,10 @@ def test_classify_unknown_raises(): def test_classify_local_missing_file_raises(tmp_path): with pytest.raises(ValueError, match="unrecognised"): classify_db_spec(str(tmp_path / "nope.fasta")) + + +def test_classify_local_bad_extension_raises(tmp_path): + p = tmp_path / "sequences.parquet" + p.write_bytes(b"PAR1") + with pytest.raises(ValueError, match="unrecognised"): + classify_db_spec(str(p)) From 22902fef7cb1296919b2cc3b73e3bc7d3f0e34df Mon Sep 17 00:00:00 2001 From: "J. Sebastian Paez" Date: Fri, 8 May 2026 20:29:16 -0700 Subject: [PATCH 06/41] feat(bench): uniprot REST fetch helpers (proteome + accession batch) --- bench/_uniprot.py | 39 ++++++++++++++++++++++++++++ bench/tests/test_uniprot.py | 51 +++++++++++++++++++++++++++++++++++++ 2 files changed, 90 insertions(+) create mode 100644 bench/_uniprot.py create mode 100644 bench/tests/test_uniprot.py diff --git a/bench/_uniprot.py b/bench/_uniprot.py new file mode 100644 index 00000000..20ee4632 --- /dev/null +++ b/bench/_uniprot.py @@ -0,0 +1,39 @@ +"""Uniprot REST helpers. + +We use the *stream* endpoint which returns full FASTA payloads without +pagination. See https://www.uniprot.org/help/api_queries. +""" + +from __future__ import annotations + +from typing import Iterable + +import requests +from loguru import logger + +_BASE = "https://rest.uniprot.org/uniprotkb/stream" +_TIMEOUT = 120 +_BATCH_SIZE = 100 # accession IDs per query — keeps URL well under the ~8KB limit + + +def _get(params: dict[str, str]) -> str: + logger.info("uniprot GET {} {}", _BASE, params) + r = requests.get(_BASE, params=params, timeout=_TIMEOUT) + r.raise_for_status() + return r.text + + +def fetch_proteome(proteome_id: str) -> str: + """Fetch a full uniprot proteome (e.g. UP000005640) as FASTA text.""" + return _get({"query": f"proteome:{proteome_id}", "format": "fasta"}) + + +def fetch_accession_batch(accessions: Iterable[str]) -> str: + """Fetch FASTA for a list of accessions. Chunks under URL-length limits.""" + accs = list(accessions) + out: list[str] = [] + for i in range(0, len(accs), _BATCH_SIZE): + chunk = accs[i : i + _BATCH_SIZE] + query = " OR ".join(f"accession:{a}" for a in chunk) + out.append(_get({"query": query, "format": "fasta"})) + return "".join(out) diff --git a/bench/tests/test_uniprot.py b/bench/tests/test_uniprot.py new file mode 100644 index 00000000..b1c4d237 --- /dev/null +++ b/bench/tests/test_uniprot.py @@ -0,0 +1,51 @@ +import responses + +from bench._uniprot import fetch_accession_batch, fetch_proteome + + +@responses.activate +def test_fetch_proteome(): + responses.add( + responses.GET, + "https://rest.uniprot.org/uniprotkb/stream", + body=">sp|P12345|TEST_HUMAN test\nMKLAA\n", + status=200, + content_type="text/plain", + ) + fasta = fetch_proteome("UP000005640") + assert fasta.startswith(">sp|P12345") + assert "MKLAA" in fasta + + +@responses.activate +def test_fetch_accession_batch(): + responses.add( + responses.GET, + "https://rest.uniprot.org/uniprotkb/stream", + body=">sp|P12345|A\nMK\n>sp|Q67890|B\nLL\n", + status=200, + content_type="text/plain", + ) + fasta = fetch_accession_batch(["P12345", "Q67890"]) + assert "P12345" in fasta and "Q67890" in fasta + # Verify the request used the OR'd accession query + assert len(responses.calls) == 1 + qs = responses.calls[0].request.url + assert "accession%3AP12345" in qs and "accession%3AQ67890" in qs + assert "format=fasta" in qs + + +@responses.activate +def test_fetch_accession_batch_chunks_long_lists(): + """Uniprot stream URLs have practical length limits; verify we batch.""" + # Stub one response per chunk regardless of how many calls happen + responses.add( + responses.GET, + "https://rest.uniprot.org/uniprotkb/stream", + body=">sp|X|x\nM\n", + status=200, + ) + accs = [f"P{n:05d}" for n in range(250)] # 250 → at least 2 chunks at 100/chunk + fasta = fetch_accession_batch(accs) + assert fasta.count(">") == len(responses.calls) + assert len(responses.calls) >= 3 # ceil(250/100) From bb6634abd4d246bef2779c55c253b37adc1af161 Mon Sep 17 00:00:00 2001 From: "J. Sebastian Paez" Date: Fri, 8 May 2026 20:30:44 -0700 Subject: [PATCH 07/41] chore: configure ty to use project .venv --- pyproject.toml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pyproject.toml b/pyproject.toml index 72c1f701..ae6a04db 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -35,6 +35,10 @@ bench = [ [tool.pytest.ini_options] testpaths = ["bench/tests"] +[tool.ty.environment] +python = ".venv" +python-version = "3.12" + [tool.ruff] target-version = "py312" From ca77ef4f0b25b83f428b6597bae0592f1aff51c8 Mon Sep 17 00:00:00 2001 From: "J. Sebastian Paez" Date: Fri, 8 May 2026 20:33:44 -0700 Subject: [PATCH 08/41] fix(bench): tighten uniprot test (ty-clean url access, exact chunk count) --- bench/tests/test_uniprot.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/bench/tests/test_uniprot.py b/bench/tests/test_uniprot.py index b1c4d237..e1791ed0 100644 --- a/bench/tests/test_uniprot.py +++ b/bench/tests/test_uniprot.py @@ -30,7 +30,7 @@ def test_fetch_accession_batch(): assert "P12345" in fasta and "Q67890" in fasta # Verify the request used the OR'd accession query assert len(responses.calls) == 1 - qs = responses.calls[0].request.url + qs = responses.calls[0].request.url or "" assert "accession%3AP12345" in qs and "accession%3AQ67890" in qs assert "format=fasta" in qs @@ -46,6 +46,5 @@ def test_fetch_accession_batch_chunks_long_lists(): status=200, ) accs = [f"P{n:05d}" for n in range(250)] # 250 → at least 2 chunks at 100/chunk - fasta = fetch_accession_batch(accs) - assert fasta.count(">") == len(responses.calls) - assert len(responses.calls) >= 3 # ceil(250/100) + fetch_accession_batch(accs) + assert len(responses.calls) == 3 # ceil(250/100) From 1e303a078cb32cb869941ec0bb8be70c72255692 Mon Sep 17 00:00:00 2001 From: "J. Sebastian Paez" Date: Fri, 8 May 2026 20:36:34 -0700 Subject: [PATCH 09/41] feat(bench): resolve --db specs to concatenated fasta + s3 wrapper Adds resolve_dbs() that turns a list of --db spec strings into one concatenated FASTA on disk (local, S3, UniProt proteome/accession). Individual accession specs are coalesced into a single batched HTTP call. Adds bench/_s3.py wrapping aws s3 cp for download/upload/upload-dir. --- bench/_db_resolver.py | 84 ++++++++++++++++++++++++++++++++- bench/_s3.py | 37 +++++++++++++++ bench/tests/test_db_resolver.py | 59 ++++++++++++++++++++++- 3 files changed, 178 insertions(+), 2 deletions(-) create mode 100644 bench/_s3.py diff --git a/bench/_db_resolver.py b/bench/_db_resolver.py index 5c3451a3..1e01e19e 100644 --- a/bench/_db_resolver.py +++ b/bench/_db_resolver.py @@ -1,15 +1,22 @@ -"""Polymorphic --db spec parsing. +"""Polymorphic --db spec parsing and resolution. Classifies one CLI value into one of: local fasta file, local accession-list text file, remote s3 fasta, uniprot proteome ID, or uniprot accession. +Also provides resolve_dbs() to turn a list of specs into a merged FASTA file. """ from __future__ import annotations import enum +import gzip import os import re +import tempfile from dataclasses import dataclass +from pathlib import Path + +from bench._s3 import s3_download_file +from bench._uniprot import fetch_accession_batch, fetch_proteome # Longest suffixes first so 'foo.fasta.gz' does not short-circuit on '.fasta'. _FASTA_EXTS = (".fasta.gz", ".fa.gz", ".fasta", ".fa") @@ -63,3 +70,78 @@ def classify_db_spec(spec: str) -> DbSpec: f"unrecognised --db spec: {spec!r}" " (not s3://, not a local .fasta/.txt, not UP..., not an accession)" ) + + +# --------------------------------------------------------------------------- +# Resolution helpers +# --------------------------------------------------------------------------- + + +def _read_local_fasta_text(path: str) -> str: + p = Path(path) + if str(p).lower().endswith(".gz"): + with gzip.open(p, "rt") as f: + return f.read() + return p.read_text() + + +def _read_accession_list_file(path: str) -> list[str]: + accs: list[str] = [] + for line in Path(path).read_text().splitlines(): + line = line.strip() + if line: + accs.append(line) + return accs + + +def resolve_dbs(specs: list[str], output_path: Path) -> None: + """Resolve a list of --db specs into a concatenated FASTA at output_path. + + Individual UniProt accessions (UNIPROT_ACCESSION kind) are coalesced into + a single batched fetch call (one HTTP round-trip). Accession-list files + each produce their own batch call. Other spec kinds are fetched + independently and appended in CLI order. + """ + output_path = Path(output_path) + output_path.parent.mkdir(parents=True, exist_ok=True) + + classified = [classify_db_spec(raw) for raw in specs] + + # Coalesce all bare UNIPROT_ACCESSION specs into one batch. + bare_accessions: list[str] = [ + s.value for s in classified if s.kind is DbSpecKind.UNIPROT_ACCESSION + ] + + fragments: list[str] = [] + pending_accessions_flushed = False + + for spec in classified: + if spec.kind is DbSpecKind.UNIPROT_ACCESSION: + # Flush the whole accession batch on the first encounter. + if not pending_accessions_flushed: + fragments.append(fetch_accession_batch(bare_accessions)) + pending_accessions_flushed = True + # Subsequent accessions are already included in the batch above. + elif spec.kind is DbSpecKind.LOCAL_FASTA: + fragments.append(_read_local_fasta_text(spec.value)) + elif spec.kind is DbSpecKind.S3_FASTA: + with tempfile.NamedTemporaryFile(suffix=".fasta", delete=False) as tmp: + tmp_path = tmp.name + try: + s3_download_file(spec.value, tmp_path) + fragments.append(_read_local_fasta_text(tmp_path)) + finally: + Path(tmp_path).unlink(missing_ok=True) + elif spec.kind is DbSpecKind.UNIPROT_PROTEOME: + fragments.append(fetch_proteome(spec.value)) + elif spec.kind is DbSpecKind.ACCESSION_LIST_FILE: + accs = _read_accession_list_file(spec.value) + fragments.append(fetch_accession_batch(accs)) + else: + raise AssertionError(f"unhandled kind {spec.kind}") + + with output_path.open("w") as f: + for chunk in fragments: + if not chunk.endswith("\n"): + chunk = chunk + "\n" + f.write(chunk) diff --git a/bench/_s3.py b/bench/_s3.py new file mode 100644 index 00000000..7fdfeeee --- /dev/null +++ b/bench/_s3.py @@ -0,0 +1,37 @@ +"""Tiny shellout wrapper around `aws s3 cp`. One module so all subprocess +invocations of the AWS CLI live in one place (easier to mock in tests).""" + +from __future__ import annotations + +import subprocess +from pathlib import Path + +from loguru import logger + + +def _aws_cp(src: str, dst: str, recursive: bool = False) -> None: + cmd = ["aws", "s3", "cp", src, dst] + if recursive: + cmd.append("--recursive") + logger.info("$ {}", " ".join(cmd)) + subprocess.run(cmd, check=True) + + +def s3_download_file(s3_uri: str, local_path: str) -> None: + if not s3_uri.startswith("s3://"): + raise ValueError(f"not an s3:// URI: {s3_uri}") + _aws_cp(s3_uri, local_path) + + +def s3_upload_file(local_path: str, s3_uri: str) -> None: + if not s3_uri.startswith("s3://"): + raise ValueError(f"not an s3:// URI: {s3_uri}") + _aws_cp(local_path, s3_uri) + + +def s3_upload_dir(local_dir: str, s3_uri: str) -> None: + if not s3_uri.startswith("s3://"): + raise ValueError(f"not an s3:// URI: {s3_uri}") + if not Path(local_dir).is_dir(): + raise ValueError(f"not a directory: {local_dir}") + _aws_cp(local_dir, s3_uri, recursive=True) diff --git a/bench/tests/test_db_resolver.py b/bench/tests/test_db_resolver.py index 7913fedf..6a658809 100644 --- a/bench/tests/test_db_resolver.py +++ b/bench/tests/test_db_resolver.py @@ -1,6 +1,9 @@ +from pathlib import Path +from unittest.mock import patch + import pytest -from bench._db_resolver import DbSpec, DbSpecKind, classify_db_spec +from bench._db_resolver import DbSpec, DbSpecKind, classify_db_spec, resolve_dbs def test_classify_local_fasta(tmp_path): @@ -54,3 +57,57 @@ def test_classify_local_bad_extension_raises(tmp_path): p.write_bytes(b"PAR1") with pytest.raises(ValueError, match="unrecognised"): classify_db_spec(str(p)) + + +def test_resolve_local_fasta_only(tmp_path): + a = tmp_path / "a.fasta" + a.write_text(">a\nMK\n") + b = tmp_path / "b.fasta" + b.write_text(">b\nLL\n") + out = tmp_path / "merged.fasta" + resolve_dbs([str(a), str(b)], out) + text = out.read_text() + assert ">a" in text and ">b" in text + assert text.count(">") == 2 + + +def test_resolve_uniprot_proteome_concat(tmp_path): + out = tmp_path / "merged.fasta" + with patch("bench._db_resolver.fetch_proteome", return_value=">P\nMK\n") as m: + resolve_dbs(["UP000005640"], out) + m.assert_called_once_with("UP000005640") + assert out.read_text() == ">P\nMK\n" + + +def test_resolve_uniprot_accession_batched(tmp_path): + out = tmp_path / "merged.fasta" + with patch( + "bench._db_resolver.fetch_accession_batch", return_value=">A\nM\n>B\nL\n" + ) as m: + resolve_dbs(["P12345", "Q67890"], out) + # Single batch call carries both accessions + m.assert_called_once_with(["P12345", "Q67890"]) + assert ">A" in out.read_text() and ">B" in out.read_text() + + +def test_resolve_accession_list_file(tmp_path): + ids = tmp_path / "ids.txt" + ids.write_text("P12345\nQ67890\n\n") + out = tmp_path / "merged.fasta" + with patch("bench._db_resolver.fetch_accession_batch", return_value=">x\nM\n") as m: + resolve_dbs([str(ids)], out) + m.assert_called_once_with(["P12345", "Q67890"]) + + +def test_resolve_s3_uses_aws_cp(tmp_path): + out = tmp_path / "merged.fasta" + + def fake_s3_download(uri, dst): + Path(dst).write_text(">s3\nMK\n") + + with patch( + "bench._db_resolver.s3_download_file", side_effect=fake_s3_download + ) as m: + resolve_dbs(["s3://bkt/p.fasta"], out) + m.assert_called_once() + assert ">s3" in out.read_text() From 399b4bc407f40114a8ba7600f765760d3c2341ba Mon Sep 17 00:00:00 2001 From: "J. Sebastian Paez" Date: Fri, 8 May 2026 20:40:21 -0700 Subject: [PATCH 10/41] feat(bench): aho-corasick peptide classification for entrapment --- bench/entrapment.py | 119 +++++++++++++++++++++++++++++++++ bench/tests/test_entrapment.py | 64 ++++++++++++++++++ 2 files changed, 183 insertions(+) create mode 100644 bench/entrapment.py create mode 100644 bench/tests/test_entrapment.py diff --git a/bench/entrapment.py b/bench/entrapment.py new file mode 100644 index 00000000..d457f411 --- /dev/null +++ b/bench/entrapment.py @@ -0,0 +1,119 @@ +"""Entrapment classification + FDR walk + plot. + +The classification half lives here (Task 6). The FDR walk and plot land in +Task 7. The CLI entry-point lands in Task 8. +""" + +from __future__ import annotations + +import enum +import re +from pathlib import Path + +import ahocorasick # ty: ignore[unresolved-import] +import polars as pl + +_MOD_RE = re.compile(r"\[[^\]]*\]|\([^)]*\)|[0-9.]+") +"""Strip bracketed mods (`[U:4]`, `[42]`), parenthesised mods (`(Phospho)`), +and bare numeric mass shifts (`123.45`).""" + + +def strip_mods(seq: str) -> str: + """Strip mod annotations to return a bare AA sequence (alpha chars only).""" + return _MOD_RE.sub("", seq) + + +def parse_fasta(path: str | Path) -> dict[str, str]: + """Parse a FASTA file into {accession: concatenated_sequence}. + + Accession is taken as the full header line minus the leading `>`, + stripped of trailing whitespace. The full header is used so callers can + later parse it however they want; we don't impose uniprot's `sp|...|` + grammar here. + """ + out: dict[str, str] = {} + current_acc: str | None = None + parts: list[str] = [] + with Path(path).open("r") as f: + for raw_line in f: + line = raw_line.rstrip() + if not line: + continue + if line.startswith(">"): + if current_acc is not None: + out[current_acc] = "".join(parts) + current_acc = line[1:].strip() + parts = [] + else: + parts.append(line) + if current_acc is not None: + out[current_acc] = "".join(parts) + return out + + +class PeptideClass(enum.Enum): + TARGET = "target" + ENTRAPMENT = "entrapment" + SHARED_DROPPED = "shared_dropped" + UNKNOWN = "unknown" + + +def _build_hits(patterns: set[str], proteins: dict[str, str]) -> set[str]: + """Return the subset of `patterns` that occurs as a substring of any value + in `proteins`.""" + if not patterns: + return set() + aut = ahocorasick.Automaton() + for pat in patterns: + aut.add_word(pat, pat) + aut.make_automaton() + + hits: set[str] = set() + for seq in proteins.values(): + for _, pat in aut.iter(seq): + hits.add(pat) + if len(hits) == len(patterns): + return hits + return hits + + +def classify_peptides( + results: pl.DataFrame, + target_fasta: str | Path, + entrapment_fasta: str | Path, +) -> pl.DataFrame: + """Add `class` and `is_entrapment` columns to a results DataFrame. + + `results` must have a `sequence` column. Sequences are mod-stripped + before substring matching. Shared peptides (present in both fastas) are + marked SHARED_DROPPED -- callers exclude them from FDR. + """ + if "sequence" not in results.columns: + raise ValueError("results dataframe missing required 'sequence' column") + + target = parse_fasta(target_fasta) + entrap = parse_fasta(entrapment_fasta) + + stripped = results["sequence"].map_elements(strip_mods, return_dtype=pl.Utf8) + patterns = set(stripped.to_list()) + + hits_t = _build_hits(patterns, target) + hits_e = _build_hits(patterns, entrap) + + def _classify(s: str) -> str: + in_t, in_e = s in hits_t, s in hits_e + if in_t and in_e: + return PeptideClass.SHARED_DROPPED.value + if in_t: + return PeptideClass.TARGET.value + if in_e: + return PeptideClass.ENTRAPMENT.value + return PeptideClass.UNKNOWN.value + + classes = stripped.map_elements(_classify, return_dtype=pl.Utf8) + is_entrap = classes == PeptideClass.ENTRAPMENT.value + + return results.with_columns( + classes.alias("class"), + is_entrap.alias("is_entrapment"), + ) diff --git a/bench/tests/test_entrapment.py b/bench/tests/test_entrapment.py new file mode 100644 index 00000000..a32490c5 --- /dev/null +++ b/bench/tests/test_entrapment.py @@ -0,0 +1,64 @@ +import polars as pl + +from bench.entrapment import ( + PeptideClass, + classify_peptides, + parse_fasta, + strip_mods, +) + + +def test_strip_mods(): + assert strip_mods("PEPTIDEK") == "PEPTIDEK" + assert strip_mods("PEPC[U:4]TIDEK") == "PEPCTIDEK" + assert strip_mods("PEP(Phospho)TIDEK") == "PEPTIDEK" + assert strip_mods("123.45PEPTIDEK") == "PEPTIDEK" + assert strip_mods("n[42]PEPTIDEK") == "nPEPTIDEK" # keep alpha n-term marker + + +def test_parse_fasta(tmp_path): + p = tmp_path / "p.fasta" + p.write_text(">sp|P1|A\nMKLAA\nDDDD\n>sp|P2|B\nLLLL\n") + out = parse_fasta(p) + assert out == {"sp|P1|A": "MKLAADDDD", "sp|P2|B": "LLLL"} + + +def test_classify_peptides(tmp_path): + target = tmp_path / "t.fasta" + target.write_text(">T1\nAAAAPEPTIDEKBBBB\n>T2\nMMMMSHAREDXXXX\n") + entrap = tmp_path / "e.fasta" + entrap.write_text(">E1\nQQQQENTRAPEPTKZZZZ\n>E2\nMMMMSHAREDYYYY\n") + + df = pl.DataFrame( + {"sequence": ["PEPTIDEK", "ENTRAPEPTK", "SHARED", "GHOSTAA"]} + ) + classified = classify_peptides(df, target, entrap) + + classes = dict(zip(classified["sequence"], classified["class"])) + assert classes["PEPTIDEK"] == PeptideClass.TARGET.value + assert classes["ENTRAPEPTK"] == PeptideClass.ENTRAPMENT.value + assert classes["SHARED"] == PeptideClass.SHARED_DROPPED.value + assert classes["GHOSTAA"] == PeptideClass.UNKNOWN.value + # is_entrapment column: True only for ENTRAPMENT + is_e = dict(zip(classified["sequence"], classified["is_entrapment"])) + assert is_e["ENTRAPEPTK"] is True + assert is_e["PEPTIDEK"] is False + assert is_e["SHARED"] is False + assert is_e["GHOSTAA"] is False + + +def test_classify_peptides_strips_mods_before_match(tmp_path): + target = tmp_path / "t.fasta" + target.write_text(">T1\nAAAAPEPTIDEKBBBB\n") + entrap = tmp_path / "e.fasta" + entrap.write_text(">E1\nQQQQ\n") + + df = pl.DataFrame({"sequence": ["PEPC[U:4]PTIDEK"]}) + # Stripped form is PEPCPTIDEK which is NOT in target. Confirm we end up unknown. + classified = classify_peptides(df, target, entrap) + assert classified["class"][0] == PeptideClass.UNKNOWN.value + + # But a real target match works after stripping + df2 = pl.DataFrame({"sequence": ["PEPT[U:4]IDEK"]}) + classified2 = classify_peptides(df2, target, entrap) + assert classified2["class"][0] == PeptideClass.TARGET.value From ccce2728eccc85b234f54c9a6fd0529d209cd84f Mon Sep 17 00:00:00 2001 From: "J. Sebastian Paez" Date: Fri, 8 May 2026 20:43:10 -0700 Subject: [PATCH 11/41] feat(bench): empirical FDR curve and PNG plot for entrapment --- bench/entrapment.py | 57 ++++++++++++++++++++++++++++++++++ bench/tests/test_entrapment.py | 49 +++++++++++++++++++++++++++++ 2 files changed, 106 insertions(+) diff --git a/bench/entrapment.py b/bench/entrapment.py index d457f411..fdbea524 100644 --- a/bench/entrapment.py +++ b/bench/entrapment.py @@ -117,3 +117,60 @@ def _classify(s: str) -> str: classes.alias("class"), is_entrap.alias("is_entrapment"), ) + + +def compute_fdr_curve(classified: pl.DataFrame) -> pl.DataFrame: + """Sort by qvalue, accumulate target/entrapment counts, return curve. + + Rows whose class is SHARED_DROPPED or UNKNOWN are excluded from both + numerator and denominator. + """ + if "qvalue" not in classified.columns: + raise ValueError("classified dataframe missing 'qvalue' column") + + keep = classified.filter( + pl.col("class").is_in( + [PeptideClass.TARGET.value, PeptideClass.ENTRAPMENT.value] + ) + ).sort("qvalue") + + n_target = ( + (keep["class"] == PeptideClass.TARGET.value).cast(pl.UInt32).cum_sum() + ) + n_entrap = ( + (keep["class"] == PeptideClass.ENTRAPMENT.value).cast(pl.UInt32).cum_sum() + ) + empirical = (n_entrap / (n_target + n_entrap)).fill_nan(0.0) + + return keep.with_columns( + n_target.alias("n_target"), + n_entrap.alias("n_entrap"), + empirical.alias("empirical_fdr"), + ) + + +def plot_fdr_curve( + curve: pl.DataFrame, + output_path: str | Path, + title: str = "Reported q-value vs empirical entrapment FDR", +) -> None: + """Render a FDR-vs-qvalue plot to a PNG file.""" + import matplotlib + + matplotlib.use("Agg") + import matplotlib.pyplot as plt + + fig, ax = plt.subplots(figsize=(5, 5), dpi=150) + ax.plot(curve["qvalue"], curve["empirical_fdr"], lw=1.5, label="empirical") + _max_qv = curve["qvalue"].cast(pl.Float64).max() + lim: float = max(0.05, _max_qv if isinstance(_max_qv, float) else 0.05) + ax.plot([0, lim], [0, lim], color="grey", ls="--", lw=1.0, label="y=x") + ax.set_xlabel("reported q-value") + ax.set_ylabel("empirical FDR (n_entrap / (n_target + n_entrap))") + ax.set_xlim(0, lim) + ax.set_ylim(0, lim) + ax.set_title(title) + ax.legend(loc="best") + fig.tight_layout() + fig.savefig(output_path) + plt.close(fig) diff --git a/bench/tests/test_entrapment.py b/bench/tests/test_entrapment.py index a32490c5..9b2eee24 100644 --- a/bench/tests/test_entrapment.py +++ b/bench/tests/test_entrapment.py @@ -3,7 +3,9 @@ from bench.entrapment import ( PeptideClass, classify_peptides, + compute_fdr_curve, parse_fasta, + plot_fdr_curve, strip_mods, ) @@ -62,3 +64,50 @@ def test_classify_peptides_strips_mods_before_match(tmp_path): df2 = pl.DataFrame({"sequence": ["PEPT[U:4]IDEK"]}) classified2 = classify_peptides(df2, target, entrap) assert classified2["class"][0] == PeptideClass.TARGET.value + + +def test_compute_fdr_curve_basic(): + classified = pl.DataFrame( + { + "qvalue": [0.001, 0.005, 0.01, 0.02, 0.05], + "class": ["target", "target", "entrapment", "target", "entrapment"], + } + ) + curve = compute_fdr_curve(classified) + # Sorted ascending by qvalue + assert curve["qvalue"].to_list() == [0.001, 0.005, 0.01, 0.02, 0.05] + # n_target cumulative + assert curve["n_target"].to_list() == [1, 2, 2, 3, 3] + # n_entrap cumulative + assert curve["n_entrap"].to_list() == [0, 0, 1, 1, 2] + # empirical_fdr = n_e / (n_t + n_e) + last = curve.row(-1, named=True) + assert last["empirical_fdr"] == 2 / 5 + + +def test_compute_fdr_curve_excludes_shared_and_unknown(): + classified = pl.DataFrame( + { + "qvalue": [0.01, 0.01, 0.01, 0.01], + "class": ["target", "shared_dropped", "unknown", "entrapment"], + } + ) + curve = compute_fdr_curve(classified) + # Only one target + one entrapment row contribute + assert curve.height == 2 + assert sorted(curve["class"].to_list()) == ["entrapment", "target"] + + +def test_plot_fdr_curve_writes_png(tmp_path): + curve = pl.DataFrame( + { + "qvalue": [0.001, 0.01, 0.05], + "n_target": [10, 50, 100], + "n_entrap": [0, 1, 5], + "empirical_fdr": [0.0, 1 / 51, 5 / 105], + } + ) + out = tmp_path / "fdr.png" + plot_fdr_curve(curve, out, title="test") + assert out.exists() + assert out.stat().st_size > 1000 # not an empty or stub file From ce7517f55627fd4bd053ffe0025180df96ea7b73 Mon Sep 17 00:00:00 2001 From: "J. Sebastian Paez" Date: Fri, 8 May 2026 20:45:47 -0700 Subject: [PATCH 12/41] feat(bench): analyse() entry point returning wandb-ready scalars --- bench/entrapment.py | 43 ++++++++++++++++++++++++++++++++++ bench/tests/test_entrapment.py | 39 ++++++++++++++++++++++++++++++ 2 files changed, 82 insertions(+) diff --git a/bench/entrapment.py b/bench/entrapment.py index fdbea524..cea6f7e7 100644 --- a/bench/entrapment.py +++ b/bench/entrapment.py @@ -174,3 +174,46 @@ def plot_fdr_curve( fig.tight_layout() fig.savefig(output_path) plt.close(fig) + + +def _scalar_at_q( + curve: pl.DataFrame, q_threshold: float, suffix: str +) -> dict[str, float | int]: + """Read off n_target / n_entrap / empirical_fdr at q <= q_threshold.""" + sub = curve.filter(pl.col("qvalue") <= q_threshold) + if sub.height == 0: + return { + f"entrap/n_target_at_{suffix}": 0, + f"entrap/n_entrap_at_{suffix}": 0, + f"entrap/empirical_fdr_at_{suffix}": 0.0, + } + last = sub.row(-1, named=True) + return { + f"entrap/n_target_at_{suffix}": int(last["n_target"]), + f"entrap/n_entrap_at_{suffix}": int(last["n_entrap"]), + f"entrap/empirical_fdr_at_{suffix}": float(last["empirical_fdr"]), + } + + +def analyse( + results_parquet: str | Path, + target_fasta: str | Path, + entrapment_fasta: str | Path, + out_parquet: str | Path, + out_plot: str | Path, + title: str = "Reported q-value vs empirical entrapment FDR", +) -> dict[str, float | int]: + """End-to-end: classify -> FDR walk -> write parquet + plot -> return scalars.""" + results = pl.read_parquet(results_parquet) + classified = classify_peptides(results, target_fasta, entrapment_fasta) + Path(out_parquet).parent.mkdir(parents=True, exist_ok=True) + classified.write_parquet(out_parquet) + + curve = compute_fdr_curve(classified) + Path(out_plot).parent.mkdir(parents=True, exist_ok=True) + plot_fdr_curve(curve, out_plot, title=title) + + scalars: dict[str, float | int] = {} + scalars.update(_scalar_at_q(curve, 0.01, "q01")) + scalars.update(_scalar_at_q(curve, 0.05, "q05")) + return scalars diff --git a/bench/tests/test_entrapment.py b/bench/tests/test_entrapment.py index 9b2eee24..b0ffe409 100644 --- a/bench/tests/test_entrapment.py +++ b/bench/tests/test_entrapment.py @@ -2,6 +2,7 @@ from bench.entrapment import ( PeptideClass, + analyse, classify_peptides, compute_fdr_curve, parse_fasta, @@ -111,3 +112,41 @@ def test_plot_fdr_curve_writes_png(tmp_path): plot_fdr_curve(curve, out, title="test") assert out.exists() assert out.stat().st_size > 1000 # not an empty or stub file + + +def test_analyse_end_to_end(tmp_path): + target = tmp_path / "t.fasta" + target.write_text(">T1\nAAAAPEPTIDEKBBBB\n") + entrap = tmp_path / "e.fasta" + entrap.write_text(">E1\nQQQQENTRAPEPTKZZZZ\n") + + results = pl.DataFrame( + { + "sequence": ["PEPTIDEK", "ENTRAPEPTK", "PEPTIDEK", "ENTRAPEPTK"], + "qvalue": [0.001, 0.02, 0.005, 0.04], + } + ) + results_path = tmp_path / "results.parquet" + results.write_parquet(results_path) + + out = analyse( + results_parquet=results_path, + target_fasta=target, + entrapment_fasta=entrap, + out_parquet=tmp_path / "classified.parquet", + out_plot=tmp_path / "fdr.png", + ) + + # Returned scalars + assert out["entrap/n_target_at_q01"] == 2 # both PEPTIDEK rows have q <= 0.01 + assert out["entrap/n_entrap_at_q01"] == 0 + assert out["entrap/empirical_fdr_at_q01"] == 0.0 + assert out["entrap/n_target_at_q05"] == 2 + assert out["entrap/n_entrap_at_q05"] == 2 + + # Outputs + assert (tmp_path / "classified.parquet").exists() + assert (tmp_path / "fdr.png").exists() + + classified = pl.read_parquet(tmp_path / "classified.parquet") + assert "class" in classified.columns and "is_entrapment" in classified.columns From cc3a0632e7dbda6fbfa1f72da671f70ca107cc3d Mon Sep 17 00:00:00 2001 From: "J. Sebastian Paez" Date: Fri, 8 May 2026 20:48:40 -0700 Subject: [PATCH 13/41] feat(bench): push_fixture.py CLI scaffolding + TOML emitter --- bench/push_fixture.py | 132 +++++++++++++++++++++++++++++++ bench/tests/test_push_fixture.py | 95 ++++++++++++++++++++++ 2 files changed, 227 insertions(+) create mode 100644 bench/push_fixture.py create mode 100644 bench/tests/test_push_fixture.py diff --git a/bench/push_fixture.py b/bench/push_fixture.py new file mode 100644 index 00000000..b2d94962 --- /dev/null +++ b/bench/push_fixture.py @@ -0,0 +1,132 @@ +"""Build a fixture and push its inputs to S3. + +Resolves polymorphic --db specs into concatenated FASTAs, uploads them and +the raw .d directory, builds the speclib via speclib_build_cli, and writes +the fixture TOML to bench/fixtures/.toml. +""" + +from __future__ import annotations + +import argparse +import sys +from pathlib import Path + +from loguru import logger + + +def parse_args(argv: list[str] | None = None) -> argparse.Namespace: + p = argparse.ArgumentParser(description=__doc__) + p.add_argument( + "--name", required=True, help="Fixture name (used as filename + S3 subdir)" + ) + p.add_argument("--bucket", required=True, help="S3 bucket") + p.add_argument("--prefix", required=True, help="S3 prefix under the bucket") + + p.add_argument( + "--db", + action="append", + default=[], + required=True, + metavar="SPEC", + help=( + "Target FASTA source (repeatable). Accepts: path/to/*.fasta(.gz)," + " path/to/ids.txt, s3://..., UPxxxxxxxxx, accession" + ), + ) + p.add_argument( + "--raw", required=True, help="Raw .d / .idx (local dir or s3://...)" + ) + p.add_argument( + "--config", required=True, help="Local timsseek config TOML to embed" + ) + + p.add_argument( + "--entrap-db", + action="append", + default=[], + metavar="SPEC", + help="Entrapment FASTA source (repeatable)", + ) + p.add_argument( + "--calib-db", + action="append", + default=[], + metavar="SPEC", + help="Calibration FASTA source (repeatable)", + ) + + p.add_argument( + "--speclib", + help="If set, skip main speclib build and reference this URI", + ) + p.add_argument( + "--calibration-speclib", + help="If set, skip calib speclib build and reference this URI", + ) + + p.add_argument("--koina-url", help="Koina URL passed to speclib_build_cli") + + p.add_argument( + "--dry-run", action="store_true", help="Print the resolved plan and exit" + ) + p.add_argument( + "--overwrite", action="store_true", help="Overwrite existing fixture TOML" + ) + return p.parse_args(argv) + + +def build_fixture_toml( + name: str, + description: str, + config_path: Path, + fasta_uri: str, + speclib_uri: str, + raw_uri: str, + entrapment_fasta_uri: str | None, + calibration_speclib_uri: str | None, +) -> str: + """Emit a fixture TOML body as a string. The body is valid against + bench._fixture_schema.Fixture.""" + lines: list[str] = [] + lines.append(f'name = "{name}"') + desc = description.replace('"', '\\"') + lines.append(f'description = "{desc}"') + lines.append("") + lines.append("[inputs]") + lines.append(f'fasta = "{fasta_uri}"') + lines.append(f'speclib = "{speclib_uri}"') + lines.append(f'raw = "{raw_uri}"') + if entrapment_fasta_uri is not None: + lines.append(f'entrapment_fasta = "{entrapment_fasta_uri}"') + if calibration_speclib_uri is not None: + lines.append(f'calibration_speclib = "{calibration_speclib_uri}"') + lines.append("") + config_text = config_path.read_text().strip() + lines.append("# === embedded timsseek config ===") + # Re-emit each non-empty top-level section under `[config.
]`. The + # simplest approach: rewrite section headers `[X]` → `[config.X]` in the + # source. Sub-sections `[X.Y]` → `[config.X.Y]`. This preserves comments + # and ordering of the user's config. + for raw_line in config_text.splitlines(): + is_section = ( + raw_line.startswith("[") + and raw_line.rstrip().endswith("]") + and not raw_line.startswith("[[") + ) + if is_section: + inner = raw_line.strip()[1:-1] + lines.append(f"[config.{inner}]") + else: + lines.append(raw_line) + lines.append("") + return "\n".join(lines) + + +def main(argv: list[str] | None = None) -> int: + args = parse_args(argv) + logger.info("push_fixture.py invoked: {}", vars(args)) + raise NotImplementedError("upload + build pipeline lands in Task 10") + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/bench/tests/test_push_fixture.py b/bench/tests/test_push_fixture.py new file mode 100644 index 00000000..a246fadb --- /dev/null +++ b/bench/tests/test_push_fixture.py @@ -0,0 +1,95 @@ +import textwrap +from pathlib import Path + +from bench.push_fixture import build_fixture_toml, parse_args + + +def test_parse_args_minimal(): + args = parse_args( + [ + "--name", "hela", + "--bucket", "bk", + "--prefix", "fx", + "--db", "UP000005640", + "--raw", "/tmp/sample.d", + "--config", "/tmp/cfg.toml", + ] + ) + assert args.name == "hela" + assert args.bucket == "bk" + assert args.prefix == "fx" + assert args.db == ["UP000005640"] + assert args.entrap_db == [] + assert args.calib_db == [] + assert args.dry_run is False + assert args.overwrite is False + + +def test_parse_args_multiple_db_and_entrap(): + args = parse_args( + [ + "--name", "hy", + "--bucket", "bk", + "--prefix", "fx", + "--db", "UP000005640", + "--db", "/tmp/extra.fasta", + "--entrap-db", "UP000002311", + "--raw", "/tmp/sample.d", + "--config", "/tmp/cfg.toml", + "--dry-run", + ] + ) + assert args.db == ["UP000005640", "/tmp/extra.fasta"] + assert args.entrap_db == ["UP000002311"] + assert args.dry_run is True + + +def test_build_fixture_toml(tmp_path: Path): + cfg = tmp_path / "cfg.toml" + cfg.write_text( + textwrap.dedent( + """ + [analysis] + chunk_size = 20000 + """ + ).strip() + ) + out = build_fixture_toml( + name="hela", + description="200ng HeLa", + config_path=cfg, + fasta_uri="s3://bk/fx/hela/proteome.fasta", + speclib_uri="s3://bk/fx/hela/lib.msgpack.zst", + raw_uri="s3://bk/fx/hela/sample.d", + entrapment_fasta_uri=None, + calibration_speclib_uri=None, + ) + # Round-trip via the schema loader to verify validity + target_path = tmp_path / "fx.toml" + target_path.write_text(out) + from bench._fixture_schema import load_fixture + fx = load_fixture(target_path) + assert fx.name == "hela" + assert fx.inputs.entrapment_fasta is None + assert fx.config["analysis"]["chunk_size"] == 20000 + + +def test_build_fixture_toml_with_entrap_and_calib(tmp_path: Path): + cfg = tmp_path / "cfg.toml" + cfg.write_text("[analysis]\nchunk_size = 1\n") + out = build_fixture_toml( + name="x", + description="x", + config_path=cfg, + fasta_uri="s3://b/x/proteome.fasta", + speclib_uri="s3://b/x/lib.msgpack.zst", + raw_uri="s3://b/x/sample.d", + entrapment_fasta_uri="s3://b/x/entrap.fasta", + calibration_speclib_uri="s3://b/x/calib.msgpack.zst", + ) + p = tmp_path / "fx.toml" + p.write_text(out) + from bench._fixture_schema import load_fixture + fx = load_fixture(p) + assert fx.has_entrapment() + assert fx.has_calibration_speclib() From 5118703babbc9b99a992065416a6d93a7bbfd0e8 Mon Sep 17 00:00:00 2001 From: "J. Sebastian Paez" Date: Fri, 8 May 2026 20:52:42 -0700 Subject: [PATCH 14/41] feat(bench): push_fixture.py upload + speclib build pipeline --- bench/push_fixture.py | 222 ++++++++++++++++++++++++------- bench/tests/test_push_fixture.py | 188 ++++++++++++++++++++++++++ 2 files changed, 359 insertions(+), 51 deletions(-) diff --git a/bench/push_fixture.py b/bench/push_fixture.py index b2d94962..26338f55 100644 --- a/bench/push_fixture.py +++ b/bench/push_fixture.py @@ -8,11 +8,16 @@ from __future__ import annotations import argparse +import subprocess import sys +import tempfile from pathlib import Path from loguru import logger +from bench._db_resolver import resolve_dbs +from bench._s3 import s3_upload_dir, s3_upload_file + def parse_args(argv: list[str] | None = None) -> argparse.Namespace: p = argparse.ArgumentParser(description=__doc__) @@ -21,58 +26,50 @@ def parse_args(argv: list[str] | None = None) -> argparse.Namespace: ) p.add_argument("--bucket", required=True, help="S3 bucket") p.add_argument("--prefix", required=True, help="S3 prefix under the bucket") - - p.add_argument( - "--db", - action="append", - default=[], - required=True, - metavar="SPEC", - help=( - "Target FASTA source (repeatable). Accepts: path/to/*.fasta(.gz)," - " path/to/ids.txt, s3://..., UPxxxxxxxxx, accession" - ), - ) p.add_argument( - "--raw", required=True, help="Raw .d / .idx (local dir or s3://...)" + "--db", action="append", default=[], required=True, metavar="SPEC", + help="Target FASTA source (repeatable)", ) + p.add_argument("--raw", required=True, help="Raw .d / .idx (local dir or s3://...)") p.add_argument( "--config", required=True, help="Local timsseek config TOML to embed" ) - - p.add_argument( - "--entrap-db", - action="append", - default=[], - metavar="SPEC", - help="Entrapment FASTA source (repeatable)", - ) + p.add_argument("--entrap-db", action="append", default=[], metavar="SPEC") + p.add_argument("--calib-db", action="append", default=[], metavar="SPEC") p.add_argument( - "--calib-db", - action="append", - default=[], - metavar="SPEC", - help="Calibration FASTA source (repeatable)", - ) - - p.add_argument( - "--speclib", - help="If set, skip main speclib build and reference this URI", + "--speclib", dest="speclib_uri", + help="Skip main speclib build, reference this URI", ) p.add_argument( "--calibration-speclib", - help="If set, skip calib speclib build and reference this URI", + dest="calibration_speclib_uri", + help="Skip calib speclib build, reference this URI", ) + p.add_argument("--koina-url") + p.add_argument("--dry-run", action="store_true") + p.add_argument("--overwrite", action="store_true") + return p.parse_args(argv) - p.add_argument("--koina-url", help="Koina URL passed to speclib_build_cli") - p.add_argument( - "--dry-run", action="store_true", help="Print the resolved plan and exit" - ) - p.add_argument( - "--overwrite", action="store_true", help="Overwrite existing fixture TOML" - ) - return p.parse_args(argv) +def run_speclib_build( + fasta_s3: str, + speclib_s3: str, + koina_url: str | None, +) -> None: + cmd = [ + "cargo", "run", "--release", + "-p", "speclib_build_cli", "--", + "--fasta", fasta_s3, + "--fixed-mod", "C[U:4]", + "--max-ions", "10", + "-o", speclib_s3, + ] + if koina_url: + cmd.extend(["--koina-url", koina_url]) + else: + cmd.extend(["--request-delay-ms", "500"]) + logger.info("$ {}", " ".join(cmd)) + subprocess.run(cmd, check=True) def build_fixture_toml( @@ -85,8 +82,6 @@ def build_fixture_toml( entrapment_fasta_uri: str | None, calibration_speclib_uri: str | None, ) -> str: - """Emit a fixture TOML body as a string. The body is valid against - bench._fixture_schema.Fixture.""" lines: list[str] = [] lines.append(f'name = "{name}"') desc = description.replace('"', '\\"') @@ -101,19 +96,15 @@ def build_fixture_toml( if calibration_speclib_uri is not None: lines.append(f'calibration_speclib = "{calibration_speclib_uri}"') lines.append("") - config_text = config_path.read_text().strip() lines.append("# === embedded timsseek config ===") - # Re-emit each non-empty top-level section under `[config.
]`. The - # simplest approach: rewrite section headers `[X]` → `[config.X]` in the - # source. Sub-sections `[X.Y]` → `[config.X.Y]`. This preserves comments - # and ordering of the user's config. + config_text = config_path.read_text().strip() for raw_line in config_text.splitlines(): - is_section = ( + is_section_header = ( raw_line.startswith("[") and raw_line.rstrip().endswith("]") and not raw_line.startswith("[[") ) - if is_section: + if is_section_header: inner = raw_line.strip()[1:-1] lines.append(f"[config.{inner}]") else: @@ -122,10 +113,139 @@ def build_fixture_toml( return "\n".join(lines) +def _resolve_and_upload_fasta( + specs: list[str], + s3_dest: str, + label: str, + workdir: Path, +) -> None: + local = workdir / f"{label}.fasta" + resolve_dbs(specs, local) + s3_upload_file(str(local), s3_dest) + + +def run_pipeline( + *, + name: str, + bucket: str, + prefix: str, + db: list[str], + raw: str, + config: str, + entrap_db: list[str], + calib_db: list[str], + speclib_uri: str | None, + calibration_speclib_uri: str | None, + koina_url: str | None, + fixture_target: Path, + overwrite: bool, + dry_run: bool, +) -> None: + """Execute the full upload + build + write-toml flow.""" + dest_prefix = f"s3://{bucket}/{prefix.rstrip('/')}/{name}" + target_fasta_uri = f"{dest_prefix}/proteome.fasta" + entrap_fasta_uri = f"{dest_prefix}/entrap.fasta" if entrap_db else None + calib_fasta_uri = f"{dest_prefix}/calib.fasta" if calib_db else None + + main_speclib_uri = speclib_uri or f"{dest_prefix}/lib.msgpack.zst" + final_calib_speclib_uri: str | None = calibration_speclib_uri + if final_calib_speclib_uri is None and calib_db: + final_calib_speclib_uri = f"{dest_prefix}/calib_lib.msgpack.zst" + + # Raw is either a local dir we upload or an existing s3 URI we just reference + if raw.startswith("s3://"): + raw_uri = raw + else: + raw_uri = f"{dest_prefix}/sample.d" + + if fixture_target.exists() and not overwrite and not dry_run: + raise FileExistsError( + f"fixture TOML already exists: {fixture_target}" + " (pass --overwrite to replace)" + ) + + plan = { + "name": name, + "dest_prefix": dest_prefix, + "target_fasta_uri": target_fasta_uri, + "entrap_fasta_uri": entrap_fasta_uri, + "calib_fasta_uri": calib_fasta_uri, + "raw_uri": raw_uri, + "main_speclib_uri": main_speclib_uri, + "calib_speclib_uri": final_calib_speclib_uri, + "build_main_speclib": speclib_uri is None, + "build_calib_speclib": (calib_db != [] and calibration_speclib_uri is None), + } + logger.info("plan: {}", plan) + if dry_run: + logger.info("--dry-run: stopping before any side effects") + return + + with tempfile.TemporaryDirectory() as td: + workdir = Path(td) + + # 1. Resolve and upload target FASTA + _resolve_and_upload_fasta(db, target_fasta_uri, "proteome", workdir) + + # 2. Optional entrapment FASTA + if entrap_db: + assert entrap_fasta_uri is not None + _resolve_and_upload_fasta(entrap_db, entrap_fasta_uri, "entrap", workdir) + + # 3. Optional calibration FASTA + if calib_db: + assert calib_fasta_uri is not None + _resolve_and_upload_fasta(calib_db, calib_fasta_uri, "calib", workdir) + + # 4. Upload raw dir if local + if not raw.startswith("s3://"): + s3_upload_dir(raw, raw_uri) + + # 5. Build speclib(s) if not user-provided + if speclib_uri is None: + run_speclib_build(target_fasta_uri, main_speclib_uri, koina_url) + if calib_db and calibration_speclib_uri is None: + assert calib_fasta_uri is not None + assert final_calib_speclib_uri is not None + run_speclib_build(calib_fasta_uri, final_calib_speclib_uri, koina_url) + + # 6. Emit fixture TOML + body = build_fixture_toml( + name=name, + description="", + config_path=Path(config), + fasta_uri=target_fasta_uri, + speclib_uri=main_speclib_uri, + raw_uri=raw_uri, + entrapment_fasta_uri=entrap_fasta_uri, + calibration_speclib_uri=final_calib_speclib_uri, + ) + fixture_target.parent.mkdir(parents=True, exist_ok=True) + fixture_target.write_text(body) + logger.info("wrote fixture: {}", fixture_target) + logger.info("remember to: git add {}", fixture_target) + + def main(argv: list[str] | None = None) -> int: args = parse_args(argv) - logger.info("push_fixture.py invoked: {}", vars(args)) - raise NotImplementedError("upload + build pipeline lands in Task 10") + fixture_target = Path("bench/fixtures") / f"{args.name}.toml" + run_pipeline( + name=args.name, + bucket=args.bucket, + prefix=args.prefix, + db=args.db, + raw=args.raw, + config=args.config, + entrap_db=args.entrap_db, + calib_db=args.calib_db, + speclib_uri=args.speclib_uri, + calibration_speclib_uri=args.calibration_speclib_uri, + koina_url=args.koina_url, + fixture_target=fixture_target, + overwrite=args.overwrite, + dry_run=args.dry_run, + ) + return 0 if __name__ == "__main__": diff --git a/bench/tests/test_push_fixture.py b/bench/tests/test_push_fixture.py index a246fadb..f556474c 100644 --- a/bench/tests/test_push_fixture.py +++ b/bench/tests/test_push_fixture.py @@ -1,5 +1,8 @@ import textwrap from pathlib import Path +from unittest.mock import ANY, patch + +import pytest from bench.push_fixture import build_fixture_toml, parse_args @@ -93,3 +96,188 @@ def test_build_fixture_toml_with_entrap_and_calib(tmp_path: Path): fx = load_fixture(p) assert fx.has_entrapment() assert fx.has_calibration_speclib() + + +def _common_args(tmp_path): + cfg = tmp_path / "cfg.toml" + cfg.write_text("[analysis]\nchunk_size = 20000\n") + raw = tmp_path / "sample.d" + raw.mkdir() + (raw / "metadata").write_bytes(b"x") + return cfg, raw + + +@pytest.fixture +def fake_runtime(tmp_path): + """Patch S3 + speclib_build_cli + resolve_dbs for the run_pipeline tests.""" + fx_dir = tmp_path / "bench_fixtures" + fx_dir.mkdir() + + with ( + patch("bench.push_fixture.s3_upload_file") as up_file, + patch("bench.push_fixture.s3_upload_dir") as up_dir, + patch("bench.push_fixture.run_speclib_build") as build, + patch("bench.push_fixture.resolve_dbs") as res, + ): + # resolve_dbs writes a stub fasta to the requested output path + def _resolve(specs, out): + out.write_text(">x\nMK\n") + + res.side_effect = _resolve + + yield { + "up_file": up_file, + "up_dir": up_dir, + "build": build, + "res": res, + "fx_dir": fx_dir, + } + + +def test_run_pipeline_minimal(tmp_path, fake_runtime): + cfg, raw = _common_args(tmp_path) + from bench.push_fixture import run_pipeline + + output_toml = fake_runtime["fx_dir"] / "hela.toml" + run_pipeline( + name="hela", + bucket="bk", + prefix="fx", + db=["UP000005640"], + raw=str(raw), + config=str(cfg), + entrap_db=[], + calib_db=[], + speclib_uri=None, + calibration_speclib_uri=None, + koina_url=None, + fixture_target=output_toml, + overwrite=False, + dry_run=False, + ) + + # Resolved + uploaded the target fasta + assert fake_runtime["res"].call_count == 1 + fake_runtime["up_file"].assert_any_call(ANY, "s3://bk/fx/hela/proteome.fasta") + # Uploaded the raw directory + fake_runtime["up_dir"].assert_called_once() + args = fake_runtime["up_dir"].call_args.args + assert args[0] == str(raw) + assert args[1] == "s3://bk/fx/hela/sample.d" + # Built the speclib + fake_runtime["build"].assert_called_once() + # Wrote fixture TOML + assert output_toml.exists() + body = output_toml.read_text() + assert "s3://bk/fx/hela/lib.msgpack.zst" in body + + +def test_run_pipeline_skip_build_when_speclib_provided(tmp_path, fake_runtime): + cfg, raw = _common_args(tmp_path) + from bench.push_fixture import run_pipeline + + output_toml = fake_runtime["fx_dir"] / "hela.toml" + run_pipeline( + name="hela", + bucket="bk", + prefix="fx", + db=["UP000005640"], + raw=str(raw), + config=str(cfg), + entrap_db=[], + calib_db=[], + speclib_uri="s3://other/lib.msgpack.zst", + calibration_speclib_uri=None, + koina_url=None, + fixture_target=output_toml, + overwrite=False, + dry_run=False, + ) + # No speclib build + fake_runtime["build"].assert_not_called() + # Fixture TOML references the user-provided URI + assert "s3://other/lib.msgpack.zst" in output_toml.read_text() + + +def test_run_pipeline_with_entrap_and_calib_db(tmp_path, fake_runtime): + cfg, raw = _common_args(tmp_path) + from bench.push_fixture import run_pipeline + + output_toml = fake_runtime["fx_dir"] / "x.toml" + run_pipeline( + name="x", + bucket="bk", + prefix="fx", + db=["UP000005640"], + raw=str(raw), + config=str(cfg), + entrap_db=["UP000002311"], + calib_db=["P12345"], + speclib_uri=None, + calibration_speclib_uri=None, + koina_url=None, + fixture_target=output_toml, + overwrite=False, + dry_run=False, + ) + # resolve_dbs called for db, entrap_db, calib_db = 3 times + assert fake_runtime["res"].call_count == 3 + # speclib_build called twice: main + calibration + assert fake_runtime["build"].call_count == 2 + body = output_toml.read_text() + assert "entrapment_fasta" in body and "calibration_speclib" in body + + +def test_run_pipeline_refuses_overwrite(tmp_path, fake_runtime): + cfg, raw = _common_args(tmp_path) + from bench.push_fixture import run_pipeline + + target = fake_runtime["fx_dir"] / "hela.toml" + target.write_text("# pre-existing") + + with pytest.raises(FileExistsError): + run_pipeline( + name="hela", + bucket="bk", + prefix="fx", + db=["UP000005640"], + raw=str(raw), + config=str(cfg), + entrap_db=[], + calib_db=[], + speclib_uri=None, + calibration_speclib_uri=None, + koina_url=None, + fixture_target=target, + overwrite=False, + dry_run=False, + ) + + +def test_run_pipeline_dry_run(tmp_path, fake_runtime): + cfg, raw = _common_args(tmp_path) + from bench.push_fixture import run_pipeline + + target = fake_runtime["fx_dir"] / "hela.toml" + run_pipeline( + name="hela", + bucket="bk", + prefix="fx", + db=["UP000005640"], + raw=str(raw), + config=str(cfg), + entrap_db=[], + calib_db=[], + speclib_uri=None, + calibration_speclib_uri=None, + koina_url=None, + fixture_target=target, + overwrite=False, + dry_run=True, + ) + # No side effects + fake_runtime["res"].assert_not_called() + fake_runtime["up_file"].assert_not_called() + fake_runtime["up_dir"].assert_not_called() + fake_runtime["build"].assert_not_called() + assert not target.exists() From 108e720469d726064120fdaf1aa5a8f53a142c31 Mon Sep 17 00:00:00 2001 From: "J. Sebastian Paez" Date: Fri, 8 May 2026 20:55:37 -0700 Subject: [PATCH 15/41] feat(bench): rewrite bench runner skeleton with fixture selection --- bench/tests/test_wandb_bench.py | 79 ++++++++ bench/wandb_bench.py | 318 ++++++-------------------------- 2 files changed, 139 insertions(+), 258 deletions(-) create mode 100644 bench/tests/test_wandb_bench.py diff --git a/bench/tests/test_wandb_bench.py b/bench/tests/test_wandb_bench.py new file mode 100644 index 00000000..e8355841 --- /dev/null +++ b/bench/tests/test_wandb_bench.py @@ -0,0 +1,79 @@ +import textwrap +from pathlib import Path + +import pytest + + +def _write_fx(dir: Path, name: str) -> Path: + p = dir / f"{name}.toml" + p.write_text( + textwrap.dedent( + f""" + name = "{name}" + description = "x" + + [inputs] + fasta = "s3://b/p.fasta" + speclib = "s3://b/lib.msgpack.zst" + raw = "s3://b/sample.d" + + [config.analysis] + chunk_size = 20000 + """ + ).strip() + ) + return p + + +def test_select_positional(tmp_path): + _write_fx(tmp_path, "hela") + _write_fx(tmp_path, "yeast") + from bench.wandb_bench import select_fixtures + out = select_fixtures(["hela"], all_=False, match=None, fixtures_dir=tmp_path) + assert [f.name for f in out] == ["hela"] + + +def test_select_all(tmp_path): + _write_fx(tmp_path, "a") + _write_fx(tmp_path, "b") + from bench.wandb_bench import select_fixtures + out = select_fixtures([], all_=True, match=None, fixtures_dir=tmp_path) + assert sorted(f.name for f in out) == ["a", "b"] + + +def test_select_match_glob(tmp_path): + _write_fx(tmp_path, "hela_a") + _write_fx(tmp_path, "hela_b") + _write_fx(tmp_path, "yeast_c") + from bench.wandb_bench import select_fixtures + out = select_fixtures([], all_=False, match="hela*", fixtures_dir=tmp_path) + assert sorted(f.name for f in out) == ["hela_a", "hela_b"] + + +def test_select_unknown_name_errors(tmp_path): + _write_fx(tmp_path, "hela") + from bench.wandb_bench import select_fixtures + with pytest.raises(SystemExit) as exc: + select_fixtures(["nope"], all_=False, match=None, fixtures_dir=tmp_path) + assert "nope" in str(exc.value) + + +def test_select_combinations_error(tmp_path): + _write_fx(tmp_path, "hela") + from bench.wandb_bench import select_fixtures + with pytest.raises(SystemExit): + select_fixtures(["hela"], all_=True, match=None, fixtures_dir=tmp_path) + with pytest.raises(SystemExit): + select_fixtures(["hela"], all_=False, match="*", fixtures_dir=tmp_path) + with pytest.raises(SystemExit): + select_fixtures([], all_=True, match="*", fixtures_dir=tmp_path) + + +def test_select_no_args_lists_available(tmp_path, capsys): + _write_fx(tmp_path, "hela") + _write_fx(tmp_path, "yeast") + from bench.wandb_bench import select_fixtures + with pytest.raises(SystemExit): + select_fixtures([], all_=False, match=None, fixtures_dir=tmp_path) + err = capsys.readouterr().err + assert "hela" in err and "yeast" in err diff --git a/bench/wandb_bench.py b/bench/wandb_bench.py index cf445c4b..2687c357 100644 --- a/bench/wandb_bench.py +++ b/bench/wandb_bench.py @@ -1,276 +1,78 @@ -# /// script -# requires-python = ">=3.12" -# dependencies = [ -# "wandb[media]", -# "loguru", -# ] -# /// +"""Bench runner: load named fixtures, run timsseek, log to wandb.""" + +from __future__ import annotations import argparse -import json -import subprocess -import tempfile -from contextlib import contextmanager -from dataclasses import dataclass +import fnmatch +import sys from pathlib import Path -from typing import Any from loguru import logger -import wandb +from bench._fixture_schema import Fixture, load_fixture +DEFAULT_FIXTURES_DIR = Path("bench/fixtures") ENTITY = "jspaezp" PROJECT = "timsseek" -@dataclass -class TimsseekRunner: - fasta_file_location: Path - speclib_location: Path - raw_file_location: Path - config_dict: dict[str, Any] | None = None - koina_url: str | None = None - - def build_speclib(self): - if self.speclib_location.exists(): - logger.info("Skipping speclib build bc already exists") - return - - logger.info("Building speclib") - args = [ - "cargo", - "run", - "--release", - "-p", - "speclib_build_cli", - "--", - "--fasta", - str(self.fasta_file_location), - "--fixed-mod", - "C[U:4]", - "--max-ions", - "10", - "-o", - str(self.speclib_location), - ] - if self.koina_url: - args.extend(["--koina-url", self.koina_url]) - else: - # Public Koina: use delay to avoid rate limiting - args.extend(["--request-delay-ms", "500"]) - res = subprocess.run(args, check=True) - return res - - def setup_run(self): - if self.config_dict is None: - self.config_dict = self.default_timsseek_config() - - logger.info("Building release versions") - subprocess.run(["cargo", "b", "--release"], check=True) - - def loggable_config_dict(self) -> dict[str, Any]: - out = {} - out.update(self.config_dict) - out["raw_file"] = self.raw_file_location.name - out["speclib"] = self.speclib_location.name - return out - - def run( - self, - output_loc: Path | None = None, - wandb_kwargs: dict | None = None, - ): - self.setup_run() - - with tempfile.TemporaryDirectory() as temp_dir: - tmpdir = Path(temp_dir) - if output_loc is None: - output_loc = tmpdir - - results_path = output_loc / "res" - summary_dir = output_loc / "summ" - results_path.mkdir(exist_ok=True, parents=True) - summary_dir.mkdir(exist_ok=True, parents=True) - config_path = results_path / "config.json" - - with open(config_path, "w") as f: - f.write(json.dumps(self.config_dict)) - - with wandb_context( - self.loggable_config_dict(), - wandb_kwargs=wandb_kwargs, - ) as wandb_experiment: - self._run( - config_path=config_path, - speclib_path=self.speclib_location, - output_path=results_path, - raw_file=self.raw_file_location, - ) - # Results are now in a subdirectory named after the raw file - raw_file_stem = self.raw_file_location.stem - self.log_results(wandb_experiment, output_loc, raw_file_stem) - - @staticmethod - def _run(config_path, speclib_path, output_path, raw_file): - if not raw_file.exists(): - raise FileNotFoundError(f"Raw file {raw_file} does not exist") - if not speclib_path.exists(): - raise FileNotFoundError(f"Speclib file {speclib_path} does not exist") - if not config_path.exists(): - raise FileNotFoundError(f"Config file {config_path} does not exist") - - logger.info(f"Running timsseek on {raw_file.name}") - args = [ - "cargo", - "run", - "--release", - "--bin", - "timsseek", - "--", - "--overwrite", - "--config", - str(config_path), - "--speclib-file", - str(speclib_path), - "--output-dir", - str(output_path), - "--dotd-files", - str(raw_file), - ] - logger.info(f"Running command: {' '.join(args)}") - stdout_file = output_path / "timsseek_stdout.log" - stderr_file = output_path / "timsseek_stderr.log" - - logger.info(f"Starting timsseek, logging to {stdout_file} and {stderr_file}") - try: - res = subprocess.run( - args, - stdout=open(stdout_file, "w"), - stderr=open(stderr_file, "w"), - check=True, - ) - finally: - # Log stdout and stderr - logger.info(stdout_file.read_text()) - logger.error(stderr_file.read_text()) - logger.info(f"Timsseek completed with return code {res.returncode}") - return res +def _list_fixtures(fixtures_dir: Path) -> list[str]: + return sorted(p.stem for p in fixtures_dir.glob("*.toml")) - def log_results(self, wandb_experiment, results_loc, raw_file_stem): - metrics = self.crunch_metrics(results_loc, raw_file_stem) - with open("latest_metrics.json", "w") as f: - serializable_metrics = { - k: v - for k, v in metrics.items() - if isinstance(v, (int, float, str, bool, list, dict)) - } - assert serializable_metrics - json.dump(serializable_metrics, f, indent=4) - wandb_experiment.log(metrics) - @staticmethod - def default_timsseek_config(): - config = { - "analysis": { - "chunk_size": 20000, - "tolerance": { - "ms": {"ppm": [15.0, 15.0]}, - "mobility": {"percent": [10.0, 10.0]}, - "quad": {"absolute": [0.1, 0.1]}, - }, - } - } - return config - - @staticmethod - def crunch_metrics(output_dir: Path, raw_file_stem: str) -> dict[str, Any]: - metrics = {} - performance_report_path = ( - output_dir / "res" / raw_file_stem / "performance_report.json" +def select_fixtures( + names: list[str], + all_: bool, + match: str | None, + fixtures_dir: Path = DEFAULT_FIXTURES_DIR, +) -> list[Fixture]: + """Resolve CLI selection flags into a list of loaded fixtures.""" + selectors = sum([bool(names), all_, match is not None]) + if selectors == 0: + avail = _list_fixtures(fixtures_dir) + sys.stderr.write( + "no fixture selected. available: " + ", ".join(avail or ["(none)"]) + "\n" ) - if performance_report_path.exists(): - with open(performance_report_path, "r") as f: - metrics.update(json.load(f)) - else: - logger.warning( - f"Performance report {performance_report_path} does not exist" - ) - return metrics - - -@contextmanager -def wandb_context(config_dict: dict[str, Any], wandb_kwargs=None): - if wandb_kwargs is None: - wandb_kwargs = {} - # Start a new wandb run to track this script. - run = wandb.init( - # Set the wandb entity where your project will be logged (generally your team name). - entity=ENTITY, - # Set the wandb project where this run will be logged. - project=PROJECT, - # Track hyperparameters and run metadata. - config=config_dict, - **wandb_kwargs, - ) - try: - yield run - except KeyboardInterrupt as e: - logger.warning("Keyboard interrupt, finishing wandb run") - run.finish(1) - raise e - finally: - run.finish() - - -def main(wandb_kwargs: dict | None = None, koina_url: str | None = None): - - fasta_file = Path.home() / "fasta/hela_gt20peps.fasta" - speclib_path = Path.home() / "fasta/asdad.msgpack.zstd" - - prefix = Path.home() / "data/decompressed_timstof/" - dotd_files = [ - # prefix / "MSR28858_EXP80_Plate3_G08_DMSO_DIA_S5-G8_1_7079.d", - # prefix / "MSR28893_EXP80_Plate4_B07_DMSO_DIA_S6-B7_1_7115.d", - # prefix / "250225_Desnaux_200ng_Hela_ICC_on_DIA.d", - prefix / "250225_Desnaux_200ng_Hela_ICC_off_DIA.d", - ] - - for file in dotd_files: - runner = TimsseekRunner( - fasta_file_location=fasta_file, - speclib_location=speclib_path, - raw_file_location=file, - koina_url=koina_url, + raise SystemExit(2) + if selectors > 1: + sys.stderr.write( + "--all, --match, and positional names are mutually exclusive\n" ) - runner.build_speclib() - runner.run(wandb_kwargs=wandb_kwargs) - - -def build_parser(): - parser = argparse.ArgumentParser() - parser.add_argument( - "--notes", - type=str, - help="The notes to add to the wandb run", - ) - parser.add_argument( - "--koina-url", - type=str, - default=None, - help="Koina server URL (e.g. http://localhost:8501/v2/models for local)", - ) - return parser + raise SystemExit(2) + + if all_: + chosen = _list_fixtures(fixtures_dir) + elif match is not None: + chosen = [n for n in _list_fixtures(fixtures_dir) if fnmatch.fnmatch(n, match)] + else: + chosen = list(names) + + out: list[Fixture] = [] + avail = set(_list_fixtures(fixtures_dir)) + for n in chosen: + if n not in avail: + raise SystemExit(f"fixture not found: {n!r} (in {fixtures_dir})") + out.append(load_fixture(fixtures_dir / f"{n}.toml")) + return out + + +def parse_args(argv: list[str] | None = None) -> argparse.Namespace: + p = argparse.ArgumentParser(description=__doc__) + p.add_argument("fixtures", nargs="*", help="Fixture names to run") + p.add_argument("--all", dest="all_", action="store_true") + p.add_argument("--match", help="Glob pattern over fixture names") + p.add_argument("--notes", help="Free-form note added to wandb run") + p.add_argument("--dry-run", action="store_true", help="Print plan and stop") + return p.parse_args(argv) + + +def main(argv: list[str] | None = None) -> int: + args = parse_args(argv) + fixtures = select_fixtures(args.fixtures, args.all_, args.match) + for fx in fixtures: + logger.info("would run fixture: {}", fx.name) + raise NotImplementedError("timsseek invocation lands in Task 12") if __name__ == "__main__": - parser = build_parser() - args, unkargs = parser.parse_known_args() - if unkargs: - raise ValueError(f"Unknown arguments: {unkargs}") - - wandb_kwargs = {} - - if args.notes is not None: - wandb_kwargs["notes"] = args.notes - - main(wandb_kwargs=wandb_kwargs, koina_url=args.koina_url) + sys.exit(main()) From b85310890ec54afe07a4fbbca1d26a501dd51ea6 Mon Sep 17 00:00:00 2001 From: "J. Sebastian Paez" Date: Fri, 8 May 2026 21:01:53 -0700 Subject: [PATCH 16/41] feat(bench): timsseek invocation + wandb logging + entrapment hook --- bench/tests/test_wandb_bench.py | 127 +++++++++++++++++++++++++ bench/wandb_bench.py | 159 +++++++++++++++++++++++++++++++- 2 files changed, 282 insertions(+), 4 deletions(-) diff --git a/bench/tests/test_wandb_bench.py b/bench/tests/test_wandb_bench.py index e8355841..8878d6fa 100644 --- a/bench/tests/test_wandb_bench.py +++ b/bench/tests/test_wandb_bench.py @@ -1,8 +1,12 @@ +import json import textwrap from pathlib import Path +from unittest.mock import MagicMock, patch import pytest +from bench._fixture_schema import load_fixture + def _write_fx(dir: Path, name: str) -> Path: p = dir / f"{name}.toml" @@ -77,3 +81,126 @@ def test_select_no_args_lists_available(tmp_path, capsys): select_fixtures([], all_=False, match=None, fixtures_dir=tmp_path) err = capsys.readouterr().err assert "hela" in err and "yeast" in err + + +def _write_perf_report(out_dir: Path, raw_stem: str, payload: dict) -> None: + sub = out_dir / "res" / raw_stem + sub.mkdir(parents=True, exist_ok=True) + (sub / "performance_report.json").write_text(json.dumps(payload)) + + +@pytest.fixture +def fake_wandb(): + with patch("bench.wandb_bench.wandb") as w: + run = MagicMock() + w.init.return_value = run + yield {"wandb": w, "run": run} + + +def test_run_one_fixture_logs_perf_report(tmp_path, fake_wandb): + """timsseek subprocess is mocked; we drop a fake performance_report.json + where the runner expects it, then assert wandb.run.log got the payload.""" + fx_dir = tmp_path / "fx" + fx_dir.mkdir() + fixture = _write_fx(fx_dir, "hela") + out_root = tmp_path / "out" + + def fake_subprocess(cmd, *a, **kw): + # The runner passes --output-dir /res; locate it + idx = cmd.index("--output-dir") + out = Path(cmd[idx + 1]) + # Raw file basename is also passed via --dotd-files + raw_idx = cmd.index("--dotd-files") + raw_stem = Path(cmd[raw_idx + 1]).stem + _write_perf_report(out, raw_stem, {"runtime_s": 12.3, "n_targets_q01": 1000}) + return MagicMock(returncode=0) + + from bench.wandb_bench import run_one + with patch("bench.wandb_bench.subprocess.run", side_effect=fake_subprocess): + run_one( + load_fixture(fixture), + out_root=out_root, + notes="hi", + dry_run=False, + ) + + fake_wandb["wandb"].init.assert_called_once() + # log should have been invoked with the payload that came from the json file + fake_wandb["run"].log.assert_called() + logged = fake_wandb["run"].log.call_args.args[0] + assert logged["runtime_s"] == 12.3 + assert logged["n_targets_q01"] == 1000 + + +def test_run_one_fixture_runs_entrapment_when_field_present(tmp_path, fake_wandb): + fx_dir = tmp_path / "fx" + fx_dir.mkdir() + p = fx_dir / "hy.toml" + p.write_text( + textwrap.dedent( + """ + name = "hy" + description = "x" + + [inputs] + fasta = "s3://b/p.fasta" + speclib = "s3://b/lib.msgpack.zst" + raw = "s3://b/sample.d" + entrapment_fasta = "s3://b/entrap.fasta" + + [config.analysis] + chunk_size = 20000 + """ + ).strip() + ) + out_root = tmp_path / "out" + + def fake_subprocess(cmd, *a, **kw): + idx = cmd.index("--output-dir") + out = Path(cmd[idx + 1]) + raw_idx = cmd.index("--dotd-files") + raw_stem = Path(cmd[raw_idx + 1]).stem + _write_perf_report(out, raw_stem, {"runtime_s": 1.0}) + # Also drop a results.parquet so analyse() can read it + import polars as pl + pl.DataFrame({"sequence": ["MK"], "qvalue": [0.001]}).write_parquet( + out / "res" / raw_stem / "results.parquet" + ) + return MagicMock(returncode=0) + + from bench.wandb_bench import run_one + with ( + patch("bench.wandb_bench.subprocess.run", side_effect=fake_subprocess), + patch("bench.wandb_bench.analyse") as analyse_mock, + patch("bench.wandb_bench.s3_download_file") as s3_dl, + ): + # entrapment.analyse returns scalars; s3_download fetches the two fastas locally + analyse_mock.return_value = {"entrap/empirical_fdr_at_q01": 0.012} + + def _dl(uri, dst): + Path(dst).write_text(">x\nMK\n") + s3_dl.side_effect = _dl + + run_one(load_fixture(p), out_root=out_root, notes=None, dry_run=False) + + analyse_mock.assert_called_once() + # wandb.run.log should have been called with the entrapment scalars at some point + log_payloads = [c.args[0] for c in fake_wandb["run"].log.call_args_list] + assert any("entrap/empirical_fdr_at_q01" in p for p in log_payloads) + + +def test_run_one_dry_run_no_subprocess(tmp_path, fake_wandb): + fx_dir = tmp_path / "fx" + fx_dir.mkdir() + fixture = _write_fx(fx_dir, "hela") + + from bench.wandb_bench import run_one + with patch("bench.wandb_bench.subprocess.run") as sp: + run_one( + load_fixture(fixture), + out_root=tmp_path / "out", + notes=None, + dry_run=True, + ) + sp.assert_not_called() + fake_wandb["wandb"].init.assert_not_called() diff --git a/bench/wandb_bench.py b/bench/wandb_bench.py index 2687c357..85097404 100644 --- a/bench/wandb_bench.py +++ b/bench/wandb_bench.py @@ -4,14 +4,23 @@ import argparse import fnmatch +import json +import os +import subprocess import sys +import tempfile +from datetime import datetime from pathlib import Path +import wandb from loguru import logger from bench._fixture_schema import Fixture, load_fixture +from bench._s3 import s3_download_file +from bench.entrapment import analyse DEFAULT_FIXTURES_DIR = Path("bench/fixtures") +DEFAULT_OUT_ROOT = Path("bench_out") ENTITY = "jspaezp" PROJECT = "timsseek" @@ -26,7 +35,6 @@ def select_fixtures( match: str | None, fixtures_dir: Path = DEFAULT_FIXTURES_DIR, ) -> list[Fixture]: - """Resolve CLI selection flags into a list of loaded fixtures.""" selectors = sum([bool(names), all_, match is not None]) if selectors == 0: avail = _list_fixtures(fixtures_dir) @@ -56,13 +64,156 @@ def select_fixtures( return out +def _git_short_sha() -> str: + try: + return subprocess.check_output( + ["git", "rev-parse", "--short", "HEAD"], text=True + ).strip() + except Exception: + return "unknown" + + +def _git_branch() -> str: + try: + return subprocess.check_output( + ["git", "rev-parse", "--abbrev-ref", "HEAD"], text=True + ).strip() + except Exception: + return "unknown" + + +def _flatten_config(d: dict, prefix: str = "config") -> dict: + out = {} + for k, v in d.items(): + key = f"{prefix}.{k}" + if isinstance(v, dict): + out.update(_flatten_config(v, key)) + else: + out[key] = v + return out + + +def _materialize_timsseek_config(config_dict: dict, target_path: Path) -> None: + """Write the [config] subtree to a JSON file (timsseek accepts both + TOML and JSON — JSON is simpler to write here).""" + target_path.parent.mkdir(parents=True, exist_ok=True) + target_path.write_text(json.dumps(config_dict)) + + +def run_one( + fx: Fixture, + out_root: Path = DEFAULT_OUT_ROOT, + notes: str | None = None, + dry_run: bool = False, +) -> None: + """Execute one fixture: timsseek + wandb logging + optional entrapment.""" + ts = datetime.now().strftime("%Y%m%d-%H%M%S") + run_dir = out_root / "logs" / f"{fx.name}-{ts}" + res_dir = run_dir / "res" + + # The raw stem is what timsseek uses for its results subdir. + raw_stem = Path(fx.inputs.raw).name + if raw_stem.endswith(".d") or raw_stem.endswith(".idx"): + raw_stem = Path(raw_stem).stem + elif raw_stem.endswith(".d.tar"): + raw_stem = raw_stem[: -len(".d.tar")] + + plan_msg = f"fixture={fx.name} run_dir={run_dir} raw_stem={raw_stem}" + logger.info("plan: {}", plan_msg) + if dry_run: + return + + run_dir.mkdir(parents=True, exist_ok=True) + res_dir.mkdir(parents=True, exist_ok=True) + config_path = run_dir / "config.json" + _materialize_timsseek_config(fx.config, config_path) + + sha = _git_short_sha() + branch = _git_branch() + tags = [fx.name, branch] + if fx.has_entrapment(): + tags.append("entrapment") + + wandb_config = { + "fixture": fx.name, + "git.sha": sha, + "git.branch": branch, + "host": os.uname().nodename, + **{ + f"inputs.{k}": v + for k, v in fx.inputs.model_dump().items() + if v is not None + }, + **_flatten_config(fx.config), + } + run = wandb.init( + entity=ENTITY, + project=PROJECT, + name=f"{fx.name}-{sha}", + tags=tags, + notes=notes, + config=wandb_config, + ) + try: + cmd = [ + "cargo", "run", "--release", "--bin", "timsseek", "--", + "--overwrite", + "--config", str(config_path), + "--speclib-file", fx.inputs.speclib, + "--output-dir", str(run_dir), + "--dotd-files", fx.inputs.raw, + ] + if fx.has_calibration_speclib(): + assert fx.inputs.calibration_speclib is not None + cmd.extend(["--calib-lib", fx.inputs.calibration_speclib]) + stdout_log = run_dir / "timsseek_stdout.log" + stderr_log = run_dir / "timsseek_stderr.log" + logger.info("$ {}", " ".join(cmd)) + with stdout_log.open("w") as so, stderr_log.open("w") as se: + subprocess.run(cmd, stdout=so, stderr=se, check=True) + + perf = res_dir / raw_stem / "performance_report.json" + if perf.exists(): + run.log(json.loads(perf.read_text())) + else: + logger.warning("performance_report.json missing at {}", perf) + + if fx.has_entrapment(): + assert fx.inputs.entrapment_fasta is not None + with tempfile.TemporaryDirectory() as td: + target_local = Path(td) / "target.fasta" + entrap_local = Path(td) / "entrap.fasta" + s3_download_file(fx.inputs.fasta, str(target_local)) + s3_download_file(fx.inputs.entrapment_fasta, str(entrap_local)) + results_parquet = res_dir / raw_stem / "results.parquet" + out_parquet = ( + out_root / "parquets" / f"{fx.name}-{ts}-classified.parquet" + ) + out_plot = out_root / "plots" / f"{fx.name}-fdr_curve-{ts}.png" + scalars = analyse( + results_parquet=results_parquet, + target_fasta=target_local, + entrapment_fasta=entrap_local, + out_parquet=out_parquet, + out_plot=out_plot, + title=f"{fx.name} entrapment FDR", + ) + run.log(scalars) + run.log({"entrap/fdr_curve": wandb.Image(str(out_plot))}) + artifact = wandb.Artifact(f"{fx.name}-classified", type="dataset") + artifact.add_file(str(out_parquet)) + run.log_artifact(artifact) + finally: + run.finish() + + def parse_args(argv: list[str] | None = None) -> argparse.Namespace: p = argparse.ArgumentParser(description=__doc__) p.add_argument("fixtures", nargs="*", help="Fixture names to run") p.add_argument("--all", dest="all_", action="store_true") p.add_argument("--match", help="Glob pattern over fixture names") p.add_argument("--notes", help="Free-form note added to wandb run") - p.add_argument("--dry-run", action="store_true", help="Print plan and stop") + p.add_argument("--dry-run", action="store_true") return p.parse_args(argv) @@ -70,8 +221,8 @@ def main(argv: list[str] | None = None) -> int: args = parse_args(argv) fixtures = select_fixtures(args.fixtures, args.all_, args.match) for fx in fixtures: - logger.info("would run fixture: {}", fx.name) - raise NotImplementedError("timsseek invocation lands in Task 12") + run_one(fx, notes=args.notes, dry_run=args.dry_run) + return 0 if __name__ == "__main__": From 1451eeb20efdc4fc420a95e265a3d37b897a9957 Mon Sep 17 00:00:00 2001 From: "J. Sebastian Paez" Date: Fri, 8 May 2026 21:03:22 -0700 Subject: [PATCH 17/41] fix(bench): pass res_dir as timsseek --output-dir (matches its native layout) --- bench/tests/test_wandb_bench.py | 6 ++++-- bench/wandb_bench.py | 2 +- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/bench/tests/test_wandb_bench.py b/bench/tests/test_wandb_bench.py index 8878d6fa..fdc526a3 100644 --- a/bench/tests/test_wandb_bench.py +++ b/bench/tests/test_wandb_bench.py @@ -84,7 +84,9 @@ def test_select_no_args_lists_available(tmp_path, capsys): def _write_perf_report(out_dir: Path, raw_stem: str, payload: dict) -> None: - sub = out_dir / "res" / raw_stem + """`out_dir` is timsseek's `--output-dir` (i.e., the runner's `res_dir`). + Timsseek writes its outputs under `//`.""" + sub = out_dir / raw_stem sub.mkdir(parents=True, exist_ok=True) (sub / "performance_report.json").write_text(json.dumps(payload)) @@ -164,7 +166,7 @@ def fake_subprocess(cmd, *a, **kw): # Also drop a results.parquet so analyse() can read it import polars as pl pl.DataFrame({"sequence": ["MK"], "qvalue": [0.001]}).write_parquet( - out / "res" / raw_stem / "results.parquet" + out / raw_stem / "results.parquet" ) return MagicMock(returncode=0) diff --git a/bench/wandb_bench.py b/bench/wandb_bench.py index 85097404..01d77b93 100644 --- a/bench/wandb_bench.py +++ b/bench/wandb_bench.py @@ -160,7 +160,7 @@ def run_one( "--overwrite", "--config", str(config_path), "--speclib-file", fx.inputs.speclib, - "--output-dir", str(run_dir), + "--output-dir", str(res_dir), "--dotd-files", fx.inputs.raw, ] if fx.has_calibration_speclib(): From 89a3c0eb4bf33a62e901100ed169b54e566bfec3 Mon Sep 17 00:00:00 2001 From: "J. Sebastian Paez" Date: Fri, 8 May 2026 21:04:15 -0700 Subject: [PATCH 18/41] docs(bench): add default config and README for fixture harness --- bench/README.md | 31 +++++++++++++++++++++++++++++++ bench/configs/default.toml | 7 +++++++ 2 files changed, 38 insertions(+) create mode 100644 bench/README.md create mode 100644 bench/configs/default.toml diff --git a/bench/README.md b/bench/README.md new file mode 100644 index 00000000..9a890fa3 --- /dev/null +++ b/bench/README.md @@ -0,0 +1,31 @@ +# bench + +Fixture-driven bench harness for `timsseek`. Each fixture is a TOML in `bench/fixtures/` pointing at S3 URIs. + +## Run a fixture + + uv run --group bench python -m bench.wandb_bench hela_iccoff_gt20peps + uv run --group bench python -m bench.wandb_bench --all + uv run --group bench python -m bench.wandb_bench --match 'hela*' + +Outputs land under `bench_out/` (gitignored): `logs/-/`, `parquets/--classified.parquet`, `plots/-fdr_curve-.png`. Wandb runs go to `jspaezp/timsseek`. + +Fixtures with `entrapment_fasta` set automatically run the entrapment classification + FDR-curve step. + +## Push a new fixture + +Requires `aws` CLI (auth via env / profile). + + uv run --group bench python -m bench.push_fixture \ + --name hela_iccoff_gt20peps \ + --bucket timsbukto-bench --prefix fixtures \ + --db ~/fasta/hela_gt20peps.fasta \ + --raw ~/data/decompressed_timstof/250225_Desnaux_200ng_Hela_ICC_off_DIA.d \ + --config bench/configs/default.toml \ + --koina-url http://localhost:8501/v2/models # omit for public Koina + +`--db` (and `--entrap-db`, `--calib-db`) are repeatable and accept any of: local `*.fasta(.gz)` path, local `*.txt` accession list, `s3://...` URI, `UPxxxxxxxxx` proteome ID, bare uniprot accession. After upload, hand-edit the generated `bench/fixtures/.toml` to add a description, then `git add bench/fixtures/.toml`. + +## Schema + +See `bench/_fixture_schema.py` for the canonical TOML schema. Inputs must be `s3://` URIs (project rule: bench data lives in S3 only). diff --git a/bench/configs/default.toml b/bench/configs/default.toml new file mode 100644 index 00000000..2570446a --- /dev/null +++ b/bench/configs/default.toml @@ -0,0 +1,7 @@ +[analysis] +chunk_size = 20000 + +[analysis.tolerance] +ms = { ppm = [15.0, 15.0] } +mobility = { percent = [10.0, 10.0] } +quad = { absolute = [0.1, 0.1] } From 602860e808a4a2ec8b2a5331be25f4b290ce5665 Mon Sep 17 00:00:00 2001 From: "J. Sebastian Paez" Date: Fri, 8 May 2026 21:07:51 -0700 Subject: [PATCH 19/41] style(bench): apply task fmt ruff reformatted 6 bench/ files (magic trailing comma, line-length normalisation). --- bench/entrapment.py | 11 +++--- bench/push_fixture.py | 29 +++++++++++----- bench/tests/test_entrapment.py | 48 +++++++++++--------------- bench/tests/test_push_fixture.py | 58 +++++++++++++++++++------------- bench/tests/test_wandb_bench.py | 11 ++++++ bench/wandb_bench.py | 23 ++++++++----- 6 files changed, 106 insertions(+), 74 deletions(-) diff --git a/bench/entrapment.py b/bench/entrapment.py index cea6f7e7..d6fe9092 100644 --- a/bench/entrapment.py +++ b/bench/entrapment.py @@ -129,14 +129,13 @@ def compute_fdr_curve(classified: pl.DataFrame) -> pl.DataFrame: raise ValueError("classified dataframe missing 'qvalue' column") keep = classified.filter( - pl.col("class").is_in( - [PeptideClass.TARGET.value, PeptideClass.ENTRAPMENT.value] - ) + pl.col("class").is_in([ + PeptideClass.TARGET.value, + PeptideClass.ENTRAPMENT.value, + ]) ).sort("qvalue") - n_target = ( - (keep["class"] == PeptideClass.TARGET.value).cast(pl.UInt32).cum_sum() - ) + n_target = (keep["class"] == PeptideClass.TARGET.value).cast(pl.UInt32).cum_sum() n_entrap = ( (keep["class"] == PeptideClass.ENTRAPMENT.value).cast(pl.UInt32).cum_sum() ) diff --git a/bench/push_fixture.py b/bench/push_fixture.py index 26338f55..fb75f942 100644 --- a/bench/push_fixture.py +++ b/bench/push_fixture.py @@ -27,7 +27,11 @@ def parse_args(argv: list[str] | None = None) -> argparse.Namespace: p.add_argument("--bucket", required=True, help="S3 bucket") p.add_argument("--prefix", required=True, help="S3 prefix under the bucket") p.add_argument( - "--db", action="append", default=[], required=True, metavar="SPEC", + "--db", + action="append", + default=[], + required=True, + metavar="SPEC", help="Target FASTA source (repeatable)", ) p.add_argument("--raw", required=True, help="Raw .d / .idx (local dir or s3://...)") @@ -37,7 +41,8 @@ def parse_args(argv: list[str] | None = None) -> argparse.Namespace: p.add_argument("--entrap-db", action="append", default=[], metavar="SPEC") p.add_argument("--calib-db", action="append", default=[], metavar="SPEC") p.add_argument( - "--speclib", dest="speclib_uri", + "--speclib", + dest="speclib_uri", help="Skip main speclib build, reference this URI", ) p.add_argument( @@ -57,12 +62,20 @@ def run_speclib_build( koina_url: str | None, ) -> None: cmd = [ - "cargo", "run", "--release", - "-p", "speclib_build_cli", "--", - "--fasta", fasta_s3, - "--fixed-mod", "C[U:4]", - "--max-ions", "10", - "-o", speclib_s3, + "cargo", + "run", + "--release", + "-p", + "speclib_build_cli", + "--", + "--fasta", + fasta_s3, + "--fixed-mod", + "C[U:4]", + "--max-ions", + "10", + "-o", + speclib_s3, ] if koina_url: cmd.extend(["--koina-url", koina_url]) diff --git a/bench/tests/test_entrapment.py b/bench/tests/test_entrapment.py index b0ffe409..b57ae72c 100644 --- a/bench/tests/test_entrapment.py +++ b/bench/tests/test_entrapment.py @@ -32,9 +32,7 @@ def test_classify_peptides(tmp_path): entrap = tmp_path / "e.fasta" entrap.write_text(">E1\nQQQQENTRAPEPTKZZZZ\n>E2\nMMMMSHAREDYYYY\n") - df = pl.DataFrame( - {"sequence": ["PEPTIDEK", "ENTRAPEPTK", "SHARED", "GHOSTAA"]} - ) + df = pl.DataFrame({"sequence": ["PEPTIDEK", "ENTRAPEPTK", "SHARED", "GHOSTAA"]}) classified = classify_peptides(df, target, entrap) classes = dict(zip(classified["sequence"], classified["class"])) @@ -68,12 +66,10 @@ def test_classify_peptides_strips_mods_before_match(tmp_path): def test_compute_fdr_curve_basic(): - classified = pl.DataFrame( - { - "qvalue": [0.001, 0.005, 0.01, 0.02, 0.05], - "class": ["target", "target", "entrapment", "target", "entrapment"], - } - ) + classified = pl.DataFrame({ + "qvalue": [0.001, 0.005, 0.01, 0.02, 0.05], + "class": ["target", "target", "entrapment", "target", "entrapment"], + }) curve = compute_fdr_curve(classified) # Sorted ascending by qvalue assert curve["qvalue"].to_list() == [0.001, 0.005, 0.01, 0.02, 0.05] @@ -87,12 +83,10 @@ def test_compute_fdr_curve_basic(): def test_compute_fdr_curve_excludes_shared_and_unknown(): - classified = pl.DataFrame( - { - "qvalue": [0.01, 0.01, 0.01, 0.01], - "class": ["target", "shared_dropped", "unknown", "entrapment"], - } - ) + classified = pl.DataFrame({ + "qvalue": [0.01, 0.01, 0.01, 0.01], + "class": ["target", "shared_dropped", "unknown", "entrapment"], + }) curve = compute_fdr_curve(classified) # Only one target + one entrapment row contribute assert curve.height == 2 @@ -100,14 +94,12 @@ def test_compute_fdr_curve_excludes_shared_and_unknown(): def test_plot_fdr_curve_writes_png(tmp_path): - curve = pl.DataFrame( - { - "qvalue": [0.001, 0.01, 0.05], - "n_target": [10, 50, 100], - "n_entrap": [0, 1, 5], - "empirical_fdr": [0.0, 1 / 51, 5 / 105], - } - ) + curve = pl.DataFrame({ + "qvalue": [0.001, 0.01, 0.05], + "n_target": [10, 50, 100], + "n_entrap": [0, 1, 5], + "empirical_fdr": [0.0, 1 / 51, 5 / 105], + }) out = tmp_path / "fdr.png" plot_fdr_curve(curve, out, title="test") assert out.exists() @@ -120,12 +112,10 @@ def test_analyse_end_to_end(tmp_path): entrap = tmp_path / "e.fasta" entrap.write_text(">E1\nQQQQENTRAPEPTKZZZZ\n") - results = pl.DataFrame( - { - "sequence": ["PEPTIDEK", "ENTRAPEPTK", "PEPTIDEK", "ENTRAPEPTK"], - "qvalue": [0.001, 0.02, 0.005, 0.04], - } - ) + results = pl.DataFrame({ + "sequence": ["PEPTIDEK", "ENTRAPEPTK", "PEPTIDEK", "ENTRAPEPTK"], + "qvalue": [0.001, 0.02, 0.005, 0.04], + }) results_path = tmp_path / "results.parquet" results.write_parquet(results_path) diff --git a/bench/tests/test_push_fixture.py b/bench/tests/test_push_fixture.py index f556474c..46bbb230 100644 --- a/bench/tests/test_push_fixture.py +++ b/bench/tests/test_push_fixture.py @@ -8,16 +8,20 @@ def test_parse_args_minimal(): - args = parse_args( - [ - "--name", "hela", - "--bucket", "bk", - "--prefix", "fx", - "--db", "UP000005640", - "--raw", "/tmp/sample.d", - "--config", "/tmp/cfg.toml", - ] - ) + args = parse_args([ + "--name", + "hela", + "--bucket", + "bk", + "--prefix", + "fx", + "--db", + "UP000005640", + "--raw", + "/tmp/sample.d", + "--config", + "/tmp/cfg.toml", + ]) assert args.name == "hela" assert args.bucket == "bk" assert args.prefix == "fx" @@ -29,19 +33,25 @@ def test_parse_args_minimal(): def test_parse_args_multiple_db_and_entrap(): - args = parse_args( - [ - "--name", "hy", - "--bucket", "bk", - "--prefix", "fx", - "--db", "UP000005640", - "--db", "/tmp/extra.fasta", - "--entrap-db", "UP000002311", - "--raw", "/tmp/sample.d", - "--config", "/tmp/cfg.toml", - "--dry-run", - ] - ) + args = parse_args([ + "--name", + "hy", + "--bucket", + "bk", + "--prefix", + "fx", + "--db", + "UP000005640", + "--db", + "/tmp/extra.fasta", + "--entrap-db", + "UP000002311", + "--raw", + "/tmp/sample.d", + "--config", + "/tmp/cfg.toml", + "--dry-run", + ]) assert args.db == ["UP000005640", "/tmp/extra.fasta"] assert args.entrap_db == ["UP000002311"] assert args.dry_run is True @@ -71,6 +81,7 @@ def test_build_fixture_toml(tmp_path: Path): target_path = tmp_path / "fx.toml" target_path.write_text(out) from bench._fixture_schema import load_fixture + fx = load_fixture(target_path) assert fx.name == "hela" assert fx.inputs.entrapment_fasta is None @@ -93,6 +104,7 @@ def test_build_fixture_toml_with_entrap_and_calib(tmp_path: Path): p = tmp_path / "fx.toml" p.write_text(out) from bench._fixture_schema import load_fixture + fx = load_fixture(p) assert fx.has_entrapment() assert fx.has_calibration_speclib() diff --git a/bench/tests/test_wandb_bench.py b/bench/tests/test_wandb_bench.py index fdc526a3..dac08b6f 100644 --- a/bench/tests/test_wandb_bench.py +++ b/bench/tests/test_wandb_bench.py @@ -33,6 +33,7 @@ def test_select_positional(tmp_path): _write_fx(tmp_path, "hela") _write_fx(tmp_path, "yeast") from bench.wandb_bench import select_fixtures + out = select_fixtures(["hela"], all_=False, match=None, fixtures_dir=tmp_path) assert [f.name for f in out] == ["hela"] @@ -41,6 +42,7 @@ def test_select_all(tmp_path): _write_fx(tmp_path, "a") _write_fx(tmp_path, "b") from bench.wandb_bench import select_fixtures + out = select_fixtures([], all_=True, match=None, fixtures_dir=tmp_path) assert sorted(f.name for f in out) == ["a", "b"] @@ -50,6 +52,7 @@ def test_select_match_glob(tmp_path): _write_fx(tmp_path, "hela_b") _write_fx(tmp_path, "yeast_c") from bench.wandb_bench import select_fixtures + out = select_fixtures([], all_=False, match="hela*", fixtures_dir=tmp_path) assert sorted(f.name for f in out) == ["hela_a", "hela_b"] @@ -57,6 +60,7 @@ def test_select_match_glob(tmp_path): def test_select_unknown_name_errors(tmp_path): _write_fx(tmp_path, "hela") from bench.wandb_bench import select_fixtures + with pytest.raises(SystemExit) as exc: select_fixtures(["nope"], all_=False, match=None, fixtures_dir=tmp_path) assert "nope" in str(exc.value) @@ -65,6 +69,7 @@ def test_select_unknown_name_errors(tmp_path): def test_select_combinations_error(tmp_path): _write_fx(tmp_path, "hela") from bench.wandb_bench import select_fixtures + with pytest.raises(SystemExit): select_fixtures(["hela"], all_=True, match=None, fixtures_dir=tmp_path) with pytest.raises(SystemExit): @@ -77,6 +82,7 @@ def test_select_no_args_lists_available(tmp_path, capsys): _write_fx(tmp_path, "hela") _write_fx(tmp_path, "yeast") from bench.wandb_bench import select_fixtures + with pytest.raises(SystemExit): select_fixtures([], all_=False, match=None, fixtures_dir=tmp_path) err = capsys.readouterr().err @@ -118,6 +124,7 @@ def fake_subprocess(cmd, *a, **kw): return MagicMock(returncode=0) from bench.wandb_bench import run_one + with patch("bench.wandb_bench.subprocess.run", side_effect=fake_subprocess): run_one( load_fixture(fixture), @@ -165,12 +172,14 @@ def fake_subprocess(cmd, *a, **kw): _write_perf_report(out, raw_stem, {"runtime_s": 1.0}) # Also drop a results.parquet so analyse() can read it import polars as pl + pl.DataFrame({"sequence": ["MK"], "qvalue": [0.001]}).write_parquet( out / raw_stem / "results.parquet" ) return MagicMock(returncode=0) from bench.wandb_bench import run_one + with ( patch("bench.wandb_bench.subprocess.run", side_effect=fake_subprocess), patch("bench.wandb_bench.analyse") as analyse_mock, @@ -181,6 +190,7 @@ def fake_subprocess(cmd, *a, **kw): def _dl(uri, dst): Path(dst).write_text(">x\nMK\n") + s3_dl.side_effect = _dl run_one(load_fixture(p), out_root=out_root, notes=None, dry_run=False) @@ -197,6 +207,7 @@ def test_run_one_dry_run_no_subprocess(tmp_path, fake_wandb): fixture = _write_fx(fx_dir, "hela") from bench.wandb_bench import run_one + with patch("bench.wandb_bench.subprocess.run") as sp: run_one( load_fixture(fixture), diff --git a/bench/wandb_bench.py b/bench/wandb_bench.py index 01d77b93..4d3f3fe9 100644 --- a/bench/wandb_bench.py +++ b/bench/wandb_bench.py @@ -140,9 +140,7 @@ def run_one( "git.branch": branch, "host": os.uname().nodename, **{ - f"inputs.{k}": v - for k, v in fx.inputs.model_dump().items() - if v is not None + f"inputs.{k}": v for k, v in fx.inputs.model_dump().items() if v is not None }, **_flatten_config(fx.config), } @@ -156,12 +154,21 @@ def run_one( ) try: cmd = [ - "cargo", "run", "--release", "--bin", "timsseek", "--", + "cargo", + "run", + "--release", + "--bin", + "timsseek", + "--", "--overwrite", - "--config", str(config_path), - "--speclib-file", fx.inputs.speclib, - "--output-dir", str(res_dir), - "--dotd-files", fx.inputs.raw, + "--config", + str(config_path), + "--speclib-file", + fx.inputs.speclib, + "--output-dir", + str(res_dir), + "--dotd-files", + fx.inputs.raw, ] if fx.has_calibration_speclib(): assert fx.inputs.calibration_speclib is not None From 7ba9e25f44fd766d97aaf731e60ec8ca0d902f01 Mon Sep 17 00:00:00 2001 From: "J. Sebastian Paez" Date: Fri, 8 May 2026 21:10:57 -0700 Subject: [PATCH 20/41] fix(bench): ty-clean field_name access; use real S3 prefix in README --- bench/README.md | 2 +- bench/_fixture_schema.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/bench/README.md b/bench/README.md index 9a890fa3..c82b1b24 100644 --- a/bench/README.md +++ b/bench/README.md @@ -18,7 +18,7 @@ Requires `aws` CLI (auth via env / profile). uv run --group bench python -m bench.push_fixture \ --name hela_iccoff_gt20peps \ - --bucket timsbukto-bench --prefix fixtures \ + --bucket terraform-workstations-bucket --prefix jspaezp/timsseek_fixtures \ --db ~/fasta/hela_gt20peps.fasta \ --raw ~/data/decompressed_timstof/250225_Desnaux_200ng_Hela_ICC_off_DIA.d \ --config bench/configs/default.toml \ diff --git a/bench/_fixture_schema.py b/bench/_fixture_schema.py index 5038caae..5cd598fb 100644 --- a/bench/_fixture_schema.py +++ b/bench/_fixture_schema.py @@ -27,14 +27,14 @@ class FixtureInputs(BaseModel): @field_validator("fasta", "speclib", "raw") @classmethod def _required_s3(cls, v: str, info: ValidationInfo) -> str: - return _require_s3(v, info.field_name) + return _require_s3(v, info.field_name or "") @field_validator("entrapment_fasta", "calibration_speclib") @classmethod def _optional_s3(cls, v: str | None, info: ValidationInfo) -> str | None: if v is None: return v - return _require_s3(v, info.field_name) + return _require_s3(v, info.field_name or "") class Fixture(BaseModel): From 6cf54386f037d99fd39e5723cd658361fb71f8eb Mon Sep 17 00:00:00 2001 From: "J. Sebastian Paez" Date: Fri, 8 May 2026 21:25:00 -0700 Subject: [PATCH 21/41] feat(bench): fixture inputs accept absolute local paths (for staged fixtures) --- Cargo.toml | 5 +-- bench/_fixture_schema.py | 27 +++++++++++----- bench/tests/test_fixture_schema.py | 49 ++++++++++++++++++++++++++++-- 3 files changed, 68 insertions(+), 13 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index d8ca2ce4..f9333792 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -81,8 +81,9 @@ tempfile = "3.23.0" # rustyms into target/patch/ via timsseek_cli's build.rs. On a clean checkout # run `cargo patch-crate` once to populate target/patch/ before building, or # trigger a first build of timsseek_cli to let its build.rs handle it. -[patch.crates-io] -rustyms = { path = "./target/patch/rustyms-0.11.0" } +## DISABLING FOR NOW +## [patch.crates-io] +## rustyms = { path = "./target/patch/rustyms-0.11.0" } [workspace.lints.clippy] len_without_is_empty = "allow" diff --git a/bench/_fixture_schema.py b/bench/_fixture_schema.py index 5cd598fb..082926fd 100644 --- a/bench/_fixture_schema.py +++ b/bench/_fixture_schema.py @@ -9,10 +9,21 @@ from pydantic import BaseModel, ConfigDict, Field, ValidationInfo, field_validator -def _require_s3(value: str, field: str) -> str: - if not value.startswith("s3://"): - raise ValueError(f"{field} must be an s3:// URI, got {value!r}") - return value +def _require_uri(value: str, field: str) -> str: + """Accept s3:// URIs or absolute local paths. Expands `~` to home. + + Relative paths and other URI schemes (file://, http://, etc.) are + rejected. Existence is NOT checked here — the runner will surface a + clearer error at use time. + """ + if value.startswith("s3://"): + return value + expanded = str(Path(value).expanduser()) + if Path(expanded).is_absolute(): + return expanded + raise ValueError( + f"{field} must be an s3:// URI or an absolute local path, got {value!r}" + ) class FixtureInputs(BaseModel): @@ -26,15 +37,15 @@ class FixtureInputs(BaseModel): @field_validator("fasta", "speclib", "raw") @classmethod - def _required_s3(cls, v: str, info: ValidationInfo) -> str: - return _require_s3(v, info.field_name or "") + def _required_uri(cls, v: str, info: ValidationInfo) -> str: + return _require_uri(v, info.field_name or "") @field_validator("entrapment_fasta", "calibration_speclib") @classmethod - def _optional_s3(cls, v: str | None, info: ValidationInfo) -> str | None: + def _optional_uri(cls, v: str | None, info: ValidationInfo) -> str | None: if v is None: return v - return _require_s3(v, info.field_name or "") + return _require_uri(v, info.field_name or "") class Fixture(BaseModel): diff --git a/bench/tests/test_fixture_schema.py b/bench/tests/test_fixture_schema.py index 3d3c0545..5f4fe6d9 100644 --- a/bench/tests/test_fixture_schema.py +++ b/bench/tests/test_fixture_schema.py @@ -60,7 +60,7 @@ def test_entrapment_and_calib_optional_present(tmp_path): assert f.has_calibration_speclib() -def test_local_path_in_inputs_rejected(tmp_path): +def test_relative_path_in_inputs_rejected(tmp_path): p = _write( tmp_path, """ @@ -68,7 +68,7 @@ def test_local_path_in_inputs_rejected(tmp_path): description = "test" [inputs] - fasta = "/home/me/p.fasta" + fasta = "relative/path.fasta" speclib = "s3://b/lib.msgpack.zst" raw = "s3://b/sample.d" @@ -76,10 +76,53 @@ def test_local_path_in_inputs_rejected(tmp_path): chunk_size = 20000 """, ) - with pytest.raises(ValueError, match="must be an s3:// URI"): + with pytest.raises(ValueError, match="absolute local path"): load_fixture(p) +def test_absolute_local_path_in_inputs_accepted(tmp_path): + p = _write( + tmp_path, + """ + name = "ok" + description = "test" + + [inputs] + fasta = "/abs/path/proteome.fasta" + speclib = "/abs/path/lib.msgpack.zst" + raw = "/abs/path/sample.d" + + [config.analysis] + chunk_size = 20000 + """, + ) + f = load_fixture(p) + assert f.inputs.fasta == "/abs/path/proteome.fasta" + + +def test_tilde_path_expanded(tmp_path): + import os + home = os.path.expanduser("~") + p = _write( + tmp_path, + f""" + name = "ok" + description = "test" + + [inputs] + fasta = "~/proteome.fasta" + speclib = "{home}/lib.msgpack.zst" + raw = "s3://b/sample.d" + + [config.analysis] + chunk_size = 20000 + """, + ) + f = load_fixture(p) + assert f.inputs.fasta == f"{home}/proteome.fasta" + assert f.inputs.speclib == f"{home}/lib.msgpack.zst" + + def test_missing_required_input_rejected(tmp_path): p = _write( tmp_path, From d3b4539312f062d491d87b8025237da3ec824954 Mon Sep 17 00:00:00 2001 From: "J. Sebastian Paez" Date: Fri, 8 May 2026 21:26:08 -0700 Subject: [PATCH 22/41] feat(bench): --fixtures-dir flag on wandb_bench (for staged fixtures) --- bench/tests/test_wandb_bench.py | 16 ++++++++++++++++ bench/wandb_bench.py | 8 +++++++- 2 files changed, 23 insertions(+), 1 deletion(-) diff --git a/bench/tests/test_wandb_bench.py b/bench/tests/test_wandb_bench.py index dac08b6f..263a411f 100644 --- a/bench/tests/test_wandb_bench.py +++ b/bench/tests/test_wandb_bench.py @@ -217,3 +217,19 @@ def test_run_one_dry_run_no_subprocess(tmp_path, fake_wandb): ) sp.assert_not_called() fake_wandb["wandb"].init.assert_not_called() + + +def test_parse_args_fixtures_dir_flag(): + from bench.wandb_bench import parse_args + args = parse_args([ + "--fixtures-dir", "bench_out/staged", + "myfix", + ]) + assert args.fixtures_dir == Path("bench_out/staged") + assert args.fixtures == ["myfix"] + + +def test_parse_args_fixtures_dir_default(): + from bench.wandb_bench import DEFAULT_FIXTURES_DIR, parse_args + args = parse_args(["myfix"]) + assert args.fixtures_dir == DEFAULT_FIXTURES_DIR diff --git a/bench/wandb_bench.py b/bench/wandb_bench.py index 4d3f3fe9..48e35ddf 100644 --- a/bench/wandb_bench.py +++ b/bench/wandb_bench.py @@ -221,12 +221,18 @@ def parse_args(argv: list[str] | None = None) -> argparse.Namespace: p.add_argument("--match", help="Glob pattern over fixture names") p.add_argument("--notes", help="Free-form note added to wandb run") p.add_argument("--dry-run", action="store_true") + p.add_argument( + "--fixtures-dir", + type=Path, + default=DEFAULT_FIXTURES_DIR, + help="Directory containing fixture TOMLs (default: bench/fixtures/)", + ) return p.parse_args(argv) def main(argv: list[str] | None = None) -> int: args = parse_args(argv) - fixtures = select_fixtures(args.fixtures, args.all_, args.match) + fixtures = select_fixtures(args.fixtures, args.all_, args.match, args.fixtures_dir) for fx in fixtures: run_one(fx, notes=args.notes, dry_run=args.dry_run) return 0 From 85cf7debed976884b187351228e1b4d6ceac86aa Mon Sep 17 00:00:00 2001 From: "J. Sebastian Paez" Date: Fri, 8 May 2026 21:26:35 -0700 Subject: [PATCH 23/41] chore: disable patching --- Cargo.lock | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Cargo.lock b/Cargo.lock index 68033636..cbebf754 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5321,6 +5321,8 @@ checksum = "b39cdef0fa800fc44525c84ccb54a029961a8215f9619753635a9c0d2538d46d" [[package]] name = "rustyms" version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "011d3d672ae44d5e07db0488d855f2b5ed178e3d6bb7ef5b18c6415c20bbd61e" dependencies = [ "bincode", "context_error", From 3eac4ffacac6cc151393c79eab46cff3b1d259f3 Mon Sep 17 00:00:00 2001 From: "J. Sebastian Paez" Date: Fri, 8 May 2026 21:29:55 -0700 Subject: [PATCH 24/41] feat(bench): idempotent push_fixture uploads (skip existing s3 objects, --force to override) --- bench/_s3.py | 40 +++++++++++++++++--- bench/push_fixture.py | 24 +++++++++--- bench/tests/test_push_fixture.py | 64 +++++++++++++++++++++++++++++++- 3 files changed, 117 insertions(+), 11 deletions(-) diff --git a/bench/_s3.py b/bench/_s3.py index 7fdfeeee..33944b9b 100644 --- a/bench/_s3.py +++ b/bench/_s3.py @@ -1,5 +1,4 @@ -"""Tiny shellout wrapper around `aws s3 cp`. One module so all subprocess -invocations of the AWS CLI live in one place (easier to mock in tests).""" +"""Tiny shellout wrapper around `aws s3 cp` / `aws s3 ls` / `aws s3 sync`.""" from __future__ import annotations @@ -17,21 +16,52 @@ def _aws_cp(src: str, dst: str, recursive: bool = False) -> None: subprocess.run(cmd, check=True) +def _aws_sync(src: str, dst: str) -> None: + cmd = ["aws", "s3", "sync", src, dst] + logger.info("$ {}", " ".join(cmd)) + subprocess.run(cmd, check=True) + + +def s3_object_exists(s3_uri: str) -> bool: + """Return True if the S3 object at `s3_uri` already exists.""" + if not s3_uri.startswith("s3://"): + raise ValueError(f"not an s3:// URI: {s3_uri}") + cmd = ["aws", "s3", "ls", s3_uri] + logger.info("$ {}", " ".join(cmd)) + result = subprocess.run(cmd, capture_output=True, text=True) + # `aws s3 ls` exits 0 with empty output when the object isn't there + # (some configurations) or non-zero in others. Accept either signal: + # object exists iff exit code 0 AND output non-empty. + return result.returncode == 0 and bool(result.stdout.strip()) + + def s3_download_file(s3_uri: str, local_path: str) -> None: if not s3_uri.startswith("s3://"): raise ValueError(f"not an s3:// URI: {s3_uri}") _aws_cp(s3_uri, local_path) -def s3_upload_file(local_path: str, s3_uri: str) -> None: +def s3_upload_file(local_path: str, s3_uri: str, skip_if_exists: bool = False) -> None: if not s3_uri.startswith("s3://"): raise ValueError(f"not an s3:// URI: {s3_uri}") + if skip_if_exists and s3_object_exists(s3_uri): + logger.info("s3 upload: skipping {} (already exists)", s3_uri) + return _aws_cp(local_path, s3_uri) -def s3_upload_dir(local_dir: str, s3_uri: str) -> None: +def s3_upload_dir(local_dir: str, s3_uri: str, idempotent: bool = False) -> None: + """Upload a directory tree to S3. + + `idempotent=True` uses `aws s3 sync` which only transfers files whose + size or modified-time differ. `idempotent=False` (default) does a fresh + recursive copy, overwriting whatever is at the destination. + """ if not s3_uri.startswith("s3://"): raise ValueError(f"not an s3:// URI: {s3_uri}") if not Path(local_dir).is_dir(): raise ValueError(f"not a directory: {local_dir}") - _aws_cp(local_dir, s3_uri, recursive=True) + if idempotent: + _aws_sync(local_dir, s3_uri) + else: + _aws_cp(local_dir, s3_uri, recursive=True) diff --git a/bench/push_fixture.py b/bench/push_fixture.py index fb75f942..94dfdb0d 100644 --- a/bench/push_fixture.py +++ b/bench/push_fixture.py @@ -53,6 +53,11 @@ def parse_args(argv: list[str] | None = None) -> argparse.Namespace: p.add_argument("--koina-url") p.add_argument("--dry-run", action="store_true") p.add_argument("--overwrite", action="store_true") + p.add_argument( + "--force", + action="store_true", + help="Re-upload S3 objects even if they already exist", + ) return p.parse_args(argv) @@ -131,10 +136,11 @@ def _resolve_and_upload_fasta( s3_dest: str, label: str, workdir: Path, + skip_if_exists: bool = False, ) -> None: local = workdir / f"{label}.fasta" resolve_dbs(specs, local) - s3_upload_file(str(local), s3_dest) + s3_upload_file(str(local), s3_dest, skip_if_exists=skip_if_exists) def run_pipeline( @@ -153,6 +159,7 @@ def run_pipeline( fixture_target: Path, overwrite: bool, dry_run: bool, + force: bool = False, ) -> None: """Execute the full upload + build + write-toml flow.""" dest_prefix = f"s3://{bucket}/{prefix.rstrip('/')}/{name}" @@ -198,21 +205,27 @@ def run_pipeline( workdir = Path(td) # 1. Resolve and upload target FASTA - _resolve_and_upload_fasta(db, target_fasta_uri, "proteome", workdir) + _resolve_and_upload_fasta( + db, target_fasta_uri, "proteome", workdir, skip_if_exists=not force + ) # 2. Optional entrapment FASTA if entrap_db: assert entrap_fasta_uri is not None - _resolve_and_upload_fasta(entrap_db, entrap_fasta_uri, "entrap", workdir) + _resolve_and_upload_fasta( + entrap_db, entrap_fasta_uri, "entrap", workdir, skip_if_exists=not force + ) # 3. Optional calibration FASTA if calib_db: assert calib_fasta_uri is not None - _resolve_and_upload_fasta(calib_db, calib_fasta_uri, "calib", workdir) + _resolve_and_upload_fasta( + calib_db, calib_fasta_uri, "calib", workdir, skip_if_exists=not force + ) # 4. Upload raw dir if local if not raw.startswith("s3://"): - s3_upload_dir(raw, raw_uri) + s3_upload_dir(raw, raw_uri, idempotent=not force) # 5. Build speclib(s) if not user-provided if speclib_uri is None: @@ -257,6 +270,7 @@ def main(argv: list[str] | None = None) -> int: fixture_target=fixture_target, overwrite=args.overwrite, dry_run=args.dry_run, + force=args.force, ) return 0 diff --git a/bench/tests/test_push_fixture.py b/bench/tests/test_push_fixture.py index 46bbb230..105ad01d 100644 --- a/bench/tests/test_push_fixture.py +++ b/bench/tests/test_push_fixture.py @@ -170,12 +170,15 @@ def test_run_pipeline_minimal(tmp_path, fake_runtime): # Resolved + uploaded the target fasta assert fake_runtime["res"].call_count == 1 - fake_runtime["up_file"].assert_any_call(ANY, "s3://bk/fx/hela/proteome.fasta") + fake_runtime["up_file"].assert_any_call( + ANY, "s3://bk/fx/hela/proteome.fasta", skip_if_exists=True + ) # Uploaded the raw directory fake_runtime["up_dir"].assert_called_once() args = fake_runtime["up_dir"].call_args.args assert args[0] == str(raw) assert args[1] == "s3://bk/fx/hela/sample.d" + assert fake_runtime["up_dir"].call_args.kwargs.get("idempotent") is True # Built the speclib fake_runtime["build"].assert_called_once() # Wrote fixture TOML @@ -293,3 +296,62 @@ def test_run_pipeline_dry_run(tmp_path, fake_runtime): fake_runtime["up_dir"].assert_not_called() fake_runtime["build"].assert_not_called() assert not target.exists() + + +def test_run_pipeline_default_skips_existing_uploads(tmp_path, fake_runtime): + """By default (no --force), uploads are idempotent: existing S3 objects skipped.""" + cfg, raw = _common_args(tmp_path) + from bench.push_fixture import run_pipeline + + output_toml = fake_runtime["fx_dir"] / "hela.toml" + run_pipeline( + name="hela", + bucket="bk", + prefix="fx", + db=["UP000005640"], + raw=str(raw), + config=str(cfg), + entrap_db=[], + calib_db=[], + speclib_uri=None, + calibration_speclib_uri=None, + koina_url=None, + fixture_target=output_toml, + overwrite=False, + dry_run=False, + # force NOT passed — defaults to False + ) + # s3_upload_file got called with skip_if_exists=True + fasta_call = fake_runtime["up_file"].call_args + assert fasta_call.kwargs.get("skip_if_exists") is True + # s3_upload_dir got called with idempotent=True + raw_call = fake_runtime["up_dir"].call_args + assert raw_call.kwargs.get("idempotent") is True + + +def test_run_pipeline_force_overrides_skip(tmp_path, fake_runtime): + cfg, raw = _common_args(tmp_path) + from bench.push_fixture import run_pipeline + + output_toml = fake_runtime["fx_dir"] / "hela.toml" + run_pipeline( + name="hela", + bucket="bk", + prefix="fx", + db=["UP000005640"], + raw=str(raw), + config=str(cfg), + entrap_db=[], + calib_db=[], + speclib_uri=None, + calibration_speclib_uri=None, + koina_url=None, + fixture_target=output_toml, + overwrite=False, + dry_run=False, + force=True, + ) + fasta_call = fake_runtime["up_file"].call_args + assert fasta_call.kwargs.get("skip_if_exists") is False + raw_call = fake_runtime["up_dir"].call_args + assert raw_call.kwargs.get("idempotent") is False From af12aad573c3b59a80e356207a9e79b1e7fd699b Mon Sep 17 00:00:00 2001 From: "J. Sebastian Paez" Date: Fri, 8 May 2026 21:33:29 -0700 Subject: [PATCH 25/41] feat(bench): stage_fixture CLI for local cache + offline reruns --- bench/_s3.py | 8 + bench/stage_fixture.py | 231 +++++++++++++++++++++++++++ bench/tests/test_stage_fixture.py | 257 ++++++++++++++++++++++++++++++ 3 files changed, 496 insertions(+) create mode 100644 bench/stage_fixture.py create mode 100644 bench/tests/test_stage_fixture.py diff --git a/bench/_s3.py b/bench/_s3.py index 33944b9b..12d93324 100644 --- a/bench/_s3.py +++ b/bench/_s3.py @@ -65,3 +65,11 @@ def s3_upload_dir(local_dir: str, s3_uri: str, idempotent: bool = False) -> None _aws_sync(local_dir, s3_uri) else: _aws_cp(local_dir, s3_uri, recursive=True) + + +def s3_sync_dir(s3_uri: str, local_dir: str) -> None: + """Download (sync) a directory tree from S3 to a local path. Idempotent.""" + if not s3_uri.startswith("s3://"): + raise ValueError(f"not an s3:// URI: {s3_uri}") + Path(local_dir).mkdir(parents=True, exist_ok=True) + _aws_sync(s3_uri, local_dir) diff --git a/bench/stage_fixture.py b/bench/stage_fixture.py new file mode 100644 index 00000000..a4eb22cf --- /dev/null +++ b/bench/stage_fixture.py @@ -0,0 +1,231 @@ +"""Stage a fixture for offline / repeated runs. + +Reads `bench/fixtures/.toml`, downloads its S3 inputs to a local cache, +and writes a new fixture TOML pointing at those local paths. The new TOML +can be run via `python -m bench.wandb_bench --fixtures-dir `. +""" + +from __future__ import annotations + +import argparse +import os +import sys +from pathlib import Path + +from loguru import logger + +from bench._fixture_schema import Fixture, load_fixture +from bench._s3 import s3_download_file, s3_sync_dir +from bench.push_fixture import build_fixture_toml + + +def parse_args(argv: list[str] | None = None) -> argparse.Namespace: + p = argparse.ArgumentParser(description=__doc__) + p.add_argument("name", help="Fixture name (matches a TOML in --fixtures-dir)") + p.add_argument( + "--fixtures-dir", + type=Path, + default=Path("bench/fixtures"), + help="Directory containing the source fixture (default: bench/fixtures/)", + ) + p.add_argument( + "--cache-dir", + type=Path, + default=Path(os.environ.get("BENCH_CACHE_DIR", "bench_out/cache")), + help="Local cache root (env: BENCH_CACHE_DIR; default: bench_out/cache/)", + ) + p.add_argument( + "--out", + type=Path, + help="Output TOML path (default: bench_out/staged/.toml)", + ) + p.add_argument( + "--overwrite", action="store_true", help="Replace existing output TOML" + ) + p.add_argument( + "--force", action="store_true", help="Re-download files even if cached" + ) + args = p.parse_args(argv) + if args.out is None: + args.out = Path("bench_out/staged") / f"{args.name}.toml" + return args + + +def _stage_one_file(uri: str, dst: Path, force: bool) -> str: + """Resolve `uri` to a local path; return path string for the staged TOML. + + - If `uri` is already an absolute local path, return it unchanged (no copy). + - If `uri` is `s3://...`, download to `dst` (skip if exists, unless `force`). + """ + if not uri.startswith("s3://"): + return uri # already local; reference as-is + if dst.exists() and not force: + logger.info("stage: cached {} (skip)", dst) + return str(dst) + s3_download_file(uri, str(dst)) + return str(dst) + + +def _stage_one_dir(uri: str, dst: Path, force: bool) -> str: # noqa: ARG001 + """Sync `uri` (s3 prefix) into `dst`. Returns the path string.""" + if not uri.startswith("s3://"): + return uri + # `aws s3 sync` is itself idempotent; --force just forces a re-sync + # which has the same observable result, so we always call it. + s3_sync_dir(uri, str(dst)) + return str(dst) + + +def stage( + *, + name: str, + fixtures_dir: Path, + cache_dir: Path, + out: Path, + overwrite: bool, + force: bool, +) -> None: + """Stage one fixture for offline use.""" + src_toml = fixtures_dir / f"{name}.toml" + fx: Fixture = load_fixture(src_toml) + + if out.exists() and not overwrite: + raise FileExistsError(f"staged TOML already exists: {out} (pass --overwrite)") + + cache_root = cache_dir / name + cache_root.mkdir(parents=True, exist_ok=True) + + fasta_local = _stage_one_file( + fx.inputs.fasta, cache_root / "proteome.fasta", force + ) + speclib_local = _stage_one_file( + fx.inputs.speclib, cache_root / "lib.msgpack.zst", force + ) + raw_local = _stage_one_dir(fx.inputs.raw, cache_root / "sample.d", force) + + entrap_local: str | None = None + if fx.inputs.entrapment_fasta is not None: + entrap_local = _stage_one_file( + fx.inputs.entrapment_fasta, cache_root / "entrap.fasta", force + ) + + calib_local: str | None = None + if fx.inputs.calibration_speclib is not None: + calib_local = _stage_one_file( + fx.inputs.calibration_speclib, cache_root / "calib.msgpack.zst", force + ) + + body = _build_staged_toml( + name=name, + description=fx.description, + config=fx.config, + fasta_uri=fasta_local, + speclib_uri=speclib_local, + raw_uri=raw_local, + entrapment_fasta_uri=entrap_local, + calibration_speclib_uri=calib_local, + ) + out.parent.mkdir(parents=True, exist_ok=True) + out.write_text(body) + logger.info("staged fixture written: {}", out) + logger.info( + "run with: python -m bench.wandb_bench --fixtures-dir {} {}", + out.parent, name, + ) + + +def _build_staged_toml( + *, + name: str, + description: str, + config: dict, + fasta_uri: str, + speclib_uri: str, + raw_uri: str, + entrapment_fasta_uri: str | None, + calibration_speclib_uri: str | None, +) -> str: + """Emit a staged-fixture TOML body. Mirrors push_fixture.build_fixture_toml's + layout but takes the [config] table as a dict (already loaded) instead of a + file path.""" + lines: list[str] = [] + lines.append(f'name = "{name}"') + desc = description.replace('"', '\\"') + lines.append(f'description = "{desc}"') + lines.append("") + lines.append("[inputs]") + lines.append(f'fasta = "{fasta_uri}"') + lines.append(f'speclib = "{speclib_uri}"') + lines.append(f'raw = "{raw_uri}"') + if entrapment_fasta_uri is not None: + lines.append(f'entrapment_fasta = "{entrapment_fasta_uri}"') + if calibration_speclib_uri is not None: + lines.append(f'calibration_speclib = "{calibration_speclib_uri}"') + lines.append("") + lines.append("# === embedded timsseek config ===") + lines.extend(_emit_config(config)) + lines.append("") + return "\n".join(lines) + + +def _emit_config(config: dict, prefix: str = "config") -> list[str]: + """Render a nested dict as TOML lines under `[]` (and sub-tables).""" + lines: list[str] = [] + scalars: dict = {} + sub_tables: dict = {} + for k, v in config.items(): + if isinstance(v, dict): + sub_tables[k] = v + else: + scalars[k] = v + if scalars: + lines.append(f"[{prefix}]") + for k, v in scalars.items(): + lines.append(f"{k} = {_toml_value(v)}") + lines.append("") + for k, v in sub_tables.items(): + lines.extend(_emit_config(v, f"{prefix}.{k}")) + return lines + + +def _toml_value(v: object) -> str: + """Minimal TOML serializer for scalars + simple lists + inline tables.""" + if isinstance(v, bool): + return "true" if v else "false" + if isinstance(v, (int, float)): + return str(v) + if isinstance(v, str): + escaped = v.replace('"', '\\"') + return f'"{escaped}"' + if isinstance(v, list): + return "[" + ", ".join(_toml_value(x) for x in v) + "]" + if isinstance(v, dict): + items = ", ".join(f"{k} = {_toml_value(val)}" for k, val in v.items()) + return "{" + items + "}" + raise ValueError(f"unsupported TOML value: {v!r}") + + +# Re-export so the name is reachable from tests that import from this module; +# suppresses any "imported but unused" lint on the import above. +__all__ = [ + "parse_args", + "stage", + "build_fixture_toml", +] + + +def main(argv: list[str] | None = None) -> int: + args = parse_args(argv) + stage( + name=args.name, + fixtures_dir=args.fixtures_dir, + cache_dir=args.cache_dir, + out=args.out, + overwrite=args.overwrite, + force=args.force, + ) + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/bench/tests/test_stage_fixture.py b/bench/tests/test_stage_fixture.py new file mode 100644 index 00000000..4b76fdc8 --- /dev/null +++ b/bench/tests/test_stage_fixture.py @@ -0,0 +1,257 @@ +import textwrap +from pathlib import Path +from unittest.mock import patch + +import pytest + + +def _write_fixture( + dir: Path, name: str, *, with_entrap: bool = False, with_calib: bool = False +) -> Path: + extras = "" + if with_entrap: + extras += '\nentrapment_fasta = "s3://b/entrap.fasta"' + if with_calib: + extras += '\ncalibration_speclib = "s3://b/calib.msgpack.zst"' + p = dir / f"{name}.toml" + p.write_text( + textwrap.dedent( + f""" + name = "{name}" + description = "x" + + [inputs] + fasta = "s3://b/p.fasta" + speclib = "s3://b/lib.msgpack.zst" + raw = "s3://b/sample.d"{extras} + + [config.analysis] + chunk_size = 20000 + """ + ).strip() + ) + return p + + +@pytest.fixture +def fake_s3(tmp_path): + """Patch S3 ops; downloads create stub files at the requested local paths.""" + def fake_download_file(uri, dst): + Path(dst).parent.mkdir(parents=True, exist_ok=True) + Path(dst).write_text(f"# stub of {uri}\n") + + def fake_sync(uri, dst): + Path(dst).mkdir(parents=True, exist_ok=True) + (Path(dst) / "metadata").write_text(f"# stub of {uri}\n") + + with ( + patch( + "bench.stage_fixture.s3_download_file", side_effect=fake_download_file + ) as dl, + patch("bench.stage_fixture.s3_sync_dir", side_effect=fake_sync) as sync, + ): + yield {"download": dl, "sync": sync} + + +def test_stage_minimal(tmp_path, fake_s3): + fx_dir = tmp_path / "fx" + fx_dir.mkdir() + _write_fixture(fx_dir, "hela") + cache = tmp_path / "cache" + out = tmp_path / "staged" / "hela.toml" + + from bench.stage_fixture import stage + + stage( + name="hela", + fixtures_dir=fx_dir, + cache_dir=cache, + out=out, + overwrite=False, + force=False, + ) + + # Files were "downloaded" + assert (cache / "hela" / "proteome.fasta").exists() + assert (cache / "hela" / "lib.msgpack.zst").exists() + assert (cache / "hela" / "sample.d" / "metadata").exists() + + # download_file called for fasta + speclib (2x); sync called for raw (1x) + assert fake_s3["download"].call_count == 2 + assert fake_s3["sync"].call_count == 1 + + # Output TOML exists and is valid against schema + assert out.exists() + from bench._fixture_schema import load_fixture + fx = load_fixture(out) + assert fx.name == "hela" + assert fx.inputs.fasta == str(cache / "hela" / "proteome.fasta") + assert fx.inputs.speclib == str(cache / "hela" / "lib.msgpack.zst") + assert fx.inputs.raw == str(cache / "hela" / "sample.d") + + +def test_stage_with_entrap_and_calib(tmp_path, fake_s3): + fx_dir = tmp_path / "fx" + fx_dir.mkdir() + _write_fixture(fx_dir, "hy", with_entrap=True, with_calib=True) + cache = tmp_path / "cache" + out = tmp_path / "staged" / "hy.toml" + + from bench.stage_fixture import stage + + stage( + name="hy", + fixtures_dir=fx_dir, + cache_dir=cache, + out=out, + overwrite=False, + force=False, + ) + # 4 file downloads (fasta + speclib + entrap_fasta + calib_speclib); 1 sync (raw) + assert fake_s3["download"].call_count == 4 + assert fake_s3["sync"].call_count == 1 + + from bench._fixture_schema import load_fixture + fx = load_fixture(out) + assert fx.has_entrapment() + assert fx.has_calibration_speclib() + assert fx.inputs.entrapment_fasta == str(cache / "hy" / "entrap.fasta") + assert fx.inputs.calibration_speclib == str(cache / "hy" / "calib.msgpack.zst") + + +def test_stage_skips_existing_local_files(tmp_path, fake_s3): + fx_dir = tmp_path / "fx" + fx_dir.mkdir() + _write_fixture(fx_dir, "hela") + cache = tmp_path / "cache" + # Pre-create the fasta + speclib (raw is always synced — sync is itself idempotent) + (cache / "hela").mkdir(parents=True) + (cache / "hela" / "proteome.fasta").write_text("preexisting") + (cache / "hela" / "lib.msgpack.zst").write_text("preexisting") + out = tmp_path / "staged" / "hela.toml" + + from bench.stage_fixture import stage + + stage( + name="hela", + fixtures_dir=fx_dir, + cache_dir=cache, + out=out, + overwrite=False, + force=False, + ) + # No download calls because local files already exist + assert fake_s3["download"].call_count == 0 + # Sync still runs (raw .d) — sync itself is idempotent + assert fake_s3["sync"].call_count == 1 + # Local fasta content was NOT overwritten + assert (cache / "hela" / "proteome.fasta").read_text() == "preexisting" + + +def test_stage_force_redownloads(tmp_path, fake_s3): + fx_dir = tmp_path / "fx" + fx_dir.mkdir() + _write_fixture(fx_dir, "hela") + cache = tmp_path / "cache" + (cache / "hela").mkdir(parents=True) + (cache / "hela" / "proteome.fasta").write_text("preexisting") + out = tmp_path / "staged" / "hela.toml" + + from bench.stage_fixture import stage + + stage( + name="hela", + fixtures_dir=fx_dir, + cache_dir=cache, + out=out, + overwrite=False, + force=True, + ) + # Force re-downloads even if local exists + assert fake_s3["download"].call_count == 2 + # Stub overwrites the preexisting file + assert (cache / "hela" / "proteome.fasta").read_text().startswith("# stub") + + +def test_stage_preserves_local_paths(tmp_path, fake_s3): + """Inputs that are already absolute local paths are referenced as-is, no copy.""" + fx_dir = tmp_path / "fx" + fx_dir.mkdir() + p = fx_dir / "lo.toml" + local_fasta = tmp_path / "abs_p.fasta" + local_fasta.write_text(">x\nMK\n") + p.write_text( + textwrap.dedent( + f""" + name = "lo" + description = "x" + + [inputs] + fasta = "{local_fasta}" + speclib = "s3://b/lib.msgpack.zst" + raw = "s3://b/sample.d" + + [config.analysis] + chunk_size = 1 + """ + ).strip() + ) + cache = tmp_path / "cache" + out = tmp_path / "staged" / "lo.toml" + + from bench.stage_fixture import stage + + stage( + name="lo", + fixtures_dir=fx_dir, + cache_dir=cache, + out=out, + overwrite=False, + force=False, + ) + # Only the speclib gets downloaded; fasta is referenced as-is + assert fake_s3["download"].call_count == 1 + from bench._fixture_schema import load_fixture + fx = load_fixture(out) + assert fx.inputs.fasta == str(local_fasta) + + +def test_stage_refuses_overwrite(tmp_path, fake_s3): + fx_dir = tmp_path / "fx" + fx_dir.mkdir() + _write_fixture(fx_dir, "hela") + cache = tmp_path / "cache" + out = tmp_path / "staged" / "hela.toml" + out.parent.mkdir(parents=True) + out.write_text("# preexisting") + + from bench.stage_fixture import stage + + with pytest.raises(FileExistsError): + stage( + name="hela", + fixtures_dir=fx_dir, + cache_dir=cache, + out=out, + overwrite=False, + force=False, + ) + + +def test_parse_args_defaults(monkeypatch, tmp_path): + monkeypatch.delenv("BENCH_CACHE_DIR", raising=False) + from bench.stage_fixture import parse_args + args = parse_args(["hela"]) + assert args.name == "hela" + assert args.fixtures_dir == Path("bench/fixtures") + assert args.cache_dir == Path("bench_out/cache") + assert args.out == Path("bench_out/staged/hela.toml") + assert args.overwrite is False + assert args.force is False + + +def test_parse_args_env_var_for_cache(monkeypatch): + monkeypatch.setenv("BENCH_CACHE_DIR", "/tmp/my_cache") + from bench.stage_fixture import parse_args + args = parse_args(["hela"]) + assert args.cache_dir == Path("/tmp/my_cache") From 3f9c1543bba94eb20e4e0308f9f4fe81ffd6bef4 Mon Sep 17 00:00:00 2001 From: "J. Sebastian Paez" Date: Fri, 8 May 2026 21:33:57 -0700 Subject: [PATCH 26/41] docs(bench): add staging + idempotent uploads sections --- bench/README.md | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/bench/README.md b/bench/README.md index c82b1b24..77984eda 100644 --- a/bench/README.md +++ b/bench/README.md @@ -26,6 +26,17 @@ Requires `aws` CLI (auth via env / profile). `--db` (and `--entrap-db`, `--calib-db`) are repeatable and accept any of: local `*.fasta(.gz)` path, local `*.txt` accession list, `s3://...` URI, `UPxxxxxxxxx` proteome ID, bare uniprot accession. After upload, hand-edit the generated `bench/fixtures/.toml` to add a description, then `git add bench/fixtures/.toml`. +Re-running `push_fixture` is idempotent by default: existing S3 objects are skipped (single files via `aws s3 ls` check; `.d` directory via `aws s3 sync`). Pass `--force` to re-upload everything. + +## Stage a fixture for offline / repeated runs + +When iterating on a fixture, pull its inputs to a local cache once, then run against the staged copy: + + uv run --group bench python -m bench.stage_fixture hela_iccoff_gt20peps + uv run --group bench python -m bench.wandb_bench --fixtures-dir bench_out/staged hela_iccoff_gt20peps + +`stage_fixture` defaults: cache root `bench_out/cache//` (override via `--cache-dir` or `BENCH_CACHE_DIR` env), output TOML `bench_out/staged/.toml` (override via `--out`). Already-cached files are skipped on re-stage; pass `--force` to re-download. Inputs that are already absolute local paths are referenced as-is (no copy). + ## Schema -See `bench/_fixture_schema.py` for the canonical TOML schema. Inputs must be `s3://` URIs (project rule: bench data lives in S3 only). +See `bench/_fixture_schema.py` for the canonical TOML schema. Inputs accept `s3://` URIs or absolute local paths (the latter for staged fixtures only — `push_fixture` always emits `s3://`). From 5070f11b9309b43da6bb8ecc62f994df2ca79141 Mon Sep 17 00:00:00 2001 From: "J. Sebastian Paez" Date: Fri, 8 May 2026 21:54:33 -0700 Subject: [PATCH 27/41] feat(bench): k-mer-based search-space normalization for entrapment FDR --- bench/entrapment.py | 102 +++++++++++++++++++++++++------ bench/tests/test_entrapment.py | 108 ++++++++++++++++++++++++++++++++- 2 files changed, 190 insertions(+), 20 deletions(-) diff --git a/bench/entrapment.py b/bench/entrapment.py index d6fe9092..8ecbe8c7 100644 --- a/bench/entrapment.py +++ b/bench/entrapment.py @@ -51,6 +51,41 @@ def parse_fasta(path: str | Path) -> dict[str, str]: return out +def count_kmers(fasta_path: str | Path, k: int = 7) -> set[str]: + """Return the set of unique k-mers across all proteins in a FASTA. + + Window slides over each protein sequence; sequences shorter than `k` + contribute nothing. + """ + proteins = parse_fasta(fasta_path) + out: set[str] = set() + for seq in proteins.values(): + if len(seq) < k: + continue + for i in range(len(seq) - k + 1): + out.add(seq[i : i + k]) + return out + + +def kmer_normalization_factor( + target_fasta: str | Path, + entrapment_fasta: str | Path, + k: int = 7, +) -> float: + """Compute |T_unique_kmers| / |E_unique_kmers| after dropping shared k-mers. + + Used to rescale entrapment hit counts so empirical FDR is comparable + across proteomes of unequal search-space size. Clamped at 1.0 in the + denominator to avoid div-by-zero on tiny / empty entrapment fastas. + """ + t = count_kmers(target_fasta, k=k) + e = count_kmers(entrapment_fasta, k=k) + shared = t & e + t_only = t - shared + e_only = e - shared + return len(t_only) / max(1, len(e_only)) + + class PeptideClass(enum.Enum): TARGET = "target" ENTRAPMENT = "entrapment" @@ -119,32 +154,47 @@ def _classify(s: str) -> str: ) -def compute_fdr_curve(classified: pl.DataFrame) -> pl.DataFrame: +def compute_fdr_curve( + classified: pl.DataFrame, + normalization_factor: float = 1.0, +) -> pl.DataFrame: """Sort by qvalue, accumulate target/entrapment counts, return curve. Rows whose class is SHARED_DROPPED or UNKNOWN are excluded from both numerator and denominator. + + `normalization_factor` rescales entrapment counts to compensate for + differences in target vs entrapment search-space size (e.g., from + `kmer_normalization_factor`). With factor=1.0 (default), `empirical_fdr_norm` + equals `empirical_fdr_raw`. """ if "qvalue" not in classified.columns: raise ValueError("classified dataframe missing 'qvalue' column") keep = classified.filter( - pl.col("class").is_in([ - PeptideClass.TARGET.value, - PeptideClass.ENTRAPMENT.value, - ]) + pl.col("class").is_in( + [PeptideClass.TARGET.value, PeptideClass.ENTRAPMENT.value] + ) ).sort("qvalue") - n_target = (keep["class"] == PeptideClass.TARGET.value).cast(pl.UInt32).cum_sum() + n_target = ( + (keep["class"] == PeptideClass.TARGET.value).cast(pl.UInt32).cum_sum() + ) n_entrap = ( (keep["class"] == PeptideClass.ENTRAPMENT.value).cast(pl.UInt32).cum_sum() ) - empirical = (n_entrap / (n_target + n_entrap)).fill_nan(0.0) + raw_fdr = (n_entrap.cast(pl.Float64) / (n_target + n_entrap)).fill_nan(0.0) + n_entrap_norm = n_entrap.cast(pl.Float64) * normalization_factor + norm_fdr = ( + n_entrap_norm / (n_target.cast(pl.Float64) + n_entrap_norm) + ).fill_nan(0.0) return keep.with_columns( n_target.alias("n_target"), n_entrap.alias("n_entrap"), - empirical.alias("empirical_fdr"), + n_entrap_norm.alias("n_entrap_norm"), + raw_fdr.alias("empirical_fdr_raw"), + norm_fdr.alias("empirical_fdr_norm"), ) @@ -153,19 +203,33 @@ def plot_fdr_curve( output_path: str | Path, title: str = "Reported q-value vs empirical entrapment FDR", ) -> None: - """Render a FDR-vs-qvalue plot to a PNG file.""" + """Render a FDR-vs-qvalue plot to a PNG file. Plots the kmer-normalized + curve (primary) and the raw curve (faded) for comparison.""" import matplotlib matplotlib.use("Agg") import matplotlib.pyplot as plt fig, ax = plt.subplots(figsize=(5, 5), dpi=150) - ax.plot(curve["qvalue"], curve["empirical_fdr"], lw=1.5, label="empirical") + ax.plot( + curve["qvalue"], + curve["empirical_fdr_norm"], + lw=1.5, + label="empirical (kmer-normalized)", + ) + ax.plot( + curve["qvalue"], + curve["empirical_fdr_raw"], + lw=1.0, + ls=":", + alpha=0.6, + label="empirical (raw, unnormalized)", + ) _max_qv = curve["qvalue"].cast(pl.Float64).max() lim: float = max(0.05, _max_qv if isinstance(_max_qv, float) else 0.05) ax.plot([0, lim], [0, lim], color="grey", ls="--", lw=1.0, label="y=x") ax.set_xlabel("reported q-value") - ax.set_ylabel("empirical FDR (n_entrap / (n_target + n_entrap))") + ax.set_ylabel("empirical FDR") ax.set_xlim(0, lim) ax.set_ylim(0, lim) ax.set_title(title) @@ -178,19 +242,21 @@ def plot_fdr_curve( def _scalar_at_q( curve: pl.DataFrame, q_threshold: float, suffix: str ) -> dict[str, float | int]: - """Read off n_target / n_entrap / empirical_fdr at q <= q_threshold.""" + """Read off n_target / n_entrap / empirical FDR (raw + norm) at q <= threshold.""" sub = curve.filter(pl.col("qvalue") <= q_threshold) if sub.height == 0: return { f"entrap/n_target_at_{suffix}": 0, f"entrap/n_entrap_at_{suffix}": 0, - f"entrap/empirical_fdr_at_{suffix}": 0.0, + f"entrap/empirical_fdr_raw_at_{suffix}": 0.0, + f"entrap/empirical_fdr_norm_at_{suffix}": 0.0, } last = sub.row(-1, named=True) return { f"entrap/n_target_at_{suffix}": int(last["n_target"]), f"entrap/n_entrap_at_{suffix}": int(last["n_entrap"]), - f"entrap/empirical_fdr_at_{suffix}": float(last["empirical_fdr"]), + f"entrap/empirical_fdr_raw_at_{suffix}": float(last["empirical_fdr_raw"]), + f"entrap/empirical_fdr_norm_at_{suffix}": float(last["empirical_fdr_norm"]), } @@ -201,18 +267,20 @@ def analyse( out_parquet: str | Path, out_plot: str | Path, title: str = "Reported q-value vs empirical entrapment FDR", + kmer_k: int = 7, ) -> dict[str, float | int]: - """End-to-end: classify -> FDR walk -> write parquet + plot -> return scalars.""" + """End-to-end: classify → kmer norm → FDR walk → write parquet+plot → scalars.""" results = pl.read_parquet(results_parquet) classified = classify_peptides(results, target_fasta, entrapment_fasta) Path(out_parquet).parent.mkdir(parents=True, exist_ok=True) classified.write_parquet(out_parquet) - curve = compute_fdr_curve(classified) + factor = kmer_normalization_factor(target_fasta, entrapment_fasta, k=kmer_k) + curve = compute_fdr_curve(classified, normalization_factor=factor) Path(out_plot).parent.mkdir(parents=True, exist_ok=True) plot_fdr_curve(curve, out_plot, title=title) - scalars: dict[str, float | int] = {} + scalars: dict[str, float | int] = {"entrap/normalization_factor": float(factor)} scalars.update(_scalar_at_q(curve, 0.01, "q01")) scalars.update(_scalar_at_q(curve, 0.05, "q05")) return scalars diff --git a/bench/tests/test_entrapment.py b/bench/tests/test_entrapment.py index b57ae72c..daa79532 100644 --- a/bench/tests/test_entrapment.py +++ b/bench/tests/test_entrapment.py @@ -5,6 +5,8 @@ analyse, classify_peptides, compute_fdr_curve, + count_kmers, + kmer_normalization_factor, parse_fasta, plot_fdr_curve, strip_mods, @@ -79,7 +81,8 @@ def test_compute_fdr_curve_basic(): assert curve["n_entrap"].to_list() == [0, 0, 1, 1, 2] # empirical_fdr = n_e / (n_t + n_e) last = curve.row(-1, named=True) - assert last["empirical_fdr"] == 2 / 5 + assert last["empirical_fdr_raw"] == 2 / 5 + assert last["empirical_fdr_norm"] == 2 / 5 # factor defaults to 1.0 def test_compute_fdr_curve_excludes_shared_and_unknown(): @@ -98,7 +101,8 @@ def test_plot_fdr_curve_writes_png(tmp_path): "qvalue": [0.001, 0.01, 0.05], "n_target": [10, 50, 100], "n_entrap": [0, 1, 5], - "empirical_fdr": [0.0, 1 / 51, 5 / 105], + "empirical_fdr_raw": [0.0, 1 / 51, 5 / 105], + "empirical_fdr_norm": [0.0, 1 / 51, 5 / 105], }) out = tmp_path / "fdr.png" plot_fdr_curve(curve, out, title="test") @@ -130,7 +134,7 @@ def test_analyse_end_to_end(tmp_path): # Returned scalars assert out["entrap/n_target_at_q01"] == 2 # both PEPTIDEK rows have q <= 0.01 assert out["entrap/n_entrap_at_q01"] == 0 - assert out["entrap/empirical_fdr_at_q01"] == 0.0 + assert out["entrap/empirical_fdr_raw_at_q01"] == 0.0 assert out["entrap/n_target_at_q05"] == 2 assert out["entrap/n_entrap_at_q05"] == 2 @@ -140,3 +144,101 @@ def test_analyse_end_to_end(tmp_path): classified = pl.read_parquet(tmp_path / "classified.parquet") assert "class" in classified.columns and "is_entrapment" in classified.columns + + +def test_count_kmers_basic(tmp_path): + p = tmp_path / "p.fasta" + p.write_text(">A\nABCDEFG\n>B\nABCDEFGH\n") + kmers = count_kmers(p, k=7) + # First protein contributes "ABCDEFG" exactly (length 7 → one kmer). + # Second protein contributes "ABCDEFG" and "BCDEFGH". + # Set dedupes the shared kmer. + assert kmers == {"ABCDEFG", "BCDEFGH"} + + +def test_count_kmers_skips_too_short(tmp_path): + p = tmp_path / "p.fasta" + p.write_text(">A\nABCDEF\n") # length 6, smaller than k=7 + assert count_kmers(p, k=7) == set() + + +def test_kmer_normalization_factor(tmp_path): + target = tmp_path / "t.fasta" + # 4 kmers of length 7: ABCDEFG, BCDEFGH, CDEFGHI, DEFGHIJ + target.write_text(">T\nABCDEFGHIJ\n") + entrap = tmp_path / "e.fasta" + entrap.write_text(">E\nABCDEFG\n") # 1 kmer: ABCDEFG (shared with target) + + # After dropping shared kmers: target has {BCDEFGH, CDEFGHI, DEFGHIJ} (3); + # entrap has {} (0). Factor = target / max(1, entrap) → 3/1 = 3.0. + f = kmer_normalization_factor(target, entrap, k=7) + assert f == 3.0 + + +def test_kmer_normalization_factor_balanced(tmp_path): + target = tmp_path / "t.fasta" + target.write_text(">T\nAAAAAAAA\n") # 2 kmers: AAAAAAA, AAAAAAA → set: {AAAAAAA} + entrap = tmp_path / "e.fasta" + entrap.write_text(">E\nBBBBBBBB\n") # 2 kmers: BBBBBBB, BBBBBBB → set: {BBBBBBB} + # No shared kmers; both have 1 → factor = 1.0 + f = kmer_normalization_factor(target, entrap, k=7) + assert f == 1.0 + + +def test_compute_fdr_curve_with_normalization(): + """Plain raw FDR uses n_e / (n_e + n_t); normalized scales n_e by factor.""" + classified = pl.DataFrame( + { + "qvalue": [0.001, 0.005, 0.01, 0.02, 0.05], + "class": ["target", "target", "entrapment", "target", "entrapment"], + } + ) + curve = compute_fdr_curve(classified, normalization_factor=3.0) + + # Raw counts unchanged + assert curve["n_target"].to_list() == [1, 2, 2, 3, 3] + assert curve["n_entrap"].to_list() == [0, 0, 1, 1, 2] + # Raw fdr unchanged + assert curve["empirical_fdr_raw"].to_list() == [0.0, 0.0, 1 / 3, 1 / 4, 2 / 5] + # Normalized fdr at last row: n_e * 3 / (n_t + n_e * 3) = 6 / (3 + 6) = 2/3 + last = curve.row(-1, named=True) + assert last["empirical_fdr_norm"] == 2 / 3 + assert last["n_entrap_norm"] == 6.0 # 2 * 3.0 + + +def test_compute_fdr_curve_default_factor_is_one(): + """Default factor=1.0 keeps backward-compatible raw == norm behavior.""" + classified = pl.DataFrame( + { + "qvalue": [0.001, 0.01], + "class": ["target", "entrapment"], + } + ) + curve = compute_fdr_curve(classified) + assert curve["empirical_fdr_norm"].to_list() == curve["empirical_fdr_raw"].to_list() + + +def test_analyse_includes_normalization_factor(tmp_path): + """analyse() computes and applies kmer normalization automatically.""" + target = tmp_path / "t.fasta" + # homopolymer of length 50 → exactly 1 unique kmer "AAAAAAA" + target.write_text(">T1\n" + "A" * 50 + "\n") + entrap = tmp_path / "e.fasta" + entrap.write_text(">E1\n" + "B" * 50 + "\n") + + results = pl.DataFrame( + {"sequence": ["AAAAAAA", "BBBBBBB"], "qvalue": [0.001, 0.001]} + ) + results_path = tmp_path / "r.parquet" + results.write_parquet(results_path) + out = analyse( + results_parquet=results_path, + target_fasta=target, + entrapment_fasta=entrap, + out_parquet=tmp_path / "c.parquet", + out_plot=tmp_path / "f.png", + ) + # Both proteins are pure homopolymer of length 50 → 1 unique kmer each → factor=1.0 + # Returned scalars include the factor + assert "entrap/normalization_factor" in out + assert out["entrap/normalization_factor"] == 1.0 From c82d3e3d9e5a76917ce3aabeb5fa6d74574c9f1d Mon Sep 17 00:00:00 2001 From: "J. Sebastian Paez" Date: Fri, 8 May 2026 21:55:27 -0700 Subject: [PATCH 28/41] feat(bench): --request-delay-ms flag for speclib_build_cli throttle --- bench/push_fixture.py | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/bench/push_fixture.py b/bench/push_fixture.py index 94dfdb0d..975c8e05 100644 --- a/bench/push_fixture.py +++ b/bench/push_fixture.py @@ -51,6 +51,12 @@ def parse_args(argv: list[str] | None = None) -> argparse.Namespace: help="Skip calib speclib build, reference this URI", ) p.add_argument("--koina-url") + p.add_argument( + "--request-delay-ms", + type=int, + default=500, + help="Per-request delay passed to speclib_build_cli (ms; default 500)", + ) p.add_argument("--dry-run", action="store_true") p.add_argument("--overwrite", action="store_true") p.add_argument( @@ -65,6 +71,7 @@ def run_speclib_build( fasta_s3: str, speclib_s3: str, koina_url: str | None, + request_delay_ms: int = 500, ) -> None: cmd = [ "cargo", @@ -84,8 +91,7 @@ def run_speclib_build( ] if koina_url: cmd.extend(["--koina-url", koina_url]) - else: - cmd.extend(["--request-delay-ms", "500"]) + cmd.extend(["--request-delay-ms", str(request_delay_ms)]) logger.info("$ {}", " ".join(cmd)) subprocess.run(cmd, check=True) @@ -160,6 +166,7 @@ def run_pipeline( overwrite: bool, dry_run: bool, force: bool = False, + request_delay_ms: int = 500, ) -> None: """Execute the full upload + build + write-toml flow.""" dest_prefix = f"s3://{bucket}/{prefix.rstrip('/')}/{name}" @@ -229,11 +236,18 @@ def run_pipeline( # 5. Build speclib(s) if not user-provided if speclib_uri is None: - run_speclib_build(target_fasta_uri, main_speclib_uri, koina_url) + run_speclib_build( + target_fasta_uri, main_speclib_uri, koina_url, request_delay_ms + ) if calib_db and calibration_speclib_uri is None: assert calib_fasta_uri is not None assert final_calib_speclib_uri is not None - run_speclib_build(calib_fasta_uri, final_calib_speclib_uri, koina_url) + run_speclib_build( + calib_fasta_uri, + final_calib_speclib_uri, + koina_url, + request_delay_ms, + ) # 6. Emit fixture TOML body = build_fixture_toml( @@ -271,6 +285,7 @@ def main(argv: list[str] | None = None) -> int: overwrite=args.overwrite, dry_run=args.dry_run, force=args.force, + request_delay_ms=args.request_delay_ms, ) return 0 From a0c03922445767830d9e588ea7b6e972d0121ce0 Mon Sep 17 00:00:00 2001 From: "J. Sebastian Paez" Date: Fri, 8 May 2026 22:02:26 -0700 Subject: [PATCH 29/41] fix(bench): build speclib from merged target+entrap fasta when entrap_db set --- bench/push_fixture.py | 28 +++++++++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) diff --git a/bench/push_fixture.py b/bench/push_fixture.py index 975c8e05..a148df01 100644 --- a/bench/push_fixture.py +++ b/bench/push_fixture.py @@ -175,6 +175,14 @@ def run_pipeline( calib_fasta_uri = f"{dest_prefix}/calib.fasta" if calib_db else None main_speclib_uri = speclib_uri or f"{dest_prefix}/lib.msgpack.zst" + # When entrap_db is present, the speclib must cover both target+entrap so + # the search can score entrapment peptides. We upload a concatenated fasta + # to a separate URI and point speclib_build_cli at it. The per-fasta + # target/entrap files are still uploaded separately so analyse() can + # classify hits by source. + speclib_input_fasta_uri = ( + f"{dest_prefix}/speclib_input.fasta" if entrap_db else None + ) final_calib_speclib_uri: str | None = calibration_speclib_uri if final_calib_speclib_uri is None and calib_db: final_calib_speclib_uri = f"{dest_prefix}/calib_lib.msgpack.zst" @@ -234,10 +242,28 @@ def run_pipeline( if not raw.startswith("s3://"): s3_upload_dir(raw, raw_uri, idempotent=not force) + # 4b. If entrap_db present, build a merged target+entrap fasta locally + # and upload it as the speclib build input. + if speclib_input_fasta_uri is not None: + merged_local = workdir / "speclib_input.fasta" + target_local = workdir / "proteome.fasta" + entrap_local = workdir / "entrap.fasta" + with merged_local.open("wb") as out: + for src in (target_local, entrap_local): + out.write(src.read_bytes()) + if not src.read_bytes().endswith(b"\n"): + out.write(b"\n") + s3_upload_file( + str(merged_local), + speclib_input_fasta_uri, + skip_if_exists=not force, + ) + # 5. Build speclib(s) if not user-provided if speclib_uri is None: + speclib_input_uri = speclib_input_fasta_uri or target_fasta_uri run_speclib_build( - target_fasta_uri, main_speclib_uri, koina_url, request_delay_ms + speclib_input_uri, main_speclib_uri, koina_url, request_delay_ms ) if calib_db and calibration_speclib_uri is None: assert calib_fasta_uri is not None From bc12861b83ef40a64edd3ad80619fe758f2cb96a Mon Sep 17 00:00:00 2001 From: "J. Sebastian Paez" Date: Fri, 8 May 2026 22:10:41 -0700 Subject: [PATCH 30/41] feat(bench): default fetch_proteome to reviewed-only (Swiss-Prot) --- bench/_uniprot.py | 15 ++++++++++++--- bench/tests/test_uniprot.py | 17 ++++++++++++++++- 2 files changed, 28 insertions(+), 4 deletions(-) diff --git a/bench/_uniprot.py b/bench/_uniprot.py index 20ee4632..547826f3 100644 --- a/bench/_uniprot.py +++ b/bench/_uniprot.py @@ -23,9 +23,18 @@ def _get(params: dict[str, str]) -> str: return r.text -def fetch_proteome(proteome_id: str) -> str: - """Fetch a full uniprot proteome (e.g. UP000005640) as FASTA text.""" - return _get({"query": f"proteome:{proteome_id}", "format": "fasta"}) +def fetch_proteome(proteome_id: str, reviewed_only: bool = True) -> str: + """Fetch a uniprot proteome (e.g. UP000005640) as FASTA text. + + Defaults to Swiss-Prot only (`reviewed:true`) to keep search spaces + tractable; full proteome (incl. TrEMBL) is rarely what bench fixtures + actually want and is much slower to build a speclib over. Pass + `reviewed_only=False` for the unfiltered set. + """ + query = f"proteome:{proteome_id}" + if reviewed_only: + query += " AND reviewed:true" + return _get({"query": query, "format": "fasta"}) def fetch_accession_batch(accessions: Iterable[str]) -> str: diff --git a/bench/tests/test_uniprot.py b/bench/tests/test_uniprot.py index e1791ed0..94c1b2e2 100644 --- a/bench/tests/test_uniprot.py +++ b/bench/tests/test_uniprot.py @@ -4,7 +4,7 @@ @responses.activate -def test_fetch_proteome(): +def test_fetch_proteome_default_reviewed(): responses.add( responses.GET, "https://rest.uniprot.org/uniprotkb/stream", @@ -15,6 +15,21 @@ def test_fetch_proteome(): fasta = fetch_proteome("UP000005640") assert fasta.startswith(">sp|P12345") assert "MKLAA" in fasta + qs = responses.calls[0].request.url or "" + assert "reviewed%3Atrue" in qs + + +@responses.activate +def test_fetch_proteome_unreviewed_off(): + responses.add( + responses.GET, + "https://rest.uniprot.org/uniprotkb/stream", + body=">tr|Q11111|X\nMK\n", + status=200, + ) + fetch_proteome("UP000005640", reviewed_only=False) + qs = responses.calls[0].request.url or "" + assert "reviewed%3Atrue" not in qs @responses.activate From 6589605feea15869db78b29b5cf5ea27f14ed2c4 Mon Sep 17 00:00:00 2001 From: "J. Sebastian Paez" Date: Sat, 9 May 2026 10:01:48 -0700 Subject: [PATCH 31/41] feat(bench): trypsin digest + algorithm-1 shuffled-entrapment helpers --- bench/_digest.py | 63 +++++++++++++++++++++++++++++++++++ bench/_shuffle.py | 60 ++++++++++++++++++++++++++++++++++ bench/tests/test_digest.py | 45 +++++++++++++++++++++++++ bench/tests/test_shuffle.py | 65 +++++++++++++++++++++++++++++++++++++ 4 files changed, 233 insertions(+) create mode 100644 bench/_digest.py create mode 100644 bench/_shuffle.py create mode 100644 bench/tests/test_digest.py create mode 100644 bench/tests/test_shuffle.py diff --git a/bench/_digest.py b/bench/_digest.py new file mode 100644 index 00000000..0eded49c --- /dev/null +++ b/bench/_digest.py @@ -0,0 +1,63 @@ +"""Trypsin digestion + length filter helpers. + +Trypsin rule: cleave C-terminal to K or R, NOT when followed by P. +Missed cleavages are concatenations of N+1 contiguous base segments. +""" + +from __future__ import annotations + +import re +from pathlib import Path + +# Cleavage site: after K or R that is NOT followed by P. +# Use a regex split that emits peptides ending in K|R (or the final tail). +_CLEAVE = re.compile(r"(?<=[KR])(?!P)") + + +def parse_fasta(path: str | Path) -> dict[str, str]: + """Parse a FASTA into {accession: concatenated_sequence} (header line minus `>`).""" + out: dict[str, str] = {} + cur_acc: str | None = None + parts: list[str] = [] + with Path(path).open("r") as f: + for raw in f: + line = raw.rstrip() + if not line: + continue + if line.startswith(">"): + if cur_acc is not None: + out[cur_acc] = "".join(parts) + cur_acc = line[1:].strip() + parts = [] + else: + parts.append(line) + if cur_acc is not None: + out[cur_acc] = "".join(parts) + return out + + +def digest_protein(sequence: str, missed_cleavages: int = 1) -> list[str]: + """Digest one protein into peptides. Returns base segments plus all + contiguous merges of up to (missed_cleavages+1) segments.""" + base = [s for s in _CLEAVE.split(sequence) if s] + out: list[str] = [] + n = len(base) + for i in range(n): + for j in range(i + 1, min(i + 2 + missed_cleavages, n + 1)): + out.append("".join(base[i:j])) + return out + + +def digest_proteins( + proteins: dict[str, str], missed_cleavages: int = 1 +) -> set[str]: + """Digest a {accession: sequence} dict; return the deduplicated peptide set.""" + out: set[str] = set() + for seq in proteins.values(): + out.update(digest_protein(seq, missed_cleavages=missed_cleavages)) + return out + + +def length_filter(peptides: set[str], min_len: int = 7, max_len: int = 30) -> set[str]: + """Keep peptides with length in [min_len, max_len].""" + return {p for p in peptides if min_len <= len(p) <= max_len} diff --git a/bench/_shuffle.py b/bench/_shuffle.py new file mode 100644 index 00000000..9689f5f5 --- /dev/null +++ b/bench/_shuffle.py @@ -0,0 +1,60 @@ +"""Shuffled-entrapment peptide generation (Algorithm 1 of Noble et al, FDRBench paper). + +`shuffle_keeping_c_terminal` permutes the interior residues of a peptide and +keeps the C-terminal residue fixed. `generate_shuffled_entrapment` produces +the (target, shuffle) pairs per Algorithm 1, with deduplication and +max-attempt fallback that drops targets unable to produce r distinct shuffles. +""" + +from __future__ import annotations + +import random + + +def shuffle_keeping_c_terminal(peptide: str, seed: int | None = None) -> str: + """Shuffle the interior residues, keep the C-terminal residue fixed. + + For peptides of length <= 2 returns the input unchanged (no interior). + """ + if len(peptide) <= 2: + return peptide + rng = random.Random(seed) + interior = list(peptide[:-1]) + rng.shuffle(interior) + return "".join(interior) + peptide[-1] + + +def generate_shuffled_entrapment( + targets: set[str], + r: int = 1, + seed: int = 42, +) -> list[tuple[str, str]]: + """Algorithm 1: for each target, generate r distinct shuffles. + + Uses `max_attempts = 20 + r` attempts per target. Targets that cannot + produce r unique shuffles (e.g., homopolymers) are dropped — i.e., they + contribute zero pairs to the output. + + Returns a list of (target_peptide, shuffle_peptide) pairs. Pairs are + sorted by target for determinism. RNG is seeded once at the start so + re-runs with the same `seed` produce the same pairs. + """ + rng = random.Random(seed) + max_attempts = 20 + r + out: list[tuple[str, str]] = [] + for p_target in sorted(targets): # determinism via sorted iteration + shuffles: set[str] = set() + for _ in range(max_attempts): + if len(shuffles) >= r: + break + # New shuffle drawn from the seeded shared RNG + interior = list(p_target[:-1]) + rng.shuffle(interior) + cand = "".join(interior) + p_target[-1] + if cand != p_target and cand not in shuffles and cand not in targets: + shuffles.add(cand) + if len(shuffles) >= r: + for s in sorted(shuffles): + out.append((p_target, s)) + # else: drop p_target entirely (no entries appended) + return out diff --git a/bench/tests/test_digest.py b/bench/tests/test_digest.py new file mode 100644 index 00000000..88e964e5 --- /dev/null +++ b/bench/tests/test_digest.py @@ -0,0 +1,45 @@ +from bench._digest import digest_protein, digest_proteins, length_filter, parse_fasta + + +def test_parse_fasta_basic(tmp_path): + p = tmp_path / "p.fasta" + p.write_text(">A\nMKL\nAAR\n>B\nPPP\n") + out = parse_fasta(p) + assert out == {"A": "MKLAAR", "B": "PPP"} + + +def test_digest_protein_no_missed_cleavage(): + """Trypsin: cleave after K/R, never before P. missed_cleavages=0.""" + # MKR|TPK|GCD - cleave after K, R, K (KP rule: K-P would not cleave) + pep = digest_protein("MKRTPKGCD", missed_cleavages=0) + assert pep == ["MK", "R", "TPK", "GCD"] + + +def test_digest_protein_kp_rp_rule(): + """K-P and R-P bonds are NOT cleaved.""" + # AAKPBBR - the K-P bond stays, R at end cleaves + pep = digest_protein("AAKPBBR", missed_cleavages=0) + assert pep == ["AAKPBBR"] + + +def test_digest_protein_one_missed_cleavage(): + pep = digest_protein("MKRTPKGCD", missed_cleavages=1) + # Base segments: ["MK", "R", "TPK", "GCD"] + # Plus contiguous merges of length 2: ["MKR", "RTPK", "TPKGCD"] + assert "MK" in pep and "R" in pep and "TPK" in pep and "GCD" in pep + assert "MKR" in pep and "RTPK" in pep and "TPKGCD" in pep + + +def test_digest_proteins_dedupes(): + # Two proteins sharing a peptide + pep = digest_proteins({"A": "MKR", "B": "MKR"}, missed_cleavages=0) + # set ordering doesn't matter; check membership + assert "MK" in pep + assert "R" in pep + + +def test_length_filter(): + peps = {"A", "AA", "AAAAAAA", "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA", "AAAAAA"} + out = length_filter(peps, min_len=7, max_len=30) + # 7-mer kept; 6-mer dropped; 31-mer dropped; 1-mer/2-mer dropped + assert out == {"AAAAAAA"} diff --git a/bench/tests/test_shuffle.py b/bench/tests/test_shuffle.py new file mode 100644 index 00000000..0c150bf0 --- /dev/null +++ b/bench/tests/test_shuffle.py @@ -0,0 +1,65 @@ +from bench._shuffle import generate_shuffled_entrapment, shuffle_keeping_c_terminal + + +def test_shuffle_keeps_c_terminal(): + """C-terminal residue stays put; interior is rearranged.""" + rng_seed = 42 + p = "PEPTIDEK" + out = shuffle_keeping_c_terminal(p, seed=rng_seed) + assert out[-1] == "K" + # Same residues, possibly reordered + assert sorted(out) == sorted(p) + + +def test_shuffle_short_peptide_returns_input(): + """A 2-AA peptide has no interior to shuffle; returns as-is.""" + out = shuffle_keeping_c_terminal("MK", seed=1) + assert out == "MK" + + +def test_generate_shuffled_basic(): + targets = {"PEPTIDEK", "ANOTHERR"} + pairs = generate_shuffled_entrapment(targets, r=1, seed=7) + # One shuffle per target → 2 pairs + assert len(pairs) == 2 + by_target = {t: s for (t, s) in pairs} + assert "PEPTIDEK" in by_target + assert "ANOTHERR" in by_target + # Each shuffle preserves C-term + AA composition + for t, s in pairs: + assert s[-1] == t[-1] + assert sorted(s) == sorted(t) + # Shuffle is unique vs target peptides AND its own previously-generated shuffles + assert s != t # for non-trivially-permutable peptides + assert s not in targets + + +def test_generate_shuffled_drops_target_when_no_unique_shuffle(): + """Target with fixed-point interior is dropped when r shuffles can't be found.""" + # 'AAAAK' interior 'AAAA' only permutes to itself → no unique shuffle exists. + # 'AAK' interior 'A' has only one permutation. + targets = {"AAAAK", "PEPTIDEK"} + pairs = generate_shuffled_entrapment(targets, r=1, seed=42) + matched_targets = {t for (t, _) in pairs} + assert "AAAAK" not in matched_targets + assert "PEPTIDEK" in matched_targets + + +def test_generate_shuffled_r_greater_than_one(): + targets = {"PEPTIDEK"} + pairs = generate_shuffled_entrapment(targets, r=3, seed=11) + # 3 shuffles for the one target + assert len(pairs) == 3 + shuffles = [s for (_, s) in pairs] + assert len(set(shuffles)) == 3 # all distinct + for s in shuffles: + assert s[-1] == "K" + assert sorted(s) == sorted("PEPTIDEK") + assert s != "PEPTIDEK" + + +def test_generate_shuffled_deterministic_with_seed(): + targets = {"PEPTIDEK", "ANOTHERR", "GREATSEQK"} + a = generate_shuffled_entrapment(targets, r=2, seed=99) + b = generate_shuffled_entrapment(targets, r=2, seed=99) + assert a == b From 5b4f1c5f03afe8708505f09af28e90e3b8e6edb1 Mon Sep 17 00:00:00 2001 From: "J. Sebastian Paez" Date: Sat, 9 May 2026 10:04:08 -0700 Subject: [PATCH 32/41] feat(bench)!: migrate fixture schema to peptide-list inputs + entrapment metadata Replace fasta/entrapment_fasta with target_peptides/entrapment_peptides plus entrapment_ratio, entrapment_mode (foreign|shuffled), and pairing fields. Adds cross-field consistency validation and has_pairing() helper on Fixture. --- bench/_fixture_schema.py | 52 ++++++-- bench/tests/test_fixture_schema.py | 194 +++++++++++++++++++++++------ 2 files changed, 204 insertions(+), 42 deletions(-) diff --git a/bench/_fixture_schema.py b/bench/_fixture_schema.py index 082926fd..613a5f35 100644 --- a/bench/_fixture_schema.py +++ b/bench/_fixture_schema.py @@ -4,9 +4,16 @@ import tomllib from pathlib import Path -from typing import Any +from typing import Any, Literal -from pydantic import BaseModel, ConfigDict, Field, ValidationInfo, field_validator +from pydantic import ( + BaseModel, + ConfigDict, + Field, + ValidationInfo, + field_validator, + model_validator, +) def _require_uri(value: str, field: str) -> str: @@ -29,24 +36,52 @@ def _require_uri(value: str, field: str) -> str: class FixtureInputs(BaseModel): model_config = ConfigDict(extra="forbid") - fasta: str + target_peptides: str speclib: str raw: str - entrapment_fasta: str | None = None + entrapment_peptides: str | None = None + entrapment_ratio: float | None = None + entrapment_mode: Literal["foreign", "shuffled"] | None = None + pairing: str | None = None calibration_speclib: str | None = None - @field_validator("fasta", "speclib", "raw") + @field_validator("target_peptides", "speclib", "raw") @classmethod def _required_uri(cls, v: str, info: ValidationInfo) -> str: return _require_uri(v, info.field_name or "") - @field_validator("entrapment_fasta", "calibration_speclib") + @field_validator( + "entrapment_peptides", "calibration_speclib", "pairing" + ) @classmethod def _optional_uri(cls, v: str | None, info: ValidationInfo) -> str | None: if v is None: return v return _require_uri(v, info.field_name or "") + @model_validator(mode="after") + def _entrap_consistency(self) -> "FixtureInputs": + has_pep = self.entrapment_peptides is not None + has_ratio = self.entrapment_ratio is not None + has_mode = self.entrapment_mode is not None + if has_pep and not (has_ratio and has_mode): + raise ValueError( + "entrapment_peptides requires both entrapment_ratio and entrapment_mode" + ) + if (has_ratio or has_mode) and not has_pep: + raise ValueError( + "entrapment_ratio / entrapment_mode set without entrapment_peptides" + ) + if self.pairing is not None and self.entrapment_mode != "shuffled": + raise ValueError( + "pairing field only valid when entrapment_mode == 'shuffled'" + ) + if self.entrapment_ratio is not None and self.entrapment_ratio < 1.0: + raise ValueError( + f"entrapment_ratio must be >= 1.0, got {self.entrapment_ratio}" + ) + return self + class Fixture(BaseModel): model_config = ConfigDict(extra="forbid") @@ -57,7 +92,10 @@ class Fixture(BaseModel): config: dict[str, Any] = Field(default_factory=dict) def has_entrapment(self) -> bool: - return self.inputs.entrapment_fasta is not None + return self.inputs.entrapment_peptides is not None + + def has_pairing(self) -> bool: + return self.inputs.pairing is not None def has_calibration_speclib(self) -> bool: return self.inputs.calibration_speclib is not None diff --git a/bench/tests/test_fixture_schema.py b/bench/tests/test_fixture_schema.py index 5f4fe6d9..2ad377ed 100644 --- a/bench/tests/test_fixture_schema.py +++ b/bench/tests/test_fixture_schema.py @@ -19,7 +19,7 @@ def test_minimal_fixture_loads(tmp_path): description = "test" [inputs] - fasta = "s3://b/p.fasta" + target_peptides = "s3://b/peps.txt" speclib = "s3://b/lib.msgpack.zst" raw = "s3://b/sample.d" @@ -29,26 +29,31 @@ def test_minimal_fixture_loads(tmp_path): ) f = load_fixture(p) assert f.name == "hela" - assert f.inputs.fasta == "s3://b/p.fasta" - assert f.inputs.entrapment_fasta is None + assert f.inputs.target_peptides == "s3://b/peps.txt" + assert f.inputs.entrapment_peptides is None + assert f.inputs.entrapment_ratio is None + assert f.inputs.entrapment_mode is None + assert f.inputs.pairing is None assert f.inputs.calibration_speclib is None assert not f.has_entrapment() - assert f.config["analysis"]["chunk_size"] == 20000 + assert not f.has_pairing() -def test_entrapment_and_calib_optional_present(tmp_path): +def test_full_entrapment_fixture(tmp_path): p = _write( tmp_path, """ - name = "hela_entrap" + name = "h_y" description = "test" [inputs] - fasta = "s3://b/p.fasta" + target_peptides = "s3://b/t.txt" + entrapment_peptides = "s3://b/e.txt" + entrapment_ratio = 1.0 + entrapment_mode = "shuffled" + pairing = "s3://b/pairs.tsv" speclib = "s3://b/lib.msgpack.zst" raw = "s3://b/sample.d" - entrapment_fasta = "s3://b/entrap.fasta" - calibration_speclib = "s3://b/calib.msgpack.zst" [config.analysis] chunk_size = 20000 @@ -56,11 +61,37 @@ def test_entrapment_and_calib_optional_present(tmp_path): ) f = load_fixture(p) assert f.has_entrapment() - assert f.inputs.calibration_speclib == "s3://b/calib.msgpack.zst" - assert f.has_calibration_speclib() + assert f.has_pairing() + assert f.inputs.entrapment_ratio == 1.0 + assert f.inputs.entrapment_mode == "shuffled" -def test_relative_path_in_inputs_rejected(tmp_path): +def test_foreign_mode_no_pairing(tmp_path): + p = _write( + tmp_path, + """ + name = "h_y" + description = "test" + + [inputs] + target_peptides = "s3://b/t.txt" + entrapment_peptides = "s3://b/e.txt" + entrapment_ratio = 1.0 + entrapment_mode = "foreign" + speclib = "s3://b/lib.msgpack.zst" + raw = "s3://b/sample.d" + + [config.analysis] + chunk_size = 20000 + """, + ) + f = load_fixture(p) + assert f.has_entrapment() + assert not f.has_pairing() + assert f.inputs.entrapment_mode == "foreign" + + +def test_entrapment_peptides_without_ratio_rejected(tmp_path): p = _write( tmp_path, """ @@ -68,7 +99,9 @@ def test_relative_path_in_inputs_rejected(tmp_path): description = "test" [inputs] - fasta = "relative/path.fasta" + target_peptides = "s3://b/t.txt" + entrapment_peptides = "s3://b/e.txt" + entrapment_mode = "foreign" speclib = "s3://b/lib.msgpack.zst" raw = "s3://b/sample.d" @@ -76,66 +109,157 @@ def test_relative_path_in_inputs_rejected(tmp_path): chunk_size = 20000 """, ) - with pytest.raises(ValueError, match="absolute local path"): + with pytest.raises(ValueError, match="entrapment_ratio"): load_fixture(p) -def test_absolute_local_path_in_inputs_accepted(tmp_path): +def test_entrapment_peptides_without_mode_rejected(tmp_path): p = _write( tmp_path, """ - name = "ok" + name = "bad" description = "test" [inputs] - fasta = "/abs/path/proteome.fasta" - speclib = "/abs/path/lib.msgpack.zst" - raw = "/abs/path/sample.d" + target_peptides = "s3://b/t.txt" + entrapment_peptides = "s3://b/e.txt" + entrapment_ratio = 1.0 + speclib = "s3://b/lib.msgpack.zst" + raw = "s3://b/sample.d" [config.analysis] chunk_size = 20000 """, ) - f = load_fixture(p) - assert f.inputs.fasta == "/abs/path/proteome.fasta" + with pytest.raises(ValueError, match="entrapment_mode"): + load_fixture(p) -def test_tilde_path_expanded(tmp_path): - import os - home = os.path.expanduser("~") +def test_orphan_ratio_rejected(tmp_path): p = _write( tmp_path, - f""" - name = "ok" + """ + name = "bad" description = "test" [inputs] - fasta = "~/proteome.fasta" - speclib = "{home}/lib.msgpack.zst" + target_peptides = "s3://b/t.txt" + entrapment_ratio = 1.0 + speclib = "s3://b/lib.msgpack.zst" raw = "s3://b/sample.d" [config.analysis] chunk_size = 20000 """, ) - f = load_fixture(p) - assert f.inputs.fasta == f"{home}/proteome.fasta" - assert f.inputs.speclib == f"{home}/lib.msgpack.zst" + with pytest.raises(ValueError, match="without entrapment_peptides"): + load_fixture(p) + + +def test_pairing_only_for_shuffled_mode(tmp_path): + p = _write( + tmp_path, + """ + name = "bad" + description = "test" + + [inputs] + target_peptides = "s3://b/t.txt" + entrapment_peptides = "s3://b/e.txt" + entrapment_ratio = 1.0 + entrapment_mode = "foreign" + pairing = "s3://b/pairs.tsv" + speclib = "s3://b/lib.msgpack.zst" + raw = "s3://b/sample.d" + + [config.analysis] + chunk_size = 20000 + """, + ) + with pytest.raises(ValueError, match="pairing"): + load_fixture(p) + + +def test_ratio_below_one_rejected(tmp_path): + p = _write( + tmp_path, + """ + name = "bad" + description = "test" + + [inputs] + target_peptides = "s3://b/t.txt" + entrapment_peptides = "s3://b/e.txt" + entrapment_ratio = 0.5 + entrapment_mode = "foreign" + speclib = "s3://b/lib.msgpack.zst" + raw = "s3://b/sample.d" + + [config.analysis] + chunk_size = 20000 + """, + ) + with pytest.raises(ValueError, match="entrapment_ratio must be >= 1.0"): + load_fixture(p) -def test_missing_required_input_rejected(tmp_path): +def test_relative_path_in_inputs_rejected(tmp_path): p = _write( tmp_path, """ - name = "missing" + name = "bad" description = "test" [inputs] - fasta = "s3://b/p.fasta" + target_peptides = "relative/path.txt" + speclib = "s3://b/lib.msgpack.zst" + raw = "s3://b/sample.d" [config.analysis] chunk_size = 20000 """, ) - with pytest.raises(ValueError, match="speclib"): + with pytest.raises(ValueError, match="absolute local path"): load_fixture(p) + + +def test_absolute_local_path_accepted(tmp_path): + p = _write( + tmp_path, + """ + name = "ok" + description = "test" + + [inputs] + target_peptides = "/abs/path/peps.txt" + speclib = "/abs/path/lib.msgpack.zst" + raw = "/abs/path/sample.d" + + [config.analysis] + chunk_size = 20000 + """, + ) + f = load_fixture(p) + assert f.inputs.target_peptides == "/abs/path/peps.txt" + + +def test_tilde_path_expanded(tmp_path): + import os + home = os.path.expanduser("~") + p = _write( + tmp_path, + f""" + name = "ok" + description = "test" + + [inputs] + target_peptides = "~/peps.txt" + speclib = "{home}/lib.msgpack.zst" + raw = "s3://b/sample.d" + + [config.analysis] + chunk_size = 20000 + """, + ) + f = load_fixture(p) + assert f.inputs.target_peptides == f"{home}/peps.txt" From 84e4b7e3fc70c8cd66b1cdee09365fb47d253655 Mon Sep 17 00:00:00 2001 From: "J. Sebastian Paez" Date: Sat, 9 May 2026 10:10:27 -0700 Subject: [PATCH 33/41] feat(bench)!: rewrite push_fixture for peptide-list pipeline + SHUFFLED entrapment --- bench/_db_resolver.py | 11 + bench/push_fixture.py | 340 +++++++++++++----- bench/tests/test_db_resolver.py | 11 + bench/tests/test_push_fixture.py | 571 ++++++++++++++++++++++++------- 4 files changed, 736 insertions(+), 197 deletions(-) diff --git a/bench/_db_resolver.py b/bench/_db_resolver.py index 1e01e19e..f1134235 100644 --- a/bench/_db_resolver.py +++ b/bench/_db_resolver.py @@ -33,6 +33,7 @@ class DbSpecKind(enum.Enum): S3_FASTA = "s3_fasta" UNIPROT_PROTEOME = "uniprot_proteome" UNIPROT_ACCESSION = "uniprot_accession" + SHUFFLED = "shuffled" @dataclass(frozen=True) @@ -43,6 +44,9 @@ class DbSpec: def classify_db_spec(spec: str) -> DbSpec: """Classify a single --db value. Raises ValueError for unrecognised input.""" + if spec == "SHUFFLED": + return DbSpec(DbSpecKind.SHUFFLED, spec) + if spec.startswith("s3://"): return DbSpec(DbSpecKind.S3_FASTA, spec) @@ -107,6 +111,13 @@ def resolve_dbs(specs: list[str], output_path: Path) -> None: classified = [classify_db_spec(raw) for raw in specs] + for spec in classified: + if spec.kind is DbSpecKind.SHUFFLED: + raise ValueError( + "SHUFFLED is not resolvable to a fasta" + " — push_fixture handles it directly" + ) + # Coalesce all bare UNIPROT_ACCESSION specs into one batch. bare_accessions: list[str] = [ s.value for s in classified if s.kind is DbSpecKind.UNIPROT_ACCESSION diff --git a/bench/push_fixture.py b/bench/push_fixture.py index a148df01..2dcd2aa4 100644 --- a/bench/push_fixture.py +++ b/bench/push_fixture.py @@ -1,22 +1,26 @@ """Build a fixture and push its inputs to S3. -Resolves polymorphic --db specs into concatenated FASTAs, uploads them and -the raw .d directory, builds the speclib via speclib_build_cli, and writes +Resolves polymorphic --db specs into peptide lists, uploads them and the raw +.d directory, builds the speclib via speclib_build_cli --peptides, and writes the fixture TOML to bench/fixtures/.toml. """ from __future__ import annotations import argparse +import random import subprocess import sys import tempfile +import warnings from pathlib import Path from loguru import logger from bench._db_resolver import resolve_dbs +from bench._digest import digest_proteins, length_filter, parse_fasta from bench._s3 import s3_upload_dir, s3_upload_file +from bench._shuffle import generate_shuffled_entrapment def parse_args(argv: list[str] | None = None) -> argparse.Namespace: @@ -32,13 +36,19 @@ def parse_args(argv: list[str] | None = None) -> argparse.Namespace: default=[], required=True, metavar="SPEC", - help="Target FASTA source (repeatable)", + help="Target proteome source (repeatable)", ) p.add_argument("--raw", required=True, help="Raw .d / .idx (local dir or s3://...)") p.add_argument( "--config", required=True, help="Local timsseek config TOML to embed" ) - p.add_argument("--entrap-db", action="append", default=[], metavar="SPEC") + p.add_argument( + "--entrap-db", + action="append", + default=[], + metavar="SPEC", + help="Foreign entrapment db specs, OR exactly 'SHUFFLED' for Algorithm 1", + ) p.add_argument("--calib-db", action="append", default=[], metavar="SPEC") p.add_argument( "--speclib", @@ -57,6 +67,36 @@ def parse_args(argv: list[str] | None = None) -> argparse.Namespace: default=500, help="Per-request delay passed to speclib_build_cli (ms; default 500)", ) + p.add_argument( + "--entrap-ratio", + type=float, + default=1.0, + help="Entrapment ratio r >= 1.0 (default 1.0)", + ) + p.add_argument( + "--peptide-min-len", + type=int, + default=7, + help="Minimum peptide length after digestion (default 7)", + ) + p.add_argument( + "--peptide-max-len", + type=int, + default=30, + help="Maximum peptide length after digestion (default 30)", + ) + p.add_argument( + "--missed-cleavages", + type=int, + default=1, + help="Number of missed cleavages for trypsin digestion (default 1)", + ) + p.add_argument( + "--seed", + type=int, + default=42, + help="RNG seed for shuffle / subsample (default 42)", + ) p.add_argument("--dry-run", action="store_true") p.add_argument("--overwrite", action="store_true") p.add_argument( @@ -64,12 +104,60 @@ def parse_args(argv: list[str] | None = None) -> argparse.Namespace: action="store_true", help="Re-upload S3 objects even if they already exist", ) - return p.parse_args(argv) + + args = p.parse_args(argv) + + if args.entrap_ratio < 1.0: + p.error("--entrap-ratio must be >= 1.0") + + if len(args.entrap_db) > 1 and "SHUFFLED" in args.entrap_db: + p.error("SHUFFLED cannot be mixed with other --entrap-db specs") + + return args + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + + +def _subsample_set(s: set[str], k: int, seed: int) -> set[str]: + rng = random.Random(seed) + if k >= len(s): + return set(s) + return set(rng.sample(sorted(s), k)) + + +def _write_peptides(peptides: set[str], path: Path) -> None: + path.write_text("\n".join(sorted(peptides)) + "\n") + + +def _write_pairing(pairs: list[tuple[str, str]], path: Path) -> None: + lines = ["target_peptide\tentrap_peptide"] + for t, s in pairs: + lines.append(f"{t}\t{s}") + path.write_text("\n".join(lines) + "\n") + + +def _digest_fasta( + fasta_path: Path, + missed_cleavages: int, + min_len: int, + max_len: int, +) -> set[str]: + proteins = parse_fasta(fasta_path) + raw = digest_proteins(proteins, missed_cleavages=missed_cleavages) + return length_filter(raw, min_len=min_len, max_len=max_len) + + +# --------------------------------------------------------------------------- +# Speclib build +# --------------------------------------------------------------------------- def run_speclib_build( - fasta_s3: str, - speclib_s3: str, + peptides_uri: str, + speclib_uri: str, koina_url: str | None, request_delay_ms: int = 500, ) -> None: @@ -80,14 +168,14 @@ def run_speclib_build( "-p", "speclib_build_cli", "--", - "--fasta", - fasta_s3, + "--peptides", + peptides_uri, "--fixed-mod", "C[U:4]", "--max-ions", "10", "-o", - speclib_s3, + speclib_uri, ] if koina_url: cmd.extend(["--koina-url", koina_url]) @@ -96,15 +184,23 @@ def run_speclib_build( subprocess.run(cmd, check=True) +# --------------------------------------------------------------------------- +# Fixture TOML builder +# --------------------------------------------------------------------------- + + def build_fixture_toml( name: str, description: str, config_path: Path, - fasta_uri: str, + target_peptides_uri: str, speclib_uri: str, raw_uri: str, - entrapment_fasta_uri: str | None, - calibration_speclib_uri: str | None, + entrapment_peptides_uri: str | None = None, + entrapment_ratio: float | None = None, + entrapment_mode: str | None = None, + pairing_uri: str | None = None, + calibration_speclib_uri: str | None = None, ) -> str: lines: list[str] = [] lines.append(f'name = "{name}"') @@ -112,11 +208,17 @@ def build_fixture_toml( lines.append(f'description = "{desc}"') lines.append("") lines.append("[inputs]") - lines.append(f'fasta = "{fasta_uri}"') + lines.append(f'target_peptides = "{target_peptides_uri}"') lines.append(f'speclib = "{speclib_uri}"') lines.append(f'raw = "{raw_uri}"') - if entrapment_fasta_uri is not None: - lines.append(f'entrapment_fasta = "{entrapment_fasta_uri}"') + if entrapment_peptides_uri is not None: + lines.append(f'entrapment_peptides = "{entrapment_peptides_uri}"') + if entrapment_ratio is not None: + lines.append(f"entrapment_ratio = {entrapment_ratio}") + if entrapment_mode is not None: + lines.append(f'entrapment_mode = "{entrapment_mode}"') + if pairing_uri is not None: + lines.append(f'pairing = "{pairing_uri}"') if calibration_speclib_uri is not None: lines.append(f'calibration_speclib = "{calibration_speclib_uri}"') lines.append("") @@ -137,16 +239,9 @@ def build_fixture_toml( return "\n".join(lines) -def _resolve_and_upload_fasta( - specs: list[str], - s3_dest: str, - label: str, - workdir: Path, - skip_if_exists: bool = False, -) -> None: - local = workdir / f"{label}.fasta" - resolve_dbs(specs, local) - s3_upload_file(str(local), s3_dest, skip_if_exists=skip_if_exists) +# --------------------------------------------------------------------------- +# Main pipeline +# --------------------------------------------------------------------------- def run_pipeline( @@ -167,31 +262,27 @@ def run_pipeline( dry_run: bool, force: bool = False, request_delay_ms: int = 500, + entrap_ratio: float = 1.0, + peptide_min_len: int = 7, + peptide_max_len: int = 30, + missed_cleavages: int = 1, + seed: int = 42, ) -> None: """Execute the full upload + build + write-toml flow.""" + # Validate SHUFFLED mixing up-front (defensive; parse_args also checks) + if len(entrap_db) > 1 and "SHUFFLED" in entrap_db: + raise ValueError("SHUFFLED cannot be mixed with other --entrap-db specs") + if entrap_ratio < 1.0: + raise ValueError("entrap_ratio must be >= 1.0") + dest_prefix = f"s3://{bucket}/{prefix.rstrip('/')}/{name}" - target_fasta_uri = f"{dest_prefix}/proteome.fasta" - entrap_fasta_uri = f"{dest_prefix}/entrap.fasta" if entrap_db else None - calib_fasta_uri = f"{dest_prefix}/calib.fasta" if calib_db else None main_speclib_uri = speclib_uri or f"{dest_prefix}/lib.msgpack.zst" - # When entrap_db is present, the speclib must cover both target+entrap so - # the search can score entrapment peptides. We upload a concatenated fasta - # to a separate URI and point speclib_build_cli at it. The per-fasta - # target/entrap files are still uploaded separately so analyse() can - # classify hits by source. - speclib_input_fasta_uri = ( - f"{dest_prefix}/speclib_input.fasta" if entrap_db else None - ) final_calib_speclib_uri: str | None = calibration_speclib_uri if final_calib_speclib_uri is None and calib_db: final_calib_speclib_uri = f"{dest_prefix}/calib_lib.msgpack.zst" - # Raw is either a local dir we upload or an existing s3 URI we just reference - if raw.startswith("s3://"): - raw_uri = raw - else: - raw_uri = f"{dest_prefix}/sample.d" + raw_uri = raw if raw.startswith("s3://") else f"{dest_prefix}/sample.d" if fixture_target.exists() and not overwrite and not dry_run: raise FileExistsError( @@ -199,17 +290,20 @@ def run_pipeline( " (pass --overwrite to replace)" ) + use_shuffled = entrap_db == ["SHUFFLED"] + use_foreign = bool(entrap_db) and not use_shuffled + plan = { "name": name, "dest_prefix": dest_prefix, - "target_fasta_uri": target_fasta_uri, - "entrap_fasta_uri": entrap_fasta_uri, - "calib_fasta_uri": calib_fasta_uri, + "use_shuffled": use_shuffled, + "use_foreign": use_foreign, + "entrap_ratio": entrap_ratio, "raw_uri": raw_uri, "main_speclib_uri": main_speclib_uri, "calib_speclib_uri": final_calib_speclib_uri, "build_main_speclib": speclib_uri is None, - "build_calib_speclib": (calib_db != [] and calibration_speclib_uri is None), + "build_calib_speclib": (bool(calib_db) and calibration_speclib_uri is None), } logger.info("plan: {}", plan) if dry_run: @@ -219,71 +313,148 @@ def run_pipeline( with tempfile.TemporaryDirectory() as td: workdir = Path(td) - # 1. Resolve and upload target FASTA - _resolve_and_upload_fasta( - db, target_fasta_uri, "proteome", workdir, skip_if_exists=not force + # 1. Resolve target proteome → target peptides + target_fasta = workdir / "target.fasta" + resolve_dbs(db, target_fasta) + p_target = _digest_fasta( + target_fasta, + missed_cleavages=missed_cleavages, + min_len=peptide_min_len, + max_len=peptide_max_len, ) - # 2. Optional entrapment FASTA - if entrap_db: - assert entrap_fasta_uri is not None - _resolve_and_upload_fasta( - entrap_db, entrap_fasta_uri, "entrap", workdir, skip_if_exists=not force + # 2. Entrapment handling + p_foreign: set[str] = set() + pairs: list[tuple[str, str]] = [] + actual_r: float | None = None + mode: str | None = None + emit_pairing = False + + if use_shuffled: + # Algorithm 1: shuffled entrapment + r_int = int(entrap_ratio) + pairs = generate_shuffled_entrapment(p_target, r=r_int, seed=seed) + kept_targets = {t for (t, _) in pairs} + p_target = kept_targets + p_foreign = {s for (_, s) in pairs} + actual_r = float(r_int) + mode = "shuffled" + emit_pairing = r_int == 1 + + elif use_foreign: + # Algorithm 2: foreign entrapment + entrap_fasta = workdir / "entrap.fasta" + resolve_dbs(entrap_db, entrap_fasta) + all_foreign = _digest_fasta( + entrap_fasta, + missed_cleavages=missed_cleavages, + min_len=peptide_min_len, + max_len=peptide_max_len, ) + # Remove any peptides that appear in the target + all_foreign -= p_target + n_needed = int(entrap_ratio * len(p_target)) + if n_needed <= len(all_foreign): + p_foreign = _subsample_set(all_foreign, n_needed, seed) + actual_r = entrap_ratio + else: + p_foreign = all_foreign + actual_r = len(p_foreign) / len(p_target) if p_target else 0.0 + warnings.warn( + f"Not enough foreign peptides: needed {n_needed}, " + f"got {len(p_foreign)}. Actual r = {actual_r:.4f}", + stacklevel=2, + ) + pairs = [] + mode = "foreign" + emit_pairing = False - # 3. Optional calibration FASTA - if calib_db: - assert calib_fasta_uri is not None - _resolve_and_upload_fasta( - calib_db, calib_fasta_uri, "calib", workdir, skip_if_exists=not force + # 3. Build database peptide list (target ∪ entrap) + p_database = p_target | p_foreign + + # 4. Write local peptide files + target_pep_local = workdir / "target.peptides.txt" + database_pep_local = workdir / "database.peptides.txt" + _write_peptides(p_target, target_pep_local) + _write_peptides(p_database, database_pep_local) + + entrap_pep_local: Path | None = None + pairing_local: Path | None = None + if entrap_db: + entrap_pep_local = workdir / "entrap.peptides.txt" + _write_peptides(p_foreign, entrap_pep_local) + if emit_pairing: + pairing_local = workdir / "pairing.tsv" + _write_pairing(pairs, pairing_local) + + # 5. Upload peptide files + target_pep_uri = f"{dest_prefix}/target.peptides.txt" + database_pep_uri = f"{dest_prefix}/database.peptides.txt" + s3_upload_file(str(target_pep_local), target_pep_uri, skip_if_exists=not force) + s3_upload_file( + str(database_pep_local), database_pep_uri, skip_if_exists=not force + ) + + entrap_pep_uri: str | None = None + pairing_uri: str | None = None + if entrap_pep_local is not None: + entrap_pep_uri = f"{dest_prefix}/entrap.peptides.txt" + s3_upload_file( + str(entrap_pep_local), entrap_pep_uri, skip_if_exists=not force ) + if pairing_local is not None: + pairing_uri = f"{dest_prefix}/pairing.tsv" + s3_upload_file(str(pairing_local), pairing_uri, skip_if_exists=not force) - # 4. Upload raw dir if local + # 6. Upload raw if local if not raw.startswith("s3://"): s3_upload_dir(raw, raw_uri, idempotent=not force) - # 4b. If entrap_db present, build a merged target+entrap fasta locally - # and upload it as the speclib build input. - if speclib_input_fasta_uri is not None: - merged_local = workdir / "speclib_input.fasta" - target_local = workdir / "proteome.fasta" - entrap_local = workdir / "entrap.fasta" - with merged_local.open("wb") as out: - for src in (target_local, entrap_local): - out.write(src.read_bytes()) - if not src.read_bytes().endswith(b"\n"): - out.write(b"\n") + # 7. Calibration db peptides (Algorithm 2 path, no entrapment subtract) + calib_pep_uri: str | None = None + if calib_db: + calib_fasta = workdir / "calib.fasta" + resolve_dbs(calib_db, calib_fasta) + p_calib = _digest_fasta( + calib_fasta, + missed_cleavages=missed_cleavages, + min_len=peptide_min_len, + max_len=peptide_max_len, + ) + calib_pep_local = workdir / "calib.peptides.txt" + _write_peptides(p_calib, calib_pep_local) + calib_pep_uri = f"{dest_prefix}/calib.peptides.txt" s3_upload_file( - str(merged_local), - speclib_input_fasta_uri, - skip_if_exists=not force, + str(calib_pep_local), calib_pep_uri, skip_if_exists=not force ) - # 5. Build speclib(s) if not user-provided + # 8. Build speclib(s) if speclib_uri is None: - speclib_input_uri = speclib_input_fasta_uri or target_fasta_uri run_speclib_build( - speclib_input_uri, main_speclib_uri, koina_url, request_delay_ms + database_pep_uri, main_speclib_uri, koina_url, request_delay_ms ) if calib_db and calibration_speclib_uri is None: - assert calib_fasta_uri is not None + assert calib_pep_uri is not None assert final_calib_speclib_uri is not None run_speclib_build( - calib_fasta_uri, + calib_pep_uri, final_calib_speclib_uri, koina_url, request_delay_ms, ) - # 6. Emit fixture TOML + # 9. Emit fixture TOML body = build_fixture_toml( name=name, description="", config_path=Path(config), - fasta_uri=target_fasta_uri, + target_peptides_uri=target_pep_uri, speclib_uri=main_speclib_uri, raw_uri=raw_uri, - entrapment_fasta_uri=entrap_fasta_uri, + entrapment_peptides_uri=entrap_pep_uri, + entrapment_ratio=actual_r, + entrapment_mode=mode, + pairing_uri=pairing_uri, calibration_speclib_uri=final_calib_speclib_uri, ) fixture_target.parent.mkdir(parents=True, exist_ok=True) @@ -312,6 +483,11 @@ def main(argv: list[str] | None = None) -> int: dry_run=args.dry_run, force=args.force, request_delay_ms=args.request_delay_ms, + entrap_ratio=args.entrap_ratio, + peptide_min_len=args.peptide_min_len, + peptide_max_len=args.peptide_max_len, + missed_cleavages=args.missed_cleavages, + seed=args.seed, ) return 0 diff --git a/bench/tests/test_db_resolver.py b/bench/tests/test_db_resolver.py index 6a658809..e1a1fff3 100644 --- a/bench/tests/test_db_resolver.py +++ b/bench/tests/test_db_resolver.py @@ -111,3 +111,14 @@ def fake_s3_download(uri, dst): resolve_dbs(["s3://bkt/p.fasta"], out) m.assert_called_once() assert ">s3" in out.read_text() + + +def test_classify_shuffled_sentinel(): + spec = classify_db_spec("SHUFFLED") + assert spec.kind == DbSpecKind.SHUFFLED + + +def test_resolve_dbs_rejects_shuffled(tmp_path): + out = tmp_path / "merged.fasta" + with pytest.raises(ValueError, match="SHUFFLED"): + resolve_dbs(["SHUFFLED"], out) diff --git a/bench/tests/test_push_fixture.py b/bench/tests/test_push_fixture.py index 105ad01d..a1169e9d 100644 --- a/bench/tests/test_push_fixture.py +++ b/bench/tests/test_push_fixture.py @@ -1,26 +1,39 @@ +"""Tests for bench/push_fixture.py. + +S3 + subprocess + resolve_dbs are always mocked via the fake_runtime fixture. +Digest + shuffle helpers run for real so algorithm correctness is verified. +""" + +from __future__ import annotations + import textwrap +import tomllib +import warnings from pathlib import Path -from unittest.mock import ANY, patch +from unittest.mock import patch import pytest -from bench.push_fixture import build_fixture_toml, parse_args +from bench._fixture_schema import load_fixture +from bench.push_fixture import ( + build_fixture_toml, + parse_args, + run_pipeline, +) + +# --------------------------------------------------------------------------- +# parse_args +# --------------------------------------------------------------------------- def test_parse_args_minimal(): args = parse_args([ - "--name", - "hela", - "--bucket", - "bk", - "--prefix", - "fx", - "--db", - "UP000005640", - "--raw", - "/tmp/sample.d", - "--config", - "/tmp/cfg.toml", + "--name", "hela", + "--bucket", "bk", + "--prefix", "fx", + "--db", "UP000005640", + "--raw", "/tmp/sample.d", + "--config", "/tmp/cfg.toml", ]) assert args.name == "hela" assert args.bucket == "bk" @@ -30,113 +43,197 @@ def test_parse_args_minimal(): assert args.calib_db == [] assert args.dry_run is False assert args.overwrite is False + assert args.entrap_ratio == 1.0 + assert args.peptide_min_len == 7 + assert args.peptide_max_len == 30 + assert args.missed_cleavages == 1 + assert args.seed == 42 -def test_parse_args_multiple_db_and_entrap(): +def test_parse_args_all_flags(): args = parse_args([ - "--name", - "hy", - "--bucket", - "bk", - "--prefix", - "fx", - "--db", - "UP000005640", - "--db", - "/tmp/extra.fasta", - "--entrap-db", - "UP000002311", - "--raw", - "/tmp/sample.d", - "--config", - "/tmp/cfg.toml", + "--name", "hy", + "--bucket", "bk", + "--prefix", "fx", + "--db", "UP000005640", + "--db", "s3://bkt/extra.fasta", + "--entrap-db", "UP000002311", + "--calib-db", "P12345", + "--raw", "/tmp/sample.d", + "--config", "/tmp/cfg.toml", + "--entrap-ratio", "2.0", + "--peptide-min-len", "6", + "--peptide-max-len", "25", + "--missed-cleavages", "2", + "--seed", "7", + "--request-delay-ms", "200", "--dry-run", + "--overwrite", + "--force", ]) - assert args.db == ["UP000005640", "/tmp/extra.fasta"] + assert args.db == ["UP000005640", "s3://bkt/extra.fasta"] assert args.entrap_db == ["UP000002311"] + assert args.calib_db == ["P12345"] + assert args.entrap_ratio == 2.0 + assert args.peptide_min_len == 6 + assert args.peptide_max_len == 25 + assert args.missed_cleavages == 2 + assert args.seed == 7 + assert args.request_delay_ms == 200 assert args.dry_run is True + assert args.overwrite is True + assert args.force is True -def test_build_fixture_toml(tmp_path: Path): +def test_parse_args_shuffled_entrap_db(): + args = parse_args([ + "--name", "s", + "--bucket", "bk", + "--prefix", "fx", + "--db", "UP000005640", + "--raw", "/tmp/sample.d", + "--config", "/tmp/cfg.toml", + "--entrap-db", "SHUFFLED", + ]) + assert args.entrap_db == ["SHUFFLED"] + + +def test_parse_args_ratio_lt_1_rejected(): + with pytest.raises(SystemExit): + parse_args([ + "--name", "x", + "--bucket", "bk", + "--prefix", "fx", + "--db", "UP000005640", + "--raw", "/tmp/s.d", + "--config", "/tmp/cfg.toml", + "--entrap-ratio", "0.5", + ]) + + +def test_parse_args_shuffled_mixed_with_foreign_rejected(): + with pytest.raises(SystemExit): + parse_args([ + "--name", "x", + "--bucket", "bk", + "--prefix", "fx", + "--db", "UP000005640", + "--raw", "/tmp/s.d", + "--config", "/tmp/cfg.toml", + "--entrap-db", "SHUFFLED", + "--entrap-db", "UP000002311", + ]) + + +# --------------------------------------------------------------------------- +# build_fixture_toml +# --------------------------------------------------------------------------- + + +def test_build_fixture_toml_minimal(tmp_path: Path): cfg = tmp_path / "cfg.toml" - cfg.write_text( - textwrap.dedent( - """ - [analysis] - chunk_size = 20000 - """ - ).strip() - ) + cfg.write_text("[analysis]\nchunk_size = 20000\n") out = build_fixture_toml( name="hela", description="200ng HeLa", config_path=cfg, - fasta_uri="s3://bk/fx/hela/proteome.fasta", + target_peptides_uri="s3://bk/fx/hela/target.peptides.txt", speclib_uri="s3://bk/fx/hela/lib.msgpack.zst", raw_uri="s3://bk/fx/hela/sample.d", - entrapment_fasta_uri=None, - calibration_speclib_uri=None, ) - # Round-trip via the schema loader to verify validity target_path = tmp_path / "fx.toml" target_path.write_text(out) - from bench._fixture_schema import load_fixture - fx = load_fixture(target_path) assert fx.name == "hela" - assert fx.inputs.entrapment_fasta is None + assert fx.inputs.entrapment_peptides is None + assert fx.inputs.entrapment_ratio is None + assert fx.inputs.entrapment_mode is None + assert fx.inputs.pairing is None assert fx.config["analysis"]["chunk_size"] == 20000 -def test_build_fixture_toml_with_entrap_and_calib(tmp_path: Path): +def test_build_fixture_toml_shuffled_with_pairing(tmp_path: Path): + cfg = tmp_path / "cfg.toml" + cfg.write_text("[analysis]\nchunk_size = 1\n") + out = build_fixture_toml( + name="s", + description="", + config_path=cfg, + target_peptides_uri="s3://b/s/target.peptides.txt", + speclib_uri="s3://b/s/lib.msgpack.zst", + raw_uri="s3://b/s/sample.d", + entrapment_peptides_uri="s3://b/s/entrap.peptides.txt", + entrapment_ratio=1.0, + entrapment_mode="shuffled", + pairing_uri="s3://b/s/pairing.tsv", + ) + p = tmp_path / "fx.toml" + p.write_text(out) + fx = load_fixture(p) + assert fx.has_entrapment() + assert fx.has_pairing() + assert fx.inputs.entrapment_mode == "shuffled" + assert fx.inputs.entrapment_ratio == 1.0 + + +def test_build_fixture_toml_foreign_with_calib(tmp_path: Path): cfg = tmp_path / "cfg.toml" cfg.write_text("[analysis]\nchunk_size = 1\n") out = build_fixture_toml( name="x", description="x", config_path=cfg, - fasta_uri="s3://b/x/proteome.fasta", + target_peptides_uri="s3://b/x/target.peptides.txt", speclib_uri="s3://b/x/lib.msgpack.zst", raw_uri="s3://b/x/sample.d", - entrapment_fasta_uri="s3://b/x/entrap.fasta", - calibration_speclib_uri="s3://b/x/calib.msgpack.zst", + entrapment_peptides_uri="s3://b/x/entrap.peptides.txt", + entrapment_ratio=2.0, + entrapment_mode="foreign", + calibration_speclib_uri="s3://b/x/calib_lib.msgpack.zst", ) p = tmp_path / "fx.toml" p.write_text(out) - from bench._fixture_schema import load_fixture - fx = load_fixture(p) assert fx.has_entrapment() + assert not fx.has_pairing() + assert fx.inputs.entrapment_mode == "foreign" assert fx.has_calibration_speclib() -def _common_args(tmp_path): - cfg = tmp_path / "cfg.toml" - cfg.write_text("[analysis]\nchunk_size = 20000\n") - raw = tmp_path / "sample.d" - raw.mkdir() - (raw / "metadata").write_bytes(b"x") - return cfg, raw +# --------------------------------------------------------------------------- +# fake_runtime fixture +# --------------------------------------------------------------------------- + + +def _stub_fasta_content() -> str: + """Three proteins that digest into distinct tryptic peptides.""" + return textwrap.dedent("""\ + >sp|P1|PROT1 + MKWVTFISLLLLFSSAYSRGVFRRDTHKSEIAHRFKDLGEEHFKGLVLIAFSQYLQQCPFDEHVKLVNELTEFAKTCVADESHAGCEKSLHTLFGDELCKVASLRETYGDMADCCEK + >sp|P2|PROT2 + MAKQADSVSVKAEQYLSAELREQNLAKMSAAEERNRIAESQRQLAEQQKQLEQLKQKLEQLKQKLEQLK + >sp|P3|PROT3 + MTEYKLVVVGAGGVGKSALTIQLIQNHFVDEYDPTIEDSY + """) @pytest.fixture def fake_runtime(tmp_path): - """Patch S3 + speclib_build_cli + resolve_dbs for the run_pipeline tests.""" + """Patch S3 + speclib_build_cli. resolve_dbs writes a real stub fasta so + digest helpers run for real. Also patches generate_shuffled_entrapment + is NOT mocked — the real implementation runs with deterministic seed.""" fx_dir = tmp_path / "bench_fixtures" fx_dir.mkdir() + def _resolve(specs, out_path): + out_path.write_text(_stub_fasta_content()) + with ( patch("bench.push_fixture.s3_upload_file") as up_file, patch("bench.push_fixture.s3_upload_dir") as up_dir, patch("bench.push_fixture.run_speclib_build") as build, - patch("bench.push_fixture.resolve_dbs") as res, + patch("bench.push_fixture.resolve_dbs", side_effect=_resolve) as res, ): - # resolve_dbs writes a stub fasta to the requested output path - def _resolve(specs, out): - out.write_text(">x\nMK\n") - - res.side_effect = _resolve - yield { "up_file": up_file, "up_dir": up_dir, @@ -146,10 +243,22 @@ def _resolve(specs, out): } +def _common_args(tmp_path): + cfg = tmp_path / "cfg.toml" + cfg.write_text("[analysis]\nchunk_size = 20000\n") + raw = tmp_path / "sample.d" + raw.mkdir() + (raw / "metadata").write_bytes(b"x") + return cfg, raw + + +# --------------------------------------------------------------------------- +# run_pipeline — no entrapment +# --------------------------------------------------------------------------- + + def test_run_pipeline_minimal(tmp_path, fake_runtime): cfg, raw = _common_args(tmp_path) - from bench.push_fixture import run_pipeline - output_toml = fake_runtime["fx_dir"] / "hela.toml" run_pipeline( name="hela", @@ -167,30 +276,31 @@ def test_run_pipeline_minimal(tmp_path, fake_runtime): overwrite=False, dry_run=False, ) - - # Resolved + uploaded the target fasta + # resolve_dbs called once for target assert fake_runtime["res"].call_count == 1 - fake_runtime["up_file"].assert_any_call( - ANY, "s3://bk/fx/hela/proteome.fasta", skip_if_exists=True - ) - # Uploaded the raw directory + # target.peptides.txt + database.peptides.txt uploaded + uploaded_uris = [c.args[1] for c in fake_runtime["up_file"].call_args_list] + assert any("target.peptides.txt" in u for u in uploaded_uris) + assert any("database.peptides.txt" in u for u in uploaded_uris) + # entrap.peptides.txt NOT uploaded + assert not any("entrap.peptides.txt" in u for u in uploaded_uris) + # Raw dir uploaded fake_runtime["up_dir"].assert_called_once() - args = fake_runtime["up_dir"].call_args.args - assert args[0] == str(raw) - assert args[1] == "s3://bk/fx/hela/sample.d" - assert fake_runtime["up_dir"].call_args.kwargs.get("idempotent") is True - # Built the speclib + assert fake_runtime["up_dir"].call_args.args[1] == "s3://bk/fx/hela/sample.d" + # speclib build called once fake_runtime["build"].assert_called_once() - # Wrote fixture TOML + # speclib built from database.peptides.txt + build_call_args = fake_runtime["build"].call_args.args + assert "database.peptides.txt" in build_call_args[0] + # fixture written assert output_toml.exists() - body = output_toml.read_text() - assert "s3://bk/fx/hela/lib.msgpack.zst" in body + fx = load_fixture(output_toml) + assert fx.name == "hela" + assert fx.inputs.entrapment_peptides is None def test_run_pipeline_skip_build_when_speclib_provided(tmp_path, fake_runtime): cfg, raw = _common_args(tmp_path) - from bench.push_fixture import run_pipeline - output_toml = fake_runtime["fx_dir"] / "hela.toml" run_pipeline( name="hela", @@ -208,16 +318,103 @@ def test_run_pipeline_skip_build_when_speclib_provided(tmp_path, fake_runtime): overwrite=False, dry_run=False, ) - # No speclib build fake_runtime["build"].assert_not_called() - # Fixture TOML references the user-provided URI assert "s3://other/lib.msgpack.zst" in output_toml.read_text() -def test_run_pipeline_with_entrap_and_calib_db(tmp_path, fake_runtime): +# --------------------------------------------------------------------------- +# run_pipeline — foreign entrapment (Algorithm 2) +# --------------------------------------------------------------------------- + + +def test_run_pipeline_foreign_entrap(tmp_path, fake_runtime): cfg, raw = _common_args(tmp_path) - from bench.push_fixture import run_pipeline + output_toml = fake_runtime["fx_dir"] / "x.toml" + + # Use distinct proteins so entrap peptides survive the target-subtraction step + # and actual_r stays >= 1.0 (passes the schema validator). + target_fasta = textwrap.dedent("""\ + >sp|T1|TARGET + MKWVTFISLLLLFSSAYSRGVFRRDTHKSEIAHRFK + """) + entrap_fasta = textwrap.dedent("""\ + >sp|E1|ENTRAP + MTEYKLVVVGAGGVGKSALTIQLIQNHFVDEYDPTIEDSY + >sp|E2|ENTRAP2 + MAKQADSVSVKAEQYLSAELREQNLAKMSAAEERNRIAESQR + >sp|E3|ENTRAP3 + MALPVTALLLPLALLLHAARPSFSLVKRGELKPAPKALPEPKPAPKALPEPKPAPKALPEPKPVSKMAPP + """) + + call_count = [0] + + def _resolve_distinct(specs, out_path): + call_count[0] += 1 + if call_count[0] == 1: + out_path.write_text(target_fasta) + else: + out_path.write_text(entrap_fasta) + + fake_runtime["res"].side_effect = _resolve_distinct + + run_pipeline( + name="x", + bucket="bk", + prefix="fx", + db=["UP000005640"], + raw=str(raw), + config=str(cfg), + entrap_db=["UP000002311"], + calib_db=[], + speclib_uri=None, + calibration_speclib_uri=None, + koina_url=None, + fixture_target=output_toml, + overwrite=False, + dry_run=False, + ) + # resolve_dbs called for target + entrap = 2 + assert fake_runtime["res"].call_count == 2 + uploaded_uris = [c.args[1] for c in fake_runtime["up_file"].call_args_list] + assert any("entrap.peptides.txt" in u for u in uploaded_uris) + # pairing.tsv NOT uploaded (foreign mode, not shuffled) + assert not any("pairing.tsv" in u for u in uploaded_uris) + fx = load_fixture(output_toml) + assert fx.inputs.entrapment_mode == "foreign" + assert fx.inputs.pairing is None + + +def test_run_pipeline_with_calib_db(tmp_path, fake_runtime): + cfg, raw = _common_args(tmp_path) + output_toml = fake_runtime["fx_dir"] / "x.toml" + run_pipeline( + name="x", + bucket="bk", + prefix="fx", + db=["UP000005640"], + raw=str(raw), + config=str(cfg), + entrap_db=[], + calib_db=["P12345"], + speclib_uri=None, + calibration_speclib_uri=None, + koina_url=None, + fixture_target=output_toml, + overwrite=False, + dry_run=False, + ) + # resolve_dbs: target + calib = 2 + assert fake_runtime["res"].call_count == 2 + # build called twice: main + calib + assert fake_runtime["build"].call_count == 2 + uploaded_uris = [c.args[1] for c in fake_runtime["up_file"].call_args_list] + assert any("calib.peptides.txt" in u for u in uploaded_uris) + fx = load_fixture(output_toml) + assert fx.has_calibration_speclib() + +def test_run_pipeline_foreign_and_calib(tmp_path, fake_runtime): + cfg, raw = _common_args(tmp_path) output_toml = fake_runtime["fx_dir"] / "x.toml" run_pipeline( name="x", @@ -235,18 +432,110 @@ def test_run_pipeline_with_entrap_and_calib_db(tmp_path, fake_runtime): overwrite=False, dry_run=False, ) - # resolve_dbs called for db, entrap_db, calib_db = 3 times + # resolve_dbs: target + entrap + calib = 3 assert fake_runtime["res"].call_count == 3 - # speclib_build called twice: main + calibration + # build called twice assert fake_runtime["build"].call_count == 2 body = output_toml.read_text() - assert "entrapment_fasta" in body and "calibration_speclib" in body + assert "entrap" in body and "calibration_speclib" in body -def test_run_pipeline_refuses_overwrite(tmp_path, fake_runtime): +# --------------------------------------------------------------------------- +# run_pipeline — shuffled entrapment (Algorithm 1) +# --------------------------------------------------------------------------- + + +def test_run_pipeline_shuffled(tmp_path, fake_runtime): + cfg, raw = _common_args(tmp_path) + output_toml = fake_runtime["fx_dir"] / "s.toml" + run_pipeline( + name="s", + bucket="bk", + prefix="fx", + db=["UP000005640"], + raw=str(raw), + config=str(cfg), + entrap_db=["SHUFFLED"], + calib_db=[], + speclib_uri=None, + calibration_speclib_uri=None, + koina_url=None, + fixture_target=output_toml, + overwrite=False, + dry_run=False, + entrap_ratio=1.0, + seed=42, + ) + # resolve_dbs called only for target (SHUFFLED is not resolved via resolve_dbs) + assert fake_runtime["res"].call_count == 1 + uploaded_uris = [c.args[1] for c in fake_runtime["up_file"].call_args_list] + # pairing.tsv uploaded (r=1, shuffled) + assert any("pairing.tsv" in u for u in uploaded_uris) + assert any("entrap.peptides.txt" in u for u in uploaded_uris) + fx = load_fixture(output_toml) + assert fx.inputs.entrapment_mode == "shuffled" + assert fx.inputs.entrapment_ratio == 1.0 + assert fx.has_pairing() + + +def test_run_pipeline_shuffled_r2_no_pairing(tmp_path, fake_runtime): + """r=2 shuffled: two shuffles per target; pairing.tsv NOT emitted.""" + cfg, raw = _common_args(tmp_path) + output_toml = fake_runtime["fx_dir"] / "s2.toml" + run_pipeline( + name="s2", + bucket="bk", + prefix="fx", + db=["UP000005640"], + raw=str(raw), + config=str(cfg), + entrap_db=["SHUFFLED"], + calib_db=[], + speclib_uri=None, + calibration_speclib_uri=None, + koina_url=None, + fixture_target=output_toml, + overwrite=False, + dry_run=False, + entrap_ratio=2.0, + seed=42, + ) + uploaded_uris = [c.args[1] for c in fake_runtime["up_file"].call_args_list] + assert not any("pairing.tsv" in u for u in uploaded_uris) + fx = load_fixture(output_toml) + assert fx.inputs.entrapment_mode == "shuffled" + assert not fx.has_pairing() + + +def test_run_pipeline_shuffled_mixing_foreign_rejected(tmp_path, fake_runtime): cfg, raw = _common_args(tmp_path) - from bench.push_fixture import run_pipeline + output_toml = fake_runtime["fx_dir"] / "bad.toml" + with pytest.raises(ValueError, match="SHUFFLED"): + run_pipeline( + name="bad", + bucket="bk", + prefix="fx", + db=["UP000005640"], + raw=str(raw), + config=str(cfg), + entrap_db=["SHUFFLED", "UP000002311"], + calib_db=[], + speclib_uri=None, + calibration_speclib_uri=None, + koina_url=None, + fixture_target=output_toml, + overwrite=False, + dry_run=False, + ) + + +# --------------------------------------------------------------------------- +# run_pipeline — overwrite / dry_run / force guards +# --------------------------------------------------------------------------- + +def test_run_pipeline_refuses_overwrite(tmp_path, fake_runtime): + cfg, raw = _common_args(tmp_path) target = fake_runtime["fx_dir"] / "hela.toml" target.write_text("# pre-existing") @@ -271,8 +560,6 @@ def test_run_pipeline_refuses_overwrite(tmp_path, fake_runtime): def test_run_pipeline_dry_run(tmp_path, fake_runtime): cfg, raw = _common_args(tmp_path) - from bench.push_fixture import run_pipeline - target = fake_runtime["fx_dir"] / "hela.toml" run_pipeline( name="hela", @@ -290,7 +577,6 @@ def test_run_pipeline_dry_run(tmp_path, fake_runtime): overwrite=False, dry_run=True, ) - # No side effects fake_runtime["res"].assert_not_called() fake_runtime["up_file"].assert_not_called() fake_runtime["up_dir"].assert_not_called() @@ -299,10 +585,7 @@ def test_run_pipeline_dry_run(tmp_path, fake_runtime): def test_run_pipeline_default_skips_existing_uploads(tmp_path, fake_runtime): - """By default (no --force), uploads are idempotent: existing S3 objects skipped.""" cfg, raw = _common_args(tmp_path) - from bench.push_fixture import run_pipeline - output_toml = fake_runtime["fx_dir"] / "hela.toml" run_pipeline( name="hela", @@ -319,20 +602,14 @@ def test_run_pipeline_default_skips_existing_uploads(tmp_path, fake_runtime): fixture_target=output_toml, overwrite=False, dry_run=False, - # force NOT passed — defaults to False ) - # s3_upload_file got called with skip_if_exists=True - fasta_call = fake_runtime["up_file"].call_args - assert fasta_call.kwargs.get("skip_if_exists") is True - # s3_upload_dir got called with idempotent=True - raw_call = fake_runtime["up_dir"].call_args - assert raw_call.kwargs.get("idempotent") is True + for c in fake_runtime["up_file"].call_args_list: + assert c.kwargs.get("skip_if_exists") is True + assert fake_runtime["up_dir"].call_args.kwargs.get("idempotent") is True def test_run_pipeline_force_overrides_skip(tmp_path, fake_runtime): cfg, raw = _common_args(tmp_path) - from bench.push_fixture import run_pipeline - output_toml = fake_runtime["fx_dir"] / "hela.toml" run_pipeline( name="hela", @@ -351,7 +628,71 @@ def test_run_pipeline_force_overrides_skip(tmp_path, fake_runtime): dry_run=False, force=True, ) - fasta_call = fake_runtime["up_file"].call_args - assert fasta_call.kwargs.get("skip_if_exists") is False - raw_call = fake_runtime["up_dir"].call_args - assert raw_call.kwargs.get("idempotent") is False + for c in fake_runtime["up_file"].call_args_list: + assert c.kwargs.get("skip_if_exists") is False + assert fake_runtime["up_dir"].call_args.kwargs.get("idempotent") is False + + +# --------------------------------------------------------------------------- +# Algorithm verification — actual r recorded correctly +# --------------------------------------------------------------------------- + + +def test_run_pipeline_foreign_insufficient_warns(tmp_path, fake_runtime): + """When foreign peptides are fewer than needed, warn and record actual r. + + Note: if actual_r < 1.0 the schema validator rejects it, so we check the + raw TOML text rather than loading through the schema. + """ + cfg, raw = _common_args(tmp_path) + output_toml = fake_runtime["fx_dir"] / "short.toml" + + # target gets a long protein with many tryptic peptides + target_fasta = textwrap.dedent("""\ + >sp|T1|PROT + MKWVTFISLLLLFSSAYSRGVFRRDTHKSEIAHRFKDLGEEHFKGLVLIAFSQYLQQCPFDEHVKLVNELTEFAKTCVADESHAGCEKSLHTLFGDELCKVASLRETYGDMADCCEK + """) + # Tiny entrap — too short for 7-mer tryptic peptide → 0 foreign after filter + entrap_fasta = textwrap.dedent("""\ + >sp|E1|ENTRAP + MKWVTFISLLL + """) + + call_count = [0] + + def _resolve_custom(specs, out_path): + call_count[0] += 1 + if call_count[0] == 1: + out_path.write_text(target_fasta) + else: + out_path.write_text(entrap_fasta) + + fake_runtime["res"].side_effect = _resolve_custom + + with warnings.catch_warnings(record=True) as w: + warnings.simplefilter("always") + run_pipeline( + name="short", + bucket="bk", + prefix="fx", + db=["UP000005640"], + raw=str(raw), + config=str(cfg), + entrap_db=["UP000002311"], + calib_db=[], + speclib_uri=None, + calibration_speclib_uri=None, + koina_url=None, + fixture_target=output_toml, + overwrite=False, + dry_run=False, + entrap_ratio=5.0, + ) + # Should have warned about insufficient peptides + assert any("Not enough foreign" in str(warning.message) for warning in w) + # actual_r in the raw TOML should be < 5.0 + toml_text = output_toml.read_text() + assert "entrapment_ratio" in toml_text + data = tomllib.loads(toml_text) + actual_r = data["inputs"]["entrapment_ratio"] + assert actual_r < 5.0 From 41a01d3e7aa8cec7f1235995207276e1182aa108 Mon Sep 17 00:00:00 2001 From: "J. Sebastian Paez" Date: Sat, 9 May 2026 10:17:19 -0700 Subject: [PATCH 34/41] feat(bench)!: rewrite entrapment.py with FDRBench Method 1/1B + score histogram --- bench/entrapment.py | 443 +++++++++++++++++++++------------ bench/tests/test_entrapment.py | 373 +++++++++++++++------------ 2 files changed, 489 insertions(+), 327 deletions(-) diff --git a/bench/entrapment.py b/bench/entrapment.py index 8ecbe8c7..f168d02b 100644 --- a/bench/entrapment.py +++ b/bench/entrapment.py @@ -1,7 +1,22 @@ -"""Entrapment classification + FDR walk + plot. +"""Entrapment classification + FDR estimators (Noble et al, FDRBench Table S2). -The classification half lives here (Task 6). The FDR walk and plot land in -Task 7. The CLI entry-point lands in Task 8. +The fixture's peptide-list inputs (target_peptides, entrapment_peptides) feed +sequence-level set membership classification of result PSMs. Three FDP +estimators are emitted from compute_fdr_curve: + +- empirical_fdr_lower = n_e / (n_e + n_t) # lower bound +- empirical_fdr_combined = n_e × (1 + 1/r) / (n_e + n_t) # avg upper bound +- empirical_fdr_matched = (n_e + n_p_s_t + 2·n_p_t_s) / (n_e + n_t) + # matched estimator, k=1; only when pairing supplied + +Where: +- r is the entrapment_ratio recorded on the fixture +- n_p_t_s = entrap hits whose paired target also scored ≥ threshold + AND entrap_score > paired_target_score +- n_p_s_t = entrap hits whose paired target scored < threshold + +The walk filters to is_target=True before accumulating counts (post-competition +target winners only; decoy wins are TDA-style FPs, not entrapment FPs). """ from __future__ import annotations @@ -10,82 +25,44 @@ import re from pathlib import Path -import ahocorasick # ty: ignore[unresolved-import] import polars as pl _MOD_RE = re.compile(r"\[[^\]]*\]|\([^)]*\)|[0-9.]+") -"""Strip bracketed mods (`[U:4]`, `[42]`), parenthesised mods (`(Phospho)`), -and bare numeric mass shifts (`123.45`).""" +"""Strip [...], (...), and bare numeric mass shifts from peptide sequences.""" def strip_mods(seq: str) -> str: - """Strip mod annotations to return a bare AA sequence (alpha chars only).""" return _MOD_RE.sub("", seq) -def parse_fasta(path: str | Path) -> dict[str, str]: - """Parse a FASTA file into {accession: concatenated_sequence}. - - Accession is taken as the full header line minus the leading `>`, - stripped of trailing whitespace. The full header is used so callers can - later parse it however they want; we don't impose uniprot's `sp|...|` - grammar here. - """ - out: dict[str, str] = {} - current_acc: str | None = None - parts: list[str] = [] - with Path(path).open("r") as f: - for raw_line in f: - line = raw_line.rstrip() - if not line: - continue - if line.startswith(">"): - if current_acc is not None: - out[current_acc] = "".join(parts) - current_acc = line[1:].strip() - parts = [] - else: - parts.append(line) - if current_acc is not None: - out[current_acc] = "".join(parts) +def load_peptide_set(path: str | Path) -> set[str]: + """Load a peptide list .txt file; returns a set of bare AA sequences.""" + out: set[str] = set() + for line in Path(path).read_text().splitlines(): + s = line.strip() + if s: + out.add(s) return out -def count_kmers(fasta_path: str | Path, k: int = 7) -> set[str]: - """Return the set of unique k-mers across all proteins in a FASTA. - - Window slides over each protein sequence; sequences shorter than `k` - contribute nothing. - """ - proteins = parse_fasta(fasta_path) - out: set[str] = set() - for seq in proteins.values(): - if len(seq) < k: +def load_pairing(path: str | Path) -> dict[str, str]: + """Load a pairing.tsv with header `target_peptide\\tentrap_peptide`.""" + lines = Path(path).read_text().splitlines() + if not lines: + return {} + out: dict[str, str] = {} + # Skip header + for raw in lines[1:]: + s = raw.strip() + if not s: continue - for i in range(len(seq) - k + 1): - out.add(seq[i : i + k]) + parts = s.split("\t") + if len(parts) != 2: + raise ValueError(f"bad pairing row: {raw!r}") + out[parts[0]] = parts[1] return out -def kmer_normalization_factor( - target_fasta: str | Path, - entrapment_fasta: str | Path, - k: int = 7, -) -> float: - """Compute |T_unique_kmers| / |E_unique_kmers| after dropping shared k-mers. - - Used to rescale entrapment hit counts so empirical FDR is comparable - across proteomes of unequal search-space size. Clamped at 1.0 in the - denominator to avoid div-by-zero on tiny / empty entrapment fastas. - """ - t = count_kmers(target_fasta, k=k) - e = count_kmers(entrapment_fasta, k=k) - shared = t & e - t_only = t - shared - e_only = e - shared - return len(t_only) / max(1, len(e_only)) - - class PeptideClass(enum.Enum): TARGET = "target" ENTRAPMENT = "entrapment" @@ -93,86 +70,119 @@ class PeptideClass(enum.Enum): UNKNOWN = "unknown" -def _build_hits(patterns: set[str], proteins: dict[str, str]) -> set[str]: - """Return the subset of `patterns` that occurs as a substring of any value - in `proteins`.""" - if not patterns: - return set() - aut = ahocorasick.Automaton() - for pat in patterns: - aut.add_word(pat, pat) - aut.make_automaton() - - hits: set[str] = set() - for seq in proteins.values(): - for _, pat in aut.iter(seq): - hits.add(pat) - if len(hits) == len(patterns): - return hits - return hits - - def classify_peptides( results: pl.DataFrame, - target_fasta: str | Path, - entrapment_fasta: str | Path, + target_peptides: set[str], + entrapment_peptides: set[str], ) -> pl.DataFrame: - """Add `class` and `is_entrapment` columns to a results DataFrame. - - `results` must have a `sequence` column. Sequences are mod-stripped - before substring matching. Shared peptides (present in both fastas) are - marked SHARED_DROPPED -- callers exclude them from FDR. - """ + """Add `class` and `is_entrapment` columns via set membership.""" if "sequence" not in results.columns: - raise ValueError("results dataframe missing required 'sequence' column") - - target = parse_fasta(target_fasta) - entrap = parse_fasta(entrapment_fasta) - - stripped = results["sequence"].map_elements(strip_mods, return_dtype=pl.Utf8) - patterns = set(stripped.to_list()) - - hits_t = _build_hits(patterns, target) - hits_e = _build_hits(patterns, entrap) - - def _classify(s: str) -> str: - in_t, in_e = s in hits_t, s in hits_e - if in_t and in_e: - return PeptideClass.SHARED_DROPPED.value - if in_t: + raise ValueError("results dataframe missing 'sequence' column") + + shared = target_peptides & entrapment_peptides + target_only = target_peptides - shared + entrap_only = entrapment_peptides - shared + + def _classify(seq: str) -> str: + s = strip_mods(seq) + # Prefer the stripped form; fall back to original for sets that store + # sequences without any mod annotations (e.g., peptide-list .txt files + # whose entries were never annotated, so strip_mods is a no-op on them + # but may modify the query sequence if it contains numeric characters). + key = s if (s in target_only or s in entrap_only or s in shared) else seq + if key in target_only: return PeptideClass.TARGET.value - if in_e: + if key in entrap_only: return PeptideClass.ENTRAPMENT.value + if key in shared: + return PeptideClass.SHARED_DROPPED.value return PeptideClass.UNKNOWN.value - classes = stripped.map_elements(_classify, return_dtype=pl.Utf8) + classes = results["sequence"].map_elements(_classify, return_dtype=pl.Utf8) is_entrap = classes == PeptideClass.ENTRAPMENT.value - return results.with_columns( classes.alias("class"), is_entrap.alias("is_entrapment"), ) +def _walk_matched_counts( + keep: pl.DataFrame, + pairing: dict[str, str], +) -> tuple[list[int], list[int]]: + """Walk rows in qvalue order, accumulate (n_p_t_s, n_p_s_t) at each row. + + n_p_t_s = entrap hits whose paired_target ALSO entered the keep set + before this row AND entrap_score > paired_target_score. + n_p_s_t = entrap hits whose paired_target has NOT entered the keep set + by this row (paired target failed the threshold). + + Walks rows in the order they appear in `keep` (already sorted by qvalue). + """ + # Per-row maps for lookup + seq_col = keep["sequence"].to_list() + cls_col = keep["class"].to_list() + score_col = keep["main_score"].to_list() + # target peptide -> (row_index, score) when discovered; otherwise absent + discovered_target_score: dict[str, float] = {} + pts = 0 # n_p_t_s cumulative + pst = 0 # n_p_s_t cumulative + pts_col: list[int] = [] + pst_col: list[int] = [] + # Build target → entrap reverse index from pairing (we look up paired target + # of an entrap hit; pairing is target -> entrap by convention). + entrap_to_target = {e: t for t, e in pairing.items()} + for i in range(len(seq_col)): + seq = seq_col[i] + cls = cls_col[i] + score = float(score_col[i]) + if cls == PeptideClass.TARGET.value: + discovered_target_score[seq] = score + elif cls == PeptideClass.ENTRAPMENT.value: + paired_target = entrap_to_target.get(seq) + if paired_target is None: + # Entrap not in pairing dict → treat as if its paired target + # was not discovered; counts as n_p_s_t per the "paired target + # didn't reach threshold" branch. + pst += 1 + elif paired_target in discovered_target_score: + # Paired target already in keep set; compare scores + tscore = discovered_target_score[paired_target] + if score > tscore: + pts += 1 + # else: entrap < target; not an upper-bound contribution + else: + # Paired target not discovered yet → entrap above s, target below + pst += 1 + pts_col.append(pts) + pst_col.append(pst) + return pts_col, pst_col + + def compute_fdr_curve( classified: pl.DataFrame, - normalization_factor: float = 1.0, + ratio: float, + pairing: dict[str, str] | None = None, ) -> pl.DataFrame: - """Sort by qvalue, accumulate target/entrapment counts, return curve. + """Sort by qvalue (ascending), accumulate target/entrap counts, return curve. - Rows whose class is SHARED_DROPPED or UNKNOWN are excluded from both - numerator and denominator. + Filters to is_target=True before walking (decoys are TDA FPs, not entrapment FPs). + Only rows whose class is TARGET or ENTRAPMENT contribute. - `normalization_factor` rescales entrapment counts to compensate for - differences in target vs entrapment search-space size (e.g., from - `kmer_normalization_factor`). With factor=1.0 (default), `empirical_fdr_norm` - equals `empirical_fdr_raw`. + Emits columns: n_target, n_entrap, empirical_fdr_lower, empirical_fdr_combined. + If `pairing` provided AND `main_score` column present, also emits + empirical_fdr_matched. """ if "qvalue" not in classified.columns: raise ValueError("classified dataframe missing 'qvalue' column") + if "is_target" not in classified.columns: + raise ValueError("classified dataframe missing 'is_target' column") + if ratio < 1.0: + raise ValueError(f"ratio must be >= 1.0, got {ratio}") keep = classified.filter( - pl.col("class").is_in( + pl.col("is_target") + & pl.col("class").is_in( [PeptideClass.TARGET.value, PeptideClass.ENTRAPMENT.value] ) ).sort("qvalue") @@ -183,50 +193,70 @@ def compute_fdr_curve( n_entrap = ( (keep["class"] == PeptideClass.ENTRAPMENT.value).cast(pl.UInt32).cum_sum() ) - raw_fdr = (n_entrap.cast(pl.Float64) / (n_target + n_entrap)).fill_nan(0.0) - n_entrap_norm = n_entrap.cast(pl.Float64) * normalization_factor - norm_fdr = ( - n_entrap_norm / (n_target.cast(pl.Float64) + n_entrap_norm) - ).fill_nan(0.0) + n_total = (n_target + n_entrap).cast(pl.Float64) + lower = (n_entrap.cast(pl.Float64) / n_total).fill_nan(0.0) + combined = (n_entrap.cast(pl.Float64) * (1.0 + 1.0 / ratio) / n_total).fill_nan( + 0.0 + ) - return keep.with_columns( + out = keep.with_columns( n_target.alias("n_target"), n_entrap.alias("n_entrap"), - n_entrap_norm.alias("n_entrap_norm"), - raw_fdr.alias("empirical_fdr_raw"), - norm_fdr.alias("empirical_fdr_norm"), + lower.alias("empirical_fdr_lower"), + combined.alias("empirical_fdr_combined"), ) + if pairing is not None and "main_score" in keep.columns: + pts_col, pst_col = _walk_matched_counts(out, pairing) + out = out.with_columns( + pl.Series("n_p_t_s", pts_col, dtype=pl.UInt32), + pl.Series("n_p_s_t", pst_col, dtype=pl.UInt32), + ) + matched_num = ( + n_entrap.cast(pl.Float64) + + pl.Series("n_p_s_t", pst_col).cast(pl.Float64) + + 2.0 * pl.Series("n_p_t_s", pts_col).cast(pl.Float64) + ) + matched = (matched_num / n_total).fill_nan(0.0) + out = out.with_columns(matched.alias("empirical_fdr_matched")) + + return out + def plot_fdr_curve( curve: pl.DataFrame, output_path: str | Path, title: str = "Reported q-value vs empirical entrapment FDR", + xlim: float | None = None, ) -> None: - """Render a FDR-vs-qvalue plot to a PNG file. Plots the kmer-normalized - curve (primary) and the raw curve (faded) for comparison.""" + """Render curves to a PNG. Plots all available estimators.""" import matplotlib matplotlib.use("Agg") import matplotlib.pyplot as plt fig, ax = plt.subplots(figsize=(5, 5), dpi=150) - ax.plot( - curve["qvalue"], - curve["empirical_fdr_norm"], - lw=1.5, - label="empirical (kmer-normalized)", - ) - ax.plot( - curve["qvalue"], - curve["empirical_fdr_raw"], - lw=1.0, - ls=":", - alpha=0.6, - label="empirical (raw, unnormalized)", - ) - _max_qv = curve["qvalue"].cast(pl.Float64).max() - lim: float = max(0.05, _max_qv if isinstance(_max_qv, float) else 0.05) + if "empirical_fdr_combined" in curve.columns: + ax.plot( + curve["qvalue"], curve["empirical_fdr_combined"], lw=1.5, + label="combined (avg upper bound)", + ) + if "empirical_fdr_matched" in curve.columns: + ax.plot( + curve["qvalue"], curve["empirical_fdr_matched"], lw=1.5, + label="matched (k=1, avg upper bound)", + ) + if "empirical_fdr_lower" in curve.columns: + ax.plot( + curve["qvalue"], curve["empirical_fdr_lower"], lw=1.0, + ls=":", alpha=0.7, label="lower bound", + ) + + if xlim is None: + _max_qv = curve["qvalue"].cast(pl.Float64).max() + lim: float = max(0.05, _max_qv if isinstance(_max_qv, float) else 0.05) + else: + lim = float(xlim) ax.plot([0, lim], [0, lim], color="grey", ls="--", lw=1.0, label="y=x") ax.set_xlabel("reported q-value") ax.set_ylabel("empirical FDR") @@ -239,48 +269,135 @@ def plot_fdr_curve( plt.close(fig) +def plot_score_histogram( + classified: pl.DataFrame, + output_path: str | Path, + title: str = "main_score by class × is_target", +) -> None: + """Render a 4-group main_score histogram (log-x, log-y count).""" + import matplotlib + import numpy as np + + matplotlib.use("Agg") + import matplotlib.pyplot as plt + + if "main_score" not in classified.columns: + raise ValueError("classified dataframe missing 'main_score' column") + + df = classified + groups = { + "target × class=target": df.filter( + pl.col("is_target") & (pl.col("class") == "target") + ), + "target × class=entrap": df.filter( + pl.col("is_target") & (pl.col("class") == "entrapment") + ), + "decoy × class=target": df.filter( + ~pl.col("is_target") & (pl.col("class") == "target") + ), + "decoy × class=entrap": df.filter( + ~pl.col("is_target") & (pl.col("class") == "entrapment") + ), + } + all_scores = df["main_score"].to_numpy() + all_scores = all_scores[np.isfinite(all_scores) & (all_scores > 0)] + if len(all_scores) == 0: + # Empty plot rather than crash + fig, ax = plt.subplots(figsize=(9, 5), dpi=150) + ax.set_title(title + " (no positive scores)") + fig.savefig(output_path) + plt.close(fig) + return + bins = np.logspace(np.log10(all_scores.min()), np.log10(all_scores.max()), 80) + + fig, ax = plt.subplots(figsize=(9, 5), dpi=150) + colors = { + "target × class=target": "C0", + "target × class=entrap": "C1", + "decoy × class=target": "C2", + "decoy × class=entrap": "C3", + } + for label, sub in groups.items(): + s = sub["main_score"].to_numpy() + s = s[np.isfinite(s) & (s > 0)] + if len(s) == 0: + continue + ax.hist( + s, bins=bins, alpha=0.6, label=f"{label} (n={len(s)})", + color=colors[label], histtype="step", linewidth=1.5, + ) + ax.set_xscale("log") + ax.set_yscale("log") + ax.set_xlabel("main_score (log)") + ax.set_ylabel("count (log)") + ax.legend(loc="best", fontsize=9) + ax.set_title(title) + fig.tight_layout() + fig.savefig(output_path) + plt.close(fig) + + def _scalar_at_q( curve: pl.DataFrame, q_threshold: float, suffix: str ) -> dict[str, float | int]: - """Read off n_target / n_entrap / empirical FDR (raw + norm) at q <= threshold.""" sub = curve.filter(pl.col("qvalue") <= q_threshold) if sub.height == 0: - return { + out: dict[str, float | int] = { f"entrap/n_target_at_{suffix}": 0, f"entrap/n_entrap_at_{suffix}": 0, - f"entrap/empirical_fdr_raw_at_{suffix}": 0.0, - f"entrap/empirical_fdr_norm_at_{suffix}": 0.0, + f"entrap/empirical_fdr_lower_at_{suffix}": 0.0, + f"entrap/empirical_fdr_combined_at_{suffix}": 0.0, } + if "empirical_fdr_matched" in curve.columns: + out[f"entrap/empirical_fdr_matched_at_{suffix}"] = 0.0 + return out last = sub.row(-1, named=True) - return { + out = { f"entrap/n_target_at_{suffix}": int(last["n_target"]), f"entrap/n_entrap_at_{suffix}": int(last["n_entrap"]), - f"entrap/empirical_fdr_raw_at_{suffix}": float(last["empirical_fdr_raw"]), - f"entrap/empirical_fdr_norm_at_{suffix}": float(last["empirical_fdr_norm"]), + f"entrap/empirical_fdr_lower_at_{suffix}": float(last["empirical_fdr_lower"]), + f"entrap/empirical_fdr_combined_at_{suffix}": float( + last["empirical_fdr_combined"] + ), } + if "empirical_fdr_matched" in curve.columns: + out[f"entrap/empirical_fdr_matched_at_{suffix}"] = float( + last["empirical_fdr_matched"] + ) + return out def analyse( results_parquet: str | Path, - target_fasta: str | Path, - entrapment_fasta: str | Path, + target_peptides: str | Path, + entrapment_peptides: str | Path, + ratio: float, + pairing_path: str | Path | None, out_parquet: str | Path, - out_plot: str | Path, + out_fdr_plot: str | Path, + out_hist_plot: str | Path, title: str = "Reported q-value vs empirical entrapment FDR", - kmer_k: int = 7, ) -> dict[str, float | int]: - """End-to-end: classify → kmer norm → FDR walk → write parquet+plot → scalars.""" + """End-to-end: classify → curve → write parquet + 2 plots → return scalars.""" results = pl.read_parquet(results_parquet) - classified = classify_peptides(results, target_fasta, entrapment_fasta) + target_set = load_peptide_set(target_peptides) + entrap_set = load_peptide_set(entrapment_peptides) + classified = classify_peptides(results, target_set, entrap_set) Path(out_parquet).parent.mkdir(parents=True, exist_ok=True) classified.write_parquet(out_parquet) - factor = kmer_normalization_factor(target_fasta, entrapment_fasta, k=kmer_k) - curve = compute_fdr_curve(classified, normalization_factor=factor) - Path(out_plot).parent.mkdir(parents=True, exist_ok=True) - plot_fdr_curve(curve, out_plot, title=title) + pairing = None + if pairing_path is not None: + pairing = load_pairing(pairing_path) + + curve = compute_fdr_curve(classified, ratio=ratio, pairing=pairing) + Path(out_fdr_plot).parent.mkdir(parents=True, exist_ok=True) + plot_fdr_curve(curve, out_fdr_plot, title=title) + + Path(out_hist_plot).parent.mkdir(parents=True, exist_ok=True) + plot_score_histogram(classified, out_hist_plot, title=f"{title} (score hist)") - scalars: dict[str, float | int] = {"entrap/normalization_factor": float(factor)} + scalars: dict[str, float | int] = {"entrap/ratio": float(ratio)} scalars.update(_scalar_at_q(curve, 0.01, "q01")) scalars.update(_scalar_at_q(curve, 0.05, "q05")) return scalars diff --git a/bench/tests/test_entrapment.py b/bench/tests/test_entrapment.py index daa79532..f622f557 100644 --- a/bench/tests/test_entrapment.py +++ b/bench/tests/test_entrapment.py @@ -5,10 +5,10 @@ analyse, classify_peptides, compute_fdr_curve, - count_kmers, - kmer_normalization_factor, - parse_fasta, + load_pairing, + load_peptide_set, plot_fdr_curve, + plot_score_histogram, strip_mods, ) @@ -18,21 +18,26 @@ def test_strip_mods(): assert strip_mods("PEPC[U:4]TIDEK") == "PEPCTIDEK" assert strip_mods("PEP(Phospho)TIDEK") == "PEPTIDEK" assert strip_mods("123.45PEPTIDEK") == "PEPTIDEK" - assert strip_mods("n[42]PEPTIDEK") == "nPEPTIDEK" # keep alpha n-term marker + assert strip_mods("n[42]PEPTIDEK") == "nPEPTIDEK" -def test_parse_fasta(tmp_path): - p = tmp_path / "p.fasta" - p.write_text(">sp|P1|A\nMKLAA\nDDDD\n>sp|P2|B\nLLLL\n") - out = parse_fasta(p) - assert out == {"sp|P1|A": "MKLAADDDD", "sp|P2|B": "LLLL"} +def test_load_peptide_set(tmp_path): + p = tmp_path / "peps.txt" + p.write_text("PEPTIDEK\nLAGEPRVK\n\nMRSEQGLAR\n") + out = load_peptide_set(p) + assert out == {"PEPTIDEK", "LAGEPRVK", "MRSEQGLAR"} -def test_classify_peptides(tmp_path): - target = tmp_path / "t.fasta" - target.write_text(">T1\nAAAAPEPTIDEKBBBB\n>T2\nMMMMSHAREDXXXX\n") - entrap = tmp_path / "e.fasta" - entrap.write_text(">E1\nQQQQENTRAPEPTKZZZZ\n>E2\nMMMMSHAREDYYYY\n") +def test_load_pairing(tmp_path): + p = tmp_path / "pairs.tsv" + p.write_text("target_peptide\tentrap_peptide\nPEPTIDEK\tEDPEKTIK\nLAGEPRVK\tGAEPLRVK\n") + out = load_pairing(p) + assert out == {"PEPTIDEK": "EDPEKTIK", "LAGEPRVK": "GAEPLRVK"} + + +def test_classify_peptides_set_membership(tmp_path): + target = {"PEPTIDEK", "SHARED", "ANOTHER"} + entrap = {"ENTRAPEPTK", "SHARED"} df = pl.DataFrame({"sequence": ["PEPTIDEK", "ENTRAPEPTK", "SHARED", "GHOSTAA"]}) classified = classify_peptides(df, target, entrap) @@ -42,7 +47,7 @@ def test_classify_peptides(tmp_path): assert classes["ENTRAPEPTK"] == PeptideClass.ENTRAPMENT.value assert classes["SHARED"] == PeptideClass.SHARED_DROPPED.value assert classes["GHOSTAA"] == PeptideClass.UNKNOWN.value - # is_entrapment column: True only for ENTRAPMENT + is_e = dict(zip(classified["sequence"], classified["is_entrapment"])) assert is_e["ENTRAPEPTK"] is True assert is_e["PEPTIDEK"] is False @@ -50,195 +55,235 @@ def test_classify_peptides(tmp_path): assert is_e["GHOSTAA"] is False -def test_classify_peptides_strips_mods_before_match(tmp_path): - target = tmp_path / "t.fasta" - target.write_text(">T1\nAAAAPEPTIDEKBBBB\n") - entrap = tmp_path / "e.fasta" - entrap.write_text(">E1\nQQQQ\n") - - df = pl.DataFrame({"sequence": ["PEPC[U:4]PTIDEK"]}) - # Stripped form is PEPCPTIDEK which is NOT in target. Confirm we end up unknown. +def test_classify_strips_mods_before_match(): + target = {"PEPTIDEK"} + entrap = {"AAAAAAAA"} + df = pl.DataFrame({"sequence": ["PEPT[U:4]IDEK", "PEPC[U:4]PTIDEK"]}) classified = classify_peptides(df, target, entrap) - assert classified["class"][0] == PeptideClass.UNKNOWN.value - - # But a real target match works after stripping - df2 = pl.DataFrame({"sequence": ["PEPT[U:4]IDEK"]}) - classified2 = classify_peptides(df2, target, entrap) - assert classified2["class"][0] == PeptideClass.TARGET.value + classes = classified["class"].to_list() + # stripped → "PEPTIDEK" ∈ target + assert classes[0] == PeptideClass.TARGET.value + # stripped → "PEPCPTIDEK" not in either set + assert classes[1] == PeptideClass.UNKNOWN.value + + +def test_classify_filters_to_targets(): + """is_target=False rows are excluded from FDR walk regardless of class column. + + With the new design, classify_peptides classifies ALL rows by sequence + membership; the FDR walk in compute_fdr_curve filters is_target=True. + """ + target = {"PEP1", "PEP2", "PEP3"} + entrap = {"ENT1", "ENT2"} + df = pl.DataFrame({ + "sequence": ["PEP1", "ENT1", "PEP2", "ENT2"], + "qvalue": [0.001, 0.005, 0.01, 0.02], + "is_target":[True, True, False, True], + }) + classified = classify_peptides(df, target, entrap) + curve = compute_fdr_curve(classified, ratio=1.0) + # Only the 3 is_target=True rows enter the curve + assert curve.height == 3 + # is_target=True: PEP1 (target, q=.001), ENT1 (entrap, q=.005), ENT2 (entrap, q=.02) + last = curve.row(-1, named=True) + assert last["n_target"] == 1 + assert last["n_entrap"] == 2 -def test_compute_fdr_curve_basic(): +def test_compute_fdr_curve_lower_combined(tmp_path): classified = pl.DataFrame({ - "qvalue": [0.001, 0.005, 0.01, 0.02, 0.05], - "class": ["target", "target", "entrapment", "target", "entrapment"], + "qvalue": [0.001, 0.005, 0.01, 0.02, 0.05], + "class": ["target", "target", "entrapment", "target", "entrapment"], + "is_target": [True, True, True, True, True], + "main_score":[100.0, 90.0, 80.0, 70.0, 60.0], }) - curve = compute_fdr_curve(classified) - # Sorted ascending by qvalue - assert curve["qvalue"].to_list() == [0.001, 0.005, 0.01, 0.02, 0.05] + curve = compute_fdr_curve(classified, ratio=1.0) # n_target cumulative assert curve["n_target"].to_list() == [1, 2, 2, 3, 3] # n_entrap cumulative assert curve["n_entrap"].to_list() == [0, 0, 1, 1, 2] - # empirical_fdr = n_e / (n_t + n_e) + # Lower at last: 2/(3+2) = 0.4 last = curve.row(-1, named=True) - assert last["empirical_fdr_raw"] == 2 / 5 - assert last["empirical_fdr_norm"] == 2 / 5 # factor defaults to 1.0 + assert abs(last["empirical_fdr_lower"] - 2/5) < 1e-9 + # Combined at last: 2 * (1+1/1) / (3+2) = 0.8 (factor 2 for r=1) + assert abs(last["empirical_fdr_combined"] - 4/5) < 1e-9 + + +def test_compute_fdr_curve_combined_with_ratio_above_one(): + classified = pl.DataFrame({ + "qvalue": [0.01, 0.02], + "class": ["target", "entrapment"], + "is_target": [True, True], + "main_score":[100.0, 90.0], + }) + curve = compute_fdr_curve(classified, ratio=2.0) + # Combined: n_e * (1 + 1/2) / (n_e + n_t) = 1 * 1.5 / 2 = 0.75 + last = curve.row(-1, named=True) + assert abs(last["empirical_fdr_combined"] - 0.75) < 1e-9 + + +def test_compute_fdr_curve_matched_with_pairing(): + """Matched estimator requires pairing dict and main_score column. + + Construct: 2 targets, both at q≤s; their paired entrap peptides also score + well (one beats its target, one loses). + """ + classified = pl.DataFrame({ + "sequence": ["PEP1", "ENT1", "PEP2", "ENT2"], + "qvalue": [0.001, 0.002, 0.003, 0.004], # all under threshold + "class": ["target", "entrapment", "target", "entrapment"], + "is_target":[True, True, True, True], + "main_score":[100.0, 200.0, 150.0, 50.0], # ENT1 beats PEP1; ENT2 loses to PEP2 + }) + pairing = {"PEP1": "ENT1", "PEP2": "ENT2"} + curve = compute_fdr_curve(classified, ratio=1.0, pairing=pairing) + # n_e = 2, n_t = 2 at the end + # n_p_t_s (entrap > paired_target ≥ s): ENT1 wins over PEP1, both above s → 1 + # n_p_s_t (entrap ≥ s > paired_target): both targets are above s, so this is 0 + last = curve.row(-1, named=True) + expected = (2 + 0 + 2*1) / (2 + 2) # = 1.0 + assert abs(last["empirical_fdr_matched"] - expected) < 1e-9 + + +def test_compute_fdr_curve_matched_target_below_threshold(): + """Entrap discovered, paired target NOT under the chosen threshold. + + Walk only includes rows that ARE discovered (q ≤ threshold). To exercise + the n_p_s_t branch, place the entrap row in the curve and the paired target + OUT (e.g., qvalue > threshold). compute_fdr_curve walks ALL rows; we read + off the row at the entrap's qvalue. + """ + classified = pl.DataFrame({ + "sequence": ["PEP1", "ENT1"], + "qvalue": [0.5, 0.001], # PEP1 has high qvalue, ENT1 low + "class": ["target", "entrapment"], + "is_target": [True, True], + "main_score":[10.0, 100.0], + }) + pairing = {"PEP1": "ENT1"} + curve = compute_fdr_curve(classified, ratio=1.0, pairing=pairing) + # Sorted by qvalue: ENT1 (q=0.001), PEP1 (q=0.5) + # At ENT1's row (first row): n_t=0, n_e=1 + # ENT1's paired target PEP1 has q=0.5 > 0.001 (not yet discovered) + # → n_p_s_t = 1 (entrap discovered, paired target NOT yet) + # → n_p_t_s = 0 + first = curve.row(0, named=True) + assert first["n_target"] == 0 + assert first["n_entrap"] == 1 + expected = (1 + 1 + 0) / (0 + 1) + assert abs(first["empirical_fdr_matched"] - expected) < 1e-9 + + +def test_compute_fdr_curve_no_pairing_no_matched_column(): + classified = pl.DataFrame({ + "qvalue": [0.01, 0.02], + "class": ["target", "entrapment"], + "is_target": [True, True], + "main_score":[100.0, 90.0], + }) + curve = compute_fdr_curve(classified, ratio=1.0) + assert "empirical_fdr_matched" not in curve.columns + assert "empirical_fdr_lower" in curve.columns + assert "empirical_fdr_combined" in curve.columns def test_compute_fdr_curve_excludes_shared_and_unknown(): classified = pl.DataFrame({ - "qvalue": [0.01, 0.01, 0.01, 0.01], - "class": ["target", "shared_dropped", "unknown", "entrapment"], + "qvalue": [0.01, 0.01, 0.01, 0.01], + "class": ["target", "shared_dropped", "unknown", "entrapment"], + "is_target": [True, True, True, True], + "main_score":[100.0, 90.0, 80.0, 70.0], }) - curve = compute_fdr_curve(classified) - # Only one target + one entrapment row contribute - assert curve.height == 2 + curve = compute_fdr_curve(classified, ratio=1.0) + assert curve.height == 2 # only target + entrap survive assert sorted(curve["class"].to_list()) == ["entrapment", "target"] def test_plot_fdr_curve_writes_png(tmp_path): curve = pl.DataFrame({ - "qvalue": [0.001, 0.01, 0.05], - "n_target": [10, 50, 100], - "n_entrap": [0, 1, 5], - "empirical_fdr_raw": [0.0, 1 / 51, 5 / 105], - "empirical_fdr_norm": [0.0, 1 / 51, 5 / 105], + "qvalue": [0.001, 0.01, 0.05], + "n_target": [10, 50, 100], + "n_entrap": [0, 1, 5], + "empirical_fdr_lower": [0.0, 1/51, 5/105], + "empirical_fdr_combined": [0.0, 2/51, 10/105], }) out = tmp_path / "fdr.png" plot_fdr_curve(curve, out, title="test") - assert out.exists() - assert out.stat().st_size > 1000 # not an empty or stub file + assert out.exists() and out.stat().st_size > 1000 + + +def test_plot_score_histogram_writes_png(tmp_path): + classified = pl.DataFrame({ + "main_score": [1e3, 1e4, 1e5, 1e6, 1e7] * 5, + "class": ["target","entrapment","target","entrapment","target"] * 5, + "is_target": [True, True, False, False, True] * 5, + }) + out = tmp_path / "hist.png" + plot_score_histogram(classified, out, title="test") + assert out.exists() and out.stat().st_size > 1000 def test_analyse_end_to_end(tmp_path): - target = tmp_path / "t.fasta" - target.write_text(">T1\nAAAAPEPTIDEKBBBB\n") - entrap = tmp_path / "e.fasta" - entrap.write_text(">E1\nQQQQENTRAPEPTKZZZZ\n") + target_peps = tmp_path / "t.txt" + target_peps.write_text("PEP1\nPEP2\nPEP3\n") + entrap_peps = tmp_path / "e.txt" + entrap_peps.write_text("ENT1\nENT2\n") results = pl.DataFrame({ - "sequence": ["PEPTIDEK", "ENTRAPEPTK", "PEPTIDEK", "ENTRAPEPTK"], - "qvalue": [0.001, 0.02, 0.005, 0.04], + "sequence": ["PEP1", "ENT1", "PEP2", "ENT2"], + "qvalue": [0.001, 0.02, 0.005, 0.04], + "is_target": [True, True, True, True], + "main_score":[100.0, 80.0, 95.0, 70.0], }) - results_path = tmp_path / "results.parquet" + results_path = tmp_path / "r.parquet" results.write_parquet(results_path) - out = analyse( results_parquet=results_path, - target_fasta=target, - entrapment_fasta=entrap, - out_parquet=tmp_path / "classified.parquet", - out_plot=tmp_path / "fdr.png", + target_peptides=target_peps, + entrapment_peptides=entrap_peps, + ratio=1.0, + pairing_path=None, + out_parquet=tmp_path / "c.parquet", + out_fdr_plot=tmp_path / "fdr.png", + out_hist_plot=tmp_path / "hist.png", ) - - # Returned scalars - assert out["entrap/n_target_at_q01"] == 2 # both PEPTIDEK rows have q <= 0.01 - assert out["entrap/n_entrap_at_q01"] == 0 - assert out["entrap/empirical_fdr_raw_at_q01"] == 0.0 - assert out["entrap/n_target_at_q05"] == 2 - assert out["entrap/n_entrap_at_q05"] == 2 - - # Outputs - assert (tmp_path / "classified.parquet").exists() + # Required scalars + assert "entrap/n_target_at_q01" in out + assert "entrap/n_entrap_at_q01" in out + assert "entrap/empirical_fdr_lower_at_q01" in out + assert "entrap/empirical_fdr_combined_at_q01" in out + assert "entrap/ratio" in out + assert out["entrap/ratio"] == 1.0 + # Files exist + assert (tmp_path / "c.parquet").exists() assert (tmp_path / "fdr.png").exists() + assert (tmp_path / "hist.png").exists() - classified = pl.read_parquet(tmp_path / "classified.parquet") - assert "class" in classified.columns and "is_entrapment" in classified.columns - - -def test_count_kmers_basic(tmp_path): - p = tmp_path / "p.fasta" - p.write_text(">A\nABCDEFG\n>B\nABCDEFGH\n") - kmers = count_kmers(p, k=7) - # First protein contributes "ABCDEFG" exactly (length 7 → one kmer). - # Second protein contributes "ABCDEFG" and "BCDEFGH". - # Set dedupes the shared kmer. - assert kmers == {"ABCDEFG", "BCDEFGH"} - - -def test_count_kmers_skips_too_short(tmp_path): - p = tmp_path / "p.fasta" - p.write_text(">A\nABCDEF\n") # length 6, smaller than k=7 - assert count_kmers(p, k=7) == set() - - -def test_kmer_normalization_factor(tmp_path): - target = tmp_path / "t.fasta" - # 4 kmers of length 7: ABCDEFG, BCDEFGH, CDEFGHI, DEFGHIJ - target.write_text(">T\nABCDEFGHIJ\n") - entrap = tmp_path / "e.fasta" - entrap.write_text(">E\nABCDEFG\n") # 1 kmer: ABCDEFG (shared with target) - - # After dropping shared kmers: target has {BCDEFGH, CDEFGHI, DEFGHIJ} (3); - # entrap has {} (0). Factor = target / max(1, entrap) → 3/1 = 3.0. - f = kmer_normalization_factor(target, entrap, k=7) - assert f == 3.0 - - -def test_kmer_normalization_factor_balanced(tmp_path): - target = tmp_path / "t.fasta" - target.write_text(">T\nAAAAAAAA\n") # 2 kmers: AAAAAAA, AAAAAAA → set: {AAAAAAA} - entrap = tmp_path / "e.fasta" - entrap.write_text(">E\nBBBBBBBB\n") # 2 kmers: BBBBBBB, BBBBBBB → set: {BBBBBBB} - # No shared kmers; both have 1 → factor = 1.0 - f = kmer_normalization_factor(target, entrap, k=7) - assert f == 1.0 +def test_analyse_with_pairing_emits_matched(tmp_path): + target_peps = tmp_path / "t.txt" + target_peps.write_text("PEP1\nPEP2\n") + entrap_peps = tmp_path / "e.txt" + entrap_peps.write_text("ENT1\nENT2\n") + pairs_path = tmp_path / "pairs.tsv" + pairs_path.write_text("target_peptide\tentrap_peptide\nPEP1\tENT1\nPEP2\tENT2\n") -def test_compute_fdr_curve_with_normalization(): - """Plain raw FDR uses n_e / (n_e + n_t); normalized scales n_e by factor.""" - classified = pl.DataFrame( - { - "qvalue": [0.001, 0.005, 0.01, 0.02, 0.05], - "class": ["target", "target", "entrapment", "target", "entrapment"], - } - ) - curve = compute_fdr_curve(classified, normalization_factor=3.0) - - # Raw counts unchanged - assert curve["n_target"].to_list() == [1, 2, 2, 3, 3] - assert curve["n_entrap"].to_list() == [0, 0, 1, 1, 2] - # Raw fdr unchanged - assert curve["empirical_fdr_raw"].to_list() == [0.0, 0.0, 1 / 3, 1 / 4, 2 / 5] - # Normalized fdr at last row: n_e * 3 / (n_t + n_e * 3) = 6 / (3 + 6) = 2/3 - last = curve.row(-1, named=True) - assert last["empirical_fdr_norm"] == 2 / 3 - assert last["n_entrap_norm"] == 6.0 # 2 * 3.0 - - -def test_compute_fdr_curve_default_factor_is_one(): - """Default factor=1.0 keeps backward-compatible raw == norm behavior.""" - classified = pl.DataFrame( - { - "qvalue": [0.001, 0.01], - "class": ["target", "entrapment"], - } - ) - curve = compute_fdr_curve(classified) - assert curve["empirical_fdr_norm"].to_list() == curve["empirical_fdr_raw"].to_list() - - -def test_analyse_includes_normalization_factor(tmp_path): - """analyse() computes and applies kmer normalization automatically.""" - target = tmp_path / "t.fasta" - # homopolymer of length 50 → exactly 1 unique kmer "AAAAAAA" - target.write_text(">T1\n" + "A" * 50 + "\n") - entrap = tmp_path / "e.fasta" - entrap.write_text(">E1\n" + "B" * 50 + "\n") - - results = pl.DataFrame( - {"sequence": ["AAAAAAA", "BBBBBBB"], "qvalue": [0.001, 0.001]} - ) + results = pl.DataFrame({ + "sequence": ["PEP1", "ENT1", "PEP2", "ENT2"], + "qvalue": [0.001, 0.002, 0.003, 0.004], + "is_target": [True, True, True, True], + "main_score":[100.0, 200.0, 150.0, 50.0], + }) results_path = tmp_path / "r.parquet" results.write_parquet(results_path) out = analyse( results_parquet=results_path, - target_fasta=target, - entrapment_fasta=entrap, + target_peptides=target_peps, + entrapment_peptides=entrap_peps, + ratio=1.0, + pairing_path=pairs_path, out_parquet=tmp_path / "c.parquet", - out_plot=tmp_path / "f.png", + out_fdr_plot=tmp_path / "fdr.png", + out_hist_plot=tmp_path / "hist.png", ) - # Both proteins are pure homopolymer of length 50 → 1 unique kmer each → factor=1.0 - # Returned scalars include the factor - assert "entrap/normalization_factor" in out - assert out["entrap/normalization_factor"] == 1.0 + assert "entrap/empirical_fdr_matched_at_q01" in out From e667afdd97a8b11bff3da682f631fc7f94b64d25 Mon Sep 17 00:00:00 2001 From: "J. Sebastian Paez" Date: Sat, 9 May 2026 10:21:50 -0700 Subject: [PATCH 35/41] feat(bench)!: thread peptide-list schema through wandb_bench + stage_fixture --- bench/stage_fixture.py | 44 +++++++++---- bench/tests/test_stage_fixture.py | 79 +++++++++++++++++------ bench/tests/test_wandb_bench.py | 100 ++++++++++++++++++++++++++---- bench/wandb_bench.py | 37 +++++++---- 4 files changed, 204 insertions(+), 56 deletions(-) diff --git a/bench/stage_fixture.py b/bench/stage_fixture.py index a4eb22cf..3c8b606a 100644 --- a/bench/stage_fixture.py +++ b/bench/stage_fixture.py @@ -95,18 +95,24 @@ def stage( cache_root = cache_dir / name cache_root.mkdir(parents=True, exist_ok=True) - fasta_local = _stage_one_file( - fx.inputs.fasta, cache_root / "proteome.fasta", force + target_pep_local = _stage_one_file( + fx.inputs.target_peptides, cache_root / "target.peptides.txt", force ) speclib_local = _stage_one_file( fx.inputs.speclib, cache_root / "lib.msgpack.zst", force ) raw_local = _stage_one_dir(fx.inputs.raw, cache_root / "sample.d", force) - entrap_local: str | None = None - if fx.inputs.entrapment_fasta is not None: - entrap_local = _stage_one_file( - fx.inputs.entrapment_fasta, cache_root / "entrap.fasta", force + entrap_pep_local: str | None = None + if fx.inputs.entrapment_peptides is not None: + entrap_pep_local = _stage_one_file( + fx.inputs.entrapment_peptides, cache_root / "entrap.peptides.txt", force + ) + + pairing_local: str | None = None + if fx.inputs.pairing is not None: + pairing_local = _stage_one_file( + fx.inputs.pairing, cache_root / "pairing.tsv", force ) calib_local: str | None = None @@ -119,10 +125,13 @@ def stage( name=name, description=fx.description, config=fx.config, - fasta_uri=fasta_local, + target_peptides_uri=target_pep_local, speclib_uri=speclib_local, raw_uri=raw_local, - entrapment_fasta_uri=entrap_local, + entrapment_peptides_uri=entrap_pep_local, + entrapment_ratio=fx.inputs.entrapment_ratio, + entrapment_mode=fx.inputs.entrapment_mode, + pairing_uri=pairing_local, calibration_speclib_uri=calib_local, ) out.parent.mkdir(parents=True, exist_ok=True) @@ -139,10 +148,13 @@ def _build_staged_toml( name: str, description: str, config: dict, - fasta_uri: str, + target_peptides_uri: str, speclib_uri: str, raw_uri: str, - entrapment_fasta_uri: str | None, + entrapment_peptides_uri: str | None, + entrapment_ratio: float | None, + entrapment_mode: str | None, + pairing_uri: str | None, calibration_speclib_uri: str | None, ) -> str: """Emit a staged-fixture TOML body. Mirrors push_fixture.build_fixture_toml's @@ -154,11 +166,17 @@ def _build_staged_toml( lines.append(f'description = "{desc}"') lines.append("") lines.append("[inputs]") - lines.append(f'fasta = "{fasta_uri}"') + lines.append(f'target_peptides = "{target_peptides_uri}"') lines.append(f'speclib = "{speclib_uri}"') lines.append(f'raw = "{raw_uri}"') - if entrapment_fasta_uri is not None: - lines.append(f'entrapment_fasta = "{entrapment_fasta_uri}"') + if entrapment_peptides_uri is not None: + lines.append(f'entrapment_peptides = "{entrapment_peptides_uri}"') + if entrapment_ratio is not None: + lines.append(f"entrapment_ratio = {_toml_value(entrapment_ratio)}") + if entrapment_mode is not None: + lines.append(f'entrapment_mode = "{entrapment_mode}"') + if pairing_uri is not None: + lines.append(f'pairing = "{pairing_uri}"') if calibration_speclib_uri is not None: lines.append(f'calibration_speclib = "{calibration_speclib_uri}"') lines.append("") diff --git a/bench/tests/test_stage_fixture.py b/bench/tests/test_stage_fixture.py index 4b76fdc8..e236a36b 100644 --- a/bench/tests/test_stage_fixture.py +++ b/bench/tests/test_stage_fixture.py @@ -6,11 +6,21 @@ def _write_fixture( - dir: Path, name: str, *, with_entrap: bool = False, with_calib: bool = False + dir: Path, + name: str, + *, + with_entrap: bool = False, + with_pairing: bool = False, + with_calib: bool = False, ) -> Path: extras = "" if with_entrap: - extras += '\nentrapment_fasta = "s3://b/entrap.fasta"' + extras += '\nentrapment_peptides = "s3://b/entrap.peptides.txt"' + extras += '\nentrapment_ratio = 1.0' + mode = "shuffled" if with_pairing else "foreign" + extras += f'\nentrapment_mode = "{mode}"' + if with_pairing: + extras += '\npairing = "s3://b/pairs.tsv"' if with_calib: extras += '\ncalibration_speclib = "s3://b/calib.msgpack.zst"' p = dir / f"{name}.toml" @@ -21,7 +31,7 @@ def _write_fixture( description = "x" [inputs] - fasta = "s3://b/p.fasta" + target_peptides = "s3://b/target.peptides.txt" speclib = "s3://b/lib.msgpack.zst" raw = "s3://b/sample.d"{extras} @@ -72,11 +82,11 @@ def test_stage_minimal(tmp_path, fake_s3): ) # Files were "downloaded" - assert (cache / "hela" / "proteome.fasta").exists() + assert (cache / "hela" / "target.peptides.txt").exists() assert (cache / "hela" / "lib.msgpack.zst").exists() assert (cache / "hela" / "sample.d" / "metadata").exists() - # download_file called for fasta + speclib (2x); sync called for raw (1x) + # download_file called for target_peptides + speclib (2x); sync called for raw (1x) assert fake_s3["download"].call_count == 2 assert fake_s3["sync"].call_count == 1 @@ -85,7 +95,7 @@ def test_stage_minimal(tmp_path, fake_s3): from bench._fixture_schema import load_fixture fx = load_fixture(out) assert fx.name == "hela" - assert fx.inputs.fasta == str(cache / "hela" / "proteome.fasta") + assert fx.inputs.target_peptides == str(cache / "hela" / "target.peptides.txt") assert fx.inputs.speclib == str(cache / "hela" / "lib.msgpack.zst") assert fx.inputs.raw == str(cache / "hela" / "sample.d") @@ -107,7 +117,7 @@ def test_stage_with_entrap_and_calib(tmp_path, fake_s3): overwrite=False, force=False, ) - # 4 file downloads (fasta + speclib + entrap_fasta + calib_speclib); 1 sync (raw) + # 4 downloads (target_peptides, speclib, entrap_peptides, calib); 1 sync (raw) assert fake_s3["download"].call_count == 4 assert fake_s3["sync"].call_count == 1 @@ -115,18 +125,49 @@ def test_stage_with_entrap_and_calib(tmp_path, fake_s3): fx = load_fixture(out) assert fx.has_entrapment() assert fx.has_calibration_speclib() - assert fx.inputs.entrapment_fasta == str(cache / "hy" / "entrap.fasta") + assert fx.inputs.entrapment_peptides == str(cache / "hy" / "entrap.peptides.txt") + assert fx.inputs.entrapment_ratio == 1.0 + assert fx.inputs.entrapment_mode == "foreign" assert fx.inputs.calibration_speclib == str(cache / "hy" / "calib.msgpack.zst") +def test_stage_with_pairing(tmp_path, fake_s3): + fx_dir = tmp_path / "fx" + fx_dir.mkdir() + _write_fixture(fx_dir, "sh", with_entrap=True, with_pairing=True) + cache = tmp_path / "cache" + out = tmp_path / "staged" / "sh.toml" + + from bench.stage_fixture import stage + + stage( + name="sh", + fixtures_dir=fx_dir, + cache_dir=cache, + out=out, + overwrite=False, + force=False, + ) + # 4 file downloads: target_peptides, entrap_peptides, pairing, speclib; 1 sync (raw) + assert fake_s3["download"].call_count == 4 + assert fake_s3["sync"].call_count == 1 + + from bench._fixture_schema import load_fixture + fx = load_fixture(out) + assert fx.has_entrapment() + assert fx.has_pairing() + assert fx.inputs.entrapment_mode == "shuffled" + assert fx.inputs.pairing == str(cache / "sh" / "pairing.tsv") + + def test_stage_skips_existing_local_files(tmp_path, fake_s3): fx_dir = tmp_path / "fx" fx_dir.mkdir() _write_fixture(fx_dir, "hela") cache = tmp_path / "cache" - # Pre-create the fasta + speclib (raw is always synced — sync is itself idempotent) + # Pre-create target_peptides + speclib (raw is always synced — sync is idempotent) (cache / "hela").mkdir(parents=True) - (cache / "hela" / "proteome.fasta").write_text("preexisting") + (cache / "hela" / "target.peptides.txt").write_text("preexisting") (cache / "hela" / "lib.msgpack.zst").write_text("preexisting") out = tmp_path / "staged" / "hela.toml" @@ -144,8 +185,8 @@ def test_stage_skips_existing_local_files(tmp_path, fake_s3): assert fake_s3["download"].call_count == 0 # Sync still runs (raw .d) — sync itself is idempotent assert fake_s3["sync"].call_count == 1 - # Local fasta content was NOT overwritten - assert (cache / "hela" / "proteome.fasta").read_text() == "preexisting" + # Local target_peptides content was NOT overwritten + assert (cache / "hela" / "target.peptides.txt").read_text() == "preexisting" def test_stage_force_redownloads(tmp_path, fake_s3): @@ -154,7 +195,7 @@ def test_stage_force_redownloads(tmp_path, fake_s3): _write_fixture(fx_dir, "hela") cache = tmp_path / "cache" (cache / "hela").mkdir(parents=True) - (cache / "hela" / "proteome.fasta").write_text("preexisting") + (cache / "hela" / "target.peptides.txt").write_text("preexisting") out = tmp_path / "staged" / "hela.toml" from bench.stage_fixture import stage @@ -170,7 +211,7 @@ def test_stage_force_redownloads(tmp_path, fake_s3): # Force re-downloads even if local exists assert fake_s3["download"].call_count == 2 # Stub overwrites the preexisting file - assert (cache / "hela" / "proteome.fasta").read_text().startswith("# stub") + assert (cache / "hela" / "target.peptides.txt").read_text().startswith("# stub") def test_stage_preserves_local_paths(tmp_path, fake_s3): @@ -178,8 +219,8 @@ def test_stage_preserves_local_paths(tmp_path, fake_s3): fx_dir = tmp_path / "fx" fx_dir.mkdir() p = fx_dir / "lo.toml" - local_fasta = tmp_path / "abs_p.fasta" - local_fasta.write_text(">x\nMK\n") + local_peptides = tmp_path / "abs_target.peptides.txt" + local_peptides.write_text("MK\nLL\n") p.write_text( textwrap.dedent( f""" @@ -187,7 +228,7 @@ def test_stage_preserves_local_paths(tmp_path, fake_s3): description = "x" [inputs] - fasta = "{local_fasta}" + target_peptides = "{local_peptides}" speclib = "s3://b/lib.msgpack.zst" raw = "s3://b/sample.d" @@ -209,11 +250,11 @@ def test_stage_preserves_local_paths(tmp_path, fake_s3): overwrite=False, force=False, ) - # Only the speclib gets downloaded; fasta is referenced as-is + # Only the speclib gets downloaded; target_peptides is referenced as-is assert fake_s3["download"].call_count == 1 from bench._fixture_schema import load_fixture fx = load_fixture(out) - assert fx.inputs.fasta == str(local_fasta) + assert fx.inputs.target_peptides == str(local_peptides) def test_stage_refuses_overwrite(tmp_path, fake_s3): diff --git a/bench/tests/test_wandb_bench.py b/bench/tests/test_wandb_bench.py index 263a411f..3f40a1b6 100644 --- a/bench/tests/test_wandb_bench.py +++ b/bench/tests/test_wandb_bench.py @@ -17,7 +17,7 @@ def _write_fx(dir: Path, name: str) -> Path: description = "x" [inputs] - fasta = "s3://b/p.fasta" + target_peptides = "s3://b/target.peptides.txt" speclib = "s3://b/lib.msgpack.zst" raw = "s3://b/sample.d" @@ -152,10 +152,12 @@ def test_run_one_fixture_runs_entrapment_when_field_present(tmp_path, fake_wandb description = "x" [inputs] - fasta = "s3://b/p.fasta" + target_peptides = "s3://b/t.peptides.txt" + entrapment_peptides = "s3://b/e.peptides.txt" + entrapment_ratio = 1.0 + entrapment_mode = "foreign" speclib = "s3://b/lib.msgpack.zst" raw = "s3://b/sample.d" - entrapment_fasta = "s3://b/entrap.fasta" [config.analysis] chunk_size = 20000 @@ -169,13 +171,17 @@ def fake_subprocess(cmd, *a, **kw): out = Path(cmd[idx + 1]) raw_idx = cmd.index("--dotd-files") raw_stem = Path(cmd[raw_idx + 1]).stem - _write_perf_report(out, raw_stem, {"runtime_s": 1.0}) - # Also drop a results.parquet so analyse() can read it + sub = out / raw_stem + sub.mkdir(parents=True, exist_ok=True) + (sub / "performance_report.json").write_text(json.dumps({"runtime_s": 1.0})) import polars as pl - pl.DataFrame({"sequence": ["MK"], "qvalue": [0.001]}).write_parquet( - out / raw_stem / "results.parquet" - ) + pl.DataFrame({ + "sequence": ["MK", "LL"], + "qvalue": [0.001, 0.002], + "is_target": [True, True], + "main_score": [100.0, 90.0], + }).write_parquet(sub / "results.parquet") return MagicMock(returncode=0) from bench.wandb_bench import run_one @@ -185,20 +191,88 @@ def fake_subprocess(cmd, *a, **kw): patch("bench.wandb_bench.analyse") as analyse_mock, patch("bench.wandb_bench.s3_download_file") as s3_dl, ): - # entrapment.analyse returns scalars; s3_download fetches the two fastas locally - analyse_mock.return_value = {"entrap/empirical_fdr_at_q01": 0.012} + analyse_mock.return_value = {"entrap/empirical_fdr_combined_at_q01": 0.012} def _dl(uri, dst): - Path(dst).write_text(">x\nMK\n") + Path(dst).write_text("MK\nLL\n") s3_dl.side_effect = _dl run_one(load_fixture(p), out_root=out_root, notes=None, dry_run=False) analyse_mock.assert_called_once() - # wandb.run.log should have been called with the entrapment scalars at some point + # Verify analyse was called with peptide paths (not fasta) AND ratio + kwargs = analyse_mock.call_args.kwargs + assert "target_peptides" in kwargs + assert "entrapment_peptides" in kwargs + assert kwargs["ratio"] == 1.0 + assert kwargs["pairing_path"] is None + assert "out_hist_plot" in kwargs + # wandb.run.log was called with the entrap scalars + with the histogram image log_payloads = [c.args[0] for c in fake_wandb["run"].log.call_args_list] - assert any("entrap/empirical_fdr_at_q01" in p for p in log_payloads) + assert any("entrap/empirical_fdr_combined_at_q01" in pp for pp in log_payloads) + assert any("entrap/mainscore_hist" in pp for pp in log_payloads) + + +def test_run_one_fixture_with_pairing(tmp_path, fake_wandb): + """When pairing is set, the runner downloads it and forwards path to analyse.""" + fx_dir = tmp_path / "fx" + fx_dir.mkdir() + p = fx_dir / "shuffle.toml" + p.write_text( + textwrap.dedent( + """ + name = "shuffle" + description = "x" + + [inputs] + target_peptides = "s3://b/t.peptides.txt" + entrapment_peptides = "s3://b/e.peptides.txt" + entrapment_ratio = 1.0 + entrapment_mode = "shuffled" + pairing = "s3://b/pairs.tsv" + speclib = "s3://b/lib.msgpack.zst" + raw = "s3://b/sample.d" + + [config.analysis] + chunk_size = 20000 + """ + ).strip() + ) + + def fake_subprocess(cmd, *a, **kw): + idx = cmd.index("--output-dir") + out = Path(cmd[idx + 1]) + raw_idx = cmd.index("--dotd-files") + raw_stem = Path(cmd[raw_idx + 1]).stem + sub = out / raw_stem + sub.mkdir(parents=True, exist_ok=True) + (sub / "performance_report.json").write_text(json.dumps({"runtime_s": 1.0})) + import polars as pl + + pl.DataFrame({ + "sequence": ["MK"], + "qvalue": [0.001], + "is_target": [True], + "main_score": [100.0], + }).write_parquet(sub / "results.parquet") + return MagicMock(returncode=0) + + from bench.wandb_bench import run_one + + with ( + patch("bench.wandb_bench.subprocess.run", side_effect=fake_subprocess), + patch("bench.wandb_bench.analyse") as analyse_mock, + patch("bench.wandb_bench.s3_download_file") as s3_dl, + ): + analyse_mock.return_value = {"entrap/empirical_fdr_matched_at_q01": 0.005} + s3_dl.side_effect = lambda uri, dst: Path(dst).write_text("MK\n") + run_one(load_fixture(p), out_root=tmp_path / "out", notes=None, dry_run=False) + + # 3 downloads: target, entrap, pairing + assert s3_dl.call_count == 3 + # analyse received a non-None pairing_path + assert analyse_mock.call_args.kwargs["pairing_path"] is not None def test_run_one_dry_run_no_subprocess(tmp_path, fake_wandb): diff --git a/bench/wandb_bench.py b/bench/wandb_bench.py index 48e35ddf..e9ccbb58 100644 --- a/bench/wandb_bench.py +++ b/bench/wandb_bench.py @@ -12,9 +12,9 @@ from datetime import datetime from pathlib import Path -import wandb from loguru import logger +import wandb from bench._fixture_schema import Fixture, load_fixture from bench._s3 import s3_download_file from bench.entrapment import analyse @@ -186,27 +186,42 @@ def run_one( logger.warning("performance_report.json missing at {}", perf) if fx.has_entrapment(): - assert fx.inputs.entrapment_fasta is not None + assert fx.inputs.entrapment_peptides is not None + assert fx.inputs.entrapment_ratio is not None with tempfile.TemporaryDirectory() as td: - target_local = Path(td) / "target.fasta" - entrap_local = Path(td) / "entrap.fasta" - s3_download_file(fx.inputs.fasta, str(target_local)) - s3_download_file(fx.inputs.entrapment_fasta, str(entrap_local)) + target_local = Path(td) / "target.peptides.txt" + entrap_local = Path(td) / "entrap.peptides.txt" + s3_download_file(fx.inputs.target_peptides, str(target_local)) + s3_download_file(fx.inputs.entrapment_peptides, str(entrap_local)) + + pairing_local: Path | None = None + if fx.has_pairing(): + assert fx.inputs.pairing is not None + pairing_local = Path(td) / "pairing.tsv" + s3_download_file(fx.inputs.pairing, str(pairing_local)) + results_parquet = res_dir / raw_stem / "results.parquet" out_parquet = ( out_root / "parquets" / f"{fx.name}-{ts}-classified.parquet" ) - out_plot = out_root / "plots" / f"{fx.name}-fdr_curve-{ts}.png" + out_fdr_plot = out_root / "plots" / f"{fx.name}-fdr_curve-{ts}.png" + out_hist_plot = ( + out_root / "plots" / f"{fx.name}-mainscore_hist-{ts}.png" + ) scalars = analyse( results_parquet=results_parquet, - target_fasta=target_local, - entrapment_fasta=entrap_local, + target_peptides=target_local, + entrapment_peptides=entrap_local, + ratio=fx.inputs.entrapment_ratio, + pairing_path=pairing_local, out_parquet=out_parquet, - out_plot=out_plot, + out_fdr_plot=out_fdr_plot, + out_hist_plot=out_hist_plot, title=f"{fx.name} entrapment FDR", ) run.log(scalars) - run.log({"entrap/fdr_curve": wandb.Image(str(out_plot))}) + run.log({"entrap/fdr_curve": wandb.Image(str(out_fdr_plot))}) + run.log({"entrap/mainscore_hist": wandb.Image(str(out_hist_plot))}) artifact = wandb.Artifact(f"{fx.name}-classified", type="dataset") artifact.add_file(str(out_parquet)) run.log_artifact(artifact) From 30bb2cb5ce6a2ad50c572e38cd31097acf550420 Mon Sep 17 00:00:00 2001 From: "J. Sebastian Paez" Date: Sat, 9 May 2026 10:22:43 -0700 Subject: [PATCH 36/41] docs(bench): peptide-list pipeline + Algorithm 1/2 + FDRBench estimators --- bench/README.md | 68 +++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 57 insertions(+), 11 deletions(-) diff --git a/bench/README.md b/bench/README.md index 77984eda..841aa8dc 100644 --- a/bench/README.md +++ b/bench/README.md @@ -8,35 +8,81 @@ Fixture-driven bench harness for `timsseek`. Each fixture is a TOML in `bench/fi uv run --group bench python -m bench.wandb_bench --all uv run --group bench python -m bench.wandb_bench --match 'hela*' -Outputs land under `bench_out/` (gitignored): `logs/-/`, `parquets/--classified.parquet`, `plots/-fdr_curve-.png`. Wandb runs go to `jspaezp/timsseek`. +Outputs land under `bench_out/` (gitignored): `logs/-/`, `parquets/--classified.parquet`, `plots/-fdr_curve-.png`, `plots/-mainscore_hist-.png`. Wandb runs go to `jspaezp/timsseek`. -Fixtures with `entrapment_fasta` set automatically run the entrapment classification + FDR-curve step. +Fixtures with `entrapment_peptides` set automatically run entrapment classification, emit lower-bound + combined FDR estimators (and matched FDR if `pairing` is set), and a 4-group `main_score` histogram (target/decoy × class=target/entrap). ## Push a new fixture Requires `aws` CLI (auth via env / profile). +### Foreign-species entrapment (Algorithm 2 of Noble et al, FDRBench paper) + + uv run --group bench python -m bench.push_fixture \ + --name hela_iccoff_human_yeast \ + --bucket terraform-workstations-bucket --prefix jspaezp/timsseek_fixtures \ + --db UP000005640 \ + --entrap-db UP000002311 \ + --raw ~/data/decompressed_timstof/250225_Desnaux_200ng_Hela_ICC_off_DIA.d \ + --config bench/configs/default.toml \ + --entrap-ratio 1.0 \ + --request-delay-ms 250 + +Pipeline: digest target + entrap (trypsin, 1 missed cleavage), filter to length 7-30, drop entrap peptides that also appear in target, randomly subsample entrap to `r × |target|` (seed=42; override with `--seed`). Uploads `target.peptides.txt`, `entrap.peptides.txt`, `database.peptides.txt` (union) and builds the speclib via `speclib_build_cli --peptides s3://.../database.peptides.txt`. + +Records `entrapment_mode = "foreign"` and the actual achieved `entrapment_ratio` on the fixture. + +### Shuffled entrapment (Algorithm 1 — paired estimator) + uv run --group bench python -m bench.push_fixture \ - --name hela_iccoff_gt20peps \ + --name hela_iccoff_shuffled \ --bucket terraform-workstations-bucket --prefix jspaezp/timsseek_fixtures \ - --db ~/fasta/hela_gt20peps.fasta \ + --db UP000005640 \ + --entrap-db SHUFFLED \ --raw ~/data/decompressed_timstof/250225_Desnaux_200ng_Hela_ICC_off_DIA.d \ --config bench/configs/default.toml \ - --koina-url http://localhost:8501/v2/models # omit for public Koina + --entrap-ratio 1.0 \ + --request-delay-ms 250 + +Pipeline: digest target, length-filter, then for each surviving target peptide generate r distinct shuffles (interior permuted, C-term residue fixed). Targets that can't produce r unique shuffles are dropped. With `--entrap-ratio 1.0` the runner also emits `pairing.tsv` enabling the matched FDP estimator. + +Records `entrapment_mode = "shuffled"` plus `pairing` URI when r=1. + +### Common flags + +`--db` (and `--entrap-db`, `--calib-db`) accept: local `*.fasta(.gz)`, local `*.txt` accession list, `s3://...` URI, `UPxxxxxxxxx` proteome ID, bare UniProt accession, or the literal `SHUFFLED` for `--entrap-db` only. + +`fetch_proteome` defaults to `reviewed:true` (Swiss-Prot only). Pass full proteomes via fasta or accession list if you need TrEMBL. + +Other flags: `--peptide-min-len 7`, `--peptide-max-len 30`, `--missed-cleavages 1`, `--seed 42`. `--request-delay-ms` throttles speclib_build_cli's koina calls (default 500). + +After upload, hand-edit the generated `bench/fixtures/.toml` to add a description, then `git add bench/fixtures/.toml`. + +Re-running `push_fixture` is idempotent by default: existing S3 objects are skipped (single files via `aws s3 ls`; `.d` directory via `aws s3 sync`). Pass `--force` to re-upload everything. + +## FDP estimators + +Per Noble et al, FDRBench paper (Table S2). When entrapment is configured: + +| Estimator | Formula | Available when | +|---|---|---| +| Lower bound | `n_e / (n_e + n_t)` | always | +| Combined (avg upper bound) | `n_e × (1 + 1/r) / (n_e + n_t)` | always | +| Matched (k=1, avg upper bound) | `(n_e + n_p_s_t + 2·n_p_t_s) / (n_e + n_t)` | shuffled mode + r=1 | -`--db` (and `--entrap-db`, `--calib-db`) are repeatable and accept any of: local `*.fasta(.gz)` path, local `*.txt` accession list, `s3://...` URI, `UPxxxxxxxxx` proteome ID, bare uniprot accession. After upload, hand-edit the generated `bench/fixtures/.toml` to add a description, then `git add bench/fixtures/.toml`. +`r = entrapment_ratio` is recorded on the fixture from the actual ratio achieved at push time. Counts are walked over `is_target=True` rows only (post-competition target winners; decoy wins are TDA-style FPs, separate from entrapment FPs). -Re-running `push_fixture` is idempotent by default: existing S3 objects are skipped (single files via `aws s3 ls` check; `.d` directory via `aws s3 sync`). Pass `--force` to re-upload everything. +`compute_fdr_curve` emits all available estimator columns; `plot_fdr_curve` overlays them. ## Stage a fixture for offline / repeated runs When iterating on a fixture, pull its inputs to a local cache once, then run against the staged copy: - uv run --group bench python -m bench.stage_fixture hela_iccoff_gt20peps - uv run --group bench python -m bench.wandb_bench --fixtures-dir bench_out/staged hela_iccoff_gt20peps + uv run --group bench python -m bench.stage_fixture hela_iccoff_human_yeast + uv run --group bench python -m bench.wandb_bench --fixtures-dir bench_out/staged hela_iccoff_human_yeast -`stage_fixture` defaults: cache root `bench_out/cache//` (override via `--cache-dir` or `BENCH_CACHE_DIR` env), output TOML `bench_out/staged/.toml` (override via `--out`). Already-cached files are skipped on re-stage; pass `--force` to re-download. Inputs that are already absolute local paths are referenced as-is (no copy). +Defaults: cache root `bench_out/cache//` (override via `--cache-dir` or `BENCH_CACHE_DIR` env), output TOML `bench_out/staged/.toml` (override via `--out`). Already-cached files are skipped on re-stage; `--force` re-downloads. Inputs that are already absolute local paths are referenced as-is (no copy). ## Schema -See `bench/_fixture_schema.py` for the canonical TOML schema. Inputs accept `s3://` URIs or absolute local paths (the latter for staged fixtures only — `push_fixture` always emits `s3://`). +See `bench/_fixture_schema.py` for the canonical TOML. Required: `inputs.target_peptides`, `inputs.speclib`, `inputs.raw`. Optional: `inputs.entrapment_peptides` (with `entrapment_ratio` and `entrapment_mode`), `inputs.pairing` (only valid when `entrapment_mode = "shuffled"`), `inputs.calibration_speclib`. URIs accept `s3://` or absolute local paths (the latter for staged fixtures; `push_fixture` always emits `s3://`). From e6339f4fd077e8f5f7bb8fc8619750e71a5fb972 Mon Sep 17 00:00:00 2001 From: "J. Sebastian Paez" Date: Sat, 9 May 2026 10:37:57 -0700 Subject: [PATCH 37/41] feat(bench): target_peptides optional (only required for entrapment); update gt20peps fixture --- bench/_fixture_schema.py | 10 ++++-- bench/fixtures/hela_iccoff_gt20peps.toml | 15 ++++++++ bench/tests/test_fixture_schema.py | 44 ++++++++++++++++++++++++ 3 files changed, 66 insertions(+), 3 deletions(-) create mode 100644 bench/fixtures/hela_iccoff_gt20peps.toml diff --git a/bench/_fixture_schema.py b/bench/_fixture_schema.py index 613a5f35..36550638 100644 --- a/bench/_fixture_schema.py +++ b/bench/_fixture_schema.py @@ -36,7 +36,7 @@ def _require_uri(value: str, field: str) -> str: class FixtureInputs(BaseModel): model_config = ConfigDict(extra="forbid") - target_peptides: str + target_peptides: str | None = None speclib: str raw: str entrapment_peptides: str | None = None @@ -45,13 +45,13 @@ class FixtureInputs(BaseModel): pairing: str | None = None calibration_speclib: str | None = None - @field_validator("target_peptides", "speclib", "raw") + @field_validator("speclib", "raw") @classmethod def _required_uri(cls, v: str, info: ValidationInfo) -> str: return _require_uri(v, info.field_name or "") @field_validator( - "entrapment_peptides", "calibration_speclib", "pairing" + "target_peptides", "entrapment_peptides", "calibration_speclib", "pairing" ) @classmethod def _optional_uri(cls, v: str | None, info: ValidationInfo) -> str | None: @@ -64,6 +64,10 @@ def _entrap_consistency(self) -> "FixtureInputs": has_pep = self.entrapment_peptides is not None has_ratio = self.entrapment_ratio is not None has_mode = self.entrapment_mode is not None + if has_pep and self.target_peptides is None: + raise ValueError( + "entrapment_peptides requires target_peptides to also be set" + ) if has_pep and not (has_ratio and has_mode): raise ValueError( "entrapment_peptides requires both entrapment_ratio and entrapment_mode" diff --git a/bench/fixtures/hela_iccoff_gt20peps.toml b/bench/fixtures/hela_iccoff_gt20peps.toml new file mode 100644 index 00000000..489d39b0 --- /dev/null +++ b/bench/fixtures/hela_iccoff_gt20peps.toml @@ -0,0 +1,15 @@ +name = "hela_iccoff_gt20peps" +description = "" + +[inputs] +speclib = "s3://terraform-workstations-bucket/jspaezp/timsseek_fixtures/hela_iccoff_gt20peps/lib.msgpack.zst" +raw = "s3://terraform-workstations-bucket/jspaezp/timsseek_fixtures/hela_iccoff_gt20peps/sample.d" + +# === embedded timsseek config === +[config.analysis] +chunk_size = 20000 + +[config.analysis.tolerance] +ms = { ppm = [15.0, 15.0] } +mobility = { percent = [10.0, 10.0] } +quad = { absolute = [0.1, 0.1] } diff --git a/bench/tests/test_fixture_schema.py b/bench/tests/test_fixture_schema.py index 2ad377ed..a1b74480 100644 --- a/bench/tests/test_fixture_schema.py +++ b/bench/tests/test_fixture_schema.py @@ -263,3 +263,47 @@ def test_tilde_path_expanded(tmp_path): ) f = load_fixture(p) assert f.inputs.target_peptides == f"{home}/peps.txt" + + +def test_minimal_fixture_without_target_peptides(tmp_path): + """Non-entrap fixtures don't need target_peptides.""" + p = _write( + tmp_path, + """ + name = "hela" + description = "test" + + [inputs] + speclib = "s3://b/lib.msgpack.zst" + raw = "s3://b/sample.d" + + [config.analysis] + chunk_size = 20000 + """, + ) + f = load_fixture(p) + assert f.inputs.target_peptides is None + assert not f.has_entrapment() + + +def test_entrapment_without_target_peptides_rejected(tmp_path): + """If entrapment_peptides is set, target_peptides must be too.""" + p = _write( + tmp_path, + """ + name = "bad" + description = "test" + + [inputs] + speclib = "s3://b/lib.msgpack.zst" + raw = "s3://b/sample.d" + entrapment_peptides = "s3://b/e.txt" + entrapment_ratio = 1.0 + entrapment_mode = "foreign" + + [config.analysis] + chunk_size = 1 + """, + ) + with pytest.raises(ValueError, match="target_peptides"): + load_fixture(p) From a4c5ef6670cf9539eaab54e755183182b6c8d46a Mon Sep 17 00:00:00 2001 From: "J. Sebastian Paez" Date: Mon, 11 May 2026 09:06:28 -0700 Subject: [PATCH 38/41] fix(bench): speclib_build_cli flag is --peptide-list, not --peptides --- bench/push_fixture.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bench/push_fixture.py b/bench/push_fixture.py index 2dcd2aa4..c2f342d6 100644 --- a/bench/push_fixture.py +++ b/bench/push_fixture.py @@ -168,7 +168,7 @@ def run_speclib_build( "-p", "speclib_build_cli", "--", - "--peptides", + "--peptide-list", peptides_uri, "--fixed-mod", "C[U:4]", From 276d1e8219e178752be9a86fb107b9c1847db5d1 Mon Sep 17 00:00:00 2001 From: "J. Sebastian Paez" Date: Wed, 13 May 2026 09:46:08 -0700 Subject: [PATCH 39/41] fix(timsseek): correct obs_mobility unit + refit supersimpleprediction obs_mobility was computed by adding the percent-of-ref mobility error back to ref_mobility, giving values biased ~50% high. avg_delta_mobs now converts pct -> absolute via ref_mobility/100 before feeding the collector, so obs_mob = ref + abs_delta is dimensionally correct. delta_ms1_ms2_mobility likewise switched to absolute 1/k0 units. Side cleanups in offsets.rs: - weighted_ms1 NaN gate was checking mz_error_ppm only, poisoning the mobility accumulator (or dropping valid mobility) on mismatched NaNs. Now gates each dimension independently. - magic .take(3) on FRAGMENT_TOP_N=7 named as FRAGMENT_OBS_MOB_TOP_N. - renamed shadow mz/mob in with_sorted_offsets to ms1_mob/ms2_mob. supersimpleprediction refit on hela_iccoff_gt20peps (34k IDs, holdout MAPE 1.36% vs prior 1.83% claim) using scripts/refit_mobility.py. --- .../fragment_mass/elution_group_converter.rs | 20 +-- rust/timsseek/src/scoring/offsets.rs | 42 ++++-- rust/timsseek/src/scoring/results.rs | 8 +- scripts/refit_mobility.py | 132 ++++++++++++++++++ 4 files changed, 174 insertions(+), 28 deletions(-) create mode 100755 scripts/refit_mobility.py diff --git a/rust/timsseek/src/fragment_mass/elution_group_converter.rs b/rust/timsseek/src/fragment_mass/elution_group_converter.rs index cd428283..02bf67ba 100644 --- a/rust/timsseek/src/fragment_mass/elution_group_converter.rs +++ b/rust/timsseek/src/fragment_mass/elution_group_converter.rs @@ -7,9 +7,9 @@ use rustyms::prelude::{ /// Super simple 1/k0 prediction. /// -/// This is a simple prediction of the retention time based on the m/z and charge. -/// On my data it gets MAPE 1.82802 so, this prediction + 10% error is a pretty solid way -/// to set an extraction window for mobility if you dont know anything for the peptide. +/// Refit on `hela_iccoff_gt20peps` (34k IDs, holdout MAPE 1.36%) after the +/// mobility unit-bug fix. Use `scripts/refit_mobility.py` against a fresh +/// `results.parquet` to refit on a different dataset. /// /// Example: /// ``` @@ -17,20 +17,20 @@ use rustyms::prelude::{ /// let mass = 1810.917339999999; /// let charge = 2; /// let out = supersimpleprediction(mass / charge as f64, charge); -/// assert!((out - 1.105151).abs() < 0.001 ); +/// assert!((out - 1.144405).abs() < 0.001); /// ``` pub fn supersimpleprediction(mz: f64, charge: i32) -> f64 { - let intercept_ = -1.660e+00; + let intercept_ = -1.319388e+00; let log1p_mz = (mz + 1.).ln(); let sq_mz_over_charge = mz.powi(2) / charge as f64; let log1p_sq_mz_over_charge = (sq_mz_over_charge + 1.).ln(); intercept_ - + (-3.798e-01 * log1p_mz) - + (-2.389e-04 * mz) - + (3.957e-01 * log1p_sq_mz_over_charge) - + (4.157e-07 * sq_mz_over_charge) - + (1.417e-01 * charge as f64) + + (-2.954677e-01 * log1p_mz) + + (-9.277763e-05 * mz) + + (3.219103e-01 * log1p_sq_mz_over_charge) + + (4.005229e-07 * sq_mz_over_charge) + + (1.176651e-01 * charge as f64) } fn count_carbon_sulphur(form: &MolecularFormula) -> (u16, u16) { diff --git a/rust/timsseek/src/scoring/offsets.rs b/rust/timsseek/src/scoring/offsets.rs index adf3f2e8..19f29912 100644 --- a/rust/timsseek/src/scoring/offsets.rs +++ b/rust/timsseek/src/scoring/offsets.rs @@ -14,6 +14,10 @@ const PRECURSOR_TOP_N: usize = 3; /// Balances statistical power with outlier resistance. const FRAGMENT_TOP_N: usize = 7; +/// Number of top fragments contributing to the obs-mobility estimate. +/// Subset of FRAGMENT_TOP_N to bias toward the highest-confidence ions. +const FRAGMENT_OBS_MOB_TOP_N: usize = 3; + /// Container for measured m/z and mobility offsets from top ions. /// /// Tracks the highest-intensity precursors and fragments to calculate @@ -118,17 +122,23 @@ impl MzMobilityOffsets { /// Used by Phase-2 calibration to estimate population-level offsets; /// rescoring uses the per-ion arrays directly. pub fn weighted_ms1(&self) -> Option<(f32, f32)> { - let (mut w, mut mz, mut mob) = (0.0f64, 0.0f64, 0.0f64); + let (mut w_mz, mut mz) = (0.0f64, 0.0f64); + let (mut w_mob, mut mob) = (0.0f64, 0.0f64); for v in self.ms1.get_values() { - if v.intensity <= 0.0 || v.mz_error_ppm.is_nan() { + if v.intensity <= 0.0 { continue; } - w += v.intensity; - mz += v.intensity * v.mz_error_ppm as f64; - mob += v.intensity * v.mobility_error_pct as f64; + if !v.mz_error_ppm.is_nan() { + w_mz += v.intensity; + mz += v.intensity * v.mz_error_ppm as f64; + } + if !v.mobility_error_pct.is_nan() { + w_mob += v.intensity; + mob += v.intensity * v.mobility_error_pct as f64; + } } - if w > 0.0 { - Some(((mz / w) as f32, (mob / w) as f32)) + if w_mz > 0.0 && w_mob > 0.0 { + Some(((mz / w_mz) as f32, (mob / w_mob) as f32)) } else { None } @@ -170,30 +180,34 @@ impl MzMobilityOffsets { out } + /// Intensity-weighted absolute mobility deltas (1/k0 units) for MS1 and MS2. + /// Converts the stored percent error back to absolute via `ref_mobility`. + /// The collector's "mz" slot carries the ppm error (unused by current + /// consumers); only `mean_mobility()` is meaningful here. pub fn avg_delta_mobs(&self) -> (MzMobilityStatsCollector, MzMobilityStatsCollector) { - let mut ms2 = MzMobilityStatsCollector::default(); let mut ms1 = MzMobilityStatsCollector::default(); - let vals = self.ms2.get_values(); - for v in vals.iter().take(3) { + let mut ms2 = MzMobilityStatsCollector::default(); + let pct_to_abs = self.ref_mobility / 100.0; + + for v in self.ms2.get_values().iter().take(FRAGMENT_OBS_MOB_TOP_N) { if v.mobility_error_pct.is_nan() { continue; } ms2.add( v.intensity, v.mz_error_ppm as f64, - v.mobility_error_pct as f64, + v.mobility_error_pct as f64 * pct_to_abs, ); } - let vals = self.ms1.get_values(); - for v in vals.iter() { + for v in self.ms1.get_values().iter() { if v.mobility_error_pct.is_nan() { continue; } ms1.add( v.intensity, v.mz_error_ppm as f64, - v.mobility_error_pct as f64, + v.mobility_error_pct as f64 * pct_to_abs, ); } diff --git a/rust/timsseek/src/scoring/results.rs b/rust/timsseek/src/scoring/results.rs index 4a0a4064..3d56c345 100644 --- a/rust/timsseek/src/scoring/results.rs +++ b/rust/timsseek/src/scoring/results.rs @@ -299,11 +299,11 @@ impl ScoredCandidateBuilder { self.ms2_mz_errors = SetField::Some(offsets.ms2_mz_errors()); self.ms2_mobility_errors = SetField::Some(offsets.ms2_mobility_errors()); - let mob_errors = offsets.avg_delta_mobs(); - let cum_err = mob_errors.0 + mob_errors.1; + let (ms1_err, ms2_err) = offsets.avg_delta_mobs(); + let cum_err = ms1_err.clone() + ms2_err.clone(); let obs_mob = (offsets.ref_mobility + cum_err.mean_mobility().unwrap_or(f64::NAN)) as f32; - let d_err = match (mob_errors.0.mean_mobility(), mob_errors.1.mean_mobility()) { - (Ok(mz), Ok(mob)) => mz - mob, + let d_err = match (ms1_err.mean_mobility(), ms2_err.mean_mobility()) { + (Ok(ms1_mob), Ok(ms2_mob)) => ms1_mob - ms2_mob, _ => f64::NAN, }; self.delta_ms1_ms2_mobility = SetField::Some(d_err as f32); diff --git a/scripts/refit_mobility.py b/scripts/refit_mobility.py new file mode 100755 index 00000000..25699e42 --- /dev/null +++ b/scripts/refit_mobility.py @@ -0,0 +1,132 @@ +#!/usr/bin/env -S uv run --script +# /// script +# requires-python = ">=3.12" +# dependencies = ["polars", "numpy", "scikit-learn"] +# /// +"""Refit `supersimpleprediction` 1/k0 model from a timsseek results.parquet. + +Mirrors the feature set in +rust/timsseek/src/fragment_mass/elution_group_converter.rs:supersimpleprediction. +Prints the new intercept + coefs ready to paste back into the Rust source. +""" + +from __future__ import annotations + +import argparse +import sys +from pathlib import Path + +import numpy as np +import polars as pl +from sklearn.linear_model import HuberRegressor, LinearRegression + + +def build_features(mz: np.ndarray, z: np.ndarray) -> np.ndarray: + sq_mz_over_z = mz**2 / z + return np.column_stack( + [ + np.log1p(mz), + mz, + np.log1p(sq_mz_over_z), + sq_mz_over_z, + z, + ] + ) + + +def main() -> int: + p = argparse.ArgumentParser(description=__doc__) + p.add_argument("parquet", type=Path, help="path to results.parquet") + p.add_argument( + "--min-main-score", + type=float, + default=0.0, + help="filter rows with main_score < threshold (default: 0.0)", + ) + p.add_argument( + "--robust", + action="store_true", + help="use HuberRegressor instead of OLS", + ) + p.add_argument( + "--holdout", + type=float, + default=0.2, + help="fraction of rows reserved for holdout MAPE (default: 0.2)", + ) + p.add_argument( + "--seed", + type=int, + default=42, + ) + args = p.parse_args() + + if not args.parquet.exists(): + print(f"parquet not found: {args.parquet}", file=sys.stderr) + return 1 + + df = pl.read_parquet(args.parquet).filter( + pl.col("obs_mobility").is_finite() + & pl.col("precursor_mz").is_finite() + & (pl.col("main_score") > args.min_main_score) + ) + print(f"rows after filter: {df.height}") + if df.height < 100: + print("not enough rows to fit", file=sys.stderr) + return 1 + + mz = df["precursor_mz"].to_numpy().astype(np.float64) + z = df["precursor_charge"].cast(pl.Float64).to_numpy() + y = df["obs_mobility"].to_numpy().astype(np.float64) + + X = build_features(mz, z) + + rng = np.random.default_rng(args.seed) + idx = rng.permutation(len(y)) + cut = int(len(y) * (1 - args.holdout)) + tr, ho = idx[:cut], idx[cut:] + + model = HuberRegressor(max_iter=500) if args.robust else LinearRegression() + model.fit(X[tr], y[tr]) + + def mape(a: np.ndarray, b: np.ndarray) -> float: + return float(np.mean(np.abs((a - b) / b)) * 100) + + tr_mape = mape(model.predict(X[tr]), y[tr]) + ho_mape = mape(model.predict(X[ho]), y[ho]) + + feats = [ + "log1p_mz", + "mz", + "log1p_sq_mz_over_charge", + "sq_mz_over_charge", + "charge", + ] + print() + print(f"intercept: {model.intercept_:.6e}") + for name, coef in zip(feats, model.coef_): + print(f" {name:>24s}: {coef:+.6e}") + print() + print(f"train MAPE: {tr_mape:.4f}%") + print(f"holdout MAPE: {ho_mape:.4f}%") + + print() + print("--- Rust paste block ---") + print(f" let intercept_ = {model.intercept_:.3e};") + print(" let log1p_mz = (mz + 1.).ln();") + print(" let sq_mz_over_charge = mz.powi(2) / charge as f64;") + print(" let log1p_sq_mz_over_charge = (sq_mz_over_charge + 1.).ln();") + print() + print(" intercept_") + c = model.coef_ + print(f" + ({c[0]:+.3e} * log1p_mz)") + print(f" + ({c[1]:+.3e} * mz)") + print(f" + ({c[2]:+.3e} * log1p_sq_mz_over_charge)") + print(f" + ({c[3]:+.3e} * sq_mz_over_charge)") + print(f" + ({c[4]:+.3e} * charge as f64)") + + return 0 + + +if __name__ == "__main__": + sys.exit(main()) From 26de8017cd3281f0a64c47ce88d3f30f78960940 Mon Sep 17 00:00:00 2001 From: "J. Sebastian Paez" Date: Wed, 13 May 2026 09:46:14 -0700 Subject: [PATCH 40/41] fix(bench): accept absolute local paths in staged fixtures stage_fixture emitted relative paths (bench_out/cache/...) into the staged TOML, which then failed the schema validator that requires s3:// or absolute local paths. Now resolves to absolute via dst.resolve() on both _stage_one_file and _stage_one_dir. Also guards target_peptides for the gt20peps fixture (target_peptides is optional since e6339f4 but stage still unconditionally tried to download it). wandb_bench grew a _materialize_uri helper used in the entrapment block so target_peptides / entrapment_peptides / pairing pass through when already local instead of erroring out of s3_download_file. scripts/stage_gt20.sh: one-shot wrapper for the gt20peps fixture. --- bench/stage_fixture.py | 31 +++++++++++++++++-------------- bench/wandb_bench.py | 24 ++++++++++++++++++------ scripts/stage_gt20.sh | 21 +++++++++++++++++++++ 3 files changed, 56 insertions(+), 20 deletions(-) create mode 100755 scripts/stage_gt20.sh diff --git a/bench/stage_fixture.py b/bench/stage_fixture.py index 3c8b606a..50234cc4 100644 --- a/bench/stage_fixture.py +++ b/bench/stage_fixture.py @@ -52,28 +52,28 @@ def parse_args(argv: list[str] | None = None) -> argparse.Namespace: def _stage_one_file(uri: str, dst: Path, force: bool) -> str: - """Resolve `uri` to a local path; return path string for the staged TOML. + """Resolve `uri` to an absolute local path string for the staged TOML. - - If `uri` is already an absolute local path, return it unchanged (no copy). + - If `uri` is already an absolute local path, return it unchanged. - If `uri` is `s3://...`, download to `dst` (skip if exists, unless `force`). """ if not uri.startswith("s3://"): - return uri # already local; reference as-is - if dst.exists() and not force: + return uri # already absolute (schema validator enforces this) + if not (dst.exists() and not force): + s3_download_file(uri, str(dst)) + else: logger.info("stage: cached {} (skip)", dst) - return str(dst) - s3_download_file(uri, str(dst)) - return str(dst) + return str(dst.resolve()) def _stage_one_dir(uri: str, dst: Path, force: bool) -> str: # noqa: ARG001 - """Sync `uri` (s3 prefix) into `dst`. Returns the path string.""" + """Sync `uri` (s3 prefix) into `dst`. Returns the absolute path string.""" if not uri.startswith("s3://"): return uri # `aws s3 sync` is itself idempotent; --force just forces a re-sync # which has the same observable result, so we always call it. s3_sync_dir(uri, str(dst)) - return str(dst) + return str(dst.resolve()) def stage( @@ -95,9 +95,11 @@ def stage( cache_root = cache_dir / name cache_root.mkdir(parents=True, exist_ok=True) - target_pep_local = _stage_one_file( - fx.inputs.target_peptides, cache_root / "target.peptides.txt", force - ) + target_pep_local: str | None = None + if fx.inputs.target_peptides is not None: + target_pep_local = _stage_one_file( + fx.inputs.target_peptides, cache_root / "target.peptides.txt", force + ) speclib_local = _stage_one_file( fx.inputs.speclib, cache_root / "lib.msgpack.zst", force ) @@ -148,7 +150,7 @@ def _build_staged_toml( name: str, description: str, config: dict, - target_peptides_uri: str, + target_peptides_uri: str | None, speclib_uri: str, raw_uri: str, entrapment_peptides_uri: str | None, @@ -166,7 +168,8 @@ def _build_staged_toml( lines.append(f'description = "{desc}"') lines.append("") lines.append("[inputs]") - lines.append(f'target_peptides = "{target_peptides_uri}"') + if target_peptides_uri is not None: + lines.append(f'target_peptides = "{target_peptides_uri}"') lines.append(f'speclib = "{speclib_uri}"') lines.append(f'raw = "{raw_uri}"') if entrapment_peptides_uri is not None: diff --git a/bench/wandb_bench.py b/bench/wandb_bench.py index e9ccbb58..35f215af 100644 --- a/bench/wandb_bench.py +++ b/bench/wandb_bench.py @@ -29,6 +29,14 @@ def _list_fixtures(fixtures_dir: Path) -> list[str]: return sorted(p.stem for p in fixtures_dir.glob("*.toml")) +def _materialize_uri(uri: str, dst: Path) -> Path: + """Return a local Path for `uri`. Downloads if s3, passes through if local.""" + if uri.startswith("s3://"): + s3_download_file(uri, str(dst)) + return dst + return Path(uri) + + def select_fixtures( names: list[str], all_: bool, @@ -188,17 +196,21 @@ def run_one( if fx.has_entrapment(): assert fx.inputs.entrapment_peptides is not None assert fx.inputs.entrapment_ratio is not None + assert fx.inputs.target_peptides is not None with tempfile.TemporaryDirectory() as td: - target_local = Path(td) / "target.peptides.txt" - entrap_local = Path(td) / "entrap.peptides.txt" - s3_download_file(fx.inputs.target_peptides, str(target_local)) - s3_download_file(fx.inputs.entrapment_peptides, str(entrap_local)) + target_local = _materialize_uri( + fx.inputs.target_peptides, Path(td) / "target.peptides.txt" + ) + entrap_local = _materialize_uri( + fx.inputs.entrapment_peptides, Path(td) / "entrap.peptides.txt" + ) pairing_local: Path | None = None if fx.has_pairing(): assert fx.inputs.pairing is not None - pairing_local = Path(td) / "pairing.tsv" - s3_download_file(fx.inputs.pairing, str(pairing_local)) + pairing_local = _materialize_uri( + fx.inputs.pairing, Path(td) / "pairing.tsv" + ) results_parquet = res_dir / raw_stem / "results.parquet" out_parquet = ( diff --git a/scripts/stage_gt20.sh b/scripts/stage_gt20.sh new file mode 100755 index 00000000..a8411c0c --- /dev/null +++ b/scripts/stage_gt20.sh @@ -0,0 +1,21 @@ +#!/usr/bin/env bash +set -euo pipefail + +FIXTURE="hela_iccoff_gt20peps" +REPO_ROOT="$(cd "$(dirname "$0")/.." && pwd)" +LOG_DIR="${REPO_ROOT}/bench_out/logs" +LOG_FILE="${LOG_DIR}/stage_${FIXTURE}.log" + +mkdir -p "${LOG_DIR}" + +cd "${REPO_ROOT}" +echo "Staging ${FIXTURE} -> bench_out/staged" +echo "Log: ${LOG_FILE}" + +uv run --group bench python -m bench.stage_fixture "${FIXTURE}" \ + > "${LOG_FILE}" 2>&1 + +echo "Done. Staged inputs under bench_out/staged/${FIXTURE}/" +echo "Run with:" +echo " uv run --group bench python -m bench.wandb_bench \\" +echo " --fixtures-dir bench_out/staged ${FIXTURE}" From 9564f56187dd91c296319258796c77f90fb15f17 Mon Sep 17 00:00:00 2001 From: "J. Sebastian Paez" Date: Mon, 18 May 2026 21:27:26 -0700 Subject: [PATCH 41/41] ci: stuff --- .github/workflows/rust.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index 49011395..a94f3cb3 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -21,10 +21,10 @@ jobs: - "--no-default-features" # serial steps: - uses: actions/checkout@v6 - - name: Apply crate patches (rustyms gnome.dat stub) - run: | - cargo install --git https://github.com/jspaezp/cargo-patch-crate patch-crate --locked - cargo patch-crate + # - name: Apply crate patches (rustyms gnome.dat stub) + # run: | + # cargo install --git https://github.com/jspaezp/cargo-patch-crate patch-crate --locked + # cargo patch-crate - name: Build run: cargo build --verbose -p timsseek ${{ matrix.features }} - name: Run tests