From bef3eaf12bbe978fc7f688ec1f0aee65de254d0d Mon Sep 17 00:00:00 2001
From: Jason Kai <21226986+kaitj@users.noreply.github.com>
Date: Tue, 5 May 2026 16:47:30 -0400
Subject: [PATCH 1/9] Tests + dependencies for benchmarks

---
 pyproject.toml                 |   1 +
 tests/benchmarks/test_index.py | 114 +++++++++++++++++++++++++++++++++
 tests/benchmarks/test_query.py |  88 +++++++++++++++++++++++++
 uv.lock                        |  60 +++++++++++++++++
 4 files changed, 263 insertions(+)
 create mode 100644 tests/benchmarks/test_index.py
 create mode 100644 tests/benchmarks/test_query.py

diff --git a/pyproject.toml b/pyproject.toml
index ec2ab72..6622b2b 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -35,6 +35,7 @@ s3 = [
 pybids = ["pandas>=2.0.0"]
 
 [dependency-groups]
+benchmark = ["polars>=1.40.1", "pytest>=9.0.3", "pytest-benchmark>=5.2.3"]
 dev = [
     "pdoc>=16.0.0",
     "pre-commit>=4.6.0",
diff --git a/tests/benchmarks/test_index.py b/tests/benchmarks/test_index.py
new file mode 100644
index 0000000..9894b58
--- /dev/null
+++ b/tests/benchmarks/test_index.py
@@ -0,0 +1,114 @@
+"""Indexing benchmarks."""
+
+import os
+import shutil
+from pathlib import Path
+from typing import Callable
+
+import pyarrow.parquet as pq
+import pytest
+from pytest_benchmark.fixture import BenchmarkFixture
+
+import bids2table as b2t2
+
+
+def du(path: Path) -> float:
+    """Compute directory size in mb."""
+    total = 0
+    stack = [path]
+    while stack:
+        for entry in os.scandir(stack.pop()):
+            try:
+                st = entry.stat(follow_symlinks=False)
+                if entry.is_dir(follow_symlinks=False):
+                    stack.append(Path(entry.path))
+                else:
+                    total += st.st_size
+            except OSError:
+                continue
+    return total / 1_024**2
+
+
+def _run_benchmark(
+    benchmark: BenchmarkFixture,
+    func: Callable,
+    index_fpath: Path,
+    version: str,
+    workers: int,
+    *args,
+    **kwargs,
+) -> None:
+    sizes = []
+
+    def _teardown(index_fpath: Path):
+        size = du(index_fpath.parent)
+        sizes.append(size)
+        if index_fpath.exists():
+            shutil.rmtree(index_fpath.parent)
+
+    # Benchmark
+    benchmark.pedantic(
+        func,
+        teardown=_teardown(index_fpath=index_fpath),
+        args=args,
+        kwargs=kwargs,
+        iterations=1,
+        rounds=11,  # Include an additional round for warmup
+    )
+
+    # Additional info
+    benchmark.extra_info.update(
+        {
+            "size_mb": sizes,
+            "version": version or "Unknown",
+            "workers": workers or "Unknown",
+        }
+    )
+
+
+@pytest.mark.benchmark
+def test_openneuro(benchmark: BenchmarkFixture, tmp_path: Path) -> None:
+    """Benchmark b2t2 with a subset of datasets on OpenNeuro."""
+    workers = 4
+    index_fpath = tmp_path / "index.parquet"
+
+    def index() -> None:
+        path = b2t2._pathlib.as_path("s3://openneuro.org/ds002*")
+        paths = list(path.parent.glob(path.name))
+        schema = b2t2.get_arrow_schema()
+        assert len(paths) > 1, "1 or less datasets found...check the path provided"
+        with pq.ParquetWriter(index_fpath, schema) as writer:
+            for table in b2t2.batch_index_dataset(
+                paths,  # type: ignore[arg-type]
+                max_workers=workers,
+                show_progress=False,
+            ):
+                writer.write_table(table)
+
+    _run_benchmark(
+        benchmark,
+        index,
+        index_fpath=index_fpath,
+        version=b2t2.__version__,
+        workers=workers,
+    )
+
+
+@pytest.mark.benchmark
+@pytest.mark.parametrize("workers", (1, 4))
+def test_local(benchmark: BenchmarkFixture, tmp_path: Path, workers: int) -> None:
+    """Bids2Table v2 benchmarking on local dataset."""
+    index_fpath = tmp_path / "index.parquet"
+    data_dir = Path("bids-examples/ds000117")
+
+    def index() -> None:
+        table = b2t2.index_dataset(data_dir, max_workers=workers, show_progress=False)
+        pq.write_table(table, index_fpath)
+
+    _run_benchmark(
+        benchmark,
+        index,
+        index_fpath=index_fpath,
+        version=b2t2.__version__,
+        workers=workers,
+    )
diff --git a/tests/benchmarks/test_query.py b/tests/benchmarks/test_query.py
new file mode 100644
index 0000000..c93d19b
--- /dev/null
+++ b/tests/benchmarks/test_query.py
@@ -0,0 +1,88 @@
+"""Querying benchmarks."""
+
+import datetime
+from pathlib import Path
+from typing import Callable
+
+import polars as pl
+import pytest
+from pytest_benchmark.fixture import BenchmarkFixture
+
+import bids2table as b2t2
+
+SUBJECTS = ["01", "10"]
+NUM_VOLS = 184
+TARGET_TE = 0.00875
+TARGET_TIME = datetime.time(10).strftime("%H:%M:%S.%f")
+
+
+def _run_benchmark(
+    benchmark: BenchmarkFixture,
+    func: Callable,
+    version: str,
+    *args,
+    **kwargs,
+) -> None:
+    benchmark.pedantic(func, args=args, kwargs=kwargs, iterations=1, rounds=11)
+    benchmark.extra_info.update({"version": version or "Unknown"})
+
+
+@pytest.mark.benchmark
+class TestB2TQuery:
+    """Benchmark different b2t queries."""
+
+    @pytest.fixture
+    def index(self) -> tuple:
+        """Index dataset with b2t."""
+        data_dir = Path("bids-examples/ds000117")
+        table = b2t2.index_dataset(data_dir, show_progress=False)
+        df = pl.from_arrow(table)
+        df = df.with_columns(
+            pl.format("{}/{}", pl.col("root"), pl.col("path")).alias("fpath")
+        )
+        df = df.with_columns(
+            pl.col("fpath")
+            .map_elements(b2t2.load_bids_metadata, return_dtype=pl.Object)
+            .alias("json")
+        )
+        version = b2t2.__version__
+        return df, version
+
+    def test_subject_query(self, benchmark: BenchmarkFixture, index: tuple) -> None:
+        """Benchmark subject queries."""
+        table, version = index
+
+        def query() -> None:
+            table.get_column("sub").unique()
+
+        _run_benchmark(benchmark, query, version=version)
+
+    def test_bold_query(self, benchmark: BenchmarkFixture, index: tuple) -> None:
+        """Benchmark queries for bold images."""
+        table, version = index
+        table = table.with_columns(
+            [pl.col("ext").cast(pl.Categorical), pl.col("suffix").cast(pl.Categorical)]
+        )
+
+        def query() -> None:
+            table.select(["ext", "suffix", "fpath"]).filter(
+                (pl.col("ext") == ".nii.gz") & (pl.col("suffix") == "bold")
+            ).get_column("fpath")
+
+        _run_benchmark(benchmark, query, version=version)
+
+    def test_metadata_query(self, benchmark: BenchmarkFixture, index: tuple) -> None:
+        """Benchmark query via metadata."""
+        table, version = index
+        table = table.with_columns(
+            pl.col("json")
+            .map_elements(lambda x: x.get("EchoTime"), return_dtype=pl.Float64)
+            .alias("echo_time")
+        )
+
+        def query() -> None:
+            table.select(["sub", "echo_time", "fpath"]).filter(
+                (pl.col("sub").is_in(SUBJECTS)) & (pl.col("echo_time") == TARGET_TE)
+            ).get_column("fpath")
+
+        _run_benchmark(benchmark, query, version=version)
diff --git a/uv.lock b/uv.lock
index f112a16..e4bcc26 100644
--- a/uv.lock
+++ b/uv.lock
@@ -43,6 +43,11 @@ s3 = [
 ]
 
 [package.dev-dependencies]
+benchmark = [
+    { name = "polars" },
+    { name = "pytest" },
+    { name = "pytest-benchmark" },
+]
 dev = [
     { name = "pdoc" },
     { name = "pre-commit" },
@@ -63,6 +68,11 @@ requires-dist = [
 provides-extras = ["cloud", "s3", "pybids"]
 
 [package.metadata.requires-dev]
+benchmark = [
+    { name = "polars", specifier = ">=1.40.1" },
+    { name = "pytest", specifier = ">=9.0.3" },
+    { name = "pytest-benchmark", specifier = ">=5.2.3" },
+]
 dev = [
     { name = "pdoc", specifier = ">=16.0.0" },
     { name = "pre-commit", specifier = ">=4.6.0" },
@@ -943,6 +953,34 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/54/20/4d324d65cc6d9205fabedc306948156824eb9f0ee1633355a8f7ec5c66bf/pluggy-1.6.0-py3-none-any.whl", hash = "sha256:e920276dd6813095e9377c0bc5566d94c932c33b27a3e3945d8389c374dd4746", size = 20538, upload-time = "2025-05-15T12:30:06.134Z" },
 ]
 
+[[package]]
+name = "polars"
+version = "1.40.1"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "polars-runtime-32" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/b3/8c/bc9bc948058348ed43117cecc3007cd608f395915dae8a00974579a5dab1/polars-1.40.1.tar.gz", hash = "sha256:ab2694134b137596b5a59bfd7b4c54ebbc9b59f9403127f18e32d363777552e8", size = 733574, upload-time = "2026-04-22T19:15:55.507Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/ea/91/74fc60d94488685a92ac9d49d7ec55f3e91fe9b77942a6235a5fa7f249c3/polars-1.40.1-py3-none-any.whl", hash = "sha256:c0f861219d1319cdea45c4ce4d30355a47176b8f98dcedf95ea8269f131b8abd", size = 828723, upload-time = "2026-04-22T19:14:25.452Z" },
+]
+
+[[package]]
+name = "polars-runtime-32"
+version = "1.40.1"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/54/ba/26d40f039be9f552b5fd7365a621bdfc0f8e912ef77094ae4693491b0bae/polars_runtime_32-1.40.1.tar.gz", hash = "sha256:37f3065615d1bf90d03b5326222df4c5c1f8a5d33e50470aa588e3465e6eb814", size = 2935843, upload-time = "2026-04-22T19:15:57.26Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/7d/46/22c8af5eed68ac2eeb556e0fa3ca8a7b798e984ceff4450888f3b5ac61fd/polars_runtime_32-1.40.1-cp310-abi3-macosx_10_12_x86_64.whl", hash = "sha256:b748ef652270cc49e9e69f99a035e0eb4d5f856d42bcd6ac4d9d80a40142aa1e", size = 52098755, upload-time = "2026-04-22T19:14:28.555Z" },
+    { url = "https://files.pythonhosted.org/packages/c6/3e/48599a38009ca60ff82a6f38c8a621ce3c0286aa7397c7d79e741bd9060e/polars_runtime_32-1.40.1-cp310-abi3-macosx_11_0_arm64.whl", hash = "sha256:d249b3743e05986060cec0a7aaa542d020df6c6b876e556023a310efd581f9be", size = 46367542, upload-time = "2026-04-22T19:14:32.433Z" },
+    { url = "https://files.pythonhosted.org/packages/43/e9/384bc069367a1a36ee31c13782c178dbd039b2b873b772d4a0fc23a2373d/polars_runtime_32-1.40.1-cp310-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5987b30e7aa1059d069498496e8dda35afd592b0ac3d46ed87e3ff8df1ad652c", size = 50252104, upload-time = "2026-04-22T19:14:35.945Z" },
+    { url = "https://files.pythonhosted.org/packages/15/ef/7d57ceb0651af74194e97ed6583e148d352f03d696090221b8059cdfc90b/polars_runtime_32-1.40.1-cp310-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8d7f42a8b3f16fc66002cc0f6516f7dd7653396886ae0ed362ab95c0b3408b59", size = 56250788, upload-time = "2026-04-22T19:14:39.743Z" },
+    { url = "https://files.pythonhosted.org/packages/10/0f/e4b3ffc748827a14a474ec9c42e45c066050e440fec57e914091d9adda75/polars_runtime_32-1.40.1-cp310-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:e5f7becc237a7ec9d9a10878dc8e54b73bbf4e2d94a2991c37d7a0b38590d8f9", size = 50432590, upload-time = "2026-04-22T19:14:43.388Z" },
+    { url = "https://files.pythonhosted.org/packages/d9/0b/b8d95fbed869fa4caabe9c400e4210374913b376e925e96fdcfa9be6416b/polars_runtime_32-1.40.1-cp310-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:992d14cf191dde043d36fbdbc98a65e43fbc7e9a5024cecd45f838ac4988c1ee", size = 54155564, upload-time = "2026-04-22T19:14:47.239Z" },
+    { url = "https://files.pythonhosted.org/packages/06/d9/d091d8fb5cbed5e9536adfed955c4c89987a4cc3b8e73ae4532402b91c74/polars_runtime_32-1.40.1-cp310-abi3-win_amd64.whl", hash = "sha256:f78bb2abd00101cbb23cc0cb068f7e36e081057a15d2ec2dde3dda280709f030", size = 51829755, upload-time = "2026-04-22T19:14:50.85Z" },
+    { url = "https://files.pythonhosted.org/packages/65/ad/b33c3022a394f3eb55c3310597cec615412a8a33880055eee191d154a628/polars_runtime_32-1.40.1-cp310-abi3-win_arm64.whl", hash = "sha256:b5cbfaf6b085b420b4bfcbe24e8f665076d1cccfdb80c0484c02a023ce205537", size = 45822104, upload-time = "2026-04-22T19:14:54.192Z" },
+]
+
 [[package]]
 name = "pre-commit"
 version = "4.6.0"
@@ -986,6 +1024,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/b8/ef/50433d346c56657a70d27f156c7b349ac59a068b01de4eb796e747eecc43/protobuf-7.35.0-py3-none-any.whl", hash = "sha256:c13f325cf242bad135c350629eeb5d54b24228eb472fb3e2e9ebbd4c5dc20ca0", size = 171659, upload-time = "2026-05-19T23:02:27.842Z" },
 ]
 
+[[package]]
+name = "py-cpuinfo"
+version = "9.0.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/37/a8/d832f7293ebb21690860d2e01d8115e5ff6f2ae8bbdc953f0eb0fa4bd2c7/py-cpuinfo-9.0.0.tar.gz", hash = "sha256:3cdbbf3fac90dc6f118bfd64384f309edeadd902d7c8fb17f02ffa1fc3f49690", size = 104716, upload-time = "2022-10-25T20:38:06.303Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/e0/a9/023730ba63db1e494a271cb018dcd361bd2c917ba7004c3e49d5daf795a2/py_cpuinfo-9.0.0-py3-none-any.whl", hash = "sha256:859625bc251f64e21f077d099d4162689c762b5d6a4c3c97553d56241c9674d5", size = 22335, upload-time = "2022-10-25T20:38:27.636Z" },
+]
+
 [[package]]
 name = "pyarrow"
 version = "24.0.0"
@@ -1091,6 +1138,19 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/d4/24/a372aaf5c9b7208e7112038812994107bc65a84cd00e0354a88c2c77a617/pytest-9.0.3-py3-none-any.whl", hash = "sha256:2c5efc453d45394fdd706ade797c0a81091eccd1d6e4bccfcd476e2b8e0ab5d9", size = 375249, upload-time = "2026-04-07T17:16:16.13Z" },
 ]
 
+[[package]]
+name = "pytest-benchmark"
+version = "5.2.3"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "py-cpuinfo" },
+    { name = "pytest" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/24/34/9f732b76456d64faffbef6232f1f9dbec7a7c4999ff46282fa418bd1af66/pytest_benchmark-5.2.3.tar.gz", hash = "sha256:deb7317998a23c650fd4ff76e1230066a76cb45dcece0aca5607143c619e7779", size = 341340, upload-time = "2025-11-09T18:48:43.215Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/33/29/e756e715a48959f1c0045342088d7ca9762a2f509b945f362a316e9412b7/pytest_benchmark-5.2.3-py3-none-any.whl", hash = "sha256:bc839726ad20e99aaa0d11a127445457b4219bdb9e80a1afc4b51da7f96b0803", size = 45255, upload-time = "2025-11-09T18:48:39.765Z" },
+]
+
 [[package]]
 name = "pytest-cov"
 version = "7.1.0"

From fcad52b0f61342aad2f649260432ff113323b9e1 Mon Sep 17 00:00:00 2001
From: Jason Kai <21226986+kaitj@users.noreply.github.com>
Date: Wed, 6 May 2026 13:15:28 -0400
Subject: [PATCH 2/9] Add scripts for benchmarking

---
 .github/scripts/compare_benchmarks.py | 161 ++++++++++++++++++++++++++
 .github/scripts/run_benchmarks.py     |  27 +++++
 2 files changed, 188 insertions(+)
 create mode 100644 .github/scripts/compare_benchmarks.py
 create mode 100644 .github/scripts/run_benchmarks.py

diff --git a/.github/scripts/compare_benchmarks.py b/.github/scripts/compare_benchmarks.py
new file mode 100644
index 0000000..531ccd9
--- /dev/null
+++ b/.github/scripts/compare_benchmarks.py
@@ -0,0 +1,161 @@
+#!/usr/bin/env python
+"""Compare benchmark results across PR, main, and tag and output a markdown table."""
+
+import json
+import statistics
+from pathlib import Path
+from typing import Literal, NamedTuple
+
+
+class BenchmarkResult(NamedTuple):
+    fullname: str
+    kind: Literal["index", "query"]
+    locality: Literal["local", "remote"] | None = None
+    workers: int | None = None
+    median: float = 0.0
+    mean: float = 0.0
+    stddev: float = 0.0
+
+
+def parse_file(path: Path) -> dict[str, BenchmarkResult]:
+    data = json.loads(path.read_text())
+    results = {}
+    for benchmark in data["benchmarks"]:
+        fullname: str = benchmark["fullname"]
+        data_trimmed = benchmark["stats"]["data"][1:]
+        median = statistics.median(data_trimmed)
+        mean = statistics.mean(data_trimmed)
+        stddev = statistics.stdev(data_trimmed)
+
+        if "query" in fullname:
+            result = BenchmarkResult(
+                fullname=fullname, kind="query", median=median, mean=mean, stddev=stddev
+            )
+        else:
+            locality: Literal["local", "remote"] = (
+                "remote" if "openneuro" in fullname or "s3" in fullname else "local"
+            )
+            workers = benchmark["extra_info"].get("workers", "Unknown")
+            result = BenchmarkResult(
+                fullname=fullname,
+                kind="index",
+                locality=locality,
+                workers=workers,
+                median=median,
+                mean=mean,
+                stddev=stddev,
+            )
+        results[fullname] = result
+    return results
+
+
+def _scale(val: float) -> float:
+    return val * 1000
+
+
+def _fmt(res: BenchmarkResult) -> str:
+    median = _scale(res.median)
+    mean = _scale(res.mean)
+    stddev = _scale(res.stddev)
+    return f"{median:.3f} ({mean:.3f} ± {stddev:.3f}) ms"
+
+
+def _delta(pr: BenchmarkResult, ref: BenchmarkResult) -> str:
+    if ref == 0:
+        return "N/A"
+    diff = _scale(pr.median - ref.median)
+    pct = (pr.median / ref.median - 1) * 100
+    icon = "🔴" if pct > 5 else "🟢" if pct < -5 else "⚪"
+    return f"{icon} {diff:+.3f} ms ({pct:+.1f}%)"
+
+
+def _label(result: BenchmarkResult) -> str:
+    if result.kind == "query":
+        return (
+            result.fullname.split("::")[-1]
+            .replace("test_", "")
+            .replace("_", " ")
+            .capitalize()
+        )
+    return f"{result.locality.capitalize()} index ({result.workers} workers)"
+
+
+def build_table(
+    pr: dict[str, BenchmarkResult],
+    main: dict[str, BenchmarkResult],
+    tag: dict[str, BenchmarkResult],
+    tag_name: str,
+) -> str:
+    all_keys = set(pr) | set(main) | set(tag)
+    labels = [_label((pr.get(k) or main.get(k) or tag.get(k))) for k in all_keys]
+
+    col_sep = " | "
+    header = "| |" + col_sep.join(f" **{label}** " for label in labels) + " |"
+    divider = "|-|" + "|".join("---" for _ in all_keys) + "|"
+
+    def row(name: str, results: dict[str, BenchmarkResult]) -> str:
+        cells = [_fmt(results[k]) if k in results else "—" for k in all_keys]
+        return "| **" + name + "** |" + col_sep.join(f" {c} " for c in cells) + " |"
+
+    def delta_row(label: str, ref: dict[str, BenchmarkResult]) -> str:
+        cells = [
+            _delta(pr[k], ref[k]) if k in pr and k in ref else "—" for k in all_keys
+        ]
+        return "| *" + label + "* |" + col_sep.join(f" {c} " for c in cells) + " |"
+
+    lines = [
+        "## Benchmark Results",
+        "",
+        header,
+        divider,
+        row("PR", pr),
+        row("main", main),
+        row(tag_name, tag),
+        divider.replace("-", ""),
+        delta_row("PR vs main", main),
+        delta_row(f"PR vs {tag_name}", tag),
+        "",
+        "> `median (mean ± std)`",
+        "> ",
+        "🔴 >5% slower &nbsp; ⚪ within 5% &nbsp; 🟢 >5% faster",
+    ]
+    return "\n".join(lines)
+
+
+def main():
+    import argparse
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--pattern",
+        default="benchmark-*.json",
+        help="Glob pattern for benchmark JSON files",
+    )
+    parser.add_argument(
+        "-o",
+        "--output",
+        help="Output markdown filepath containing benchmark comparisons",
+    )
+    args = parser.parse_args()
+
+    files = sorted(Path(".").glob(args.pattern))
+    assert len(files) == 3, f"Expected 3 files, found {len(files)}: {files}"
+
+    # Infer pr/main/tag from directory name
+    parsed: dict[str, BenchmarkResult] = {}
+    tag = None
+    for f in files:
+        stem = f.parent.name  # e.g. "benchmark-pr"
+        key = stem.split("-")[-1]  # "pr", "main", tag
+        if key not in ("pr", "main"):
+            tag = key
+        parsed[key] = parse_file(f)
+    if tag is None:
+        raise ValueError("Unknown tag")
+    table = build_table(parsed["pr"], parsed["main"], parsed[tag], tag_name=tag)
+    args.output.write_text(table)
+    print(table)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/.github/scripts/run_benchmarks.py b/.github/scripts/run_benchmarks.py
new file mode 100644
index 0000000..64ce1e5
--- /dev/null
+++ b/.github/scripts/run_benchmarks.py
@@ -0,0 +1,27 @@
+#!/usr/bin/env python
+"""Perform benchmarks across PR commit, main, and previous tag."""
+
+import argparse
+
+import pytest
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-o", "--output", required=True, help="Output JSON file path")
+    args = parser.parse_args()
+
+    pytest.main(
+        [
+            "-m",
+            "benchmark",
+            "--benchmark-save-data",
+            f"--benchmark-json={args.output}",
+            "--benchmark-time-unit=ms",
+            "--benchmark-warmup=on",
+        ]
+    )
+
+
+if __name__ == "__main__":
+    main()

From 72411332f2381de84f74e1496c34613718fd6179 Mon Sep 17 00:00:00 2001
From: Jason Kai <21226986+kaitj@users.noreply.github.com>
Date: Wed, 6 May 2026 13:15:40 -0400
Subject: [PATCH 3/9] Add benchmark CI

---
 .github/workflows/benchmark.yaml | 62 ++++++++++++++++++++++++++++++++
 1 file changed, 62 insertions(+)
 create mode 100644 .github/workflows/benchmark.yaml

diff --git a/.github/workflows/benchmark.yaml b/.github/workflows/benchmark.yaml
new file mode 100644
index 0000000..9b15465
--- /dev/null
+++ b/.github/workflows/benchmark.yaml
@@ -0,0 +1,62 @@
+name: Benchmark
+
+on:
+  pull_request:
+    branches: [ "main" ]
+
+jobs:
+  get-tag:
+    runs-on: ubuntu-latest
+    outputs:
+      tag: ${{ steps.last_tag.outputs.tag }}
+    steps:
+      - uses: actions/checkout@v6
+        with:
+          fetch-tags: true
+          fetch-depth: 0
+      - id: last_tag
+        run: echo ="tag=$(git describe --tags --abbrev=0)" >> $GITHUB_OUTPUT
+
+  benchmark:
+    needs: get-tag
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        target:
+          - name: pr
+            ref: ${{ github.sha }}
+          - name: main
+            ref: main
+          - name: ${{ needs.get_tag.outputs.tag }}
+            ref: ${{ needs.get_tag.outputs.tag }}
+    steps:
+      - uses: actions/checkout@v6
+        with:
+          ref: ${{ matrix.target.ref }}
+          submodules: true
+      - uses: astral-sh/setup-uv@v8.1.0
+      - run: uv sync --group "benchmark" --extra "cloud"
+      - name: Run benchmarks
+        run: |
+          uv run .github/scripts/run_benchmarks.py \ 
+            --output benchmark-${{matrix.target.name }}.json
+      - uses: actions/upload-artifact@v7
+        with:
+          name: benchmark-${{ matrix.target.name }}
+          path: benchmark-${{ matrix.target.name }}.json
+
+  report:
+    needs: [ get-tag, benchmark ]
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v6
+      - uses: astral-sh/setup-uv@v8.1.0
+      - run: uv sync --group "benchmark"
+      - uses: actions/download-artifact@v8
+        with:
+          pattern: benchmark-*
+      - name: Generate report
+        run: |
+          uv run .github/scripts/compare_benchmarks.py \
+            --output benchmarks.md \
+            --pattern benchmark-*.json

From 93f191400c32642bbf2f9cfdde4910bc04c338ec Mon Sep 17 00:00:00 2001
From: Jason Kai <21226986+kaitj@users.noreply.github.com>
Date: Wed, 6 May 2026 13:45:30 -0400
Subject: [PATCH 4/9] Register benchmark pytest marker

---
 pyproject.toml | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index 6622b2b..a9f7380 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -69,4 +69,7 @@ lint.extend-select = ["I"]
 [tool.pytest.ini_options]
 log_cli = true
 log_cli_level = "INFO"
-markers = ["cloud: Tests requiring cloud group dependencies"]
+markers = [
+    "benchmark: Tests used for benchmarking",
+    "cloud: Tests requiring cloud group dependencies",
+]

From 4afac6dbb548392c76a5f66d5e64bfa078b863c8 Mon Sep 17 00:00:00 2001
From: Jason Kai <21226986+kaitj@users.noreply.github.com>
Date: Wed, 6 May 2026 13:51:31 -0400
Subject: [PATCH 5/9] Fix CI workflow bugs

- Mark tests with "cloud" and / or "benchmark" as needed
- Combine both "dev" and "benchmark" dependencies, was causing issues with the pytest due to imports (alternatively, use `try-except` block for optional dependency import)
- Replace pandas with polars in dev dependency (for benchmarking)
---
 .github/scripts/run_benchmarks.py |  2 +-
 .github/workflows/benchmark.yaml  |  3 +--
 .github/workflows/ci.yaml         | 10 +++++-----
 pyproject.toml                    |  2 +-
 tests/benchmarks/test_index.py    |  1 +
 uv.lock                           | 18 +++++-------------
 6 files changed, 14 insertions(+), 22 deletions(-)

diff --git a/.github/scripts/run_benchmarks.py b/.github/scripts/run_benchmarks.py
index 64ce1e5..cf870c0 100644
--- a/.github/scripts/run_benchmarks.py
+++ b/.github/scripts/run_benchmarks.py
@@ -14,7 +14,7 @@ def main():
     pytest.main(
         [
             "-m",
-            "benchmark",
+            "benchmark and not cloud",
             "--benchmark-save-data",
             f"--benchmark-json={args.output}",
             "--benchmark-time-unit=ms",
diff --git a/.github/workflows/benchmark.yaml b/.github/workflows/benchmark.yaml
index 9b15465..cd91f8f 100644
--- a/.github/workflows/benchmark.yaml
+++ b/.github/workflows/benchmark.yaml
@@ -35,7 +35,7 @@ jobs:
           ref: ${{ matrix.target.ref }}
           submodules: true
       - uses: astral-sh/setup-uv@v8.1.0
-      - run: uv sync --group "benchmark" --extra "cloud"
+      - run: uv sync --extra "cloud"
       - name: Run benchmarks
         run: |
           uv run .github/scripts/run_benchmarks.py \ 
@@ -51,7 +51,6 @@ jobs:
     steps:
       - uses: actions/checkout@v6
       - uses: astral-sh/setup-uv@v8.1.0
-      - run: uv sync --group "benchmark"
       - uses: actions/download-artifact@v8
         with:
           pattern: benchmark-*
diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
index 773ee8a..4117030 100644
--- a/.github/workflows/ci.yaml
+++ b/.github/workflows/ci.yaml
@@ -42,19 +42,19 @@ jobs:
           python-version: ${{ matrix.python-version }}
       - run: uv sync --all-extras
 
-      - name: Run tests without cloudpathlib
+      - name: Run non-cloud tests
         run: |
           uv run pytest \
-            -m "not cloud" \
+            -m "not cloud and not benchmark" \
             --junitxml=pytest-cloudless.xml \
             --cov-report=xml:coverage.xml \
-            --cov bids2table \
+            --cov=bids2table \
             tests
 
-      - name: Run tests with cloudpathlib
+      - name: Run cloud tests
         run: |
           uv run pytest \
-            -m "cloud" \
+            -m "cloud and not benchmark" \
             --junitxml=pytest-cloud.xml \
             --cov-report=xml:coverage.xml \
             --cov=bids2table \
diff --git a/pyproject.toml b/pyproject.toml
index a9f7380..92f6514 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -35,11 +35,11 @@ s3 = [
 pybids = ["pandas>=2.0.0"]
 
 [dependency-groups]
-benchmark = ["polars>=1.40.1", "pytest>=9.0.3", "pytest-benchmark>=5.2.3"]
 dev = [
     "pdoc>=16.0.0",
     "pre-commit>=4.6.0",
     "pytest>=9.0.3",
+    "pytest-benchmark>=5.2.3",
     "pytest-cov>=7.1.0",
     "ruff>=0.15.13",
 ]
diff --git a/tests/benchmarks/test_index.py b/tests/benchmarks/test_index.py
index 9894b58..8c3e37c 100644
--- a/tests/benchmarks/test_index.py
+++ b/tests/benchmarks/test_index.py
@@ -67,6 +67,7 @@ def _teardown(index_fpath: Path):
 
 
 @pytest.mark.benchmark
+@pytest.mark.cloud
 def test_openneuro(benchmark: BenchmarkFixture, tmp_path: Path) -> None:
     """Benchmark b2t2 with a subset of datasets on OpenNeuro."""
     workers = 4
diff --git a/uv.lock b/uv.lock
index e4bcc26..8b75f83 100644
--- a/uv.lock
+++ b/uv.lock
@@ -43,15 +43,12 @@ s3 = [
 ]
 
 [package.dev-dependencies]
-benchmark = [
-    { name = "polars" },
-    { name = "pytest" },
-    { name = "pytest-benchmark" },
-]
 dev = [
     { name = "pdoc" },
+    { name = "polars" },
     { name = "pre-commit" },
     { name = "pytest" },
+    { name = "pytest-benchmark" },
     { name = "pytest-cov" },
     { name = "ruff" },
 ]
@@ -68,15 +65,12 @@ requires-dist = [
 provides-extras = ["cloud", "s3", "pybids"]
 
 [package.metadata.requires-dev]
-benchmark = [
-    { name = "polars", specifier = ">=1.40.1" },
-    { name = "pytest", specifier = ">=9.0.3" },
-    { name = "pytest-benchmark", specifier = ">=5.2.3" },
-]
 dev = [
     { name = "pdoc", specifier = ">=16.0.0" },
+    { name = "polars", specifier = ">=1.40.1" },
     { name = "pre-commit", specifier = ">=4.6.0" },
     { name = "pytest", specifier = ">=9.0.3" },
+    { name = "pytest-benchmark", specifier = ">=5.2.3" },
     { name = "pytest-cov", specifier = ">=7.1.0" },
     { name = "ruff", specifier = ">=0.15.13" },
 ]
@@ -105,9 +99,7 @@ dependencies = [
     { name = "s3transfer" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/67/2f/c4159fa45079b41f11ad17d8c5df8e1d10169b94d1e4240df5be116d3f0a/boto3-1.43.12.tar.gz", hash = "sha256:4a60cdf02c52cb0a60f8dbc986142ce2c31e87e3df1438ffe6755b83008f3e4e", size = 113142, upload-time = "2026-05-20T19:38:13.163Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/a2/35/b7ab4b6977811f9887405e24460640033c22f4515cf1e904480710bd6296/boto3-1.43.12-py3-none-any.whl", hash = "sha256:685c3e6093455623bfc22dac55b4946ea243095252f7f9c11a99d84b38033bcf", size = 140537, upload-time = "2026-05-20T19:38:09.995Z" },
-]
+
 
 [[package]]
 name = "botocore"

From 2b2dbcf5732160efb79f7bfa9094630c097cc7a1 Mon Sep 17 00:00:00 2001
From: Jason Kai <21226986+kaitj@users.noreply.github.com>
Date: Wed, 6 May 2026 14:12:48 -0400
Subject: [PATCH 6/9] Fix benchmark workflow bugs

- Switch to shortened SHA for PR
- Add PR for unique output file artifact
- Disable comparison against tag due to lack of dependency group
- Add step to comment on PR
- Sort labels for comment
---
 .github/scripts/compare_benchmarks.py | 46 +++++++++++++++--------
 .github/scripts/run_benchmarks.py     |  2 +-
 .github/workflows/benchmark.yaml      | 54 ++++++++++++++++++++-------
 3 files changed, 73 insertions(+), 29 deletions(-)

diff --git a/.github/scripts/compare_benchmarks.py b/.github/scripts/compare_benchmarks.py
index 531ccd9..2a8a46c 100644
--- a/.github/scripts/compare_benchmarks.py
+++ b/.github/scripts/compare_benchmarks.py
@@ -2,10 +2,17 @@
 """Compare benchmark results across PR, main, and tag and output a markdown table."""
 
 import json
+import logging
+import re
 import statistics
 from pathlib import Path
 from typing import Literal, NamedTuple
 
+_logger = logging.getLogger(__name__)
+
+
+ALERT = 250  # Value (arbitrary; in ms) to indicate difference between benchmarks
+
 
 class BenchmarkResult(NamedTuple):
     fullname: str
@@ -64,9 +71,9 @@ def _delta(pr: BenchmarkResult, ref: BenchmarkResult) -> str:
     if ref == 0:
         return "N/A"
     diff = _scale(pr.median - ref.median)
-    pct = (pr.median / ref.median - 1) * 100
-    icon = "🔴" if pct > 5 else "🟢" if pct < -5 else "⚪"
-    return f"{icon} {diff:+.3f} ms ({pct:+.1f}%)"
+    # Indicator for 250ms absolute diff (arbitrary)
+    icon = "🔴" if diff > ALERT else "🟢" if diff < -ALERT else "⚪"
+    return f"{icon} {diff:+.3f}ms"
 
 
 def _label(result: BenchmarkResult) -> str:
@@ -83,10 +90,13 @@ def _label(result: BenchmarkResult) -> str:
 def build_table(
     pr: dict[str, BenchmarkResult],
     main: dict[str, BenchmarkResult],
-    tag: dict[str, BenchmarkResult],
-    tag_name: str,
+    tag: dict[str, BenchmarkResult] = {},
+    tag_name: str | None = None,
 ) -> str:
     all_keys = set(pr) | set(main) | set(tag)
+    all_keys = sorted(
+        all_keys, key=lambda x: (0 if "index" in x else 1 if "query" in x else 2, x)
+    )
     labels = [_label((pr.get(k) or main.get(k) or tag.get(k))) for k in all_keys]
 
     col_sep = " | "
@@ -110,14 +120,14 @@ def delta_row(label: str, ref: dict[str, BenchmarkResult]) -> str:
         divider,
         row("PR", pr),
         row("main", main),
-        row(tag_name, tag),
+        # row(tag_name, tag),
         divider.replace("-", ""),
         delta_row("PR vs main", main),
-        delta_row(f"PR vs {tag_name}", tag),
+        # delta_row(f"PR vs {tag_name}", tag),
         "",
         "> `median (mean ± std)`",
         "> ",
-        "🔴 >5% slower &nbsp; ⚪ within 5% &nbsp; 🟢 >5% faster",
+        f"> 🔴 >{ALERT}ms slower &nbsp; ⚪ within {ALERT}ms &nbsp; 🟢 >{ALERT}ms faster",
     ]
     return "\n".join(lines)
 
@@ -134,27 +144,33 @@ def main():
     parser.add_argument(
         "-o",
         "--output",
+        type=Path,
         help="Output markdown filepath containing benchmark comparisons",
     )
     args = parser.parse_args()
 
     files = sorted(Path(".").glob(args.pattern))
-    assert len(files) == 3, f"Expected 3 files, found {len(files)}: {files}"
+    assert len(files) > 1, "Expected more than 1 file for benchmark comparison."
 
     # Infer pr/main/tag from directory name
     parsed: dict[str, BenchmarkResult] = {}
     tag = None
     for f in files:
-        stem = f.parent.name  # e.g. "benchmark-pr"
-        key = stem.split("-")[-1]  # "pr", "main", tag
-        if key not in ("pr", "main"):
+        stem = f.name  # e.g. "benchmark-pr-PR-#"
+        key = stem.split("-")[1]  # commit-sha, "main", tag
+
+        # Special cases
+        if re.match(r"^v\d+\.\d+.\d+$", key):
             tag = key
+        elif key != "main":
+            key = "pr"
+
         parsed[key] = parse_file(f)
     if tag is None:
-        raise ValueError("Unknown tag")
-    table = build_table(parsed["pr"], parsed["main"], parsed[tag], tag_name=tag)
+        _logger.warning("Tag not found")
+    table = build_table(parsed["pr"], parsed["main"], parsed.get(tag, {}), tag_name=tag)
     args.output.write_text(table)
-    print(table)
+    _logger.info(table)
 
 
 if __name__ == "__main__":
diff --git a/.github/scripts/run_benchmarks.py b/.github/scripts/run_benchmarks.py
index cf870c0..64ce1e5 100644
--- a/.github/scripts/run_benchmarks.py
+++ b/.github/scripts/run_benchmarks.py
@@ -14,7 +14,7 @@ def main():
     pytest.main(
         [
             "-m",
-            "benchmark and not cloud",
+            "benchmark",
             "--benchmark-save-data",
             f"--benchmark-json={args.output}",
             "--benchmark-time-unit=ms",
diff --git a/.github/workflows/benchmark.yaml b/.github/workflows/benchmark.yaml
index cd91f8f..d919a6a 100644
--- a/.github/workflows/benchmark.yaml
+++ b/.github/workflows/benchmark.yaml
@@ -4,31 +4,38 @@ on:
   pull_request:
     branches: [ "main" ]
 
+permissions:
+  pull-requests: write
 jobs:
-  get-tag:
+  prep:
     runs-on: ubuntu-latest
     outputs:
       tag: ${{ steps.last_tag.outputs.tag }}
+      short_sha: ${{ steps.short.outputs.sha }}
     steps:
       - uses: actions/checkout@v6
         with:
           fetch-tags: true
           fetch-depth: 0
       - id: last_tag
-        run: echo ="tag=$(git describe --tags --abbrev=0)" >> $GITHUB_OUTPUT
+        run: echo "tag=$(git describe --tags --abbrev=0)" >> $GITHUB_OUTPUT
+      - id: short
+        run: echo "sha=$(echo ${{ github.sha }} | cut -c1-7)" >> $GITHUB_OUTPUT
 
   benchmark:
-    needs: get-tag
+    needs: prep
     runs-on: ubuntu-latest
     strategy:
+      fail-fast: false
       matrix:
         target:
-          - name: pr
+          - name: ${{ needs.prep.outputs.short_sha }}
             ref: ${{ github.sha }}
           - name: main
             ref: main
-          - name: ${{ needs.get_tag.outputs.tag }}
-            ref: ${{ needs.get_tag.outputs.tag }}
+          # Tag comparison disabled until next release (missing benchmark dependencies)
+          # - name: ${{ needs.prep.outputs.tag }}
+          #   ref: ${{ needs.prep.outputs.tag }}
     steps:
       - uses: actions/checkout@v6
         with:
@@ -38,15 +45,19 @@ jobs:
       - run: uv sync --extra "cloud"
       - name: Run benchmarks
         run: |
-          uv run .github/scripts/run_benchmarks.py \ 
-            --output benchmark-${{matrix.target.name }}.json
+          FILENAME="benchmark-${{ matrix.target.name }}-PR-${{ github.event.pull_request.number }}.json"
+          uv run .github/scripts/run_benchmarks.py --output "$FILENAME"
+          echo "REPORT_PATH=$FILENAME" >> $GITHUB_ENV
       - uses: actions/upload-artifact@v7
         with:
-          name: benchmark-${{ matrix.target.name }}
-          path: benchmark-${{ matrix.target.name }}.json
+          name: benchmark-${{ matrix.target.name }}-PR-${{
+            github.event.pull_request.number }}
+          path: ${{ env.REPORT_PATH }}
+          retention-days: 1
+          overwrite: true
 
   report:
-    needs: [ get-tag, benchmark ]
+    needs: [ prep, benchmark ]
     runs-on: ubuntu-latest
     steps:
       - uses: actions/checkout@v6
@@ -54,8 +65,25 @@ jobs:
       - uses: actions/download-artifact@v8
         with:
           pattern: benchmark-*
+          merge-multiple: true
+          path: benchmark-results
       - name: Generate report
         run: |
           uv run .github/scripts/compare_benchmarks.py \
-            --output benchmarks.md \
-            --pattern benchmark-*.json
+            --output "benchmarks.md" \
+            --pattern "benchmark-results/benchmark-*-PR-${{ github.event.pull_request.number }}.json"
+      - name: Find Comment
+        uses: peter-evans/find-comment@v3
+        id: fc
+        with:
+          issue-number: ${{ github.event.pull_request.number }}
+          comment-author: "github-actions[bot]"
+          body-includes: "Benchmark Results"
+
+      - name: Create / update comment
+        uses: peter-evans/create-or-update-comment@v5
+        with:
+          comment-id: ${{ steps.fc.outputs.comment-id }}
+          issue-number: ${{ github.event.pull_request.number }}
+          body-path: "benchmarks.md"
+          edit-mode: replace

From cea0a0bbb8040bde52ae49493656ff5b863b3d16 Mon Sep 17 00:00:00 2001
From: Jason Kai <21226986+kaitj@users.noreply.github.com>
Date: Wed, 13 May 2026 17:25:57 -0400
Subject: [PATCH 7/9] Add benchmarking script

- Fold CI scripts into local benchmark script
- Remove CI workflow
- Use importlib for pytest for identical file names across different test modules
---
 .github/scripts/compare_benchmarks.py | 177 -------------
 .github/scripts/run_benchmarks.py     |  27 --
 .github/workflows/benchmark.yaml      |  82 +-----
 .gitignore                            |   1 +
 pyproject.toml                        |   2 +
 scripts/benchmark.py                  | 368 ++++++++++++++++++++++++++
 uv.lock                               |   4 +-
 7 files changed, 384 insertions(+), 277 deletions(-)
 delete mode 100644 .github/scripts/compare_benchmarks.py
 delete mode 100644 .github/scripts/run_benchmarks.py
 create mode 100644 scripts/benchmark.py

diff --git a/.github/scripts/compare_benchmarks.py b/.github/scripts/compare_benchmarks.py
deleted file mode 100644
index 2a8a46c..0000000
--- a/.github/scripts/compare_benchmarks.py
+++ /dev/null
@@ -1,177 +0,0 @@
-#!/usr/bin/env python
-"""Compare benchmark results across PR, main, and tag and output a markdown table."""
-
-import json
-import logging
-import re
-import statistics
-from pathlib import Path
-from typing import Literal, NamedTuple
-
-_logger = logging.getLogger(__name__)
-
-
-ALERT = 250  # Value (arbitrary; in ms) to indicate difference between benchmarks
-
-
-class BenchmarkResult(NamedTuple):
-    fullname: str
-    kind: Literal["index", "query"]
-    locality: Literal["local", "remote"] | None = None
-    workers: int | None = None
-    median: float = 0.0
-    mean: float = 0.0
-    stddev: float = 0.0
-
-
-def parse_file(path: Path) -> dict[str, BenchmarkResult]:
-    data = json.loads(path.read_text())
-    results = {}
-    for benchmark in data["benchmarks"]:
-        fullname: str = benchmark["fullname"]
-        data_trimmed = benchmark["stats"]["data"][1:]
-        median = statistics.median(data_trimmed)
-        mean = statistics.mean(data_trimmed)
-        stddev = statistics.stdev(data_trimmed)
-
-        if "query" in fullname:
-            result = BenchmarkResult(
-                fullname=fullname, kind="query", median=median, mean=mean, stddev=stddev
-            )
-        else:
-            locality: Literal["local", "remote"] = (
-                "remote" if "openneuro" in fullname or "s3" in fullname else "local"
-            )
-            workers = benchmark["extra_info"].get("workers", "Unknown")
-            result = BenchmarkResult(
-                fullname=fullname,
-                kind="index",
-                locality=locality,
-                workers=workers,
-                median=median,
-                mean=mean,
-                stddev=stddev,
-            )
-        results[fullname] = result
-    return results
-
-
-def _scale(val: float) -> float:
-    return val * 1000
-
-
-def _fmt(res: BenchmarkResult) -> str:
-    median = _scale(res.median)
-    mean = _scale(res.mean)
-    stddev = _scale(res.stddev)
-    return f"{median:.3f} ({mean:.3f} ± {stddev:.3f}) ms"
-
-
-def _delta(pr: BenchmarkResult, ref: BenchmarkResult) -> str:
-    if ref == 0:
-        return "N/A"
-    diff = _scale(pr.median - ref.median)
-    # Indicator for 250ms absolute diff (arbitrary)
-    icon = "🔴" if diff > ALERT else "🟢" if diff < -ALERT else "⚪"
-    return f"{icon} {diff:+.3f}ms"
-
-
-def _label(result: BenchmarkResult) -> str:
-    if result.kind == "query":
-        return (
-            result.fullname.split("::")[-1]
-            .replace("test_", "")
-            .replace("_", " ")
-            .capitalize()
-        )
-    return f"{result.locality.capitalize()} index ({result.workers} workers)"
-
-
-def build_table(
-    pr: dict[str, BenchmarkResult],
-    main: dict[str, BenchmarkResult],
-    tag: dict[str, BenchmarkResult] = {},
-    tag_name: str | None = None,
-) -> str:
-    all_keys = set(pr) | set(main) | set(tag)
-    all_keys = sorted(
-        all_keys, key=lambda x: (0 if "index" in x else 1 if "query" in x else 2, x)
-    )
-    labels = [_label((pr.get(k) or main.get(k) or tag.get(k))) for k in all_keys]
-
-    col_sep = " | "
-    header = "| |" + col_sep.join(f" **{label}** " for label in labels) + " |"
-    divider = "|-|" + "|".join("---" for _ in all_keys) + "|"
-
-    def row(name: str, results: dict[str, BenchmarkResult]) -> str:
-        cells = [_fmt(results[k]) if k in results else "—" for k in all_keys]
-        return "| **" + name + "** |" + col_sep.join(f" {c} " for c in cells) + " |"
-
-    def delta_row(label: str, ref: dict[str, BenchmarkResult]) -> str:
-        cells = [
-            _delta(pr[k], ref[k]) if k in pr and k in ref else "—" for k in all_keys
-        ]
-        return "| *" + label + "* |" + col_sep.join(f" {c} " for c in cells) + " |"
-
-    lines = [
-        "## Benchmark Results",
-        "",
-        header,
-        divider,
-        row("PR", pr),
-        row("main", main),
-        # row(tag_name, tag),
-        divider.replace("-", ""),
-        delta_row("PR vs main", main),
-        # delta_row(f"PR vs {tag_name}", tag),
-        "",
-        "> `median (mean ± std)`",
-        "> ",
-        f"> 🔴 >{ALERT}ms slower &nbsp; ⚪ within {ALERT}ms &nbsp; 🟢 >{ALERT}ms faster",
-    ]
-    return "\n".join(lines)
-
-
-def main():
-    import argparse
-
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--pattern",
-        default="benchmark-*.json",
-        help="Glob pattern for benchmark JSON files",
-    )
-    parser.add_argument(
-        "-o",
-        "--output",
-        type=Path,
-        help="Output markdown filepath containing benchmark comparisons",
-    )
-    args = parser.parse_args()
-
-    files = sorted(Path(".").glob(args.pattern))
-    assert len(files) > 1, "Expected more than 1 file for benchmark comparison."
-
-    # Infer pr/main/tag from directory name
-    parsed: dict[str, BenchmarkResult] = {}
-    tag = None
-    for f in files:
-        stem = f.name  # e.g. "benchmark-pr-PR-#"
-        key = stem.split("-")[1]  # commit-sha, "main", tag
-
-        # Special cases
-        if re.match(r"^v\d+\.\d+.\d+$", key):
-            tag = key
-        elif key != "main":
-            key = "pr"
-
-        parsed[key] = parse_file(f)
-    if tag is None:
-        _logger.warning("Tag not found")
-    table = build_table(parsed["pr"], parsed["main"], parsed.get(tag, {}), tag_name=tag)
-    args.output.write_text(table)
-    _logger.info(table)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/.github/scripts/run_benchmarks.py b/.github/scripts/run_benchmarks.py
deleted file mode 100644
index 64ce1e5..0000000
--- a/.github/scripts/run_benchmarks.py
+++ /dev/null
@@ -1,27 +0,0 @@
-#!/usr/bin/env python
-"""Perform benchmarks across PR commit, main, and previous tag."""
-
-import argparse
-
-import pytest
-
-
-def main():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("-o", "--output", required=True, help="Output JSON file path")
-    args = parser.parse_args()
-
-    pytest.main(
-        [
-            "-m",
-            "benchmark",
-            "--benchmark-save-data",
-            f"--benchmark-json={args.output}",
-            "--benchmark-time-unit=ms",
-            "--benchmark-warmup=on",
-        ]
-    )
-
-
-if __name__ == "__main__":
-    main()
diff --git a/.github/workflows/benchmark.yaml b/.github/workflows/benchmark.yaml
index d919a6a..5649216 100644
--- a/.github/workflows/benchmark.yaml
+++ b/.github/workflows/benchmark.yaml
@@ -2,88 +2,26 @@ name: Benchmark
 
 on:
   pull_request:
-    branches: [ "main" ]
+    branches: ["main"]
 
-permissions:
-  pull-requests: write
 jobs:
-  prep:
-    runs-on: ubuntu-latest
-    outputs:
-      tag: ${{ steps.last_tag.outputs.tag }}
-      short_sha: ${{ steps.short.outputs.sha }}
-    steps:
-      - uses: actions/checkout@v6
-        with:
-          fetch-tags: true
-          fetch-depth: 0
-      - id: last_tag
-        run: echo "tag=$(git describe --tags --abbrev=0)" >> $GITHUB_OUTPUT
-      - id: short
-        run: echo "sha=$(echo ${{ github.sha }} | cut -c1-7)" >> $GITHUB_OUTPUT
-
   benchmark:
-    needs: prep
     runs-on: ubuntu-latest
-    strategy:
-      fail-fast: false
-      matrix:
-        target:
-          - name: ${{ needs.prep.outputs.short_sha }}
-            ref: ${{ github.sha }}
-          - name: main
-            ref: main
-          # Tag comparison disabled until next release (missing benchmark dependencies)
-          # - name: ${{ needs.prep.outputs.tag }}
-          #   ref: ${{ needs.prep.outputs.tag }}
+    env:
+      BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
     steps:
       - uses: actions/checkout@v6
         with:
-          ref: ${{ matrix.target.ref }}
+          fetch-depth: 0
           submodules: true
+          ref: ${{ github.head_ref }}
       - uses: astral-sh/setup-uv@v8.1.0
-      - run: uv sync --extra "cloud"
+      - run: uv sync --frozen --all-extras
       - name: Run benchmarks
+        id: run-benchmarks
         run: |
-          FILENAME="benchmark-${{ matrix.target.name }}-PR-${{ github.event.pull_request.number }}.json"
-          uv run .github/scripts/run_benchmarks.py --output "$FILENAME"
-          echo "REPORT_PATH=$FILENAME" >> $GITHUB_ENV
+          uv run python scripts/benchmark.py --branch $BRANCH_NAME
       - uses: actions/upload-artifact@v7
         with:
-          name: benchmark-${{ matrix.target.name }}-PR-${{
-            github.event.pull_request.number }}
-          path: ${{ env.REPORT_PATH }}
-          retention-days: 1
-          overwrite: true
-
-  report:
-    needs: [ prep, benchmark ]
-    runs-on: ubuntu-latest
-    steps:
-      - uses: actions/checkout@v6
-      - uses: astral-sh/setup-uv@v8.1.0
-      - uses: actions/download-artifact@v8
-        with:
-          pattern: benchmark-*
-          merge-multiple: true
-          path: benchmark-results
-      - name: Generate report
-        run: |
-          uv run .github/scripts/compare_benchmarks.py \
-            --output "benchmarks.md" \
-            --pattern "benchmark-results/benchmark-*-PR-${{ github.event.pull_request.number }}.json"
-      - name: Find Comment
-        uses: peter-evans/find-comment@v3
-        id: fc
-        with:
-          issue-number: ${{ github.event.pull_request.number }}
-          comment-author: "github-actions[bot]"
-          body-includes: "Benchmark Results"
-
-      - name: Create / update comment
-        uses: peter-evans/create-or-update-comment@v5
-        with:
-          comment-id: ${{ steps.fc.outputs.comment-id }}
-          issue-number: ${{ github.event.pull_request.number }}
-          body-path: "benchmarks.md"
-          edit-mode: replace
+          name: benchmark-${{ matrix.target.name }}
+          path: ${{ steps.run-benchmarks.outputs.report_file }}
diff --git a/.gitignore b/.gitignore
index 748e2f9..2d1a0f2 100644
--- a/.gitignore
+++ b/.gitignore
@@ -26,6 +26,7 @@ htmlcov
 
 # Local data and scratch
 .scratch
+benchmarks/
 
 # Local virtual environment
 .venv
diff --git a/pyproject.toml b/pyproject.toml
index 92f6514..17e249e 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -37,6 +37,7 @@ pybids = ["pandas>=2.0.0"]
 [dependency-groups]
 dev = [
     "pdoc>=16.0.0",
+    "polars>=1.40.1",
     "pre-commit>=4.6.0",
     "pytest>=9.0.3",
     "pytest-benchmark>=5.2.3",
@@ -69,6 +70,7 @@ lint.extend-select = ["I"]
 [tool.pytest.ini_options]
 log_cli = true
 log_cli_level = "INFO"
+addopts = "--import-mode=importlib"
 markers = [
     "benchmark: Tests used for benchmarking",
     "cloud: Tests requiring cloud group dependencies",
diff --git a/scripts/benchmark.py b/scripts/benchmark.py
new file mode 100644
index 0000000..2ae13b3
--- /dev/null
+++ b/scripts/benchmark.py
@@ -0,0 +1,368 @@
+# /// script
+# requires-python = ">=3.13"
+# dependencies = []
+# ///
+"""Perform benchmarking of bids2table against last tag, main and feature branches.
+
+Run with:
+    uv run --with <repo> scripts/benchmark.py -b <feature_branch> [-o <output_dir>]
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import logging
+import os
+import statistics
+import subprocess
+import sys
+from contextlib import contextmanager
+from datetime import datetime, timezone
+from pathlib import Path
+from typing import Literal, NamedTuple
+
+import pytest
+
+logging.basicConfig(level=logging.INFO)
+_logger = logging.getLogger("bids2table.benchmark")
+
+
+# Suppression and resetting (after checkout) necessary due to streaming of outputs
+@contextmanager
+def _suppress_log_exceptions():
+    logging.raiseExceptions = False
+    try:
+        yield
+    finally:
+        logging.raiseExceptions = True
+
+
+def _reset_logger():
+    for h in _logger.handlers[:]:
+        _logger.removeHandler(h)
+        h.close()
+    logging.basicConfig(stream=sys.stderr, level=logging.INFO)
+
+
+class Git:
+    """Class to simplify git calls via subprocess."""
+
+    def __init__(self):
+        """Initialize the repository object, pulling in latest changes."""
+        self.repo_path = self._root()
+        self._head_ref = self._run("rev-parse", "--abbrev-ref", "HEAD")
+
+    def __enter__(self):
+        if bool(self._run("status", "--porcelain")):
+            _logger.error("Please stash or commit changes before benchmarking.")
+            sys.exit(1)
+        self.pull()
+        self.submodule_update()
+        return self
+
+    def __exit__(self, *_):
+        """On context closure, checkout the HEAD ref."""
+        self.checkout(self._head_ref)
+
+    @staticmethod
+    def _root() -> Path:
+        result = subprocess.run(
+            ["git", "rev-parse", "--show-toplevel"], capture_output=True, text=True
+        )
+        return Path(result.stdout.strip())
+
+    def _run(self, *args: str) -> str:
+        result = subprocess.run(
+            ["git", "-C", str(self.repo_path), *args], capture_output=True, text=True
+        )
+        if result.returncode != 0:
+            _logger.error(result.stderr.strip())
+            sys.exit(result.returncode)
+        return result.stdout.strip()
+
+    def checkout(self, ref: str) -> None:
+        """Checkout reference.
+
+        Args:
+            ref: Reference to checkout (e.g. branch, SHA, tag)
+        """
+        self._run("checkout", ref)
+
+    def pull(self) -> None:
+        """Pull from the remote repository."""
+        self._run("pull")
+
+    def submodule_update(self) -> None:
+        """Update submodules of the repo, initializing if necessary."""
+        self._run("submodule", "update", "--init", "--recursive")
+
+    def last_tag(self) -> str:
+        """Get last tag.
+
+        Returns:
+            A string value of the last tag
+        """
+        return self._run("describe", "--tags", "--abbrev=0")
+
+
+class BenchmarkResult(NamedTuple):
+    fullname: str
+    kind: Literal["index", "query"]
+    locality: Literal["local", "remote"] | None = None
+    workers: int = 1
+    median: float = 0.0
+    mean: float = 0.0
+    stddev: float = 0.0
+
+
+def parse_file(path: Path) -> dict[str, BenchmarkResult]:
+    data = json.loads(path.read_text())
+    results = {}
+    for benchmark in data["benchmarks"]:
+        fullname: str = benchmark["fullname"]
+        data_trimmed = benchmark["stats"]["data"][1:]
+        median = statistics.median(data_trimmed)
+        mean = statistics.mean(data_trimmed)
+        stddev = statistics.stdev(data_trimmed)
+
+        if "query" in fullname:
+            result = BenchmarkResult(
+                fullname=fullname, kind="query", median=median, mean=mean, stddev=stddev
+            )
+        else:
+            locality: Literal["local", "remote"] = (
+                "remote" if "openneuro" in fullname or "s3" in fullname else "local"
+            )
+            workers = benchmark["extra_info"].get("workers", "Unknown")
+            result = BenchmarkResult(
+                fullname=fullname,
+                kind="index",
+                locality=locality,
+                workers=workers,
+                median=median,
+                mean=mean,
+                stddev=stddev,
+            )
+        results[fullname] = result
+    return results
+
+
+# Values are alwaays provided in seconds in the json outputs.
+# Need to scale appropriately (also noting factor and unit to pass along for
+# formatting).
+class Value(NamedTuple):
+    value: float
+    factor: float
+    unit: str
+
+
+def _scale(val: float) -> Value:
+    if val >= 1.0:
+        return Value(value=val, factor=1, unit="s")
+    elif val >= 1e-3:
+        return Value(value=val * 1e3, factor=1e3, unit="ms")
+    else:
+        return Value(value=val * 1e6, factor=1e6, unit="µs")
+
+
+def _fmt(res: BenchmarkResult) -> str:
+    median = _scale(res.median)
+    mean = res.mean * median.factor
+    stddev = res.stddev * median.factor
+    return f"{median.value:.3f} ({mean:.3f} ± {stddev:.3f}) {median.unit}"
+
+
+def _ratio(pr: BenchmarkResult, ref: BenchmarkResult) -> str:
+    ratio = pr.median / ref.median
+    icon = "🔴" if ratio > 1 else "🟢" if ratio < 1 else "⚪"
+    return f"{icon} {ratio:.2f}"
+
+
+def _label(result: BenchmarkResult) -> str:
+    if result.kind == "query":
+        return (
+            result.fullname.split("::")[-1]
+            .replace("test_", "")
+            .replace("_", " ")
+            .capitalize()
+        )
+    return f"{result.locality.capitalize()} index ({result.workers} workers)"
+
+
+def build_table(
+    branch_name: str,
+    branch: dict[str, BenchmarkResult],
+    main: dict[str, BenchmarkResult],
+    tag: dict[str, BenchmarkResult] | None = None,
+) -> str:
+    tag = tag or {}
+    all_keys = sorted(
+        set(branch) | set(main) | set(tag),
+        key=lambda x: (0 if "index" in x else 1 if "query" in x else 2, x),
+    )
+    labels = [_label(branch.get(k) or main.get(k) or tag.get(k)) for k in all_keys]
+
+    col_sep = " | "
+    header = "| |" + col_sep.join(f" **{label}** " for label in labels) + " |"
+    divider = "|-|" + "|".join("---" for _ in all_keys) + "|"
+
+    def row(name: str, results: dict[str, BenchmarkResult]) -> str:
+        cells = [_fmt(results[k]) if k in results else "—" for k in all_keys]
+        return "| **" + name + "** |" + col_sep.join(f" {c} " for c in cells) + " |"
+
+    def ratio_row(label: str, ref: dict[str, BenchmarkResult]) -> str:
+        cells = [
+            _ratio(branch[k], ref[k]) if k in branch and k in ref else "—"
+            for k in all_keys
+        ]
+        return "| *" + label + "* |" + col_sep.join(f" {c} " for c in cells) + " |"
+
+    lines = [
+        "## Benchmark Results",
+        "",
+        header,
+        divider,
+        row(branch_name, branch),
+        row("main", main),
+        divider.replace("-", ""),
+        ratio_row(f"{branch_name} vs main ratio", main),
+        "",
+        "> `median (mean ± std)`",
+        "> ",
+        "> 🔴 Slower &nbsp; ⚪ No change &nbsp; 🟢 Faster",
+    ]
+    return "\n".join(lines)
+
+
+def _parser() -> argparse.Namespace:
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-b", "--branch", required=True, help="PR branch to benchmark")
+    parser.add_argument(
+        "-o",
+        "--output-dir",
+        default="benchmarks",
+        type=Path,
+        help="Output directory to save benchmarks to",
+    )
+    return parser.parse_args()
+
+
+def _sanitize(s: str) -> str:
+    return s.replace("/", "-")
+
+
+def run_benchmark(git: Git, branch: str, out_dir: Path) -> None:
+    """Perform benchmarking.
+
+    Args:
+        git: Representation of current git repository for benchmarking
+        branch: Feature branch to benchmark
+        out_dir: Output directory to save benchmarks to
+    """
+
+    tag = git.last_tag()
+    targets = {branch: branch, "main": "main", tag: None}
+
+    with _suppress_log_exceptions():
+        for name, ref in targets.items():
+            # Skip if the reference is not provided
+            if ref is None:
+                continue
+            git.checkout(ref)
+            _reset_logger()
+            _logger.info("Running benchmarks for '%s'", name)
+
+            safe_name = _sanitize(name)
+            fname = out_dir / f"benchmark-{safe_name}.json"
+            if fname.exists():
+                _logger.warning(
+                    "Existing benchmarks found for %s. File will be overwritten.", fname
+                )
+
+            # Run benchmark
+            pytest.main(
+                [
+                    "-m",
+                    "benchmark",
+                    "--benchmark-save-data",
+                    f"--benchmark-json={fname}",
+                    "--benchmark-time-unit=ms",
+                    "--benchmark-warmup=on",
+                    f"{git.repo_path}/tests",
+                ]
+            )
+
+
+def generate_report(git: Git, branch: str, out_dir: Path) -> Path:
+    """Generate markdown report from benchmarks.
+
+    Args:
+        git: Representation of current git repository for benchmarking
+        branch: Feature branch benchmarked
+        out_dir: Directory benchmarks are saved to / output report to
+
+    Returns:
+        Path to file containing benchmark comparison table
+
+    Raises:
+        AssertionError: if less than 2 benchmark files found
+    """
+    with _suppress_log_exceptions():
+        git.checkout(branch)
+        _reset_logger()
+        _logger.info("Generating benchmark report")
+
+        files = sorted(out_dir.glob("benchmark-*.json"))
+        if len(files) < 2:
+            raise AssertionError(
+                "Expected 2 or more benchmark files to perform comparisons."
+            )
+
+        tag = git.last_tag()
+        parsed: dict[str, dict[str, BenchmarkResult]] = {}
+        for f in files:
+            if not f.exists():
+                _logger.warning("File %s does not exist - skipping", f)
+                continue
+            key = f.stem.split("-")[1]
+            if key == tag:
+                pass  # keep as tag name
+            elif key != "main":
+                key = branch
+            parsed[key] = parse_file(f)
+
+        if tag not in parsed:
+            _logger.warning("Tag '%s' not found in benchmark files.", tag)
+
+        report_contents = build_table(
+            branch,
+            parsed[branch],
+            parsed["main"],
+            None,  # parsed.get(tag)
+        )
+        dt = datetime.now(timezone.utc).strftime("%Y%m%dT%H%M")
+        report_file = out_dir / f"benchmark-{_sanitize(branch)}-{dt}.md"
+        report_file.write_text(report_contents)
+        _logger.info("Report written to %s", report_file)
+
+        return report_file
+
+
+def main() -> None:
+    args = _parser()
+    args.output_dir.mkdir(parents=True, exist_ok=True)
+
+    with Git() as git:
+        run_benchmark(git=git, branch=args.branch, out_dir=args.output_dir)
+        report_file = generate_report(
+            git=git, branch=args.branch, out_dir=args.output_dir
+        )
+
+        if "GITHUB_OUTPUT" in os.environ:
+            with open(os.environ["GITHUB_OUTPUT"], "a") as f:
+                f.write(f"report_file={report_file}\n")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/uv.lock b/uv.lock
index 8b75f83..1a52de3 100644
--- a/uv.lock
+++ b/uv.lock
@@ -99,7 +99,9 @@ dependencies = [
     { name = "s3transfer" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/67/2f/c4159fa45079b41f11ad17d8c5df8e1d10169b94d1e4240df5be116d3f0a/boto3-1.43.12.tar.gz", hash = "sha256:4a60cdf02c52cb0a60f8dbc986142ce2c31e87e3df1438ffe6755b83008f3e4e", size = 113142, upload-time = "2026-05-20T19:38:13.163Z" }
-
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/a2/35/b7ab4b6977811f9887405e24460640033c22f4515cf1e904480710bd6296/boto3-1.43.12-py3-none-any.whl", hash = "sha256:685c3e6093455623bfc22dac55b4946ea243095252f7f9c11a99d84b38033bcf", size = 140537, upload-time = "2026-05-20T19:38:09.995Z" },
+]
 
 [[package]]
 name = "botocore"

From d72905367c25915e4de3f34578dfe49900c660b4 Mon Sep 17 00:00:00 2001
From: Jason Kai <21226986+kaitj@users.noreply.github.com>
Date: Fri, 29 May 2026 11:02:33 -0400
Subject: [PATCH 8/9] Re-add benchmark CI reporting

- Rename coverage.yaml -> report.yaml and expand to handle both CI (coverage)
  and Benchmark results in a single workflow
- Benchmark workflow: fix undefined matrix reference in artifact name, add
  PR number recording, record PR number and report file as artifacts
- scripts/benchmark.py: add -f/--output-file flag to allow specifying a
  fixed output filename for CI artifact consumption
- Report workflow conditionally downloads the correct artifact based on
  which upstream workflow triggered it, posts coverage comment for CI
  and creates/updates comment for benchmarks
---
 .github/workflows/benchmark.yaml | 15 ++++++--
 .github/workflows/coverage.yaml  | 33 ----------------
 .github/workflows/report.yaml    | 64 ++++++++++++++++++++++++++++++++
 scripts/benchmark.py             | 22 +++++++++--
 4 files changed, 94 insertions(+), 40 deletions(-)
 delete mode 100644 .github/workflows/coverage.yaml
 create mode 100644 .github/workflows/report.yaml

diff --git a/.github/workflows/benchmark.yaml b/.github/workflows/benchmark.yaml
index 5649216..3b4635d 100644
--- a/.github/workflows/benchmark.yaml
+++ b/.github/workflows/benchmark.yaml
@@ -1,5 +1,8 @@
 name: Benchmark
 
+permissions:
+  contents: read
+
 on:
   pull_request:
     branches: ["main"]
@@ -20,8 +23,14 @@ jobs:
       - name: Run benchmarks
         id: run-benchmarks
         run: |
-          uv run python scripts/benchmark.py --branch $BRANCH_NAME
+          uv run python scripts/benchmark.py -b $BRANCH_NAME -o . -f benchmarks.md
+      - name: Record pr number
+        run: |
+          echo "${{ github.event.number }}" > pr-number.txt
       - uses: actions/upload-artifact@v7
         with:
-          name: benchmark-${{ matrix.target.name }}
-          path: ${{ steps.run-benchmarks.outputs.report_file }}
+          name: benchmark
+          retention-days: 7
+          path: |
+            benchmarks.md
+            pr-number.txt
diff --git a/.github/workflows/coverage.yaml b/.github/workflows/coverage.yaml
deleted file mode 100644
index 5083e10..0000000
--- a/.github/workflows/coverage.yaml
+++ /dev/null
@@ -1,33 +0,0 @@
-name: Coverage report
-
-on:
-  workflow_run:
-    workflows: ["CI"]
-    types: [completed]
-
-permissions:
-  pull-requests: write
-  actions: read
-
-jobs:
-  coverage:
-    if: >-
-      github.event.workflow_run.event == 'pull_request' &&
-      github.event.workflow_run.conclusion == 'success'
-    runs-on: ubuntu-latest
-    steps:
-      - uses: actions/download-artifact@v8
-        with:
-          name: coverage
-          run-id: ${{ github.event.workflow_run.id }}
-          github-token: ${{ secrets.GITHUB_TOKEN }}
-
-      - id: pr
-        run: echo "number=$(cat pr-number.txt)" >> "$GITHUB_OUTPUT"
-
-      - name: Pytest coverage comment
-        uses: MishaKav/pytest-coverage-comment@v1
-        with:
-          issue-number: ${{ steps.pr.outputs.number }}
-          pytest-xml-coverage-path: ./coverage.xml
-          junitxml-path: ./pytest.xml
diff --git a/.github/workflows/report.yaml b/.github/workflows/report.yaml
new file mode 100644
index 0000000..49eee9e
--- /dev/null
+++ b/.github/workflows/report.yaml
@@ -0,0 +1,64 @@
+name: Report
+
+on:
+  workflow_run:
+    workflows: ["CI", "Benchmark"]
+    types: [completed]
+
+permissions:
+  pull-requests: write
+  actions: read
+
+jobs:
+  report:
+    if: >-
+      github.event.workflow_run.event == 'pull_request' &&
+      github.event.workflow_run.conclusion == 'success'
+    runs-on: ubuntu-latest
+    steps:
+      - name: Download CI artifact
+        if: github.event.workflow_run.name == 'CI'
+        continue-on-error: true
+        uses: actions/download-artifact@v8
+        with:
+          name: coverage
+          run-id: ${{ github.event.workflow_run.id }}
+          github-token: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Download Benchmark artifact
+        if: github.event.workflow_run.name == 'Benchmark'
+        continue-on-error: true
+        uses: actions/download-artifact@v8
+        with:
+          name: benchmark
+          run-id: ${{ github.event.workflow_run.id }}
+          github-token: ${{ secrets.GITHUB_TOKEN }}
+
+      - id: pr
+        run: echo "number=$(cat pr-number.txt)" >> "$GITHUB_OUTPUT"
+
+      - name: Pytest coverage comment
+        if: github.event.workflow_run.name == 'CI'
+        uses: MishaKav/pytest-coverage-comment@v1
+        with:
+          issue-number: ${{ steps.pr.outputs.number }}
+          pytest-xml-coverage-path: ./coverage.xml
+          junitxml-path: ./pytest.xml
+
+      - name: Benchmark find comment
+        if: github.event.workflow_run.name == 'Benchmark'
+        uses: peter-evans/find-comment@v3
+        id: fc
+        with:
+          issue-number: ${{ steps.pr.outputs.number }}
+          comment-author: "github-actions[bot]"
+          body-includes: "Benchmark Results"
+
+      - name: Benchmark create / update comment
+        if: github.event.workflow_run.name == 'Benchmark'
+        uses: peter-evans/create-or-update-comment@v5
+        with:
+          comment-id: ${{ steps.fc.outputs.comment-id }}
+          issue-number: ${{ steps.pr.outputs.number }}
+          body-path: "benchmarks.md"
+          edit-mode: replace
diff --git a/scripts/benchmark.py b/scripts/benchmark.py
index 2ae13b3..a92e049 100644
--- a/scripts/benchmark.py
+++ b/scripts/benchmark.py
@@ -245,6 +245,13 @@ def _parser() -> argparse.Namespace:
         type=Path,
         help="Output directory to save benchmarks to",
     )
+    parser.add_argument(
+        "-f",
+        "--output-file",
+        required=False,
+        type=str,
+        help="Output file name",
+    )
     return parser.parse_args()
 
 
@@ -294,7 +301,9 @@ def run_benchmark(git: Git, branch: str, out_dir: Path) -> None:
             )
 
 
-def generate_report(git: Git, branch: str, out_dir: Path) -> Path:
+def generate_report(
+    git: Git, branch: str, out_dir: Path, out_fname: str | None = None
+) -> Path:
     """Generate markdown report from benchmarks.
 
     Args:
@@ -341,8 +350,10 @@ def generate_report(git: Git, branch: str, out_dir: Path) -> Path:
             parsed["main"],
             None,  # parsed.get(tag)
         )
-        dt = datetime.now(timezone.utc).strftime("%Y%m%dT%H%M")
-        report_file = out_dir / f"benchmark-{_sanitize(branch)}-{dt}.md"
+        if out_fname is None:
+            dt = datetime.now(timezone.utc).strftime("%Y%m%dT%H%M")
+            out_fname = f"benchmark-{_sanitize(branch)}-{dt}.md"
+        report_file = out_dir / out_fname
         report_file.write_text(report_contents)
         _logger.info("Report written to %s", report_file)
 
@@ -356,7 +367,10 @@ def main() -> None:
     with Git() as git:
         run_benchmark(git=git, branch=args.branch, out_dir=args.output_dir)
         report_file = generate_report(
-            git=git, branch=args.branch, out_dir=args.output_dir
+            git=git,
+            branch=args.branch,
+            out_dir=args.output_dir,
+            out_fname=args.output_file,
         )
 
         if "GITHUB_OUTPUT" in os.environ:

From d98efc8cf8cd24a351f04f2bfd879606abe0ae84 Mon Sep 17 00:00:00 2001
From: Jason Kai <21226986+kaitj@users.noreply.github.com>
Date: Fri, 29 May 2026 13:26:16 -0400
Subject: [PATCH 9/9] Add configurable threshold for benchmark ratio

- Add --threshold/-t flag to benchmark script (default 0.05) to define
  the minimum ratio difference before marking a change as slower/faster
- Rewrite _ratio to compare % change  against threshold
- Display 3 decimal places in ratio output to surface small differences
---
 .github/workflows/benchmark.yaml |  2 +-
 scripts/benchmark.py             | 34 +++++++++++++++++++++++++-------
 2 files changed, 28 insertions(+), 8 deletions(-)

diff --git a/.github/workflows/benchmark.yaml b/.github/workflows/benchmark.yaml
index 3b4635d..8ef00bb 100644
--- a/.github/workflows/benchmark.yaml
+++ b/.github/workflows/benchmark.yaml
@@ -23,7 +23,7 @@ jobs:
       - name: Run benchmarks
         id: run-benchmarks
         run: |
-          uv run python scripts/benchmark.py -b $BRANCH_NAME -o . -f benchmarks.md
+          uv run python scripts/benchmark.py -b $BRANCH_NAME -o . -f benchmarks.md -t 0.05
       - name: Record pr number
         run: |
           echo "${{ github.event.number }}" > pr-number.txt
diff --git a/scripts/benchmark.py b/scripts/benchmark.py
index a92e049..b324ee0 100644
--- a/scripts/benchmark.py
+++ b/scripts/benchmark.py
@@ -5,7 +5,8 @@
 """Perform benchmarking of bids2table against last tag, main and feature branches.
 
 Run with:
-    uv run --with <repo> scripts/benchmark.py -b <feature_branch> [-o <output_dir>]
+    uv run --with <repo> scripts/benchmark.py \
+        -b <feature_branch> [-o <output_dir>] [-f <output_file>] [-t <threshold>]
 """
 
 from __future__ import annotations
@@ -173,10 +174,15 @@ def _fmt(res: BenchmarkResult) -> str:
     return f"{median.value:.3f} ({mean:.3f} ± {stddev:.3f}) {median.unit}"
 
 
-def _ratio(pr: BenchmarkResult, ref: BenchmarkResult) -> str:
+def _ratio(pr: BenchmarkResult, ref: BenchmarkResult, threshold: float) -> str:
     ratio = pr.median / ref.median
-    icon = "🔴" if ratio > 1 else "🟢" if ratio < 1 else "⚪"
-    return f"{icon} {ratio:.2f}"
+    if abs(1 - ratio) <= threshold:
+        icon = "⚪"
+    elif ratio > 1:
+        icon = "🔴"
+    else:
+        icon = "🟢"
+    return f"{icon} {ratio:.3f}"
 
 
 def _label(result: BenchmarkResult) -> str:
@@ -191,6 +197,7 @@ def _label(result: BenchmarkResult) -> str:
 
 
 def build_table(
+    threshold: float,
     branch_name: str,
     branch: dict[str, BenchmarkResult],
     main: dict[str, BenchmarkResult],
@@ -213,7 +220,7 @@ def row(name: str, results: dict[str, BenchmarkResult]) -> str:
 
     def ratio_row(label: str, ref: dict[str, BenchmarkResult]) -> str:
         cells = [
-            _ratio(branch[k], ref[k]) if k in branch and k in ref else "—"
+            _ratio(branch[k], ref[k], threshold) if k in branch and k in ref else "—"
             for k in all_keys
         ]
         return "| *" + label + "* |" + col_sep.join(f" {c} " for c in cells) + " |"
@@ -230,7 +237,7 @@ def ratio_row(label: str, ref: dict[str, BenchmarkResult]) -> str:
         "",
         "> `median (mean ± std)`",
         "> ",
-        "> 🔴 Slower &nbsp; ⚪ No change &nbsp; 🟢 Faster",
+        f"> 🔴 Slower &nbsp; ⚪ No change (<{threshold * 100:.0f} %) &nbsp; 🟢 Faster",
     ]
     return "\n".join(lines)
 
@@ -252,6 +259,13 @@ def _parser() -> argparse.Namespace:
         type=str,
         help="Output file name",
     )
+    parser.add_argument(
+        "-t",
+        "--threshold",
+        default=0.05,
+        type=float,
+        help="Threshold for performance to be considered unchanged",
+    )
     return parser.parse_args()
 
 
@@ -302,14 +316,16 @@ def run_benchmark(git: Git, branch: str, out_dir: Path) -> None:
 
 
 def generate_report(
-    git: Git, branch: str, out_dir: Path, out_fname: str | None = None
+    git: Git, branch: str, threshold: float, out_dir: Path, out_fname: str | None = None
 ) -> Path:
     """Generate markdown report from benchmarks.
 
     Args:
         git: Representation of current git repository for benchmarking
         branch: Feature branch benchmarked
+        threshold: Threshold for performance to be considered unchanged
         out_dir: Directory benchmarks are saved to / output report to
+        out_fname: Benchmark output file name
 
     Returns:
         Path to file containing benchmark comparison table
@@ -345,6 +361,7 @@ def generate_report(
             _logger.warning("Tag '%s' not found in benchmark files.", tag)
 
         report_contents = build_table(
+            threshold,
             branch,
             parsed[branch],
             parsed["main"],
@@ -362,6 +379,8 @@ def generate_report(
 
 def main() -> None:
     args = _parser()
+    if abs(args.threshold) > 1:
+        raise ValueError(f"Threshold should be between 0 and 1, got: {args.threshold}")
     args.output_dir.mkdir(parents=True, exist_ok=True)
 
     with Git() as git:
@@ -369,6 +388,7 @@ def main() -> None:
         report_file = generate_report(
             git=git,
             branch=args.branch,
+            threshold=args.threshold,
             out_dir=args.output_dir,
             out_fname=args.output_file,
         )