diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index 7e79a6d..a6df1e3 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -211,6 +211,79 @@ jobs:
         cd build
         ctest --output-on-failure --parallel
 
+  # ── GCC 16 Reflection-specific validation ────────────────────────
+  reflection-linux:
+    name: Reflection (Linux, GCC 16, C++26)
+    runs-on: ubuntu-24.04
+    steps:
+    - name: Checkout code
+      uses: actions/checkout@v4
+
+    - name: Add toolchain PPA
+      run: |
+        sudo add-apt-repository -y ppa:ubuntu-toolchain-r/test
+        sudo apt-get update
+
+    - name: Install dependencies
+      run: |
+        sudo apt-get install -y cmake ninja-build gcc-16 g++-16
+
+    - name: Configure CMake
+      env:
+        CC: gcc-16
+        CXX: g++-16
+      run: |
+        cmake -B build -G Ninja \
+          -DCMAKE_BUILD_TYPE=Release \
+          -DCMAKE_CXX_STANDARD=26 \
+          -DTHREADSCHEDULE_BUILD_EXAMPLES=OFF \
+          -DTHREADSCHEDULE_BUILD_TESTS=ON \
+          -DTHREADSCHEDULE_ENABLE_REFLECTION=ON
+
+    - name: Build
+      run: cmake --build build --parallel
+
+    - name: Run reflection-focused tests
+      run: |
+        ctest --test-dir build \
+          --output-on-failure \
+          --tests-regex 'ReflectionApiTest|RegistryQueryTest\.Reflection'
+
+  reflection-modules-linux:
+    name: Reflection Modules (Linux, GCC 16, C++26)
+    runs-on: ubuntu-24.04
+    steps:
+    - name: Checkout code
+      uses: actions/checkout@v4
+
+    - name: Add toolchain PPA
+      run: |
+        sudo add-apt-repository -y ppa:ubuntu-toolchain-r/test
+        sudo apt-get update
+
+    - name: Install dependencies
+      run: |
+        sudo apt-get install -y cmake ninja-build gcc-16 g++-16
+
+    - name: Configure CMake
+      env:
+        CC: gcc-16
+        CXX: g++-16
+      run: |
+        cmake -B build -G Ninja \
+          -DCMAKE_BUILD_TYPE=Release \
+          -DCMAKE_CXX_STANDARD=26 \
+          -DTHREADSCHEDULE_MODULE=ON \
+          -DTHREADSCHEDULE_ENABLE_REFLECTION=ON
+
+    - name: Build module
+      run: cmake --build build --parallel
+
+    - name: Verify reflection-enabled module artifacts
+      run: |
+        echo "Reflection module artifacts:"
+        find build -name '*.a' -o -name '*.gcm' -o -name '*.pcm' | head -20
+
   # ── C++20 Module build verification ────────────────────────────────
   modules-linux:
     name: Modules (Linux, C++${{ matrix.cpp_standard }}, ${{ matrix.compiler }})
diff --git a/.gitignore b/.gitignore
index 2ebeb08..cb75fc3 100644
--- a/.gitignore
+++ b/.gitignore
@@ -46,5 +46,6 @@ compile_commands.json
 .cache/
 build/
 build_*/
+build-*/
 install/
 build_runtime/
diff --git a/CHANGELOG.md b/CHANGELOG.md
index ffdc9ed..38c64cb 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,72 @@
 # Changelog
 
+## v2.3.0
+
+> **No intended API/ABI breaking changes for existing non-reflection users.**
+> This release adds an optional GCC-16/C++26 reflection surface and uses it to
+> expose faster registry projection/filter paths without changing the existing
+> query API.
+
+### New Features
+
+- **Optional GCC 16.1+ reflection API** -- when building with C++26,
+  `THREADSCHEDULE_ENABLE_REFLECTION=ON`, and working `-freflection` support,
+  the library now exports `threadschedule::reflect::*` for field metadata,
+  field visitation, compile-time projection, and type/field naming.
+  (`reflection.hpp`, `threadschedule.cppm`, `CMakeLists.txt`)
+
+- **Reflection-backed registry selectors** -- `ThreadRegistry` and
+  `QueryView` now expose field-oriented helpers such as
+  `where<registered_thread_fields::componentTag()>(...)`,
+  `where_if<registered_thread_fields::alive()>(...)`,
+  `find_by<registered_thread_fields::name()>(...)`,
+  `contains<...>(...)`, and `project<...>()` when reflection is enabled.
+  (`thread_registry.hpp`)
+
+### Performance
+
+- **Lower-overhead registry projections on reflection builds** -- direct
+  field-projection and field-filter paths now run under the registry's shared
+  lock and can skip the older `filter(...).map(...)` layering when callers opt
+  into the new reflection APIs. This reduces intermediate traversal and avoids
+  some full-entry transformation work for hot query paths. (`thread_registry.hpp`)
+
+- **More metadata is now promoted at compile time** -- reflection field names
+  and type display names are now stabilized via `std::define_static_string(...)`
+  and reused through `consteval` helpers such as `field_names<T>()`, reducing
+  repeated compile-time reconstruction of the same metadata. (`reflection.hpp`)
+
+### Documentation
+
+- **README examples for reflection queries** -- the top-level README now shows
+  how to combine `threadschedule::reflect` with field-based registry queries
+  and projections. (`README.md`)
+
+- **New CMake reference entry for reflection** -- the reference now documents
+  `THREADSCHEDULE_ENABLE_REFLECTION` and the GCC 16.1+/C++26 activation path.
+  (`docs/CMAKE_REFERENCE.md`)
+
+### Tests & Benchmarks
+
+- **New reflection unit coverage** -- dedicated tests now validate reflection
+  metadata for core public structs and reflection-backed registry queries.
+  (`tests/reflection_test.cpp`, `tests/registry_query_test.cpp`,
+  `tests/CMakeLists.txt`)
+
+- **New reflection registry benchmark** -- `reflection_registry_benchmarks`
+  compares classic `filter/map/find_if` usage against the new field-oriented
+  query helpers on synthetic registry snapshots. (`benchmarks/CMakeLists.txt`,
+  `benchmarks/reflection_registry_benchmarks.cpp`)
+
+### CI / Infrastructure
+
+- **Dedicated GCC 16 reflection CI jobs** -- the main test workflow now
+  includes explicit `ubuntu-24.04` jobs for reflection-enabled GCC 16/C++26
+  validation: one job builds and runs the reflection-focused test cases, and a
+  second job verifies the reflection-enabled module build path. This makes the
+  new `THREADSCHEDULE_ENABLE_REFLECTION` surface visible in CI instead of
+  relying only on the generic C++26 matrix entry. (`.github/workflows/tests.yml`)
+
 ## v2.2.0
 
 > **No intended API/ABI breaking changes.** This release extends thread-control
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 114ceb5..88e708f 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -64,6 +64,7 @@ option(THREADSCHEDULE_INSTALL "Generate install target" ${THREADSCHEDULE_IS_TOPL
 option(THREADSCHEDULE_RUNTIME "Build shared runtime for global registry (non header-only)" OFF)
 option(THREADSCHEDULE_MODULE "Build C++20 module target (requires CMake >= 3.28 and C++20+)" OFF)
 option(THREADSCHEDULE_BUILD_DOCS "Build API documentation with Doxygen" ${THREADSCHEDULE_IS_TOPLEVEL_PROJECT})
+option(THREADSCHEDULE_ENABLE_REFLECTION "Enable GCC 16 C++26 reflection APIs when supported" ON)
 
 # CPM support (optional, download if building tests or benchmarks)
 if(THREADSCHEDULE_BUILD_TESTS OR THREADSCHEDULE_BUILD_BENCHMARKS)
@@ -119,6 +120,34 @@ endif()
 # Platform-specific requirements
 find_package(Threads REQUIRED)
 
+set(THREADSCHEDULE_HAS_REFLECTION OFF)
+if(THREADSCHEDULE_ENABLE_REFLECTION
+   AND CMAKE_CXX_STANDARD GREATER_EQUAL 26
+   AND CMAKE_CXX_COMPILER_ID STREQUAL "GNU"
+   AND NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS "16.1")
+    include(CheckCXXSourceCompiles)
+    set(_threadschedule_saved_required_flags "${CMAKE_REQUIRED_FLAGS}")
+    set(CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS} -std=c++26 -freflection")
+    check_cxx_source_compiles(
+        "
+        #include <meta>
+        using namespace std::meta;
+        struct probe_type { int value; bool ready; };
+        consteval bool probe() {
+            auto fields = std::define_static_array(nonstatic_data_members_of(^^probe_type, access_context::current()));
+            return fields.size() == 2 && identifier_of(fields[0]) == \"value\";
+        }
+        static_assert(probe());
+        int main() { return 0; }
+        "
+        THREADSCHEDULE_REFLECTION_PROBE_OK
+    )
+    set(CMAKE_REQUIRED_FLAGS "${_threadschedule_saved_required_flags}")
+    if(THREADSCHEDULE_REFLECTION_PROBE_OK)
+        set(THREADSCHEDULE_HAS_REFLECTION ON)
+    endif()
+endif()
+
 # Create the interface library target (header-only API)
 add_library(ThreadSchedule INTERFACE)
 add_library(ThreadSchedule::ThreadSchedule ALIAS ThreadSchedule)
@@ -151,6 +180,11 @@ target_include_directories(ThreadSchedule INTERFACE
 # Link libraries
 target_link_libraries(ThreadSchedule INTERFACE Threads::Threads)
 
+if(THREADSCHEDULE_HAS_REFLECTION)
+    target_compile_definitions(ThreadSchedule INTERFACE THREADSCHEDULE_HAS_REFLECTION=1)
+    target_compile_options(ThreadSchedule INTERFACE $<$<COMPILE_LANGUAGE:CXX>:-freflection>)
+endif()
+
 # Windows: ensure modern API availability macros
 if(WIN32)
     target_compile_definitions(ThreadSchedule INTERFACE
@@ -217,6 +251,10 @@ if(THREADSCHEDULE_RUNTIME)
         src/runtime_registry.cpp
     )
     target_compile_definitions(ThreadScheduleRuntime PRIVATE THREADSCHEDULE_EXPORTS THREADSCHEDULE_RUNTIME)
+    if(THREADSCHEDULE_HAS_REFLECTION)
+        target_compile_definitions(ThreadScheduleRuntime PUBLIC THREADSCHEDULE_HAS_REFLECTION=1)
+        target_compile_options(ThreadScheduleRuntime PUBLIC $<$<COMPILE_LANGUAGE:CXX>:-freflection>)
+    endif()
     # Propagate the THREADSCHEDULE_RUNTIME define to consumers so headers call into the DLL
     target_compile_definitions(ThreadScheduleRuntime INTERFACE THREADSCHEDULE_RUNTIME)
     target_include_directories(ThreadScheduleRuntime
@@ -271,6 +309,11 @@ if(THREADSCHEDULE_MODULE)
     else()
         target_compile_features(ThreadScheduleModule PUBLIC cxx_std_20)
     endif()
+
+    if(THREADSCHEDULE_HAS_REFLECTION)
+        target_compile_definitions(ThreadScheduleModule PUBLIC THREADSCHEDULE_HAS_REFLECTION=1)
+        target_compile_options(ThreadScheduleModule PUBLIC $<$<COMPILE_LANGUAGE:CXX>:-freflection>)
+    endif()
 endif()
 
 # Documentation (Doxygen + Awesome theme)
diff --git a/README.md b/README.md
index 8f1d881..6f706f4 100644
--- a/README.md
+++ b/README.md
@@ -43,6 +43,9 @@ or with optional **shared runtime** for multi-DSO applications.
 - **Modern Callable Paths**: Newer standard libraries can use
   `std::move_only_function` / `std::copyable_function` internally for lower
   adaptation overhead while keeping the public API source-compatible
+- **GCC 16 Reflection APIs**: Optional C++26 reflection utilities and
+  reflection-backed registry queries when building with GCC 16.1+ and
+  `-freflection`
 - **Scheduled Tasks**: Run tasks at specific times, after delays, or
   periodically
 - **Error Handling**: Comprehensive exception handling with error callbacks and
@@ -158,6 +161,11 @@ are not regularly tested in CI.
 > **C++26**: Requires GCC 14+ or Clang 19+. MSVC does not yet expose
 > `cxx_std_26` to CMake; C++26 on Windows is not tested.
 >
+> **Reflection APIs**: The optional `threadschedule::reflect` API and
+> reflection-backed registry queries require GCC 16.1+ with
+> `THREADSCHEDULE_ENABLE_REFLECTION=ON`. These APIs are not built on other
+> toolchains or standards.
+>
 > **GCC 15**: Installed via `ppa:ubuntu-toolchain-r/test` on Ubuntu 24.04.
 >
 > **GCC 16**: Installed via `ppa:ubuntu-toolchain-r/test` on Ubuntu 24.04.
@@ -419,6 +427,47 @@ Notes:
 - Use `*Reg` wrappers (e.g., `ThreadWrapperReg`) or `AutoRegisterCurrentThread`
   for automatic control block creation and registration.
 
+### Reflection-powered registry queries (GCC 16.1+ / C++26)
+
+When `THREADSCHEDULE_ENABLE_REFLECTION=ON` is active on GCC 16.1+ with
+`-std=c++26`, ThreadSchedule exposes field metadata and faster field-oriented
+registry queries.
+
+```cpp
+#include <threadschedule/threadschedule.hpp>
+using namespace threadschedule;
+
+auto io_names =
+    registry()
+        .where<registered_thread_fields::componentTag()>("io")
+        .project<registered_thread_fields::name()>();
+
+auto live_compute =
+    registry()
+        .where<registered_thread_fields::componentTag()>("compute")
+        .where_if<registered_thread_fields::alive()>([](bool alive) {
+            return alive;
+        })
+        .project<registered_thread_fields::tid(), registered_thread_fields::name()>();
+
+bool has_scheduler = registry().contains<registered_thread_fields::name()>("sched_main");
+```
+
+You can also inspect reflected library types directly:
+
+```cpp
+#include <threadschedule/threadschedule.hpp>
+using namespace threadschedule;
+
+static_assert(reflect::field_count<RegisteredThreadInfo>() == 6);
+static_assert(reflect::field_name<RegisteredThreadInfo, 2>() == "name");
+
+ThreadProfile profile = profiles::throughput();
+reflect::visit_fields(profile, [](std::string_view field, auto const& value) {
+    // inspect compile-time-described fields at runtime
+});
+```
+
 Find by name (Linux):
 
 ```cpp
@@ -584,8 +633,162 @@ worker.set_affinity(affinity);
 ### Benchmark Results
 
 Performance varies by system configuration, workload characteristics, and task
-complexity. See [benchmarks/](benchmarks/) for detailed performance analysis,
-real-world scenario testing, and optimization recommendations.
+complexity. The charts below were captured in a single environment; reproduce
+them on your own machine with `./run_benchmark_graphs.sh` (HTML report) or
+regenerate the SVGs with `benchmarks/generate_readme_graphs.py`.
+
+<details>
+<summary><strong>Benchmark environment & build flags</strong></summary>
+
+| Setting          | Value                                                                 |
+| ---------------- | --------------------------------------------------------------------- |
+| CPU              | AMD Ryzen 5 5600X (6 cores / 12 threads, 32 MiB L3, up to ~4.65 GHz)   |
+| OS / kernel      | Fedora 44, Linux 7.0.4-200.fc44.x86_64                                 |
+| Compiler         | GCC 16.1.1 (`-std=c++23` for the pool charts; C++17/20/23/26 for the callable charts) |
+| Build type       | `Release` (`-O3 -DNDEBUG`)                                             |
+| Extra flags      | `-march=native -ffast-math -fno-omit-frame-pointer`                   |
+| Google Benchmark | v1.9.4                                                                 |
+| Threads          | 4 worker threads unless noted                                         |
+
+The exact compile flags used for every benchmark target (see
+[`benchmarks/CMakeLists.txt`](benchmarks/CMakeLists.txt)):
+
+```bash
+# GCC / Clang
+-O3 -DNDEBUG -fno-omit-frame-pointer -march=native -ffast-math
+# plus the C++ standard: -std=c++23 (pool/reflection charts),
+#                         -std=c++17 / 20 / 23 / 26 (callable charts)
+```
+
+> Absolute numbers are only meaningful relative to each other on the **same**
+> machine and build. `-march=native` and `-ffast-math` in particular mean results
+> are not comparable across CPUs. Re-run the benchmarks locally before drawing
+> conclusions for your hardware.
+
+</details>
+
+**Throughput scales with batch size.** For tiny tasks the
+fire-and-forget `LightweightPool` consistently leads, while the work-stealing
+`HighPerformancePool` pays for its extra machinery and only shines on larger,
+unbalanced workloads:
+
+![Thread pool throughput by batch size](docs/benchmarks/pool_throughput.svg)
+
+**Pick the right pool for the workload.** Running 100,000 trivial tasks, the
+`LightweightPool` finishes ~1.9x faster than the baseline `ThreadPool`, whereas
+the work-stealing pool is slower because the tasks are too small to benefit from
+stealing:
+
+![Thread pool comparison for a light workload](docs/benchmarks/pool_comparison.svg)
+
+**The gap depends heavily on how much work each task does.** With the pool built
+once and the per-task work swept from `tiny` to `heavy`, the picture changes: for
+tiny/medium tasks submission overhead dominates and `LightweightPool` wins by
+~2-3x, but as the per-task work grows the field converges to within ~20% and the
+pool choice stops mattering much. The work-stealing `HighPerformancePool` climbs
+from last place (tiny) to nearly the front (heavy):
+
+![Pool comparison across workload weights](docs/benchmarks/pool_workload.svg)
+
+**Skip the future when you do not need it.** `post()` reuses the same queue path
+as `submit()` but avoids the `packaged_task` / `std::future` overhead, which is
+dramatic for very short tasks:
+
+![post() versus submit() submission overhead](docs/benchmarks/post_vs_submit.svg)
+
+> These numbers measure submission/scheduling overhead with light tasks, so they
+> represent a worst case for pool overhead. As the "workload weights" chart
+> shows, real workloads with heavier per-task work narrow these gaps
+> considerably.
+
+#### Reflection-backed registry queries (GCC 16.1+ / C++26)
+
+With `THREADSCHEDULE_ENABLE_REFLECTION=ON` the registry exposes ergonomic,
+field-oriented queries (`where` / `project` / `find_by`). These trade a little
+performance for readability and compile-time field checking: against
+hand-written STL-style lambdas over 16,384 registered threads they currently run
+slightly slower, so reach for them when expressiveness matters more than the last
+few percent of throughput.
+
+![Reflection query: project a field versus hand-written filter + map](docs/benchmarks/reflection_query.svg)
+
+![Reflection query: find by field versus hand-written find_if](docs/benchmarks/reflection_lookup.svg)
+
+#### Task storage: `std::move_only_function` and SBO callables
+
+The pools store type-erased tasks in one of two ways: `ThreadPool` /
+`FastThreadPool` / `HighPerformancePool` use `detail::move_callable`
+(`std::function` on C++17/20, `std::move_only_function` on C++23+), while
+`LightweightPool` uses a custom small-buffer callable (`SboCallable<64>`). The
+`callable_std_benchmarks` target isolates the build + invoke cost of these
+wrappers (away from thread-scheduling noise) and is compiled under every standard.
+
+**Does replacing `std::function` help?** For small captures, switching to
+`std::move_only_function` on C++23+ cuts the per-task wrapper cost by ~30%
+(~4.6 ns to ~3.1 ns). For larger captures the heap allocation dominates and the
+wrapper choice barely matters:
+
+![move_callable cost across C++ standards](docs/benchmarks/callable_standards.svg)
+
+**Do the SBO callables help?** Yes — and this is the bigger effect. A 48-byte
+capture fits `LightweightPool`'s 56-byte inline buffer but overflows the
+standard-library callables' small buffer, so the latter heap-allocate. The SBO
+path is then ~6x faster (~3.4 ns vs ~21 ns per task). Once a capture is too big
+for any inline buffer (128 B), both allocate and the advantage disappears:
+
+![SBO callable versus standard-library callable](docs/benchmarks/callable_sbo.svg)
+
+<details>
+<summary><strong>How big is a task, really? (capture sizes &amp; inline buffers)</strong></summary>
+
+A task is usually a lambda, and **a lambda's size is the sum of what it captures**
+(plus alignment padding). A capture-less lambda is effectively free; each captured
+pointer or reference adds 8 bytes, and capturing objects *by value* adds their
+full size. Concrete sizes on this platform (GCC 16 / libstdc++, x86_64):
+
+| What the task captures                              | Example                                  | Size   |
+| --------------------------------------------------- | ---------------------------------------- | ------ |
+| nothing (stateless)                                 | `pool.post([]{ tick(); });`              | ~1 B   |
+| one pointer / reference / `this`                    | `pool.post([&q]{ q.drain(); });`         | 8 B    |
+| two pointers / references                           | `pool.post([&a, &b]{ join(a, b); });`    | 16 B   |
+| a `std::shared_ptr` by value                        | `pool.post([h]{ h->run(); });`           | 16 B   |
+| a `std::vector` by value                            | `pool.post([data]{ process(data); });`   | 24 B   |
+| a `std::string` by value                            | `pool.post([name]{ log(name); });`       | 32 B   |
+| ~6 small values / handles (the chart's "medium")    | `pool.post([id,a,b,c,d,e]{ ... });`      | 48 B   |
+| a big array / struct by value (the chart's "large") | `pool.post([frame]{ encode(frame); });`  | 128 B  |
+
+Each storage type keeps small callables **inline** (no allocation) up to a fixed
+buffer size, and falls back to a heap allocation above it:
+
+| Storage                       | Inline buffer | Used by                                   |
+| ----------------------------- | ------------- | ----------------------------------------- |
+| `std::function`               | ≤ 16 B        | `ThreadPool` family on C++17/20           |
+| `std::move_only_function`     | ≤ 24 B        | `ThreadPool` family on C++23+             |
+| `SboCallable<64>`             | ≤ 56 B        | `LightweightPool` (`= LightweightPoolT<64>`) |
+
+`SboCallable<TaskSize>` lays each task out as one cache line:
+
+```
+  |<------------- TaskSize = 64 B ------------->|
+  [ vtable* (8 B) | inline capture buffer (56 B) ]
+```
+
+**Typical real tasks capture a few pointers/handles plus maybe a small value, so
+they land in the ~8-48 B range.** That fits `LightweightPool`'s 56 B buffer with
+no allocation, but overflows `std::function`'s 16 B buffer (one allocation per
+task). If you capture large objects by value you blow past every inline buffer -
+capture a pointer/handle to the data instead, or bump the buffer with
+`LightweightPoolT<128>`.
+
+</details>
+
+> Takeaway: keep task captures small. They stay inline (no allocation) in
+> `LightweightPool`, and on C++23+ the other pools also benefit from the
+> move-only wrapper. This is exactly why `post()` and `LightweightPool` are the
+> recommended low-overhead paths.
+
+See [benchmarks/](benchmarks/) for detailed performance analysis, real-world
+scenario testing, and optimization recommendations.
 
 ## Platform-Specific Features
 
diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt
index 4feb43b..102accc 100644
--- a/benchmarks/CMakeLists.txt
+++ b/benchmarks/CMakeLists.txt
@@ -7,6 +7,10 @@ add_executable(threadpool_throughput_benchmarks throughput_benchmarks.cpp)
 add_executable(threadpool_memory_benchmarks memory_benchmarks.cpp)
 add_executable(threadpool_resampling_benchmarks resampling_benchmarks.cpp)
 add_executable(callable_benchmarks callable_benchmarks.cpp)
+add_executable(callable_std_benchmarks callable_std_benchmarks.cpp)
+if(THREADSCHEDULE_HAS_REFLECTION)
+    add_executable(reflection_registry_benchmarks reflection_registry_benchmarks.cpp)
+endif()
 
 # Real-world scenario benchmarks
 add_executable(web_server_benchmarks web_server_benchmarks.cpp)
@@ -20,10 +24,14 @@ set(ALL_BENCHMARK_TARGETS
     threadpool_memory_benchmarks
     threadpool_resampling_benchmarks
     callable_benchmarks
+    callable_std_benchmarks
     web_server_benchmarks
     database_benchmarks
     audio_video_benchmarks
 )
+if(THREADSCHEDULE_HAS_REFLECTION)
+    list(APPEND ALL_BENCHMARK_TARGETS reflection_registry_benchmarks)
+endif()
 
 # Link libraries for all benchmarks
 foreach(target ${ALL_BENCHMARK_TARGETS})
@@ -59,6 +67,9 @@ add_test(NAME ThreadPoolThroughputBenchmarks COMMAND threadpool_throughput_bench
 add_test(NAME ThreadPoolMemoryBenchmarks COMMAND threadpool_memory_benchmarks --benchmark_min_time=2s --benchmark_repetitions=3)
 add_test(NAME ThreadPoolResamplingBenchmarks COMMAND threadpool_resampling_benchmarks --benchmark_min_time=2s --benchmark_repetitions=3)
 add_test(NAME CallableBenchmarks COMMAND callable_benchmarks --benchmark_min_time=2s --benchmark_repetitions=3)
+if(THREADSCHEDULE_HAS_REFLECTION)
+    add_test(NAME ReflectionRegistryBenchmarks COMMAND reflection_registry_benchmarks --benchmark_min_time=2s --benchmark_repetitions=3)
+endif()
 add_test(NAME WebServerBenchmarks COMMAND web_server_benchmarks --benchmark_min_time=2s --benchmark_repetitions=3)
 add_test(NAME DatabaseBenchmarks COMMAND database_benchmarks --benchmark_min_time=2s --benchmark_repetitions=3)
 add_test(NAME AudioVideoBenchmarks COMMAND audio_video_benchmarks --benchmark_min_time=2s --benchmark_repetitions=3)
diff --git a/benchmarks/README.md b/benchmarks/README.md
index 36be4b5..12fdbfe 100644
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -59,6 +59,10 @@ cmake --build build --target run_quick_benchmarks
 # Run all core benchmarks (2s per test, 3 repetitions) - use the run_benchmarks.sh script
 ./run_benchmarks.sh
 
+# Generate an HTML report with graphs + speedups for comparison benchmarks
+./run_benchmark_graphs.sh
+./run_benchmark_graphs.sh --quick
+
 # Or run specific benchmark suites with custom settings
 ./build/benchmarks/threadpool_basic_benchmarks --benchmark_min_time=2s --benchmark_repetitions=3
 ./build/benchmarks/web_server_benchmarks --benchmark_min_time=2s --benchmark_repetitions=3
@@ -228,6 +232,111 @@ This shows:
 
 # Generate JSON for analysis
 ./database_benchmarks --benchmark_format=json --benchmark_out=results.json
+
+# Turn one or more Google Benchmark JSON files into a local HTML report
+python3 benchmarks/generate_benchmark_report.py \
+  --output build/benchmark-report.html \
+  --title "Local benchmark comparison" \
+  build/benchmarks/threadpool_comparisons.json \
+  build/benchmarks/reflection_registry.json
+```
+
+## Graphs and Speedups
+
+The repository now includes a local report generator that turns Google
+Benchmark JSON output into a standalone HTML report with:
+
+- Absolute timing bar charts
+- Relative speedup annotations (for known comparison families)
+- Automatically collected machine information
+- Side-by-side tables for comparison-oriented benchmark groups
+
+The current heuristics explicitly understand:
+
+- `BM_ComparePoolTypes_LightWorkload`
+- `BM_PostVsSubmit`
+- `BM_QueryView_FilterMapName`
+- `BM_QueryView_ReflectionWhereProjectName`
+- `BM_QueryView_FindIf`
+- `BM_QueryView_ReflectionFindBy`
+
+This is enough to visualize both classic pool comparisons and the new
+reflection registry speedups without extra dependencies such as matplotlib.
+
+### Standalone SVG charts for the README
+
+`generate_readme_graphs.py` turns the same Google Benchmark JSON into a few
+self-contained SVG files (light background, dark text) that embed cleanly into
+Markdown and render in both light and dark GitHub themes:
+
+```bash
+# Produce JSON from the comparison benchmarks
+./build/benchmarks/threadpool_basic_benchmarks \
+  --benchmark_filter="BM_ComparePoolTypes_LightWorkload|BM_ComparePoolWorkload|BM_PostVsSubmit" \
+  --benchmark_format=json \
+  --benchmark_out=build/threadpool_comparisons.json
+
+# Optional: reflection query benchmarks (needs a C++26 + reflection build)
+./build-reflection/benchmarks/reflection_registry_benchmarks \
+  --benchmark_filter="BM_QueryView_.*" \
+  --benchmark_format=json \
+  --benchmark_out=build/reflection_registry.json
+
+# Render the README charts (no matplotlib required)
+python3 benchmarks/generate_readme_graphs.py \
+  --output-dir docs/benchmarks \
+  build/threadpool_comparisons.json \
+  build/reflection_registry.json
+```
+
+The generator accepts any number of JSON files and emits the charts it can build
+from the data it finds:
+
+| SVG file                          | Source benchmark                                   |
+| --------------------------------- | -------------------------------------------------- |
+| `pool_throughput.svg`             | `BM_ComparePoolTypes_LightWorkload`                |
+| `pool_comparison.svg`             | `BM_ComparePoolTypes_LightWorkload`                |
+| `pool_workload.svg`               | `BM_ComparePoolWorkload`                           |
+| `post_vs_submit.svg`              | `BM_PostVsSubmit`                                  |
+| `reflection_query.svg`            | `BM_QueryView_FilterMapName` vs `...WhereProject`  |
+| `reflection_lookup.svg`           | `BM_QueryView_FindIf` vs `BM_QueryView_ReflectionFindBy` |
+| `callable_standards.svg`          | `callable_std_benchmarks` (`BM_MoveCallable_*`, one JSON per standard) |
+| `callable_sbo.svg`                | `callable_std_benchmarks` (`BM_MoveCallable_*` vs `BM_Sbo_*`) |
+
+All files are written into `docs/benchmarks/` and referenced from the top-level
+`README.md`. The reflection charts require building `reflection_registry_benchmarks`,
+which is only available on GCC 16.1+ with `-DCMAKE_CXX_STANDARD=26 -DTHREADSCHEDULE_ENABLE_REFLECTION=ON`:
+
+```bash
+cmake -S . -B build-reflection -G Ninja -DCMAKE_BUILD_TYPE=Release \
+  -DCMAKE_CXX_STANDARD=26 -DTHREADSCHEDULE_BUILD_BENCHMARKS=ON \
+  -DTHREADSCHEDULE_ENABLE_REFLECTION=ON
+cmake --build build-reflection --target reflection_registry_benchmarks
+```
+
+#### Cross-standard callable charts
+
+`callable_std_benchmarks` isolates the cost of ThreadSchedule's task storage
+(`detail::move_callable`, which is `std::function` on C++17/20 and
+`std::move_only_function` on C++23+, versus the `SboCallable` small-buffer
+callable used by `LightweightPool`). To compare standards, build the same source
+under each one and feed the per-standard JSON (named `callable_cxx<NN>.json`, the
+generator reads the standard from the file name) to the generator:
+
+```bash
+for std in 17 20 23 26; do
+  g++ -std=c++$std -O3 -DNDEBUG -march=native -ffast-math -fno-omit-frame-pointer \
+    -Iinclude -Ibuild/_deps/benchmark-src/include \
+    benchmarks/callable_std_benchmarks.cpp \
+    build/_deps/benchmark-build/src/libbenchmark.a -lpthread -o /tmp/callable_c$std
+  /tmp/callable_c$std --benchmark_min_time=0.5s --benchmark_repetitions=3 \
+    --benchmark_report_aggregates_only=true --benchmark_format=json \
+    --benchmark_out=build/callable_cxx$std.json
+done
+
+python3 benchmarks/generate_readme_graphs.py --output-dir docs/benchmarks \
+  build/callable_cxx17.json build/callable_cxx20.json \
+  build/callable_cxx23.json build/callable_cxx26.json
 ```
 
 ### Performance Regression Testing
diff --git a/benchmarks/callable_std_benchmarks.cpp b/benchmarks/callable_std_benchmarks.cpp
new file mode 100644
index 0000000..50f40c4
--- /dev/null
+++ b/benchmarks/callable_std_benchmarks.cpp
@@ -0,0 +1,101 @@
+// Cross-standard callable storage micro-benchmark.
+//
+// ThreadSchedule stores type-erased tasks in one of two ways:
+//
+//   - detail::move_callable<Signature>  -- the hot-path storage used by
+//     ThreadPool / FastThreadPool / HighPerformancePool. It is an alias for
+//     std::function on C++17/20 and for std::move_only_function on C++23+.
+//   - detail::SboCallable<TaskSize>     -- the small-buffer callable used by
+//     LightweightPool. It stores callables up to TaskSize-8 bytes inline and is
+//     identical across every C++ standard.
+//
+// This benchmark isolates the construction (including any heap allocation) and
+// invocation cost of those two storage types, away from thread scheduling noise,
+// so the same binary can be compiled under C++17/20/23/26 and compared. It
+// answers two questions directly:
+//
+//   1. Does replacing std::function with std::move_only_function help?
+//      -> compare BM_MoveCallable_* across standards.
+//   2. Do the SBO callables help?
+//      -> compare BM_Sbo_* against BM_MoveCallable_* for the same capture.
+//
+// Written to compile as C++17 (no concepts / requires).
+
+#include <array>
+#include <benchmark/benchmark.h>
+#include <cstdint>
+#include <memory>
+#include <vector>
+#include <threadschedule/callable.hpp>
+#include <threadschedule/thread_pool.hpp>
+
+using namespace threadschedule;
+
+namespace
+{
+
+// Build kBatch callables (each capturing NWords * 8 bytes) into a reused vector,
+// then invoke them all. This amortizes timer overhead and measures exactly the
+// storage construction + indirect call that the callable type controls.
+template <typename Storage, std::size_t NWords>
+void bench_storage(benchmark::State& state)
+{
+    constexpr std::size_t kBatch = 256;
+    std::vector<Storage> store;
+    store.reserve(kBatch);
+    volatile std::uint64_t sink = 0;
+
+    for (auto _ : state)
+    {
+        store.clear();
+        for (std::size_t i = 0; i < kBatch; ++i)
+        {
+            std::array<std::uint64_t, NWords> payload{};
+            payload[0] = i;
+            store.emplace_back([payload, &sink]() mutable { sink += payload[0] + 1; });
+        }
+        for (auto& callable : store)
+            callable();
+        benchmark::ClobberMemory();
+    }
+    state.SetItemsProcessed(static_cast<int64_t>(state.iterations() * kBatch));
+}
+
+} // namespace
+
+// move_callable == std::function (C++17/20) or std::move_only_function (C++23+)
+static void BM_MoveCallable_Small(benchmark::State& state)
+{
+    bench_storage<detail::move_callable<void()>, 1>(state); // 8 B capture (fits all)
+}
+static void BM_MoveCallable_Medium(benchmark::State& state)
+{
+    bench_storage<detail::move_callable<void()>, 6>(state); // 48 B capture (heap in std lib callables)
+}
+static void BM_MoveCallable_Large(benchmark::State& state)
+{
+    bench_storage<detail::move_callable<void()>, 16>(state); // 128 B capture (heap everywhere)
+}
+
+// SboCallable<64> == LightweightPool storage (56 B inline buffer)
+static void BM_Sbo_Small(benchmark::State& state)
+{
+    bench_storage<detail::SboCallable<64>, 1>(state);
+}
+static void BM_Sbo_Medium(benchmark::State& state)
+{
+    bench_storage<detail::SboCallable<64>, 6>(state);
+}
+static void BM_Sbo_Large(benchmark::State& state)
+{
+    bench_storage<detail::SboCallable<64>, 16>(state);
+}
+
+BENCHMARK(BM_MoveCallable_Small)->Unit(benchmark::kNanosecond);
+BENCHMARK(BM_MoveCallable_Medium)->Unit(benchmark::kNanosecond);
+BENCHMARK(BM_MoveCallable_Large)->Unit(benchmark::kNanosecond);
+BENCHMARK(BM_Sbo_Small)->Unit(benchmark::kNanosecond);
+BENCHMARK(BM_Sbo_Medium)->Unit(benchmark::kNanosecond);
+BENCHMARK(BM_Sbo_Large)->Unit(benchmark::kNanosecond);
+
+BENCHMARK_MAIN();
diff --git a/benchmarks/generate_benchmark_report.py b/benchmarks/generate_benchmark_report.py
new file mode 100755
index 0000000..bf6274e
--- /dev/null
+++ b/benchmarks/generate_benchmark_report.py
@@ -0,0 +1,428 @@
+#!/usr/bin/env python3
+
+from __future__ import annotations
+
+import argparse
+import datetime as dt
+import html
+import json
+import os
+import platform
+import re
+import subprocess
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Iterable
+
+
+TIME_TO_NS = {
+    "ns": 1.0,
+    "us": 1_000.0,
+    "ms": 1_000_000.0,
+    "s": 1_000_000_000.0,
+}
+
+POOL_NAMES = ("ThreadPool", "FastThreadPool", "HighPerformancePool", "LightweightPool")
+
+EXPLICIT_GROUPS: dict[str, tuple[str, str, str]] = {
+    "BM_QueryView_FilterMapName": ("Reflection query: name projection", "filter + map", "filter + map"),
+    "BM_QueryView_ReflectionWhereProjectName": (
+        "Reflection query: name projection",
+        "reflection where + project",
+        "filter + map",
+    ),
+    "BM_QueryView_FindIf": ("Reflection query: lookup", "find_if", "find_if"),
+    "BM_QueryView_ReflectionFindBy": ("Reflection query: lookup", "reflection find_by", "find_if"),
+}
+
+
+@dataclass
+class Run:
+    source: str
+    family: str
+    full_name: str
+    args: tuple[str, ...]
+    label: str
+    time_ns: float
+    original_unit: str
+    context: dict[str, object] = field(default_factory=dict)
+
+
+@dataclass
+class ComparisonGroup:
+    title: str
+    baseline: str
+    runs: list[Run]
+
+
+def run_command(command: list[str]) -> str:
+    try:
+        completed = subprocess.run(command, capture_output=True, text=True, check=True)
+        return completed.stdout.strip()
+    except Exception:
+        return ""
+
+
+def collect_system_info() -> dict[str, str]:
+    info: dict[str, str] = {
+        "Timestamp": dt.datetime.now().isoformat(timespec="seconds"),
+        "Hostname": platform.node(),
+        "Platform": platform.platform(),
+        "Kernel": run_command(["uname", "-a"]),
+    }
+
+    lscpu = run_command(["lscpu"])
+    if lscpu:
+        def extract(pattern: str) -> str:
+            match = re.search(pattern, lscpu, re.MULTILINE)
+            return match.group(1).strip() if match else ""
+
+        info["CPU"] = extract(r"^Model name:\s+(.+)$")
+        info["CPU cores"] = extract(r"^Core\(s\) per socket:\s+(.+)$")
+        info["CPU threads"] = extract(r"^CPU\(s\):\s+(.+)$")
+        info["Max MHz"] = extract(r"^CPU max MHz:\s+(.+)$")
+        info["L3 cache"] = extract(r"^L3 cache:\s+(.+)$")
+
+    mem = run_command(["free", "-h"])
+    if mem:
+        lines = mem.splitlines()
+        if len(lines) >= 2:
+            parts = lines[1].split()
+            if len(parts) >= 7:
+                info["Memory total"] = parts[1]
+                info["Memory available"] = parts[6]
+        if len(lines) >= 3:
+            parts = lines[2].split()
+            if len(parts) >= 3:
+                info["Swap total"] = parts[1]
+
+    gpu = run_command(["sh", "-lc", "lspci | rg 'VGA|3D|Display'"])
+    if gpu:
+        info["GPU"] = gpu.splitlines()[0].strip()
+
+    disks = run_command(["lsblk", "-d", "-o", "NAME,SIZE,MODEL"])
+    if disks:
+        disk_lines = [line.strip() for line in disks.splitlines()[1:] if line.strip()]
+        if disk_lines:
+            info["Storage"] = "; ".join(disk_lines[:4])
+
+    git_commit = run_command(["git", "rev-parse", "--short", "HEAD"])
+    if git_commit:
+        info["Git commit"] = git_commit
+
+    git_branch = run_command(["git", "branch", "--show-current"])
+    if git_branch:
+        info["Git branch"] = git_branch
+
+    return {key: value for key, value in info.items() if value}
+
+
+def load_runs(path: Path) -> list[Run]:
+    payload = json.loads(path.read_text())
+    context = payload.get("context", {})
+    runs: list[Run] = []
+    for bench in payload.get("benchmarks", []):
+        if bench.get("aggregate_name") or bench.get("run_type") == "aggregate":
+            continue
+        if "real_time" not in bench and "cpu_time" not in bench:
+            continue
+        full_name = str(bench["name"])
+        parts = full_name.split("/")
+        family = parts[0]
+        args = tuple(parts[1:])
+        unit = str(bench.get("time_unit", "ns"))
+        raw_value = float(bench.get("real_time", bench.get("cpu_time")))
+        time_ns = raw_value * TIME_TO_NS.get(unit, 1.0)
+        runs.append(
+            Run(
+                source=path.name,
+                family=family,
+                full_name=full_name,
+                args=args,
+                label=str(bench.get("label", "")),
+                time_ns=time_ns,
+                original_unit=unit,
+                context=context,
+            )
+        )
+    return runs
+
+
+def detect_group(run: Run) -> tuple[str, str, str] | None:
+    if run.family in EXPLICIT_GROUPS:
+        title, variant, baseline = EXPLICIT_GROUPS[run.family]
+        suffix = ", ".join(run.args) if run.args else "default"
+        return (f"{title} ({suffix})", variant, baseline)
+
+    if run.family == "BM_ComparePoolTypes_LightWorkload" and run.label:
+        task_match = re.search(r"tasks=(\d+)", run.label)
+        pool_name = next((name for name in POOL_NAMES if name in run.label), run.label)
+        tasks = task_match.group(1) if task_match else (run.args[0] if run.args else "unknown")
+        return (f"Pool comparison: light workload ({tasks} tasks)", pool_name, "ThreadPool")
+
+    if run.family == "BM_PostVsSubmit":
+        tasks = run.args[0] if run.args else "unknown"
+        variant = run.label or ("submit(future)" if run.args[-1:] == ("0",) else "post(fire-forget)")
+        return (f"Post vs submit ({tasks} tasks)", variant, "submit(future)")
+
+    return None
+
+
+def build_comparisons(runs: Iterable[Run]) -> list[ComparisonGroup]:
+    grouped: dict[str, tuple[str, str, list[Run]]] = {}
+    variant_names: dict[str, list[str]] = {}
+
+    for run in runs:
+        detected = detect_group(run)
+        if not detected:
+            continue
+        title, variant, baseline = detected
+        key = title
+        if key not in grouped:
+            grouped[key] = (title, baseline, [])
+            variant_names[key] = []
+        grouped[key][2].append(run)
+        variant_names[key].append(variant)
+        run.context = dict(run.context)
+        run.context["variant_name"] = variant
+
+    groups: list[ComparisonGroup] = []
+    for key, (title, baseline, values) in grouped.items():
+        if len(values) < 2:
+            continue
+        groups.append(ComparisonGroup(title=title, baseline=baseline, runs=values))
+    groups.sort(key=lambda group: group.title)
+    return groups
+
+
+def format_time_ns(time_ns: float) -> str:
+    if time_ns >= 1_000_000_000.0:
+        return f"{time_ns / 1_000_000_000.0:.3f} s"
+    if time_ns >= 1_000_000.0:
+        return f"{time_ns / 1_000_000.0:.3f} ms"
+    if time_ns >= 1_000.0:
+        return f"{time_ns / 1_000.0:.3f} us"
+    return f"{time_ns:.0f} ns"
+
+
+def speedup_label(baseline_ns: float, run_ns: float) -> str:
+    if run_ns <= 0:
+        return "n/a"
+    ratio = baseline_ns / run_ns
+    if abs(ratio - 1.0) < 0.02:
+        return "same speed"
+    if ratio > 1.0:
+        return f"{ratio:.2f}x faster"
+    return f"{1.0 / ratio:.2f}x slower"
+
+
+def render_bar_chart(items: list[tuple[str, float, str]], width: int = 920, bar_height: int = 30) -> str:
+    if not items:
+        return ""
+    max_value = max(value for _, value, _ in items) or 1.0
+    label_width = 280
+    chart_width = width - label_width - 120
+    height = len(items) * (bar_height + 18) + 24
+    bars = []
+    for index, (label, value, annotation) in enumerate(items):
+        y = 20 + index * (bar_height + 18)
+        bar_width = max(2.0, chart_width * (value / max_value))
+        bars.append(
+            f'<text x="8" y="{y + 19}" fill="#122033" font-size="13">{html.escape(label)}</text>'
+            f'<rect x="{label_width}" y="{y}" width="{bar_width:.1f}" height="{bar_height}" rx="6" fill="#2a7fff" />'
+            f'<text x="{label_width + bar_width + 10:.1f}" y="{y + 19}" fill="#122033" font-size="13">{html.escape(annotation)}</text>'
+        )
+    return (
+        f'<svg viewBox="0 0 {width} {height}" width="100%" height="{height}" role="img">'
+        + "".join(bars)
+        + "</svg>"
+    )
+
+
+def comparison_table(group: ComparisonGroup) -> str:
+    variants: list[tuple[str, Run]] = []
+    for run in group.runs:
+        variant = str(run.context.get("variant_name", run.label or run.family))
+        variants.append((variant, run))
+
+    baseline_run = next((run for variant, run in variants if variant == group.baseline), variants[0][1])
+    rows = []
+    chart_items = []
+
+    for variant, run in sorted(variants, key=lambda item: item[1].time_ns):
+        speedup = speedup_label(baseline_run.time_ns, run.time_ns)
+        rows.append(
+            "<tr>"
+            f"<td>{html.escape(variant)}</td>"
+            f"<td>{html.escape(format_time_ns(run.time_ns))}</td>"
+            f"<td>{html.escape(speedup)}</td>"
+            f"<td>{html.escape(run.source)}</td>"
+            "</tr>"
+        )
+        chart_items.append((variant, run.time_ns, f"{format_time_ns(run.time_ns)} | {speedup}"))
+
+    return (
+        f"<section class='card'><h3>{html.escape(group.title)}</h3>"
+        + render_bar_chart(chart_items)
+        + "<table><thead><tr><th>Variant</th><th>Time</th><th>Relative to baseline</th><th>Source</th></tr></thead><tbody>"
+        + "".join(rows)
+        + "</tbody></table></section>"
+    )
+
+
+def overall_section(runs: list[Run]) -> str:
+    top = sorted(runs, key=lambda run: run.time_ns)[:14]
+    chart_items = []
+    for run in top:
+        label = run.label or run.full_name
+        chart_items.append((label[:42], run.time_ns, format_time_ns(run.time_ns)))
+    return (
+        "<section class='card'><h3>Fastest benchmark runs</h3>"
+        "<p>Absolute timings across the provided JSON files. Lower is better.</p>"
+        + render_bar_chart(chart_items)
+        + "</section>"
+    )
+
+
+def system_info_section(system_info: dict[str, str]) -> str:
+    rows = "".join(
+        f"<tr><th>{html.escape(key)}</th><td>{html.escape(value)}</td></tr>" for key, value in system_info.items()
+    )
+    return (
+        "<section class='card'><h3>System information</h3>"
+        "<table><tbody>"
+        + rows
+        + "</tbody></table></section>"
+    )
+
+
+def build_html(title: str, runs: list[Run], groups: list[ComparisonGroup], system_info: dict[str, str]) -> str:
+    comparison_sections = "".join(comparison_table(group) for group in groups)
+    if not comparison_sections:
+        comparison_sections = "<section class='card'><h3>No comparison groups detected</h3><p>The input JSON did not match any known comparison patterns yet.</p></section>"
+
+    return f"""<!doctype html>
+<html lang="en">
+<head>
+  <meta charset="utf-8" />
+  <meta name="viewport" content="width=device-width, initial-scale=1" />
+  <title>{html.escape(title)}</title>
+  <style>
+    :root {{
+      --bg: #f3f6fb;
+      --card: #ffffff;
+      --ink: #122033;
+      --muted: #5b6b82;
+      --line: #dbe3ef;
+      --accent: #2a7fff;
+      --accent-soft: #d9e8ff;
+    }}
+    * {{ box-sizing: border-box; }}
+    body {{
+      margin: 0;
+      padding: 32px;
+      background: radial-gradient(circle at top left, #ffffff 0%, var(--bg) 55%);
+      color: var(--ink);
+      font: 15px/1.45 "IBM Plex Sans", "Segoe UI", sans-serif;
+    }}
+    h1, h2, h3 {{ margin: 0 0 12px; }}
+    p {{ margin: 0 0 12px; color: var(--muted); }}
+    .layout {{
+      display: grid;
+      gap: 20px;
+      max-width: 1320px;
+      margin: 0 auto;
+    }}
+    .hero {{
+      padding: 28px;
+      border-radius: 18px;
+      background: linear-gradient(135deg, #0e1b2d 0%, #163f73 100%);
+      color: #ffffff;
+      box-shadow: 0 24px 60px rgba(11, 31, 57, 0.16);
+    }}
+    .hero p {{ color: rgba(255,255,255,0.82); }}
+    .grid {{
+      display: grid;
+      gap: 20px;
+      grid-template-columns: repeat(auto-fit, minmax(320px, 1fr));
+    }}
+    .card {{
+      background: var(--card);
+      border: 1px solid var(--line);
+      border-radius: 18px;
+      padding: 22px;
+      box-shadow: 0 10px 30px rgba(17, 34, 51, 0.06);
+    }}
+    table {{
+      width: 100%;
+      border-collapse: collapse;
+      margin-top: 12px;
+    }}
+    th, td {{
+      text-align: left;
+      border-top: 1px solid var(--line);
+      padding: 10px 8px;
+      vertical-align: top;
+    }}
+    th {{ width: 220px; color: var(--muted); font-weight: 600; }}
+    .section-title {{
+      margin-top: 10px;
+      padding-left: 6px;
+      border-left: 4px solid var(--accent);
+    }}
+    @media (max-width: 720px) {{
+      body {{ padding: 18px; }}
+      th {{ width: 140px; }}
+    }}
+  </style>
+</head>
+<body>
+  <main class="layout">
+    <section class="hero">
+      <h1>{html.escape(title)}</h1>
+      <p>Google Benchmark comparison report with automatically collected machine data and relative speedups.</p>
+      <p>Loaded benchmark runs: {len(runs)} | Comparison groups: {len(groups)}</p>
+    </section>
+    <div class="grid">
+      {system_info_section(system_info)}
+      {overall_section(runs)}
+    </div>
+    <h2 class="section-title">Relative speedups</h2>
+    {comparison_sections}
+  </main>
+</body>
+</html>
+"""
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description="Generate a local HTML benchmark report with graphs and speedups.")
+    parser.add_argument("json_files", nargs="+", help="Google Benchmark JSON files")
+    parser.add_argument("--output", required=True, help="Output HTML file")
+    parser.add_argument("--title", default="ThreadSchedule benchmark report", help="Report title")
+    return parser.parse_args()
+
+
+def main() -> int:
+    args = parse_args()
+    json_paths = [Path(value) for value in args.json_files]
+    runs: list[Run] = []
+    for path in json_paths:
+        runs.extend(load_runs(path))
+
+    if not runs:
+        raise SystemExit("No benchmark runs found in the provided JSON files.")
+
+    system_info = collect_system_info()
+    groups = build_comparisons(runs)
+    html_payload = build_html(args.title, runs, groups, system_info)
+
+    output_path = Path(args.output)
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+    output_path.write_text(html_payload, encoding="utf-8")
+    print(f"Wrote benchmark report to {output_path}")
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/benchmarks/generate_readme_graphs.py b/benchmarks/generate_readme_graphs.py
new file mode 100644
index 0000000..bbe5f2a
--- /dev/null
+++ b/benchmarks/generate_readme_graphs.py
@@ -0,0 +1,538 @@
+#!/usr/bin/env python3
+"""Generate standalone SVG charts from Google Benchmark JSON for README embedding.
+
+This intentionally has no third-party dependencies (no matplotlib): it emits
+self-contained SVG files with an explicit light background so they render well
+in both light and dark GitHub themes.
+"""
+
+from __future__ import annotations
+
+import argparse
+import html
+import json
+from dataclasses import dataclass
+from pathlib import Path
+
+TIME_TO_NS = {"ns": 1.0, "us": 1_000.0, "ms": 1_000_000.0, "s": 1_000_000_000.0}
+
+POOL_ORDER = ("ThreadPool", "FastThreadPool", "HighPerformancePool", "LightweightPool")
+POOL_COLORS = {
+    "ThreadPool": "#2a7fff",
+    "FastThreadPool": "#16a34a",
+    "HighPerformancePool": "#f59e0b",
+    "LightweightPool": "#db2777",
+}
+VARIANT_COLORS = {
+    "submit(future)": "#2a7fff",
+    "post(fire-forget)": "#16a34a",
+}
+WORKLOAD_ORDER = ("tiny", "medium", "heavy", "imbalanced")
+
+CXX_COLORS = {"C++17": "#94a3b8", "C++20": "#2a7fff", "C++23": "#16a34a", "C++26": "#db2777"}
+CALLABLE_BATCH = 256  # kBatch in callable_std_benchmarks.cpp
+CALLABLE_CAPTURES = (("Small", "small (8 B)"), ("Medium", "medium (48 B)"), ("Large", "large (128 B)"))
+
+INK = "#122033"
+MUTED = "#5b6b82"
+LINE = "#dbe3ef"
+BG = "#ffffff"
+
+
+@dataclass
+class Entry:
+    family: str
+    args: tuple[str, ...]
+    label: str
+    time_ns: float
+    items_per_second: float
+
+
+def load_entries(path: Path) -> list[Entry]:
+    payload = json.loads(path.read_text())
+    entries: list[Entry] = []
+    for bench in payload.get("benchmarks", []):
+        if bench.get("run_type") == "aggregate" or bench.get("aggregate_name"):
+            continue
+        if "real_time" not in bench and "cpu_time" not in bench:
+            continue
+        parts = str(bench["name"]).split("/")
+        unit = str(bench.get("time_unit", "ns"))
+        raw = float(bench.get("real_time", bench.get("cpu_time")))
+        entries.append(
+            Entry(
+                family=parts[0],
+                args=tuple(parts[1:]),
+                label=str(bench.get("label", "")),
+                time_ns=raw * TIME_TO_NS.get(unit, 1.0),
+                items_per_second=float(bench.get("items_per_second", 0.0)),
+            )
+        )
+    return entries
+
+
+def fmt_time(time_ns: float) -> str:
+    if time_ns >= 1_000_000_000.0:
+        return f"{time_ns / 1_000_000_000.0:.2f} s"
+    if time_ns >= 1_000_000.0:
+        return f"{time_ns / 1_000_000.0:.2f} ms"
+    if time_ns >= 1_000.0:
+        return f"{time_ns / 1_000.0:.2f} us"
+    return f"{time_ns:.0f} ns"
+
+
+def svg_header(width: int, height: int, title: str, subtitle: str) -> list[str]:
+    return [
+        f'<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 {width} {height}" '
+        f'width="{width}" height="{height}" font-family="\'Segoe UI\', Helvetica, Arial, sans-serif">',
+        f'<rect x="0" y="0" width="{width}" height="{height}" rx="14" fill="{BG}" stroke="{LINE}" />',
+        f'<text x="24" y="34" fill="{INK}" font-size="18" font-weight="700">{html.escape(title)}</text>',
+        f'<text x="24" y="56" fill="{MUTED}" font-size="13">{html.escape(subtitle)}</text>',
+    ]
+
+
+def horizontal_bar_chart(
+    title: str,
+    subtitle: str,
+    items: list[tuple[str, float, str, str]],
+    value_suffix: str = "",
+) -> str:
+    """items: (label, value, annotation, color)."""
+    width = 760
+    top = 80
+    bar_h = 34
+    gap = 22
+    label_w = 200
+    right_pad = 150
+    chart_w = width - label_w - right_pad - 24
+    height = top + len(items) * (bar_h + gap) + 16
+    max_value = max(v for _, v, _, _ in items) or 1.0
+
+    parts = svg_header(width, height, title, subtitle)
+    for i, (label, value, annotation, color) in enumerate(items):
+        y = top + i * (bar_h + gap)
+        bar_w = max(3.0, chart_w * (value / max_value))
+        parts.append(
+            f'<text x="24" y="{y + bar_h / 2 + 5:.0f}" fill="{INK}" font-size="13" font-weight="600">{html.escape(label)}</text>'
+        )
+        parts.append(
+            f'<rect x="{label_w}" y="{y}" width="{bar_w:.1f}" height="{bar_h}" rx="6" fill="{color}" />'
+        )
+        parts.append(
+            f'<text x="{label_w + bar_w + 10:.1f}" y="{y + bar_h / 2 + 5:.0f}" fill="{INK}" '
+            f'font-size="13" font-weight="600">{html.escape(annotation)}</text>'
+        )
+    parts.append("</svg>")
+    return "\n".join(parts)
+
+
+def grouped_bar_chart(
+    title: str,
+    subtitle: str,
+    group_labels: list[str],
+    series: list[tuple[str, list[float], str]],
+    y_axis_label: str,
+) -> str:
+    """series: (name, values_per_group, color)."""
+    width = 820
+    top = 96
+    plot_h = 300
+    left_pad = 64
+    right_pad = 24
+    plot_w = width - left_pad - right_pad
+    height = top + plot_h + 86
+
+    max_value = max((max(vals) for _, vals, _ in series), default=1.0) or 1.0
+    # round max up to a nice number
+    import math
+
+    magnitude = 10 ** math.floor(math.log10(max_value)) if max_value > 0 else 1
+    nice_max = math.ceil(max_value / magnitude) * magnitude
+    if nice_max == 0:
+        nice_max = 1
+
+    parts = svg_header(width, height, title, subtitle)
+
+    baseline_y = top + plot_h
+    # gridlines + y ticks
+    ticks = 5
+    for t in range(ticks + 1):
+        gy = baseline_y - plot_h * t / ticks
+        val = nice_max * t / ticks
+        parts.append(
+            f'<line x1="{left_pad}" y1="{gy:.1f}" x2="{width - right_pad}" y2="{gy:.1f}" stroke="{LINE}" stroke-width="1" />'
+        )
+        parts.append(
+            f'<text x="{left_pad - 8}" y="{gy + 4:.1f}" fill="{MUTED}" font-size="11" text-anchor="end">{val:.1f}</text>'
+        )
+    parts.append(
+        f'<text x="16" y="{top - 8}" fill="{MUTED}" font-size="12" font-weight="600">{html.escape(y_axis_label)}</text>'
+    )
+
+    n_groups = len(group_labels)
+    n_series = len(series)
+    group_w = plot_w / n_groups
+    inner_pad = group_w * 0.16
+    bar_w = (group_w - 2 * inner_pad) / n_series
+
+    for g in range(n_groups):
+        gx = left_pad + g * group_w
+        for s, (_, vals, color) in enumerate(series):
+            value = vals[g]
+            bh = plot_h * (value / nice_max)
+            bx = gx + inner_pad + s * bar_w
+            parts.append(
+                f'<rect x="{bx:.1f}" y="{baseline_y - bh:.1f}" width="{bar_w - 2:.1f}" height="{bh:.1f}" rx="3" fill="{color}" />'
+            )
+        parts.append(
+            f'<text x="{gx + group_w / 2:.1f}" y="{baseline_y + 18:.1f}" fill="{INK}" '
+            f'font-size="12" font-weight="600" text-anchor="middle">{html.escape(group_labels[g])}</text>'
+        )
+
+    # legend
+    legend_y = baseline_y + 44
+    lx = left_pad
+    for name, _, color in series:
+        parts.append(f'<rect x="{lx}" y="{legend_y}" width="14" height="14" rx="3" fill="{color}" />')
+        parts.append(
+            f'<text x="{lx + 20}" y="{legend_y + 12}" fill="{INK}" font-size="12">{html.escape(name)}</text>'
+        )
+        lx += 30 + len(name) * 7.2
+    parts.append("</svg>")
+    return "\n".join(parts)
+
+
+def pool_runs_by_tasks(entries: list[Entry]) -> dict[str, dict[str, Entry]]:
+    """tasks -> pool_name -> Entry."""
+    out: dict[str, dict[str, Entry]] = {}
+    for e in entries:
+        if e.family != "BM_ComparePoolTypes_LightWorkload":
+            continue
+        # Label format is "<PoolName> tasks=N"; match the leading token so that
+        # "ThreadPool" does not shadow "FastThreadPool" via substring matching.
+        first_token = e.label.split()[0] if e.label else ""
+        pool = first_token if first_token in POOL_ORDER else None
+        if not pool:
+            continue
+        tasks = e.args[0] if e.args else "?"
+        out.setdefault(tasks, {})[pool] = e
+    return out
+
+
+def build_pool_comparison(entries: list[Entry], out_dir: Path) -> Path | None:
+    by_tasks = pool_runs_by_tasks(entries)
+    if not by_tasks:
+        return None
+    tasks = max(by_tasks, key=lambda t: int(t) if t.isdigit() else 0)
+    pools = by_tasks[tasks]
+    baseline = pools.get("ThreadPool")
+    items: list[tuple[str, float, str, str]] = []
+    ordered = sorted(pools.items(), key=lambda kv: kv[1].time_ns)
+    for pool, e in ordered:
+        if baseline and baseline.time_ns > 0 and e.time_ns > 0:
+            ratio = baseline.time_ns / e.time_ns
+            if abs(ratio - 1.0) < 0.02:
+                rel = "baseline"
+            elif ratio >= 1.0:
+                rel = f"{ratio:.2f}x faster"
+            else:
+                rel = f"{1.0 / ratio:.2f}x slower"
+        else:
+            rel = ""
+        annotation = f"{fmt_time(e.time_ns)}  ({rel})" if rel else fmt_time(e.time_ns)
+        items.append((pool, e.time_ns, annotation, POOL_COLORS.get(pool, "#2a7fff")))
+    svg = horizontal_bar_chart(
+        "Thread pool comparison \u2014 light workload",
+        f"Wall-clock time to run {int(tasks):,} tiny tasks (lower is better, relative to ThreadPool)",
+        items,
+    )
+    path = out_dir / "pool_comparison.svg"
+    path.write_text(svg, encoding="utf-8")
+    return path
+
+
+def build_pool_throughput(entries: list[Entry], out_dir: Path) -> Path | None:
+    by_tasks = pool_runs_by_tasks(entries)
+    if not by_tasks:
+        return None
+    task_keys = sorted((t for t in by_tasks if t.isdigit()), key=lambda t: int(t))
+    group_labels = [f"{int(t):,}" for t in task_keys]
+    series: list[tuple[str, list[float], str]] = []
+    for pool in POOL_ORDER:
+        vals = []
+        for t in task_keys:
+            e = by_tasks[t].get(pool)
+            vals.append((e.items_per_second / 1_000_000.0) if e else 0.0)
+        if any(v > 0 for v in vals):
+            series.append((pool, vals, POOL_COLORS.get(pool, "#2a7fff")))
+    if not series:
+        return None
+    svg = grouped_bar_chart(
+        "Thread pool throughput by batch size",
+        "Tasks processed per second for the light workload (higher is better)",
+        group_labels,
+        series,
+        "M tasks / second",
+    )
+    path = out_dir / "pool_throughput.svg"
+    path.write_text(svg, encoding="utf-8")
+    return path
+
+
+def build_post_vs_submit(entries: list[Entry], out_dir: Path) -> Path | None:
+    by_tasks: dict[str, dict[str, Entry]] = {}
+    for e in entries:
+        if e.family != "BM_PostVsSubmit":
+            continue
+        tasks = e.args[0] if e.args else "?"
+        by_tasks.setdefault(tasks, {})[e.label] = e
+    if not by_tasks:
+        return None
+    tasks = max(by_tasks, key=lambda t: int(t) if t.isdigit() else 0)
+    variants = by_tasks[tasks]
+    submit = variants.get("submit(future)")
+    items: list[tuple[str, float, str, str]] = []
+    for name, e in sorted(variants.items(), key=lambda kv: kv[1].time_ns):
+        if submit and submit.time_ns > 0 and e.time_ns > 0:
+            ratio = submit.time_ns / e.time_ns
+            rel = "baseline" if abs(ratio - 1.0) < 0.02 else (
+                f"{ratio:.2f}x faster" if ratio > 1.0 else f"{1.0 / ratio:.2f}x slower"
+            )
+            annotation = f"{fmt_time(e.time_ns)}  ({rel})"
+        else:
+            annotation = fmt_time(e.time_ns)
+        items.append((name, e.time_ns, annotation, VARIANT_COLORS.get(name, "#2a7fff")))
+    svg = horizontal_bar_chart(
+        "post() vs submit()",
+        f"Submission overhead for {int(tasks):,} tasks: post() skips the future/packaged_task path (lower is better)",
+        items,
+    )
+    path = out_dir / "post_vs_submit.svg"
+    path.write_text(svg, encoding="utf-8")
+    return path
+
+
+def build_pool_workload(entries: list[Entry], out_dir: Path) -> Path | None:
+    by_wl: dict[str, dict[str, Entry]] = {}
+    for e in entries:
+        if e.family != "BM_ComparePoolWorkload" or not e.label:
+            continue
+        tokens = e.label.split()
+        if len(tokens) < 2:
+            continue
+        pool, wl = tokens[0], tokens[1]
+        if pool not in POOL_ORDER:
+            continue
+        by_wl.setdefault(wl, {})[pool] = e
+    if not by_wl:
+        return None
+
+    group_labels = [wl for wl in WORKLOAD_ORDER if wl in by_wl]
+    series: list[tuple[str, list[float], str]] = []
+    for pool in POOL_ORDER:
+        vals: list[float] = []
+        for wl in group_labels:
+            row = by_wl[wl]
+            best = min((r.time_ns for r in row.values()), default=0.0) or 1.0
+            e = row.get(pool)
+            vals.append((e.time_ns / best) if e else 0.0)
+        if any(v > 0 for v in vals):
+            series.append((pool, vals, POOL_COLORS.get(pool, "#2a7fff")))
+    if not series:
+        return None
+
+    svg = grouped_bar_chart(
+        "Which pool wins depends on the workload",
+        "Time relative to the fastest pool per workload (1.0 = winner, shorter is better; pool built once, 4 threads)",
+        group_labels,
+        series,
+        "relative time (1.0 = fastest)",
+    )
+    path = out_dir / "pool_workload.svg"
+    path.write_text(svg, encoding="utf-8")
+    return path
+
+
+def _reflection_pair(
+    entries: list[Entry],
+    manual_family: str,
+    reflect_family: str,
+    title: str,
+    subtitle: str,
+    manual_label: str,
+    reflect_label: str,
+    out_name: str,
+    out_dir: Path,
+) -> Path | None:
+    manual = {e.args: e for e in entries if e.family == manual_family}
+    reflect = {e.args: e for e in entries if e.family == reflect_family}
+    common = sorted(set(manual) & set(reflect), key=lambda a: int(a[0]) if a and a[0].isdigit() else 0)
+    if not common:
+        return None
+    args = common[-1]  # largest registry size
+    m = manual[args]
+    r = reflect[args]
+    size = args[0] if args else "?"
+    items: list[tuple[str, float, str, str]] = []
+    for label, e, color in (
+        (manual_label, m, "#2a7fff"),
+        (reflect_label, r, "#f59e0b"),
+    ):
+        if m.time_ns > 0 and e.time_ns > 0:
+            ratio = e.time_ns / m.time_ns
+            rel = "baseline" if abs(ratio - 1.0) < 0.02 else (
+                f"{ratio:.2f}x slower" if ratio > 1.0 else f"{1.0 / ratio:.2f}x faster"
+            )
+            annotation = f"{fmt_time(e.time_ns)}  ({rel})"
+        else:
+            annotation = fmt_time(e.time_ns)
+        items.append((label, e.time_ns, annotation, color))
+    svg = horizontal_bar_chart(title, subtitle.format(size=f"{int(size):,}" if size.isdigit() else size), items)
+    path = out_dir / out_name
+    path.write_text(svg, encoding="utf-8")
+    return path
+
+
+def build_reflection_query(entries: list[Entry], out_dir: Path) -> Path | None:
+    return _reflection_pair(
+        entries,
+        "BM_QueryView_FilterMapName",
+        "BM_QueryView_ReflectionWhereProjectName",
+        "Reflection registry query: project a field",
+        "Selecting + projecting one field over {size} registered threads (lower is better)",
+        "filter + map (hand-written)",
+        "where + project (reflection)",
+        "reflection_query.svg",
+        out_dir,
+    )
+
+
+def build_reflection_lookup(entries: list[Entry], out_dir: Path) -> Path | None:
+    return _reflection_pair(
+        entries,
+        "BM_QueryView_FindIf",
+        "BM_QueryView_ReflectionFindBy",
+        "Reflection registry query: find by field",
+        "Locating a single entry by name over {size} registered threads (lower is better)",
+        "find_if (hand-written)",
+        "find_by (reflection)",
+        "reflection_lookup.svg",
+        out_dir,
+    )
+
+
+def load_callable_medians(path: Path) -> dict[str, float]:
+    """Return family -> per-task time (ns) from an aggregate-only callable JSON."""
+    payload = json.loads(path.read_text())
+    out: dict[str, float] = {}
+    for bench in payload.get("benchmarks", []):
+        if bench.get("aggregate_name") != "median":
+            continue
+        family = str(bench.get("run_name", bench.get("name", ""))).split("/")[0]
+        unit = str(bench.get("time_unit", "ns"))
+        ns = float(bench.get("real_time", 0.0)) * TIME_TO_NS.get(unit, 1.0)
+        out[family] = ns / CALLABLE_BATCH
+    return out
+
+
+def standard_from_filename(path: Path) -> str | None:
+    name = path.name
+    for token in ("cxx17", "cxx20", "cxx23", "cxx26"):
+        if token in name:
+            return "C++" + token[3:]
+    return None
+
+
+def build_callable_charts(std_medians: dict[str, dict[str, float]], out_dir: Path) -> list[Path]:
+    std_order = [s for s in ("C++17", "C++20", "C++23", "C++26") if s in std_medians]
+    if not std_order:
+        return []
+    group_labels = [label for _, label in CALLABLE_CAPTURES]
+    paths: list[Path] = []
+
+    # Chart A: move_callable cost across standards (std::function vs move_only_function).
+    series_a: list[tuple[str, list[float], str]] = []
+    for std in std_order:
+        vals = [std_medians[std].get(f"BM_MoveCallable_{key}", 0.0) for key, _ in CALLABLE_CAPTURES]
+        if any(v > 0 for v in vals):
+            series_a.append((std, vals, CXX_COLORS[std]))
+    if series_a:
+        svg = grouped_bar_chart(
+            "Does replacing std::function help? (ThreadPool task storage)",
+            "Build + invoke cost per task for detail::move_callable "
+            "(std::function on C++17/20, std::move_only_function on C++23+); lower is better",
+            group_labels,
+            series_a,
+            "ns per task",
+        )
+        path = out_dir / "callable_standards.svg"
+        path.write_text(svg, encoding="utf-8")
+        paths.append(path)
+
+    # Chart B: SBO callable vs std-library callable at the newest available standard.
+    newest = std_order[-1]
+    medians = std_medians[newest]
+    move_vals = [medians.get(f"BM_MoveCallable_{key}", 0.0) for key, _ in CALLABLE_CAPTURES]
+    sbo_vals = [medians.get(f"BM_Sbo_{key}", 0.0) for key, _ in CALLABLE_CAPTURES]
+    if any(v > 0 for v in move_vals) and any(v > 0 for v in sbo_vals):
+        series_b = [
+            ("move_callable (ThreadPool / std lib)", move_vals, "#2a7fff"),
+            ("SboCallable (LightweightPool)", sbo_vals, "#db2777"),
+        ]
+        svg = grouped_bar_chart(
+            f"Do the SBO callables help? ({newest})",
+            "Per-task cost; the 48 B capture fits the SBO buffer but spills the std-library "
+            "callable to the heap (lower is better)",
+            group_labels,
+            series_b,
+            "ns per task",
+        )
+        path = out_dir / "callable_sbo.svg"
+        path.write_text(svg, encoding="utf-8")
+        paths.append(path)
+
+    return paths
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument("json_files", nargs="+", help="Google Benchmark JSON files")
+    parser.add_argument("--output-dir", required=True, help="Directory for generated SVG files")
+    return parser.parse_args()
+
+
+def main() -> int:
+    args = parse_args()
+    entries: list[Entry] = []
+    std_medians: dict[str, dict[str, float]] = {}
+    for value in args.json_files:
+        path = Path(value)
+        standard = standard_from_filename(path)
+        if standard:
+            std_medians[standard] = load_callable_medians(path)
+        else:
+            entries.extend(load_entries(path))
+
+    if not entries and not std_medians:
+        raise SystemExit("No benchmark entries found in the provided JSON files.")
+
+    out_dir = Path(args.output_dir)
+    out_dir.mkdir(parents=True, exist_ok=True)
+
+    generated: list[Path | None] = [
+        build_pool_throughput(entries, out_dir),
+        build_pool_comparison(entries, out_dir),
+        build_pool_workload(entries, out_dir),
+        build_post_vs_submit(entries, out_dir),
+        build_reflection_query(entries, out_dir),
+        build_reflection_lookup(entries, out_dir),
+    ]
+    generated.extend(build_callable_charts(std_medians, out_dir))
+    for path in generated:
+        if path:
+            print(f"Wrote {path}")
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/benchmarks/reflection_registry_benchmarks.cpp b/benchmarks/reflection_registry_benchmarks.cpp
new file mode 100644
index 0000000..b465bae
--- /dev/null
+++ b/benchmarks/reflection_registry_benchmarks.cpp
@@ -0,0 +1,80 @@
+#include <benchmark/benchmark.h>
+#include <threadschedule/thread_registry.hpp>
+#include <string>
+#include <vector>
+
+#if !defined(THREADSCHEDULE_HAS_REFLECTION) || !THREADSCHEDULE_HAS_REFLECTION
+#error "reflection_registry_benchmarks.cpp requires THREADSCHEDULE_HAS_REFLECTION"
+#endif
+
+using namespace threadschedule;
+
+namespace
+{
+
+auto make_entries(std::size_t count) -> std::vector<RegisteredThreadInfo>
+{
+    std::vector<RegisteredThreadInfo> entries;
+    entries.reserve(count);
+    for (std::size_t index = 0; index < count; ++index)
+    {
+        RegisteredThreadInfo info{};
+        info.tid = static_cast<Tid>(index + 1);
+        info.name = "worker-" + std::to_string(index);
+        info.componentTag = (index % 3 == 0) ? "io" : ((index % 3 == 1) ? "compute" : "scheduler");
+        info.alive = (index % 5) != 0;
+        entries.push_back(std::move(info));
+    }
+    return entries;
+}
+
+} // namespace
+
+static void BM_QueryView_FilterMapName(benchmark::State& state)
+{
+    ThreadRegistry::QueryView view(make_entries(static_cast<std::size_t>(state.range(0))));
+    for (auto _ : state)
+    {
+        auto names = view.filter([](RegisteredThreadInfo const& entry) { return entry.componentTag == "io"; })
+                         .map([](RegisteredThreadInfo const& entry) { return entry.name; });
+        benchmark::DoNotOptimize(names);
+    }
+}
+
+static void BM_QueryView_ReflectionWhereProjectName(benchmark::State& state)
+{
+    ThreadRegistry::QueryView view(make_entries(static_cast<std::size_t>(state.range(0))));
+    for (auto _ : state)
+    {
+        auto names =
+            view.where<registered_thread_fields::componentTag()>("io").project<registered_thread_fields::name()>();
+        benchmark::DoNotOptimize(names);
+    }
+}
+
+static void BM_QueryView_FindIf(benchmark::State& state)
+{
+    ThreadRegistry::QueryView view(make_entries(static_cast<std::size_t>(state.range(0))));
+    for (auto _ : state)
+    {
+        auto found = view.find_if([](RegisteredThreadInfo const& entry) { return entry.name == "worker-42"; });
+        benchmark::DoNotOptimize(found);
+    }
+}
+
+static void BM_QueryView_ReflectionFindBy(benchmark::State& state)
+{
+    ThreadRegistry::QueryView view(make_entries(static_cast<std::size_t>(state.range(0))));
+    for (auto _ : state)
+    {
+        auto found = view.find_by<registered_thread_fields::name()>(std::string("worker-42"));
+        benchmark::DoNotOptimize(found);
+    }
+}
+
+BENCHMARK(BM_QueryView_FilterMapName)->Arg(256)->Arg(4096)->Arg(16384);
+BENCHMARK(BM_QueryView_ReflectionWhereProjectName)->Arg(256)->Arg(4096)->Arg(16384);
+BENCHMARK(BM_QueryView_FindIf)->Arg(256)->Arg(4096)->Arg(16384);
+BENCHMARK(BM_QueryView_ReflectionFindBy)->Arg(256)->Arg(4096)->Arg(16384);
+
+BENCHMARK_MAIN();
diff --git a/benchmarks/threadpool_benchmarks.cpp b/benchmarks/threadpool_benchmarks.cpp
index 71ceb53..e9b7383 100644
--- a/benchmarks/threadpool_benchmarks.cpp
+++ b/benchmarks/threadpool_benchmarks.cpp
@@ -456,6 +456,103 @@ static void BM_ComparePoolTypes_LightWorkload(benchmark::State& state)
     state.SetLabel(pool_names[pool_type] + " tasks=" + std::to_string(num_tasks));
 }
 
+// =============================================================================
+// Pool comparison across workload weights (pool constructed once, not per-iter)
+// =============================================================================
+// Unlike BM_ComparePoolTypes_LightWorkload (which rebuilds the pool every
+// iteration and only runs a light task), this benchmark builds the pool once and
+// sweeps the per-task work. It shows how the best pool changes with workload:
+//   - tiny     : submission overhead dominates -> LightweightPool wins
+//   - heavy    : execution dominates -> the field converges
+//   - imbalanced: a few tasks are far heavier than the rest -> the work-stealing
+//                 HighPerformancePool balances the load and pulls ahead
+static void bench_busy_work(int iters)
+{
+    volatile long sum = 0;
+    for (int i = 0; i < iters; ++i)
+        sum += static_cast<long>(i) * i;
+}
+
+// Per-task work (in busy-loop iterations) for a given workload and task index.
+static int bench_work_iters(int workload, size_t task_index)
+{
+    switch (workload)
+    {
+    case 0: // tiny: pure scheduling overhead
+        return 50;
+    case 1: // medium: a few microseconds of work each
+        return 2000;
+    case 2: // heavy: uniform, execution-bound
+        return 30000;
+    default: // imbalanced: every 16th task is very heavy, the rest are tiny
+        return (task_index % 16 == 0) ? 120000 : 50;
+    }
+}
+
+static void BM_ComparePoolWorkload(benchmark::State& state)
+{
+    size_t const num_threads = 4;
+    size_t const num_tasks = 4000;
+    int const pool_type = static_cast<int>(state.range(0));
+    int const workload = static_cast<int>(state.range(1));
+
+    char const* const workload_names[] = {"tiny", "medium", "heavy", "imbalanced"};
+    char const* const pool_names[] = {"ThreadPool", "FastThreadPool", "HighPerformancePool", "LightweightPool"};
+
+    auto submit_loop = [&](auto& pool) {
+        for (auto _ : state)
+        {
+            std::vector<std::future<void>> futures;
+            futures.reserve(num_tasks);
+            for (size_t i = 0; i < num_tasks; ++i)
+                futures.push_back(pool.submit([workload, i]() { bench_busy_work(bench_work_iters(workload, i)); }));
+            for (auto& f : futures)
+                f.wait();
+        }
+    };
+
+    if (pool_type == 0)
+    {
+        ThreadPool pool(num_threads);
+        pool.configure_threads("bench");
+        submit_loop(pool);
+    }
+    else if (pool_type == 1)
+    {
+        FastThreadPool pool(num_threads);
+        pool.configure_threads("bench");
+        submit_loop(pool);
+    }
+    else if (pool_type == 2)
+    {
+        HighPerformancePool pool(num_threads);
+        pool.configure_threads("bench");
+        pool.distribute_across_cpus();
+        submit_loop(pool);
+    }
+    else
+    {
+        LightweightPool pool(num_threads);
+        pool.configure_threads("bench");
+        for (auto _ : state)
+        {
+            std::atomic<size_t> counter{0};
+            for (size_t i = 0; i < num_tasks; ++i)
+            {
+                pool.post([&counter, workload, i]() {
+                    bench_busy_work(bench_work_iters(workload, i));
+                    counter.fetch_add(1, std::memory_order_relaxed);
+                });
+            }
+            while (counter.load(std::memory_order_acquire) < num_tasks)
+                std::this_thread::yield();
+        }
+    }
+
+    state.SetItemsProcessed(state.iterations() * num_tasks);
+    state.SetLabel(std::string(pool_names[pool_type]) + " " + workload_names[workload]);
+}
+
 // =============================================================================
 // Post vs Submit comparison (fire-and-forget overhead on pools that support both)
 // =============================================================================
@@ -663,6 +760,27 @@ BENCHMARK(BM_ComparePoolTypes_LightWorkload)
     ->Args({100000, 3})
     ->Unit(benchmark::kMillisecond);
 
+// Pool comparison across workload weights (pool built once)
+// Args: {pool_type 0..3, workload 0=minimal 1=light 2=medium 3=heavy}
+BENCHMARK(BM_ComparePoolWorkload)
+    ->Args({0, 0})
+    ->Args({1, 0})
+    ->Args({2, 0})
+    ->Args({3, 0})
+    ->Args({0, 1})
+    ->Args({1, 1})
+    ->Args({2, 1})
+    ->Args({3, 1})
+    ->Args({0, 2})
+    ->Args({1, 2})
+    ->Args({2, 2})
+    ->Args({3, 2})
+    ->Args({0, 3})
+    ->Args({1, 3})
+    ->Args({2, 3})
+    ->Args({3, 3})
+    ->Unit(benchmark::kMillisecond);
+
 // Post vs Submit overhead comparison
 BENCHMARK(BM_PostVsSubmit)
     ->Args({1000, 0})
diff --git a/docs/CMAKE_REFERENCE.md b/docs/CMAKE_REFERENCE.md
index d90ee83..396d619 100644
--- a/docs/CMAKE_REFERENCE.md
+++ b/docs/CMAKE_REFERENCE.md
@@ -8,6 +8,7 @@
 | `THREADSCHEDULE_BUILD_TESTS` | BOOL | OFF | Build unit tests |
 | `THREADSCHEDULE_BUILD_BENCHMARKS` | BOOL | OFF | Build benchmarks (downloads Google Benchmark) |
 | `THREADSCHEDULE_RUNTIME` | BOOL | OFF | Build shared runtime library for process-wide registry |
+| `THREADSCHEDULE_ENABLE_REFLECTION` | BOOL | ON | Enable GCC 16.1+ C++26 reflection APIs and reflection-backed registry queries when supported |
 | `THREADSCHEDULE_INSTALL` | BOOL | ON (main project)<br>OFF (subdirectory) | Generate install targets |
 
 ## CMake Variables
@@ -43,6 +44,16 @@ add_subdirectory(ThreadSchedule)
 ```
 Features: All features + latest language enhancements
 
+### C++26 + GCC Reflection
+```cmake
+set(CMAKE_CXX_STANDARD 26)
+set(THREADSCHEDULE_ENABLE_REFLECTION ON)
+add_subdirectory(ThreadSchedule)
+```
+Features: All regular C++26 features plus `threadschedule::reflect` and
+reflection-backed registry APIs when using GCC 16.1+ with working
+`-freflection` support.
+
 ## Usage Examples
 
 ### Minimal Integration (Default)
diff --git a/docs/benchmarks/callable_sbo.svg b/docs/benchmarks/callable_sbo.svg
new file mode 100644
index 0000000..923540c
--- /dev/null
+++ b/docs/benchmarks/callable_sbo.svg
@@ -0,0 +1,31 @@
+<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 820 482" width="820" height="482" font-family="'Segoe UI', Helvetica, Arial, sans-serif">
+<rect x="0" y="0" width="820" height="482" rx="14" fill="#ffffff" stroke="#dbe3ef" />
+<text x="24" y="34" fill="#122033" font-size="18" font-weight="700">Do the SBO callables help? (C++26)</text>
+<text x="24" y="56" fill="#5b6b82" font-size="13">Per-task cost; the 48 B capture fits the SBO buffer but spills the std-library callable to the heap (lower is better)</text>
+<line x1="64" y1="396.0" x2="796" y2="396.0" stroke="#dbe3ef" stroke-width="1" />
+<text x="56" y="400.0" fill="#5b6b82" font-size="11" text-anchor="end">0.0</text>
+<line x1="64" y1="336.0" x2="796" y2="336.0" stroke="#dbe3ef" stroke-width="1" />
+<text x="56" y="340.0" fill="#5b6b82" font-size="11" text-anchor="end">6.0</text>
+<line x1="64" y1="276.0" x2="796" y2="276.0" stroke="#dbe3ef" stroke-width="1" />
+<text x="56" y="280.0" fill="#5b6b82" font-size="11" text-anchor="end">12.0</text>
+<line x1="64" y1="216.0" x2="796" y2="216.0" stroke="#dbe3ef" stroke-width="1" />
+<text x="56" y="220.0" fill="#5b6b82" font-size="11" text-anchor="end">18.0</text>
+<line x1="64" y1="156.0" x2="796" y2="156.0" stroke="#dbe3ef" stroke-width="1" />
+<text x="56" y="160.0" fill="#5b6b82" font-size="11" text-anchor="end">24.0</text>
+<line x1="64" y1="96.0" x2="796" y2="96.0" stroke="#dbe3ef" stroke-width="1" />
+<text x="56" y="100.0" fill="#5b6b82" font-size="11" text-anchor="end">30.0</text>
+<text x="16" y="88" fill="#5b6b82" font-size="12" font-weight="600">ns per task</text>
+<rect x="103.0" y="363.5" width="81.0" height="32.5" rx="3" fill="#2a7fff" />
+<rect x="186.0" y="364.9" width="81.0" height="31.1" rx="3" fill="#db2777" />
+<text x="186.0" y="414.0" fill="#122033" font-size="12" font-weight="600" text-anchor="middle">small (8 B)</text>
+<rect x="347.0" y="178.7" width="81.0" height="217.3" rx="3" fill="#2a7fff" />
+<rect x="430.0" y="356.8" width="81.0" height="39.2" rx="3" fill="#db2777" />
+<text x="430.0" y="414.0" fill="#122033" font-size="12" font-weight="600" text-anchor="middle">medium (48 B)</text>
+<rect x="591.0" y="192.8" width="81.0" height="203.2" rx="3" fill="#2a7fff" />
+<rect x="674.0" y="162.2" width="81.0" height="233.8" rx="3" fill="#db2777" />
+<text x="674.0" y="414.0" fill="#122033" font-size="12" font-weight="600" text-anchor="middle">large (128 B)</text>
+<rect x="64" y="440" width="14" height="14" rx="3" fill="#2a7fff" />
+<text x="84" y="452" fill="#122033" font-size="12">move_callable (ThreadPool / std lib)</text>
+<rect x="353.2" y="440" width="14" height="14" rx="3" fill="#db2777" />
+<text x="373.2" y="452" fill="#122033" font-size="12">SboCallable (LightweightPool)</text>
+</svg>
\ No newline at end of file
diff --git a/docs/benchmarks/callable_standards.svg b/docs/benchmarks/callable_standards.svg
new file mode 100644
index 0000000..e8c8c2f
--- /dev/null
+++ b/docs/benchmarks/callable_standards.svg
@@ -0,0 +1,41 @@
+<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 820 482" width="820" height="482" font-family="'Segoe UI', Helvetica, Arial, sans-serif">
+<rect x="0" y="0" width="820" height="482" rx="14" fill="#ffffff" stroke="#dbe3ef" />
+<text x="24" y="34" fill="#122033" font-size="18" font-weight="700">Does replacing std::function help? (ThreadPool task storage)</text>
+<text x="24" y="56" fill="#5b6b82" font-size="13">Build + invoke cost per task for detail::move_callable (std::function on C++17/20, std::move_only_function on C++23+); lower is better</text>
+<line x1="64" y1="396.0" x2="796" y2="396.0" stroke="#dbe3ef" stroke-width="1" />
+<text x="56" y="400.0" fill="#5b6b82" font-size="11" text-anchor="end">0.0</text>
+<line x1="64" y1="336.0" x2="796" y2="336.0" stroke="#dbe3ef" stroke-width="1" />
+<text x="56" y="340.0" fill="#5b6b82" font-size="11" text-anchor="end">6.0</text>
+<line x1="64" y1="276.0" x2="796" y2="276.0" stroke="#dbe3ef" stroke-width="1" />
+<text x="56" y="280.0" fill="#5b6b82" font-size="11" text-anchor="end">12.0</text>
+<line x1="64" y1="216.0" x2="796" y2="216.0" stroke="#dbe3ef" stroke-width="1" />
+<text x="56" y="220.0" fill="#5b6b82" font-size="11" text-anchor="end">18.0</text>
+<line x1="64" y1="156.0" x2="796" y2="156.0" stroke="#dbe3ef" stroke-width="1" />
+<text x="56" y="160.0" fill="#5b6b82" font-size="11" text-anchor="end">24.0</text>
+<line x1="64" y1="96.0" x2="796" y2="96.0" stroke="#dbe3ef" stroke-width="1" />
+<text x="56" y="100.0" fill="#5b6b82" font-size="11" text-anchor="end">30.0</text>
+<text x="16" y="88" fill="#5b6b82" font-size="12" font-weight="600">ns per task</text>
+<rect x="103.0" y="346.9" width="39.5" height="49.1" rx="3" fill="#94a3b8" />
+<rect x="144.5" y="350.5" width="39.5" height="45.5" rx="3" fill="#2a7fff" />
+<rect x="186.0" y="364.6" width="39.5" height="31.4" rx="3" fill="#16a34a" />
+<rect x="227.5" y="363.5" width="39.5" height="32.5" rx="3" fill="#db2777" />
+<text x="186.0" y="414.0" fill="#122033" font-size="12" font-weight="600" text-anchor="middle">small (8 B)</text>
+<rect x="347.0" y="181.2" width="39.5" height="214.8" rx="3" fill="#94a3b8" />
+<rect x="388.5" y="181.8" width="39.5" height="214.2" rx="3" fill="#2a7fff" />
+<rect x="430.0" y="198.1" width="39.5" height="197.9" rx="3" fill="#16a34a" />
+<rect x="471.5" y="178.7" width="39.5" height="217.3" rx="3" fill="#db2777" />
+<text x="430.0" y="414.0" fill="#122033" font-size="12" font-weight="600" text-anchor="middle">medium (48 B)</text>
+<rect x="591.0" y="177.7" width="39.5" height="218.3" rx="3" fill="#94a3b8" />
+<rect x="632.5" y="181.9" width="39.5" height="214.1" rx="3" fill="#2a7fff" />
+<rect x="674.0" y="189.0" width="39.5" height="207.0" rx="3" fill="#16a34a" />
+<rect x="715.5" y="192.8" width="39.5" height="203.2" rx="3" fill="#db2777" />
+<text x="674.0" y="414.0" fill="#122033" font-size="12" font-weight="600" text-anchor="middle">large (128 B)</text>
+<rect x="64" y="440" width="14" height="14" rx="3" fill="#94a3b8" />
+<text x="84" y="452" fill="#122033" font-size="12">C++17</text>
+<rect x="130.0" y="440" width="14" height="14" rx="3" fill="#2a7fff" />
+<text x="150.0" y="452" fill="#122033" font-size="12">C++20</text>
+<rect x="196.0" y="440" width="14" height="14" rx="3" fill="#16a34a" />
+<text x="216.0" y="452" fill="#122033" font-size="12">C++23</text>
+<rect x="262.0" y="440" width="14" height="14" rx="3" fill="#db2777" />
+<text x="282.0" y="452" fill="#122033" font-size="12">C++26</text>
+</svg>
\ No newline at end of file
diff --git a/docs/benchmarks/pool_comparison.svg b/docs/benchmarks/pool_comparison.svg
new file mode 100644
index 0000000..c8480ba
--- /dev/null
+++ b/docs/benchmarks/pool_comparison.svg
@@ -0,0 +1,17 @@
+<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 760 320" width="760" height="320" font-family="'Segoe UI', Helvetica, Arial, sans-serif">
+<rect x="0" y="0" width="760" height="320" rx="14" fill="#ffffff" stroke="#dbe3ef" />
+<text x="24" y="34" fill="#122033" font-size="18" font-weight="700">Thread pool comparison — light workload</text>
+<text x="24" y="56" fill="#5b6b82" font-size="13">Wall-clock time to run 100,000 tiny tasks (lower is better, relative to ThreadPool)</text>
+<text x="24" y="102" fill="#122033" font-size="13" font-weight="600">LightweightPool</text>
+<rect x="200" y="80" width="123.4" height="34" rx="6" fill="#db2777" />
+<text x="333.4" y="102" fill="#122033" font-size="13" font-weight="600">38.18 ms  (1.86x faster)</text>
+<text x="24" y="158" fill="#122033" font-size="13" font-weight="600">ThreadPool</text>
+<rect x="200" y="136" width="229.9" height="34" rx="6" fill="#2a7fff" />
+<text x="439.9" y="158" fill="#122033" font-size="13" font-weight="600">71.13 ms  (baseline)</text>
+<text x="24" y="214" fill="#122033" font-size="13" font-weight="600">FastThreadPool</text>
+<rect x="200" y="192" width="252.9" height="34" rx="6" fill="#16a34a" />
+<text x="462.9" y="214" fill="#122033" font-size="13" font-weight="600">78.26 ms  (1.10x slower)</text>
+<text x="24" y="270" fill="#122033" font-size="13" font-weight="600">HighPerformancePool</text>
+<rect x="200" y="248" width="386.0" height="34" rx="6" fill="#f59e0b" />
+<text x="596.0" y="270" fill="#122033" font-size="13" font-weight="600">119.45 ms  (1.68x slower)</text>
+</svg>
\ No newline at end of file
diff --git a/docs/benchmarks/pool_throughput.svg b/docs/benchmarks/pool_throughput.svg
new file mode 100644
index 0000000..2e57694
--- /dev/null
+++ b/docs/benchmarks/pool_throughput.svg
@@ -0,0 +1,51 @@
+<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 820 482" width="820" height="482" font-family="'Segoe UI', Helvetica, Arial, sans-serif">
+<rect x="0" y="0" width="820" height="482" rx="14" fill="#ffffff" stroke="#dbe3ef" />
+<text x="24" y="34" fill="#122033" font-size="18" font-weight="700">Thread pool throughput by batch size</text>
+<text x="24" y="56" fill="#5b6b82" font-size="13">Tasks processed per second for the light workload (higher is better)</text>
+<line x1="64" y1="396.0" x2="796" y2="396.0" stroke="#dbe3ef" stroke-width="1" />
+<text x="56" y="400.0" fill="#5b6b82" font-size="11" text-anchor="end">0.0</text>
+<line x1="64" y1="336.0" x2="796" y2="336.0" stroke="#dbe3ef" stroke-width="1" />
+<text x="56" y="340.0" fill="#5b6b82" font-size="11" text-anchor="end">0.8</text>
+<line x1="64" y1="276.0" x2="796" y2="276.0" stroke="#dbe3ef" stroke-width="1" />
+<text x="56" y="280.0" fill="#5b6b82" font-size="11" text-anchor="end">1.6</text>
+<line x1="64" y1="216.0" x2="796" y2="216.0" stroke="#dbe3ef" stroke-width="1" />
+<text x="56" y="220.0" fill="#5b6b82" font-size="11" text-anchor="end">2.4</text>
+<line x1="64" y1="156.0" x2="796" y2="156.0" stroke="#dbe3ef" stroke-width="1" />
+<text x="56" y="160.0" fill="#5b6b82" font-size="11" text-anchor="end">3.2</text>
+<line x1="64" y1="96.0" x2="796" y2="96.0" stroke="#dbe3ef" stroke-width="1" />
+<text x="56" y="100.0" fill="#5b6b82" font-size="11" text-anchor="end">4.0</text>
+<text x="16" y="88" fill="#5b6b82" font-size="12" font-weight="600">M tasks / second</text>
+<rect x="87.4" y="388.5" width="22.9" height="7.5" rx="3" fill="#2a7fff" />
+<rect x="112.3" y="388.6" width="22.9" height="7.4" rx="3" fill="#16a34a" />
+<rect x="137.2" y="391.9" width="22.9" height="4.1" rx="3" fill="#f59e0b" />
+<rect x="162.1" y="388.6" width="22.9" height="7.4" rx="3" fill="#db2777" />
+<text x="137.2" y="414.0" fill="#122033" font-size="12" font-weight="600" text-anchor="middle">10</text>
+<rect x="233.8" y="349.0" width="22.9" height="47.0" rx="3" fill="#2a7fff" />
+<rect x="258.7" y="349.5" width="22.9" height="46.5" rx="3" fill="#16a34a" />
+<rect x="283.6" y="365.0" width="22.9" height="31.0" rx="3" fill="#f59e0b" />
+<rect x="308.5" y="336.2" width="22.9" height="59.8" rx="3" fill="#db2777" />
+<text x="283.6" y="414.0" fill="#122033" font-size="12" font-weight="600" text-anchor="middle">100</text>
+<rect x="380.2" y="294.2" width="22.9" height="101.8" rx="3" fill="#2a7fff" />
+<rect x="405.1" y="302.0" width="22.9" height="94.0" rx="3" fill="#16a34a" />
+<rect x="430.0" y="322.6" width="22.9" height="73.4" rx="3" fill="#f59e0b" />
+<rect x="454.9" y="165.1" width="22.9" height="230.9" rx="3" fill="#db2777" />
+<text x="430.0" y="414.0" fill="#122033" font-size="12" font-weight="600" text-anchor="middle">1,000</text>
+<rect x="526.6" y="267.9" width="22.9" height="128.1" rx="3" fill="#2a7fff" />
+<rect x="551.5" y="278.6" width="22.9" height="117.4" rx="3" fill="#16a34a" />
+<rect x="576.4" y="311.0" width="22.9" height="85.0" rx="3" fill="#f59e0b" />
+<rect x="601.3" y="151.8" width="22.9" height="244.2" rx="3" fill="#db2777" />
+<text x="576.4" y="414.0" fill="#122033" font-size="12" font-weight="600" text-anchor="middle">10,000</text>
+<rect x="673.0" y="277.8" width="22.9" height="118.2" rx="3" fill="#2a7fff" />
+<rect x="697.9" y="287.7" width="22.9" height="108.3" rx="3" fill="#16a34a" />
+<rect x="722.8" y="309.0" width="22.9" height="87.0" rx="3" fill="#f59e0b" />
+<rect x="747.7" y="161.5" width="22.9" height="234.5" rx="3" fill="#db2777" />
+<text x="722.8" y="414.0" fill="#122033" font-size="12" font-weight="600" text-anchor="middle">100,000</text>
+<rect x="64" y="440" width="14" height="14" rx="3" fill="#2a7fff" />
+<text x="84" y="452" fill="#122033" font-size="12">ThreadPool</text>
+<rect x="166.0" y="440" width="14" height="14" rx="3" fill="#16a34a" />
+<text x="186.0" y="452" fill="#122033" font-size="12">FastThreadPool</text>
+<rect x="296.8" y="440" width="14" height="14" rx="3" fill="#f59e0b" />
+<text x="316.8" y="452" fill="#122033" font-size="12">HighPerformancePool</text>
+<rect x="463.6" y="440" width="14" height="14" rx="3" fill="#db2777" />
+<text x="483.6" y="452" fill="#122033" font-size="12">LightweightPool</text>
+</svg>
\ No newline at end of file
diff --git a/docs/benchmarks/pool_workload.svg b/docs/benchmarks/pool_workload.svg
new file mode 100644
index 0000000..86c7eac
--- /dev/null
+++ b/docs/benchmarks/pool_workload.svg
@@ -0,0 +1,46 @@
+<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 820 482" width="820" height="482" font-family="'Segoe UI', Helvetica, Arial, sans-serif">
+<rect x="0" y="0" width="820" height="482" rx="14" fill="#ffffff" stroke="#dbe3ef" />
+<text x="24" y="34" fill="#122033" font-size="18" font-weight="700">Which pool wins depends on the workload</text>
+<text x="24" y="56" fill="#5b6b82" font-size="13">Time relative to the fastest pool per workload (1.0 = winner, shorter is better; pool built once, 4 threads)</text>
+<line x1="64" y1="396.0" x2="796" y2="396.0" stroke="#dbe3ef" stroke-width="1" />
+<text x="56" y="400.0" fill="#5b6b82" font-size="11" text-anchor="end">0.0</text>
+<line x1="64" y1="336.0" x2="796" y2="336.0" stroke="#dbe3ef" stroke-width="1" />
+<text x="56" y="340.0" fill="#5b6b82" font-size="11" text-anchor="end">0.6</text>
+<line x1="64" y1="276.0" x2="796" y2="276.0" stroke="#dbe3ef" stroke-width="1" />
+<text x="56" y="280.0" fill="#5b6b82" font-size="11" text-anchor="end">1.2</text>
+<line x1="64" y1="216.0" x2="796" y2="216.0" stroke="#dbe3ef" stroke-width="1" />
+<text x="56" y="220.0" fill="#5b6b82" font-size="11" text-anchor="end">1.8</text>
+<line x1="64" y1="156.0" x2="796" y2="156.0" stroke="#dbe3ef" stroke-width="1" />
+<text x="56" y="160.0" fill="#5b6b82" font-size="11" text-anchor="end">2.4</text>
+<line x1="64" y1="96.0" x2="796" y2="96.0" stroke="#dbe3ef" stroke-width="1" />
+<text x="56" y="100.0" fill="#5b6b82" font-size="11" text-anchor="end">3.0</text>
+<text x="16" y="88" fill="#5b6b82" font-size="12" font-weight="600">relative time (1.0 = fastest)</text>
+<rect x="93.3" y="177.8" width="29.1" height="218.2" rx="3" fill="#2a7fff" />
+<rect x="124.4" y="178.8" width="29.1" height="217.2" rx="3" fill="#16a34a" />
+<rect x="155.5" y="102.1" width="29.1" height="293.9" rx="3" fill="#f59e0b" />
+<rect x="186.6" y="296.0" width="29.1" height="100.0" rx="3" fill="#db2777" />
+<text x="155.5" y="414.0" fill="#122033" font-size="12" font-weight="600" text-anchor="middle">tiny</text>
+<rect x="276.3" y="202.9" width="29.1" height="193.1" rx="3" fill="#2a7fff" />
+<rect x="307.4" y="203.1" width="29.1" height="192.9" rx="3" fill="#16a34a" />
+<rect x="338.5" y="153.1" width="29.1" height="242.9" rx="3" fill="#f59e0b" />
+<rect x="369.6" y="296.0" width="29.1" height="100.0" rx="3" fill="#db2777" />
+<text x="338.5" y="414.0" fill="#122033" font-size="12" font-weight="600" text-anchor="middle">medium</text>
+<rect x="459.3" y="274.9" width="29.1" height="121.1" rx="3" fill="#2a7fff" />
+<rect x="490.4" y="277.0" width="29.1" height="119.0" rx="3" fill="#16a34a" />
+<rect x="521.5" y="292.0" width="29.1" height="104.0" rx="3" fill="#f59e0b" />
+<rect x="552.6" y="296.0" width="29.1" height="100.0" rx="3" fill="#db2777" />
+<text x="521.5" y="414.0" fill="#122033" font-size="12" font-weight="600" text-anchor="middle">heavy</text>
+<rect x="642.3" y="260.0" width="29.1" height="136.0" rx="3" fill="#2a7fff" />
+<rect x="673.4" y="258.3" width="29.1" height="137.7" rx="3" fill="#16a34a" />
+<rect x="704.5" y="219.3" width="29.1" height="176.7" rx="3" fill="#f59e0b" />
+<rect x="735.6" y="296.0" width="29.1" height="100.0" rx="3" fill="#db2777" />
+<text x="704.5" y="414.0" fill="#122033" font-size="12" font-weight="600" text-anchor="middle">imbalanced</text>
+<rect x="64" y="440" width="14" height="14" rx="3" fill="#2a7fff" />
+<text x="84" y="452" fill="#122033" font-size="12">ThreadPool</text>
+<rect x="166.0" y="440" width="14" height="14" rx="3" fill="#16a34a" />
+<text x="186.0" y="452" fill="#122033" font-size="12">FastThreadPool</text>
+<rect x="296.8" y="440" width="14" height="14" rx="3" fill="#f59e0b" />
+<text x="316.8" y="452" fill="#122033" font-size="12">HighPerformancePool</text>
+<rect x="463.6" y="440" width="14" height="14" rx="3" fill="#db2777" />
+<text x="483.6" y="452" fill="#122033" font-size="12">LightweightPool</text>
+</svg>
\ No newline at end of file
diff --git a/docs/benchmarks/post_vs_submit.svg b/docs/benchmarks/post_vs_submit.svg
new file mode 100644
index 0000000..8e503fb
--- /dev/null
+++ b/docs/benchmarks/post_vs_submit.svg
@@ -0,0 +1,11 @@
+<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 760 208" width="760" height="208" font-family="'Segoe UI', Helvetica, Arial, sans-serif">
+<rect x="0" y="0" width="760" height="208" rx="14" fill="#ffffff" stroke="#dbe3ef" />
+<text x="24" y="34" fill="#122033" font-size="18" font-weight="700">post() vs submit()</text>
+<text x="24" y="56" fill="#5b6b82" font-size="13">Submission overhead for 100,000 tasks: post() skips the future/packaged_task path (lower is better)</text>
+<text x="24" y="102" fill="#122033" font-size="13" font-weight="600">post(fire-forget)</text>
+<rect x="200" y="80" width="44.7" height="34" rx="6" fill="#16a34a" />
+<text x="254.7" y="102" fill="#122033" font-size="13" font-weight="600">13.33 ms  (8.63x faster)</text>
+<text x="24" y="158" fill="#122033" font-size="13" font-weight="600">submit(future)</text>
+<rect x="200" y="136" width="386.0" height="34" rx="6" fill="#2a7fff" />
+<text x="596.0" y="158" fill="#122033" font-size="13" font-weight="600">115.05 ms  (baseline)</text>
+</svg>
\ No newline at end of file
diff --git a/docs/benchmarks/reflection_lookup.svg b/docs/benchmarks/reflection_lookup.svg
new file mode 100644
index 0000000..ff0ca43
--- /dev/null
+++ b/docs/benchmarks/reflection_lookup.svg
@@ -0,0 +1,11 @@
+<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 760 208" width="760" height="208" font-family="'Segoe UI', Helvetica, Arial, sans-serif">
+<rect x="0" y="0" width="760" height="208" rx="14" fill="#ffffff" stroke="#dbe3ef" />
+<text x="24" y="34" fill="#122033" font-size="18" font-weight="700">Reflection registry query: find by field</text>
+<text x="24" y="56" fill="#5b6b82" font-size="13">Locating a single entry by name over 16,384 registered threads (lower is better)</text>
+<text x="24" y="102" fill="#122033" font-size="13" font-weight="600">find_if (hand-written)</text>
+<rect x="200" y="80" width="95.4" height="34" rx="6" fill="#2a7fff" />
+<text x="305.4" y="102" fill="#122033" font-size="13" font-weight="600">22 ns  (baseline)</text>
+<text x="24" y="158" fill="#122033" font-size="13" font-weight="600">find_by (reflection)</text>
+<rect x="200" y="136" width="386.0" height="34" rx="6" fill="#f59e0b" />
+<text x="596.0" y="158" fill="#122033" font-size="13" font-weight="600">88 ns  (4.04x slower)</text>
+</svg>
\ No newline at end of file
diff --git a/docs/benchmarks/reflection_query.svg b/docs/benchmarks/reflection_query.svg
new file mode 100644
index 0000000..e5a7324
--- /dev/null
+++ b/docs/benchmarks/reflection_query.svg
@@ -0,0 +1,11 @@
+<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 760 208" width="760" height="208" font-family="'Segoe UI', Helvetica, Arial, sans-serif">
+<rect x="0" y="0" width="760" height="208" rx="14" fill="#ffffff" stroke="#dbe3ef" />
+<text x="24" y="34" fill="#122033" font-size="18" font-weight="700">Reflection registry query: project a field</text>
+<text x="24" y="56" fill="#5b6b82" font-size="13">Selecting + projecting one field over 16,384 registered threads (lower is better)</text>
+<text x="24" y="102" fill="#122033" font-size="13" font-weight="600">filter + map (hand-written)</text>
+<rect x="200" y="80" width="313.1" height="34" rx="6" fill="#2a7fff" />
+<text x="523.1" y="102" fill="#122033" font-size="13" font-weight="600">93.55 us  (baseline)</text>
+<text x="24" y="158" fill="#122033" font-size="13" font-weight="600">where + project (reflection)</text>
+<rect x="200" y="136" width="386.0" height="34" rx="6" fill="#f59e0b" />
+<text x="596.0" y="158" fill="#122033" font-size="13" font-weight="600">115.34 us  (1.23x slower)</text>
+</svg>
\ No newline at end of file
diff --git a/include/threadschedule/reflection.hpp b/include/threadschedule/reflection.hpp
new file mode 100644
index 0000000..da98549
--- /dev/null
+++ b/include/threadschedule/reflection.hpp
@@ -0,0 +1,161 @@
+#pragma once
+
+#if defined(THREADSCHEDULE_HAS_REFLECTION) && THREADSCHEDULE_HAS_REFLECTION
+
+#include <cstddef>
+#include <array>
+#include <meta>
+#include <span>
+#include <string_view>
+#include <tuple>
+#include <type_traits>
+#include <utility>
+
+namespace threadschedule::reflect
+{
+
+using info = std::meta::info;
+inline constexpr bool enabled = true;
+
+template <typename T>
+consteval auto fields() -> std::span<info const>
+{
+    static_assert(std::meta::is_class_type(^^T) || std::meta::is_union_type(^^T),
+                  "threadschedule::reflect::fields<T>() requires a class or union type");
+    return std::define_static_array(std::meta::nonstatic_data_members_of(^^T, std::meta::access_context::current()));
+}
+
+template <typename T>
+consteval auto field_count() -> std::size_t
+{
+    return fields<T>().size();
+}
+
+template <typename T, std::size_t I>
+consteval auto field_info() -> info
+{
+    static_assert(I < field_count<T>(), "reflection field index out of range");
+    return fields<T>()[I];
+}
+
+template <info Field>
+consteval auto field_name() -> std::string_view
+{
+    return std::string_view(std::define_static_string(std::meta::identifier_of(Field)));
+}
+
+template <typename T, std::size_t I>
+consteval auto field_name() -> std::string_view
+{
+    return field_name<field_info<T, I>()>();
+}
+
+template <typename T>
+consteval auto type_name() -> std::string_view
+{
+    return std::string_view(std::define_static_string(std::meta::display_string_of(^^T)));
+}
+
+template <info Field>
+using field_type_t = [: std::meta::type_of(Field) :];
+
+template <info Field, typename T>
+constexpr decltype(auto) get(T&& obj)
+{
+    return std::forward<T>(obj).[:Field:];
+}
+
+template <info Field, typename Owner>
+inline constexpr bool is_field_of_v = std::meta::is_same_type(std::meta::parent_of(Field), ^^Owner);
+
+template <typename T>
+consteval auto field_names() -> std::span<char const* const>;
+
+namespace detail
+{
+
+template <info... Fields>
+struct projection_type;
+
+template <info Field>
+struct projection_type<Field>
+{
+    using type = field_type_t<Field>;
+};
+
+template <info First, info Second, info... Rest>
+struct projection_type<First, Second, Rest...>
+{
+    using type = std::tuple<field_type_t<First>, field_type_t<Second>, field_type_t<Rest>...>;
+};
+
+template <typename T, typename F, std::size_t... I>
+constexpr void visit_fields_impl(T&& obj, F&& fn, std::index_sequence<I...>)
+{
+    using object_type = std::remove_cv_t<std::remove_reference_t<T>>;
+    constexpr auto names = field_names<object_type>();
+    (fn(names[I], get<field_info<object_type, I>()>(std::forward<T>(obj))), ...);
+}
+
+template <typename T, std::size_t... I>
+consteval auto field_names_impl(std::index_sequence<I...>) -> std::span<char const* const>
+{
+    return std::define_static_array(
+        std::array<char const*, sizeof...(I)>{std::define_static_string(std::meta::identifier_of(field_info<T, I>()))...});
+}
+
+template <info First>
+consteval info first_field() noexcept
+{
+    return First;
+}
+
+template <info Field, typename Owner>
+consteval void require_field_owner()
+{
+    static_assert(std::meta::is_same_type(std::meta::parent_of(Field), ^^Owner),
+                  "Reflection field does not belong to the requested owner type");
+}
+
+} // namespace detail
+
+template <typename T>
+consteval auto field_names() -> std::span<char const* const>
+{
+    return detail::field_names_impl<T>(std::make_index_sequence<field_count<T>()>{});
+}
+
+template <typename T, typename F>
+constexpr void visit_fields(T&& obj, F&& fn)
+{
+    using object_type = std::remove_cv_t<std::remove_reference_t<T>>;
+    detail::visit_fields_impl(std::forward<T>(obj), std::forward<F>(fn),
+                              std::make_index_sequence<field_count<object_type>()>{});
+}
+
+template <info... Fields>
+using projection_t = typename detail::projection_type<Fields...>::type;
+
+template <info... Fields, typename T>
+constexpr auto project_value(T&& obj) -> projection_t<Fields...>
+{
+    static_assert(sizeof...(Fields) > 0, "project_value requires at least one field");
+    if constexpr (sizeof...(Fields) == 1)
+    {
+        return get<detail::first_field<Fields...>()>(std::forward<T>(obj));
+    }
+    else
+    {
+        return projection_t<Fields...>{get<Fields>(std::forward<T>(obj))...};
+    }
+}
+
+template <info Field, typename Owner>
+consteval void require_field_owner()
+{
+    detail::require_field_owner<Field, Owner>();
+}
+
+} // namespace threadschedule::reflect
+
+#endif
diff --git a/include/threadschedule/thread_registry.hpp b/include/threadschedule/thread_registry.hpp
index 5dd1b36..b06415b 100644
--- a/include/threadschedule/thread_registry.hpp
+++ b/include/threadschedule/thread_registry.hpp
@@ -7,6 +7,9 @@
 
 #include "callable.hpp"
 #include "expected.hpp"
+#if defined(THREADSCHEDULE_HAS_REFLECTION) && THREADSCHEDULE_HAS_REFLECTION
+#include "reflection.hpp"
+#endif
 #include "scheduler_policy.hpp"
 #include "thread_wrapper.hpp" // for ThreadInfo, ThreadAffinity
 #include <functional>
@@ -88,6 +91,18 @@ struct RegisteredThreadInfo
     std::shared_ptr<class ThreadControlBlock> control;
 };
 
+#if defined(THREADSCHEDULE_HAS_REFLECTION) && THREADSCHEDULE_HAS_REFLECTION
+namespace registered_thread_fields
+{
+consteval auto tid() -> reflect::info { return reflect::field_info<RegisteredThreadInfo, 0>(); }
+consteval auto stdId() -> reflect::info { return reflect::field_info<RegisteredThreadInfo, 1>(); }
+consteval auto name() -> reflect::info { return reflect::field_info<RegisteredThreadInfo, 2>(); }
+consteval auto componentTag() -> reflect::info { return reflect::field_info<RegisteredThreadInfo, 3>(); }
+consteval auto alive() -> reflect::info { return reflect::field_info<RegisteredThreadInfo, 4>(); }
+consteval auto control() -> reflect::info { return reflect::field_info<RegisteredThreadInfo, 5>(); }
+} // namespace registered_thread_fields
+#endif
+
 using RegistryCallback = detail::copyable_callable<void(RegisteredThreadInfo const&)>;
 
 /**
@@ -217,6 +232,14 @@ class ThreadControlBlock
 namespace detail
 {
 
+#if defined(THREADSCHEDULE_HAS_REFLECTION) && THREADSCHEDULE_HAS_REFLECTION
+template <reflect::info Field, typename Owner>
+consteval void validate_reflected_field()
+{
+    reflect::require_field_owner<Field, Owner>();
+}
+#endif
+
 /**
  * @brief CRTP mixin that provides functional-style query facade methods.
  *
@@ -290,6 +313,38 @@ class QueryFacadeMixin
     [[nodiscard]] auto take(size_t n) const { return self().query().take(n); }
 
     [[nodiscard]] auto skip(size_t n) const { return self().query().skip(n); }
+
+#if defined(THREADSCHEDULE_HAS_REFLECTION) && THREADSCHEDULE_HAS_REFLECTION
+    template <reflect::info Field, typename Value>
+    [[nodiscard]] auto where(Value const& value) const
+    {
+        return self().query().template where<Field>(value);
+    }
+
+    template <reflect::info Field, typename Predicate>
+    [[nodiscard]] auto where_if(Predicate&& pred) const
+    {
+        return self().query().template where_if<Field>(std::forward<Predicate>(pred));
+    }
+
+    template <reflect::info Field, typename Value>
+    [[nodiscard]] auto find_by(Value const& value) const
+    {
+        return self().query().template find_by<Field>(value);
+    }
+
+    template <reflect::info Field, typename Value>
+    [[nodiscard]] auto contains(Value const& value) const -> bool
+    {
+        return self().query().template contains<Field>(value);
+    }
+
+    template <reflect::info... Fields>
+    [[nodiscard]] auto project() const
+    {
+        return self().query().template project<Fields...>();
+    }
+#endif
 };
 
 } // namespace detail
@@ -551,6 +606,71 @@ class ThreadRegistry : public detail::QueryFacadeMixin<ThreadRegistry>
             return QueryView(std::move(result));
         }
 
+#if defined(THREADSCHEDULE_HAS_REFLECTION) && THREADSCHEDULE_HAS_REFLECTION
+        template <reflect::info Field, typename Value>
+        [[nodiscard]] auto where(Value const& value) const -> QueryView
+        {
+            detail::validate_reflected_field<Field, RegisteredThreadInfo>();
+            std::vector<RegisteredThreadInfo> filtered;
+            filtered.reserve(entries_.size());
+            for (auto const& entry : entries_)
+            {
+                if (reflect::get<Field>(entry) == value)
+                    filtered.push_back(entry);
+            }
+            return QueryView(std::move(filtered));
+        }
+
+        template <reflect::info Field, typename Predicate>
+        [[nodiscard]] auto where_if(Predicate&& pred) const -> QueryView
+        {
+            detail::validate_reflected_field<Field, RegisteredThreadInfo>();
+            static_assert(std::is_invocable_r_v<bool, Predicate&, reflect::field_type_t<Field> const&>,
+                          "Reflection predicate must accept the selected field type");
+            std::vector<RegisteredThreadInfo> filtered;
+            filtered.reserve(entries_.size());
+            for (auto const& entry : entries_)
+            {
+                if (pred(reflect::get<Field>(entry)))
+                    filtered.push_back(entry);
+            }
+            return QueryView(std::move(filtered));
+        }
+
+        template <reflect::info Field, typename Value>
+        [[nodiscard]] auto find_by(Value const& value) const -> std::optional<RegisteredThreadInfo>
+        {
+            detail::validate_reflected_field<Field, RegisteredThreadInfo>();
+            for (auto const& entry : entries_)
+            {
+                if (reflect::get<Field>(entry) == value)
+                    return entry;
+            }
+            return std::nullopt;
+        }
+
+        template <reflect::info Field, typename Value>
+        [[nodiscard]] auto contains(Value const& value) const -> bool
+        {
+            return find_by<Field>(value).has_value();
+        }
+
+        template <reflect::info... Fields>
+        [[nodiscard]] auto project() const -> std::vector<reflect::projection_t<Fields...>>
+        {
+            static_assert(sizeof...(Fields) > 0, "project requires at least one field");
+            (detail::validate_reflected_field<Fields, RegisteredThreadInfo>(), ...);
+
+            std::vector<reflect::projection_t<Fields...>> result;
+            result.reserve(entries_.size());
+            for (auto const& entry : entries_)
+            {
+                result.push_back(reflect::project_value<Fields...>(entry));
+            }
+            return result;
+        }
+#endif
+
       private:
         std::vector<RegisteredThreadInfo> entries_;
     };
@@ -568,6 +688,87 @@ class ThreadRegistry : public detail::QueryFacadeMixin<ThreadRegistry>
         return QueryView(std::move(snapshot));
     }
 
+#if defined(THREADSCHEDULE_HAS_REFLECTION) && THREADSCHEDULE_HAS_REFLECTION
+    template <reflect::info Field, typename Value>
+    [[nodiscard]] auto where(Value const& value) const -> QueryView
+    {
+        detail::validate_reflected_field<Field, RegisteredThreadInfo>();
+        std::vector<RegisteredThreadInfo> filtered;
+        std::shared_lock<std::shared_mutex> lock(mutex_);
+        filtered.reserve(threads_.size());
+        for (auto const& [tid, entry] : threads_)
+        {
+            (void)tid;
+            if (reflect::get<Field>(entry) == value)
+                filtered.push_back(entry);
+        }
+        return QueryView(std::move(filtered));
+    }
+
+    template <reflect::info Field, typename Predicate>
+    [[nodiscard]] auto where_if(Predicate&& pred) const -> QueryView
+    {
+        detail::validate_reflected_field<Field, RegisteredThreadInfo>();
+        static_assert(std::is_invocable_r_v<bool, Predicate&, reflect::field_type_t<Field> const&>,
+                      "Reflection predicate must accept the selected field type");
+        std::vector<RegisteredThreadInfo> filtered;
+        std::shared_lock<std::shared_mutex> lock(mutex_);
+        filtered.reserve(threads_.size());
+        for (auto const& [tid, entry] : threads_)
+        {
+            (void)tid;
+            if (pred(reflect::get<Field>(entry)))
+                filtered.push_back(entry);
+        }
+        return QueryView(std::move(filtered));
+    }
+
+    template <reflect::info Field, typename Value>
+    [[nodiscard]] auto find_by(Value const& value) const -> std::optional<RegisteredThreadInfo>
+    {
+        detail::validate_reflected_field<Field, RegisteredThreadInfo>();
+        std::shared_lock<std::shared_mutex> lock(mutex_);
+        for (auto const& [tid, entry] : threads_)
+        {
+            (void)tid;
+            if (reflect::get<Field>(entry) == value)
+                return entry;
+        }
+        return std::nullopt;
+    }
+
+    template <reflect::info Field, typename Value>
+    [[nodiscard]] auto contains(Value const& value) const -> bool
+    {
+        detail::validate_reflected_field<Field, RegisteredThreadInfo>();
+        std::shared_lock<std::shared_mutex> lock(mutex_);
+        for (auto const& [tid, entry] : threads_)
+        {
+            (void)tid;
+            if (reflect::get<Field>(entry) == value)
+                return true;
+        }
+        return false;
+    }
+
+    template <reflect::info... Fields>
+    [[nodiscard]] auto project() const -> std::vector<reflect::projection_t<Fields...>>
+    {
+        static_assert(sizeof...(Fields) > 0, "project requires at least one field");
+        (detail::validate_reflected_field<Fields, RegisteredThreadInfo>(), ...);
+
+        std::vector<reflect::projection_t<Fields...>> result;
+        std::shared_lock<std::shared_mutex> lock(mutex_);
+        result.reserve(threads_.size());
+        for (auto const& [tid, entry] : threads_)
+        {
+            (void)tid;
+            result.push_back(reflect::project_value<Fields...>(entry));
+        }
+        return result;
+    }
+#endif
+
     [[nodiscard]] auto set_affinity(Tid tid, ThreadAffinity const& affinity) const -> expected<void, std::error_code>
     {
         auto blk = lock_block(tid);
diff --git a/include/threadschedule/threadschedule.hpp b/include/threadschedule/threadschedule.hpp
index 521dd4f..7be59e4 100644
--- a/include/threadschedule/threadschedule.hpp
+++ b/include/threadschedule/threadschedule.hpp
@@ -18,6 +18,10 @@
 #include "thread_wrapper.hpp"
 #include "topology.hpp"
 
+#if defined(THREADSCHEDULE_HAS_REFLECTION) && THREADSCHEDULE_HAS_REFLECTION
+#include "reflection.hpp"
+#endif
+
 /**
  * @file threadschedule.hpp
  * @brief Modern C++17/20/23/26 Thread Scheduling Library
diff --git a/run_benchmark_graphs.sh b/run_benchmark_graphs.sh
new file mode 100755
index 0000000..f7ff6a0
--- /dev/null
+++ b/run_benchmark_graphs.sh
@@ -0,0 +1,121 @@
+#!/bin/bash
+
+set -euo pipefail
+
+QUICK_MODE=false
+SHOW_HELP=false
+OUTPUT_DIR=""
+
+while [[ $# -gt 0 ]]; do
+    case "$1" in
+        --quick|-q)
+            QUICK_MODE=true
+            shift
+            ;;
+        --output-dir)
+            OUTPUT_DIR="$2"
+            shift 2
+            ;;
+        --help|-h)
+            SHOW_HELP=true
+            shift
+            ;;
+        *)
+            echo "Unknown argument: $1" >&2
+            exit 1
+            ;;
+    esac
+done
+
+if [[ "$SHOW_HELP" == "true" ]]; then
+    cat <<'EOF'
+ThreadSchedule Benchmark Graph Runner
+
+Usage:
+  ./run_benchmark_graphs.sh [--quick] [--output-dir DIR]
+
+What it does:
+  - Runs selected comparison-focused Google Benchmark targets
+  - Writes JSON outputs into a report directory
+  - Generates an HTML report with inline SVG graphs and speedups
+  - Captures local machine specs automatically inside the report
+
+Examples:
+  ./run_benchmark_graphs.sh
+  ./run_benchmark_graphs.sh --quick
+  ./run_benchmark_graphs.sh --output-dir build/benchmark-reports/latest
+EOF
+    exit 0
+fi
+
+PROJECT_ROOT="$(cd "$(dirname "$0")" && pwd)"
+BUILD_DIR="${PROJECT_ROOT}/build"
+BENCHMARK_DIR="${BUILD_DIR}/benchmarks"
+
+if [[ -z "${OUTPUT_DIR}" ]]; then
+    TIMESTAMP="$(date +%Y%m%d-%H%M%S)"
+    OUTPUT_DIR="${BUILD_DIR}/benchmark-reports/${TIMESTAMP}"
+fi
+
+mkdir -p "${OUTPUT_DIR}"
+
+if [[ ! -d "${BENCHMARK_DIR}" ]]; then
+    echo "Benchmark directory not found: ${BENCHMARK_DIR}" >&2
+    echo "Build with -DTHREADSCHEDULE_BUILD_BENCHMARKS=ON first." >&2
+    exit 1
+fi
+
+if [[ "$QUICK_MODE" == "true" ]]; then
+    BENCH_MIN_TIME="0.4s"
+    BENCH_REPETITIONS="1"
+else
+    BENCH_MIN_TIME="1.5s"
+    BENCH_REPETITIONS="3"
+fi
+
+run_json_benchmark() {
+    local executable="$1"
+    local filter="$2"
+    local output_json="$3"
+
+    if [[ ! -x "${BENCHMARK_DIR}/${executable}" ]]; then
+        echo "Skipping ${executable}: not built" >&2
+        return 0
+    fi
+
+    "${BENCHMARK_DIR}/${executable}" \
+        --benchmark_filter="${filter}" \
+        --benchmark_min_time="${BENCH_MIN_TIME}" \
+        --benchmark_repetitions="${BENCH_REPETITIONS}" \
+        --benchmark_format=json \
+        --benchmark_out="${output_json}" \
+        --benchmark_out_format=json
+}
+
+JSON_FILES=()
+
+THREADPOOL_JSON="${OUTPUT_DIR}/threadpool_comparisons.json"
+run_json_benchmark "threadpool_basic_benchmarks" "BM_ComparePoolTypes_LightWorkload|BM_PostVsSubmit" "${THREADPOOL_JSON}"
+if [[ -f "${THREADPOOL_JSON}" ]]; then
+    JSON_FILES+=("${THREADPOOL_JSON}")
+fi
+
+REFLECTION_JSON="${OUTPUT_DIR}/reflection_registry.json"
+run_json_benchmark "reflection_registry_benchmarks" "BM_QueryView_.*" "${REFLECTION_JSON}"
+if [[ -f "${REFLECTION_JSON}" ]]; then
+    JSON_FILES+=("${REFLECTION_JSON}")
+fi
+
+if [[ ${#JSON_FILES[@]} -eq 0 ]]; then
+    echo "No benchmark JSON files were produced." >&2
+    exit 1
+fi
+
+python3 "${PROJECT_ROOT}/benchmarks/generate_benchmark_report.py" \
+    --output "${OUTPUT_DIR}/index.html" \
+    --title "ThreadSchedule benchmark comparison report" \
+    "${JSON_FILES[@]}"
+
+echo
+echo "Benchmark graphs written to:"
+echo "  ${OUTPUT_DIR}/index.html"
diff --git a/src/threadschedule.cppm b/src/threadschedule.cppm
index affb995..03833f2 100644
--- a/src/threadschedule.cppm
+++ b/src/threadschedule.cppm
@@ -166,6 +166,35 @@ using ::threadschedule::cgroup_attach_tid;
 
 } // export namespace threadschedule
 
+#if defined(THREADSCHEDULE_HAS_REFLECTION) && THREADSCHEDULE_HAS_REFLECTION
+export namespace threadschedule::reflect {
+using ::threadschedule::reflect::enabled;
+using ::threadschedule::reflect::info;
+using ::threadschedule::reflect::fields;
+using ::threadschedule::reflect::field_count;
+using ::threadschedule::reflect::field_info;
+using ::threadschedule::reflect::field_name;
+using ::threadschedule::reflect::field_names;
+using ::threadschedule::reflect::type_name;
+using ::threadschedule::reflect::get;
+using ::threadschedule::reflect::visit_fields;
+using ::threadschedule::reflect::project_value;
+using ::threadschedule::reflect::require_field_owner;
+using ::threadschedule::reflect::is_field_of_v;
+using ::threadschedule::reflect::field_type_t;
+using ::threadschedule::reflect::projection_t;
+}
+
+export namespace threadschedule::registered_thread_fields {
+using ::threadschedule::registered_thread_fields::tid;
+using ::threadschedule::registered_thread_fields::stdId;
+using ::threadschedule::registered_thread_fields::name;
+using ::threadschedule::registered_thread_fields::componentTag;
+using ::threadschedule::registered_thread_fields::alive;
+using ::threadschedule::registered_thread_fields::control;
+}
+#endif
+
 // Re-export profiles sub-namespace
 export namespace threadschedule::profiles {
     using ::threadschedule::profiles::realtime;
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index e649533..c644d55 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -126,6 +126,19 @@ if(TARGET gtest)
         PROPERTIES TIMEOUT 120
     )
 
+    if(THREADSCHEDULE_HAS_REFLECTION)
+        add_executable(reflection_test reflection_test.cpp)
+        target_link_libraries(reflection_test
+            ThreadSchedule::ThreadSchedule
+            gtest
+            gtest_main
+        )
+        gtest_discover_tests(reflection_test
+            DISCOVERY_TIMEOUT 60
+            PROPERTIES TIMEOUT 120
+        )
+    endif()
+
     if(THREADSCHEDULE_RUNTIME)
         add_executable(runtime_registry_test runtime_registry_test.cpp)
         target_link_libraries(runtime_registry_test 
diff --git a/tests/reflection_test.cpp b/tests/reflection_test.cpp
new file mode 100644
index 0000000..c9baf1a
--- /dev/null
+++ b/tests/reflection_test.cpp
@@ -0,0 +1,61 @@
+#include <gtest/gtest.h>
+#include <threadschedule/threadschedule.hpp>
+#include <array>
+#include <set>
+#include <string_view>
+
+#if !defined(THREADSCHEDULE_HAS_REFLECTION) || !THREADSCHEDULE_HAS_REFLECTION
+#error "reflection_test.cpp requires THREADSCHEDULE_HAS_REFLECTION"
+#endif
+
+using namespace threadschedule;
+
+TEST(ReflectionApiTest, ExposesMetadataForLibraryTypes)
+{
+    static_assert(reflect::enabled);
+    static_assert(reflect::field_count<RegisteredThreadInfo>() == 6);
+    static_assert(reflect::field_count<ThreadProfile>() == 4);
+    static_assert(reflect::field_count<ChaosConfig>() == 3);
+    static_assert(reflect::field_count<HighPerformancePool::Statistics>() == 7);
+    static_assert(reflect::field_count<ThreadPoolBase::Statistics>() == 6);
+    static_assert(reflect::field_name<RegisteredThreadInfo, 0>() == "tid");
+    static_assert(reflect::field_name<RegisteredThreadInfo, 3>() == "componentTag");
+    static_assert(reflect::field_name<ThreadProfile, 1>() == "policy");
+    static_assert(reflect::field_name<ChaosConfig, 1>() == "priority_jitter");
+    constexpr auto registry_field_names = reflect::field_names<RegisteredThreadInfo>();
+    static_assert(registry_field_names.size() == 6);
+    static_assert(std::string_view(registry_field_names[2]) == "name");
+    static_assert(reflect::type_name<ThreadProfile>().contains("ThreadProfile"));
+}
+
+TEST(ReflectionApiTest, VisitFieldsAndGetWorkForPublicStructs)
+{
+    ThreadProfile profile{"latency", SchedulingPolicy::RR, ThreadPriority{3}, std::nullopt};
+    std::array<std::string_view, 4> expected = {"name", "policy", "priority", "affinity"};
+    std::size_t index = 0;
+
+    reflect::visit_fields(profile, [&](std::string_view name, auto&) {
+        ASSERT_LT(index, expected.size());
+        EXPECT_EQ(name, expected[index]);
+        ++index;
+    });
+
+    EXPECT_EQ(index, expected.size());
+    EXPECT_EQ(reflect::get<reflect::field_info<ThreadProfile, 0>()>(profile), "latency");
+}
+
+TEST(ReflectionApiTest, ProjectValueBuildsCompactResults)
+{
+    RegisteredThreadInfo info{};
+    info.tid = Tid{11};
+    info.name = "alpha";
+    info.componentTag = "io";
+    info.alive = true;
+
+    auto tuple =
+        reflect::project_value<registered_thread_fields::name(), registered_thread_fields::componentTag()>(info);
+
+    EXPECT_EQ(std::get<0>(tuple), "alpha");
+    EXPECT_EQ(std::get<1>(tuple), "io");
+    EXPECT_TRUE(reflect::is_field_of_v<registered_thread_fields::alive(), RegisteredThreadInfo>);
+}
diff --git a/tests/registry_query_test.cpp b/tests/registry_query_test.cpp
index 702bcf7..3641cdd 100644
--- a/tests/registry_query_test.cpp
+++ b/tests/registry_query_test.cpp
@@ -171,3 +171,36 @@ TEST_F(RegistryQueryTest, ChainedFilterMapForEach)
     EXPECT_TRUE(names.count("alpha"));
     EXPECT_TRUE(names.count("gamma"));
 }
+
+#if defined(THREADSCHEDULE_HAS_REFLECTION) && THREADSCHEDULE_HAS_REFLECTION
+TEST_F(RegistryQueryTest, ReflectionContainsAndFindBy)
+{
+    EXPECT_TRUE(registry().contains<registered_thread_fields::name()>("beta"));
+    auto found = registry().find_by<registered_thread_fields::name()>("beta");
+    ASSERT_TRUE(found.has_value());
+    EXPECT_EQ(found->componentTag, "compute");
+}
+
+TEST_F(RegistryQueryTest, ReflectionWhereAndProject)
+{
+    auto io_names =
+        registry().where<registered_thread_fields::componentTag()>("io").project<registered_thread_fields::name()>();
+    EXPECT_EQ(io_names.size(), 2u);
+    std::set<std::string> names(io_names.begin(), io_names.end());
+    EXPECT_TRUE(names.count("alpha"));
+    EXPECT_TRUE(names.count("gamma"));
+}
+
+TEST_F(RegistryQueryTest, ReflectionQueryViewWhereIf)
+{
+    auto names = registry()
+                     .query()
+                     .where_if<registered_thread_fields::alive()>([](bool alive) { return alive; })
+                     .where_if<registered_thread_fields::name()>([](std::string const& name) {
+                         return name.starts_with("g");
+                     })
+                     .project<registered_thread_fields::name()>();
+    ASSERT_EQ(names.size(), 1u);
+    EXPECT_EQ(names.front(), "gamma");
+}
+#endif