diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 7e79a6d..a6df1e3 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -211,6 +211,79 @@ jobs: cd build ctest --output-on-failure --parallel + # ── GCC 16 Reflection-specific validation ──────────────────────── + reflection-linux: + name: Reflection (Linux, GCC 16, C++26) + runs-on: ubuntu-24.04 + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Add toolchain PPA + run: | + sudo add-apt-repository -y ppa:ubuntu-toolchain-r/test + sudo apt-get update + + - name: Install dependencies + run: | + sudo apt-get install -y cmake ninja-build gcc-16 g++-16 + + - name: Configure CMake + env: + CC: gcc-16 + CXX: g++-16 + run: | + cmake -B build -G Ninja \ + -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_CXX_STANDARD=26 \ + -DTHREADSCHEDULE_BUILD_EXAMPLES=OFF \ + -DTHREADSCHEDULE_BUILD_TESTS=ON \ + -DTHREADSCHEDULE_ENABLE_REFLECTION=ON + + - name: Build + run: cmake --build build --parallel + + - name: Run reflection-focused tests + run: | + ctest --test-dir build \ + --output-on-failure \ + --tests-regex 'ReflectionApiTest|RegistryQueryTest\.Reflection' + + reflection-modules-linux: + name: Reflection Modules (Linux, GCC 16, C++26) + runs-on: ubuntu-24.04 + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Add toolchain PPA + run: | + sudo add-apt-repository -y ppa:ubuntu-toolchain-r/test + sudo apt-get update + + - name: Install dependencies + run: | + sudo apt-get install -y cmake ninja-build gcc-16 g++-16 + + - name: Configure CMake + env: + CC: gcc-16 + CXX: g++-16 + run: | + cmake -B build -G Ninja \ + -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_CXX_STANDARD=26 \ + -DTHREADSCHEDULE_MODULE=ON \ + -DTHREADSCHEDULE_ENABLE_REFLECTION=ON + + - name: Build module + run: cmake --build build --parallel + + - name: Verify reflection-enabled module artifacts + run: | + echo "Reflection module artifacts:" + find build -name '*.a' -o -name '*.gcm' -o -name '*.pcm' | head -20 + # ── C++20 Module build verification ──────────────────────────────── modules-linux: name: Modules (Linux, C++${{ matrix.cpp_standard }}, ${{ matrix.compiler }}) diff --git a/.gitignore b/.gitignore index 2ebeb08..cb75fc3 100644 --- a/.gitignore +++ b/.gitignore @@ -46,5 +46,6 @@ compile_commands.json .cache/ build/ build_*/ +build-*/ install/ build_runtime/ diff --git a/CHANGELOG.md b/CHANGELOG.md index ffdc9ed..38c64cb 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,72 @@ # Changelog +## v2.3.0 + +> **No intended API/ABI breaking changes for existing non-reflection users.** +> This release adds an optional GCC-16/C++26 reflection surface and uses it to +> expose faster registry projection/filter paths without changing the existing +> query API. + +### New Features + +- **Optional GCC 16.1+ reflection API** -- when building with C++26, + `THREADSCHEDULE_ENABLE_REFLECTION=ON`, and working `-freflection` support, + the library now exports `threadschedule::reflect::*` for field metadata, + field visitation, compile-time projection, and type/field naming. + (`reflection.hpp`, `threadschedule.cppm`, `CMakeLists.txt`) + +- **Reflection-backed registry selectors** -- `ThreadRegistry` and + `QueryView` now expose field-oriented helpers such as + `where(...)`, + `where_if(...)`, + `find_by(...)`, + `contains<...>(...)`, and `project<...>()` when reflection is enabled. + (`thread_registry.hpp`) + +### Performance + +- **Lower-overhead registry projections on reflection builds** -- direct + field-projection and field-filter paths now run under the registry's shared + lock and can skip the older `filter(...).map(...)` layering when callers opt + into the new reflection APIs. This reduces intermediate traversal and avoids + some full-entry transformation work for hot query paths. (`thread_registry.hpp`) + +- **More metadata is now promoted at compile time** -- reflection field names + and type display names are now stabilized via `std::define_static_string(...)` + and reused through `consteval` helpers such as `field_names()`, reducing + repeated compile-time reconstruction of the same metadata. (`reflection.hpp`) + +### Documentation + +- **README examples for reflection queries** -- the top-level README now shows + how to combine `threadschedule::reflect` with field-based registry queries + and projections. (`README.md`) + +- **New CMake reference entry for reflection** -- the reference now documents + `THREADSCHEDULE_ENABLE_REFLECTION` and the GCC 16.1+/C++26 activation path. + (`docs/CMAKE_REFERENCE.md`) + +### Tests & Benchmarks + +- **New reflection unit coverage** -- dedicated tests now validate reflection + metadata for core public structs and reflection-backed registry queries. + (`tests/reflection_test.cpp`, `tests/registry_query_test.cpp`, + `tests/CMakeLists.txt`) + +- **New reflection registry benchmark** -- `reflection_registry_benchmarks` + compares classic `filter/map/find_if` usage against the new field-oriented + query helpers on synthetic registry snapshots. (`benchmarks/CMakeLists.txt`, + `benchmarks/reflection_registry_benchmarks.cpp`) + +### CI / Infrastructure + +- **Dedicated GCC 16 reflection CI jobs** -- the main test workflow now + includes explicit `ubuntu-24.04` jobs for reflection-enabled GCC 16/C++26 + validation: one job builds and runs the reflection-focused test cases, and a + second job verifies the reflection-enabled module build path. This makes the + new `THREADSCHEDULE_ENABLE_REFLECTION` surface visible in CI instead of + relying only on the generic C++26 matrix entry. (`.github/workflows/tests.yml`) + ## v2.2.0 > **No intended API/ABI breaking changes.** This release extends thread-control diff --git a/CMakeLists.txt b/CMakeLists.txt index 114ceb5..88e708f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -64,6 +64,7 @@ option(THREADSCHEDULE_INSTALL "Generate install target" ${THREADSCHEDULE_IS_TOPL option(THREADSCHEDULE_RUNTIME "Build shared runtime for global registry (non header-only)" OFF) option(THREADSCHEDULE_MODULE "Build C++20 module target (requires CMake >= 3.28 and C++20+)" OFF) option(THREADSCHEDULE_BUILD_DOCS "Build API documentation with Doxygen" ${THREADSCHEDULE_IS_TOPLEVEL_PROJECT}) +option(THREADSCHEDULE_ENABLE_REFLECTION "Enable GCC 16 C++26 reflection APIs when supported" ON) # CPM support (optional, download if building tests or benchmarks) if(THREADSCHEDULE_BUILD_TESTS OR THREADSCHEDULE_BUILD_BENCHMARKS) @@ -119,6 +120,34 @@ endif() # Platform-specific requirements find_package(Threads REQUIRED) +set(THREADSCHEDULE_HAS_REFLECTION OFF) +if(THREADSCHEDULE_ENABLE_REFLECTION + AND CMAKE_CXX_STANDARD GREATER_EQUAL 26 + AND CMAKE_CXX_COMPILER_ID STREQUAL "GNU" + AND NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS "16.1") + include(CheckCXXSourceCompiles) + set(_threadschedule_saved_required_flags "${CMAKE_REQUIRED_FLAGS}") + set(CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS} -std=c++26 -freflection") + check_cxx_source_compiles( + " + #include + using namespace std::meta; + struct probe_type { int value; bool ready; }; + consteval bool probe() { + auto fields = std::define_static_array(nonstatic_data_members_of(^^probe_type, access_context::current())); + return fields.size() == 2 && identifier_of(fields[0]) == \"value\"; + } + static_assert(probe()); + int main() { return 0; } + " + THREADSCHEDULE_REFLECTION_PROBE_OK + ) + set(CMAKE_REQUIRED_FLAGS "${_threadschedule_saved_required_flags}") + if(THREADSCHEDULE_REFLECTION_PROBE_OK) + set(THREADSCHEDULE_HAS_REFLECTION ON) + endif() +endif() + # Create the interface library target (header-only API) add_library(ThreadSchedule INTERFACE) add_library(ThreadSchedule::ThreadSchedule ALIAS ThreadSchedule) @@ -151,6 +180,11 @@ target_include_directories(ThreadSchedule INTERFACE # Link libraries target_link_libraries(ThreadSchedule INTERFACE Threads::Threads) +if(THREADSCHEDULE_HAS_REFLECTION) + target_compile_definitions(ThreadSchedule INTERFACE THREADSCHEDULE_HAS_REFLECTION=1) + target_compile_options(ThreadSchedule INTERFACE $<$:-freflection>) +endif() + # Windows: ensure modern API availability macros if(WIN32) target_compile_definitions(ThreadSchedule INTERFACE @@ -217,6 +251,10 @@ if(THREADSCHEDULE_RUNTIME) src/runtime_registry.cpp ) target_compile_definitions(ThreadScheduleRuntime PRIVATE THREADSCHEDULE_EXPORTS THREADSCHEDULE_RUNTIME) + if(THREADSCHEDULE_HAS_REFLECTION) + target_compile_definitions(ThreadScheduleRuntime PUBLIC THREADSCHEDULE_HAS_REFLECTION=1) + target_compile_options(ThreadScheduleRuntime PUBLIC $<$:-freflection>) + endif() # Propagate the THREADSCHEDULE_RUNTIME define to consumers so headers call into the DLL target_compile_definitions(ThreadScheduleRuntime INTERFACE THREADSCHEDULE_RUNTIME) target_include_directories(ThreadScheduleRuntime @@ -271,6 +309,11 @@ if(THREADSCHEDULE_MODULE) else() target_compile_features(ThreadScheduleModule PUBLIC cxx_std_20) endif() + + if(THREADSCHEDULE_HAS_REFLECTION) + target_compile_definitions(ThreadScheduleModule PUBLIC THREADSCHEDULE_HAS_REFLECTION=1) + target_compile_options(ThreadScheduleModule PUBLIC $<$:-freflection>) + endif() endif() # Documentation (Doxygen + Awesome theme) diff --git a/README.md b/README.md index 8f1d881..6f706f4 100644 --- a/README.md +++ b/README.md @@ -43,6 +43,9 @@ or with optional **shared runtime** for multi-DSO applications. - **Modern Callable Paths**: Newer standard libraries can use `std::move_only_function` / `std::copyable_function` internally for lower adaptation overhead while keeping the public API source-compatible +- **GCC 16 Reflection APIs**: Optional C++26 reflection utilities and + reflection-backed registry queries when building with GCC 16.1+ and + `-freflection` - **Scheduled Tasks**: Run tasks at specific times, after delays, or periodically - **Error Handling**: Comprehensive exception handling with error callbacks and @@ -158,6 +161,11 @@ are not regularly tested in CI. > **C++26**: Requires GCC 14+ or Clang 19+. MSVC does not yet expose > `cxx_std_26` to CMake; C++26 on Windows is not tested. > +> **Reflection APIs**: The optional `threadschedule::reflect` API and +> reflection-backed registry queries require GCC 16.1+ with +> `THREADSCHEDULE_ENABLE_REFLECTION=ON`. These APIs are not built on other +> toolchains or standards. +> > **GCC 15**: Installed via `ppa:ubuntu-toolchain-r/test` on Ubuntu 24.04. > > **GCC 16**: Installed via `ppa:ubuntu-toolchain-r/test` on Ubuntu 24.04. @@ -419,6 +427,47 @@ Notes: - Use `*Reg` wrappers (e.g., `ThreadWrapperReg`) or `AutoRegisterCurrentThread` for automatic control block creation and registration. +### Reflection-powered registry queries (GCC 16.1+ / C++26) + +When `THREADSCHEDULE_ENABLE_REFLECTION=ON` is active on GCC 16.1+ with +`-std=c++26`, ThreadSchedule exposes field metadata and faster field-oriented +registry queries. + +```cpp +#include +using namespace threadschedule; + +auto io_names = + registry() + .where("io") + .project(); + +auto live_compute = + registry() + .where("compute") + .where_if([](bool alive) { + return alive; + }) + .project(); + +bool has_scheduler = registry().contains("sched_main"); +``` + +You can also inspect reflected library types directly: + +```cpp +#include +using namespace threadschedule; + +static_assert(reflect::field_count() == 6); +static_assert(reflect::field_name() == "name"); + +ThreadProfile profile = profiles::throughput(); +reflect::visit_fields(profile, [](std::string_view field, auto const& value) { + // inspect compile-time-described fields at runtime +}); +``` + Find by name (Linux): ```cpp @@ -584,8 +633,162 @@ worker.set_affinity(affinity); ### Benchmark Results Performance varies by system configuration, workload characteristics, and task -complexity. See [benchmarks/](benchmarks/) for detailed performance analysis, -real-world scenario testing, and optimization recommendations. +complexity. The charts below were captured in a single environment; reproduce +them on your own machine with `./run_benchmark_graphs.sh` (HTML report) or +regenerate the SVGs with `benchmarks/generate_readme_graphs.py`. + +
+Benchmark environment & build flags + +| Setting | Value | +| ---------------- | --------------------------------------------------------------------- | +| CPU | AMD Ryzen 5 5600X (6 cores / 12 threads, 32 MiB L3, up to ~4.65 GHz) | +| OS / kernel | Fedora 44, Linux 7.0.4-200.fc44.x86_64 | +| Compiler | GCC 16.1.1 (`-std=c++23` for the pool charts; C++17/20/23/26 for the callable charts) | +| Build type | `Release` (`-O3 -DNDEBUG`) | +| Extra flags | `-march=native -ffast-math -fno-omit-frame-pointer` | +| Google Benchmark | v1.9.4 | +| Threads | 4 worker threads unless noted | + +The exact compile flags used for every benchmark target (see +[`benchmarks/CMakeLists.txt`](benchmarks/CMakeLists.txt)): + +```bash +# GCC / Clang +-O3 -DNDEBUG -fno-omit-frame-pointer -march=native -ffast-math +# plus the C++ standard: -std=c++23 (pool/reflection charts), +# -std=c++17 / 20 / 23 / 26 (callable charts) +``` + +> Absolute numbers are only meaningful relative to each other on the **same** +> machine and build. `-march=native` and `-ffast-math` in particular mean results +> are not comparable across CPUs. Re-run the benchmarks locally before drawing +> conclusions for your hardware. + +
+ +**Throughput scales with batch size.** For tiny tasks the +fire-and-forget `LightweightPool` consistently leads, while the work-stealing +`HighPerformancePool` pays for its extra machinery and only shines on larger, +unbalanced workloads: + +![Thread pool throughput by batch size](docs/benchmarks/pool_throughput.svg) + +**Pick the right pool for the workload.** Running 100,000 trivial tasks, the +`LightweightPool` finishes ~1.9x faster than the baseline `ThreadPool`, whereas +the work-stealing pool is slower because the tasks are too small to benefit from +stealing: + +![Thread pool comparison for a light workload](docs/benchmarks/pool_comparison.svg) + +**The gap depends heavily on how much work each task does.** With the pool built +once and the per-task work swept from `tiny` to `heavy`, the picture changes: for +tiny/medium tasks submission overhead dominates and `LightweightPool` wins by +~2-3x, but as the per-task work grows the field converges to within ~20% and the +pool choice stops mattering much. The work-stealing `HighPerformancePool` climbs +from last place (tiny) to nearly the front (heavy): + +![Pool comparison across workload weights](docs/benchmarks/pool_workload.svg) + +**Skip the future when you do not need it.** `post()` reuses the same queue path +as `submit()` but avoids the `packaged_task` / `std::future` overhead, which is +dramatic for very short tasks: + +![post() versus submit() submission overhead](docs/benchmarks/post_vs_submit.svg) + +> These numbers measure submission/scheduling overhead with light tasks, so they +> represent a worst case for pool overhead. As the "workload weights" chart +> shows, real workloads with heavier per-task work narrow these gaps +> considerably. + +#### Reflection-backed registry queries (GCC 16.1+ / C++26) + +With `THREADSCHEDULE_ENABLE_REFLECTION=ON` the registry exposes ergonomic, +field-oriented queries (`where` / `project` / `find_by`). These trade a little +performance for readability and compile-time field checking: against +hand-written STL-style lambdas over 16,384 registered threads they currently run +slightly slower, so reach for them when expressiveness matters more than the last +few percent of throughput. + +![Reflection query: project a field versus hand-written filter + map](docs/benchmarks/reflection_query.svg) + +![Reflection query: find by field versus hand-written find_if](docs/benchmarks/reflection_lookup.svg) + +#### Task storage: `std::move_only_function` and SBO callables + +The pools store type-erased tasks in one of two ways: `ThreadPool` / +`FastThreadPool` / `HighPerformancePool` use `detail::move_callable` +(`std::function` on C++17/20, `std::move_only_function` on C++23+), while +`LightweightPool` uses a custom small-buffer callable (`SboCallable<64>`). The +`callable_std_benchmarks` target isolates the build + invoke cost of these +wrappers (away from thread-scheduling noise) and is compiled under every standard. + +**Does replacing `std::function` help?** For small captures, switching to +`std::move_only_function` on C++23+ cuts the per-task wrapper cost by ~30% +(~4.6 ns to ~3.1 ns). For larger captures the heap allocation dominates and the +wrapper choice barely matters: + +![move_callable cost across C++ standards](docs/benchmarks/callable_standards.svg) + +**Do the SBO callables help?** Yes — and this is the bigger effect. A 48-byte +capture fits `LightweightPool`'s 56-byte inline buffer but overflows the +standard-library callables' small buffer, so the latter heap-allocate. The SBO +path is then ~6x faster (~3.4 ns vs ~21 ns per task). Once a capture is too big +for any inline buffer (128 B), both allocate and the advantage disappears: + +![SBO callable versus standard-library callable](docs/benchmarks/callable_sbo.svg) + +
+How big is a task, really? (capture sizes & inline buffers) + +A task is usually a lambda, and **a lambda's size is the sum of what it captures** +(plus alignment padding). A capture-less lambda is effectively free; each captured +pointer or reference adds 8 bytes, and capturing objects *by value* adds their +full size. Concrete sizes on this platform (GCC 16 / libstdc++, x86_64): + +| What the task captures | Example | Size | +| --------------------------------------------------- | ---------------------------------------- | ------ | +| nothing (stateless) | `pool.post([]{ tick(); });` | ~1 B | +| one pointer / reference / `this` | `pool.post([&q]{ q.drain(); });` | 8 B | +| two pointers / references | `pool.post([&a, &b]{ join(a, b); });` | 16 B | +| a `std::shared_ptr` by value | `pool.post([h]{ h->run(); });` | 16 B | +| a `std::vector` by value | `pool.post([data]{ process(data); });` | 24 B | +| a `std::string` by value | `pool.post([name]{ log(name); });` | 32 B | +| ~6 small values / handles (the chart's "medium") | `pool.post([id,a,b,c,d,e]{ ... });` | 48 B | +| a big array / struct by value (the chart's "large") | `pool.post([frame]{ encode(frame); });` | 128 B | + +Each storage type keeps small callables **inline** (no allocation) up to a fixed +buffer size, and falls back to a heap allocation above it: + +| Storage | Inline buffer | Used by | +| ----------------------------- | ------------- | ----------------------------------------- | +| `std::function` | ≤ 16 B | `ThreadPool` family on C++17/20 | +| `std::move_only_function` | ≤ 24 B | `ThreadPool` family on C++23+ | +| `SboCallable<64>` | ≤ 56 B | `LightweightPool` (`= LightweightPoolT<64>`) | + +`SboCallable` lays each task out as one cache line: + +``` + |<------------- TaskSize = 64 B ------------->| + [ vtable* (8 B) | inline capture buffer (56 B) ] +``` + +**Typical real tasks capture a few pointers/handles plus maybe a small value, so +they land in the ~8-48 B range.** That fits `LightweightPool`'s 56 B buffer with +no allocation, but overflows `std::function`'s 16 B buffer (one allocation per +task). If you capture large objects by value you blow past every inline buffer - +capture a pointer/handle to the data instead, or bump the buffer with +`LightweightPoolT<128>`. + +
+ +> Takeaway: keep task captures small. They stay inline (no allocation) in +> `LightweightPool`, and on C++23+ the other pools also benefit from the +> move-only wrapper. This is exactly why `post()` and `LightweightPool` are the +> recommended low-overhead paths. + +See [benchmarks/](benchmarks/) for detailed performance analysis, real-world +scenario testing, and optimization recommendations. ## Platform-Specific Features diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt index 4feb43b..102accc 100644 --- a/benchmarks/CMakeLists.txt +++ b/benchmarks/CMakeLists.txt @@ -7,6 +7,10 @@ add_executable(threadpool_throughput_benchmarks throughput_benchmarks.cpp) add_executable(threadpool_memory_benchmarks memory_benchmarks.cpp) add_executable(threadpool_resampling_benchmarks resampling_benchmarks.cpp) add_executable(callable_benchmarks callable_benchmarks.cpp) +add_executable(callable_std_benchmarks callable_std_benchmarks.cpp) +if(THREADSCHEDULE_HAS_REFLECTION) + add_executable(reflection_registry_benchmarks reflection_registry_benchmarks.cpp) +endif() # Real-world scenario benchmarks add_executable(web_server_benchmarks web_server_benchmarks.cpp) @@ -20,10 +24,14 @@ set(ALL_BENCHMARK_TARGETS threadpool_memory_benchmarks threadpool_resampling_benchmarks callable_benchmarks + callable_std_benchmarks web_server_benchmarks database_benchmarks audio_video_benchmarks ) +if(THREADSCHEDULE_HAS_REFLECTION) + list(APPEND ALL_BENCHMARK_TARGETS reflection_registry_benchmarks) +endif() # Link libraries for all benchmarks foreach(target ${ALL_BENCHMARK_TARGETS}) @@ -59,6 +67,9 @@ add_test(NAME ThreadPoolThroughputBenchmarks COMMAND threadpool_throughput_bench add_test(NAME ThreadPoolMemoryBenchmarks COMMAND threadpool_memory_benchmarks --benchmark_min_time=2s --benchmark_repetitions=3) add_test(NAME ThreadPoolResamplingBenchmarks COMMAND threadpool_resampling_benchmarks --benchmark_min_time=2s --benchmark_repetitions=3) add_test(NAME CallableBenchmarks COMMAND callable_benchmarks --benchmark_min_time=2s --benchmark_repetitions=3) +if(THREADSCHEDULE_HAS_REFLECTION) + add_test(NAME ReflectionRegistryBenchmarks COMMAND reflection_registry_benchmarks --benchmark_min_time=2s --benchmark_repetitions=3) +endif() add_test(NAME WebServerBenchmarks COMMAND web_server_benchmarks --benchmark_min_time=2s --benchmark_repetitions=3) add_test(NAME DatabaseBenchmarks COMMAND database_benchmarks --benchmark_min_time=2s --benchmark_repetitions=3) add_test(NAME AudioVideoBenchmarks COMMAND audio_video_benchmarks --benchmark_min_time=2s --benchmark_repetitions=3) diff --git a/benchmarks/README.md b/benchmarks/README.md index 36be4b5..12fdbfe 100644 --- a/benchmarks/README.md +++ b/benchmarks/README.md @@ -59,6 +59,10 @@ cmake --build build --target run_quick_benchmarks # Run all core benchmarks (2s per test, 3 repetitions) - use the run_benchmarks.sh script ./run_benchmarks.sh +# Generate an HTML report with graphs + speedups for comparison benchmarks +./run_benchmark_graphs.sh +./run_benchmark_graphs.sh --quick + # Or run specific benchmark suites with custom settings ./build/benchmarks/threadpool_basic_benchmarks --benchmark_min_time=2s --benchmark_repetitions=3 ./build/benchmarks/web_server_benchmarks --benchmark_min_time=2s --benchmark_repetitions=3 @@ -228,6 +232,111 @@ This shows: # Generate JSON for analysis ./database_benchmarks --benchmark_format=json --benchmark_out=results.json + +# Turn one or more Google Benchmark JSON files into a local HTML report +python3 benchmarks/generate_benchmark_report.py \ + --output build/benchmark-report.html \ + --title "Local benchmark comparison" \ + build/benchmarks/threadpool_comparisons.json \ + build/benchmarks/reflection_registry.json +``` + +## Graphs and Speedups + +The repository now includes a local report generator that turns Google +Benchmark JSON output into a standalone HTML report with: + +- Absolute timing bar charts +- Relative speedup annotations (for known comparison families) +- Automatically collected machine information +- Side-by-side tables for comparison-oriented benchmark groups + +The current heuristics explicitly understand: + +- `BM_ComparePoolTypes_LightWorkload` +- `BM_PostVsSubmit` +- `BM_QueryView_FilterMapName` +- `BM_QueryView_ReflectionWhereProjectName` +- `BM_QueryView_FindIf` +- `BM_QueryView_ReflectionFindBy` + +This is enough to visualize both classic pool comparisons and the new +reflection registry speedups without extra dependencies such as matplotlib. + +### Standalone SVG charts for the README + +`generate_readme_graphs.py` turns the same Google Benchmark JSON into a few +self-contained SVG files (light background, dark text) that embed cleanly into +Markdown and render in both light and dark GitHub themes: + +```bash +# Produce JSON from the comparison benchmarks +./build/benchmarks/threadpool_basic_benchmarks \ + --benchmark_filter="BM_ComparePoolTypes_LightWorkload|BM_ComparePoolWorkload|BM_PostVsSubmit" \ + --benchmark_format=json \ + --benchmark_out=build/threadpool_comparisons.json + +# Optional: reflection query benchmarks (needs a C++26 + reflection build) +./build-reflection/benchmarks/reflection_registry_benchmarks \ + --benchmark_filter="BM_QueryView_.*" \ + --benchmark_format=json \ + --benchmark_out=build/reflection_registry.json + +# Render the README charts (no matplotlib required) +python3 benchmarks/generate_readme_graphs.py \ + --output-dir docs/benchmarks \ + build/threadpool_comparisons.json \ + build/reflection_registry.json +``` + +The generator accepts any number of JSON files and emits the charts it can build +from the data it finds: + +| SVG file | Source benchmark | +| --------------------------------- | -------------------------------------------------- | +| `pool_throughput.svg` | `BM_ComparePoolTypes_LightWorkload` | +| `pool_comparison.svg` | `BM_ComparePoolTypes_LightWorkload` | +| `pool_workload.svg` | `BM_ComparePoolWorkload` | +| `post_vs_submit.svg` | `BM_PostVsSubmit` | +| `reflection_query.svg` | `BM_QueryView_FilterMapName` vs `...WhereProject` | +| `reflection_lookup.svg` | `BM_QueryView_FindIf` vs `BM_QueryView_ReflectionFindBy` | +| `callable_standards.svg` | `callable_std_benchmarks` (`BM_MoveCallable_*`, one JSON per standard) | +| `callable_sbo.svg` | `callable_std_benchmarks` (`BM_MoveCallable_*` vs `BM_Sbo_*`) | + +All files are written into `docs/benchmarks/` and referenced from the top-level +`README.md`. The reflection charts require building `reflection_registry_benchmarks`, +which is only available on GCC 16.1+ with `-DCMAKE_CXX_STANDARD=26 -DTHREADSCHEDULE_ENABLE_REFLECTION=ON`: + +```bash +cmake -S . -B build-reflection -G Ninja -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_CXX_STANDARD=26 -DTHREADSCHEDULE_BUILD_BENCHMARKS=ON \ + -DTHREADSCHEDULE_ENABLE_REFLECTION=ON +cmake --build build-reflection --target reflection_registry_benchmarks +``` + +#### Cross-standard callable charts + +`callable_std_benchmarks` isolates the cost of ThreadSchedule's task storage +(`detail::move_callable`, which is `std::function` on C++17/20 and +`std::move_only_function` on C++23+, versus the `SboCallable` small-buffer +callable used by `LightweightPool`). To compare standards, build the same source +under each one and feed the per-standard JSON (named `callable_cxx.json`, the +generator reads the standard from the file name) to the generator: + +```bash +for std in 17 20 23 26; do + g++ -std=c++$std -O3 -DNDEBUG -march=native -ffast-math -fno-omit-frame-pointer \ + -Iinclude -Ibuild/_deps/benchmark-src/include \ + benchmarks/callable_std_benchmarks.cpp \ + build/_deps/benchmark-build/src/libbenchmark.a -lpthread -o /tmp/callable_c$std + /tmp/callable_c$std --benchmark_min_time=0.5s --benchmark_repetitions=3 \ + --benchmark_report_aggregates_only=true --benchmark_format=json \ + --benchmark_out=build/callable_cxx$std.json +done + +python3 benchmarks/generate_readme_graphs.py --output-dir docs/benchmarks \ + build/callable_cxx17.json build/callable_cxx20.json \ + build/callable_cxx23.json build/callable_cxx26.json ``` ### Performance Regression Testing diff --git a/benchmarks/callable_std_benchmarks.cpp b/benchmarks/callable_std_benchmarks.cpp new file mode 100644 index 0000000..50f40c4 --- /dev/null +++ b/benchmarks/callable_std_benchmarks.cpp @@ -0,0 +1,101 @@ +// Cross-standard callable storage micro-benchmark. +// +// ThreadSchedule stores type-erased tasks in one of two ways: +// +// - detail::move_callable -- the hot-path storage used by +// ThreadPool / FastThreadPool / HighPerformancePool. It is an alias for +// std::function on C++17/20 and for std::move_only_function on C++23+. +// - detail::SboCallable -- the small-buffer callable used by +// LightweightPool. It stores callables up to TaskSize-8 bytes inline and is +// identical across every C++ standard. +// +// This benchmark isolates the construction (including any heap allocation) and +// invocation cost of those two storage types, away from thread scheduling noise, +// so the same binary can be compiled under C++17/20/23/26 and compared. It +// answers two questions directly: +// +// 1. Does replacing std::function with std::move_only_function help? +// -> compare BM_MoveCallable_* across standards. +// 2. Do the SBO callables help? +// -> compare BM_Sbo_* against BM_MoveCallable_* for the same capture. +// +// Written to compile as C++17 (no concepts / requires). + +#include +#include +#include +#include +#include +#include +#include + +using namespace threadschedule; + +namespace +{ + +// Build kBatch callables (each capturing NWords * 8 bytes) into a reused vector, +// then invoke them all. This amortizes timer overhead and measures exactly the +// storage construction + indirect call that the callable type controls. +template +void bench_storage(benchmark::State& state) +{ + constexpr std::size_t kBatch = 256; + std::vector store; + store.reserve(kBatch); + volatile std::uint64_t sink = 0; + + for (auto _ : state) + { + store.clear(); + for (std::size_t i = 0; i < kBatch; ++i) + { + std::array payload{}; + payload[0] = i; + store.emplace_back([payload, &sink]() mutable { sink += payload[0] + 1; }); + } + for (auto& callable : store) + callable(); + benchmark::ClobberMemory(); + } + state.SetItemsProcessed(static_cast(state.iterations() * kBatch)); +} + +} // namespace + +// move_callable == std::function (C++17/20) or std::move_only_function (C++23+) +static void BM_MoveCallable_Small(benchmark::State& state) +{ + bench_storage, 1>(state); // 8 B capture (fits all) +} +static void BM_MoveCallable_Medium(benchmark::State& state) +{ + bench_storage, 6>(state); // 48 B capture (heap in std lib callables) +} +static void BM_MoveCallable_Large(benchmark::State& state) +{ + bench_storage, 16>(state); // 128 B capture (heap everywhere) +} + +// SboCallable<64> == LightweightPool storage (56 B inline buffer) +static void BM_Sbo_Small(benchmark::State& state) +{ + bench_storage, 1>(state); +} +static void BM_Sbo_Medium(benchmark::State& state) +{ + bench_storage, 6>(state); +} +static void BM_Sbo_Large(benchmark::State& state) +{ + bench_storage, 16>(state); +} + +BENCHMARK(BM_MoveCallable_Small)->Unit(benchmark::kNanosecond); +BENCHMARK(BM_MoveCallable_Medium)->Unit(benchmark::kNanosecond); +BENCHMARK(BM_MoveCallable_Large)->Unit(benchmark::kNanosecond); +BENCHMARK(BM_Sbo_Small)->Unit(benchmark::kNanosecond); +BENCHMARK(BM_Sbo_Medium)->Unit(benchmark::kNanosecond); +BENCHMARK(BM_Sbo_Large)->Unit(benchmark::kNanosecond); + +BENCHMARK_MAIN(); diff --git a/benchmarks/generate_benchmark_report.py b/benchmarks/generate_benchmark_report.py new file mode 100755 index 0000000..bf6274e --- /dev/null +++ b/benchmarks/generate_benchmark_report.py @@ -0,0 +1,428 @@ +#!/usr/bin/env python3 + +from __future__ import annotations + +import argparse +import datetime as dt +import html +import json +import os +import platform +import re +import subprocess +from dataclasses import dataclass, field +from pathlib import Path +from typing import Iterable + + +TIME_TO_NS = { + "ns": 1.0, + "us": 1_000.0, + "ms": 1_000_000.0, + "s": 1_000_000_000.0, +} + +POOL_NAMES = ("ThreadPool", "FastThreadPool", "HighPerformancePool", "LightweightPool") + +EXPLICIT_GROUPS: dict[str, tuple[str, str, str]] = { + "BM_QueryView_FilterMapName": ("Reflection query: name projection", "filter + map", "filter + map"), + "BM_QueryView_ReflectionWhereProjectName": ( + "Reflection query: name projection", + "reflection where + project", + "filter + map", + ), + "BM_QueryView_FindIf": ("Reflection query: lookup", "find_if", "find_if"), + "BM_QueryView_ReflectionFindBy": ("Reflection query: lookup", "reflection find_by", "find_if"), +} + + +@dataclass +class Run: + source: str + family: str + full_name: str + args: tuple[str, ...] + label: str + time_ns: float + original_unit: str + context: dict[str, object] = field(default_factory=dict) + + +@dataclass +class ComparisonGroup: + title: str + baseline: str + runs: list[Run] + + +def run_command(command: list[str]) -> str: + try: + completed = subprocess.run(command, capture_output=True, text=True, check=True) + return completed.stdout.strip() + except Exception: + return "" + + +def collect_system_info() -> dict[str, str]: + info: dict[str, str] = { + "Timestamp": dt.datetime.now().isoformat(timespec="seconds"), + "Hostname": platform.node(), + "Platform": platform.platform(), + "Kernel": run_command(["uname", "-a"]), + } + + lscpu = run_command(["lscpu"]) + if lscpu: + def extract(pattern: str) -> str: + match = re.search(pattern, lscpu, re.MULTILINE) + return match.group(1).strip() if match else "" + + info["CPU"] = extract(r"^Model name:\s+(.+)$") + info["CPU cores"] = extract(r"^Core\(s\) per socket:\s+(.+)$") + info["CPU threads"] = extract(r"^CPU\(s\):\s+(.+)$") + info["Max MHz"] = extract(r"^CPU max MHz:\s+(.+)$") + info["L3 cache"] = extract(r"^L3 cache:\s+(.+)$") + + mem = run_command(["free", "-h"]) + if mem: + lines = mem.splitlines() + if len(lines) >= 2: + parts = lines[1].split() + if len(parts) >= 7: + info["Memory total"] = parts[1] + info["Memory available"] = parts[6] + if len(lines) >= 3: + parts = lines[2].split() + if len(parts) >= 3: + info["Swap total"] = parts[1] + + gpu = run_command(["sh", "-lc", "lspci | rg 'VGA|3D|Display'"]) + if gpu: + info["GPU"] = gpu.splitlines()[0].strip() + + disks = run_command(["lsblk", "-d", "-o", "NAME,SIZE,MODEL"]) + if disks: + disk_lines = [line.strip() for line in disks.splitlines()[1:] if line.strip()] + if disk_lines: + info["Storage"] = "; ".join(disk_lines[:4]) + + git_commit = run_command(["git", "rev-parse", "--short", "HEAD"]) + if git_commit: + info["Git commit"] = git_commit + + git_branch = run_command(["git", "branch", "--show-current"]) + if git_branch: + info["Git branch"] = git_branch + + return {key: value for key, value in info.items() if value} + + +def load_runs(path: Path) -> list[Run]: + payload = json.loads(path.read_text()) + context = payload.get("context", {}) + runs: list[Run] = [] + for bench in payload.get("benchmarks", []): + if bench.get("aggregate_name") or bench.get("run_type") == "aggregate": + continue + if "real_time" not in bench and "cpu_time" not in bench: + continue + full_name = str(bench["name"]) + parts = full_name.split("/") + family = parts[0] + args = tuple(parts[1:]) + unit = str(bench.get("time_unit", "ns")) + raw_value = float(bench.get("real_time", bench.get("cpu_time"))) + time_ns = raw_value * TIME_TO_NS.get(unit, 1.0) + runs.append( + Run( + source=path.name, + family=family, + full_name=full_name, + args=args, + label=str(bench.get("label", "")), + time_ns=time_ns, + original_unit=unit, + context=context, + ) + ) + return runs + + +def detect_group(run: Run) -> tuple[str, str, str] | None: + if run.family in EXPLICIT_GROUPS: + title, variant, baseline = EXPLICIT_GROUPS[run.family] + suffix = ", ".join(run.args) if run.args else "default" + return (f"{title} ({suffix})", variant, baseline) + + if run.family == "BM_ComparePoolTypes_LightWorkload" and run.label: + task_match = re.search(r"tasks=(\d+)", run.label) + pool_name = next((name for name in POOL_NAMES if name in run.label), run.label) + tasks = task_match.group(1) if task_match else (run.args[0] if run.args else "unknown") + return (f"Pool comparison: light workload ({tasks} tasks)", pool_name, "ThreadPool") + + if run.family == "BM_PostVsSubmit": + tasks = run.args[0] if run.args else "unknown" + variant = run.label or ("submit(future)" if run.args[-1:] == ("0",) else "post(fire-forget)") + return (f"Post vs submit ({tasks} tasks)", variant, "submit(future)") + + return None + + +def build_comparisons(runs: Iterable[Run]) -> list[ComparisonGroup]: + grouped: dict[str, tuple[str, str, list[Run]]] = {} + variant_names: dict[str, list[str]] = {} + + for run in runs: + detected = detect_group(run) + if not detected: + continue + title, variant, baseline = detected + key = title + if key not in grouped: + grouped[key] = (title, baseline, []) + variant_names[key] = [] + grouped[key][2].append(run) + variant_names[key].append(variant) + run.context = dict(run.context) + run.context["variant_name"] = variant + + groups: list[ComparisonGroup] = [] + for key, (title, baseline, values) in grouped.items(): + if len(values) < 2: + continue + groups.append(ComparisonGroup(title=title, baseline=baseline, runs=values)) + groups.sort(key=lambda group: group.title) + return groups + + +def format_time_ns(time_ns: float) -> str: + if time_ns >= 1_000_000_000.0: + return f"{time_ns / 1_000_000_000.0:.3f} s" + if time_ns >= 1_000_000.0: + return f"{time_ns / 1_000_000.0:.3f} ms" + if time_ns >= 1_000.0: + return f"{time_ns / 1_000.0:.3f} us" + return f"{time_ns:.0f} ns" + + +def speedup_label(baseline_ns: float, run_ns: float) -> str: + if run_ns <= 0: + return "n/a" + ratio = baseline_ns / run_ns + if abs(ratio - 1.0) < 0.02: + return "same speed" + if ratio > 1.0: + return f"{ratio:.2f}x faster" + return f"{1.0 / ratio:.2f}x slower" + + +def render_bar_chart(items: list[tuple[str, float, str]], width: int = 920, bar_height: int = 30) -> str: + if not items: + return "" + max_value = max(value for _, value, _ in items) or 1.0 + label_width = 280 + chart_width = width - label_width - 120 + height = len(items) * (bar_height + 18) + 24 + bars = [] + for index, (label, value, annotation) in enumerate(items): + y = 20 + index * (bar_height + 18) + bar_width = max(2.0, chart_width * (value / max_value)) + bars.append( + f'{html.escape(label)}' + f'' + f'{html.escape(annotation)}' + ) + return ( + f'' + + "".join(bars) + + "" + ) + + +def comparison_table(group: ComparisonGroup) -> str: + variants: list[tuple[str, Run]] = [] + for run in group.runs: + variant = str(run.context.get("variant_name", run.label or run.family)) + variants.append((variant, run)) + + baseline_run = next((run for variant, run in variants if variant == group.baseline), variants[0][1]) + rows = [] + chart_items = [] + + for variant, run in sorted(variants, key=lambda item: item[1].time_ns): + speedup = speedup_label(baseline_run.time_ns, run.time_ns) + rows.append( + "" + f"{html.escape(variant)}" + f"{html.escape(format_time_ns(run.time_ns))}" + f"{html.escape(speedup)}" + f"{html.escape(run.source)}" + "" + ) + chart_items.append((variant, run.time_ns, f"{format_time_ns(run.time_ns)} | {speedup}")) + + return ( + f"

{html.escape(group.title)}

" + + render_bar_chart(chart_items) + + "" + + "".join(rows) + + "
VariantTimeRelative to baselineSource
" + ) + + +def overall_section(runs: list[Run]) -> str: + top = sorted(runs, key=lambda run: run.time_ns)[:14] + chart_items = [] + for run in top: + label = run.label or run.full_name + chart_items.append((label[:42], run.time_ns, format_time_ns(run.time_ns))) + return ( + "

Fastest benchmark runs

" + "

Absolute timings across the provided JSON files. Lower is better.

" + + render_bar_chart(chart_items) + + "
" + ) + + +def system_info_section(system_info: dict[str, str]) -> str: + rows = "".join( + f"{html.escape(key)}{html.escape(value)}" for key, value in system_info.items() + ) + return ( + "

System information

" + "" + + rows + + "
" + ) + + +def build_html(title: str, runs: list[Run], groups: list[ComparisonGroup], system_info: dict[str, str]) -> str: + comparison_sections = "".join(comparison_table(group) for group in groups) + if not comparison_sections: + comparison_sections = "

No comparison groups detected

The input JSON did not match any known comparison patterns yet.

" + + return f""" + + + + + {html.escape(title)} + + + +
+
+

{html.escape(title)}

+

Google Benchmark comparison report with automatically collected machine data and relative speedups.

+

Loaded benchmark runs: {len(runs)} | Comparison groups: {len(groups)}

+
+
+ {system_info_section(system_info)} + {overall_section(runs)} +
+

Relative speedups

+ {comparison_sections} +
+ + +""" + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description="Generate a local HTML benchmark report with graphs and speedups.") + parser.add_argument("json_files", nargs="+", help="Google Benchmark JSON files") + parser.add_argument("--output", required=True, help="Output HTML file") + parser.add_argument("--title", default="ThreadSchedule benchmark report", help="Report title") + return parser.parse_args() + + +def main() -> int: + args = parse_args() + json_paths = [Path(value) for value in args.json_files] + runs: list[Run] = [] + for path in json_paths: + runs.extend(load_runs(path)) + + if not runs: + raise SystemExit("No benchmark runs found in the provided JSON files.") + + system_info = collect_system_info() + groups = build_comparisons(runs) + html_payload = build_html(args.title, runs, groups, system_info) + + output_path = Path(args.output) + output_path.parent.mkdir(parents=True, exist_ok=True) + output_path.write_text(html_payload, encoding="utf-8") + print(f"Wrote benchmark report to {output_path}") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/benchmarks/generate_readme_graphs.py b/benchmarks/generate_readme_graphs.py new file mode 100644 index 0000000..bbe5f2a --- /dev/null +++ b/benchmarks/generate_readme_graphs.py @@ -0,0 +1,538 @@ +#!/usr/bin/env python3 +"""Generate standalone SVG charts from Google Benchmark JSON for README embedding. + +This intentionally has no third-party dependencies (no matplotlib): it emits +self-contained SVG files with an explicit light background so they render well +in both light and dark GitHub themes. +""" + +from __future__ import annotations + +import argparse +import html +import json +from dataclasses import dataclass +from pathlib import Path + +TIME_TO_NS = {"ns": 1.0, "us": 1_000.0, "ms": 1_000_000.0, "s": 1_000_000_000.0} + +POOL_ORDER = ("ThreadPool", "FastThreadPool", "HighPerformancePool", "LightweightPool") +POOL_COLORS = { + "ThreadPool": "#2a7fff", + "FastThreadPool": "#16a34a", + "HighPerformancePool": "#f59e0b", + "LightweightPool": "#db2777", +} +VARIANT_COLORS = { + "submit(future)": "#2a7fff", + "post(fire-forget)": "#16a34a", +} +WORKLOAD_ORDER = ("tiny", "medium", "heavy", "imbalanced") + +CXX_COLORS = {"C++17": "#94a3b8", "C++20": "#2a7fff", "C++23": "#16a34a", "C++26": "#db2777"} +CALLABLE_BATCH = 256 # kBatch in callable_std_benchmarks.cpp +CALLABLE_CAPTURES = (("Small", "small (8 B)"), ("Medium", "medium (48 B)"), ("Large", "large (128 B)")) + +INK = "#122033" +MUTED = "#5b6b82" +LINE = "#dbe3ef" +BG = "#ffffff" + + +@dataclass +class Entry: + family: str + args: tuple[str, ...] + label: str + time_ns: float + items_per_second: float + + +def load_entries(path: Path) -> list[Entry]: + payload = json.loads(path.read_text()) + entries: list[Entry] = [] + for bench in payload.get("benchmarks", []): + if bench.get("run_type") == "aggregate" or bench.get("aggregate_name"): + continue + if "real_time" not in bench and "cpu_time" not in bench: + continue + parts = str(bench["name"]).split("/") + unit = str(bench.get("time_unit", "ns")) + raw = float(bench.get("real_time", bench.get("cpu_time"))) + entries.append( + Entry( + family=parts[0], + args=tuple(parts[1:]), + label=str(bench.get("label", "")), + time_ns=raw * TIME_TO_NS.get(unit, 1.0), + items_per_second=float(bench.get("items_per_second", 0.0)), + ) + ) + return entries + + +def fmt_time(time_ns: float) -> str: + if time_ns >= 1_000_000_000.0: + return f"{time_ns / 1_000_000_000.0:.2f} s" + if time_ns >= 1_000_000.0: + return f"{time_ns / 1_000_000.0:.2f} ms" + if time_ns >= 1_000.0: + return f"{time_ns / 1_000.0:.2f} us" + return f"{time_ns:.0f} ns" + + +def svg_header(width: int, height: int, title: str, subtitle: str) -> list[str]: + return [ + f'', + f'', + f'{html.escape(title)}', + f'{html.escape(subtitle)}', + ] + + +def horizontal_bar_chart( + title: str, + subtitle: str, + items: list[tuple[str, float, str, str]], + value_suffix: str = "", +) -> str: + """items: (label, value, annotation, color).""" + width = 760 + top = 80 + bar_h = 34 + gap = 22 + label_w = 200 + right_pad = 150 + chart_w = width - label_w - right_pad - 24 + height = top + len(items) * (bar_h + gap) + 16 + max_value = max(v for _, v, _, _ in items) or 1.0 + + parts = svg_header(width, height, title, subtitle) + for i, (label, value, annotation, color) in enumerate(items): + y = top + i * (bar_h + gap) + bar_w = max(3.0, chart_w * (value / max_value)) + parts.append( + f'{html.escape(label)}' + ) + parts.append( + f'' + ) + parts.append( + f'{html.escape(annotation)}' + ) + parts.append("") + return "\n".join(parts) + + +def grouped_bar_chart( + title: str, + subtitle: str, + group_labels: list[str], + series: list[tuple[str, list[float], str]], + y_axis_label: str, +) -> str: + """series: (name, values_per_group, color).""" + width = 820 + top = 96 + plot_h = 300 + left_pad = 64 + right_pad = 24 + plot_w = width - left_pad - right_pad + height = top + plot_h + 86 + + max_value = max((max(vals) for _, vals, _ in series), default=1.0) or 1.0 + # round max up to a nice number + import math + + magnitude = 10 ** math.floor(math.log10(max_value)) if max_value > 0 else 1 + nice_max = math.ceil(max_value / magnitude) * magnitude + if nice_max == 0: + nice_max = 1 + + parts = svg_header(width, height, title, subtitle) + + baseline_y = top + plot_h + # gridlines + y ticks + ticks = 5 + for t in range(ticks + 1): + gy = baseline_y - plot_h * t / ticks + val = nice_max * t / ticks + parts.append( + f'' + ) + parts.append( + f'{val:.1f}' + ) + parts.append( + f'{html.escape(y_axis_label)}' + ) + + n_groups = len(group_labels) + n_series = len(series) + group_w = plot_w / n_groups + inner_pad = group_w * 0.16 + bar_w = (group_w - 2 * inner_pad) / n_series + + for g in range(n_groups): + gx = left_pad + g * group_w + for s, (_, vals, color) in enumerate(series): + value = vals[g] + bh = plot_h * (value / nice_max) + bx = gx + inner_pad + s * bar_w + parts.append( + f'' + ) + parts.append( + f'{html.escape(group_labels[g])}' + ) + + # legend + legend_y = baseline_y + 44 + lx = left_pad + for name, _, color in series: + parts.append(f'') + parts.append( + f'{html.escape(name)}' + ) + lx += 30 + len(name) * 7.2 + parts.append("") + return "\n".join(parts) + + +def pool_runs_by_tasks(entries: list[Entry]) -> dict[str, dict[str, Entry]]: + """tasks -> pool_name -> Entry.""" + out: dict[str, dict[str, Entry]] = {} + for e in entries: + if e.family != "BM_ComparePoolTypes_LightWorkload": + continue + # Label format is " tasks=N"; match the leading token so that + # "ThreadPool" does not shadow "FastThreadPool" via substring matching. + first_token = e.label.split()[0] if e.label else "" + pool = first_token if first_token in POOL_ORDER else None + if not pool: + continue + tasks = e.args[0] if e.args else "?" + out.setdefault(tasks, {})[pool] = e + return out + + +def build_pool_comparison(entries: list[Entry], out_dir: Path) -> Path | None: + by_tasks = pool_runs_by_tasks(entries) + if not by_tasks: + return None + tasks = max(by_tasks, key=lambda t: int(t) if t.isdigit() else 0) + pools = by_tasks[tasks] + baseline = pools.get("ThreadPool") + items: list[tuple[str, float, str, str]] = [] + ordered = sorted(pools.items(), key=lambda kv: kv[1].time_ns) + for pool, e in ordered: + if baseline and baseline.time_ns > 0 and e.time_ns > 0: + ratio = baseline.time_ns / e.time_ns + if abs(ratio - 1.0) < 0.02: + rel = "baseline" + elif ratio >= 1.0: + rel = f"{ratio:.2f}x faster" + else: + rel = f"{1.0 / ratio:.2f}x slower" + else: + rel = "" + annotation = f"{fmt_time(e.time_ns)} ({rel})" if rel else fmt_time(e.time_ns) + items.append((pool, e.time_ns, annotation, POOL_COLORS.get(pool, "#2a7fff"))) + svg = horizontal_bar_chart( + "Thread pool comparison \u2014 light workload", + f"Wall-clock time to run {int(tasks):,} tiny tasks (lower is better, relative to ThreadPool)", + items, + ) + path = out_dir / "pool_comparison.svg" + path.write_text(svg, encoding="utf-8") + return path + + +def build_pool_throughput(entries: list[Entry], out_dir: Path) -> Path | None: + by_tasks = pool_runs_by_tasks(entries) + if not by_tasks: + return None + task_keys = sorted((t for t in by_tasks if t.isdigit()), key=lambda t: int(t)) + group_labels = [f"{int(t):,}" for t in task_keys] + series: list[tuple[str, list[float], str]] = [] + for pool in POOL_ORDER: + vals = [] + for t in task_keys: + e = by_tasks[t].get(pool) + vals.append((e.items_per_second / 1_000_000.0) if e else 0.0) + if any(v > 0 for v in vals): + series.append((pool, vals, POOL_COLORS.get(pool, "#2a7fff"))) + if not series: + return None + svg = grouped_bar_chart( + "Thread pool throughput by batch size", + "Tasks processed per second for the light workload (higher is better)", + group_labels, + series, + "M tasks / second", + ) + path = out_dir / "pool_throughput.svg" + path.write_text(svg, encoding="utf-8") + return path + + +def build_post_vs_submit(entries: list[Entry], out_dir: Path) -> Path | None: + by_tasks: dict[str, dict[str, Entry]] = {} + for e in entries: + if e.family != "BM_PostVsSubmit": + continue + tasks = e.args[0] if e.args else "?" + by_tasks.setdefault(tasks, {})[e.label] = e + if not by_tasks: + return None + tasks = max(by_tasks, key=lambda t: int(t) if t.isdigit() else 0) + variants = by_tasks[tasks] + submit = variants.get("submit(future)") + items: list[tuple[str, float, str, str]] = [] + for name, e in sorted(variants.items(), key=lambda kv: kv[1].time_ns): + if submit and submit.time_ns > 0 and e.time_ns > 0: + ratio = submit.time_ns / e.time_ns + rel = "baseline" if abs(ratio - 1.0) < 0.02 else ( + f"{ratio:.2f}x faster" if ratio > 1.0 else f"{1.0 / ratio:.2f}x slower" + ) + annotation = f"{fmt_time(e.time_ns)} ({rel})" + else: + annotation = fmt_time(e.time_ns) + items.append((name, e.time_ns, annotation, VARIANT_COLORS.get(name, "#2a7fff"))) + svg = horizontal_bar_chart( + "post() vs submit()", + f"Submission overhead for {int(tasks):,} tasks: post() skips the future/packaged_task path (lower is better)", + items, + ) + path = out_dir / "post_vs_submit.svg" + path.write_text(svg, encoding="utf-8") + return path + + +def build_pool_workload(entries: list[Entry], out_dir: Path) -> Path | None: + by_wl: dict[str, dict[str, Entry]] = {} + for e in entries: + if e.family != "BM_ComparePoolWorkload" or not e.label: + continue + tokens = e.label.split() + if len(tokens) < 2: + continue + pool, wl = tokens[0], tokens[1] + if pool not in POOL_ORDER: + continue + by_wl.setdefault(wl, {})[pool] = e + if not by_wl: + return None + + group_labels = [wl for wl in WORKLOAD_ORDER if wl in by_wl] + series: list[tuple[str, list[float], str]] = [] + for pool in POOL_ORDER: + vals: list[float] = [] + for wl in group_labels: + row = by_wl[wl] + best = min((r.time_ns for r in row.values()), default=0.0) or 1.0 + e = row.get(pool) + vals.append((e.time_ns / best) if e else 0.0) + if any(v > 0 for v in vals): + series.append((pool, vals, POOL_COLORS.get(pool, "#2a7fff"))) + if not series: + return None + + svg = grouped_bar_chart( + "Which pool wins depends on the workload", + "Time relative to the fastest pool per workload (1.0 = winner, shorter is better; pool built once, 4 threads)", + group_labels, + series, + "relative time (1.0 = fastest)", + ) + path = out_dir / "pool_workload.svg" + path.write_text(svg, encoding="utf-8") + return path + + +def _reflection_pair( + entries: list[Entry], + manual_family: str, + reflect_family: str, + title: str, + subtitle: str, + manual_label: str, + reflect_label: str, + out_name: str, + out_dir: Path, +) -> Path | None: + manual = {e.args: e for e in entries if e.family == manual_family} + reflect = {e.args: e for e in entries if e.family == reflect_family} + common = sorted(set(manual) & set(reflect), key=lambda a: int(a[0]) if a and a[0].isdigit() else 0) + if not common: + return None + args = common[-1] # largest registry size + m = manual[args] + r = reflect[args] + size = args[0] if args else "?" + items: list[tuple[str, float, str, str]] = [] + for label, e, color in ( + (manual_label, m, "#2a7fff"), + (reflect_label, r, "#f59e0b"), + ): + if m.time_ns > 0 and e.time_ns > 0: + ratio = e.time_ns / m.time_ns + rel = "baseline" if abs(ratio - 1.0) < 0.02 else ( + f"{ratio:.2f}x slower" if ratio > 1.0 else f"{1.0 / ratio:.2f}x faster" + ) + annotation = f"{fmt_time(e.time_ns)} ({rel})" + else: + annotation = fmt_time(e.time_ns) + items.append((label, e.time_ns, annotation, color)) + svg = horizontal_bar_chart(title, subtitle.format(size=f"{int(size):,}" if size.isdigit() else size), items) + path = out_dir / out_name + path.write_text(svg, encoding="utf-8") + return path + + +def build_reflection_query(entries: list[Entry], out_dir: Path) -> Path | None: + return _reflection_pair( + entries, + "BM_QueryView_FilterMapName", + "BM_QueryView_ReflectionWhereProjectName", + "Reflection registry query: project a field", + "Selecting + projecting one field over {size} registered threads (lower is better)", + "filter + map (hand-written)", + "where + project (reflection)", + "reflection_query.svg", + out_dir, + ) + + +def build_reflection_lookup(entries: list[Entry], out_dir: Path) -> Path | None: + return _reflection_pair( + entries, + "BM_QueryView_FindIf", + "BM_QueryView_ReflectionFindBy", + "Reflection registry query: find by field", + "Locating a single entry by name over {size} registered threads (lower is better)", + "find_if (hand-written)", + "find_by (reflection)", + "reflection_lookup.svg", + out_dir, + ) + + +def load_callable_medians(path: Path) -> dict[str, float]: + """Return family -> per-task time (ns) from an aggregate-only callable JSON.""" + payload = json.loads(path.read_text()) + out: dict[str, float] = {} + for bench in payload.get("benchmarks", []): + if bench.get("aggregate_name") != "median": + continue + family = str(bench.get("run_name", bench.get("name", ""))).split("/")[0] + unit = str(bench.get("time_unit", "ns")) + ns = float(bench.get("real_time", 0.0)) * TIME_TO_NS.get(unit, 1.0) + out[family] = ns / CALLABLE_BATCH + return out + + +def standard_from_filename(path: Path) -> str | None: + name = path.name + for token in ("cxx17", "cxx20", "cxx23", "cxx26"): + if token in name: + return "C++" + token[3:] + return None + + +def build_callable_charts(std_medians: dict[str, dict[str, float]], out_dir: Path) -> list[Path]: + std_order = [s for s in ("C++17", "C++20", "C++23", "C++26") if s in std_medians] + if not std_order: + return [] + group_labels = [label for _, label in CALLABLE_CAPTURES] + paths: list[Path] = [] + + # Chart A: move_callable cost across standards (std::function vs move_only_function). + series_a: list[tuple[str, list[float], str]] = [] + for std in std_order: + vals = [std_medians[std].get(f"BM_MoveCallable_{key}", 0.0) for key, _ in CALLABLE_CAPTURES] + if any(v > 0 for v in vals): + series_a.append((std, vals, CXX_COLORS[std])) + if series_a: + svg = grouped_bar_chart( + "Does replacing std::function help? (ThreadPool task storage)", + "Build + invoke cost per task for detail::move_callable " + "(std::function on C++17/20, std::move_only_function on C++23+); lower is better", + group_labels, + series_a, + "ns per task", + ) + path = out_dir / "callable_standards.svg" + path.write_text(svg, encoding="utf-8") + paths.append(path) + + # Chart B: SBO callable vs std-library callable at the newest available standard. + newest = std_order[-1] + medians = std_medians[newest] + move_vals = [medians.get(f"BM_MoveCallable_{key}", 0.0) for key, _ in CALLABLE_CAPTURES] + sbo_vals = [medians.get(f"BM_Sbo_{key}", 0.0) for key, _ in CALLABLE_CAPTURES] + if any(v > 0 for v in move_vals) and any(v > 0 for v in sbo_vals): + series_b = [ + ("move_callable (ThreadPool / std lib)", move_vals, "#2a7fff"), + ("SboCallable (LightweightPool)", sbo_vals, "#db2777"), + ] + svg = grouped_bar_chart( + f"Do the SBO callables help? ({newest})", + "Per-task cost; the 48 B capture fits the SBO buffer but spills the std-library " + "callable to the heap (lower is better)", + group_labels, + series_b, + "ns per task", + ) + path = out_dir / "callable_sbo.svg" + path.write_text(svg, encoding="utf-8") + paths.append(path) + + return paths + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("json_files", nargs="+", help="Google Benchmark JSON files") + parser.add_argument("--output-dir", required=True, help="Directory for generated SVG files") + return parser.parse_args() + + +def main() -> int: + args = parse_args() + entries: list[Entry] = [] + std_medians: dict[str, dict[str, float]] = {} + for value in args.json_files: + path = Path(value) + standard = standard_from_filename(path) + if standard: + std_medians[standard] = load_callable_medians(path) + else: + entries.extend(load_entries(path)) + + if not entries and not std_medians: + raise SystemExit("No benchmark entries found in the provided JSON files.") + + out_dir = Path(args.output_dir) + out_dir.mkdir(parents=True, exist_ok=True) + + generated: list[Path | None] = [ + build_pool_throughput(entries, out_dir), + build_pool_comparison(entries, out_dir), + build_pool_workload(entries, out_dir), + build_post_vs_submit(entries, out_dir), + build_reflection_query(entries, out_dir), + build_reflection_lookup(entries, out_dir), + ] + generated.extend(build_callable_charts(std_medians, out_dir)) + for path in generated: + if path: + print(f"Wrote {path}") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/benchmarks/reflection_registry_benchmarks.cpp b/benchmarks/reflection_registry_benchmarks.cpp new file mode 100644 index 0000000..b465bae --- /dev/null +++ b/benchmarks/reflection_registry_benchmarks.cpp @@ -0,0 +1,80 @@ +#include +#include +#include +#include + +#if !defined(THREADSCHEDULE_HAS_REFLECTION) || !THREADSCHEDULE_HAS_REFLECTION +#error "reflection_registry_benchmarks.cpp requires THREADSCHEDULE_HAS_REFLECTION" +#endif + +using namespace threadschedule; + +namespace +{ + +auto make_entries(std::size_t count) -> std::vector +{ + std::vector entries; + entries.reserve(count); + for (std::size_t index = 0; index < count; ++index) + { + RegisteredThreadInfo info{}; + info.tid = static_cast(index + 1); + info.name = "worker-" + std::to_string(index); + info.componentTag = (index % 3 == 0) ? "io" : ((index % 3 == 1) ? "compute" : "scheduler"); + info.alive = (index % 5) != 0; + entries.push_back(std::move(info)); + } + return entries; +} + +} // namespace + +static void BM_QueryView_FilterMapName(benchmark::State& state) +{ + ThreadRegistry::QueryView view(make_entries(static_cast(state.range(0)))); + for (auto _ : state) + { + auto names = view.filter([](RegisteredThreadInfo const& entry) { return entry.componentTag == "io"; }) + .map([](RegisteredThreadInfo const& entry) { return entry.name; }); + benchmark::DoNotOptimize(names); + } +} + +static void BM_QueryView_ReflectionWhereProjectName(benchmark::State& state) +{ + ThreadRegistry::QueryView view(make_entries(static_cast(state.range(0)))); + for (auto _ : state) + { + auto names = + view.where("io").project(); + benchmark::DoNotOptimize(names); + } +} + +static void BM_QueryView_FindIf(benchmark::State& state) +{ + ThreadRegistry::QueryView view(make_entries(static_cast(state.range(0)))); + for (auto _ : state) + { + auto found = view.find_if([](RegisteredThreadInfo const& entry) { return entry.name == "worker-42"; }); + benchmark::DoNotOptimize(found); + } +} + +static void BM_QueryView_ReflectionFindBy(benchmark::State& state) +{ + ThreadRegistry::QueryView view(make_entries(static_cast(state.range(0)))); + for (auto _ : state) + { + auto found = view.find_by(std::string("worker-42")); + benchmark::DoNotOptimize(found); + } +} + +BENCHMARK(BM_QueryView_FilterMapName)->Arg(256)->Arg(4096)->Arg(16384); +BENCHMARK(BM_QueryView_ReflectionWhereProjectName)->Arg(256)->Arg(4096)->Arg(16384); +BENCHMARK(BM_QueryView_FindIf)->Arg(256)->Arg(4096)->Arg(16384); +BENCHMARK(BM_QueryView_ReflectionFindBy)->Arg(256)->Arg(4096)->Arg(16384); + +BENCHMARK_MAIN(); diff --git a/benchmarks/threadpool_benchmarks.cpp b/benchmarks/threadpool_benchmarks.cpp index 71ceb53..e9b7383 100644 --- a/benchmarks/threadpool_benchmarks.cpp +++ b/benchmarks/threadpool_benchmarks.cpp @@ -456,6 +456,103 @@ static void BM_ComparePoolTypes_LightWorkload(benchmark::State& state) state.SetLabel(pool_names[pool_type] + " tasks=" + std::to_string(num_tasks)); } +// ============================================================================= +// Pool comparison across workload weights (pool constructed once, not per-iter) +// ============================================================================= +// Unlike BM_ComparePoolTypes_LightWorkload (which rebuilds the pool every +// iteration and only runs a light task), this benchmark builds the pool once and +// sweeps the per-task work. It shows how the best pool changes with workload: +// - tiny : submission overhead dominates -> LightweightPool wins +// - heavy : execution dominates -> the field converges +// - imbalanced: a few tasks are far heavier than the rest -> the work-stealing +// HighPerformancePool balances the load and pulls ahead +static void bench_busy_work(int iters) +{ + volatile long sum = 0; + for (int i = 0; i < iters; ++i) + sum += static_cast(i) * i; +} + +// Per-task work (in busy-loop iterations) for a given workload and task index. +static int bench_work_iters(int workload, size_t task_index) +{ + switch (workload) + { + case 0: // tiny: pure scheduling overhead + return 50; + case 1: // medium: a few microseconds of work each + return 2000; + case 2: // heavy: uniform, execution-bound + return 30000; + default: // imbalanced: every 16th task is very heavy, the rest are tiny + return (task_index % 16 == 0) ? 120000 : 50; + } +} + +static void BM_ComparePoolWorkload(benchmark::State& state) +{ + size_t const num_threads = 4; + size_t const num_tasks = 4000; + int const pool_type = static_cast(state.range(0)); + int const workload = static_cast(state.range(1)); + + char const* const workload_names[] = {"tiny", "medium", "heavy", "imbalanced"}; + char const* const pool_names[] = {"ThreadPool", "FastThreadPool", "HighPerformancePool", "LightweightPool"}; + + auto submit_loop = [&](auto& pool) { + for (auto _ : state) + { + std::vector> futures; + futures.reserve(num_tasks); + for (size_t i = 0; i < num_tasks; ++i) + futures.push_back(pool.submit([workload, i]() { bench_busy_work(bench_work_iters(workload, i)); })); + for (auto& f : futures) + f.wait(); + } + }; + + if (pool_type == 0) + { + ThreadPool pool(num_threads); + pool.configure_threads("bench"); + submit_loop(pool); + } + else if (pool_type == 1) + { + FastThreadPool pool(num_threads); + pool.configure_threads("bench"); + submit_loop(pool); + } + else if (pool_type == 2) + { + HighPerformancePool pool(num_threads); + pool.configure_threads("bench"); + pool.distribute_across_cpus(); + submit_loop(pool); + } + else + { + LightweightPool pool(num_threads); + pool.configure_threads("bench"); + for (auto _ : state) + { + std::atomic counter{0}; + for (size_t i = 0; i < num_tasks; ++i) + { + pool.post([&counter, workload, i]() { + bench_busy_work(bench_work_iters(workload, i)); + counter.fetch_add(1, std::memory_order_relaxed); + }); + } + while (counter.load(std::memory_order_acquire) < num_tasks) + std::this_thread::yield(); + } + } + + state.SetItemsProcessed(state.iterations() * num_tasks); + state.SetLabel(std::string(pool_names[pool_type]) + " " + workload_names[workload]); +} + // ============================================================================= // Post vs Submit comparison (fire-and-forget overhead on pools that support both) // ============================================================================= @@ -663,6 +760,27 @@ BENCHMARK(BM_ComparePoolTypes_LightWorkload) ->Args({100000, 3}) ->Unit(benchmark::kMillisecond); +// Pool comparison across workload weights (pool built once) +// Args: {pool_type 0..3, workload 0=minimal 1=light 2=medium 3=heavy} +BENCHMARK(BM_ComparePoolWorkload) + ->Args({0, 0}) + ->Args({1, 0}) + ->Args({2, 0}) + ->Args({3, 0}) + ->Args({0, 1}) + ->Args({1, 1}) + ->Args({2, 1}) + ->Args({3, 1}) + ->Args({0, 2}) + ->Args({1, 2}) + ->Args({2, 2}) + ->Args({3, 2}) + ->Args({0, 3}) + ->Args({1, 3}) + ->Args({2, 3}) + ->Args({3, 3}) + ->Unit(benchmark::kMillisecond); + // Post vs Submit overhead comparison BENCHMARK(BM_PostVsSubmit) ->Args({1000, 0}) diff --git a/docs/CMAKE_REFERENCE.md b/docs/CMAKE_REFERENCE.md index d90ee83..396d619 100644 --- a/docs/CMAKE_REFERENCE.md +++ b/docs/CMAKE_REFERENCE.md @@ -8,6 +8,7 @@ | `THREADSCHEDULE_BUILD_TESTS` | BOOL | OFF | Build unit tests | | `THREADSCHEDULE_BUILD_BENCHMARKS` | BOOL | OFF | Build benchmarks (downloads Google Benchmark) | | `THREADSCHEDULE_RUNTIME` | BOOL | OFF | Build shared runtime library for process-wide registry | +| `THREADSCHEDULE_ENABLE_REFLECTION` | BOOL | ON | Enable GCC 16.1+ C++26 reflection APIs and reflection-backed registry queries when supported | | `THREADSCHEDULE_INSTALL` | BOOL | ON (main project)
OFF (subdirectory) | Generate install targets | ## CMake Variables @@ -43,6 +44,16 @@ add_subdirectory(ThreadSchedule) ``` Features: All features + latest language enhancements +### C++26 + GCC Reflection +```cmake +set(CMAKE_CXX_STANDARD 26) +set(THREADSCHEDULE_ENABLE_REFLECTION ON) +add_subdirectory(ThreadSchedule) +``` +Features: All regular C++26 features plus `threadschedule::reflect` and +reflection-backed registry APIs when using GCC 16.1+ with working +`-freflection` support. + ## Usage Examples ### Minimal Integration (Default) diff --git a/docs/benchmarks/callable_sbo.svg b/docs/benchmarks/callable_sbo.svg new file mode 100644 index 0000000..923540c --- /dev/null +++ b/docs/benchmarks/callable_sbo.svg @@ -0,0 +1,31 @@ + + +Do the SBO callables help? (C++26) +Per-task cost; the 48 B capture fits the SBO buffer but spills the std-library callable to the heap (lower is better) + +0.0 + +6.0 + +12.0 + +18.0 + +24.0 + +30.0 +ns per task + + +small (8 B) + + +medium (48 B) + + +large (128 B) + +move_callable (ThreadPool / std lib) + +SboCallable (LightweightPool) + \ No newline at end of file diff --git a/docs/benchmarks/callable_standards.svg b/docs/benchmarks/callable_standards.svg new file mode 100644 index 0000000..e8c8c2f --- /dev/null +++ b/docs/benchmarks/callable_standards.svg @@ -0,0 +1,41 @@ + + +Does replacing std::function help? (ThreadPool task storage) +Build + invoke cost per task for detail::move_callable (std::function on C++17/20, std::move_only_function on C++23+); lower is better + +0.0 + +6.0 + +12.0 + +18.0 + +24.0 + +30.0 +ns per task + + + + +small (8 B) + + + + +medium (48 B) + + + + +large (128 B) + +C++17 + +C++20 + +C++23 + +C++26 + \ No newline at end of file diff --git a/docs/benchmarks/pool_comparison.svg b/docs/benchmarks/pool_comparison.svg new file mode 100644 index 0000000..c8480ba --- /dev/null +++ b/docs/benchmarks/pool_comparison.svg @@ -0,0 +1,17 @@ + + +Thread pool comparison — light workload +Wall-clock time to run 100,000 tiny tasks (lower is better, relative to ThreadPool) +LightweightPool + +38.18 ms (1.86x faster) +ThreadPool + +71.13 ms (baseline) +FastThreadPool + +78.26 ms (1.10x slower) +HighPerformancePool + +119.45 ms (1.68x slower) + \ No newline at end of file diff --git a/docs/benchmarks/pool_throughput.svg b/docs/benchmarks/pool_throughput.svg new file mode 100644 index 0000000..2e57694 --- /dev/null +++ b/docs/benchmarks/pool_throughput.svg @@ -0,0 +1,51 @@ + + +Thread pool throughput by batch size +Tasks processed per second for the light workload (higher is better) + +0.0 + +0.8 + +1.6 + +2.4 + +3.2 + +4.0 +M tasks / second + + + + +10 + + + + +100 + + + + +1,000 + + + + +10,000 + + + + +100,000 + +ThreadPool + +FastThreadPool + +HighPerformancePool + +LightweightPool + \ No newline at end of file diff --git a/docs/benchmarks/pool_workload.svg b/docs/benchmarks/pool_workload.svg new file mode 100644 index 0000000..86c7eac --- /dev/null +++ b/docs/benchmarks/pool_workload.svg @@ -0,0 +1,46 @@ + + +Which pool wins depends on the workload +Time relative to the fastest pool per workload (1.0 = winner, shorter is better; pool built once, 4 threads) + +0.0 + +0.6 + +1.2 + +1.8 + +2.4 + +3.0 +relative time (1.0 = fastest) + + + + +tiny + + + + +medium + + + + +heavy + + + + +imbalanced + +ThreadPool + +FastThreadPool + +HighPerformancePool + +LightweightPool + \ No newline at end of file diff --git a/docs/benchmarks/post_vs_submit.svg b/docs/benchmarks/post_vs_submit.svg new file mode 100644 index 0000000..8e503fb --- /dev/null +++ b/docs/benchmarks/post_vs_submit.svg @@ -0,0 +1,11 @@ + + +post() vs submit() +Submission overhead for 100,000 tasks: post() skips the future/packaged_task path (lower is better) +post(fire-forget) + +13.33 ms (8.63x faster) +submit(future) + +115.05 ms (baseline) + \ No newline at end of file diff --git a/docs/benchmarks/reflection_lookup.svg b/docs/benchmarks/reflection_lookup.svg new file mode 100644 index 0000000..ff0ca43 --- /dev/null +++ b/docs/benchmarks/reflection_lookup.svg @@ -0,0 +1,11 @@ + + +Reflection registry query: find by field +Locating a single entry by name over 16,384 registered threads (lower is better) +find_if (hand-written) + +22 ns (baseline) +find_by (reflection) + +88 ns (4.04x slower) + \ No newline at end of file diff --git a/docs/benchmarks/reflection_query.svg b/docs/benchmarks/reflection_query.svg new file mode 100644 index 0000000..e5a7324 --- /dev/null +++ b/docs/benchmarks/reflection_query.svg @@ -0,0 +1,11 @@ + + +Reflection registry query: project a field +Selecting + projecting one field over 16,384 registered threads (lower is better) +filter + map (hand-written) + +93.55 us (baseline) +where + project (reflection) + +115.34 us (1.23x slower) + \ No newline at end of file diff --git a/include/threadschedule/reflection.hpp b/include/threadschedule/reflection.hpp new file mode 100644 index 0000000..da98549 --- /dev/null +++ b/include/threadschedule/reflection.hpp @@ -0,0 +1,161 @@ +#pragma once + +#if defined(THREADSCHEDULE_HAS_REFLECTION) && THREADSCHEDULE_HAS_REFLECTION + +#include +#include +#include +#include +#include +#include +#include +#include + +namespace threadschedule::reflect +{ + +using info = std::meta::info; +inline constexpr bool enabled = true; + +template +consteval auto fields() -> std::span +{ + static_assert(std::meta::is_class_type(^^T) || std::meta::is_union_type(^^T), + "threadschedule::reflect::fields() requires a class or union type"); + return std::define_static_array(std::meta::nonstatic_data_members_of(^^T, std::meta::access_context::current())); +} + +template +consteval auto field_count() -> std::size_t +{ + return fields().size(); +} + +template +consteval auto field_info() -> info +{ + static_assert(I < field_count(), "reflection field index out of range"); + return fields()[I]; +} + +template +consteval auto field_name() -> std::string_view +{ + return std::string_view(std::define_static_string(std::meta::identifier_of(Field))); +} + +template +consteval auto field_name() -> std::string_view +{ + return field_name()>(); +} + +template +consteval auto type_name() -> std::string_view +{ + return std::string_view(std::define_static_string(std::meta::display_string_of(^^T))); +} + +template +using field_type_t = [: std::meta::type_of(Field) :]; + +template +constexpr decltype(auto) get(T&& obj) +{ + return std::forward(obj).[:Field:]; +} + +template +inline constexpr bool is_field_of_v = std::meta::is_same_type(std::meta::parent_of(Field), ^^Owner); + +template +consteval auto field_names() -> std::span; + +namespace detail +{ + +template +struct projection_type; + +template +struct projection_type +{ + using type = field_type_t; +}; + +template +struct projection_type +{ + using type = std::tuple, field_type_t, field_type_t...>; +}; + +template +constexpr void visit_fields_impl(T&& obj, F&& fn, std::index_sequence) +{ + using object_type = std::remove_cv_t>; + constexpr auto names = field_names(); + (fn(names[I], get()>(std::forward(obj))), ...); +} + +template +consteval auto field_names_impl(std::index_sequence) -> std::span +{ + return std::define_static_array( + std::array{std::define_static_string(std::meta::identifier_of(field_info()))...}); +} + +template +consteval info first_field() noexcept +{ + return First; +} + +template +consteval void require_field_owner() +{ + static_assert(std::meta::is_same_type(std::meta::parent_of(Field), ^^Owner), + "Reflection field does not belong to the requested owner type"); +} + +} // namespace detail + +template +consteval auto field_names() -> std::span +{ + return detail::field_names_impl(std::make_index_sequence()>{}); +} + +template +constexpr void visit_fields(T&& obj, F&& fn) +{ + using object_type = std::remove_cv_t>; + detail::visit_fields_impl(std::forward(obj), std::forward(fn), + std::make_index_sequence()>{}); +} + +template +using projection_t = typename detail::projection_type::type; + +template +constexpr auto project_value(T&& obj) -> projection_t +{ + static_assert(sizeof...(Fields) > 0, "project_value requires at least one field"); + if constexpr (sizeof...(Fields) == 1) + { + return get()>(std::forward(obj)); + } + else + { + return projection_t{get(std::forward(obj))...}; + } +} + +template +consteval void require_field_owner() +{ + detail::require_field_owner(); +} + +} // namespace threadschedule::reflect + +#endif diff --git a/include/threadschedule/thread_registry.hpp b/include/threadschedule/thread_registry.hpp index 5dd1b36..b06415b 100644 --- a/include/threadschedule/thread_registry.hpp +++ b/include/threadschedule/thread_registry.hpp @@ -7,6 +7,9 @@ #include "callable.hpp" #include "expected.hpp" +#if defined(THREADSCHEDULE_HAS_REFLECTION) && THREADSCHEDULE_HAS_REFLECTION +#include "reflection.hpp" +#endif #include "scheduler_policy.hpp" #include "thread_wrapper.hpp" // for ThreadInfo, ThreadAffinity #include @@ -88,6 +91,18 @@ struct RegisteredThreadInfo std::shared_ptr control; }; +#if defined(THREADSCHEDULE_HAS_REFLECTION) && THREADSCHEDULE_HAS_REFLECTION +namespace registered_thread_fields +{ +consteval auto tid() -> reflect::info { return reflect::field_info(); } +consteval auto stdId() -> reflect::info { return reflect::field_info(); } +consteval auto name() -> reflect::info { return reflect::field_info(); } +consteval auto componentTag() -> reflect::info { return reflect::field_info(); } +consteval auto alive() -> reflect::info { return reflect::field_info(); } +consteval auto control() -> reflect::info { return reflect::field_info(); } +} // namespace registered_thread_fields +#endif + using RegistryCallback = detail::copyable_callable; /** @@ -217,6 +232,14 @@ class ThreadControlBlock namespace detail { +#if defined(THREADSCHEDULE_HAS_REFLECTION) && THREADSCHEDULE_HAS_REFLECTION +template +consteval void validate_reflected_field() +{ + reflect::require_field_owner(); +} +#endif + /** * @brief CRTP mixin that provides functional-style query facade methods. * @@ -290,6 +313,38 @@ class QueryFacadeMixin [[nodiscard]] auto take(size_t n) const { return self().query().take(n); } [[nodiscard]] auto skip(size_t n) const { return self().query().skip(n); } + +#if defined(THREADSCHEDULE_HAS_REFLECTION) && THREADSCHEDULE_HAS_REFLECTION + template + [[nodiscard]] auto where(Value const& value) const + { + return self().query().template where(value); + } + + template + [[nodiscard]] auto where_if(Predicate&& pred) const + { + return self().query().template where_if(std::forward(pred)); + } + + template + [[nodiscard]] auto find_by(Value const& value) const + { + return self().query().template find_by(value); + } + + template + [[nodiscard]] auto contains(Value const& value) const -> bool + { + return self().query().template contains(value); + } + + template + [[nodiscard]] auto project() const + { + return self().query().template project(); + } +#endif }; } // namespace detail @@ -551,6 +606,71 @@ class ThreadRegistry : public detail::QueryFacadeMixin return QueryView(std::move(result)); } +#if defined(THREADSCHEDULE_HAS_REFLECTION) && THREADSCHEDULE_HAS_REFLECTION + template + [[nodiscard]] auto where(Value const& value) const -> QueryView + { + detail::validate_reflected_field(); + std::vector filtered; + filtered.reserve(entries_.size()); + for (auto const& entry : entries_) + { + if (reflect::get(entry) == value) + filtered.push_back(entry); + } + return QueryView(std::move(filtered)); + } + + template + [[nodiscard]] auto where_if(Predicate&& pred) const -> QueryView + { + detail::validate_reflected_field(); + static_assert(std::is_invocable_r_v const&>, + "Reflection predicate must accept the selected field type"); + std::vector filtered; + filtered.reserve(entries_.size()); + for (auto const& entry : entries_) + { + if (pred(reflect::get(entry))) + filtered.push_back(entry); + } + return QueryView(std::move(filtered)); + } + + template + [[nodiscard]] auto find_by(Value const& value) const -> std::optional + { + detail::validate_reflected_field(); + for (auto const& entry : entries_) + { + if (reflect::get(entry) == value) + return entry; + } + return std::nullopt; + } + + template + [[nodiscard]] auto contains(Value const& value) const -> bool + { + return find_by(value).has_value(); + } + + template + [[nodiscard]] auto project() const -> std::vector> + { + static_assert(sizeof...(Fields) > 0, "project requires at least one field"); + (detail::validate_reflected_field(), ...); + + std::vector> result; + result.reserve(entries_.size()); + for (auto const& entry : entries_) + { + result.push_back(reflect::project_value(entry)); + } + return result; + } +#endif + private: std::vector entries_; }; @@ -568,6 +688,87 @@ class ThreadRegistry : public detail::QueryFacadeMixin return QueryView(std::move(snapshot)); } +#if defined(THREADSCHEDULE_HAS_REFLECTION) && THREADSCHEDULE_HAS_REFLECTION + template + [[nodiscard]] auto where(Value const& value) const -> QueryView + { + detail::validate_reflected_field(); + std::vector filtered; + std::shared_lock lock(mutex_); + filtered.reserve(threads_.size()); + for (auto const& [tid, entry] : threads_) + { + (void)tid; + if (reflect::get(entry) == value) + filtered.push_back(entry); + } + return QueryView(std::move(filtered)); + } + + template + [[nodiscard]] auto where_if(Predicate&& pred) const -> QueryView + { + detail::validate_reflected_field(); + static_assert(std::is_invocable_r_v const&>, + "Reflection predicate must accept the selected field type"); + std::vector filtered; + std::shared_lock lock(mutex_); + filtered.reserve(threads_.size()); + for (auto const& [tid, entry] : threads_) + { + (void)tid; + if (pred(reflect::get(entry))) + filtered.push_back(entry); + } + return QueryView(std::move(filtered)); + } + + template + [[nodiscard]] auto find_by(Value const& value) const -> std::optional + { + detail::validate_reflected_field(); + std::shared_lock lock(mutex_); + for (auto const& [tid, entry] : threads_) + { + (void)tid; + if (reflect::get(entry) == value) + return entry; + } + return std::nullopt; + } + + template + [[nodiscard]] auto contains(Value const& value) const -> bool + { + detail::validate_reflected_field(); + std::shared_lock lock(mutex_); + for (auto const& [tid, entry] : threads_) + { + (void)tid; + if (reflect::get(entry) == value) + return true; + } + return false; + } + + template + [[nodiscard]] auto project() const -> std::vector> + { + static_assert(sizeof...(Fields) > 0, "project requires at least one field"); + (detail::validate_reflected_field(), ...); + + std::vector> result; + std::shared_lock lock(mutex_); + result.reserve(threads_.size()); + for (auto const& [tid, entry] : threads_) + { + (void)tid; + result.push_back(reflect::project_value(entry)); + } + return result; + } +#endif + [[nodiscard]] auto set_affinity(Tid tid, ThreadAffinity const& affinity) const -> expected { auto blk = lock_block(tid); diff --git a/include/threadschedule/threadschedule.hpp b/include/threadschedule/threadschedule.hpp index 521dd4f..7be59e4 100644 --- a/include/threadschedule/threadschedule.hpp +++ b/include/threadschedule/threadschedule.hpp @@ -18,6 +18,10 @@ #include "thread_wrapper.hpp" #include "topology.hpp" +#if defined(THREADSCHEDULE_HAS_REFLECTION) && THREADSCHEDULE_HAS_REFLECTION +#include "reflection.hpp" +#endif + /** * @file threadschedule.hpp * @brief Modern C++17/20/23/26 Thread Scheduling Library diff --git a/run_benchmark_graphs.sh b/run_benchmark_graphs.sh new file mode 100755 index 0000000..f7ff6a0 --- /dev/null +++ b/run_benchmark_graphs.sh @@ -0,0 +1,121 @@ +#!/bin/bash + +set -euo pipefail + +QUICK_MODE=false +SHOW_HELP=false +OUTPUT_DIR="" + +while [[ $# -gt 0 ]]; do + case "$1" in + --quick|-q) + QUICK_MODE=true + shift + ;; + --output-dir) + OUTPUT_DIR="$2" + shift 2 + ;; + --help|-h) + SHOW_HELP=true + shift + ;; + *) + echo "Unknown argument: $1" >&2 + exit 1 + ;; + esac +done + +if [[ "$SHOW_HELP" == "true" ]]; then + cat <<'EOF' +ThreadSchedule Benchmark Graph Runner + +Usage: + ./run_benchmark_graphs.sh [--quick] [--output-dir DIR] + +What it does: + - Runs selected comparison-focused Google Benchmark targets + - Writes JSON outputs into a report directory + - Generates an HTML report with inline SVG graphs and speedups + - Captures local machine specs automatically inside the report + +Examples: + ./run_benchmark_graphs.sh + ./run_benchmark_graphs.sh --quick + ./run_benchmark_graphs.sh --output-dir build/benchmark-reports/latest +EOF + exit 0 +fi + +PROJECT_ROOT="$(cd "$(dirname "$0")" && pwd)" +BUILD_DIR="${PROJECT_ROOT}/build" +BENCHMARK_DIR="${BUILD_DIR}/benchmarks" + +if [[ -z "${OUTPUT_DIR}" ]]; then + TIMESTAMP="$(date +%Y%m%d-%H%M%S)" + OUTPUT_DIR="${BUILD_DIR}/benchmark-reports/${TIMESTAMP}" +fi + +mkdir -p "${OUTPUT_DIR}" + +if [[ ! -d "${BENCHMARK_DIR}" ]]; then + echo "Benchmark directory not found: ${BENCHMARK_DIR}" >&2 + echo "Build with -DTHREADSCHEDULE_BUILD_BENCHMARKS=ON first." >&2 + exit 1 +fi + +if [[ "$QUICK_MODE" == "true" ]]; then + BENCH_MIN_TIME="0.4s" + BENCH_REPETITIONS="1" +else + BENCH_MIN_TIME="1.5s" + BENCH_REPETITIONS="3" +fi + +run_json_benchmark() { + local executable="$1" + local filter="$2" + local output_json="$3" + + if [[ ! -x "${BENCHMARK_DIR}/${executable}" ]]; then + echo "Skipping ${executable}: not built" >&2 + return 0 + fi + + "${BENCHMARK_DIR}/${executable}" \ + --benchmark_filter="${filter}" \ + --benchmark_min_time="${BENCH_MIN_TIME}" \ + --benchmark_repetitions="${BENCH_REPETITIONS}" \ + --benchmark_format=json \ + --benchmark_out="${output_json}" \ + --benchmark_out_format=json +} + +JSON_FILES=() + +THREADPOOL_JSON="${OUTPUT_DIR}/threadpool_comparisons.json" +run_json_benchmark "threadpool_basic_benchmarks" "BM_ComparePoolTypes_LightWorkload|BM_PostVsSubmit" "${THREADPOOL_JSON}" +if [[ -f "${THREADPOOL_JSON}" ]]; then + JSON_FILES+=("${THREADPOOL_JSON}") +fi + +REFLECTION_JSON="${OUTPUT_DIR}/reflection_registry.json" +run_json_benchmark "reflection_registry_benchmarks" "BM_QueryView_.*" "${REFLECTION_JSON}" +if [[ -f "${REFLECTION_JSON}" ]]; then + JSON_FILES+=("${REFLECTION_JSON}") +fi + +if [[ ${#JSON_FILES[@]} -eq 0 ]]; then + echo "No benchmark JSON files were produced." >&2 + exit 1 +fi + +python3 "${PROJECT_ROOT}/benchmarks/generate_benchmark_report.py" \ + --output "${OUTPUT_DIR}/index.html" \ + --title "ThreadSchedule benchmark comparison report" \ + "${JSON_FILES[@]}" + +echo +echo "Benchmark graphs written to:" +echo " ${OUTPUT_DIR}/index.html" diff --git a/src/threadschedule.cppm b/src/threadschedule.cppm index affb995..03833f2 100644 --- a/src/threadschedule.cppm +++ b/src/threadschedule.cppm @@ -166,6 +166,35 @@ using ::threadschedule::cgroup_attach_tid; } // export namespace threadschedule +#if defined(THREADSCHEDULE_HAS_REFLECTION) && THREADSCHEDULE_HAS_REFLECTION +export namespace threadschedule::reflect { +using ::threadschedule::reflect::enabled; +using ::threadschedule::reflect::info; +using ::threadschedule::reflect::fields; +using ::threadschedule::reflect::field_count; +using ::threadschedule::reflect::field_info; +using ::threadschedule::reflect::field_name; +using ::threadschedule::reflect::field_names; +using ::threadschedule::reflect::type_name; +using ::threadschedule::reflect::get; +using ::threadschedule::reflect::visit_fields; +using ::threadschedule::reflect::project_value; +using ::threadschedule::reflect::require_field_owner; +using ::threadschedule::reflect::is_field_of_v; +using ::threadschedule::reflect::field_type_t; +using ::threadschedule::reflect::projection_t; +} + +export namespace threadschedule::registered_thread_fields { +using ::threadschedule::registered_thread_fields::tid; +using ::threadschedule::registered_thread_fields::stdId; +using ::threadschedule::registered_thread_fields::name; +using ::threadschedule::registered_thread_fields::componentTag; +using ::threadschedule::registered_thread_fields::alive; +using ::threadschedule::registered_thread_fields::control; +} +#endif + // Re-export profiles sub-namespace export namespace threadschedule::profiles { using ::threadschedule::profiles::realtime; diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index e649533..c644d55 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -126,6 +126,19 @@ if(TARGET gtest) PROPERTIES TIMEOUT 120 ) + if(THREADSCHEDULE_HAS_REFLECTION) + add_executable(reflection_test reflection_test.cpp) + target_link_libraries(reflection_test + ThreadSchedule::ThreadSchedule + gtest + gtest_main + ) + gtest_discover_tests(reflection_test + DISCOVERY_TIMEOUT 60 + PROPERTIES TIMEOUT 120 + ) + endif() + if(THREADSCHEDULE_RUNTIME) add_executable(runtime_registry_test runtime_registry_test.cpp) target_link_libraries(runtime_registry_test diff --git a/tests/reflection_test.cpp b/tests/reflection_test.cpp new file mode 100644 index 0000000..c9baf1a --- /dev/null +++ b/tests/reflection_test.cpp @@ -0,0 +1,61 @@ +#include +#include +#include +#include +#include + +#if !defined(THREADSCHEDULE_HAS_REFLECTION) || !THREADSCHEDULE_HAS_REFLECTION +#error "reflection_test.cpp requires THREADSCHEDULE_HAS_REFLECTION" +#endif + +using namespace threadschedule; + +TEST(ReflectionApiTest, ExposesMetadataForLibraryTypes) +{ + static_assert(reflect::enabled); + static_assert(reflect::field_count() == 6); + static_assert(reflect::field_count() == 4); + static_assert(reflect::field_count() == 3); + static_assert(reflect::field_count() == 7); + static_assert(reflect::field_count() == 6); + static_assert(reflect::field_name() == "tid"); + static_assert(reflect::field_name() == "componentTag"); + static_assert(reflect::field_name() == "policy"); + static_assert(reflect::field_name() == "priority_jitter"); + constexpr auto registry_field_names = reflect::field_names(); + static_assert(registry_field_names.size() == 6); + static_assert(std::string_view(registry_field_names[2]) == "name"); + static_assert(reflect::type_name().contains("ThreadProfile")); +} + +TEST(ReflectionApiTest, VisitFieldsAndGetWorkForPublicStructs) +{ + ThreadProfile profile{"latency", SchedulingPolicy::RR, ThreadPriority{3}, std::nullopt}; + std::array expected = {"name", "policy", "priority", "affinity"}; + std::size_t index = 0; + + reflect::visit_fields(profile, [&](std::string_view name, auto&) { + ASSERT_LT(index, expected.size()); + EXPECT_EQ(name, expected[index]); + ++index; + }); + + EXPECT_EQ(index, expected.size()); + EXPECT_EQ(reflect::get()>(profile), "latency"); +} + +TEST(ReflectionApiTest, ProjectValueBuildsCompactResults) +{ + RegisteredThreadInfo info{}; + info.tid = Tid{11}; + info.name = "alpha"; + info.componentTag = "io"; + info.alive = true; + + auto tuple = + reflect::project_value(info); + + EXPECT_EQ(std::get<0>(tuple), "alpha"); + EXPECT_EQ(std::get<1>(tuple), "io"); + EXPECT_TRUE(reflect::is_field_of_v); +} diff --git a/tests/registry_query_test.cpp b/tests/registry_query_test.cpp index 702bcf7..3641cdd 100644 --- a/tests/registry_query_test.cpp +++ b/tests/registry_query_test.cpp @@ -171,3 +171,36 @@ TEST_F(RegistryQueryTest, ChainedFilterMapForEach) EXPECT_TRUE(names.count("alpha")); EXPECT_TRUE(names.count("gamma")); } + +#if defined(THREADSCHEDULE_HAS_REFLECTION) && THREADSCHEDULE_HAS_REFLECTION +TEST_F(RegistryQueryTest, ReflectionContainsAndFindBy) +{ + EXPECT_TRUE(registry().contains("beta")); + auto found = registry().find_by("beta"); + ASSERT_TRUE(found.has_value()); + EXPECT_EQ(found->componentTag, "compute"); +} + +TEST_F(RegistryQueryTest, ReflectionWhereAndProject) +{ + auto io_names = + registry().where("io").project(); + EXPECT_EQ(io_names.size(), 2u); + std::set names(io_names.begin(), io_names.end()); + EXPECT_TRUE(names.count("alpha")); + EXPECT_TRUE(names.count("gamma")); +} + +TEST_F(RegistryQueryTest, ReflectionQueryViewWhereIf) +{ + auto names = registry() + .query() + .where_if([](bool alive) { return alive; }) + .where_if([](std::string const& name) { + return name.starts_with("g"); + }) + .project(); + ASSERT_EQ(names.size(), 1u); + EXPECT_EQ(names.front(), "gamma"); +} +#endif