cpp_utils/benchmarking/benchmark.hxx at main · Zanzibarr/cpp_utils · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
#pragma once

/**
 * @file benchmark.hxx
 * @brief Micro-benchmark framework with per-iteration timing and statistical output.
 * @version 1.0.0
 *
 * @details
 * Benchmarks are registered with `BENCH_SUITE` / `BENCH_CASE` macros at
 * static-init time and run via `bench_registry::instance().run_all()`.
 *
 * Each benchmark receives a `bench_state&` whose range-for loop records
 * one nanosecond sample per iteration (same idiom as Google Benchmark):
 * @code
 *   BENCH_CASE("my bench") {
 *       for (auto _ : state) { DoNotOptimize(my_function()); }
 *   }
 * @endcode
 *
 * After all iterations complete, `detail::compute_result()` derives mean,
 * median, stddev, min, and max from the raw sample vector.  Results are
 * printed in a colour-coded table with auto-scaling units (ns / µs / ms / s).
 *
 * `DoNotOptimize()` uses inline asm constraints on GCC/Clang to prevent the
 * compiler from eliminating the benchmarked expression.
 *
 * @author Matteo Zanella <matteozanella2@gmail.com>
 * Copyright 2026 Matteo Zanella
 *
 * Repository: https://github.com/Zanzibarr/cpp_utils
 *
 * SPDX-License-Identifier: MIT
 */

#include <algorithm>
#include <chrono>
#include <cmath>
#include <functional>
#include <iomanip>
#include <iostream>
#include <sstream>
#include <string>
#include <string_view>
#include <vector>

#include "../utilities/ansi_colors.hxx"  // TODO: Update to the actual path

// ─────────────────────────────────────────────────────────────────────────────
// ANSI colors — thin namespace aliases over the shared ansi:: helpers
// ─────────────────────────────────────────────────────────────────────────────
namespace benchmark::color {

inline auto enabled() -> bool { return ansi::enabled(); }
inline auto green(std::string_view str) -> std::string { return ansi::green(str); }
inline auto red(std::string_view str) -> std::string { return ansi::red(str); }
inline auto yellow(std::string_view str) -> std::string { return ansi::yellow(str); }
inline auto cyan(std::string_view str) -> std::string { return ansi::cyan(str); }
inline auto bold(std::string_view str) -> std::string { return ansi::bold(str); }
inline auto dim(std::string_view str) -> std::string { return ansi::dim(str); }

}  // namespace benchmark::color

// ─────────────────────────────────────────────────────────────────────────────
// benchmark_result — plain data, computed after a run
// ─────────────────────────────────────────────────────────────────────────────
namespace benchmark {

struct benchmark_result {
    std::string suite;
    std::string name;
    std::size_t iterations{};
    double mean_ns{};
    double median_ns{};
    double stddev_ns{};
    double min_ns{};
    double max_ns{};
};

// ─────────────────────────────────────────────────────────────────────────────
// DoNotOptimize — prevents the compiler from optimizing away the benchmarked
// expression.  Uses the same pattern as Google Benchmark / nanobench.
// ─────────────────────────────────────────────────────────────────────────────

#if defined(__GNUC__) || defined(__clang__)
template <typename T>
inline void DoNotOptimize(T const& val) {
    asm volatile("" : : "r,m"(val) : "memory");
}
template <typename T>
inline void DoNotOptimize(T& val) {
    asm volatile("" : "+r,m"(val) : : "memory");
}
#else
// MSVC / unknown: volatile store is the best we can do portably
template <typename T>
inline void DoNotOptimize(T const& val) {
    const volatile T* ptr = &val;
    (void)ptr;
}
#endif

// ─────────────────────────────────────────────────────────────────────────────
// bench_state — passed into every benchmark function, mirrors the Google
// Benchmark `State` loop idiom but without range complexity.
//
//   BENCH_CASE("my bench") {
//       for (auto _ : state) {
//           DoNotOptimize(my_function());
//       }
//   }
// ─────────────────────────────────────────────────────────────────────────────

class bench_state {
   public:
    explicit bench_state(std::size_t iters) : iters_(iters) {}

    // ── range-for support ──────────────────────────────────────────────────

    struct iterator {
        bench_state* state;
        std::size_t index;

        auto operator!=(const iterator& other) const -> bool { return index != other.index; }
        auto operator++() -> iterator& {
            state->lap();
            ++index;
            return *this;
        }
        // dereference returns an unused int — the `auto _` binding just needs
        // something to bind to; we match Google Benchmark's convention.
        auto operator*() const -> int { return 0; }
    };

    auto begin() -> iterator {
        start_ = clock::now();
        return {.state = this, .index = 0};
    }

    auto end() -> iterator { return {.state = this, .index = iters_}; }

    // ── results ────────────────────────────────────────────────────────────

    [[nodiscard]] auto samples() const -> const std::vector<double>& { return samples_ns_; }
    [[nodiscard]] auto iterations() const -> std::size_t { return iters_; }

   private:
    using clock = std::chrono::steady_clock;

    std::size_t iters_;
    clock::time_point start_;
    std::vector<double> samples_ns_;

    void lap() {
        auto now = clock::now();
        double nanos = std::chrono::duration<double, std::nano>(now - start_).count();
        samples_ns_.push_back(nanos);
        start_ = now;  // reset for next iteration
    }
};

// ─────────────────────────────────────────────────────────────────────────────
// Statistics helpers
// ─────────────────────────────────────────────────────────────────────────────

namespace detail {

inline auto compute_result(std::string suite, std::string name, std::vector<double> samples) -> benchmark_result {
    if (samples.empty()) {
        return {.suite = suite, .name = name, .iterations = 0, .mean_ns = 0, .median_ns = 0, .stddev_ns = 0, .min_ns = 0, .max_ns = 0};
    }

    std::ranges::sort(samples);

    const std::size_t smpl_size = samples.size();
    const double mean = [&] {
        double sum = 0;
        for (double samp : samples) {
            sum += samp;
        }
        return sum / static_cast<double>(smpl_size);
    }();

    const double median = (smpl_size % 2 == 0) ? (samples[(smpl_size / 2) - 1] + samples[smpl_size / 2]) / 2.0 : samples[smpl_size / 2];

    const double variance = [&] {
        double acc = 0;
        for (double samp : samples) {
            acc += (samp - mean) * (samp - mean);
        }
        return acc / static_cast<double>(smpl_size);
    }();

    return benchmark_result{
        .suite = std::move(suite),
        .name = std::move(name),
        .iterations = smpl_size,
        .mean_ns = mean,
        .median_ns = median,
        .stddev_ns = std::sqrt(variance),
        .min_ns = samples.front(),
        .max_ns = samples.back(),
    };
}

// ── Time formatting ──────────────────────────────────────────────────────────
//
// Chooses the most human-readable unit: ns / µs / ms / s.

inline auto fmt_time(double nanoseconds) -> std::string {
    constexpr double NS_PER_US = 1e3;
    constexpr double NS_PER_MS = 1e6;
    constexpr double NS_PER_S = 1e9;

    std::ostringstream oss;
    oss << std::fixed << std::setprecision(2);
    if (nanoseconds < NS_PER_US) {
        oss << nanoseconds << " ns";
    } else if (nanoseconds < NS_PER_MS) {
        oss << nanoseconds / NS_PER_US << " µs";
    } else if (nanoseconds < NS_PER_S) {
        oss << nanoseconds / NS_PER_MS << " ms";
    } else {
        oss << nanoseconds / NS_PER_S << "  s";
    }
    return oss.str();
}

}  // namespace detail

// ─────────────────────────────────────────────────────────────────────────────
// bench_case — one registered benchmark
// ─────────────────────────────────────────────────────────────────────────────

struct bench_case {
    std::string suite;
    std::string name;
    std::function<void(bench_state&)> fn;
    std::size_t iterations;
    std::size_t warmup;
};

// ─────────────────────────────────────────────────────────────────────────────
// bench_registry — collects and runs all benchmarks
// ─────────────────────────────────────────────────────────────────────────────

class bench_registry {
   public:
    static auto instance() -> bench_registry& {
        static bench_registry reg;
        return reg;
    }

    auto register_bench(bench_case bcase) -> void { benches_.push_back(std::move(bcase)); }

    /**
     * @brief Run all registered benchmarks and print results.
     * @return 0 (reserved for future failure modes, e.g. regression checks).
     */
    auto run_all() -> int {
        print_header();

        std::vector<benchmark_result> results;
        std::string current_suite;

        for (auto& bcase : benches_) {
            if (bcase.suite != current_suite) {
                current_suite = bcase.suite;
                std::cout << "\n  " << color::bold(color::yellow("SUITE: " + current_suite)) << "\n";
            }

            // ── warmup ──────────────────────────────────────────────────────
            for (std::size_t warmup_idx = 0; warmup_idx < bcase.warmup; ++warmup_idx) {
                bench_state warmup_state(1);
                bcase.fn(warmup_state);
            }

            // ── measured run ─────────────────────────────────────────────────
            bench_state state(bcase.iterations);
            bcase.fn(state);

            auto res = detail::compute_result(bcase.suite, bcase.name, state.samples());
            print_result(res);
            results.push_back(res);
        }

        print_footer(results);
        return 0;
    }

   private:
    std::vector<bench_case> benches_;

    static void print_header() {
        std::cout << color::bold("\n+-------------------------------------+\n");
        std::cout << color::bold("|  cpp_utils benchmark runner          |\n");
        std::cout << color::bold("+-------------------------------------+\n");
    }

    static void print_result(const benchmark_result& result) {
        constexpr int NAME_COLUMN_WIDTH = 80;
        // Layout:  v  name    mean  median  stddev  [min … max]  N iters
        std::cout << "    " << color::green("v") << "  " << std::left << std::setw(NAME_COLUMN_WIDTH) << result.name
                  << color::cyan(detail::fmt_time(result.mean_ns)) << color::dim("  med " + detail::fmt_time(result.median_ns))
                  << color::dim("  σ " + detail::fmt_time(result.stddev_ns))
                  << color::dim("  [" + detail::fmt_time(result.min_ns) + " … " + detail::fmt_time(result.max_ns) + "]")
                  << color::dim("  ×" + std::to_string(result.iterations)) << "\n";
    }

    static void print_footer(const std::vector<benchmark_result>& results) {
        constexpr int SEPARATOR_WIDTH = 42;
        std::cout << "\n" << std::string(SEPARATOR_WIDTH, '-') << "\n";
        std::cout << "  " << color::green(std::to_string(results.size()) + " benchmarks completed") << "\n";
        std::cout << std::string(SEPARATOR_WIDTH, '-') << "\n\n";
    }
};

// ─────────────────────────────────────────────────────────────────────────────
// auto_bench_registrar — RAII helper, registers at static-init time
// ─────────────────────────────────────────────────────────────────────────────

struct auto_bench_registrar {
    auto_bench_registrar(const char* suite, const char* name, void (*func)(bench_state&), std::size_t iters, std::size_t warmup) {
        bench_registry::instance().register_bench({
            .suite = suite,
            .name = name,
            .fn = func,
            .iterations = iters,
            .warmup = warmup,
        });
    }
};

}  // namespace benchmark

// ─────────────────────────────────────────────────────────────────────────────
// Macro helpers
// ─────────────────────────────────────────────────────────────────────────────
#define _BM_CAT2(a, b) a##b
#define _BM_CAT(a, b) _BM_CAT2(a, b)

// ─────────────────────────────────────────────────────────────────────────────
// BENCH_SUITE — sets the current suite name for following BENCH_CASEs
// ─────────────────────────────────────────────────────────────────────────────
namespace {
inline const char* _bm_current_suite_ = "<unset>";
}

#define BENCH_SUITE(name)                                        \
    static const char* _BM_CAT(_bm_suite_str_, __LINE__) = name; \
    static int _BM_CAT(_bm_suite_set_, __LINE__) = (_bm_current_suite_ = _BM_CAT(_bm_suite_str_, __LINE__), 0);

// ─────────────────────────────────────────────────────────────────────────────
// BENCH_CASE — default iterations / warmup
//
//   BENCH_CASE("my bench") { for (auto _ : state) { ... } }
//
// BENCH_CASE_N — explicit iteration count
//
//   BENCH_CASE_N("my bench", 10000) { for (auto _ : state) { ... } }
//
// BENCH_CASE_NW — explicit iteration count and warmup count
//
//   BENCH_CASE_NW("my bench", 10000, 100) { for (auto _ : state) { ... } }
// ─────────────────────────────────────────────────────────────────────────────

#define _BM_DEFINE(test_name, iters, warmup)                                                                                                 \
    static void _BM_CAT(_bm_fn_, __LINE__)(::benchmark::bench_state & state);                                                                \
    static ::benchmark::auto_bench_registrar _BM_CAT(_bm_reg_, __LINE__)(_bm_current_suite_, test_name, _BM_CAT(_bm_fn_, __LINE__), (iters), \
                                                                         (warmup));                                                          \
    static void _BM_CAT(_bm_fn_, __LINE__)(::benchmark::bench_state & state)

#define BENCH_CASE(test_name) _BM_DEFINE(test_name, 1000, 10)
#define BENCH_CASE_N(test_name, iters) _BM_DEFINE(test_name, iters, 10)
#define BENCH_CASE_NW(test_name, iters, w) _BM_DEFINE(test_name, iters, w)