Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
---

<!-- COVERAGE_START -->
![English Coverage](https://img.shields.io/badge/en_coverage-99%25-green.svg) 494/500 docs translated
![English Coverage](https://img.shields.io/badge/en_coverage-89%25-green.svg) 494/552 docs translated
<!-- COVERAGE_END -->

## 这是什么项目
Expand Down
12 changes: 12 additions & 0 deletions code/volumn_codes/vol6-performance/ch00/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
cmake_minimum_required(VERSION 3.20)
project(vol6_ch00_performance_mindset LANGUAGES CXX)

set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_CXX_EXTENSIONS OFF)

set(CMAKE_EXPORT_COMPILE_COMMANDS ON)

# ch00-01 命题验证:vector+二分 vs set+find,同为 O(log n) 的缓存效应对比
add_executable(vector_vs_set vector_vs_set.cpp)
target_compile_options(vector_vs_set PRIVATE -Wall -Wextra -Wpedantic -O2)
24 changes: 24 additions & 0 deletions code/volumn_codes/vol6-performance/ch00/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
# vol6 ch00 · 性能思维 — 代码示例

对应文章:`documents/vol6-performance/ch00-performance-mindset/01-efficiency-vs-performance.md`

## vector_vs_set

验证本卷开篇命题 **efficiency ≠ performance**:`std::vector` + `std::lower_bound`(二分查找)与 `std::set::find` 都是 $O(\log n)$,但在真实硬件上,当 N 超出缓存后,连续内存的 `vector` 把节点分散的 `set` 甩开好几倍。

### 构建

```bash
# 直接编译(最快)
g++ -O2 -std=c++17 vector_vs_set.cpp -o vector_vs_set
./vector_vs_set

# 或用 CMake
cmake -B build && cmake --build build && ./build/vector_vs_set
```

### 怎么读结果

关心**趋势**(N 增大后 `set/vector` 比值上升)和**命题**(同复杂度差几倍),不要把某个具体倍数当普适结论。绝对数字随 CPU / 编译器 / libc++ 实现而变。

代码里几处防失真细节(`volatile global_sink` 防死代码消除、全部命中消除偏差、多轮取中位数压离群值)是 vol6 ch01 *Benchmark 方法论* 的伏笔,文章里逐条有讲。
69 changes: 69 additions & 0 deletions code/volumn_codes/vol6-performance/ch00/vector_vs_set.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
// vector_vs_set.cpp —— vol6 ch00-01 命题验证
// 同为 O(log n) 的查找,缓存效应能差多少?
//
// 编译:g++ -O2 -std=c++17 vector_vs_set.cpp -o vector_vs_set
// 或: cmake -B build && cmake --build build && ./build/vector_vs_set
#include <algorithm>
#include <chrono>
#include <cstdint>
#include <cstdio>
#include <random>
#include <set>
#include <vector>

using Clock = std::chrono::steady_clock;

/// 多轮取中位数,把离群值压下去(ch01 测量方法论的伏笔)
static double median(std::vector<double>& v) {
std::sort(v.begin(), v.end());
return v[v.size() / 2];
}

int main() {
constexpr int queries = 2'000'000; // 每个 N 查 200 万次,摊薄单次噪声
constexpr int trials = 5; // 跑 5 轮取中位数
volatile std::int64_t global_sink = 0; // 防止整段循环被死代码消除(DCE)

printf("%-10s %18s %18s %10s\n", "N", "vector(ns/q)", "set(ns/q)", "set/vector");
printf("------------------------------------------------------------\n");
for (int N : {1024, 4096, 16384, 65536, 262144, 1048576}) {
std::mt19937_64 rng(12345);
std::vector<int> keys(N);
for (int i = 0; i < N; ++i)
keys[i] = i * 2; // 偶数、稀疏
std::vector<int> sorted = keys;
std::sort(sorted.begin(), sorted.end()); // vector 二分用
std::set<int> sset(keys.begin(), keys.end()); // set 红黑树

// 全部命中(查存在的 key),消除「找不到」走不同路径的偏差
std::vector<int> toFind(queries);
for (int i = 0; i < queries; ++i)
toFind[i] = keys[rng() % N];

std::vector<double> tv, ts;
for (int t = 0; t < trials; ++t) {
std::int64_t acc = 0;
auto a = Clock::now();
for (int q : toFind) {
auto it = std::lower_bound(sorted.begin(), sorted.end(), q);
acc += (it != sorted.end() && *it == q);
}
auto b = Clock::now();
tv.push_back(std::chrono::duration<double, std::nano>(b - a).count() / queries);
global_sink += acc;

acc = 0;
auto c = Clock::now();
for (int q : toFind) {
auto it = sset.find(q);
acc += (it != sset.end());
}
auto d = Clock::now();
ts.push_back(std::chrono::duration<double, std::nano>(d - c).count() / queries);
global_sink += acc;
}
const double mv = median(tv), ms = median(ts);
printf("%-10d %18.1f %18.1f %10.1fx\n", N, mv, ms, ms / mv);
}
printf("\nglobal_sink=%lld (防死代码消除)\n", (long long)global_sink);
}
23 changes: 23 additions & 0 deletions code/volumn_codes/vol6-performance/ch01/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
cmake_minimum_required(VERSION 3.20)
project(vol6_ch01_benchmark_methodology CXX)

set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_CXX_EXTENSIONS OFF)
set(CMAKE_EXPORT_COMPILE_COMMANDS ON)

# FetchContent 拉 Google Benchmark —— reader 不用预装,clone 仓库就能跑
include(FetchContent)
FetchContent_Declare(benchmark
GIT_REPOSITORY https://github.com/google/benchmark.git
GIT_TAG v1.9.5)
# 关掉 benchmark 自己的测试目标(注意 flag 名是 BENCHMARK_ENABLE_TESTING,
# 不是 BENCHMARK_ENABLE_TESTS——写错会去 build 它的内部测试、缺 gtest 就炸)
set(BENCHMARK_ENABLE_TESTING OFF CACHE BOOL "" FORCE)
set(BENCHMARK_ENABLE_GTEST_TESTS OFF CACHE BOOL "" FORCE)
FetchContent_MakeAvailable(benchmark)

# ch01-02 最小完整例子:push_back + DoNotOptimize/ClobberMemory + 参数扫描 + 重复聚合
add_executable(push_bench push_bench.cpp)
target_link_libraries(push_bench PRIVATE benchmark::benchmark_main)
target_compile_options(push_bench PRIVATE -O2 -Wall -Wextra -Wpedantic)
28 changes: 28 additions & 0 deletions code/volumn_codes/vol6-performance/ch01/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
# vol6 ch01 · Benchmark 方法论 — 代码示例

对应文章:`documents/vol6-performance/ch01-benchmark-methodology/02-credible-microbenchmark.md`

## push_bench

ch01-02 的最小完整 GBench 例子:测 `std::vector::push_back`,演示四件套——`DoNotOptimize`/`ClobberMemory` 防 DCE、`Range` 参数扫描、`Repetitions`+`ReportAggregatesOnly` 重复聚合、`UseRealTime` 墙钟计时。

### 构建(任选其一)

```bash
# 1) 系统装了 GBench(Arch: pacman -S benchmark;macOS: brew install google-benchmark)
g++ -O2 -std=c++17 push_bench.cpp -o push_bench -lbenchmark -lpthread
./push_bench

# 2) CMake + FetchContent(免预装,首次会 clone + build benchmark,几分钟)
cmake -B build && cmake --build build && ./build/push_bench
```

### 怎么读输出

- `Time` 是墙钟(`UseRealTime`)、`CPU` 是 CPU 时间;聚合行的 `Iterations` 列显示的是重复次数(3),不是每轮真实迭代数(被聚合隐藏了)。
- 盯 `cv`(coefficient of variation = `stddev/mean`):<1% 很稳;>5% 这轮测得不可信,查噪声源(ch01-03)。
- 时间随 N 涨才是 `push_back` 真实的样子;如果你测出来不随 N 变,多半是被 DCE 删成空壳了(缺 `DoNotOptimize`)。

### 一个会踩的坑

`BENCHMARK_ENABLE_TESTING OFF`(关 benchmark 自己的测试目标)flag 名是 `BENCHMARK_ENABLE_TESTING`,不是 `BENCHMARK_ENABLE_TESTS`。写错的话 `cmake --build` 会因为 benchmark 内部测试目标失败而整体非零退出,即便你的 `push_bench` 已经编过了——看输出里有没有 `Built target push_bench`,有就直接 `./build/push_bench` 跑。
54 changes: 54 additions & 0 deletions code/volumn_codes/vol6-performance/ch01/perf-env-check.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
#!/usr/bin/env bash
# perf-env-check.sh —— vol6 ch01-03 可信 microbenchmark 环境体检(只查不改)
#
# 用法:bash perf-env-check.sh
# 它不修改任何东西(改 governor / 关 Turbo 要 sudo,留给你自己决定),只把发现的问题打印出来。
# 对应文章:documents/vol6-performance/ch01-benchmark-methodology/03-pitfalls-and-env.md
set -u

ok() { printf " ✓ %s\n" "$1"; }
warn() { printf " ⚠ %s — %s\n" "$1" "$2"; }

echo "=== CPU governor(应=performance;否则 DVFS 让数字浮动)==="
if [ -f /sys/devices/system/cpu/cpu0/cpufreq/scaling_governor ]; then
g=$(cat /sys/devices/system/cpu/cpu0/cpufreq/scaling_governor)
[ "$g" = performance ] && ok "governor=performance" \
|| warn "governor=$g" "sudo cpupower frequency-set -g performance"
else
echo " · 无 cpufreq 接口(可能已锁频或虚拟化屏蔽),跳过"
fi

echo "=== Turbo Boost(Intel pstate)==="
if [ -f /sys/devices/system/cpu/intel_pstate/no_turbo ]; then
nt=$(cat /sys/devices/system/cpu/intel_pstate/no_turbo)
[ "$nt" = 1 ] && ok "Turbo 已关" \
|| warn "Turbo 开着(no_turbo=$nt)" "冷热启动数字会差,BIOS 或这里关"
else
echo " · 非 intel_pstate,跳过(可在 BIOS 设)"
fi

echo "=== perf_event_paranoid(<=1 才好采样)==="
p=$(cat /proc/sys/kernel/perf_event_paranoid 2>/dev/null || echo 3)
[ "$p" -le 1 ] 2>/dev/null && ok "perf_event_paranoid=$p" \
|| warn "perf_event_paranoid=$p" "sudo sysctl -w kernel.perf_event_paranoid=1"

echo "=== NUMA 拓扑(多 socket 才在意)==="
if command -v numactl >/dev/null 2>&1; then
numactl --hardware 2>/dev/null | grep -E "^available|node [0-9]+ (cpus|size)" | head -6
else
warn "无 numactl" "apt install numactl / pacman -S numactl;多 socket 机器必装"
fi

echo "=== CPU 亲和性(应明确绑一个核,别让 OS 晃)==="
cpu=$(grep Cpus_allowed_list /proc/self/status 2>/dev/null | awk '{print $2}')
n=$(nproc 2>/dev/null)
echo " Cpus_allowed_list=$cpu (nproc=$n)"
echo " → 想绑核:taskset -c <某个核> ./bench (别挑 0 号核,常被系统中断占用)"

echo "=== ASLR(微架构精细测时应关)==="
aslr=$(cat /proc/sys/kernel/randomize_va_space 2>/dev/null || echo "?")
echo " randomize_va_space=$aslr (2=全开;精细 icache/分支测时: sudo sysctl -w kernel.randomize_va_space=0)"

echo ""
echo "体检完毕。microbenchmark A/B 场景:把上面 ⚠ 尽量清掉;"
echo "评估生产性能时:这些噪声源反而要保留(复刻真实),见 ch01-05。"
32 changes: 32 additions & 0 deletions code/volumn_codes/vol6-performance/ch01/push_bench.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
// push_bench.cpp —— vol6 ch01-02 最小完整 GBench 例子
// 测 std::vector::push_back,演示 DoNotOptimize/ClobberMemory + 参数扫描 + 重复聚合
//
// 构建(任选其一):
// 1) 系统装了 GBench(Arch: pacman -S benchmark):
// g++ -O2 -std=c++17 push_bench.cpp -o push_bench -lbenchmark -lpthread
// 2) CMake + FetchContent(reader 免预装,见同目录 CMakeLists.txt):
// cmake -B build && cmake --build build && ./build/push_bench
#include <benchmark/benchmark.h>
#include <vector>

// push_back 带 DoNotOptimize+ClobberMemory:防 DCE + 强制写落内存
static void BM_PushBack(benchmark::State& state) {
for (auto _ : state) { // 计时循环:框架控制迭代次数
std::vector<int> v;
for (int i = 0; i < state.range(0); ++i) {
v.push_back(i);
benchmark::DoNotOptimize(v.data()); // 防 DCE + 内存 barrier
}
benchmark::ClobberMemory(); // 确保写真正落内存
}
state.SetComplexityN(state.range(0)); // 告诉框架 big-O 的 N,自动拟合
}

BENCHMARK(BM_PushBack)
->RangeMultiplier(2)
->Range(8, 8 << 6) // 参数扫描:8,16,32,...,512
->UseRealTime() // 报墙钟时间,不是 CPU 时间
->Repetitions(3) // 跑 3 轮
->ReportAggregatesOnly(true); // 只报 mean/median/stddev/cv

BENCHMARK_MAIN();
Loading
Loading