From 96a0d0b73d82e0aa5b2240cd3fb7434d45b73a4a Mon Sep 17 00:00:00 2001 From: Thomas Date: Thu, 4 Jun 2026 14:44:02 -0400 Subject: [PATCH 1/3] Add local regex_parsing benchmark --- benchmark/regex_parsing/analysis.py | 81 ++++++++++++++++++ benchmark/regex_parsing/lading.yaml | 16 ++++ benchmark/regex_parsing/run.sh | 125 ++++++++++++++++++++++++++++ benchmark/regex_parsing/vector.yaml | 22 +++++ 4 files changed, 244 insertions(+) create mode 100644 benchmark/regex_parsing/analysis.py create mode 100644 benchmark/regex_parsing/lading.yaml create mode 100755 benchmark/regex_parsing/run.sh create mode 100644 benchmark/regex_parsing/vector.yaml diff --git a/benchmark/regex_parsing/analysis.py b/benchmark/regex_parsing/analysis.py new file mode 100644 index 0000000000000..0143e1e9c66f7 --- /dev/null +++ b/benchmark/regex_parsing/analysis.py @@ -0,0 +1,81 @@ +import json, sys +from collections import defaultdict + +run_dir = sys.argv[1] + +# --- Throughput from lading captures --- +total_written = total_received = total_requests = 0 +start_t = end_t = None +with open(f"{run_dir}/lading.captures") as f: + for line in f: + obj = json.loads(line) + metric = obj.get('metric_name', '') + comp = obj.get('component', '') + cname = obj.get('component_name', '') + value = obj.get('value', 0) + t = obj.get('time', 0) + if start_t is None or t < start_t: start_t = t + if end_t is None or t > end_t: end_t = t + if cname == 'http': + if metric == 'bytes_written' and comp == 'generator': + total_written = max(total_written, value) + elif metric == 'bytes_received' and comp == 'blackhole': + total_received = max(total_received, value) + elif metric == 'requests_sent' and comp == 'generator': + total_requests = max(total_requests, value) + +duration = max((end_t - start_t) / 1000.0, 1e-9) +print(f" Duration: {duration:.1f}s") +print(f" Sent in: {total_written/1e6:7.1f} MB ({total_written/1e6/duration:6.1f} MB/s)") +print(f" Sent out: {total_received/1e6:7.1f} MB ({total_received/1e6/duration:6.1f} MB/s)") +print(f" Requests/s: {total_requests/duration:.0f}") + +# --- Remap CPU breakdown --- +categories = defaultdict(int) +total = 0 +with open(f"{run_dir}/sample.folded") as f: + for line in f: + line = line.strip() + if not line: continue + parts = line.rsplit(' ', 1) + if len(parts) != 2: continue + stack, count = parts[0], int(parts[1]) + # Only stacks doing remap work, not parked + if 'SyncTransform::transform_all' not in stack and 'Remap' not in stack: + continue + if 'park_internal' in stack or '__psynch_cvwait' in stack or 'kevent' in stack: + continue + total += count + leaf = stack.split(';')[-1] + if 'regex_automata' in stack: + if any(x in stack for x in ('get_slow', 'create_cache', 'init_cache')): + categories['regex: cache miss/init'] += count + else: + categories['regex: DFA matching'] += count + elif 'capture_regex_to_map' in stack: + categories['capture_regex_to_map'] += count + elif 'BTreeMap' in stack and ('clone' in stack or 'dying' in stack): + categories['BTreeMap clone / drop'] += count + elif 'drop_in_place' in leaf or 'drop_slow' in leaf: + categories['Value drop/dealloc'] += count + elif 'finish_grow' in leaf or 'nanov2' in leaf or 'malloc' in leaf.lower() or 'realloc' in leaf: + categories['heap alloc'] += count + elif '_free' in leaf or 'nanov2_free' in leaf or 'bzero' in leaf or 'memset' in leaf: + categories['heap free'] += count + elif 'tracing_subscriber' in stack and ('event' in stack.lower() or 'record' in stack): + categories['tracing: error events'] += count + elif 'vrl' in stack and 'resolve' in stack: + categories['VRL interpreter'] += count + elif 'memmove' in leaf or 'memcpy' in leaf: + categories['memcpy/memmove'] += count + elif 'Arc' in stack or 'drop_in_place' in stack: + categories['Arc/refcount'] += count + else: + categories['other'] += count + +print() +print(f" Remap samples: {total}") +for k, v in sorted(categories.items(), key=lambda x: -x[1]): + pct = 100.0 * v / total if total else 0 + bar = '█' * int(pct / 2) + print(f" {pct:5.1f}% {bar:<25} {k} ({v})") diff --git a/benchmark/regex_parsing/lading.yaml b/benchmark/regex_parsing/lading.yaml new file mode 100644 index 0000000000000..5f29fedda9926 --- /dev/null +++ b/benchmark/regex_parsing/lading.yaml @@ -0,0 +1,16 @@ +generator: + - http: + seed: [2, 3, 5, 7, 11, 13, 19, 23, 29, 31, 37, 41, 43, 47, 53, 59, 61, 67, 71, 73, 79, 83, 89, 97, 101, 103, 107, 109, 113, 127, 131, 137] + headers: {} + target_uri: "http://localhost:8282/" + bytes_per_second: "500 Mb" + parallel_connections: 10 + method: + post: + maximum_prebuild_cache_size_bytes: "256 Mb" + variant: "apache_common" + +blackhole: + - http: + binding_addr: "0.0.0.0:8080" + body_variant: "nothing" diff --git a/benchmark/regex_parsing/run.sh b/benchmark/regex_parsing/run.sh new file mode 100755 index 0000000000000..c246680c72f26 --- /dev/null +++ b/benchmark/regex_parsing/run.sh @@ -0,0 +1,125 @@ +#!/usr/bin/env bash +# +# Profiles Vector's regex parsing under load and produces a flamegraph. +# +# Drives Vector's http_server source with lading's apache_common HTTP payload, +# samples the running Vector process, and emits a flamegraph plus a remap-only +# CPU breakdown. Compare two runs by passing different LABELs. +# +# Usage: +# run.sh # default label = timestamp +# run.sh baseline # named run +# VECTOR_BIN=/path/to/vector run.sh baseline +# +# Prerequisites (macOS): +# - lading (cargo install lading) +# - inferno (cargo install inferno) +# - sample (ships with macOS) +# +# Vector must be built with debug symbols. Build with: +# cargo build --profile bench --no-default-features \ +# --features "sources-http_server,transforms-remap,sinks-http,vrl/stdlib" +# +# Note: macOS-only. On Linux, swap `sample` for `perf record` and +# `inferno-collapse-sample` for `inferno-collapse-perf`. + +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +REPO_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)" + +# Configurable via env vars +VECTOR_BIN="${VECTOR_BIN:-$REPO_ROOT/target/release/vector}" +VECTOR_CONFIG="${VECTOR_CONFIG:-$SCRIPT_DIR/vector.yaml}" +LADING_CONFIG="${LADING_CONFIG:-$SCRIPT_DIR/lading.yaml}" +OUT_DIR="${OUT_DIR:-/tmp/vector-regex-bench}" +SAMPLE_SECONDS="${SAMPLE_SECONDS:-30}" +WARMUP_SECONDS="${WARMUP_SECONDS:-12}" +EXPERIMENT_SECONDS="${EXPERIMENT_SECONDS:-60}" + +LABEL="${1:-$(date +%Y%m%d-%H%M%S)}" +RUN_DIR="$OUT_DIR/$LABEL" + +# Sanity checks +[[ -x "$VECTOR_BIN" ]] || { echo "Vector binary not found: $VECTOR_BIN" >&2; exit 1; } +[[ -f "$VECTOR_CONFIG" ]] || { echo "Vector config not found: $VECTOR_CONFIG" >&2; exit 1; } +[[ -f "$LADING_CONFIG" ]] || { echo "Lading config not found: $LADING_CONFIG" >&2; exit 1; } +for tool in lading sample inferno-collapse-sample inferno-flamegraph; do + command -v "$tool" >/dev/null || { echo "Required tool not on PATH: $tool" >&2; exit 1; } +done + +mkdir -p "$RUN_DIR" + +echo "==> $LABEL" +echo " Vector: $VECTOR_BIN" +echo " Config: $VECTOR_CONFIG" +echo " Output: $RUN_DIR" +echo + +VECTOR_PID="" +LADING_PID="" +cleanup() { + [[ -n "$VECTOR_PID" ]] && kill "$VECTOR_PID" 2>/dev/null || true + [[ -n "$LADING_PID" ]] && kill "$LADING_PID" 2>/dev/null || true + wait 2>/dev/null || true +} +trap cleanup EXIT INT TERM + +# Kill anything leftover from prior runs (lading or Vector on our ports) +pkill -f "$(basename "$VECTOR_BIN") --config $VECTOR_CONFIG" 2>/dev/null || true +pkill -f "lading --config-path $LADING_CONFIG" 2>/dev/null || true +sleep 1 + +echo "==> Starting Vector" +"$VECTOR_BIN" --config "$VECTOR_CONFIG" > "$RUN_DIR/vector.stdout" 2>&1 & +VECTOR_PID=$! +sleep 3 +if ! kill -0 "$VECTOR_PID" 2>/dev/null; then + echo "Vector crashed at startup:" + tail -20 "$RUN_DIR/vector.stdout" + exit 1 +fi +echo " PID $VECTOR_PID" + +echo "==> Starting lading (${EXPERIMENT_SECONDS}s experiment)" +lading \ + --config-path "$LADING_CONFIG" \ + --no-target \ + --capture-path "$RUN_DIR/lading.captures" \ + --experiment-duration-seconds "$EXPERIMENT_SECONDS" \ + --warmup-duration-seconds 5 \ + > "$RUN_DIR/lading.stdout" 2>&1 & +LADING_PID=$! +echo " PID $LADING_PID" + +echo "==> Warming up ${WARMUP_SECONDS}s" +sleep "$WARMUP_SECONDS" + +echo " CPU at sample-start:" +ps -p "$VECTOR_PID" -o pcpu= -o pmem= | awk '{printf " %.0f%% CPU, %.1f%% RSS\n", $1, $2}' + +echo "==> Sampling for ${SAMPLE_SECONDS}s" +sample "$VECTOR_PID" "$SAMPLE_SECONDS" -file "$RUN_DIR/sample.txt" > /dev/null + +echo "==> Generating flamegraph" +inferno-collapse-sample "$RUN_DIR/sample.txt" > "$RUN_DIR/sample.folded" +inferno-flamegraph --title "Vector regex parsing ($LABEL)" \ + "$RUN_DIR/sample.folded" > "$RUN_DIR/flamegraph.svg" + +# Stop Vector and lading so captures are flushed +kill "$VECTOR_PID" "$LADING_PID" 2>/dev/null || true +VECTOR_PID="" +LADING_PID="" +sleep 1 + +echo +echo "==> Analysis" +python3 "$SCRIPT_DIR"/analysis.py "$RUN_DIR" + +echo +echo "==> Outputs in $RUN_DIR" +echo " flamegraph.svg open with: open $RUN_DIR/flamegraph.svg" +echo " sample.txt raw macOS sample output" +echo " sample.folded collapsed stacks (inferno format)" +echo " lading.captures lading metrics (JSONL)" +echo " vector.stdout Vector logs" diff --git a/benchmark/regex_parsing/vector.yaml b/benchmark/regex_parsing/vector.yaml new file mode 100644 index 0000000000000..8fdea862182b1 --- /dev/null +++ b/benchmark/regex_parsing/vector.yaml @@ -0,0 +1,22 @@ +data_dir: /tmp/vector-regex-bench-data + +sources: + logs: + type: http_server + address: "0.0.0.0:8282" + encoding: text + +transforms: + apache_common_parser: + type: remap + inputs: [logs] + source: | + . = parse_regex_all!(.message, r'^(?P\d{1,3}\.\d{1,3}.\d{1,3}\.\d{1,3}) - (?P-|\S+) \[(?P.*)\] "(?PGET|PUT|POST|HEAD|DELETE) (?P[/\S]+) (?PHTTP/[12].[012])" (?P\d+) (?P\d+)$') + +sinks: + out: + type: http + inputs: [apache_common_parser] + uri: "http://localhost:8080" + encoding: + codec: json From 469da41196be2ee8845aafbdff0ba7bf0c335adb Mon Sep 17 00:00:00 2001 From: Thomas Date: Thu, 4 Jun 2026 15:35:08 -0400 Subject: [PATCH 2/3] Add qrelease profile --- Cargo.toml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Cargo.toml b/Cargo.toml index f5ad7c2d25d13..5a7becc2aabaa 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -44,6 +44,10 @@ debug = false # Do not include debug symbols in the executable. codegen-units = 1 lto = "fat" +[profile.qrelease] +inherits = "release" +lto = "thin" + [profile.bench] debug = true From 37aeda7b680dc54f123f38d819256ba586dc7fc4 Mon Sep 17 00:00:00 2001 From: Thomas Date: Thu, 4 Jun 2026 15:57:31 -0400 Subject: [PATCH 3/3] fix(benchmark): wait for background processes to prevent silent hang at exit --- benchmark/regex_parsing/run.sh | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/benchmark/regex_parsing/run.sh b/benchmark/regex_parsing/run.sh index c246680c72f26..20e7765d1797d 100755 --- a/benchmark/regex_parsing/run.sh +++ b/benchmark/regex_parsing/run.sh @@ -59,9 +59,10 @@ echo VECTOR_PID="" LADING_PID="" cleanup() { - [[ -n "$VECTOR_PID" ]] && kill "$VECTOR_PID" 2>/dev/null || true - [[ -n "$LADING_PID" ]] && kill "$LADING_PID" 2>/dev/null || true - wait 2>/dev/null || true + local pids=() + [[ -n "$VECTOR_PID" ]] && { kill "$VECTOR_PID" 2>/dev/null; pids+=("$VECTOR_PID"); } + [[ -n "$LADING_PID" ]] && { kill "$LADING_PID" 2>/dev/null; pids+=("$LADING_PID"); } + [[ ${#pids[@]} -gt 0 ]] && wait "${pids[@]}" 2>/dev/null || true } trap cleanup EXIT INT TERM @@ -106,11 +107,13 @@ inferno-collapse-sample "$RUN_DIR/sample.txt" > "$RUN_DIR/sample.folded" inferno-flamegraph --title "Vector regex parsing ($LABEL)" \ "$RUN_DIR/sample.folded" > "$RUN_DIR/flamegraph.svg" -# Stop Vector and lading so captures are flushed -kill "$VECTOR_PID" "$LADING_PID" 2>/dev/null || true +# Stop both processes and wait for them to exit before continuing. +# Must wait explicitly here — bash stalls at script exit until all tracked +# background jobs change state, causing a silent hang if we skip the wait. +kill "$LADING_PID" "$VECTOR_PID" 2>/dev/null || true +wait "$LADING_PID" "$VECTOR_PID" 2>/dev/null || true VECTOR_PID="" LADING_PID="" -sleep 1 echo echo "==> Analysis"