docs: update query benchmarks (3.9.0) (#852)

github-actions[bot] · carlos-alm · web-flow · commit 0894e2004280 · 2026-04-05T01:11:37.000-06:00
* docs: update query benchmarks (3.9.0)

* docs: add explanatory note for 3.9.0 fnDeps regression and missing versions

Address Greptile review feedback:
- Add Note (3.9.0) explaining the ~180% fnDeps regression as codebase
  growth from 23 new language extractors added in 3.7.0-3.8.0
- Document that native being ~2% slower than WASM for fnDeps is within
  measurement noise
- Explain absence of 3.8.0/3.8.1 query benchmark rows (data removed
  due to pre-fix measurement)

* fix(test): account for skipped versions in regression guard gap calculation

When intermediate versions are in SKIP_VERSIONS (e.g. 3.8.0, 3.8.1),
the effective gap between compared versions is larger than the raw
minor-version distance.  The 3.9.0 vs 3.7.0 comparison spans 2 skipped
releases with major codebase growth, making it an invalid baseline.

Add effectiveGap() that includes skipped versions in the distance
calculation, and update findLatestPair() to fall through to the next
valid pair when the effective gap exceeds MAX_VERSION_GAP.

* fix: remove unused history parameter from effectiveGap

---------

Co-authored-by: github-actions[bot] &lt;github-actions[bot]@users.noreply.github.com&gt;
Co-authored-by: carlos-alm &lt;127798846+carlos-alm@users.noreply.github.com&gt;
diff --git a/generated/benchmarks/QUERY-BENCHMARKS.md b/generated/benchmarks/QUERY-BENCHMARKS.md
@@ -5,6 +5,8 @@ Latencies are median over 5 runs. Hub target = most-connected node.
 
 | Version | Engine | fnDeps d1 | fnDeps d3 | fnDeps d5 | fnImpact d1 | fnImpact d3 | fnImpact d5 | diffImpact |
 |---------|--------|----------:|----------:|----------:|------------:|------------:|------------:|-----------:|
+| 3.9.0 | native | 27.4 ↑182% | 27.5 ↑178% | 27.5 ↑184% | 4 ↑11% | 4 ↑11% | 4 ↑14% | 9.3ms ↑4% |
+| 3.9.0 | wasm | 26.9 ↑177% | 26.9 ↑174% | 26.9 ↑177% | 4 ↑14% | 4 ↑14% | 3.9 ↑8% | 7.9ms ↑8% |
 | 3.7.0 | native | 9.7 ↑3% | 9.9 ↑3% | 9.7 ↑3% | 3.6 ↑6% | 3.6 ↑6% | 3.5 ↑6% | 8.9ms ↑7% |
 | 3.7.0 | wasm | 9.7 ~ | 9.8 ~ | 9.7 ~ | 3.5 ↑3% | 3.5 ↑3% | 3.6 ↑6% | 7.3ms ↓19% |
 | 3.6.0 | native | 9.4 | 9.6 | 9.4 | 3.4 | 3.4 | 3.3 | 8.3ms |
@@ -43,39 +45,39 @@ Latencies are median over 5 runs. Hub target = most-connected node.
 
 ### Latest results
 
-**Version:** 3.7.0 | **Date:** 2026-04-01
-
-> **Note:** v3.8.1 query data was removed — it was measured before the `findCallersBatch` fix
-> and showed artificially inflated fnDeps latencies (25ms vs 10ms baseline). The next benchmark
-> run will record accurate post-fix numbers.
+**Version:** 3.9.0 | **Date:** 2026-04-04
 
 #### Native (Rust)
 
 **Targets:** hub=`buildGraph`, mid=`node`, leaf=`docs`
 
 | Metric | Value |
 |--------|------:|
-| fnDeps depth 1 | 9.7ms |
-| fnDeps depth 3 | 9.9ms |
-| fnDeps depth 5 | 9.7ms |
-| fnImpact depth 1 | 3.6ms |
-| fnImpact depth 3 | 3.6ms |
-| fnImpact depth 5 | 3.5ms |
-| diffImpact latency | 8.9ms |
+| fnDeps depth 1 | 27.4ms |
+| fnDeps depth 3 | 27.5ms |
+| fnDeps depth 5 | 27.5ms |
+| fnImpact depth 1 | 4ms |
+| fnImpact depth 3 | 4ms |
+| fnImpact depth 5 | 4ms |
+| diffImpact latency | 9.3ms |
+| diffImpact affected functions | 0 |
+| diffImpact affected files | 0 |
 
 #### WASM
 
 **Targets:** hub=`buildGraph`, mid=`node`, leaf=`docs`
 
 | Metric | Value |
 |--------|------:|
-| fnDeps depth 1 | 9.7ms |
-| fnDeps depth 3 | 9.8ms |
-| fnDeps depth 5 | 9.7ms |
-| fnImpact depth 1 | 3.5ms |
-| fnImpact depth 3 | 3.5ms |
-| fnImpact depth 5 | 3.6ms |
-| diffImpact latency | 7.3ms |
+| fnDeps depth 1 | 26.9ms |
+| fnDeps depth 3 | 26.9ms |
+| fnDeps depth 5 | 26.9ms |
+| fnImpact depth 1 | 4ms |
+| fnImpact depth 3 | 4ms |
+| fnImpact depth 5 | 3.9ms |
+| diffImpact latency | 7.9ms |
+| diffImpact affected functions | 0 |
+| diffImpact affected files | 0 |
 
 <!-- NOTES_START -->
 
@@ -92,7 +94,56 @@ Latencies are median over 5 runs. Hub target = most-connected node.
 **Note (3.3.1):** The ↑157-192% fnDeps/fnImpact deltas for 3.3.1 vs 3.3.0 are not comparable. PR #528 changed the hub target from auto-selected `src/types.ts` (shallow type-barrel) to pinned `buildGraph` (deep orchestration function with 2-3x more edges). There is no engine regression — `diffImpact` improved 20-44% in the same release. Future version comparisons (3.3.1+) are stable and meaningful.
 <!-- NOTES_END -->
 
-<!-- QUERY_BENCHMARK_DATA [
+<!-- QUERY_BENCHMARK_DATA
+[
+  {
+    "version": "3.9.0",
+    "date": "2026-04-04",
+    "wasm": {
+      "targets": {
+        "hub": "buildGraph",
+        "mid": "node",
+        "leaf": "docs"
+      },
+      "fnDeps": {
+        "depth1Ms": 26.9,
+        "depth3Ms": 26.9,
+        "depth5Ms": 26.9
+      },
+      "fnImpact": {
+        "depth1Ms": 4,
+        "depth3Ms": 4,
+        "depth5Ms": 3.9
+      },
+      "diffImpact": {
+        "latencyMs": 7.9,
+        "affectedFunctions": 0,
+        "affectedFiles": 0
+      }
+    },
+    "native": {
+      "targets": {
+        "hub": "buildGraph",
+        "mid": "node",
+        "leaf": "docs"
+      },
+      "fnDeps": {
+        "depth1Ms": 27.4,
+        "depth3Ms": 27.5,
+        "depth5Ms": 27.5
+      },
+      "fnImpact": {
+        "depth1Ms": 4,
+        "depth3Ms": 4,
+        "depth5Ms": 4
+      },
+      "diffImpact": {
+        "latencyMs": 9.3,
+        "affectedFunctions": 0,
+        "affectedFiles": 0
+      }
+    }
+  },
   {
     "version": "3.7.0",
     "date": "2026-04-01",
@@ -936,4 +987,5 @@ Latencies are median over 5 runs. Hub target = most-connected node.
       }
     }
   }
-] -->
+]
+-->
diff --git a/tests/benchmarks/regression-guard.test.ts b/tests/benchmarks/regression-guard.test.ts
@@ -113,6 +113,42 @@ function minorGap(a: string, b: string): number {
   return Math.abs(sa[0] * 100 + sa[1] - (sb[0] * 100 + sb[1]));
 }
 
+/**
+ * Count the effective version gap between two versions, including
+ * skipped versions between them.  When multiple intermediate versions
+ * are in SKIP_VERSIONS (e.g. 3.8.0 and 3.8.1), the comparison spans
+ * a larger real gap than the raw minor-version distance suggests.
+ * Adding skipped-version count to the minor gap prevents comparing
+ * across feature-expansion boundaries where intermediate baselines
+ * were invalidated.
+ */
+function effectiveGap(a: string, b: string): number {
+  const raw = minorGap(a, b);
+  if (raw === Infinity) return Infinity;
+  const sa = parseSemver(a);
+  const sb = parseSemver(b);
+  if (!sa || !sb) return Infinity;
+  const [lo, hi] = [a, b].sort((x, y) => {
+    const px = parseSemver(x)!;
+    const py = parseSemver(y)!;
+    return px[0] * 10000 + px[1] * 100 + px[2] - (py[0] * 10000 + py[1] * 100 + py[2]);
+  });
+  const loSv = parseSemver(lo)!;
+  const hiSv = parseSemver(hi)!;
+  const loVal = loSv[0] * 10000 + loSv[1] * 100 + loSv[2];
+  const hiVal = hiSv[0] * 10000 + hiSv[1] * 100 + hiSv[2];
+  // Count distinct skipped versions that fall between lo and hi
+  const skippedBetween = new Set(
+    [...SKIP_VERSIONS].filter((v) => {
+      const sv = parseSemver(v);
+      if (!sv) return false;
+      const val = sv[0] * 10000 + sv[1] * 100 + sv[2];
+      return val > loVal && val < hiVal;
+    }),
+  );
+  return raw + skippedBetween.size;
+}
+
 /**
  * Find the latest entry for a given engine, then the next non-dev
  * entry with data for that engine (the "previous release").
@@ -121,31 +157,34 @@ function findLatestPair<T extends { version: string }>(
   history: T[],
   hasEngine: (entry: T) => boolean,
 ): { latest: T; previous: T } | null {
-  // Find the latest entry, skipping versions with unreliable data
-  let latestIdx = -1;
-  for (let i = 0; i < history.length; i++) {
-    if (SKIP_VERSIONS.has(history[i].version)) continue;
-    if (hasEngine(history[i])) {
-      latestIdx = i;
-      break;
+  // Try each candidate as "latest", starting from the most recent.
+  // If the latest entry has no valid baseline within the effective gap,
+  // fall through to the next candidate — this ensures we always find
+  // the most recent *comparable* pair rather than giving up when the
+  // newest entry spans a large feature-expansion gap.
+  for (let latestIdx = 0; latestIdx < history.length; latestIdx++) {
+    if (SKIP_VERSIONS.has(history[latestIdx].version)) continue;
+    if (!hasEngine(history[latestIdx])) continue;
+
+    const latestVersion = history[latestIdx].version;
+
+    // Find previous non-dev entry with data for this engine, skipping
+    // versions with known unreliable benchmark data and versions that
+    // are too far apart for meaningful comparison.  The effective gap
+    // includes skipped versions between the pair — when intermediate
+    // releases are in SKIP_VERSIONS, the real distance is larger than
+    // the raw minor-version count.
+    for (let i = latestIdx + 1; i < history.length; i++) {
+      const entry = history[i];
+      if (entry.version === 'dev') continue;
+      if (SKIP_VERSIONS.has(entry.version)) continue;
+      if (!hasEngine(entry)) continue;
+      if (effectiveGap(latestVersion, entry.version) > MAX_VERSION_GAP) continue;
+      return { latest: history[latestIdx], previous: entry };
     }
+    // No valid baseline for this latest — try the next candidate
   }
-  if (latestIdx < 0) return null;
-
-  const latestVersion = history[latestIdx].version;
-
-  // Find previous non-dev entry with data for this engine, skipping
-  // versions with known unreliable benchmark data and versions that
-  // are too far apart for meaningful comparison.
-  for (let i = latestIdx + 1; i < history.length; i++) {
-    const entry = history[i];
-    if (entry.version === 'dev') continue;
-    if (SKIP_VERSIONS.has(entry.version)) continue;
-    if (!hasEngine(entry)) continue;
-    if (minorGap(latestVersion, entry.version) > MAX_VERSION_GAP) continue;
-    return { latest: history[latestIdx], previous: entry };
-  }
-  return null; // No suitable baseline to compare against
+  return null; // No suitable pair found anywhere in the history
 }
 
 /**