ffedoroff · ffedoroff · Jun 22, 2026 · Jun 22, 2026 · Jun 22, 2026 · Jun 22, 2026
diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml
@@ -0,0 +1,112 @@
+# CodeQL — advanced setup with per-language path gating.
+#
+# Replaces GitHub's *default* CodeQL setup so we can scan each language ONLY when
+# that language's source actually changed. Most non-Rust languages exist here only
+# as plugin test fixtures (`crates/.../languages/<lang>/tests/sample/**`) that
+# rarely change, yet default setup re-analyzed all of them on every PR.
+#
+# How the gating works: the `changes` job classifies the diff by *file extension*
+# (CodeQL analyzes source, so the extension is the right signal), then emits a
+# matrix of only the changed languages. Consequences of keying on extension:
+#   - golden snapshots (`*.json` / `*.sarif`) never match any language → no run;
+#   - a plugin's Rust file (e.g. `languages/csharp/dialect.rs`) is `rust`, not
+#     `csharp` → it triggers `rust`, not a pointless C# re-scan;
+#   - real first-party code outside the fixtures is covered too — `rust` spans all
+#     `**/*.rs`, `javascript-typescript` includes the viewer assets, `python`
+#     includes `.github/scripts/*.py`.
+# A docs-only PR matches nothing → the whole analyze job is skipped.
+#
+# The weekly `schedule` (and manual dispatch) force a FULL scan of every language
+# so coverage never drifts behind the gated PR runs.
+#
+# ⚠ REQUIRED ONE-TIME STEP: default setup must be turned OFF or it conflicts with
+# this workflow (uploads get rejected). Disable it in Settings → Code security →
+# "CodeQL analysis" → switch to Advanced, or run:
+#   gh api -X PATCH repos/ffedoroff/code-ranker/code-scanning/default-setup -f state=not-configured
+
+name: CodeQL
+
+on:
+  push:
+    branches: [main]
+  pull_request:
+  schedule:
+    - cron: '25 6 * * 1'   # weekly Monday full scan
+  workflow_dispatch:
+
+permissions:
+  contents: read
+
+concurrency:
+  group: codeql-${{ github.event.pull_request.number || github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  # Classify the diff → which CodeQL languages need analysis this run.
+  changes:
+    name: Detect changed languages
+    runs-on: ubuntu-22.04
+    outputs:
+      matrix: ${{ steps.matrix.outputs.matrix }}
+      any: ${{ steps.matrix.outputs.any }}
+    steps:
+      - uses: actions/checkout@v6
+      - uses: dorny/paths-filter@v3
+        id: filter
+        if: github.event_name == 'pull_request' || github.event_name == 'push'
+        with:
+          filters: |
+            rust: ['**/*.rs', '**/Cargo.toml', 'Cargo.lock']
+            python: ['**/*.py']
+            csharp: ['**/*.cs']
+            go: ['**/*.go', '**/go.mod', '**/go.sum']
+            c-cpp: ['**/*.c', '**/*.cc', '**/*.cpp', '**/*.cxx', '**/*.h', '**/*.hh', '**/*.hpp']
+            javascript-typescript: ['**/*.js', '**/*.jsx', '**/*.mjs', '**/*.cjs', '**/*.ts', '**/*.tsx']
+            actions: ['.github/workflows/**']
+
+      # Map the matched filters to a CodeQL matrix. `go` is the only language here
+      # that has no buildless mode, so it gets `autobuild`; everything else uses
+      # `build-mode: none` (the fixtures are not buildable). On schedule/dispatch
+      # we override to the full language set for a complete scan.
+      - id: matrix
+        env:
+          CHANGED: ${{ steps.filter.outputs.changes }}
+          EVENT: ${{ github.event_name }}
+        run: |
+          set -euo pipefail
+          CHANGED="${CHANGED:-[]}"
+          [ -z "$CHANGED" ] && CHANGED='[]'
+          if [ "$EVENT" = "schedule" ] || [ "$EVENT" = "workflow_dispatch" ]; then
+            CHANGED='["actions","c-cpp","csharp","go","javascript-typescript","python","rust"]'
+          fi
+          matrix=$(jq -cn --argjson c "$CHANGED" \
+            '{include: [ $c[] | {language: ., "build-mode": (if . == "go" then "autobuild" else "none" end)} ]}')
+          any=$(jq -rn --argjson c "$CHANGED" 'if ($c|length) > 0 then "true" else "false" end')
+          echo "matrix=$matrix" >> "$GITHUB_OUTPUT"
+          echo "any=$any" >> "$GITHUB_OUTPUT"
+          echo "Languages selected: $matrix"
+
+  analyze:
+    name: Analyze (${{ matrix.language }})
+    needs: changes
+    if: ${{ needs.changes.outputs.any == 'true' }}
+    runs-on: ubuntu-22.04
+    permissions:
+      contents: read
+      security-events: write
+      packages: read
+      actions: read
+    strategy:
+      fail-fast: false
+      matrix: ${{ fromJSON(needs.changes.outputs.matrix) }}
+    steps:
+      - uses: actions/checkout@v6
+      - name: Initialize CodeQL
+        uses: github/codeql-action/init@v3
+        with:
+          languages: ${{ matrix.language }}
+          build-mode: ${{ matrix.build-mode }}
+      - name: Perform CodeQL Analysis
+        uses: github/codeql-action/analyze@v3
+        with:
+          category: "/language:${{ matrix.language }}"
diff --git a/.github/workflows/crates-io.yml b/.github/workflows/crates-io.yml
@@ -37,6 +37,12 @@ jobs:
           # right before publishing; cargo auto-detects a package-root README.md
           # when no `readme` field is set. `--allow-dirty` tolerates the copy.
           for d in crates/*/; do cp README.md "$d/README.md"; done
+          # Same trick for the `--doc` corpus: the languages/ tree lives at the
+          # repo root (outside the crate, so not in the published tarball by
+          # default). Copy it into the CLI crate so `cargo publish` packs it and
+          # `cargo install code-ranker` embeds the full corpus; build.rs prefers
+          # this package-local copy over the repo-root tree. `--allow-dirty` covers it.
+          cp -R languages crates/code-ranker-cli/languages
           for c in code-ranker-plugin-api code-ranker-graph code-ranker-plugins code-ranker-viewer code-ranker; do
             echo "==> publishing $c"
             attempt=0

diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -3,7 +3,7 @@ members = ["crates/*"]
 resolver = "3"
 
 [workspace.package]
-version = "3.0.1"
+version = "3.0.2"
 edition = "2024"
 rust-version = "1.88"
 license = "Apache-2.0"
@@ -12,10 +12,10 @@ keywords = ["dependency-graph", "coupling", "refactoring", "code-quality", "stat
 categories = ["development-tools", "command-line-utilities"]
 
 [workspace.dependencies]
-code-ranker-graph = { path = "crates/code-ranker-graph", version = "3.0.1" }
-code-ranker-plugin-api = { path = "crates/code-ranker-plugin-api", version = "3.0.1" }
-code-ranker-plugins = { path = "crates/code-ranker-plugins", version = "3.0.1" }
-code-ranker-viewer = { path = "crates/code-ranker-viewer", version = "3.0.1" }
+code-ranker-graph = { path = "crates/code-ranker-graph", version = "3.0.2" }
+code-ranker-plugin-api = { path = "crates/code-ranker-plugin-api", version = "3.0.2" }
+code-ranker-plugins = { path = "crates/code-ranker-plugins", version = "3.0.2" }
+code-ranker-viewer = { path = "crates/code-ranker-viewer", version = "3.0.2" }
 
 anyhow = "1.0"
 cel = "0.13"

diff --git a/crates/code-ranker-cli/build.rs b/crates/code-ranker-cli/build.rs
@@ -5,13 +5,14 @@
 //! can serve a principle's Markdown (e.g. `--doc HK`) from the binary itself with
 //! no filesystem at runtime. Dependency-free (no `include_dir` crate).
 //!
-//! The corpus lives at the repo root (`../../languages`), OUTSIDE this crate, so it
-//! is NOT in the published crate tarball. A workspace build (the prebuilt binaries
-//! shipped via the installer / npm / PyPI / Docker / GitHub Release) finds it and
-//! embeds the full corpus; an ISOLATED build (`cargo publish` verify, or
-//! `cargo install code-ranker` from crates.io source) won't — so the corpus is
-//! resolved best-effort and absence yields an EMPTY corpus (never a build failure).
-//! `--doc` then reports "not embedded" on such builds; everything else works.
+//! The single source of truth lives at the repo root (`../../languages`), OUTSIDE
+//! this crate. So that `cargo install code-ranker` from crates.io still embeds the
+//! corpus, the publish workflow copies that tree into a package-local `languages/`
+//! right before `cargo publish` (mirroring the per-crate README copy) — and this
+//! build script prefers that package-local copy, falling back to the repo-root tree
+//! for workspace/dev builds. If NEITHER exists (an unexpected isolated build) the
+//! corpus resolves best-effort to EMPTY (never a build failure); `--doc` then reports
+//! "not embedded" while everything else works.
 
 use std::path::{Path, PathBuf};
 use std::{env, fs};
@@ -20,9 +21,13 @@ fn main() {
     let manifest = env::var("CARGO_MANIFEST_DIR").expect("CARGO_MANIFEST_DIR");
 
     let mut entries: Vec<(String, PathBuf)> = Vec::new();
-    // Best-effort: a missing corpus (isolated/published build) is NOT an error —
-    // it must never break `cargo publish`/`cargo install`. See module docs.
-    match Path::new(&manifest).join("../../languages").canonicalize() {
+    // Prefer the package-local copy (present in the published tarball), else the
+    // repo-root tree (workspace/dev builds). Best-effort: a missing corpus is NOT
+    // an error — it must never break `cargo publish`/`cargo install`. See module docs.
+    let local = Path::new(&manifest).join("languages");
+    let root = Path::new(&manifest).join("../../languages");
+    let resolved = local.canonicalize().or_else(|_| root.canonicalize());
+    match resolved {
         Ok(corpus) => {
             // Re-run when the tree changes (added/removed files) and on any file edit.
             println!("cargo:rerun-if-changed={}", corpus.display());
@@ -31,9 +36,10 @@ fn main() {
         }
         Err(_) => {
             println!(
-                "cargo:warning=languages/ corpus not found (isolated build, e.g. \
-                 `cargo install code-ranker` from crates.io) — embedding an empty corpus; \
-                 `--doc` will report \"not embedded\". Prebuilt binaries embed the full corpus."
+                "cargo:warning=languages/ corpus not found at ./languages or ../../languages \
+                 — embedding an empty corpus; `--doc` will report \"not embedded\". Published \
+                 builds carry a package-local copy (see crates-io.yml); workspace builds use the \
+                 repo-root tree."
             );
         }
     }

diff --git a/docs/DESIGN.md b/docs/DESIGN.md
@@ -986,7 +986,7 @@ dictionaries with the structural graph and the computed cycles/stats:
   "workspace":      "/Users/alice/projects/code-ranker",
   "target":         "/Users/alice/projects/axum-api",
   "plugin":         "rust",
-  "versions": { "code-ranker": "3.0.1", "rustc": "1.78.0" },
+  "versions": { "code-ranker": "3.0.2", "rustc": "1.78.0" },
   "roots": {
     "registry": "/Users/alice/.cargo/registry/src/index.crates.io-abc123",
     "target":   "/Users/alice/projects/axum-api"

diff --git a/docs/PRD.md b/docs/PRD.md
@@ -269,7 +269,7 @@ bundles its semantics dictionaries with the structural graph and computed data
   "target":    "/Users/alice/projects/axum-api",
   "plugin": "rust",
   "config_file": "/Users/alice/projects/axum-api/code-ranker.toml",
-  "versions": { "code-ranker": "3.0.1", "rustc": "1.78.0" },
+  "versions": { "code-ranker": "3.0.2", "rustc": "1.78.0" },
   "roots": {
     "registry": "/Users/alice/.cargo/registry/src/index.crates.io-abc123",
     "target":   "/Users/alice/projects/axum-api"
@@ -729,7 +729,7 @@ can render any language/metric set without hardcoding names.
   "workspace":      "<absolute-path>",
   "target":         "<absolute-path>",
   "plugin":         "<plugin-id>",
-  "versions":       { "code-ranker": "3.0.1", "rustc": "1.78.0" },
+  "versions":       { "code-ranker": "3.0.2", "rustc": "1.78.0" },
   "roots":          { "target": "<abs>", "registry": "<abs>" },
   "git":            { "branch": "main", "commit": "a3f9c21b4d5e", "dirty_files": 0, "origin": "git@…:team/proj.git" },
   "timings":        [ { "stage": "rust", "ms": 0, "detail": "…" }, … ],

diff --git a/docs/customization/cel-reference.md b/docs/customization/cel-reference.md
@@ -291,10 +291,18 @@ message = "test/source ratio too high ({tloc}/{sloc})"
 when    = 'depends_on("ext:sqlx")'
 message = "imports the sqlx crate directly"
 
-# Check: list macros over the dependency set
-[rules.checks.wide_hub]
-when    = "deps.size() > 20"
-message = "depends on {fan_out} modules — a coupling hub"
+# Check: a list-comprehension macro over the dependency set. `filter` is the
+# macro; `size()` is the collection function that counts the result. (A bare
+# `deps.size() > 20` needs no macro and just equals `fan_out > 20`.)
+[rules.checks.wide_ext_hub]
+when    = 'deps.filter(d, d.startsWith("ext:")).size() > 20'
+message = "{name}: depends on many external crates — a coupling hub"
+
+# Metric: the same macro in a formula. Graph lists (`deps`/`files`/…) are
+# checks-only (§4.2), so a metric macro runs over a *literal* list — here the
+# file's own complexity signals — counting how many exceed a floor.
+[metrics.complexity_signals]
+formula_cel = "[cyclomatic, cognitive, branches].filter(x, x > 10.0).size().double()"
 
 # Check: relative threshold (this node vs the project distribution)
 [rules.checks.complexity_outlier]
@@ -304,6 +312,36 @@ message = "{name}: cyclomatic {cyclomatic} is in the project's worst 10%"
 # Metric: branch on path (blank the metric for generated code)
 [metrics.real_hk]
 formula_cel = 'path.contains("/generated/") ? 0.0 : hk'
+
+# Metrics: size-normalized complexity — branching *per 100 source lines*. A raw
+# `cognitive`/`cyclomatic` count just tracks size; dividing by `sloc` measures
+# DENSITY, in intuitive units (e.g. 42 = 42 points of cognitive load per 100 lines).
+# Guard the divide (`sloc == 0 -> 0`).
+[metrics.cognitive_per_100sloc]
+formula_cel = "sloc > 0.0 ? cognitive / sloc * 100.0 : 0.0"
+
+[metrics.cyclomatic_per_100sloc]
+formula_cel = "sloc > 0.0 ? cyclomatic / sloc * 100.0 : 0.0"
+
+# Check: a SHORT-but-DENSE file — the most complexity packed into the fewest lines,
+# judged RELATIVE to this repo (no fixed number ports across codebases). Custom
+# `[metrics]` are aggregatable, so we threshold each density against its own p90:
+#   1. top-decile cognitive density    cognitive_per_100sloc  > p90
+#   2. top-decile branching density     cyclomatic_per_100sloc > p90
+#   3. genuinely short                   sloc < project median  → true density, not bulk
+# (3) is what excludes large-and-dense files: a 200-line file can top the density
+# deciles yet isn't "short". A multi-line `when` (TOML `'''…'''`) stays readable —
+# CEL ignores the newlines; a node missing an attr just doesn't fire (never errors).
+[rules.checks.dense_complexity]
+when = '''
+  cognitive_per_100sloc  > agg('cognitive_per_100sloc',  'p90', 'not_empty') &&
+  cyclomatic_per_100sloc > agg('cyclomatic_per_100sloc', 'p90', 'not_empty') &&
+  sloc.double() < agg('sloc', 'p50', 'not_empty')
+'''
+message = "{name}: dense complexity — {cognitive} cognitive / {cyclomatic} cyclomatic packed into {sloc} sloc (top-decile density for this repo)"
+why     = "High branching crammed into few lines reads as clever but is hard to follow and test."
+fix     = "Extract the nested branches into named helpers — trade a few more lines for lower per-line complexity."
+group   = "SRP"
 ```
 
 ---