From 49ba90090f67bacec1ebdd0bdd81db9ed94dce22 Mon Sep 17 00:00:00 2001
From: t1mato <tnln3rd@gmail.com>
Date: Wed, 1 Jul 2026 15:13:35 -0700
Subject: [PATCH 1/4] ci: add files/scripts

Rename examples/deep-researcher/utils.py to deep_researcher_utils.py to
avoid basename collision with four other ASF-owned utils.py files that
Apache RAT skips. Update .rat-excludes and the application import accordingly.

Thread --skip-signing through cmd_verify in scripts/apache_release.py so
CI can run full artifact verification without GPG keys present.

Extend Apache RAT scanning to wheel (.whl) artifacts in addition to source
and sdist tarballs, so license header regressions in packaged files are
caught before the release vote.

Closes #747 (partial)
---
 .rat-excludes                                 |  9 +++-----
 examples/deep-researcher/application.py       |  4 ++--
 .../{utils.py => deep_researcher_utils.py}    |  0
 scripts/apache_release.py                     |  8 ++++++-
 tests/test_apache_release.py                  | 22 +++++++++++++++++++
 5 files changed, 34 insertions(+), 9 deletions(-)
 rename examples/deep-researcher/{utils.py => deep_researcher_utils.py} (100%)

diff --git a/.rat-excludes b/.rat-excludes
index 5ef43f57f..5609dd080 100644
--- a/.rat-excludes
+++ b/.rat-excludes
@@ -44,15 +44,12 @@
 
 # Third-party MIT-licensed files (attributed in LICENSE).
 # Most names are unique within the repo so basename matching is safe.
-# Known collisions:
-#   - utils.py: also matches our own ASF code in burr/tracking/, etc.
-#     (4 other utils.py files; all currently have ASF headers)
+# Known collision:
 #   - button.tsx: also matches telemetry/ui/src/components/common/button.tsx
 #     (our own ASF code with header)
-# A future regression in any of those collision targets would silently pass
-# RAT. Tracked as a follow-up to rename or restructure.
+# A future regression in that collision target would silently pass RAT.
 **/prompts.py
-**/utils.py
+**/deep_researcher_utils.py
 **/animated-beam.tsx
 **/animated-shiny-text.tsx
 **/blur-fade.tsx
diff --git a/examples/deep-researcher/application.py b/examples/deep-researcher/application.py
index 9547998bd..f64afe174 100644
--- a/examples/deep-researcher/application.py
+++ b/examples/deep-researcher/application.py
@@ -32,9 +32,9 @@
     import prompts
 
 try:
-    utils = importlib.import_module("burr.examples.deep-researcher.utils")
+    utils = importlib.import_module("burr.examples.deep-researcher.deep_researcher_utils")
 except ModuleNotFoundError:
-    import utils
+    import deep_researcher_utils as utils
 
 
 @functools.lru_cache
diff --git a/examples/deep-researcher/utils.py b/examples/deep-researcher/deep_researcher_utils.py
similarity index 100%
rename from examples/deep-researcher/utils.py
rename to examples/deep-researcher/deep_researcher_utils.py
diff --git a/scripts/apache_release.py b/scripts/apache_release.py
index 1d823fe1c..4fd43bfd0 100644
--- a/scripts/apache_release.py
+++ b/scripts/apache_release.py
@@ -1385,6 +1385,7 @@ def cmd_verify(args) -> bool:
     """Handle 'verify' subcommand."""
     _print_section(f"Verifying Artifacts - v{args.version}-RC{args.rc_num}")
 
+    skip_signing = getattr(args, "skip_signing", False)
     artifacts = _collect_all_artifacts(args.version, args.artifacts_dir)
 
     if not artifacts:
@@ -1395,7 +1396,7 @@ def cmd_verify(args) -> bool:
     for artifact in artifacts:
         if artifact.endswith((".asc", ".sha512")):
             continue  # Skip signature/checksum files
-        if not _verify_artifact_complete(artifact):
+        if not _verify_artifact_complete(artifact, skip_signing=skip_signing):
             all_valid = False
 
     if all_valid:
@@ -1594,6 +1595,11 @@ def _build_parser() -> argparse.ArgumentParser:
     verify_parser.add_argument("version", help="Version")
     verify_parser.add_argument("rc_num", help="RC number")
     verify_parser.add_argument("--artifacts-dir", default="dist")
+    verify_parser.add_argument(
+        "--skip-signing",
+        action="store_true",
+        help="Skip GPG signature verification (for builds produced with --skip-signing).",
+    )
 
     # vote-email subcommand
     vote_email_parser = subparsers.add_parser("vote-email", help="Generate release vote email")
diff --git a/tests/test_apache_release.py b/tests/test_apache_release.py
index ac0e48245..f43867d75 100644
--- a/tests/test_apache_release.py
+++ b/tests/test_apache_release.py
@@ -15,6 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 
+import hashlib
 import importlib.util
 import sys
 from argparse import Namespace
@@ -163,9 +164,30 @@ def fake_promote(source_url, target_url, message, apache_id, dry_run=False):
     assert release.cmd_promote(args) is True
     # only the RC is checked out; the release tree is never downloaded
     assert len(calls["checkout"]) == 1
+
     assert calls["checkout"][0][0].endswith("/0.42.0-incubating-RC1")
     source_url, target_url, message, apache_id, dry_run = calls["promote"]
     assert source_url.endswith("/0.42.0-incubating-RC1")
     assert target_url == "https://dist.apache.org/repos/dist/release/incubator/burr/0.42.0"
     assert apache_id == "hari"
     assert dry_run is True
+
+
+def test_verify_parser_accepts_skip_signing():
+    parser = release._build_parser()
+    args = parser.parse_args(["verify", "0.42.0", "0", "--skip-signing"])
+    assert args.skip_signing is True
+
+
+def test_cmd_verify_skip_signing_succeeds_without_asc_files(tmp_path):
+    version = "0.42.0"
+    content = b"fake artifact content"
+    sha = hashlib.sha512(content).hexdigest()
+
+    artifact_name = f"apache-burr-{version}-incubating-src.tar.gz"
+    (tmp_path / artifact_name).write_bytes(content)
+    (tmp_path / f"{artifact_name}.sha512").write_text(f"{sha}  {artifact_name}\n")
+    # No .asc file — simulates a --skip-signing build
+
+    args = Namespace(version=version, rc_num="0", artifacts_dir=str(tmp_path), skip_signing=True)
+    assert release.cmd_verify(args) is True

From 41c866cfbea1dd9b4423fcb6ca6c0775978dbaba Mon Sep 17 00:00:00 2001
From: t1mato <tnln3rd@gmail.com>
Date: Wed, 1 Jul 2026 15:14:09 -0700
Subject: [PATCH 2/4] ci: add smoke test improvements

Replace fixed time.sleep(2) with a polling loop on /api/v0/projects so
the smoke test waits only as long as necessary and fails fast if the
server process exits unexpectedly.

Launch the server in its own process group (start_new_session=True) and
send SIGTERM to the whole group on teardown so uvicorn child processes
are not orphaned.

Add GET / check to verify the UI is being served by the installed wheel.

Add --cleanup / --no-cleanup flag; defaults to cleanup locally but
preserves the workspace in GITHUB_ACTIONS so artifacts are available
for upload on failure.

Add tests/test_ci_smoke_server.py covering all new testable helpers.

Closes #747 (partial)
---
 scripts/ci_smoke_server.py    | 119 +++++++++++++++++---
 tests/test_ci_smoke_server.py | 204 ++++++++++++++++++++++++++++++++++
 2 files changed, 307 insertions(+), 16 deletions(-)
 create mode 100644 tests/test_ci_smoke_server.py

diff --git a/scripts/ci_smoke_server.py b/scripts/ci_smoke_server.py
index ba51d6fa4..870cc80bb 100644
--- a/scripts/ci_smoke_server.py
+++ b/scripts/ci_smoke_server.py
@@ -36,6 +36,7 @@
 import argparse
 import json
 import os
+import shutil
 import signal
 import socket
 import subprocess
@@ -45,6 +46,7 @@
 import urllib.error
 import urllib.request
 from pathlib import Path
+from typing import Optional
 
 
 def _free_port() -> int:
@@ -79,7 +81,49 @@ def _poll_url(url: str, timeout_s: int = 30, server_proc: "subprocess.Popen | No
     return False
 
 
-def main() -> None:
+def _poll_projects(
+    base_url: str,
+    project_name: str,
+    timeout_s: int = 30,
+    server_proc: "subprocess.Popen | None" = None,
+) -> bool:
+    """Poll /api/v0/projects until project_name appears or timeout.
+
+    The Burr server discovers tracking data from the filesystem on demand, so
+    there is a short lag between a tracked app writing its data and the server
+    reporting the project over the API. Polling is more reliable than a fixed
+    sleep because it succeeds as soon as the data is visible and bails early
+    if the server process has already died.
+    """
+    deadline = time.time() + timeout_s
+    while time.time() < deadline:
+        if server_proc is not None and server_proc.poll() is not None:
+            return False
+        try:
+            with urllib.request.urlopen(f"{base_url}/api/v0/projects", timeout=2) as resp:
+                if resp.status == 200:
+                    data = json.loads(resp.read().decode("utf-8"))
+                    if project_name in [p.get("name") for p in data]:
+                        return True
+        except (urllib.error.URLError, ConnectionResetError, TimeoutError):
+            pass
+        time.sleep(1)
+    return False
+
+
+def _should_cleanup(explicit: Optional[bool]) -> bool:
+    """Return True if the work directory should be removed after the run.
+
+    Priority: explicit flag > GITHUB_ACTIONS env var > default (clean locally).
+    In GitHub Actions the workspace is preserved so the upload-artifact step
+    can capture it on failure; locally it is cleaned up by default.
+    """
+    if explicit is not None:
+        return explicit
+    return os.environ.get("GITHUB_ACTIONS") != "true"
+
+
+def _build_parser() -> argparse.ArgumentParser:
     parser = argparse.ArgumentParser(description=__doc__)
     parser.add_argument("--wheel", required=True, help="Path to the wheel to smoke-test")
     parser.add_argument(
@@ -99,13 +143,28 @@ def main() -> None:
         default=45,
         help="Seconds to wait for the server to become ready",
     )
-    args = parser.parse_args()
+    parser.add_argument(
+        "--cleanup",
+        action=argparse.BooleanOptionalAction,
+        default=None,
+        help=(
+            "Remove work directory after run. "
+            "Defaults to True locally and False in GitHub Actions "
+            "(so CI can upload the workspace as a debug artifact)."
+        ),
+    )
+    return parser
+
+
+def main() -> None:
+    args = _build_parser().parse_args()
 
     wheel_path = Path(args.wheel).resolve()
     if not wheel_path.is_file():
         _fail(f"Wheel not found: {wheel_path}")
 
     port = args.port if args.port else _free_port()
+    should_cleanup = _should_cleanup(args.cleanup)
 
     # Fresh working dirs, outside of any source tree
     work_dir = Path(tempfile.mkdtemp(prefix="burr-smoke-"))
@@ -118,6 +177,7 @@ def main() -> None:
     _log(f"Workspace: {work_dir}")
     _log(f"Python: {args.python}")
     _log(f"Wheel: {wheel_path}")
+    _log(f"Cleanup after run: {should_cleanup}")
 
     server_proc = None
     try:
@@ -149,6 +209,9 @@ def main() -> None:
         )
 
         # 4. Start server from outside the source tree so CWD can't shadow the install.
+        # start_new_session=True puts the server and all its children (uvicorn) into a
+        # dedicated process group. This lets us send SIGTERM to the entire group on
+        # teardown, preventing orphaned uvicorn processes from holding the port.
         _log(f"Starting burr server on port {port}...")
         env = os.environ.copy()
         env["burr_path"] = str(burr_data_dir)
@@ -160,6 +223,7 @@ def main() -> None:
                 env=env,
                 stdout=log_fh,
                 stderr=subprocess.STDOUT,
+                start_new_session=True,
             )
 
         base_url = f"http://127.0.0.1:{port}"
@@ -173,7 +237,15 @@ def main() -> None:
             _fail("Server did not become ready")
         _log("Server is up")
 
-        # 5. Run a tracked Burr app as a separate process using the venv.
+        # 5. Verify the UI is served at the web root. If the frontend build is
+        # missing from the wheel, GET / returns 404 even though the API works.
+        _log("Checking UI is served at GET /...")
+        with urllib.request.urlopen(f"{base_url}/", timeout=5) as resp:
+            if resp.status != 200:
+                _fail(f"GET / returned HTTP {resp.status}, expected 200 — UI may be missing from wheel")
+        _log("UI served correctly")
+
+        # 6. Run a tracked Burr app as a separate process using the venv.
         _log("Running tracked Burr app...")
         app_script.write_text(
             f"""\
@@ -207,27 +279,42 @@ def inc(state: State) -> State:
         )
         subprocess.run([str(venv_py), str(app_script)], check=True, cwd=str(work_dir), env=env)
 
-        # 6. Verify the server sees the project.
-        _log("Verifying server sees project 'ci-smoke-test'...")
-        time.sleep(2)  # give the server a moment to pick up the filesystem change
-        with urllib.request.urlopen(f"{base_url}/api/v0/projects", timeout=5) as resp:
-            data = json.loads(resp.read().decode("utf-8"))
-        names = [p.get("name") for p in data]
-        if "ci-smoke-test" not in names:
-            _fail(f"Project 'ci-smoke-test' not found. Projects seen: {names}")
-        _log(f"Projects: {names}")
+        # 7. Poll until the server reports the project. The server discovers tracking
+        # data from the filesystem on demand, so there is a short lag after the app
+        # writes its data. Polling is preferable to a fixed sleep: it succeeds as soon
+        # as the data appears and gives a clear failure message on timeout.
+        _log("Waiting for server to report project 'ci-smoke-test'...")
+        if not _poll_projects(
+            base_url, "ci-smoke-test", timeout_s=30, server_proc=server_proc
+        ):
+            if server_proc.poll() is not None:
+                _log(f"Server process exited with code {server_proc.returncode}")
+            _log("--- server log ---")
+            print(server_log.read_text(), flush=True)
+            _log("--- end server log ---")
+            _fail("Project 'ci-smoke-test' never appeared in /api/v0/projects")
 
         _log("SUCCESS")
     finally:
         if server_proc is not None and server_proc.poll() is None:
-            _log("Stopping server...")
-            server_proc.send_signal(signal.SIGTERM)
+            _log("Stopping server (sending SIGTERM to process group)...")
+            try:
+                # Kill the entire process group so uvicorn (a child of burr) is also
+                # terminated. Without this, uvicorn becomes an orphan that holds the
+                # port and consumes resources after the script exits.
+                os.killpg(os.getpgid(server_proc.pid), signal.SIGTERM)
+            except ProcessLookupError:
+                pass  # process group already gone
             try:
                 server_proc.wait(timeout=10)
             except subprocess.TimeoutExpired:
                 server_proc.kill()
-        # Leave work_dir intact in CI (uploadable as artifact); also leave it locally
-        # on failure for easier debugging. Caller can rm -rf /tmp/burr-smoke-* to clean up.
+
+        if should_cleanup:
+            _log(f"Cleaning up workspace {work_dir} ...")
+            shutil.rmtree(work_dir, ignore_errors=True)
+        else:
+            _log(f"Workspace preserved at {work_dir} (upload as CI artifact if needed)")
 
 
 if __name__ == "__main__":
diff --git a/tests/test_ci_smoke_server.py b/tests/test_ci_smoke_server.py
new file mode 100644
index 000000000..5910b6be4
--- /dev/null
+++ b/tests/test_ci_smoke_server.py
@@ -0,0 +1,204 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import importlib.util
+import sys
+from pathlib import Path
+
+import pytest
+
+
+def _load_smoke_module():
+    module_path = Path(__file__).resolve().parent.parent / "scripts" / "ci_smoke_server.py"
+    spec = importlib.util.spec_from_file_location("ci_smoke_server", module_path)
+    module = importlib.util.module_from_spec(spec)
+    assert spec.loader is not None
+    sys.modules[spec.name] = module
+    spec.loader.exec_module(module)
+    return module
+
+
+smoke = _load_smoke_module()
+
+
+# ---------------------------------------------------------------------------
+# _should_cleanup: pure function mapping (explicit flag, env) → bool
+# ---------------------------------------------------------------------------
+
+
+def test_should_cleanup_defaults_true_outside_ci(monkeypatch):
+    """When GITHUB_ACTIONS is not set, default is to clean up (saves disk space)."""
+    monkeypatch.delenv("GITHUB_ACTIONS", raising=False)
+    assert smoke._should_cleanup(explicit=None) is True
+
+
+def test_should_cleanup_defaults_false_in_ci(monkeypatch):
+    """When GITHUB_ACTIONS=true, default is to preserve workspace for artifact upload."""
+    monkeypatch.setenv("GITHUB_ACTIONS", "true")
+    assert smoke._should_cleanup(explicit=None) is False
+
+
+def test_should_cleanup_explicit_true_overrides_ci(monkeypatch):
+    """--cleanup flag forces cleanup even inside GitHub Actions."""
+    monkeypatch.setenv("GITHUB_ACTIONS", "true")
+    assert smoke._should_cleanup(explicit=True) is True
+
+
+def test_should_cleanup_explicit_false_overrides_local(monkeypatch):
+    """--no-cleanup flag preserves workspace even outside CI."""
+    monkeypatch.delenv("GITHUB_ACTIONS", raising=False)
+    assert smoke._should_cleanup(explicit=False) is False
+
+
+# ---------------------------------------------------------------------------
+# _build_parser: argument parsing for --cleanup / --no-cleanup
+# ---------------------------------------------------------------------------
+
+
+def test_parser_cleanup_flag_sets_true():
+    """--cleanup sets args.cleanup to True."""
+    parser = smoke._build_parser()
+    args = parser.parse_args(["--wheel", "fake.whl", "--cleanup"])
+    assert args.cleanup is True
+
+
+def test_parser_no_cleanup_flag_sets_false():
+    """--no-cleanup sets args.cleanup to False."""
+    parser = smoke._build_parser()
+    args = parser.parse_args(["--wheel", "fake.whl", "--no-cleanup"])
+    assert args.cleanup is False
+
+
+def test_parser_cleanup_defaults_to_none():
+    """Without either flag, args.cleanup is None (deferred to _should_cleanup)."""
+    parser = smoke._build_parser()
+    args = parser.parse_args(["--wheel", "fake.whl"])
+    assert args.cleanup is None
+
+
+# ---------------------------------------------------------------------------
+# _poll_projects: polls /api/v0/projects until named project appears
+# ---------------------------------------------------------------------------
+
+
+def test_poll_projects_returns_true_when_project_found(monkeypatch):
+    """Returns True immediately once the target project name appears in the response."""
+    calls = []
+
+    def fake_urlopen(url, timeout=None):
+        calls.append(url)
+
+        class FakeResp:
+            status = 200
+
+            def read(self):
+                import json
+                return json.dumps([{"name": "ci-smoke-test"}, {"name": "other"}]).encode()
+
+            def __enter__(self):
+                return self
+
+            def __exit__(self, *_):
+                pass
+
+        return FakeResp()
+
+    monkeypatch.setattr(smoke.urllib.request, "urlopen", fake_urlopen)
+    result = smoke._poll_projects("http://127.0.0.1:9999", "ci-smoke-test", timeout_s=5)
+    assert result is True
+    assert len(calls) == 1
+
+
+def test_poll_projects_returns_false_on_timeout(monkeypatch):
+    """Returns False when the project never appears before the deadline."""
+
+    def fake_urlopen(url, timeout=None):
+        raise smoke.urllib.error.URLError("connection refused")
+
+    monkeypatch.setattr(smoke.urllib.request, "urlopen", fake_urlopen)
+    monkeypatch.setattr(smoke.time, "sleep", lambda _: None)
+    monkeypatch.setattr(smoke.time, "time", _make_deadline_clock(budget=0.0))
+
+    result = smoke._poll_projects("http://127.0.0.1:9999", "ci-smoke-test", timeout_s=1)
+    assert result is False
+
+
+def test_poll_projects_returns_false_when_server_proc_exits(monkeypatch):
+    """Returns False immediately if the server process has already exited."""
+
+    class FakeProc:
+        def poll(self):
+            return 1  # non-None → process is dead
+
+    monkeypatch.setattr(smoke.urllib.request, "urlopen", lambda *a, **kw: (_ for _ in ()).throw(AssertionError("should not reach urlopen")))
+
+    result = smoke._poll_projects(
+        "http://127.0.0.1:9999", "ci-smoke-test", timeout_s=5, server_proc=FakeProc()
+    )
+    assert result is False
+
+
+def test_poll_projects_keeps_trying_until_project_appears(monkeypatch):
+    """Retries when project is absent, then succeeds once it appears."""
+    import json
+    responses = [
+        json.dumps([]).encode(),
+        json.dumps([{"name": "other"}]).encode(),
+        json.dumps([{"name": "ci-smoke-test"}]).encode(),
+    ]
+    call_count = [0]
+
+    def fake_urlopen(url, timeout=None):
+        class FakeResp:
+            status = 200
+
+            def read(self):
+                idx = min(call_count[0], len(responses) - 1)
+                call_count[0] += 1
+                return responses[idx]
+
+            def __enter__(self):
+                return self
+
+            def __exit__(self, *_):
+                pass
+
+        return FakeResp()
+
+    monkeypatch.setattr(smoke.urllib.request, "urlopen", fake_urlopen)
+    monkeypatch.setattr(smoke.time, "sleep", lambda _: None)
+
+    result = smoke._poll_projects("http://127.0.0.1:9999", "ci-smoke-test", timeout_s=30)
+    assert result is True
+    assert call_count[0] == 3
+
+
+# ---------------------------------------------------------------------------
+# helpers
+# ---------------------------------------------------------------------------
+
+
+def _make_deadline_clock(budget: float):
+    """Return a fake time.time() that expires after `budget` seconds of calls."""
+    start = [0.0]
+
+    def _fake_time():
+        val = start[0]
+        start[0] += budget + 1.0
+        return val
+
+    return _fake_time

From e6bf7d74643c4dfde36c654438009dfd7fc48bbf Mon Sep 17 00:00:00 2001
From: t1mato <tnln3rd@gmail.com>
Date: Wed, 1 Jul 2026 15:14:43 -0700
Subject: [PATCH 3/4] ci: add CI coverage gaps

Add _wheel_content_hashes and _compare_wheel_contents to
verify_apache_artifacts.py to compare wheels by file content hashes
rather than binary equality (zip timestamps make byte-for-byte
comparison unreliable). Add compare-wheels subcommand exposing this
from the CLI.

Add bare-install job: installs the wheel without optional extras and
imports core symbols to catch accidental leakage of optional
dependencies into core code.

Add sdist-wheel-equivalence job: extracts the sdist tarball, rebuilds
the wheel from it (including the npm frontend build), and compares
content hashes against the CI-built wheel to catch files missing from
the sdist.

Pin the Apache RAT JAR download with a SHA256 checksum to guard
against supply-chain tampering.

Closes #747 (partial)
---
 .github/workflows/release-validation.yml | 127 ++++++++++++++++++-
 scripts/verify_apache_artifacts.py       | 107 ++++++++++++++--
 tests/test_verify_apache_artifacts.py    | 148 ++++++++++++++++++++++-
 3 files changed, 364 insertions(+), 18 deletions(-)

diff --git a/.github/workflows/release-validation.yml b/.github/workflows/release-validation.yml
index ba1a6c5a4..0e0a64343 100644
--- a/.github/workflows/release-validation.yml
+++ b/.github/workflows/release-validation.yml
@@ -35,6 +35,9 @@ on:
       - 'v*.*.*-incubating-RC*'
   pull_request:
     types: [opened, synchronize, reopened]
+  schedule:
+    # Weekly run against main: catches dependency breakage between releases.
+    - cron: '0 9 * * 1'
   workflow_dispatch:
 
 concurrency:
@@ -113,8 +116,13 @@ jobs:
         if: steps.cache-rat.outputs.cache-hit != 'true'
         run: |
           mkdir -p ~/.cache/apache-rat
-          curl -fL -o ~/.cache/apache-rat/apache-rat-0.18.jar \
+          JAR="$HOME/.cache/apache-rat/apache-rat-0.18.jar"
+          curl -fL -o "$JAR" \
             https://repo1.maven.org/maven2/org/apache/rat/apache-rat/0.18/apache-rat-0.18.jar
+          # Verify integrity: SHA256 computed from the official Maven Central download
+          # and cross-checked against Maven Central's published SHA1.
+          echo "fe513ddd10cdc07e965ba430f2c093d8745ff24a0fb54efe0933653752c53301  $JAR" \
+            | sha256sum --check
 
       - name: Extract version
         id: version
@@ -194,13 +202,116 @@ jobs:
           retention-days: 7
           if-no-files-found: ignore
 
+  # Installs the wheel without any optional extras ([learn], etc.) and imports
+  # core symbols. Catches accidental leakage of optional dependencies into core
+  # code — a bare `pip install apache-burr` user would hit an ImportError that
+  # the [learn] smoke test would never see.
+  bare-install:
+    name: "Release Validation / bare-install"
+    needs: [check-paths, build-artifacts]
+    if: needs.check-paths.outputs.should_run == 'true'
+    runs-on: ubuntu-latest
+    timeout-minutes: 10
+    steps:
+      - uses: actions/checkout@v4
+
+      - uses: actions/setup-python@v4
+        with:
+          python-version: '3.12'
+
+      - name: Download release artifacts
+        uses: actions/download-artifact@v4
+        with:
+          name: release-artifacts
+          path: dist
+
+      - name: Install wheel without optional extras
+        env:
+          BURR_VERSION: ${{ needs.build-artifacts.outputs.version }}
+        run: |
+          pip install "dist/apache_burr-${BURR_VERSION}-py3-none-any.whl"
+
+      - name: Verify core imports succeed without optional dependencies
+        run: |
+          python -c "
+          import burr
+          from burr.core import ApplicationBuilder, State
+          from burr.core.action import action
+          print('Core imports OK')
+          "
+
+  # Extracts the sdist tarball, rebuilds the wheel from it (including the
+  # frontend npm build), then compares the resulting wheel's file contents
+  # against the release wheel using content hashes. Catches cases where the
+  # sdist is missing files that the direct wheel build includes.
+  sdist-wheel-equivalence:
+    name: "Release Validation / sdist-wheel-equivalence"
+    needs: [check-paths, build-artifacts]
+    if: needs.check-paths.outputs.should_run == 'true'
+    runs-on: ubuntu-latest
+    timeout-minutes: 25
+    steps:
+      - uses: actions/checkout@v4
+
+      - uses: actions/setup-python@v4
+        with:
+          python-version: '3.12'
+          cache: pip
+
+      - uses: actions/setup-node@v4
+        with:
+          node-version: '20'
+          cache: npm
+          cache-dependency-path: telemetry/ui/package-lock.json
+
+      - uses: actions/setup-java@v4
+        with:
+          distribution: temurin
+          java-version: '17'
+
+      - name: Install system deps
+        run: sudo apt-get install -y --no-install-recommends graphviz
+
+      - name: Install Python build deps
+        run: pip install flit twine jinja2
+
+      - name: Download release artifacts
+        uses: actions/download-artifact@v4
+        with:
+          name: release-artifacts
+          path: dist
+
+      - name: Extract sdist and build wheel from it
+        env:
+          BURR_VERSION: ${{ needs.build-artifacts.outputs.version }}
+        run: |
+          mkdir -p /tmp/sdist-extract /tmp/sdist-wheel
+          tar -xzf "dist/apache-burr-${BURR_VERSION}-incubating-sdist.tar.gz" \
+            -C /tmp/sdist-extract
+          # Find the single top-level directory the tarball extracted into
+          SDIST_ROOT=$(find /tmp/sdist-extract -maxdepth 1 -mindepth 1 -type d | head -1)
+          cd "$SDIST_ROOT"
+          # Build wheel from within the extracted sdist. The sdist contains the
+          # React frontend source (telemetry/ui/) but not the compiled output,
+          # so the full npm build runs here — same as the original build.
+          python scripts/apache_release.py wheel "$BURR_VERSION" 0 \
+            --skip-signing --output-dir /tmp/sdist-wheel
+
+      - name: Compare sdist-built wheel against release wheel
+        env:
+          BURR_VERSION: ${{ needs.build-artifacts.outputs.version }}
+        run: |
+          python scripts/verify_apache_artifacts.py compare-wheels \
+            "dist/apache_burr-${BURR_VERSION}-py3-none-any.whl" \
+            "/tmp/sdist-wheel/apache_burr-${BURR_VERSION}-py3-none-any.whl"
+
   # Single stable required-check name. Always runs (if: always()) so it produces
   # a definite SUCCESS or FAILURE — never SKIPPED. Branch protection in
   # .asf.yaml requires this context, not the underlying jobs, so path-filtered
   # docs/website PRs (where the upstream jobs are skipped) still go green here.
   summary:
     name: "Release Validation / summary"
-    needs: [check-paths, build-artifacts, install-and-smoke]
+    needs: [check-paths, build-artifacts, install-and-smoke, bare-install, sdist-wheel-equivalence]
     if: always()
     runs-on: ubuntu-latest
     timeout-minutes: 2
@@ -210,13 +321,17 @@ jobs:
           CHECK_PATHS: ${{ needs.check-paths.result }}
           BUILD_ARTIFACTS: ${{ needs.build-artifacts.result }}
           INSTALL_AND_SMOKE: ${{ needs.install-and-smoke.result }}
+          BARE_INSTALL: ${{ needs.bare-install.result }}
+          SDIST_WHEEL_EQUIV: ${{ needs.sdist-wheel-equivalence.result }}
         run: |
-          echo "check-paths:       $CHECK_PATHS"
-          echo "build-artifacts:   $BUILD_ARTIFACTS"
-          echo "install-and-smoke: $INSTALL_AND_SMOKE"
+          echo "check-paths:             $CHECK_PATHS"
+          echo "build-artifacts:         $BUILD_ARTIFACTS"
+          echo "install-and-smoke:       $INSTALL_AND_SMOKE"
+          echo "bare-install:            $BARE_INSTALL"
+          echo "sdist-wheel-equivalence: $SDIST_WHEEL_EQUIV"
           # Pass if every needed job is success or skipped; fail if any
           # failed or was cancelled.
-          for r in "$CHECK_PATHS" "$BUILD_ARTIFACTS" "$INSTALL_AND_SMOKE"; do
+          for r in "$CHECK_PATHS" "$BUILD_ARTIFACTS" "$INSTALL_AND_SMOKE" "$BARE_INSTALL" "$SDIST_WHEEL_EQUIV"; do
             case "$r" in
               success|skipped) ;;
               *) echo "::error::Release Validation failed (one or more jobs not success/skipped)"; exit 1 ;;
diff --git a/scripts/verify_apache_artifacts.py b/scripts/verify_apache_artifacts.py
index 583991e48..03d60ab84 100755
--- a/scripts/verify_apache_artifacts.py
+++ b/scripts/verify_apache_artifacts.py
@@ -194,6 +194,46 @@ def _wheel_file_bytes(artifact_path: str) -> dict[str, bytes]:
         return {name: wheel.read(name) for name in wheel.namelist() if not name.endswith("/")}
 
 
+def _wheel_content_hashes(wheel_path: str) -> dict[str, str]:
+    """Return {member_path: sha256_hex} for all non-directory members of a wheel.
+
+    RECORD is excluded because it is a manifest that lists other files' hashes.
+    Two wheels built from identical source at different times will produce
+    different RECORD files, but their other content will be the same.
+    """
+    result: dict[str, str] = {}
+    with zipfile.ZipFile(wheel_path, "r") as zf:
+        for name in zf.namelist():
+            if name.endswith("/"):
+                continue  # directory entry — no content to hash
+            if PurePosixPath(name).name == "RECORD":
+                continue  # manifest of other files' hashes — legitimately differs
+            result[name] = hashlib.sha256(zf.read(name)).hexdigest()
+    return result
+
+
+def _compare_wheel_contents(wheel_a: str, wheel_b: str) -> tuple[bool, list[str]]:
+    """Compare two wheels by file content hash, ignoring zip metadata and RECORD.
+
+    Returns (all_match, list_of_difference_descriptions). Uses content hashes
+    rather than whole-file SHA because zip timestamps make binary comparison
+    fail for wheels built from the same source at different times.
+    """
+    hashes_a = _wheel_content_hashes(wheel_a)
+    hashes_b = _wheel_content_hashes(wheel_b)
+    name_a = os.path.basename(wheel_a)
+    name_b = os.path.basename(wheel_b)
+    diffs: list[str] = []
+    for key in sorted(set(hashes_a) | set(hashes_b)):
+        if key not in hashes_b:
+            diffs.append(f"only in {name_a}: {key}")
+        elif key not in hashes_a:
+            diffs.append(f"only in {name_b}: {key}")
+        elif hashes_a[key] != hashes_b[key]:
+            diffs.append(f"content differs: {key}")
+    return len(diffs) == 0, diffs
+
+
 def _find_files_by_basename(file_bytes: dict[str, bytes], basename: str) -> list[str]:
     matches = []
     for path in file_bytes:
@@ -558,8 +598,12 @@ def _check_licenses_with_rat(
 
         print("  Extracting archive...")
         try:
-            with tarfile.open(artifact_path, "r:gz") as tar:
-                _safe_extract_tar(tar, extract_dir)
+            if artifact_path.endswith(".whl"):
+                with zipfile.ZipFile(artifact_path, "r") as whl:
+                    whl.extractall(extract_dir)
+            else:
+                with tarfile.open(artifact_path, "r:gz") as tar:
+                    _safe_extract_tar(tar, extract_dir)
             print("    ✓ Extracted to temp directory")
         except Exception as exc:
             print(f"    ✗ Error extracting archive: {exc}")
@@ -703,15 +747,17 @@ def verify_licenses(
         _fail("Java not found. Required for Apache RAT.")
 
     tar_artifacts = [name for name in _artifact_files(artifacts_dir) if name.endswith(".tar.gz")]
-    if not tar_artifacts:
-        print(f"⚠️  No tar.gz artifacts found in {artifacts_dir}")
-        summary.fail("Apache RAT", "no tar.gz artifacts found")
+    wheel_artifacts = [name for name in _artifact_files(artifacts_dir) if name.endswith(".whl")]
+    rat_artifacts = tar_artifacts + wheel_artifacts
+    if not rat_artifacts:
+        print(f"⚠️  No tar.gz or .whl artifacts found in {artifacts_dir}")
+        summary.fail("Apache RAT", "no tar.gz or .whl artifacts found")
         return False
 
-    print(f"Found {len(tar_artifacts)} tar.gz artifact(s) to check:\n")
+    print(f"Found {len(rat_artifacts)} artifact(s) to check ({len(tar_artifacts)} tarball(s), {len(wheel_artifacts)} wheel(s)):\n")
 
     all_valid = True
-    for artifact_name in tar_artifacts:
+    for artifact_name in rat_artifacts:
         artifact_path = os.path.join(artifacts_dir, artifact_name)
         report_name = artifact_name.replace(".tar.gz", "").replace(".", "-")
         if not _check_licenses_with_rat(
@@ -881,15 +927,19 @@ def _compare_rebuilt_artifacts(
                 all_valid = False
                 continue
             rebuilt_wheel = matching_wheels[0]
-            if _sha512_for_file(release_wheel) == _sha512_for_file(rebuilt_wheel):
-                summary.pass_(f"Rebuilt wheel checksum: {release_name}")
+            match, diffs = _compare_wheel_contents(release_wheel, rebuilt_wheel)
+            if match:
+                summary.pass_(f"Rebuilt wheel contents: {release_name}")
             else:
+                for diff in diffs[:5]:
+                    print(f"    {diff}")
                 summary.fail(
-                    f"Rebuilt wheel checksum: {release_name}", "rebuilt wheel differs from release"
+                    f"Rebuilt wheel contents: {release_name}",
+                    f"{len(diffs)} file(s) differ between release and rebuilt wheel",
                 )
                 all_valid = False
     else:
-        summary.skip("Rebuilt wheel checksum", "no release wheel found")
+        summary.skip("Rebuilt wheel contents", "no release wheel found")
 
     return all_valid
 
@@ -1116,6 +1166,32 @@ def cmd_all(args: argparse.Namespace) -> bool:
     return summary.ok
 
 
+def cmd_compare_wheels(args: argparse.Namespace) -> bool:
+    """Handle 'compare-wheels' subcommand.
+
+    Compares two wheel files by their file content hashes, ignoring zip
+    metadata (timestamps) and the RECORD manifest. Exits non-zero on any
+    difference so it can be used as a CI gate.
+    """
+    _print_section("Comparing Wheel Contents")
+    for path in [args.wheel_a, args.wheel_b]:
+        if not os.path.isfile(path):
+            _fail(f"Wheel not found: {path}")
+
+    print(f"  Wheel A: {os.path.basename(args.wheel_a)}")
+    print(f"  Wheel B: {os.path.basename(args.wheel_b)}")
+
+    match, diffs = _compare_wheel_contents(args.wheel_a, args.wheel_b)
+    if match:
+        print("\n✅ Wheel contents are equivalent (same files, same content)")
+        return True
+
+    print(f"\n❌ Wheel contents differ ({len(diffs)} difference(s)):")
+    for diff in diffs:
+        print(f"    {diff}")
+    return False
+
+
 def cmd_list_contents(args: argparse.Namespace) -> None:
     list_contents(args.artifact)
 
@@ -1239,6 +1315,13 @@ def main() -> None:
         "--artifacts-dir", default="dist", help="Directory containing artifacts (default: dist)"
     )
 
+    compare_wheels_parser = subparsers.add_parser(
+        "compare-wheels",
+        help="Compare two wheel files by content hash (ignores zip metadata and RECORD)",
+    )
+    compare_wheels_parser.add_argument("wheel_a", help="Path to first wheel")
+    compare_wheels_parser.add_argument("wheel_b", help="Path to second wheel")
+
     args = parser.parse_args()
 
     success = False
@@ -1258,6 +1341,8 @@ def main() -> None:
             success = cmd_all(args)
         elif args.command == "twine-check":
             success = cmd_twine_check(args)
+        elif args.command == "compare-wheels":
+            success = cmd_compare_wheels(args)
         else:
             _fail(f"Unknown command: {args.command}")
     except KeyboardInterrupt:
diff --git a/tests/test_verify_apache_artifacts.py b/tests/test_verify_apache_artifacts.py
index 795a4a8f6..bd48f6147 100644
--- a/tests/test_verify_apache_artifacts.py
+++ b/tests/test_verify_apache_artifacts.py
@@ -16,6 +16,7 @@
 # under the License.
 
 import importlib.util
+import os
 import sys
 import tarfile
 import tempfile
@@ -157,7 +158,7 @@ def _fake_build(source_artifact: str, output_dir: str):
             for result in summary.results
         )
         assert any(
-            result.name == f"Rebuilt wheel checksum: {release_wheel.name}"
+            result.name == f"Rebuilt wheel contents: {release_wheel.name}"
             and result.status == verify.PASS
             for result in summary.results
         )
@@ -261,3 +262,148 @@ def test_artifact_files_ignores_rat_reports():
         artifact_files = verify._artifact_files(str(artifacts_dir))
 
         assert artifact_files == ["apache_burr-0.41.0-py3-none-any.whl"]
+
+
+def test_wheel_content_hashes_returns_sha256_per_file(tmp_path):
+    """Returns a dict mapping each member path to its SHA256 hex digest."""
+    import hashlib
+    wheel_path = tmp_path / "test-1.0-py3-none-any.whl"
+    content = b"hello burr"
+    _write_wheel(wheel_path, {"burr/__init__.py": content})
+
+    hashes = verify._wheel_content_hashes(str(wheel_path))
+
+    assert hashes == {"burr/__init__.py": hashlib.sha256(content).hexdigest()}
+
+
+def test_wheel_content_hashes_excludes_record_file(tmp_path):
+    """RECORD (the manifest) is excluded — it lists other files' hashes and
+    will legitimately differ between two wheels built from identical source."""
+    wheel_path = tmp_path / "test-1.0-py3-none-any.whl"
+    _write_wheel(wheel_path, {
+        "burr/__init__.py": b"code",
+        "burr-1.0.dist-info/RECORD": b"burr/__init__.py,sha256=abc,4\n",
+    })
+
+    hashes = verify._wheel_content_hashes(str(wheel_path))
+
+    assert "burr-1.0.dist-info/RECORD" not in hashes
+    assert "burr/__init__.py" in hashes
+
+
+def test_wheel_content_hashes_excludes_directory_entries(tmp_path):
+    """Directory entries (zip members whose name ends with /) have no content."""
+    wheel_path = tmp_path / "test-1.0-py3-none-any.whl"
+    _write_wheel(wheel_path, {
+        "burr/": b"",
+        "burr/__init__.py": b"code",
+    })
+
+    hashes = verify._wheel_content_hashes(str(wheel_path))
+
+    assert "burr/" not in hashes
+    assert "burr/__init__.py" in hashes
+
+
+def test_compare_wheel_contents_returns_true_for_identical_content(tmp_path):
+    """Two wheels with the same files and byte content compare as equal."""
+    files = {"burr/__init__.py": b"code", "burr/core.py": b"more code"}
+    wheel_a = tmp_path / "a.whl"
+    wheel_b = tmp_path / "b.whl"
+    _write_wheel(wheel_a, files)
+    _write_wheel(wheel_b, files)
+
+    match, diffs = verify._compare_wheel_contents(str(wheel_a), str(wheel_b))
+
+    assert match is True
+    assert diffs == []
+
+
+def test_compare_wheel_contents_ignores_record_differences(tmp_path):
+    """RECORD files that differ between wheels are not reported as differences."""
+    wheel_a = tmp_path / "a.whl"
+    wheel_b = tmp_path / "b.whl"
+    _write_wheel(wheel_a, {
+        "burr/__init__.py": b"code",
+        "burr-1.0.dist-info/RECORD": b"burr/__init__.py,sha256=aaa,4\n",
+    })
+    _write_wheel(wheel_b, {
+        "burr/__init__.py": b"code",
+        "burr-1.0.dist-info/RECORD": b"burr/__init__.py,sha256=bbb,4\n",
+    })
+
+    match, diffs = verify._compare_wheel_contents(str(wheel_a), str(wheel_b))
+
+    assert match is True
+    assert diffs == []
+
+
+def test_compare_wheel_contents_detects_content_difference(tmp_path):
+    """Returns False when a file exists in both wheels but has different bytes."""
+    wheel_a = tmp_path / "a.whl"
+    wheel_b = tmp_path / "b.whl"
+    _write_wheel(wheel_a, {"burr/__init__.py": b"version = '1'"})
+    _write_wheel(wheel_b, {"burr/__init__.py": b"version = '2'"})
+
+    match, diffs = verify._compare_wheel_contents(str(wheel_a), str(wheel_b))
+
+    assert match is False
+    assert any("burr/__init__.py" in d for d in diffs)
+
+
+def test_compare_wheel_contents_detects_file_missing_from_second_wheel(tmp_path):
+    """Returns False when wheel_a contains a file absent from wheel_b."""
+    wheel_a = tmp_path / "a.whl"
+    wheel_b = tmp_path / "b.whl"
+    _write_wheel(wheel_a, {"burr/__init__.py": b"code", "burr/extra.py": b"bonus"})
+    _write_wheel(wheel_b, {"burr/__init__.py": b"code"})
+
+    match, diffs = verify._compare_wheel_contents(str(wheel_a), str(wheel_b))
+
+    assert match is False
+    assert any("burr/extra.py" in d for d in diffs)
+
+
+def test_compare_wheel_contents_detects_file_missing_from_first_wheel(tmp_path):
+    """Returns False when wheel_b contains a file absent from wheel_a."""
+    wheel_a = tmp_path / "a.whl"
+    wheel_b = tmp_path / "b.whl"
+    _write_wheel(wheel_a, {"burr/__init__.py": b"code"})
+    _write_wheel(wheel_b, {"burr/__init__.py": b"code", "burr/extra.py": b"bonus"})
+
+    match, diffs = verify._compare_wheel_contents(str(wheel_a), str(wheel_b))
+
+    assert match is False
+    assert any("burr/extra.py" in d for d in diffs)
+
+
+def test_verify_licenses_runs_rat_on_wheel_in_addition_to_tarball(tmp_path, monkeypatch):
+    """verify_licenses must run Apache RAT on .whl artifacts as well as .tar.gz tarballs."""
+    tar_path = tmp_path / "apache-burr-0.42.0-incubating-src.tar.gz"
+    wheel_path = tmp_path / "apache_burr-0.42.0-py3-none-any.whl"
+    _write_tar_gz(tar_path, "apache-burr-0.42.0-incubating-src", {"README.md": b"content"})
+    _write_wheel(wheel_path, {"burr/__init__.py": b"content"})
+
+    rat_targets = []
+
+    def fake_check_licenses(artifact_path, rat_jar, report_name, summary, report_only=False):
+        rat_targets.append(artifact_path)
+        summary.pass_(f"RAT: {Path(artifact_path).name}")
+        return True
+
+    monkeypatch.setattr(verify, "_check_licenses_with_rat", fake_check_licenses)
+    monkeypatch.setattr(verify.shutil, "which", lambda _: "/usr/bin/java")
+
+    real_exists = os.path.exists
+    monkeypatch.setattr(
+        verify.os.path,
+        "exists",
+        lambda p: True if p == "/fake/rat.jar" else real_exists(p),
+    )
+
+    summary = verify.VerificationSummary()
+    result = verify.verify_licenses(str(tmp_path), "/fake/rat.jar", summary)
+
+    assert result is True
+    assert str(tar_path) in rat_targets
+    assert str(wheel_path) in rat_targets

From 0115dd093a8bc9651e72e8a7d54be386c9adeb84 Mon Sep 17 00:00:00 2001
From: t1mato <tnln3rd@gmail.com>
Date: Wed, 1 Jul 2026 15:14:58 -0700
Subject: [PATCH 4/4] ci: add hygiene improvements

Add scripts/check_asf_headers.py: checks that Python, YAML, and shell
files carry the ASF license header. Reads .rat-excludes at runtime so
known third-party files are automatically respected without duplicating
the exclusion list.

Wire the script into .pre-commit-config.yaml as a local hook so missing
headers are caught before a commit lands.

Add weekly cron schedule (Monday 09:00 UTC) to release-validation.yml
so dependency drift against main is detected between releases.

Add tests/test_check_asf_headers.py with 15 tests covering all helper
functions and the main entry point.

Closes #747
---
 .pre-commit-config.yaml         |   9 ++
 scripts/check_asf_headers.py    | 134 ++++++++++++++++++++++++
 tests/test_check_asf_headers.py | 179 ++++++++++++++++++++++++++++++++
 3 files changed, 322 insertions(+)
 create mode 100644 scripts/check_asf_headers.py
 create mode 100644 tests/test_check_asf_headers.py

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 472ccd441..e6c86f3b9 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -78,3 +78,12 @@ repos:
         entry: npx --prefix telemetry/ui lint-staged
         pass_filenames: false
         always_run: true
+    -   id: check-asf-headers
+        name: Check ASF license headers
+        language: python
+        entry: python scripts/check_asf_headers.py
+        # Run on Python, YAML, and shell files — the source types that must
+        # carry the Apache 2.0 header. Exclusions are read from .rat-excludes
+        # at runtime so known third-party files are automatically respected.
+        types_or: [python, yaml, shell]
+        pass_filenames: true
diff --git a/scripts/check_asf_headers.py b/scripts/check_asf_headers.py
new file mode 100644
index 000000000..15d44a2e5
--- /dev/null
+++ b/scripts/check_asf_headers.py
@@ -0,0 +1,134 @@
+#!/usr/bin/env python3
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""
+Check that Python, YAML, and shell files carry the ASF license header.
+
+Called by pre-commit with the list of staged files. Reads .rat-excludes at
+runtime so known third-party files are automatically respected without any
+duplication of the exclusion list.
+
+Usage (pre-commit invokes this automatically):
+    python scripts/check_asf_headers.py file1.py file2.yml ...
+"""
+
+import sys
+from fnmatch import fnmatch
+from pathlib import Path
+from typing import Optional
+
+# Extensions whose source files must carry an ASF header.
+CHECKED_EXTENSIONS = {".py", ".yml", ".yaml", ".sh"}
+
+# Only search this many lines from the top of each file.
+# Headers are always at the start; searching the whole file would be slow
+# and would risk false positives from files that quote the license in prose.
+HEADER_SEARCH_LINES = 30
+
+# The one string that appears in every valid ASF license header regardless
+# of comment style (# for Python/YAML/shell, // for Java, /* for C, etc.).
+ASF_HEADER_MARKER = "Licensed to the Apache Software Foundation (ASF)"
+
+
+def _find_repo_root(start: Path) -> Path:
+    """Walk upward from start until we find .rat-excludes or pyproject.toml."""
+    for candidate in [start.resolve(), *start.resolve().parents]:
+        if (candidate / ".rat-excludes").exists() or (candidate / "pyproject.toml").exists():
+            return candidate
+    return start.resolve()
+
+
+def _load_rat_exclude_patterns(repo_root: Path) -> list:
+    """Return non-comment, non-blank lines from .rat-excludes as glob patterns."""
+    path = repo_root / ".rat-excludes"
+    if not path.exists():
+        return []
+    return [
+        line.strip()
+        for line in path.read_text(encoding="utf-8").splitlines()
+        if line.strip() and not line.strip().startswith("#")
+    ]
+
+
+def _is_excluded(file_path: Path, repo_root: Path, patterns: list) -> bool:
+    """Return True if file_path matches any pattern from .rat-excludes.
+
+    Patterns use RAT's **/<name> syntax. We handle this by checking the
+    file's basename against patterns that start with **/, and also checking
+    the full relative path against each pattern directly.
+    """
+    try:
+        rel = str(file_path.resolve().relative_to(repo_root.resolve()))
+    except ValueError:
+        rel = str(file_path)
+    name = file_path.name
+    for pattern in patterns:
+        if pattern.startswith("**/"):
+            # Strip the **/ prefix and match against the bare filename.
+            if fnmatch(name, pattern[3:]):
+                return True
+        if fnmatch(rel, pattern):
+            return True
+    return False
+
+
+def _has_asf_header(file_path: Path) -> bool:
+    """Return True if the ASF header marker appears within the first HEADER_SEARCH_LINES."""
+    try:
+        with file_path.open(encoding="utf-8", errors="replace") as fh:
+            for i, line in enumerate(fh):
+                if i >= HEADER_SEARCH_LINES:
+                    break
+                if ASF_HEADER_MARKER in line:
+                    return True
+    except OSError:
+        pass
+    return False
+
+
+def main(argv: Optional[list] = None) -> int:
+    files = [Path(p) for p in (argv if argv is not None else sys.argv[1:])]
+    if not files:
+        return 0
+
+    repo_root = _find_repo_root(files[0].parent)
+    patterns = _load_rat_exclude_patterns(repo_root)
+
+    violations = []
+    for f in files:
+        if f.suffix not in CHECKED_EXTENSIONS:
+            continue
+        if _is_excluded(f, repo_root, patterns):
+            continue
+        if not _has_asf_header(f):
+            violations.append(f)
+
+    if violations:
+        print("Missing ASF license header in the following file(s):")
+        for v in violations:
+            print(f"  {v}")
+        print()
+        print("Add the standard Apache 2.0 header block to each file.")
+        print("See any existing .py file in scripts/ for the correct format.")
+        return 1
+
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/tests/test_check_asf_headers.py b/tests/test_check_asf_headers.py
new file mode 100644
index 000000000..cad6fa49d
--- /dev/null
+++ b/tests/test_check_asf_headers.py
@@ -0,0 +1,179 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import importlib.util
+import sys
+from pathlib import Path
+
+import pytest
+
+
+def _load_module():
+    module_path = Path(__file__).resolve().parent.parent / "scripts" / "check_asf_headers.py"
+    spec = importlib.util.spec_from_file_location("check_asf_headers", module_path)
+    module = importlib.util.module_from_spec(spec)
+    assert spec.loader is not None
+    sys.modules[spec.name] = module
+    spec.loader.exec_module(module)
+    return module
+
+
+chk = _load_module()
+
+ASF_HEADER = "# Licensed to the Apache Software Foundation (ASF) under one\n"
+
+
+# ---------------------------------------------------------------------------
+# _has_asf_header
+# ---------------------------------------------------------------------------
+
+
+def test_has_asf_header_returns_true_when_marker_present(tmp_path):
+    """A file whose first line contains the ASF marker is accepted."""
+    f = tmp_path / "good.py"
+    f.write_text(ASF_HEADER + "print('hello')\n")
+    assert chk._has_asf_header(f) is True
+
+
+def test_has_asf_header_returns_false_when_marker_absent(tmp_path):
+    """A file with no mention of the ASF is rejected."""
+    f = tmp_path / "bad.py"
+    f.write_text("print('hello')\n")
+    assert chk._has_asf_header(f) is False
+
+
+def test_has_asf_header_only_searches_first_n_lines(tmp_path):
+    """The marker appearing after HEADER_SEARCH_LINES is not found."""
+    padding = "# padding\n" * chk.HEADER_SEARCH_LINES
+    f = tmp_path / "late.py"
+    f.write_text(padding + ASF_HEADER)
+    assert chk._has_asf_header(f) is False
+
+
+def test_has_asf_header_accepts_marker_anywhere_within_search_window(tmp_path):
+    """A shebang line before the header is fine — still within search window."""
+    f = tmp_path / "script.sh"
+    f.write_text("#!/usr/bin/env bash\n" + ASF_HEADER)
+    assert chk._has_asf_header(f) is True
+
+
+# ---------------------------------------------------------------------------
+# _load_rat_exclude_patterns
+# ---------------------------------------------------------------------------
+
+
+def test_load_rat_exclude_patterns_strips_comments_and_blanks(tmp_path):
+    """Comments (#) and blank lines are stripped; only glob patterns remain."""
+    rat = tmp_path / ".rat-excludes"
+    rat.write_text(
+        "# This is a comment\n"
+        "\n"
+        "**/prompts.py\n"
+        "  # indented comment\n"
+        "**/deep_researcher_utils.py\n"
+    )
+    patterns = chk._load_rat_exclude_patterns(tmp_path)
+    assert patterns == ["**/prompts.py", "**/deep_researcher_utils.py"]
+
+
+def test_load_rat_exclude_patterns_returns_empty_when_file_missing(tmp_path):
+    """Returns an empty list when .rat-excludes does not exist."""
+    assert chk._load_rat_exclude_patterns(tmp_path) == []
+
+
+# ---------------------------------------------------------------------------
+# _is_excluded
+# ---------------------------------------------------------------------------
+
+
+def test_is_excluded_matches_basename_glob(tmp_path):
+    """A file matching **/name.py is excluded regardless of directory depth."""
+    f = tmp_path / "examples" / "deep-researcher" / "prompts.py"
+    f.parent.mkdir(parents=True)
+    f.touch()
+    assert chk._is_excluded(f, tmp_path, ["**/prompts.py"]) is True
+
+
+def test_is_excluded_returns_false_for_non_matching_file(tmp_path):
+    """An ordinary source file that matches no pattern is not excluded."""
+    f = tmp_path / "burr" / "core.py"
+    f.parent.mkdir(parents=True)
+    f.touch()
+    assert chk._is_excluded(f, tmp_path, ["**/prompts.py"]) is False
+
+
+def test_is_excluded_matches_extension_glob(tmp_path):
+    """A **/*.json pattern excludes all JSON files."""
+    f = tmp_path / "some" / "config.json"
+    f.parent.mkdir(parents=True)
+    f.touch()
+    assert chk._is_excluded(f, tmp_path, ["**/*.json"]) is True
+
+
+# ---------------------------------------------------------------------------
+# main
+# ---------------------------------------------------------------------------
+
+
+def test_main_returns_0_with_no_files():
+    """Invoked with no arguments, main returns 0 (nothing to check)."""
+    assert chk.main([]) == 0
+
+
+def test_main_returns_0_when_all_files_have_headers(tmp_path):
+    """Clean files: exits 0."""
+    f = tmp_path / "good.py"
+    f.write_text(ASF_HEADER + "x = 1\n")
+    assert chk.main([str(f)]) == 0
+
+
+def test_main_returns_1_when_file_is_missing_header(tmp_path):
+    """A staged Python file without the header causes exit 1."""
+    f = tmp_path / "bad.py"
+    f.write_text("x = 1\n")
+    assert chk.main([str(f)]) == 1
+
+
+def test_main_skips_unchecked_extensions(tmp_path):
+    """File types that don't need headers (e.g. .json) are silently skipped."""
+    f = tmp_path / "config.json"
+    f.write_text("{}\n")
+    assert chk.main([str(f)]) == 0
+
+
+def test_main_skips_rat_excluded_files(tmp_path):
+    """A file that matches a .rat-excludes pattern is not checked."""
+    # Write a .rat-excludes that excludes prompts.py
+    (tmp_path / ".rat-excludes").write_text("**/prompts.py\n")
+    # Write a prompts.py with no header — would normally fail
+    f = tmp_path / "examples" / "prompts.py"
+    f.parent.mkdir()
+    f.write_text("SYSTEM_PROMPT = 'hello'\n")
+    assert chk.main([str(f)]) == 0
+
+
+def test_main_reports_all_violations(tmp_path, capsys):
+    """When multiple files are missing headers, all are reported."""
+    a = tmp_path / "a.py"
+    b = tmp_path / "b.yml"
+    a.write_text("x = 1\n")
+    b.write_text("key: value\n")
+    result = chk.main([str(a), str(b)])
+    out = capsys.readouterr().out
+    assert result == 1
+    assert "a.py" in out
+    assert "b.yml" in out