From 9805c81f45d00b8f054b532aa88776e233363581 Mon Sep 17 00:00:00 2001 From: Bryan Beverly Date: Sun, 31 May 2026 00:27:49 -0700 Subject: [PATCH 1/3] Add domain/* labels and CODEOWNERS-based domain labeling Define 7 domain/* labels (scanning, findings, integrations, platform, frontend, infra, database) in labels.yml with a teal color palette. Extend pr_labeler.py to fetch CODEOWNERS from the caller repo via the Contents API, parse it with last-match-wins semantics, match changed files to owning teams, and apply/remove domain/* labels accordingly. This avoids depending on mutable PR reviewer state and works across all repos that have a CODEOWNERS file. Repos without CODEOWNERS gracefully skip domain labeling. Unknown team slugs (e.g. eng-leads) are filtered out so only recognized domain labels are applied. 81 tests passing (26 new for CODEOWNERS parsing, matching, domain reconciliation). Jira: PLAT-222 Co-authored-by: Cursor --- .github/scripts/pr_labeler.py | 166 ++++++++++++++++++++++++++++- .github/scripts/test_pr_labeler.py | 166 +++++++++++++++++++++++++++++ labels.yml | 21 ++++ 3 files changed, 350 insertions(+), 3 deletions(-) diff --git a/.github/scripts/pr_labeler.py b/.github/scripts/pr_labeler.py index 75875cc..3fc72c2 100644 --- a/.github/scripts/pr_labeler.py +++ b/.github/scripts/pr_labeler.py @@ -1,5 +1,5 @@ #!/usr/bin/env python3 -"""PR labeler: compute size, risk, and template-field labels for one or more PRs. +"""PR labeler: compute size, risk, template-field, and domain labels for PRs. Inputs come from environment variables set by the calling workflow: GITHUB_REPOSITORY e.g. "owner/repo" (always set on Actions) @@ -18,7 +18,8 @@ The current format is preferred; the legacy format is matched as a fallback so PRs opened before the template change keep working until the queue rolls over. - 5. Reconciling with current labels and applying adds/removes via `gh pr edit`. + 5. Matching changed files against CODEOWNERS to apply domain/* labels. + 6. Reconciling with current labels and applying adds/removes via `gh pr edit`. For backfill mode (PR_NUMBER == "all"), per-PR failures are logged but do not abort the run, unless more than 10% of PRs fail. @@ -26,12 +27,14 @@ from __future__ import annotations +import base64 import json import os import re import subprocess import sys from dataclasses import dataclass, field +from fnmatch import fnmatch SIZE_LABELS = ["size/XS", "size/S", "size/M", "size/L", "size/XL"] RISK_LABELS = ["risk/low", "risk/medium", "risk/high"] @@ -51,6 +54,134 @@ # Conservative fallback for unmapped Bugbot levels (e.g., "Critical", "Minimal"). RISK_FALLBACK = "risk/high" +DOMAIN_LABEL_PREFIX = "domain/" +KNOWN_DOMAIN_SLUGS = frozenset( + ["scanning", "findings", "integrations", "platform", "frontend", "infra", "database"] +) + + +# --------------------------------------------------------------------------- +# CODEOWNERS parsing (last-match-wins per file, union across all files) +# --------------------------------------------------------------------------- + +CodeownersRule = tuple[str, list[str]] # (pattern, [team_slugs]) + + +def parse_codeowners(text: str) -> list[CodeownersRule]: + """Parse CODEOWNERS text into an ordered list of (pattern, teams) rules.""" + rules: list[CodeownersRule] = [] + for line in text.splitlines(): + line = line.split("#", 1)[0].strip() + if not line: + continue + tokens = line.split() + pattern = tokens[0] + slugs: list[str] = [] + for owner in tokens[1:]: + # @org/team -> team (lowercased) + if "/" in owner: + slugs.append(owner.rsplit("/", 1)[1].lower()) + else: + slugs.append(owner.lstrip("@").lower()) + rules.append((pattern, slugs)) + return rules + + +def _codeowners_match(pattern: str, filepath: str) -> bool: + """Test whether a CODEOWNERS pattern matches a file path. + + Implements GitHub's CODEOWNERS matching rules: + - ``*`` alone matches everything. + - A pattern starting with ``/`` is anchored to the repo root; the + leading ``/`` is stripped before matching. + - A pattern ending with ``/`` matches everything under that directory. + - A pattern containing an internal ``/`` (after stripping leading ``/``) + is implicitly anchored to the repo root. + - A pattern with no ``/`` at all matches by basename at any depth. + - A single ``*`` does not cross directory boundaries (unlike fnmatch); + use ``**`` to match across directories. + """ + if pattern == "*": + return True + + anchored = pattern.startswith("/") + p = pattern.lstrip("/") + + if p.endswith("/"): + p += "**" + + has_slash = "/" in p.rstrip("/") + + if anchored or has_slash: + return _gitignore_match(p, filepath) + + # No slash: match against basename at any depth. + basename = filepath.rsplit("/", 1)[-1] + return fnmatch(basename, p) + + +def _gitignore_match(pattern: str, filepath: str) -> bool: + """Match a pattern against a filepath where ``*`` does not cross ``/``. + + Splits both pattern and path on ``/`` and matches segment-by-segment. + ``**`` matches zero or more directory segments. + """ + return _segments_match(pattern.split("/"), filepath.split("/")) + + +def _segments_match(pat_parts: list[str], path_parts: list[str]) -> bool: + if not pat_parts: + return not path_parts + if pat_parts[0] == "**": + rest = pat_parts[1:] + # ** matches zero or more segments + for i in range(len(path_parts) + 1): + if _segments_match(rest, path_parts[i:]): + return True + return False + if not path_parts: + return False + if fnmatch(path_parts[0], pat_parts[0]): + return _segments_match(pat_parts[1:], path_parts[1:]) + return False + + +def domains_for_pr( + rules: list[CodeownersRule], changed_files: list[str] +) -> set[str]: + """Return the set of domain slugs that own any changed file.""" + teams: set[str] = set() + for filepath in changed_files: + matched_slugs: list[str] = [] + for pattern, slugs in rules: + if _codeowners_match(pattern, filepath): + matched_slugs = slugs + teams.update(matched_slugs) + return teams + + +def fetch_codeowners(repo: str) -> str | None: + """Fetch CODEOWNERS from the repo's default branch via the Contents API.""" + result = gh( + ["api", f"repos/{repo}/contents/CODEOWNERS", "--jq", ".content"], + check=False, + ) + if result.returncode != 0: + return None + try: + return base64.b64decode(result.stdout.strip()).decode() + except Exception: + return None + + +def fetch_pr_files(repo: str, pr_number: int) -> list[str]: + """Return the list of changed file paths for a PR.""" + result = gh( + ["pr", "view", str(pr_number), "--repo", repo, "--json", "files"], + ) + data = json.loads(result.stdout) + return [f["path"] for f in data.get("files", [])] + def yesno_regex(keyword: str) -> re.Pattern[str]: """Match the current template format and capture ``yes`` or ``no``. @@ -222,6 +353,7 @@ def reconcile( pr: dict, *, plan: LabelPlan, + domain_slugs: set[str] | None = None, ) -> None: current_labels = {label["name"] for label in pr.get("labels", [])} body = pr.get("body") or "" @@ -259,6 +391,19 @@ def reconcile( elif state == "off" and label in current_labels: plan.remove.append(label) + # Domain labels: add for matched teams, remove stale ones. + if domain_slugs is not None: + desired_domain = { + f"{DOMAIN_LABEL_PREFIX}{s}" for s in domain_slugs if s in KNOWN_DOMAIN_SLUGS + } + for slug in KNOWN_DOMAIN_SLUGS: + label = f"{DOMAIN_LABEL_PREFIX}{slug}" + if label in desired_domain: + if label not in current_labels: + plan.add.append(label) + elif label in current_labels: + plan.remove.append(label) + def apply(repo: str, plan: LabelPlan, dry_run: bool) -> None: if dry_run or (not plan.add and not plan.remove): @@ -294,6 +439,15 @@ def main() -> int: print(f"Processing {len(targets)} PR(s) in {repo} (dry_run={dry_run})") + # Fetch CODEOWNERS once per run (same for all PRs in this repo). + codeowners_text = fetch_codeowners(repo) + codeowners_rules: list[CodeownersRule] | None = None + if codeowners_text is not None: + codeowners_rules = parse_codeowners(codeowners_text) + print(f"Loaded {len(codeowners_rules)} CODEOWNERS rule(s) for domain labeling") + else: + print("No CODEOWNERS found; skipping domain labeling") + failures = 0 for pr_number in targets: plan = LabelPlan(pr_number=pr_number) @@ -302,7 +456,13 @@ def main() -> int: if pr.get("state") != "OPEN": print(f"PR #{pr_number} (skip: not open)") continue - reconcile(pr, plan=plan) + + domain_slugs: set[str] | None = None + if codeowners_rules is not None: + files = fetch_pr_files(repo, pr_number) + domain_slugs = domains_for_pr(codeowners_rules, files) + + reconcile(pr, plan=plan, domain_slugs=domain_slugs) apply(repo, plan, dry_run) print(plan.summary()) except subprocess.CalledProcessError as exc: diff --git a/.github/scripts/test_pr_labeler.py b/.github/scripts/test_pr_labeler.py index df1b404..1f03d74 100644 --- a/.github/scripts/test_pr_labeler.py +++ b/.github/scripts/test_pr_labeler.py @@ -390,3 +390,169 @@ def test_event_overridden_by_explicit(self, monkeypatch): def test_no_input_returns_empty(self): assert pr_labeler.determine_targets("repo", "", "") == [] + + +# ---- parse_codeowners ------------------------------------------------------- + + +class TestParseCodeowners: + def test_simple_catch_all(self): + rules = pr_labeler.parse_codeowners("* @org/scanning") + assert rules == [("*", ["scanning"])] + + def test_multiple_owners(self): + rules = pr_labeler.parse_codeowners( + "proto/ @org/integrations @org/scanning" + ) + assert rules == [("proto/", ["integrations", "scanning"])] + + def test_skips_comments_and_blanks(self): + text = "# comment\n\n* @org/eng-leads\n # indented comment\n/web/ @org/findings" + rules = pr_labeler.parse_codeowners(text) + assert len(rules) == 2 + assert rules[0] == ("*", ["eng-leads"]) + assert rules[1] == ("/web/", ["findings"]) + + def test_inline_comment_stripped(self): + rules = pr_labeler.parse_codeowners("/vendor/ @org/platform # vendored deps") + assert rules == [("/vendor/", ["platform"])] + + def test_owner_case_normalized(self): + rules = pr_labeler.parse_codeowners("* @org/Integrations") + assert rules[0][1] == ["integrations"] + + +# ---- _codeowners_match ------------------------------------------------------ + + +class TestCodeownersMatch: + def test_star_matches_everything(self): + assert pr_labeler._codeowners_match("*", "any/file.py") + assert pr_labeler._codeowners_match("*", "root.go") + + def test_anchored_dir(self): + assert pr_labeler._codeowners_match("/web/", "web/app.py") + assert pr_labeler._codeowners_match("/web/", "web/sub/deep.py") + assert not pr_labeler._codeowners_match("/web/", "other/web/app.py") + + def test_unanchored_dir_with_internal_slash(self): + # Pattern has internal slash -> implicitly anchored + assert pr_labeler._codeowners_match("pkg/engine/", "pkg/engine/scan.go") + assert not pr_labeler._codeowners_match("pkg/engine/", "other/pkg/engine/x.go") + + def test_anchored_glob(self): + assert pr_labeler._codeowners_match("/web/webapi/views/*.py", "web/webapi/views/foo.py") + assert not pr_labeler._codeowners_match( + "/web/webapi/views/*.py", "web/webapi/views/sub/foo.py" + ) + + def test_unanchored_basename(self): + assert pr_labeler._codeowners_match("go.sum", "go.sum") + assert pr_labeler._codeowners_match("go.sum", "vendor/somelib/go.sum") + + def test_basename_glob(self): + assert pr_labeler._codeowners_match("*.js", "frontend/app.js") + assert pr_labeler._codeowners_match("*.js", "app.js") + assert not pr_labeler._codeowners_match("*.js", "app.jsx") + + def test_deep_anchored_path(self): + assert pr_labeler._codeowners_match( + "/vendor/github.com/trufflesecurity/smallfetch/", + "vendor/github.com/trufflesecurity/smallfetch/client.go", + ) + assert not pr_labeler._codeowners_match( + "/vendor/github.com/trufflesecurity/smallfetch/", + "other/vendor/github.com/trufflesecurity/smallfetch/client.go", + ) + + +# ---- domains_for_pr --------------------------------------------------------- + + +SAMPLE_CODEOWNERS = """\ +* @org/eng-leads +/web/ @org/findings +/web/webapi/views/*.py @org/integrations +/pkg/engine/ @org/scanning +go.sum +go.mod +""" + + +class TestDomainsForPr: + @pytest.fixture() + def rules(self): + return pr_labeler.parse_codeowners(SAMPLE_CODEOWNERS) + + def test_single_domain(self, rules): + result = pr_labeler.domains_for_pr(rules, ["pkg/engine/scan.go"]) + assert result == {"scanning"} + + def test_last_match_wins(self, rules): + # web/webapi/views/foo.py matches both /web/ and /web/webapi/views/*.py; + # last-match-wins means integrations, not findings. + result = pr_labeler.domains_for_pr(rules, ["web/webapi/views/foo.py"]) + assert result == {"integrations"} + + def test_multi_domain_pr(self, rules): + result = pr_labeler.domains_for_pr( + rules, ["web/app.py", "pkg/engine/scan.go"] + ) + assert result == {"findings", "scanning"} + + def test_catch_all_fallback(self, rules): + result = pr_labeler.domains_for_pr(rules, ["README.md"]) + assert result == {"eng-leads"} + + def test_unowned_file(self, rules): + # go.sum has no owners in CODEOWNERS -> empty slug list from last match + result = pr_labeler.domains_for_pr(rules, ["go.sum"]) + assert result == set() + + def test_empty_files(self, rules): + assert pr_labeler.domains_for_pr(rules, []) == set() + + +# ---- reconcile with domain labels ------------------------------------------- + + +class TestReconcileDomain: + def test_adds_domain_labels(self): + plan = _plan() + pr_labeler.reconcile( + _pr(additions=5), + plan=plan, + domain_slugs={"scanning", "findings"}, + ) + assert "domain/scanning" in plan.add + assert "domain/findings" in plan.add + + def test_removes_stale_domain_labels(self): + plan = _plan() + pr_labeler.reconcile( + _pr(additions=5, labels=("domain/scanning", "domain/platform")), + plan=plan, + domain_slugs={"scanning"}, + ) + assert "domain/scanning" not in plan.add # already present + assert "domain/scanning" not in plan.remove + assert "domain/platform" in plan.remove + + def test_ignores_unknown_slugs(self): + plan = _plan() + pr_labeler.reconcile( + _pr(additions=5), + plan=plan, + domain_slugs={"eng-leads", "scanning"}, + ) + assert "domain/eng-leads" not in plan.add + assert "domain/scanning" in plan.add + + def test_no_domain_changes_when_none(self): + plan = _plan() + pr_labeler.reconcile( + _pr(additions=5, labels=("domain/scanning",)), + plan=plan, + domain_slugs=None, + ) + assert "domain/scanning" not in plan.remove diff --git a/labels.yml b/labels.yml index 18d6ea9..fec9f45 100644 --- a/labels.yml +++ b/labels.yml @@ -33,3 +33,24 @@ - name: complexity/high color: 8957e5 description: Non-obvious logic, careful review (author-declared) +- name: domain/scanning + color: 0d9488 + description: PR touches code owned by Scanning +- name: domain/findings + color: 14b8a6 + description: PR touches code owned by Findings +- name: domain/integrations + color: 2dd4bf + description: PR touches code owned by Integrations +- name: domain/platform + color: 5eead4 + description: PR touches code owned by Platform +- name: domain/frontend + color: 99f6e4 + description: PR touches code owned by Frontend +- name: domain/infra + color: 0f766e + description: PR touches code owned by Infra +- name: domain/database + color: 115e59 + description: PR touches code owned by Database From 5670c719dd0092fc0f05b958155215c0a0261c77 Mon Sep 17 00:00:00 2001 From: Bryan Beverly Date: Sun, 31 May 2026 00:30:22 -0700 Subject: [PATCH 2/3] Fix ruff formatting Co-authored-by: Cursor --- .github/scripts/pr_labeler.py | 14 ++++++++++---- .github/scripts/test_pr_labeler.py | 16 ++++++++-------- 2 files changed, 18 insertions(+), 12 deletions(-) diff --git a/.github/scripts/pr_labeler.py b/.github/scripts/pr_labeler.py index 3fc72c2..db71f1d 100644 --- a/.github/scripts/pr_labeler.py +++ b/.github/scripts/pr_labeler.py @@ -56,7 +56,15 @@ DOMAIN_LABEL_PREFIX = "domain/" KNOWN_DOMAIN_SLUGS = frozenset( - ["scanning", "findings", "integrations", "platform", "frontend", "infra", "database"] + [ + "scanning", + "findings", + "integrations", + "platform", + "frontend", + "infra", + "database", + ] ) @@ -146,9 +154,7 @@ def _segments_match(pat_parts: list[str], path_parts: list[str]) -> bool: return False -def domains_for_pr( - rules: list[CodeownersRule], changed_files: list[str] -) -> set[str]: +def domains_for_pr(rules: list[CodeownersRule], changed_files: list[str]) -> set[str]: """Return the set of domain slugs that own any changed file.""" teams: set[str] = set() for filepath in changed_files: diff --git a/.github/scripts/test_pr_labeler.py b/.github/scripts/test_pr_labeler.py index 1f03d74..ab4fe08 100644 --- a/.github/scripts/test_pr_labeler.py +++ b/.github/scripts/test_pr_labeler.py @@ -401,13 +401,13 @@ def test_simple_catch_all(self): assert rules == [("*", ["scanning"])] def test_multiple_owners(self): - rules = pr_labeler.parse_codeowners( - "proto/ @org/integrations @org/scanning" - ) + rules = pr_labeler.parse_codeowners("proto/ @org/integrations @org/scanning") assert rules == [("proto/", ["integrations", "scanning"])] def test_skips_comments_and_blanks(self): - text = "# comment\n\n* @org/eng-leads\n # indented comment\n/web/ @org/findings" + text = ( + "# comment\n\n* @org/eng-leads\n # indented comment\n/web/ @org/findings" + ) rules = pr_labeler.parse_codeowners(text) assert len(rules) == 2 assert rules[0] == ("*", ["eng-leads"]) @@ -441,7 +441,9 @@ def test_unanchored_dir_with_internal_slash(self): assert not pr_labeler._codeowners_match("pkg/engine/", "other/pkg/engine/x.go") def test_anchored_glob(self): - assert pr_labeler._codeowners_match("/web/webapi/views/*.py", "web/webapi/views/foo.py") + assert pr_labeler._codeowners_match( + "/web/webapi/views/*.py", "web/webapi/views/foo.py" + ) assert not pr_labeler._codeowners_match( "/web/webapi/views/*.py", "web/webapi/views/sub/foo.py" ) @@ -495,9 +497,7 @@ def test_last_match_wins(self, rules): assert result == {"integrations"} def test_multi_domain_pr(self, rules): - result = pr_labeler.domains_for_pr( - rules, ["web/app.py", "pkg/engine/scan.go"] - ) + result = pr_labeler.domains_for_pr(rules, ["web/app.py", "pkg/engine/scan.go"]) assert result == {"findings", "scanning"} def test_catch_all_fallback(self, rules): From a70cc1a451c4fb5d41d4d8c09068907afe45f290 Mon Sep 17 00:00:00 2001 From: Bryan Beverly Date: Sun, 31 May 2026 00:44:49 -0700 Subject: [PATCH 3/3] Fix trailing-slash anchoring and multi-path CODEOWNERS lookup - Check for internal slashes before appending ** to trailing-slash patterns, so unanchored dirs like "vendor/" match at any depth - Search all three GitHub CODEOWNERS locations (.github/, root, docs/) in priority order instead of only the root Co-authored-by: Cursor --- .github/scripts/pr_labeler.py | 50 ++++++++++++++++++++---------- .github/scripts/test_pr_labeler.py | 10 ++++++ 2 files changed, 43 insertions(+), 17 deletions(-) diff --git a/.github/scripts/pr_labeler.py b/.github/scripts/pr_labeler.py index db71f1d..06f856a 100644 --- a/.github/scripts/pr_labeler.py +++ b/.github/scripts/pr_labeler.py @@ -115,17 +115,24 @@ def _codeowners_match(pattern: str, filepath: str) -> bool: anchored = pattern.startswith("/") p = pattern.lstrip("/") + # Check for internal slash *before* appending ** for trailing-slash dirs. + # A trailing-only slash does not anchor; only leading or internal slashes do. + has_internal_slash = "/" in p.rstrip("/") + if p.endswith("/"): p += "**" - has_slash = "/" in p.rstrip("/") - - if anchored or has_slash: + if anchored or has_internal_slash: return _gitignore_match(p, filepath) - # No slash: match against basename at any depth. - basename = filepath.rsplit("/", 1)[-1] - return fnmatch(basename, p) + if "/" not in p: + # No slash at all: match against basename at any depth. + basename = filepath.rsplit("/", 1)[-1] + return fnmatch(basename, p) + + # Trailing-slash-only dir (e.g. "vendor/") with no anchoring: + # match at any depth by prepending **/. + return _gitignore_match("**/" + p, filepath) def _gitignore_match(pattern: str, filepath: str) -> bool: @@ -166,18 +173,27 @@ def domains_for_pr(rules: list[CodeownersRule], changed_files: list[str]) -> set return teams +CODEOWNERS_PATHS = [".github/CODEOWNERS", "CODEOWNERS", "docs/CODEOWNERS"] + + def fetch_codeowners(repo: str) -> str | None: - """Fetch CODEOWNERS from the repo's default branch via the Contents API.""" - result = gh( - ["api", f"repos/{repo}/contents/CODEOWNERS", "--jq", ".content"], - check=False, - ) - if result.returncode != 0: - return None - try: - return base64.b64decode(result.stdout.strip()).decode() - except Exception: - return None + """Fetch CODEOWNERS from the repo's default branch via the Contents API. + + Checks the three locations GitHub supports, in priority order: + ``.github/CODEOWNERS``, ``CODEOWNERS``, ``docs/CODEOWNERS``. + """ + for path in CODEOWNERS_PATHS: + result = gh( + ["api", f"repos/{repo}/contents/{path}", "--jq", ".content"], + check=False, + ) + if result.returncode != 0: + continue + try: + return base64.b64decode(result.stdout.strip()).decode() + except Exception: + continue + return None def fetch_pr_files(repo: str, pr_number: int) -> list[str]: diff --git a/.github/scripts/test_pr_labeler.py b/.github/scripts/test_pr_labeler.py index ab4fe08..186976b 100644 --- a/.github/scripts/test_pr_labeler.py +++ b/.github/scripts/test_pr_labeler.py @@ -467,6 +467,16 @@ def test_deep_anchored_path(self): "other/vendor/github.com/trufflesecurity/smallfetch/client.go", ) + def test_trailing_slash_no_leading_slash_matches_any_depth(self): + # "vendor/" with no leading "/" should match at any depth + assert pr_labeler._codeowners_match("vendor/", "vendor/file.go") + assert pr_labeler._codeowners_match("vendor/", "nested/vendor/file.go") + assert pr_labeler._codeowners_match("vendor/", "a/b/vendor/deep.go") + + def test_trailing_slash_with_leading_slash_anchored(self): + assert pr_labeler._codeowners_match("/vendor/", "vendor/file.go") + assert not pr_labeler._codeowners_match("/vendor/", "nested/vendor/file.go") + # ---- domains_for_pr ---------------------------------------------------------