From 27266ea8739babedb0f23ad7e2e4a73832706aa0 Mon Sep 17 00:00:00 2001 From: Mimi Flynn Date: Thu, 21 May 2026 19:54:28 +0000 Subject: [PATCH 01/15] Add organization repository report workflow and report generation script --- .github/workflows/org-repo-report.yml | 45 ++++ reporting/generate_org_repo_report.py | 343 ++++++++++++++++++++++++++ 2 files changed, 388 insertions(+) create mode 100644 .github/workflows/org-repo-report.yml create mode 100644 reporting/generate_org_repo_report.py diff --git a/.github/workflows/org-repo-report.yml b/.github/workflows/org-repo-report.yml new file mode 100644 index 000000000..820fa90e8 --- /dev/null +++ b/.github/workflows/org-repo-report.yml @@ -0,0 +1,45 @@ +name: Organization Repository Report + +on: + workflow_dispatch: + schedule: + - cron: '0 6 * * 1' + +permissions: + contents: write + +jobs: + generate-org-repo-report: + runs-on: ubuntu-latest + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.12' + + - name: Generate report + env: + GH_TOKEN: ${{ secrets.GH_TOKEN }} + ORG_NAME: morganstanley + OUTPUT_CSV: reporting/org-repo-report.csv + OUTPUT_MD: reporting/org-repo-report.md + run: python reporting/generate_org_repo_report.py + + - name: Commit and push report + env: + GITHUB_TOKEN: ${{ secrets.GH_TOKEN }} + run: | + set -euo pipefail + git config user.name "github-actions[bot]" + git config user.email "41898282+github-actions[bot]@users.noreply.github.com" + git checkout -B github-org-stats + git add reporting/org-repo-report.csv reporting/org-repo-report.md + if git diff --cached --quiet; then + echo "No report changes to commit." + exit 0 + fi + git commit -m "chore(reporting): update organization repository report" + git push origin HEAD:github-org-stats diff --git a/reporting/generate_org_repo_report.py b/reporting/generate_org_repo_report.py new file mode 100644 index 000000000..7e2d90ff0 --- /dev/null +++ b/reporting/generate_org_repo_report.py @@ -0,0 +1,343 @@ +#!/usr/bin/env python3 +"""Generate organization repository metadata report. + +Report fields per repository: +- Repository name +- Repository created date/time +- Repository creator (best effort via org audit log) +- Most recent update date/time +- Most recent updater (push actor when available) +""" + +from __future__ import annotations + +import csv +import json +import os +import sys +import time +import urllib.parse +import urllib.request +from datetime import datetime, timezone +from typing import Dict, Iterable, List, Optional, Tuple + +API_BASE = "https://api.github.com" + + +class GitHubClient: + def __init__(self, token: str): + self.token = token + self.audit_supported: Optional[bool] = None + + def _request( + self, + path: str, + params: Optional[Dict[str, str]] = None, + accept: str = "application/vnd.github+json", + retries: int = 3, + ) -> Tuple[object, Dict[str, str]]: + query = "" + if params: + query = "?" + urllib.parse.urlencode(params) + url = f"{API_BASE}{path}{query}" + + headers = { + "Accept": accept, + "Authorization": f"Bearer {self.token}", + "X-GitHub-Api-Version": "2022-11-28", + "User-Agent": "org-repo-report-generator", + } + + last_error: Optional[Exception] = None + for attempt in range(1, retries + 1): + req = urllib.request.Request(url, headers=headers, method="GET") + try: + with urllib.request.urlopen(req, timeout=60) as response: + body = response.read().decode("utf-8") + data = json.loads(body) if body else None + response_headers = {k.lower(): v for k, v in response.headers.items()} + return data, response_headers + except urllib.error.HTTPError as err: + if err.code == 403 and err.headers.get("X-RateLimit-Remaining") == "0": + reset_epoch = int(err.headers.get("X-RateLimit-Reset", "0")) + sleep_for = max(reset_epoch - int(time.time()), 1) + print( + f"Rate limited. Sleeping for {sleep_for} seconds before retrying {path}.", + file=sys.stderr, + ) + time.sleep(sleep_for) + continue + body = err.read().decode("utf-8", errors="replace") + raise RuntimeError(f"GitHub API error ({err.code}) for {path}: {body}") from err + except Exception as err: # pragma: no cover - defensive + last_error = err + if attempt < retries: + time.sleep(attempt) + else: + raise RuntimeError(f"Failed to call GitHub API for {path}: {err}") from err + + raise RuntimeError(f"Unexpected API failure for {path}: {last_error}") + + def paginate(self, path: str, params: Optional[Dict[str, str]] = None) -> Iterable[object]: + next_path = path + next_params = dict(params or {}) + + while next_path: + data, headers = self._request(next_path, next_params) + if not isinstance(data, list): + raise RuntimeError(f"Expected list response for {next_path}, got {type(data)}") + + for item in data: + yield item + + link_header = headers.get("link", "") + next_link = self._extract_next_link(link_header) + if next_link: + parsed = urllib.parse.urlparse(next_link) + next_path = parsed.path + next_params = dict(urllib.parse.parse_qsl(parsed.query)) + else: + next_path = "" + next_params = {} + + @staticmethod + def _extract_next_link(link_header: str) -> Optional[str]: + if not link_header: + return None + parts = [p.strip() for p in link_header.split(",")] + for part in parts: + sections = [s.strip() for s in part.split(";")] + if len(sections) < 2: + continue + if sections[1] == 'rel="next"' and sections[0].startswith("<") and sections[0].endswith(">"): + return sections[0][1:-1] + return None + + def list_org_repos(self, org: str) -> List[Dict[str, object]]: + repos: List[Dict[str, object]] = [] + for repo in self.paginate( + f"/orgs/{org}/repos", + params={ + "per_page": "100", + "type": "all", + "sort": "full_name", + "direction": "asc", + }, + ): + if isinstance(repo, dict): + repos.append(repo) + return repos + + def get_latest_push_event_info(self, org: str, repo: str) -> Tuple[str, str]: + data, _ = self._request( + f"/repos/{org}/{repo}/events", + params={"per_page": "100"}, + ) + if not isinstance(data, list) or not data: + return "", "" + + for event in data: + if not isinstance(event, dict): + continue + if event.get("type") != "PushEvent": + continue + actor = event.get("actor", {}) if isinstance(event.get("actor"), dict) else {} + pushed_by = str(actor.get("login") or "") + pushed_at = str(event.get("created_at") or "") + return pushed_at, pushed_by + + return "", "" + + def get_latest_commit_info(self, org: str, repo: str, default_branch: Optional[str]) -> Tuple[str, str]: + if not default_branch: + return "", "" + + data, _ = self._request( + f"/repos/{org}/{repo}/commits", + params={"per_page": "1", "sha": default_branch}, + ) + if not isinstance(data, list) or not data: + return "", "" + + latest = data[0] + commit = latest.get("commit", {}) if isinstance(latest, dict) else {} + committer_data = commit.get("committer", {}) if isinstance(commit, dict) else {} + author_data = commit.get("author", {}) if isinstance(commit, dict) else {} + + update_at = committer_data.get("date") or author_data.get("date") or "" + + updater = "" + if isinstance(latest, dict): + if isinstance(latest.get("committer"), dict): + updater = latest["committer"].get("login") or "" + if not updater and isinstance(latest.get("author"), dict): + updater = latest["author"].get("login") or "" + + if not updater: + updater = committer_data.get("name") or author_data.get("name") or "" + + return str(update_at or ""), str(updater or "") + + def get_latest_update_info(self, org: str, repo: str, default_branch: Optional[str]) -> Tuple[str, str]: + pushed_at, pushed_by = self.get_latest_push_event_info(org, repo) + if pushed_at and pushed_by: + return pushed_at, pushed_by + return self.get_latest_commit_info(org, repo, default_branch) + + def get_repo_creator(self, org: str, repo: str) -> Tuple[str, str]: + if self.audit_supported is False: + return "", "" + + try: + data, _ = self._request( + f"/orgs/{org}/audit-log", + params={ + "per_page": "1", + "phrase": f"action:repo.create repo:{repo}", + }, + ) + except RuntimeError as err: + text = str(err) + if "(403)" in text or "(404)" in text: + # Token lacks org audit-log access. + self.audit_supported = False + return "", "" + raise + + self.audit_supported = True + + if not isinstance(data, list) or not data: + return "", "" + + event = data[0] + if not isinstance(event, dict): + return "", "" + + created_by = str(event.get("actor") or "") + created_at = str(event.get("created_at") or "") + return created_by, created_at + + +def normalize_timestamp(timestamp: str) -> str: + if not timestamp: + return "" + try: + dt = datetime.fromisoformat(timestamp.replace("Z", "+00:00")) + return dt.astimezone(timezone.utc).strftime("%Y-%m-%d %H:%M:%S UTC") + except ValueError: + return timestamp + + +def write_csv(rows: List[Dict[str, str]], output_csv: str) -> None: + headers = [ + "repo_name", + "repo_created_at", + "repo_created_by", + "most_recent_update_at", + "most_recent_updated_by", + ] + with open(output_csv, "w", newline="", encoding="utf-8") as fh: + writer = csv.DictWriter(fh, fieldnames=headers) + writer.writeheader() + for row in rows: + writer.writerow(row) + + +def write_markdown(rows: List[Dict[str, str]], output_md: str, org: str, audit_supported: bool) -> None: + now = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S UTC") + lines = [ + f"# {org} Repository Report", + "", + f"Generated: {now}", + "", + "| Repo Name | Repo Created At | Repo Created By | Most Recent Update At | Most Recent Updated By |", + "| --- | --- | --- | --- | --- |", + ] + + for row in rows: + lines.append( + "| " + + " | ".join( + [ + row["repo_name"] or "n/a", + row["repo_created_at"] or "n/a", + row["repo_created_by"] or "unknown", + row["most_recent_update_at"] or "n/a", + row["most_recent_updated_by"] or "unknown", + ] + ) + + " |" + ) + + lines.extend( + [ + "", + "## Notes", + "", + "- `repo_created_by` is sourced from the organization audit log when accessible.", + "- `most_recent_updated_by` uses the latest push event actor when available, otherwise latest default-branch commit metadata.", + ] + ) + + if not audit_supported: + lines.append( + "- Creator information is unavailable because the token could not access organization audit log data (typically requires org-level `admin:org`)." + ) + + with open(output_md, "w", encoding="utf-8") as fh: + fh.write("\n".join(lines) + "\n") + + +def main() -> int: + org = os.getenv("ORG_NAME", "morganstanley") + token = os.getenv("GH_TOKEN") + output_csv = os.getenv("OUTPUT_CSV", "reporting/org-repo-report.csv") + output_md = os.getenv("OUTPUT_MD", "reporting/org-repo-report.md") + + if not token: + print("GH_TOKEN environment variable is required.", file=sys.stderr) + return 1 + + os.makedirs(os.path.dirname(output_csv), exist_ok=True) + os.makedirs(os.path.dirname(output_md), exist_ok=True) + + client = GitHubClient(token) + repos = client.list_org_repos(org) + rows: List[Dict[str, str]] = [] + + for repo in repos: + repo_name = str(repo.get("name") or "") + created_at = str(repo.get("created_at") or "") + default_branch = repo.get("default_branch") + + latest_update_at, latest_updated_by = client.get_latest_update_info( + org=org, + repo=repo_name, + default_branch=str(default_branch) if default_branch else None, + ) + + created_by, created_from_audit_at = client.get_repo_creator(org, repo_name) + + # Prefer audit-log create timestamp if available; otherwise use repo metadata created_at. + created_time = created_from_audit_at or created_at + + rows.append( + { + "repo_name": repo_name, + "repo_created_at": normalize_timestamp(created_time), + "repo_created_by": created_by, + "most_recent_update_at": normalize_timestamp(latest_update_at), + "most_recent_updated_by": latest_updated_by, + } + ) + + rows.sort(key=lambda x: x["repo_name"].lower()) + write_csv(rows, output_csv) + write_markdown(rows, output_md, org=org, audit_supported=bool(client.audit_supported)) + + print(f"Wrote {len(rows)} rows to {output_csv} and {output_md}") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) From 3420a7d306ccd9ef8c5639d966b7fdec8eb349d0 Mon Sep 17 00:00:00 2001 From: Mimi Flynn <414934+mimiflynn@users.noreply.github.com> Date: Thu, 21 May 2026 16:52:44 -0400 Subject: [PATCH 02/15] Potential fix for pull request finding Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com> --- reporting/generate_org_repo_report.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/reporting/generate_org_repo_report.py b/reporting/generate_org_repo_report.py index 7e2d90ff0..a14c2872b 100644 --- a/reporting/generate_org_repo_report.py +++ b/reporting/generate_org_repo_report.py @@ -298,8 +298,10 @@ def main() -> int: print("GH_TOKEN environment variable is required.", file=sys.stderr) return 1 - os.makedirs(os.path.dirname(output_csv), exist_ok=True) - os.makedirs(os.path.dirname(output_md), exist_ok=True) + output_csv_dir = os.path.dirname(output_csv) or "." + output_md_dir = os.path.dirname(output_md) or "." + os.makedirs(output_csv_dir, exist_ok=True) + os.makedirs(output_md_dir, exist_ok=True) client = GitHubClient(token) repos = client.list_org_repos(org) From 83b6c9afb3a4d707e7cd2a72f402c1c08531d44d Mon Sep 17 00:00:00 2001 From: Mimi Flynn <414934+mimiflynn@users.noreply.github.com> Date: Thu, 21 May 2026 16:53:22 -0400 Subject: [PATCH 03/15] Potential fix for pull request finding Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com> --- reporting/generate_org_repo_report.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/reporting/generate_org_repo_report.py b/reporting/generate_org_repo_report.py index a14c2872b..ff6eaec29 100644 --- a/reporting/generate_org_repo_report.py +++ b/reporting/generate_org_repo_report.py @@ -335,7 +335,12 @@ def main() -> int: rows.sort(key=lambda x: x["repo_name"].lower()) write_csv(rows, output_csv) - write_markdown(rows, output_md, org=org, audit_supported=bool(client.audit_supported)) + audit_supported = ( + True if client.audit_supported is True + else False if client.audit_supported is False + else None + ) + write_markdown(rows, output_md, org=org, audit_supported=audit_supported) print(f"Wrote {len(rows)} rows to {output_csv} and {output_md}") return 0 From 0f336ab57d1b76b161e94a968af2093b733fa3de Mon Sep 17 00:00:00 2001 From: Mimi Flynn Date: Thu, 21 May 2026 21:25:58 +0000 Subject: [PATCH 04/15] Refactor GitHub API request handling and improve report generation logic --- .github/workflows/org-repo-report.yml | 8 +- reporting/generate_org_repo_report.py | 302 +++++++++++++++++--------- 2 files changed, 205 insertions(+), 105 deletions(-) diff --git a/.github/workflows/org-repo-report.yml b/.github/workflows/org-repo-report.yml index 820fa90e8..374d255ac 100644 --- a/.github/workflows/org-repo-report.yml +++ b/.github/workflows/org-repo-report.yml @@ -8,12 +8,18 @@ on: permissions: contents: write +concurrency: + group: org-repo-report-github-org-stats + cancel-in-progress: true + jobs: generate-org-repo-report: runs-on: ubuntu-latest steps: - name: Checkout repository uses: actions/checkout@v4 + with: + token: ${{ secrets.GH_TOKEN }} - name: Set up Python uses: actions/setup-python@v5 @@ -29,8 +35,6 @@ jobs: run: python reporting/generate_org_repo_report.py - name: Commit and push report - env: - GITHUB_TOKEN: ${{ secrets.GH_TOKEN }} run: | set -euo pipefail git config user.name "github-actions[bot]" diff --git a/reporting/generate_org_repo_report.py b/reporting/generate_org_repo_report.py index 7e2d90ff0..8ccbe1e84 100644 --- a/reporting/generate_org_repo_report.py +++ b/reporting/generate_org_repo_report.py @@ -19,9 +19,11 @@ import urllib.parse import urllib.request from datetime import datetime, timezone +from email.utils import parsedate_to_datetime from typing import Dict, Iterable, List, Optional, Tuple API_BASE = "https://api.github.com" +GRAPHQL_URL = "https://api.github.com/graphql" class GitHubClient: @@ -29,28 +31,14 @@ def __init__(self, token: str): self.token = token self.audit_supported: Optional[bool] = None - def _request( + def _send_request( self, - path: str, - params: Optional[Dict[str, str]] = None, - accept: str = "application/vnd.github+json", + req: urllib.request.Request, + request_label: str, retries: int = 3, ) -> Tuple[object, Dict[str, str]]: - query = "" - if params: - query = "?" + urllib.parse.urlencode(params) - url = f"{API_BASE}{path}{query}" - - headers = { - "Accept": accept, - "Authorization": f"Bearer {self.token}", - "X-GitHub-Api-Version": "2022-11-28", - "User-Agent": "org-repo-report-generator", - } - last_error: Optional[Exception] = None for attempt in range(1, retries + 1): - req = urllib.request.Request(url, headers=headers, method="GET") try: with urllib.request.urlopen(req, timeout=60) as response: body = response.read().decode("utf-8") @@ -58,25 +46,101 @@ def _request( response_headers = {k.lower(): v for k, v in response.headers.items()} return data, response_headers except urllib.error.HTTPError as err: - if err.code == 403 and err.headers.get("X-RateLimit-Remaining") == "0": - reset_epoch = int(err.headers.get("X-RateLimit-Reset", "0")) - sleep_for = max(reset_epoch - int(time.time()), 1) + body = err.read().decode("utf-8", errors="replace") + retry_after_seconds = self._parse_retry_after_seconds(err.headers.get("Retry-After")) + body_lower = body.lower() + is_primary_limit = err.code == 403 and err.headers.get("X-RateLimit-Remaining") == "0" + is_secondary_limit = err.code == 403 and ( + "secondary rate limit" in body_lower or "abuse detection" in body_lower + ) + is_retryable_rate_limit = err.code == 429 or is_primary_limit or is_secondary_limit + + if is_retryable_rate_limit and attempt < retries: + if is_primary_limit: + reset_epoch = int(err.headers.get("X-RateLimit-Reset", "0")) + sleep_for = max(reset_epoch - int(time.time()), 1) + elif retry_after_seconds is not None: + sleep_for = max(retry_after_seconds, 1) + else: + # Backoff for secondary/abuse limits when Retry-After is absent. + sleep_for = min(30 * attempt, 300) + print( - f"Rate limited. Sleeping for {sleep_for} seconds before retrying {path}.", + f"Rate limited ({err.code}). Sleeping for {sleep_for} seconds before retrying {request_label}.", file=sys.stderr, ) time.sleep(sleep_for) continue - body = err.read().decode("utf-8", errors="replace") - raise RuntimeError(f"GitHub API error ({err.code}) for {path}: {body}") from err + + raise RuntimeError(f"GitHub API error ({err.code}) for {request_label}: {body}") from err except Exception as err: # pragma: no cover - defensive last_error = err if attempt < retries: time.sleep(attempt) else: - raise RuntimeError(f"Failed to call GitHub API for {path}: {err}") from err + raise RuntimeError(f"Failed to call GitHub API for {request_label}: {err}") from err + + raise RuntimeError(f"Unexpected API failure for {request_label}: {last_error}") + + def _request( + self, + path: str, + params: Optional[Dict[str, str]] = None, + accept: str = "application/vnd.github+json", + retries: int = 3, + ) -> Tuple[object, Dict[str, str]]: + query = "" + if params: + query = "?" + urllib.parse.urlencode(params) + url = f"{API_BASE}{path}{query}" + + headers = { + "Accept": accept, + "Authorization": f"Bearer {self.token}", + "X-GitHub-Api-Version": "2022-11-28", + "User-Agent": "org-repo-report-generator", + } + req = urllib.request.Request(url, headers=headers, method="GET") + return self._send_request(req, path, retries=retries) + + def _graphql_request( + self, + query: str, + variables: Dict[str, object], + retries: int = 3, + ) -> Dict[str, object]: + payload = json.dumps({"query": query, "variables": variables}).encode("utf-8") + headers = { + "Accept": "application/vnd.github+json", + "Authorization": f"Bearer {self.token}", + "Content-Type": "application/json", + "X-GitHub-Api-Version": "2022-11-28", + "User-Agent": "org-repo-report-generator", + } + req = urllib.request.Request(GRAPHQL_URL, headers=headers, data=payload, method="POST") + data, _ = self._send_request(req, "graphql", retries=retries) + if not isinstance(data, dict): + raise RuntimeError("Unexpected GraphQL response format") + if isinstance(data.get("errors"), list) and data["errors"]: + raise RuntimeError(f"GitHub GraphQL error: {data['errors']}") + return data - raise RuntimeError(f"Unexpected API failure for {path}: {last_error}") + @staticmethod + def _parse_retry_after_seconds(retry_after: Optional[str]) -> Optional[int]: + if not retry_after: + return None + + retry_after = retry_after.strip() + if retry_after.isdigit(): + return int(retry_after) + + try: + retry_at = parsedate_to_datetime(retry_after) + if retry_at.tzinfo is None: + retry_at = retry_at.replace(tzinfo=timezone.utc) + return max(int((retry_at - datetime.now(timezone.utc)).total_seconds()), 0) + except (TypeError, ValueError, OverflowError): + return None def paginate(self, path: str, params: Optional[Dict[str, str]] = None) -> Iterable[object]: next_path = path @@ -128,72 +192,16 @@ def list_org_repos(self, org: str) -> List[Dict[str, object]]: repos.append(repo) return repos - def get_latest_push_event_info(self, org: str, repo: str) -> Tuple[str, str]: - data, _ = self._request( - f"/repos/{org}/{repo}/events", - params={"per_page": "100"}, - ) - if not isinstance(data, list) or not data: - return "", "" - - for event in data: - if not isinstance(event, dict): - continue - if event.get("type") != "PushEvent": - continue - actor = event.get("actor", {}) if isinstance(event.get("actor"), dict) else {} - pushed_by = str(actor.get("login") or "") - pushed_at = str(event.get("created_at") or "") - return pushed_at, pushed_by - - return "", "" - - def get_latest_commit_info(self, org: str, repo: str, default_branch: Optional[str]) -> Tuple[str, str]: - if not default_branch: - return "", "" - - data, _ = self._request( - f"/repos/{org}/{repo}/commits", - params={"per_page": "1", "sha": default_branch}, - ) - if not isinstance(data, list) or not data: - return "", "" - - latest = data[0] - commit = latest.get("commit", {}) if isinstance(latest, dict) else {} - committer_data = commit.get("committer", {}) if isinstance(commit, dict) else {} - author_data = commit.get("author", {}) if isinstance(commit, dict) else {} - - update_at = committer_data.get("date") or author_data.get("date") or "" - - updater = "" - if isinstance(latest, dict): - if isinstance(latest.get("committer"), dict): - updater = latest["committer"].get("login") or "" - if not updater and isinstance(latest.get("author"), dict): - updater = latest["author"].get("login") or "" - - if not updater: - updater = committer_data.get("name") or author_data.get("name") or "" - - return str(update_at or ""), str(updater or "") - - def get_latest_update_info(self, org: str, repo: str, default_branch: Optional[str]) -> Tuple[str, str]: - pushed_at, pushed_by = self.get_latest_push_event_info(org, repo) - if pushed_at and pushed_by: - return pushed_at, pushed_by - return self.get_latest_commit_info(org, repo, default_branch) - - def get_repo_creator(self, org: str, repo: str) -> Tuple[str, str]: + def get_repo_creators(self, org: str) -> Dict[str, Tuple[str, str]]: if self.audit_supported is False: - return "", "" + return {} try: - data, _ = self._request( + audit_events = self.paginate( f"/orgs/{org}/audit-log", params={ - "per_page": "1", - "phrase": f"action:repo.create repo:{repo}", + "per_page": "100", + "phrase": "action:repo.create", }, ) except RuntimeError as err: @@ -201,21 +209,112 @@ def get_repo_creator(self, org: str, repo: str) -> Tuple[str, str]: if "(403)" in text or "(404)" in text: # Token lacks org audit-log access. self.audit_supported = False - return "", "" + return {} raise self.audit_supported = True + creators: Dict[str, Tuple[str, str]] = {} + for event in audit_events: + if not isinstance(event, dict): + continue + repo_name = self._extract_audit_repo_name(event, org) + if not repo_name or repo_name in creators: + continue + created_by = str(event.get("actor") or "") + created_at = str(event.get("created_at") or "") + creators[repo_name] = (created_by, created_at) + + return creators + + def get_latest_updaters(self, org: str, repo_names: List[str]) -> Dict[str, str]: + if not repo_names: + return {} + + query = """ + query RepoBatch($org: String!, $names: [String!]!) { + organization(login: $org) { + repositories(first: 100, names: $names) { + nodes { + name + defaultBranchRef { + target { + __typename + ... on Commit { + history(first: 1) { + nodes { + author { + name + user { + login + } + } + committer { + name + user { + login + } + } + } + } + } + } + } + } + } + } + } + """ + + latest_updaters: Dict[str, str] = {} + for start in range(0, len(repo_names), 100): + batch = repo_names[start : start + 100] + data = self._graphql_request(query, {"org": org, "names": batch}) + organization = data.get("data", {}).get("organization", {}) if isinstance(data.get("data"), dict) else {} + repositories = organization.get("repositories", {}) if isinstance(organization, dict) else {} + nodes = repositories.get("nodes", []) if isinstance(repositories, dict) else [] + + for node in nodes: + if not isinstance(node, dict): + continue + repo_name = str(node.get("name") or "") + branch_ref = node.get("defaultBranchRef", {}) if isinstance(node.get("defaultBranchRef"), dict) else {} + target = branch_ref.get("target", {}) if isinstance(branch_ref, dict) else {} + history = target.get("history", {}) if isinstance(target, dict) else {} + history_nodes = history.get("nodes", []) if isinstance(history, dict) else [] + if not history_nodes: + continue + latest_commit = history_nodes[0] if isinstance(history_nodes[0], dict) else {} + author = latest_commit.get("author", {}) if isinstance(latest_commit.get("author"), dict) else {} + committer = latest_commit.get("committer", {}) if isinstance(latest_commit.get("committer"), dict) else {} - if not isinstance(data, list) or not data: - return "", "" + updater = "" + committer_user = committer.get("user", {}) if isinstance(committer.get("user"), dict) else {} + author_user = author.get("user", {}) if isinstance(author.get("user"), dict) else {} + updater = str(committer_user.get("login") or author_user.get("login") or "") + if not updater: + updater = str(committer.get("name") or author.get("name") or "") + latest_updaters[repo_name] = updater - event = data[0] - if not isinstance(event, dict): - return "", "" + return latest_updaters - created_by = str(event.get("actor") or "") - created_at = str(event.get("created_at") or "") - return created_by, created_at + @staticmethod + def _extract_audit_repo_name(event: Dict[str, object], org: str) -> str: + repo_value = event.get("repo") + if isinstance(repo_value, str) and repo_value: + prefix = f"{org}/" + return repo_value[len(prefix) :] if repo_value.startswith(prefix) else repo_value + + repository = event.get("repository") + if isinstance(repository, dict): + name = repository.get("name") + if isinstance(name, str): + return name + + repo_name = event.get("repo_name") + if isinstance(repo_name, str): + return repo_name + + return "" def normalize_timestamp(timestamp: str) -> str: @@ -303,20 +402,17 @@ def main() -> int: client = GitHubClient(token) repos = client.list_org_repos(org) + repo_creators = client.get_repo_creators(org) + latest_updaters = client.get_latest_updaters(org, [str(repo.get("name") or "") for repo in repos]) rows: List[Dict[str, str]] = [] for repo in repos: repo_name = str(repo.get("name") or "") created_at = str(repo.get("created_at") or "") - default_branch = repo.get("default_branch") - - latest_update_at, latest_updated_by = client.get_latest_update_info( - org=org, - repo=repo_name, - default_branch=str(default_branch) if default_branch else None, - ) + latest_update_at = str(repo.get("pushed_at") or repo.get("updated_at") or "") + latest_updated_by = latest_updaters.get(repo_name, "") - created_by, created_from_audit_at = client.get_repo_creator(org, repo_name) + created_by, created_from_audit_at = repo_creators.get(repo_name, ("", "")) # Prefer audit-log create timestamp if available; otherwise use repo metadata created_at. created_time = created_from_audit_at or created_at From aeda25a700d325b23750ebb92aa76607b3a7e8f5 Mon Sep 17 00:00:00 2001 From: Mimi Flynn Date: Thu, 21 May 2026 21:35:02 +0000 Subject: [PATCH 05/15] Refactor CSV and Markdown writing functions to use a consistent UNKNOWN_VALUE for missing repository creator information --- reporting/generate_org_repo_report.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/reporting/generate_org_repo_report.py b/reporting/generate_org_repo_report.py index ef4b23c5e..0ed9a832d 100644 --- a/reporting/generate_org_repo_report.py +++ b/reporting/generate_org_repo_report.py @@ -24,6 +24,7 @@ API_BASE = "https://api.github.com" GRAPHQL_URL = "https://api.github.com/graphql" +UNKNOWN_VALUE = "unknown" class GitHubClient: @@ -339,7 +340,15 @@ def write_csv(rows: List[Dict[str, str]], output_csv: str) -> None: writer = csv.DictWriter(fh, fieldnames=headers) writer.writeheader() for row in rows: - writer.writerow(row) + writer.writerow( + { + "repo_name": row["repo_name"], + "repo_created_at": row["repo_created_at"], + "repo_created_by": row["repo_created_by"] or UNKNOWN_VALUE, + "most_recent_update_at": row["most_recent_update_at"], + "most_recent_updated_by": row["most_recent_updated_by"] or UNKNOWN_VALUE, + } + ) def write_markdown(rows: List[Dict[str, str]], output_md: str, org: str, audit_supported: bool) -> None: @@ -360,9 +369,9 @@ def write_markdown(rows: List[Dict[str, str]], output_md: str, org: str, audit_s [ row["repo_name"] or "n/a", row["repo_created_at"] or "n/a", - row["repo_created_by"] or "unknown", + row["repo_created_by"] or UNKNOWN_VALUE, row["most_recent_update_at"] or "n/a", - row["most_recent_updated_by"] or "unknown", + row["most_recent_updated_by"] or UNKNOWN_VALUE, ] ) + " |" From 3ae6e9a7efcf1992ac3ce6b18a839aacdfdb5ac2 Mon Sep 17 00:00:00 2001 From: Mimi Flynn Date: Thu, 21 May 2026 21:39:43 +0000 Subject: [PATCH 06/15] Enhance workflow to support pull request validation and artifact uploads for organization repository report --- .github/workflows/org-repo-report.yml | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/.github/workflows/org-repo-report.yml b/.github/workflows/org-repo-report.yml index 374d255ac..0a82a0f76 100644 --- a/.github/workflows/org-repo-report.yml +++ b/.github/workflows/org-repo-report.yml @@ -2,6 +2,10 @@ name: Organization Repository Report on: workflow_dispatch: + pull_request: + paths: + - '.github/workflows/org-repo-report.yml' + - 'reporting/generate_org_repo_report.py' schedule: - cron: '0 6 * * 1' @@ -34,7 +38,17 @@ jobs: OUTPUT_MD: reporting/org-repo-report.md run: python reporting/generate_org_repo_report.py + - name: Upload report artifacts for PR validation + if: ${{ github.event_name == 'pull_request' }} + uses: actions/upload-artifact@v4 + with: + name: org-repo-report + path: | + reporting/org-repo-report.csv + reporting/org-repo-report.md + - name: Commit and push report + if: ${{ github.event_name != 'pull_request' }} run: | set -euo pipefail git config user.name "github-actions[bot]" From ff11c88ad49ab718722665e424a4c9a7691f4c62 Mon Sep 17 00:00:00 2001 From: Mimi Flynn Date: Thu, 21 May 2026 21:51:09 +0000 Subject: [PATCH 07/15] Refactor GitHub API request handling for improved readability and error handling --- reporting/generate_org_repo_report.py | 203 +++++++++++++++++--------- 1 file changed, 130 insertions(+), 73 deletions(-) diff --git a/reporting/generate_org_repo_report.py b/reporting/generate_org_repo_report.py index 0ed9a832d..0c34f14fb 100644 --- a/reporting/generate_org_repo_report.py +++ b/reporting/generate_org_repo_report.py @@ -44,17 +44,26 @@ def _send_request( with urllib.request.urlopen(req, timeout=60) as response: body = response.read().decode("utf-8") data = json.loads(body) if body else None - response_headers = {k.lower(): v for k, v in response.headers.items()} + response_headers = { + k.lower(): v for k, v in response.headers.items() + } return data, response_headers except urllib.error.HTTPError as err: body = err.read().decode("utf-8", errors="replace") - retry_after_seconds = self._parse_retry_after_seconds(err.headers.get("Retry-After")) + retry_after_seconds = self._parse_retry_after_seconds( + err.headers.get("Retry-After") + ) body_lower = body.lower() - is_primary_limit = err.code == 403 and err.headers.get("X-RateLimit-Remaining") == "0" + is_primary_limit = ( + err.code == 403 and err.headers.get("X-RateLimit-Remaining") == "0" + ) is_secondary_limit = err.code == 403 and ( - "secondary rate limit" in body_lower or "abuse detection" in body_lower + "secondary rate limit" in body_lower + or "abuse detection" in body_lower + ) + is_retryable_rate_limit = ( + err.code == 429 or is_primary_limit or is_secondary_limit ) - is_retryable_rate_limit = err.code == 429 or is_primary_limit or is_secondary_limit if is_retryable_rate_limit and attempt < retries: if is_primary_limit: @@ -73,13 +82,17 @@ def _send_request( time.sleep(sleep_for) continue - raise RuntimeError(f"GitHub API error ({err.code}) for {request_label}: {body}") from err + raise RuntimeError( + f"GitHub API error ({err.code}) for {request_label}: {body}" + ) from err except Exception as err: # pragma: no cover - defensive last_error = err if attempt < retries: time.sleep(attempt) else: - raise RuntimeError(f"Failed to call GitHub API for {request_label}: {err}") from err + raise RuntimeError( + f"Failed to call GitHub API for {request_label}: {err}" + ) from err raise RuntimeError(f"Unexpected API failure for {request_label}: {last_error}") @@ -118,7 +131,9 @@ def _graphql_request( "X-GitHub-Api-Version": "2022-11-28", "User-Agent": "org-repo-report-generator", } - req = urllib.request.Request(GRAPHQL_URL, headers=headers, data=payload, method="POST") + req = urllib.request.Request( + GRAPHQL_URL, headers=headers, data=payload, method="POST" + ) data, _ = self._send_request(req, "graphql", retries=retries) if not isinstance(data, dict): raise RuntimeError("Unexpected GraphQL response format") @@ -143,14 +158,18 @@ def _parse_retry_after_seconds(retry_after: Optional[str]) -> Optional[int]: except (TypeError, ValueError, OverflowError): return None - def paginate(self, path: str, params: Optional[Dict[str, str]] = None) -> Iterable[object]: + def paginate( + self, path: str, params: Optional[Dict[str, str]] = None + ) -> Iterable[object]: next_path = path next_params = dict(params or {}) while next_path: data, headers = self._request(next_path, next_params) if not isinstance(data, list): - raise RuntimeError(f"Expected list response for {next_path}, got {type(data)}") + raise RuntimeError( + f"Expected list response for {next_path}, got {type(data)}" + ) for item in data: yield item @@ -174,7 +193,11 @@ def _extract_next_link(link_header: str) -> Optional[str]: sections = [s.strip() for s in part.split(";")] if len(sections) < 2: continue - if sections[1] == 'rel="next"' and sections[0].startswith("<") and sections[0].endswith(">"): + if ( + sections[1] == 'rel="next"' + and sections[0].startswith("<") + and sections[0].endswith(">") + ): return sections[0][1:-1] return None @@ -232,69 +255,94 @@ def get_latest_updaters(self, org: str, repo_names: List[str]) -> Dict[str, str] return {} query = """ - query RepoBatch($org: String!, $names: [String!]!) { - organization(login: $org) { - repositories(first: 100, names: $names) { - nodes { - name - defaultBranchRef { - target { - __typename - ... on Commit { - history(first: 1) { - nodes { - author { - name - user { - login - } - } - committer { - name - user { - login - } - } + query RepoLatestUpdater($org: String!, $name: String!) { + repository(owner: $org, name: $name) { + name + defaultBranchRef { + target { + __typename + ... on Commit { + history(first: 1) { + nodes { + author { + name + user { + login + } + } + committer { + name + user { + login } - } } - } + } } - } + } + } } - } + } } """ latest_updaters: Dict[str, str] = {} - for start in range(0, len(repo_names), 100): - batch = repo_names[start : start + 100] - data = self._graphql_request(query, {"org": org, "names": batch}) - organization = data.get("data", {}).get("organization", {}) if isinstance(data.get("data"), dict) else {} - repositories = organization.get("repositories", {}) if isinstance(organization, dict) else {} - nodes = repositories.get("nodes", []) if isinstance(repositories, dict) else [] - - for node in nodes: - if not isinstance(node, dict): - continue - repo_name = str(node.get("name") or "") - branch_ref = node.get("defaultBranchRef", {}) if isinstance(node.get("defaultBranchRef"), dict) else {} - target = branch_ref.get("target", {}) if isinstance(branch_ref, dict) else {} - history = target.get("history", {}) if isinstance(target, dict) else {} - history_nodes = history.get("nodes", []) if isinstance(history, dict) else [] - if not history_nodes: - continue - latest_commit = history_nodes[0] if isinstance(history_nodes[0], dict) else {} - author = latest_commit.get("author", {}) if isinstance(latest_commit.get("author"), dict) else {} - committer = latest_commit.get("committer", {}) if isinstance(latest_commit.get("committer"), dict) else {} - - updater = "" - committer_user = committer.get("user", {}) if isinstance(committer.get("user"), dict) else {} - author_user = author.get("user", {}) if isinstance(author.get("user"), dict) else {} - updater = str(committer_user.get("login") or author_user.get("login") or "") - if not updater: - updater = str(committer.get("name") or author.get("name") or "") - latest_updaters[repo_name] = updater + + for repo_name in repo_names: + if not repo_name: + continue + + data = self._graphql_request(query, {"org": org, "name": repo_name}) + repository = ( + data.get("data", {}).get("repository", {}) + if isinstance(data.get("data"), dict) + else {} + ) + if not isinstance(repository, dict) or not repository: + continue + + branch_ref = ( + repository.get("defaultBranchRef", {}) + if isinstance(repository.get("defaultBranchRef"), dict) + else {} + ) + target = ( + branch_ref.get("target", {}) if isinstance(branch_ref, dict) else {} + ) + history = target.get("history", {}) if isinstance(target, dict) else {} + history_nodes = ( + history.get("nodes", []) if isinstance(history, dict) else [] + ) + if not history_nodes: + continue + + latest_commit = ( + history_nodes[0] if isinstance(history_nodes[0], dict) else {} + ) + author = ( + latest_commit.get("author", {}) + if isinstance(latest_commit.get("author"), dict) + else {} + ) + committer = ( + latest_commit.get("committer", {}) + if isinstance(latest_commit.get("committer"), dict) + else {} + ) + + committer_user = ( + committer.get("user", {}) + if isinstance(committer.get("user"), dict) + else {} + ) + author_user = ( + author.get("user", {}) if isinstance(author.get("user"), dict) else {} + ) + + updater = str(committer_user.get("login") or author_user.get("login") or "") + if not updater: + updater = str(committer.get("name") or author.get("name") or "") + + latest_updaters[repo_name] = updater return latest_updaters @@ -303,7 +351,11 @@ def _extract_audit_repo_name(event: Dict[str, object], org: str) -> str: repo_value = event.get("repo") if isinstance(repo_value, str) and repo_value: prefix = f"{org}/" - return repo_value[len(prefix) :] if repo_value.startswith(prefix) else repo_value + return ( + repo_value[len(prefix) :] + if repo_value.startswith(prefix) + else repo_value + ) repository = event.get("repository") if isinstance(repository, dict): @@ -346,12 +398,15 @@ def write_csv(rows: List[Dict[str, str]], output_csv: str) -> None: "repo_created_at": row["repo_created_at"], "repo_created_by": row["repo_created_by"] or UNKNOWN_VALUE, "most_recent_update_at": row["most_recent_update_at"], - "most_recent_updated_by": row["most_recent_updated_by"] or UNKNOWN_VALUE, + "most_recent_updated_by": row["most_recent_updated_by"] + or UNKNOWN_VALUE, } ) -def write_markdown(rows: List[Dict[str, str]], output_md: str, org: str, audit_supported: bool) -> None: +def write_markdown( + rows: List[Dict[str, str]], output_md: str, org: str, audit_supported: bool +) -> None: now = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S UTC") lines = [ f"# {org} Repository Report", @@ -414,7 +469,9 @@ def main() -> int: client = GitHubClient(token) repos = client.list_org_repos(org) repo_creators = client.get_repo_creators(org) - latest_updaters = client.get_latest_updaters(org, [str(repo.get("name") or "") for repo in repos]) + latest_updaters = client.get_latest_updaters( + org, [str(repo.get("name") or "") for repo in repos] + ) rows: List[Dict[str, str]] = [] for repo in repos: @@ -441,9 +498,9 @@ def main() -> int: rows.sort(key=lambda x: x["repo_name"].lower()) write_csv(rows, output_csv) audit_supported = ( - True if client.audit_supported is True - else False if client.audit_supported is False - else None + True + if client.audit_supported is True + else False if client.audit_supported is False else None ) write_markdown(rows, output_md, org=org, audit_supported=audit_supported) From edbfcba3bb38c987c5444c95f07621c09e9a76b8 Mon Sep 17 00:00:00 2001 From: Mimi Flynn Date: Fri, 22 May 2026 13:53:58 +0000 Subject: [PATCH 08/15] dependabot config for actions --- .github/dependabot.yml | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) create mode 100644 .github/dependabot.yml diff --git a/.github/dependabot.yml b/.github/dependabot.yml new file mode 100644 index 000000000..2cc5c403c --- /dev/null +++ b/.github/dependabot.yml @@ -0,0 +1,16 @@ +version: 2 +updates: + - package-ecosystem: github-actions + directory: / + schedule: + interval: weekly + open-pull-requests-limit: 5 + commit-message: + prefix: chore + - package-ecosystem: pip + directory: /reporting + schedule: + interval: weekly + open-pull-requests-limit: 5 + commit-message: + prefix: chore \ No newline at end of file From d5916cf0f5173b6496de991fff78f371e7e7b7f6 Mon Sep 17 00:00:00 2001 From: Mimi Flynn Date: Fri, 22 May 2026 19:00:50 +0000 Subject: [PATCH 09/15] Enhance GitHub workflow to validate report script changes and run tests on pull requests; add requirements files and .gitignore for Python project --- .github/workflows/org-repo-report.yml | 19 ++ .gitignore | 2 + reporting/requirements-dev.txt | 4 + reporting/requirements.txt | 2 + .../tests/test_generate_org_repo_report.py | 222 ++++++++++++++++++ 5 files changed, 249 insertions(+) create mode 100644 .gitignore create mode 100644 reporting/requirements-dev.txt create mode 100644 reporting/requirements.txt create mode 100644 reporting/tests/test_generate_org_repo_report.py diff --git a/.github/workflows/org-repo-report.yml b/.github/workflows/org-repo-report.yml index 0a82a0f76..bf851d1ef 100644 --- a/.github/workflows/org-repo-report.yml +++ b/.github/workflows/org-repo-report.yml @@ -11,6 +11,7 @@ on: permissions: contents: write + pull-requests: read concurrency: group: org-repo-report-github-org-stats @@ -30,6 +31,24 @@ jobs: with: python-version: '3.12' + - name: Detect report script changes + if: ${{ github.event_name == 'pull_request' }} + id: report-script-changes + uses: dorny/paths-filter@v3 + with: + filters: | + report_script: + - 'reporting/generate_org_repo_report.py' + + - name: Run report script tests + if: ${{ github.event_name == 'pull_request' && steps.report-script-changes.outputs.report_script == 'true' }} + run: | + python -m pip install --upgrade pip + python -m pip install -r reporting/requirements-dev.txt + ruff check reporting + mypy reporting/generate_org_repo_report.py + pytest -q reporting/tests + - name: Generate report env: GH_TOKEN: ${{ secrets.GH_TOKEN }} diff --git a/.gitignore b/.gitignore new file mode 100644 index 000000000..029519e08 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +__pycache__/ +*.py[cod] \ No newline at end of file diff --git a/reporting/requirements-dev.txt b/reporting/requirements-dev.txt new file mode 100644 index 000000000..60fd7a6db --- /dev/null +++ b/reporting/requirements-dev.txt @@ -0,0 +1,4 @@ +pytest==8.3.5 +pytest-cov==5.0.0 +ruff==0.11.13 +mypy==1.15.0 \ No newline at end of file diff --git a/reporting/requirements.txt b/reporting/requirements.txt new file mode 100644 index 000000000..7b4c93515 --- /dev/null +++ b/reporting/requirements.txt @@ -0,0 +1,2 @@ +# No third-party Python dependencies are required. +# Keep this file so Dependabot can track future packages added under reporting/. \ No newline at end of file diff --git a/reporting/tests/test_generate_org_repo_report.py b/reporting/tests/test_generate_org_repo_report.py new file mode 100644 index 000000000..9d891d65b --- /dev/null +++ b/reporting/tests/test_generate_org_repo_report.py @@ -0,0 +1,222 @@ +import csv +import os +import tempfile +import unittest +from pathlib import Path +from unittest import mock + +SCRIPT_DIR = Path(__file__).resolve().parents[1] +if str(SCRIPT_DIR) not in os.sys.path: + os.sys.path.insert(0, str(SCRIPT_DIR)) + +import generate_org_repo_report as report + + +class TestHelpers(unittest.TestCase): + def test_normalize_timestamp_formats_utc(self) -> None: + value = report.normalize_timestamp("2026-05-22T12:34:56Z") + self.assertEqual(value, "2026-05-22 12:34:56 UTC") + + def test_normalize_timestamp_returns_original_for_invalid(self) -> None: + value = report.normalize_timestamp("not-a-date") + self.assertEqual(value, "not-a-date") + + def test_extract_next_link(self) -> None: + link_header = ( + '; rel="next", ' + '; rel="last"' + ) + next_link = report.GitHubClient._extract_next_link(link_header) + self.assertEqual(next_link, "https://api.github.com/orgs/test/repos?page=2") + + def test_extract_audit_repo_name_prefers_repo_field(self) -> None: + event = {"repo": "morganstanley/my-repo", "repo_name": "fallback"} + repo_name = report.GitHubClient._extract_audit_repo_name(event, "morganstanley") + self.assertEqual(repo_name, "my-repo") + + +class TestFileOutputs(unittest.TestCase): + def test_write_csv_replaces_empty_values_with_unknown(self) -> None: + rows = [ + { + "repo_name": "demo", + "repo_created_at": "2026-05-22 00:00:00 UTC", + "repo_created_by": "", + "most_recent_update_at": "2026-05-22 01:00:00 UTC", + "most_recent_updated_by": "", + } + ] + + with tempfile.TemporaryDirectory() as td: + csv_path = Path(td) / "report.csv" + report.write_csv(rows, str(csv_path)) + + with csv_path.open("r", encoding="utf-8", newline="") as fh: + parsed = list(csv.DictReader(fh)) + + self.assertEqual(len(parsed), 1) + self.assertEqual(parsed[0]["repo_created_by"], report.UNKNOWN_VALUE) + self.assertEqual(parsed[0]["most_recent_updated_by"], report.UNKNOWN_VALUE) + + def test_write_markdown_includes_audit_access_note(self) -> None: + rows = [ + { + "repo_name": "demo", + "repo_created_at": "2026-05-22 00:00:00 UTC", + "repo_created_by": "alice", + "most_recent_update_at": "2026-05-22 01:00:00 UTC", + "most_recent_updated_by": "bob", + } + ] + + with tempfile.TemporaryDirectory() as td: + md_path = Path(td) / "report.md" + report.write_markdown( + rows, str(md_path), org="morganstanley", audit_supported=False + ) + text = md_path.read_text(encoding="utf-8") + + self.assertIn("# morganstanley Repository Report", text) + self.assertIn("| demo |", text) + self.assertIn("Creator information is unavailable", text) + + +class TestClientBehavior(unittest.TestCase): + def test_get_repo_creators_marks_audit_unsupported_on_403(self) -> None: + client = report.GitHubClient(token="fake") + + with mock.patch.object( + client, + "paginate", + side_effect=RuntimeError( + "GitHub API error (403) for /orgs/test/audit-log: denied" + ), + ): + creators = client.get_repo_creators("test") + + self.assertEqual(creators, {}) + self.assertIs(client.audit_supported, False) + + def test_get_latest_updaters_prefers_committer_login(self) -> None: + client = report.GitHubClient(token="fake") + + def fake_graphql(_query, variables): + if variables["name"] == "repo-a": + return { + "data": { + "repository": { + "defaultBranchRef": { + "target": { + "history": { + "nodes": [ + { + "author": { + "name": "A", + "user": {"login": "author-login"}, + }, + "committer": { + "name": "C", + "user": { + "login": "committer-login" + }, + }, + } + ] + } + } + } + } + } + } + return { + "data": { + "repository": { + "defaultBranchRef": { + "target": { + "history": { + "nodes": [ + { + "author": { + "name": "Only Name", + "user": None, + }, + "committer": {"name": "", "user": None}, + } + ] + } + } + } + } + } + } + + with mock.patch.object(client, "_graphql_request", side_effect=fake_graphql): + updaters = client.get_latest_updaters( + "morganstanley", ["repo-a", "repo-b", ""] + ) + + self.assertEqual(updaters["repo-a"], "committer-login") + self.assertEqual(updaters["repo-b"], "Only Name") + self.assertNotIn("", updaters) + + +class TestMain(unittest.TestCase): + def test_main_requires_gh_token(self) -> None: + with mock.patch.dict(os.environ, {}, clear=True): + exit_code = report.main() + self.assertEqual(exit_code, 1) + + def test_main_writes_reports_with_mocked_client(self) -> None: + class FakeClient: + def __init__(self, token: str): + self.token = token + self.audit_supported = True + + def list_org_repos(self, _org: str): + return [ + { + "name": "zeta", + "created_at": "2026-05-21T00:00:00Z", + "updated_at": "2026-05-22T00:00:00Z", + "pushed_at": "2026-05-22T01:00:00Z", + }, + { + "name": "alpha", + "created_at": "2026-05-20T00:00:00Z", + "updated_at": "2026-05-20T01:00:00Z", + "pushed_at": None, + }, + ] + + def get_repo_creators(self, _org: str): + return {"alpha": ("alice", "2026-05-20T00:30:00Z")} + + def get_latest_updaters(self, _org: str, _repo_names): + return {"zeta": "zane", "alpha": ""} + + with tempfile.TemporaryDirectory() as td: + output_csv = Path(td) / "out" / "report.csv" + output_md = Path(td) / "out" / "report.md" + + env = { + "GH_TOKEN": "fake-token", + "ORG_NAME": "morganstanley", + "OUTPUT_CSV": str(output_csv), + "OUTPUT_MD": str(output_md), + } + + with mock.patch.object(report, "GitHubClient", FakeClient): + with mock.patch.dict(os.environ, env, clear=True): + exit_code = report.main() + + self.assertEqual(exit_code, 0) + self.assertTrue(output_csv.exists()) + self.assertTrue(output_md.exists()) + + csv_text = output_csv.read_text(encoding="utf-8") + self.assertLess(csv_text.find("alpha"), csv_text.find("zeta")) + self.assertIn(report.UNKNOWN_VALUE, csv_text) + + +if __name__ == "__main__": + unittest.main() From a0bf51633273f5a6fdfadd246f658e3ffc70bcfc Mon Sep 17 00:00:00 2001 From: Mimi Flynn Date: Fri, 22 May 2026 19:13:14 +0000 Subject: [PATCH 10/15] Refactor report script change detection to use GitHub API for improved accuracy in pull request validation --- .github/workflows/org-repo-report.yml | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/.github/workflows/org-repo-report.yml b/.github/workflows/org-repo-report.yml index bf851d1ef..ca1fe1e5a 100644 --- a/.github/workflows/org-repo-report.yml +++ b/.github/workflows/org-repo-report.yml @@ -34,11 +34,22 @@ jobs: - name: Detect report script changes if: ${{ github.event_name == 'pull_request' }} id: report-script-changes - uses: dorny/paths-filter@v3 + uses: actions/github-script@v7 with: - filters: | - report_script: - - 'reporting/generate_org_repo_report.py' + script: | + const files = await github.paginate(github.rest.pulls.listFiles, { + owner: context.repo.owner, + repo: context.repo.repo, + pull_number: context.payload.pull_request.number, + per_page: 100, + }); + + const reportScriptChanged = files.some( + (file) => file.filename === 'reporting/generate_org_repo_report.py' + ); + + core.info(`report_script changed: ${reportScriptChanged}`); + core.setOutput('report_script', reportScriptChanged ? 'true' : 'false'); - name: Run report script tests if: ${{ github.event_name == 'pull_request' && steps.report-script-changes.outputs.report_script == 'true' }} From 526d1cb6d75aed411764842e51030e24e6b02663 Mon Sep 17 00:00:00 2001 From: Mimi Flynn Date: Fri, 22 May 2026 19:19:28 +0000 Subject: [PATCH 11/15] Refactor import statement for report module to improve code organization --- reporting/tests/test_generate_org_repo_report.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/reporting/tests/test_generate_org_repo_report.py b/reporting/tests/test_generate_org_repo_report.py index 9d891d65b..0585a021e 100644 --- a/reporting/tests/test_generate_org_repo_report.py +++ b/reporting/tests/test_generate_org_repo_report.py @@ -5,12 +5,12 @@ from pathlib import Path from unittest import mock +import generate_org_repo_report as report + SCRIPT_DIR = Path(__file__).resolve().parents[1] if str(SCRIPT_DIR) not in os.sys.path: os.sys.path.insert(0, str(SCRIPT_DIR)) -import generate_org_repo_report as report - class TestHelpers(unittest.TestCase): def test_normalize_timestamp_formats_utc(self) -> None: From 38f550cc5677c4eb97ea869529d92191b6103c90 Mon Sep 17 00:00:00 2001 From: Mimi Flynn Date: Fri, 22 May 2026 19:24:05 +0000 Subject: [PATCH 12/15] Refactor GitHubClient to improve data handling and simplify audit support logic; update tests to maintain import consistency --- reporting/generate_org_repo_report.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/reporting/generate_org_repo_report.py b/reporting/generate_org_repo_report.py index 0c34f14fb..dc9afcced 100644 --- a/reporting/generate_org_repo_report.py +++ b/reporting/generate_org_repo_report.py @@ -292,9 +292,13 @@ def get_latest_updaters(self, org: str, repo_names: List[str]) -> Dict[str, str] continue data = self._graphql_request(query, {"org": org, "name": repo_name}) + data_payload = data.get("data") + if not isinstance(data_payload, dict): + continue + repository = ( - data.get("data", {}).get("repository", {}) - if isinstance(data.get("data"), dict) + data_payload.get("repository", {}) + if isinstance(data_payload.get("repository"), dict) else {} ) if not isinstance(repository, dict) or not repository: @@ -497,11 +501,8 @@ def main() -> int: rows.sort(key=lambda x: x["repo_name"].lower()) write_csv(rows, output_csv) - audit_supported = ( - True - if client.audit_supported is True - else False if client.audit_supported is False else None - ) + # Treat unknown audit support as available to avoid false warning text. + audit_supported = client.audit_supported is not False write_markdown(rows, output_md, org=org, audit_supported=audit_supported) print(f"Wrote {len(rows)} rows to {output_csv} and {output_md}") From d92cf9a8fd4a6701bbdc91085f6fd45b5325faa4 Mon Sep 17 00:00:00 2001 From: Mimi Flynn Date: Fri, 22 May 2026 19:30:36 +0000 Subject: [PATCH 13/15] Fix import statement for report module to enhance code organization --- reporting/tests/test_generate_org_repo_report.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/reporting/tests/test_generate_org_repo_report.py b/reporting/tests/test_generate_org_repo_report.py index 0585a021e..9d891d65b 100644 --- a/reporting/tests/test_generate_org_repo_report.py +++ b/reporting/tests/test_generate_org_repo_report.py @@ -5,12 +5,12 @@ from pathlib import Path from unittest import mock -import generate_org_repo_report as report - SCRIPT_DIR = Path(__file__).resolve().parents[1] if str(SCRIPT_DIR) not in os.sys.path: os.sys.path.insert(0, str(SCRIPT_DIR)) +import generate_org_repo_report as report + class TestHelpers(unittest.TestCase): def test_normalize_timestamp_formats_utc(self) -> None: From 7165a7b283828dc2ad50add16cee9e294b7922c1 Mon Sep 17 00:00:00 2001 From: Mimi Flynn Date: Fri, 22 May 2026 19:36:19 +0000 Subject: [PATCH 14/15] Fix import statement for report module to comply with linting rules --- reporting/tests/test_generate_org_repo_report.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/reporting/tests/test_generate_org_repo_report.py b/reporting/tests/test_generate_org_repo_report.py index 9d891d65b..67c4264c2 100644 --- a/reporting/tests/test_generate_org_repo_report.py +++ b/reporting/tests/test_generate_org_repo_report.py @@ -9,7 +9,7 @@ if str(SCRIPT_DIR) not in os.sys.path: os.sys.path.insert(0, str(SCRIPT_DIR)) -import generate_org_repo_report as report +import generate_org_repo_report as report # noqa: E402 class TestHelpers(unittest.TestCase): From 889ba605e31a6ff3a8e1d688b11aa50bcda1fce1 Mon Sep 17 00:00:00 2001 From: Mimi Flynn Date: Fri, 22 May 2026 19:52:59 +0000 Subject: [PATCH 15/15] Remove workflow_dispatch and pull_request triggers from org-repo-report.yml to streamline execution --- .github/workflows/org-repo-report.yml | 5 ----- 1 file changed, 5 deletions(-) diff --git a/.github/workflows/org-repo-report.yml b/.github/workflows/org-repo-report.yml index ca1fe1e5a..c8b525c30 100644 --- a/.github/workflows/org-repo-report.yml +++ b/.github/workflows/org-repo-report.yml @@ -1,11 +1,6 @@ name: Organization Repository Report on: - workflow_dispatch: - pull_request: - paths: - - '.github/workflows/org-repo-report.yml' - - 'reporting/generate_org_repo_report.py' schedule: - cron: '0 6 * * 1'