From 7cae5016b42f571c2cad9b4350d9b9154c923a94 Mon Sep 17 00:00:00 2001 From: Zack Koppert Date: Fri, 30 May 2025 16:50:48 -0700 Subject: [PATCH 1/7] Add ci to each pull request that runs tests and lints --- .github/workflows/python-ci.yml | 36 +++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) create mode 100644 .github/workflows/python-ci.yml diff --git a/.github/workflows/python-ci.yml b/.github/workflows/python-ci.yml new file mode 100644 index 0000000..a3aed2a --- /dev/null +++ b/.github/workflows/python-ci.yml @@ -0,0 +1,36 @@ +--- +# This workflow will install Python dependencies, run tests and lint with a variety of Python versions +# For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions + +name: Python package + +on: + push: + branches: [main] + pull_request: + branches: [main] + +permissions: + contents: read + +jobs: + build: + runs-on: ubuntu-latest + strategy: + matrix: + python-version: [3.11, 3.12] + steps: + - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies + run: | + pip install -r requirements.txt -r requirements-test.txt + - name: Lint with flake8 and pylint + run: | + make lint + - name: Test with pytest + run: | + make test From 1d92be3ff1cf7e09c879c61bc9a7de25049dde93 Mon Sep 17 00:00:00 2001 From: Zack Koppert Date: Fri, 30 May 2025 16:53:24 -0700 Subject: [PATCH 2/7] docs: update docs link --- .github/workflows/python-ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/python-ci.yml b/.github/workflows/python-ci.yml index a3aed2a..ebc75bc 100644 --- a/.github/workflows/python-ci.yml +++ b/.github/workflows/python-ci.yml @@ -1,6 +1,6 @@ --- # This workflow will install Python dependencies, run tests and lint with a variety of Python versions -# For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions +# For more information see: https://docs.github.com/en/actions/use-cases-and-examples/building-and-testing/building-and-testing-python name: Python package From 0ea54ce50bfc2abedbc93ecfb5a094de0e1b19a2 Mon Sep 17 00:00:00 2001 From: Zack Koppert Date: Fri, 30 May 2025 16:56:11 -0700 Subject: [PATCH 3/7] perf: moar speed if we cache! Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- .github/workflows/python-ci.yml | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/.github/workflows/python-ci.yml b/.github/workflows/python-ci.yml index ebc75bc..b5a46c3 100644 --- a/.github/workflows/python-ci.yml +++ b/.github/workflows/python-ci.yml @@ -25,6 +25,13 @@ jobs: uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0 with: python-version: ${{ matrix.python-version }} + - name: Cache pip dependencies + uses: actions/cache@v3 + with: + path: ~/.cache/pip + key: ${{ runner.os }}-pip-${{ hashFiles('**/requirements.txt', '**/requirements-test.txt') }} + restore-keys: | + ${{ runner.os }}-pip- - name: Install dependencies run: | pip install -r requirements.txt -r requirements-test.txt From 36a223549e77e69509dfe9fcd2c377b34d23156e Mon Sep 17 00:00:00 2001 From: Zack Koppert Date: Fri, 30 May 2025 20:21:36 -0700 Subject: [PATCH 4/7] fix: lint fixes and docstrings Signed-off-by: Zack Koppert --- test_user_engagement_metrics.py | 62 ++++++++++- user_engagement_metrics.py | 179 +++++++++++++++++++++++++------- 2 files changed, 201 insertions(+), 40 deletions(-) diff --git a/test_user_engagement_metrics.py b/test_user_engagement_metrics.py index ff2f65b..d5411b5 100644 --- a/test_user_engagement_metrics.py +++ b/test_user_engagement_metrics.py @@ -1,3 +1,10 @@ +""" +Test suite for the GitHub User Engagement Metrics module. + +This module contains unit tests for all the functions in the user_engagement_metrics.py +module, including API interactions, file operations, and data processing logic. +""" + import json import time from unittest.mock import MagicMock @@ -8,6 +15,16 @@ @pytest.fixture(autouse=True) def patch_globals(tmp_path, monkeypatch): + """ + Fixture to patch global file paths to use temporary test directories. + + This automatically runs for all tests, ensuring that tests don't use + or modify the real data files. + + Args: + tmp_path: pytest fixture providing a temporary directory + monkeypatch: pytest fixture for modifying objects + """ # Patch file paths to use test dir monkeypatch.setattr( user_engagement_metrics, "USERNAMES_FILE", str(tmp_path / "usernames.txt") @@ -24,6 +41,12 @@ def patch_globals(tmp_path, monkeypatch): def test_safe_get_rate_limit(monkeypatch): + """ + Test that safe_get handles GitHub API rate limits correctly. + + This test verifies that when a rate limit response is received, + the function waits and retries the request. + """ m_resp = MagicMock() m_resp.status_code = 403 m_resp.headers = { @@ -34,7 +57,8 @@ def test_safe_get_rate_limit(monkeypatch): call_count = {"count": 0} - def fake_requests_get(*a, **kw): + def fake_requests_get(*_a, **_kw): + """Fake requests.get to simulate rate limiting.""" call_count["count"] += 1 # simulate second call as success if call_count["count"] > 1: @@ -52,6 +76,9 @@ def fake_requests_get(*a, **kw): def test_get_user_profile(monkeypatch): + """ + Test that get_user_profile correctly calls the GitHub API and processes the result. + """ monkeypatch.setattr( user_engagement_metrics, "safe_get", @@ -61,12 +88,13 @@ def test_get_user_profile(monkeypatch): def test_get_user_repos(monkeypatch): + """ + Test that get_user_repos correctly handles API pagination. + """ # Simulate 2 pages, then empty - responses = [ - [{"id": 1}, {"id": 2}] - ] + responses = [[{"id": 1}, {"id": 2}]] - def safe_get(url, params=None): + def safe_get(_url, _params=None): return MagicMock(json=lambda: responses.pop(0)) monkeypatch.setattr(user_engagement_metrics, "safe_get", safe_get) @@ -75,6 +103,9 @@ def safe_get(url, params=None): def test_get_starred_repos_count_no_link(monkeypatch): + """ + Test that get_starred_repos_count works correctly when no pagination Link header is present. + """ m_resp = MagicMock() m_resp.headers = {} m_resp.json.return_value = [1, 2, 3] @@ -83,6 +114,9 @@ def test_get_starred_repos_count_no_link(monkeypatch): def test_get_starred_repos_count_with_link(monkeypatch): + """ + Test that get_starred_repos_count correctly parses the Link header for total count. + """ m_resp = MagicMock() m_resp.headers = { "Link": '; rel="last"' @@ -93,6 +127,9 @@ def test_get_starred_repos_count_with_link(monkeypatch): def test_get_orgs(monkeypatch): + """ + Test that get_orgs correctly processes the API response. + """ monkeypatch.setattr( user_engagement_metrics, "safe_get", @@ -102,6 +139,9 @@ def test_get_orgs(monkeypatch): def test_search_user_contributions_commit(monkeypatch): + """ + Test that search_user_contributions correctly handles commit searches. + """ m_resp = MagicMock() m_resp.json.return_value = {"total_count": 123} monkeypatch.setattr( @@ -113,6 +153,9 @@ def test_search_user_contributions_commit(monkeypatch): def test_search_user_contributions_issue(monkeypatch): + """ + Test that search_user_contributions correctly handles issue searches. + """ m_resp = MagicMock() m_resp.json.return_value = {"total_count": 99} monkeypatch.setattr( @@ -124,6 +167,9 @@ def test_search_user_contributions_issue(monkeypatch): def test_load_completed_usernames(tmp_path, monkeypatch): + """ + Test that load_completed_usernames correctly reads and processes the checkpoint file. + """ file_path = tmp_path / "completed_usernames.txt" file_path.write_text("a\nb\n\nc\n") monkeypatch.setattr(user_engagement_metrics, "CHECKPOINT_FILE", str(file_path)) @@ -131,6 +177,9 @@ def test_load_completed_usernames(tmp_path, monkeypatch): def test_append_completed_username(tmp_path, monkeypatch): + """ + Test that append_completed_username correctly writes to the checkpoint file. + """ file_path = tmp_path / "completed_usernames.txt" monkeypatch.setattr(user_engagement_metrics, "CHECKPOINT_FILE", str(file_path)) user_engagement_metrics.append_completed_username("dude") @@ -138,6 +187,9 @@ def test_append_completed_username(tmp_path, monkeypatch): def test_append_result(tmp_path, monkeypatch): + """ + Test that append_result correctly writes results to the output file. + """ file_path = tmp_path / "user_results.jsonl" monkeypatch.setattr(user_engagement_metrics, "OUTPUT_FILE", str(file_path)) user_engagement_metrics.append_result({"foo": "bar"}) diff --git a/user_engagement_metrics.py b/user_engagement_metrics.py index 4168a1b..a6d45e1 100644 --- a/user_engagement_metrics.py +++ b/user_engagement_metrics.py @@ -1,7 +1,23 @@ +""" +GitHub User Engagement Metrics Collector + +This module fetches and aggregates GitHub user engagement metrics for a list of usernames. +It collects data on repositories, contributions, organizations, and user profiles from the +GitHub API. The script handles rate limiting, retries, and checkpointing to resume +interrupted operations. + +Usage: + 1. Add GitHub usernames to 'usernames.txt' (one per line) + 2. Replace 'your_token' with a valid GitHub API token + 3. Run the script to collect metrics + 4. Results are stored in 'user_results.jsonl' +""" + import json import os import random import time +from re import search import requests @@ -16,12 +32,31 @@ def safe_get(url, params=None, extra_headers=None, max_retries=5): + """ + Make a GET request to the GitHub API with automatic rate limit handling and retries. + + This function handles rate limits by sleeping until the reset time when limits are hit. + It also implements exponential backoff for server errors. + + Args: + url (str): The API endpoint URL to request + params (dict, optional): Query parameters for the request. Defaults to None. + extra_headers (dict, optional): Additional headers to include in the request. + Defaults to None. + max_retries (int, optional): Maximum number of retry attempts for failed requests. + Defaults to 5. + + Returns: + requests.Response: The response object from the successful request + """ retries = 0 while True: combined_headers = headers.copy() if extra_headers: combined_headers.update(extra_headers) - response = requests.get(url, headers=combined_headers, params=params) + response = requests.get( + url, headers=combined_headers, params=params, timeout=10 + ) if response.status_code == 403: remaining = int(response.headers.get("X-RateLimit-Remaining", "1")) if remaining == 0: @@ -44,57 +79,107 @@ def safe_get(url, params=None, extra_headers=None, max_retries=5): time.sleep(wait) retries += 1 continue - else: - print(f"Max retries reached for {url}. Skipping.") - return response + print(f"Max retries reached for {url}. Skipping.") + return response return response -def get_user_profile(username): - url = f"{GITHUB_API}/users/{username}" +def get_user_profile(user): + """ + Fetch a GitHub user's profile information. + + Args: + user (str): The GitHub username to fetch profile for + + Returns: + dict: User profile data from the GitHub API + """ + url = f"{GITHUB_API}/users/{user}" return safe_get(url).json() -def get_user_repos(username): - repos = [] +def get_user_repos(user_name): + """ + Fetch all public repositories for a GitHub user. + + This function handles pagination to retrieve all repositories even if + the user has more than 100 repos (the API's default page size). + + Args: + user_name (str): The GitHub username to fetch repositories for + + Returns: + list: A list of repository objects from the GitHub API + """ + repositories = [] page = 1 while True: - url = f"{GITHUB_API}/users/{username}/repos" + url = f"{GITHUB_API}/users/{user_name}/repos" params = {"per_page": 100, "page": page} res = safe_get(url, params=params).json() if not res or "message" in res: break - repos.extend(res) + repositories.extend(res) if len(res) < 100: break page += 1 - return repos + return repositories + +def get_starred_repos_count(user): + """ + Get the total count of repositories starred by a GitHub user. -def get_starred_repos_count(username): - url = f"{GITHUB_API}/users/{username}/starred" + This function efficiently determines the count by examining the pagination + links rather than fetching all starred repos. + + Args: + user (str): The GitHub username to check + + Returns: + int: The number of repositories starred by the user + """ + url = f"{GITHUB_API}/users/{user}/starred" params = {"per_page": 1} res = safe_get(url, params=params) link = res.headers.get("Link", "") if 'rel="last"' in link: - import re - - match = re.search(r'page=(\d+)>; rel="last"', link) + match = search(r'page=(\d+)>; rel="last"', link) if match: return int(match.group(1)) return len(res.json()) -def get_orgs(username): - url = f"{GITHUB_API}/users/{username}/orgs" +def get_orgs(user): + """ + Fetch all organizations a GitHub user is a member of. + + Args: + user (str): The GitHub username to check + + Returns: + list: A list of organization objects from the GitHub API + """ + url = f"{GITHUB_API}/users/{user}/orgs" return safe_get(url).json() -def search_user_contributions(username, type_): +def search_user_contributions(user, type_): + """ + Search for a user's public contributions of a specific type. + + Args: + user (str): The GitHub username to check + type_ (str): The type of contribution to search for: + 'pr' (pull requests), 'issue', or 'commit' + + Returns: + int: The total count of contributions of the specified type + """ q_map = { - "pr": f"type:pr author:{username}", - "issue": f"type:issue author:{username}", - "commit": f"author:{username}", + "pr": f"type:pr author:{user}", + "issue": f"type:issue author:{user}", + "commit": f"author:{user}", } if type_ == "commit": url = f"{GITHUB_API}/search/commits" @@ -107,26 +192,47 @@ def search_user_contributions(username, type_): def load_completed_usernames(): + """ + Load the set of usernames that have already been processed. + + This function reads the checkpoint file to determine which users + have been successfully processed in previous runs. + + Returns: + set: A set of usernames that have already been processed + """ if os.path.exists(CHECKPOINT_FILE): - with open(CHECKPOINT_FILE, "r") as f: - return set(line.strip() for line in f if line.strip()) + with open(CHECKPOINT_FILE, "r", encoding="utf-8") as checkpoint_file_load: + return set(line.strip() for line in checkpoint_file_load if line.strip()) return set() -def append_completed_username(username): - with open(CHECKPOINT_FILE, "a") as f: - f.write(username + "\n") +def append_completed_username(completed_username): + """ + Mark a username as completed by adding it to the checkpoint file. + Args: + completed_username (str): The GitHub username to mark as completed + """ + with open(CHECKPOINT_FILE, "a", encoding="utf-8") as checkpoint_file_append: + checkpoint_file_append.write(completed_username + "\n") -def append_result(result): - with open(OUTPUT_FILE, "a") as f: - f.write(json.dumps(result) + "\n") +def append_result(user_result): + """ + Append a user's engagement metrics to the output file. -if __name__ == "__main__": + Args: + user_result (dict): The user's engagement metrics to save + """ + with open(OUTPUT_FILE, "a", encoding="utf-8") as output_file_append: + output_file_append.write(json.dumps(user_result) + "\n") + + +if __name__ == "__main__": # pragma: no cover completed = load_completed_usernames() - with open(USERNAMES_FILE, "r") as f: - usernames = [line.strip() for line in f if line.strip()] + with open(USERNAMES_FILE, "r", encoding="utf-8") as file: + usernames = [line.strip() for line in file if line.strip()] print(f"Loaded {len(usernames)} usernames, {len(completed)} already completed.") if TOKEN == "your_token": @@ -162,7 +268,10 @@ def append_result(result): "total_public_commits": commit_count, } append_result(result) - except Exception as e: - print(f"Error processing {username}: {e}") + except requests.RequestException as e: + print(f"Network error processing {username}: {e}") + continue + except (KeyError, ValueError) as e: + print(f"Data error processing {username}: {e}") continue append_completed_username(username) From 5a4e28becd5e672960d8eff7750d9b94aafd1ba2 Mon Sep 17 00:00:00 2001 From: Zack Koppert Date: Fri, 30 May 2025 20:24:53 -0700 Subject: [PATCH 5/7] fix: update required packages for testing Signed-off-by: Zack Koppert --- requirements-test.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/requirements-test.txt b/requirements-test.txt index fbf31c1..93cc3c4 100644 --- a/requirements-test.txt +++ b/requirements-test.txt @@ -4,4 +4,5 @@ flake8 isort pylint mypy -black \ No newline at end of file +black +types-requests From 3df64ff64b329345cbcb4a1093401badfe27a0d0 Mon Sep 17 00:00:00 2001 From: Zack Koppert Date: Fri, 30 May 2025 20:33:39 -0700 Subject: [PATCH 6/7] fix: do throw away in test using python kwargs Signed-off-by: Zack Koppert --- test_user_engagement_metrics.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test_user_engagement_metrics.py b/test_user_engagement_metrics.py index d5411b5..d776851 100644 --- a/test_user_engagement_metrics.py +++ b/test_user_engagement_metrics.py @@ -94,7 +94,7 @@ def test_get_user_repos(monkeypatch): # Simulate 2 pages, then empty responses = [[{"id": 1}, {"id": 2}]] - def safe_get(_url, _params=None): + def safe_get(_url, **kwargs): return MagicMock(json=lambda: responses.pop(0)) monkeypatch.setattr(user_engagement_metrics, "safe_get", safe_get) From 0ef8bb4e23c3ff95fd49f32b737cf055fa210651 Mon Sep 17 00:00:00 2001 From: Zack Koppert Date: Fri, 30 May 2025 20:37:02 -0700 Subject: [PATCH 7/7] fix: use pinned hash in action reference --- .github/workflows/python-ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/python-ci.yml b/.github/workflows/python-ci.yml index b5a46c3..33fa9ad 100644 --- a/.github/workflows/python-ci.yml +++ b/.github/workflows/python-ci.yml @@ -26,7 +26,7 @@ jobs: with: python-version: ${{ matrix.python-version }} - name: Cache pip dependencies - uses: actions/cache@v3 + uses: actions/cache@2f8e54208210a422b2efd51efaa6bd6d7ca8920f # v3.4.3 with: path: ~/.cache/pip key: ${{ runner.os }}-pip-${{ hashFiles('**/requirements.txt', '**/requirements-test.txt') }}