diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml new file mode 100644 index 0000000..d77e706 --- /dev/null +++ b/.github/workflows/tests.yml @@ -0,0 +1,44 @@ +name: Tests and Linting + +on: + pull_request: + branches: [main] + +jobs: + test: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: "3.14.2" + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install -r requirements.txt + pip install -e ".[dev]" + - name: Run unit tests with coverage + run: | + pytest -m "not integration and not slow" --cov=main --cov-report=term-missing --cov-fail-under=80 + - name: Run all tests + run: | + pytest --cov=main --cov-report=xml --cov-report=html + - name: Upload coverage reports + uses: actions/upload-artifact@v4 + with: + name: coverage-reports + path: | + htmlcov/ + coverage.xml + + integration-test: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - name: Install docker-compose + run: sudo apt update && sudo apt install -y docker-compose + - name: Run integration test with docker-compose + run: docker-compose up --build --abort-on-container-exit --exit-code-from github-etl + - name: Cleanup + if: always() + run: docker-compose down -v diff --git a/Dockerfile b/Dockerfile index 5608295..bec1ed8 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,5 +1,5 @@ # Use the latest stable Python image -FROM python:3.11-slim +FROM python:3.14.2-slim # Set environment variables ENV PYTHONDONTWRITEBYTECODE=1 \ @@ -34,4 +34,4 @@ RUN chown -R app:app /app USER app # Set the default command -CMD ["python", "main.py"] \ No newline at end of file +CMD ["python", "main.py"] diff --git a/Dockerfile.mock b/Dockerfile.mock index 1098382..cf46078 100644 --- a/Dockerfile.mock +++ b/Dockerfile.mock @@ -1,5 +1,5 @@ # Dockerfile for mock GitHub API service -FROM python:3.11-slim +FROM python:3.14.2-slim WORKDIR /app diff --git a/README.md b/README.md index 80a3afe..570bacb 100644 --- a/README.md +++ b/README.md @@ -66,7 +66,7 @@ docker run --rm \ ### Container Specifications -- **Base Image**: `python:3.11-slim` (latest stable Python) +- **Base Image**: `python:3.14.2-slim` (latest stable Python) - **User**: `app` (uid: 1000, gid: 1000) - **Working Directory**: `/app` - **Ownership**: All files in `/app` are owned by the `app` user diff --git a/TESTING.md b/TESTING.md new file mode 100644 index 0000000..6901d2f --- /dev/null +++ b/TESTING.md @@ -0,0 +1,621 @@ +# Testing Guide for GitHub ETL + +This document describes comprehensive testing for the GitHub ETL pipeline, including +unit tests, integration tests, Docker testing, linting, and CI/CD workflows. + +## Table of Contents + +1. [Unit Testing](#unit-testing) +2. [Test Organization](#test-organization) +3. [Running Tests](#running-tests) +4. [Code Coverage](#code-coverage) +5. [Linting and Code Quality](#linting-and-code-quality) +6. [CI/CD Integration](#cicd-integration) +7. [Docker Testing](#docker-testing) +8. [Adding New Tests](#adding-new-tests) + +--- + +## Unit Testing + +The test suite in `test_main.py` provides comprehensive coverage for all functions in `main.py`. +We have unit tests covering 9 functions with 80%+ code coverage requirement. + +### Test Structure + +Tests are organized into 10 test classes: + +1. **TestSetupLogging** - Logging configuration +2. **TestSleepForRateLimit** - Rate limit handling +3. **TestExtractPullRequests** - PR extraction with pagination and enrichment +4. **TestExtractCommits** - Commit and file extraction +5. **TestExtractReviewers** - Reviewer extraction +6. **TestExtractComments** - Comment extraction (uses /issues endpoint) +7. **TestTransformData** - Data transformation for all 4 BigQuery tables +8. **TestLoadData** - BigQuery data loading +9. **TestMain** - Main ETL orchestration +10. **TestIntegration** - End-to-end integration tests (marked with `@pytest.mark.integration`) + +### Fixtures + +Reusable fixtures are defined at the top of `test_main.py`: + +- `mock_session` - Mocked `requests.Session` +- `mock_bigquery_client` - Mocked BigQuery client +- `mock_pr_response` - Realistic pull request response +- `mock_commit_response` - Realistic commit with files +- `mock_reviewer_response` - Realistic reviewer response +- `mock_comment_response` - Realistic comment response + +## Test Organization + +### Function Coverage + +| Function | Coverage Target | Key Test Areas | +|----------|------------------|----------------| +| `setup_logging()` | 100% | Logger configuration | +| `sleep_for_rate_limit()` | 100% | Rate limit sleep logic, edge cases | +| `extract_pull_requests()` | 90%+ | Pagination, rate limits, enrichment, error handling | +| `extract_commits()` | 85%+ | Commit/file fetching, rate limits, errors | +| `extract_reviewers()` | 85%+ | Reviewer states, rate limits, errors | +| `extract_comments()` | 85%+ | Comment fetching (via /issues), rate limits | +| `transform_data()` | 95%+ | Bug ID extraction, 4 tables, field mapping | +| `load_data()` | 90%+ | BigQuery insertion, snapshot dates, errors | +| `main()` | 85%+ | Env vars, orchestration, chunking | + +**Overall Target: 85-90% coverage** (80% minimum enforced in CI) + +### Critical Test Cases + +#### Bug ID Extraction +Tests verify the regex pattern matches: +- `Bug 1234567 - Fix` → 1234567 +- `bug 1234567` → 1234567 +- `b=1234567` → 1234567 +- `Bug #1234567` → 1234567 +- Filters out IDs >= 100000000 + +#### Data Transformation +Tests ensure correct transformation for all 4 BigQuery tables: +- **pull_requests**: PR metadata, bug IDs, labels, date_approved +- **commits**: Flattened files (one row per file), commit metadata +- **reviewers**: Review states, date_approved calculation +- **comments**: Character count, status mapping from reviews + +#### Rate Limiting +Tests verify rate limit handling at all API levels: +- Pull requests pagination +- Commit fetching +- Reviewer fetching +- Comment fetching + +## Running Tests + +### All Tests with Coverage + +```bash +pytest +``` + +This runs all tests with coverage reporting (configured in `pytest.ini`). + +### Fast Unit Tests Only (Skip Integration) + +```bash +pytest -m "not integration and not slow" +``` + +Use this for fast feedback during development. + +### Specific Test Class + +```bash +pytest test_main.py::TestTransformData +``` + +### Specific Test Function + +```bash +pytest test_main.py::TestTransformData::test_bug_id_extraction_basic -v +``` + +### With Verbose Output + +```bash +pytest -v +``` + +### With Coverage Report + +```bash +# Terminal report +pytest --cov=main --cov-report=term-missing + +# HTML report +pytest --cov=main --cov-report=html +open htmlcov/index.html +``` + +### Integration Tests Only + +```bash +pytest -m integration +``` + +## Code Coverage + +### Coverage Requirements + +- **Minimum**: 80% (enforced in CI via `--cov-fail-under=80`) +- **Target**: 85-90% +- **Current**: Run `pytest --cov=main` to see current coverage + +### Coverage Configuration + +Coverage settings are in `pytest.ini`: + +```ini +[pytest] +addopts = + --cov=main + --cov-report=term-missing + --cov-report=html + --cov-branch + --cov-fail-under=80 +``` + +### Viewing Coverage + +```bash +# Generate HTML coverage report +pytest --cov=main --cov-report=html + +# Open in browser +xdg-open htmlcov/index.html # Linux +open htmlcov/index.html # macOS +``` + +The HTML report shows: +- Line-by-line coverage +- Branch coverage +- Missing lines highlighted +- Per-file coverage percentages + +## Linting and Code Quality + +### Available Linters + +The project uses these linting tools (defined in `requirements.txt`): + +- **black** - Code formatting +- **isort** - Import sorting +- **flake8** - Style and syntax checking +- **mypy** - Static type checking + +### Running Linters + +```bash +# Run black (auto-format) +black main.py test_main.py + +# Check formatting without changes +black --check main.py test_main.py + +# Sort imports +isort main.py test_main.py + +# Check import sorting +isort --check-only main.py test_main.py + +# Run flake8 +flake8 main.py test_main.py --max-line-length=100 --extend-ignore=E203,W503 + +# Run mypy +mypy main.py --no-strict-optional --ignore-missing-imports +``` + +### All Linting Checks + +```bash +# Run all linters in sequence +black --check main.py test_main.py && \ +isort --check-only main.py test_main.py && \ +flake8 main.py test_main.py --max-line-length=100 --extend-ignore=E203,W503 && \ +mypy main.py --no-strict-optional --ignore-missing-imports +``` + +## CI/CD Integration + +### GitHub Actions Workflow + +The `.github/workflows/tests.yml` workflow runs on every pull request: + +**Lint Job:** +1. Runs black (format check) +2. Runs isort (import check) +3. Runs flake8 (style check) +4. Runs mypy (type check) + +**Test Job:** +1. Runs fast unit tests with 80% coverage threshold +2. Runs all tests (including integration) +3. Uploads coverage reports as artifacts + +### Workflow Triggers + +- Pull requests to `main` branch + +### Viewing Results + +- Check the Actions tab in GitHub +- Coverage artifacts are uploaded for each run +- Failed linting or tests will block merges + +## Docker Testing + +## Overview + +The `docker-compose.yml` configuration provides a complete local testing environment with: + +1. **Mock GitHub API** - A Flask-based mock service that simulates the GitHub Pull Requests API +2. **BigQuery Emulator** - A local BigQuery instance for testing data loads +3. **ETL Service** - The main GitHub ETL application configured to use the mock services + +## Quick Start + +### Start all services + +```bash +docker-compose up --build +``` + +This will: + +- Build and start the mock GitHub API (port 5000) +- Start the BigQuery emulator (ports 9050, 9060) +- Build and run the ETL service + +The ETL service will automatically: + +- Fetch 250 mock pull requests from the mock GitHub API +- Transform the data +- Load it into the BigQuery emulator + +### View logs + +```bash +# All services +docker-compose logs -f + +# Specific service +docker-compose logs -f github-etl +docker-compose logs -f bigquery-emulator +docker-compose logs -f mock-github-api +``` + +### Stop services + +```bash +docker-compose down +``` + +## Architecture + +### Mock GitHub API Service + +- **Port**: 5000 +- **Endpoint**: `http://localhost:5000/repos/{owner}/{repo}/pulls` +- **Mock data**: Generates 250 sample pull requests with realistic data +- **Features**: + - Pagination support (per_page, page parameters) + - Realistic PR data (numbers, titles, states, timestamps, users, etc.) + - Mock rate limit headers + - No authentication required + +### BigQuery Emulator Service + +- **Ports**: + - 9050 (BigQuery API) + - 9060 (Discovery/Admin API) +- **Configuration**: Uses `data.yml` to define the schema +- **Project**: test +- **Dataset**: github_etl +- **Table**: pull_requests + +### ETL Service + +The ETL service is configured via environment variables in `docker-compose.yml`: + +```yaml +environment: + GITHUB_REPOS: "mozilla-firefox/firefox" + GITHUB_TOKEN: "" # Not needed for mock API + GITHUB_API_URL: "http://mock-github-api:5000" + BIGQUERY_PROJECT: "test" + BIGQUERY_DATASET: "github_etl" + BIGQUERY_EMULATOR_HOST: "http://bigquery-emulator:9050" +``` + +## Customization + +### Using Real GitHub API + +To test with the real GitHub API instead of the mock: + +1. Set `GITHUB_TOKEN` environment variable +2. Remove or comment out `GITHUB_API_URL` in docker-compose.yml +3. Update `depends_on` to not require mock-github-api + +```bash +export GITHUB_TOKEN="your_github_token" +docker-compose up github-etl bigquery-emulator +``` + +### Adjusting Mock Data + +Edit `mock_github_api.py` to customize: + +- Total number of PRs (default: 250) +- PR field values +- Pagination behavior + +### Modifying BigQuery Schema + +Edit `data.yml` to change the table schema. The schema matches the fields +extracted in `main.py`'s `transform_data()` function. + +## Querying the BigQuery Emulator + +You can query the BigQuery emulator using the BigQuery Python client: + +```python +from google.cloud import bigquery +from google.api_core.client_options import ClientOptions + +client = bigquery.Client( + project="test-project", + client_options=ClientOptions(api_endpoint="http://localhost:9050") +) + +query = """ +SELECT pr_number, title, state, user_login +FROM `test-project.test_dataset.pull_requests` +LIMIT 10 +""" + +for row in client.query(query): + print(f"PR #{row.pr_number}: {row.title} - {row.state}") +``` + +Or use the `bq` command-line tool with the emulator endpoint. + +## Troubleshooting + +### Services not starting + +Check if ports are already in use: + +```bash +lsof -i :5000 # Mock GitHub API +lsof -i :9050 # BigQuery emulator +``` + +### ETL fails to connect + +Ensure services are healthy: + +```bash +docker-compose ps +``` + +Check service logs: + +```bash +docker-compose logs bigquery-emulator +docker-compose logs mock-github-api +``` + +### Schema mismatch errors + +Verify `data.yml` schema matches fields in `main.py:transform_data()`. + +## Development Workflow + +1. Make changes to `main.py` +2. Restart the ETL service: `docker-compose restart github-etl` +3. View logs: `docker-compose logs -f github-etl` + +The `main.py` file is mounted as a volume, so changes are reflected without rebuilding. + +## Cleanup + +Remove all containers and volumes: + +```bash +docker-compose down -v +``` + +Remove built images: + +```bash +docker-compose down --rmi all +``` + +--- + +## Adding New Tests + +### Testing Patterns + +#### 1. Mock External Dependencies + +Always mock external API calls and BigQuery operations: + +```python +@patch("requests.Session") +def test_api_call(mock_session_class): + mock_session = MagicMock() + mock_session_class.return_value = mock_session + + mock_response = Mock() + mock_response.status_code = 200 + mock_response.json.return_value = [{"id": 1}] + + mock_session.get.return_value = mock_response + # Test code here +``` + +#### 2. Use Fixtures + +Leverage existing fixtures for common test data: + +```python +def test_with_fixtures(mock_session, mock_pr_response): + # Use mock_session and mock_pr_response + pass +``` + +#### 3. Test Edge Cases + +Always test: +- Empty inputs +- None values +- Missing fields +- Rate limits +- API errors (404, 500, etc.) +- Boundary conditions + +#### 4. Verify Call Arguments + +Check that functions are called with correct parameters: + +```python +mock_extract.assert_called_once_with( + session=mock_session, + repo="mozilla/firefox", + github_api_url="https://api.github.com" +) +``` + +### Example: Adding a New Test + +```python +class TestNewFunction: + """Tests for new_function.""" + + def test_basic_functionality(self, mock_session): + """Test basic happy path.""" + # Arrange + mock_response = Mock() + mock_response.status_code = 200 + mock_response.json.return_value = {"result": "success"} + mock_session.get.return_value = mock_response + + # Act + result = main.new_function(mock_session, "arg1") + + # Assert + assert result == {"result": "success"} + mock_session.get.assert_called_once() + + def test_error_handling(self, mock_session): + """Test error handling.""" + mock_response = Mock() + mock_response.status_code = 500 + mock_response.text = "Internal Error" + mock_session.get.return_value = mock_response + + with pytest.raises(SystemExit) as exc_info: + main.new_function(mock_session, "arg1") + + assert "500" in str(exc_info.value) +``` + +### Test Organization Guidelines + +1. **Group related tests** in test classes +2. **Use descriptive names** like `test_handles_rate_limit_on_commits` +3. **One assertion concept per test** - Test one thing at a time +4. **Arrange-Act-Assert pattern** - Structure tests clearly +5. **Add docstrings** to explain what each test verifies + +### Mocking Patterns + +#### Mocking Time + +```python +@patch("time.time") +@patch("time.sleep") +def test_with_time(mock_sleep, mock_time): + mock_time.return_value = 1000 + # Test code +``` + +#### Mocking Environment Variables + +```python +with patch.dict(os.environ, {"VAR_NAME": "value"}, clear=True): + # Test code +``` + +#### Mocking Generators + +```python +mock_extract.return_value = iter([[{"id": 1}], [{"id": 2}]]) +``` + +### Running Tests During Development + +```bash +# Auto-run tests on file changes (requires pytest-watch) +pip install pytest-watch +ptw -- --cov=main -m "not integration" +``` + +### Debugging Tests + +```bash +# Drop into debugger on failures +pytest --pdb + +# Show print statements +pytest -s + +# Verbose with full diff +pytest -vv +``` + +### Coverage Tips + +If coverage is below 80%: + +1. Run `pytest --cov=main --cov-report=term-missing` to see missing lines +2. Look for untested branches (if/else paths) +3. Check error handling paths +4. Verify edge cases are covered + +## Resources + +- [pytest documentation](https://docs.pytest.org/) +- [pytest-cov documentation](https://pytest-cov.readthedocs.io/) +- [unittest.mock documentation](https://docs.python.org/3/library/unittest.mock.html) + +## Troubleshooting + +### Tests Pass Locally But Fail in CI + +- Check Python version (must be 3.14) +- Verify all dependencies are in `requirements.txt` +- Look for environment-specific issues + +### Coverage Dropped Below 80% + +- Run locally: `pytest --cov=main --cov-report=html` +- Open `htmlcov/index.html` to see uncovered lines +- Add tests for missing coverage + +### Import Errors + +- Ensure `PYTHONPATH` includes project root +- Check that `__init__.py` files exist if needed +- Verify module names match file names diff --git a/main.py b/main.py index db80d03..e6b92b0 100755 --- a/main.py +++ b/main.py @@ -9,16 +9,16 @@ import logging import os import re -import requests import sys import time from datetime import datetime, timezone from typing import Iterator, Optional from urllib.parse import parse_qs, urlparse -from google.cloud import bigquery + +import requests from google.api_core.client_options import ClientOptions from google.auth.credentials import AnonymousCredentials - +from google.cloud import bigquery BUG_RE = re.compile(r"\b(?:bug|b=)\s*#?(\d+)\b", re.I) @@ -29,6 +29,7 @@ def setup_logging() -> None: level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", handlers=[logging.StreamHandler(sys.stdout)], + force=True, ) @@ -58,7 +59,7 @@ def extract_pull_requests( # Support custom API URL for mocking/testing api_base = github_api_url or "https://api.github.com" base_url = f"{api_base}/repos/{repo}/pulls" - params = { + params: dict = { "state": "all", "per_page": chunk_size, "sort": "created", @@ -90,7 +91,7 @@ def extract_pull_requests( f"Extracted page {pages} with {len(batch)} PRs (total: {total})" ) - for idx, pr in enumerate(batch): + for _idx, pr in enumerate(batch): pr_number = pr.get("number") if not pr_number: continue @@ -272,7 +273,7 @@ def extract_comments( return comments -def sleep_for_rate_limit(resp): +def sleep_for_rate_limit(resp: requests.Response) -> None: """Sleep until rate limit resets.""" remaining = int(resp.headers.get("X-RateLimit-Remaining", 1)) reset = int(resp.headers.get("X-RateLimit-Reset", 0)) @@ -297,7 +298,7 @@ def transform_data(raw_data: list[dict], repo: str) -> dict: logger = logging.getLogger(__name__) logger.info(f"Starting data transformation for {len(raw_data)} PRs") - transformed_data = { + transformed_data: dict = { "pull_requests": [], "commits": [], "reviewers": [], @@ -324,9 +325,11 @@ def transform_data(raw_data: list[dict], repo: str) -> dict: "bug_id": bug_id, "date_landed": pr.get("merged_at"), "date_approved": None, # This will be filled later - "labels": [label.get("name") for label in pr.get("labels", [])] - if pr.get("labels") - else [], + "labels": ( + [label.get("name") for label in pr.get("labels", [])] + if pr.get("labels") + else [] + ), } # Extract and flatten commit data @@ -368,7 +371,8 @@ def transform_data(raw_data: list[dict], repo: str) -> dict: } transformed_data["reviewers"].append(transformed_reviewer) - # If the request is approved then store the date in the date_approved for the pull request + # If the request is approved then store the date in the + # date_approved for the pull request if review.get("state") == "APPROVED": approved_date = review.get("submitted_at") if transformed_pr.get( @@ -386,9 +390,9 @@ def transform_data(raw_data: list[dict], repo: str) -> dict: "date_created": comment.get("created_at"), "author_email": None, # TODO Placeholder for reviewer email extraction logic "author_username": comment.get("user", {}).get("login"), - "character_count": len(comment.get("body", "")) - if comment.get("body") - else 0, + "character_count": ( + len(comment.get("body", "")) if comment.get("body") else 0 + ), "status": None, # TODO } @@ -419,7 +423,8 @@ def load_data( Args: client: BigQuery client instance dataset_id: BigQuery dataset ID - transformed_data: Dictionary containing tables ('pull_requests', 'commits', 'reviewers', 'comments') mapped to lists of row dictionaries + transformed_data: Dictionary containing tables ('pull_requests', + 'commits', 'reviewers', 'comments') mapped to lists of row dictionaries """ logger = logging.getLogger(__name__) @@ -454,7 +459,8 @@ def load_data( raise Exception(error_msg) logger.info( - f"Data loading completed successfully for table {table} with {len(load_table_data)} rows" + f"Data loading completed successfully for table {table} " + + f"with {len(load_table_data)} rows" ) @@ -476,7 +482,8 @@ def main() -> int: github_token = os.environ.get("GITHUB_TOKEN") if not github_token: logger.warning( - "Warning: No token provided. You will hit very low rate limits and private repos won't work." + "Warning: No token provided. You will hit very low rate " + + "limits and private repos won't work." ) # Read BigQuery configuration @@ -519,9 +526,10 @@ def main() -> int: bigquery_client = bigquery.Client(project=bigquery_project) # Read GitHub repository configuration - github_repos = os.getenv("GITHUB_REPOS") - if github_repos: - github_repos = github_repos.split(",") + github_repos = [] + github_repos_str = os.getenv("GITHUB_REPOS") + if github_repos_str: + github_repos = github_repos_str.split(",") else: raise SystemExit( "Environment variable GITHUB_REPOS is required (format: 'owner/repo,owner/repo')" diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..198886d --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,129 @@ +[project] +name = "github-etl" +version = "0.1.0" +description = "ETL script to extract data from Mozilla Organization Firefox repositories on GitHub and load them into BigQuery" +readme = "README.md" +requires-python = ">=3.14" +license = {text = "MPL-2.0"} +authors = [ + {name = "Mozilla", email = "dev-platform@lists.mozilla.org"} +] +keywords = ["etl", "github", "bigquery", "mozilla"] +classifiers = [ + "Development Status :: 3 - Alpha", + "Intended Audience :: Developers", + "License :: OSI Approved :: Mozilla Public License 2.0 (MPL-2.0)", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.14", +] + +dependencies = [ + "requests>=2.25.0", + "google-cloud-bigquery==3.25.0", +] + +[project.optional-dependencies] +dev = [ + "pytest>=7.0.0", + "pytest-mock>=3.10.0", + "pytest-cov>=4.0.0", + "ruff>=0.14.14", + "black>=24.0.0", +] + +[project.scripts] +github-etl = "main:main" + +[build-system] +requires = ["setuptools>=61.0"] +build-backend = "setuptools.build_meta" + +[tool.setuptools] +py-modules = ["main"] + +# Ruff configuration +[tool.ruff] +line-length = 88 +exclude = [ + ".cache", + ".git", + ".hg", + "__pycache__", +] + +[tool.ruff.lint] +select = ["C", "E", "F", "W", "B", "B9", "I", "ANN"] +ignore = [ + "B006", + "B904", + "C901", + "E203", + "E501", + "ANN002", # Missing type annotation for *args + "ANN003", # Missing type annotation for **kwargs + "ANN202", # Missing return type annotation for protected function +] + +[tool.ruff.lint.isort] +split-on-trailing-comma = true + +[tool.ruff.lint.flake8-annotations] +suppress-none-returning = true + +[tool.ruff.lint.per-file-ignores] +"**/*/tests/*" = ["ANN"] +"**/*/conftest.py" = ["ANN"] + +# Black configuration +[tool.black] +line-length = 88 +target-version = ['py314'] + +# Pytest configuration +[tool.pytest.ini_options] +testpaths = ["tests"] +python_files = ["test_*.py"] +python_classes = ["Test*"] +python_functions = ["test_*"] +addopts = [ + "-v", + "--strict-markers", + "--tb=short", + "--cov=main", + "--cov-report=term-missing", + "--cov-report=html", + "--cov-branch", + "--cov-fail-under=80", +] +markers = [ + "unit: Unit tests for individual functions", + "integration: Integration tests that test multiple components", + "slow: Tests that take longer to run", +] +log_cli = false +log_cli_level = "INFO" +log_cli_format = "%(asctime)s [%(levelname)8s] %(message)s" +log_cli_date_format = "%Y-%m-%d %H:%M:%S" + +# Coverage configuration +[tool.coverage.run] +source = ["main"] +omit = [ + "test_*.py", + ".venv/*", + "venv/*", + "*/site-packages/*", +] + +[tool.coverage.report] +precision = 2 +show_missing = true +skip_covered = false +exclude_lines = [ + "pragma: no cover", + "def __repr__", + "raise AssertionError", + "raise NotImplementedError", + "if __name__ == .__main__.:", + "if TYPE_CHECKING:", +] diff --git a/requirements.txt b/requirements.txt index 008aa8a..d487f50 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,8 +1,325 @@ -# Essential dependencies for GitHub ETL -requests>=2.25.0 -google-cloud-bigquery==3.25.0 - -# Testing dependencies -pytest>=7.0.0 -pytest-mock>=3.10.0 -pytest-cov>=4.0.0 +# +# This file is autogenerated by pip-compile with Python 3.10 +# by the following command: +# +# pip-compile --generate-hashes pyproject.toml +# +certifi==2026.1.4 \ + --hash=sha256:9943707519e4add1115f44c2bc244f782c0249876bf51b6599fee1ffbedd685c \ + --hash=sha256:ac726dd470482006e014ad384921ed6438c457018f4b3d204aea4281258b2120 + # via requests +charset-normalizer==3.4.4 \ + --hash=sha256:027f6de494925c0ab2a55eab46ae5129951638a49a34d87f4c3eda90f696b4ad \ + --hash=sha256:077fbb858e903c73f6c9db43374fd213b0b6a778106bc7032446a8e8b5b38b93 \ + --hash=sha256:0a98e6759f854bd25a58a73fa88833fba3b7c491169f86ce1180c948ab3fd394 \ + --hash=sha256:0d3d8f15c07f86e9ff82319b3d9ef6f4bf907608f53fe9d92b28ea9ae3d1fd89 \ + --hash=sha256:0f04b14ffe5fdc8c4933862d8306109a2c51e0704acfa35d51598eb45a1e89fc \ + --hash=sha256:11d694519d7f29d6cd09f6ac70028dba10f92f6cdd059096db198c283794ac86 \ + --hash=sha256:194f08cbb32dc406d6e1aea671a68be0823673db2832b38405deba2fb0d88f63 \ + --hash=sha256:1bee1e43c28aa63cb16e5c14e582580546b08e535299b8b6158a7c9c768a1f3d \ + --hash=sha256:21d142cc6c0ec30d2efee5068ca36c128a30b0f2c53c1c07bd78cb6bc1d3be5f \ + --hash=sha256:2437418e20515acec67d86e12bf70056a33abdacb5cb1655042f6538d6b085a8 \ + --hash=sha256:244bfb999c71b35de57821b8ea746b24e863398194a4014e4c76adc2bbdfeff0 \ + --hash=sha256:2677acec1a2f8ef614c6888b5b4ae4060cc184174a938ed4e8ef690e15d3e505 \ + --hash=sha256:277e970e750505ed74c832b4bf75dac7476262ee2a013f5574dd49075879e161 \ + --hash=sha256:2aaba3b0819274cc41757a1da876f810a3e4d7b6eb25699253a4effef9e8e4af \ + --hash=sha256:2b7d8f6c26245217bd2ad053761201e9f9680f8ce52f0fcd8d0755aeae5b2152 \ + --hash=sha256:2c9d3c380143a1fedbff95a312aa798578371eb29da42106a29019368a475318 \ + --hash=sha256:3162d5d8ce1bb98dd51af660f2121c55d0fa541b46dff7bb9b9f86ea1d87de72 \ + --hash=sha256:31fd66405eaf47bb62e8cd575dc621c56c668f27d46a61d975a249930dd5e2a4 \ + --hash=sha256:362d61fd13843997c1c446760ef36f240cf81d3ebf74ac62652aebaf7838561e \ + --hash=sha256:376bec83a63b8021bb5c8ea75e21c4ccb86e7e45ca4eb81146091b56599b80c3 \ + --hash=sha256:44c2a8734b333e0578090c4cd6b16f275e07aa6614ca8715e6c038e865e70576 \ + --hash=sha256:47cc91b2f4dd2833fddaedd2893006b0106129d4b94fdb6af1f4ce5a9965577c \ + --hash=sha256:4902828217069c3c5c71094537a8e623f5d097858ac6ca8252f7b4d10b7560f1 \ + --hash=sha256:4bd5d4137d500351a30687c2d3971758aac9a19208fc110ccb9d7188fbe709e8 \ + --hash=sha256:4fe7859a4e3e8457458e2ff592f15ccb02f3da787fcd31e0183879c3ad4692a1 \ + --hash=sha256:542d2cee80be6f80247095cc36c418f7bddd14f4a6de45af91dfad36d817bba2 \ + --hash=sha256:554af85e960429cf30784dd47447d5125aaa3b99a6f0683589dbd27e2f45da44 \ + --hash=sha256:5833d2c39d8896e4e19b689ffc198f08ea58116bee26dea51e362ecc7cd3ed26 \ + --hash=sha256:5947809c8a2417be3267efc979c47d76a079758166f7d43ef5ae8e9f92751f88 \ + --hash=sha256:5ae497466c7901d54b639cf42d5b8c1b6a4fead55215500d2f486d34db48d016 \ + --hash=sha256:5bd2293095d766545ec1a8f612559f6b40abc0eb18bb2f5d1171872d34036ede \ + --hash=sha256:5bfbb1b9acf3334612667b61bd3002196fe2a1eb4dd74d247e0f2a4d50ec9bbf \ + --hash=sha256:5cb4d72eea50c8868f5288b7f7f33ed276118325c1dfd3957089f6b519e1382a \ + --hash=sha256:5dbe56a36425d26d6cfb40ce79c314a2e4dd6211d51d6d2191c00bed34f354cc \ + --hash=sha256:5f819d5fe9234f9f82d75bdfa9aef3a3d72c4d24a6e57aeaebba32a704553aa0 \ + --hash=sha256:64b55f9dce520635f018f907ff1b0df1fdc31f2795a922fb49dd14fbcdf48c84 \ + --hash=sha256:6515f3182dbe4ea06ced2d9e8666d97b46ef4c75e326b79bb624110f122551db \ + --hash=sha256:65e2befcd84bc6f37095f5961e68a6f077bf44946771354a28ad434c2cce0ae1 \ + --hash=sha256:6aee717dcfead04c6eb1ce3bd29ac1e22663cdea57f943c87d1eab9a025438d7 \ + --hash=sha256:6b39f987ae8ccdf0d2642338faf2abb1862340facc796048b604ef14919e55ed \ + --hash=sha256:6e1fcf0720908f200cd21aa4e6750a48ff6ce4afe7ff5a79a90d5ed8a08296f8 \ + --hash=sha256:74018750915ee7ad843a774364e13a3db91682f26142baddf775342c3f5b1133 \ + --hash=sha256:74664978bb272435107de04e36db5a9735e78232b85b77d45cfb38f758efd33e \ + --hash=sha256:74bb723680f9f7a6234dcf67aea57e708ec1fbdf5699fb91dfd6f511b0a320ef \ + --hash=sha256:752944c7ffbfdd10c074dc58ec2d5a8a4cd9493b314d367c14d24c17684ddd14 \ + --hash=sha256:778d2e08eda00f4256d7f672ca9fef386071c9202f5e4607920b86d7803387f2 \ + --hash=sha256:780236ac706e66881f3b7f2f32dfe90507a09e67d1d454c762cf642e6e1586e0 \ + --hash=sha256:798d75d81754988d2565bff1b97ba5a44411867c0cf32b77a7e8f8d84796b10d \ + --hash=sha256:799a7a5e4fb2d5898c60b640fd4981d6a25f1c11790935a44ce38c54e985f828 \ + --hash=sha256:7a32c560861a02ff789ad905a2fe94e3f840803362c84fecf1851cb4cf3dc37f \ + --hash=sha256:7c308f7e26e4363d79df40ca5b2be1c6ba9f02bdbccfed5abddb7859a6ce72cf \ + --hash=sha256:7fa17817dc5625de8a027cb8b26d9fefa3ea28c8253929b8d6649e705d2835b6 \ + --hash=sha256:81d5eb2a312700f4ecaa977a8235b634ce853200e828fbadf3a9c50bab278328 \ + --hash=sha256:82004af6c302b5d3ab2cfc4cc5f29db16123b1a8417f2e25f9066f91d4411090 \ + --hash=sha256:837c2ce8c5a65a2035be9b3569c684358dfbf109fd3b6969630a87535495ceaa \ + --hash=sha256:840c25fb618a231545cbab0564a799f101b63b9901f2569faecd6b222ac72381 \ + --hash=sha256:8a6562c3700cce886c5be75ade4a5db4214fda19fede41d9792d100288d8f94c \ + --hash=sha256:8af65f14dc14a79b924524b1e7fffe304517b2bff5a58bf64f30b98bbc5079eb \ + --hash=sha256:8ef3c867360f88ac904fd3f5e1f902f13307af9052646963ee08ff4f131adafc \ + --hash=sha256:94537985111c35f28720e43603b8e7b43a6ecfb2ce1d3058bbe955b73404e21a \ + --hash=sha256:99ae2cffebb06e6c22bdc25801d7b30f503cc87dbd283479e7b606f70aff57ec \ + --hash=sha256:9a26f18905b8dd5d685d6d07b0cdf98a79f3c7a918906af7cc143ea2e164c8bc \ + --hash=sha256:9b35f4c90079ff2e2edc5b26c0c77925e5d2d255c42c74fdb70fb49b172726ac \ + --hash=sha256:9cd98cdc06614a2f768d2b7286d66805f94c48cde050acdbbb7db2600ab3197e \ + --hash=sha256:9d1bb833febdff5c8927f922386db610b49db6e0d4f4ee29601d71e7c2694313 \ + --hash=sha256:9f7fcd74d410a36883701fafa2482a6af2ff5ba96b9a620e9e0721e28ead5569 \ + --hash=sha256:a59cb51917aa591b1c4e6a43c132f0cdc3c76dbad6155df4e28ee626cc77a0a3 \ + --hash=sha256:a61900df84c667873b292c3de315a786dd8dac506704dea57bc957bd31e22c7d \ + --hash=sha256:a79cfe37875f822425b89a82333404539ae63dbdddf97f84dcbc3d339aae9525 \ + --hash=sha256:a8a8b89589086a25749f471e6a900d3f662d1d3b6e2e59dcecf787b1cc3a1894 \ + --hash=sha256:a8bf8d0f749c5757af2142fe7903a9df1d2e8aa3841559b2bad34b08d0e2bcf3 \ + --hash=sha256:a9768c477b9d7bd54bc0c86dbaebdec6f03306675526c9927c0e8a04e8f94af9 \ + --hash=sha256:ac1c4a689edcc530fc9d9aa11f5774b9e2f33f9a0c6a57864e90908f5208d30a \ + --hash=sha256:af2d8c67d8e573d6de5bc30cdb27e9b95e49115cd9baad5ddbd1a6207aaa82a9 \ + --hash=sha256:b435cba5f4f750aa6c0a0d92c541fb79f69a387c91e61f1795227e4ed9cece14 \ + --hash=sha256:b5b290ccc2a263e8d185130284f8501e3e36c5e02750fc6b6bdeb2e9e96f1e25 \ + --hash=sha256:b5d84d37db046c5ca74ee7bb47dd6cbc13f80665fdde3e8040bdd3fb015ecb50 \ + --hash=sha256:b7cf1017d601aa35e6bb650b6ad28652c9cd78ee6caff19f3c28d03e1c80acbf \ + --hash=sha256:bc7637e2f80d8530ee4a78e878bce464f70087ce73cf7c1caf142416923b98f1 \ + --hash=sha256:c0463276121fdee9c49b98908b3a89c39be45d86d1dbaa22957e38f6321d4ce3 \ + --hash=sha256:c4ef880e27901b6cc782f1b95f82da9313c0eb95c3af699103088fa0ac3ce9ac \ + --hash=sha256:c8ae8a0f02f57a6e61203a31428fa1d677cbe50c93622b4149d5c0f319c1d19e \ + --hash=sha256:ca5862d5b3928c4940729dacc329aa9102900382fea192fc5e52eb69d6093815 \ + --hash=sha256:cb01158d8b88ee68f15949894ccc6712278243d95f344770fa7593fa2d94410c \ + --hash=sha256:cb6254dc36b47a990e59e1068afacdcd02958bdcce30bb50cc1700a8b9d624a6 \ + --hash=sha256:cc00f04ed596e9dc0da42ed17ac5e596c6ccba999ba6bd92b0e0aef2f170f2d6 \ + --hash=sha256:cd09d08005f958f370f539f186d10aec3377d55b9eeb0d796025d4886119d76e \ + --hash=sha256:cd4b7ca9984e5e7985c12bc60a6f173f3c958eae74f3ef6624bb6b26e2abbae4 \ + --hash=sha256:ce8a0633f41a967713a59c4139d29110c07e826d131a316b50ce11b1d79b4f84 \ + --hash=sha256:cead0978fc57397645f12578bfd2d5ea9138ea0fac82b2f63f7f7c6877986a69 \ + --hash=sha256:d055ec1e26e441f6187acf818b73564e6e6282709e9bcb5b63f5b23068356a15 \ + --hash=sha256:d1f13550535ad8cff21b8d757a3257963e951d96e20ec82ab44bc64aeb62a191 \ + --hash=sha256:d9c7f57c3d666a53421049053eaacdd14bbd0a528e2186fcb2e672effd053bb0 \ + --hash=sha256:d9e45d7faa48ee908174d8fe84854479ef838fc6a705c9315372eacbc2f02897 \ + --hash=sha256:da3326d9e65ef63a817ecbcc0df6e94463713b754fe293eaa03da99befb9a5bd \ + --hash=sha256:de00632ca48df9daf77a2c65a484531649261ec9f25489917f09e455cb09ddb2 \ + --hash=sha256:e1f185f86a6f3403aa2420e815904c67b2f9ebc443f045edd0de921108345794 \ + --hash=sha256:e824f1492727fa856dd6eda4f7cee25f8518a12f3c4a56a74e8095695089cf6d \ + --hash=sha256:e912091979546adf63357d7e2ccff9b44f026c075aeaf25a52d0e95ad2281074 \ + --hash=sha256:eaabd426fe94daf8fd157c32e571c85cb12e66692f15516a83a03264b08d06c3 \ + --hash=sha256:ebf3e58c7ec8a8bed6d66a75d7fb37b55e5015b03ceae72a8e7c74495551e224 \ + --hash=sha256:ecaae4149d99b1c9e7b88bb03e3221956f68fd6d50be2ef061b2381b61d20838 \ + --hash=sha256:eecbc200c7fd5ddb9a7f16c7decb07b566c29fa2161a16cf67b8d068bd21690a \ + --hash=sha256:f155a433c2ec037d4e8df17d18922c3a0d9b3232a396690f17175d2946f0218d \ + --hash=sha256:f1e34719c6ed0b92f418c7c780480b26b5d9c50349e9a9af7d76bf757530350d \ + --hash=sha256:f34be2938726fc13801220747472850852fe6b1ea75869a048d6f896838c896f \ + --hash=sha256:f820802628d2694cb7e56db99213f930856014862f3fd943d290ea8438d07ca8 \ + --hash=sha256:f8bf04158c6b607d747e93949aa60618b61312fe647a6369f88ce2ff16043490 \ + --hash=sha256:f8e160feb2aed042cd657a72acc0b481212ed28b1b9a95c0cee1621b524e1966 \ + --hash=sha256:f9d332f8c2a2fcbffe1378594431458ddbef721c1769d78e2cbc06280d8155f9 \ + --hash=sha256:fa09f53c465e532f4d3db095e0c55b615f010ad81803d383195b6b5ca6cbf5f3 \ + --hash=sha256:faa3a41b2b66b6e50f84ae4a68c64fcd0c44355741c6374813a800cd6695db9e \ + --hash=sha256:fd44c878ea55ba351104cb93cc85e74916eb8fa440ca7903e57575e97394f608 + # via requests +google-api-core[grpc]==2.29.0 \ + --hash=sha256:84181be0f8e6b04006df75ddfe728f24489f0af57c96a529ff7cf45bc28797f7 \ + --hash=sha256:d30bc60980daa36e314b5d5a3e5958b0200cb44ca8fa1be2b614e932b75a3ea9 + # via + # google-cloud-bigquery + # google-cloud-core +google-auth==2.47.0 \ + --hash=sha256:833229070a9dfee1a353ae9877dcd2dec069a8281a4e72e72f77d4a70ff945da \ + --hash=sha256:c516d68336bfde7cf0da26aab674a36fedcf04b37ac4edd59c597178760c3498 + # via + # google-api-core + # google-cloud-bigquery + # google-cloud-core +google-cloud-bigquery==3.25.0 \ + --hash=sha256:5b2aff3205a854481117436836ae1403f11f2594e6810a98886afd57eda28509 \ + --hash=sha256:7f0c371bc74d2a7fb74dacbc00ac0f90c8c2bec2289b51dd6685a275873b1ce9 + # via github-etl (pyproject.toml) +google-cloud-core==2.5.0 \ + --hash=sha256:67d977b41ae6c7211ee830c7912e41003ea8194bff15ae7d72fd6f51e57acabc \ + --hash=sha256:7c1b7ef5c92311717bd05301aa1a91ffbc565673d3b0b4163a52d8413a186963 + # via google-cloud-bigquery +google-crc32c==1.8.0 \ + --hash=sha256:014a7e68d623e9a4222d663931febc3033c5c7c9730785727de2a81f87d5bab8 \ + --hash=sha256:01f126a5cfddc378290de52095e2c7052be2ba7656a9f0caf4bcd1bfb1833f8a \ + --hash=sha256:0470b8c3d73b5f4e3300165498e4cf25221c7eb37f1159e221d1825b6df8a7ff \ + --hash=sha256:119fcd90c57c89f30040b47c211acee231b25a45d225e3225294386f5d258288 \ + --hash=sha256:14f87e04d613dfa218d6135e81b78272c3b904e2a7053b841481b38a7d901411 \ + --hash=sha256:17446feb05abddc187e5441a45971b8394ea4c1b6efd88ab0af393fd9e0a156a \ + --hash=sha256:19b40d637a54cb71e0829179f6cb41835f0fbd9e8eb60552152a8b52c36cbe15 \ + --hash=sha256:2a3dc3318507de089c5384cc74d54318401410f82aa65b2d9cdde9d297aca7cb \ + --hash=sha256:3b9776774b24ba76831609ffbabce8cdf6fa2bd5e9df37b594221c7e333a81fa \ + --hash=sha256:3cc0c8912038065eafa603b238abf252e204accab2a704c63b9e14837a854962 \ + --hash=sha256:3d488e98b18809f5e322978d4506373599c0c13e6c5ad13e53bb44758e18d215 \ + --hash=sha256:3ebb04528e83b2634857f43f9bb8ef5b2bbe7f10f140daeb01b58f972d04736b \ + --hash=sha256:450dc98429d3e33ed2926fc99ee81001928d63460f8538f21a5d6060912a8e27 \ + --hash=sha256:4b8286b659c1335172e39563ab0a768b8015e88e08329fa5321f774275fc3113 \ + --hash=sha256:57a50a9035b75643996fbf224d6661e386c7162d1dfdab9bc4ca790947d1007f \ + --hash=sha256:61f58b28e0b21fcb249a8247ad0db2e64114e201e2e9b4200af020f3b6242c9f \ + --hash=sha256:6f35aaffc8ccd81ba3162443fabb920e65b1f20ab1952a31b13173a67811467d \ + --hash=sha256:71734788a88f551fbd6a97be9668a0020698e07b2bf5b3aa26a36c10cdfb27b2 \ + --hash=sha256:864abafe7d6e2c4c66395c1eb0fe12dc891879769b52a3d56499612ca93b6092 \ + --hash=sha256:86cfc00fe45a0ac7359e5214a1704e51a99e757d0272554874f419f79838c5f7 \ + --hash=sha256:87b0072c4ecc9505cfa16ee734b00cd7721d20a0f595be4d40d3d21b41f65ae2 \ + --hash=sha256:87fa445064e7db928226b2e6f0d5304ab4cd0339e664a4e9a25029f384d9bb93 \ + --hash=sha256:89c17d53d75562edfff86679244830599ee0a48efc216200691de8b02ab6b2b8 \ + --hash=sha256:8b3f68782f3cbd1bce027e48768293072813469af6a61a86f6bb4977a4380f21 \ + --hash=sha256:a428e25fb7691024de47fecfbff7ff957214da51eddded0da0ae0e0f03a2cf79 \ + --hash=sha256:b0d1a7afc6e8e4635564ba8aa5c0548e3173e41b6384d7711a9123165f582de2 \ + --hash=sha256:ba6aba18daf4d36ad4412feede6221414692f44d17e5428bdd81ad3fc1eee5dc \ + --hash=sha256:cb5c869c2923d56cb0c8e6bcdd73c009c36ae39b652dbe46a05eb4ef0ad01454 \ + --hash=sha256:d511b3153e7011a27ab6ee6bb3a5404a55b994dc1a7322c0b87b29606d9790e2 \ + --hash=sha256:db3fe8eaf0612fc8b20fa21a5f25bd785bc3cd5be69f8f3412b0ac2ffd49e733 \ + --hash=sha256:e6584b12cb06796d285d09e33f63309a09368b9d806a551d8036a4207ea43697 \ + --hash=sha256:f4b51844ef67d6cf2e9425983274da75f18b1597bb2c998e1c0a0e8d46f8f651 \ + --hash=sha256:f639065ea2042d5c034bf258a9f085eaa7af0cd250667c0635a3118e8f92c69c + # via google-resumable-media +google-resumable-media==2.8.0 \ + --hash=sha256:dd14a116af303845a8d932ddae161a26e86cc229645bc98b39f026f9b1717582 \ + --hash=sha256:f1157ed8b46994d60a1bc432544db62352043113684d4e030ee02e77ebe9a1ae + # via google-cloud-bigquery +googleapis-common-protos==1.72.0 \ + --hash=sha256:4299c5a82d5ae1a9702ada957347726b167f9f8d1fc352477702a1e851ff4038 \ + --hash=sha256:e55a601c1b32b52d7a3e65f43563e2aa61bcd737998ee672ac9b951cd49319f5 + # via + # google-api-core + # grpcio-status +grpcio==1.76.0 \ + --hash=sha256:035d90bc79eaa4bed83f524331d55e35820725c9fbb00ffa1904d5550ed7ede3 \ + --hash=sha256:04bbe1bfe3a68bbfd4e52402ab7d4eb59d72d02647ae2042204326cf4bbad280 \ + --hash=sha256:063065249d9e7e0782d03d2bca50787f53bd0fb89a67de9a7b521c4a01f1989b \ + --hash=sha256:06c3d6b076e7b593905d04fdba6a0525711b3466f43b3400266f04ff735de0cd \ + --hash=sha256:08caea849a9d3c71a542827d6df9d5a69067b0a1efbea8a855633ff5d9571465 \ + --hash=sha256:0aaa82d0813fd4c8e589fac9b65d7dd88702555f702fb10417f96e2a2a6d4c0f \ + --hash=sha256:0b7604868b38c1bfd5cf72d768aedd7db41d78cb6a4a18585e33fb0f9f2363fd \ + --hash=sha256:0c37db8606c258e2ee0c56b78c62fc9dee0e901b5dbdcf816c2dd4ad652b8b0c \ + --hash=sha256:1c9b93f79f48b03ada57ea24725d83a30284a012ec27eab2cf7e50a550cbbbcc \ + --hash=sha256:2107b0c024d1b35f4083f11245c0e23846ae64d02f40b2b226684840260ed054 \ + --hash=sha256:2229ae655ec4e8999599469559e97630185fdd53ae1e8997d147b7c9b2b72cba \ + --hash=sha256:25a18e9810fbc7e7f03ec2516addc116a957f8cbb8cbc95ccc80faa072743d03 \ + --hash=sha256:26ef06c73eb53267c2b319f43e6634c7556ea37672029241a056629af27c10e2 \ + --hash=sha256:2e1743fbd7f5fa713a1b0a8ac8ebabf0ec980b5d8809ec358d488e273b9cf02a \ + --hash=sha256:32483fe2aab2c3794101c2a159070584e5db11d0aa091b2c0ea9c4fc43d0d749 \ + --hash=sha256:3bf0f392c0b806905ed174dcd8bdd5e418a40d5567a05615a030a5aeddea692d \ + --hash=sha256:3e2a27c89eb9ac3d81ec8835e12414d73536c6e620355d65102503064a4ed6eb \ + --hash=sha256:40ad3afe81676fd9ec6d9d406eda00933f218038433980aa19d401490e46ecde \ + --hash=sha256:4215d3a102bd95e2e11b5395c78562967959824156af11fa93d18fdd18050990 \ + --hash=sha256:45d59a649a82df5718fd9527ce775fd66d1af35e6d31abdcdc906a49c6822958 \ + --hash=sha256:45e0111e73f43f735d70786557dc38141185072d7ff8dc1829d6a77ac1471468 \ + --hash=sha256:479496325ce554792dba6548fae3df31a72cef7bad71ca2e12b0e58f9b336bfc \ + --hash=sha256:490fa6d203992c47c7b9e4a9d39003a0c2bcc1c9aa3c058730884bbbb0ee9f09 \ + --hash=sha256:49ce47231818806067aea3324d4bf13825b658ad662d3b25fada0bdad9b8a6af \ + --hash=sha256:4baf3cbe2f0be3289eb68ac8ae771156971848bb8aaff60bad42005539431980 \ + --hash=sha256:522175aba7af9113c48ec10cc471b9b9bd4f6ceb36aeb4544a8e2c80ed9d252d \ + --hash=sha256:5e8571632780e08526f118f74170ad8d50fb0a48c23a746bef2a6ebade3abd6f \ + --hash=sha256:615ba64c208aaceb5ec83bfdce7728b80bfeb8be97562944836a7a0a9647d882 \ + --hash=sha256:61f69297cba3950a524f61c7c8ee12e55c486cb5f7db47ff9dcee33da6f0d3ae \ + --hash=sha256:65a20de41e85648e00305c1bb09a3598f840422e522277641145a32d42dcefcc \ + --hash=sha256:6a15c17af8839b6801d554263c546c69c4d7718ad4321e3166175b37eaacca77 \ + --hash=sha256:747fa73efa9b8b1488a95d0ba1039c8e2dca0f741612d80415b1e1c560febf4e \ + --hash=sha256:7be78388d6da1a25c0d5ec506523db58b18be22d9c37d8d3a32c08be4987bd73 \ + --hash=sha256:81fd9652b37b36f16138611c7e884eb82e0cec137c40d3ef7c3f9b3ed00f6ed8 \ + --hash=sha256:83d57312a58dcfe2a3a0f9d1389b299438909a02db60e2f2ea2ae2d8034909d3 \ + --hash=sha256:8843114c0cfce61b40ad48df65abcfc00d4dba82eae8718fab5352390848c5da \ + --hash=sha256:8cc3309d8e08fd79089e13ed4819d0af72aa935dd8f435a195fd152796752ff2 \ + --hash=sha256:8ebe63ee5f8fa4296b1b8cfc743f870d10e902ca18afc65c68cf46fd39bb0783 \ + --hash=sha256:8eddfb4d203a237da6f3cc8a540dad0517d274b5a1e9e636fd8d2c79b5c1d397 \ + --hash=sha256:922fa70ba549fce362d2e2871ab542082d66e2aaf0c19480ea453905b01f384e \ + --hash=sha256:931091142fd8cc14edccc0845a79248bc155425eee9a98b2db2ea4f00a235a42 \ + --hash=sha256:971fd5a1d6e62e00d945423a567e42eb1fa678ba89072832185ca836a94daaa6 \ + --hash=sha256:980a846182ce88c4f2f7e2c22c56aefd515daeb36149d1c897f83cf57999e0b6 \ + --hash=sha256:9d9adda641db7207e800a7f089068f6f645959f2df27e870ee81d44701dd9db3 \ + --hash=sha256:9f8f757bebaaea112c00dba718fc0d3260052ce714e25804a03f93f5d1c6cc11 \ + --hash=sha256:a6ae758eb08088d36812dd5d9af7a9859c05b1e0f714470ea243694b49278e7b \ + --hash=sha256:a8c2cf1209497cf659a667d7dea88985e834c24b7c3b605e6254cbb5076d985c \ + --hash=sha256:acab0277c40eff7143c2323190ea57b9ee5fd353d8190ee9652369fae735668a \ + --hash=sha256:b331680e46239e090f5b3cead313cc772f6caa7d0fc8de349337563125361a4a \ + --hash=sha256:c088e7a90b6017307f423efbb9d1ba97a22aa2170876223f9709e9d1de0b5347 \ + --hash=sha256:d099566accf23d21037f18a2a63d323075bebace807742e4b0ac210971d4dd70 \ + --hash=sha256:d388087771c837cdb6515539f43b9d4bf0b0f23593a24054ac16f7a960be16f4 \ + --hash=sha256:dcfe41187da8992c5f40aa8c5ec086fa3672834d2be57a32384c08d5a05b4c00 \ + --hash=sha256:e6d1db20594d9daba22f90da738b1a0441a7427552cc6e2e3d1297aeddc00378 \ + --hash=sha256:ebea5cc3aa8ea72e04df9913492f9a96d9348db876f9dda3ad729cfedf7ac416 \ + --hash=sha256:ebebf83299b0cb1721a8859ea98f3a77811e35dce7609c5c963b9ad90728f886 \ + --hash=sha256:f0e34c2079d47ae9f6188211db9e777c619a21d4faba6977774e8fa43b085e48 \ + --hash=sha256:f92f88e6c033db65a5ae3d97905c8fea9c725b63e28d5a75cb73b49bda5024d8 \ + --hash=sha256:f9f7bd5faab55f47231ad8dba7787866b69f5e93bc306e3915606779bbfb4ba8 \ + --hash=sha256:fd5ef5932f6475c436c4a55e4336ebbe47bd3272be04964a03d316bbf4afbcbc \ + --hash=sha256:ff8a59ea85a1f2191a0ffcc61298c571bc566332f82e5f5be1b83c9d8e668a62 + # via + # google-api-core + # grpcio-status +grpcio-status==1.76.0 \ + --hash=sha256:25fcbfec74c15d1a1cb5da3fab8ee9672852dc16a5a9eeb5baf7d7a9952943cd \ + --hash=sha256:380568794055a8efbbd8871162df92012e0228a5f6dffaf57f2a00c534103b18 + # via google-api-core +idna==3.11 \ + --hash=sha256:771a87f49d9defaf64091e6e6fe9c18d4833f140bd19464795bc32d966ca37ea \ + --hash=sha256:795dafcc9c04ed0c1fb032c2aa73654d8e8c5023a7df64a53f39190ada629902 + # via requests +packaging==26.0 \ + --hash=sha256:00243ae351a257117b6a241061796684b084ed1c516a08c48a3f7e147a9d80b4 \ + --hash=sha256:b36f1fef9334a5588b4166f8bcd26a14e521f2b55e6b9de3aaa80d3ff7a37529 + # via google-cloud-bigquery +proto-plus==1.27.0 \ + --hash=sha256:1baa7f81cf0f8acb8bc1f6d085008ba4171eaf669629d1b6d1673b21ed1c0a82 \ + --hash=sha256:873af56dd0d7e91836aee871e5799e1c6f1bda86ac9a983e0bb9f0c266a568c4 + # via google-api-core +protobuf==6.33.4 \ + --hash=sha256:0f12ddbf96912690c3582f9dffb55530ef32015ad8e678cd494312bd78314c4f \ + --hash=sha256:1fe3730068fcf2e595816a6c34fe66eeedd37d51d0400b72fabc848811fdc1bc \ + --hash=sha256:2fe67f6c014c84f655ee06f6f66213f9254b3a8b6bda6cda0ccd4232c73c06f0 \ + --hash=sha256:3df850c2f8db9934de4cf8f9152f8dc2558f49f298f37f90c517e8e5c84c30e9 \ + --hash=sha256:757c978f82e74d75cba88eddec479df9b99a42b31193313b75e492c06a51764e \ + --hash=sha256:8f11ffae31ec67fc2554c2ef891dcb561dae9a2a3ed941f9e134c2db06657dbc \ + --hash=sha256:918966612c8232fc6c24c78e1cd89784307f5814ad7506c308ee3cf86662850d \ + --hash=sha256:955478a89559fa4568f5a81dce77260eabc5c686f9e8366219ebd30debf06aa6 \ + --hash=sha256:c7c64f259c618f0bef7bee042075e390debbf9682334be2b67408ec7c1c09ee6 \ + --hash=sha256:dc2e61bca3b10470c1912d166fe0af67bfc20eb55971dcef8dfa48ce14f0ed91 + # via + # google-api-core + # googleapis-common-protos + # grpcio-status + # proto-plus +pyasn1==0.6.2 \ + --hash=sha256:1eb26d860996a18e9b6ed05e7aae0e9fc21619fcee6af91cca9bad4fbea224bf \ + --hash=sha256:9b59a2b25ba7e4f8197db7686c09fb33e658b98339fadb826e9512629017833b + # via + # pyasn1-modules + # rsa +pyasn1-modules==0.4.2 \ + --hash=sha256:29253a9207ce32b64c3ac6600edc75368f98473906e8fd1043bd6b5b1de2c14a \ + --hash=sha256:677091de870a80aae844b1ca6134f54652fa2c8c5a52aa396440ac3106e941e6 + # via google-auth +python-dateutil==2.9.0.post0 \ + --hash=sha256:37dd54208da7e1cd875388217d5e00ebd4179249f90fb72437e91a35459a0ad3 \ + --hash=sha256:a8b2bc7bffae282281c8140a97d3aa9c14da0b136dfe83f850eea9a5f7470427 + # via google-cloud-bigquery +requests==2.32.5 \ + --hash=sha256:2462f94637a34fd532264295e186976db0f5d453d1cdd31473c85a6a161affb6 \ + --hash=sha256:dbba0bac56e100853db0ea71b82b4dfd5fe2bf6d3754a8893c3af500cec7d7cf + # via + # github-etl (pyproject.toml) + # google-api-core + # google-cloud-bigquery +rsa==4.9.1 \ + --hash=sha256:68635866661c6836b8d39430f97a996acbd61bfa49406748ea243539fe239762 \ + --hash=sha256:e7bdbfdb5497da4c07dfd35530e1a902659db6ff241e39d9953cad06ebd0ae75 + # via google-auth +six==1.17.0 \ + --hash=sha256:4721f391ed90541fddacab5acf947aa0d3dc7d27b2e1e8eda2be8970586c3274 \ + --hash=sha256:ff70335d468e7eb6ec65b95b99d3a2836546063f63acc5171de367e834932a81 + # via python-dateutil +typing-extensions==4.15.0 \ + --hash=sha256:0cea48d173cc12fa28ecabc3b837ea3cf6f38c6d1136f85cbaaf598984861466 \ + --hash=sha256:f0fa19c6845758ab08074a0cfa8b7aecb71c999ca73d62883bc25cc018c4e548 + # via grpcio +urllib3==2.6.3 \ + --hash=sha256:1b62b6884944a57dbe321509ab94fd4d3b307075e0c2eae991ac71ee15ad38ed \ + --hash=sha256:bf272323e553dfb2e87d9bfd225ca7b0f467b919d7bbd355436d3fd37cb0acd4 + # via requests diff --git a/test_formatting.py b/test_formatting.py new file mode 100644 index 0000000..c92e534 --- /dev/null +++ b/test_formatting.py @@ -0,0 +1,16 @@ +""" +Code Style Tests. +""" + +import subprocess + + +def test_black(): + cmd = ("black", "--diff", "main.py") + output = subprocess.check_output(cmd) + assert not output, "The python code does not adhere to the project style." + + +def test_ruff(): + passed = subprocess.call(("ruff", "check", "main.py", "--target-version", "py314")) + assert not passed, "ruff did not run cleanly." diff --git a/test_main.py b/test_main.py new file mode 100644 index 0000000..0d38ac3 --- /dev/null +++ b/test_main.py @@ -0,0 +1,2138 @@ +#!/usr/bin/env python3 +""" +Comprehensive test suite for GitHub ETL main.py + +This test suite provides complete coverage for all functions in main.py, +including extraction, transformation, loading, and orchestration logic. +""" + +import logging +import os +from unittest.mock import MagicMock, Mock, patch + +import pytest +import requests +from google.cloud import bigquery + +import main + +# ============================================================================= +# FIXTURES +# ============================================================================= + + +@pytest.fixture +def mock_session(): + """Provide a mocked requests.Session for testing.""" + session = Mock(spec=requests.Session) + session.headers = {} + return session + + +@pytest.fixture +def mock_bigquery_client(): + """Provide a mocked BigQuery client for testing.""" + client = Mock(spec=bigquery.Client) + client.project = "test-project" + client.insert_rows_json = Mock(return_value=[]) + return client + + +@pytest.fixture +def mock_pr_response(): + """Provide a realistic pull request response for testing.""" + return { + "number": 123, + "title": "Bug 1234567 - Fix login issue", + "state": "closed", + "created_at": "2024-01-01T10:00:00Z", + "updated_at": "2024-01-02T10:00:00Z", + "merged_at": "2024-01-02T10:00:00Z", + "user": {"login": "testuser"}, + "head": {"ref": "fix-branch"}, + "base": {"ref": "main"}, + "labels": [{"name": "bug"}, {"name": "priority-high"}], + "commit_data": [], + "reviewer_data": [], + "comment_data": [], + } + + +@pytest.fixture +def mock_commit_response(): + """Provide a realistic commit response with files.""" + return { + "sha": "abc123def456", + "commit": { + "author": { + "name": "Test Author", + "email": "test@example.com", + "date": "2024-01-01T12:00:00Z", + } + }, + "files": [ + { + "filename": "src/login.py", + "additions": 10, + "deletions": 5, + "changes": 15, + }, + { + "filename": "tests/test_login.py", + "additions": 20, + "deletions": 2, + "changes": 22, + }, + ], + } + + +@pytest.fixture +def mock_reviewer_response(): + """Provide a realistic reviewer response.""" + return { + "id": 789, + "user": {"login": "reviewer1"}, + "state": "APPROVED", + "submitted_at": "2024-01-01T15:00:00Z", + "body": "LGTM", + } + + +@pytest.fixture +def mock_comment_response(): + """Provide a realistic comment response.""" + return { + "id": 456, + "user": {"login": "commenter1"}, + "created_at": "2024-01-01T14:00:00Z", + "body": "This looks good to me", + "pull_request_review_id": None, + } + + +# ============================================================================= +# TEST CLASSES +# ============================================================================= + + + +# ============================================================================= +# TESTS FOR SETUP_LOGGING +# ============================================================================= + + +def test_setup_logging(): + """Test that setup_logging configures logging correctly.""" + main.setup_logging() + + root_logger = logging.getLogger() + assert root_logger.level == logging.INFO + assert len(root_logger.handlers) > 0 + + # Check that at least one handler is a StreamHandler + has_stream_handler = any( + isinstance(handler, logging.StreamHandler) + for handler in root_logger.handlers + ) + assert has_stream_handler + + + +# ============================================================================= +# TESTS FOR SLEEP_FOR_RATE_LIMIT +# ============================================================================= + + +@patch("time.time") +@patch("time.sleep") +def test_sleep_for_rate_limit_calculates_wait_time(mock_sleep, mock_time): + """Test that sleep_for_rate_limit calculates correct wait time.""" + mock_time.return_value = 1000 + + mock_response = Mock() + mock_response.headers = { + "X-RateLimit-Remaining": "0", + "X-RateLimit-Reset": "1120", # 120 seconds from now + } + + main.sleep_for_rate_limit(mock_response) + + mock_sleep.assert_called_once_with(120) + + +@patch("time.time") +@patch("time.sleep") +def test_sleep_for_rate_limit_when_reset_already_passed(mock_sleep, mock_time): + """Test that sleep_for_rate_limit doesn't sleep negative time.""" + mock_time.return_value = 2000 + + mock_response = Mock() + mock_response.headers = { + "X-RateLimit-Remaining": "0", + "X-RateLimit-Reset": "1500", # Already passed + } + + main.sleep_for_rate_limit(mock_response) + + # Should sleep for 0 seconds (max of 0 and negative value) + mock_sleep.assert_called_once_with(0) + + +@patch("time.sleep") +def test_sleep_for_rate_limit_when_remaining_not_zero(mock_sleep): + """Test that sleep_for_rate_limit doesn't sleep when remaining > 0.""" + mock_response = Mock() + mock_response.headers = { + "X-RateLimit-Remaining": "5", + "X-RateLimit-Reset": "1500", + } + + main.sleep_for_rate_limit(mock_response) + + # Should not sleep when remaining > 0 + mock_sleep.assert_not_called() + + +@patch("time.sleep") +def test_sleep_for_rate_limit_with_missing_headers(mock_sleep): + """Test sleep_for_rate_limit with missing rate limit headers.""" + mock_response = Mock() + mock_response.headers = {} + + main.sleep_for_rate_limit(mock_response) + + # Should not sleep when headers are missing (defaults to remaining=1) + mock_sleep.assert_not_called() + + + +# ============================================================================= +# TESTS FOR EXTRACT_PULL_REQUESTS +# ============================================================================= + + +def test_extract_pull_requests_basic(mock_session): + """Test basic extraction of pull requests.""" + mock_response = Mock() + mock_response.status_code = 200 + mock_response.json.return_value = [ + {"number": 1, "title": "PR 1"}, + {"number": 2, "title": "PR 2"}, + ] + mock_response.links = {} + + mock_session.get.return_value = mock_response + + # Mock the extract functions + with ( + patch("main.extract_commits", return_value=[]), + patch("main.extract_reviewers", return_value=[]), + patch("main.extract_comments", return_value=[]), + ): + result = list(main.extract_pull_requests(mock_session, "mozilla/firefox")) + + assert len(result) == 1 + assert len(result[0]) == 2 + assert result[0][0]["number"] == 1 + assert result[0][1]["number"] == 2 + +def test_extract_multiple_pages(mock_session): + """Test extracting data across multiple pages with pagination.""" + # First page response + mock_response_1 = Mock() + mock_response_1.status_code = 200 + mock_response_1.json.return_value = [ + {"number": 1, "title": "PR 1"}, + {"number": 2, "title": "PR 2"}, + ] + mock_response_1.links = { + "next": {"url": "https://api.github.com/repos/mozilla/firefox/pulls?page=2"} + } + + # Second page response + mock_response_2 = Mock() + mock_response_2.status_code = 200 + mock_response_2.json.return_value = [{"number": 3, "title": "PR 3"}] + mock_response_2.links = {} + + mock_session.get.side_effect = [mock_response_1, mock_response_2] + + with ( + patch("main.extract_commits", return_value=[]), + patch("main.extract_reviewers", return_value=[]), + patch("main.extract_comments", return_value=[]), + ): + result = list(main.extract_pull_requests(mock_session, "mozilla/firefox")) + + assert len(result) == 2 + assert len(result[0]) == 2 + assert len(result[1]) == 1 + assert result[0][0]["number"] == 1 + assert result[1][0]["number"] == 3 + +def test_enriches_prs_with_commit_data(mock_session): + """Test that PRs are enriched with commit data.""" + mock_response = Mock() + mock_response.status_code = 200 + mock_response.json.return_value = [{"number": 1, "title": "PR 1"}] + mock_response.links = {} + + mock_session.get.return_value = mock_response + + mock_commits = [{"sha": "abc123"}] + + with ( + patch( + "main.extract_commits", return_value=mock_commits + ) as mock_extract_commits, + patch("main.extract_reviewers", return_value=[]), + patch("main.extract_comments", return_value=[]), + ): + result = list(main.extract_pull_requests(mock_session, "mozilla/firefox")) + + assert result[0][0]["commit_data"] == mock_commits + mock_extract_commits.assert_called_once() + +def test_enriches_prs_with_reviewer_data(mock_session): + """Test that PRs are enriched with reviewer data.""" + mock_response = Mock() + mock_response.status_code = 200 + mock_response.json.return_value = [{"number": 1, "title": "PR 1"}] + mock_response.links = {} + + mock_session.get.return_value = mock_response + + mock_reviewers = [{"id": 789, "state": "APPROVED"}] + + with ( + patch("main.extract_commits", return_value=[]), + patch( + "main.extract_reviewers", return_value=mock_reviewers + ) as mock_extract_reviewers, + patch("main.extract_comments", return_value=[]), + ): + result = list(main.extract_pull_requests(mock_session, "mozilla/firefox")) + + assert result[0][0]["reviewer_data"] == mock_reviewers + mock_extract_reviewers.assert_called_once() + +def test_enriches_prs_with_comment_data(mock_session): + """Test that PRs are enriched with comment data.""" + mock_response = Mock() + mock_response.status_code = 200 + mock_response.json.return_value = [{"number": 1, "title": "PR 1"}] + mock_response.links = {} + + mock_session.get.return_value = mock_response + + mock_comments = [{"id": 456, "body": "Great work!"}] + + with ( + patch("main.extract_commits", return_value=[]), + patch("main.extract_reviewers", return_value=[]), + patch( + "main.extract_comments", return_value=mock_comments + ) as mock_extract_comments, + ): + result = list(main.extract_pull_requests(mock_session, "mozilla/firefox")) + + assert result[0][0]["comment_data"] == mock_comments + mock_extract_comments.assert_called_once() + +@patch("main.sleep_for_rate_limit") +def test_handles_rate_limit(mock_sleep, mock_session): + """Test that extract_pull_requests handles rate limiting correctly.""" + # Rate limit response + mock_response_rate_limit = Mock() + mock_response_rate_limit.status_code = 403 + mock_response_rate_limit.headers = {"X-RateLimit-Remaining": "0"} + + # Successful response after rate limit + mock_response_success = Mock() + mock_response_success.status_code = 200 + mock_response_success.json.return_value = [{"number": 1, "title": "PR 1"}] + mock_response_success.links = {} + + mock_session.get.side_effect = [ + mock_response_rate_limit, + mock_response_success, + ] + + with ( + patch("main.extract_commits", return_value=[]), + patch("main.extract_reviewers", return_value=[]), + patch("main.extract_comments", return_value=[]), + ): + result = list(main.extract_pull_requests(mock_session, "mozilla/firefox")) + + mock_sleep.assert_called_once_with(mock_response_rate_limit) + assert len(result) == 1 + +def test_handles_api_error_404(mock_session): + """Test that extract_pull_requests raises SystemExit on 404.""" + mock_response = Mock() + mock_response.status_code = 404 + mock_response.text = "Not Found" + + mock_session.get.return_value = mock_response + + with pytest.raises(SystemExit) as exc_info: + list(main.extract_pull_requests(mock_session, "mozilla/nonexistent")) + + assert "GitHub API error 404" in str(exc_info.value) + +def test_handles_api_error_500(mock_session): + """Test that extract_pull_requests raises SystemExit on 500.""" + mock_response = Mock() + mock_response.status_code = 500 + mock_response.text = "Internal Server Error" + + mock_session.get.return_value = mock_response + + with pytest.raises(SystemExit) as exc_info: + list(main.extract_pull_requests(mock_session, "mozilla/firefox")) + + assert "GitHub API error 500" in str(exc_info.value) + +def test_stops_on_empty_batch(mock_session): + """Test that extraction stops when an empty batch is returned.""" + # First page with data + mock_response_1 = Mock() + mock_response_1.status_code = 200 + mock_response_1.json.return_value = [{"number": 1}] + mock_response_1.links = { + "next": {"url": "https://api.github.com/repos/mozilla/firefox/pulls?page=2"} + } + + # Second page empty + mock_response_2 = Mock() + mock_response_2.status_code = 200 + mock_response_2.json.return_value = [] + mock_response_2.links = {} + + mock_session.get.side_effect = [mock_response_1, mock_response_2] + + with ( + patch("main.extract_commits", return_value=[]), + patch("main.extract_reviewers", return_value=[]), + patch("main.extract_comments", return_value=[]), + ): + result = list(main.extract_pull_requests(mock_session, "mozilla/firefox")) + + # Should only have 1 chunk from first page + assert len(result) == 1 + assert len(result[0]) == 1 + +def test_invalid_page_number_handling(mock_session): + """Test handling of invalid page number in pagination.""" + mock_response_1 = Mock() + mock_response_1.status_code = 200 + mock_response_1.json.return_value = [{"number": 1}] + mock_response_1.links = { + "next": { + "url": "https://api.github.com/repos/mozilla/firefox/pulls?page=invalid" + } + } + + mock_session.get.return_value = mock_response_1 + + with ( + patch("main.extract_commits", return_value=[]), + patch("main.extract_reviewers", return_value=[]), + patch("main.extract_comments", return_value=[]), + ): + result = list(main.extract_pull_requests(mock_session, "mozilla/firefox")) + + # Should stop pagination on invalid page number + assert len(result) == 1 + +def test_custom_github_api_url(mock_session): + """Test using custom GitHub API URL.""" + custom_url = "https://mock-github.example.com" + + mock_response = Mock() + mock_response.status_code = 200 + mock_response.json.return_value = [{"number": 1}] + mock_response.links = {} + + mock_session.get.return_value = mock_response + + with ( + patch("main.extract_commits", return_value=[]), + patch("main.extract_reviewers", return_value=[]), + patch("main.extract_comments", return_value=[]), + ): + list( + main.extract_pull_requests( + mock_session, "mozilla/firefox", github_api_url=custom_url + ) + ) + + # Verify custom URL was used + call_args = mock_session.get.call_args + assert custom_url in call_args[0][0] + +def test_skips_prs_without_number_field(mock_session): + """Test that PRs without 'number' field are skipped.""" + mock_response = Mock() + mock_response.status_code = 200 + mock_response.json.return_value = [ + {"number": 1, "title": "PR 1"}, + {"title": "PR without number"}, # Missing number field + {"number": 2, "title": "PR 2"}, + ] + mock_response.links = {} + + mock_session.get.return_value = mock_response + + with ( + patch("main.extract_commits", return_value=[]) as mock_commits, + patch("main.extract_reviewers", return_value=[]), + patch("main.extract_comments", return_value=[]), + ): + list(main.extract_pull_requests(mock_session, "mozilla/firefox")) + + # extract_commits should only be called for PRs with number field + assert mock_commits.call_count == 2 + + + +# ============================================================================= +# TESTS FOR EXTRACT_COMMITS +# ============================================================================= + + # Mock commits list response + commits_response = Mock() + commits_response.status_code = 200 + commits_response.json.return_value = [ + {"sha": "abc123"}, + {"sha": "def456"}, + ] + + # Mock individual commit responses + commit_detail_1 = Mock() + commit_detail_1.status_code = 200 + commit_detail_1.json.return_value = { + "sha": "abc123", + "files": [{"filename": "file1.py", "additions": 10}], + } + + commit_detail_2 = Mock() + commit_detail_2.status_code = 200 + commit_detail_2.json.return_value = { + "sha": "def456", + "files": [{"filename": "file2.py", "deletions": 5}], + } + + mock_session.get.side_effect = [ + commits_response, + commit_detail_1, + commit_detail_2, + ] + + result = main.extract_commits(mock_session, "mozilla/firefox", 123) + + assert len(result) == 2 + assert result[0]["sha"] == "abc123" + assert result[0]["files"][0]["filename"] == "file1.py" + assert result[1]["sha"] == "def456" + assert result[1]["files"][0]["filename"] == "file2.py" + +def test_multiple_files_per_commit(mock_session): + """Test handling multiple files in a single commit.""" + commits_response = Mock() + commits_response.status_code = 200 + commits_response.json.return_value = [{"sha": "abc123"}] + + commit_detail = Mock() + commit_detail.status_code = 200 + commit_detail.json.return_value = { + "sha": "abc123", + "files": [ + {"filename": "file1.py", "additions": 10}, + {"filename": "file2.py", "additions": 20}, + {"filename": "file3.py", "deletions": 5}, + ], + } + + mock_session.get.side_effect = [commits_response, commit_detail] + + result = main.extract_commits(mock_session, "mozilla/firefox", 123) + + assert len(result) == 1 + assert len(result[0]["files"]) == 3 + +@patch("main.sleep_for_rate_limit") +def test_rate_limit_on_commits_list(mock_sleep, mock_session): + """Test rate limit handling when fetching commits list.""" + # Rate limit response + rate_limit_response = Mock() + rate_limit_response.status_code = 403 + rate_limit_response.headers = {"X-RateLimit-Remaining": "0"} + + # Success response + success_response = Mock() + success_response.status_code = 200 + success_response.json.return_value = [] + + mock_session.get.side_effect = [rate_limit_response, success_response] + + result = main.extract_commits(mock_session, "mozilla/firefox", 123) + + mock_sleep.assert_called_once() + assert result == [] + +def test_api_error_on_commits_list(mock_session): + """Test API error handling when fetching commits list.""" + error_response = Mock() + error_response.status_code = 500 + error_response.text = "Internal Server Error" + + mock_session.get.return_value = error_response + + with pytest.raises(SystemExit) as exc_info: + main.extract_commits(mock_session, "mozilla/firefox", 123) + + assert "GitHub API error 500" in str(exc_info.value) + +def test_api_error_on_individual_commit(mock_session): + """Test API error when fetching individual commit details.""" + commits_response = Mock() + commits_response.status_code = 200 + commits_response.json.return_value = [{"sha": "abc123"}] + + commit_error = Mock() + commit_error.status_code = 404 + commit_error.text = "Commit not found" + + mock_session.get.side_effect = [commits_response, commit_error] + + with pytest.raises(SystemExit) as exc_info: + main.extract_commits(mock_session, "mozilla/firefox", 123) + + assert "GitHub API error 404" in str(exc_info.value) + +def test_commit_without_sha_field(mock_session): + """Test handling commits without sha field.""" + commits_response = Mock() + commits_response.status_code = 200 + commits_response.json.return_value = [ + {"sha": "abc123"}, + {}, # Missing sha field + ] + + commit_detail_1 = Mock() + commit_detail_1.status_code = 200 + commit_detail_1.json.return_value = {"sha": "abc123", "files": []} + + commit_detail_2 = Mock() + commit_detail_2.status_code = 200 + commit_detail_2.json.return_value = {"files": []} + + mock_session.get.side_effect = [ + commits_response, + commit_detail_1, + commit_detail_2, + ] + + result = main.extract_commits(mock_session, "mozilla/firefox", 123) + + # Should handle the commit without sha gracefully + assert len(result) == 2 + +def test_custom_github_api_url(mock_session): + """Test using custom GitHub API URL for commits.""" + custom_url = "https://mock-github.example.com" + + commits_response = Mock() + commits_response.status_code = 200 + commits_response.json.return_value = [] + + mock_session.get.return_value = commits_response + + main.extract_commits( + mock_session, "mozilla/firefox", 123, github_api_url=custom_url + ) + + call_args = mock_session.get.call_args + assert custom_url in call_args[0][0] + +def test_empty_commits_list(mock_session): + """Test handling PR with no commits.""" + commits_response = Mock() + commits_response.status_code = 200 + commits_response.json.return_value = [] + + mock_session.get.return_value = commits_response + + result = main.extract_commits(mock_session, "mozilla/firefox", 123) + + assert result == [] + + + +# ============================================================================= +# TESTS FOR EXTRACT_REVIEWERS +# ============================================================================= + + reviewers_response = Mock() + reviewers_response.status_code = 200 + reviewers_response.json.return_value = [ + { + "id": 789, + "user": {"login": "reviewer1"}, + "state": "APPROVED", + "submitted_at": "2024-01-01T15:00:00Z", + }, + { + "id": 790, + "user": {"login": "reviewer2"}, + "state": "CHANGES_REQUESTED", + "submitted_at": "2024-01-01T16:00:00Z", + }, + ] + + mock_session.get.return_value = reviewers_response + + result = main.extract_reviewers(mock_session, "mozilla/firefox", 123) + + assert len(result) == 2 + assert result[0]["state"] == "APPROVED" + assert result[1]["state"] == "CHANGES_REQUESTED" + +def test_multiple_review_states(mock_session): + """Test handling multiple different review states.""" + reviewers_response = Mock() + reviewers_response.status_code = 200 + reviewers_response.json.return_value = [ + {"id": 1, "state": "APPROVED", "user": {"login": "user1"}}, + {"id": 2, "state": "CHANGES_REQUESTED", "user": {"login": "user2"}}, + {"id": 3, "state": "COMMENTED", "user": {"login": "user3"}}, + {"id": 4, "state": "DISMISSED", "user": {"login": "user4"}}, + ] + + mock_session.get.return_value = reviewers_response + + result = main.extract_reviewers(mock_session, "mozilla/firefox", 123) + + assert len(result) == 4 + states = [r["state"] for r in result] + assert "APPROVED" in states + assert "CHANGES_REQUESTED" in states + assert "COMMENTED" in states + +def test_empty_reviewers_list(mock_session): + """Test handling PR with no reviewers.""" + reviewers_response = Mock() + reviewers_response.status_code = 200 + reviewers_response.json.return_value = [] + + mock_session.get.return_value = reviewers_response + + result = main.extract_reviewers(mock_session, "mozilla/firefox", 123) + + assert result == [] + +@patch("main.sleep_for_rate_limit") +def test_rate_limit_handling(mock_sleep, mock_session): + """Test rate limit handling when fetching reviewers.""" + rate_limit_response = Mock() + rate_limit_response.status_code = 403 + rate_limit_response.headers = {"X-RateLimit-Remaining": "0"} + + success_response = Mock() + success_response.status_code = 200 + success_response.json.return_value = [] + + mock_session.get.side_effect = [rate_limit_response, success_response] + + result = main.extract_reviewers(mock_session, "mozilla/firefox", 123) + + mock_sleep.assert_called_once() + assert result == [] + +def test_api_error(mock_session): + """Test API error handling when fetching reviewers.""" + error_response = Mock() + error_response.status_code = 500 + error_response.text = "Internal Server Error" + + mock_session.get.return_value = error_response + + with pytest.raises(SystemExit) as exc_info: + main.extract_reviewers(mock_session, "mozilla/firefox", 123) + + assert "GitHub API error 500" in str(exc_info.value) + +def test_custom_github_api_url(mock_session): + """Test using custom GitHub API URL for reviewers.""" + custom_url = "https://mock-github.example.com" + + reviewers_response = Mock() + reviewers_response.status_code = 200 + reviewers_response.json.return_value = [] + + mock_session.get.return_value = reviewers_response + + main.extract_reviewers( + mock_session, "mozilla/firefox", 123, github_api_url=custom_url + ) + + call_args = mock_session.get.call_args + assert custom_url in call_args[0][0] + + + +# ============================================================================= +# TESTS FOR EXTRACT_COMMENTS +# ============================================================================= + + comments_response = Mock() + comments_response.status_code = 200 + comments_response.json.return_value = [ + { + "id": 456, + "user": {"login": "commenter1"}, + "body": "This looks good", + "created_at": "2024-01-01T14:00:00Z", + }, + { + "id": 457, + "user": {"login": "commenter2"}, + "body": "I have concerns", + "created_at": "2024-01-01T15:00:00Z", + }, + ] + + mock_session.get.return_value = comments_response + + result = main.extract_comments(mock_session, "mozilla/firefox", 123) + + assert len(result) == 2 + assert result[0]["id"] == 456 + assert result[1]["id"] == 457 + +def test_uses_issues_endpoint(mock_session): + """Test that comments use /issues endpoint not /pulls.""" + comments_response = Mock() + comments_response.status_code = 200 + comments_response.json.return_value = [] + + mock_session.get.return_value = comments_response + + main.extract_comments(mock_session, "mozilla/firefox", 123) + + call_args = mock_session.get.call_args + url = call_args[0][0] + assert "/issues/123/comments" in url + assert "/pulls/123/comments" not in url + +def test_multiple_comments(mock_session): + """Test handling multiple comments.""" + comments_response = Mock() + comments_response.status_code = 200 + comments_response.json.return_value = [ + {"id": i, "user": {"login": f"user{i}"}, "body": f"Comment {i}"} + for i in range(1, 11) + ] + + mock_session.get.return_value = comments_response + + result = main.extract_comments(mock_session, "mozilla/firefox", 123) + + assert len(result) == 10 + +def test_empty_comments_list(mock_session): + """Test handling PR with no comments.""" + comments_response = Mock() + comments_response.status_code = 200 + comments_response.json.return_value = [] + + mock_session.get.return_value = comments_response + + result = main.extract_comments(mock_session, "mozilla/firefox", 123) + + assert result == [] + +@patch("main.sleep_for_rate_limit") +def test_rate_limit_handling(mock_sleep, mock_session): + """Test rate limit handling when fetching comments.""" + rate_limit_response = Mock() + rate_limit_response.status_code = 403 + rate_limit_response.headers = {"X-RateLimit-Remaining": "0"} + + success_response = Mock() + success_response.status_code = 200 + success_response.json.return_value = [] + + mock_session.get.side_effect = [rate_limit_response, success_response] + + result = main.extract_comments(mock_session, "mozilla/firefox", 123) + + mock_sleep.assert_called_once() + assert result == [] + +def test_api_error(mock_session): + """Test API error handling when fetching comments.""" + error_response = Mock() + error_response.status_code = 404 + error_response.text = "Not Found" + + mock_session.get.return_value = error_response + + with pytest.raises(SystemExit) as exc_info: + main.extract_comments(mock_session, "mozilla/firefox", 123) + + assert "GitHub API error 404" in str(exc_info.value) + +def test_custom_github_api_url(mock_session): + """Test using custom GitHub API URL for comments.""" + custom_url = "https://mock-github.example.com" + + comments_response = Mock() + comments_response.status_code = 200 + comments_response.json.return_value = [] + + mock_session.get.return_value = comments_response + + main.extract_comments( + mock_session, "mozilla/firefox", 123, github_api_url=custom_url + ) + + call_args = mock_session.get.call_args + assert custom_url in call_args[0][0] + + + +# ============================================================================= +# TESTS FOR TRANSFORM_DATA +# ============================================================================= + + raw_data = [ + { + "number": 123, + "title": "Fix login bug", + "state": "closed", + "created_at": "2024-01-01T10:00:00Z", + "updated_at": "2024-01-02T10:00:00Z", + "merged_at": "2024-01-02T12:00:00Z", + "labels": [], + "commit_data": [], + "reviewer_data": [], + "comment_data": [], + } + ] + + result = main.transform_data(raw_data, "mozilla/firefox") + + assert len(result["pull_requests"]) == 1 + pr = result["pull_requests"][0] + assert pr["pull_request_id"] == 123 + assert pr["current_status"] == "closed" + assert pr["date_created"] == "2024-01-01T10:00:00Z" + assert pr["date_modified"] == "2024-01-02T10:00:00Z" + assert pr["date_landed"] == "2024-01-02T12:00:00Z" + assert pr["target_repository"] == "mozilla/firefox" + +def test_bug_id_extraction_basic(): + """Test bug ID extraction from PR title.""" + test_cases = [ + ("Bug 1234567 - Fix issue", 1234567), + ("bug 1234567: Update code", 1234567), + ("Fix for bug 7654321", 7654321), + ("b=9876543 - Change behavior", 9876543), + ] + + for title, expected_bug_id in test_cases: + raw_data = [ + { + "number": 1, + "title": title, + "state": "open", + "labels": [], + "commit_data": [], + "reviewer_data": [], + "comment_data": [], + } + ] + + result = main.transform_data(raw_data, "mozilla/firefox") + assert result["pull_requests"][0]["bug_id"] == expected_bug_id + +def test_bug_id_extraction_with_hash(): + """Test bug ID extraction with # symbol.""" + raw_data = [ + { + "number": 1, + "title": "Bug #1234567 - Fix issue", + "state": "open", + "labels": [], + "commit_data": [], + "reviewer_data": [], + "comment_data": [], + } + ] + + result = main.transform_data(raw_data, "mozilla/firefox") + assert result["pull_requests"][0]["bug_id"] == 1234567 + +def test_bug_id_filter_large_numbers(): + """Test that bug IDs >= 100000000 are filtered out.""" + raw_data = [ + { + "number": 1, + "title": "Bug 999999999 - Invalid bug ID", + "state": "open", + "labels": [], + "commit_data": [], + "reviewer_data": [], + "comment_data": [], + } + ] + + result = main.transform_data(raw_data, "mozilla/firefox") + assert result["pull_requests"][0]["bug_id"] is None + +def test_bug_id_no_match(): + """Test PR title with no bug ID.""" + raw_data = [ + { + "number": 1, + "title": "Update documentation", + "state": "open", + "labels": [], + "commit_data": [], + "reviewer_data": [], + "comment_data": [], + } + ] + + result = main.transform_data(raw_data, "mozilla/firefox") + assert result["pull_requests"][0]["bug_id"] is None + +def test_labels_extraction(): + """Test labels array extraction.""" + raw_data = [ + { + "number": 1, + "title": "PR with labels", + "state": "open", + "labels": [ + {"name": "bug"}, + {"name": "priority-high"}, + {"name": "needs-review"}, + ], + "commit_data": [], + "reviewer_data": [], + "comment_data": [], + } + ] + + result = main.transform_data(raw_data, "mozilla/firefox") + labels = result["pull_requests"][0]["labels"] + assert len(labels) == 3 + assert "bug" in labels + assert "priority-high" in labels + assert "needs-review" in labels + +def test_labels_empty_list(): + """Test handling empty labels list.""" + raw_data = [ + { + "number": 1, + "title": "PR without labels", + "state": "open", + "labels": [], + "commit_data": [], + "reviewer_data": [], + "comment_data": [], + } + ] + + result = main.transform_data(raw_data, "mozilla/firefox") + assert result["pull_requests"][0]["labels"] == [] + +def test_commit_transformation(): + """Test commit fields mapping.""" + raw_data = [ + { + "number": 123, + "title": "PR with commits", + "state": "open", + "labels": [], + "commit_data": [ + { + "sha": "abc123", + "commit": { + "author": { + "name": "Test Author", + "date": "2024-01-01T12:00:00Z", + } + }, + "files": [ + { + "filename": "src/main.py", + "additions": 10, + "deletions": 5, + } + ], + } + ], + "reviewer_data": [], + "comment_data": [], + } + ] + + result = main.transform_data(raw_data, "mozilla/firefox") + + assert len(result["commits"]) == 1 + commit = result["commits"][0] + assert commit["pull_request_id"] == 123 + assert commit["target_repository"] == "mozilla/firefox" + assert commit["commit_sha"] == "abc123" + assert commit["date_created"] == "2024-01-01T12:00:00Z" + assert commit["author_username"] == "Test Author" + assert commit["filename"] == "src/main.py" + assert commit["lines_added"] == 10 + assert commit["lines_removed"] == 5 + +def test_commit_file_flattening(): + """Test that each file becomes a separate row.""" + raw_data = [ + { + "number": 123, + "title": "PR with multiple files", + "state": "open", + "labels": [], + "commit_data": [ + { + "sha": "abc123", + "commit": {"author": {"name": "Author", "date": "2024-01-01"}}, + "files": [ + {"filename": "file1.py", "additions": 10, "deletions": 5}, + {"filename": "file2.py", "additions": 20, "deletions": 2}, + {"filename": "file3.py", "additions": 5, "deletions": 15}, + ], + } + ], + "reviewer_data": [], + "comment_data": [], + } + ] + + result = main.transform_data(raw_data, "mozilla/firefox") + + # Should have 3 rows in commits table (one per file) + assert len(result["commits"]) == 3 + filenames = [c["filename"] for c in result["commits"]] + assert "file1.py" in filenames + assert "file2.py" in filenames + assert "file3.py" in filenames + +def test_multiple_commits_with_files(): + """Test multiple commits with multiple files per PR.""" + raw_data = [ + { + "number": 123, + "title": "PR with multiple commits", + "state": "open", + "labels": [], + "commit_data": [ + { + "sha": "commit1", + "commit": {"author": {"name": "Author1", "date": "2024-01-01"}}, + "files": [ + {"filename": "file1.py", "additions": 10, "deletions": 0} + ], + }, + { + "sha": "commit2", + "commit": {"author": {"name": "Author2", "date": "2024-01-02"}}, + "files": [ + {"filename": "file2.py", "additions": 5, "deletions": 2}, + {"filename": "file3.py", "additions": 8, "deletions": 3}, + ], + }, + ], + "reviewer_data": [], + "comment_data": [], + } + ] + + result = main.transform_data(raw_data, "mozilla/firefox") + + # Should have 3 rows total (1 file from commit1, 2 files from commit2) + assert len(result["commits"]) == 3 + assert result["commits"][0]["commit_sha"] == "commit1" + assert result["commits"][1]["commit_sha"] == "commit2" + assert result["commits"][2]["commit_sha"] == "commit2" + +def test_reviewer_transformation(): + """Test reviewer fields mapping.""" + raw_data = [ + { + "number": 123, + "title": "PR with reviewers", + "state": "open", + "labels": [], + "commit_data": [], + "reviewer_data": [ + { + "id": 789, + "user": {"login": "reviewer1"}, + "state": "APPROVED", + "submitted_at": "2024-01-01T15:00:00Z", + } + ], + "comment_data": [], + } + ] + + result = main.transform_data(raw_data, "mozilla/firefox") + + assert len(result["reviewers"]) == 1 + reviewer = result["reviewers"][0] + assert reviewer["pull_request_id"] == 123 + assert reviewer["target_repository"] == "mozilla/firefox" + assert reviewer["reviewer_username"] == "reviewer1" + assert reviewer["status"] == "APPROVED" + assert reviewer["date_reviewed"] == "2024-01-01T15:00:00Z" + +def test_multiple_review_states(): + """Test handling multiple review states.""" + raw_data = [ + { + "number": 123, + "title": "PR with multiple reviews", + "state": "open", + "labels": [], + "commit_data": [], + "reviewer_data": [ + { + "id": 1, + "user": {"login": "user1"}, + "state": "APPROVED", + "submitted_at": "2024-01-01T15:00:00Z", + }, + { + "id": 2, + "user": {"login": "user2"}, + "state": "CHANGES_REQUESTED", + "submitted_at": "2024-01-01T16:00:00Z", + }, + { + "id": 3, + "user": {"login": "user3"}, + "state": "COMMENTED", + "submitted_at": "2024-01-01T17:00:00Z", + }, + ], + "comment_data": [], + } + ] + + result = main.transform_data(raw_data, "mozilla/firefox") + + assert len(result["reviewers"]) == 3 + states = [r["status"] for r in result["reviewers"]] + assert "APPROVED" in states + assert "CHANGES_REQUESTED" in states + assert "COMMENTED" in states + +def test_date_approved_from_earliest_approval(): + """Test that date_approved is set to earliest APPROVED review.""" + raw_data = [ + { + "number": 123, + "title": "PR with multiple approvals", + "state": "open", + "labels": [], + "commit_data": [], + "reviewer_data": [ + { + "id": 1, + "user": {"login": "user1"}, + "state": "APPROVED", + "submitted_at": "2024-01-02T15:00:00Z", + }, + { + "id": 2, + "user": {"login": "user2"}, + "state": "APPROVED", + "submitted_at": "2024-01-01T14:00:00Z", # Earliest + }, + { + "id": 3, + "user": {"login": "user3"}, + "state": "APPROVED", + "submitted_at": "2024-01-03T16:00:00Z", + }, + ], + "comment_data": [], + } + ] + + result = main.transform_data(raw_data, "mozilla/firefox") + + pr = result["pull_requests"][0] + assert pr["date_approved"] == "2024-01-01T14:00:00Z" + +def test_comment_transformation(): + """Test comment fields mapping.""" + raw_data = [ + { + "number": 123, + "title": "PR with comments", + "state": "open", + "labels": [], + "commit_data": [], + "reviewer_data": [], + "comment_data": [ + { + "id": 456, + "user": {"login": "commenter1"}, + "body": "This looks great!", + "created_at": "2024-01-01T14:00:00Z", + "pull_request_review_id": None, + } + ], + } + ] + + result = main.transform_data(raw_data, "mozilla/firefox") + + assert len(result["comments"]) == 1 + comment = result["comments"][0] + assert comment["pull_request_id"] == 123 + assert comment["target_repository"] == "mozilla/firefox" + assert comment["comment_id"] == 456 + assert comment["author_username"] == "commenter1" + assert comment["date_created"] == "2024-01-01T14:00:00Z" + assert comment["character_count"] == 17 + +def test_comment_character_count(): + """Test character count calculation for comments.""" + raw_data = [ + { + "number": 123, + "title": "PR", + "state": "open", + "labels": [], + "commit_data": [], + "reviewer_data": [], + "comment_data": [ + { + "id": 1, + "user": {"login": "user1"}, + "body": "Short", + "created_at": "2024-01-01", + }, + { + "id": 2, + "user": {"login": "user2"}, + "body": "This is a much longer comment with more text", + "created_at": "2024-01-01", + }, + ], + } + ] + + result = main.transform_data(raw_data, "mozilla/firefox") + + assert result["comments"][0]["character_count"] == 5 + assert result["comments"][1]["character_count"] == 44 + +def test_comment_status_from_review(): + """Test that comment status is mapped from review_id_statuses.""" + raw_data = [ + { + "number": 123, + "title": "PR", + "state": "open", + "labels": [], + "commit_data": [], + "reviewer_data": [ + { + "id": 789, + "user": {"login": "reviewer"}, + "state": "APPROVED", + "submitted_at": "2024-01-01", + } + ], + "comment_data": [ + { + "id": 456, + "user": {"login": "commenter"}, + "body": "LGTM", + "created_at": "2024-01-01", + "pull_request_review_id": 789, + } + ], + } + ] + + result = main.transform_data(raw_data, "mozilla/firefox") + + # Comment should have status from the review + assert result["comments"][0]["status"] == "APPROVED" + +def test_comment_empty_body(): + """Test handling comments with empty or None body.""" + raw_data = [ + { + "number": 123, + "title": "PR", + "state": "open", + "labels": [], + "commit_data": [], + "reviewer_data": [], + "comment_data": [ + { + "id": 1, + "user": {"login": "user1"}, + "body": None, + "created_at": "2024-01-01", + }, + { + "id": 2, + "user": {"login": "user2"}, + "body": "", + "created_at": "2024-01-01", + }, + ], + } + ] + + result = main.transform_data(raw_data, "mozilla/firefox") + + assert result["comments"][0]["character_count"] == 0 + assert result["comments"][1]["character_count"] == 0 + +def test_empty_raw_data(): + """Test handling empty input list.""" + result = main.transform_data([], "mozilla/firefox") + + assert result["pull_requests"] == [] + assert result["commits"] == [] + assert result["reviewers"] == [] + assert result["comments"] == [] + +def test_pr_without_commits_reviewers_comments(): + """Test PR with no commits, reviewers, or comments.""" + raw_data = [ + { + "number": 123, + "title": "Minimal PR", + "state": "open", + "labels": [], + "commit_data": [], + "reviewer_data": [], + "comment_data": [], + } + ] + + result = main.transform_data(raw_data, "mozilla/firefox") + + assert len(result["pull_requests"]) == 1 + assert len(result["commits"]) == 0 + assert len(result["reviewers"]) == 0 + assert len(result["comments"]) == 0 + +def test_return_structure(): + """Test that transform_data returns dict with 4 keys.""" + raw_data = [ + { + "number": 1, + "title": "Test", + "state": "open", + "labels": [], + "commit_data": [], + "reviewer_data": [], + "comment_data": [], + } + ] + + result = main.transform_data(raw_data, "mozilla/firefox") + + assert isinstance(result, dict) + assert "pull_requests" in result + assert "commits" in result + assert "reviewers" in result + assert "comments" in result + +def test_all_tables_have_target_repository(): + """Test that all tables include target_repository field.""" + raw_data = [ + { + "number": 123, + "title": "Test PR", + "state": "open", + "labels": [], + "commit_data": [ + { + "sha": "abc", + "commit": {"author": {"name": "Author", "date": "2024-01-01"}}, + "files": [ + {"filename": "test.py", "additions": 1, "deletions": 0} + ], + } + ], + "reviewer_data": [ + { + "id": 1, + "user": {"login": "reviewer"}, + "state": "APPROVED", + "submitted_at": "2024-01-01", + } + ], + "comment_data": [ + { + "id": 2, + "user": {"login": "commenter"}, + "body": "Test", + "created_at": "2024-01-01", + } + ], + } + ] + + result = main.transform_data(raw_data, "mozilla/firefox") + + assert result["pull_requests"][0]["target_repository"] == "mozilla/firefox" + assert result["commits"][0]["target_repository"] == "mozilla/firefox" + assert result["reviewers"][0]["target_repository"] == "mozilla/firefox" + assert result["comments"][0]["target_repository"] == "mozilla/firefox" + + + +# ============================================================================= +# TESTS FOR LOAD_DATA +# ============================================================================= + + +@patch("main.datetime") +def test_load_data_inserts_all_tables(mock_datetime, mock_bigquery_client): + """Test that load_data inserts all tables correctly.""" + mock_datetime.now.return_value.strftime.return_value = "2024-01-15" + + transformed_data = { + "pull_requests": [{"pull_request_id": 1}], + "commits": [{"commit_sha": "abc"}], + "reviewers": [{"reviewer_username": "user1"}], + "comments": [{"comment_id": 123}], + } + + main.load_data(mock_bigquery_client, "test_dataset", transformed_data) + + # Should call insert_rows_json 4 times (once per table) + assert mock_bigquery_client.insert_rows_json.call_count == 4 + +@patch("main.datetime") +def test_adds_snapshot_date(mock_datetime, mock_bigquery_client): + """Test that snapshot_date is added to all rows.""" + mock_datetime.now.return_value.strftime.return_value = "2024-01-15" + + transformed_data = { + "pull_requests": [{"pull_request_id": 1}, {"pull_request_id": 2}], + "commits": [], + "reviewers": [], + "comments": [], + } + + main.load_data(mock_bigquery_client, "test_dataset", transformed_data) + + call_args = mock_bigquery_client.insert_rows_json.call_args + rows = call_args[0][1] + assert all(row["snapshot_date"] == "2024-01-15" for row in rows) + +def test_constructs_correct_table_ref(mock_bigquery_client): + """Test that table_ref is constructed correctly.""" + transformed_data = { + "pull_requests": [{"pull_request_id": 1}], + "commits": [], + "reviewers": [], + "comments": [], + } + + main.load_data(mock_bigquery_client, "my_dataset", transformed_data) + + call_args = mock_bigquery_client.insert_rows_json.call_args + table_ref = call_args[0][0] + assert table_ref == "test-project.my_dataset.pull_requests" + +def test_empty_transformed_data_skipped(mock_bigquery_client): + """Test that empty transformed_data dict is skipped.""" + transformed_data = {} + + main.load_data(mock_bigquery_client, "test_dataset", transformed_data) + + mock_bigquery_client.insert_rows_json.assert_not_called() + +def test_skips_empty_tables_individually(mock_bigquery_client): + """Test that empty tables are skipped individually.""" + transformed_data = { + "pull_requests": [{"pull_request_id": 1}], + "commits": [], # Empty, should be skipped + "reviewers": [], # Empty, should be skipped + "comments": [{"comment_id": 456}], + } + + main.load_data(mock_bigquery_client, "test_dataset", transformed_data) + + # Should only call insert_rows_json twice (for PRs and comments) + assert mock_bigquery_client.insert_rows_json.call_count == 2 + +def test_only_pull_requests_table(mock_bigquery_client): + """Test loading only pull_requests table.""" + transformed_data = { + "pull_requests": [{"pull_request_id": 1}], + "commits": [], + "reviewers": [], + "comments": [], + } + + main.load_data(mock_bigquery_client, "test_dataset", transformed_data) + + assert mock_bigquery_client.insert_rows_json.call_count == 1 + +def test_raises_exception_on_insert_errors(mock_bigquery_client): + """Test that Exception is raised on BigQuery insert errors.""" + mock_bigquery_client.insert_rows_json.return_value = [ + {"index": 0, "errors": ["Insert failed"]} + ] + + transformed_data = { + "pull_requests": [{"pull_request_id": 1}], + "commits": [], + "reviewers": [], + "comments": [], + } + + with pytest.raises(Exception) as exc_info: + main.load_data(mock_bigquery_client, "test_dataset", transformed_data) + + assert "BigQuery insert errors" in str(exc_info.value) + +def test_verifies_client_insert_called_correctly(mock_bigquery_client): + """Test that client.insert_rows_json is called with correct arguments.""" + transformed_data = { + "pull_requests": [{"pull_request_id": 1}, {"pull_request_id": 2}], + "commits": [], + "reviewers": [], + "comments": [], + } + + main.load_data(mock_bigquery_client, "test_dataset", transformed_data) + + call_args = mock_bigquery_client.insert_rows_json.call_args + table_ref, rows = call_args[0] + + assert "pull_requests" in table_ref + assert len(rows) == 2 + + + +# ============================================================================= +# TESTS FOR MAIN +# ============================================================================= + + +@patch("main.setup_logging") +@patch("main.bigquery.Client") +@patch("requests.Session") +def test_requires_github_repos(mock_session_class, mock_bq_client, mock_setup_logging): + """Test that GITHUB_REPOS is required.""" + with patch.dict( + os.environ, + {"BIGQUERY_PROJECT": "test", "BIGQUERY_DATASET": "test"}, + clear=True, + ): + with pytest.raises(SystemExit) as exc_info: + main.main() + + assert "GITHUB_REPOS" in str(exc_info.value) + + +@patch("main.setup_logging") +@patch("main.bigquery.Client") +@patch("requests.Session") +def test_requires_bigquery_project(mock_session_class, mock_bq_client, mock_setup_logging): + """Test that BIGQUERY_PROJECT is required.""" + with patch.dict( + os.environ, + {"GITHUB_REPOS": "mozilla/firefox", "BIGQUERY_DATASET": "test"}, + clear=True, + ): + with pytest.raises(SystemExit) as exc_info: + main.main() + + assert "BIGQUERY_PROJECT" in str(exc_info.value) + + +@patch("main.setup_logging") +@patch("main.bigquery.Client") +@patch("requests.Session") +def test_requires_bigquery_dataset(mock_session_class, mock_bq_client, mock_setup_logging): + """Test that BIGQUERY_DATASET is required.""" + with patch.dict( + os.environ, + {"GITHUB_REPOS": "mozilla/firefox", "BIGQUERY_PROJECT": "test"}, + clear=True, + ): + with pytest.raises(SystemExit) as exc_info: + main.main() + + assert "BIGQUERY_DATASET" in str(exc_info.value) + +@patch("main.setup_logging") +@patch("main.bigquery.Client") +@patch("requests.Session") +def test_github_token_optional_with_warning(mock_session_class, mock_bq_client, mock_setup_logging): + """Test that GITHUB_TOKEN is optional but warns if missing.""" + with ( + patch.dict( + os.environ, + { + "GITHUB_REPOS": "mozilla/firefox", + "BIGQUERY_PROJECT": "test", + "BIGQUERY_DATASET": "test", + }, + clear=True, + ), + patch("main.extract_pull_requests", return_value=iter([])), + ): + # Should not raise, but should log warning + result = main.main() + assert result == 0 + +@patch("main.setup_logging") +@patch("main.bigquery.Client") +@patch("requests.Session") +def test_splits_github_repos_by_comma(mock_session_class, mock_bq_client, mock_setup_logging): + """Test that GITHUB_REPOS is split by comma.""" + with ( + patch.dict( + os.environ, + { + "GITHUB_REPOS": "mozilla/firefox,mozilla/gecko-dev", + "BIGQUERY_PROJECT": "test", + "BIGQUERY_DATASET": "test", + "GITHUB_TOKEN": "token", + }, + clear=True, + ), + patch("main.extract_pull_requests", return_value=iter([])) as mock_extract, + ): + main.main() + + # Should be called twice (once per repo) + assert mock_extract.call_count == 2 + +@patch("main.setup_logging") +@patch("main.bigquery.Client") +@patch("requests.Session") +def test_honors_github_api_url(mock_session_class, mock_bq_client, mock_setup_logging): + """Test that GITHUB_API_URL is honored.""" + with ( + patch.dict( + os.environ, + { + "GITHUB_REPOS": "mozilla/firefox", + "BIGQUERY_PROJECT": "test", + "BIGQUERY_DATASET": "test", + "GITHUB_TOKEN": "token", + "GITHUB_API_URL": "https://custom-api.example.com", + }, + clear=True, + ), + patch("main.extract_pull_requests", return_value=iter([])) as mock_extract, + ): + main.main() + + call_kwargs = mock_extract.call_args[1] + assert call_kwargs["github_api_url"] == "https://custom-api.example.com" + +@patch("main.setup_logging") +@patch("main.bigquery.Client") +@patch("requests.Session") +def test_honors_bigquery_emulator_host(mock_session_class, mock_bq_client_class, mock_setup_logging): + """Test that BIGQUERY_EMULATOR_HOST is honored.""" + with ( + patch.dict( + os.environ, + { + "GITHUB_REPOS": "mozilla/firefox", + "BIGQUERY_PROJECT": "test", + "BIGQUERY_DATASET": "test", + "GITHUB_TOKEN": "token", + "BIGQUERY_EMULATOR_HOST": "http://localhost:9050", + }, + clear=True, + ), + patch("main.extract_pull_requests", return_value=iter([])), + ): + main.main() + + # Verify BigQuery client was created with emulator settings + mock_bq_client_class.assert_called_once() + +@patch("main.setup_logging") +@patch("main.bigquery.Client") +@patch("requests.Session") +def test_creates_session_with_headers(mock_session_class, mock_bq_client, mock_setup_logging): + """Test that session is created with Accept and User-Agent headers.""" + mock_session = MagicMock() + mock_session_class.return_value = mock_session + + with ( + patch.dict( + os.environ, + { + "GITHUB_REPOS": "mozilla/firefox", + "BIGQUERY_PROJECT": "test", + "BIGQUERY_DATASET": "test", + "GITHUB_TOKEN": "token", + }, + clear=True, + ), + patch("main.extract_pull_requests", return_value=iter([])), + ): + main.main() + + # Verify session headers were set + assert mock_session.headers.update.called + call_args = mock_session.headers.update.call_args[0][0] + assert "Accept" in call_args + assert "User-Agent" in call_args + +@patch("main.setup_logging") +@patch("main.bigquery.Client") +@patch("requests.Session") +def test_sets_authorization_header_with_token(mock_session_class, mock_bq_client, mock_setup_logging): + """Test that Authorization header is set when token provided.""" + mock_session = MagicMock() + mock_session_class.return_value = mock_session + + with ( + patch.dict( + os.environ, + { + "GITHUB_REPOS": "mozilla/firefox", + "BIGQUERY_PROJECT": "test", + "BIGQUERY_DATASET": "test", + "GITHUB_TOKEN": "test-token-123", + }, + clear=True, + ), + patch("main.extract_pull_requests", return_value=iter([])), + ): + main.main() + + # Verify Authorization header was set + assert mock_session.headers.__setitem__.called + +@patch("main.setup_logging") +@patch("main.bigquery.Client") +@patch("requests.Session") +@patch("main.extract_pull_requests") +@patch("main.transform_data") +@patch("main.load_data") +def test_single_repo_successful_etl( + mock_load, + mock_transform, + mock_extract, + mock_session_class, + mock_bq_client, + mock_setup_logging, +): + """Test successful ETL for single repository.""" + mock_extract.return_value = iter([[{"number": 1}]]) + mock_transform.return_value = { + "pull_requests": [{"pull_request_id": 1}], + "commits": [], + "reviewers": [], + "comments": [], + } + + with patch.dict( + os.environ, + { + "GITHUB_REPOS": "mozilla/firefox", + "BIGQUERY_PROJECT": "test", + "BIGQUERY_DATASET": "test", + "GITHUB_TOKEN": "token", + }, + clear=True, + ): + result = main.main() + + assert result == 0 + mock_extract.assert_called_once() + mock_transform.assert_called_once() + mock_load.assert_called_once() + +@patch("main.setup_logging") +@patch("main.bigquery.Client") +@patch("requests.Session") +@patch("main.extract_pull_requests") +@patch("main.transform_data") +@patch("main.load_data") +def test_multiple_repos_processing( + mock_load, + mock_transform, + mock_extract, + mock_session_class, + mock_bq_client, + mock_setup_logging, +): + """Test processing multiple repositories.""" + mock_extract.return_value = iter([[{"number": 1}]]) + mock_transform.return_value = { + "pull_requests": [{"pull_request_id": 1}], + "commits": [], + "reviewers": [], + "comments": [], + } + + with patch.dict( + os.environ, + { + "GITHUB_REPOS": "mozilla/firefox,mozilla/gecko-dev,mozilla/addons", + "BIGQUERY_PROJECT": "test", + "BIGQUERY_DATASET": "test", + "GITHUB_TOKEN": "token", + }, + clear=True, + ): + result = main.main() + + assert result == 0 + # Should process 3 repositories + assert mock_extract.call_count == 3 + +@patch("main.setup_logging") +@patch("main.bigquery.Client") +@patch("requests.Session") +@patch("main.extract_pull_requests") +@patch("main.transform_data") +@patch("main.load_data") +def test_processes_chunks_iteratively( + mock_load, + mock_transform, + mock_extract, + mock_session_class, + mock_bq_client, + mock_setup_logging, +): + """Test that chunks are processed iteratively from generator.""" + # Return 3 chunks + mock_extract.return_value = iter( + [ + [{"number": 1}], + [{"number": 2}], + [{"number": 3}], + ] + ) + mock_transform.return_value = { + "pull_requests": [{"pull_request_id": 1}], + "commits": [], + "reviewers": [], + "comments": [], + } + + with patch.dict( + os.environ, + { + "GITHUB_REPOS": "mozilla/firefox", + "BIGQUERY_PROJECT": "test", + "BIGQUERY_DATASET": "test", + "GITHUB_TOKEN": "token", + }, + clear=True, + ): + result = main.main() + + assert result == 0 + # Transform and load should be called 3 times (once per chunk) + assert mock_transform.call_count == 3 + assert mock_load.call_count == 3 + +@patch("main.setup_logging") +@patch("main.bigquery.Client") +@patch("requests.Session") +def test_returns_zero_on_success(mock_session_class, mock_bq_client, mock_setup_logging): + """Test that main returns 0 on success.""" + with ( + patch.dict( + os.environ, + { + "GITHUB_REPOS": "mozilla/firefox", + "BIGQUERY_PROJECT": "test", + "BIGQUERY_DATASET": "test", + "GITHUB_TOKEN": "token", + }, + clear=True, + ), + patch("main.extract_pull_requests", return_value=iter([])), + ): + result = main.main() + + assert result == 0 + + +@pytest.mark.integration +@patch("main.setup_logging") +@patch("main.load_data") +@patch("main.bigquery.Client") +@patch("requests.Session") +def test_full_etl_flow_transforms_data_correctly(mock_session_class, mock_bq_client, mock_load, mock_setup_logging): + """Test full ETL flow with mocked GitHub responses.""" + mock_session = MagicMock() + mock_session_class.return_value = mock_session + + # Mock PR response + pr_response = Mock() + pr_response.status_code = 200 + pr_response.json.return_value = [ + {"number": 1, "title": "Bug 1234567 - Test PR", "state": "open"} + ] + pr_response.links = {} + + # Mock commits, reviewers, comments responses + empty_response = Mock() + empty_response.status_code = 200 + empty_response.json.return_value = [] + + mock_session.get.side_effect = [ + pr_response, + empty_response, + empty_response, + empty_response, + ] + + with patch.dict( + os.environ, + { + "GITHUB_REPOS": "mozilla/firefox", + "BIGQUERY_PROJECT": "test", + "BIGQUERY_DATASET": "test", + "GITHUB_TOKEN": "token", + }, + clear=True, + ): + result = main.main() + + assert result == 0 + mock_load.assert_called_once() + + # Verify transformed data structure + call_args = mock_load.call_args[0] + transformed_data = call_args[2] + assert "pull_requests" in transformed_data + assert len(transformed_data["pull_requests"]) == 1 + +@patch("main.setup_logging") +@patch("main.load_data") +@patch("main.bigquery.Client") +@patch("requests.Session") +def test_bug_id_extraction_through_pipeline(mock_session_class, mock_bq_client, mock_load, mock_setup_logging): + """Test bug ID extraction through full pipeline.""" + mock_session = MagicMock() + mock_session_class.return_value = mock_session + + pr_response = Mock() + pr_response.status_code = 200 + pr_response.json.return_value = [ + { + "number": 1, + "title": "Bug 9876543 - Fix critical issue", + "state": "closed", + } + ] + pr_response.links = {} + + empty_response = Mock() + empty_response.status_code = 200 + empty_response.json.return_value = [] + + mock_session.get.side_effect = [ + pr_response, + empty_response, + empty_response, + empty_response, + ] + + with patch.dict( + os.environ, + { + "GITHUB_REPOS": "mozilla/firefox", + "BIGQUERY_PROJECT": "test", + "BIGQUERY_DATASET": "test", + "GITHUB_TOKEN": "token", + }, + clear=True, + ): + main.main() + + call_args = mock_load.call_args[0] + transformed_data = call_args[2] + pr = transformed_data["pull_requests"][0] + assert pr["bug_id"] == 9876543 + +@patch("main.setup_logging") +@patch("main.load_data") +@patch("main.bigquery.Client") +@patch("requests.Session") +def test_pagination_through_full_flow(mock_session_class, mock_bq_client, mock_load, mock_setup_logging): + """Test pagination through full ETL flow.""" + mock_session = MagicMock() + mock_session_class.return_value = mock_session + + # First page + pr_response_1 = Mock() + pr_response_1.status_code = 200 + pr_response_1.json.return_value = [ + {"number": 1, "title": "PR 1", "state": "open"} + ] + pr_response_1.links = { + "next": {"url": "https://api.github.com/repos/mozilla/firefox/pulls?page=2"} + } + + # Second page + pr_response_2 = Mock() + pr_response_2.status_code = 200 + pr_response_2.json.return_value = [ + {"number": 2, "title": "PR 2", "state": "open"} + ] + pr_response_2.links = {} + + empty_response = Mock() + empty_response.status_code = 200 + empty_response.json.return_value = [] + + mock_session.get.side_effect = [ + pr_response_1, + empty_response, + empty_response, + empty_response, + pr_response_2, + empty_response, + empty_response, + empty_response, + ] + + with patch.dict( + os.environ, + { + "GITHUB_REPOS": "mozilla/firefox", + "BIGQUERY_PROJECT": "test", + "BIGQUERY_DATASET": "test", + "GITHUB_TOKEN": "token", + }, + clear=True, + ): + main.main() + + # Should be called twice (once per chunk/page) + assert mock_load.call_count == 2