diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index e33449f34..cfd3b1247 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -69,7 +69,8 @@ repos: (?x) ^\.github/| ^docs/| - ^examples/ + ^examples/| + ^hamilton/experimental/databackend.py args: - --comment-style - "|#|" diff --git a/.rat-excludes b/.rat-excludes new file mode 100644 index 000000000..05b78f9dd --- /dev/null +++ b/.rat-excludes @@ -0,0 +1,19 @@ +# Requirements/data files (not source code) +.*\.txt +.*\.jsonl + +# SPDX short license template (is itself a license reference) +SHORT_LICENSE\.md + +# Data files (not source code) +.*\.json +.*\.csv +.*\.fwf +.*\.xml + +# Git and version control config +\.gitignore +\.rat-excludes + +# PKG-INFO (generated metadata) +PKG-INFO diff --git a/pyproject.toml b/pyproject.toml index f29d9543d..90b34a5b0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -94,6 +94,10 @@ dev = [ "prek", "ruff==0.15.0", # this should match `.pre-commit-config.yaml` ] +release = [ + "flit", + "twine", +] test = [ "connectorx; python_version < '3.14'", "dask[complete]", @@ -289,6 +293,13 @@ include = [ "NOTICE", "DISCLAIMER", "scripts/**", + "tests/**", + "plugin_tests/**", + "examples/hello_world/**", + "examples/data_quality/simple/**", + "examples/reusing_functions/**", + "examples/schema/**", + "examples/pandas/materialization/**", ] exclude = [ ".git/**", diff --git a/scripts/README.md b/scripts/README.md new file mode 100644 index 000000000..402cc3527 --- /dev/null +++ b/scripts/README.md @@ -0,0 +1,289 @@ + + +# Policy on source versus distribution + +Apache Hamilton is an apache-incubating project. As such, we intend to follow all Apache guidelines to +both the spirit and (when applicable) the letter. + +That said, there is occasional ambiguity. Thus we aim to clarify with a reasonable and consistently maintained +approach. The question that we found most ambiguous when determining our release process is: +1. What counts as source code, and should thus be included in the "sdist" (the source-only distribution) +2. What should be included in the build? + +Specifically, we set the following guidelines: + +| | source (to vote on) -- tar.gz | sdist -- source used to build | whl file | Reasoning | +|---|---|---|---|---| +| Build Scripts | Y | Y | N | Included in tar.gz and sdist as they are needed to reproduce the build, but not in the whl. These are only meant to be consumed by developers/pod members. | +| Library Source code | Y | Y | Y | Core library source code is included in all three distributions: tar.gz, sdist, and whl. | +| Tests (unit + plugin) | Y | Y | N | We expect users/PMC to download the source distribution, build from source, run the tests, and validate. Thus we include in the tar.gz and sdist, but not in the whl. | +| READMEs | Y | Y | Y | Standard project metadata files (README.md, LICENSE, NOTICE, DISCLAIMER) are included in all three distributions. | +| Documentation | Y | N | N | Documentation source is included in the tar.gz for voters to review, but not in the sdist or whl as it is not needed for building or using the package. | +| Representative Examples | Y | Y | N | A curated set of examples are included in tar.gz and sdist so voters can verify Hamilton works end-to-end. Not in the whl as they serve as documentation/verification only. | +| Other Examples | Y | N | N | These are included in the tar.gz for voters to review but not included in the sdist or whl. | + +# Packages + +Apache Hamilton consists of 5 independently versioned packages: + +| Package | Key | Working Directory | Description | +|---|---|---|---| +| `apache-hamilton` | `hamilton` | `.` | Core library (must be released first) | +| `apache-hamilton-sdk` | `sdk` | `ui/sdk` | Tracking SDK | +| `apache-hamilton-contrib` | `contrib` | `contrib` | Community dataflows | +| `apache-hamilton-ui` | `ui` | `ui/backend` | Web UI server | +| `apache-hamilton-lsp` | `lsp` | `dev_tools/language_server` | Language server | + +The core `apache-hamilton` package must be released first. The other four packages depend on it but not on each other. + +# Release Process + +## Environment Setup + +We recommend using [uv](https://docs.astral.sh/uv/) for Python environment management. It handles Python versions, virtual environments, and dependency installation in a single tool. + +### Prerequisites + +- Python 3.10+ +- `uv` ([install guide](https://docs.astral.sh/uv/getting-started/installation/)) +- `flit` for building +- `twine` for package validation +- GPG key configured for signing +- Node.js + npm for UI builds (only needed for the `ui` package) +- Apache RAT jar for license checking (optional, for verification) + +```bash +# Install uv (unless already installed) +curl -LsSf https://astral.sh/uv/install.sh | sh + +# Create a virtual environment with build dependencies +uv venv --python 3.11 +uv sync --group release + +# Verify GPG setup +gpg --list-secret-keys + +# IMPORTANT: set GPG_TTY so GPG can prompt for passphrase +export GPG_TTY=$(tty) +``` + +Note: all commands below use `uv run` which automatically activates the `.venv` environment. +If you prefer, you can instead `source .venv/bin/activate` and omit the `uv run` prefix. + +## Building a Release + +The main release script is `scripts/apache_release_helper.py`. It builds the sdist and wheel, signs +all artifacts with GPG, generates SHA512 checksums, uploads to Apache SVN, and generates a vote email template. + +```bash +# Release the core package (example: version 1.90.0, RC0) +uv run python scripts/apache_release_helper.py --package hamilton 1.90.0 0 your_apache_id + +# Release a downstream package (example: sdk version 0.9.0, RC0) +uv run python scripts/apache_release_helper.py --package sdk 0.9.0 0 your_apache_id +``` + +The script will: +1. Check prerequisites (`flit`, `twine`, `gpg`) +2. Validate the version in the source matches the version you specified +3. Create a git tag (`apache-hamilton-v1.90.0-incubating-RC0`) +4. Build the sdist (`.tar.gz`) and wheel (`.whl`) using `flit build --no-use-vcs` +5. Validate the wheel with `twine check` +6. Sign all artifacts with GPG and generate SHA512 checksums +7. Upload to Apache SVN dist/dev +8. Print a vote email template + +Output lands in the `dist/` directory under the package's working directory. + +### Dry Run (no SVN upload) + +To test the build and signing without uploading, you can interrupt the script after artifacts +are built (before the SVN upload step), or comment out the upload call. The artifacts will +be in the `dist/` directory for inspection. + +### After the Vote Passes + +```bash +# Push the git tag +git push origin apache-hamilton-v1.90.0-incubating-RC0 + +# Upload to PyPI (from the package's working directory) +uv run twine upload dist/apache_hamilton-1.90.0.tar.gz dist/apache_hamilton-1.90.0-py3-none-any.whl +``` + +# For Voters: Verifying a Release + +If you're voting on a release, follow these steps to verify the release candidate. + +## Complete Verification Workflow + +```bash +# Set version and RC number +export VERSION=1.90.0 +export RC=0 +export PACKAGE=apache-hamilton # or apache-hamilton-sdk, etc. + +# 1. Download all artifacts from SVN +svn export https://dist.apache.org/repos/dist/dev/incubator/hamilton/${PACKAGE}-${VERSION}-incubating-RC${RC}/ hamilton-rc${RC} +cd hamilton-rc${RC} + +# 2. Import KEYS file and verify GPG signatures +wget https://downloads.apache.org/incubator/hamilton/KEYS +gpg --import KEYS + +# Verify sdist signature +gpg --verify ${PACKAGE}-${VERSION}-incubating.tar.gz.asc ${PACKAGE}-${VERSION}-incubating.tar.gz + +# Verify wheel signature (note: underscores in wheel filenames) +WHEEL_NAME=$(echo ${PACKAGE} | tr '-' '_')-${VERSION}-py3-none-any.whl +gpg --verify ${WHEEL_NAME}.asc ${WHEEL_NAME} + +# 3. Verify SHA512 checksums +shasum -a 512 -c ${PACKAGE}-${VERSION}-incubating.tar.gz.sha512 +shasum -a 512 -c ${WHEEL_NAME}.sha512 + +# 4. Extract the source archive and build from source +tar -xzf ${PACKAGE}-${VERSION}-incubating.tar.gz +cd ${PACKAGE}-${VERSION}-incubating/ +``` + +## Build from Source with uv + +All remaining steps assume you are inside the extracted source directory +(`${PACKAGE}-${VERSION}-incubating/`) from the step above. + +```bash +# Create a fresh environment and install build tools +uv venv --python 3.11 --clean +uv sync --group release + +# Build the wheel from source +uv run flit build --no-use-vcs + +# Install the wheel you just built +uv pip install dist/apache_hamilton-${VERSION}-py3-none-any.whl +``` + +## Run Tests + +```bash +# Install test dependencies (uses the test dependency group from pyproject.toml) +uv sync --group test + +# Run core unit tests +uv run pytest tests/ -x -q + +# Run plugin tests +uv run pytest plugin_tests/ -x -q +``` + +## Run Examples + +The source archive includes representative examples to verify Hamilton works end-to-end. Each example may require additional dependencies. + +### Hello World (no extra deps) +```bash +cd examples/hello_world +uv run python my_script.py +cd ../.. +``` + +### Data Quality with Pandera +```bash +uv pip install pandera +cd examples/data_quality/simple +uv run python run.py +cd ../../.. +``` + +### Function Reuse +```bash +cd examples/reusing_functions +uv run python run.py +cd ../.. +``` + +### Schema Validation +```bash +cd examples/schema +uv run python run.py +cd ../.. +``` + +### Materialization (Pandas) +```bash +uv pip install openpyxl xlsxwriter +cd examples/pandas/materialization +uv run python run.py +cd ../../.. +``` + +## Verification Script + +For automated verification of signatures, checksums, and license compliance, use the verification script. +Run these from inside the extracted source directory (`${PACKAGE}-${VERSION}-incubating/`). + +### Prerequisites + +Download Apache RAT for license verification (into the extracted source directory): + +```bash +curl -O https://repo1.maven.org/maven2/org/apache/rat/apache-rat/0.15/apache-rat-0.15.jar +``` + +### Running Verification + +```bash +# Run from the extracted source directory (${PACKAGE}-${VERSION}-incubating/) +# Verify GPG signatures and SHA512 checksums +uv run python scripts/verify_apache_artifacts.py signatures + +# Verify license headers (requires Apache RAT) +uv run python scripts/verify_apache_artifacts.py licenses --rat-jar apache-rat-0.15.jar + +# Verify everything +uv run python scripts/verify_apache_artifacts.py all --rat-jar apache-rat-0.15.jar + +# Inspect artifact contents +uv run python scripts/verify_apache_artifacts.py list-contents dist/apache-hamilton-1.90.0-incubating.tar.gz +uv run python scripts/verify_apache_artifacts.py list-contents dist/apache_hamilton-1.90.0-py3-none-any.whl + +# Validate wheel metadata +uv run python scripts/verify_apache_artifacts.py twine-check +``` + +# Local Development + +For local wheel building/testing without signing or the full release process: + +```bash +uv venv --python 3.11 +uv sync --group release + +# Build both sdist and wheel +uv run flit build --no-use-vcs + +# Or just the wheel +uv run flit build --no-use-vcs --format wheel + +# Install and test locally +uv pip install dist/apache_hamilton-*.whl +uv run python -c "import hamilton; print(hamilton.version.VERSION)" +``` diff --git a/scripts/apache_release_helper.py b/scripts/apache_release_helper.py index 8f3705c21..08ea342dc 100644 --- a/scripts/apache_release_helper.py +++ b/scripts/apache_release_helper.py @@ -96,7 +96,7 @@ def get_version_from_file(package_config: dict) -> str: def check_prerequisites(): """Checks for necessary command-line tools and Python modules.""" print("Checking for required tools...") - required_tools = ["git", "gpg", "svn"] + required_tools = ["git", "gpg", "svn", "twine"] for tool in required_tools: if shutil.which(tool) is None: print(f"Error: '{tool}' not found. Please install it and ensure it's in your PATH.") @@ -161,6 +161,26 @@ def update_version(package_config: dict, version, rc_num): return False +def verify_wheel_with_twine(wheel_path: str) -> bool: + """Validates wheel metadata using twine check.""" + print(f"Validating wheel with twine: {wheel_path}") + try: + result = subprocess.run( + ["twine", "check", wheel_path], + capture_output=True, + text=True, + ) + print(result.stdout) + if result.returncode != 0: + print(f"twine check failed:\n{result.stderr}") + return False + print("Wheel passed twine validation.") + return True + except Exception as e: + print(f"Error running twine check: {e}") + return False + + def sign_artifacts(archive_name: str) -> list[str] | None: """Creates signed files for the designated artifact.""" files = [] @@ -302,15 +322,13 @@ def create_release_artifacts(package_config: dict, version) -> list[str]: # Use flit build to create the source distribution. try: - env = os.environ.copy() - env["FLIT_USE_VCS"] = "0" subprocess.run( [ "flit", "build", + "--no-use-vcs", ], check=True, - env=env, ) print("Source distribution created successfully.") except subprocess.CalledProcessError as e: @@ -323,45 +341,46 @@ def create_release_artifacts(package_config: dict, version) -> list[str]: expected_tar_ball = f"dist/{package_file_name}-{version.lower()}.tar.gz" tarball_path = glob.glob(expected_tar_ball) - if not tarball_path: + if len(tarball_path) != 1: print( - f"Error: Could not find {expected_tar_ball} the generated source tarball in the 'dist' directory." + f"Error: Expected exactly 1 tarball matching {expected_tar_ball}, " + f"found {len(tarball_path)}." ) if os.path.exists("dist"): print("Contents of 'dist' directory:") for item in os.listdir("dist"): print(f"- {item}") - else: - print("'dist' directory not found.") - raise ValueError("Could not find the generated source tarball in the 'dist' directory.") + raise ValueError(f"Could not find the generated source tarball: {expected_tar_ball}") + tarball_file = tarball_path[0] # Copy the tarball to be {package-name}-{version}-incubating-src.tar.gz # Use -src suffix to distinguish source distribution from wheel (convenience package) new_tar_ball = f"dist/{package_name}-{version.lower()}-incubating-src.tar.gz" - _modify_tarball_for_apache_release(tarball_path[0], new_tar_ball, package_name) + _modify_tarball_for_apache_release(tarball_file, new_tar_ball, package_name) + # Remove original flit tarball (only keep the incubating copy) + os.remove(tarball_file) archive_name = new_tar_ball print(f"Found source tarball: {archive_name}") - new_tar_ball_singed = sign_artifacts(archive_name) - if new_tar_ball_singed is None: + new_tar_ball_signed = sign_artifacts(archive_name) + if new_tar_ball_signed is None: raise ValueError("Could not sign the main release artifacts.") - # Create wheel release artifacts + # Wheel keeps its original PEP 427 filename (no -incubating suffix) expected_wheel = f"dist/{package_file_name}-{version.lower()}-py3-none-any.whl" wheel_path = glob.glob(expected_wheel) + if len(wheel_path) != 1: + raise ValueError( + f"Expected exactly 1 wheel matching {expected_wheel}, found {len(wheel_path)}." + ) + wheel_file = wheel_path[0] - # Create incubator wheel release artifacts with -incubating suffix - expected_incubator_wheel = ( - f"dist/{package_name}-{version.lower()}-incubating-py3-none-any.whl" - ) - shutil.copy(wheel_path[0], expected_incubator_wheel) - incubator_wheel_signed_files = sign_artifacts(expected_incubator_wheel) - - files_to_upload = ( - [new_tar_ball] - + new_tar_ball_singed - + [expected_incubator_wheel] - + incubator_wheel_signed_files - ) + # Verify wheel with twine before signing + if not verify_wheel_with_twine(wheel_file): + raise ValueError("Wheel failed twine validation.") + + wheel_signed_files = sign_artifacts(wheel_file) + + files_to_upload = [new_tar_ball, *new_tar_ball_signed, wheel_file, *wheel_signed_files] return files_to_upload finally: @@ -521,6 +540,11 @@ def main(): parser.add_argument("version", help="The new release version (e.g., '1.0.0').") parser.add_argument("rc_num", help="The release candidate number (e.g., '0' for RC0).") parser.add_argument("apache_id", help="Your apache user ID.") + parser.add_argument( + "--dry-run", + action="store_true", + help="Build and sign artifacts but skip git tagging and SVN upload.", + ) args = parser.parse_args() package_key = args.package @@ -547,24 +571,29 @@ def main(): # Create git tag (from repo root) tag_name = f"{package_name}-v{version}-incubating-RC{rc_num}" - print(f"\nChecking for git tag '{tag_name}'...") - try: - # Check if the tag already exists - existing_tag = subprocess.check_output(["git", "tag", "-l", tag_name]).decode().strip() - if existing_tag == tag_name: - print(f"Git tag '{tag_name}' already exists.") - response = input("Do you want to continue without creating a new tag? (y/n): ").lower() - if response != "y": - print("Aborting.") - sys.exit(1) - else: - # Tag does not exist, create it - print(f"Creating git tag '{tag_name}'...") - subprocess.run(["git", "tag", tag_name], check=True) - print(f"Git tag {tag_name} created.") - except subprocess.CalledProcessError as e: - print(f"Error checking or creating Git tag: {e}") - sys.exit(1) + if args.dry_run: + print(f"\n[dry-run] Skipping git tag creation: {tag_name}") + else: + print(f"\nChecking for git tag '{tag_name}'...") + try: + # Check if the tag already exists + existing_tag = subprocess.check_output(["git", "tag", "-l", tag_name]).decode().strip() + if existing_tag == tag_name: + print(f"Git tag '{tag_name}' already exists.") + response = input( + "Do you want to continue without creating a new tag? (y/n): " + ).lower() + if response != "y": + print("Aborting.") + sys.exit(1) + else: + # Tag does not exist, create it + print(f"Creating git tag '{tag_name}'...") + subprocess.run(["git", "tag", tag_name], check=True) + print(f"Git tag {tag_name} created.") + except subprocess.CalledProcessError as e: + print(f"Error checking or creating Git tag: {e}") + sys.exit(1) # Create artifacts print(f"\n{'=' * 80}") @@ -574,29 +603,39 @@ def main(): if not files_to_upload: sys.exit(1) - # Upload artifacts - print(f"\n{'=' * 80}") - print(" Uploading to Apache SVN") - print(f"{'=' * 80}\n") - # NOTE: You MUST have your SVN client configured to use your Apache ID and have permissions. - svn_url = svn_upload(package_name, version, rc_num, files_to_upload, apache_id) - if not svn_url: - sys.exit(1) - - # Generate email - print(f"\n{'=' * 80}") - print(" Vote Email Template") - print(f"{'=' * 80}\n") - generate_email_template(package_name, version, rc_num, svn_url) + if args.dry_run: + # Dry run: skip SVN upload, show summary + print(f"\n{'=' * 80}") + print(" [dry-run] Skipping SVN upload") + print(f"{'=' * 80}\n") + print("Artifacts built successfully:") + for f in files_to_upload: + print(f" {f}") + print("\nTo do a real release, re-run without --dry-run.") + else: + # Upload artifacts + print(f"\n{'=' * 80}") + print(" Uploading to Apache SVN") + print(f"{'=' * 80}\n") + # NOTE: You MUST have your SVN client configured to use your Apache ID and have permissions. + svn_url = svn_upload(package_name, version, rc_num, files_to_upload, apache_id) + if not svn_url: + sys.exit(1) - print("\n" + "=" * 80) - print(" Process Complete!") - print("=" * 80) - print("\nNext steps:") - print(f"1. Push the git tag: git push origin {tag_name}") - print("2. Copy the email template above and send to dev@hamilton.apache.org") - print("3. Wait for votes (minimum 72 hours)") - print("\n") + # Generate email + print(f"\n{'=' * 80}") + print(" Vote Email Template") + print(f"{'=' * 80}\n") + generate_email_template(package_name, version, rc_num, svn_url) + + print("\n" + "=" * 80) + print(" Process Complete!") + print("=" * 80) + print("\nNext steps:") + print(f"1. Push the git tag: git push origin {tag_name}") + print("2. Copy the email template above and send to dev@hamilton.apache.org") + print("3. Wait for votes (minimum 72 hours)") + print("\n") if __name__ == "__main__": diff --git a/scripts/verify_apache_artifacts.py b/scripts/verify_apache_artifacts.py new file mode 100755 index 000000000..63da3a6f0 --- /dev/null +++ b/scripts/verify_apache_artifacts.py @@ -0,0 +1,769 @@ +#!/usr/bin/env python3 +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +""" +Apache Artifacts Verification Script + +Comprehensive verification tool for Apache release artifacts. +Checks signatures, checksums, licenses, and archive integrity. + +Usage: + # List contents of an artifact + uv run python scripts/verify_apache_artifacts.py list-contents dist/apache-hamilton-0.41.0-incubating-src.tar.gz + + # Verify signatures and checksums + uv run python scripts/verify_apache_artifacts.py signatures + + # Verify licenses with Apache RAT + uv run python scripts/verify_apache_artifacts.py licenses --rat-jar path/to/apache-rat.jar + + # Verify everything + uv run python scripts/verify_apache_artifacts.py all --rat-jar path/to/apache-rat.jar + + # Specify custom artifacts directory + uv run python scripts/verify_apache_artifacts.py signatures --artifacts-dir /path/to/dist +""" + +import argparse +import glob +import hashlib +import os +import shutil +import subprocess +import sys +import tarfile +import tempfile +import xml.etree.ElementTree as ET +import zipfile + +# Configuration +PROJECT_SHORT_NAME = "hamilton" + + +def _fail(message: str) -> None: + """Print error message and exit.""" + print(f"\n❌ {message}") + sys.exit(1) + + +def _print_section(title: str) -> None: + """Print formatted section header.""" + print("\n" + "=" * 80) + print(f" {title}") + print("=" * 80 + "\n") + + +# ============================================================================ +# Signature and Checksum Verification +# ============================================================================ + + +def _verify_artifact_signature(artifact_path: str, signature_path: str) -> bool: + """Verify GPG signature of artifact.""" + print(f" Verifying GPG signature: {os.path.basename(signature_path)}") + + if not os.path.exists(signature_path): + print(" ✗ Signature file not found") + return False + + try: + result = subprocess.run( + ["gpg", "--verify", signature_path, artifact_path], + capture_output=True, + check=False, + ) + if result.returncode == 0: + print(" ✓ GPG signature is valid") + return True + else: + print(" ✗ GPG signature verification failed") + if result.stderr: + print(f" Error: {result.stderr.decode()}") + return False + except subprocess.CalledProcessError: + print(" ✗ Error running GPG") + return False + + +def _verify_artifact_checksum(artifact_path: str, checksum_path: str) -> bool: + """Verify SHA512 checksum of artifact.""" + print(f" Verifying SHA512 checksum: {os.path.basename(checksum_path)}") + + if not os.path.exists(checksum_path): + print(" ✗ Checksum file not found") + return False + + # Read expected checksum + with open(checksum_path, "r", encoding="utf-8") as f: + expected_checksum = f.read().strip().split()[0] + + # Calculate actual checksum + sha512_hash = hashlib.sha512() + with open(artifact_path, "rb") as f: + while chunk := f.read(65536): + sha512_hash.update(chunk) + + actual_checksum = sha512_hash.hexdigest() + + if actual_checksum == expected_checksum: + print(" ✓ SHA512 checksum is valid") + return True + else: + print(" ✗ SHA512 checksum mismatch!") + print(f" Expected: {expected_checksum}") + print(f" Actual: {actual_checksum}") + return False + + +def _verify_tar_gz_readable(artifact_path: str) -> bool: + """Verify tar.gz archive can be read and contains files.""" + print(f" Checking archive readability: {os.path.basename(artifact_path)}") + + try: + with tarfile.open(artifact_path, "r:gz") as tar: + members = tar.getmembers() + + if len(members) == 0: + print(" ✗ Archive is empty (no files)") + return False + + print(f" ✓ Archive is readable and contains {len(members)} files") + return True + except tarfile.TarError as e: + print(f" ✗ Archive is corrupted or unreadable: {e}") + return False + except Exception as e: + print(f" ✗ Error reading archive: {e}") + return False + + +def _verify_wheel_readable(wheel_path: str) -> bool: + """Verify wheel can be read and contains expected structure.""" + print(f" Checking wheel readability: {os.path.basename(wheel_path)}") + + try: + with zipfile.ZipFile(wheel_path, "r") as whl: + file_list = whl.namelist() + + if len(file_list) == 0: + print(" ✗ Wheel is empty (no files)") + return False + + # Check for metadata + metadata_files = [f for f in file_list if "METADATA" in f or "WHEEL" in f] + if not metadata_files: + print(" ✗ Wheel missing required metadata files") + return False + + print(f" ✓ Wheel is readable and contains {len(file_list)} files") + return True + except zipfile.BadZipFile: + print(" ✗ Wheel is corrupted or not a valid ZIP file") + return False + except Exception as e: + print(f" ✗ Error reading wheel: {e}") + return False + + +def _verify_artifact_exists(artifact_path: str, min_size: int = 1000) -> bool: + """Verify artifact exists and has reasonable size.""" + if not os.path.exists(artifact_path): + print(f" ✗ Artifact not found: {os.path.basename(artifact_path)}") + return False + + file_size = os.path.getsize(artifact_path) + if file_size < min_size: + print( + f" ✗ Artifact is suspiciously small ({file_size} bytes): {os.path.basename(artifact_path)}" + ) + return False + + print(f" ✓ Artifact exists: {os.path.basename(artifact_path)} ({file_size:,} bytes)") + return True + + +def verify_signatures(artifacts_dir: str) -> bool: + """Verify all signatures and checksums in artifacts directory.""" + _print_section("Verifying Signatures and Checksums") + + if not os.path.exists(artifacts_dir): + _fail(f"Artifacts directory not found: {artifacts_dir}") + + # Find all artifacts (exclude .asc and .sha512 files) + all_files = [ + f for f in os.listdir(artifacts_dir) if os.path.isfile(os.path.join(artifacts_dir, f)) + ] + artifacts = [f for f in all_files if not f.endswith((".asc", ".sha512"))] + + if not artifacts: + print(f"⚠️ No artifacts found in {artifacts_dir}") + return False + + print(f"Found {len(artifacts)} artifact(s) to verify:\n") + + all_valid = True + for artifact_name in artifacts: + artifact_path = os.path.join(artifacts_dir, artifact_name) + + print(f"Verifying: {artifact_name}") + print("-" * 80) + + # Check existence and size + if not _verify_artifact_exists(artifact_path): + all_valid = False + continue + + # Verify signature + signature_path = f"{artifact_path}.asc" + if not _verify_artifact_signature(artifact_path, signature_path): + all_valid = False + + # Verify checksum + checksum_path = f"{artifact_path}.sha512" + if not _verify_artifact_checksum(artifact_path, checksum_path): + all_valid = False + + # Verify archive/wheel structure + if artifact_name.endswith(".tar.gz"): + if not _verify_tar_gz_readable(artifact_path): + all_valid = False + elif artifact_name.endswith(".whl"): + if not _verify_wheel_readable(artifact_path): + all_valid = False + + print() + + return all_valid + + +# ============================================================================ +# License Verification (Apache RAT) +# ============================================================================ + + +def _check_licenses_with_rat( + artifact_path: str, + rat_jar_path: str, + report_name: str, + report_only: bool = False, +) -> bool: + """Run Apache RAT license checker on artifact.""" + print(f"\nRunning Apache RAT on: {os.path.basename(artifact_path)}") + print("-" * 80) + + # Create reports directory + report_dir = "dist" + os.makedirs(report_dir, exist_ok=True) + + rat_report_xml = os.path.join(report_dir, f"rat-report-{report_name}.xml") + rat_report_txt = os.path.join(report_dir, f"rat-report-{report_name}.txt") + + # Extract archive to temp directory + with tempfile.TemporaryDirectory() as temp_dir: + extract_dir = os.path.join(temp_dir, "extracted") + os.makedirs(extract_dir) + + print(" Extracting archive...") + try: + with tarfile.open(artifact_path, "r:gz") as tar: + # Use data filter for Python 3.12+ to avoid deprecation warning + tar.extractall(extract_dir, filter="data") + print(" ✓ Extracted to temp directory") + except Exception as e: + print(f" ✗ Error extracting archive: {e}") + return False + + # Locate .rat-excludes file + rat_excludes = ".rat-excludes" + if not os.path.exists(rat_excludes): + print(f" ⚠️ Warning: {rat_excludes} not found, running without excludes") + rat_excludes = None + + # Run RAT with XML output + print(" Running Apache RAT (XML format for parsing)...") + rat_cmd_xml = [ + "java", + "-jar", + rat_jar_path, + "-x", # XML output + "-d", + extract_dir, + ] + if rat_excludes: + rat_cmd_xml.extend(["-E", rat_excludes]) + + try: + with open(rat_report_xml, "w", encoding="utf-8") as report_file: + result = subprocess.run( + rat_cmd_xml, + stdout=report_file, + stderr=subprocess.PIPE, + text=True, + check=False, + ) + + if result.returncode != 0: + print(f" ⚠️ RAT exited with code {result.returncode}") + + print(f" ✓ RAT XML report: {rat_report_xml}") + except Exception as e: + print(f" ✗ Error running RAT (XML): {e}") + return False + + # Run RAT with plain text output + print(" Running Apache RAT (text format for review)...") + rat_cmd_txt = [ + "java", + "-jar", + rat_jar_path, + "-d", + extract_dir, + ] + if rat_excludes: + rat_cmd_txt.extend(["-E", rat_excludes]) + + try: + with open(rat_report_txt, "w", encoding="utf-8") as report_file: + subprocess.run( + rat_cmd_txt, + stdout=report_file, + stderr=subprocess.PIPE, + text=True, + check=False, + ) + print(f" ✓ RAT text report: {rat_report_txt}") + except Exception as e: + print(f" ⚠️ Warning: Could not generate text report: {e}") + + # Parse XML report + print(" Parsing RAT report...") + try: + tree = ET.parse(rat_report_xml) + root = tree.getroot() + + # Find license issues + unapproved_licenses = [] + unknown_licenses = [] + + for resource in root.findall(".//resource"): + name = resource.get("name", "unknown") + + # Get license approval and family from child elements + license_approval_elem = resource.find("license-approval") + license_family_elem = resource.find("license-family") + + license_approval = ( + license_approval_elem.get("name", "true") + if license_approval_elem is not None + else "true" + ) + license_family = ( + license_family_elem.get("name", "") if license_family_elem is not None else "" + ) + + if license_approval == "false" or license_family == "Unknown license": + if license_family == "Unknown license" or not license_family: + unknown_licenses.append(name) + else: + unapproved_licenses.append(name) + + # Report findings + total_files = len(root.findall(".//resource")) + issues_count = len(unapproved_licenses) + len(unknown_licenses) + + print(f" ✓ Scanned {total_files} files") + print(f" ✓ Found {issues_count} files with license issues") + + if issues_count > 0: + print("\n ⚠️ License Issues Found:") + + if unknown_licenses: + print(f"\n Unknown/Missing Licenses ({len(unknown_licenses)} files):") + for file in unknown_licenses[:10]: + print(f" - {file}") + if len(unknown_licenses) > 10: + print(f" ... and {len(unknown_licenses) - 10} more") + + if unapproved_licenses: + print(f"\n Unapproved Licenses ({len(unapproved_licenses)} files):") + for file in unapproved_licenses[:10]: + print(f" - {file}") + if len(unapproved_licenses) > 10: + print(f" ... and {len(unapproved_licenses) - 10} more") + + print("\n 📄 Reports saved:") + print(f" - {rat_report_xml} (structured)") + print(f" - {rat_report_txt} (human-readable)") + + if report_only: + print("\n ℹ️ Report-only mode: continuing despite license issues") + return True + else: + print("\n ❌ License check failed!") + return False + else: + print(" ✅ All files have approved licenses") + print("\n 📄 Reports saved:") + print(f" - {rat_report_xml} (structured)") + print(f" - {rat_report_txt} (human-readable)") + return True + + except Exception as e: + print(f" ✗ Error parsing RAT report: {e}") + if report_only: + print(" ℹ️ Report-only mode: continuing despite parse error") + return True + return False + + +def verify_licenses(artifacts_dir: str, rat_jar_path: str, report_only: bool = False) -> bool: + """Verify licenses in all tar.gz artifacts using Apache RAT.""" + _print_section("Verifying Licenses with Apache RAT") + + if not os.path.exists(artifacts_dir): + _fail(f"Artifacts directory not found: {artifacts_dir}") + + if not rat_jar_path or not os.path.exists(rat_jar_path): + _fail( + f"Apache RAT JAR not found: {rat_jar_path}\nDownload from: https://creadur.apache.org/rat/download_rat.cgi" + ) + + # Check for java + if shutil.which("java") is None: + _fail("Java not found. Required for Apache RAT.") + + # Find all tar.gz artifacts (not wheels) + all_files = [ + f for f in os.listdir(artifacts_dir) if os.path.isfile(os.path.join(artifacts_dir, f)) + ] + tar_artifacts = [f for f in all_files if f.endswith(".tar.gz")] + + if not tar_artifacts: + print(f"⚠️ No tar.gz artifacts found in {artifacts_dir}") + return False + + print(f"Found {len(tar_artifacts)} tar.gz artifact(s) to check:\n") + + all_valid = True + for artifact_name in tar_artifacts: + artifact_path = os.path.join(artifacts_dir, artifact_name) + + # Generate report name from artifact name + report_name = artifact_name.replace(".tar.gz", "").replace(".", "-") + + if not _check_licenses_with_rat(artifact_path, rat_jar_path, report_name, report_only): + all_valid = False + + return all_valid + + +# ============================================================================ +# List Contents +# ============================================================================ + + +def _list_tar_gz_contents(artifact_path: str) -> None: + """List contents of a tar.gz archive.""" + print(f"\nContents of: {os.path.basename(artifact_path)}") + print("=" * 80) + + try: + with tarfile.open(artifact_path, "r:gz") as tar: + members = tar.getmembers() + + print(f"Total files: {len(members)}\n") + + # Group by type + files = [m for m in members if m.isfile()] + dirs = [m for m in members if m.isdir()] + symlinks = [m for m in members if m.issym() or m.islnk()] + + print(f"Files: {len(files)}, Directories: {len(dirs)}, Symlinks: {len(symlinks)}\n") + + # Show all files + print("Files:\n") + + for member in members: + size = f"{member.size:>12,}" if member.isfile() else "