From c16eb36bc6d9fcf2ff49d77b91a787e43b9bd1bb Mon Sep 17 00:00:00 2001 From: Avinash Singh Date: Mon, 22 Dec 2025 15:15:23 +0530 Subject: [PATCH 1/2] Build top huggiface models Signed-off-by: Avinash Singh --- .github/workflows/build-top-models.yml | 267 ++++++++++++++++++++ contrib/scripts/requirements.txt | 1 + contrib/scripts/select-top-models.py | 329 +++++++++++++++++++++++++ contrib/scripts/top-model-selection.md | 131 ++++++++++ 4 files changed, 728 insertions(+) create mode 100644 .github/workflows/build-top-models.yml create mode 100644 contrib/scripts/requirements.txt create mode 100644 contrib/scripts/select-top-models.py create mode 100644 contrib/scripts/top-model-selection.md diff --git a/.github/workflows/build-top-models.yml b/.github/workflows/build-top-models.yml new file mode 100644 index 00000000..da50e6b2 --- /dev/null +++ b/.github/workflows/build-top-models.yml @@ -0,0 +1,267 @@ +name: Build Top HuggingFace Models + +on: + schedule: + # Run weekly on Sunday at 00:00 UTC + - cron: '0 0 * * 0' + workflow_dispatch: + inputs: + limit: + description: 'Number of models to build' + required: false + default: '10' + type: string + max_size: + description: 'Maximum model size in GB' + required: false + default: '10' + type: string + sort_by: + description: 'Sort criteria' + required: false + default: 'downloads' + type: choice + options: + - downloads + - likes + - trending + +permissions: + contents: read + packages: write + +env: + REGISTRY: ghcr.io + ORGANIZATION: ${{ github.repository_owner }} + +jobs: + select-models: + name: Select Top Models + runs-on: ubuntu-latest + outputs: + models: ${{ steps.select-models.outputs.models }} + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: "3.14" + + - name: Install dependencies + run: | + pip install -r contrib/scripts/requirements.txt + + - name: Select compatible models + id: select-models + env: + HF_TOKEN: ${{ secrets.HF_TOKEN }} + run: | + python contrib/scripts/select-top-models.py \ + --limit ${{ github.event.inputs.limit || '10' }} \ + --max-size ${{ github.event.inputs.max_size || '10' }} \ + --sort-by ${{ github.event.inputs.sort_by || 'downloads' }} \ + --output models.json + + # Convert to single line JSON for GitHub output + echo "models=$(cat models.json | jq -c)" >> $GITHUB_OUTPUT + + # Also display selected models for debugging + echo "Selected models:" + cat models.json | jq -r '.[] | "\(.id) (\(.format), \(.size_gb)GB)"' + + build-modctl: + name: Build modctl + runs-on: ubuntu-latest + env: + PACKAGE_DIR: modctl-build-package + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + submodules: recursive + + - name: Install Go + uses: actions/setup-go@v5 + with: + go-version-file: go.mod + cache-dependency-path: go.sum + + - name: Create Cache Dir + run: | + mkdir -p ${{ env.PACKAGE_DIR }} + + - name: Cache Package + id: cache-package + uses: actions/cache@v4 + with: + path: ${{ env.PACKAGE_DIR }} + key: modctl-build-packages + + - name: Install dependencies + run: | + sudo apt-get update + sudo apt-get install -y pkg-config + sudo DEBIAN_FRONTEND=noninteractive apt install -y build-essential \ + cmake pkg-config libssl-dev libssh2-1-dev zlib1g-dev \ + libhttp-parser-dev python3 wget tar git + mkdir -p ${{ env.PACKAGE_DIR }} + if [ ! -f "${{ env.PACKAGE_DIR }}/libgit2-v1.5.1.tar.gz" ]; then + wget https://github.com/libgit2/libgit2/archive/refs/tags/v1.5.1.tar.gz -O ${{ env.PACKAGE_DIR }}/libgit2-v1.5.1.tar.gz + fi + tar -xzf ${{ env.PACKAGE_DIR }}/libgit2-v1.5.1.tar.gz + cd libgit2-1.5.1 && mkdir build && cd build + cmake .. -DCMAKE_BUILD_TYPE=Release -DBUILD_SHARED_LIBS=OFF + make -j$(nproc) + sudo make install + sudo ldconfig + env: + LIBGIT2_SYS_USE_PKG_CONFIG: "1" + + - name: Build modctl + run: | + go build -tags "static system_libgit2 enable_libgit2" + + - name: Upload modctl + uses: actions/upload-artifact@v4 + with: + name: modctl-artifact + path: modctl + + build-and-push-models: + name: Build ${{ matrix.model.id }} + needs: [select-models, build-modctl] + runs-on: ubuntu-latest + timeout-minutes: 120 + strategy: + fail-fast: false + max-parallel: 3 # Don't overwhelm GHCR + matrix: + model: ${{ fromJson(needs.select-models.outputs.models) }} + env: + MODEL_ID: ${{ matrix.model.id }} + MODEL_FAMILY: ${{ matrix.model.family }} + MODEL_FORMAT: ${{ matrix.model.format }} + MODEL_PARAM_SIZE: ${{ matrix.model.param_size }} + MODEL_DIR: model-files + steps: + - name: Download modctl artifact + uses: actions/download-artifact@v4 + with: + name: modctl-artifact + path: modctl + + - name: Setup modctl + run: | + sudo cp modctl/modctl /bin/modctl + sudo chmod +x /bin/modctl + modctl version + modctl login -u ${{ github.actor }} \ + -p ${{ secrets.GITHUB_TOKEN }} \ + ${{ env.REGISTRY }} + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: "3.10" + + - name: Cache model + uses: actions/cache@v4 + id: cache-model + with: + path: ${{ env.MODEL_DIR }} + key: model-${{ env.MODEL_ID }}-${{ hashFiles('**/config.json') }} + + - name: Download HuggingFace Model + if: steps.cache-model.outputs.cache-hit != 'true' + env: + HF_TOKEN: ${{ secrets.HF_TOKEN }} + run: | + pip install 'huggingface_hub' + python << 'EOF' + from huggingface_hub import snapshot_download + import os + + model_id = os.environ['MODEL_ID'] + model_dir = os.environ['MODEL_DIR'] + + print(f"Downloading {model_id}...") + snapshot_download( + repo_id=model_id, + local_dir=model_dir, + token=os.environ.get('HF_TOKEN') + ) + print(f"Download complete: {model_dir}") + EOF + + - name: Generate Modelfile + run: | + cd ${{ env.MODEL_DIR }} + echo "Generating Modelfile for ${{ env.MODEL_ID }}" + modctl modelfile generate \ + --arch transformer \ + --family ${{ env.MODEL_FAMILY }} \ + --format ${{ env.MODEL_FORMAT }} \ + --param-size ${{ env.MODEL_PARAM_SIZE }} \ + . + + echo "Generated Modelfile:" + cat Modelfile + + - name: Build and Push Model + run: | + cd ${{ env.MODEL_DIR }} + + # Convert model ID to valid image name (lowercase, replace / with -) + IMAGE_NAME=$(echo "${{ env.MODEL_ID }}" | tr '[:upper:]' '[:lower:]' | tr '/' '-') + IMAGE_URL="${{ env.REGISTRY }}/${{ env.ORGANIZATION }}/${IMAGE_NAME}:latest" + + echo "Building and pushing to ${IMAGE_URL}" + + modctl build -f Modelfile \ + -t ${IMAGE_URL} \ + --raw --output-remote --log-level debug \ + . + + echo "Successfully pushed ${IMAGE_URL}" + + - name: Cleanup HuggingFace Model Files + run: | + echo "Cleaning up HuggingFace model files to free disk space..." + du -sh ${{ env.MODEL_DIR }} || true + rm -rf ${{ env.MODEL_DIR }} + echo "Cleanup complete" + df -h + + - name: Verify Pull + run: | + # Convert model ID to valid image name + IMAGE_NAME=$(echo "${{ env.MODEL_ID }}" | tr '[:upper:]' '[:lower:]' | tr '/' '-') + IMAGE_URL="${{ env.REGISTRY }}/${{ env.ORGANIZATION }}/${IMAGE_NAME}:latest" + + mkdir -p verify-download + echo "Pulling ${IMAGE_URL} to verify" + + modctl pull ${IMAGE_URL} \ + --extract-dir verify-download \ + --log-level debug + + echo "Successfully verified pull from ${IMAGE_URL}" + + summary: + name: Build Summary + needs: [select-models, build-and-push-models] + runs-on: ubuntu-latest + if: always() + steps: + - name: Generate Summary + run: | + echo "# Build Top Models Summary" >> $GITHUB_STEP_SUMMARY + echo "" >> $GITHUB_STEP_SUMMARY + echo "## Selected Models" >> $GITHUB_STEP_SUMMARY + echo "" >> $GITHUB_STEP_SUMMARY + echo '${{ needs.select-models.outputs.models }}' | jq -r '.[] | "- **\(.id)** (\(.format), \(.param_size), \(.size_gb)GB) - \(.downloads) downloads"' >> $GITHUB_STEP_SUMMARY || true + echo "" >> $GITHUB_STEP_SUMMARY + echo "## Build Status" >> $GITHUB_STEP_SUMMARY + echo "Check individual job results above." >> $GITHUB_STEP_SUMMARY diff --git a/contrib/scripts/requirements.txt b/contrib/scripts/requirements.txt new file mode 100644 index 00000000..8ff2acb0 --- /dev/null +++ b/contrib/scripts/requirements.txt @@ -0,0 +1 @@ +huggingface_hub>=0.20.0 diff --git a/contrib/scripts/select-top-models.py b/contrib/scripts/select-top-models.py new file mode 100644 index 00000000..f6b2b409 --- /dev/null +++ b/contrib/scripts/select-top-models.py @@ -0,0 +1,329 @@ +#!/usr/bin/env python3 +""" +Select top HuggingFace models compatible with modctl. + +This script fetches popular models from HuggingFace Hub and filters them +based on modctl compatibility criteria: +1. Has config.json for auto-detection +2. Has model files in supported formats (safetensors, gguf, bin, pt) +3. Size is under a reasonable limit +4. Has necessary metadata for modelfile generation +""" + +import json +import re +import sys +import argparse +from typing import List, Dict, Optional +from huggingface_hub import HfApi + +# Try to import ModelFilter, fall back to dict if not available +try: + from huggingface_hub import ModelFilter +except ImportError: + ModelFilter = None + + +# Supported model file formats (based on pkg/modelfile/constants.go) +SUPPORTED_FORMATS = [ + "safetensors", + "gguf", + "bin", + "pt", + "pth", + "onnx", +] + +# Model families known to work well with modctl +KNOWN_FAMILIES = [ + "llama", + "qwen", + "qwen2", + "qwen3", + "mistral", + "phi", + "gpt2", + "gpt_neo", + "gpt_neox", + "bloom", + "opt", + "falcon", + "mpt", + "stablelm", +] + + +def get_model_size_gb(model_info) -> Optional[float]: + """Estimate model size in GB from model info.""" + try: + total_size = 0 + if hasattr(model_info, 'siblings') and model_info.siblings: + for file in model_info.siblings: + if hasattr(file, 'size') and file.size: + total_size += file.size + return total_size / (1024 ** 3) # Convert to GB + except Exception: + return None + + +def has_config_json(model_info) -> bool: + """Check if model has config.json for auto-detection.""" + try: + if hasattr(model_info, 'siblings') and model_info.siblings: + filenames = [f.rfilename for f in model_info.siblings] + return "config.json" in filenames + return False + except Exception: + return False + + +def get_model_format(model_info) -> Optional[str]: + """Detect model format from repository files.""" + try: + if not hasattr(model_info, 'siblings') or not model_info.siblings: + return None + + # Check for each supported format + for file in model_info.siblings: + filename = file.rfilename.lower() + if filename.endswith('.safetensors'): + return "safetensors" + elif filename.endswith('.gguf'): + return "gguf" + elif filename.endswith('.onnx'): + return "onnx" + elif filename.endswith('.bin') and 'pytorch_model' in filename: + return "bin" + elif filename.endswith('.pt') or filename.endswith('.pth'): + return "pt" + + return None + except Exception: + return None + + +def extract_param_size(model_id: str) -> Optional[str]: + """Extract parameter size from model name or metadata.""" + # Common patterns: 7B, 8B, 13B, 0.5B, 1.1B, etc. + patterns = [ + r'(\d+\.?\d*[BM])', # 7B, 8B, 0.5B + r'(\d+\.?\d*)b', # 7b, 0.5b (lowercase) + ] + + model_name = model_id.lower() + for pattern in patterns: + match = re.search(pattern, model_name) + if match: + size = match.group(1).upper() + if not size.endswith('B') and not size.endswith('M'): + size += 'B' + return size + + return None + + +def detect_family(model_info, model_id: str) -> Optional[str]: + """Detect model family from model info.""" + try: + # Try to get from config + if hasattr(model_info, 'config') and model_info.config: + model_type = model_info.config.get('model_type') + if model_type and model_type in KNOWN_FAMILIES: + return model_type + + # Fallback to tags + if hasattr(model_info, 'tags') and model_info.tags: + for tag in model_info.tags: + if tag in KNOWN_FAMILIES: + return tag + + # Last resort: parse from model name + model_name_lower = model_id.lower() + for family in KNOWN_FAMILIES: + if family in model_name_lower: + return family + + return None + except Exception: + return None + + +def is_compatible_model(api: HfApi, model_id: str, max_size_gb: float = 20.0) -> tuple[bool, Optional[Dict]]: + """ + Check if model is compatible with modctl. + + Returns: + (is_compatible, model_metadata) tuple + """ + # Get all model information + try: + model_info = api.model_info(model_id, files_metadata=True) + except Exception as e: + print(f"Skipping {model_id}: Could not fetch model info: {e}", file=sys.stderr) + return False, None + + # Check for config.json + if not has_config_json(model_info): + print(f"Skipping {model_id}: No config.json", file=sys.stderr) + return False, None + + # Check format + format_type = get_model_format(model_info) + if not format_type: + print(f"Skipping {model_id}: No supported model format found", file=sys.stderr) + return False, None + + # Check size + size_gb = get_model_size_gb(model_info) + if size_gb and size_gb > max_size_gb: + print(f"Skipping {model_id}: Too large ({size_gb:.2f}GB > {max_size_gb}GB)", file=sys.stderr) + return False, None + + # Detect family + family = detect_family(model_info, model_id) + + # Extract param size + param_size = extract_param_size(model_id) + + metadata = { + "id": model_id, + "family": family or "unknown", + "arch": "transformer", # modctl auto-detects this from config.json + "format": format_type, + "param_size": param_size or "unknown", + "size_gb": round(size_gb, 2) if size_gb else None, + "downloads": model_info.downloads if hasattr(model_info, 'downloads') else 0, + "likes": model_info.likes if hasattr(model_info, 'likes') else 0, + } + + return True, metadata + + +def select_top_models( + limit: int = 10, + max_size_gb: float = 20.0, + sort_by: str = "downloads", + task: Optional[str] = "text-generation", +) -> List[Dict]: + """ + Select top models from HuggingFace Hub. + + Args: + limit: Number of models to return + max_size_gb: Maximum model size in GB + sort_by: Sort criteria (downloads, likes, trending) + task: Task filter (text-generation, image-classification, etc.) + + Returns: + List of model metadata dictionaries + """ + api = HfApi() + + print(f"Fetching top {limit} models (sort by: {sort_by}, max size: {max_size_gb}GB)...", file=sys.stderr) + + # Fetch more models than needed to account for filtering + fetch_limit = limit * 10 + + # Use ModelFilter if available, otherwise pass task as filter string + if ModelFilter is not None: + model_filter = ModelFilter( + task=task, + library="transformers", + ) + models = api.list_models( + filter=model_filter, + sort=sort_by, + direction=-1, + limit=fetch_limit, + ) + else: + # Older API without ModelFilter + models = api.list_models( + filter=task, + sort=sort_by, + direction=-1, + limit=fetch_limit, + ) + + selected = [] + checked = 0 + + for model in models: + checked += 1 + print(f"Checking {checked}/{fetch_limit}: {model.id}...", file=sys.stderr) + + is_compatible, metadata = is_compatible_model(api, model.id, max_size_gb) + + if is_compatible and metadata: + selected.append(metadata) + print(f"✓ Added {model.id} ({len(selected)}/{limit})", file=sys.stderr) + + if len(selected) >= limit: + break + + print(f"\nSelected {len(selected)} compatible models", file=sys.stderr) + return selected + + +def main(): + parser = argparse.ArgumentParser( + description="Select top HuggingFace models compatible with modctl" + ) + parser.add_argument( + "--limit", + type=int, + default=10, + help="Number of models to select (default: 10)", + ) + parser.add_argument( + "--max-size", + type=float, + default=20.0, + help="Maximum model size in GB (default: 20.0)", + ) + parser.add_argument( + "--sort-by", + choices=["downloads", "likes", "trending"], + default="downloads", + help="Sort criteria (default: downloads)", + ) + parser.add_argument( + "--task", + default="text-generation", + help="Task filter (default: text-generation)", + ) + parser.add_argument( + "--output", + help="Output file path (default: stdout)", + ) + + args = parser.parse_args() + + try: + models = select_top_models( + limit=args.limit, + max_size_gb=args.max_size, + sort_by=args.sort_by, + task=args.task, + ) + + output = json.dumps(models, indent=2) + + if args.output: + with open(args.output, 'w') as f: + f.write(output) + print(f"\nWrote {len(models)} models to {args.output}", file=sys.stderr) + else: + print(output) + + return 0 + + except Exception as e: + print(f"Error: {e}", file=sys.stderr) + import traceback + traceback.print_exc(file=sys.stderr) + return 1 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/contrib/scripts/top-model-selection.md b/contrib/scripts/top-model-selection.md new file mode 100644 index 00000000..fbf03fb7 --- /dev/null +++ b/contrib/scripts/top-model-selection.md @@ -0,0 +1,131 @@ +# Model Selection Scripts + +This directory contains scripts for selecting and filtering HuggingFace models compatible with modctl. + +## select-top-models.py + +Python script that fetches top models from HuggingFace Hub and filters them based on modctl compatibility criteria. + +### Compatibility Criteria + +The script filters models based on: + +1. **Has config.json** - Required for auto-detection of model metadata +2. **Supported formats** - Must have files in formats like: + - `safetensors` (preferred) + - `gguf` + - `bin` (PyTorch) + - `pt`, `pth` (PyTorch) + - `onnx` +3. **Size limit** - Configurable maximum size (default: 20GB) +4. **Metadata** - Attempts to extract: + - Model family (llama, qwen, gpt2, etc.) + - Parameter size (0.5B, 7B, etc.) + - Format type + +### Installation + +```bash +pip install -r requirements.txt +``` + +### Usage + +Basic usage (fetch top 10 models by downloads): + +```bash +python contrib/scripts/select-top-models.py +``` + +#### Options + +```bash +python contrib/scripts/select-top-models.py \ + --limit 10 \ # Number of models to select (default: 10) + --max-size 20.0 \ # Maximum model size in GB (default: 20.0) + --sort-by downloads \ # Sort by: downloads, likes, trending (default: downloads) + --task text-generation \ # Task filter (default: text-generation) + --output models.json # Output file (default: stdout) +``` + +#### Examples + +Get top 5 small models (< 5GB): + +```bash +python contrib/scriptsselect-top-models.py --limit 5 --max-size 5 +``` + +Get most liked models: + +```bash +python contrib/scripts/select-top-models.py --limit 10 --sort-by likes +``` + +Save to file: + +```bash +python contrib/scripts/select-top-models.py --limit 20 --output top_models.json +``` + +### Output Format + +The script outputs JSON with model metadata: + +```json +[ + { + "id": "Qwen/Qwen3-0.6B", + "family": "qwen3", + "arch": "transformer", + "format": "safetensors", + "param_size": "0.6B", + "size_gb": 1.41, + "downloads": 7509488, + "likes": 867 + } +] +``` + +### Authentication + +Some models require HuggingFace authentication. Set the `HF_TOKEN` environment variable: + +```bash +export HF_TOKEN="your_huggingface_token" +python contrib/scripts/select-top-models.py +``` + +Or use `huggingface-cli`: + +```bash +huggingface-cli login +python contrib/scripts/select-top-models.py +``` + +## GitHub Workflow Integration + +The `build-top-models.yml` workflow uses this script to automatically: + +1. Select top models from HuggingFace +2. Build them using modctl +3. Push to GitHub Container Registry + +### Manual Trigger + +You can manually trigger the workflow from GitHub Actions tab with custom parameters: + +- **limit**: Number of models to build (default: 10) +- **max_size**: Maximum model size in GB (default: 20) +- **sort_by**: Sort criteria - downloads, likes, or trending + +### Scheduled Runs + +The workflow runs automatically every Sunday at 00:00 UTC. + +### Required Secrets + +The workflow requires these GitHub secrets: + +- `HF_TOKEN` - HuggingFace API token (for downloading models) +- `GITHUB_TOKEN` - Automatically provided by GitHub Actions From 9f4e06b4b1216cbd6309b46c3a836d558fbb29a4 Mon Sep 17 00:00:00 2001 From: Avinash Singh Date: Mon, 22 Dec 2025 15:28:46 +0530 Subject: [PATCH 2/2] optimise as per review Signed-off-by: Avinash Singh --- contrib/scripts/select-top-models.py | 23 +++++++++++++---------- contrib/scripts/top-model-selection.md | 2 +- 2 files changed, 14 insertions(+), 11 deletions(-) diff --git a/contrib/scripts/select-top-models.py b/contrib/scripts/select-top-models.py index f6b2b409..26fe7cf3 100644 --- a/contrib/scripts/select-top-models.py +++ b/contrib/scripts/select-top-models.py @@ -35,7 +35,7 @@ ] # Model families known to work well with modctl -KNOWN_FAMILIES = [ +KNOWN_FAMILIES = { "llama", "qwen", "qwen2", @@ -50,7 +50,7 @@ "falcon", "mpt", "stablelm", -] +} def get_model_size_gb(model_info) -> Optional[float]: @@ -62,7 +62,8 @@ def get_model_size_gb(model_info) -> Optional[float]: if hasattr(file, 'size') and file.size: total_size += file.size return total_size / (1024 ** 3) # Convert to GB - except Exception: + except Exception as e: + print(f"Error: An error occurred in get_model_size_gb: {e}", file=sys.stderr) return None @@ -70,11 +71,11 @@ def has_config_json(model_info) -> bool: """Check if model has config.json for auto-detection.""" try: if hasattr(model_info, 'siblings') and model_info.siblings: - filenames = [f.rfilename for f in model_info.siblings] - return "config.json" in filenames - return False - except Exception: + return any(f.rfilename == "config.json" for f in model_info.siblings) return False + except Exception as e: + print(f"Error: An error occurred in has_config_json: {e}", file=sys.stderr) + return None def get_model_format(model_info) -> Optional[str]: @@ -98,7 +99,8 @@ def get_model_format(model_info) -> Optional[str]: return "pt" return None - except Exception: + except Exception as e: + print(f"Error: An error occurred in get_model_format: {e}", file=sys.stderr) return None @@ -144,7 +146,8 @@ def detect_family(model_info, model_id: str) -> Optional[str]: return family return None - except Exception: + except Exception as e: + print(f"Error: An error occurred in detect_family: {e}", file=sys.stderr) return None @@ -319,7 +322,7 @@ def main(): return 0 except Exception as e: - print(f"Error: {e}", file=sys.stderr) + print(f"Error: An error occurred in main: {e}", file=sys.stderr) import traceback traceback.print_exc(file=sys.stderr) return 1 diff --git a/contrib/scripts/top-model-selection.md b/contrib/scripts/top-model-selection.md index fbf03fb7..6c552066 100644 --- a/contrib/scripts/top-model-selection.md +++ b/contrib/scripts/top-model-selection.md @@ -53,7 +53,7 @@ python contrib/scripts/select-top-models.py \ Get top 5 small models (< 5GB): ```bash -python contrib/scriptsselect-top-models.py --limit 5 --max-size 5 +python contrib/scripts/select-top-models.py --limit 5 --max-size 5 ``` Get most liked models: