diff --git a/tools-py/.gitignore b/tools-py/.gitignore new file mode 100644 index 0000000..5b6f473 --- /dev/null +++ b/tools-py/.gitignore @@ -0,0 +1,52 @@ +# Python +__pycache__/ +*.py[cod] +*$py.class +*.so +.Python +env/ +venv/ +ENV/ +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +.installed.cfg +*.egg + +# Virtual environments +venv/ +ENV/ +env/ + +# IDE +.vscode/ +.idea/ +*.swp +*.swo +*~ + +# Output files +drafts/ +*.yml.bak +*.yaml.bak + +# Generated reports +missing_models_report.txt +*_report.txt + +# Logs +*.log + +# OS +.DS_Store +Thumbs.db \ No newline at end of file diff --git a/tools-py/README.md b/tools-py/README.md new file mode 100644 index 0000000..b08cb92 --- /dev/null +++ b/tools-py/README.md @@ -0,0 +1,444 @@ +# Model Openness Tool - Python Tools + +Automated tools for working with the Model Openness Framework (MOF), including scraping model information from HuggingFace and identifying missing models. + +## Tools Included + +### 1. Model Scraper (`model_scraper.py`) +Generates draft YAML files for individual models by scraping HuggingFace. + +### 2. Missing Models Finder (`find_missing_models.py`) +Identifies popular HuggingFace models that are not yet in the MOT database. + +## Overview + +These tools automate data collection and gap analysis for model evaluation by: +- Fetching model metadata from HuggingFace API +- Analyzing repository contents to detect available MOF components +- Identifying license information +- Generating MOF-compliant YAML files with confidence scores +- Finding missing models that should be added to MOT +- Flagging areas requiring manual review + +**⚠️ Important**: Generated YAML files are **DRAFTS** that require manual review and validation before submission to the MOT database. + +## Installation + +### Prerequisites +- Python 3.8 or higher +- pip package manager + +### Setup + +1. Navigate to the tools-py directory: +```bash +cd tools-py +``` + +2. Install dependencies: +```bash +pip install -r requirements.txt +``` + +## Usage + +### Tool 1: Model Scraper + +#### Basic Usage + +Scrape a model from HuggingFace: +```bash +python model_scraper.py +``` + +Example: +```bash +python model_scraper.py meta-llama/Llama-3-8B +``` + +#### Advanced Options + +Specify output directory: +```bash +python model_scraper.py meta-llama/Llama-3-8B --output-dir ../models +``` + +Use HuggingFace token for gated models: +```bash +python model_scraper.py meta-llama/Llama-3-8B --hf-token YOUR_TOKEN +``` + +### Tool 2: Missing Models Finder + +#### Basic Usage + +Find missing models with default settings (min 1000 downloads): +```bash +python find_missing_models.py +``` + +#### Advanced Options + +Set minimum download threshold: +```bash +python find_missing_models.py --min-downloads 10000 +``` + +Limit number of models to check: +```bash +python find_missing_models.py --limit 500 +``` + +Save report to file: +```bash +python find_missing_models.py --output missing_models_report.txt +``` + +Filter by model type: +```bash +python find_missing_models.py --model-type text-generation +``` + +#### Command-Line Arguments + +- `--min-downloads`: Minimum number of downloads to consider (default: 1000) +- `--limit`: Maximum number of models to fetch from HuggingFace (default: 1000) +- `--models-dir`: Path to MOT models directory (default: ../models) +- `--output`: Output file for report (default: print to console) +- `--model-type`: Filter by model type (e.g., text-generation, image-to-text) + +#### Example Workflow + +**Option A: Manual Workflow (Selective)** + +1. Find missing high-priority models: +```bash +python find_missing_models.py --min-downloads 50000 --output report.txt +``` + +2. Review the report and identify models to add + +3. Scrape the identified models: +```bash +python model_scraper.py meta-llama/Llama-3-8B +python model_scraper.py mistralai/Mistral-7B-v0.1 +``` + +4. Review and validate the generated YAML files + +5. Submit PRs to add models to MOT + +**Option B: Batch Processing (Automated)** + +Use the batch script to automatically find and scrape missing models: + +```bash +./batch_scrape_missing.sh +``` + +This will: +1. Find missing models (default: 50k+ downloads) +2. Extract model IDs from the report +3. Prompt for confirmation +4. Scrape first 20 models automatically +5. Generate summary with success/failure counts + +**Customize batch processing:** + +```bash +# Set minimum downloads threshold +MIN_DOWNLOADS=100000 ./batch_scrape_missing.sh + +# Process more models (default is 20) +MAX_MODELS=50 ./batch_scrape_missing.sh + +# Process ALL missing models (use with caution!) +MAX_MODELS=999 ./batch_scrape_missing.sh + +# Use HuggingFace token for gated models +HF_TOKEN=your_token ./batch_scrape_missing.sh + +# Change output directory +OUTPUT_DIR=./draft_models ./batch_scrape_missing.sh + +# Combine multiple options +MIN_DOWNLOADS=100000 MAX_MODELS=50 HF_TOKEN=your_token ./batch_scrape_missing.sh +``` + +**Environment Variables:** +- `MIN_DOWNLOADS`: Minimum download threshold (default: 50000) +- `LIMIT`: Max models to check in HuggingFace (default: 1000) +- `MAX_MODELS`: Max models to scrape in one batch (default: 20) +- `OUTPUT_DIR`: Output directory for YAML files (default: ../models) +- `HF_TOKEN`: HuggingFace API token for gated models (optional) + +**Note:** The default limit of 20 models is a safety measure. Each model takes ~2-5 seconds to scrape, so 20 models = ~1-2 minutes. Increase MAX_MODELS carefully based on your needs. + +### Advanced Options + +Specify output directory: +```bash +python model_scraper.py meta-llama/Llama-3-8B --output-dir ../models +``` + +Use HuggingFace token for gated models: +```bash +python model_scraper.py meta-llama/Llama-3-8B --hf-token YOUR_TOKEN +``` + +### Command-Line Arguments + +- `model_id` (required): HuggingFace model ID (e.g., `meta-llama/Llama-3-8B`) +- `--output-dir`: Output directory for YAML files (default: `../models`) +- `--hf-token`: HuggingFace API token for accessing gated models + +## What the Scraper Does + +### 1. Data Collection +- Fetches model metadata from HuggingFace API +- Downloads and parses model card (README.md) +- Lists repository files to detect available components +- Extracts license information + +### 2. Component Detection + +The scraper automatically detects the following MOF components: + +**Code Components:** +- Model parameters (Final) - Detects `.bin`, `.safetensors`, `.pt`, `.pth`, `.ckpt` files +- Model metadata - Detects `config.json`, `model_config.json` +- Model architecture - Detects Python files with modeling code +- Inference code - Detects files with inference/generation keywords + +**Data Components:** +- Training dataset - Detects references in model card + +**Documentation Components:** +- Model card - Detects README.md +- Technical report - Detects references in model card +- Research paper - Detects paper/arxiv references +- Evaluation results - Detects benchmark/performance mentions + +### 3. License Detection +- Extracts license from HuggingFace model metadata +- Checks for LICENSE files in repository +- Defaults to "unlicensed" when uncertain (requires manual review) + +### 4. Repository Detection + +The scraper automatically detects GitHub repositories using multiple strategies with confidence scoring: + +**Detection Methods (in priority order):** + +1. **Model Card Parsing** (70-90% confidence) + - Searches the model card (README.md) for GitHub URLs + - Filters for the most relevant repository (matching model/organization name) + - Higher confidence when repository name closely matches model name + - Example: Found `https://github.com/bigscience-workshop/bigscience` for BLOOM model + +2. **Pattern-Based Inference** (60-65% confidence) + - Attempts direct mapping: `organization/model` → `github.com/organization/model` + - Validates repository existence via HTTP HEAD request + - Example: `mistralai/Mistral-7B-v0.1` → `github.com/mistralai/Mistral-7B-v0.1` + +3. **Name Variations** (60% confidence) + - Tries base model names without version suffixes + - Tests multiple naming patterns (e.g., `Mistral-7B-v0.1` → `Mistral`) + - Validates each attempt before accepting + +**Output Format:** +```yaml +# Repository detected: 70% confidence +release: + repository: https://github.com/bigscience-workshop/bigscience +``` + +**When No Repository Found:** +- Repository field left empty (requires manual addition) +- Common for models without public code repositories +- May indicate closed-source or proprietary models + +**Manual Review Required:** +- Verify detected repository is correct and official +- Check if repository contains actual model code/weights +- Some models may have multiple repositories (training vs. inference) + +### 5. Confidence Scoring + +Each detected component includes a confidence score: +- **95%**: High confidence (e.g., model parameters detected via file extensions) +- **80-90%**: Good confidence (e.g., config files, model card) +- **60-75%**: Medium confidence (e.g., references in documentation) +- **50%**: Low confidence (requires verification) + +## Output Format + +Generated YAML files include: +- MOF framework metadata +- Model release information (name, version, date, producer) +- Detected components with descriptions and licenses +- Confidence scores in comments +- Links to HuggingFace repository + +Example output structure: +```yaml +# AUTO-GENERATED DRAFT - REQUIRES MANUAL REVIEW +# Generated by Model Openness Tool scraper +# Source: HuggingFace model meta-llama/Llama-3-8B +# +# Component confidence scores: +# - Model parameters (Final): 95% confidence +# - Model card: 95% confidence +# +framework: + name: Model Openness Framework + version: '1.0' + date: '2024-12-15' +release: + name: Llama-3-8B + version: 8B + date: '2024-10-03' + license: {} + type: language + architecture: transformer decoder + origin: llama-3-8b + producer: Meta + contact: '' + huggingface: https://huggingface.co/meta-llama/Llama-3-8B + components: + - name: Model parameters (Final) + description: Trained model parameters, weights and biases + license: llama-3 +``` + +## Manual Review Checklist + +After generating a YAML file, you **must** review and verify: + +### 1. Model Metadata +- [ ] Verify model name and version +- [ ] Confirm producer/organization +- [ ] Check release date accuracy +- [ ] Validate model type (language, vision, multimodal, etc.) +- [ ] Verify architecture classification + +### 2. Components +- [ ] Confirm all detected components are actually available +- [ ] Add any missing components not detected by scraper +- [ ] Verify component descriptions are accurate +- [ ] Update component locations/URLs if needed + +### 3. Licenses +- [ ] **Critical**: Verify all license information +- [ ] Check if licenses are correctly identified as open/closed +- [ ] Add license file paths where available +- [ ] Replace "unlicensed" with actual license names +- [ ] Ensure license compatibility across components + +### 4. Additional Information +- [ ] Add contact information if available +- [ ] Add GitHub repository URL if different from HuggingFace +- [ ] Include paper URLs (arXiv, conference proceedings) +- [ ] Add any special notes or caveats + +## Validation + +After manual review, validate the YAML file: + +```bash +cd .. +php scripts/validate-model.php models/Your-Model.yml +``` + +The validation script checks: +- Schema compliance with `schema/mof_schema.json` +- Required fields are present +- Data types are correct +- Enum values are valid + +## Submission Workflow + +1. **Generate draft**: Run the scraper +2. **Manual review**: Edit the generated YAML file +3. **Validate**: Run validation script +4. **Test locally**: Import into local MOT instance +5. **Submit PR**: Create pull request to add model to MOT + +See [CONTRIBUTING.md](../CONTRIBUTING.md) for detailed submission instructions. + +## Limitations + +### Current Limitations +- Only supports HuggingFace as a data source +- Cannot access gated models without API token +- License detection is basic (requires manual verification) +- Cannot determine if training code/data is actually available +- May miss components not clearly documented +- Cannot validate license openness automatically + +### Known Issues +- Some model cards use non-standard formats +- License information may be incomplete or ambiguous +- Component availability may be overstated +- Confidence scores are heuristic-based + +## Future Enhancements + +Potential improvements: +- [ ] Support for additional sources (GitHub, Papers with Code, etc.) +- [ ] LLM-powered content analysis for better component detection +- [ ] Automated license file parsing and classification +- [ ] Integration with license databases (SPDX, OSI) +- [ ] Batch processing for multiple models +- [ ] Interactive mode for guided review +- [ ] Comparison with existing MOT entries +- [ ] GitHub API integration for repository analysis + +## Troubleshooting + +### Common Issues + +**"Failed to scrape model data"** +- Check model ID is correct (format: `organization/model-name`) +- Verify internet connection +- For gated models, provide HuggingFace token with `--hf-token` + +**"Model not found"** +- Ensure model exists on HuggingFace +- Check for typos in model ID +- Some models may be private or deleted + +**"Permission denied"** +- Gated models require authentication +- Get HuggingFace token from https://huggingface.co/settings/tokens +- Use `--hf-token` argument + +**Low confidence scores** +- Normal for models with minimal documentation +- Requires more thorough manual review +- Consider contacting model producer for clarification + +## Contributing + +To improve the scraper: +1. Fork the repository +2. Make your changes +3. Test with various models +4. Submit a pull request + +See [CONTRIBUTING.md](../CONTRIBUTING.md) for guidelines. + +## License + +This scraper is part of the Model Openness Tool, licensed under MIT. + +## Support + +For issues or questions: +- File an issue on GitHub +- Check existing issues for solutions +- Consult the main MOT documentation + +## Acknowledgments + +This scraper was developed to accelerate the MOT model evaluation process while maintaining the quality and accuracy standards of the Model Openness Framework. \ No newline at end of file diff --git a/tools-py/batch_scrape_missing.sh b/tools-py/batch_scrape_missing.sh new file mode 100755 index 0000000..f0445eb --- /dev/null +++ b/tools-py/batch_scrape_missing.sh @@ -0,0 +1,140 @@ +#!/bin/bash + +# Batch Scrape Missing Models +# This script finds missing models and generates draft YAML files for each + +set -e # Exit on error + +# Configuration +MIN_DOWNLOADS=${MIN_DOWNLOADS:-50000} # Default: 50k downloads +LIMIT=${LIMIT:-1000} # Default: check 1000 models +MAX_MODELS=${MAX_MODELS:-20} # Default: process 20 models (safety limit) +OUTPUT_DIR=${OUTPUT_DIR:-../models} # Default: ../models directory +REPORT_FILE="missing_models_report.txt" +HF_TOKEN=${HF_TOKEN:-""} # Optional HuggingFace token + +# Colors for output +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +NC='\033[0m' # No Color + +echo -e "${BLUE}========================================${NC}" +echo -e "${BLUE}Batch Scrape Missing Models${NC}" +echo -e "${BLUE}========================================${NC}" +echo "" + +# Step 1: Find missing models +echo -e "${YELLOW}Step 1: Finding missing models...${NC}" +echo " Min downloads: $MIN_DOWNLOADS" +echo " Limit: $LIMIT models" +echo "" + +python find_missing_models.py \ + --min-downloads "$MIN_DOWNLOADS" \ + --limit "$LIMIT" \ + --output "$REPORT_FILE" + +if [ ! -f "$REPORT_FILE" ]; then + echo -e "${RED}Error: Report file not generated${NC}" + exit 1 +fi + +# Step 2: Extract model IDs from report +echo "" +echo -e "${YELLOW}Step 2: Extracting model IDs from report...${NC}" + +# Extract lines with model IDs (format: " org/model-name downloads") +# Look for lines starting with spaces followed by org/model pattern +MODEL_IDS=$(grep -E "^ [a-zA-Z0-9_-]+/[a-zA-Z0-9_.-]+" "$REPORT_FILE" | \ + awk '{print $1}' | \ + head -n "$MAX_MODELS") + +MODEL_COUNT=$(echo "$MODEL_IDS" | wc -l | tr -d ' ') + +if [ -z "$MODEL_IDS" ]; then + echo -e "${RED}No missing models found in report${NC}" + exit 0 +fi + +echo " Found $MODEL_COUNT models to scrape" +echo "" + +# Step 3: Confirm with user +echo -e "${YELLOW}Models to scrape:${NC}" +echo "$MODEL_IDS" | nl +echo "" +echo -e "${YELLOW}This will generate $MODEL_COUNT YAML files in: $OUTPUT_DIR${NC}" +read -p "Continue? (y/n) " -n 1 -r +echo "" + +if [[ ! $REPLY =~ ^[Yy]$ ]]; then + echo "Aborted by user" + exit 0 +fi + +# Step 4: Scrape each model +echo "" +echo -e "${YELLOW}Step 3: Scraping models...${NC}" +echo "" + +SUCCESS_COUNT=0 +FAIL_COUNT=0 +FAILED_MODELS=() + +# Create output directory if it doesn't exist +mkdir -p "$OUTPUT_DIR" + +# Loop through each model +CURRENT=0 +while IFS= read -r MODEL_ID; do + CURRENT=$((CURRENT + 1)) + echo -e "${BLUE}[$CURRENT/$MODEL_COUNT] Scraping: $MODEL_ID${NC}" + + # Build command with optional token + CMD="python model_scraper.py \"$MODEL_ID\" --output-dir \"$OUTPUT_DIR\"" + if [ -n "$HF_TOKEN" ]; then + CMD="$CMD --hf-token \"$HF_TOKEN\"" + fi + + # Run scraper + if eval "$CMD"; then + echo -e "${GREEN} ✓ Success${NC}" + SUCCESS_COUNT=$((SUCCESS_COUNT + 1)) + else + echo -e "${RED} ✗ Failed${NC}" + FAIL_COUNT=$((FAIL_COUNT + 1)) + FAILED_MODELS+=("$MODEL_ID") + fi + + echo "" + + # Small delay to avoid rate limiting + sleep 1 +done <<< "$MODEL_IDS" + +# Step 5: Summary +echo "" +echo -e "${BLUE}========================================${NC}" +echo -e "${BLUE}Summary${NC}" +echo -e "${BLUE}========================================${NC}" +echo -e "${GREEN}Successful: $SUCCESS_COUNT${NC}" +echo -e "${RED}Failed: $FAIL_COUNT${NC}" +echo "" + +if [ $FAIL_COUNT -gt 0 ]; then + echo -e "${RED}Failed models:${NC}" + for MODEL in "${FAILED_MODELS[@]}"; do + echo " - $MODEL" + done + echo "" +fi + +echo -e "${YELLOW}Next steps:${NC}" +echo "1. Review generated YAML files in: $OUTPUT_DIR" +echo "2. Manually verify and edit each file" +echo "3. Validate with: php ../scripts/validate-model.php models/Your-Model.yml" +echo "4. Submit PRs for reviewed models" +echo "" +echo "Report saved to: $REPORT_FILE" diff --git a/tools-py/example_usage.sh b/tools-py/example_usage.sh new file mode 100755 index 0000000..b3907be --- /dev/null +++ b/tools-py/example_usage.sh @@ -0,0 +1,109 @@ +#!/bin/bash +# Example usage of the Model Openness Tool Python scripts + +echo "Model Openness Tool - Python Scripts Examples" +echo "==============================================" +echo "" + +# Example 1: Find missing models +echo "Example 1: Find missing models from HuggingFace" +echo "------------------------------------------------" +echo "Command: python find_missing_models.py --min-downloads 10000" +echo "" +echo "This will:" +echo " - Compare HuggingFace models with MOT database" +echo " - Identify popular models not yet in MOT" +echo " - Generate a prioritized report" +echo "" +# Uncomment to run: +# python find_missing_models.py --min-downloads 10000 + +# Example 2: Save missing models report to file +echo "Example 2: Save missing models report" +echo "--------------------------------------" +echo "Command: python find_missing_models.py --output missing_report.txt" +echo "" +# Uncomment to run: +# python find_missing_models.py --output missing_report.txt + +# Example 3: Basic model scraping +echo "Example 3: Scraping a single model" +echo "-----------------------------------" +echo "Command: python model_scraper.py google/gemma-2b" +echo "" +# Uncomment to run: +# python model_scraper.py google/gemma-2b + +# Example 4: Scraping with custom output directory +echo "Example 4: Custom output directory" +echo "-----------------------------------" +echo "Command: python model_scraper.py microsoft/phi-2 --output-dir ./drafts" +echo "" +# Uncomment to run: +# python model_scraper.py microsoft/phi-2 --output-dir ./drafts + +# Example 5: Scraping a gated model (requires token) +echo "Example 5: Gated model (requires HuggingFace token)" +echo "----------------------------------------------------" +echo "Command: python model_scraper.py meta-llama/Llama-3-8B --hf-token YOUR_TOKEN" +echo "" +echo "Get your token from: https://huggingface.co/settings/tokens" +echo "" +# Uncomment and add your token to run: +# python model_scraper.py meta-llama/Llama-3-8B --hf-token YOUR_TOKEN + +# Example 6: Complete workflow +echo "Example 6: Complete workflow (find missing + scrape)" +echo "-----------------------------------------------------" +echo "Step 1: Find missing models" +echo " python find_missing_models.py --min-downloads 50000 --output report.txt" +echo "" +echo "Step 2: Review report.txt and identify models to add" +echo "" +echo "Step 3: Scrape identified models" +echo " python model_scraper.py meta-llama/Llama-3-8B" +echo " python model_scraper.py mistralai/Mistral-7B-v0.1" +echo "" +echo "Step 4: Review and validate generated YAML files" +echo " php ../scripts/validate-model.php ../models/Llama-3-8B.yml" +echo "" +echo "Step 5: Submit PR to add models to MOT" +echo "" + +# Example 7: Batch processing multiple models +echo "Example 7: Batch processing" +echo "----------------------------" +echo "Processing multiple models in sequence:" +echo "" + +models=( + "google/gemma-2b" + "microsoft/phi-2" + "mistralai/Mistral-7B-v0.1" +) + +for model in "${models[@]}"; do + echo " - $model" +done +echo "" +echo "To run batch processing, uncomment the loop below in this script" +echo "" + +# Uncomment to run batch processing: +# for model in "${models[@]}"; do +# echo "Processing: $model" +# python model_scraper.py "$model" +# echo "" +# done + +echo "==============================================" +echo "Summary of available tools:" +echo " 1. find_missing_models.py - Identify models to add" +echo " 2. model_scraper.py - Generate draft YAML files" +echo "" +echo "Next steps after scraping:" +echo " 1. Review generated YAML files in ../models/" +echo " 2. Manually verify and edit the files" +echo " 3. Validate: php ../scripts/validate-model.php ../models/YourModel.yml" +echo " 4. Submit PR to add to MOT database" +echo "" diff --git a/tools-py/find_missing_models.py b/tools-py/find_missing_models.py new file mode 100755 index 0000000..1a76871 --- /dev/null +++ b/tools-py/find_missing_models.py @@ -0,0 +1,450 @@ +#!/usr/bin/env python3 +""" +Model Openness Tool - Missing Models Finder + +This script identifies models on HuggingFace that are not yet in the MOT database. +It compares popular/trending models from HuggingFace against existing MOT YAML files. + +Usage: + python find_missing_models.py [--min-downloads MIN] [--limit LIMIT] [--output OUTPUT] + +Example: + python find_missing_models.py --min-downloads 10000 --limit 500 +""" + +import argparse +import json +import os +import re +import sys +from pathlib import Path +from typing import Dict, List, Optional, Set, Tuple +from urllib.parse import quote + +import requests +import yaml + + +class MissingModelsFinder: + """Finds models on HuggingFace that are missing from MOT.""" + + def __init__(self, models_dir: str = "../models"): + """Initialize the finder. + + Args: + models_dir: Path to MOT models directory + """ + self.models_dir = Path(models_dir) + self.session = requests.Session() + self.session.headers.update({ + 'User-Agent': 'MOT-Missing-Models-Finder/1.0' + }) + + def get_mot_models(self) -> Dict[str, Dict]: + """Get all models currently in MOT database. + + Returns: + Dictionary mapping model names/IDs to their metadata + """ + print("Loading existing MOT models...") + mot_models = {} + + if not self.models_dir.exists(): + print(f"Warning: Models directory not found: {self.models_dir}") + return mot_models + + yaml_files = list(self.models_dir.glob("*.yml")) + print(f"Found {len(yaml_files)} YAML files in MOT database") + + for yaml_file in yaml_files: + try: + with open(yaml_file, 'r', encoding='utf-8') as f: + data = yaml.safe_load(f) + + if not data or 'release' not in data: + continue + + release = data['release'] + model_name = release.get('name', '') + origin = release.get('origin', '') + huggingface = release.get('huggingface', '') + + # Store multiple identifiers for matching + identifiers = set() + if model_name: + identifiers.add(model_name.lower()) + identifiers.add(model_name.lower().replace('-', '_')) + identifiers.add(model_name.lower().replace('_', '-')) + if origin: + identifiers.add(origin.lower()) + if huggingface: + # Extract model ID from HuggingFace URL + hf_id = huggingface.replace('https://huggingface.co/', '') + identifiers.add(hf_id.lower()) + + mot_models[yaml_file.stem] = { + 'name': model_name, + 'origin': origin, + 'huggingface': huggingface, + 'identifiers': identifiers, + 'file': yaml_file.name + } + + except Exception as e: + print(f"Warning: Error reading {yaml_file.name}: {e}") + continue + + print(f"Loaded {len(mot_models)} models from MOT database\n") + return mot_models + + def get_huggingface_models( + self, + min_downloads: int = 1000, + limit: int = 1000, + model_type: Optional[str] = None + ) -> List[Dict]: + """Get popular models from HuggingFace. + + Args: + min_downloads: Minimum number of downloads to consider + limit: Maximum number of models to fetch + model_type: Filter by model type (e.g., 'text-generation') + + Returns: + List of model dictionaries + """ + print(f"Fetching models from HuggingFace (min downloads: {min_downloads:,})...") + + models = [] + page = 0 + + while len(models) < limit: + # HuggingFace API endpoint for models + url = "https://huggingface.co/api/models" + params = { + 'sort': 'downloads', + 'direction': -1, + 'limit': 100, + 'skip': page * 100, + 'full': True + } + + if model_type: + params['filter'] = model_type + + try: + response = self.session.get(url, params=params, timeout=30) + response.raise_for_status() + batch = response.json() + + if not batch: + break + + for model in batch: + downloads = model.get('downloads', 0) + if downloads >= min_downloads: + models.append(model) + + if len(models) >= limit: + break + + page += 1 + print(f" Fetched {len(models)} models so far...", end='\r') + + except requests.exceptions.RequestException as e: + print(f"\nError fetching models: {e}") + break + + print(f"\nFetched {len(models)} models from HuggingFace\n") + return models + + def normalize_model_id(self, model_id: str) -> Set[str]: + """Generate normalized variations of a model ID for matching. + + Args: + model_id: Model ID (e.g., 'meta-llama/Llama-3-8B') + + Returns: + Set of normalized variations + """ + variations = set() + + # Original + variations.add(model_id.lower()) + + # Without organization prefix + if '/' in model_id: + model_name = model_id.split('/')[-1] + variations.add(model_name.lower()) + variations.add(model_name.lower().replace('-', '_')) + variations.add(model_name.lower().replace('_', '-')) + + # With underscores/hyphens swapped + variations.add(model_id.lower().replace('-', '_')) + variations.add(model_id.lower().replace('_', '-')) + + return variations + + def is_model_in_mot(self, hf_model: Dict, mot_models: Dict) -> Tuple[bool, str]: + """Check if a HuggingFace model is already in MOT. + + Args: + hf_model: HuggingFace model dictionary + mot_models: Dictionary of MOT models + + Returns: + Tuple of (is_present, matched_file) + """ + model_id = hf_model.get('id', '') + variations = self.normalize_model_id(model_id) + + for mot_file, mot_data in mot_models.items(): + mot_identifiers = mot_data.get('identifiers', set()) + + # Check if any variation matches + if variations & mot_identifiers: + return True, mot_data['file'] + + return False, '' + + def categorize_missing_models( + self, + missing_models: List[Dict] + ) -> Dict[str, List[Dict]]: + """Categorize missing models by type and popularity. + + Args: + missing_models: List of missing model dictionaries + + Returns: + Dictionary of categorized models + """ + categories = { + 'high_priority': [], # >100k downloads + 'medium_priority': [], # 10k-100k downloads + 'low_priority': [], # <10k downloads + 'by_type': {} + } + + for model in missing_models: + downloads = model.get('downloads', 0) + tags = model.get('tags', []) + + # Priority by downloads + if downloads >= 100000: + categories['high_priority'].append(model) + elif downloads >= 10000: + categories['medium_priority'].append(model) + else: + categories['low_priority'].append(model) + + # By type + model_type = 'other' + type_tags = [ + 'text-generation', 'text2text-generation', + 'image-to-text', 'text-to-image', + 'automatic-speech-recognition', 'audio-classification', + 'image-classification', 'object-detection' + ] + + for tag in tags: + if tag in type_tags: + model_type = tag + break + + if model_type not in categories['by_type']: + categories['by_type'][model_type] = [] + categories['by_type'][model_type].append(model) + + return categories + + def generate_report( + self, + missing_models: List[Dict], + mot_models: Dict, + output_file: Optional[str] = None + ) -> str: + """Generate a report of missing models. + + Args: + missing_models: List of missing model dictionaries + mot_models: Dictionary of MOT models + output_file: Optional file to save report + + Returns: + Report text + """ + categories = self.categorize_missing_models(missing_models) + + report_lines = [] + report_lines.append("=" * 80) + report_lines.append("MODEL OPENNESS TOOL - MISSING MODELS REPORT") + report_lines.append("=" * 80) + report_lines.append("") + + # Summary + report_lines.append("SUMMARY") + report_lines.append("-" * 80) + report_lines.append(f"Models in MOT database: {len(mot_models):,}") + report_lines.append(f"Missing models found: {len(missing_models):,}") + report_lines.append(f" - High priority (>100k): {len(categories['high_priority']):,}") + report_lines.append(f" - Medium priority (10k+): {len(categories['medium_priority']):,}") + report_lines.append(f" - Low priority (<10k): {len(categories['low_priority']):,}") + report_lines.append("") + + # By type + report_lines.append("MISSING MODELS BY TYPE") + report_lines.append("-" * 80) + for model_type, models in sorted(categories['by_type'].items(), + key=lambda x: len(x[1]), + reverse=True): + report_lines.append(f" {model_type:30s} {len(models):5,} models") + report_lines.append("") + + # High priority models + if categories['high_priority']: + report_lines.append("HIGH PRIORITY MODELS (>100,000 downloads)") + report_lines.append("-" * 80) + for model in sorted(categories['high_priority'], + key=lambda x: x.get('downloads', 0), + reverse=True)[:50]: # Top 50 + model_id = model.get('id', 'unknown') + downloads = model.get('downloads', 0) + tags = ', '.join(model.get('tags', [])[:3]) + report_lines.append(f" {model_id:50s} {downloads:>10,} downloads") + if tags: + report_lines.append(f" Tags: {tags}") + report_lines.append(f" URL: https://huggingface.co/{model_id}") + report_lines.append("") + + # Medium priority models + if categories['medium_priority']: + report_lines.append("MEDIUM PRIORITY MODELS (10,000-100,000 downloads)") + report_lines.append("-" * 80) + report_lines.append(f"Total: {len(categories['medium_priority'])} models") + report_lines.append("Top 20:") + for model in sorted(categories['medium_priority'], + key=lambda x: x.get('downloads', 0), + reverse=True)[:20]: + model_id = model.get('id', 'unknown') + downloads = model.get('downloads', 0) + report_lines.append(f" {model_id:50s} {downloads:>10,} downloads") + report_lines.append("") + + # Commands to scrape + report_lines.append("SUGGESTED SCRAPING COMMANDS") + report_lines.append("-" * 80) + report_lines.append("High priority models (copy and run):") + report_lines.append("") + for model in sorted(categories['high_priority'], + key=lambda x: x.get('downloads', 0), + reverse=True)[:10]: + model_id = model.get('id', 'unknown') + report_lines.append(f"python model_scraper.py {model_id}") + report_lines.append("") + + report_lines.append("=" * 80) + + report_text = '\n'.join(report_lines) + + # Save to file if specified + if output_file: + with open(output_file, 'w', encoding='utf-8') as f: + f.write(report_text) + print(f"\nReport saved to: {output_file}") + + return report_text + + +def main(): + """Main entry point.""" + parser = argparse.ArgumentParser( + description='Find models on HuggingFace that are missing from MOT' + ) + parser.add_argument( + '--min-downloads', + type=int, + default=1000, + help='Minimum number of downloads to consider (default: 1000)' + ) + parser.add_argument( + '--limit', + type=int, + default=1000, + help='Maximum number of models to fetch from HuggingFace (default: 1000)' + ) + parser.add_argument( + '--models-dir', + default='../models', + help='Path to MOT models directory (default: ../models)' + ) + parser.add_argument( + '--output', + help='Output file for report (default: print to console)' + ) + parser.add_argument( + '--model-type', + help='Filter by model type (e.g., text-generation, image-to-text)' + ) + + args = parser.parse_args() + + # Initialize finder + finder = MissingModelsFinder(models_dir=args.models_dir) + + print("=" * 80) + print("MODEL OPENNESS TOOL - MISSING MODELS FINDER") + print("=" * 80) + print() + + # Get MOT models + mot_models = finder.get_mot_models() + + # Get HuggingFace models + hf_models = finder.get_huggingface_models( + min_downloads=args.min_downloads, + limit=args.limit, + model_type=args.model_type + ) + + # Find missing models + print("Comparing models...") + missing_models = [] + + for hf_model in hf_models: + is_present, matched_file = finder.is_model_in_mot(hf_model, mot_models) + if not is_present: + missing_models.append(hf_model) + + print(f"Found {len(missing_models)} missing models\n") + + # Generate report + report = finder.generate_report( + missing_models, + mot_models, + output_file=args.output + ) + + # Print report + print(report) + + # Summary + print("\n" + "=" * 80) + print("NEXT STEPS") + print("=" * 80) + print("1. Review the high priority models above") + print("2. Use model_scraper.py to generate draft YAML files") + print("3. Manually review and validate the generated files") + print("4. Submit PRs to add models to MOT database") + print() + print("Example workflow:") + print(" python model_scraper.py meta-llama/Llama-3-8B") + print(" # Review and edit ../models/Llama-3-8B.yml") + print(" php ../scripts/validate-model.php ../models/Llama-3-8B.yml") + print(" # Submit PR") + print() + + +if __name__ == '__main__': + main() + diff --git a/tools-py/model_scraper.py b/tools-py/model_scraper.py new file mode 100755 index 0000000..6358ce5 --- /dev/null +++ b/tools-py/model_scraper.py @@ -0,0 +1,557 @@ +#!/usr/bin/env python3 +""" +Model Openness Tool - Automated Model Data Scraper + +This script scrapes model information from HuggingFace and other sources +to generate draft YAML files for the Model Openness Framework (MOF). + +Usage: + python model_scraper.py [--output-dir OUTPUT_DIR] + +Example: + python model_scraper.py meta-llama/Llama-3-8B --output-dir ../models +""" + +import argparse +import json +import os +import re +import sys +from datetime import datetime +from pathlib import Path +from typing import Dict, List, Optional, Tuple +from urllib.parse import urlparse + +import requests +import yaml + + +class ModelScraper: + """Scrapes model information from various sources.""" + + # MOF Component names as defined in the framework + MOF_COMPONENTS = { + 'code': [ + 'Training code', + 'Inference code', + 'Model architecture', + 'Supporting libraries and tools', + ], + 'data': [ + 'Training dataset', + 'Training data preprocessing', + 'Evaluation dataset', + 'Evaluation data preprocessing', + 'Sample model outputs', + 'Data card', + ], + 'document': [ + 'Technical report', + 'Research paper', + 'Model card', + 'Evaluation results', + 'Evaluation methodology', + ], + 'parameters': [ + 'Model parameters (Final)', + 'Model metadata', + ] + } + + # Known open licenses + OPEN_LICENSES = { + 'apache-2.0', 'mit', 'bsd', 'gpl', 'lgpl', 'mpl-2.0', + 'cc-by-4.0', 'cc-by-sa-4.0', 'openrail', 'bigscience-openrail-m', + 'bigscience-bloom-rail-1.0', 'creativeml-openrail-m' + } + + def __init__(self, hf_token: Optional[str] = None): + """Initialize the scraper. + + Args: + hf_token: Optional HuggingFace API token for accessing gated models + """ + self.hf_token = hf_token + self.session = requests.Session() + if hf_token: + self.session.headers.update({'Authorization': f'Bearer {hf_token}'}) + + def scrape_huggingface_model(self, model_id: str) -> Dict: + """Scrape model information from HuggingFace. + + Args: + model_id: HuggingFace model ID (e.g., 'meta-llama/Llama-3-8B') + + Returns: + Dictionary containing scraped model information + """ + print(f"Scraping HuggingFace model: {model_id}") + + # Get model info from HuggingFace API + api_url = f"https://huggingface.co/api/models/{model_id}" + + try: + response = self.session.get(api_url, timeout=30) + response.raise_for_status() + model_info = response.json() + except requests.exceptions.RequestException as e: + print(f"Error fetching model info: {e}") + return {} + + # Get model card content + card_url = f"https://huggingface.co/{model_id}/raw/main/README.md" + model_card_content = "" + try: + card_response = self.session.get(card_url, timeout=30) + if card_response.status_code == 200: + model_card_content = card_response.text + except requests.exceptions.RequestException: + pass + + # Get repository files list + files_url = f"https://huggingface.co/api/models/{model_id}/tree/main" + repo_files = [] + try: + files_response = self.session.get(files_url, timeout=30) + if files_response.status_code == 200: + repo_files = [f['path'] for f in files_response.json()] + except requests.exceptions.RequestException: + pass + + # Extract information + scraped_data = { + 'model_id': model_id, + 'model_info': model_info, + 'model_card': model_card_content, + 'repo_files': repo_files, + 'confidence': {} + } + + return scraped_data + + def detect_components(self, scraped_data: Dict) -> List[Dict]: + """Detect which MOF components are available. + + Args: + scraped_data: Dictionary containing scraped model information + + Returns: + List of component dictionaries with name, description, license, and confidence + """ + components = [] + repo_files = scraped_data.get('repo_files', []) + model_card = scraped_data.get('model_card', '').lower() + model_info = scraped_data.get('model_info', {}) + + # Detect Model parameters (Final) + if any(f.endswith(('.bin', '.safetensors', '.pt', '.pth', '.ckpt')) for f in repo_files): + components.append({ + 'name': 'Model parameters (Final)', + 'description': 'Trained model parameters, weights and biases', + 'license': self._detect_license(scraped_data), + 'confidence': 0.95, + 'location': 'HuggingFace repository' + }) + + # Detect Model metadata + if any(f in repo_files for f in ['config.json', 'model_config.json', 'configuration.json']): + components.append({ + 'name': 'Model metadata', + 'description': 'Any model metadata including training configuration and optimizer states', + 'license': self._detect_license(scraped_data), + 'confidence': 0.90, + 'location': 'HuggingFace repository' + }) + + # Detect Model architecture + if any(f.endswith('.py') for f in repo_files) or 'modeling' in ' '.join(repo_files): + components.append({ + 'name': 'Model architecture', + 'description': "Well commented code for the model's architecture", + 'license': self._detect_license(scraped_data), + 'confidence': 0.85, + 'location': 'HuggingFace repository' + }) + + # Detect Inference code + if any('inference' in f.lower() or 'generate' in f.lower() for f in repo_files): + components.append({ + 'name': 'Inference code', + 'description': 'Code used for running the model to make predictions', + 'license': self._detect_license(scraped_data), + 'confidence': 0.80, + 'location': 'HuggingFace repository' + }) + + # Detect Model card + if 'README.md' in repo_files or model_card: + components.append({ + 'name': 'Model card', + 'description': 'Model details including performance metrics, intended use, and limitations', + 'license': self._detect_license(scraped_data), + 'confidence': 0.95, + 'location': 'HuggingFace repository' + }) + + # Detect Technical report (check model card for links) + if any(keyword in model_card for keyword in ['technical report', 'tech report', 'documentation']): + components.append({ + 'name': 'Technical report', + 'description': 'Technical report detailing capabilities and usage instructions for the model', + 'license': 'unlicensed', + 'confidence': 0.60, + 'location': 'Referenced in model card' + }) + + # Detect Research paper + if any(keyword in model_card for keyword in ['paper', 'arxiv', 'publication']): + components.append({ + 'name': 'Research paper', + 'description': 'Research paper detailing the development and capabilities of the model', + 'license': 'unlicensed', + 'confidence': 0.70, + 'location': 'Referenced in model card' + }) + + # Detect Evaluation results + if any(keyword in model_card for keyword in ['evaluation', 'benchmark', 'performance', 'results']): + components.append({ + 'name': 'Evaluation results', + 'description': 'The results from evaluating the model', + 'license': 'unlicensed', + 'confidence': 0.75, + 'location': 'Model card' + }) + + # Detect Training dataset (check for dataset references) + if any(keyword in model_card for keyword in ['training data', 'trained on', 'dataset']): + components.append({ + 'name': 'Training dataset', + 'description': 'The dataset used to train the model', + 'license': 'unlicensed', + 'confidence': 0.50, + 'location': 'Referenced in model card' + }) + + return components + + def _detect_license(self, scraped_data: Dict) -> str: + """Detect the license for the model. + + Args: + scraped_data: Dictionary containing scraped model information + + Returns: + License name or 'unlicensed' + """ + model_info = scraped_data.get('model_info', {}) + + # Check for license in model info + if 'cardData' in model_info and 'license' in model_info['cardData']: + license_name = model_info['cardData']['license'] + if license_name and license_name != 'other': + return license_name + + # Check for LICENSE file in repo + repo_files = scraped_data.get('repo_files', []) + if any(f.upper() == 'LICENSE' or f.upper() == 'LICENSE.md' for f in repo_files): + # Would need to fetch and parse the LICENSE file + return 'unlicensed' # Placeholder - needs manual review + + return 'unlicensed' + + def _detect_repository(self, scraped_data: Dict) -> Tuple[str, float]: + """Detect GitHub repository with confidence score. + + Args: + scraped_data: Dictionary containing scraped model information + + Returns: + Tuple of (repository_url, confidence_score) + """ + model_id = scraped_data.get('model_id', '') + model_card = scraped_data.get('model_card', '') + + # Method 1: Parse model card for GitHub links + github_pattern = r'https://github\.com/[^/\s"<>]+/[^/\s"<>]+' + github_urls = re.findall(github_pattern, model_card) + + if github_urls: + # Filter for most relevant (matching model name) + model_name = model_id.split('/')[-1].lower() + for url in github_urls: + # Clean up URL (remove trailing punctuation/markdown) + url = url.rstrip(')') + if model_name in url.lower(): + return url, 0.90 + # Return first GitHub URL found + return github_urls[0].rstrip(')'), 0.70 + + # Method 2: Try pattern-based inference + inferred_repo = f"https://github.com/{model_id}" + if self._check_repo_exists(inferred_repo): + return inferred_repo, 0.65 + + # Method 3: Try organization/model-base-name + if '/' in model_id: + org, name = model_id.split('/', 1) + # Try various name variations + base_names = [ + name.split('-')[0], # e.g., Mistral-7B-v0.1 → Mistral + name.lower(), + name, + ] + for base_name in base_names: + inferred_repo = f"https://github.com/{org}/{base_name}" + if self._check_repo_exists(inferred_repo): + return inferred_repo, 0.60 + + return '', 0.0 + + def _check_repo_exists(self, repo_url: str) -> bool: + """Check if GitHub repo exists. + + Args: + repo_url: GitHub repository URL + + Returns: + True if repo exists, False otherwise + """ + try: + response = self.session.head(repo_url, timeout=5, allow_redirects=True) + return response.status_code == 200 + except: + return False + + def _extract_model_metadata(self, scraped_data: Dict) -> Dict: + """Extract model metadata from scraped data. + + Args: + scraped_data: Dictionary containing scraped model information + + Returns: + Dictionary with model metadata + """ + model_info = scraped_data.get('model_info', {}) + model_id = scraped_data.get('model_id', '') + + # Extract producer from model_id + producer = model_id.split('/')[0] if '/' in model_id else 'Unknown' + producer = producer.replace('-', ' ').replace('_', ' ').title() + + # Extract model name + model_name = model_id.split('/')[-1] if '/' in model_id else model_id + + # Detect model type from tags + tags = model_info.get('tags', []) + model_type = '' + type_mapping = { + 'text-generation': 'language', + 'text2text-generation': 'language', + 'image-to-text': 'multimodal', + 'text-to-image': 'image', + 'image-classification': 'vision', + 'object-detection': 'vision', + 'automatic-speech-recognition': 'audio', + } + + for tag in tags: + if tag in type_mapping: + model_type = type_mapping[tag] + break + + # Detect architecture + architecture = '' + model_card = scraped_data.get('model_card', '').lower() + if 'transformer' in model_card or 'transformer' in str(tags).lower(): + if 'decoder' in model_card: + architecture = 'transformer decoder' + elif 'encoder' in model_card: + architecture = 'transformer encoder-decoder' + else: + architecture = 'transformer' + elif 'diffusion' in model_card or 'diffusion' in str(tags).lower(): + architecture = 'diffusion' + + # Extract version (often in model name) + version_match = re.search(r'(\d+\.?\d*[BMK]?)', model_name) + version = version_match.group(1) if version_match else '1.0' + + # Get last modified date + last_modified = model_info.get('lastModified', '') + if last_modified: + date = last_modified.split('T')[0] + else: + date = datetime.now().strftime('%Y-%m-%d') + + # Detect repository + repository, repo_confidence = self._detect_repository(scraped_data) + + metadata = { + 'name': model_name, + 'version': version, + 'producer': producer, + 'type': model_type, + 'architecture': architecture, + 'date': date, + 'origin': model_name.lower(), + 'huggingface': f"https://huggingface.co/{model_id}", + } + + # Add repository if found + if repository: + metadata['repository'] = repository + metadata['repository_confidence'] = repo_confidence + + return metadata + + def _format_yaml_mot_style(self, metadata: Dict, components: List[Dict]) -> str: + """Format YAML in MOT style with proper indentation and quotes. + + Args: + metadata: Model metadata dictionary + components: List of component dictionaries + + Returns: + Formatted YAML string matching MOT style + """ + lines = [] + + # Framework section + lines.append("framework:") + lines.append(" name: 'Model Openness Framework'") + lines.append(" version: '1.0'") + lines.append(" date: '2024-12-15'") + + # Release section + lines.append("release:") + lines.append(f" name: {metadata['name']}") + lines.append(f" version: '{metadata['version']}'") + lines.append(f" date: '{metadata['date']}'") + lines.append(" license: { }") + lines.append(f" type: '{metadata['type']}'") + lines.append(f" architecture: '{metadata['architecture']}'") + lines.append(f" origin: {metadata['origin']}") + lines.append(f" producer: '{metadata['producer']}'") + lines.append(" contact: ''") + + # Add repository if present + if metadata.get('repository'): + lines.append(f" repository: '{metadata['repository']}'") + + # Add huggingface if present + if metadata.get('huggingface'): + lines.append(f" huggingface: '{metadata['huggingface']}'") + + # Components section + lines.append(" components:") + for comp in components: + lines.append(" -") + lines.append(f" name: '{comp['name']}'") + lines.append(f" description: \"{comp['description']}\"") + + # Format license - handle different types + license_val = comp['license'] + if isinstance(license_val, list): + # Multiple licenses - just use first one for now + license_val = license_val[0] if license_val else 'unlicensed' + + if license_val and license_val != 'unlicensed': + lines.append(f" license: '{license_val}'") + else: + lines.append(f" license: unlicensed") + + return '\n'.join(lines) + + def generate_yaml(self, scraped_data: Dict, output_path: Optional[str] = None) -> str: + """Generate MOF-compliant YAML from scraped data. + + Args: + scraped_data: Dictionary containing scraped model information + output_path: Optional path to save the YAML file + + Returns: + YAML string + """ + # Extract metadata + metadata = self._extract_model_metadata(scraped_data) + + # Detect components + components = self.detect_components(scraped_data) + + # Format in MOT style + yaml_output = self._format_yaml_mot_style(metadata, components) + + # Save to file if path provided + if output_path: + with open(output_path, 'w', encoding='utf-8') as f: + f.write(yaml_output) + print(f"YAML saved to: {output_path}") + + return yaml_output + + +def main(): + """Main entry point for the scraper.""" + parser = argparse.ArgumentParser( + description='Scrape model information and generate MOF YAML files' + ) + parser.add_argument( + 'model_id', + help='HuggingFace model ID (e.g., meta-llama/Llama-3-8B)' + ) + parser.add_argument( + '--output-dir', + default='../models', + help='Output directory for YAML files (default: ../models)' + ) + parser.add_argument( + '--hf-token', + help='HuggingFace API token for accessing gated models' + ) + + args = parser.parse_args() + + # Initialize scraper + scraper = ModelScraper(hf_token=args.hf_token) + + # Scrape model data + print(f"\n{'='*60}") + print(f"Scraping model: {args.model_id}") + print(f"{'='*60}\n") + + scraped_data = scraper.scrape_huggingface_model(args.model_id) + + if not scraped_data: + print("Failed to scrape model data") + sys.exit(1) + + # Generate output filename + model_name = args.model_id.split('/')[-1] + output_path = Path(args.output_dir) / f"{model_name}.yml" + output_path.parent.mkdir(parents=True, exist_ok=True) + + # Generate YAML + print(f"\n{'='*60}") + print("Generating YAML...") + print(f"{'='*60}\n") + + yaml_output = scraper.generate_yaml(scraped_data, str(output_path)) + + print(f"\n{'='*60}") + print("DRAFT YAML GENERATED") + print(f"{'='*60}\n") + print("⚠️ IMPORTANT: This is a DRAFT that requires manual review!") + print(" - Verify all component availability") + print(" - Confirm license information") + print(" - Add missing components") + print(" - Update confidence scores") + print(f"\nOutput saved to: {output_path}") + print(f"\nNext steps:") + print(f" 1. Review and edit: {output_path}") + print(f" 2. Validate: php scripts/validate-model.php {output_path}") + print(f" 3. Submit PR to add to MOT database") + + +if __name__ == '__main__': + main() diff --git a/tools-py/requirements.txt b/tools-py/requirements.txt new file mode 100644 index 0000000..f987aff --- /dev/null +++ b/tools-py/requirements.txt @@ -0,0 +1,5 @@ +# Python dependencies for Model Openness Tool scraper +# Install with: pip install -r requirements.txt + +requests>=2.31.0 +PyYAML>=6.0.1 \ No newline at end of file