Skip to content

Latest commit

 

History

History
484 lines (371 loc) · 16.6 KB

File metadata and controls

484 lines (371 loc) · 16.6 KB

Guide to Building Documentation Scrapers

This guide outlines a comprehensive approach to creating documentation scrapers for various web-based documentation sites and organizing the scraped content for use with AI models or knowledge bases.

Overview

Building a documentation scraper involves three main components:

  1. Web Scraping: Extracting content from documentation websites
  2. Content Processing: Cleaning and converting HTML to Markdown
  3. Content Organization: Categorizing content into logical structures

1. Setting Up a Web Scraper

Dependencies

import os
import re
import requests
import time
import logging
import argparse
from bs4 import BeautifulSoup
from urllib.parse import urlparse, urljoin
from concurrent.futures import ThreadPoolExecutor

Basic Structure

class DocumentationScraper:
    def __init__(self, base_url, output_dir, delay=0.5, max_pages=None):
        self.base_url = base_url
        self.output_dir = output_dir
        self.delay = delay
        self.max_pages = max_pages
        self.processed_urls = set()
        self.urls_to_process = [base_url]
        
        # Configure logging
        logging.basicConfig(
            level=logging.INFO,
            format='%(asctime)s - %(levelname)s - %(message)s',
            handlers=[
                logging.FileHandler("scraper.log"),
                logging.StreamHandler()
            ]
        )
        self.logger = logging.getLogger()
        
        # Create output directory
        os.makedirs(output_dir, exist_ok=True)
    
    def run(self):
        """Main scraper loop"""
        count = 0
        while self.urls_to_process and (self.max_pages is None or count < self.max_pages):
            url = self.urls_to_process.pop(0)
            if url in self.processed_urls:
                continue
                
            self.process_page(url)
            self.processed_urls.add(url)
            count += 1
            time.sleep(self.delay)
            
        self.logger.info(f"Scraping completed. Processed {len(self.processed_urls)} pages.")
    
    def process_page(self, url):
        """Process a single page: fetch, extract content, and save"""
        try:
            self.logger.info(f"Processing {url}")
            response = requests.get(url, timeout=10)
            if response.status_code != 200:
                self.logger.warning(f"Failed to fetch {url}: Status code {response.status_code}")
                return
                
            soup = BeautifulSoup(response.text, 'html.parser')
            
            # Extract content
            content = self.extract_content(soup, url)
            
            # Save content
            self.save_content(url, content)
            
            # Find new links
            new_urls = self.extract_links(soup, url)
            for new_url in new_urls:
                if new_url not in self.processed_urls and new_url not in self.urls_to_process:
                    self.urls_to_process.append(new_url)
                    
        except Exception as e:
            self.logger.error(f"Error processing {url}: {str(e)}")
    
    def extract_content(self, soup, url):
        """Extract and format content from the page"""
        # This is site-specific and must be customized
        # Example:
        main_content = soup.find('main') or soup.find('article') or soup.find('div', class_='content')
        if not main_content:
            return ""
            
        # Convert to Markdown (simplified example)
        title = soup.find('h1')
        title_text = title.get_text() if title else os.path.basename(url)
        
        content = f"# {title_text}\n\n"
        content += f"*Source: [{url}]({url})*\n\n"
        
        # Process content further...
        
        return content
    
    def extract_links(self, soup, url):
        """Extract relevant links to follow"""
        # This needs to be customized per site
        links = []
        for a in soup.find_all('a', href=True):
            href = a['href']
            full_url = urljoin(url, href)
            
            # Only follow links within the same domain
            if self.should_follow_link(full_url):
                links.append(full_url)
                
        return links
    
    def should_follow_link(self, url):
        """Determine if a link should be followed"""
        parsed_base = urlparse(self.base_url)
        parsed_url = urlparse(url)
        
        # Same domain check
        if parsed_url.netloc != parsed_base.netloc:
            return False
            
        # Check if it's within the documentation path
        if not parsed_url.path.startswith(parsed_base.path):
            return False
            
        # Avoid query parameters, fragments, etc.
        if parsed_url.query or parsed_url.fragment:
            return False
            
        return True
    
    def save_content(self, url, content):
        """Save content to a file"""
        if not content:
            return
            
        # Create file path based on URL
        parsed_url = urlparse(url)
        path_parts = parsed_url.path.strip('/').split('/')
        
        # Skip the common base path parts
        base_parts = urlparse(self.base_url).path.strip('/').split('/')
        for part in base_parts:
            if path_parts and path_parts[0] == part:
                path_parts.pop(0)
        
        # Create directory structure
        dir_path = self.output_dir
        if path_parts:
            dir_path = os.path.join(self.output_dir, *path_parts[:-1])
            os.makedirs(dir_path, exist_ok=True)
        
        # Create filename
        filename = path_parts[-1] if path_parts else 'index'
        if not filename.endswith('.md'):
            filename += '.md'
            
        file_path = os.path.join(dir_path, filename)
        
        # Write content
        with open(file_path, 'w', encoding='utf-8') as f:
            f.write(content)
            
        self.logger.info(f"Saved content to {file_path}")

2. Customizing for Specific Documentation Sites

Identifying Content Patterns

  1. Inspect the DOM Structure: Use browser developer tools to identify:

    • Main content containers
    • Navigation elements
    • Code blocks
    • Documentation-specific components
  2. Analyze URL Patterns: Look for:

    • Section indicators in the URL
    • Version indicators
    • Category/subcategory structures
  3. Extract Content with Custom Selectors:

def extract_content(self, soup, url):
    # Example for a documentation site with specific selectors
    title = soup.find('h1', class_='documentation-title')
    content_div = soup.find('div', class_='documentation-content')
    sidebar = soup.find('div', class_='sidebar')
    
    # Combine the content
    markdown = f"# {title.get_text() if title else 'Untitled'}\n\n"
    markdown += f"*Source: [{url}]({url})*\n\n"
    
    # Extract sections
    sections = content_div.find_all(['h2', 'h3', 'p', 'pre', 'ul', 'ol']) if content_div else []
    
    for section in sections:
        if section.name in ['h2', 'h3']:
            level = int(section.name[1])
            markdown += f"{'#' * level} {section.get_text().strip()}\n\n"
        elif section.name == 'p':
            markdown += f"{section.get_text().strip()}\n\n"
        elif section.name == 'pre':
            code = section.get_text().strip()
            language = 'javascript'  # Default, can be detected based on site
            markdown += f"```{language}\n{code}\n```\n\n"
        elif section.name in ['ul', 'ol']:
            for li in section.find_all('li'):
                prefix = '- ' if section.name == 'ul' else '1. '
                markdown += f"{prefix}{li.get_text().strip()}\n"
            markdown += "\n"
    
    return markdown

Handling Different Documentation Types

API Reference Documentation

def extract_api_reference(self, soup, url):
    # For API documentation with function signatures, parameters, return types
    title = soup.find('h1').get_text()
    signature = soup.find('div', class_='signature').get_text()
    description = soup.find('div', class_='description').get_text()
    
    markdown = f"# {title}\n\n"
    markdown += f"*Source: [{url}]({url})*\n\n"
    markdown += f"## Signature\n\n```typescript\n{signature}\n```\n\n"
    markdown += f"## Description\n\n{description}\n\n"
    
    # Parameters section
    params = soup.find('div', class_='parameters')
    if params:
        markdown += "## Parameters\n\n"
        for param in params.find_all('div', class_='parameter'):
            param_name = param.find('span', class_='name').get_text()
            param_type = param.find('span', class_='type').get_text()
            param_desc = param.find('div', class_='description').get_text()
            
            markdown += f"### {param_name}\n\n"
            markdown += f"**Type**: `{param_type}`\n\n"
            markdown += f"{param_desc}\n\n"
    
    # Return value
    returns = soup.find('div', class_='returns')
    if returns:
        return_type = returns.find('span', class_='type').get_text()
        return_desc = returns.find('div', class_='description').get_text()
        
        markdown += f"## Returns\n\n"
        markdown += f"**Type**: `{return_type}`\n\n"
        markdown += f"{return_desc}\n\n"
    
    return markdown

Conceptual Documentation

def extract_concept_doc(self, soup, url):
    # For tutorials, guides, and conceptual docs
    title = soup.find('h1').get_text()
    intro = soup.find('div', class_='introduction').get_text()
    
    markdown = f"# {title}\n\n"
    markdown += f"*Source: [{url}]({url})*\n\n"
    markdown += f"{intro}\n\n"
    
    # Extract sections with headings
    content = soup.find('div', class_='content')
    current_section = None
    
    for element in content.children:
        if element.name in ['h2', 'h3', 'h4']:
            level = int(element.name[1])
            heading = element.get_text().strip()
            markdown += f"{'#' * level} {heading}\n\n"
        elif element.name == 'p':
            text = element.get_text().strip()
            markdown += f"{text}\n\n"
        elif element.name == 'div' and 'code-block' in element.get('class', []):
            code = element.get_text().strip()
            language = element.get('data-language', 'bash')
            markdown += f"```{language}\n{code}\n```\n\n"
        # Handle other element types like lists, tables, etc.
    
    return markdown

3. Content Cleaning and Formatting

Markdown Formatter

def clean_markdown(content):
    """Clean and format markdown content"""
    # Fix code blocks
    content = re.sub(r'```\s+([a-zA-Z0-9]+)', r'```\1', content)
    
    # Fix headings without spaces
    content = re.sub(r'(#+)([a-zA-Z0-9])', r'\1 \2', content)
    
    # Fix extra newlines
    content = re.sub(r'\n{3,}', '\n\n', content)
    
    # Fix inline code
    content = re.sub(r'(?<!\`)\`([^\`]+)\`(?!\`)', r'`\1`', content)
    
    # Remove common boilerplate text
    content = re.sub(r'Was this page helpful\?.*', '', content, flags=re.DOTALL)
    
    return content

4. Content Organization for AI Training

Categorization Script

def determine_category(file_path):
    """Determine the appropriate category for a file based on its content"""
    with open(file_path, 'r', encoding='utf-8') as f:
        content = f.read()
        
    # Patterns for different categories
    ui_component_patterns = [
        r'\bComponent\b', r'\bButton\b', r'\bProvider\b', 
        r'\bRenderer\b', r'\bView\b', r'<.*?>', r'JSX'
    ]
    
    react_hook_patterns = [
        r'\buse[A-Z]', r'\buseEffect\b', r'\buseState\b',
        r'\bhook\b', r'\bHook\b', r'\breact\b'
    ]
    
    core_function_patterns = [
        r'\bfunction\b', r'\butil\b', r'\bcreate\b',
        r'\bget[A-Z]', r'\bfetch\b', r'\bconnect\b'
    ]
    
    advanced_topic_patterns = [
        r'advanced', r'complex', r'deep dive',
        r'architecture', r'internals', r'optimization'
    ]
    
    # Check patterns
    for pattern in ui_component_patterns:
        if re.search(pattern, content, re.IGNORECASE):
            return "UI Components"
            
    for pattern in react_hook_patterns:
        if re.search(pattern, content, re.IGNORECASE):
            return "React Hooks"
            
    for pattern in core_function_patterns:
        if re.search(pattern, content, re.IGNORECASE):
            return "Core Functions"
            
    for pattern in advanced_topic_patterns:
        if re.search(pattern, content, re.IGNORECASE):
            return "Advanced Topics"
    
    # Default category
    return "General"

Reorganization Script

def reorganize_files(base_dir, categories):
    """Reorganize files into appropriate category directories"""
    # Create category directories
    for category in categories:
        os.makedirs(os.path.join(base_dir, category), exist_ok=True)
    
    # Process all markdown files
    for root, _, files in os.walk(base_dir):
        for file in files:
            if file.endswith('.md'):
                file_path = os.path.join(root, file)
                
                # Skip files that are already in category folders
                is_in_category = any(os.path.join(base_dir, category) in file_path for category in categories)
                if is_in_category:
                    continue
                
                # Determine category
                category = determine_category(file_path)
                
                # Move file
                destination = os.path.join(base_dir, category, file)
                os.makedirs(os.path.dirname(destination), exist_ok=True)
                os.rename(file_path, destination)
                logging.info(f"Moved {file_path} to {destination}")

Generate Index Files

def create_index_files(base_dir, categories):
    """Create index files for each category"""
    for category in categories:
        category_dir = os.path.join(base_dir, category)
        index_content = f"# {category} Index\n\n"
        index_content += "This index contains links to all documentation in the " + category + " category.\n\n"
        
        files = []
        for file in os.listdir(category_dir):
            if file.endswith('.md') and file != "00_index.md":
                files.append(file)
        
        files.sort()
        
        for file in files:
            name = file[:-3]  # Remove .md extension
            index_content += f"- [{name}]({file})\n"
        
        # Write index file
        with open(os.path.join(category_dir, "00_index.md"), 'w', encoding='utf-8') as f:
            f.write(index_content)

5. Tips for Adapting to Different Documentation Sites

General Tips

  1. Start Small: Begin with a specific section or category of documentation
  2. Incremental Approach: Build the scraper in stages, testing each component
  3. Rate Limiting: Always include delays between requests to avoid overloading servers
  4. Follow Robots.txt: Check if the site has specific crawling rules

Site-Specific Adaptations

For React Documentation:

  • Look for component examples in <div class="examples"> or similar containers
  • Extract prop tables often found in <table class="props"> elements
  • Handle interactive code examples by extracting the raw code

For REST API Documentation:

  • Focus on extracting endpoint details, parameters, request/response samples
  • Parse code examples in multiple languages
  • Identify authentication sections

For Python Libraries:

  • Extract function signatures, parameters, return types
  • Handle class hierarchies and inheritance
  • Process docstring examples

6. Best Practices

  1. Error Handling: Implement robust error handling to prevent crashes
  2. Logging: Maintain detailed logs for debugging
  3. Resumable Scraping: Enable stopping and resuming scraping sessions
  4. Respect Copyright: Ensure your use of the documentation complies with the site's terms
  5. Attribution: Always include source links in the generated documentation
  6. Parallel Processing: For large documentation sites, consider using parallel processing
  7. Maintenance: Create a process for periodic updates to keep documentation current

7. Conclusion

Building documentation scrapers requires combining web scraping techniques with domain-specific knowledge of the documentation structure. By following this guide, you can create robust scrapers for various documentation sites and organize the content effectively for AI training or knowledge base creation.

Remember that each documentation site is unique, so adapting your scraper to the specific structure and patterns of the target site is crucial for success.