This guide outlines a comprehensive approach to creating documentation scrapers for various web-based documentation sites and organizing the scraped content for use with AI models or knowledge bases.
Building a documentation scraper involves three main components:
- Web Scraping: Extracting content from documentation websites
- Content Processing: Cleaning and converting HTML to Markdown
- Content Organization: Categorizing content into logical structures
import os
import re
import requests
import time
import logging
import argparse
from bs4 import BeautifulSoup
from urllib.parse import urlparse, urljoin
from concurrent.futures import ThreadPoolExecutorclass DocumentationScraper:
def __init__(self, base_url, output_dir, delay=0.5, max_pages=None):
self.base_url = base_url
self.output_dir = output_dir
self.delay = delay
self.max_pages = max_pages
self.processed_urls = set()
self.urls_to_process = [base_url]
# Configure logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s',
handlers=[
logging.FileHandler("scraper.log"),
logging.StreamHandler()
]
)
self.logger = logging.getLogger()
# Create output directory
os.makedirs(output_dir, exist_ok=True)
def run(self):
"""Main scraper loop"""
count = 0
while self.urls_to_process and (self.max_pages is None or count < self.max_pages):
url = self.urls_to_process.pop(0)
if url in self.processed_urls:
continue
self.process_page(url)
self.processed_urls.add(url)
count += 1
time.sleep(self.delay)
self.logger.info(f"Scraping completed. Processed {len(self.processed_urls)} pages.")
def process_page(self, url):
"""Process a single page: fetch, extract content, and save"""
try:
self.logger.info(f"Processing {url}")
response = requests.get(url, timeout=10)
if response.status_code != 200:
self.logger.warning(f"Failed to fetch {url}: Status code {response.status_code}")
return
soup = BeautifulSoup(response.text, 'html.parser')
# Extract content
content = self.extract_content(soup, url)
# Save content
self.save_content(url, content)
# Find new links
new_urls = self.extract_links(soup, url)
for new_url in new_urls:
if new_url not in self.processed_urls and new_url not in self.urls_to_process:
self.urls_to_process.append(new_url)
except Exception as e:
self.logger.error(f"Error processing {url}: {str(e)}")
def extract_content(self, soup, url):
"""Extract and format content from the page"""
# This is site-specific and must be customized
# Example:
main_content = soup.find('main') or soup.find('article') or soup.find('div', class_='content')
if not main_content:
return ""
# Convert to Markdown (simplified example)
title = soup.find('h1')
title_text = title.get_text() if title else os.path.basename(url)
content = f"# {title_text}\n\n"
content += f"*Source: [{url}]({url})*\n\n"
# Process content further...
return content
def extract_links(self, soup, url):
"""Extract relevant links to follow"""
# This needs to be customized per site
links = []
for a in soup.find_all('a', href=True):
href = a['href']
full_url = urljoin(url, href)
# Only follow links within the same domain
if self.should_follow_link(full_url):
links.append(full_url)
return links
def should_follow_link(self, url):
"""Determine if a link should be followed"""
parsed_base = urlparse(self.base_url)
parsed_url = urlparse(url)
# Same domain check
if parsed_url.netloc != parsed_base.netloc:
return False
# Check if it's within the documentation path
if not parsed_url.path.startswith(parsed_base.path):
return False
# Avoid query parameters, fragments, etc.
if parsed_url.query or parsed_url.fragment:
return False
return True
def save_content(self, url, content):
"""Save content to a file"""
if not content:
return
# Create file path based on URL
parsed_url = urlparse(url)
path_parts = parsed_url.path.strip('/').split('/')
# Skip the common base path parts
base_parts = urlparse(self.base_url).path.strip('/').split('/')
for part in base_parts:
if path_parts and path_parts[0] == part:
path_parts.pop(0)
# Create directory structure
dir_path = self.output_dir
if path_parts:
dir_path = os.path.join(self.output_dir, *path_parts[:-1])
os.makedirs(dir_path, exist_ok=True)
# Create filename
filename = path_parts[-1] if path_parts else 'index'
if not filename.endswith('.md'):
filename += '.md'
file_path = os.path.join(dir_path, filename)
# Write content
with open(file_path, 'w', encoding='utf-8') as f:
f.write(content)
self.logger.info(f"Saved content to {file_path}")-
Inspect the DOM Structure: Use browser developer tools to identify:
- Main content containers
- Navigation elements
- Code blocks
- Documentation-specific components
-
Analyze URL Patterns: Look for:
- Section indicators in the URL
- Version indicators
- Category/subcategory structures
-
Extract Content with Custom Selectors:
def extract_content(self, soup, url):
# Example for a documentation site with specific selectors
title = soup.find('h1', class_='documentation-title')
content_div = soup.find('div', class_='documentation-content')
sidebar = soup.find('div', class_='sidebar')
# Combine the content
markdown = f"# {title.get_text() if title else 'Untitled'}\n\n"
markdown += f"*Source: [{url}]({url})*\n\n"
# Extract sections
sections = content_div.find_all(['h2', 'h3', 'p', 'pre', 'ul', 'ol']) if content_div else []
for section in sections:
if section.name in ['h2', 'h3']:
level = int(section.name[1])
markdown += f"{'#' * level} {section.get_text().strip()}\n\n"
elif section.name == 'p':
markdown += f"{section.get_text().strip()}\n\n"
elif section.name == 'pre':
code = section.get_text().strip()
language = 'javascript' # Default, can be detected based on site
markdown += f"```{language}\n{code}\n```\n\n"
elif section.name in ['ul', 'ol']:
for li in section.find_all('li'):
prefix = '- ' if section.name == 'ul' else '1. '
markdown += f"{prefix}{li.get_text().strip()}\n"
markdown += "\n"
return markdowndef extract_api_reference(self, soup, url):
# For API documentation with function signatures, parameters, return types
title = soup.find('h1').get_text()
signature = soup.find('div', class_='signature').get_text()
description = soup.find('div', class_='description').get_text()
markdown = f"# {title}\n\n"
markdown += f"*Source: [{url}]({url})*\n\n"
markdown += f"## Signature\n\n```typescript\n{signature}\n```\n\n"
markdown += f"## Description\n\n{description}\n\n"
# Parameters section
params = soup.find('div', class_='parameters')
if params:
markdown += "## Parameters\n\n"
for param in params.find_all('div', class_='parameter'):
param_name = param.find('span', class_='name').get_text()
param_type = param.find('span', class_='type').get_text()
param_desc = param.find('div', class_='description').get_text()
markdown += f"### {param_name}\n\n"
markdown += f"**Type**: `{param_type}`\n\n"
markdown += f"{param_desc}\n\n"
# Return value
returns = soup.find('div', class_='returns')
if returns:
return_type = returns.find('span', class_='type').get_text()
return_desc = returns.find('div', class_='description').get_text()
markdown += f"## Returns\n\n"
markdown += f"**Type**: `{return_type}`\n\n"
markdown += f"{return_desc}\n\n"
return markdowndef extract_concept_doc(self, soup, url):
# For tutorials, guides, and conceptual docs
title = soup.find('h1').get_text()
intro = soup.find('div', class_='introduction').get_text()
markdown = f"# {title}\n\n"
markdown += f"*Source: [{url}]({url})*\n\n"
markdown += f"{intro}\n\n"
# Extract sections with headings
content = soup.find('div', class_='content')
current_section = None
for element in content.children:
if element.name in ['h2', 'h3', 'h4']:
level = int(element.name[1])
heading = element.get_text().strip()
markdown += f"{'#' * level} {heading}\n\n"
elif element.name == 'p':
text = element.get_text().strip()
markdown += f"{text}\n\n"
elif element.name == 'div' and 'code-block' in element.get('class', []):
code = element.get_text().strip()
language = element.get('data-language', 'bash')
markdown += f"```{language}\n{code}\n```\n\n"
# Handle other element types like lists, tables, etc.
return markdowndef clean_markdown(content):
"""Clean and format markdown content"""
# Fix code blocks
content = re.sub(r'```\s+([a-zA-Z0-9]+)', r'```\1', content)
# Fix headings without spaces
content = re.sub(r'(#+)([a-zA-Z0-9])', r'\1 \2', content)
# Fix extra newlines
content = re.sub(r'\n{3,}', '\n\n', content)
# Fix inline code
content = re.sub(r'(?<!\`)\`([^\`]+)\`(?!\`)', r'`\1`', content)
# Remove common boilerplate text
content = re.sub(r'Was this page helpful\?.*', '', content, flags=re.DOTALL)
return contentdef determine_category(file_path):
"""Determine the appropriate category for a file based on its content"""
with open(file_path, 'r', encoding='utf-8') as f:
content = f.read()
# Patterns for different categories
ui_component_patterns = [
r'\bComponent\b', r'\bButton\b', r'\bProvider\b',
r'\bRenderer\b', r'\bView\b', r'<.*?>', r'JSX'
]
react_hook_patterns = [
r'\buse[A-Z]', r'\buseEffect\b', r'\buseState\b',
r'\bhook\b', r'\bHook\b', r'\breact\b'
]
core_function_patterns = [
r'\bfunction\b', r'\butil\b', r'\bcreate\b',
r'\bget[A-Z]', r'\bfetch\b', r'\bconnect\b'
]
advanced_topic_patterns = [
r'advanced', r'complex', r'deep dive',
r'architecture', r'internals', r'optimization'
]
# Check patterns
for pattern in ui_component_patterns:
if re.search(pattern, content, re.IGNORECASE):
return "UI Components"
for pattern in react_hook_patterns:
if re.search(pattern, content, re.IGNORECASE):
return "React Hooks"
for pattern in core_function_patterns:
if re.search(pattern, content, re.IGNORECASE):
return "Core Functions"
for pattern in advanced_topic_patterns:
if re.search(pattern, content, re.IGNORECASE):
return "Advanced Topics"
# Default category
return "General"def reorganize_files(base_dir, categories):
"""Reorganize files into appropriate category directories"""
# Create category directories
for category in categories:
os.makedirs(os.path.join(base_dir, category), exist_ok=True)
# Process all markdown files
for root, _, files in os.walk(base_dir):
for file in files:
if file.endswith('.md'):
file_path = os.path.join(root, file)
# Skip files that are already in category folders
is_in_category = any(os.path.join(base_dir, category) in file_path for category in categories)
if is_in_category:
continue
# Determine category
category = determine_category(file_path)
# Move file
destination = os.path.join(base_dir, category, file)
os.makedirs(os.path.dirname(destination), exist_ok=True)
os.rename(file_path, destination)
logging.info(f"Moved {file_path} to {destination}")def create_index_files(base_dir, categories):
"""Create index files for each category"""
for category in categories:
category_dir = os.path.join(base_dir, category)
index_content = f"# {category} Index\n\n"
index_content += "This index contains links to all documentation in the " + category + " category.\n\n"
files = []
for file in os.listdir(category_dir):
if file.endswith('.md') and file != "00_index.md":
files.append(file)
files.sort()
for file in files:
name = file[:-3] # Remove .md extension
index_content += f"- [{name}]({file})\n"
# Write index file
with open(os.path.join(category_dir, "00_index.md"), 'w', encoding='utf-8') as f:
f.write(index_content)- Start Small: Begin with a specific section or category of documentation
- Incremental Approach: Build the scraper in stages, testing each component
- Rate Limiting: Always include delays between requests to avoid overloading servers
- Follow Robots.txt: Check if the site has specific crawling rules
- Look for component examples in
<div class="examples">or similar containers - Extract prop tables often found in
<table class="props">elements - Handle interactive code examples by extracting the raw code
- Focus on extracting endpoint details, parameters, request/response samples
- Parse code examples in multiple languages
- Identify authentication sections
- Extract function signatures, parameters, return types
- Handle class hierarchies and inheritance
- Process docstring examples
- Error Handling: Implement robust error handling to prevent crashes
- Logging: Maintain detailed logs for debugging
- Resumable Scraping: Enable stopping and resuming scraping sessions
- Respect Copyright: Ensure your use of the documentation complies with the site's terms
- Attribution: Always include source links in the generated documentation
- Parallel Processing: For large documentation sites, consider using parallel processing
- Maintenance: Create a process for periodic updates to keep documentation current
Building documentation scrapers requires combining web scraping techniques with domain-specific knowledge of the documentation structure. By following this guide, you can create robust scrapers for various documentation sites and organize the content effectively for AI training or knowledge base creation.
Remember that each documentation site is unique, so adapting your scraper to the specific structure and patterns of the target site is crucial for success.