ccprocessor
diff --git a/‎README.md‎
Lines changed: 10 additions & 0 deletions b/‎README.md‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎bench/data/groundtruth/math_katex_latex_1.jsonl‎
Lines changed: 1 addition & 1 deletion b/‎bench/data/groundtruth/math_katex_latex_1.jsonl‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎bench/data/groundtruth/math_katex_latex_3.jsonl‎
Lines changed: 1 addition & 1 deletion b/‎bench/data/groundtruth/math_katex_latex_3.jsonl‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/images/extract_method.png‎
253 KB b/‎docs/images/extract_method.png‎
253 KB
diff --git a/‎llm_web_kit/api/README.md‎
Lines changed: 105 additions & 0 deletions b/‎llm_web_kit/api/README.md‎
Lines changed: 105 additions & 0 deletions
diff --git a/‎llm_web_kit/api/__init__.py‎
Lines changed: 7 additions & 0 deletions b/‎llm_web_kit/api/__init__.py‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎llm_web_kit/api/dependencies.py‎
Lines changed: 78 additions & 0 deletions b/‎llm_web_kit/api/dependencies.py‎
Lines changed: 78 additions & 0 deletions
diff --git a/‎llm_web_kit/api/main.py‎
Lines changed: 85 additions & 0 deletions b/‎llm_web_kit/api/main.py‎
Lines changed: 85 additions & 0 deletions
diff --git a/‎llm_web_kit/api/models/__init__.py‎
Lines changed: 13 additions & 0 deletions b/‎llm_web_kit/api/models/__init__.py‎
Lines changed: 13 additions & 0 deletions
diff --git a/‎llm_web_kit/api/models/request.py‎
Lines changed: 41 additions & 0 deletions b/‎llm_web_kit/api/models/request.py‎
Lines changed: 41 additions & 0 deletions
@@ -75,6 +75,16 @@ llm-web-kit is a python library that ..
 
 ## Quick Start
 
+![extract_method picture](/docs/images/extract_method.png)
+
+This diagram shows three main HTML content extraction methods:
+
+1. **extract by magic_html+recognize**: Two-stage complete extraction that first uses magic-html to extract main content, then converts it to structured markdown.
+
+2. **only extract by recognize**: Direct content recognition that converts main_html to structured format without main content identification.
+
+3. **only extract main_html by magic-html**: First-stage only extraction that identifies and extracts main content area while preserving HTML structure.
+
 ### extract by magic_html+recognize
 
 ```python
 
@@ -0,0 +1,105 @@
+# LLM Web Kit API
+
+基于 FastAPI 的 LLM Web Kit API 服务，提供 HTML 解析功能。
+
+## 功能特性
+
+- 🚀 基于 FastAPI 的高性能 Web API
+- 📄 HTML 内容解析与结构化输出
+- 🔗 支持 URL 和 HTML 字符串输入
+- 📁 支持 HTML 文件上传
+- 📚 自动生成的 API 文档
+- 🔧 可配置的解析选项
+
+## 快速开始
+
+配置环境变量
+
+```bash
+export MODEL_PATH=""
+```
+
+或者配置文件.llm-web-kit.jsonc添加“model_path”
+
+安装依赖
+
+```bash
+pip install -r requirements.txt
+python llm_web_kit/api/run_server.py
+```
+
+- Swagger UI: http://127.0.0.1:8000/docs
+- ReDoc: http://127.0.0.1:8000/redoc
+
+## API 端点
+
+### HTML 解析
+
+POST /api/v1/html/parse
+
+请求示例：
+
+```bash
+curl -s -X POST "http://127.0.0.1:8000/api/v1/html/parse" \
+  -H "Content-Type: application/json" \
+  -d '{
+    "html_content": "<html><body><h1>Hello World</h1></body></html>",
+    "url": "https://helloworld.com/hello",
+    "options": {
+      "clean_html": true
+    }
+  }'
+```
+
+或直接发送以下 JSON 作为请求体：
+
+```json
+{
+  "html_content": "<html><body><h1>Hello World</h1></body></html>",
+  "options": {
+    "clean_html": true
+  }
+}
+```
+
+### 文件上传解析
+
+POST /api/v1/html/upload
+
+```bash
+curl -s -X POST "http://127.0.0.1:8000/api/v1/html/upload" \
+  -F "file=@/path/to/file.html"
+```
+
+### 服务状态
+
+GET /api/v1/html/status
+
+## 返回结构示例（/api/v1/html/parse 与 /api/v1/html/upload 成功返回）
+
+以下示例为 HTML 解析成功时的统一响应结构：
+
+```json
+{
+  "success": true,
+  "message": "HTML 解析成功",
+  "timestamp": "2025-08-26T16:45:43.140638",
+  "data": {
+    "layout_file_list": [],
+    "typical_raw_html": "<html><body><h1>Hello World</h1></body></html>",
+    "typical_raw_tag_html": "<html><body><h1 _item_id=\"1\">Hello World</h1><h2 _item_id=\"2\">not main content</h2></body></html>\n",
+    "llm_response": {
+      "item_id 1": 0,
+      "item_id 2": 1
+    },
+    "typical_main_html": "<html><body><h1 _item_id=\"1\">Hello World</h1></body></html>",
+    "html_target_list": ["Hello World"]
+  },
+  "metadata": null
+}
+```
+
+## 常见问题
+
+- 422 错误：确认请求头 `Content-Type: application/json`，并确保请求体 JSON 合法。
+- 依赖缺失：`pip install -r llm_web_kit/api/requirements.txt`。
@@ -0,0 +1,7 @@
+"""LLM Web Kit API 模块.
+
+提供基于 FastAPI 的 Web API 接口，用于处理 HTML 解析和内容提取功能。
+"""
+
+__version__ = "1.0.0"
+__author__ = "LLM Web Kit Team"
@@ -0,0 +1,78 @@
+"""API 依赖项管理.
+
+包含 FastAPI 应用的依赖项、配置管理和共享服务。
+"""
+
+import logging
+from functools import lru_cache
+from typing import Optional
+
+from pydantic_settings import BaseSettings, SettingsConfigDict
+
+logger = logging.getLogger(__name__)
+
+
+class Settings(BaseSettings):
+    """应用配置设置."""
+
+    # API 配置
+    api_title: str = "LLM Web Kit API"
+    api_version: str = "1.0.0"
+    api_description: str = "基于 LLM 的 Web 内容解析和提取 API 服务"
+
+    # 服务器配置
+    host: str = "0.0.0.0"
+    port: int = 8000
+    debug: bool = False
+
+    # 日志配置
+    log_level: str = "INFO"
+
+    # 模型配置
+    model_path: Optional[str] = None
+    max_content_length: int = 10 * 1024 * 1024  # 10MB
+
+    # 缓存配置
+    cache_ttl: int = 3600  # 1小时
+
+    # pydantic v2 配置写法
+    model_config = SettingsConfigDict(
+        env_file=".env",
+        case_sensitive=False
+    )
+
+
+@lru_cache()
+def get_settings() -> Settings:
+    """获取应用配置单例."""
+    return Settings()
+
+
+def get_logger(name: str = __name__) -> logging.Logger:
+    """获取配置好的日志记录器."""
+    logger = logging.getLogger(name)
+    if not logger.handlers:
+        handler = logging.StreamHandler()
+        formatter = logging.Formatter(
+            '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+        )
+        handler.setFormatter(formatter)
+        logger.addHandler(handler)
+        logger.setLevel(get_settings().log_level)
+    return logger
+
+
+# 全局依赖项
+settings = get_settings()
+
+# InferenceService 单例
+_inference_service_singleton = None
+
+
+def get_inference_service():
+    """获取 InferenceService 单例."""
+    global _inference_service_singleton
+    if _inference_service_singleton is None:
+        from .services.inference_service import InferenceService
+        _inference_service_singleton = InferenceService()
+    return _inference_service_singleton
@@ -0,0 +1,85 @@
+"""FastAPI 应用主入口.
+
+提供 LLM Web Kit 的 Web API 服务，包括 HTML 解析、内容提取等功能。
+"""
+
+import uvicorn
+from fastapi import FastAPI
+from fastapi.middleware.cors import CORSMiddleware
+from fastapi.responses import JSONResponse
+
+from .dependencies import get_inference_service, get_logger, get_settings
+from .routers import htmls
+
+settings = get_settings()
+logger = get_logger(__name__)
+
+
+# 创建 FastAPI 应用实例（元数据读取自 Settings）
+app = FastAPI(
+    title=settings.api_title,
+    description=settings.api_description,
+    version=settings.api_version,
+    docs_url="/docs",
+    redoc_url="/redoc"
+)
+
+# 添加 CORS 中间件
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],  # 在生产环境中应该限制具体域名
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+
+# 注册路由
+app.include_router(htmls.router, prefix="/api/v1", tags=["HTML 处理"])
+
+
+@app.get("/")
+async def root():
+    """根路径，返回服务状态信息."""
+    return {
+        "message": "LLM Web Kit API 服务运行中",
+        "version": settings.api_version,
+        "status": "healthy"
+    }
+
+
+@app.get("/health")
+async def health_check():
+    """健康检查端点."""
+    return {"status": "healthy", "service": "llm-web-kit-api"}
+
+
+@app.on_event("startup")
+async def app_startup():
+    """应用启动时预热模型，避免首个请求冷启动延迟."""
+    try:
+        service = get_inference_service()
+        await service.warmup()
+        logger.info("InferenceService 模型预热完成")
+    except Exception as e:
+        logger.warning(f"InferenceService 预热失败（服务仍可运行，将在首次请求时再初始化）: {e}")
+
+
+@app.exception_handler(Exception)
+async def global_exception_handler(request, exc):
+    """全局异常处理器."""
+    logger.error(f"未处理的异常: {exc}")
+    return JSONResponse(
+        status_code=500,
+        content={"detail": "服务器内部错误", "error": str(exc)}
+    )
+
+
+if __name__ == "__main__":
+    # 开发环境运行
+    uvicorn.run(
+        "llm_web_kit.api.main:app",
+        host=settings.host,
+        port=settings.port,
+        reload=True,
+        log_level=(settings.log_level or "INFO").lower()
+    )
@@ -0,0 +1,13 @@
+"""Pydantic 模型模块.
+
+包含所有 API 请求和响应的数据模型定义。
+"""
+
+from .request import HTMLParseRequest
+from .response import ErrorResponse, HTMLParseResponse
+
+__all__ = [
+    "HTMLParseRequest",
+    "HTMLParseResponse",
+    "ErrorResponse"
+]
@@ -0,0 +1,41 @@
+"""请求数据模型.
+
+定义 API 请求的数据结构和验证规则。
+"""
+
+from typing import Any, Dict, Optional
+
+from pydantic import BaseModel, ConfigDict, Field
+
+
+class HTMLParseRequest(BaseModel):
+    """HTML 解析请求模型."""
+
+    html_content: Optional[str] = Field(
+        None,
+        description="HTML 内容字符串",
+        max_length=10485760  # 10MB
+    )
+
+    url: Optional[str] = Field(
+        None,
+        description="url 地址",
+        max_length=10485760  # 10MB
+    )
+
+    options: Optional[Dict[str, Any]] = Field(
+        default_factory=dict,
+        description="解析选项配置"
+    )
+
+    model_config = ConfigDict(
+        json_schema_extra={
+            "example": {
+                "html_content": "<html><body><h1>Hello World</h1></body></html>",
+                "url": "https://helloworld.com/hello",
+                "options": {
+                    "clean_html": True
+                }
+            }
+        }
+    )