Skip to content

Commit 9f22c89

Browse files
authored
Merge pull request #558 from ccprocessor/dev
Release v4.1.0
2 parents 85a91ed + d3a4b2e commit 9f22c89

92 files changed

Lines changed: 20857 additions & 906 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

README.md

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,16 @@ llm-web-kit is a python library that ..
7575

7676
## Quick Start
7777

78+
![extract_method picture](/docs/images/extract_method.png)
79+
80+
This diagram shows three main HTML content extraction methods:
81+
82+
1. **extract by magic_html+recognize**: Two-stage complete extraction that first uses magic-html to extract main content, then converts it to structured markdown.
83+
84+
2. **only extract by recognize**: Direct content recognition that converts main_html to structured format without main content identification.
85+
86+
3. **only extract main_html by magic-html**: First-stage only extraction that identifies and extracts main content area while preserving HTML structure.
87+
7888
### extract by magic_html+recognize
7989

8090
```python

bench/data/groundtruth/math_katex_latex_1.jsonl

Lines changed: 1 addition & 1 deletion
Large diffs are not rendered by default.

bench/data/groundtruth/math_katex_latex_3.jsonl

Lines changed: 1 addition & 1 deletion
Large diffs are not rendered by default.

docs/images/extract_method.png

253 KB
Loading

llm_web_kit/api/README.md

Lines changed: 105 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,105 @@
1+
# LLM Web Kit API
2+
3+
基于 FastAPI 的 LLM Web Kit API 服务,提供 HTML 解析功能。
4+
5+
## 功能特性
6+
7+
- 🚀 基于 FastAPI 的高性能 Web API
8+
- 📄 HTML 内容解析与结构化输出
9+
- 🔗 支持 URL 和 HTML 字符串输入
10+
- 📁 支持 HTML 文件上传
11+
- 📚 自动生成的 API 文档
12+
- 🔧 可配置的解析选项
13+
14+
## 快速开始
15+
16+
配置环境变量
17+
18+
```bash
19+
export MODEL_PATH=""
20+
```
21+
22+
或者配置文件.llm-web-kit.jsonc添加“model_path”
23+
24+
安装依赖
25+
26+
```bash
27+
pip install -r requirements.txt
28+
python llm_web_kit/api/run_server.py
29+
```
30+
31+
- Swagger UI: http://127.0.0.1:8000/docs
32+
- ReDoc: http://127.0.0.1:8000/redoc
33+
34+
## API 端点
35+
36+
### HTML 解析
37+
38+
POST /api/v1/html/parse
39+
40+
请求示例:
41+
42+
```bash
43+
curl -s -X POST "http://127.0.0.1:8000/api/v1/html/parse" \
44+
-H "Content-Type: application/json" \
45+
-d '{
46+
"html_content": "<html><body><h1>Hello World</h1></body></html>",
47+
"url": "https://helloworld.com/hello",
48+
"options": {
49+
"clean_html": true
50+
}
51+
}'
52+
```
53+
54+
或直接发送以下 JSON 作为请求体:
55+
56+
```json
57+
{
58+
"html_content": "<html><body><h1>Hello World</h1></body></html>",
59+
"options": {
60+
"clean_html": true
61+
}
62+
}
63+
```
64+
65+
### 文件上传解析
66+
67+
POST /api/v1/html/upload
68+
69+
```bash
70+
curl -s -X POST "http://127.0.0.1:8000/api/v1/html/upload" \
71+
-F "file=@/path/to/file.html"
72+
```
73+
74+
### 服务状态
75+
76+
GET /api/v1/html/status
77+
78+
## 返回结构示例(/api/v1/html/parse 与 /api/v1/html/upload 成功返回)
79+
80+
以下示例为 HTML 解析成功时的统一响应结构:
81+
82+
```json
83+
{
84+
"success": true,
85+
"message": "HTML 解析成功",
86+
"timestamp": "2025-08-26T16:45:43.140638",
87+
"data": {
88+
"layout_file_list": [],
89+
"typical_raw_html": "<html><body><h1>Hello World</h1></body></html>",
90+
"typical_raw_tag_html": "<html><body><h1 _item_id=\"1\">Hello World</h1><h2 _item_id=\"2\">not main content</h2></body></html>\n",
91+
"llm_response": {
92+
"item_id 1": 0,
93+
"item_id 2": 1
94+
},
95+
"typical_main_html": "<html><body><h1 _item_id=\"1\">Hello World</h1></body></html>",
96+
"html_target_list": ["Hello World"]
97+
},
98+
"metadata": null
99+
}
100+
```
101+
102+
## 常见问题
103+
104+
- 422 错误:确认请求头 `Content-Type: application/json`,并确保请求体 JSON 合法。
105+
- 依赖缺失:`pip install -r llm_web_kit/api/requirements.txt`

llm_web_kit/api/__init__.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
"""LLM Web Kit API 模块.
2+
3+
提供基于 FastAPI 的 Web API 接口,用于处理 HTML 解析和内容提取功能。
4+
"""
5+
6+
__version__ = "1.0.0"
7+
__author__ = "LLM Web Kit Team"

llm_web_kit/api/dependencies.py

Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
"""API 依赖项管理.
2+
3+
包含 FastAPI 应用的依赖项、配置管理和共享服务。
4+
"""
5+
6+
import logging
7+
from functools import lru_cache
8+
from typing import Optional
9+
10+
from pydantic_settings import BaseSettings, SettingsConfigDict
11+
12+
logger = logging.getLogger(__name__)
13+
14+
15+
class Settings(BaseSettings):
16+
"""应用配置设置."""
17+
18+
# API 配置
19+
api_title: str = "LLM Web Kit API"
20+
api_version: str = "1.0.0"
21+
api_description: str = "基于 LLM 的 Web 内容解析和提取 API 服务"
22+
23+
# 服务器配置
24+
host: str = "0.0.0.0"
25+
port: int = 8000
26+
debug: bool = False
27+
28+
# 日志配置
29+
log_level: str = "INFO"
30+
31+
# 模型配置
32+
model_path: Optional[str] = None
33+
max_content_length: int = 10 * 1024 * 1024 # 10MB
34+
35+
# 缓存配置
36+
cache_ttl: int = 3600 # 1小时
37+
38+
# pydantic v2 配置写法
39+
model_config = SettingsConfigDict(
40+
env_file=".env",
41+
case_sensitive=False
42+
)
43+
44+
45+
@lru_cache()
46+
def get_settings() -> Settings:
47+
"""获取应用配置单例."""
48+
return Settings()
49+
50+
51+
def get_logger(name: str = __name__) -> logging.Logger:
52+
"""获取配置好的日志记录器."""
53+
logger = logging.getLogger(name)
54+
if not logger.handlers:
55+
handler = logging.StreamHandler()
56+
formatter = logging.Formatter(
57+
'%(asctime)s - %(name)s - %(levelname)s - %(message)s'
58+
)
59+
handler.setFormatter(formatter)
60+
logger.addHandler(handler)
61+
logger.setLevel(get_settings().log_level)
62+
return logger
63+
64+
65+
# 全局依赖项
66+
settings = get_settings()
67+
68+
# InferenceService 单例
69+
_inference_service_singleton = None
70+
71+
72+
def get_inference_service():
73+
"""获取 InferenceService 单例."""
74+
global _inference_service_singleton
75+
if _inference_service_singleton is None:
76+
from .services.inference_service import InferenceService
77+
_inference_service_singleton = InferenceService()
78+
return _inference_service_singleton

llm_web_kit/api/main.py

Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,85 @@
1+
"""FastAPI 应用主入口.
2+
3+
提供 LLM Web Kit 的 Web API 服务,包括 HTML 解析、内容提取等功能。
4+
"""
5+
6+
import uvicorn
7+
from fastapi import FastAPI
8+
from fastapi.middleware.cors import CORSMiddleware
9+
from fastapi.responses import JSONResponse
10+
11+
from .dependencies import get_inference_service, get_logger, get_settings
12+
from .routers import htmls
13+
14+
settings = get_settings()
15+
logger = get_logger(__name__)
16+
17+
18+
# 创建 FastAPI 应用实例(元数据读取自 Settings)
19+
app = FastAPI(
20+
title=settings.api_title,
21+
description=settings.api_description,
22+
version=settings.api_version,
23+
docs_url="/docs",
24+
redoc_url="/redoc"
25+
)
26+
27+
# 添加 CORS 中间件
28+
app.add_middleware(
29+
CORSMiddleware,
30+
allow_origins=["*"], # 在生产环境中应该限制具体域名
31+
allow_credentials=True,
32+
allow_methods=["*"],
33+
allow_headers=["*"],
34+
)
35+
36+
# 注册路由
37+
app.include_router(htmls.router, prefix="/api/v1", tags=["HTML 处理"])
38+
39+
40+
@app.get("/")
41+
async def root():
42+
"""根路径,返回服务状态信息."""
43+
return {
44+
"message": "LLM Web Kit API 服务运行中",
45+
"version": settings.api_version,
46+
"status": "healthy"
47+
}
48+
49+
50+
@app.get("/health")
51+
async def health_check():
52+
"""健康检查端点."""
53+
return {"status": "healthy", "service": "llm-web-kit-api"}
54+
55+
56+
@app.on_event("startup")
57+
async def app_startup():
58+
"""应用启动时预热模型,避免首个请求冷启动延迟."""
59+
try:
60+
service = get_inference_service()
61+
await service.warmup()
62+
logger.info("InferenceService 模型预热完成")
63+
except Exception as e:
64+
logger.warning(f"InferenceService 预热失败(服务仍可运行,将在首次请求时再初始化): {e}")
65+
66+
67+
@app.exception_handler(Exception)
68+
async def global_exception_handler(request, exc):
69+
"""全局异常处理器."""
70+
logger.error(f"未处理的异常: {exc}")
71+
return JSONResponse(
72+
status_code=500,
73+
content={"detail": "服务器内部错误", "error": str(exc)}
74+
)
75+
76+
77+
if __name__ == "__main__":
78+
# 开发环境运行
79+
uvicorn.run(
80+
"llm_web_kit.api.main:app",
81+
host=settings.host,
82+
port=settings.port,
83+
reload=True,
84+
log_level=(settings.log_level or "INFO").lower()
85+
)

llm_web_kit/api/models/__init__.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
"""Pydantic 模型模块.
2+
3+
包含所有 API 请求和响应的数据模型定义。
4+
"""
5+
6+
from .request import HTMLParseRequest
7+
from .response import ErrorResponse, HTMLParseResponse
8+
9+
__all__ = [
10+
"HTMLParseRequest",
11+
"HTMLParseResponse",
12+
"ErrorResponse"
13+
]

llm_web_kit/api/models/request.py

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
"""请求数据模型.
2+
3+
定义 API 请求的数据结构和验证规则。
4+
"""
5+
6+
from typing import Any, Dict, Optional
7+
8+
from pydantic import BaseModel, ConfigDict, Field
9+
10+
11+
class HTMLParseRequest(BaseModel):
12+
"""HTML 解析请求模型."""
13+
14+
html_content: Optional[str] = Field(
15+
None,
16+
description="HTML 内容字符串",
17+
max_length=10485760 # 10MB
18+
)
19+
20+
url: Optional[str] = Field(
21+
None,
22+
description="url 地址",
23+
max_length=10485760 # 10MB
24+
)
25+
26+
options: Optional[Dict[str, Any]] = Field(
27+
default_factory=dict,
28+
description="解析选项配置"
29+
)
30+
31+
model_config = ConfigDict(
32+
json_schema_extra={
33+
"example": {
34+
"html_content": "<html><body><h1>Hello World</h1></body></html>",
35+
"url": "https://helloworld.com/hello",
36+
"options": {
37+
"clean_html": True
38+
}
39+
}
40+
}
41+
)

0 commit comments

Comments
 (0)