Skip to content

Commit b1bc533

Browse files
authored
Merge pull request #507 from ccprocessor/dev
Release v3.2.0
2 parents 1bcbcf1 + bd1e4c8 commit b1bc533

327 files changed

Lines changed: 1311738 additions & 3464 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.github/workflows/lint.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ jobs:
2121
- name: Install pre-commit hook
2222
run: |
2323
pip install pre-commit==3.8.0
24+
pip install jupyter
2425
pre-commit install
2526
- name: Linting
2627
run: pre-commit run --all-files

.github/workflows/pr_stage_test.yml

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,9 @@ jobs:
4545
pip install -r requirements/runtime.txt
4646
pip install -r requirements/dev.txt
4747
- name: Run tests and collect coverage
48-
run: pytest --cov --cov-report=xml ./tests/llm_web_kit
48+
run: |
49+
pytest --cov --cov-report=xml ./tests/llm_web_kit
50+
pytest -n auto ./tests/jupyter
4951
- name: Upload coverage reports to Codecov
5052
uses: codecov/codecov-action@v5
5153
with:

.github/workflows/pr_ut_test_extra.yml

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,9 @@ jobs:
4040
pip install -r requirements/runtime.txt
4141
pip install -r requirements/dev.txt
4242
- name: Run tests and collect coverage
43-
run: pytest --cov --cov-report=xml -n auto ./tests/llm_web_kit
43+
run: |
44+
pytest --cov --cov-report=xml -n auto ./tests/llm_web_kit
45+
pytest -n auto ./tests/jupyter
4446
- name: Upload coverage reports to Codecov
4547
uses: codecov/codecov-action@v5
4648
with:

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,3 +48,4 @@ coverage.xml
4848
llm_web_kit.egg-info/*
4949
.llm-web-kit.jsonc
5050
.llm-web-kit-pageclassify.jsonc
51+
tests/llm_web_kit/extractor/ygq_testmd

.pre-commit-config.yaml

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -28,19 +28,19 @@ repos:
2828
rev: v4.3.0
2929
hooks:
3030
- id: trailing-whitespace
31-
exclude: '^tests/.*/assets/|llm_web_kit/model/assets/.*'
31+
exclude: '^tests/.*/assets/|llm_web_kit/model/assets/.*|\.ipynb$'
3232
- id: check-yaml
3333
- id: end-of-file-fixer
34-
exclude: '^tests/.*/assets/|llm_web_kit/model/assets/.*'
34+
exclude: '^tests/.*/assets/|llm_web_kit/model/assets/.*|\.ipynb$'
3535
- id: requirements-txt-fixer
3636
- id: double-quote-string-fixer
37-
exclude: '^tests/.*/assets/|llm_web_kit/model/assets/.*'
37+
exclude: '^tests/.*/assets/|llm_web_kit/model/assets/.*|jupyter/domain_clustering/.*'
3838
- id: check-merge-conflict
3939
- id: fix-encoding-pragma
4040
args: [ "--remove" ]
4141
- id: mixed-line-ending
4242
args: [ "--fix=lf" ]
43-
exclude: '^tests/.*/assets/'
43+
exclude: '^tests/.*/assets/|\.ipynb$'
4444
- repo: https://github.com/executablebooks/mdformat
4545
rev: 0.7.9
4646
hooks:
@@ -56,3 +56,11 @@ repos:
5656
hooks:
5757
- id: docformatter
5858
args: [ "--in-place", "--wrap-descriptions", "119" ]
59+
- repo: local
60+
hooks:
61+
- id: clear-jupyter-notebook-output
62+
name: Clear Jupyter Notebook Output
63+
entry: jupyter nbconvert --ClearOutputPreprocessor.enabled=True --inplace
64+
language: system
65+
files: \.ipynb$
66+
types: [file]

README.md

Lines changed: 88 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -75,8 +75,10 @@ llm-web-kit is a python library that ..
7575

7676
## Quick Start
7777

78+
### extract by magic_html+recognize
79+
7880
```python
79-
from llm_web_kit.simple import extract_html_to_md
81+
from llm_web_kit.simple import extract_html_to_md, extract_html_to_mm_md
8082
import traceback
8183
from loguru import logger
8284

@@ -95,6 +97,91 @@ if __name__=="__main__":
9597
markdown = extract(url, html)
9698
```
9799

100+
### only extract by recognize
101+
102+
```python
103+
from llm_web_kit.simple import extract_html_to_md, extract_html_to_mm_md
104+
import traceback
105+
from loguru import logger
106+
107+
def extract(url:str, raw_html:str) -> str:
108+
try:
109+
nlp_md = extract_html_to_md(url, raw_html, clip_html=False)
110+
# or mm_nlp_md = extract_html_to_mm_md(url, raw_html, clip_html=False)
111+
return nlp_md
112+
except Exception as e:
113+
logger.exception(e)
114+
return None
115+
116+
if __name__=="__main__":
117+
url = ""
118+
html = ""
119+
markdown = extract(url, html)
120+
```
121+
122+
### only extract main_html by magic-html
123+
124+
```python
125+
from llm_web_kit.simple import extract_main_html_by_maigic_html
126+
import traceback
127+
from loguru import logger
128+
129+
def extract(url:str, html:str) -> str:
130+
try:
131+
main_html = extract_main_html_by_maigic_html(url, html)
132+
# or mm_main_html = extract_pure_html_to_mm_md(url, html)
133+
return main_html
134+
except Exception as e:
135+
logger.exception(e)
136+
return None
137+
138+
if __name__=="__main__":
139+
url = ""
140+
html = ""
141+
main_html = extract(url, html)
142+
```
143+
144+
### extract main_html by model response
145+
146+
```python
147+
import traceback
148+
from loguru import logger
149+
from llm_web_kit.main_html_parser.simplify_html.simplify_html import simplify_html
150+
from llm_web_kit.input.pre_data_json import PreDataJson, PreDataJsonKey
151+
from llm_web_kit.main_html_parser.parser.tag_mapping import MapItemToHtmlTagsParser
152+
153+
def extract(response_json: dict, html:str) -> str:
154+
try:
155+
_, typical_raw_tag_html, _ = simplify_html(html)
156+
pre_data = PreDataJson({})
157+
pre_data[PreDataJsonKey.TYPICAL_RAW_TAG_HTML] = typical_raw_tag_html
158+
pre_data[PreDataJsonKey.TYPICAL_RAW_HTML] = html
159+
pre_data['success_label_enable'] = True
160+
pre_data[PreDataJsonKey.LLM_RESPONSE] = response_json
161+
parser = MapItemToHtmlTagsParser({})
162+
pre_data = parser.parse_single(pre_data)
163+
main_html = pre_data[PreDataJsonKey.TYPICAL_MAIN_HTML]
164+
is_success = pre_data[PreDataJsonKey.TYPICAL_MAIN_HTML_SUCCESS]
165+
return main_html, is_success
166+
except Exception as e:
167+
logger.exception(e)
168+
return None
169+
170+
if __name__=="__main__":
171+
response_json = {'item_id 1': 0, 'item_id 2': 1, 'item_id 3': 1}
172+
html = ""
173+
main_html, is_success = extract(response_json, html)
174+
```
175+
176+
## Pipeline
177+
178+
1. [HTML pre-dedup](jupyter/html-pre-dedup/main.ipynb)
179+
2. [domain clustering](jupyter/domain_clustering/README.md)
180+
3. [layout clustering](jupyter/layout-clustering/main.ipynb)
181+
4. [typical layout node selection](jupyter/typical-html-select/main.ipynb)
182+
5. [HTML node select by LLM](jupyter/html-node-select-llm/main.ipynb)
183+
6. [html parse layout by layout](jupyter/html-parse-by-layout/main.ipynb)
184+
98185
## Usage
99186

100187
# TODO

bench/config/data_config.jsonl

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,9 @@
4343
{"track_id": "math_stackexchange_1", "dataset_name": "eval_pipeline_suit", "url": "https://math.stackexchange.com/questions/458323/is-8327-1-a-prime-number?answertab=active", "data_source_category": "HTML", "path":"data/origin/math_stackexchange_1.html", "file_bytes": 1000, "page_layout_type": "forum", "meta_info": {"input_datetime": "2025-03-17 15:00:00"}}
4444
{"track_id": "math_physicsforums_1", "dataset_name": "eval_pipeline_suit", "url": "https://www.physicsforums.com/threads/probability-theoretic-inequality.246150/", "data_source_category": "HTML", "path":"data/origin/math_physicsforums_1.html", "file_bytes": 1000, "page_layout_type": "forum", "meta_info": {"input_datetime": "2025-03-17 15:00:00"}}
4545
{"track_id": "math_physicsforums_2", "dataset_name": "eval_pipeline_suit", "url": "https://physicshelpforum.com/t/latex-upgrade-physics-forum-powered-by-mathjax-v3.17489/", "data_source_category": "HTML", "path":"data/origin/math_physicsforums_2.html", "file_bytes": 1000, "page_layout_type": "forum", "meta_info": {"input_datetime": "2025-03-17 15:00:00"}}
46+
{"track_id": "math_img_1", "dataset_name": "eval_pipeline_suit", "url": "https://terrytao.wordpress.com/2008/05/16/285g-lecture-12-high-curvature-regions-of-ricci-flow-and-%ce%ba-solutions/", "data_source_category": "HTML", "path":"data/origin/math_img_1.html", "file_bytes": 1000, "page_layout_type": "article", "meta_info": {"input_datetime": "2025-03-17 15:00:00"}}
47+
{"track_id": "math_img_2", "dataset_name": "eval_pipeline_suit", "url": "https://up-skill.me/math/find-interquartile-range.html", "data_source_category": "HTML", "path":"data/origin/math_img_2.html", "file_bytes": 1000, "page_layout_type": "article", "meta_info": {"input_datetime": "2025-03-17 15:00:00"}}
48+
{"track_id": "math_img_3", "dataset_name": "eval_pipeline_suit", "url": "https://math.eretrandre.org/tetrationforum/showthread.php?tid=965", "data_source_category": "HTML", "path":"data/origin/math_img_3.html", "file_bytes": 1000, "page_layout_type": "forum", "meta_info": {"input_datetime": "2025-03-17 15:00:00"}}
4649
{"track_id": "table_1", "dataset_name": "eval_pipeline_suit", "url": "https://grondwerken-overzicht.nl/noordwolde/gwy", "data_source_category": "HTML", "path":"data/origin/table_1.html", "file_bytes": 1000, "page_layout_type": "article", "meta_info": {"input_datetime": "2025-03-17 15:00:00"}}
4750
{"track_id": "table_2", "dataset_name": "eval_pipeline_suit", "url": "https://10bestreviewz.com/automotive/best-underbody-underglow-kits", "data_source_category": "HTML", "path":"data/origin/table_2.html", "file_bytes": 1000, "page_layout_type": "article", "meta_info": {"input_datetime": "2025-03-17 15:00:00"}}
4851
{"track_id": "table_3", "dataset_name": "eval_pipeline_suit", "url": "https://cavaliers.co.uk/results/judges/Mrs%20S%20Hindle.htm", "data_source_category": "HTML", "path":"data/origin/table_3.html", "file_bytes": 1000, "page_layout_type": "article", "meta_info": {"input_datetime": "2025-03-17 15:00:00"}}

bench/config/ours_config.jsonc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@
3737
"extractor": [
3838
{
3939
"enable": true,
40-
"python_class": "llm_web_kit.extractor.html.extractor.HTMLFileFormatExtractor",
40+
"python_class": "llm_web_kit.extractor.html.extractor.MagicHTMLFIleFormatorExtractor",
4141
"class_init_kwargs": {}
4242
}
4343
],

bench/data/groundtruth/math_img_1.jsonl

Lines changed: 1 addition & 0 deletions
Large diffs are not rendered by default.

bench/data/groundtruth/math_img_2.jsonl

Lines changed: 1 addition & 0 deletions
Large diffs are not rendered by default.

0 commit comments

Comments
 (0)