ccprocessor
diff --git a/‎.github/workflows/lint.yml‎
Lines changed: 1 addition & 0 deletions b/‎.github/workflows/lint.yml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎.github/workflows/pr_stage_test.yml‎
Lines changed: 3 additions & 1 deletion b/‎.github/workflows/pr_stage_test.yml‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎.github/workflows/pr_ut_test_extra.yml‎
Lines changed: 3 additions & 1 deletion b/‎.github/workflows/pr_ut_test_extra.yml‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎.gitignore‎
Lines changed: 1 addition & 0 deletions b/‎.gitignore‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎.pre-commit-config.yaml‎
Lines changed: 12 additions & 4 deletions b/‎.pre-commit-config.yaml‎
Lines changed: 12 additions & 4 deletions
diff --git a/‎README.md‎
Lines changed: 88 additions & 1 deletion b/‎README.md‎
Lines changed: 88 additions & 1 deletion
diff --git a/‎bench/config/data_config.jsonl‎
Lines changed: 3 additions & 0 deletions b/‎bench/config/data_config.jsonl‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎bench/config/ours_config.jsonc‎
Lines changed: 1 addition & 1 deletion b/‎bench/config/ours_config.jsonc‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎bench/data/groundtruth/math_img_1.jsonl‎
Lines changed: 1 addition & 0 deletions b/‎bench/data/groundtruth/math_img_1.jsonl‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎bench/data/groundtruth/math_img_2.jsonl‎
Lines changed: 1 addition & 0 deletions b/‎bench/data/groundtruth/math_img_2.jsonl‎
Lines changed: 1 addition & 0 deletions
@@ -21,6 +21,7 @@ jobs:
       - name: Install pre-commit hook
         run: |
           pip install pre-commit==3.8.0
+          pip install jupyter
           pre-commit install
       - name: Linting
         run: pre-commit run --all-files
@@ -45,7 +45,9 @@ jobs:
           pip install -r requirements/runtime.txt
           pip install -r requirements/dev.txt
       - name: Run tests and collect coverage
-        run: pytest --cov --cov-report=xml ./tests/llm_web_kit
+        run: |
+          pytest --cov --cov-report=xml ./tests/llm_web_kit
+          pytest -n auto ./tests/jupyter
       - name: Upload coverage reports to Codecov
         uses: codecov/codecov-action@v5
         with:
 
@@ -40,7 +40,9 @@ jobs:
           pip install -r requirements/runtime.txt
           pip install -r requirements/dev.txt
       - name: Run tests and collect coverage
-        run: pytest --cov --cov-report=xml -n auto ./tests/llm_web_kit
+        run: |
+          pytest --cov --cov-report=xml -n auto ./tests/llm_web_kit
+          pytest -n auto ./tests/jupyter
       - name: Upload coverage reports to Codecov
         uses: codecov/codecov-action@v5
         with:
 
@@ -48,3 +48,4 @@ coverage.xml
 llm_web_kit.egg-info/*
 .llm-web-kit.jsonc
 .llm-web-kit-pageclassify.jsonc
+tests/llm_web_kit/extractor/ygq_testmd
@@ -28,19 +28,19 @@ repos:
     rev: v4.3.0
     hooks:
       - id: trailing-whitespace
-        exclude: '^tests/.*/assets/|llm_web_kit/model/assets/.*'
+        exclude: '^tests/.*/assets/|llm_web_kit/model/assets/.*|\.ipynb$'
       - id: check-yaml
       - id: end-of-file-fixer
-        exclude: '^tests/.*/assets/|llm_web_kit/model/assets/.*'
+        exclude: '^tests/.*/assets/|llm_web_kit/model/assets/.*|\.ipynb$'
       - id: requirements-txt-fixer
       - id: double-quote-string-fixer
-        exclude: '^tests/.*/assets/|llm_web_kit/model/assets/.*'
+        exclude: '^tests/.*/assets/|llm_web_kit/model/assets/.*|jupyter/domain_clustering/.*'
       - id: check-merge-conflict
       - id: fix-encoding-pragma
         args: [ "--remove" ]
       - id: mixed-line-ending
         args: [ "--fix=lf" ]
-        exclude: '^tests/.*/assets/'
+        exclude: '^tests/.*/assets/|\.ipynb$'
   - repo: https://github.com/executablebooks/mdformat
     rev: 0.7.9
     hooks:
@@ -56,3 +56,11 @@ repos:
     hooks:
       - id: docformatter
         args: [ "--in-place", "--wrap-descriptions", "119" ]
+  - repo: local
+    hooks:
+      - id: clear-jupyter-notebook-output
+        name: Clear Jupyter Notebook Output
+        entry: jupyter nbconvert --ClearOutputPreprocessor.enabled=True --inplace
+        language: system
+        files: \.ipynb$
+        types: [file]
@@ -75,8 +75,10 @@ llm-web-kit is a python library that ..
 
 ## Quick Start
 
+### extract by magic_html+recognize
+
 ```python
-from llm_web_kit.simple import extract_html_to_md
+from llm_web_kit.simple import extract_html_to_md, extract_html_to_mm_md
 import traceback
 from loguru import logger
 
@@ -95,6 +97,91 @@ if __name__=="__main__":
     markdown = extract(url, html)
 ```
 
+### only extract by recognize
+
+```python
+from llm_web_kit.simple import extract_html_to_md, extract_html_to_mm_md
+import traceback
+from loguru import logger
+
+def extract(url:str, raw_html:str) -> str:
+    try:
+        nlp_md = extract_html_to_md(url, raw_html, clip_html=False)
+        # or mm_nlp_md = extract_html_to_mm_md(url, raw_html, clip_html=False)
+        return nlp_md
+    except Exception as e:
+        logger.exception(e)
+    return None
+
+if __name__=="__main__":
+    url = ""
+    html = ""
+    markdown = extract(url, html)
+```
+
+### only extract main_html by magic-html
+
+```python
+from llm_web_kit.simple import extract_main_html_by_maigic_html
+import traceback
+from loguru import logger
+
+def extract(url:str, html:str) -> str:
+    try:
+        main_html = extract_main_html_by_maigic_html(url, html)
+        # or mm_main_html = extract_pure_html_to_mm_md(url, html)
+        return main_html
+    except Exception as e:
+        logger.exception(e)
+    return None
+
+if __name__=="__main__":
+    url = ""
+    html = ""
+    main_html = extract(url, html)
+```
+
+### extract main_html by model response
+
+```python
+import traceback
+from loguru import logger
+from llm_web_kit.main_html_parser.simplify_html.simplify_html import simplify_html
+from llm_web_kit.input.pre_data_json import PreDataJson, PreDataJsonKey
+from llm_web_kit.main_html_parser.parser.tag_mapping import MapItemToHtmlTagsParser
+
+def extract(response_json: dict, html:str) -> str:
+    try:
+        _, typical_raw_tag_html, _ = simplify_html(html)
+        pre_data = PreDataJson({})
+        pre_data[PreDataJsonKey.TYPICAL_RAW_TAG_HTML] = typical_raw_tag_html
+        pre_data[PreDataJsonKey.TYPICAL_RAW_HTML] = html
+        pre_data['success_label_enable'] = True
+        pre_data[PreDataJsonKey.LLM_RESPONSE] = response_json
+        parser = MapItemToHtmlTagsParser({})
+        pre_data = parser.parse_single(pre_data)
+        main_html = pre_data[PreDataJsonKey.TYPICAL_MAIN_HTML]
+        is_success = pre_data[PreDataJsonKey.TYPICAL_MAIN_HTML_SUCCESS]
+        return main_html, is_success
+    except Exception as e:
+        logger.exception(e)
+    return None
+
+if __name__=="__main__":
+    response_json =  {'item_id 1': 0, 'item_id 2': 1, 'item_id 3': 1}
+    html = ""
+    main_html, is_success = extract(response_json, html)
+```
+
+## Pipeline
+
+1. [HTML pre-dedup](jupyter/html-pre-dedup/main.ipynb)
+2. [domain clustering](jupyter/domain_clustering/README.md)
+3. [layout clustering](jupyter/layout-clustering/main.ipynb)
+4. [typical layout node selection](jupyter/typical-html-select/main.ipynb)
+5. [HTML node select by LLM](jupyter/html-node-select-llm/main.ipynb)
+6. [html parse layout by layout](jupyter/html-parse-by-layout/main.ipynb)
+
 ## Usage
 
 # TODO
 
@@ -43,6 +43,9 @@
 {"track_id": "math_stackexchange_1", "dataset_name": "eval_pipeline_suit", "url": "https://math.stackexchange.com/questions/458323/is-8327-1-a-prime-number?answertab=active", "data_source_category": "HTML", "path":"data/origin/math_stackexchange_1.html", "file_bytes": 1000, "page_layout_type": "forum", "meta_info": {"input_datetime": "2025-03-17 15:00:00"}}
 {"track_id": "math_physicsforums_1", "dataset_name": "eval_pipeline_suit", "url": "https://www.physicsforums.com/threads/probability-theoretic-inequality.246150/", "data_source_category": "HTML", "path":"data/origin/math_physicsforums_1.html", "file_bytes": 1000, "page_layout_type": "forum", "meta_info": {"input_datetime": "2025-03-17 15:00:00"}}
 {"track_id": "math_physicsforums_2", "dataset_name": "eval_pipeline_suit", "url": "https://physicshelpforum.com/t/latex-upgrade-physics-forum-powered-by-mathjax-v3.17489/", "data_source_category": "HTML", "path":"data/origin/math_physicsforums_2.html", "file_bytes": 1000, "page_layout_type": "forum", "meta_info": {"input_datetime": "2025-03-17 15:00:00"}}
+{"track_id": "math_img_1", "dataset_name": "eval_pipeline_suit", "url": "https://terrytao.wordpress.com/2008/05/16/285g-lecture-12-high-curvature-regions-of-ricci-flow-and-%ce%ba-solutions/", "data_source_category": "HTML", "path":"data/origin/math_img_1.html", "file_bytes": 1000, "page_layout_type": "article", "meta_info": {"input_datetime": "2025-03-17 15:00:00"}}
+{"track_id": "math_img_2", "dataset_name": "eval_pipeline_suit", "url": "https://up-skill.me/math/find-interquartile-range.html", "data_source_category": "HTML", "path":"data/origin/math_img_2.html", "file_bytes": 1000, "page_layout_type": "article", "meta_info": {"input_datetime": "2025-03-17 15:00:00"}}
+{"track_id": "math_img_3", "dataset_name": "eval_pipeline_suit", "url": "https://math.eretrandre.org/tetrationforum/showthread.php?tid=965", "data_source_category": "HTML", "path":"data/origin/math_img_3.html", "file_bytes": 1000, "page_layout_type": "forum", "meta_info": {"input_datetime": "2025-03-17 15:00:00"}}
 {"track_id": "table_1", "dataset_name": "eval_pipeline_suit", "url": "https://grondwerken-overzicht.nl/noordwolde/gwy", "data_source_category": "HTML", "path":"data/origin/table_1.html", "file_bytes": 1000, "page_layout_type": "article", "meta_info": {"input_datetime": "2025-03-17 15:00:00"}}
 {"track_id": "table_2", "dataset_name": "eval_pipeline_suit", "url": "https://10bestreviewz.com/automotive/best-underbody-underglow-kits", "data_source_category": "HTML", "path":"data/origin/table_2.html", "file_bytes": 1000, "page_layout_type": "article", "meta_info": {"input_datetime": "2025-03-17 15:00:00"}}
 {"track_id": "table_3", "dataset_name": "eval_pipeline_suit", "url": "https://cavaliers.co.uk/results/judges/Mrs%20S%20Hindle.htm", "data_source_category": "HTML", "path":"data/origin/table_3.html", "file_bytes": 1000, "page_layout_type": "article", "meta_info": {"input_datetime": "2025-03-17 15:00:00"}}
 
@@ -37,7 +37,7 @@
         "extractor": [
             {
                 "enable": true,
-                "python_class": "llm_web_kit.extractor.html.extractor.HTMLFileFormatExtractor",
+                "python_class": "llm_web_kit.extractor.html.extractor.MagicHTMLFIleFormatorExtractor",
                 "class_init_kwargs": {}
             }
         ],
Original file line number	Diff line number	Diff line change
`@@ -37,7 +37,7 @@`
`37`	`37`	`"extractor": [`
`38`	`38`	`{`
`39`	`39`	`"enable": true,`
`40`		`- "python_class": "llm_web_kit.extractor.html.extractor.HTMLFileFormatExtractor",`
	`40`	`+ "python_class": "llm_web_kit.extractor.html.extractor.MagicHTMLFIleFormatorExtractor",`
`41`	`41`	`"class_init_kwargs": {}`
`42`	`42`	`}`
`43`	`43`	`],`