@@ -75,8 +75,10 @@ llm-web-kit is a python library that ..
7575
7676## Quick Start
7777
78+ ### extract by magic_html+recognize
79+
7880``` python
79- from llm_web_kit.simple import extract_html_to_md
81+ from llm_web_kit.simple import extract_html_to_md, extract_html_to_mm_md
8082import traceback
8183from loguru import logger
8284
@@ -95,6 +97,91 @@ if __name__=="__main__":
9597 markdown = extract(url, html)
9698```
9799
100+ ### only extract by recognize
101+
102+ ``` python
103+ from llm_web_kit.simple import extract_html_to_md, extract_html_to_mm_md
104+ import traceback
105+ from loguru import logger
106+
107+ def extract (url :str , raw_html :str ) -> str :
108+ try :
109+ nlp_md = extract_html_to_md(url, raw_html, clip_html = False )
110+ # or mm_nlp_md = extract_html_to_mm_md(url, raw_html, clip_html=False)
111+ return nlp_md
112+ except Exception as e:
113+ logger.exception(e)
114+ return None
115+
116+ if __name__ == " __main__" :
117+ url = " "
118+ html = " "
119+ markdown = extract(url, html)
120+ ```
121+
122+ ### only extract main_html by magic-html
123+
124+ ``` python
125+ from llm_web_kit.simple import extract_main_html_by_maigic_html
126+ import traceback
127+ from loguru import logger
128+
129+ def extract (url :str , html :str ) -> str :
130+ try :
131+ main_html = extract_main_html_by_maigic_html(url, html)
132+ # or mm_main_html = extract_pure_html_to_mm_md(url, html)
133+ return main_html
134+ except Exception as e:
135+ logger.exception(e)
136+ return None
137+
138+ if __name__ == " __main__" :
139+ url = " "
140+ html = " "
141+ main_html = extract(url, html)
142+ ```
143+
144+ ### extract main_html by model response
145+
146+ ``` python
147+ import traceback
148+ from loguru import logger
149+ from llm_web_kit.main_html_parser.simplify_html.simplify_html import simplify_html
150+ from llm_web_kit.input.pre_data_json import PreDataJson, PreDataJsonKey
151+ from llm_web_kit.main_html_parser.parser.tag_mapping import MapItemToHtmlTagsParser
152+
153+ def extract (response_json : dict , html :str ) -> str :
154+ try :
155+ _, typical_raw_tag_html, _ = simplify_html(html)
156+ pre_data = PreDataJson({})
157+ pre_data[PreDataJsonKey.TYPICAL_RAW_TAG_HTML ] = typical_raw_tag_html
158+ pre_data[PreDataJsonKey.TYPICAL_RAW_HTML ] = html
159+ pre_data[' success_label_enable' ] = True
160+ pre_data[PreDataJsonKey.LLM_RESPONSE ] = response_json
161+ parser = MapItemToHtmlTagsParser({})
162+ pre_data = parser.parse_single(pre_data)
163+ main_html = pre_data[PreDataJsonKey.TYPICAL_MAIN_HTML ]
164+ is_success = pre_data[PreDataJsonKey.TYPICAL_MAIN_HTML_SUCCESS ]
165+ return main_html, is_success
166+ except Exception as e:
167+ logger.exception(e)
168+ return None
169+
170+ if __name__ == " __main__" :
171+ response_json = {' item_id 1' : 0 , ' item_id 2' : 1 , ' item_id 3' : 1 }
172+ html = " "
173+ main_html, is_success = extract(response_json, html)
174+ ```
175+
176+ ## Pipeline
177+
178+ 1 . [ HTML pre-dedup] ( jupyter/html-pre-dedup/main.ipynb )
179+ 2 . [ domain clustering] ( jupyter/domain_clustering/README.md )
180+ 3 . [ layout clustering] ( jupyter/layout-clustering/main.ipynb )
181+ 4 . [ typical layout node selection] ( jupyter/typical-html-select/main.ipynb )
182+ 5 . [ HTML node select by LLM] ( jupyter/html-node-select-llm/main.ipynb )
183+ 6 . [ html parse layout by layout] ( jupyter/html-parse-by-layout/main.ipynb )
184+
98185## Usage
99186
100187# TODO
0 commit comments