|
| 1 | + |
| 2 | +from func_timeout import func_timeout |
| 3 | +from llm_web_kit.extractor.extractor_chain import ExtractSimpleFactory |
| 4 | +from llm_web_kit.input.datajson import DataJson |
| 5 | + |
| 6 | +from realcrawl.cfg import load_pipe_tpl |
| 7 | + |
| 8 | + |
| 9 | +class HtmlExtract: |
| 10 | + def __init__(self, html_file_path: str, output_format: str = 'md'): |
| 11 | + self.config = load_pipe_tpl('extractor_pipe') |
| 12 | + self.extractor_chain = ExtractSimpleFactory.create(self.config) |
| 13 | + self.d = { |
| 14 | + 'track_id': '1', |
| 15 | + 'html': open(html_file_path, 'r').read(), |
| 16 | + 'url': 'https://www.google.com', |
| 17 | + 'domain': 'google.com', |
| 18 | + 'dataset_name':'cc', |
| 19 | + 'data_source_category':'HTML', |
| 20 | + 'file_bytes': 4096, |
| 21 | + 'page_layout_type': 'article', |
| 22 | + 'meta_info': {'input_datetime': '2020-01-01 00:00:00'} |
| 23 | + } |
| 24 | + self.output_format = output_format |
| 25 | + |
| 26 | + def get_html_content(self): |
| 27 | + print('self.d: ', self.d) |
| 28 | + input_data = DataJson(self.d) |
| 29 | + data_e: DataJson = func_timeout(10, self.extractor_chain.extract, args=(input_data,)) |
| 30 | + print('data_e: ', data_e.get_content_list().to_json()) |
| 31 | + if self.output_format == 'md': |
| 32 | + md_content = data_e.get_content_list().to_mm_md() |
| 33 | + elif self.output_format == 'json': |
| 34 | + md_content = data_e.get_content_list().to_mm_json() |
| 35 | + else: |
| 36 | + raise ValueError(f'Invalid output format: {self.output_format}') |
| 37 | + return md_content |
| 38 | + |
| 39 | + def get_main_html(self): |
| 40 | + input_data = DataJson(self.d) |
| 41 | + data_e: DataJson = func_timeout(10, self.extractor_chain.extract, args=(input_data,)) |
| 42 | + main_html = data_e.get_main_html() |
| 43 | + return main_html |
0 commit comments