Merge pull request #4 from dt-yy/dev

Haojiong · web-flow · commit 7671b29762cd · 2025-06-16T10:45:14.000+08:00
add html extract
diff --git a/realcrawl/cfg.py b/realcrawl/cfg.py
@@ -10,6 +10,7 @@
 from loguru import logger
 
 from realcrawl.exception.base import ConfigFileNotFoundException
+from realcrawl.libs.path_lib import get_py_pkg_root_dir
 
 
 def load_config(suppress_error: bool = False) -> dict:
@@ -60,3 +61,17 @@ def load_config(suppress_error: bool = False) -> dict:
         config = json.load(f)
 
     return config
+
+
+def load_pipe_tpl(pipe_name: str) -> dict:
+    """Load the pipe template for the web kit.
+
+    Args:
+        pipe_name(str): The name of the pipe to load
+
+    Returns: pipe_tpl(dict): The pipe template dictionary
+    """
+    pipe_tpl_path = os.path.join(get_py_pkg_root_dir(), 'config', 'extract_tpl', f'{pipe_name}.jsonc')
+    with open(pipe_tpl_path, 'r', encoding='utf-8') as f:
+        pipe_tpl = json.load(f)
+    return pipe_tpl
diff --git a/realcrawl/config/extract_tpl/extractor_pipe.jsonc b/realcrawl/config/extract_tpl/extractor_pipe.jsonc
@@ -0,0 +1,30 @@
+{
+    "extractor_pipe": {
+            "enable": true,
+            "validate_input_format": false,
+            "pre_extractor": [
+                {
+                    "enable": true,
+                    "python_class": "llm_web_kit.extractor.html.pre_extractor.HTMLFileFormatFilterTablePreExtractor"
+                },
+                {
+                                    "enable": true,
+                                    "python_class": "llm_web_kit.extractor.html.pre_extractor.HTMLFileFormatCleanTagsPreExtractor",
+                                    "class_init_kwargs": {},
+                }
+            ],
+            "extractor": [
+                {
+                    "enable": true,
+                    "python_class": "llm_web_kit.extractor.html.extractor.HTMLFileFormatExtractor",
+                    "class_init_kwargs": {}
+                }
+            ],
+            "post_extractor": [
+                {
+                    "enable": true,
+                    "python_class": "llm_web_kit.extractor.html.post_extractor.HTMLStripSpacePostExtractor"
+                }
+            ]
+        }
+    }
diff --git a/realcrawl/extract/html_extract.py b/realcrawl/extract/html_extract.py
@@ -0,0 +1,43 @@
+
+from func_timeout import func_timeout
+from llm_web_kit.extractor.extractor_chain import ExtractSimpleFactory
+from llm_web_kit.input.datajson import DataJson
+
+from realcrawl.cfg import load_pipe_tpl
+
+
+class HtmlExtract:
+    def __init__(self, html_file_path: str, output_format: str = 'md'):
+        self.config = load_pipe_tpl('extractor_pipe')
+        self.extractor_chain = ExtractSimpleFactory.create(self.config)
+        self.d = {
+            'track_id': '1',
+            'html': open(html_file_path, 'r').read(),
+            'url': 'https://www.google.com',
+            'domain': 'google.com',
+            'dataset_name':'cc',
+            'data_source_category':'HTML',
+            'file_bytes': 4096,
+            'page_layout_type': 'article',
+            'meta_info': {'input_datetime': '2020-01-01 00:00:00'}
+        }
+        self.output_format = output_format
+
+    def get_html_content(self):
+        print('self.d: ', self.d)
+        input_data = DataJson(self.d)
+        data_e: DataJson = func_timeout(10, self.extractor_chain.extract, args=(input_data,))
+        print('data_e: ', data_e.get_content_list().to_json())
+        if self.output_format == 'md':
+            md_content = data_e.get_content_list().to_mm_md()
+        elif self.output_format == 'json':
+            md_content = data_e.get_content_list().to_mm_json()
+        else:
+            raise ValueError(f'Invalid output format: {self.output_format}')
+        return md_content
+
+    def get_main_html(self):
+        input_data = DataJson(self.d)
+        data_e: DataJson = func_timeout(10, self.extractor_chain.extract, args=(input_data,))
+        main_html = data_e.get_main_html()
+        return main_html
diff --git a/realcrawl/libs/path_lib.py b/realcrawl/libs/path_lib.py
@@ -0,0 +1,17 @@
+import os
+
+
+def get_proj_root_dir():
+    """获取项目的根目录.也就是含有.github, docs, llm_web_kit目录的那个目录."""
+    return os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+
+def get_py_pkg_root_dir():
+    """获取python包的根目录.也就是含有__init__.py的那个目录.
+
+    Args:
+        None
+    Returns:
+        str: 项目的根目录
+    """
+    return os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
diff --git a/requirements/runtime.txt b/requirements/runtime.txt
@@ -1 +1,5 @@
 commentjson==0.9.0
+commentjson
+func-timeout
+git+https://github.com/ccprocessor/llm-webkit-mirror.git@dev
+loguru
diff --git a/tests/realcrawl/assets/1.html b/tests/realcrawl/assets/1.html
@@ -0,0 +1,122 @@
+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <title>Title</title>
+</head>
+<body>
+
+<!-- Path: 2.html -->
+
+<h1>Heading 1</h1>
+<p>Paragraph 1</p>
+<div>
+    <img alt="image-alt" title="image-title" src="test.png" />
+    <p>Paragraph 2</p>
+</div>
+
+<!-- 简单table -->
+<table>
+    <tr>
+        <td>1</td>
+        <td>2</td>
+    </tr>
+    <tr>
+        <td>3</td>
+        <td>4</td>
+    </tr>
+</table>
+
+<div>
+    <span>
+        <!-- 复杂table -->
+    <table>
+        <tr>
+            <td rowspan="2">1</td>
+            <td>2</td>
+            <td>3</td>
+        </tr>
+        <tr>
+            <td colspan="2">4</td>
+        </tr>
+        <tr>
+            <td>5</td>
+            <td>6</td>
+            <td>7</td>
+        </tr>
+    </table>
+    </span>
+</div>
+
+<!-- 简单list -->
+<ul>
+    <li>1</li>
+    <li>2</li>
+</ul>
+
+<!-- 列表项里有子列表 -->
+<ul>
+    <li>1
+        <ul>
+            <li>1.1</li>
+            <li>1.2</li>
+        </ul>
+    </li>
+    <li>2
+        <ul>
+            <li>2.1</li>
+            <li>2.2</li>
+        </ul>
+    </li>
+</ul>
+
+<!-- 数学公式 -->
+<math xmlns="http://www.w3.org/1998/Math/MathML" display="block">
+    <mi>x</mi>
+    <mo>=</mo>
+    <mrow>
+      <mfrac>
+        <mrow>
+          <mo>&#x2212;</mo>
+          <mi>b</mi>
+          <mo>&#x00B1;</mo>
+          <msqrt>
+            <msup>
+              <mi>b</mi>
+              <mn>2</mn>
+            </msup>
+            <mo>&#x2212;</mo>
+            <mn>4</mn>
+            <mi>a</mi>
+            <mi>c</mi>
+          </msqrt>
+        </mrow>
+        <mrow>
+          <mn>2</mn>
+          <mi>a</mi>
+        </mrow>
+      </mfrac>
+    </mrow>
+    <mtext>.</mtext>
+  </math>
+
+<!-- 代码 -->
+<pre><code class="language-js">const Prism = require('prismjs');
+
+    // The code snippet you want to highlight, as a string
+    const code = `var data = 1;`;
+
+    // Returns a highlighted HTML string
+    const html = Prism.highlight(code, Prism.languages.javascript, 'javascript');</code></pre>
+
+<!-- 有序列表 -->
+<ol>
+    <li>100</li>
+    <li>200</li>
+</ol>
+
+<!-- 带链接的 inline code -->
+<p>reference: <code>#include&lt;<a href="xxxx.xxxx.com">xxxx.hpp</a>&gt;</code></p>
+
+</body>
+</html>
diff --git a/tests/realcrawl/test_html_extract.py b/tests/realcrawl/test_html_extract.py
@@ -0,0 +1,18 @@
+import os
+import unittest
+
+from realcrawl.extract.html_extract import HtmlExtract
+
+
+class TestHtmlExtract(unittest.TestCase):
+    def setUp(self):
+        self.base_path = os.path.dirname(os.path.abspath(__file__))
+
+    def test_html_extract(self):
+        html_extract = HtmlExtract(os.path.join(self.base_path, 'assets/1.html'))
+        html_content = html_extract.get_html_content()
+        assert len(html_content) > 0
+
+
+if __name__ == '__main__':
+    unittest.main()