Skip to content

Commit 7671b29

Browse files
authored
Merge pull request #4 from dt-yy/dev
add html extract
2 parents 18d1d0b + faa1b52 commit 7671b29

7 files changed

Lines changed: 249 additions & 0 deletions

File tree

realcrawl/cfg.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
from loguru import logger
1111

1212
from realcrawl.exception.base import ConfigFileNotFoundException
13+
from realcrawl.libs.path_lib import get_py_pkg_root_dir
1314

1415

1516
def load_config(suppress_error: bool = False) -> dict:
@@ -60,3 +61,17 @@ def load_config(suppress_error: bool = False) -> dict:
6061
config = json.load(f)
6162

6263
return config
64+
65+
66+
def load_pipe_tpl(pipe_name: str) -> dict:
67+
"""Load the pipe template for the web kit.
68+
69+
Args:
70+
pipe_name(str): The name of the pipe to load
71+
72+
Returns: pipe_tpl(dict): The pipe template dictionary
73+
"""
74+
pipe_tpl_path = os.path.join(get_py_pkg_root_dir(), 'config', 'extract_tpl', f'{pipe_name}.jsonc')
75+
with open(pipe_tpl_path, 'r', encoding='utf-8') as f:
76+
pipe_tpl = json.load(f)
77+
return pipe_tpl
Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
{
2+
"extractor_pipe": {
3+
"enable": true,
4+
"validate_input_format": false,
5+
"pre_extractor": [
6+
{
7+
"enable": true,
8+
"python_class": "llm_web_kit.extractor.html.pre_extractor.HTMLFileFormatFilterTablePreExtractor"
9+
},
10+
{
11+
"enable": true,
12+
"python_class": "llm_web_kit.extractor.html.pre_extractor.HTMLFileFormatCleanTagsPreExtractor",
13+
"class_init_kwargs": {},
14+
}
15+
],
16+
"extractor": [
17+
{
18+
"enable": true,
19+
"python_class": "llm_web_kit.extractor.html.extractor.HTMLFileFormatExtractor",
20+
"class_init_kwargs": {}
21+
}
22+
],
23+
"post_extractor": [
24+
{
25+
"enable": true,
26+
"python_class": "llm_web_kit.extractor.html.post_extractor.HTMLStripSpacePostExtractor"
27+
}
28+
]
29+
}
30+
}

realcrawl/extract/html_extract.py

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
2+
from func_timeout import func_timeout
3+
from llm_web_kit.extractor.extractor_chain import ExtractSimpleFactory
4+
from llm_web_kit.input.datajson import DataJson
5+
6+
from realcrawl.cfg import load_pipe_tpl
7+
8+
9+
class HtmlExtract:
10+
def __init__(self, html_file_path: str, output_format: str = 'md'):
11+
self.config = load_pipe_tpl('extractor_pipe')
12+
self.extractor_chain = ExtractSimpleFactory.create(self.config)
13+
self.d = {
14+
'track_id': '1',
15+
'html': open(html_file_path, 'r').read(),
16+
'url': 'https://www.google.com',
17+
'domain': 'google.com',
18+
'dataset_name':'cc',
19+
'data_source_category':'HTML',
20+
'file_bytes': 4096,
21+
'page_layout_type': 'article',
22+
'meta_info': {'input_datetime': '2020-01-01 00:00:00'}
23+
}
24+
self.output_format = output_format
25+
26+
def get_html_content(self):
27+
print('self.d: ', self.d)
28+
input_data = DataJson(self.d)
29+
data_e: DataJson = func_timeout(10, self.extractor_chain.extract, args=(input_data,))
30+
print('data_e: ', data_e.get_content_list().to_json())
31+
if self.output_format == 'md':
32+
md_content = data_e.get_content_list().to_mm_md()
33+
elif self.output_format == 'json':
34+
md_content = data_e.get_content_list().to_mm_json()
35+
else:
36+
raise ValueError(f'Invalid output format: {self.output_format}')
37+
return md_content
38+
39+
def get_main_html(self):
40+
input_data = DataJson(self.d)
41+
data_e: DataJson = func_timeout(10, self.extractor_chain.extract, args=(input_data,))
42+
main_html = data_e.get_main_html()
43+
return main_html

realcrawl/libs/path_lib.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
import os
2+
3+
4+
def get_proj_root_dir():
5+
"""获取项目的根目录.也就是含有.github, docs, llm_web_kit目录的那个目录."""
6+
return os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
7+
8+
9+
def get_py_pkg_root_dir():
10+
"""获取python包的根目录.也就是含有__init__.py的那个目录.
11+
12+
Args:
13+
None
14+
Returns:
15+
str: 项目的根目录
16+
"""
17+
return os.path.dirname(os.path.dirname(os.path.abspath(__file__)))

requirements/runtime.txt

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1 +1,5 @@
11
commentjson==0.9.0
2+
commentjson
3+
func-timeout
4+
git+https://github.com/ccprocessor/llm-webkit-mirror.git@dev
5+
loguru

tests/realcrawl/assets/1.html

Lines changed: 122 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,122 @@
1+
<!DOCTYPE html>
2+
<html lang="en">
3+
<head>
4+
<meta charset="UTF-8">
5+
<title>Title</title>
6+
</head>
7+
<body>
8+
9+
<!-- Path: 2.html -->
10+
11+
<h1>Heading 1</h1>
12+
<p>Paragraph 1</p>
13+
<div>
14+
<img alt="image-alt" title="image-title" src="test.png" />
15+
<p>Paragraph 2</p>
16+
</div>
17+
18+
<!-- 简单table -->
19+
<table>
20+
<tr>
21+
<td>1</td>
22+
<td>2</td>
23+
</tr>
24+
<tr>
25+
<td>3</td>
26+
<td>4</td>
27+
</tr>
28+
</table>
29+
30+
<div>
31+
<span>
32+
<!-- 复杂table -->
33+
<table>
34+
<tr>
35+
<td rowspan="2">1</td>
36+
<td>2</td>
37+
<td>3</td>
38+
</tr>
39+
<tr>
40+
<td colspan="2">4</td>
41+
</tr>
42+
<tr>
43+
<td>5</td>
44+
<td>6</td>
45+
<td>7</td>
46+
</tr>
47+
</table>
48+
</span>
49+
</div>
50+
51+
<!-- 简单list -->
52+
<ul>
53+
<li>1</li>
54+
<li>2</li>
55+
</ul>
56+
57+
<!-- 列表项里有子列表 -->
58+
<ul>
59+
<li>1
60+
<ul>
61+
<li>1.1</li>
62+
<li>1.2</li>
63+
</ul>
64+
</li>
65+
<li>2
66+
<ul>
67+
<li>2.1</li>
68+
<li>2.2</li>
69+
</ul>
70+
</li>
71+
</ul>
72+
73+
<!-- 数学公式 -->
74+
<math xmlns="http://www.w3.org/1998/Math/MathML" display="block">
75+
<mi>x</mi>
76+
<mo>=</mo>
77+
<mrow>
78+
<mfrac>
79+
<mrow>
80+
<mo>&#x2212;</mo>
81+
<mi>b</mi>
82+
<mo>&#x00B1;</mo>
83+
<msqrt>
84+
<msup>
85+
<mi>b</mi>
86+
<mn>2</mn>
87+
</msup>
88+
<mo>&#x2212;</mo>
89+
<mn>4</mn>
90+
<mi>a</mi>
91+
<mi>c</mi>
92+
</msqrt>
93+
</mrow>
94+
<mrow>
95+
<mn>2</mn>
96+
<mi>a</mi>
97+
</mrow>
98+
</mfrac>
99+
</mrow>
100+
<mtext>.</mtext>
101+
</math>
102+
103+
<!-- 代码 -->
104+
<pre><code class="language-js">const Prism = require('prismjs');
105+
106+
// The code snippet you want to highlight, as a string
107+
const code = `var data = 1;`;
108+
109+
// Returns a highlighted HTML string
110+
const html = Prism.highlight(code, Prism.languages.javascript, 'javascript');</code></pre>
111+
112+
<!-- 有序列表 -->
113+
<ol>
114+
<li>100</li>
115+
<li>200</li>
116+
</ol>
117+
118+
<!-- 带链接的 inline code -->
119+
<p>reference: <code>#include&lt;<a href="xxxx.xxxx.com">xxxx.hpp</a>&gt;</code></p>
120+
121+
</body>
122+
</html>
Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
import os
2+
import unittest
3+
4+
from realcrawl.extract.html_extract import HtmlExtract
5+
6+
7+
class TestHtmlExtract(unittest.TestCase):
8+
def setUp(self):
9+
self.base_path = os.path.dirname(os.path.abspath(__file__))
10+
11+
def test_html_extract(self):
12+
html_extract = HtmlExtract(os.path.join(self.base_path, 'assets/1.html'))
13+
html_content = html_extract.get_html_content()
14+
assert len(html_content) > 0
15+
16+
17+
if __name__ == '__main__':
18+
unittest.main()

0 commit comments

Comments
 (0)