-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmain.py
More file actions
112 lines (91 loc) · 3.57 KB
/
main.py
File metadata and controls
112 lines (91 loc) · 3.57 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
HTML 表格 → 结构化文本
支持命令行两种调用方式:
1. 位置参数: python script.py /path/to/table.html 3
2. 选项参数: python script.py -html /path/to/table.html -nrows 3
"""
import pandas as pd
from pathlib import Path
import argparse
import sys
def merge_multi_header(header_rows):
num_levels = len(header_rows)
num_cols = len(header_rows[0])
merged = []
for col in range(num_cols):
parts = []
last = None
for row in range(num_levels):
val = str(header_rows[row][col])
if val == last:
continue
parts.append(val)
last = val
merged.append("-".join(parts))
return merged
def html_to_structured_text(html_input: str, header_rows_num=2):
if html_input.strip().startswith("<"):
tables = pd.read_html(html_input)
else:
html_file = Path(html_input)
if not html_file.exists():
raise FileNotFoundError(f"HTML文件不存在: {html_input}")
tables = pd.read_html(html_file.read_text(encoding="utf-8"))
if not tables:
raise ValueError("未找到表格")
df = tables[0]
df_header = df.iloc[:header_rows_num, :]
new_header = merge_multi_header(df_header.values.tolist())
df_data = df.iloc[header_rows_num:, :]
lines = []
for _, row in df_data.iterrows():
pairs = [(new_header[i], v) for i, v in enumerate(row) if pd.notna(v)]
formatted = []
last_key = None
last_val = None
for key, val in pairs:
text = f"{key}: {val}"
if formatted and key == last_key:
if val == last_val:
continue
formatted[-1] = f"{formatted[-1]}-{val}"
else:
formatted.append(text)
last_key = key
last_val =val
lines.append("; ".join(formatted))
return new_header, lines
def save_text(lines, path: Path):
path.parent.mkdir(parents=True, exist_ok=True)
path.write_text("\n".join(lines), encoding="utf-8")
def main(html_file: str, nrows=2):
html_path = Path(html_file)
if not html_path.exists():
print(f"HTML 文件不存在: {html_file}")
return
txt_path = html_path.with_suffix(".txt")
print(f"开始处理: {html_file}")
headers, text_lines = html_to_structured_text(html_file, header_rows_num=nrows)
save_text(text_lines, txt_path)
print(f"处理完成。\n表头数量: {len(headers)}\n数据行数: {len(text_lines)}")
print(f"输出文件: {txt_path}")
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="HTML 表格 → 结构化文本")
# 位置参数
parser.add_argument("html_file_pos", nargs="?", help="HTML 文件路径(位置参数方式)")
parser.add_argument("nrows_pos", nargs="?", type=int, default=2, help="表头行数(位置参数方式)")
# 选项参数
parser.add_argument("-html", "--html_file", help="HTML 文件路径(选项参数方式)")
parser.add_argument("-n", "--nrows", type=int, default=None, help="表头行数(选项参数方式)")
args = parser.parse_args()
# 优先使用选项参数
html_file = args.html_file or args.html_file_pos
nrows = args.nrows if args.nrows is not None else args.nrows_pos
if not html_file:
print("请提供 HTML 文件路径,例如:")
print(" python script.py /path/to/table.html 3")
print(" python script.py -html /path/to/table.html -n 3")
sys.exit(1)
main(html_file, nrows)