-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathprocess_data.py
More file actions
88 lines (67 loc) · 2.55 KB
/
process_data.py
File metadata and controls
88 lines (67 loc) · 2.55 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
import os
import re
from pathlib import Path
from typing import List
import pandas as pd
from bs4 import BeautifulSoup
# Define the paths to the raw and processed data directories
BASE_DIR = Path(__file__).resolve().parent
RAW_DATA_PATH = BASE_DIR / 'raw_data'
DATA_PATH = BASE_DIR / 'data'
def process_data(file_names: List[str]) -> pd.DataFrame:
"""
Extracts table data from a list of HTML files.
Parameters:
file_names (List[str]): List of HTML filenames to process.
Returns:
pd.DataFrame: Combined data extracted from HTML tables.
"""
rows = []
headers = []
for file_name in file_names:
file_path = RAW_DATA_PATH / file_name
# Read HTML content from file
with open(file_path, "r", encoding="utf-8") as file:
html = file.read()
# Extract the table with a specific class using regex
match = re.search(
r"(<table.*?mfe-app-group-hub-react-fah17h.*?</table>)",
html,
re.DOTALL
)
if not match:
print(f"[Warning] Table not found in {file_name}")
continue
table_html = match.group(1)
# Parse the HTML table with BeautifulSoup
soup = BeautifulSoup(table_html, "html.parser")
# Extract headers from the first <th> elements (once)
if not headers:
headers = [th.get_text(strip=True) for th in soup.find_all("th")]
# Extract data rows (skip the header row)
for row in soup.find_all("tr")[1:]:
cells = row.find_all("td")
if cells:
row_data = [cell.get_text(strip=True) for cell in cells]
rows.append(dict(zip(headers, row_data)))
return pd.DataFrame(rows, columns=headers)
def main():
"""
Main execution function. Groups HTML files by prefix,
extracts their data, and saves each group to a CSV file.
"""
# Get all HTML files in the raw data directory
all_html_files = [f for f in os.listdir(RAW_DATA_PATH) if f.endswith('.html')]
# Group files by prefix (before the first underscore)
grouped_files = {}
for file in all_html_files:
prefix = file.split('_')[0]
grouped_files.setdefault(prefix, []).append(file)
# Extract and export data for each group
for name, file_group in grouped_files.items():
df = process_data(file_group)
output_csv = DATA_PATH / f"{name}.csv"
df.to_csv(output_csv, index=False)
print(f"[Info] Extracted {len(df)} rows from {name} data.")
if __name__ == "__main__":
main()