dcstats/process_data.py at master · CodeforNepal/dcstats · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
import os
import re
from pathlib import Path
from typing import List

import pandas as pd
from bs4 import BeautifulSoup

# Define the paths to the raw and processed data directories
BASE_DIR = Path(__file__).resolve().parent
RAW_DATA_PATH = BASE_DIR / 'raw_data'
DATA_PATH = BASE_DIR / 'data'


def process_data(file_names: List[str]) -> pd.DataFrame:
    """
    Extracts table data from a list of HTML files.

    Parameters:
        file_names (List[str]): List of HTML filenames to process.

    Returns:
        pd.DataFrame: Combined data extracted from HTML tables.
    """
    rows = []
    headers = []

    for file_name in file_names:
        file_path = RAW_DATA_PATH / file_name

        # Read HTML content from file
        with open(file_path, "r", encoding="utf-8") as file:
            html = file.read()

        # Extract the table with a specific class using regex
        match = re.search(
            r"(<table.*?mfe-app-group-hub-react-fah17h.*?</table>)",
            html,
            re.DOTALL
        )

        if not match:
            print(f"[Warning] Table not found in {file_name}")
            continue

        table_html = match.group(1)

        # Parse the HTML table with BeautifulSoup
        soup = BeautifulSoup(table_html, "html.parser")

        # Extract headers from the first <th> elements (once)
        if not headers:
            headers = [th.get_text(strip=True) for th in soup.find_all("th")]

        # Extract data rows (skip the header row)
        for row in soup.find_all("tr")[1:]:
            cells = row.find_all("td")
            if cells:
                row_data = [cell.get_text(strip=True) for cell in cells]
                rows.append(dict(zip(headers, row_data)))

    return pd.DataFrame(rows, columns=headers)


def main():
    """
    Main execution function. Groups HTML files by prefix,
    extracts their data, and saves each group to a CSV file.
    """
    # Get all HTML files in the raw data directory
    all_html_files = [f for f in os.listdir(RAW_DATA_PATH) if f.endswith('.html')]

    # Group files by prefix (before the first underscore)
    grouped_files = {}
    for file in all_html_files:
        prefix = file.split('_')[0]
        grouped_files.setdefault(prefix, []).append(file)

    # Extract and export data for each group
    for name, file_group in grouped_files.items():
        df = process_data(file_group)
        output_csv = DATA_PATH / f"{name}.csv"
        df.to_csv(output_csv, index=False)
        print(f"[Info] Extracted {len(df)} rows from {name} data.")


if __name__ == "__main__":
    main()