SatExplorer/data_loader.py at main · Harsh223/SatExplorer · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
import os
import re
import pandas as pd
import streamlit as st
from datetime import datetime

try:
    import requests
except ImportError:
    requests = None

def get_satcat_update_date(filepath):
    try:
        with open(filepath, 'r', encoding='utf-8') as f:
            for line in f:
                if line.startswith('# Updated'):
                    m = re.search(r'# Updated (\d{4} [A-Za-z]{3}\s+\d{1,2})', line)
                    if m:
                        return m.group(1).strip()
    except Exception:
        pass
    return None

def get_satcat_update_date_from_content(content):
    for line in content.splitlines():
        if line.startswith('# Updated'):
            m = re.search(r'# Updated (\d{4} [A-Za-z]{3}\s+\d{1,2})', line)
            if m:
                return m.group(1).strip()
    return None

def parse_satcat_html(html_file):
    with open(html_file, 'r', encoding='utf-8') as f:
        content = f.read()
    pre_matches = re.findall(r'<PRE>(.*?)</PRE>', content, re.DOTALL)
    if len(pre_matches) < 2:
        st.error("Could not find at least two PRE tags in the HTML file.")
        return pd.DataFrame()
    header_text = pre_matches[0].strip()
    for m in pre_matches[1:]:
        data_text = m.strip()
        if data_text:
            break
    else:
        st.error("No data found in PRE tags after header.")
        return pd.DataFrame()
    header_positions = []
    column_names = []
    for match in re.finditer(r'\S+', header_text):
        header_positions.append(match.start())
        column_names.append(match.group())
    header_positions.append(len(header_text) + 1)
    data_lines = [line for line in data_text.split('\n') if line.strip() and not line.strip().startswith('#') and re.match(r'^S\d+', line.strip())]
    data_rows = []
    for line in data_lines:
        row = []
        for i in range(len(column_names)):
            start = header_positions[i]
            end = header_positions[i+1] if i+1 < len(header_positions) else len(line)+1
            if start < len(line):
                if end <= len(line):
                    field = line[start:end].strip()
                else:
                    field = line[start:].strip()
            else:
                field = ""
            row.append(field)
        data_rows.append(row)
    df = pd.DataFrame(data_rows, columns=column_names)
    if 'Type' in df.columns:
        df['CoarseType'] = df['Type'].astype(str).str[0]
        for i in range(12):
            df[f'SatType_{i+1}'] = df['Type'].astype(str).str[i].replace({'': '-', ' ': '-'})
        df['SatType_1_2'] = df['Type'].astype(str).str[:2]
    if 'LDate' in df.columns:
        df['LaunchYear'] = df['LDate'].astype(str).str.extract(r'(\d{4})').astype(float)
    for col in ['Mass', 'Perigee', 'Apogee', 'Inc']:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors='coerce')
    return df

def fetch_and_update_satcat(data_file, web_url):
    if requests is None:
        st.error("The 'requests' library is required to download the file. Please install it with 'pip install requests'.")
        return None
    try:
        resp = requests.get(web_url, timeout=30)
        resp.raise_for_status()
        fetched_date = get_satcat_update_date_from_content(resp.text)
        st.info(f"Fetched SATCAT update date: **{fetched_date if fetched_date else 'Unknown'}**")
        with open(data_file, 'wb') as f:
            f.write(resp.content)
        st.success("Downloaded and replaced local satcat.html with the latest SATCAT from the web.")
        return parse_satcat_html(data_file)
    except Exception as e:
        st.error(f"Failed to download SATCAT from web: {e}. Using local file if available.")
        if os.path.exists(data_file):
            return parse_satcat_html(data_file)
        return None

def load_satcat_data(data_file):
    """
    Loads the SATCAT data file, parses it, and returns (DataFrame, update_date_str).
    Returns (None, None) if file is missing or cannot be parsed.
    """
    if not os.path.exists(data_file):
        st.error(f"SATCAT file '{data_file}' not found.")
        return None, None
    try:
        df = parse_satcat_html(data_file)
        update_date = get_satcat_update_date(data_file)
        return df, update_date
    except Exception as e:
        st.error(f"Failed to load or parse '{data_file}': {e}")
        return None, None