SolarFleet/SolarkCSVAggregate.py at main · KeithCu/SolarFleet · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
import pandas as pd
import os
import glob
import argparse
from io import StringIO

def find_header_row(filepath, time_col_name="Time"):
    """
    Reads the beginning of a CSV file to find the row index containing
    the specified time column name, assuming this is the header row.
    """
    try:
        with open(filepath, 'r', encoding='utf-8', errors='ignore') as f:
            for i, line in enumerate(f):
                if time_col_name in line.split(','):
                    return i
                if i > 20: # Stop searching after 20 lines to avoid reading huge files
                    break
    except Exception as e:
        print(f"  Error reading start of {filepath} to find header: {e}")
    return None # Header row not found

def process_csv_files(output_filename):
    """
    Finds all CSV files in the current directory, extracts specified columns,
    concatenates them, sorts by time, and saves to a single output file.
    """
    # Define the columns to extract based on the sample provided
    time_column = "Time"
    column1 = "GenToMiPower(W)/166"
    column2 = "LoadTotalPower(W)/178"
    columns_to_extract = [time_column, column1, column2]

    all_dataframes = []
    current_directory = '.'

    print(f"Searching for CSV files in: {os.path.abspath(current_directory)}")

    # Use glob to find all CSV files (case-insensitive) in the current directory
    csv_files = glob.glob(os.path.join(current_directory, '*.csv')) + \
                glob.glob(os.path.join(current_directory, '*.CSV'))

    if not csv_files:
        print("No CSV files found in the current directory.")
        return

    print(f"Found {len(csv_files)} CSV files. Processing...")

    for filepath in csv_files:
        filename = os.path.basename(filepath)
        print(f"\nProcessing file: {filename}")

        # Skip the output file itself if it already exists
        if filename == output_filename:
            print(f"  Skipping output file: {filename}")
            continue

        header_row_index = find_header_row(filepath, time_column)

        if header_row_index is None:
            print(f"  Could not find header row containing '{time_column}' in {filename}. Skipping.")
            continue

        print(f"  Detected header at row {header_row_index + 1}")

        try:
            # Read the CSV, skipping metadata rows and using the detected header row
            df = pd.read_csv(filepath, header=header_row_index, encoding='utf-8', low_memory=False)

            # Check if required columns exist
            missing_cols = [col for col in columns_to_extract if col not in df.columns]
            if missing_cols:
                print(f"  Warning: Missing columns {missing_cols} in {filename}. Skipping this file.")
                continue

            # Select only the required columns
            df_subset = df[columns_to_extract].copy()

            # Convert 'Time' column to datetime objects, coercing errors
            df_subset[time_column] = pd.to_datetime(df_subset[time_column], errors='coerce')

            # Drop rows where time conversion failed
            original_rows = len(df_subset)
            df_subset = df_subset.dropna(subset=[time_column])
            if len(df_subset) < original_rows:
                 print(f"  Dropped {original_rows - len(df_subset)} rows due to invalid time format.")

            # Ensure 'Time' column is formatted consistently
            df_subset[time_column] = df_subset[time_column].dt.strftime('%Y-%m-%d %H:%M:%S')

            # Convert power columns to numeric, coercing errors to NaN
            df_subset[column1] = pd.to_numeric(df_subset[column1], errors='coerce')
            df_subset[column2] = pd.to_numeric(df_subset[column2], errors='coerce')

            all_dataframes.append(df_subset)
            print(f"  Successfully processed {len(df_subset)} rows.")

        except pd.errors.EmptyDataError:
             print(f"  Warning: File {filename} is empty or failed to parse after header. Skipping.")
        except Exception as e:
            print(f"  Error processing file {filename}: {e}")

    if not all_dataframes:
        print("\nNo data extracted from any CSV file.")
        return

    # Concatenate all dataframes
    print("\nConcatenating data...")
    combined_df = pd.concat(all_dataframes, ignore_index=True)
    print(f"Total rows before sorting: {len(combined_df)}")


    # Sort by the 'Time' column
    print("Sorting data by time...")
    combined_df = combined_df.sort_values(by=time_column)

    # Save the combined and sorted dataframe to the output file
    try:
        combined_df.to_csv(output_filename, index=False, encoding='utf-8-sig')  # Use utf-8-sig for better compatibility
        print(f"\nSuccessfully saved combined data to: {output_filename}")
        print(f"Total rows in output: {len(combined_df)}")
    except Exception as e:
        print(f"\nError saving combined data to {output_filename}: {e}")

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Combine specific columns from CSV files in the current directory.")
    parser.add_argument("output_file", help="Name of the output CSV file (e.g., combined_data.csv)")

    args = parser.parse_args()

    process_csv_files(args.output_file)