Energy-Forecasting/energy_utils.py at main · andersdot/Energy-Forecasting · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
import pandas as pd
import h5pyd
import numpy as np

"""
Utilities for reading in h5py files from NSRDB
"""

def find_index(meta, lat, lon):
    """
    Given a latitude and longitude,
    find the index of meta with the closest latitude and longitude
    Scales as logN instead of N
    Input:
        meta - dateframe containing header meta information
        lat - latidute
        lon - longitude
    Return:
        index -> int
    """
    from sklearn.neighbors import BallTree

    # Creates new columns converting coordinate degrees to radians.
    for column in ["latitude", "longitude"]:
        rad = np.deg2rad(meta[column].values)
        meta[f'{column}_rad'] = rad

    # convert input latitude and longitude to radians
    rad_lat, rad_lon = np.deg2rad(lat), np.deg2rad(lon)

    #Create balltree using haversine distances
    ball = BallTree(meta[["latitude_rad", "longitude_rad"]].values, metric='haversine')

    #leaf size of 2
    k = 2

    #query tree
    distances, indices = ball.query(np.array([rad_lat, rad_lon]).reshape(1, -1), k=k)

    #return the first index
    return indices[0][0]


def access_data(columns, f, time_series, index):
    """
    Given a set of columns,
    extract them from database and add them to time_series
    """
    for var in columns:
        # Get dataset
        ds = f[var]
        # Extract scale factor
        scale_factor = ds.attrs['psm_scale_factor']
        # Extract site index and add to DataFrame
        time_series[var] = ds[:, index] / scale_factor
    return time_series

def create_single_timeseries(f, columns, index):
    """
    Given a single file,
    generate a DataFrame with columns and index,
    indexed by time
    """
    time_index = pd.to_datetime(f['time_index'][...].astype(str)).tz_localize(None)
    time_series = pd.DataFrame(index=time_index)
    time_series = access_data(columns, f, time_series, index)
    return time_series


def timeseries(lon=-105, lat=40, years=None, columns=None):
    """
    Given a latitude, longitude, year
    create a DataFrame containing columns for that space-time
    Input:
        lon - longitude [degrees]
        lat - latitude [degrees]
        years - List [int]
        columns - List [str]
    Return:
        DataFrame
    """
    #set default values
    if columns is None:
        columns = ['ghi', 'air_temperature']
    if years is None:
        years = ['2020']

    #read in first file to access meta data
    # for finding our position's index in the file
    f = h5pyd.File(f"/nrel/nsrdb/v3/nsrdb_{years[0]}.h5", 'r')
    meta = pd.DataFrame(f['meta'][...])
    index = find_index(meta, lat, lon)

    #since the file's already read in, let's create it's time_series
    time_series = create_single_timeseries(f, columns, index)

    #append other years' timeseries
    # !!! this assumes that the meta data is the same for each file
    # true here, and expensive to read, but caution
    if len(years) > 1:
        for y in years[1:]:
            f = h5pyd.File(f"/nrel/nsrdb/v3/nsrdb_{y}.h5", 'r')
            ts_new = create_single_timeseries(f, columns, index)
            time_series = time_series.append(ts_new)
    return meta, time_series