Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
d322e06
Add window-fetching feature
Shohail-Ismail Aug 25, 2025
0da5299
Add 5 year backfill process using data retrieval windows
Shohail-Ismail Aug 25, 2025
c2a937c
Add main function
Shohail-Ismail Aug 25, 2025
14be2e0
Add tests
Shohail-Ismail Aug 25, 2025
0baf6a6
Minor cosmetic changes
Shohail-Ismail Aug 25, 2025
7940aad
Add missing import
Shohail-Ismail Aug 26, 2025
25a26db
Fix imports to be cleaner
Shohail-Ismail Aug 26, 2025
d828798
Fix CI-related error
Shohail-Ismail Aug 26, 2025
f00ee15
Fix CI error not showing up in pytest
Shohail-Ismail Aug 26, 2025
f5dc2bc
Fix PSR code check for solar rows
Shohail-Ismail Aug 26, 2025
ca02289
Move solar export code into separatede_export.py script
Shohail-Ismail Sep 15, 2025
fa02153
Move solar export code into separate 'de_export.py' script
Shohail-Ismail Sep 15, 2025
dc45a28
Merge branch 'german-solar-csv' of https://github.com/Shohail-Ismail/…
Shohail-Ismail Sep 15, 2025
e5e5e90
Make XML more accurate to new ENTSOE API docs
Shohail-Ismail Sep 15, 2025
adc564c
Correct tests to align with new XML, and fix HTTP error path in inval…
Shohail-Ismail Sep 15, 2025
43fd709
Remove 361K line backfilled-CSV
Shohail-Ismail Sep 15, 2025
6fdefd6
Correct Ruff and Black errors
Shohail-Ismail Sep 15, 2025
7d7a46f
Delete solar_consumer/exports/de_5_year_repopulate.csv
Shohail-Ismail Sep 15, 2025
6f02f2b
Moved de_export.py to \scripts
Shohail-Ismail Sep 15, 2025
5970e2c
Add ENTSOE_API_KEY to example.env + fix formatting
Shohail-Ismail Nov 3, 2025
86c9551
Add entsoe-py as dependency + migrate DE solar data fetching to use i…
Shohail-Ismail Jan 20, 2026
d923a9f
Refactor tests to mock EntsoePandasClient, and add 2 new tests accord…
Shohail-Ismail Jan 20, 2026
95c5b3b
Clean up code/comments
Shohail-Ismail Jan 20, 2026
c4abc62
Merge branch 'main' into german-solar-csv
Shohail-Ismail Jan 20, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 6 additions & 3 deletions .example.env
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,18 @@
DB_URL=postgresql://postgres:postgres@localhost:5432/neso_solar

# country code for fetching data. Other options are "nl", "de"
COUNTRY="gb"
COUNTRY="gb"

# API Key for ENTSOE data access
# Attainable from https://transparency.entsoe.eu/
ENTSOE_API_KEY="your-entsoe-api-key-here"

# ways to store the data. Other options are "csv", "site-db"
SAVE_METHOD="db"

# Directory to save CSV files if save_method is "csv".
# Directory to save CSV files if save_method is "csv"
CSV_DIR=None

# Optional Settings
LOG_LEVEL=INFO
BATCH_SIZE=1000

1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ dependencies = [
"pyproj",
"tqdm",
"loguru==0.7.3",
"entsoe-py",
"pvlive-api>=1.5.1",
"dp-sdk",
"betterproto==2.0.0b7",
Expand Down
31 changes: 31 additions & 0 deletions scripts/de_export.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
import os
from datetime import datetime, timedelta, timezone
from solar_consumer.data.fetch_de_data import fetch_de_data_range


def main():
# Backfill from 01/01/2020 to yesterday and write to CSV
out_path = os.path.join("solar_consumer", "exports", "de_5_year_repopulate.csv")

now_utc = datetime.now(timezone.utc).replace(minute=0, second=0, microsecond=0)
end = (now_utc - timedelta(days=1))

# Start on 01/01/2020 for clean boundaries
start = datetime(2020, 1, 1, 0, 0, 0, tzinfo=timezone.utc)

# Perform backfill using week-long chunks
df = fetch_de_data_range(
start, end, chunk_hours=168
) # Adjust if you hit API limits

# Write to file (done with temp to avoid empty file if failure midway)
temp = out_path + ".tmp"
df.to_csv(temp, index=False)
os.replace(temp, out_path)
print(
f"FINISHED: WROTE {len(df)} ROWS OF SOLAR GENERATION DATA TO FILE: {out_path}"
)


if __name__ == "__main__":
main()
170 changes: 100 additions & 70 deletions solar_consumer/data/fetch_de_data.py
Original file line number Diff line number Diff line change
@@ -1,92 +1,122 @@
import os
import pandas as pd
import dotenv
from datetime import datetime, timedelta, timezone
import requests
import xml.etree.ElementTree as ET
from entsoe import EntsoePandasClient
from loguru import logger

# Load environment variables
dotenv.load_dotenv()
# German bidding zone
DE_TSO_ZONE = "10Y1001A1001A82H"

def fetch_de_data(historic_or_forecast: str = "generation") -> pd.DataFrame:
def fetch_de_data_range(start: datetime, end: datetime, chunk_hours: int = 168):
"""
Fetch German solar generation over a date range by chunking into windows (smaller payloads for API
and more robust behaviour for large ranges).
- start/end: inclusive start/exclusive end datetime (UTC)
- chunk_hours: window size, default 168h (or 7 days)

Returns DataFrame with columns:
- target_datetime_utc (UTC)
- solar_generation_kw (kW)
- tso_zone
"""

# API access handled by entsoe-py, not XML
api_key = os.getenv("ENTSOE_API_KEY")
if not api_key:
raise RuntimeError("WARNING: ENTSOE_API_KEY not set in environment")

assert start < end, "Start date must be before end"

# Normalise to UTC and hour boundaries
def norm(t):
if t.tzinfo is None:
t = t.replace(tzinfo=timezone.utc)
else:
t = t.astimezone(timezone.utc)
return t.replace(minute=0, second=0, microsecond=0)

start = norm(start)
end = norm(end)

client = EntsoePandasClient(api_key=api_key)

frames = []
window = start
step = timedelta(hours=chunk_hours)

# Fetch one window from the ENTSOE API and collect non-empty solar gen results
while window < end:
nxt = min(window + step, end)

# entsoe-py request (generation in MW)
gen_mw = client.query_generation(
country_code="DE",
start=pd.Timestamp(window),
end=pd.Timestamp(nxt),
psr_type="Solar",
)

# Convert to standard schema (UTC + kW)
if gen_mw is not None and not gen_mw.empty:
idx = pd.to_datetime(gen_mw.index)

# Ensure tz-aware UTC stamps
if getattr(idx, "tz", None) is None:
idx = idx.tz_localize("UTC")
else:
idx = idx.tz_convert("UTC")

df_chunk = pd.DataFrame(
{
"target_datetime_utc": idx,
"solar_generation_kw": (gen_mw.astype(float) * 1000.0).to_numpy(),
"tso_zone": DE_TSO_ZONE,
}
)
df_chunk = df_chunk.sort_values("target_datetime_utc").reset_index(drop=True)
frames.append(df_chunk)

window = nxt

# If all windows are completely empty, return an empty one with the right shape
if not frames:
return pd.DataFrame(
columns=["target_datetime_utc", "solar_generation_kw", "tso_zone"]
)

# Concatenate to a single table
df = pd.concat(frames, ignore_index=True)
df = (
df.drop_duplicates(subset=["target_datetime_utc", "tso_zone"])
.sort_values("target_datetime_utc")
.reset_index(drop=True)
)
logger.info("Assembled {} rows of German solar data over range.", len(df))
return df


def fetch_de_data(historic_or_forecast: str = "generation"):
"""
Fetch solar generation data from German bidding zones via the
ENTSOE API
ENTSOE API (24 HOUR FETCH)

Only 'generation' mode is supported for now

Returns DataFrame with 3 columns:
- target_datetime_utc (UTC date and time)
- solar_generation_kw (generation in kilowatts)
- tso_zone (bidding zone code)
"""

assert historic_or_forecast == "generation", "Only 'generation' supported for the time being"
assert (
historic_or_forecast == "generation"
), "Only 'generation' supported for the time being"

# Fetches data from last 24 hours from current time
now = datetime.now(timezone.utc).replace(minute=0, second=0, microsecond=0)
start = now - timedelta(hours=24)
period_start = start.strftime("%Y%m%d%H%M")
period_end = now.strftime("%Y%m%d%H%M")

# Prepare request
url = "https://web-api.tp.entsoe.eu/api" # base url for api
API_KEY = os.getenv("ENTSOE_API_KEY", "") # api key from env vars, empty string if missing
params = {
"documentType": "A75", # actual generation
"processType": "A16", # realised output
"in_Domain": "10Y1001A1001A83F",
"psrType": "B16",
"periodStart": period_start,
"periodEnd": period_end,
"securityToken": API_KEY,
}

# Initialise session for request
session = requests.Session()
logger.debug("Requesting German data from API with params: {}", params)
response = session.get(url, params=params)
try:
response.raise_for_status()
except Exception as e:
logger.error("API request failed, {}: {}", response.status_code, e)
raise
logger.error(f"Bytes: {len(response.content)}")

# Parse XML
root = ET.fromstring(response.content)
records = []

# Each <TimeSeries> represents one tso zone and one energy type
for ts in root.findall(".//TimeSeries"):
zone = ts.findtext(".//inBiddingZone_Domain/Mrid")
psr = ts.findtext(".//MktPSRType/psrType")
if psr != "A-10Y1001A1001A83H": # Skips all non-solar data
continue

for pt in ts.findall(".//Period/Point"):
start_str = pt.findtext("timeInterval/start")
qty_str = pt.findtext("quantity") # Quantity is in MW, converted to kW later
try:
qty = float(qty_str)
except (TypeError, ValueError):
logger.warning("Skipping malfromed quantity (%s) in zone %s", qty_str, zone)
continue

# Convert and record in list
dt = pd.to_datetime(start_str, utc=True)
records.append({
"target_datetime_utc": dt,
"solar_generation_kw": qty * 1000,
"tso_zone": zone,
})

# Build and tidy DataFrame
df = pd.DataFrame(records)
if not df.empty:
df = df.sort_values("target_datetime_utc").reset_index(drop=True)

logger.info("Assembled {} rows of German solar data", len(df))

# Keep behaviour stable: use the same range fetch and schema conversion path
df = fetch_de_data_range(start, now, chunk_hours=24)
logger.info("Assembled {} rows of German solar data", len(df))
return df
101 changes: 0 additions & 101 deletions solar_consumer/test_fetch_de_data.py

This file was deleted.

Loading
Loading