diff --git a/src/eed_webscrapping_scripts/dwd/__init__.py b/src/eed_webscrapping_scripts/dwd/__init__.py index d22e303..f4c1ef4 100644 --- a/src/eed_webscrapping_scripts/dwd/__init__.py +++ b/src/eed_webscrapping_scripts/dwd/__init__.py @@ -2,6 +2,7 @@ "prepare_db", "download_json_to_duckdb", "get_config", + "read_sql_file", "pollenflug_gefahrenindex", ] from .db import prepare_db diff --git a/src/eed_webscrapping_scripts/dwd/config.yaml b/src/eed_webscrapping_scripts/dwd/config.yaml index f9505da..f8512d0 100644 --- a/src/eed_webscrapping_scripts/dwd/config.yaml +++ b/src/eed_webscrapping_scripts/dwd/config.yaml @@ -1,2 +1,6 @@ pollenflug_gefahrenindex: url: "https://opendata.dwd.de/climate_environment/health/alerts/s31fg.json" + db_infos: + db: "dwd" + datalake: "datalake" +git_root: "" diff --git a/src/eed_webscrapping_scripts/dwd/create_views.py b/src/eed_webscrapping_scripts/dwd/create_views.py new file mode 100644 index 0000000..9fd9151 --- /dev/null +++ b/src/eed_webscrapping_scripts/dwd/create_views.py @@ -0,0 +1,20 @@ +from eed_webscrapping_scripts.dwd import get_config, prepare_db +from eed_webscrapping_scripts.modules import read_sql_file + + +def create_views(): + cfg = get_config() + datalake = cfg["pollenflug_gefahrenindex"]["db_infos"]["datalake"] + sql = read_sql_file("dwd\sqls\list_available_tables.sql", cfg["git_root"]) + con = prepare_db() + _tables = con.sql(sql).fetchall() + tables = [item[0] for item in _tables] + sql_union_view_base = " UNION BY NAME ".join( + [f"FROM {datalake}.{table}" for table in tables] + ) + sql_union_view = f"CREATE OR REPLACE VIEW {datalake}.Pollenflug_Gefahrenindex AS ({sql_union_view_base})" + con.sql(sql_union_view) + + +if __name__ == "__main__": + create_views() diff --git a/src/eed_webscrapping_scripts/dwd/db.py b/src/eed_webscrapping_scripts/dwd/db.py index a23d5f0..a7706fd 100644 --- a/src/eed_webscrapping_scripts/dwd/db.py +++ b/src/eed_webscrapping_scripts/dwd/db.py @@ -1,17 +1,8 @@ -import os -import duckdb +from eed_webscrapping_scripts.modules import connect_to_db -home_dir = os.path.expanduser("~") - -def prepare_db(): - try: - with open(os.path.join(home_dir, ".motherduck_token")) as f: - md_token = f.read() - except Exception: - md_token = os.getenv("MD_TOKEN") - - con = duckdb.connect(f"md:?motherduck_token={md_token.strip()}") +def prepare_db(con=None): + con = con or connect_to_db() con.sql("CREATE DATABASE IF NOT EXISTS dwd") con.sql("USE dwd") con.sql("CREATE SCHEMA IF NOT EXISTS datalake") diff --git a/src/eed_webscrapping_scripts/dwd/sqls/list_available_tables.sql b/src/eed_webscrapping_scripts/dwd/sqls/list_available_tables.sql new file mode 100644 index 0000000..d77c311 --- /dev/null +++ b/src/eed_webscrapping_scripts/dwd/sqls/list_available_tables.sql @@ -0,0 +1,10 @@ +WITH _tables AS ( + SELECT table_name, TRUE AS in_dwd_db FROM duckdb_tables() + WHERE TRUE AND database_name = 'dwd' +) +, _tables_loaded AS ( + SELECT table_name, TRUE AS in_loaded_tables FROM datalake.loaded_tables +) +SELECT + * REPLACE(COALESCE(in_dwd_db, FALSE) AS in_dwd_db, COALESCE(in_loaded_tables, FALSE) AS in_loaded_tables ) +FROM _tables INNER JOIN _tables_loaded USING(table_name); diff --git a/src/eed_webscrapping_scripts/dwd/utils.py b/src/eed_webscrapping_scripts/dwd/utils.py index 534c9da..5ef7817 100644 --- a/src/eed_webscrapping_scripts/dwd/utils.py +++ b/src/eed_webscrapping_scripts/dwd/utils.py @@ -29,12 +29,12 @@ def download_json_to_duckdb(url, con): def get_config(): - path_to_config = os.path.join( - eed_webscrapping_scripts.__path__[0], "dwd", "config.yaml" - ) + git_root = eed_webscrapping_scripts.__path__[0] + path_to_config = os.path.join(git_root, "dwd", "config.yaml") with open(path_to_config, "r") as file: cfg = yaml.safe_load(file) + cfg["git_root"] = git_root return cfg diff --git a/src/eed_webscrapping_scripts/modules/__init__.py b/src/eed_webscrapping_scripts/modules/__init__.py new file mode 100644 index 0000000..1659447 --- /dev/null +++ b/src/eed_webscrapping_scripts/modules/__init__.py @@ -0,0 +1,5 @@ +__all__ = [ + "read_sql_file", + "connect_to_db", +] +from .modules import read_sql_file, connect_to_db diff --git a/src/eed_webscrapping_scripts/modules/modules.py b/src/eed_webscrapping_scripts/modules/modules.py new file mode 100644 index 0000000..9a09599 --- /dev/null +++ b/src/eed_webscrapping_scripts/modules/modules.py @@ -0,0 +1,23 @@ +import os +import duckdb + +home_dir = os.path.expanduser("~") + + +def read_sql_file(path_to_file, git_root=None): + git_root = git_root or "" + path_to_file = os.path.join(git_root, path_to_file) + with open(path_to_file, "r") as file: + sql = file.read() + return sql + + +def connect_to_db(): + try: + with open(os.path.join(home_dir, ".motherduck_token")) as f: + md_token = f.read() + except Exception: + md_token = os.getenv("MD_TOKEN") + + con = duckdb.connect(f"md:?motherduck_token={md_token.strip()}") + return con