From 5ed2010792c801aad9e16605f1df06e2c5a82fad Mon Sep 17 00:00:00 2001 From: eveleighoj <35256612+eveleighoj@users.noreply.github.com> Date: Thu, 5 Dec 2024 11:06:52 +0000 Subject: [PATCH 01/51] add convert to parquet function in and run in the pipeline --- digital_land/commands.py | 9 + digital_land/pipeline/__init__.py | 9 + digital_land/pipeline/main.py | 556 ++++++++++++++++++++++++++++++ digital_land/pipeline/process.py | 68 ++++ 4 files changed, 642 insertions(+) create mode 100644 digital_land/pipeline/__init__.py create mode 100644 digital_land/pipeline/main.py create mode 100644 digital_land/pipeline/process.py diff --git a/digital_land/commands.py b/digital_land/commands.py index 80619a612..e42b6a8aa 100644 --- a/digital_land/commands.py +++ b/digital_land/commands.py @@ -52,6 +52,7 @@ from digital_land.phase.reference import EntityReferencePhase, FactReferencePhase from digital_land.phase.save import SavePhase from digital_land.pipeline import run_pipeline, Lookups, Pipeline +from digital_land.pipeline.process import convert_tranformed_csv_to_pq from digital_land.schema import Schema from digital_land.update import add_source_endpoint from digital_land.configuration.main import Config @@ -342,6 +343,14 @@ def pipeline_run( column_field_log.save(os.path.join(column_field_dir, resource + ".csv")) dataset_resource_log.save(os.path.join(dataset_resource_dir, resource + ".csv")) converted_resource_log.save(os.path.join(converted_resource_dir, resource + ".csv")) + # create converted parquet in the var directory + cache_dir = Path(organisation_path).parent + transformed_parquet_dir = cache_dir / "transformed_parquet" / dataset + transformed_parquet_dir.mkdir(exists_ok=True, parents=True) + convert_tranformed_csv_to_pq( + input_path=input_path, + output_path=transformed_parquet_dir / f"{resource}.parquet", + ) # diff --git a/digital_land/pipeline/__init__.py b/digital_land/pipeline/__init__.py new file mode 100644 index 000000000..66b49ea6b --- /dev/null +++ b/digital_land/pipeline/__init__.py @@ -0,0 +1,9 @@ +""" sub package containing code for processing resources into transformed resources""" + +from .main import ( # noqa: F401 + Pipeline, + Lookups, + chain_phases, + run_pipeline, + EntityNumGen, +) diff --git a/digital_land/pipeline/main.py b/digital_land/pipeline/main.py new file mode 100644 index 000000000..11749571e --- /dev/null +++ b/digital_land/pipeline/main.py @@ -0,0 +1,556 @@ +import os +import csv +import functools +import importlib.util +import logging +from pathlib import Path + +from .phase.map import normalise +from .phase.lookup import key as lookup_key +from .schema import Schema + + +def chain_phases(phases): + def add(f, g): + return lambda x: g.process(f(x)) + + return functools.reduce(add, phases, lambda phase: phase) + + +def run_pipeline(*args): + logging.debug(f"run_pipeline {args}") + chain = chain_phases([arg for arg in args if arg]) + + stream = chain(None) + for row in stream: + pass + + +# TODO should we remove loading from init? it makes it harder to test +# and what if you only wanted to load specific files +# TODO replace with config models which load is handled by them +class Pipeline: + def __init__(self, path, dataset): + self.dataset = dataset + self.name = dataset + self.path = path + self.column = {} + self.filter = {} + self.skip_pattern = {} + self.patch = {} + self.default_field = {} + self.default_value = {} + self.combine_field = {} + self.concat = {} + self.migrate = {} + self.lookup = {} + self.redirect_lookup = {} + + self.load_column() + self.load_skip_patterns() + self.load_patch() + self.load_default_fields() + self.load_default_values() + self.load_concat() + self.load_combine_fields() + self.load_migrate() + self.load_lookup() + self.load_redirect_lookup() + self.load_filter() + + def file_reader(self, filename): + # read a file from the pipeline path, ignore if missing + path = os.path.join(self.path, filename) + if not os.path.isfile(path): + return [] + logging.debug(f"load {path}") + return csv.DictReader(open(path)) + + def reader(self, filename): + for row in self.file_reader(filename): + row["dataset"] = row.get("dataset", "") or row.get("pipeline", "") + if row["dataset"] and row["dataset"] != self.name: + continue + yield row + + def load_column(self): + for row in self.reader("column.csv"): + resource = row.get("resource", "") + endpoint = row.get("endpoint", "") + + if resource: + record = self.column.setdefault(resource, {}) + elif endpoint: + record = self.column.setdefault(endpoint, {}) + else: + record = self.column.setdefault("", {}) + + # migrate column.csv + row["column"] = row.get("column", "") or row["pattern"] + row["field"] = row.get("field", "") or row["value"] + + record[normalise(row["column"])] = row["field"] + + def load_filter(self): + for row in self.reader("filter.csv"): + resource = row.get("resource", "") + endpoint = row.get("endpoint", "") + + if resource: + record = self.filter.setdefault(resource, {}) + elif endpoint: + record = self.filter.setdefault(endpoint, {}) + else: + record = self.filter.setdefault("", {}) + + record[row["field"]] = row["pattern"] + + def load_skip_patterns(self): + for row in self.reader("skip.csv"): + resource = row.get("resource", "") + endpoint = row.get("endpoint", "") + + if resource: + record = self.skip_pattern.setdefault(resource, []) + elif endpoint: + record = self.skip_pattern.setdefault(endpoint, []) + else: + record = self.skip_pattern.setdefault("", []) + + record.append(row["pattern"]) + + def load_patch(self): + for row in self.reader("patch.csv"): + resource = row.get("resource", "") + endpoint = row.get("endpoint", "") + + if resource: + record = self.patch.setdefault(resource, {}) + elif endpoint: + record = self.patch.setdefault(endpoint, {}) + else: + record = self.patch.setdefault("", {}) + + row["field"] = row.get("field", "") + row["pattern"] = row.get("pattern", "") + + record = record.setdefault(row["field"], {}) + record[row["pattern"]] = row["value"] + + def load_default_fields(self): + # TBD: rename default-field.csv + for row in self.reader("default.csv"): + resource = row.get("resource", "") + endpoint = row.get("endpoint", "") + + if resource: + record = self.default_field.setdefault(resource, {}) + elif endpoint: + record = self.default_field.setdefault(endpoint, {}) + else: + record = self.default_field.setdefault("", {}) + + record[row["field"]] = row["default-field"] + + def load_default_values(self): + for row in self.reader("default-value.csv"): + record = self.default_value.setdefault(row.get("endpoint", ""), {}) + record[row["field"]] = row["value"] + + def load_combine_fields(self): + for row in self.reader("combine.csv"): + record = self.combine_field.setdefault(row.get("endpoint", ""), {}) + record[row["field"]] = row["separator"] + + def load_concat(self): + for row in self.reader("concat.csv"): + resource = row.get("resource", "") + endpoint = row.get("endpoint", "") + + if resource: + record = self.concat.setdefault(resource, {}) + elif endpoint: + record = self.concat.setdefault(endpoint, {}) + else: + record = self.concat.setdefault("", {}) + + # record = self.concat.setdefault(row["resource"], {}) + record[row["field"]] = { + "fields": row["fields"].split(";"), + "separator": row["separator"], + "prepend": row.get("prepend", ""), + "append": row.get("append", ""), + } + + # TBD: remove this table, should come from specification replacement-field + def load_migrate(self): + for row in self.reader("transform.csv"): + if row["replacement-field"] == "": + continue + + if row["replacement-field"] in self.migrate: + raise ValueError( + "replacement-field %s has more than one entry" + % row["replacement-field"] + ) + + self.migrate[row["replacement-field"]] = row["field"] + + def load_lookup(self): + for row in self.file_reader("lookup.csv"): + # migrate old lookup.csv files + entry_number = row.get("entry-number", "") + prefix = ( + row.get("prefix", "") + or row.get("dataset", "") + or row.get("pipeline", "") + ) + reference = row.get("reference", "") or row.get("value", "") + + # composite key, ordered by specificity + resource_lookup = self.lookup.setdefault(row.get("resource", ""), {}) + resource_lookup[ + lookup_key( + entry_number=entry_number, + prefix=prefix, + reference=reference, + ) + ] = row["entity"] + + organisation = row.get("organisation", "") + # replace local-authority-eng while we migrate + organisation = organisation.replace( + "local-authority-eng", "local-authority" + ) + resource_lookup[ + lookup_key( + prefix=prefix, + reference=reference, + organisation=organisation, + ) + ] = row["entity"] + + def load_redirect_lookup(self): + for row in self.file_reader("old-entity.csv"): + old_entity = row.get("old-entity", "") + entity = row.get("entity", "") + status = row.get("status", "") + if old_entity and status: + self.redirect_lookup[old_entity] = {"entity": entity, "status": status} + + def filters(self, resource="", endpoints=[]): + d = self.filter.get("", {}).copy() + + for endpoint in endpoints: + endpoint_filters = self.filter.get(endpoint, {}) + d.update(endpoint_filters) + + if resource: + resource_filters = self.filter.get(resource, {}) + d.update(resource_filters) + + return d + + def columns(self, resource="", endpoints=[]): + general_columns = self.column.get("", {}) + if not resource: + return general_columns + + resource_columns = self.column.get(resource, {}) + endpoint_columns = {} + for endpoint in endpoints: + endpoint_columns = {**endpoint_columns, **self.column.get(endpoint, {})} + + result = {**endpoint_columns, **resource_columns} + + for key in general_columns: + if key in result: + continue + if ( + general_columns[key] in endpoint_columns.values() + or general_columns[key] in resource_columns.values() + ): + continue + result[key] = general_columns[key] + return result + + def skip_patterns(self, resource="", endpoints=[]): + if not resource: + return self.skip_pattern.get("", {}) + endpoint_patterns = [] + for endpoint in endpoints: + endpoint_patterns.extend(self.skip_pattern.get(endpoint, [])) + + return ( + self.skip_pattern.get(resource, []) + + self.skip_pattern.get("", []) + + endpoint_patterns + ) + + def patches(self, resource="", endpoints=[]): + general_patch = self.patch.get("", {}) + if not resource: + return general_patch + + resource_patch = self.patch.get(resource, {}) + endpoint_patch = {} + + for endpoint in endpoints: + endpoint_patch = {**endpoint_patch, **self.patch.get(endpoint, {})} + + result = {**endpoint_patch, **resource_patch} + + # Merge any remaining general defaults into the result + for field, patch in general_patch.items(): + if field not in result: + result[field] = patch + else: + result[field] = {**patch, **result[field]} + + return result + + def default_fields(self, resource=None, endpoints=[]): + config = self.default_field + + d = config.get("", {}) + + for key, value in config.get(resource, {}).items(): + d[key] = value + + for endpoint in endpoints: + for key, value in config.get(endpoint, {}).items(): + d[key] = value + return d + + def default_values(self, endpoints=None): + if endpoints is None: + endpoints = [] + config = self.default_value + d = config.get("", {}) + for endpoint in endpoints: + for key, value in config.get(endpoint, {}).items(): + d[key] = value + return d + + def combine_fields(self, endpoints=None): + if endpoints is None: + endpoints = [] + config = self.combine_field + d = config.get("", {}) + for endpoint in endpoints: + for key, value in config.get(endpoint, {}).items(): + d[key] = value + return d + + def concatenations(self, resource=None, endpoints=[]): + result = self.concat.get("", {}) + if resource: + result.update(self.concat.get(resource, {})) + + for endpoint in endpoints: + result.update(self.concat.get(endpoint, {})) + + return result + + def migrations(self): + return self.migrate + + def lookups(self, resource=None): + d = self.lookup.get("", {}) + if resource: + d.update(self.lookup.get(resource, {})) + return d + + def redirect_lookups(self): + return self.redirect_lookup + + def get_pipeline_callback(self): + file = os.path.join(self.path, "pipeline-callback.py") + spec = importlib.util.spec_from_file_location("pipeline-callback.py", file) + module = importlib.util.module_from_spec(spec) + spec.loader.exec_module(module) + return module.PipelineCallback + + @staticmethod + def compose(phases): + def add(f, g): + return lambda x: g.process(f(x)) + + return functools.reduce(add, phases, lambda phase: phase) + + def run(self, input_path, phases): + logging.debug(f"running {input_path} through {phases}") + chain = self.compose(phases) + for row in chain(input_path): + pass + + +class EntityNumGen: + def __init__(self, entity_num_state: dict = None): + if not entity_num_state: + entity_num_state = { + "range_min": 0, + "range_max": 100, + "current": 0, + } + + self.state = entity_num_state + + def next(self): + current = self.state["current"] + new_current = current + 1 + + if new_current > int(self.state["range_max"]): + new_current = int(self.state["range_min"]) + + if new_current < int(self.state["range_min"]): + new_current = int(self.state["range_min"]) + + self.state["current"] = new_current + + return new_current + + +class Lookups: + def __init__(self, directory=None) -> None: + self.directory = directory or "pipeline" + self.lookups_path = Path(directory) / "lookup.csv" + self.old_entity_path = Path(directory) / "old-entity.csv" + self.entries = [] + self.schema = Schema("lookup") + self.entity_num_gen = EntityNumGen() + + def add_entry(self, entry, is_new_entry=True): + """ + is_new_entry is an addition to allow for backward compatibility. + Older lookups may not be valid in accordance with the current + minimal column requirements + :param entry: + :param is_new_entry: + :return: + """ + if is_new_entry: + if not self.validate_entry(entry): + return + + self.entries.append(entry) + + def load_csv(self, lookups_path=None): + """ + load in lookups as df, not when we process pipeline but useful for other analysis + """ + lookups_path = lookups_path or self.lookups_path + reader = csv.DictReader(open(lookups_path, newline="")) + extra_fields = set(reader.fieldnames) - set(self.schema.fieldnames) + + if len(extra_fields): + raise RuntimeError( + f"{len(extra_fields)} extra fields founds in lookup.csv ({','.join(list(extra_fields))})" + ) + + for row in reader: + self.add_entry(row, is_new_entry=False) + + def get_max_entity(self, prefix, specification) -> int: + if len(self.entries) == 0: + return 0 + if not prefix: + return 0 + + dataset_prefix = specification.dataset_prefix(prefix) + try: + ret_val = max( + [ + int(entry["entity"]) + for entry in self.entries + if (entry["prefix"] == prefix or entry["prefix"] == dataset_prefix) + and (entry.get("entity", None)) + ] + ) + return ret_val + except ValueError: + return 0 + + def save_csv(self, lookups_path=None, entries=None, old_entity_path=None): + path = lookups_path or self.lookups_path + + entity_values = [] + if os.path.exists(path): + reader = csv.DictReader(open(path, newline="")) + for row in reader: + entity_values.append(row["entity"]) + + if entries is None: + entries = self.entries + + os.makedirs(os.path.dirname(path), exist_ok=True) + logging.debug("saving %s" % (path)) + f = open(path, "w", newline="") + writer = csv.DictWriter( + f, fieldnames=self.schema.fieldnames, extrasaction="ignore" + ) + writer.writeheader() + + old_entity_file_path = old_entity_path or self.old_entity_path + if os.path.exists(old_entity_file_path): + old_entity_path = self.old_entity_path + reader = csv.DictReader(open(old_entity_file_path, newline="")) + + for row in reader: + entity_values.append(row["old-entity"]) + entity_values.append(row["entity"]) + + new_entities = [] + get_entity = None + for idx, entry in enumerate(entries): + if not entry: + continue + else: + if not entry.get("entity"): + while True: + generated_entity = self.entity_num_gen.next() + + if generated_entity == get_entity: + print( + "There are no more entity numbers available within this dataset." + ) + break + + if get_entity is None: + get_entity = generated_entity + + if str(generated_entity) not in entity_values: + entry["entity"] = generated_entity + new_entities.append(entry) + entity_values.append(str(generated_entity)) + writer.writerow(entry) + break + else: + writer.writerow(entry) + return new_entities + + # @staticmethod + def validate_entry(self, entry) -> bool: + # ensures minimum expected fields exist and are not empty strings + expected_fields = ["prefix", "organisation", "reference"] + for field in expected_fields: + if not entry.get(field, ""): + raise ValueError(f"ERROR: expected {field} not found in lookup entry") + + if len(self.entries) > 0: + # check entry does not already exist + existing_entries = len( + [ + 1 + for item in self.entries + if item["prefix"] == entry["prefix"] + and item["organisation"] == entry["organisation"] + and item["reference"] == entry["reference"] + ] + ) + + if existing_entries > 0: + # print(f">>> ERROR: lookup already exists - {entry['organisation']} {entry['reference']}") + return False + + return True diff --git a/digital_land/pipeline/process.py b/digital_land/pipeline/process.py new file mode 100644 index 000000000..a0b52432f --- /dev/null +++ b/digital_land/pipeline/process.py @@ -0,0 +1,68 @@ +#!/usr/bin/env python +import os +import pandas as pd +import pyarrow.parquet as pq +import pyarrow as pa +from pathlib import Path + +# load in specification + + +def convert_tranformed_csv_to_pq(input, output): + """ + function to convert a transformed resource to a parrquet file. + """ + input_path = Path(input) + output_path = Path(output) + if output_path.exists(): + os.remove(output_path) + + # Define the chunk size for reading the CSV file + chunk_size = 1000000 # Number of rows per chunk + + # Open a CSV reader with PyArrow + # csv_reader = pv.open_csv(input_path, read_options=pv.ReadOptions(block_size=chunk_size)) + csv_iterator = pd.read_csv( + input_path, chunksize=chunk_size, dtype=str, na_filter=False + ) + + # Initialize the Parquet writer with the schema from the first chunk + first_chunk = next(csv_iterator) + # size = 0 + # size +=len(first_chunk) + + fields = [ + ("end-date", pa.string()), + ("entity", pa.string()), + ("entry-date", pa.string()), + ("entry-number", pa.string()), + ("fact", pa.string()), + ("field", pa.string()), + ("priority", pa.string()), + ("reference-entity", pa.string()), + ("resource", pa.string()), + ("start-date", pa.string()), + ("value", pa.string()), + ] + schema = pa.schema(fields) + table = pa.Table.from_pandas(first_chunk, schema=schema) + + # Create a Parquet writer + parquet_writer = pq.ParquetWriter(output_path, table.schema) + + # Write the first chunk + parquet_writer.write_table(table) + + # Process and write the remaining chunks + while True: + try: + chunk = next(csv_iterator) + table = pa.Table.from_pandas(chunk) + parquet_writer.write_table(table) + # size += len(chunk) + except StopIteration: + break + + # Close the Parquet writer + parquet_writer.close() + # print(size) From 8622c43c86d2f4bacaa7cab32def9939413cde42 Mon Sep 17 00:00:00 2001 From: eveleighoj <35256612+eveleighoj@users.noreply.github.com> Date: Thu, 5 Dec 2024 11:17:02 +0000 Subject: [PATCH 02/51] correct imports --- digital_land/pipeline/main.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/digital_land/pipeline/main.py b/digital_land/pipeline/main.py index 11749571e..cde83293b 100644 --- a/digital_land/pipeline/main.py +++ b/digital_land/pipeline/main.py @@ -5,9 +5,9 @@ import logging from pathlib import Path -from .phase.map import normalise -from .phase.lookup import key as lookup_key -from .schema import Schema +from digital_land.phase.map import normalise +from digital_land.phase.lookup import key as lookup_key +from digital_land.schema import Schema def chain_phases(phases): From ff82675d5ad94c3475eaa6af09c6e608b7ace9f0 Mon Sep 17 00:00:00 2001 From: eveleighoj <35256612+eveleighoj@users.noreply.github.com> Date: Thu, 5 Dec 2024 11:21:42 +0000 Subject: [PATCH 03/51] correct arg --- digital_land/commands.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/digital_land/commands.py b/digital_land/commands.py index e42b6a8aa..dd0d9aeb6 100644 --- a/digital_land/commands.py +++ b/digital_land/commands.py @@ -346,7 +346,7 @@ def pipeline_run( # create converted parquet in the var directory cache_dir = Path(organisation_path).parent transformed_parquet_dir = cache_dir / "transformed_parquet" / dataset - transformed_parquet_dir.mkdir(exists_ok=True, parents=True) + transformed_parquet_dir.mkdir(exist_ok=True, parents=True) convert_tranformed_csv_to_pq( input_path=input_path, output_path=transformed_parquet_dir / f"{resource}.parquet", From f93068fc8b6e021efa1f69f5959b01949ede8521 Mon Sep 17 00:00:00 2001 From: eveleighoj <35256612+eveleighoj@users.noreply.github.com> Date: Thu, 5 Dec 2024 11:26:05 +0000 Subject: [PATCH 04/51] cureent arguement names --- digital_land/pipeline/process.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/digital_land/pipeline/process.py b/digital_land/pipeline/process.py index a0b52432f..520081030 100644 --- a/digital_land/pipeline/process.py +++ b/digital_land/pipeline/process.py @@ -8,12 +8,12 @@ # load in specification -def convert_tranformed_csv_to_pq(input, output): +def convert_tranformed_csv_to_pq(input_path, output_path): """ function to convert a transformed resource to a parrquet file. """ - input_path = Path(input) - output_path = Path(output) + input_path = Path(input_path) + output_path = Path(output_path) if output_path.exists(): os.remove(output_path) From 271e66432e51ed3d69385b94cd3930b9e330bff6 Mon Sep 17 00:00:00 2001 From: eveleighoj <35256612+eveleighoj@users.noreply.github.com> Date: Thu, 5 Dec 2024 11:36:18 +0000 Subject: [PATCH 05/51] correct input for csv to pq --- digital_land/commands.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/digital_land/commands.py b/digital_land/commands.py index dd0d9aeb6..70d6e1241 100644 --- a/digital_land/commands.py +++ b/digital_land/commands.py @@ -348,7 +348,7 @@ def pipeline_run( transformed_parquet_dir = cache_dir / "transformed_parquet" / dataset transformed_parquet_dir.mkdir(exist_ok=True, parents=True) convert_tranformed_csv_to_pq( - input_path=input_path, + input_path=output_path, output_path=transformed_parquet_dir / f"{resource}.parquet", ) From cdd480e0f13805afb3b3230c371ec3ab5ce27606 Mon Sep 17 00:00:00 2001 From: eveleighoj <35256612+eveleighoj@users.noreply.github.com> Date: Thu, 5 Dec 2024 12:43:54 +0000 Subject: [PATCH 06/51] add parquet package back in --- digital_land/commands.py | 27 +- digital_land/package/dataset_parquet.py | 345 ++++++++++++++++++++++++ 2 files changed, 370 insertions(+), 2 deletions(-) create mode 100644 digital_land/package/dataset_parquet.py diff --git a/digital_land/commands.py b/digital_land/commands.py index 70d6e1241..255700db0 100644 --- a/digital_land/commands.py +++ b/digital_land/commands.py @@ -26,6 +26,7 @@ ) from digital_land.organisation import Organisation from digital_land.package.dataset import DatasetPackage +from digital_land.package.dataset_parquet import DatasetParquetPackage from digital_land.phase.combine import FactCombinePhase from digital_land.phase.concat import ConcatFieldPhase from digital_land.phase.convert import ConvertPhase, execute @@ -366,7 +367,11 @@ def dataset_create( issue_dir="issue", column_field_dir="var/column-field", dataset_resource_dir="var/dataset-resource", + cache_dir="var/cache/parquet", + resource_path="collection/resource.csv", ): + cache_dir = os.path.join(cache_dir, dataset) + if not output_path: print("missing output path", file=sys.stderr) sys.exit(2) @@ -386,10 +391,8 @@ def dataset_create( package.create() for path in input_paths: path_obj = Path(path) - package.load_transformed(path) package.load_column_fields(column_field_dir / dataset / path_obj.name) package.load_dataset_resource(dataset_resource_dir / dataset / path_obj.name) - package.load_entities() old_entity_path = os.path.join(pipeline.path, "old-entity.csv") if os.path.exists(old_entity_path): @@ -404,6 +407,26 @@ def dataset_create( package.add_counts() + # Repeat for parquet + # Set up cache directory to store parquet files. The sqlite files created from this will be saved in the dataset + if not os.path.exists(cache_dir): + os.makedirs(cache_dir) + + pqpackage = DatasetParquetPackage( + dataset, + organisation=organisation, + path=output_path, + cache_dir=cache_dir, + resource_path=resource_path, + specification_dir=None, # TBD: package should use this specification object + ) + pqpackage.create_temp_table(input_paths) + pqpackage.load_facts() + pqpackage.load_fact_resource() + pqpackage.load_entities() + pqpackage.pq_to_sqlite() + pqpackage.close_conn() + def dataset_dump(input_path, output_path): cmd = f"sqlite3 -header -csv {input_path} 'select * from entity;' > {output_path}" diff --git a/digital_land/package/dataset_parquet.py b/digital_land/package/dataset_parquet.py new file mode 100644 index 000000000..724e92da8 --- /dev/null +++ b/digital_land/package/dataset_parquet.py @@ -0,0 +1,345 @@ +import os +import logging +import duckdb +from .package import Package +import resource + +logger = logging.getLogger(__name__) + +# TBD: move to from specification datapackage definition +tables = { + "dataset-resource": None, + "column-field": None, + "issue": None, + "entity": None, + "fact": None, + "fact-resource": None, +} + +# TBD: infer from specification dataset +indexes = { + "fact": ["entity"], + "fact-resource": ["fact", "resource"], + "column-field": ["dataset", "resource", "column", "field"], + "issue": ["resource", "dataset", "field"], + "dataset-resource": ["resource"], +} + + +class DatasetParquetPackage(Package): + def __init__(self, dataset, organisation, cache_dir, resource_path, **kwargs): + self.suffix = ".parquet" + super().__init__(dataset, tables=tables, indexes=indexes, **kwargs) + self.dataset = dataset + self.organisation = organisation + self.cache_dir = cache_dir + self._spatialite = None + self.resource_path = resource_path + # Persistent connection for the class. Given name to ensure that table is stored on disk (not purely in memory) + os.makedirs(cache_dir, exist_ok=True) + self.duckdb_file = os.path.join(cache_dir, f"{dataset}.duckdb") + self.conn = duckdb.connect(self.duckdb_file) + self.schema = self.get_schema() + self.typology = self.specification.schema[dataset]["typology"] + + def get_schema(self): + schema = {} + + for field in sorted( + list( + set(self.specification.schema["fact"]["fields"]).union( + set(self.specification.schema["fact-resource"]["fields"]) + ) + ) + ): + datatype = self.specification.field[field]["datatype"] + schema[field] = "BIGINT" if datatype == "integer" else "VARCHAR" + + return schema + + def create_temp_table(self, input_paths): + # Create a temp table of the data from input_paths as we need the information stored there at various times + logging.info( + f"loading data into temp table from {os.path.dirname(input_paths[0])}" + ) + + input_paths_str = ", ".join([f"'{path}'" for path in input_paths]) + + # Initial max_line_size and increment step + max_size = 40000000 + # increment_step = 20000000 + # max_limit = 200000000 # Maximum allowable line size to attempt + + # increment = False + while True: + try: + self.conn.execute("DROP TABLE IF EXISTS temp_table") + query = f""" + CREATE TEMPORARY TABLE temp_table AS + SELECT * + FROM read_csv( + [{input_paths_str}], + columns = {self.schema}, + header = true, + force_not_null = {[field for field in self.schema.keys()]}, + max_line_size={max_size} + ) + """ + self.conn.execute(query) + break + except duckdb.Error as e: # Catch specific DuckDB error + if "Value with unterminated quote" in str(e): + hard_limit = int(resource.getrlimit(resource.RLIMIT_AS)[1]) + if max_size < hard_limit / 3: + logging.info( + f"Initial max_size did not work, setting it to {hard_limit / 2}" + ) + max_size = hard_limit / 2 + else: + raise + else: + logging.info(f"Failed to read in when max_size = {max_size}") + raise + + def load_facts(self): + logging.info("loading facts from temp table") + + fact_fields = self.specification.schema["fact"]["fields"] + fields_str = ", ".join( + [f'"{field}"' if "-" in field else field for field in fact_fields] + ) + + # query to extract data from the temp table (containing raw data), group by a fact, and get the highest + # priority or latest record + query = f""" + SELECT {fields_str} + FROM temp_table + QUALIFY ROW_NUMBER() OVER ( + PARTITION BY fact ORDER BY priority, "entry-date" DESC, "entry-number" DESC + ) = 1 + """ + + self.conn.execute( + f""" + COPY ( + {query} + ) TO '{self.cache_dir}/fact{self.suffix}' (FORMAT PARQUET); + """ + ) + + def load_fact_resource(self): + logging.info("loading fact resources from temp table") + + fact_resource_fields = self.specification.schema["fact-resource"]["fields"] + fields_str = ", ".join( + [f'"{field}"' if "-" in field else field for field in fact_resource_fields] + ) + + # All CSV files have been loaded into a temporary table. Extract several columns and export + query = f""" + SELECT {fields_str} + FROM temp_table + """ + + self.conn.execute( + f""" + COPY ( + {query} + ) TO '{self.cache_dir}/fact_resource{self.suffix}' (FORMAT PARQUET); + """ + ) + + def load_entities(self): + organisation_path = self.organisation.organisation_path + + logging.info("loading entities from temp table") + + entity_fields = self.specification.schema["entity"]["fields"] + # Do this to match with later field names. + entity_fields = [e.replace("-", "_") for e in entity_fields] + input_paths_str = f"{self.cache_dir}/fact{self.suffix}" + + query = f""" + SELECT DISTINCT REPLACE(field,'-','_') + FROM parquet_scan('{str(input_paths_str)}') + """ + + # distinct_fields - list of fields in the field in fact + rows = self.conn.execute(query).fetchall() + distinct_fields = [row[0] for row in rows] + + # json fields - list of fields which are present in the fact table which + # do not exist separately in the entity table + # Need to ensure that 'organisation' is not included either + json_fields = [ + field + for field in distinct_fields + if field not in entity_fields + ["organisation"] + ] + + # null fields - list of fields which are not present in the fact tables which have + # to be in the entity table as a column + extra_fields = [ + "entity", + "dataset", + "typology", + "json", + "organisation_entity", + "organisation", + ] + null_fields = [ + field + for field in entity_fields + if field not in (distinct_fields + extra_fields) + ] + + # select fields - a list of fields which have to be selected directly from the pivoted table + # these are entity fields that are not null fields or a few special ones + extra_fields = [ + "json", + "organisation_entity", + "dataset", + "typology", + "organisation", + ] + select_fields = [ + field for field in entity_fields if field not in null_fields + extra_fields + ] + + # set fields + fields_to_include = ["entity", "field", "value"] + fields_str = ", ".join(fields_to_include) + + # Take original data, group by entity & field, and order by highest priority then latest record. + # If there are still matches then pick the first resource (and fact, just to make sure) + query = f""" + SELECT {fields_str} FROM ( + SELECT {fields_str}, CASE WHEN resource_csv."end-date" IS NULL THEN '2999-12-31' ELSE resource_csv."end-date" END AS resource_end_date + FROM temp_table + LEFT JOIN read_csv_auto('{self.resource_path}', max_line_size=40000000) resource_csv + ON temp_table.resource = resource_csv.resource + QUALIFY ROW_NUMBER() OVER ( + PARTITION BY entity, field + ORDER BY priority, "entry-date" DESC, "entry-number" DESC, resource_end_date DESC, temp_table.resource, fact + ) = 1 + ) + """ + + pivot_query = f""" + PIVOT ( + {query} + ) ON REPLACE(field,'-','_') + USING MAX(value) + """ + + # now use the field lists produced above to create specific statements to: + # add null columns which are missing + # include columns in the json statement + # Collate list of fields which don't exist but need to be in the final table + select_statement = ", ".join([f"t1.{field}" for field in select_fields]) + # Don't want to include anything that ends with "_geom" + null_fields_statement = ", ".join( + [ + f"''::VARCHAR AS \"{field}\"" + for field in null_fields + if not field.endswith("_geom") + ] + ) + json_statement = ", ".join( + [ + f"CASE WHEN t1.{field} IS NOT NULL THEN REPLACE('{field}', '_', '-') ELSE NULL END, t1.{field}" + for field in json_fields + ] + ) + + # define organisation query + org_csv = organisation_path + org_query = f""" + SELECT * FROM read_csv_auto('{org_csv}', max_line_size=40000000) + """ + + sql = f""" + INSTALL spatial; LOAD spatial; + COPY( + WITH computed_centroid AS ( + SELECT + * EXCLUDE (point), -- Calculate centroid point if not given + CASE + WHEN (geometry IS NOT NULL and geometry <> '') AND (point IS NULL OR point = '') + THEN ST_AsText(ST_ReducePrecision(ST_Centroid(ST_GeomFromText(geometry)),0.000001)) + ELSE point + END AS point + FROM ( + SELECT '{self.dataset}' as dataset, + '{self.typology}' as typology, + t2.entity as organisation_entity, + {select_statement}, + {null_fields_statement}, + json_object({json_statement}) as json, + FROM ({pivot_query}) as t1 + LEFT JOIN ({org_query}) as t2 + on t1.organisation = t2.organisation + ) + ) + SELECT + * EXCLUDE (json), + CASE WHEN json = '{{}}' THEN NULL ELSE json END AS json + FROM computed_centroid + ) TO '{self.cache_dir}/entity{self.suffix}' (FORMAT PARQUET); + """ + self.conn.execute(sql) + + def pq_to_sqlite(self): + # At present we are saving the parquet files in 'cache' but saving the sqlite files produced in 'dataset' + # In future when parquet files are saved to 'dataset' remove the 'cache_dir' in the function arguments and + # replace 'cache_dir' with 'output_path' in this function's code + logging.info( + f"loading sqlite3 tables in {self.path} from parquet files in {self.cache_dir}" + ) + query = "INSTALL sqlite; LOAD sqlite;" + self.conn.execute(query) + + parquet_files = [ + fn for fn in os.listdir(self.cache_dir) if fn.endswith(self.suffix) + ] + + for parquet_file in parquet_files: + table_name = os.path.splitext(os.path.basename(parquet_file))[0] + + # Load Parquet data into DuckDB temp table + self.conn.execute("DROP TABLE IF EXISTS temp_table;") + self.conn.execute( + f""" + CREATE TABLE temp_table AS + SELECT * FROM parquet_scan('{self.cache_dir}/{parquet_file}'); + """ + ) + + # Export the DuckDB table to the SQLite database + self.conn.execute( + f"ATTACH DATABASE '{self.path}' AS sqlite_db (TYPE SQLITE);" + ) + + # Fix the column names + for column in self.conn.execute("DESCRIBE TABLE temp_table;").fetchall(): + if "-" in column[0]: + self.conn.execute( + f"ALTER TABLE temp_table RENAME COLUMN '{column[0]}' TO '{column[0].replace('-','_')}';" + ) + + # Copy the data + self.conn.execute( + f"INSERT INTO sqlite_db.{table_name} BY NAME (SELECT * FROM temp_table);" + ) + + self.conn.execute("DETACH DATABASE sqlite_db;") + + def close_conn(self): + logging.info("Close connection to duckdb database in session") + if self.conn is not None: + self.conn.close() + if os.path.exists(self.duckdb_file): + os.remove(self.duckdb_file) + + def load(self): + pass From e6ac9bfcca4e98b8550515459fca637b9db40385 Mon Sep 17 00:00:00 2001 From: eveleighoj <35256612+eveleighoj@users.noreply.github.com> Date: Thu, 5 Dec 2024 16:28:29 +0000 Subject: [PATCH 07/51] tweak to remove temptable --- digital_land/commands.py | 8 +- digital_land/package/dataset_parquet.py | 113 +++++++++++++----------- 2 files changed, 67 insertions(+), 54 deletions(-) diff --git a/digital_land/commands.py b/digital_land/commands.py index 255700db0..8a49e7153 100644 --- a/digital_land/commands.py +++ b/digital_land/commands.py @@ -370,6 +370,8 @@ def dataset_create( cache_dir="var/cache/parquet", resource_path="collection/resource.csv", ): + # directories to be overwritten + transformed_parquet_dir = f"var/cache/transformed_parquet/{dataset}" cache_dir = os.path.join(cache_dir, dataset) if not output_path: @@ -420,9 +422,9 @@ def dataset_create( resource_path=resource_path, specification_dir=None, # TBD: package should use this specification object ) - pqpackage.create_temp_table(input_paths) - pqpackage.load_facts() - pqpackage.load_fact_resource() + # pqpackage.create_temp_table(input_paths) + pqpackage.load_facts(transformed_parquet_dir) + pqpackage.load_fact_resource(transformed_parquet_dir) pqpackage.load_entities() pqpackage.pq_to_sqlite() pqpackage.close_conn() diff --git a/digital_land/package/dataset_parquet.py b/digital_land/package/dataset_parquet.py index 724e92da8..095af51cc 100644 --- a/digital_land/package/dataset_parquet.py +++ b/digital_land/package/dataset_parquet.py @@ -2,7 +2,6 @@ import logging import duckdb from .package import Package -import resource logger = logging.getLogger(__name__) @@ -57,51 +56,52 @@ def get_schema(self): return schema - def create_temp_table(self, input_paths): - # Create a temp table of the data from input_paths as we need the information stored there at various times - logging.info( - f"loading data into temp table from {os.path.dirname(input_paths[0])}" - ) - - input_paths_str = ", ".join([f"'{path}'" for path in input_paths]) - - # Initial max_line_size and increment step - max_size = 40000000 - # increment_step = 20000000 - # max_limit = 200000000 # Maximum allowable line size to attempt - - # increment = False - while True: - try: - self.conn.execute("DROP TABLE IF EXISTS temp_table") - query = f""" - CREATE TEMPORARY TABLE temp_table AS - SELECT * - FROM read_csv( - [{input_paths_str}], - columns = {self.schema}, - header = true, - force_not_null = {[field for field in self.schema.keys()]}, - max_line_size={max_size} - ) - """ - self.conn.execute(query) - break - except duckdb.Error as e: # Catch specific DuckDB error - if "Value with unterminated quote" in str(e): - hard_limit = int(resource.getrlimit(resource.RLIMIT_AS)[1]) - if max_size < hard_limit / 3: - logging.info( - f"Initial max_size did not work, setting it to {hard_limit / 2}" - ) - max_size = hard_limit / 2 - else: - raise - else: - logging.info(f"Failed to read in when max_size = {max_size}") - raise - - def load_facts(self): + # def create_temp_table(self, input_paths): + # # Create a temp table of the data from input_paths as we need the information stored there at various times + # logging.info( + # f"loading data into temp table from {os.path.dirname(input_paths[0])}" + # ) + + # input_paths_str = ", ".join([f"'{path}'" for path in input_paths]) + + # # Initial max_line_size and increment step + # max_size = 40000000 + # # increment_step = 20000000 + # # max_limit = 200000000 # Maximum allowable line size to attempt + + # # increment = False + # while True: + # try: + # self.conn.execute("DROP TABLE IF EXISTS temp_table") + # query = f""" + # CREATE TEMPORARY TABLE temp_table AS + # SELECT * + # FROM read_csv( + # [{input_paths_str}], + # columns = {self.schema}, + # header = true, + # force_not_null = {[field for field in self.schema.keys()]}, + # max_line_size={max_size} + # ) + # """ + # self.conn.execute(query) + # break + # except duckdb.Error as e: # Catch specific DuckDB error + # if "Value with unterminated quote" in str(e): + # hard_limit = int(resource.getrlimit(resource.RLIMIT_AS)[1]) + # if max_size < hard_limit / 3: + # logging.info( + # f"Initial max_size did not work, setting it to {hard_limit / 2}" + # ) + # max_size = hard_limit / 2 + # else: + # raise + # else: + # logging.info(f"Failed to read in when max_size = {max_size}") + # raise + + def load_facts(self, transformed_parquet_dir): + """ """ logging.info("loading facts from temp table") fact_fields = self.specification.schema["fact"]["fields"] @@ -113,7 +113,7 @@ def load_facts(self): # priority or latest record query = f""" SELECT {fields_str} - FROM temp_table + FROM {transformed_parquet_dir}/*.parquet QUALIFY ROW_NUMBER() OVER ( PARTITION BY fact ORDER BY priority, "entry-date" DESC, "entry-number" DESC ) = 1 @@ -127,7 +127,7 @@ def load_facts(self): """ ) - def load_fact_resource(self): + def load_fact_resource(self, transformed_parquet_dir): logging.info("loading fact resources from temp table") fact_resource_fields = self.specification.schema["fact-resource"]["fields"] @@ -138,7 +138,7 @@ def load_fact_resource(self): # All CSV files have been loaded into a temporary table. Extract several columns and export query = f""" SELECT {fields_str} - FROM temp_table + FROM {transformed_parquet_dir}/*.parquet """ self.conn.execute( @@ -150,6 +150,8 @@ def load_fact_resource(self): ) def load_entities(self): + fact_resource_parquet_path = f"{self.cache_dir}/fact_resource{self.suffix}" + # fact_parquet_path = f"{self.cache_dir}/fact{self.suffix}" organisation_path = self.organisation.organisation_path logging.info("loading entities from temp table") @@ -157,11 +159,11 @@ def load_entities(self): entity_fields = self.specification.schema["entity"]["fields"] # Do this to match with later field names. entity_fields = [e.replace("-", "_") for e in entity_fields] - input_paths_str = f"{self.cache_dir}/fact{self.suffix}" + # input_paths_str = f"{self.cache_dir}/fact{self.suffix}" query = f""" SELECT DISTINCT REPLACE(field,'-','_') - FROM parquet_scan('{str(input_paths_str)}') + FROM parquet_scan('{fact_resource_parquet_path}') """ # distinct_fields - list of fields in the field in fact @@ -212,6 +214,15 @@ def load_entities(self): # Take original data, group by entity & field, and order by highest priority then latest record. # If there are still matches then pick the first resource (and fact, just to make sure) + # changes to make + # not sure why this is bringing a raw resourcce AND the temp_table this data is essentially the same + # need the resource hash and entry number of the file, this is important for ordering + # between these two, the onlly other metric that isn't in the factt resource table is the start date of the resource + # query to get this info + # query to use this info to get the most recent facts + # query to turn the most recent facts into a pivot + # query to sort the final table + # query to create the file query = f""" SELECT {fields_str} FROM ( SELECT {fields_str}, CASE WHEN resource_csv."end-date" IS NULL THEN '2999-12-31' ELSE resource_csv."end-date" END AS resource_end_date From 37ad0c71ac9bd652c4bb0e78654c07f40d9d1611 Mon Sep 17 00:00:00 2001 From: eveleighoj <35256612+eveleighoj@users.noreply.github.com> Date: Mon, 6 Jan 2025 11:09:56 +0000 Subject: [PATCH 08/51] add updated tests --- digital_land/package/dataset_parquet.py | 185 +++-- digital_land/pipeline/process.py | 9 +- tests/acceptance/test_dataset_create.py | 156 ++++ tests/integration/package/conftest.py | 34 + tests/integration/package/test_dataset.py | 30 - .../package/test_dataset_parquet.py | 772 ++++++++++++++++++ tests/integration/pipeline/test_process.py | 54 ++ 7 files changed, 1144 insertions(+), 96 deletions(-) create mode 100644 tests/acceptance/test_dataset_create.py create mode 100644 tests/integration/package/conftest.py create mode 100644 tests/integration/package/test_dataset_parquet.py create mode 100644 tests/integration/pipeline/test_process.py diff --git a/digital_land/package/dataset_parquet.py b/digital_land/package/dataset_parquet.py index 095af51cc..f97b9a37a 100644 --- a/digital_land/package/dataset_parquet.py +++ b/digital_land/package/dataset_parquet.py @@ -1,6 +1,7 @@ import os import logging import duckdb +from pathlib import Path from .package import Package logger = logging.getLogger(__name__) @@ -26,21 +27,38 @@ class DatasetParquetPackage(Package): - def __init__(self, dataset, organisation, cache_dir, resource_path, **kwargs): + def __init__(self, dataset, path, duckdb_path=None, **kwargs): + """ + Initialisation method to set up information as needed + + args: + dataset (str): name of the dataset + dir (str): the directory to store the package in + duckdb_path (str): optional parameter to use a duckdb file instead of in memory db + """ + # this is a given at this point to not sure we need it the base package class might use this self.suffix = ".parquet" - super().__init__(dataset, tables=tables, indexes=indexes, **kwargs) + super().__init__(dataset, tables=tables, indexes=indexes, path=path, **kwargs) self.dataset = dataset - self.organisation = organisation - self.cache_dir = cache_dir - self._spatialite = None - self.resource_path = resource_path + # self.cache_dir = cache_dir # Persistent connection for the class. Given name to ensure that table is stored on disk (not purely in memory) - os.makedirs(cache_dir, exist_ok=True) - self.duckdb_file = os.path.join(cache_dir, f"{dataset}.duckdb") - self.conn = duckdb.connect(self.duckdb_file) + if duckdb_path is not None: + self.duckdb_path = Path(duckdb_path) + self.duckdb_path.parent.mkdir(parents=True, exist_ok=True) + self.conn = duckdb.connect(self.duckdb_path) + else: + self.conn = duckdb.connect() + self.schema = self.get_schema() self.typology = self.specification.schema[dataset]["typology"] + # set up key file paths + self.fact_path = self.path / f"dataset={self.dataset}" / "fact.parquet" + self.fact_resource_path = ( + self.path / f"dataset={self.dataset}" / "fact_resource.parquet" + ) + self.entity_path = self.path / f"dataset={self.dataset}" / "entity.parquet" + def get_schema(self): schema = {} @@ -56,6 +74,7 @@ def get_schema(self): return schema + # will be removed as we will remove the temp table from this logic # def create_temp_table(self, input_paths): # # Create a temp table of the data from input_paths as we need the information stored there at various times # logging.info( @@ -101,60 +120,64 @@ def get_schema(self): # raise def load_facts(self, transformed_parquet_dir): - """ """ + """ + This method loads facts into a fact table from a directory containing all transformed files as parquet files + """ + output_path = self.path / f"dataset={self.dataset}" / "fact.parquet" + output_path.parent.mkdir(parents=True, exist_ok=True) logging.info("loading facts from temp table") fact_fields = self.specification.schema["fact"]["fields"] - fields_str = ", ".join( - [f'"{field}"' if "-" in field else field for field in fact_fields] - ) + fields_str = ", ".join([field.replace("-", "_") for field in fact_fields]) # query to extract data from the temp table (containing raw data), group by a fact, and get the highest # priority or latest record + query = f""" SELECT {fields_str} - FROM {transformed_parquet_dir}/*.parquet + FROM '{str(transformed_parquet_dir)}/*.parquet' QUALIFY ROW_NUMBER() OVER ( - PARTITION BY fact ORDER BY priority, "entry-date" DESC, "entry-number" DESC + PARTITION BY fact ORDER BY priority, entry_date DESC, entry_number DESC ) = 1 """ - self.conn.execute( f""" COPY ( {query} - ) TO '{self.cache_dir}/fact{self.suffix}' (FORMAT PARQUET); + ) TO '{str(output_path)}' (FORMAT PARQUET); """ ) def load_fact_resource(self, transformed_parquet_dir): - logging.info("loading fact resources from temp table") - + logging.info(f"loading fact resources from {str(transformed_parquet_dir)}") + output_path = self.path / f"dataset={self.dataset}" / "fact_resource.parquet" + output_path.parent.mkdir(parents=True, exist_ok=True) fact_resource_fields = self.specification.schema["fact-resource"]["fields"] fields_str = ", ".join( - [f'"{field}"' if "-" in field else field for field in fact_resource_fields] + [field.replace("-", "_") for field in fact_resource_fields] ) # All CSV files have been loaded into a temporary table. Extract several columns and export query = f""" SELECT {fields_str} - FROM {transformed_parquet_dir}/*.parquet + FROM '{str(transformed_parquet_dir)}/*.parquet' """ self.conn.execute( f""" COPY ( {query} - ) TO '{self.cache_dir}/fact_resource{self.suffix}' (FORMAT PARQUET); + ) TO '{str(output_path)}' (FORMAT PARQUET); """ ) - def load_entities(self): - fact_resource_parquet_path = f"{self.cache_dir}/fact_resource{self.suffix}" - # fact_parquet_path = f"{self.cache_dir}/fact{self.suffix}" - organisation_path = self.organisation.organisation_path + def load_entities(self, transformed_parquet_dir, resource_path, organisation_path): + output_path = self.path / f"dataset={self.dataset}" / "entity.parquet" + output_path.parent.mkdir(parents=True, exist_ok=True) + # get the other paths + # fact_resource_parquet_path = f"{self.cache_dir}/fact_resource{self.suffix}" - logging.info("loading entities from temp table") + logging.info(f"loading entities from {transformed_parquet_dir}") entity_fields = self.specification.schema["entity"]["fields"] # Do this to match with later field names. @@ -163,7 +186,7 @@ def load_entities(self): query = f""" SELECT DISTINCT REPLACE(field,'-','_') - FROM parquet_scan('{fact_resource_parquet_path}') + FROM parquet_scan('{transformed_parquet_dir}/*.parquet') """ # distinct_fields - list of fields in the field in fact @@ -171,8 +194,8 @@ def load_entities(self): distinct_fields = [row[0] for row in rows] # json fields - list of fields which are present in the fact table which - # do not exist separately in the entity table - # Need to ensure that 'organisation' is not included either + # do not exist separately in the entity table so need to be included in the json field + # Need to ensure that 'organisation' is not included either so that it is excluded json_fields = [ field for field in distinct_fields @@ -223,18 +246,20 @@ def load_entities(self): # query to turn the most recent facts into a pivot # query to sort the final table # query to create the file + query = f""" SELECT {fields_str} FROM ( SELECT {fields_str}, CASE WHEN resource_csv."end-date" IS NULL THEN '2999-12-31' ELSE resource_csv."end-date" END AS resource_end_date - FROM temp_table - LEFT JOIN read_csv_auto('{self.resource_path}', max_line_size=40000000) resource_csv - ON temp_table.resource = resource_csv.resource + FROM parquet_scan('{transformed_parquet_dir}/*.parquet') tf + LEFT JOIN read_csv_auto('{resource_path}', max_line_size=40000000) resource_csv + ON tf.resource = resource_csv.resource QUALIFY ROW_NUMBER() OVER ( PARTITION BY entity, field - ORDER BY priority, "entry-date" DESC, "entry-number" DESC, resource_end_date DESC, temp_table.resource, fact + ORDER BY priority, entry_date DESC, entry_number DESC, resource_end_date DESC, tf.resource, fact ) = 1 ) """ + logging.error(query) pivot_query = f""" PIVOT ( @@ -269,6 +294,7 @@ def load_entities(self): SELECT * FROM read_csv_auto('{org_csv}', max_line_size=40000000) """ + # should installinng spatial be done here sql = f""" INSTALL spatial; LOAD spatial; COPY( @@ -296,54 +322,83 @@ def load_entities(self): * EXCLUDE (json), CASE WHEN json = '{{}}' THEN NULL ELSE json END AS json FROM computed_centroid - ) TO '{self.cache_dir}/entity{self.suffix}' (FORMAT PARQUET); + ) TO '{str(output_path)}' (FORMAT PARQUET); """ + # might need to un some fetch all toget result back self.conn.execute(sql) - def pq_to_sqlite(self): + def load_to_sqlite(self, sqlite_path): + """ + Convert parquet files to sqlite3 tables assumes the sqlite table already exist. There is an arguement to + say we want to improve the loading functionality of a sqlite package + """ # At present we are saving the parquet files in 'cache' but saving the sqlite files produced in 'dataset' # In future when parquet files are saved to 'dataset' remove the 'cache_dir' in the function arguments and # replace 'cache_dir' with 'output_path' in this function's code logging.info( - f"loading sqlite3 tables in {self.path} from parquet files in {self.cache_dir}" + f"loading sqlite3 tables in {sqlite_path} from parquet files in {self.path}" ) + # migrate to connection creation query = "INSTALL sqlite; LOAD sqlite;" self.conn.execute(query) - parquet_files = [ - fn for fn in os.listdir(self.cache_dir) if fn.endswith(self.suffix) - ] + # attache the sqlite db to duckdb + self.conn.execute( + f"ATTACH DATABASE '{sqlite_path}' AS sqlite_db (TYPE SQLITE);" + ) + + fact_resource_fields = self.specification.schema["fact-resource"]["fields"] + fields_str = ", ".join( + [field.replace("-", "_") for field in fact_resource_fields] + ) + # insert fact_resource data + self.conn.execute( + f""" + INSERT INTO sqlite_db.fact_resource + SELECT {fields_str} FROM parquet_scan('{self.fact_resource_path}') + """ + ) - for parquet_file in parquet_files: - table_name = os.path.splitext(os.path.basename(parquet_file))[0] + # insert fact data + fact_fields = self.specification.schema["fact"]["fields"] + fields_str = ", ".join([field.replace("-", "_") for field in fact_fields]) - # Load Parquet data into DuckDB temp table - self.conn.execute("DROP TABLE IF EXISTS temp_table;") - self.conn.execute( - f""" - CREATE TABLE temp_table AS - SELECT * FROM parquet_scan('{self.cache_dir}/{parquet_file}'); - """ - ) + self.conn.execute( + f""" + INSERT INTO sqlite_db.fact + SELECT {fields_str} FROM parquet_scan('{self.fact_path}') + """ + ) - # Export the DuckDB table to the SQLite database - self.conn.execute( - f"ATTACH DATABASE '{self.path}' AS sqlite_db (TYPE SQLITE);" - ) + # insert entity data + entity_fields = self.specification.schema["entity"]["fields"] + fields_str = ", ".join( + [ + field.replace("-", "_") + for field in entity_fields + if field not in ["geometry-geom", "point-geom"] + ] + ) + self.conn.execute( + f""" + INSERT INTO sqlite_db.entity + SELECT {fields_str} FROM parquet_scan('{self.entity_path}') + """ + ) - # Fix the column names - for column in self.conn.execute("DESCRIBE TABLE temp_table;").fetchall(): - if "-" in column[0]: - self.conn.execute( - f"ALTER TABLE temp_table RENAME COLUMN '{column[0]}' TO '{column[0].replace('-','_')}';" - ) + # Fix the column names + # for column in self.conn.execute("DESCRIBE TABLE temp_table;").fetchall(): + # if "-" in column[0]: + # self.conn.execute( + # f"ALTER TABLE temp_table RENAME COLUMN '{column[0]}' TO '{column[0].replace('-','_')}';" + # ) - # Copy the data - self.conn.execute( - f"INSERT INTO sqlite_db.{table_name} BY NAME (SELECT * FROM temp_table);" - ) + # Copy the data + # self.conn.execute( + # f"INSERT INTO sqlite_db.{table_name} BY NAME (SELECT * FROM temp_table);" + # ) - self.conn.execute("DETACH DATABASE sqlite_db;") + self.conn.execute("DETACH DATABASE sqlite_db;") def close_conn(self): logging.info("Close connection to duckdb database in session") diff --git a/digital_land/pipeline/process.py b/digital_land/pipeline/process.py index 520081030..0a0fc3e61 100644 --- a/digital_land/pipeline/process.py +++ b/digital_land/pipeline/process.py @@ -47,6 +47,11 @@ def convert_tranformed_csv_to_pq(input_path, output_path): schema = pa.schema(fields) table = pa.Table.from_pandas(first_chunk, schema=schema) + # rename columns for parquet files to make querying easier in s3 + # Replace '-' with '_' in column names + new_column_names = [name.replace("-", "_") for name in table.column_names] + table = table.rename_columns(new_column_names) + # Create a Parquet writer parquet_writer = pq.ParquetWriter(output_path, table.schema) @@ -57,7 +62,9 @@ def convert_tranformed_csv_to_pq(input_path, output_path): while True: try: chunk = next(csv_iterator) - table = pa.Table.from_pandas(chunk) + table = pa.Table.from_pandas(chunk, schema=schema) + new_column_names = [name.replace("-", "_") for name in table.column_names] + table = table.rename_columns(new_column_names) parquet_writer.write_table(table) # size += len(chunk) except StopIteration: diff --git a/tests/acceptance/test_dataset_create.py b/tests/acceptance/test_dataset_create.py new file mode 100644 index 000000000..de67328af --- /dev/null +++ b/tests/acceptance/test_dataset_create.py @@ -0,0 +1,156 @@ +""" +A set of tests to mimic a user (computational or otherwise) running tests against +a sqlite dataset. There are quite a few things to set up and this specifically +""" + +import pytest + +import numpy as np +import pandas as pd +import os +import sqlite3 +from tempfile import TemporaryDirectory +from pathlib import Path + +from click.testing import CliRunner + +from digital_land.cli import cli + +test_collection = "conservation-area" +test_dataset = "conservation-area" + + +@pytest.fixture(scope="session") +def session_tmp_path(): + with TemporaryDirectory() as tmpdir: + yield Path(tmpdir) + + +@pytest.fixture +def input_paths(): + input_paths = [] + directory = f"tests/data/{test_collection}/transformed/{test_dataset}/" + for root, dirs, files in os.walk(directory): + for file in files: + full_path = os.path.join(root, file) + input_paths.append(full_path) + + return input_paths + + +@pytest.fixture +def organisation_path(): + """ + build an organisations dataset to use + """ + orgs_path = f"tests/data/{test_collection}/organisation.csv" + return orgs_path + + +@pytest.fixture +def cache_path(session_tmp_path): + cache_path = session_tmp_path / "var" / "cache" + os.makedirs(cache_path, exist_ok=True) + return cache_path + + +@pytest.fixture +def dataset_dir(session_tmp_path): + dataset_dir = session_tmp_path / "dataset" + os.makedirs(dataset_dir, exist_ok=True) + return dataset_dir + + +@pytest.fixture +def issue_dir(session_tmp_path): + issue_dir = session_tmp_path / "issue" + os.makedirs(issue_dir, exist_ok=True) + return issue_dir + + +@pytest.fixture +def resource_path(session_tmp_path): + resource_path = session_tmp_path / "resource.csv" + columns = ["resource", "end-date"] + with open(resource_path, "w") as f: + f.write(",".join(columns) + "\n") + return resource_path + + +def test_acceptance_dataset_create( + session_tmp_path, + organisation_path, + input_paths, + issue_dir, + cache_path, + dataset_dir, + resource_path, +): + output_path = dataset_dir / f"{test_dataset}.sqlite3" + + runner = CliRunner() + result = runner.invoke( + cli, + [ + "--dataset", + str(test_dataset), + "--pipeline-dir", + str(f"tests/data/{test_collection}/pipeline"), + "dataset-create", + "--output-path", + str(output_path), + "--organisation-path", + str(organisation_path), + "--column-field-dir", + str(f"tests/data/{test_collection}/var/column-field"), + "--dataset-resource-dir", + str(f"tests/data/{test_collection}/var/dataset-resource"), + "--issue-dir", + str(issue_dir), + "--cache-dir", + str(cache_path), + "--resource-path", + str(resource_path), + ] + + input_paths, + catch_exceptions=False, + ) + + # Check that the command exits with status code 0 (success) + if result.exit_code != 0: + # Print the command output if the test fails + print("Command failed with exit code:", result.exit_code) + print("Command output:") + print(result.output) + print("Command error output:") + print(result.exception) + + assert result.exit_code == 0, "error returned when building dataset" + pq_cache = os.path.join(cache_path, test_dataset) + pq_files = [file for file in os.listdir(pq_cache) if file.endswith(".parquet")] + assert len(pq_files) == 3, "Not all parquet files created" + assert np.all( + np.sort(pq_files) == ["entity.parquet", "fact.parquet", "fact_resource.parquet"] + ), "parquet file names not correct" + + # Check the sqlite file was created + assert os.path.exists(output_path), f"sqlite file {output_path} does not exists" + + conn = sqlite3.connect(output_path) + cursor = conn.cursor() + tables = cursor.execute( + "SELECT name FROM sqlite_master WHERE type='table';" + ).fetchall() + expected_tables = {"fact", "fact_resource", "entity"} + actual_tables = {table[0] for table in tables} + missing_tables = expected_tables - actual_tables + assert ( + len(missing_tables) == 0 + ), f"Missing following tables in sqlite database: {missing_tables}" + + for table in list(expected_tables): + pq_rows = len(pd.read_parquet(f"{pq_cache}/{table}.parquet")) + sql_rows = cursor.execute(f"SELECT COUNT(*) FROM {table};").fetchone()[0] + assert ( + pq_rows == sql_rows + ), f"Different rows between the parquet files and database table for {table}" diff --git a/tests/integration/package/conftest.py b/tests/integration/package/conftest.py new file mode 100644 index 000000000..408c3de1b --- /dev/null +++ b/tests/integration/package/conftest.py @@ -0,0 +1,34 @@ +import pytest +import os + +from urllib.request import urlretrieve + + +@pytest.fixture(scope="session") +def specification_dir(tmp_path_factory): + specification_dir = tmp_path_factory.mktemp("specification") + source_url = "https://raw.githubusercontent.com/digital-land/" + specification_csvs = [ + "attribution.csv", + "licence.csv", + "typology.csv", + "theme.csv", + "collection.csv", + "dataset.csv", + "dataset-field.csv", + "field.csv", + "datatype.csv", + "prefix.csv", + # deprecated .. + "pipeline.csv", + "dataset-schema.csv", + "schema.csv", + "schema-field.csv", + ] + for specification_csv in specification_csvs: + urlretrieve( + f"{source_url}/specification/main/specification/{specification_csv}", + os.path.join(specification_dir, specification_csv), + ) + + return specification_dir diff --git a/tests/integration/package/test_dataset.py b/tests/integration/package/test_dataset.py index a24c80063..96c3a7add 100644 --- a/tests/integration/package/test_dataset.py +++ b/tests/integration/package/test_dataset.py @@ -57,36 +57,6 @@ def transformed_fact_resources_with_blank(): return input_data -@pytest.fixture(scope="session") -def specification_dir(tmp_path_factory): - specification_dir = tmp_path_factory.mktemp("specification") - source_url = "https://raw.githubusercontent.com/digital-land/" - specification_csvs = [ - "attribution.csv", - "licence.csv", - "typology.csv", - "theme.csv", - "collection.csv", - "dataset.csv", - "dataset-field.csv", - "field.csv", - "datatype.csv", - "prefix.csv", - # deprecated .. - "pipeline.csv", - "dataset-schema.csv", - "schema.csv", - "schema-field.csv", - ] - for specification_csv in specification_csvs: - urllib.request.urlretrieve( - f"{source_url}/specification/main/specification/{specification_csv}", - os.path.join(specification_dir, specification_csv), - ) - - return specification_dir - - @pytest.fixture def organisation_csv(tmp_path): organisation_path = os.path.join(tmp_path, "organisation.csv") diff --git a/tests/integration/package/test_dataset_parquet.py b/tests/integration/package/test_dataset_parquet.py new file mode 100644 index 000000000..7318b979e --- /dev/null +++ b/tests/integration/package/test_dataset_parquet.py @@ -0,0 +1,772 @@ +import sqlite3 +import numpy as np +import pandas as pd +import logging +import pytest +import os +import json +import pyarrow.parquet as pq +import pyarrow as pa +from digital_land.package.dataset_parquet import DatasetParquetPackage + + +class MockOrganisation(object): + def __init__(self, organisation_path): + self.organisation_path = organisation_path + + +@pytest.fixture +def org_path(tmp_path): + org_path = tmp_path / "organisation.csv" + columns = ["organisation", "entity"] + # Test data for the tables. This checks that 'field' get pivoted + data = [ + ["local-authority:AAA", "1"], + ["local-authority:BBB", "2"], + ["local-authority:CCC", "3"], + ["local-authority:DDD", "4"], + ["local-authority:EEE", "5"], + ["local-authority:FFF", "6"], + ["local-authority:GGG", "7"], + ["local-authority:HHH", "8"], + ["local-authority:III", "9"], + ] + with open(org_path, "w") as f: + f.write(",".join(columns) + "\n") + for row in data: + f.write(",".join(map(str, row)) + "\n") + return org_path + + +# # Fixture to create a shared temporary directory +# @pytest.fixture(scope="session") +# def temp_dir(tmpdir_factory): +# temp_dir = tmpdir_factory.mktemp("shared_session_temp_dir") +# yield temp_dir + + +@pytest.fixture +def resource_path(tmp_path): + resource_path = tmp_path / "resource.csv" + resource_columns = ["resource", "end-date"] + with open(resource_path, "w") as f: + f.write(",".join(resource_columns) + "\n") + + return resource_path + + +# general use file to use for testing should focus on splitting down into individual test cases +test_geometry = "MULTIPOLYGON(((-0.49901924 53.81622,-0.5177418 53.76114,-0.4268378 53.78454,-0.49901924 53.81622)))" +transformed_1_data = { + "end_date": [np.nan] * 16, + "entity": [11, 11, 11, 11, 11, 11, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12], + "entry_date": [ + "2023-01-01", + "2023-01-01", + "2023-01-01", + "2023-01-01", + "2023-01-01", + "2023-01-01", + "2023-02-01", + "2023-02-01", + "2023-02-01", + "2023-02-01", + "2023-02-01", + "2023-02-01", + "2023-02-01", + "2023-02-01", + "2023-02-01", + "2023-02-01", + ], + "entry_number": [2] * 16, + "fact": [ + "abcdef1", + "abcdef2", + "abcdef3", + "abcdef4", + "abcdef5", + "abcdef6", + "abc1231", + "abc1232", + "abc1233", + "def4561", + "def4562", + "def4563", + "a1b2c31", + "a1b2c32", + "a1b2c33", + "a1b2c34", + ], + "field": [ + "entry-date", + "geometry", + "point", + "document-url", + "organisation", + "entry-date", + "geometry", + "organisation", + "entry-date", + "geometry", + "organisation", + "entry-date", + "geomtry", + "document-url", + "notes-checking", + "organisation", + ], + "priority": [2] * 16, + "reference_entity": [np.nan] * 16, + "resource": [ + "zyxwvu", + "zyxwvu", + "zyxwvu", + "zyxwvu", + "zyxwvu", + "zyxwvu", + "yxwvut", + "yxwvut", + "zyxwvu", + "xwvuts", + "xwvuts", + "zyxwvu", + "wvutsr", + "wvutsr", + "wvutsr", + "wvutsr", + ], + "start_date": [np.nan] * 16, + "value": [ + "2023-01-01", + f"{test_geometry}", + '"POINT(-0.481 53.788)"', + "https://www.test.xyz", + "organisation:AAA", + "2023-01-01", + f"{test_geometry}", + "local-authority:BBB", + "2023-01-01", + f"{test_geometry}", + "local-authority:CCC", + "2023-01-01", + f"{test_geometry}", + "https://www.testing.yyz", + "Something random", + "local-authority:DDD", + ], +} + +transformed_2_data = { + "end_date": [np.nan] * 19, # 19 records + "entity": [ + 110, + 110, + 110, + 111, + 111, + 111, + 112, + 112, + 112, + 113, + 113, + 113, + 114, + 114, + 114, + 115, + 115, + 115, + 116, + ], + "entry_date": [ + "2023-01-01", + "2023-01-01", + "2023-01-01", + "2023-01-01", + "2023-02-01", + "2023-02-01", + "2023-02-01", + "2023-02-01", + "2023-01-01", + "2023-01-01", + "2023-01-01", + "2023-01-01", + "2023-04-01", + "2023-05-01", + "2023-01-01", + "2023-01-01", + "2023-01-01", + "2023-01-01", + "2023-01-01", + ], + "entry_number": [2, 2, 2, 2, 2, 2, 2, 12, 12, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2], + "fact": [ + "badcfe1", + "badcfe2", + "badcfe3", + "fedcba1", + "fedcba2", + "fedcba3", + "bcdefg1", + "bcdefg2", + "bcdefg3", + "cdefgh1", + "hgfedc1", + "cdefgh2", + "efghij1", + "efghij2", + "efghij3", + "defghi1", + "defghi2", + "defghi3", + "ihgfed1", + ], + "field": [ + "entry-date", + "entry-date", + "organisation", + "entry-date", + "entry-date", + "organisation", + "entry-date", + "entry-date", + "organisation", + "entry-date", + "entry-date", + "organisation", + "entry-date", + "entry-date", + "organisation", + "entry-date", + "entry-date", + "organisation", + "entry-date", + ], + "priority": [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 1, 2, 2, 2, 2], + "reference_entity": [np.nan] * 19, # 19 records + "resource": [ + "zyx123", + "zyx123", + "zyx123", + "zxy123", + "zxy123", + "zxy123", + "yxw456", + "yxw456", + "yxw456", + "xwv789", + "xwv789", + "xwv789", + "xyz123", + "xyz123", + "xyz123", + "uvw456", + "wvu654", + "uvw456", + "rta357", + ], + "start_date": [np.nan] * 19, # 19 records + "value": [ + "2023-01-01", + "2023-01-01", + "local-authority:DDD", + "2023-01-01", + "2023-02-01", + "local-authority:EEE", + "2023-02-01", + "2023-02-01", + "local-authority:FFF", + "2023-01-01", + "2023-01-01", + "local-authority:GGG", + "2023-04-01", + "2023-05-01", + "local-authority:HHH", + "2023-01-01", + "2023-01-01", + "local-authority:III", + "2023-01-01", + ], +} + + +@pytest.fixture +def dataset_sqlite_path(tmp_path): + """ + Should consider using a test spec to feed in to a dataset package instead, also functionality might need to be moved + """ + sqlite_path = tmp_path / "conservation-area.sqlite3" + conn = sqlite3.connect(sqlite_path) + conn.execute( + """ + CREATE TABLE entity( + dataset TEXT, + end_date TEXT, + entity INTEGER PRIMARY KEY, + entry_date TEXT, + geojson JSON, + geometry TEXT, + json JSON, + name TEXT, + organisation_entity TEXT, + point TEXT, + prefix TEXT, + reference TEXT, + start_date TEXT, + typology TEXT + ); + """ + ) + conn.execute( + """ + CREATE TABLE fact( + end_date TEXT, + entity INTEGER, + fact TEXT PRIMARY KEY, + field TEXT, + entry_date TEXT, + priority INTEGER, + reference_entity TEXT, + start_date TEXT, + value TEXT, + FOREIGN KEY(entity) REFERENCES entity(entity) + ); + """ + ) + conn.execute( + """ + CREATE TABLE fact_resource( + end_date TEXT, + fact TEXT, + entry_date TEXT, + entry_number INTEGER, + priority INTEGER, + resource TEXT, + start_date TEXT, + FOREIGN KEY(fact) REFERENCES fact(fact) + ); + """ + ) + + conn.commit() + conn.close() + + return sqlite_path + + +@pytest.mark.parametrize("data,expected", [(transformed_1_data, 16)]) +def test_load_facts_single_file(data: dict, expected: int, tmp_path): + """ + tests loading from a directory when there is a single file, multiple files + make very little difference to duckdb so use to test out individual cases + """ + # convert data to df and save to a file + df = pd.DataFrame.from_dict(data) + transformed_parquet_dir = tmp_path / "transformed" + transformed_parquet_dir.mkdir(parents=True, exist_ok=True) + df.to_parquet(transformed_parquet_dir / "transformed_resouce.parquet", index=False) + + # instantiate package + package = DatasetParquetPackage( + dataset="conservation-area", + path=tmp_path / "conservation-area", + specification_dir=None, + ) + + # this method is explicitely designed to load facts from the temp table + # however it shouldn't need this, it's dupllicating all of the same data in a emporary space + # we should try leveraging the power of duckdb and parquet. + package.load_facts(transformed_parquet_dir=transformed_parquet_dir) + + output_file = ( + tmp_path / "conservation-area" / "dataset=conservation-area" / "fact.parquet" + ) + assert os.path.exists(output_file), "fact.parquet file does not exist" + + df = pd.read_parquet(output_file) + + assert len(df) > 0, "No data in fact.parquet file" + assert ( + len(df) == expected + ), "No. of facts does not match expected" # No of unique facts + assert df.shape[1] == 9, "Not all columns saved in fact.parquet file" + + +@pytest.mark.parametrize( + "data1,data2,expected", [(transformed_1_data, transformed_2_data, 35)] +) +def test_load_facts_multiple_files(data1, data2, expected, tmp_path): + """ + test loading multiple files into the fact table when they're from a single directory + """ + # convert data to df and save to a file + df1 = pd.DataFrame.from_dict(data1) + df2 = pd.DataFrame.from_dict(data2) + transformed_parquet_dir = tmp_path / "transformed" + transformed_parquet_dir.mkdir(parents=True, exist_ok=True) + df1.to_parquet( + transformed_parquet_dir / "transformed_resource_1.parquet", index=False + ) + df2.to_parquet( + transformed_parquet_dir / "transformed_resource_2.parquet", index=False + ) + + package = DatasetParquetPackage( + dataset="conservation-area", + path=tmp_path / "conservation-area", + specification_dir=None, + ) + + package.load_facts(transformed_parquet_dir=transformed_parquet_dir) + + output_file = ( + tmp_path / "conservation-area" / "dataset=conservation-area" / "fact.parquet" + ) + assert os.path.exists(output_file), "fact.parquet file does not exist" + + df = pd.read_parquet(output_file) + + assert len(df) > 0, "No data in fact.parquet file" + assert ( + len(df) == expected + ), "No. of facts does not match expected" # No of unique facts + assert df.shape[1] == 9, "Not all columns saved in fact.parquet file" + + +@pytest.mark.parametrize("data,expected", [(transformed_1_data, 16)]) +def test_load_facts_one_file_with_empty_file(data, expected, tmp_path): + """ + test loading one file into the fact table alongside an empty file + """ + + df = pd.DataFrame.from_dict(data) + transformed_parquet_dir = tmp_path / "transformed" + transformed_parquet_dir.mkdir(parents=True, exist_ok=True) + df.to_parquet(transformed_parquet_dir / "transformed_resouce.parquet", index=False) + schema = pa.schema( + [ + ("end_date", pa.string()), + ("entity", pa.int64()), + ("entry_date", pa.string()), + ("entry_number", pa.int64()), + ("fact", pa.string()), + ("field", pa.string()), + ("priority", pa.int64()), + ("reference_entity", pa.int64()), + ("resource", pa.string()), + ("start_date", pa.string()), + ("value", pa.string()), + ] + ) + empty_arrays = [pa.array([], type=field.type) for field in schema] + empty_table = pa.Table.from_arrays(empty_arrays, schema=schema) + pq.write_table(empty_table, transformed_parquet_dir / "empty.parquet") + + package = DatasetParquetPackage( + dataset="conservation-area", + path=tmp_path / "conservation-area", + specification_dir=None, + ) + + package.load_facts(transformed_parquet_dir=transformed_parquet_dir) + + output_file = ( + tmp_path / "conservation-area" / "dataset=conservation-area" / "fact.parquet" + ) + assert os.path.exists(output_file), "fact.parquet file does not exist" + + df = pd.read_parquet(output_file) + + assert len(df) > 0, "No data in fact.parquet file" + assert ( + len(df) == expected + ), "No. of facts does not match expected" # No of unique facts + assert df.shape[1] == 9, "Not all columns saved in fact.parquet file" + + +@pytest.mark.parametrize("data,expected", [(transformed_1_data, 16)]) +def test_load_fact_resource_single_file(data, expected, tmp_path): + + df = pd.DataFrame.from_dict(data) + transformed_parquet_dir = tmp_path / "transformed" + transformed_parquet_dir.mkdir(parents=True, exist_ok=True) + df.to_parquet(transformed_parquet_dir / "transformed_resouce.parquet", index=False) + + package = DatasetParquetPackage( + dataset="conservation-area", + path=tmp_path / "conservation-area", + specification_dir=None, + ) + package.load_fact_resource(transformed_parquet_dir) + + # Check if the output parquet file exists and verify contents + output_file = ( + tmp_path + / "conservation-area" + / "dataset=conservation-area" + / "fact_resource.parquet" + ) + assert os.path.exists(output_file), "fact-resource.parquet file does not exist" + + # Load Parquet into a DataFrame to verify data correctness + df = pd.read_parquet(output_file) + + assert len(df) > 0, "No data in fact-resource,parquet file" + assert len(df) == expected, "Not all data saved in fact-resource.parquet file" + + assert df.shape[1] == 7, "Not all columns saved in fact-resource.parquet file" + + +@pytest.mark.parametrize( + "data_1,data_2,expected", [(transformed_1_data, transformed_2_data, 35)] +) +def test_load_fact_resource_two_filea(data_1, data_2, expected, tmp_path): + df_1 = pd.DataFrame.from_dict(data_1) + df_2 = pd.DataFrame.from_dict(data_2) + transformed_parquet_dir = tmp_path / "transformed" + transformed_parquet_dir.mkdir(parents=True, exist_ok=True) + df_1.to_parquet( + transformed_parquet_dir / "transformed_resource_1.parquet", index=False + ) + df_2.to_parquet( + transformed_parquet_dir / "transformed_resource_2.parquet", index=False + ) + + package = DatasetParquetPackage( + dataset="conservation-area", + path=tmp_path / "conservation-area", + specification_dir=None, + ) + package.load_fact_resource(transformed_parquet_dir) + + # Check if the output parquet file exists and verify contents + output_file = ( + tmp_path + / "conservation-area" + / "dataset=conservation-area" + / "fact_resource.parquet" + ) + assert os.path.exists(output_file), "fact-resource.parquet file does not exist" + + # Load Parquet into a DataFrame to verify data correctness + df = pd.read_parquet(output_file) + + assert len(df) > 0, "No data in fact-resource,parquet file" + assert len(df) == expected, "Not all data saved in fact-resource.parquet file" + + assert df.shape[1] == 7, "Not all columns saved in fact-resource.parquet file" + + +@pytest.mark.parametrize("data,expected", [(transformed_1_data, 16)]) +def test_load_fact_resource_empty_file_with_another(data, expected, tmp_path): + + df = pd.DataFrame.from_dict(data) + transformed_parquet_dir = tmp_path / "transformed" + transformed_parquet_dir.mkdir(parents=True, exist_ok=True) + df.to_parquet(transformed_parquet_dir / "transformed_resouce.parquet", index=False) + # create empty file + schema = pa.schema( + [ + ("end_date", pa.string()), + ("entity", pa.int64()), + ("entry_date", pa.string()), + ("entry_number", pa.int64()), + ("fact", pa.string()), + ("field", pa.string()), + ("priority", pa.int64()), + ("reference_entity", pa.int64()), + ("resource", pa.string()), + ("start_date", pa.string()), + ("value", pa.string()), + ] + ) + empty_arrays = [pa.array([], type=field.type) for field in schema] + empty_table = pa.Table.from_arrays(empty_arrays, schema=schema) + pq.write_table(empty_table, transformed_parquet_dir / "empty.parquet") + + package = DatasetParquetPackage( + dataset="conservation-area", + path=tmp_path / "conservation-area", + specification_dir=None, + ) + package.load_fact_resource(transformed_parquet_dir) + + # Check if the output parquet file exists and verify contents + output_file = ( + tmp_path + / "conservation-area" + / "dataset=conservation-area" + / "fact_resource.parquet" + ) + assert os.path.exists(output_file), "fact-resource.parquet file does not exist" + + # Load Parquet into a DataFrame to verify data correctness + df = pd.read_parquet(output_file) + + assert len(df) > 0, "No data in fact-resource,parquet file" + assert len(df) == expected, "Not all data saved in fact-resource.parquet file" + + assert df.shape[1] == 7, "Not all columns saved in fact-resource.parquet file" + + +@pytest.mark.parametrize( + "data,expected_count,expected_props", + [(transformed_1_data, 2, {11: {"end_date": ""}})], +) +def test_load_entities_single_file( + data, expected_count, expected_props, tmp_path, org_path, resource_path +): + # Create dummy organisation.csv file for use in 'load_entities' + # Test data for the tables. This checks that 'field' get pivoted + df = pd.DataFrame.from_dict(data) + transformed_parquet_dir = tmp_path / "transformed" + transformed_parquet_dir.mkdir(parents=True, exist_ok=True) + df.to_parquet(transformed_parquet_dir / "transformed_resouce.parquet", index=False) + + package = DatasetParquetPackage( + dataset="conservation-area", + path=tmp_path / "conservation-area", + specification_dir=None, + ) + package.load_entities(transformed_parquet_dir, resource_path, org_path) + + output_file = ( + tmp_path / "conservation-area" / "dataset=conservation-area" / "entity.parquet" + ) + assert os.path.exists(output_file), "entity.parquet file does not exist" + + df = pd.read_parquet(output_file) + + assert len(df) > 0, "No data in entity.parquet file" + assert len(df) == expected_count, "No. of entities is not correct" + assert df["entity"].nunique() == len(df), "Entity column contains duplicate values" + + for entity in expected_props: + for key, value in expected_props[entity].items(): + logging.info(f"entity={entity}, key={key}, value={value}") + assert ( + df[df["entity"] == entity][key].iloc[0] == value + ), f"Expected {key} to be {value} for entity {entity}" + + +# not great test as have to feed so much in, would be better to test each table loading at a time +@pytest.mark.parametrize( + "fact_data,fact_resource_data,entity_data", + [ + ( + { + "fact": [""], + "end_date": [1], + "entity": [1], + "field": [""], + "entry_date": [""], + "priority": [1], + "reference_entity": [""], + "start_date": [1], + "value": [""], + }, + { + "end_date": [""], + "fact": [1], + "entry_date": [""], + "entry_number": [1], + "priority": [1], + "resource": [""], + "start_date": [1], + }, + { + "entity": [1], + "dataset": ["conservation-area"], + "end_date": [""], + "entry_date": [""], + "geojson": [""], + "geometry": [""], + "json": [""], + "name": [""], + "organisation_entity": [""], + "point": [""], + "prefix": [""], + "reference": [""], + "start_date": [""], + "typology": [""], + }, + ) + ], +) +def test_load_pq_to_sqlite_basic( + fact_data, fact_resource_data, entity_data, dataset_sqlite_path, tmp_path +): + + dataset_parquet_path = tmp_path / "dataset" + (dataset_parquet_path / "dataset=conservation-area").mkdir( + parents=True, exist_ok=True + ) + # write data to parquet files in the dataset path + fact_df = pd.DataFrame.from_dict(fact_data) + fact_resource_df = pd.DataFrame.from_dict(fact_resource_data) + entity_df = pd.DataFrame.from_dict(entity_data) + + fact_df.to_parquet( + dataset_parquet_path / "dataset=conservation-area" / "fact.parquet", index=False + ) + fact_resource_df.to_parquet( + dataset_parquet_path / "dataset=conservation-area" / "fact_resource.parquet", + index=False, + ) + entity_df.to_parquet( + dataset_parquet_path / "dataset=conservation-area" / "entity.parquet", + index=False, + ) + + output_path = dataset_sqlite_path + + package = DatasetParquetPackage( + dataset="conservation-area", + path=tmp_path / "dataset", + specification_dir=None, + ) + + package.load_to_sqlite(output_path) + + assert os.path.exists(dataset_sqlite_path), "sqlite3 file does not exist" + + cnx = sqlite3.connect(output_path) + df_sql = pd.read_sql_query("SELECT * FROM fact_resource", cnx) + assert len(df_sql) > 0, "No data in fact_resource table" + assert len(df_sql) == len( + fact_resource_df + ), "Not all data saved in fact_resource table" + assert np.all( + len(df_sql["end_date"] == 0) + ), "Non-empty strings in end_date from fact_resource table" + + df_sql = pd.read_sql_query("SELECT * FROM fact", cnx) + assert len(df_sql) > 0, "No data in fact table" + assert len(df_sql) == len(fact_df), "Not all data saved in fact table" + assert np.all( + len(df_sql["end_date"] == 0) + ), "Non-empty strings in end_date from fact table" + + df_sql = pd.read_sql_query("SELECT * FROM entity", cnx) + assert len(df_sql) > 0, "No data in entity table" + assert len(df_sql) == len(entity_df), "Not all data saved in entity table" + assert np.any( + len(df_sql["geometry"] == 0) + ), "All geometries from entity table have values" + assert np.any( + len(df_sql["geometry"] == 0) + ), "All geometries from entity table have non-blank values" + assert not any( + [ + ( + any("_" in key for key in json.loads(row).keys()) + if isinstance(row, str) + else False + ) + for row in df_sql["json"] + if row != "" + ] + ), "Some json object have underscores in their 'keys'" + + cnx.close() diff --git a/tests/integration/pipeline/test_process.py b/tests/integration/pipeline/test_process.py new file mode 100644 index 000000000..4f0e24932 --- /dev/null +++ b/tests/integration/pipeline/test_process.py @@ -0,0 +1,54 @@ +import pandas as pd + +from digital_land.pipeline.process import convert_tranformed_csv_to_pq + + +def test_convert_transformed_csv_to_pq_converts_csv(tmp_path): + # creat csv that looks like a transformed csv + data = { + "end-date": [""], + "entity": [4220000], + "entry-date": ["2024-10-02"], + "entry-number": [1], + "fact": ["1be8ef923db61d62354f041718ea0b1795c5ae60b436ec74e90d9fd850919434"], + "field": ["name"], + "priority": [2], + "reference-entity": [""], + "resource": [ + "0d1f06295866286d290d831b4569fe862ab38ca72cd23d541de2c9f20ff44ed7" + ], + "start-date": [""], + "value": "Arun District Council Local Plan 2011 - 2031", + } + df = pd.DataFrame(data) + data_path = ( + tmp_path + / "0d1f06295866286d290d831b4569fe862ab38ca72cd23d541de2c9f20ff44ed7.csv" + ) + df.to_csv(data_path) + + # use process on it + output_path = ( + tmp_path + / "0d1f06295866286d290d831b4569fe862ab38ca72cd23d541de2c9f20ff44ed7.parquet" + ) + convert_tranformed_csv_to_pq( + data_path, + tmp_path + / "0d1f06295866286d290d831b4569fe862ab38ca72cd23d541de2c9f20ff44ed7.parquet", + ) + + # check resulting parquet file for: + assert ( + output_path.exists() + ), f"no parquet file created as expected at {str(output_path)}" + # headers and number of rows + parquet_df = pd.read_parquet(output_path) + for col in list(parquet_df.columns): + assert "-" not in col + + for col in list(df.columns): + assert col.replace("-", "_") in list(parquet_df.columns) + + +# check column types From 5a621aeb4984848a46601271dc74a5d9b9ee0584 Mon Sep 17 00:00:00 2001 From: eveleighoj <35256612+eveleighoj@users.noreply.github.com> Date: Wed, 8 Jan 2025 13:20:35 +0000 Subject: [PATCH 09/51] make acceptance test work --- digital_land/cli.py | 16 + digital_land/commands.py | 45 +- digital_land/pipeline.py | 556 ------------------------ tests/acceptance/test_dataset_create.py | 230 +++++++++- 4 files changed, 250 insertions(+), 597 deletions(-) delete mode 100644 digital_land/pipeline.py diff --git a/digital_land/cli.py b/digital_land/cli.py index aae62b52f..0a08af6a6 100644 --- a/digital_land/cli.py +++ b/digital_land/cli.py @@ -139,6 +139,18 @@ def convert_cmd(input_path, output_path): @column_field_dir @dataset_resource_dir @issue_dir +@click.option( + "--cache-dir", + type=click.Path(), + default="var/cache", + help="link to a cache directory to store temporary data that can be deleted once process is finished", +) +@click.option( + "--resource-path", + type=click.Path(exists=True), + default="collection/resource.csv", + help="link to where the resource list is stored", +) @click.argument("input-paths", nargs=-1, type=click.Path(exists=True)) @click.pass_context def dataset_create_cmd( @@ -149,6 +161,8 @@ def dataset_create_cmd( column_field_dir, dataset_resource_dir, issue_dir, + cache_dir, + resource_path, ): return dataset_create( input_paths=input_paths, @@ -160,6 +174,8 @@ def dataset_create_cmd( column_field_dir=column_field_dir, dataset_resource_dir=dataset_resource_dir, issue_dir=issue_dir, + cache_dir=cache_dir, + resource_path=resource_path, ) diff --git a/digital_land/commands.py b/digital_land/commands.py index 8a49e7153..1fd065bcb 100644 --- a/digital_land/commands.py +++ b/digital_land/commands.py @@ -367,23 +367,32 @@ def dataset_create( issue_dir="issue", column_field_dir="var/column-field", dataset_resource_dir="var/dataset-resource", - cache_dir="var/cache/parquet", + cache_dir="var/cache", resource_path="collection/resource.csv", ): - # directories to be overwritten - transformed_parquet_dir = f"var/cache/transformed_parquet/{dataset}" - cache_dir = os.path.join(cache_dir, dataset) + # chek all paths are paths + issue_dir = Path(issue_dir) + column_field_dir = Path(column_field_dir) + dataset_resource_dir = Path(dataset_resource_dir) + cache_dir = Path(cache_dir) + resource_path = Path(resource_path) + + # get the transformed files from the cache directory this is assumed right now but we may want to be stricter in the future + transformed_parquet_dir = cache_dir / "transformed_parquet" / dataset + + # creat directory for dataset_parquet_package + dataset_parquet_path = cache_dir / dataset if not output_path: print("missing output path", file=sys.stderr) sys.exit(2) # Set up initial objects - column_field_dir = Path(column_field_dir) - dataset_resource_dir = Path(dataset_resource_dir) organisation = Organisation( organisation_path=organisation_path, pipeline_dir=Path(pipeline.path) ) + + # create sqlite dataset packageas before and load inn data that isn't in the parquetpackage yet package = DatasetPackage( dataset, organisation=organisation, @@ -393,15 +402,17 @@ def dataset_create( package.create() for path in input_paths: path_obj = Path(path) - package.load_column_fields(column_field_dir / dataset / path_obj.name) - package.load_dataset_resource(dataset_resource_dir / dataset / path_obj.name) + package.load_column_fields(column_field_dir / dataset / f"{path_obj.stem}.csv") + package.load_dataset_resource( + dataset_resource_dir / dataset / f"{path_obj.stem}.csv" + ) - old_entity_path = os.path.join(pipeline.path, "old-entity.csv") - if os.path.exists(old_entity_path): + old_entity_path = Path(pipeline.path) / "old-entity.csv" + if old_entity_path.exists(): package.load_old_entities(old_entity_path) - issue_paths = os.path.join(issue_dir, dataset) - if os.path.exists(issue_paths): + issue_paths = issue_dir / dataset + if issue_paths.exists(): for issue_path in os.listdir(issue_paths): package.load_issues(os.path.join(issue_paths, issue_path)) else: @@ -416,18 +427,14 @@ def dataset_create( pqpackage = DatasetParquetPackage( dataset, - organisation=organisation, - path=output_path, - cache_dir=cache_dir, - resource_path=resource_path, + path=dataset_parquet_path, specification_dir=None, # TBD: package should use this specification object ) # pqpackage.create_temp_table(input_paths) pqpackage.load_facts(transformed_parquet_dir) pqpackage.load_fact_resource(transformed_parquet_dir) - pqpackage.load_entities() - pqpackage.pq_to_sqlite() - pqpackage.close_conn() + pqpackage.load_entities(transformed_parquet_dir, resource_path, organisation_path) + pqpackage.load_to_sqlite(output_path) def dataset_dump(input_path, output_path): diff --git a/digital_land/pipeline.py b/digital_land/pipeline.py deleted file mode 100644 index 11749571e..000000000 --- a/digital_land/pipeline.py +++ /dev/null @@ -1,556 +0,0 @@ -import os -import csv -import functools -import importlib.util -import logging -from pathlib import Path - -from .phase.map import normalise -from .phase.lookup import key as lookup_key -from .schema import Schema - - -def chain_phases(phases): - def add(f, g): - return lambda x: g.process(f(x)) - - return functools.reduce(add, phases, lambda phase: phase) - - -def run_pipeline(*args): - logging.debug(f"run_pipeline {args}") - chain = chain_phases([arg for arg in args if arg]) - - stream = chain(None) - for row in stream: - pass - - -# TODO should we remove loading from init? it makes it harder to test -# and what if you only wanted to load specific files -# TODO replace with config models which load is handled by them -class Pipeline: - def __init__(self, path, dataset): - self.dataset = dataset - self.name = dataset - self.path = path - self.column = {} - self.filter = {} - self.skip_pattern = {} - self.patch = {} - self.default_field = {} - self.default_value = {} - self.combine_field = {} - self.concat = {} - self.migrate = {} - self.lookup = {} - self.redirect_lookup = {} - - self.load_column() - self.load_skip_patterns() - self.load_patch() - self.load_default_fields() - self.load_default_values() - self.load_concat() - self.load_combine_fields() - self.load_migrate() - self.load_lookup() - self.load_redirect_lookup() - self.load_filter() - - def file_reader(self, filename): - # read a file from the pipeline path, ignore if missing - path = os.path.join(self.path, filename) - if not os.path.isfile(path): - return [] - logging.debug(f"load {path}") - return csv.DictReader(open(path)) - - def reader(self, filename): - for row in self.file_reader(filename): - row["dataset"] = row.get("dataset", "") or row.get("pipeline", "") - if row["dataset"] and row["dataset"] != self.name: - continue - yield row - - def load_column(self): - for row in self.reader("column.csv"): - resource = row.get("resource", "") - endpoint = row.get("endpoint", "") - - if resource: - record = self.column.setdefault(resource, {}) - elif endpoint: - record = self.column.setdefault(endpoint, {}) - else: - record = self.column.setdefault("", {}) - - # migrate column.csv - row["column"] = row.get("column", "") or row["pattern"] - row["field"] = row.get("field", "") or row["value"] - - record[normalise(row["column"])] = row["field"] - - def load_filter(self): - for row in self.reader("filter.csv"): - resource = row.get("resource", "") - endpoint = row.get("endpoint", "") - - if resource: - record = self.filter.setdefault(resource, {}) - elif endpoint: - record = self.filter.setdefault(endpoint, {}) - else: - record = self.filter.setdefault("", {}) - - record[row["field"]] = row["pattern"] - - def load_skip_patterns(self): - for row in self.reader("skip.csv"): - resource = row.get("resource", "") - endpoint = row.get("endpoint", "") - - if resource: - record = self.skip_pattern.setdefault(resource, []) - elif endpoint: - record = self.skip_pattern.setdefault(endpoint, []) - else: - record = self.skip_pattern.setdefault("", []) - - record.append(row["pattern"]) - - def load_patch(self): - for row in self.reader("patch.csv"): - resource = row.get("resource", "") - endpoint = row.get("endpoint", "") - - if resource: - record = self.patch.setdefault(resource, {}) - elif endpoint: - record = self.patch.setdefault(endpoint, {}) - else: - record = self.patch.setdefault("", {}) - - row["field"] = row.get("field", "") - row["pattern"] = row.get("pattern", "") - - record = record.setdefault(row["field"], {}) - record[row["pattern"]] = row["value"] - - def load_default_fields(self): - # TBD: rename default-field.csv - for row in self.reader("default.csv"): - resource = row.get("resource", "") - endpoint = row.get("endpoint", "") - - if resource: - record = self.default_field.setdefault(resource, {}) - elif endpoint: - record = self.default_field.setdefault(endpoint, {}) - else: - record = self.default_field.setdefault("", {}) - - record[row["field"]] = row["default-field"] - - def load_default_values(self): - for row in self.reader("default-value.csv"): - record = self.default_value.setdefault(row.get("endpoint", ""), {}) - record[row["field"]] = row["value"] - - def load_combine_fields(self): - for row in self.reader("combine.csv"): - record = self.combine_field.setdefault(row.get("endpoint", ""), {}) - record[row["field"]] = row["separator"] - - def load_concat(self): - for row in self.reader("concat.csv"): - resource = row.get("resource", "") - endpoint = row.get("endpoint", "") - - if resource: - record = self.concat.setdefault(resource, {}) - elif endpoint: - record = self.concat.setdefault(endpoint, {}) - else: - record = self.concat.setdefault("", {}) - - # record = self.concat.setdefault(row["resource"], {}) - record[row["field"]] = { - "fields": row["fields"].split(";"), - "separator": row["separator"], - "prepend": row.get("prepend", ""), - "append": row.get("append", ""), - } - - # TBD: remove this table, should come from specification replacement-field - def load_migrate(self): - for row in self.reader("transform.csv"): - if row["replacement-field"] == "": - continue - - if row["replacement-field"] in self.migrate: - raise ValueError( - "replacement-field %s has more than one entry" - % row["replacement-field"] - ) - - self.migrate[row["replacement-field"]] = row["field"] - - def load_lookup(self): - for row in self.file_reader("lookup.csv"): - # migrate old lookup.csv files - entry_number = row.get("entry-number", "") - prefix = ( - row.get("prefix", "") - or row.get("dataset", "") - or row.get("pipeline", "") - ) - reference = row.get("reference", "") or row.get("value", "") - - # composite key, ordered by specificity - resource_lookup = self.lookup.setdefault(row.get("resource", ""), {}) - resource_lookup[ - lookup_key( - entry_number=entry_number, - prefix=prefix, - reference=reference, - ) - ] = row["entity"] - - organisation = row.get("organisation", "") - # replace local-authority-eng while we migrate - organisation = organisation.replace( - "local-authority-eng", "local-authority" - ) - resource_lookup[ - lookup_key( - prefix=prefix, - reference=reference, - organisation=organisation, - ) - ] = row["entity"] - - def load_redirect_lookup(self): - for row in self.file_reader("old-entity.csv"): - old_entity = row.get("old-entity", "") - entity = row.get("entity", "") - status = row.get("status", "") - if old_entity and status: - self.redirect_lookup[old_entity] = {"entity": entity, "status": status} - - def filters(self, resource="", endpoints=[]): - d = self.filter.get("", {}).copy() - - for endpoint in endpoints: - endpoint_filters = self.filter.get(endpoint, {}) - d.update(endpoint_filters) - - if resource: - resource_filters = self.filter.get(resource, {}) - d.update(resource_filters) - - return d - - def columns(self, resource="", endpoints=[]): - general_columns = self.column.get("", {}) - if not resource: - return general_columns - - resource_columns = self.column.get(resource, {}) - endpoint_columns = {} - for endpoint in endpoints: - endpoint_columns = {**endpoint_columns, **self.column.get(endpoint, {})} - - result = {**endpoint_columns, **resource_columns} - - for key in general_columns: - if key in result: - continue - if ( - general_columns[key] in endpoint_columns.values() - or general_columns[key] in resource_columns.values() - ): - continue - result[key] = general_columns[key] - return result - - def skip_patterns(self, resource="", endpoints=[]): - if not resource: - return self.skip_pattern.get("", {}) - endpoint_patterns = [] - for endpoint in endpoints: - endpoint_patterns.extend(self.skip_pattern.get(endpoint, [])) - - return ( - self.skip_pattern.get(resource, []) - + self.skip_pattern.get("", []) - + endpoint_patterns - ) - - def patches(self, resource="", endpoints=[]): - general_patch = self.patch.get("", {}) - if not resource: - return general_patch - - resource_patch = self.patch.get(resource, {}) - endpoint_patch = {} - - for endpoint in endpoints: - endpoint_patch = {**endpoint_patch, **self.patch.get(endpoint, {})} - - result = {**endpoint_patch, **resource_patch} - - # Merge any remaining general defaults into the result - for field, patch in general_patch.items(): - if field not in result: - result[field] = patch - else: - result[field] = {**patch, **result[field]} - - return result - - def default_fields(self, resource=None, endpoints=[]): - config = self.default_field - - d = config.get("", {}) - - for key, value in config.get(resource, {}).items(): - d[key] = value - - for endpoint in endpoints: - for key, value in config.get(endpoint, {}).items(): - d[key] = value - return d - - def default_values(self, endpoints=None): - if endpoints is None: - endpoints = [] - config = self.default_value - d = config.get("", {}) - for endpoint in endpoints: - for key, value in config.get(endpoint, {}).items(): - d[key] = value - return d - - def combine_fields(self, endpoints=None): - if endpoints is None: - endpoints = [] - config = self.combine_field - d = config.get("", {}) - for endpoint in endpoints: - for key, value in config.get(endpoint, {}).items(): - d[key] = value - return d - - def concatenations(self, resource=None, endpoints=[]): - result = self.concat.get("", {}) - if resource: - result.update(self.concat.get(resource, {})) - - for endpoint in endpoints: - result.update(self.concat.get(endpoint, {})) - - return result - - def migrations(self): - return self.migrate - - def lookups(self, resource=None): - d = self.lookup.get("", {}) - if resource: - d.update(self.lookup.get(resource, {})) - return d - - def redirect_lookups(self): - return self.redirect_lookup - - def get_pipeline_callback(self): - file = os.path.join(self.path, "pipeline-callback.py") - spec = importlib.util.spec_from_file_location("pipeline-callback.py", file) - module = importlib.util.module_from_spec(spec) - spec.loader.exec_module(module) - return module.PipelineCallback - - @staticmethod - def compose(phases): - def add(f, g): - return lambda x: g.process(f(x)) - - return functools.reduce(add, phases, lambda phase: phase) - - def run(self, input_path, phases): - logging.debug(f"running {input_path} through {phases}") - chain = self.compose(phases) - for row in chain(input_path): - pass - - -class EntityNumGen: - def __init__(self, entity_num_state: dict = None): - if not entity_num_state: - entity_num_state = { - "range_min": 0, - "range_max": 100, - "current": 0, - } - - self.state = entity_num_state - - def next(self): - current = self.state["current"] - new_current = current + 1 - - if new_current > int(self.state["range_max"]): - new_current = int(self.state["range_min"]) - - if new_current < int(self.state["range_min"]): - new_current = int(self.state["range_min"]) - - self.state["current"] = new_current - - return new_current - - -class Lookups: - def __init__(self, directory=None) -> None: - self.directory = directory or "pipeline" - self.lookups_path = Path(directory) / "lookup.csv" - self.old_entity_path = Path(directory) / "old-entity.csv" - self.entries = [] - self.schema = Schema("lookup") - self.entity_num_gen = EntityNumGen() - - def add_entry(self, entry, is_new_entry=True): - """ - is_new_entry is an addition to allow for backward compatibility. - Older lookups may not be valid in accordance with the current - minimal column requirements - :param entry: - :param is_new_entry: - :return: - """ - if is_new_entry: - if not self.validate_entry(entry): - return - - self.entries.append(entry) - - def load_csv(self, lookups_path=None): - """ - load in lookups as df, not when we process pipeline but useful for other analysis - """ - lookups_path = lookups_path or self.lookups_path - reader = csv.DictReader(open(lookups_path, newline="")) - extra_fields = set(reader.fieldnames) - set(self.schema.fieldnames) - - if len(extra_fields): - raise RuntimeError( - f"{len(extra_fields)} extra fields founds in lookup.csv ({','.join(list(extra_fields))})" - ) - - for row in reader: - self.add_entry(row, is_new_entry=False) - - def get_max_entity(self, prefix, specification) -> int: - if len(self.entries) == 0: - return 0 - if not prefix: - return 0 - - dataset_prefix = specification.dataset_prefix(prefix) - try: - ret_val = max( - [ - int(entry["entity"]) - for entry in self.entries - if (entry["prefix"] == prefix or entry["prefix"] == dataset_prefix) - and (entry.get("entity", None)) - ] - ) - return ret_val - except ValueError: - return 0 - - def save_csv(self, lookups_path=None, entries=None, old_entity_path=None): - path = lookups_path or self.lookups_path - - entity_values = [] - if os.path.exists(path): - reader = csv.DictReader(open(path, newline="")) - for row in reader: - entity_values.append(row["entity"]) - - if entries is None: - entries = self.entries - - os.makedirs(os.path.dirname(path), exist_ok=True) - logging.debug("saving %s" % (path)) - f = open(path, "w", newline="") - writer = csv.DictWriter( - f, fieldnames=self.schema.fieldnames, extrasaction="ignore" - ) - writer.writeheader() - - old_entity_file_path = old_entity_path or self.old_entity_path - if os.path.exists(old_entity_file_path): - old_entity_path = self.old_entity_path - reader = csv.DictReader(open(old_entity_file_path, newline="")) - - for row in reader: - entity_values.append(row["old-entity"]) - entity_values.append(row["entity"]) - - new_entities = [] - get_entity = None - for idx, entry in enumerate(entries): - if not entry: - continue - else: - if not entry.get("entity"): - while True: - generated_entity = self.entity_num_gen.next() - - if generated_entity == get_entity: - print( - "There are no more entity numbers available within this dataset." - ) - break - - if get_entity is None: - get_entity = generated_entity - - if str(generated_entity) not in entity_values: - entry["entity"] = generated_entity - new_entities.append(entry) - entity_values.append(str(generated_entity)) - writer.writerow(entry) - break - else: - writer.writerow(entry) - return new_entities - - # @staticmethod - def validate_entry(self, entry) -> bool: - # ensures minimum expected fields exist and are not empty strings - expected_fields = ["prefix", "organisation", "reference"] - for field in expected_fields: - if not entry.get(field, ""): - raise ValueError(f"ERROR: expected {field} not found in lookup entry") - - if len(self.entries) > 0: - # check entry does not already exist - existing_entries = len( - [ - 1 - for item in self.entries - if item["prefix"] == entry["prefix"] - and item["organisation"] == entry["organisation"] - and item["reference"] == entry["reference"] - ] - ) - - if existing_entries > 0: - # print(f">>> ERROR: lookup already exists - {entry['organisation']} {entry['reference']}") - return False - - return True diff --git a/tests/acceptance/test_dataset_create.py b/tests/acceptance/test_dataset_create.py index de67328af..cb038925d 100644 --- a/tests/acceptance/test_dataset_create.py +++ b/tests/acceptance/test_dataset_create.py @@ -4,7 +4,7 @@ """ import pytest - +import logging import numpy as np import pandas as pd import os @@ -27,31 +27,188 @@ def session_tmp_path(): @pytest.fixture -def input_paths(): +def cache_path(tmp_path): + cache_path = tmp_path / "var" / "cache" + cache_path.mkdir(parents=True, exist_ok=True) + return cache_path + + +test_geometry = "MULTIPOLYGON(((-0.49901924 53.81622,-0.5177418 53.76114,-0.4268378 53.78454,-0.49901924 53.81622)))" +transformed_1_data = { + "end_date": [np.nan] * 16, + "entity": [11, 11, 11, 11, 11, 11, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12], + "entry_date": [ + "2023-01-01", + "2023-01-01", + "2023-01-01", + "2023-01-01", + "2023-01-01", + "2023-01-01", + "2023-02-01", + "2023-02-01", + "2023-02-01", + "2023-02-01", + "2023-02-01", + "2023-02-01", + "2023-02-01", + "2023-02-01", + "2023-02-01", + "2023-02-01", + ], + "entry_number": [2] * 16, + "fact": [ + "abcdef1", + "abcdef2", + "abcdef3", + "abcdef4", + "abcdef5", + "abcdef6", + "abc1231", + "abc1232", + "abc1233", + "def4561", + "def4562", + "def4563", + "a1b2c31", + "a1b2c32", + "a1b2c33", + "a1b2c34", + ], + "field": [ + "entry-date", + "geometry", + "point", + "document-url", + "organisation", + "entry-date", + "geometry", + "organisation", + "entry-date", + "geometry", + "organisation", + "entry-date", + "geomtry", + "document-url", + "notes-checking", + "organisation", + ], + "priority": [2] * 16, + "reference_entity": [np.nan] * 16, + "resource": [ + "zyxwvu", + "zyxwvu", + "zyxwvu", + "zyxwvu", + "zyxwvu", + "zyxwvu", + "yxwvut", + "yxwvut", + "zyxwvu", + "xwvuts", + "xwvuts", + "zyxwvu", + "wvutsr", + "wvutsr", + "wvutsr", + "wvutsr", + ], + "start_date": [np.nan] * 16, + "value": [ + "2023-01-01", + f"{test_geometry}", + '"POINT(-0.481 53.788)"', + "https://www.test.xyz", + "organisation:AAA", + "2023-01-01", + f"{test_geometry}", + "local-authority:BBB", + "2023-01-01", + f"{test_geometry}", + "local-authority:CCC", + "2023-01-01", + f"{test_geometry}", + "https://www.testing.yyz", + "Something random", + "local-authority:DDD", + ], +} + + +@pytest.fixture +def input_paths(cache_path): + data_dicts = {"resource_1": transformed_1_data} input_paths = [] - directory = f"tests/data/{test_collection}/transformed/{test_dataset}/" - for root, dirs, files in os.walk(directory): - for file in files: - full_path = os.path.join(root, file) - input_paths.append(full_path) + directory = cache_path / "transformed_parquet" / "conservation-area" + directory.mkdir(parents=True, exist_ok=True) + + for path, data in data_dicts.items(): + data = pd.DataFrame.from_dict(data) + input_path = directory / f"{path}.parquet" + data.to_parquet(input_path, index=False) + logging.error(str(input_path)) + input_paths.append(str(input_path)) return input_paths @pytest.fixture -def organisation_path(): +def organisation_path(tmp_path): """ build an organisations dataset to use """ - orgs_path = f"tests/data/{test_collection}/organisation.csv" + org_data = { + "entity": [101, 102], + "name": ["test", "test_2"], + "prefix": ["local-authority", "local-authority"], + "reference": ["test", "test_2"], + "dataset": ["local-authority", "local-authority"], + "organisation": ["local-authority:test", "local-authority:test_2"], + } + orgs_path = tmp_path / "organisation.csv" + + pd.DataFrame.from_dict(org_data).to_csv(orgs_path, index=False) return orgs_path @pytest.fixture -def cache_path(session_tmp_path): - cache_path = session_tmp_path / "var" / "cache" - os.makedirs(cache_path, exist_ok=True) - return cache_path +def column_field_path(tmp_path): + column_field_dir = tmp_path / "column-field" + dataset_cfd = column_field_dir / "conservation-area" + (dataset_cfd).mkdir(parents=True, exist_ok=True) + data = { + "end_date": [""], + "entry_date": [""], + "field": ["geometry"], + "dataset": ["conservation-area"], + "start_date": [""], + "resource": [""], + "column": ["WKT"], + } + pd.DataFrame.from_dict(data).to_csv(dataset_cfd / "resource_1.csv", index=False) + logging.error(str(dataset_cfd / "resource_1.csv")) + return column_field_dir + + +@pytest.fixture +def dataset_resource_path(tmp_path): + dataset_resource_path = tmp_path / "dataset-resource" + dataset_drd = dataset_resource_path / "conservation-area" + dataset_drd.mkdir(parents=True, exist_ok=True) + data = { + "end_date": [""], + "entry_date": [""], + "dataset": ["conservation-area"], + "entity_count": [""], + "entry_count": [1], + "line_count": [1], + "mime_type": [""], + "internal_path": [""], + "internal_mime_type": [""], + "resource": ["resource_1"], + "start_date": [""], + } + pd.DataFrame.from_dict(data).to_csv(dataset_drd / "resource_1.csv", index=False) + return dataset_resource_path @pytest.fixture @@ -85,6 +242,8 @@ def test_acceptance_dataset_create( cache_path, dataset_dir, resource_path, + column_field_path, + dataset_resource_path, ): output_path = dataset_dir / f"{test_dataset}.sqlite3" @@ -102,9 +261,9 @@ def test_acceptance_dataset_create( "--organisation-path", str(organisation_path), "--column-field-dir", - str(f"tests/data/{test_collection}/var/column-field"), + str(column_field_path), "--dataset-resource-dir", - str(f"tests/data/{test_collection}/var/dataset-resource"), + str(dataset_resource_path), "--issue-dir", str(issue_dir), "--cache-dir", @@ -125,13 +284,29 @@ def test_acceptance_dataset_create( print("Command error output:") print(result.exception) + files = [ + str(f.name) + for f in ( + cache_path / "conservation-area" / "dataset=conservation-area" + ).iterdir() + ] + logging.error(files) + for file in ["entity.parquet", "fact.parquet", "fact_resource.parquet"]: + assert file in files, f"file {file} not created. files found {', '.join(files)}" assert result.exit_code == 0, "error returned when building dataset" - pq_cache = os.path.join(cache_path, test_dataset) - pq_files = [file for file in os.listdir(pq_cache) if file.endswith(".parquet")] - assert len(pq_files) == 3, "Not all parquet files created" - assert np.all( - np.sort(pq_files) == ["entity.parquet", "fact.parquet", "fact_resource.parquet"] - ), "parquet file names not correct" + + # check that parquet files have been created correctlly in the cache directory + # may want to adjust this for how we structure a parquet package in the future + # also we are using the cache to store this for now but in the future we may want to store it in a specific directory + files = [ + str(f.name) + for f in ( + cache_path / "conservation-area" / "dataset=conservation-area" + ).iterdir() + ] + + for file in ["entity.parquet", "fact.parquet", "fact_resource.parquet"]: + assert file in files, f"file {file} not created. files found {', '.join(files)}" # Check the sqlite file was created assert os.path.exists(output_path), f"sqlite file {output_path} does not exists" @@ -149,8 +324,19 @@ def test_acceptance_dataset_create( ), f"Missing following tables in sqlite database: {missing_tables}" for table in list(expected_tables): - pq_rows = len(pd.read_parquet(f"{pq_cache}/{table}.parquet")) + + pq_rows = len( + pd.read_parquet( + cache_path + / "conservation-area" + / "dataset=conservation-area" + / f"{table}.parquet" + ) + ) + + assert pq_rows > 0, f"parquet file {table} is empty" sql_rows = cursor.execute(f"SELECT COUNT(*) FROM {table};").fetchone()[0] + assert sql_rows > 0, f"database table {table} is empty" assert ( pq_rows == sql_rows ), f"Different rows between the parquet files and database table for {table}" From 60ee6fe6d693f6cf7ac75219b217d5b480467167 Mon Sep 17 00:00:00 2001 From: eveleighoj <35256612+eveleighoj@users.noreply.github.com> Date: Fri, 10 Jan 2025 15:11:00 +0000 Subject: [PATCH 10/51] ensure no error if all orgs are blank --- digital_land/package/dataset_parquet.py | 11 ++++++- tests/acceptance/test_dataset_create.py | 27 +++++++++++++++- .../package/test_dataset_parquet.py | 32 ++++++++++++++++++- 3 files changed, 67 insertions(+), 3 deletions(-) diff --git a/digital_land/package/dataset_parquet.py b/digital_land/package/dataset_parquet.py index f97b9a37a..6ed50b23b 100644 --- a/digital_land/package/dataset_parquet.py +++ b/digital_land/package/dataset_parquet.py @@ -218,6 +218,9 @@ def load_entities(self, transformed_parquet_dir, resource_path, organisation_pat if field not in (distinct_fields + extra_fields) ] + if "organisation" not in distinct_fields: + null_fields.append("organisation") + # select fields - a list of fields which have to be selected directly from the pivoted table # these are entity fields that are not null fields or a few special ones extra_fields = [ @@ -235,6 +238,12 @@ def load_entities(self, transformed_parquet_dir, resource_path, organisation_pat fields_to_include = ["entity", "field", "value"] fields_str = ", ".join(fields_to_include) + # create this statement to add a nul org column, this is needed when no entities have an associated organisation + if "organisation" not in distinct_fields: + optional_org_str = ",''::VARCHAR AS \"organisation\"" + else: + optional_org_str = "" + # Take original data, group by entity & field, and order by highest priority then latest record. # If there are still matches then pick the first resource (and fact, just to make sure) # changes to make @@ -248,7 +257,7 @@ def load_entities(self, transformed_parquet_dir, resource_path, organisation_pat # query to create the file query = f""" - SELECT {fields_str} FROM ( + SELECT {fields_str}{optional_org_str} FROM ( SELECT {fields_str}, CASE WHEN resource_csv."end-date" IS NULL THEN '2999-12-31' ELSE resource_csv."end-date" END AS resource_end_date FROM parquet_scan('{transformed_parquet_dir}/*.parquet') tf LEFT JOIN read_csv_auto('{resource_path}', max_line_size=40000000) resource_csv diff --git a/tests/acceptance/test_dataset_create.py b/tests/acceptance/test_dataset_create.py index cb038925d..1508db528 100644 --- a/tests/acceptance/test_dataset_create.py +++ b/tests/acceptance/test_dataset_create.py @@ -290,7 +290,6 @@ def test_acceptance_dataset_create( cache_path / "conservation-area" / "dataset=conservation-area" ).iterdir() ] - logging.error(files) for file in ["entity.parquet", "fact.parquet", "fact_resource.parquet"]: assert file in files, f"file {file} not created. files found {', '.join(files)}" assert result.exit_code == 0, "error returned when building dataset" @@ -340,3 +339,29 @@ def test_acceptance_dataset_create( assert ( pq_rows == sql_rows ), f"Different rows between the parquet files and database table for {table}" + + # entity table specific tests to check how we expect the data to be used + + # json field checks + # where no json value is present we expect the value to be null. not blank or an empty json bracket + # so will ensure these aren't in the results of any test + sql = """ + SELECT * + FROM entity + WHERE json = '{}' + ;""" + + results = cursor.execute(sql).fetchall() + assert ( + len(results) == 0 + ), "there should be no rows where json is an empty json bracket" + + # check no json values are arrays + sql = """ + SELECT * + FROM entity + WHERE json_type(json) NOT IN ('object', NULL) + ;""" + + results = cursor.execute(sql).fetchall() + assert len(results) == 0, "all json values should be objects or null" diff --git a/tests/integration/package/test_dataset_parquet.py b/tests/integration/package/test_dataset_parquet.py index 7318b979e..0f0042958 100644 --- a/tests/integration/package/test_dataset_parquet.py +++ b/tests/integration/package/test_dataset_parquet.py @@ -612,7 +612,37 @@ def test_load_fact_resource_empty_file_with_another(data, expected, tmp_path): @pytest.mark.parametrize( "data,expected_count,expected_props", - [(transformed_1_data, 2, {11: {"end_date": ""}})], + # need to buid an example where organisation is blank + [ + (transformed_1_data, 2, {11: {"end_date": ""}}), + ( + { + "end_date": [np.nan], # 19 records + "entity": [ + 110, + ], + "entry_date": [ + "2023-01-01", + ], + "entry_number": [2], + "fact": [ + "badcfe1", + ], + "field": [ + "entry-date", + ], + "priority": [2], + "reference_entity": [np.nan], # 19 records + "resource": [ + "zyx123", + ], + "start_date": [np.nan], # 19 records + "value": ["2023-01-01"], + }, + 1, + {}, + ), + ], ) def test_load_entities_single_file( data, expected_count, expected_props, tmp_path, org_path, resource_path From 792b9a64e155869813664fb1d2c5b7f1cea21ed4 Mon Sep 17 00:00:00 2001 From: eveleighoj <35256612+eveleighoj@users.noreply.github.com> Date: Sat, 11 Jan 2025 00:37:01 +0000 Subject: [PATCH 11/51] add converted resources to the cache directory --- digital_land/commands.py | 12 ++++++++---- digital_land/phase/convert.py | 18 ++++++++++-------- 2 files changed, 18 insertions(+), 12 deletions(-) diff --git a/digital_land/commands.py b/digital_land/commands.py index 1fd065bcb..010c42c38 100644 --- a/digital_land/commands.py +++ b/digital_land/commands.py @@ -162,17 +162,17 @@ def collection_retire_endpoints_and_sources( # # pipeline commands # -def convert(input_path, output_path, custom_temp_dir=None): +def convert(input_path, output_path): if not output_path: output_path = default_output_path("converted", input_path) dataset_resource_log = DatasetResourceLog() converted_resource_log = ConvertedResourceLog() + # TBD this actualy duplictaes the data and does nothing else, should just convert it? run_pipeline( ConvertPhase( input_path, dataset_resource_log=dataset_resource_log, converted_resource_log=converted_resource_log, - custom_temp_dir=custom_temp_dir, ), DumpPhase(output_path), ) @@ -191,10 +191,11 @@ def pipeline_run( operational_issue_dir="performance/operational_issue/", organisation_path=None, save_harmonised=False, + # TBD save all logs in a log directory, this will mean only one path passed in. column_field_dir=None, dataset_resource_dir=None, converted_resource_dir=None, - custom_temp_dir=None, # TBD: rename to "tmpdir" + cache_dir="var/cache", endpoints=[], organisations=[], entry_date="", @@ -202,6 +203,9 @@ def pipeline_run( resource=None, output_log_dir=None, ): + # set up paths + cache_dir = Path(cache_dir) + if resource is None: resource = resource_from_path(input_path) dataset = dataset @@ -265,7 +269,7 @@ def pipeline_run( path=input_path, dataset_resource_log=dataset_resource_log, converted_resource_log=converted_resource_log, - custom_temp_dir=custom_temp_dir, + output_path=cache_dir / "converted_resources" / dataset / f"{resource}.csv", ), NormalisePhase(skip_patterns=skip_patterns, null_path=null_path), ParsePhase(), diff --git a/digital_land/phase/convert.py b/digital_land/phase/convert.py index 0f411f3b1..f0964e03f 100644 --- a/digital_land/phase/convert.py +++ b/digital_land/phase/convert.py @@ -187,20 +187,21 @@ def __init__( path=None, dataset_resource_log=None, converted_resource_log=None, - custom_temp_dir=None, output_path=None, ): + """ + given a fie/filepath will aim to convert it to a csv and return the path to a csv, if the file is aready a csv + + Args: + path (str): Path to the shapefile or geojson + dataset_resource_log (DatasetResourceLog): DatasetResourceLog object + converted_resource_log (ConvertedResourceLog): ConvertedResourceLog object + output_path (str): Optional output path for the converted csv + """ self.path = path self.dataset_resource_log = dataset_resource_log self.converted_resource_log = converted_resource_log self.charset = "" - # Allows for custom temporary directory to be specified - # This allows symlink creation in case of /tmp & path being on different partitions - if custom_temp_dir: - self.temp_file_extra_kwargs = {"dir": custom_temp_dir} - else: - self.temp_file_extra_kwargs = {} - self.output_path = output_path if output_path: output_dir = os.path.dirname(output_path) @@ -247,6 +248,7 @@ def process(self, stream=None): return Stream(input_path, f=iter(()), log=self.dataset_resource_log) + # should this be a method and not a function? I think we re-factor it into a function let's remove references to self def _read_text_file(self, input_path, encoding): f = read_csv(input_path, encoding) self.dataset_resource_log.mime_type = "text/csv" + self.charset From a0a445e5872523986dfe438a4dff9c7a580fb2a1 Mon Sep 17 00:00:00 2001 From: eveleighoj <35256612+eveleighoj@users.noreply.github.com> Date: Sat, 11 Jan 2025 10:51:42 +0000 Subject: [PATCH 12/51] change to cache directory --- digital_land/cli.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/digital_land/cli.py b/digital_land/cli.py index 0a08af6a6..5a3ab4c00 100644 --- a/digital_land/cli.py +++ b/digital_land/cli.py @@ -202,7 +202,9 @@ def dataset_dump_flattened_cmd(ctx, input_path, output_path): @click.option("--endpoints", help="list of endpoint hashes", default="") @click.option("--organisations", help="list of organisations", default="") @click.option("--entry-date", help="default entry-date value", default="") -@click.option("--custom-temp-dir", help="default temporary directory", default=None) +@click.option( + "--cache-dir", help="cache directory to store conveted files etc. in", default=None +) @click.option("--config-path", help="Path to a configuration sqlite", default=None) @click.option( "--resource", @@ -232,7 +234,7 @@ def pipeline_command( endpoints, organisations, entry_date, - custom_temp_dir, + cache_dir, collection_dir, operational_issue_dir, config_path, @@ -263,7 +265,7 @@ def pipeline_command( endpoints=endpoints, organisations=organisations, entry_date=entry_date, - custom_temp_dir=custom_temp_dir, + cache_dir=cache_dir, config_path=config_path, resource=resource, output_log_dir=output_log_dir, From b1d08c787526cafec1abb0f03564bccb1fe70398 Mon Sep 17 00:00:00 2001 From: eveleighoj <35256612+eveleighoj@users.noreply.github.com> Date: Sat, 11 Jan 2025 11:32:46 +0000 Subject: [PATCH 13/51] change to cache directory default --- digital_land/cli.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/digital_land/cli.py b/digital_land/cli.py index 5a3ab4c00..0f51a1655 100644 --- a/digital_land/cli.py +++ b/digital_land/cli.py @@ -203,7 +203,9 @@ def dataset_dump_flattened_cmd(ctx, input_path, output_path): @click.option("--organisations", help="list of organisations", default="") @click.option("--entry-date", help="default entry-date value", default="") @click.option( - "--cache-dir", help="cache directory to store conveted files etc. in", default=None + "--cache-dir", + help="cache directory to store conveted files etc. in", + default="var/cache", ) @click.option("--config-path", help="Path to a configuration sqlite", default=None) @click.option( From 175f4655e74d9f2a880ed33b9f10a745119460d9 Mon Sep 17 00:00:00 2001 From: eveleighoj <35256612+eveleighoj@users.noreply.github.com> Date: Sun, 12 Jan 2025 13:01:57 +0000 Subject: [PATCH 14/51] use output path in other function --- digital_land/phase/convert.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/digital_land/phase/convert.py b/digital_land/phase/convert.py index f0964e03f..d27fc5460 100644 --- a/digital_land/phase/convert.py +++ b/digital_land/phase/convert.py @@ -363,8 +363,9 @@ def _read_binary_file(self, input_path): if internal_path: self.dataset_resource_log.internal_path = internal_path self.dataset_resource_log.internal_mime_type = mime_type + # TODO erpace temp path with output path temp_path = tempfile.NamedTemporaryFile( - suffix=".zip", **self.temp_file_extra_kwargs + suffix=".zip", dir=self.output_path.parent ).name os.link(input_path, temp_path) zip_path = f"/vsizip/{temp_path}{internal_path}" From cbb5eec06ea1b6e18c966b052563326d73696157 Mon Sep 17 00:00:00 2001 From: eveleighoj <35256612+eveleighoj@users.noreply.github.com> Date: Tue, 14 Jan 2025 10:12:05 +0000 Subject: [PATCH 15/51] use a specific path for duck.db database to help with overflow --- digital_land/commands.py | 1 + 1 file changed, 1 insertion(+) diff --git a/digital_land/commands.py b/digital_land/commands.py index 010c42c38..22da688a9 100644 --- a/digital_land/commands.py +++ b/digital_land/commands.py @@ -402,6 +402,7 @@ def dataset_create( organisation=organisation, path=output_path, specification_dir=None, # TBD: package should use this specification object + duckdb_path=cache_dir / "overflow.duckdb", ) package.create() for path in input_paths: From f56cf08d97300f410921fca694e51673862a7dda Mon Sep 17 00:00:00 2001 From: eveleighoj <35256612+eveleighoj@users.noreply.github.com> Date: Wed, 15 Jan 2025 16:37:23 +0000 Subject: [PATCH 16/51] make query entity range specific --- digital_land/package/dataset_parquet.py | 92 +++++++++++++++++++++++-- digital_land/pipeline/process.py | 1 + 2 files changed, 88 insertions(+), 5 deletions(-) diff --git a/digital_land/package/dataset_parquet.py b/digital_land/package/dataset_parquet.py index 6ed50b23b..9f4e3ff6f 100644 --- a/digital_land/package/dataset_parquet.py +++ b/digital_land/package/dataset_parquet.py @@ -171,11 +171,51 @@ def load_fact_resource(self, transformed_parquet_dir): """ ) - def load_entities(self, transformed_parquet_dir, resource_path, organisation_path): - output_path = self.path / f"dataset={self.dataset}" / "entity.parquet" - output_path.parent.mkdir(parents=True, exist_ok=True) - # get the other paths - # fact_resource_parquet_path = f"{self.cache_dir}/fact_resource{self.suffix}" + # def combine_parquet_files(input_path,output_path): + # """ + # This method combines multiple parquet files into a single parquet file + # """ + # # check input path is a directory using Path + # if not Path(input_path).is_dir(): + # raise ValueError("Input path must be a directory") + + # # check output_path is a file that doesn't exist + # if not Path(output_path).is_file(): + # raise ValueError("Output path must be a file") + + # # use self.conn to use duckdb to combine files + # sql = f""" + # COPY (select * from parquet_scan('{input_path}/*.parquet')) TO '{output_path}' (FORMAT PARQUET); + # """ + # self.conn.execute(sql) + + # # Combine all the parquet files into a single parquet file + # combined_df = pd.concat( + # [pd.read_parquet(f"{input_path}/{file}") for file in parquet_files] + # ) + + # # Save the combined dataframe to a parquet file + # combined_df.to_parquet(output_path, index=False) + + def load_entities_range( + self, + transformed_parquet_dir, + resource_path, + organisation_path, + output_path, + entity_range=None, + ): + # figure out which resources we actually need to do expensive queries on, store in parquet + # sql = f""" + # COPY( + # SELECT DISTINCT resource + # FROM parquet_scan('{transformed_parquet_dir}/*.parquet') + # QUALIFY ROW_NUMBER() OVER ( + # PARTITION BY enttity,field + # ORDER BY prioity, enttry_date DESC, entry_number DESC, resource, fact + # ) = 1 + # ) TO '{self.cache_path / 'duckdb_temp_files' / 'distinct_resource.parquet'}' (FORMAT PARQUET); + # """ logging.info(f"loading entities from {transformed_parquet_dir}") @@ -256,12 +296,20 @@ def load_entities(self, transformed_parquet_dir, resource_path, organisation_pat # query to sort the final table # query to create the file + # craft a where clause to limit entities in quetion, this chunking helps solve memory issues + if entity_range is not None: + entity_where_clause = ( + f"WHERE entity >= {entity_range[0]} AND entity < {entity_range[1]}" + ) + else: + entity_where_clause = "" query = f""" SELECT {fields_str}{optional_org_str} FROM ( SELECT {fields_str}, CASE WHEN resource_csv."end-date" IS NULL THEN '2999-12-31' ELSE resource_csv."end-date" END AS resource_end_date FROM parquet_scan('{transformed_parquet_dir}/*.parquet') tf LEFT JOIN read_csv_auto('{resource_path}', max_line_size=40000000) resource_csv ON tf.resource = resource_csv.resource + {entity_where_clause} QUALIFY ROW_NUMBER() OVER ( PARTITION BY entity, field ORDER BY priority, entry_date DESC, entry_number DESC, resource_end_date DESC, tf.resource, fact @@ -336,6 +384,40 @@ def load_entities(self, transformed_parquet_dir, resource_path, organisation_pat # might need to un some fetch all toget result back self.conn.execute(sql) + def load_entities(self, transformed_parquet_dir, resource_path, organisation_path): + output_path = self.path / f"dataset={self.dataset}" / "entity.parquet" + output_path.parent.mkdir(parents=True, exist_ok=True) + + # retrieve entity counnts including and minimum + min_sql = f"select MIN(entity) FROM parquet_scan('{transformed_parquet_dir}/*.parquet');" + min_entity = self.conn.execute(min_sql).fetchone() + max_sql = f"select MAX(entity) FROM parquet_scan('{transformed_parquet_dir}/*.parquet');" + max_entity = self.conn.execute(max_sql).fetchone() + total_entities = max_entity - min_entity + entity_limit = 1000000 + if total_entities > entity_limit: + logger.info(f"total entities {total_entities} exceeds limit {entity_limit}") + _ = 0 + file_count = 1 + while _ < max_entity: + output_path = ( + self.path + / f"dataset={self.dataset}" + / f"entity_{file_count}.parquet" + ) + entity_range = [_, entity_limit] + logger.info( + f"loading entities from {entity_range[0]} to {entity_range[1]}" + ) + self.load_entities_range( + transformed_parquet_dir, + resource_path, + organisation_path, + output_path, + entity_range, + ) + _ += entity_limit + def load_to_sqlite(self, sqlite_path): """ Convert parquet files to sqlite3 tables assumes the sqlite table already exist. There is an arguement to diff --git a/digital_land/pipeline/process.py b/digital_land/pipeline/process.py index 0a0fc3e61..11955ac41 100644 --- a/digital_land/pipeline/process.py +++ b/digital_land/pipeline/process.py @@ -8,6 +8,7 @@ # load in specification +# TODO need to take in the correct data types for the columns def convert_tranformed_csv_to_pq(input_path, output_path): """ function to convert a transformed resource to a parrquet file. From 7c04686d4fdd513a31b1ba39b5484205e31f6c87 Mon Sep 17 00:00:00 2001 From: eveleighoj <35256612+eveleighoj@users.noreply.github.com> Date: Wed, 15 Jan 2025 16:45:11 +0000 Subject: [PATCH 17/51] add condition for no range --- digital_land/package/dataset_parquet.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/digital_land/package/dataset_parquet.py b/digital_land/package/dataset_parquet.py index 9f4e3ff6f..17b5f645d 100644 --- a/digital_land/package/dataset_parquet.py +++ b/digital_land/package/dataset_parquet.py @@ -400,7 +400,7 @@ def load_entities(self, transformed_parquet_dir, resource_path, organisation_pat _ = 0 file_count = 1 while _ < max_entity: - output_path = ( + temp_output_path = ( self.path / f"dataset={self.dataset}" / f"entity_{file_count}.parquet" @@ -413,11 +413,16 @@ def load_entities(self, transformed_parquet_dir, resource_path, organisation_pat transformed_parquet_dir, resource_path, organisation_path, - output_path, + temp_output_path, entity_range, ) _ += entity_limit + else: + self.load_entities_range( + transformed_parquet_dir, resource_path, organisation_path, output_path + ) + def load_to_sqlite(self, sqlite_path): """ Convert parquet files to sqlite3 tables assumes the sqlite table already exist. There is an arguement to From ccd1ddaf2c1b13c0d64f4350b1df2f7182a96be3 Mon Sep 17 00:00:00 2001 From: eveleighoj <35256612+eveleighoj@users.noreply.github.com> Date: Wed, 15 Jan 2025 17:27:09 +0000 Subject: [PATCH 18/51] update arguement in wrong place --- digital_land/commands.py | 1 - 1 file changed, 1 deletion(-) diff --git a/digital_land/commands.py b/digital_land/commands.py index 22da688a9..010c42c38 100644 --- a/digital_land/commands.py +++ b/digital_land/commands.py @@ -402,7 +402,6 @@ def dataset_create( organisation=organisation, path=output_path, specification_dir=None, # TBD: package should use this specification object - duckdb_path=cache_dir / "overflow.duckdb", ) package.create() for path in input_paths: From 753b40a3b77ef104ad6b3e8fe58687f40d8dbdd1 Mon Sep 17 00:00:00 2001 From: eveleighoj <35256612+eveleighoj@users.noreply.github.com> Date: Wed, 15 Jan 2025 18:51:31 +0000 Subject: [PATCH 19/51] start range in correct place --- digital_land/commands.py | 68 +++++++++++++------------ digital_land/package/dataset_parquet.py | 3 +- 2 files changed, 37 insertions(+), 34 deletions(-) diff --git a/digital_land/commands.py b/digital_land/commands.py index 010c42c38..641e9c8f8 100644 --- a/digital_land/commands.py +++ b/digital_land/commands.py @@ -25,7 +25,8 @@ ConvertedResourceLog, ) from digital_land.organisation import Organisation -from digital_land.package.dataset import DatasetPackage + +# from digital_land.package.dataset import DatasetPackage from digital_land.package.dataset_parquet import DatasetParquetPackage from digital_land.phase.combine import FactCombinePhase from digital_land.phase.concat import ConcatFieldPhase @@ -392,37 +393,37 @@ def dataset_create( sys.exit(2) # Set up initial objects - organisation = Organisation( - organisation_path=organisation_path, pipeline_dir=Path(pipeline.path) - ) + # organisation = Organisation( + # organisation_path=organisation_path, pipeline_dir=Path(pipeline.path) + # ) # create sqlite dataset packageas before and load inn data that isn't in the parquetpackage yet - package = DatasetPackage( - dataset, - organisation=organisation, - path=output_path, - specification_dir=None, # TBD: package should use this specification object - ) - package.create() - for path in input_paths: - path_obj = Path(path) - package.load_column_fields(column_field_dir / dataset / f"{path_obj.stem}.csv") - package.load_dataset_resource( - dataset_resource_dir / dataset / f"{path_obj.stem}.csv" - ) - - old_entity_path = Path(pipeline.path) / "old-entity.csv" - if old_entity_path.exists(): - package.load_old_entities(old_entity_path) - - issue_paths = issue_dir / dataset - if issue_paths.exists(): - for issue_path in os.listdir(issue_paths): - package.load_issues(os.path.join(issue_paths, issue_path)) - else: - logging.warning("No directory for this dataset in the provided issue_directory") - - package.add_counts() + # package = DatasetPackage( + # dataset, + # organisation=organisation, + # path=output_path, + # specification_dir=None, # TBD: package should use this specification object + # ) + # package.create() + # for path in input_paths: + # path_obj = Path(path) + # package.load_column_fields(column_field_dir / dataset / f"{path_obj.stem}.csv") + # package.load_dataset_resource( + # dataset_resource_dir / dataset / f"{path_obj.stem}.csv" + # ) + + # old_entity_path = Path(pipeline.path) / "old-entity.csv" + # if old_entity_path.exists(): + # package.load_old_entities(old_entity_path) + + # issue_paths = issue_dir / dataset + # if issue_paths.exists(): + # for issue_path in os.listdir(issue_paths): + # package.load_issues(os.path.join(issue_paths, issue_path)) + # else: + # logging.warning("No directory for this dataset in the provided issue_directory") + + # package.add_counts() # Repeat for parquet # Set up cache directory to store parquet files. The sqlite files created from this will be saved in the dataset @@ -433,12 +434,13 @@ def dataset_create( dataset, path=dataset_parquet_path, specification_dir=None, # TBD: package should use this specification object + duckdb_path=cache_dir / "overflow.duckdb", ) # pqpackage.create_temp_table(input_paths) - pqpackage.load_facts(transformed_parquet_dir) - pqpackage.load_fact_resource(transformed_parquet_dir) + # pqpackage.load_facts(transformed_parquet_dir) + # pqpackage.load_fact_resource(transformed_parquet_dir) pqpackage.load_entities(transformed_parquet_dir, resource_path, organisation_path) - pqpackage.load_to_sqlite(output_path) + # pqpackage.load_to_sqlite(output_path) def dataset_dump(input_path, output_path): diff --git a/digital_land/package/dataset_parquet.py b/digital_land/package/dataset_parquet.py index 17b5f645d..94a559070 100644 --- a/digital_land/package/dataset_parquet.py +++ b/digital_land/package/dataset_parquet.py @@ -227,6 +227,7 @@ def load_entities_range( query = f""" SELECT DISTINCT REPLACE(field,'-','_') FROM parquet_scan('{transformed_parquet_dir}/*.parquet') + WHERE entity >= {entity_range[0]} AND entity < {entity_range[1]} """ # distinct_fields - list of fields in the field in fact @@ -397,7 +398,7 @@ def load_entities(self, transformed_parquet_dir, resource_path, organisation_pat entity_limit = 1000000 if total_entities > entity_limit: logger.info(f"total entities {total_entities} exceeds limit {entity_limit}") - _ = 0 + _ = min_entity file_count = 1 while _ < max_entity: temp_output_path = ( From 25f7618c9baa4617fb0d266be87f2747e0820866 Mon Sep 17 00:00:00 2001 From: eveleighoj <35256612+eveleighoj@users.noreply.github.com> Date: Wed, 15 Jan 2025 19:09:15 +0000 Subject: [PATCH 20/51] get value from tuple --- digital_land/package/dataset_parquet.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/digital_land/package/dataset_parquet.py b/digital_land/package/dataset_parquet.py index 94a559070..ff3a8b5b4 100644 --- a/digital_land/package/dataset_parquet.py +++ b/digital_land/package/dataset_parquet.py @@ -391,9 +391,9 @@ def load_entities(self, transformed_parquet_dir, resource_path, organisation_pat # retrieve entity counnts including and minimum min_sql = f"select MIN(entity) FROM parquet_scan('{transformed_parquet_dir}/*.parquet');" - min_entity = self.conn.execute(min_sql).fetchone() + min_entity = self.conn.execute(min_sql).fetchone()[0] max_sql = f"select MAX(entity) FROM parquet_scan('{transformed_parquet_dir}/*.parquet');" - max_entity = self.conn.execute(max_sql).fetchone() + max_entity = self.conn.execute(max_sql).fetchone()[0] total_entities = max_entity - min_entity entity_limit = 1000000 if total_entities > entity_limit: From 1b8f7a4c62ba942397c73a356e00f9f7cc3505f7 Mon Sep 17 00:00:00 2001 From: eveleighoj <35256612+eveleighoj@users.noreply.github.com> Date: Wed, 15 Jan 2025 19:12:49 +0000 Subject: [PATCH 21/51] correct max in for loop --- digital_land/package/dataset_parquet.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/digital_land/package/dataset_parquet.py b/digital_land/package/dataset_parquet.py index ff3a8b5b4..4b2e53e2a 100644 --- a/digital_land/package/dataset_parquet.py +++ b/digital_land/package/dataset_parquet.py @@ -406,7 +406,7 @@ def load_entities(self, transformed_parquet_dir, resource_path, organisation_pat / f"dataset={self.dataset}" / f"entity_{file_count}.parquet" ) - entity_range = [_, entity_limit] + entity_range = [_, _ + entity_limit] logger.info( f"loading entities from {entity_range[0]} to {entity_range[1]}" ) From 73835e988c1c25419fadd43b72f952c36c49ad5b Mon Sep 17 00:00:00 2001 From: eveleighoj <35256612+eveleighoj@users.noreply.github.com> Date: Thu, 16 Jan 2025 08:17:34 +0000 Subject: [PATCH 22/51] iterate file count --- digital_land/package/dataset_parquet.py | 1 + 1 file changed, 1 insertion(+) diff --git a/digital_land/package/dataset_parquet.py b/digital_land/package/dataset_parquet.py index 4b2e53e2a..b1b070558 100644 --- a/digital_land/package/dataset_parquet.py +++ b/digital_land/package/dataset_parquet.py @@ -418,6 +418,7 @@ def load_entities(self, transformed_parquet_dir, resource_path, organisation_pat entity_range, ) _ += entity_limit + file_count += 1 else: self.load_entities_range( From 8938a25135cd2dad9fd71498716efb4940ee8e5f Mon Sep 17 00:00:00 2001 From: eveleighoj <35256612+eveleighoj@users.noreply.github.com> Date: Thu, 16 Jan 2025 08:56:33 +0000 Subject: [PATCH 23/51] build a single file --- digital_land/package/dataset_parquet.py | 35 +++++++++++++++++++++---- 1 file changed, 30 insertions(+), 5 deletions(-) diff --git a/digital_land/package/dataset_parquet.py b/digital_land/package/dataset_parquet.py index b1b070558..d19e553a7 100644 --- a/digital_land/package/dataset_parquet.py +++ b/digital_land/package/dataset_parquet.py @@ -385,6 +385,24 @@ def load_entities_range( # might need to un some fetch all toget result back self.conn.execute(sql) + def combine_parquet_files(self, input_path, output_path): + """ + This method combines multiple parquet files into a single parquet file + """ + # check input path is a directory using Path + if not Path(input_path).is_dir(): + raise ValueError("Input path must be a directory") + + # check output_path is a file that doesn't exist + if not Path(output_path).is_file(): + raise ValueError("Output path must be a file") + + # use self.conn to use duckdb to combine files + sql = f""" + COPY (select * from parquet_scan('{input_path}/*.parquet')) TO '{output_path}' (FORMAT PARQUET); + """ + self.conn.execute(sql) + def load_entities(self, transformed_parquet_dir, resource_path, organisation_path): output_path = self.path / f"dataset={self.dataset}" / "entity.parquet" output_path.parent.mkdir(parents=True, exist_ok=True) @@ -397,15 +415,18 @@ def load_entities(self, transformed_parquet_dir, resource_path, organisation_pat total_entities = max_entity - min_entity entity_limit = 1000000 if total_entities > entity_limit: + # create a temparary output path to store separate entity file in + temp_dir = ( + self.cache_dir + / "temp_parquet_files" + / "title-boundaries" + / "entity_files" + ) logger.info(f"total entities {total_entities} exceeds limit {entity_limit}") _ = min_entity file_count = 1 while _ < max_entity: - temp_output_path = ( - self.path - / f"dataset={self.dataset}" - / f"entity_{file_count}.parquet" - ) + temp_output_path = temp_dir / f"entity_{file_count}.parquet" entity_range = [_, _ + entity_limit] logger.info( f"loading entities from {entity_range[0]} to {entity_range[1]}" @@ -419,7 +440,11 @@ def load_entities(self, transformed_parquet_dir, resource_path, organisation_pat ) _ += entity_limit file_count += 1 + # combine all the parquet files into a single parquet file + self.combine_parquet_files(temp_dir, output_path) + # remove temporary files + temp_dir.rmdir() else: self.load_entities_range( transformed_parquet_dir, resource_path, organisation_path, output_path From 0f590eb1ff9b58927ccaa811b44d47c3893eed94 Mon Sep 17 00:00:00 2001 From: eveleighoj <35256612+eveleighoj@users.noreply.github.com> Date: Thu, 16 Jan 2025 09:11:03 +0000 Subject: [PATCH 24/51] remove cache_dir thats not provided --- digital_land/package/dataset_parquet.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/digital_land/package/dataset_parquet.py b/digital_land/package/dataset_parquet.py index d19e553a7..85829d58f 100644 --- a/digital_land/package/dataset_parquet.py +++ b/digital_land/package/dataset_parquet.py @@ -417,7 +417,7 @@ def load_entities(self, transformed_parquet_dir, resource_path, organisation_pat if total_entities > entity_limit: # create a temparary output path to store separate entity file in temp_dir = ( - self.cache_dir + output_path.parent / "temp_parquet_files" / "title-boundaries" / "entity_files" From e7a3f7f3a83154f07b418318032509a40285b020 Mon Sep 17 00:00:00 2001 From: eveleighoj <35256612+eveleighoj@users.noreply.github.com> Date: Thu, 16 Jan 2025 09:16:26 +0000 Subject: [PATCH 25/51] make the right directory --- digital_land/package/dataset_parquet.py | 1 + 1 file changed, 1 insertion(+) diff --git a/digital_land/package/dataset_parquet.py b/digital_land/package/dataset_parquet.py index 85829d58f..363ee2373 100644 --- a/digital_land/package/dataset_parquet.py +++ b/digital_land/package/dataset_parquet.py @@ -422,6 +422,7 @@ def load_entities(self, transformed_parquet_dir, resource_path, organisation_pat / "title-boundaries" / "entity_files" ) + temp_dir.mkdir(parents=True, exist_ok=True) logger.info(f"total entities {total_entities} exceeds limit {entity_limit}") _ = min_entity file_count = 1 From f0722c5d3c23ecd6ecf84935efb9a0d3c759f098 Mon Sep 17 00:00:00 2001 From: eveleighoj <35256612+eveleighoj@users.noreply.github.com> Date: Thu, 16 Jan 2025 09:34:39 +0000 Subject: [PATCH 26/51] need remove whole tree --- digital_land/package/dataset_parquet.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/digital_land/package/dataset_parquet.py b/digital_land/package/dataset_parquet.py index 363ee2373..e9c5c4893 100644 --- a/digital_land/package/dataset_parquet.py +++ b/digital_land/package/dataset_parquet.py @@ -1,6 +1,7 @@ import os import logging import duckdb +import shutil from pathlib import Path from .package import Package @@ -445,7 +446,7 @@ def load_entities(self, transformed_parquet_dir, resource_path, organisation_pat self.combine_parquet_files(temp_dir, output_path) # remove temporary files - temp_dir.rmdir() + shutil.rmtree(temp_dir) else: self.load_entities_range( transformed_parquet_dir, resource_path, organisation_path, output_path From b89079a113903f6d50531bfcab0153318f608c3b Mon Sep 17 00:00:00 2001 From: eveleighoj <35256612+eveleighoj@users.noreply.github.com> Date: Thu, 16 Jan 2025 10:15:17 +0000 Subject: [PATCH 27/51] run full command --- digital_land/commands.py | 67 ++++++++++++++++++++-------------------- 1 file changed, 33 insertions(+), 34 deletions(-) diff --git a/digital_land/commands.py b/digital_land/commands.py index 641e9c8f8..bc91a4654 100644 --- a/digital_land/commands.py +++ b/digital_land/commands.py @@ -26,7 +26,7 @@ ) from digital_land.organisation import Organisation -# from digital_land.package.dataset import DatasetPackage +from digital_land.package.dataset import DatasetPackage from digital_land.package.dataset_parquet import DatasetParquetPackage from digital_land.phase.combine import FactCombinePhase from digital_land.phase.concat import ConcatFieldPhase @@ -393,37 +393,37 @@ def dataset_create( sys.exit(2) # Set up initial objects - # organisation = Organisation( - # organisation_path=organisation_path, pipeline_dir=Path(pipeline.path) - # ) + organisation = Organisation( + organisation_path=organisation_path, pipeline_dir=Path(pipeline.path) + ) # create sqlite dataset packageas before and load inn data that isn't in the parquetpackage yet - # package = DatasetPackage( - # dataset, - # organisation=organisation, - # path=output_path, - # specification_dir=None, # TBD: package should use this specification object - # ) - # package.create() - # for path in input_paths: - # path_obj = Path(path) - # package.load_column_fields(column_field_dir / dataset / f"{path_obj.stem}.csv") - # package.load_dataset_resource( - # dataset_resource_dir / dataset / f"{path_obj.stem}.csv" - # ) - - # old_entity_path = Path(pipeline.path) / "old-entity.csv" - # if old_entity_path.exists(): - # package.load_old_entities(old_entity_path) - - # issue_paths = issue_dir / dataset - # if issue_paths.exists(): - # for issue_path in os.listdir(issue_paths): - # package.load_issues(os.path.join(issue_paths, issue_path)) - # else: - # logging.warning("No directory for this dataset in the provided issue_directory") - - # package.add_counts() + package = DatasetPackage( + dataset, + organisation=organisation, + path=output_path, + specification_dir=None, # TBD: package should use this specification object + ) + package.create() + for path in input_paths: + path_obj = Path(path) + package.load_column_fields(column_field_dir / dataset / f"{path_obj.stem}.csv") + package.load_dataset_resource( + dataset_resource_dir / dataset / f"{path_obj.stem}.csv" + ) + + old_entity_path = Path(pipeline.path) / "old-entity.csv" + if old_entity_path.exists(): + package.load_old_entities(old_entity_path) + + issue_paths = issue_dir / dataset + if issue_paths.exists(): + for issue_path in os.listdir(issue_paths): + package.load_issues(os.path.join(issue_paths, issue_path)) + else: + logging.warning("No directory for this dataset in the provided issue_directory") + + package.add_counts() # Repeat for parquet # Set up cache directory to store parquet files. The sqlite files created from this will be saved in the dataset @@ -436,11 +436,10 @@ def dataset_create( specification_dir=None, # TBD: package should use this specification object duckdb_path=cache_dir / "overflow.duckdb", ) - # pqpackage.create_temp_table(input_paths) - # pqpackage.load_facts(transformed_parquet_dir) - # pqpackage.load_fact_resource(transformed_parquet_dir) + pqpackage.load_facts(transformed_parquet_dir) + pqpackage.load_fact_resource(transformed_parquet_dir) pqpackage.load_entities(transformed_parquet_dir, resource_path, organisation_path) - # pqpackage.load_to_sqlite(output_path) + pqpackage.load_to_sqlite(output_path) def dataset_dump(input_path, output_path): From 57feeb8a0081f3e56411e382b1c8a1ab4ad42cd6 Mon Sep 17 00:00:00 2001 From: eveleighoj <35256612+eveleighoj@users.noreply.github.com> Date: Thu, 16 Jan 2025 11:41:39 +0000 Subject: [PATCH 28/51] change logginng level --- digital_land/commands.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/digital_land/commands.py b/digital_land/commands.py index bc91a4654..76bfe0d33 100644 --- a/digital_land/commands.py +++ b/digital_land/commands.py @@ -375,6 +375,9 @@ def dataset_create( cache_dir="var/cache", resource_path="collection/resource.csv", ): + # set level for logging to see what's going on + logger.setLevel(logging.INFO) + # chek all paths are paths issue_dir = Path(issue_dir) column_field_dir = Path(column_field_dir) @@ -439,6 +442,8 @@ def dataset_create( pqpackage.load_facts(transformed_parquet_dir) pqpackage.load_fact_resource(transformed_parquet_dir) pqpackage.load_entities(transformed_parquet_dir, resource_path, organisation_path) + + logger.info("parquet files created") pqpackage.load_to_sqlite(output_path) From 2917f43c92a3fc2b3c4014431aec71e34dbf9713 Mon Sep 17 00:00:00 2001 From: eveleighoj <35256612+eveleighoj@users.noreply.github.com> Date: Thu, 16 Jan 2025 14:08:13 +0000 Subject: [PATCH 29/51] make parquet package prinnt info logs --- digital_land/commands.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/digital_land/commands.py b/digital_land/commands.py index 76bfe0d33..54a39335d 100644 --- a/digital_land/commands.py +++ b/digital_land/commands.py @@ -377,6 +377,8 @@ def dataset_create( ): # set level for logging to see what's going on logger.setLevel(logging.INFO) + dataset_parquet_logger = logging.getLogger("dataset_parquet") + dataset_parquet_logger.setLevel(logging.INFO) # chek all paths are paths issue_dir = Path(issue_dir) @@ -410,15 +412,18 @@ def dataset_create( package.create() for path in input_paths: path_obj = Path(path) + logging.info(f"loading column field log into {output_path}") package.load_column_fields(column_field_dir / dataset / f"{path_obj.stem}.csv") + logging.info(f"loading dataset resource log into {output_path}") package.load_dataset_resource( dataset_resource_dir / dataset / f"{path_obj.stem}.csv" ) - + logging.info(f"loading old entities into {output_path}") old_entity_path = Path(pipeline.path) / "old-entity.csv" if old_entity_path.exists(): package.load_old_entities(old_entity_path) + logging.info(f"loading issues into {output_path}") issue_paths = issue_dir / dataset if issue_paths.exists(): for issue_path in os.listdir(issue_paths): @@ -426,8 +431,6 @@ def dataset_create( else: logging.warning("No directory for this dataset in the provided issue_directory") - package.add_counts() - # Repeat for parquet # Set up cache directory to store parquet files. The sqlite files created from this will be saved in the dataset if not os.path.exists(cache_dir): @@ -443,9 +446,12 @@ def dataset_create( pqpackage.load_fact_resource(transformed_parquet_dir) pqpackage.load_entities(transformed_parquet_dir, resource_path, organisation_path) - logger.info("parquet files created") + logger.info("loading fact,fact_resource and entity into {output_path}") pqpackage.load_to_sqlite(output_path) + logger.info(f"creating dataset package {output_path} counts") + package.add_counts() + def dataset_dump(input_path, output_path): cmd = f"sqlite3 -header -csv {input_path} 'select * from entity;' > {output_path}" From 3d3abd9d3a2c04d49633eec93226eb8facfcb686 Mon Sep 17 00:00:00 2001 From: eveleighoj <35256612+eveleighoj@users.noreply.github.com> Date: Thu, 16 Jan 2025 14:19:41 +0000 Subject: [PATCH 30/51] make intgeger --- digital_land/pipeline/process.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/digital_land/pipeline/process.py b/digital_land/pipeline/process.py index 11955ac41..93d558616 100644 --- a/digital_land/pipeline/process.py +++ b/digital_land/pipeline/process.py @@ -21,10 +21,21 @@ def convert_tranformed_csv_to_pq(input_path, output_path): # Define the chunk size for reading the CSV file chunk_size = 1000000 # Number of rows per chunk + # expand on column names # Open a CSV reader with PyArrow # csv_reader = pv.open_csv(input_path, read_options=pv.ReadOptions(block_size=chunk_size)) csv_iterator = pd.read_csv( - input_path, chunksize=chunk_size, dtype=str, na_filter=False + input_path, + chunksize=chunk_size, + dtype={ + "entity": int, + **{ + col: str + for col in pd.read_csv(input_path, nrows=1).columns + if col != "entity" + }, + }, + na_filter=False, ) # Initialize the Parquet writer with the schema from the first chunk From 3453840bfa7b564ac11eeaaf89ac17980daf8b03 Mon Sep 17 00:00:00 2001 From: eveleighoj <35256612+eveleighoj@users.noreply.github.com> Date: Thu, 16 Jan 2025 14:24:26 +0000 Subject: [PATCH 31/51] include entity field change --- digital_land/pipeline/process.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/digital_land/pipeline/process.py b/digital_land/pipeline/process.py index 93d558616..7ad249c52 100644 --- a/digital_land/pipeline/process.py +++ b/digital_land/pipeline/process.py @@ -45,7 +45,7 @@ def convert_tranformed_csv_to_pq(input_path, output_path): fields = [ ("end-date", pa.string()), - ("entity", pa.string()), + ("entity", pa.int64()), ("entry-date", pa.string()), ("entry-number", pa.string()), ("fact", pa.string()), From f43907deb84142e20a1afdeef1dc307e5724eba1 Mon Sep 17 00:00:00 2001 From: eveleighoj <35256612+eveleighoj@users.noreply.github.com> Date: Thu, 16 Jan 2025 16:33:53 +0000 Subject: [PATCH 32/51] ensure where clause isnt used if no range is supplied --- digital_land/package/dataset_parquet.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/digital_land/package/dataset_parquet.py b/digital_land/package/dataset_parquet.py index e9c5c4893..73bef18ca 100644 --- a/digital_land/package/dataset_parquet.py +++ b/digital_land/package/dataset_parquet.py @@ -224,11 +224,17 @@ def load_entities_range( # Do this to match with later field names. entity_fields = [e.replace("-", "_") for e in entity_fields] # input_paths_str = f"{self.cache_dir}/fact{self.suffix}" + if entity_range is not None: + entity_where_clause = ( + f"WHERE entity >= {entity_range[0]} AND entity < {entity_range[1]}" + ) + else: + entity_where_clause = "" query = f""" SELECT DISTINCT REPLACE(field,'-','_') FROM parquet_scan('{transformed_parquet_dir}/*.parquet') - WHERE entity >= {entity_range[0]} AND entity < {entity_range[1]} + {entity_where_clause} """ # distinct_fields - list of fields in the field in fact @@ -299,12 +305,7 @@ def load_entities_range( # query to create the file # craft a where clause to limit entities in quetion, this chunking helps solve memory issues - if entity_range is not None: - entity_where_clause = ( - f"WHERE entity >= {entity_range[0]} AND entity < {entity_range[1]}" - ) - else: - entity_where_clause = "" + query = f""" SELECT {fields_str}{optional_org_str} FROM ( SELECT {fields_str}, CASE WHEN resource_csv."end-date" IS NULL THEN '2999-12-31' ELSE resource_csv."end-date" END AS resource_end_date From c94d4f129c515f492ccfd6411f788165ba2c92dc Mon Sep 17 00:00:00 2001 From: eveleighoj <35256612+eveleighoj@users.noreply.github.com> Date: Thu, 16 Jan 2025 16:46:25 +0000 Subject: [PATCH 33/51] use better logging --- digital_land/commands.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/digital_land/commands.py b/digital_land/commands.py index 54a39335d..80541c796 100644 --- a/digital_land/commands.py +++ b/digital_land/commands.py @@ -377,7 +377,7 @@ def dataset_create( ): # set level for logging to see what's going on logger.setLevel(logging.INFO) - dataset_parquet_logger = logging.getLogger("dataset_parquet") + dataset_parquet_logger = logging.getLogger("digital_land.package.dataset_parquet") dataset_parquet_logger.setLevel(logging.INFO) # chek all paths are paths From 8ec07f9424189b06aa764e685e24bcab894b4cb9 Mon Sep 17 00:00:00 2001 From: eveleighoj <35256612+eveleighoj@users.noreply.github.com> Date: Thu, 16 Jan 2025 16:57:05 +0000 Subject: [PATCH 34/51] logging tweaks --- digital_land/commands.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/digital_land/commands.py b/digital_land/commands.py index 80541c796..e834ffb76 100644 --- a/digital_land/commands.py +++ b/digital_land/commands.py @@ -376,9 +376,7 @@ def dataset_create( resource_path="collection/resource.csv", ): # set level for logging to see what's going on - logger.setLevel(logging.INFO) - dataset_parquet_logger = logging.getLogger("digital_land.package.dataset_parquet") - dataset_parquet_logger.setLevel(logging.INFO) + logging.basicConfig(level=logging.INFO) # chek all paths are paths issue_dir = Path(issue_dir) From bd0749d61f6d53f82b470573eacc9cccb26221b4 Mon Sep 17 00:00:00 2001 From: eveleighoj <35256612+eveleighoj@users.noreply.github.com> Date: Thu, 16 Jan 2025 17:32:20 +0000 Subject: [PATCH 35/51] add loading loggings --- digital_land/commands.py | 3 ++- digital_land/package/dataset_parquet.py | 21 +++++++-------------- 2 files changed, 9 insertions(+), 15 deletions(-) diff --git a/digital_land/commands.py b/digital_land/commands.py index e834ffb76..ef10edc08 100644 --- a/digital_land/commands.py +++ b/digital_land/commands.py @@ -376,7 +376,8 @@ def dataset_create( resource_path="collection/resource.csv", ): # set level for logging to see what's going on - logging.basicConfig(level=logging.INFO) + logger.setLevel(logging.INFO) + logging.getLogger("digital_land.package.dataset_parquet").setLevel(logging.INFO) # chek all paths are paths issue_dir = Path(issue_dir) diff --git a/digital_land/package/dataset_parquet.py b/digital_land/package/dataset_parquet.py index 73bef18ca..93e543db3 100644 --- a/digital_land/package/dataset_parquet.py +++ b/digital_land/package/dataset_parquet.py @@ -218,7 +218,7 @@ def load_entities_range( # ) TO '{self.cache_path / 'duckdb_temp_files' / 'distinct_resource.parquet'}' (FORMAT PARQUET); # """ - logging.info(f"loading entities from {transformed_parquet_dir}") + logger.info(f"loading entities from {transformed_parquet_dir}") entity_fields = self.specification.schema["entity"]["fields"] # Do this to match with later field names. @@ -391,6 +391,7 @@ def combine_parquet_files(self, input_path, output_path): """ This method combines multiple parquet files into a single parquet file """ + logger.info(f"combining parquet files from {input_path} into {output_path}") # check input path is a directory using Path if not Path(input_path).is_dir(): raise ValueError("Input path must be a directory") @@ -461,7 +462,7 @@ def load_to_sqlite(self, sqlite_path): # At present we are saving the parquet files in 'cache' but saving the sqlite files produced in 'dataset' # In future when parquet files are saved to 'dataset' remove the 'cache_dir' in the function arguments and # replace 'cache_dir' with 'output_path' in this function's code - logging.info( + logger.info( f"loading sqlite3 tables in {sqlite_path} from parquet files in {self.path}" ) # migrate to connection creation @@ -477,6 +478,8 @@ def load_to_sqlite(self, sqlite_path): fields_str = ", ".join( [field.replace("-", "_") for field in fact_resource_fields] ) + + logger.info("loading fact_resource data") # insert fact_resource data self.conn.execute( f""" @@ -485,6 +488,7 @@ def load_to_sqlite(self, sqlite_path): """ ) + logger.info("loading fact data") # insert fact data fact_fields = self.specification.schema["fact"]["fields"] fields_str = ", ".join([field.replace("-", "_") for field in fact_fields]) @@ -496,6 +500,7 @@ def load_to_sqlite(self, sqlite_path): """ ) + logger.info("loading entity data") # insert entity data entity_fields = self.specification.schema["entity"]["fields"] fields_str = ", ".join( @@ -512,18 +517,6 @@ def load_to_sqlite(self, sqlite_path): """ ) - # Fix the column names - # for column in self.conn.execute("DESCRIBE TABLE temp_table;").fetchall(): - # if "-" in column[0]: - # self.conn.execute( - # f"ALTER TABLE temp_table RENAME COLUMN '{column[0]}' TO '{column[0].replace('-','_')}';" - # ) - - # Copy the data - # self.conn.execute( - # f"INSERT INTO sqlite_db.{table_name} BY NAME (SELECT * FROM temp_table);" - # ) - self.conn.execute("DETACH DATABASE sqlite_db;") def close_conn(self): From ba48403422770dbf73f0468e59ed728b3a7233d7 Mon Sep 17 00:00:00 2001 From: eveleighoj <35256612+eveleighoj@users.noreply.github.com> Date: Thu, 16 Jan 2025 17:58:47 +0000 Subject: [PATCH 36/51] remove indexes before loading --- digital_land/commands.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/digital_land/commands.py b/digital_land/commands.py index ef10edc08..5be9bc215 100644 --- a/digital_land/commands.py +++ b/digital_land/commands.py @@ -408,7 +408,9 @@ def dataset_create( path=output_path, specification_dir=None, # TBD: package should use this specification object ) - package.create() + # don'tt use create as we don't want to create the indexes + package.create_database() + package.disconnect() for path in input_paths: path_obj = Path(path) logging.info(f"loading column field log into {output_path}") @@ -448,6 +450,11 @@ def dataset_create( logger.info("loading fact,fact_resource and entity into {output_path}") pqpackage.load_to_sqlite(output_path) + logger.info(f"add indexes to {output_path}") + package.connect() + package.create_indexes() + package.disconnect() + logger.info(f"creating dataset package {output_path} counts") package.add_counts() From 0bf17f2d316c22c51d23ddbb11e6b1e7b136053d Mon Sep 17 00:00:00 2001 From: eveleighoj <35256612+eveleighoj@users.noreply.github.com> Date: Thu, 16 Jan 2025 18:04:07 +0000 Subject: [PATCH 37/51] add crreate cursur --- digital_land/commands.py | 1 + 1 file changed, 1 insertion(+) diff --git a/digital_land/commands.py b/digital_land/commands.py index 5be9bc215..eabc02219 100644 --- a/digital_land/commands.py +++ b/digital_land/commands.py @@ -452,6 +452,7 @@ def dataset_create( logger.info(f"add indexes to {output_path}") package.connect() + package.create_cursor() package.create_indexes() package.disconnect() From 0183d9b44e4e9a3f5de80628c94ad9ec3d45a140 Mon Sep 17 00:00:00 2001 From: eveleighoj <35256612+eveleighoj@users.noreply.github.com> Date: Thu, 23 Jan 2025 13:59:59 +0000 Subject: [PATCH 38/51] remove error and some wrong validation --- digital_land/package/dataset_parquet.py | 9 --------- 1 file changed, 9 deletions(-) diff --git a/digital_land/package/dataset_parquet.py b/digital_land/package/dataset_parquet.py index 93e543db3..365c73ed4 100644 --- a/digital_land/package/dataset_parquet.py +++ b/digital_land/package/dataset_parquet.py @@ -319,7 +319,6 @@ def load_entities_range( ) = 1 ) """ - logging.error(query) pivot_query = f""" PIVOT ( @@ -392,14 +391,6 @@ def combine_parquet_files(self, input_path, output_path): This method combines multiple parquet files into a single parquet file """ logger.info(f"combining parquet files from {input_path} into {output_path}") - # check input path is a directory using Path - if not Path(input_path).is_dir(): - raise ValueError("Input path must be a directory") - - # check output_path is a file that doesn't exist - if not Path(output_path).is_file(): - raise ValueError("Output path must be a file") - # use self.conn to use duckdb to combine files sql = f""" COPY (select * from parquet_scan('{input_path}/*.parquet')) TO '{output_path}' (FORMAT PARQUET); From 4a605ad6e1b814b5b69360c080775eeba59cca21 Mon Sep 17 00:00:00 2001 From: eveleighoj <35256612+eveleighoj@users.noreply.github.com> Date: Fri, 24 Jan 2025 13:54:42 +0000 Subject: [PATCH 39/51] chacnge package structure --- digital_land/package/dataset_parquet.py | 43 ++++---------- .../package/test_dataset_parquet.py | 56 +++++++++++++++---- 2 files changed, 57 insertions(+), 42 deletions(-) diff --git a/digital_land/package/dataset_parquet.py b/digital_land/package/dataset_parquet.py index 365c73ed4..54a62935e 100644 --- a/digital_land/package/dataset_parquet.py +++ b/digital_land/package/dataset_parquet.py @@ -54,11 +54,16 @@ def __init__(self, dataset, path, duckdb_path=None, **kwargs): self.typology = self.specification.schema[dataset]["typology"] # set up key file paths - self.fact_path = self.path / f"dataset={self.dataset}" / "fact.parquet" + self.fact_path = self.path / "fact" / f"dataset={self.dataset}" / "fact.parquet" self.fact_resource_path = ( - self.path / f"dataset={self.dataset}" / "fact_resource.parquet" + self.path + / "fact-resource" + / f"dataset={self.dataset}" + / "fact-resource.parquet" + ) + self.entity_path = ( + self.path / "entity" / f"dataset={self.dataset}" / "entity.parquet" ) - self.entity_path = self.path / f"dataset={self.dataset}" / "entity.parquet" def get_schema(self): schema = {} @@ -124,7 +129,7 @@ def load_facts(self, transformed_parquet_dir): """ This method loads facts into a fact table from a directory containing all transformed files as parquet files """ - output_path = self.path / f"dataset={self.dataset}" / "fact.parquet" + output_path = self.fact_path output_path.parent.mkdir(parents=True, exist_ok=True) logging.info("loading facts from temp table") @@ -151,7 +156,7 @@ def load_facts(self, transformed_parquet_dir): def load_fact_resource(self, transformed_parquet_dir): logging.info(f"loading fact resources from {str(transformed_parquet_dir)}") - output_path = self.path / f"dataset={self.dataset}" / "fact_resource.parquet" + output_path = self.fact_resource_path output_path.parent.mkdir(parents=True, exist_ok=True) fact_resource_fields = self.specification.schema["fact-resource"]["fields"] fields_str = ", ".join( @@ -172,32 +177,6 @@ def load_fact_resource(self, transformed_parquet_dir): """ ) - # def combine_parquet_files(input_path,output_path): - # """ - # This method combines multiple parquet files into a single parquet file - # """ - # # check input path is a directory using Path - # if not Path(input_path).is_dir(): - # raise ValueError("Input path must be a directory") - - # # check output_path is a file that doesn't exist - # if not Path(output_path).is_file(): - # raise ValueError("Output path must be a file") - - # # use self.conn to use duckdb to combine files - # sql = f""" - # COPY (select * from parquet_scan('{input_path}/*.parquet')) TO '{output_path}' (FORMAT PARQUET); - # """ - # self.conn.execute(sql) - - # # Combine all the parquet files into a single parquet file - # combined_df = pd.concat( - # [pd.read_parquet(f"{input_path}/{file}") for file in parquet_files] - # ) - - # # Save the combined dataframe to a parquet file - # combined_df.to_parquet(output_path, index=False) - def load_entities_range( self, transformed_parquet_dir, @@ -398,7 +377,7 @@ def combine_parquet_files(self, input_path, output_path): self.conn.execute(sql) def load_entities(self, transformed_parquet_dir, resource_path, organisation_path): - output_path = self.path / f"dataset={self.dataset}" / "entity.parquet" + output_path = self.entity_path output_path.parent.mkdir(parents=True, exist_ok=True) # retrieve entity counnts including and minimum diff --git a/tests/integration/package/test_dataset_parquet.py b/tests/integration/package/test_dataset_parquet.py index 0f0042958..f94244dd1 100644 --- a/tests/integration/package/test_dataset_parquet.py +++ b/tests/integration/package/test_dataset_parquet.py @@ -380,7 +380,11 @@ def test_load_facts_single_file(data: dict, expected: int, tmp_path): package.load_facts(transformed_parquet_dir=transformed_parquet_dir) output_file = ( - tmp_path / "conservation-area" / "dataset=conservation-area" / "fact.parquet" + tmp_path + / "conservation-area" + / "fact" + / "dataset=conservation-area" + / "fact.parquet" ) assert os.path.exists(output_file), "fact.parquet file does not exist" @@ -421,7 +425,11 @@ def test_load_facts_multiple_files(data1, data2, expected, tmp_path): package.load_facts(transformed_parquet_dir=transformed_parquet_dir) output_file = ( - tmp_path / "conservation-area" / "dataset=conservation-area" / "fact.parquet" + tmp_path + / "conservation-area" + / "fact" + / "dataset=conservation-area" + / "fact.parquet" ) assert os.path.exists(output_file), "fact.parquet file does not exist" @@ -472,7 +480,11 @@ def test_load_facts_one_file_with_empty_file(data, expected, tmp_path): package.load_facts(transformed_parquet_dir=transformed_parquet_dir) output_file = ( - tmp_path / "conservation-area" / "dataset=conservation-area" / "fact.parquet" + tmp_path + / "conservation-area" + / "fact" + / "dataset=conservation-area" + / "fact.parquet" ) assert os.path.exists(output_file), "fact.parquet file does not exist" @@ -504,8 +516,9 @@ def test_load_fact_resource_single_file(data, expected, tmp_path): output_file = ( tmp_path / "conservation-area" + / "fact-resource" / "dataset=conservation-area" - / "fact_resource.parquet" + / "fact-resource.parquet" ) assert os.path.exists(output_file), "fact-resource.parquet file does not exist" @@ -544,8 +557,9 @@ def test_load_fact_resource_two_filea(data_1, data_2, expected, tmp_path): output_file = ( tmp_path / "conservation-area" + / "fact-resource" / "dataset=conservation-area" - / "fact_resource.parquet" + / "fact-resource.parquet" ) assert os.path.exists(output_file), "fact-resource.parquet file does not exist" @@ -596,8 +610,9 @@ def test_load_fact_resource_empty_file_with_another(data, expected, tmp_path): output_file = ( tmp_path / "conservation-area" + / "fact-resource" / "dataset=conservation-area" - / "fact_resource.parquet" + / "fact-resource.parquet" ) assert os.path.exists(output_file), "fact-resource.parquet file does not exist" @@ -662,7 +677,11 @@ def test_load_entities_single_file( package.load_entities(transformed_parquet_dir, resource_path, org_path) output_file = ( - tmp_path / "conservation-area" / "dataset=conservation-area" / "entity.parquet" + tmp_path + / "conservation-area" + / "entity" + / "dataset=conservation-area" + / "entity.parquet" ) assert os.path.exists(output_file), "entity.parquet file does not exist" @@ -737,15 +756,32 @@ def test_load_pq_to_sqlite_basic( fact_resource_df = pd.DataFrame.from_dict(fact_resource_data) entity_df = pd.DataFrame.from_dict(entity_data) + (dataset_parquet_path / "fact" / "dataset=conservation-area").mkdir( + parents=True, exist_ok=True + ) + (dataset_parquet_path / "fact-resource" / "dataset=conservation-area").mkdir( + parents=True, exist_ok=True + ) + (dataset_parquet_path / "entity" / "dataset=conservation-area").mkdir( + parents=True, exist_ok=True + ) + fact_df.to_parquet( - dataset_parquet_path / "dataset=conservation-area" / "fact.parquet", index=False + dataset_parquet_path / "fact" / "dataset=conservation-area" / "fact.parquet", + index=False, ) fact_resource_df.to_parquet( - dataset_parquet_path / "dataset=conservation-area" / "fact_resource.parquet", + dataset_parquet_path + / "fact-resource" + / "dataset=conservation-area" + / "fact-resource.parquet", index=False, ) entity_df.to_parquet( - dataset_parquet_path / "dataset=conservation-area" / "entity.parquet", + dataset_parquet_path + / "entity" + / "dataset=conservation-area" + / "entity.parquet", index=False, ) From 1b34bd19472f9588d10fead2f5c146dcaa0a5121 Mon Sep 17 00:00:00 2001 From: eveleighoj <35256612+eveleighoj@users.noreply.github.com> Date: Fri, 24 Jan 2025 19:28:38 +0000 Subject: [PATCH 40/51] align with proper convert changes --- digital_land/commands.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/digital_land/commands.py b/digital_land/commands.py index 954a7f0e7..898da4c60 100644 --- a/digital_land/commands.py +++ b/digital_land/commands.py @@ -283,7 +283,7 @@ def pipeline_run( path=input_path, dataset_resource_log=dataset_resource_log, converted_resource_log=converted_resource_log, - output_path=cache_dir / "converted_resources" / dataset / f"{resource}.csv", + output_path=converted_path, ), NormalisePhase(skip_patterns=skip_patterns, null_path=null_path), ParsePhase(), From a856ae28562c99528dfde3ee883ab176bf5c3e31 Mon Sep 17 00:00:00 2001 From: alexglasertpx Date: Tue, 4 Feb 2025 17:23:19 +0000 Subject: [PATCH 41/51] ran black --- digital_land/phase/convert.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/digital_land/phase/convert.py b/digital_land/phase/convert.py index d27fc5460..d0d5aa051 100644 --- a/digital_land/phase/convert.py +++ b/digital_land/phase/convert.py @@ -188,6 +188,7 @@ def __init__( dataset_resource_log=None, converted_resource_log=None, output_path=None, + custom_temp_dir=None, ): """ given a fie/filepath will aim to convert it to a csv and return the path to a csv, if the file is aready a csv @@ -207,6 +208,9 @@ def __init__( output_dir = os.path.dirname(output_path) if not os.path.exists(output_dir): os.makedirs(output_dir) + self.custom_temp_dir = custom_temp_dir + if custom_temp_dir: + os.makedirs(custom_temp_dir, exist_ok=True) def process(self, stream=None): input_path = self.path From cd57b17d19ca5275646f256958b2034a6e612224 Mon Sep 17 00:00:00 2001 From: alexglasertpx Date: Wed, 5 Feb 2025 15:57:00 +0000 Subject: [PATCH 42/51] Reran test with duckdb 1.2.0 From 50fe50efcd03d2a0b6bea2ecebf12918c1ed85fa Mon Sep 17 00:00:00 2001 From: alexglasertpx Date: Wed, 5 Feb 2025 16:11:40 +0000 Subject: [PATCH 43/51] Added 'INSTALL spatial' at class initialisation --- digital_land/package/dataset_parquet.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/digital_land/package/dataset_parquet.py b/digital_land/package/dataset_parquet.py index 54a62935e..0b5085164 100644 --- a/digital_land/package/dataset_parquet.py +++ b/digital_land/package/dataset_parquet.py @@ -47,6 +47,8 @@ def __init__(self, dataset, path, duckdb_path=None, **kwargs): self.duckdb_path = Path(duckdb_path) self.duckdb_path.parent.mkdir(parents=True, exist_ok=True) self.conn = duckdb.connect(self.duckdb_path) + query = "INSTALL spatial; LOAD spatial;" + self.conn.execute(query) else: self.conn = duckdb.connect() From 3bed06981673b89c9a012fe53d93955be7186ebd Mon Sep 17 00:00:00 2001 From: alexglasertpx Date: Wed, 5 Feb 2025 16:56:00 +0000 Subject: [PATCH 44/51] Remove adding spatial parts in class initialisation --- digital_land/package/dataset_parquet.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/digital_land/package/dataset_parquet.py b/digital_land/package/dataset_parquet.py index 0b5085164..54a62935e 100644 --- a/digital_land/package/dataset_parquet.py +++ b/digital_land/package/dataset_parquet.py @@ -47,8 +47,6 @@ def __init__(self, dataset, path, duckdb_path=None, **kwargs): self.duckdb_path = Path(duckdb_path) self.duckdb_path.parent.mkdir(parents=True, exist_ok=True) self.conn = duckdb.connect(self.duckdb_path) - query = "INSTALL spatial; LOAD spatial;" - self.conn.execute(query) else: self.conn = duckdb.connect() From 6771c05d0ea852b0a0c05f61399256836da6a472 Mon Sep 17 00:00:00 2001 From: alexglasertpx Date: Wed, 5 Feb 2025 16:56:24 +0000 Subject: [PATCH 45/51] Set verrsion for duckdb --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 0f4b77b0a..6c140a941 100644 --- a/setup.py +++ b/setup.py @@ -53,7 +53,7 @@ def get_long_description(): "dataclasses-json", "pydantic", "json-stream", - "duckdb", + "duckdb==1.1.3", "dask", "pyarrow", "pygit2", From 974876226ad4b1e36161ecff7d6468b8e019fae9 Mon Sep 17 00:00:00 2001 From: alexglasertpx Date: Wed, 5 Feb 2025 17:28:05 +0000 Subject: [PATCH 46/51] Adding spatial install on class initialisation and setting duckdb version --- digital_land/package/dataset_parquet.py | 1 + 1 file changed, 1 insertion(+) diff --git a/digital_land/package/dataset_parquet.py b/digital_land/package/dataset_parquet.py index 54a62935e..88cac0db7 100644 --- a/digital_land/package/dataset_parquet.py +++ b/digital_land/package/dataset_parquet.py @@ -47,6 +47,7 @@ def __init__(self, dataset, path, duckdb_path=None, **kwargs): self.duckdb_path = Path(duckdb_path) self.duckdb_path.parent.mkdir(parents=True, exist_ok=True) self.conn = duckdb.connect(self.duckdb_path) + self.conn.execute("INSTALL spatial; LOAD spatial;") else: self.conn = duckdb.connect() From a29b628850b22850086ecf42f290e4a641865fdd Mon Sep 17 00:00:00 2001 From: alexglasertpx Date: Wed, 5 Feb 2025 17:39:46 +0000 Subject: [PATCH 47/51] Removed duckdb version --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 6c140a941..0f4b77b0a 100644 --- a/setup.py +++ b/setup.py @@ -53,7 +53,7 @@ def get_long_description(): "dataclasses-json", "pydantic", "json-stream", - "duckdb==1.1.3", + "duckdb", "dask", "pyarrow", "pygit2", From 39457df8277c8e828c9222818bcf9db8ce05b6b7 Mon Sep 17 00:00:00 2001 From: alexglasertpx Date: Thu, 6 Feb 2025 10:11:57 +0000 Subject: [PATCH 48/51] Removed 'INSTALL spatial' from class initialisation --- digital_land/package/dataset_parquet.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/digital_land/package/dataset_parquet.py b/digital_land/package/dataset_parquet.py index 88cac0db7..b713f1e4c 100644 --- a/digital_land/package/dataset_parquet.py +++ b/digital_land/package/dataset_parquet.py @@ -47,7 +47,7 @@ def __init__(self, dataset, path, duckdb_path=None, **kwargs): self.duckdb_path = Path(duckdb_path) self.duckdb_path.parent.mkdir(parents=True, exist_ok=True) self.conn = duckdb.connect(self.duckdb_path) - self.conn.execute("INSTALL spatial; LOAD spatial;") + # self.conn.execute("INSTALL spatial; LOAD spatial;") else: self.conn = duckdb.connect() From 901148a18deebcbe4d35ada3e4e681ad8746b284 Mon Sep 17 00:00:00 2001 From: alexglasertpx Date: Thu, 6 Feb 2025 17:34:26 +0000 Subject: [PATCH 49/51] Reversed prior changes to check if get same error --- digital_land/phase/convert.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/digital_land/phase/convert.py b/digital_land/phase/convert.py index 20fbbabc0..b92e2d900 100644 --- a/digital_land/phase/convert.py +++ b/digital_land/phase/convert.py @@ -188,7 +188,7 @@ def __init__( dataset_resource_log=None, converted_resource_log=None, output_path=None, - custom_temp_dir=None, + # custom_temp_dir=None, ): """ given a fie/filepath will aim to convert it to a csv and return the path to a csv, if the file is already a csv @@ -208,9 +208,9 @@ def __init__( output_dir = os.path.dirname(output_path) if not os.path.exists(output_dir): os.makedirs(output_dir) - self.custom_temp_dir = custom_temp_dir - if custom_temp_dir: - os.makedirs(custom_temp_dir, exist_ok=True) + # self.custom_temp_dir = custom_temp_dir + # if custom_temp_dir: + # os.makedirs(custom_temp_dir, exist_ok=True) def process(self, stream=None): input_path = self.path From 937f3f03b8d2bf74fe940d1dbfb28afb704fb8a9 Mon Sep 17 00:00:00 2001 From: alexglasertpx Date: Thu, 6 Feb 2025 17:41:52 +0000 Subject: [PATCH 50/51] Reversed prior changes to check if get same error --- digital_land/commands.py | 1 - 1 file changed, 1 deletion(-) diff --git a/digital_land/commands.py b/digital_land/commands.py index 067f149ea..996bf1981 100644 --- a/digital_land/commands.py +++ b/digital_land/commands.py @@ -1265,7 +1265,6 @@ def get_resource_unidentified_lookups( ConvertPhase( path=input_path, dataset_resource_log=dataset_resource_log, - custom_temp_dir=custom_temp_dir, ), NormalisePhase(skip_patterns=skip_patterns, null_path=null_path), ParsePhase(), From 1db57db8214f662e78ad2ec971d721ddc4f037bd Mon Sep 17 00:00:00 2001 From: alexglasertpx Date: Thu, 6 Feb 2025 17:47:50 +0000 Subject: [PATCH 51/51] Reversed prior changes to check if get same error --- digital_land/commands.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/digital_land/commands.py b/digital_land/commands.py index 996bf1981..963e2e6c7 100644 --- a/digital_land/commands.py +++ b/digital_land/commands.py @@ -1211,7 +1211,7 @@ def get_resource_unidentified_lookups( # could alter resource_from_path to file from path and promote to a utils folder resource = resource_from_path(input_path) dataset_resource_log = DatasetResourceLog(dataset=dataset, resource=resource) - custom_temp_dir = tmp_dir # './var' + # custom_temp_dir = tmp_dir # './var' print("") print("----------------------------------------------------------------------")