From 5ed2010792c801aad9e16605f1df06e2c5a82fad Mon Sep 17 00:00:00 2001
From: eveleighoj <35256612+eveleighoj@users.noreply.github.com>
Date: Thu, 5 Dec 2024 11:06:52 +0000
Subject: [PATCH 01/51] add convert to parquet function in and run in the
 pipeline

---
 digital_land/commands.py          |   9 +
 digital_land/pipeline/__init__.py |   9 +
 digital_land/pipeline/main.py     | 556 ++++++++++++++++++++++++++++++
 digital_land/pipeline/process.py  |  68 ++++
 4 files changed, 642 insertions(+)
 create mode 100644 digital_land/pipeline/__init__.py
 create mode 100644 digital_land/pipeline/main.py
 create mode 100644 digital_land/pipeline/process.py

diff --git a/digital_land/commands.py b/digital_land/commands.py
index 80619a612..e42b6a8aa 100644
--- a/digital_land/commands.py
+++ b/digital_land/commands.py
@@ -52,6 +52,7 @@
 from digital_land.phase.reference import EntityReferencePhase, FactReferencePhase
 from digital_land.phase.save import SavePhase
 from digital_land.pipeline import run_pipeline, Lookups, Pipeline
+from digital_land.pipeline.process import convert_tranformed_csv_to_pq
 from digital_land.schema import Schema
 from digital_land.update import add_source_endpoint
 from digital_land.configuration.main import Config
@@ -342,6 +343,14 @@ def pipeline_run(
     column_field_log.save(os.path.join(column_field_dir, resource + ".csv"))
     dataset_resource_log.save(os.path.join(dataset_resource_dir, resource + ".csv"))
     converted_resource_log.save(os.path.join(converted_resource_dir, resource + ".csv"))
+    # create converted parquet in the var directory
+    cache_dir = Path(organisation_path).parent
+    transformed_parquet_dir = cache_dir / "transformed_parquet" / dataset
+    transformed_parquet_dir.mkdir(exists_ok=True, parents=True)
+    convert_tranformed_csv_to_pq(
+        input_path=input_path,
+        output_path=transformed_parquet_dir / f"{resource}.parquet",
+    )
 
 
 #
diff --git a/digital_land/pipeline/__init__.py b/digital_land/pipeline/__init__.py
new file mode 100644
index 000000000..66b49ea6b
--- /dev/null
+++ b/digital_land/pipeline/__init__.py
@@ -0,0 +1,9 @@
+""" sub package containing code for processing resources into transformed resources"""
+
+from .main import (  # noqa: F401
+    Pipeline,
+    Lookups,
+    chain_phases,
+    run_pipeline,
+    EntityNumGen,
+)
diff --git a/digital_land/pipeline/main.py b/digital_land/pipeline/main.py
new file mode 100644
index 000000000..11749571e
--- /dev/null
+++ b/digital_land/pipeline/main.py
@@ -0,0 +1,556 @@
+import os
+import csv
+import functools
+import importlib.util
+import logging
+from pathlib import Path
+
+from .phase.map import normalise
+from .phase.lookup import key as lookup_key
+from .schema import Schema
+
+
+def chain_phases(phases):
+    def add(f, g):
+        return lambda x: g.process(f(x))
+
+    return functools.reduce(add, phases, lambda phase: phase)
+
+
+def run_pipeline(*args):
+    logging.debug(f"run_pipeline {args}")
+    chain = chain_phases([arg for arg in args if arg])
+
+    stream = chain(None)
+    for row in stream:
+        pass
+
+
+# TODO should we remove loading from init? it makes it harder to test
+# and what if you only wanted to load specific files
+# TODO replace with config models which load is handled by them
+class Pipeline:
+    def __init__(self, path, dataset):
+        self.dataset = dataset
+        self.name = dataset
+        self.path = path
+        self.column = {}
+        self.filter = {}
+        self.skip_pattern = {}
+        self.patch = {}
+        self.default_field = {}
+        self.default_value = {}
+        self.combine_field = {}
+        self.concat = {}
+        self.migrate = {}
+        self.lookup = {}
+        self.redirect_lookup = {}
+
+        self.load_column()
+        self.load_skip_patterns()
+        self.load_patch()
+        self.load_default_fields()
+        self.load_default_values()
+        self.load_concat()
+        self.load_combine_fields()
+        self.load_migrate()
+        self.load_lookup()
+        self.load_redirect_lookup()
+        self.load_filter()
+
+    def file_reader(self, filename):
+        # read a file from the pipeline path, ignore if missing
+        path = os.path.join(self.path, filename)
+        if not os.path.isfile(path):
+            return []
+        logging.debug(f"load {path}")
+        return csv.DictReader(open(path))
+
+    def reader(self, filename):
+        for row in self.file_reader(filename):
+            row["dataset"] = row.get("dataset", "") or row.get("pipeline", "")
+            if row["dataset"] and row["dataset"] != self.name:
+                continue
+            yield row
+
+    def load_column(self):
+        for row in self.reader("column.csv"):
+            resource = row.get("resource", "")
+            endpoint = row.get("endpoint", "")
+
+            if resource:
+                record = self.column.setdefault(resource, {})
+            elif endpoint:
+                record = self.column.setdefault(endpoint, {})
+            else:
+                record = self.column.setdefault("", {})
+
+            # migrate column.csv
+            row["column"] = row.get("column", "") or row["pattern"]
+            row["field"] = row.get("field", "") or row["value"]
+
+            record[normalise(row["column"])] = row["field"]
+
+    def load_filter(self):
+        for row in self.reader("filter.csv"):
+            resource = row.get("resource", "")
+            endpoint = row.get("endpoint", "")
+
+            if resource:
+                record = self.filter.setdefault(resource, {})
+            elif endpoint:
+                record = self.filter.setdefault(endpoint, {})
+            else:
+                record = self.filter.setdefault("", {})
+
+            record[row["field"]] = row["pattern"]
+
+    def load_skip_patterns(self):
+        for row in self.reader("skip.csv"):
+            resource = row.get("resource", "")
+            endpoint = row.get("endpoint", "")
+
+            if resource:
+                record = self.skip_pattern.setdefault(resource, [])
+            elif endpoint:
+                record = self.skip_pattern.setdefault(endpoint, [])
+            else:
+                record = self.skip_pattern.setdefault("", [])
+
+            record.append(row["pattern"])
+
+    def load_patch(self):
+        for row in self.reader("patch.csv"):
+            resource = row.get("resource", "")
+            endpoint = row.get("endpoint", "")
+
+            if resource:
+                record = self.patch.setdefault(resource, {})
+            elif endpoint:
+                record = self.patch.setdefault(endpoint, {})
+            else:
+                record = self.patch.setdefault("", {})
+
+            row["field"] = row.get("field", "")
+            row["pattern"] = row.get("pattern", "")
+
+            record = record.setdefault(row["field"], {})
+            record[row["pattern"]] = row["value"]
+
+    def load_default_fields(self):
+        # TBD: rename default-field.csv
+        for row in self.reader("default.csv"):
+            resource = row.get("resource", "")
+            endpoint = row.get("endpoint", "")
+
+            if resource:
+                record = self.default_field.setdefault(resource, {})
+            elif endpoint:
+                record = self.default_field.setdefault(endpoint, {})
+            else:
+                record = self.default_field.setdefault("", {})
+
+            record[row["field"]] = row["default-field"]
+
+    def load_default_values(self):
+        for row in self.reader("default-value.csv"):
+            record = self.default_value.setdefault(row.get("endpoint", ""), {})
+            record[row["field"]] = row["value"]
+
+    def load_combine_fields(self):
+        for row in self.reader("combine.csv"):
+            record = self.combine_field.setdefault(row.get("endpoint", ""), {})
+            record[row["field"]] = row["separator"]
+
+    def load_concat(self):
+        for row in self.reader("concat.csv"):
+            resource = row.get("resource", "")
+            endpoint = row.get("endpoint", "")
+
+            if resource:
+                record = self.concat.setdefault(resource, {})
+            elif endpoint:
+                record = self.concat.setdefault(endpoint, {})
+            else:
+                record = self.concat.setdefault("", {})
+
+            # record = self.concat.setdefault(row["resource"], {})
+            record[row["field"]] = {
+                "fields": row["fields"].split(";"),
+                "separator": row["separator"],
+                "prepend": row.get("prepend", ""),
+                "append": row.get("append", ""),
+            }
+
+    # TBD: remove this table, should come from specification replacement-field
+    def load_migrate(self):
+        for row in self.reader("transform.csv"):
+            if row["replacement-field"] == "":
+                continue
+
+            if row["replacement-field"] in self.migrate:
+                raise ValueError(
+                    "replacement-field %s has more than one entry"
+                    % row["replacement-field"]
+                )
+
+            self.migrate[row["replacement-field"]] = row["field"]
+
+    def load_lookup(self):
+        for row in self.file_reader("lookup.csv"):
+            # migrate old lookup.csv files
+            entry_number = row.get("entry-number", "")
+            prefix = (
+                row.get("prefix", "")
+                or row.get("dataset", "")
+                or row.get("pipeline", "")
+            )
+            reference = row.get("reference", "") or row.get("value", "")
+
+            # composite key, ordered by specificity
+            resource_lookup = self.lookup.setdefault(row.get("resource", ""), {})
+            resource_lookup[
+                lookup_key(
+                    entry_number=entry_number,
+                    prefix=prefix,
+                    reference=reference,
+                )
+            ] = row["entity"]
+
+            organisation = row.get("organisation", "")
+            # replace local-authority-eng while we migrate
+            organisation = organisation.replace(
+                "local-authority-eng", "local-authority"
+            )
+            resource_lookup[
+                lookup_key(
+                    prefix=prefix,
+                    reference=reference,
+                    organisation=organisation,
+                )
+            ] = row["entity"]
+
+    def load_redirect_lookup(self):
+        for row in self.file_reader("old-entity.csv"):
+            old_entity = row.get("old-entity", "")
+            entity = row.get("entity", "")
+            status = row.get("status", "")
+            if old_entity and status:
+                self.redirect_lookup[old_entity] = {"entity": entity, "status": status}
+
+    def filters(self, resource="", endpoints=[]):
+        d = self.filter.get("", {}).copy()
+
+        for endpoint in endpoints:
+            endpoint_filters = self.filter.get(endpoint, {})
+            d.update(endpoint_filters)
+
+        if resource:
+            resource_filters = self.filter.get(resource, {})
+            d.update(resource_filters)
+
+        return d
+
+    def columns(self, resource="", endpoints=[]):
+        general_columns = self.column.get("", {})
+        if not resource:
+            return general_columns
+
+        resource_columns = self.column.get(resource, {})
+        endpoint_columns = {}
+        for endpoint in endpoints:
+            endpoint_columns = {**endpoint_columns, **self.column.get(endpoint, {})}
+
+        result = {**endpoint_columns, **resource_columns}
+
+        for key in general_columns:
+            if key in result:
+                continue
+            if (
+                general_columns[key] in endpoint_columns.values()
+                or general_columns[key] in resource_columns.values()
+            ):
+                continue
+            result[key] = general_columns[key]
+        return result
+
+    def skip_patterns(self, resource="", endpoints=[]):
+        if not resource:
+            return self.skip_pattern.get("", {})
+        endpoint_patterns = []
+        for endpoint in endpoints:
+            endpoint_patterns.extend(self.skip_pattern.get(endpoint, []))
+
+        return (
+            self.skip_pattern.get(resource, [])
+            + self.skip_pattern.get("", [])
+            + endpoint_patterns
+        )
+
+    def patches(self, resource="", endpoints=[]):
+        general_patch = self.patch.get("", {})
+        if not resource:
+            return general_patch
+
+        resource_patch = self.patch.get(resource, {})
+        endpoint_patch = {}
+
+        for endpoint in endpoints:
+            endpoint_patch = {**endpoint_patch, **self.patch.get(endpoint, {})}
+
+        result = {**endpoint_patch, **resource_patch}
+
+        # Merge any remaining general defaults into the result
+        for field, patch in general_patch.items():
+            if field not in result:
+                result[field] = patch
+            else:
+                result[field] = {**patch, **result[field]}
+
+        return result
+
+    def default_fields(self, resource=None, endpoints=[]):
+        config = self.default_field
+
+        d = config.get("", {})
+
+        for key, value in config.get(resource, {}).items():
+            d[key] = value
+
+        for endpoint in endpoints:
+            for key, value in config.get(endpoint, {}).items():
+                d[key] = value
+        return d
+
+    def default_values(self, endpoints=None):
+        if endpoints is None:
+            endpoints = []
+        config = self.default_value
+        d = config.get("", {})
+        for endpoint in endpoints:
+            for key, value in config.get(endpoint, {}).items():
+                d[key] = value
+        return d
+
+    def combine_fields(self, endpoints=None):
+        if endpoints is None:
+            endpoints = []
+        config = self.combine_field
+        d = config.get("", {})
+        for endpoint in endpoints:
+            for key, value in config.get(endpoint, {}).items():
+                d[key] = value
+        return d
+
+    def concatenations(self, resource=None, endpoints=[]):
+        result = self.concat.get("", {})
+        if resource:
+            result.update(self.concat.get(resource, {}))
+
+        for endpoint in endpoints:
+            result.update(self.concat.get(endpoint, {}))
+
+        return result
+
+    def migrations(self):
+        return self.migrate
+
+    def lookups(self, resource=None):
+        d = self.lookup.get("", {})
+        if resource:
+            d.update(self.lookup.get(resource, {}))
+        return d
+
+    def redirect_lookups(self):
+        return self.redirect_lookup
+
+    def get_pipeline_callback(self):
+        file = os.path.join(self.path, "pipeline-callback.py")
+        spec = importlib.util.spec_from_file_location("pipeline-callback.py", file)
+        module = importlib.util.module_from_spec(spec)
+        spec.loader.exec_module(module)
+        return module.PipelineCallback
+
+    @staticmethod
+    def compose(phases):
+        def add(f, g):
+            return lambda x: g.process(f(x))
+
+        return functools.reduce(add, phases, lambda phase: phase)
+
+    def run(self, input_path, phases):
+        logging.debug(f"running {input_path} through {phases}")
+        chain = self.compose(phases)
+        for row in chain(input_path):
+            pass
+
+
+class EntityNumGen:
+    def __init__(self, entity_num_state: dict = None):
+        if not entity_num_state:
+            entity_num_state = {
+                "range_min": 0,
+                "range_max": 100,
+                "current": 0,
+            }
+
+        self.state = entity_num_state
+
+    def next(self):
+        current = self.state["current"]
+        new_current = current + 1
+
+        if new_current > int(self.state["range_max"]):
+            new_current = int(self.state["range_min"])
+
+        if new_current < int(self.state["range_min"]):
+            new_current = int(self.state["range_min"])
+
+        self.state["current"] = new_current
+
+        return new_current
+
+
+class Lookups:
+    def __init__(self, directory=None) -> None:
+        self.directory = directory or "pipeline"
+        self.lookups_path = Path(directory) / "lookup.csv"
+        self.old_entity_path = Path(directory) / "old-entity.csv"
+        self.entries = []
+        self.schema = Schema("lookup")
+        self.entity_num_gen = EntityNumGen()
+
+    def add_entry(self, entry, is_new_entry=True):
+        """
+        is_new_entry is an addition to allow for backward compatibility.
+        Older lookups may not be valid in accordance with the current
+        minimal column requirements
+        :param entry:
+        :param is_new_entry:
+        :return:
+        """
+        if is_new_entry:
+            if not self.validate_entry(entry):
+                return
+
+        self.entries.append(entry)
+
+    def load_csv(self, lookups_path=None):
+        """
+        load in lookups as df, not when we process pipeline but useful for other analysis
+        """
+        lookups_path = lookups_path or self.lookups_path
+        reader = csv.DictReader(open(lookups_path, newline=""))
+        extra_fields = set(reader.fieldnames) - set(self.schema.fieldnames)
+
+        if len(extra_fields):
+            raise RuntimeError(
+                f"{len(extra_fields)} extra fields founds in lookup.csv ({','.join(list(extra_fields))})"
+            )
+
+        for row in reader:
+            self.add_entry(row, is_new_entry=False)
+
+    def get_max_entity(self, prefix, specification) -> int:
+        if len(self.entries) == 0:
+            return 0
+        if not prefix:
+            return 0
+
+        dataset_prefix = specification.dataset_prefix(prefix)
+        try:
+            ret_val = max(
+                [
+                    int(entry["entity"])
+                    for entry in self.entries
+                    if (entry["prefix"] == prefix or entry["prefix"] == dataset_prefix)
+                    and (entry.get("entity", None))
+                ]
+            )
+            return ret_val
+        except ValueError:
+            return 0
+
+    def save_csv(self, lookups_path=None, entries=None, old_entity_path=None):
+        path = lookups_path or self.lookups_path
+
+        entity_values = []
+        if os.path.exists(path):
+            reader = csv.DictReader(open(path, newline=""))
+            for row in reader:
+                entity_values.append(row["entity"])
+
+        if entries is None:
+            entries = self.entries
+
+        os.makedirs(os.path.dirname(path), exist_ok=True)
+        logging.debug("saving %s" % (path))
+        f = open(path, "w", newline="")
+        writer = csv.DictWriter(
+            f, fieldnames=self.schema.fieldnames, extrasaction="ignore"
+        )
+        writer.writeheader()
+
+        old_entity_file_path = old_entity_path or self.old_entity_path
+        if os.path.exists(old_entity_file_path):
+            old_entity_path = self.old_entity_path
+            reader = csv.DictReader(open(old_entity_file_path, newline=""))
+
+            for row in reader:
+                entity_values.append(row["old-entity"])
+                entity_values.append(row["entity"])
+
+        new_entities = []
+        get_entity = None
+        for idx, entry in enumerate(entries):
+            if not entry:
+                continue
+            else:
+                if not entry.get("entity"):
+                    while True:
+                        generated_entity = self.entity_num_gen.next()
+
+                        if generated_entity == get_entity:
+                            print(
+                                "There are no more entity numbers available within this dataset."
+                            )
+                            break
+
+                        if get_entity is None:
+                            get_entity = generated_entity
+
+                        if str(generated_entity) not in entity_values:
+                            entry["entity"] = generated_entity
+                            new_entities.append(entry)
+                            entity_values.append(str(generated_entity))
+                            writer.writerow(entry)
+                            break
+                else:
+                    writer.writerow(entry)
+        return new_entities
+
+    # @staticmethod
+    def validate_entry(self, entry) -> bool:
+        # ensures minimum expected fields exist and are not empty strings
+        expected_fields = ["prefix", "organisation", "reference"]
+        for field in expected_fields:
+            if not entry.get(field, ""):
+                raise ValueError(f"ERROR: expected {field} not found in lookup entry")
+
+        if len(self.entries) > 0:
+            # check entry does not already exist
+            existing_entries = len(
+                [
+                    1
+                    for item in self.entries
+                    if item["prefix"] == entry["prefix"]
+                    and item["organisation"] == entry["organisation"]
+                    and item["reference"] == entry["reference"]
+                ]
+            )
+
+            if existing_entries > 0:
+                # print(f">>> ERROR: lookup already exists - {entry['organisation']} {entry['reference']}")
+                return False
+
+        return True
diff --git a/digital_land/pipeline/process.py b/digital_land/pipeline/process.py
new file mode 100644
index 000000000..a0b52432f
--- /dev/null
+++ b/digital_land/pipeline/process.py
@@ -0,0 +1,68 @@
+#!/usr/bin/env python
+import os
+import pandas as pd
+import pyarrow.parquet as pq
+import pyarrow as pa
+from pathlib import Path
+
+# load in specification
+
+
+def convert_tranformed_csv_to_pq(input, output):
+    """
+    function to convert a transformed resource to a parrquet file.
+    """
+    input_path = Path(input)
+    output_path = Path(output)
+    if output_path.exists():
+        os.remove(output_path)
+
+    # Define the chunk size for reading the CSV file
+    chunk_size = 1000000  # Number of rows per chunk
+
+    # Open a CSV reader with PyArrow
+    # csv_reader = pv.open_csv(input_path, read_options=pv.ReadOptions(block_size=chunk_size))
+    csv_iterator = pd.read_csv(
+        input_path, chunksize=chunk_size, dtype=str, na_filter=False
+    )
+
+    # Initialize the Parquet writer with the schema from the first chunk
+    first_chunk = next(csv_iterator)
+    # size = 0
+    # size +=len(first_chunk)
+
+    fields = [
+        ("end-date", pa.string()),
+        ("entity", pa.string()),
+        ("entry-date", pa.string()),
+        ("entry-number", pa.string()),
+        ("fact", pa.string()),
+        ("field", pa.string()),
+        ("priority", pa.string()),
+        ("reference-entity", pa.string()),
+        ("resource", pa.string()),
+        ("start-date", pa.string()),
+        ("value", pa.string()),
+    ]
+    schema = pa.schema(fields)
+    table = pa.Table.from_pandas(first_chunk, schema=schema)
+
+    # Create a Parquet writer
+    parquet_writer = pq.ParquetWriter(output_path, table.schema)
+
+    # Write the first chunk
+    parquet_writer.write_table(table)
+
+    # Process and write the remaining chunks
+    while True:
+        try:
+            chunk = next(csv_iterator)
+            table = pa.Table.from_pandas(chunk)
+            parquet_writer.write_table(table)
+            # size += len(chunk)
+        except StopIteration:
+            break
+
+    # Close the Parquet writer
+    parquet_writer.close()
+    # print(size)

From 8622c43c86d2f4bacaa7cab32def9939413cde42 Mon Sep 17 00:00:00 2001
From: eveleighoj <35256612+eveleighoj@users.noreply.github.com>
Date: Thu, 5 Dec 2024 11:17:02 +0000
Subject: [PATCH 02/51] correct imports

---
 digital_land/pipeline/main.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/digital_land/pipeline/main.py b/digital_land/pipeline/main.py
index 11749571e..cde83293b 100644
--- a/digital_land/pipeline/main.py
+++ b/digital_land/pipeline/main.py
@@ -5,9 +5,9 @@
 import logging
 from pathlib import Path
 
-from .phase.map import normalise
-from .phase.lookup import key as lookup_key
-from .schema import Schema
+from digital_land.phase.map import normalise
+from digital_land.phase.lookup import key as lookup_key
+from digital_land.schema import Schema
 
 
 def chain_phases(phases):

From ff82675d5ad94c3475eaa6af09c6e608b7ace9f0 Mon Sep 17 00:00:00 2001
From: eveleighoj <35256612+eveleighoj@users.noreply.github.com>
Date: Thu, 5 Dec 2024 11:21:42 +0000
Subject: [PATCH 03/51] correct arg

---
 digital_land/commands.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/digital_land/commands.py b/digital_land/commands.py
index e42b6a8aa..dd0d9aeb6 100644
--- a/digital_land/commands.py
+++ b/digital_land/commands.py
@@ -346,7 +346,7 @@ def pipeline_run(
     # create converted parquet in the var directory
     cache_dir = Path(organisation_path).parent
     transformed_parquet_dir = cache_dir / "transformed_parquet" / dataset
-    transformed_parquet_dir.mkdir(exists_ok=True, parents=True)
+    transformed_parquet_dir.mkdir(exist_ok=True, parents=True)
     convert_tranformed_csv_to_pq(
         input_path=input_path,
         output_path=transformed_parquet_dir / f"{resource}.parquet",

From f93068fc8b6e021efa1f69f5959b01949ede8521 Mon Sep 17 00:00:00 2001
From: eveleighoj <35256612+eveleighoj@users.noreply.github.com>
Date: Thu, 5 Dec 2024 11:26:05 +0000
Subject: [PATCH 04/51] cureent arguement names

---
 digital_land/pipeline/process.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/digital_land/pipeline/process.py b/digital_land/pipeline/process.py
index a0b52432f..520081030 100644
--- a/digital_land/pipeline/process.py
+++ b/digital_land/pipeline/process.py
@@ -8,12 +8,12 @@
 # load in specification
 
 
-def convert_tranformed_csv_to_pq(input, output):
+def convert_tranformed_csv_to_pq(input_path, output_path):
     """
     function to convert a transformed resource to a parrquet file.
     """
-    input_path = Path(input)
-    output_path = Path(output)
+    input_path = Path(input_path)
+    output_path = Path(output_path)
     if output_path.exists():
         os.remove(output_path)
 

From 271e66432e51ed3d69385b94cd3930b9e330bff6 Mon Sep 17 00:00:00 2001
From: eveleighoj <35256612+eveleighoj@users.noreply.github.com>
Date: Thu, 5 Dec 2024 11:36:18 +0000
Subject: [PATCH 05/51] correct input for csv to pq

---
 digital_land/commands.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/digital_land/commands.py b/digital_land/commands.py
index dd0d9aeb6..70d6e1241 100644
--- a/digital_land/commands.py
+++ b/digital_land/commands.py
@@ -348,7 +348,7 @@ def pipeline_run(
     transformed_parquet_dir = cache_dir / "transformed_parquet" / dataset
     transformed_parquet_dir.mkdir(exist_ok=True, parents=True)
     convert_tranformed_csv_to_pq(
-        input_path=input_path,
+        input_path=output_path,
         output_path=transformed_parquet_dir / f"{resource}.parquet",
     )
 

From cdd480e0f13805afb3b3230c371ec3ab5ce27606 Mon Sep 17 00:00:00 2001
From: eveleighoj <35256612+eveleighoj@users.noreply.github.com>
Date: Thu, 5 Dec 2024 12:43:54 +0000
Subject: [PATCH 06/51] add parquet package back in

---
 digital_land/commands.py                |  27 +-
 digital_land/package/dataset_parquet.py | 345 ++++++++++++++++++++++++
 2 files changed, 370 insertions(+), 2 deletions(-)
 create mode 100644 digital_land/package/dataset_parquet.py

diff --git a/digital_land/commands.py b/digital_land/commands.py
index 70d6e1241..255700db0 100644
--- a/digital_land/commands.py
+++ b/digital_land/commands.py
@@ -26,6 +26,7 @@
 )
 from digital_land.organisation import Organisation
 from digital_land.package.dataset import DatasetPackage
+from digital_land.package.dataset_parquet import DatasetParquetPackage
 from digital_land.phase.combine import FactCombinePhase
 from digital_land.phase.concat import ConcatFieldPhase
 from digital_land.phase.convert import ConvertPhase, execute
@@ -366,7 +367,11 @@ def dataset_create(
     issue_dir="issue",
     column_field_dir="var/column-field",
     dataset_resource_dir="var/dataset-resource",
+    cache_dir="var/cache/parquet",
+    resource_path="collection/resource.csv",
 ):
+    cache_dir = os.path.join(cache_dir, dataset)
+
     if not output_path:
         print("missing output path", file=sys.stderr)
         sys.exit(2)
@@ -386,10 +391,8 @@ def dataset_create(
     package.create()
     for path in input_paths:
         path_obj = Path(path)
-        package.load_transformed(path)
         package.load_column_fields(column_field_dir / dataset / path_obj.name)
         package.load_dataset_resource(dataset_resource_dir / dataset / path_obj.name)
-    package.load_entities()
 
     old_entity_path = os.path.join(pipeline.path, "old-entity.csv")
     if os.path.exists(old_entity_path):
@@ -404,6 +407,26 @@ def dataset_create(
 
     package.add_counts()
 
+    # Repeat for parquet
+    # Set up cache directory to store parquet files. The sqlite files created from this will be saved in the dataset
+    if not os.path.exists(cache_dir):
+        os.makedirs(cache_dir)
+
+    pqpackage = DatasetParquetPackage(
+        dataset,
+        organisation=organisation,
+        path=output_path,
+        cache_dir=cache_dir,
+        resource_path=resource_path,
+        specification_dir=None,  # TBD: package should use this specification object
+    )
+    pqpackage.create_temp_table(input_paths)
+    pqpackage.load_facts()
+    pqpackage.load_fact_resource()
+    pqpackage.load_entities()
+    pqpackage.pq_to_sqlite()
+    pqpackage.close_conn()
+
 
 def dataset_dump(input_path, output_path):
     cmd = f"sqlite3 -header -csv {input_path} 'select * from entity;' > {output_path}"
diff --git a/digital_land/package/dataset_parquet.py b/digital_land/package/dataset_parquet.py
new file mode 100644
index 000000000..724e92da8
--- /dev/null
+++ b/digital_land/package/dataset_parquet.py
@@ -0,0 +1,345 @@
+import os
+import logging
+import duckdb
+from .package import Package
+import resource
+
+logger = logging.getLogger(__name__)
+
+# TBD: move to from specification datapackage definition
+tables = {
+    "dataset-resource": None,
+    "column-field": None,
+    "issue": None,
+    "entity": None,
+    "fact": None,
+    "fact-resource": None,
+}
+
+# TBD: infer from specification dataset
+indexes = {
+    "fact": ["entity"],
+    "fact-resource": ["fact", "resource"],
+    "column-field": ["dataset", "resource", "column", "field"],
+    "issue": ["resource", "dataset", "field"],
+    "dataset-resource": ["resource"],
+}
+
+
+class DatasetParquetPackage(Package):
+    def __init__(self, dataset, organisation, cache_dir, resource_path, **kwargs):
+        self.suffix = ".parquet"
+        super().__init__(dataset, tables=tables, indexes=indexes, **kwargs)
+        self.dataset = dataset
+        self.organisation = organisation
+        self.cache_dir = cache_dir
+        self._spatialite = None
+        self.resource_path = resource_path
+        # Persistent connection for the class. Given name to ensure that table is stored on disk (not purely in memory)
+        os.makedirs(cache_dir, exist_ok=True)
+        self.duckdb_file = os.path.join(cache_dir, f"{dataset}.duckdb")
+        self.conn = duckdb.connect(self.duckdb_file)
+        self.schema = self.get_schema()
+        self.typology = self.specification.schema[dataset]["typology"]
+
+    def get_schema(self):
+        schema = {}
+
+        for field in sorted(
+            list(
+                set(self.specification.schema["fact"]["fields"]).union(
+                    set(self.specification.schema["fact-resource"]["fields"])
+                )
+            )
+        ):
+            datatype = self.specification.field[field]["datatype"]
+            schema[field] = "BIGINT" if datatype == "integer" else "VARCHAR"
+
+        return schema
+
+    def create_temp_table(self, input_paths):
+        # Create a temp table of the data from input_paths as we need the information stored there at various times
+        logging.info(
+            f"loading data into temp table from {os.path.dirname(input_paths[0])}"
+        )
+
+        input_paths_str = ", ".join([f"'{path}'" for path in input_paths])
+
+        # Initial max_line_size and increment step
+        max_size = 40000000
+        # increment_step = 20000000
+        # max_limit = 200000000  # Maximum allowable line size to attempt
+
+        # increment = False
+        while True:
+            try:
+                self.conn.execute("DROP TABLE IF EXISTS temp_table")
+                query = f"""
+                    CREATE TEMPORARY TABLE temp_table AS
+                    SELECT *
+                    FROM read_csv(
+                        [{input_paths_str}],
+                        columns = {self.schema},
+                        header = true,
+                        force_not_null = {[field for field in self.schema.keys()]},
+                        max_line_size={max_size}
+                    )
+                """
+                self.conn.execute(query)
+                break
+            except duckdb.Error as e:  # Catch specific DuckDB error
+                if "Value with unterminated quote" in str(e):
+                    hard_limit = int(resource.getrlimit(resource.RLIMIT_AS)[1])
+                    if max_size < hard_limit / 3:
+                        logging.info(
+                            f"Initial max_size did not work, setting it to {hard_limit / 2}"
+                        )
+                        max_size = hard_limit / 2
+                    else:
+                        raise
+                else:
+                    logging.info(f"Failed to read in when max_size = {max_size}")
+                    raise
+
+    def load_facts(self):
+        logging.info("loading facts from temp table")
+
+        fact_fields = self.specification.schema["fact"]["fields"]
+        fields_str = ", ".join(
+            [f'"{field}"' if "-" in field else field for field in fact_fields]
+        )
+
+        # query to extract data from the temp table (containing raw data), group by a fact, and get the highest
+        # priority or latest record
+        query = f"""
+            SELECT {fields_str}
+            FROM temp_table
+            QUALIFY ROW_NUMBER() OVER (
+                PARTITION BY fact ORDER BY priority, "entry-date" DESC, "entry-number" DESC
+            ) = 1
+        """
+
+        self.conn.execute(
+            f"""
+            COPY (
+                {query}
+            ) TO '{self.cache_dir}/fact{self.suffix}' (FORMAT PARQUET);
+        """
+        )
+
+    def load_fact_resource(self):
+        logging.info("loading fact resources from temp table")
+
+        fact_resource_fields = self.specification.schema["fact-resource"]["fields"]
+        fields_str = ", ".join(
+            [f'"{field}"' if "-" in field else field for field in fact_resource_fields]
+        )
+
+        # All CSV files have been loaded into a temporary table. Extract several columns and export
+        query = f"""
+            SELECT {fields_str}
+            FROM temp_table
+        """
+
+        self.conn.execute(
+            f"""
+            COPY (
+                {query}
+            ) TO '{self.cache_dir}/fact_resource{self.suffix}' (FORMAT PARQUET);
+        """
+        )
+
+    def load_entities(self):
+        organisation_path = self.organisation.organisation_path
+
+        logging.info("loading entities from temp table")
+
+        entity_fields = self.specification.schema["entity"]["fields"]
+        # Do this to match with later field names.
+        entity_fields = [e.replace("-", "_") for e in entity_fields]
+        input_paths_str = f"{self.cache_dir}/fact{self.suffix}"
+
+        query = f"""
+            SELECT DISTINCT REPLACE(field,'-','_')
+            FROM parquet_scan('{str(input_paths_str)}')
+        """
+
+        # distinct_fields - list of fields in the field in fact
+        rows = self.conn.execute(query).fetchall()
+        distinct_fields = [row[0] for row in rows]
+
+        # json fields - list of fields which are present in the fact table which
+        # do not exist separately in the entity table
+        # Need to ensure that 'organisation' is not included either
+        json_fields = [
+            field
+            for field in distinct_fields
+            if field not in entity_fields + ["organisation"]
+        ]
+
+        # null fields - list of fields which are not present in the fact tables which have
+        # to be in the entity table as a column
+        extra_fields = [
+            "entity",
+            "dataset",
+            "typology",
+            "json",
+            "organisation_entity",
+            "organisation",
+        ]
+        null_fields = [
+            field
+            for field in entity_fields
+            if field not in (distinct_fields + extra_fields)
+        ]
+
+        # select fields - a list  of fields which have to be selected directly from the pivoted table
+        # these are entity fields that are not null fields or a few special ones
+        extra_fields = [
+            "json",
+            "organisation_entity",
+            "dataset",
+            "typology",
+            "organisation",
+        ]
+        select_fields = [
+            field for field in entity_fields if field not in null_fields + extra_fields
+        ]
+
+        # set fields
+        fields_to_include = ["entity", "field", "value"]
+        fields_str = ", ".join(fields_to_include)
+
+        # Take original data, group by entity & field, and order by highest priority then latest record.
+        # If there are still matches then pick the first resource (and fact, just to make sure)
+        query = f"""
+            SELECT {fields_str} FROM (
+                SELECT {fields_str}, CASE WHEN resource_csv."end-date" IS NULL THEN '2999-12-31' ELSE resource_csv."end-date" END AS resource_end_date
+                FROM temp_table
+                LEFT JOIN read_csv_auto('{self.resource_path}', max_line_size=40000000) resource_csv
+                ON temp_table.resource = resource_csv.resource
+                QUALIFY ROW_NUMBER() OVER (
+                    PARTITION BY entity, field
+                    ORDER BY priority, "entry-date" DESC, "entry-number" DESC, resource_end_date DESC, temp_table.resource, fact
+                ) = 1
+            )
+        """
+
+        pivot_query = f"""
+            PIVOT (
+                {query}
+            ) ON REPLACE(field,'-','_')
+            USING MAX(value)
+        """
+
+        # now use the field lists produced above to create specific statements to:
+        # add null columns which are missing
+        # include columns in the json statement
+        # Collate list of fields which don't exist but need to be in the final table
+        select_statement = ", ".join([f"t1.{field}" for field in select_fields])
+        # Don't want to include anything that ends with "_geom"
+        null_fields_statement = ", ".join(
+            [
+                f"''::VARCHAR AS \"{field}\""
+                for field in null_fields
+                if not field.endswith("_geom")
+            ]
+        )
+        json_statement = ", ".join(
+            [
+                f"CASE WHEN t1.{field} IS NOT NULL THEN REPLACE('{field}', '_', '-') ELSE NULL END, t1.{field}"
+                for field in json_fields
+            ]
+        )
+
+        # define organisation query
+        org_csv = organisation_path
+        org_query = f"""
+             SELECT * FROM read_csv_auto('{org_csv}', max_line_size=40000000)
+         """
+
+        sql = f"""
+            INSTALL spatial; LOAD spatial;
+            COPY(
+                WITH computed_centroid AS (
+                    SELECT
+                        * EXCLUDE (point), -- Calculate centroid point if not given
+                        CASE
+                            WHEN (geometry IS NOT NULL and geometry <> '') AND (point IS NULL OR point = '')
+                            THEN ST_AsText(ST_ReducePrecision(ST_Centroid(ST_GeomFromText(geometry)),0.000001))
+                            ELSE point
+                        END AS point
+                    FROM (
+                        SELECT '{self.dataset}' as dataset,
+                        '{self.typology}' as typology,
+                        t2.entity as organisation_entity,
+                        {select_statement},
+                        {null_fields_statement},
+                        json_object({json_statement}) as json,
+                        FROM ({pivot_query}) as t1
+                        LEFT JOIN ({org_query}) as t2
+                        on t1.organisation = t2.organisation
+                        )
+                    )
+                SELECT
+                    * EXCLUDE (json),
+                    CASE WHEN json = '{{}}' THEN NULL ELSE json END AS json
+                FROM computed_centroid
+            ) TO '{self.cache_dir}/entity{self.suffix}' (FORMAT PARQUET);
+         """
+        self.conn.execute(sql)
+
+    def pq_to_sqlite(self):
+        # At present we are saving the parquet files in 'cache' but saving the sqlite files produced in 'dataset'
+        # In future when parquet files are saved to 'dataset' remove the 'cache_dir' in the function arguments and
+        # replace 'cache_dir' with 'output_path' in this function's code
+        logging.info(
+            f"loading sqlite3 tables in {self.path} from parquet files in {self.cache_dir}"
+        )
+        query = "INSTALL sqlite; LOAD sqlite;"
+        self.conn.execute(query)
+
+        parquet_files = [
+            fn for fn in os.listdir(self.cache_dir) if fn.endswith(self.suffix)
+        ]
+
+        for parquet_file in parquet_files:
+            table_name = os.path.splitext(os.path.basename(parquet_file))[0]
+
+            # Load Parquet data into DuckDB temp table
+            self.conn.execute("DROP TABLE IF EXISTS temp_table;")
+            self.conn.execute(
+                f"""
+                CREATE TABLE temp_table AS
+                SELECT * FROM parquet_scan('{self.cache_dir}/{parquet_file}');
+                """
+            )
+
+            # Export the DuckDB table to the SQLite database
+            self.conn.execute(
+                f"ATTACH DATABASE '{self.path}' AS sqlite_db (TYPE SQLITE);"
+            )
+
+            # Fix the column names
+            for column in self.conn.execute("DESCRIBE TABLE temp_table;").fetchall():
+                if "-" in column[0]:
+                    self.conn.execute(
+                        f"ALTER TABLE temp_table RENAME COLUMN '{column[0]}' TO '{column[0].replace('-','_')}';"
+                    )
+
+            # Copy the data
+            self.conn.execute(
+                f"INSERT INTO sqlite_db.{table_name} BY NAME (SELECT * FROM temp_table);"
+            )
+
+            self.conn.execute("DETACH DATABASE sqlite_db;")
+
+    def close_conn(self):
+        logging.info("Close connection to duckdb database in session")
+        if self.conn is not None:
+            self.conn.close()
+            if os.path.exists(self.duckdb_file):
+                os.remove(self.duckdb_file)
+
+    def load(self):
+        pass

From e6ac9bfcca4e98b8550515459fca637b9db40385 Mon Sep 17 00:00:00 2001
From: eveleighoj <35256612+eveleighoj@users.noreply.github.com>
Date: Thu, 5 Dec 2024 16:28:29 +0000
Subject: [PATCH 07/51] tweak to remove temptable

---
 digital_land/commands.py                |   8 +-
 digital_land/package/dataset_parquet.py | 113 +++++++++++++-----------
 2 files changed, 67 insertions(+), 54 deletions(-)

diff --git a/digital_land/commands.py b/digital_land/commands.py
index 255700db0..8a49e7153 100644
--- a/digital_land/commands.py
+++ b/digital_land/commands.py
@@ -370,6 +370,8 @@ def dataset_create(
     cache_dir="var/cache/parquet",
     resource_path="collection/resource.csv",
 ):
+    # directories to be overwritten
+    transformed_parquet_dir = f"var/cache/transformed_parquet/{dataset}"
     cache_dir = os.path.join(cache_dir, dataset)
 
     if not output_path:
@@ -420,9 +422,9 @@ def dataset_create(
         resource_path=resource_path,
         specification_dir=None,  # TBD: package should use this specification object
     )
-    pqpackage.create_temp_table(input_paths)
-    pqpackage.load_facts()
-    pqpackage.load_fact_resource()
+    # pqpackage.create_temp_table(input_paths)
+    pqpackage.load_facts(transformed_parquet_dir)
+    pqpackage.load_fact_resource(transformed_parquet_dir)
     pqpackage.load_entities()
     pqpackage.pq_to_sqlite()
     pqpackage.close_conn()
diff --git a/digital_land/package/dataset_parquet.py b/digital_land/package/dataset_parquet.py
index 724e92da8..095af51cc 100644
--- a/digital_land/package/dataset_parquet.py
+++ b/digital_land/package/dataset_parquet.py
@@ -2,7 +2,6 @@
 import logging
 import duckdb
 from .package import Package
-import resource
 
 logger = logging.getLogger(__name__)
 
@@ -57,51 +56,52 @@ def get_schema(self):
 
         return schema
 
-    def create_temp_table(self, input_paths):
-        # Create a temp table of the data from input_paths as we need the information stored there at various times
-        logging.info(
-            f"loading data into temp table from {os.path.dirname(input_paths[0])}"
-        )
-
-        input_paths_str = ", ".join([f"'{path}'" for path in input_paths])
-
-        # Initial max_line_size and increment step
-        max_size = 40000000
-        # increment_step = 20000000
-        # max_limit = 200000000  # Maximum allowable line size to attempt
-
-        # increment = False
-        while True:
-            try:
-                self.conn.execute("DROP TABLE IF EXISTS temp_table")
-                query = f"""
-                    CREATE TEMPORARY TABLE temp_table AS
-                    SELECT *
-                    FROM read_csv(
-                        [{input_paths_str}],
-                        columns = {self.schema},
-                        header = true,
-                        force_not_null = {[field for field in self.schema.keys()]},
-                        max_line_size={max_size}
-                    )
-                """
-                self.conn.execute(query)
-                break
-            except duckdb.Error as e:  # Catch specific DuckDB error
-                if "Value with unterminated quote" in str(e):
-                    hard_limit = int(resource.getrlimit(resource.RLIMIT_AS)[1])
-                    if max_size < hard_limit / 3:
-                        logging.info(
-                            f"Initial max_size did not work, setting it to {hard_limit / 2}"
-                        )
-                        max_size = hard_limit / 2
-                    else:
-                        raise
-                else:
-                    logging.info(f"Failed to read in when max_size = {max_size}")
-                    raise
-
-    def load_facts(self):
+    # def create_temp_table(self, input_paths):
+    #     # Create a temp table of the data from input_paths as we need the information stored there at various times
+    #     logging.info(
+    #         f"loading data into temp table from {os.path.dirname(input_paths[0])}"
+    #     )
+
+    #     input_paths_str = ", ".join([f"'{path}'" for path in input_paths])
+
+    #     # Initial max_line_size and increment step
+    #     max_size = 40000000
+    #     # increment_step = 20000000
+    #     # max_limit = 200000000  # Maximum allowable line size to attempt
+
+    #     # increment = False
+    #     while True:
+    #         try:
+    #             self.conn.execute("DROP TABLE IF EXISTS temp_table")
+    #             query = f"""
+    #                 CREATE TEMPORARY TABLE temp_table AS
+    #                 SELECT *
+    #                 FROM read_csv(
+    #                     [{input_paths_str}],
+    #                     columns = {self.schema},
+    #                     header = true,
+    #                     force_not_null = {[field for field in self.schema.keys()]},
+    #                     max_line_size={max_size}
+    #                 )
+    #             """
+    #     self.conn.execute(query)
+    #     break
+    # except duckdb.Error as e:  # Catch specific DuckDB error
+    #     if "Value with unterminated quote" in str(e):
+    #         hard_limit = int(resource.getrlimit(resource.RLIMIT_AS)[1])
+    #         if max_size < hard_limit / 3:
+    #             logging.info(
+    #                 f"Initial max_size did not work, setting it to {hard_limit / 2}"
+    #             )
+    #             max_size = hard_limit / 2
+    #         else:
+    #             raise
+    #     else:
+    #         logging.info(f"Failed to read in when max_size = {max_size}")
+    #         raise
+
+    def load_facts(self, transformed_parquet_dir):
+        """ """
         logging.info("loading facts from temp table")
 
         fact_fields = self.specification.schema["fact"]["fields"]
@@ -113,7 +113,7 @@ def load_facts(self):
         # priority or latest record
         query = f"""
             SELECT {fields_str}
-            FROM temp_table
+            FROM {transformed_parquet_dir}/*.parquet
             QUALIFY ROW_NUMBER() OVER (
                 PARTITION BY fact ORDER BY priority, "entry-date" DESC, "entry-number" DESC
             ) = 1
@@ -127,7 +127,7 @@ def load_facts(self):
         """
         )
 
-    def load_fact_resource(self):
+    def load_fact_resource(self, transformed_parquet_dir):
         logging.info("loading fact resources from temp table")
 
         fact_resource_fields = self.specification.schema["fact-resource"]["fields"]
@@ -138,7 +138,7 @@ def load_fact_resource(self):
         # All CSV files have been loaded into a temporary table. Extract several columns and export
         query = f"""
             SELECT {fields_str}
-            FROM temp_table
+            FROM {transformed_parquet_dir}/*.parquet
         """
 
         self.conn.execute(
@@ -150,6 +150,8 @@ def load_fact_resource(self):
         )
 
     def load_entities(self):
+        fact_resource_parquet_path = f"{self.cache_dir}/fact_resource{self.suffix}"
+        # fact_parquet_path = f"{self.cache_dir}/fact{self.suffix}"
         organisation_path = self.organisation.organisation_path
 
         logging.info("loading entities from temp table")
@@ -157,11 +159,11 @@ def load_entities(self):
         entity_fields = self.specification.schema["entity"]["fields"]
         # Do this to match with later field names.
         entity_fields = [e.replace("-", "_") for e in entity_fields]
-        input_paths_str = f"{self.cache_dir}/fact{self.suffix}"
+        # input_paths_str = f"{self.cache_dir}/fact{self.suffix}"
 
         query = f"""
             SELECT DISTINCT REPLACE(field,'-','_')
-            FROM parquet_scan('{str(input_paths_str)}')
+            FROM parquet_scan('{fact_resource_parquet_path}')
         """
 
         # distinct_fields - list of fields in the field in fact
@@ -212,6 +214,15 @@ def load_entities(self):
 
         # Take original data, group by entity & field, and order by highest priority then latest record.
         # If there are still matches then pick the first resource (and fact, just to make sure)
+        # changes to make
+        # not sure why this is bringing a raw resourcce AND the temp_table this data is essentially the same
+        # need the resource hash and entry number of the file, this is important for ordering
+        # between these two, the onlly other metric that isn't in the factt resource table is the start date of the resource
+        # query to get this info
+        # query to use this info to get the most recent facts
+        # query to turn the most recent facts into a pivot
+        # query to sort the final table
+        # query  to create the file
         query = f"""
             SELECT {fields_str} FROM (
                 SELECT {fields_str}, CASE WHEN resource_csv."end-date" IS NULL THEN '2999-12-31' ELSE resource_csv."end-date" END AS resource_end_date

From 37ad0c71ac9bd652c4bb0e78654c07f40d9d1611 Mon Sep 17 00:00:00 2001
From: eveleighoj <35256612+eveleighoj@users.noreply.github.com>
Date: Mon, 6 Jan 2025 11:09:56 +0000
Subject: [PATCH 08/51] add updated tests

---
 digital_land/package/dataset_parquet.py       | 185 +++--
 digital_land/pipeline/process.py              |   9 +-
 tests/acceptance/test_dataset_create.py       | 156 ++++
 tests/integration/package/conftest.py         |  34 +
 tests/integration/package/test_dataset.py     |  30 -
 .../package/test_dataset_parquet.py           | 772 ++++++++++++++++++
 tests/integration/pipeline/test_process.py    |  54 ++
 7 files changed, 1144 insertions(+), 96 deletions(-)
 create mode 100644 tests/acceptance/test_dataset_create.py
 create mode 100644 tests/integration/package/conftest.py
 create mode 100644 tests/integration/package/test_dataset_parquet.py
 create mode 100644 tests/integration/pipeline/test_process.py

diff --git a/digital_land/package/dataset_parquet.py b/digital_land/package/dataset_parquet.py
index 095af51cc..f97b9a37a 100644
--- a/digital_land/package/dataset_parquet.py
+++ b/digital_land/package/dataset_parquet.py
@@ -1,6 +1,7 @@
 import os
 import logging
 import duckdb
+from pathlib import Path
 from .package import Package
 
 logger = logging.getLogger(__name__)
@@ -26,21 +27,38 @@
 
 
 class DatasetParquetPackage(Package):
-    def __init__(self, dataset, organisation, cache_dir, resource_path, **kwargs):
+    def __init__(self, dataset, path, duckdb_path=None, **kwargs):
+        """
+        Initialisation method to set up information as needed
+
+        args:
+            dataset (str): name of the dataset
+            dir (str): the directory to store the package in
+            duckdb_path (str): optional parameter to use  a duckdb file instead of in memory db
+        """
+        # this is a given at this point to not  sure  we need it the base package class might use this
         self.suffix = ".parquet"
-        super().__init__(dataset, tables=tables, indexes=indexes, **kwargs)
+        super().__init__(dataset, tables=tables, indexes=indexes, path=path, **kwargs)
         self.dataset = dataset
-        self.organisation = organisation
-        self.cache_dir = cache_dir
-        self._spatialite = None
-        self.resource_path = resource_path
+        # self.cache_dir = cache_dir
         # Persistent connection for the class. Given name to ensure that table is stored on disk (not purely in memory)
-        os.makedirs(cache_dir, exist_ok=True)
-        self.duckdb_file = os.path.join(cache_dir, f"{dataset}.duckdb")
-        self.conn = duckdb.connect(self.duckdb_file)
+        if duckdb_path is not None:
+            self.duckdb_path = Path(duckdb_path)
+            self.duckdb_path.parent.mkdir(parents=True, exist_ok=True)
+            self.conn = duckdb.connect(self.duckdb_path)
+        else:
+            self.conn = duckdb.connect()
+
         self.schema = self.get_schema()
         self.typology = self.specification.schema[dataset]["typology"]
 
+        # set up key file paths
+        self.fact_path = self.path / f"dataset={self.dataset}" / "fact.parquet"
+        self.fact_resource_path = (
+            self.path / f"dataset={self.dataset}" / "fact_resource.parquet"
+        )
+        self.entity_path = self.path / f"dataset={self.dataset}" / "entity.parquet"
+
     def get_schema(self):
         schema = {}
 
@@ -56,6 +74,7 @@ def get_schema(self):
 
         return schema
 
+    # will be removed as we will remove the temp table from this logic
     # def create_temp_table(self, input_paths):
     #     # Create a temp table of the data from input_paths as we need the information stored there at various times
     #     logging.info(
@@ -101,60 +120,64 @@ def get_schema(self):
     #         raise
 
     def load_facts(self, transformed_parquet_dir):
-        """ """
+        """
+        This method loads facts into a fact table from a directory containing all transformed files as parquet files
+        """
+        output_path = self.path / f"dataset={self.dataset}" / "fact.parquet"
+        output_path.parent.mkdir(parents=True, exist_ok=True)
         logging.info("loading facts from temp table")
 
         fact_fields = self.specification.schema["fact"]["fields"]
-        fields_str = ", ".join(
-            [f'"{field}"' if "-" in field else field for field in fact_fields]
-        )
+        fields_str = ", ".join([field.replace("-", "_") for field in fact_fields])
 
         # query to extract data from the temp table (containing raw data), group by a fact, and get the highest
         # priority or latest record
+
         query = f"""
             SELECT {fields_str}
-            FROM {transformed_parquet_dir}/*.parquet
+            FROM '{str(transformed_parquet_dir)}/*.parquet'
             QUALIFY ROW_NUMBER() OVER (
-                PARTITION BY fact ORDER BY priority, "entry-date" DESC, "entry-number" DESC
+                PARTITION BY fact ORDER BY priority, entry_date DESC, entry_number DESC
             ) = 1
         """
-
         self.conn.execute(
             f"""
             COPY (
                 {query}
-            ) TO '{self.cache_dir}/fact{self.suffix}' (FORMAT PARQUET);
+            ) TO '{str(output_path)}' (FORMAT PARQUET);
         """
         )
 
     def load_fact_resource(self, transformed_parquet_dir):
-        logging.info("loading fact resources from temp table")
-
+        logging.info(f"loading fact resources from {str(transformed_parquet_dir)}")
+        output_path = self.path / f"dataset={self.dataset}" / "fact_resource.parquet"
+        output_path.parent.mkdir(parents=True, exist_ok=True)
         fact_resource_fields = self.specification.schema["fact-resource"]["fields"]
         fields_str = ", ".join(
-            [f'"{field}"' if "-" in field else field for field in fact_resource_fields]
+            [field.replace("-", "_") for field in fact_resource_fields]
         )
 
         # All CSV files have been loaded into a temporary table. Extract several columns and export
         query = f"""
             SELECT {fields_str}
-            FROM {transformed_parquet_dir}/*.parquet
+            FROM '{str(transformed_parquet_dir)}/*.parquet'
         """
 
         self.conn.execute(
             f"""
             COPY (
                 {query}
-            ) TO '{self.cache_dir}/fact_resource{self.suffix}' (FORMAT PARQUET);
+            ) TO '{str(output_path)}' (FORMAT PARQUET);
         """
         )
 
-    def load_entities(self):
-        fact_resource_parquet_path = f"{self.cache_dir}/fact_resource{self.suffix}"
-        # fact_parquet_path = f"{self.cache_dir}/fact{self.suffix}"
-        organisation_path = self.organisation.organisation_path
+    def load_entities(self, transformed_parquet_dir, resource_path, organisation_path):
+        output_path = self.path / f"dataset={self.dataset}" / "entity.parquet"
+        output_path.parent.mkdir(parents=True, exist_ok=True)
+        # get the other paths
+        # fact_resource_parquet_path = f"{self.cache_dir}/fact_resource{self.suffix}"
 
-        logging.info("loading entities from temp table")
+        logging.info(f"loading entities from {transformed_parquet_dir}")
 
         entity_fields = self.specification.schema["entity"]["fields"]
         # Do this to match with later field names.
@@ -163,7 +186,7 @@ def load_entities(self):
 
         query = f"""
             SELECT DISTINCT REPLACE(field,'-','_')
-            FROM parquet_scan('{fact_resource_parquet_path}')
+            FROM parquet_scan('{transformed_parquet_dir}/*.parquet')
         """
 
         # distinct_fields - list of fields in the field in fact
@@ -171,8 +194,8 @@ def load_entities(self):
         distinct_fields = [row[0] for row in rows]
 
         # json fields - list of fields which are present in the fact table which
-        # do not exist separately in the entity table
-        # Need to ensure that 'organisation' is not included either
+        # do not exist separately in the entity table so need to be included in the json field
+        # Need to ensure that 'organisation' is not included either so  that it  is  excluded
         json_fields = [
             field
             for field in distinct_fields
@@ -223,18 +246,20 @@ def load_entities(self):
         # query to turn the most recent facts into a pivot
         # query to sort the final table
         # query  to create the file
+
         query = f"""
             SELECT {fields_str} FROM (
                 SELECT {fields_str}, CASE WHEN resource_csv."end-date" IS NULL THEN '2999-12-31' ELSE resource_csv."end-date" END AS resource_end_date
-                FROM temp_table
-                LEFT JOIN read_csv_auto('{self.resource_path}', max_line_size=40000000) resource_csv
-                ON temp_table.resource = resource_csv.resource
+                FROM parquet_scan('{transformed_parquet_dir}/*.parquet') tf
+                LEFT JOIN read_csv_auto('{resource_path}', max_line_size=40000000) resource_csv
+                ON tf.resource = resource_csv.resource
                 QUALIFY ROW_NUMBER() OVER (
                     PARTITION BY entity, field
-                    ORDER BY priority, "entry-date" DESC, "entry-number" DESC, resource_end_date DESC, temp_table.resource, fact
+                    ORDER BY priority, entry_date DESC, entry_number DESC, resource_end_date DESC, tf.resource, fact
                 ) = 1
             )
         """
+        logging.error(query)
 
         pivot_query = f"""
             PIVOT (
@@ -269,6 +294,7 @@ def load_entities(self):
              SELECT * FROM read_csv_auto('{org_csv}', max_line_size=40000000)
          """
 
+        # should installinng spatial be done here
         sql = f"""
             INSTALL spatial; LOAD spatial;
             COPY(
@@ -296,54 +322,83 @@ def load_entities(self):
                     * EXCLUDE (json),
                     CASE WHEN json = '{{}}' THEN NULL ELSE json END AS json
                 FROM computed_centroid
-            ) TO '{self.cache_dir}/entity{self.suffix}' (FORMAT PARQUET);
+            ) TO '{str(output_path)}' (FORMAT PARQUET);
          """
+        #  might  need  to un some fetch all toget result back
         self.conn.execute(sql)
 
-    def pq_to_sqlite(self):
+    def load_to_sqlite(self, sqlite_path):
+        """
+        Convert parquet files to sqlite3 tables assumes the sqlite table already exist. There is an arguement to
+        say we want to improve the loading functionality of a sqlite package
+        """
         # At present we are saving the parquet files in 'cache' but saving the sqlite files produced in 'dataset'
         # In future when parquet files are saved to 'dataset' remove the 'cache_dir' in the function arguments and
         # replace 'cache_dir' with 'output_path' in this function's code
         logging.info(
-            f"loading sqlite3 tables in {self.path} from parquet files in {self.cache_dir}"
+            f"loading sqlite3 tables in {sqlite_path} from parquet files in {self.path}"
         )
+        # migrate to connection creation
         query = "INSTALL sqlite; LOAD sqlite;"
         self.conn.execute(query)
 
-        parquet_files = [
-            fn for fn in os.listdir(self.cache_dir) if fn.endswith(self.suffix)
-        ]
+        # attache the sqlite db to duckdb
+        self.conn.execute(
+            f"ATTACH DATABASE '{sqlite_path}' AS sqlite_db (TYPE SQLITE);"
+        )
+
+        fact_resource_fields = self.specification.schema["fact-resource"]["fields"]
+        fields_str = ", ".join(
+            [field.replace("-", "_") for field in fact_resource_fields]
+        )
+        # insert fact_resource data
+        self.conn.execute(
+            f"""
+                INSERT INTO sqlite_db.fact_resource
+                SELECT {fields_str} FROM parquet_scan('{self.fact_resource_path}')
+            """
+        )
 
-        for parquet_file in parquet_files:
-            table_name = os.path.splitext(os.path.basename(parquet_file))[0]
+        # insert fact data
+        fact_fields = self.specification.schema["fact"]["fields"]
+        fields_str = ", ".join([field.replace("-", "_") for field in fact_fields])
 
-            # Load Parquet data into DuckDB temp table
-            self.conn.execute("DROP TABLE IF EXISTS temp_table;")
-            self.conn.execute(
-                f"""
-                CREATE TABLE temp_table AS
-                SELECT * FROM parquet_scan('{self.cache_dir}/{parquet_file}');
-                """
-            )
+        self.conn.execute(
+            f"""
+                INSERT INTO sqlite_db.fact
+                SELECT {fields_str} FROM parquet_scan('{self.fact_path}')
+            """
+        )
 
-            # Export the DuckDB table to the SQLite database
-            self.conn.execute(
-                f"ATTACH DATABASE '{self.path}' AS sqlite_db (TYPE SQLITE);"
-            )
+        # insert entity data
+        entity_fields = self.specification.schema["entity"]["fields"]
+        fields_str = ", ".join(
+            [
+                field.replace("-", "_")
+                for field in entity_fields
+                if field not in ["geometry-geom", "point-geom"]
+            ]
+        )
+        self.conn.execute(
+            f"""
+                INSERT INTO sqlite_db.entity
+                SELECT {fields_str} FROM parquet_scan('{self.entity_path}')
+            """
+        )
 
-            # Fix the column names
-            for column in self.conn.execute("DESCRIBE TABLE temp_table;").fetchall():
-                if "-" in column[0]:
-                    self.conn.execute(
-                        f"ALTER TABLE temp_table RENAME COLUMN '{column[0]}' TO '{column[0].replace('-','_')}';"
-                    )
+        # Fix the column names
+        # for column in self.conn.execute("DESCRIBE TABLE temp_table;").fetchall():
+        #     if "-" in column[0]:
+        #         self.conn.execute(
+        #             f"ALTER TABLE temp_table RENAME COLUMN '{column[0]}' TO '{column[0].replace('-','_')}';"
+        #         )
 
-            # Copy the data
-            self.conn.execute(
-                f"INSERT INTO sqlite_db.{table_name} BY NAME (SELECT * FROM temp_table);"
-            )
+        # Copy the data
+        # self.conn.execute(
+        #     f"INSERT INTO sqlite_db.{table_name} BY NAME (SELECT * FROM temp_table);"
+        # )
 
-            self.conn.execute("DETACH DATABASE sqlite_db;")
+        self.conn.execute("DETACH DATABASE sqlite_db;")
 
     def close_conn(self):
         logging.info("Close connection to duckdb database in session")
diff --git a/digital_land/pipeline/process.py b/digital_land/pipeline/process.py
index 520081030..0a0fc3e61 100644
--- a/digital_land/pipeline/process.py
+++ b/digital_land/pipeline/process.py
@@ -47,6 +47,11 @@ def convert_tranformed_csv_to_pq(input_path, output_path):
     schema = pa.schema(fields)
     table = pa.Table.from_pandas(first_chunk, schema=schema)
 
+    # rename columns for parquet files to make querying easier in s3
+    # Replace '-' with '_' in column names
+    new_column_names = [name.replace("-", "_") for name in table.column_names]
+    table = table.rename_columns(new_column_names)
+
     # Create a Parquet writer
     parquet_writer = pq.ParquetWriter(output_path, table.schema)
 
@@ -57,7 +62,9 @@ def convert_tranformed_csv_to_pq(input_path, output_path):
     while True:
         try:
             chunk = next(csv_iterator)
-            table = pa.Table.from_pandas(chunk)
+            table = pa.Table.from_pandas(chunk, schema=schema)
+            new_column_names = [name.replace("-", "_") for name in table.column_names]
+            table = table.rename_columns(new_column_names)
             parquet_writer.write_table(table)
             # size += len(chunk)
         except StopIteration:
diff --git a/tests/acceptance/test_dataset_create.py b/tests/acceptance/test_dataset_create.py
new file mode 100644
index 000000000..de67328af
--- /dev/null
+++ b/tests/acceptance/test_dataset_create.py
@@ -0,0 +1,156 @@
+"""
+A set of tests to mimic a user (computational or otherwise) running tests against
+a sqlite dataset. There are quite a few things to set up and this specifically
+"""
+
+import pytest
+
+import numpy as np
+import pandas as pd
+import os
+import sqlite3
+from tempfile import TemporaryDirectory
+from pathlib import Path
+
+from click.testing import CliRunner
+
+from digital_land.cli import cli
+
+test_collection = "conservation-area"
+test_dataset = "conservation-area"
+
+
+@pytest.fixture(scope="session")
+def session_tmp_path():
+    with TemporaryDirectory() as tmpdir:
+        yield Path(tmpdir)
+
+
+@pytest.fixture
+def input_paths():
+    input_paths = []
+    directory = f"tests/data/{test_collection}/transformed/{test_dataset}/"
+    for root, dirs, files in os.walk(directory):
+        for file in files:
+            full_path = os.path.join(root, file)
+            input_paths.append(full_path)
+
+    return input_paths
+
+
+@pytest.fixture
+def organisation_path():
+    """
+    build an organisations dataset to use
+    """
+    orgs_path = f"tests/data/{test_collection}/organisation.csv"
+    return orgs_path
+
+
+@pytest.fixture
+def cache_path(session_tmp_path):
+    cache_path = session_tmp_path / "var" / "cache"
+    os.makedirs(cache_path, exist_ok=True)
+    return cache_path
+
+
+@pytest.fixture
+def dataset_dir(session_tmp_path):
+    dataset_dir = session_tmp_path / "dataset"
+    os.makedirs(dataset_dir, exist_ok=True)
+    return dataset_dir
+
+
+@pytest.fixture
+def issue_dir(session_tmp_path):
+    issue_dir = session_tmp_path / "issue"
+    os.makedirs(issue_dir, exist_ok=True)
+    return issue_dir
+
+
+@pytest.fixture
+def resource_path(session_tmp_path):
+    resource_path = session_tmp_path / "resource.csv"
+    columns = ["resource", "end-date"]
+    with open(resource_path, "w") as f:
+        f.write(",".join(columns) + "\n")
+    return resource_path
+
+
+def test_acceptance_dataset_create(
+    session_tmp_path,
+    organisation_path,
+    input_paths,
+    issue_dir,
+    cache_path,
+    dataset_dir,
+    resource_path,
+):
+    output_path = dataset_dir / f"{test_dataset}.sqlite3"
+
+    runner = CliRunner()
+    result = runner.invoke(
+        cli,
+        [
+            "--dataset",
+            str(test_dataset),
+            "--pipeline-dir",
+            str(f"tests/data/{test_collection}/pipeline"),
+            "dataset-create",
+            "--output-path",
+            str(output_path),
+            "--organisation-path",
+            str(organisation_path),
+            "--column-field-dir",
+            str(f"tests/data/{test_collection}/var/column-field"),
+            "--dataset-resource-dir",
+            str(f"tests/data/{test_collection}/var/dataset-resource"),
+            "--issue-dir",
+            str(issue_dir),
+            "--cache-dir",
+            str(cache_path),
+            "--resource-path",
+            str(resource_path),
+        ]
+        + input_paths,
+        catch_exceptions=False,
+    )
+
+    # Check that the command exits with status code 0 (success)
+    if result.exit_code != 0:
+        # Print the command output if the test fails
+        print("Command failed with exit code:", result.exit_code)
+        print("Command output:")
+        print(result.output)
+        print("Command error output:")
+        print(result.exception)
+
+    assert result.exit_code == 0, "error returned when building dataset"
+    pq_cache = os.path.join(cache_path, test_dataset)
+    pq_files = [file for file in os.listdir(pq_cache) if file.endswith(".parquet")]
+    assert len(pq_files) == 3, "Not all parquet files created"
+    assert np.all(
+        np.sort(pq_files) == ["entity.parquet", "fact.parquet", "fact_resource.parquet"]
+    ), "parquet file names not correct"
+
+    # Check the sqlite file was created
+    assert os.path.exists(output_path), f"sqlite file {output_path} does not exists"
+
+    conn = sqlite3.connect(output_path)
+    cursor = conn.cursor()
+    tables = cursor.execute(
+        "SELECT name FROM sqlite_master WHERE type='table';"
+    ).fetchall()
+    expected_tables = {"fact", "fact_resource", "entity"}
+    actual_tables = {table[0] for table in tables}
+    missing_tables = expected_tables - actual_tables
+    assert (
+        len(missing_tables) == 0
+    ), f"Missing following tables in sqlite database: {missing_tables}"
+
+    for table in list(expected_tables):
+        pq_rows = len(pd.read_parquet(f"{pq_cache}/{table}.parquet"))
+        sql_rows = cursor.execute(f"SELECT COUNT(*) FROM {table};").fetchone()[0]
+        assert (
+            pq_rows == sql_rows
+        ), f"Different rows between the parquet files and database table for {table}"
diff --git a/tests/integration/package/conftest.py b/tests/integration/package/conftest.py
new file mode 100644
index 000000000..408c3de1b
--- /dev/null
+++ b/tests/integration/package/conftest.py
@@ -0,0 +1,34 @@
+import pytest
+import os
+
+from urllib.request import urlretrieve
+
+
+@pytest.fixture(scope="session")
+def specification_dir(tmp_path_factory):
+    specification_dir = tmp_path_factory.mktemp("specification")
+    source_url = "https://raw.githubusercontent.com/digital-land/"
+    specification_csvs = [
+        "attribution.csv",
+        "licence.csv",
+        "typology.csv",
+        "theme.csv",
+        "collection.csv",
+        "dataset.csv",
+        "dataset-field.csv",
+        "field.csv",
+        "datatype.csv",
+        "prefix.csv",
+        # deprecated ..
+        "pipeline.csv",
+        "dataset-schema.csv",
+        "schema.csv",
+        "schema-field.csv",
+    ]
+    for specification_csv in specification_csvs:
+        urlretrieve(
+            f"{source_url}/specification/main/specification/{specification_csv}",
+            os.path.join(specification_dir, specification_csv),
+        )
+
+    return specification_dir
diff --git a/tests/integration/package/test_dataset.py b/tests/integration/package/test_dataset.py
index a24c80063..96c3a7add 100644
--- a/tests/integration/package/test_dataset.py
+++ b/tests/integration/package/test_dataset.py
@@ -57,36 +57,6 @@ def transformed_fact_resources_with_blank():
     return input_data
 
 
-@pytest.fixture(scope="session")
-def specification_dir(tmp_path_factory):
-    specification_dir = tmp_path_factory.mktemp("specification")
-    source_url = "https://raw.githubusercontent.com/digital-land/"
-    specification_csvs = [
-        "attribution.csv",
-        "licence.csv",
-        "typology.csv",
-        "theme.csv",
-        "collection.csv",
-        "dataset.csv",
-        "dataset-field.csv",
-        "field.csv",
-        "datatype.csv",
-        "prefix.csv",
-        # deprecated ..
-        "pipeline.csv",
-        "dataset-schema.csv",
-        "schema.csv",
-        "schema-field.csv",
-    ]
-    for specification_csv in specification_csvs:
-        urllib.request.urlretrieve(
-            f"{source_url}/specification/main/specification/{specification_csv}",
-            os.path.join(specification_dir, specification_csv),
-        )
-
-    return specification_dir
-
-
 @pytest.fixture
 def organisation_csv(tmp_path):
     organisation_path = os.path.join(tmp_path, "organisation.csv")
diff --git a/tests/integration/package/test_dataset_parquet.py b/tests/integration/package/test_dataset_parquet.py
new file mode 100644
index 000000000..7318b979e
--- /dev/null
+++ b/tests/integration/package/test_dataset_parquet.py
@@ -0,0 +1,772 @@
+import sqlite3
+import numpy as np
+import pandas as pd
+import logging
+import pytest
+import os
+import json
+import pyarrow.parquet as pq
+import pyarrow as pa
+from digital_land.package.dataset_parquet import DatasetParquetPackage
+
+
+class MockOrganisation(object):
+    def __init__(self, organisation_path):
+        self.organisation_path = organisation_path
+
+
+@pytest.fixture
+def org_path(tmp_path):
+    org_path = tmp_path / "organisation.csv"
+    columns = ["organisation", "entity"]
+    # Test data for the tables. This checks that 'field' get pivoted
+    data = [
+        ["local-authority:AAA", "1"],
+        ["local-authority:BBB", "2"],
+        ["local-authority:CCC", "3"],
+        ["local-authority:DDD", "4"],
+        ["local-authority:EEE", "5"],
+        ["local-authority:FFF", "6"],
+        ["local-authority:GGG", "7"],
+        ["local-authority:HHH", "8"],
+        ["local-authority:III", "9"],
+    ]
+    with open(org_path, "w") as f:
+        f.write(",".join(columns) + "\n")
+        for row in data:
+            f.write(",".join(map(str, row)) + "\n")
+    return org_path
+
+
+# # Fixture to create a shared temporary directory
+# @pytest.fixture(scope="session")
+# def temp_dir(tmpdir_factory):
+#     temp_dir = tmpdir_factory.mktemp("shared_session_temp_dir")
+#     yield temp_dir
+
+
+@pytest.fixture
+def resource_path(tmp_path):
+    resource_path = tmp_path / "resource.csv"
+    resource_columns = ["resource", "end-date"]
+    with open(resource_path, "w") as f:
+        f.write(",".join(resource_columns) + "\n")
+
+    return resource_path
+
+
+# general use file to use for testing should focus  on splitting down into individual test cases
+test_geometry = "MULTIPOLYGON(((-0.49901924 53.81622,-0.5177418 53.76114,-0.4268378 53.78454,-0.49901924 53.81622)))"
+transformed_1_data = {
+    "end_date": [np.nan] * 16,
+    "entity": [11, 11, 11, 11, 11, 11, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12],
+    "entry_date": [
+        "2023-01-01",
+        "2023-01-01",
+        "2023-01-01",
+        "2023-01-01",
+        "2023-01-01",
+        "2023-01-01",
+        "2023-02-01",
+        "2023-02-01",
+        "2023-02-01",
+        "2023-02-01",
+        "2023-02-01",
+        "2023-02-01",
+        "2023-02-01",
+        "2023-02-01",
+        "2023-02-01",
+        "2023-02-01",
+    ],
+    "entry_number": [2] * 16,
+    "fact": [
+        "abcdef1",
+        "abcdef2",
+        "abcdef3",
+        "abcdef4",
+        "abcdef5",
+        "abcdef6",
+        "abc1231",
+        "abc1232",
+        "abc1233",
+        "def4561",
+        "def4562",
+        "def4563",
+        "a1b2c31",
+        "a1b2c32",
+        "a1b2c33",
+        "a1b2c34",
+    ],
+    "field": [
+        "entry-date",
+        "geometry",
+        "point",
+        "document-url",
+        "organisation",
+        "entry-date",
+        "geometry",
+        "organisation",
+        "entry-date",
+        "geometry",
+        "organisation",
+        "entry-date",
+        "geomtry",
+        "document-url",
+        "notes-checking",
+        "organisation",
+    ],
+    "priority": [2] * 16,
+    "reference_entity": [np.nan] * 16,
+    "resource": [
+        "zyxwvu",
+        "zyxwvu",
+        "zyxwvu",
+        "zyxwvu",
+        "zyxwvu",
+        "zyxwvu",
+        "yxwvut",
+        "yxwvut",
+        "zyxwvu",
+        "xwvuts",
+        "xwvuts",
+        "zyxwvu",
+        "wvutsr",
+        "wvutsr",
+        "wvutsr",
+        "wvutsr",
+    ],
+    "start_date": [np.nan] * 16,
+    "value": [
+        "2023-01-01",
+        f"{test_geometry}",
+        '"POINT(-0.481 53.788)"',
+        "https://www.test.xyz",
+        "organisation:AAA",
+        "2023-01-01",
+        f"{test_geometry}",
+        "local-authority:BBB",
+        "2023-01-01",
+        f"{test_geometry}",
+        "local-authority:CCC",
+        "2023-01-01",
+        f"{test_geometry}",
+        "https://www.testing.yyz",
+        "Something random",
+        "local-authority:DDD",
+    ],
+}
+
+transformed_2_data = {
+    "end_date": [np.nan] * 19,  # 19 records
+    "entity": [
+        110,
+        110,
+        110,
+        111,
+        111,
+        111,
+        112,
+        112,
+        112,
+        113,
+        113,
+        113,
+        114,
+        114,
+        114,
+        115,
+        115,
+        115,
+        116,
+    ],
+    "entry_date": [
+        "2023-01-01",
+        "2023-01-01",
+        "2023-01-01",
+        "2023-01-01",
+        "2023-02-01",
+        "2023-02-01",
+        "2023-02-01",
+        "2023-02-01",
+        "2023-01-01",
+        "2023-01-01",
+        "2023-01-01",
+        "2023-01-01",
+        "2023-04-01",
+        "2023-05-01",
+        "2023-01-01",
+        "2023-01-01",
+        "2023-01-01",
+        "2023-01-01",
+        "2023-01-01",
+    ],
+    "entry_number": [2, 2, 2, 2, 2, 2, 2, 12, 12, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2],
+    "fact": [
+        "badcfe1",
+        "badcfe2",
+        "badcfe3",
+        "fedcba1",
+        "fedcba2",
+        "fedcba3",
+        "bcdefg1",
+        "bcdefg2",
+        "bcdefg3",
+        "cdefgh1",
+        "hgfedc1",
+        "cdefgh2",
+        "efghij1",
+        "efghij2",
+        "efghij3",
+        "defghi1",
+        "defghi2",
+        "defghi3",
+        "ihgfed1",
+    ],
+    "field": [
+        "entry-date",
+        "entry-date",
+        "organisation",
+        "entry-date",
+        "entry-date",
+        "organisation",
+        "entry-date",
+        "entry-date",
+        "organisation",
+        "entry-date",
+        "entry-date",
+        "organisation",
+        "entry-date",
+        "entry-date",
+        "organisation",
+        "entry-date",
+        "entry-date",
+        "organisation",
+        "entry-date",
+    ],
+    "priority": [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 1, 2, 2, 2, 2],
+    "reference_entity": [np.nan] * 19,  # 19 records
+    "resource": [
+        "zyx123",
+        "zyx123",
+        "zyx123",
+        "zxy123",
+        "zxy123",
+        "zxy123",
+        "yxw456",
+        "yxw456",
+        "yxw456",
+        "xwv789",
+        "xwv789",
+        "xwv789",
+        "xyz123",
+        "xyz123",
+        "xyz123",
+        "uvw456",
+        "wvu654",
+        "uvw456",
+        "rta357",
+    ],
+    "start_date": [np.nan] * 19,  # 19 records
+    "value": [
+        "2023-01-01",
+        "2023-01-01",
+        "local-authority:DDD",
+        "2023-01-01",
+        "2023-02-01",
+        "local-authority:EEE",
+        "2023-02-01",
+        "2023-02-01",
+        "local-authority:FFF",
+        "2023-01-01",
+        "2023-01-01",
+        "local-authority:GGG",
+        "2023-04-01",
+        "2023-05-01",
+        "local-authority:HHH",
+        "2023-01-01",
+        "2023-01-01",
+        "local-authority:III",
+        "2023-01-01",
+    ],
+}
+
+
+@pytest.fixture
+def dataset_sqlite_path(tmp_path):
+    """
+    Should consider using a test spec to feed in to a dataset package instead, also functionality might need to be moved
+    """
+    sqlite_path = tmp_path / "conservation-area.sqlite3"
+    conn = sqlite3.connect(sqlite_path)
+    conn.execute(
+        """
+        CREATE TABLE entity(
+            dataset TEXT,
+            end_date TEXT,
+            entity INTEGER PRIMARY KEY,
+            entry_date TEXT,
+            geojson JSON,
+            geometry TEXT,
+            json JSON,
+            name TEXT,
+            organisation_entity TEXT,
+            point TEXT,
+            prefix TEXT,
+            reference TEXT,
+            start_date TEXT,
+            typology TEXT
+        );
+    """
+    )
+    conn.execute(
+        """
+        CREATE TABLE fact(
+            end_date TEXT,
+            entity INTEGER,
+            fact TEXT PRIMARY KEY,
+            field TEXT,
+            entry_date TEXT,
+            priority INTEGER,
+            reference_entity TEXT,
+            start_date TEXT,
+            value TEXT,
+            FOREIGN KEY(entity) REFERENCES entity(entity)
+            );
+    """
+    )
+    conn.execute(
+        """
+        CREATE TABLE fact_resource(
+            end_date TEXT,
+            fact TEXT,
+            entry_date TEXT,
+            entry_number INTEGER,
+            priority INTEGER,
+            resource TEXT,
+            start_date TEXT,
+            FOREIGN KEY(fact) REFERENCES fact(fact)
+        );
+    """
+    )
+
+    conn.commit()
+    conn.close()
+
+    return sqlite_path
+
+
+@pytest.mark.parametrize("data,expected", [(transformed_1_data, 16)])
+def test_load_facts_single_file(data: dict, expected: int, tmp_path):
+    """
+    tests loading  from a directory  when there is a single file, multiple files
+    make very little difference to duckdb so use to test out individual cases
+    """
+    # convert data to df and save to a file
+    df = pd.DataFrame.from_dict(data)
+    transformed_parquet_dir = tmp_path / "transformed"
+    transformed_parquet_dir.mkdir(parents=True, exist_ok=True)
+    df.to_parquet(transformed_parquet_dir / "transformed_resouce.parquet", index=False)
+
+    # instantiate package
+    package = DatasetParquetPackage(
+        dataset="conservation-area",
+        path=tmp_path / "conservation-area",
+        specification_dir=None,
+    )
+
+    # this method is explicitely designed to load facts from the temp table
+    # however it shouldn't need this, it's dupllicating all of the same data in a emporary space
+    # we  should try leveraging the power of duckdb and parquet.
+    package.load_facts(transformed_parquet_dir=transformed_parquet_dir)
+
+    output_file = (
+        tmp_path / "conservation-area" / "dataset=conservation-area" / "fact.parquet"
+    )
+    assert os.path.exists(output_file), "fact.parquet file does not exist"
+
+    df = pd.read_parquet(output_file)
+
+    assert len(df) > 0, "No data in fact.parquet file"
+    assert (
+        len(df) == expected
+    ), "No. of facts does not match expected"  # No of unique facts
+    assert df.shape[1] == 9, "Not all columns saved in fact.parquet file"
+
+
+@pytest.mark.parametrize(
+    "data1,data2,expected", [(transformed_1_data, transformed_2_data, 35)]
+)
+def test_load_facts_multiple_files(data1, data2, expected, tmp_path):
+    """
+    test loading multiple files into the fact table when they're from a single directory
+    """
+    # convert data to df and save to a file
+    df1 = pd.DataFrame.from_dict(data1)
+    df2 = pd.DataFrame.from_dict(data2)
+    transformed_parquet_dir = tmp_path / "transformed"
+    transformed_parquet_dir.mkdir(parents=True, exist_ok=True)
+    df1.to_parquet(
+        transformed_parquet_dir / "transformed_resource_1.parquet", index=False
+    )
+    df2.to_parquet(
+        transformed_parquet_dir / "transformed_resource_2.parquet", index=False
+    )
+
+    package = DatasetParquetPackage(
+        dataset="conservation-area",
+        path=tmp_path / "conservation-area",
+        specification_dir=None,
+    )
+
+    package.load_facts(transformed_parquet_dir=transformed_parquet_dir)
+
+    output_file = (
+        tmp_path / "conservation-area" / "dataset=conservation-area" / "fact.parquet"
+    )
+    assert os.path.exists(output_file), "fact.parquet file does not exist"
+
+    df = pd.read_parquet(output_file)
+
+    assert len(df) > 0, "No data in fact.parquet file"
+    assert (
+        len(df) == expected
+    ), "No. of facts does not match expected"  # No of unique facts
+    assert df.shape[1] == 9, "Not all columns saved in fact.parquet file"
+
+
+@pytest.mark.parametrize("data,expected", [(transformed_1_data, 16)])
+def test_load_facts_one_file_with_empty_file(data, expected, tmp_path):
+    """
+    test loading one file into the fact table alongside an empty file
+    """
+
+    df = pd.DataFrame.from_dict(data)
+    transformed_parquet_dir = tmp_path / "transformed"
+    transformed_parquet_dir.mkdir(parents=True, exist_ok=True)
+    df.to_parquet(transformed_parquet_dir / "transformed_resouce.parquet", index=False)
+    schema = pa.schema(
+        [
+            ("end_date", pa.string()),
+            ("entity", pa.int64()),
+            ("entry_date", pa.string()),
+            ("entry_number", pa.int64()),
+            ("fact", pa.string()),
+            ("field", pa.string()),
+            ("priority", pa.int64()),
+            ("reference_entity", pa.int64()),
+            ("resource", pa.string()),
+            ("start_date", pa.string()),
+            ("value", pa.string()),
+        ]
+    )
+    empty_arrays = [pa.array([], type=field.type) for field in schema]
+    empty_table = pa.Table.from_arrays(empty_arrays, schema=schema)
+    pq.write_table(empty_table, transformed_parquet_dir / "empty.parquet")
+
+    package = DatasetParquetPackage(
+        dataset="conservation-area",
+        path=tmp_path / "conservation-area",
+        specification_dir=None,
+    )
+
+    package.load_facts(transformed_parquet_dir=transformed_parquet_dir)
+
+    output_file = (
+        tmp_path / "conservation-area" / "dataset=conservation-area" / "fact.parquet"
+    )
+    assert os.path.exists(output_file), "fact.parquet file does not exist"
+
+    df = pd.read_parquet(output_file)
+
+    assert len(df) > 0, "No data in fact.parquet file"
+    assert (
+        len(df) == expected
+    ), "No. of facts does not match expected"  # No of unique facts
+    assert df.shape[1] == 9, "Not all columns saved in fact.parquet file"
+
+
+@pytest.mark.parametrize("data,expected", [(transformed_1_data, 16)])
+def test_load_fact_resource_single_file(data, expected, tmp_path):
+
+    df = pd.DataFrame.from_dict(data)
+    transformed_parquet_dir = tmp_path / "transformed"
+    transformed_parquet_dir.mkdir(parents=True, exist_ok=True)
+    df.to_parquet(transformed_parquet_dir / "transformed_resouce.parquet", index=False)
+
+    package = DatasetParquetPackage(
+        dataset="conservation-area",
+        path=tmp_path / "conservation-area",
+        specification_dir=None,
+    )
+    package.load_fact_resource(transformed_parquet_dir)
+
+    # Check if the output parquet file exists and verify contents
+    output_file = (
+        tmp_path
+        / "conservation-area"
+        / "dataset=conservation-area"
+        / "fact_resource.parquet"
+    )
+    assert os.path.exists(output_file), "fact-resource.parquet file does not exist"
+
+    # Load Parquet into a DataFrame to verify data correctness
+    df = pd.read_parquet(output_file)
+
+    assert len(df) > 0, "No data in fact-resource,parquet file"
+    assert len(df) == expected, "Not all data saved in fact-resource.parquet file"
+
+    assert df.shape[1] == 7, "Not all columns saved in fact-resource.parquet file"
+
+
+@pytest.mark.parametrize(
+    "data_1,data_2,expected", [(transformed_1_data, transformed_2_data, 35)]
+)
+def test_load_fact_resource_two_filea(data_1, data_2, expected, tmp_path):
+    df_1 = pd.DataFrame.from_dict(data_1)
+    df_2 = pd.DataFrame.from_dict(data_2)
+    transformed_parquet_dir = tmp_path / "transformed"
+    transformed_parquet_dir.mkdir(parents=True, exist_ok=True)
+    df_1.to_parquet(
+        transformed_parquet_dir / "transformed_resource_1.parquet", index=False
+    )
+    df_2.to_parquet(
+        transformed_parquet_dir / "transformed_resource_2.parquet", index=False
+    )
+
+    package = DatasetParquetPackage(
+        dataset="conservation-area",
+        path=tmp_path / "conservation-area",
+        specification_dir=None,
+    )
+    package.load_fact_resource(transformed_parquet_dir)
+
+    # Check if the output parquet file exists and verify contents
+    output_file = (
+        tmp_path
+        / "conservation-area"
+        / "dataset=conservation-area"
+        / "fact_resource.parquet"
+    )
+    assert os.path.exists(output_file), "fact-resource.parquet file does not exist"
+
+    # Load Parquet into a DataFrame to verify data correctness
+    df = pd.read_parquet(output_file)
+
+    assert len(df) > 0, "No data in fact-resource,parquet file"
+    assert len(df) == expected, "Not all data saved in fact-resource.parquet file"
+
+    assert df.shape[1] == 7, "Not all columns saved in fact-resource.parquet file"
+
+
+@pytest.mark.parametrize("data,expected", [(transformed_1_data, 16)])
+def test_load_fact_resource_empty_file_with_another(data, expected, tmp_path):
+
+    df = pd.DataFrame.from_dict(data)
+    transformed_parquet_dir = tmp_path / "transformed"
+    transformed_parquet_dir.mkdir(parents=True, exist_ok=True)
+    df.to_parquet(transformed_parquet_dir / "transformed_resouce.parquet", index=False)
+    # create empty file
+    schema = pa.schema(
+        [
+            ("end_date", pa.string()),
+            ("entity", pa.int64()),
+            ("entry_date", pa.string()),
+            ("entry_number", pa.int64()),
+            ("fact", pa.string()),
+            ("field", pa.string()),
+            ("priority", pa.int64()),
+            ("reference_entity", pa.int64()),
+            ("resource", pa.string()),
+            ("start_date", pa.string()),
+            ("value", pa.string()),
+        ]
+    )
+    empty_arrays = [pa.array([], type=field.type) for field in schema]
+    empty_table = pa.Table.from_arrays(empty_arrays, schema=schema)
+    pq.write_table(empty_table, transformed_parquet_dir / "empty.parquet")
+
+    package = DatasetParquetPackage(
+        dataset="conservation-area",
+        path=tmp_path / "conservation-area",
+        specification_dir=None,
+    )
+    package.load_fact_resource(transformed_parquet_dir)
+
+    # Check if the output parquet file exists and verify contents
+    output_file = (
+        tmp_path
+        / "conservation-area"
+        / "dataset=conservation-area"
+        / "fact_resource.parquet"
+    )
+    assert os.path.exists(output_file), "fact-resource.parquet file does not exist"
+
+    # Load Parquet into a DataFrame to verify data correctness
+    df = pd.read_parquet(output_file)
+
+    assert len(df) > 0, "No data in fact-resource,parquet file"
+    assert len(df) == expected, "Not all data saved in fact-resource.parquet file"
+
+    assert df.shape[1] == 7, "Not all columns saved in fact-resource.parquet file"
+
+
+@pytest.mark.parametrize(
+    "data,expected_count,expected_props",
+    [(transformed_1_data, 2, {11: {"end_date": ""}})],
+)
+def test_load_entities_single_file(
+    data, expected_count, expected_props, tmp_path, org_path, resource_path
+):
+    # Create dummy organisation.csv file for use in 'load_entities'
+    # Test data for the tables. This checks that 'field' get pivoted
+    df = pd.DataFrame.from_dict(data)
+    transformed_parquet_dir = tmp_path / "transformed"
+    transformed_parquet_dir.mkdir(parents=True, exist_ok=True)
+    df.to_parquet(transformed_parquet_dir / "transformed_resouce.parquet", index=False)
+
+    package = DatasetParquetPackage(
+        dataset="conservation-area",
+        path=tmp_path / "conservation-area",
+        specification_dir=None,
+    )
+    package.load_entities(transformed_parquet_dir, resource_path, org_path)
+
+    output_file = (
+        tmp_path / "conservation-area" / "dataset=conservation-area" / "entity.parquet"
+    )
+    assert os.path.exists(output_file), "entity.parquet file does not exist"
+
+    df = pd.read_parquet(output_file)
+
+    assert len(df) > 0, "No data in entity.parquet file"
+    assert len(df) == expected_count, "No. of entities is not correct"
+    assert df["entity"].nunique() == len(df), "Entity column contains duplicate values"
+
+    for entity in expected_props:
+        for key, value in expected_props[entity].items():
+            logging.info(f"entity={entity}, key={key}, value={value}")
+            assert (
+                df[df["entity"] == entity][key].iloc[0] == value
+            ), f"Expected {key} to be {value} for entity {entity}"
+
+
+# not  great test as have to feed so much in, would be  better to test each table  loading at a time
+@pytest.mark.parametrize(
+    "fact_data,fact_resource_data,entity_data",
+    [
+        (
+            {
+                "fact": [""],
+                "end_date": [1],
+                "entity": [1],
+                "field": [""],
+                "entry_date": [""],
+                "priority": [1],
+                "reference_entity": [""],
+                "start_date": [1],
+                "value": [""],
+            },
+            {
+                "end_date": [""],
+                "fact": [1],
+                "entry_date": [""],
+                "entry_number": [1],
+                "priority": [1],
+                "resource": [""],
+                "start_date": [1],
+            },
+            {
+                "entity": [1],
+                "dataset": ["conservation-area"],
+                "end_date": [""],
+                "entry_date": [""],
+                "geojson": [""],
+                "geometry": [""],
+                "json": [""],
+                "name": [""],
+                "organisation_entity": [""],
+                "point": [""],
+                "prefix": [""],
+                "reference": [""],
+                "start_date": [""],
+                "typology": [""],
+            },
+        )
+    ],
+)
+def test_load_pq_to_sqlite_basic(
+    fact_data, fact_resource_data, entity_data, dataset_sqlite_path, tmp_path
+):
+
+    dataset_parquet_path = tmp_path / "dataset"
+    (dataset_parquet_path / "dataset=conservation-area").mkdir(
+        parents=True, exist_ok=True
+    )
+    # write data to parquet files in the dataset path
+    fact_df = pd.DataFrame.from_dict(fact_data)
+    fact_resource_df = pd.DataFrame.from_dict(fact_resource_data)
+    entity_df = pd.DataFrame.from_dict(entity_data)
+
+    fact_df.to_parquet(
+        dataset_parquet_path / "dataset=conservation-area" / "fact.parquet", index=False
+    )
+    fact_resource_df.to_parquet(
+        dataset_parquet_path / "dataset=conservation-area" / "fact_resource.parquet",
+        index=False,
+    )
+    entity_df.to_parquet(
+        dataset_parquet_path / "dataset=conservation-area" / "entity.parquet",
+        index=False,
+    )
+
+    output_path = dataset_sqlite_path
+
+    package = DatasetParquetPackage(
+        dataset="conservation-area",
+        path=tmp_path / "dataset",
+        specification_dir=None,
+    )
+
+    package.load_to_sqlite(output_path)
+
+    assert os.path.exists(dataset_sqlite_path), "sqlite3 file does not exist"
+
+    cnx = sqlite3.connect(output_path)
+    df_sql = pd.read_sql_query("SELECT * FROM fact_resource", cnx)
+    assert len(df_sql) > 0, "No data in fact_resource table"
+    assert len(df_sql) == len(
+        fact_resource_df
+    ), "Not all data saved in fact_resource table"
+    assert np.all(
+        len(df_sql["end_date"] == 0)
+    ), "Non-empty strings in end_date from fact_resource table"
+
+    df_sql = pd.read_sql_query("SELECT * FROM fact", cnx)
+    assert len(df_sql) > 0, "No data in fact table"
+    assert len(df_sql) == len(fact_df), "Not all data saved in fact table"
+    assert np.all(
+        len(df_sql["end_date"] == 0)
+    ), "Non-empty strings in end_date from fact table"
+
+    df_sql = pd.read_sql_query("SELECT * FROM entity", cnx)
+    assert len(df_sql) > 0, "No data in entity table"
+    assert len(df_sql) == len(entity_df), "Not all data saved in entity table"
+    assert np.any(
+        len(df_sql["geometry"] == 0)
+    ), "All geometries from entity table have values"
+    assert np.any(
+        len(df_sql["geometry"] == 0)
+    ), "All geometries from entity table have non-blank values"
+    assert not any(
+        [
+            (
+                any("_" in key for key in json.loads(row).keys())
+                if isinstance(row, str)
+                else False
+            )
+            for row in df_sql["json"]
+            if row != ""
+        ]
+    ), "Some json object have underscores in their 'keys'"
+
+    cnx.close()
diff --git a/tests/integration/pipeline/test_process.py b/tests/integration/pipeline/test_process.py
new file mode 100644
index 000000000..4f0e24932
--- /dev/null
+++ b/tests/integration/pipeline/test_process.py
@@ -0,0 +1,54 @@
+import pandas as pd
+
+from digital_land.pipeline.process import convert_tranformed_csv_to_pq
+
+
+def test_convert_transformed_csv_to_pq_converts_csv(tmp_path):
+    # creat csv that looks like a transformed csv
+    data = {
+        "end-date": [""],
+        "entity": [4220000],
+        "entry-date": ["2024-10-02"],
+        "entry-number": [1],
+        "fact": ["1be8ef923db61d62354f041718ea0b1795c5ae60b436ec74e90d9fd850919434"],
+        "field": ["name"],
+        "priority": [2],
+        "reference-entity": [""],
+        "resource": [
+            "0d1f06295866286d290d831b4569fe862ab38ca72cd23d541de2c9f20ff44ed7"
+        ],
+        "start-date": [""],
+        "value": "Arun District Council Local Plan 2011 - 2031",
+    }
+    df = pd.DataFrame(data)
+    data_path = (
+        tmp_path
+        / "0d1f06295866286d290d831b4569fe862ab38ca72cd23d541de2c9f20ff44ed7.csv"
+    )
+    df.to_csv(data_path)
+
+    # use process on it
+    output_path = (
+        tmp_path
+        / "0d1f06295866286d290d831b4569fe862ab38ca72cd23d541de2c9f20ff44ed7.parquet"
+    )
+    convert_tranformed_csv_to_pq(
+        data_path,
+        tmp_path
+        / "0d1f06295866286d290d831b4569fe862ab38ca72cd23d541de2c9f20ff44ed7.parquet",
+    )
+
+    # check resulting parquet file for:
+    assert (
+        output_path.exists()
+    ), f"no parquet file created as expected at {str(output_path)}"
+    # headers and number of rows
+    parquet_df = pd.read_parquet(output_path)
+    for col in list(parquet_df.columns):
+        assert "-" not in col
+
+    for col in list(df.columns):
+        assert col.replace("-", "_") in list(parquet_df.columns)
+
+
+# check column types

From 5a621aeb4984848a46601271dc74a5d9b9ee0584 Mon Sep 17 00:00:00 2001
From: eveleighoj <35256612+eveleighoj@users.noreply.github.com>
Date: Wed, 8 Jan 2025 13:20:35 +0000
Subject: [PATCH 09/51] make acceptance test work

---
 digital_land/cli.py                     |  16 +
 digital_land/commands.py                |  45 +-
 digital_land/pipeline.py                | 556 ------------------------
 tests/acceptance/test_dataset_create.py | 230 +++++++++-
 4 files changed, 250 insertions(+), 597 deletions(-)
 delete mode 100644 digital_land/pipeline.py

diff --git a/digital_land/cli.py b/digital_land/cli.py
index aae62b52f..0a08af6a6 100644
--- a/digital_land/cli.py
+++ b/digital_land/cli.py
@@ -139,6 +139,18 @@ def convert_cmd(input_path, output_path):
 @column_field_dir
 @dataset_resource_dir
 @issue_dir
+@click.option(
+    "--cache-dir",
+    type=click.Path(),
+    default="var/cache",
+    help="link to a cache directory to store temporary data that can be deleted once process is finished",
+)
+@click.option(
+    "--resource-path",
+    type=click.Path(exists=True),
+    default="collection/resource.csv",
+    help="link to where the resource list is stored",
+)
 @click.argument("input-paths", nargs=-1, type=click.Path(exists=True))
 @click.pass_context
 def dataset_create_cmd(
@@ -149,6 +161,8 @@ def dataset_create_cmd(
     column_field_dir,
     dataset_resource_dir,
     issue_dir,
+    cache_dir,
+    resource_path,
 ):
     return dataset_create(
         input_paths=input_paths,
@@ -160,6 +174,8 @@ def dataset_create_cmd(
         column_field_dir=column_field_dir,
         dataset_resource_dir=dataset_resource_dir,
         issue_dir=issue_dir,
+        cache_dir=cache_dir,
+        resource_path=resource_path,
     )
 
 
diff --git a/digital_land/commands.py b/digital_land/commands.py
index 8a49e7153..1fd065bcb 100644
--- a/digital_land/commands.py
+++ b/digital_land/commands.py
@@ -367,23 +367,32 @@ def dataset_create(
     issue_dir="issue",
     column_field_dir="var/column-field",
     dataset_resource_dir="var/dataset-resource",
-    cache_dir="var/cache/parquet",
+    cache_dir="var/cache",
     resource_path="collection/resource.csv",
 ):
-    # directories to be overwritten
-    transformed_parquet_dir = f"var/cache/transformed_parquet/{dataset}"
-    cache_dir = os.path.join(cache_dir, dataset)
+    # chek all paths are paths
+    issue_dir = Path(issue_dir)
+    column_field_dir = Path(column_field_dir)
+    dataset_resource_dir = Path(dataset_resource_dir)
+    cache_dir = Path(cache_dir)
+    resource_path = Path(resource_path)
+
+    # get  the transformed files from the cache directory this  is  assumed right now but we may want to be stricter in the future
+    transformed_parquet_dir = cache_dir / "transformed_parquet" / dataset
+
+    # creat directory for dataset_parquet_package
+    dataset_parquet_path = cache_dir / dataset
 
     if not output_path:
         print("missing output path", file=sys.stderr)
         sys.exit(2)
 
     # Set up initial objects
-    column_field_dir = Path(column_field_dir)
-    dataset_resource_dir = Path(dataset_resource_dir)
     organisation = Organisation(
         organisation_path=organisation_path, pipeline_dir=Path(pipeline.path)
     )
+
+    # create sqlite dataset packageas before and load inn data that isn't in the parquetpackage yet
     package = DatasetPackage(
         dataset,
         organisation=organisation,
@@ -393,15 +402,17 @@ def dataset_create(
     package.create()
     for path in input_paths:
         path_obj = Path(path)
-        package.load_column_fields(column_field_dir / dataset / path_obj.name)
-        package.load_dataset_resource(dataset_resource_dir / dataset / path_obj.name)
+        package.load_column_fields(column_field_dir / dataset / f"{path_obj.stem}.csv")
+        package.load_dataset_resource(
+            dataset_resource_dir / dataset / f"{path_obj.stem}.csv"
+        )
 
-    old_entity_path = os.path.join(pipeline.path, "old-entity.csv")
-    if os.path.exists(old_entity_path):
+    old_entity_path = Path(pipeline.path) / "old-entity.csv"
+    if old_entity_path.exists():
         package.load_old_entities(old_entity_path)
 
-    issue_paths = os.path.join(issue_dir, dataset)
-    if os.path.exists(issue_paths):
+    issue_paths = issue_dir / dataset
+    if issue_paths.exists():
         for issue_path in os.listdir(issue_paths):
             package.load_issues(os.path.join(issue_paths, issue_path))
     else:
@@ -416,18 +427,14 @@ def dataset_create(
 
     pqpackage = DatasetParquetPackage(
         dataset,
-        organisation=organisation,
-        path=output_path,
-        cache_dir=cache_dir,
-        resource_path=resource_path,
+        path=dataset_parquet_path,
         specification_dir=None,  # TBD: package should use this specification object
     )
     # pqpackage.create_temp_table(input_paths)
     pqpackage.load_facts(transformed_parquet_dir)
     pqpackage.load_fact_resource(transformed_parquet_dir)
-    pqpackage.load_entities()
-    pqpackage.pq_to_sqlite()
-    pqpackage.close_conn()
+    pqpackage.load_entities(transformed_parquet_dir, resource_path, organisation_path)
+    pqpackage.load_to_sqlite(output_path)
 
 
 def dataset_dump(input_path, output_path):
diff --git a/digital_land/pipeline.py b/digital_land/pipeline.py
deleted file mode 100644
index 11749571e..000000000
--- a/digital_land/pipeline.py
+++ /dev/null
@@ -1,556 +0,0 @@
-import os
-import csv
-import functools
-import importlib.util
-import logging
-from pathlib import Path
-
-from .phase.map import normalise
-from .phase.lookup import key as lookup_key
-from .schema import Schema
-
-
-def chain_phases(phases):
-    def add(f, g):
-        return lambda x: g.process(f(x))
-
-    return functools.reduce(add, phases, lambda phase: phase)
-
-
-def run_pipeline(*args):
-    logging.debug(f"run_pipeline {args}")
-    chain = chain_phases([arg for arg in args if arg])
-
-    stream = chain(None)
-    for row in stream:
-        pass
-
-
-# TODO should we remove loading from init? it makes it harder to test
-# and what if you only wanted to load specific files
-# TODO replace with config models which load is handled by them
-class Pipeline:
-    def __init__(self, path, dataset):
-        self.dataset = dataset
-        self.name = dataset
-        self.path = path
-        self.column = {}
-        self.filter = {}
-        self.skip_pattern = {}
-        self.patch = {}
-        self.default_field = {}
-        self.default_value = {}
-        self.combine_field = {}
-        self.concat = {}
-        self.migrate = {}
-        self.lookup = {}
-        self.redirect_lookup = {}
-
-        self.load_column()
-        self.load_skip_patterns()
-        self.load_patch()
-        self.load_default_fields()
-        self.load_default_values()
-        self.load_concat()
-        self.load_combine_fields()
-        self.load_migrate()
-        self.load_lookup()
-        self.load_redirect_lookup()
-        self.load_filter()
-
-    def file_reader(self, filename):
-        # read a file from the pipeline path, ignore if missing
-        path = os.path.join(self.path, filename)
-        if not os.path.isfile(path):
-            return []
-        logging.debug(f"load {path}")
-        return csv.DictReader(open(path))
-
-    def reader(self, filename):
-        for row in self.file_reader(filename):
-            row["dataset"] = row.get("dataset", "") or row.get("pipeline", "")
-            if row["dataset"] and row["dataset"] != self.name:
-                continue
-            yield row
-
-    def load_column(self):
-        for row in self.reader("column.csv"):
-            resource = row.get("resource", "")
-            endpoint = row.get("endpoint", "")
-
-            if resource:
-                record = self.column.setdefault(resource, {})
-            elif endpoint:
-                record = self.column.setdefault(endpoint, {})
-            else:
-                record = self.column.setdefault("", {})
-
-            # migrate column.csv
-            row["column"] = row.get("column", "") or row["pattern"]
-            row["field"] = row.get("field", "") or row["value"]
-
-            record[normalise(row["column"])] = row["field"]
-
-    def load_filter(self):
-        for row in self.reader("filter.csv"):
-            resource = row.get("resource", "")
-            endpoint = row.get("endpoint", "")
-
-            if resource:
-                record = self.filter.setdefault(resource, {})
-            elif endpoint:
-                record = self.filter.setdefault(endpoint, {})
-            else:
-                record = self.filter.setdefault("", {})
-
-            record[row["field"]] = row["pattern"]
-
-    def load_skip_patterns(self):
-        for row in self.reader("skip.csv"):
-            resource = row.get("resource", "")
-            endpoint = row.get("endpoint", "")
-
-            if resource:
-                record = self.skip_pattern.setdefault(resource, [])
-            elif endpoint:
-                record = self.skip_pattern.setdefault(endpoint, [])
-            else:
-                record = self.skip_pattern.setdefault("", [])
-
-            record.append(row["pattern"])
-
-    def load_patch(self):
-        for row in self.reader("patch.csv"):
-            resource = row.get("resource", "")
-            endpoint = row.get("endpoint", "")
-
-            if resource:
-                record = self.patch.setdefault(resource, {})
-            elif endpoint:
-                record = self.patch.setdefault(endpoint, {})
-            else:
-                record = self.patch.setdefault("", {})
-
-            row["field"] = row.get("field", "")
-            row["pattern"] = row.get("pattern", "")
-
-            record = record.setdefault(row["field"], {})
-            record[row["pattern"]] = row["value"]
-
-    def load_default_fields(self):
-        # TBD: rename default-field.csv
-        for row in self.reader("default.csv"):
-            resource = row.get("resource", "")
-            endpoint = row.get("endpoint", "")
-
-            if resource:
-                record = self.default_field.setdefault(resource, {})
-            elif endpoint:
-                record = self.default_field.setdefault(endpoint, {})
-            else:
-                record = self.default_field.setdefault("", {})
-
-            record[row["field"]] = row["default-field"]
-
-    def load_default_values(self):
-        for row in self.reader("default-value.csv"):
-            record = self.default_value.setdefault(row.get("endpoint", ""), {})
-            record[row["field"]] = row["value"]
-
-    def load_combine_fields(self):
-        for row in self.reader("combine.csv"):
-            record = self.combine_field.setdefault(row.get("endpoint", ""), {})
-            record[row["field"]] = row["separator"]
-
-    def load_concat(self):
-        for row in self.reader("concat.csv"):
-            resource = row.get("resource", "")
-            endpoint = row.get("endpoint", "")
-
-            if resource:
-                record = self.concat.setdefault(resource, {})
-            elif endpoint:
-                record = self.concat.setdefault(endpoint, {})
-            else:
-                record = self.concat.setdefault("", {})
-
-            # record = self.concat.setdefault(row["resource"], {})
-            record[row["field"]] = {
-                "fields": row["fields"].split(";"),
-                "separator": row["separator"],
-                "prepend": row.get("prepend", ""),
-                "append": row.get("append", ""),
-            }
-
-    # TBD: remove this table, should come from specification replacement-field
-    def load_migrate(self):
-        for row in self.reader("transform.csv"):
-            if row["replacement-field"] == "":
-                continue
-
-            if row["replacement-field"] in self.migrate:
-                raise ValueError(
-                    "replacement-field %s has more than one entry"
-                    % row["replacement-field"]
-                )
-
-            self.migrate[row["replacement-field"]] = row["field"]
-
-    def load_lookup(self):
-        for row in self.file_reader("lookup.csv"):
-            # migrate old lookup.csv files
-            entry_number = row.get("entry-number", "")
-            prefix = (
-                row.get("prefix", "")
-                or row.get("dataset", "")
-                or row.get("pipeline", "")
-            )
-            reference = row.get("reference", "") or row.get("value", "")
-
-            # composite key, ordered by specificity
-            resource_lookup = self.lookup.setdefault(row.get("resource", ""), {})
-            resource_lookup[
-                lookup_key(
-                    entry_number=entry_number,
-                    prefix=prefix,
-                    reference=reference,
-                )
-            ] = row["entity"]
-
-            organisation = row.get("organisation", "")
-            # replace local-authority-eng while we migrate
-            organisation = organisation.replace(
-                "local-authority-eng", "local-authority"
-            )
-            resource_lookup[
-                lookup_key(
-                    prefix=prefix,
-                    reference=reference,
-                    organisation=organisation,
-                )
-            ] = row["entity"]
-
-    def load_redirect_lookup(self):
-        for row in self.file_reader("old-entity.csv"):
-            old_entity = row.get("old-entity", "")
-            entity = row.get("entity", "")
-            status = row.get("status", "")
-            if old_entity and status:
-                self.redirect_lookup[old_entity] = {"entity": entity, "status": status}
-
-    def filters(self, resource="", endpoints=[]):
-        d = self.filter.get("", {}).copy()
-
-        for endpoint in endpoints:
-            endpoint_filters = self.filter.get(endpoint, {})
-            d.update(endpoint_filters)
-
-        if resource:
-            resource_filters = self.filter.get(resource, {})
-            d.update(resource_filters)
-
-        return d
-
-    def columns(self, resource="", endpoints=[]):
-        general_columns = self.column.get("", {})
-        if not resource:
-            return general_columns
-
-        resource_columns = self.column.get(resource, {})
-        endpoint_columns = {}
-        for endpoint in endpoints:
-            endpoint_columns = {**endpoint_columns, **self.column.get(endpoint, {})}
-
-        result = {**endpoint_columns, **resource_columns}
-
-        for key in general_columns:
-            if key in result:
-                continue
-            if (
-                general_columns[key] in endpoint_columns.values()
-                or general_columns[key] in resource_columns.values()
-            ):
-                continue
-            result[key] = general_columns[key]
-        return result
-
-    def skip_patterns(self, resource="", endpoints=[]):
-        if not resource:
-            return self.skip_pattern.get("", {})
-        endpoint_patterns = []
-        for endpoint in endpoints:
-            endpoint_patterns.extend(self.skip_pattern.get(endpoint, []))
-
-        return (
-            self.skip_pattern.get(resource, [])
-            + self.skip_pattern.get("", [])
-            + endpoint_patterns
-        )
-
-    def patches(self, resource="", endpoints=[]):
-        general_patch = self.patch.get("", {})
-        if not resource:
-            return general_patch
-
-        resource_patch = self.patch.get(resource, {})
-        endpoint_patch = {}
-
-        for endpoint in endpoints:
-            endpoint_patch = {**endpoint_patch, **self.patch.get(endpoint, {})}
-
-        result = {**endpoint_patch, **resource_patch}
-
-        # Merge any remaining general defaults into the result
-        for field, patch in general_patch.items():
-            if field not in result:
-                result[field] = patch
-            else:
-                result[field] = {**patch, **result[field]}
-
-        return result
-
-    def default_fields(self, resource=None, endpoints=[]):
-        config = self.default_field
-
-        d = config.get("", {})
-
-        for key, value in config.get(resource, {}).items():
-            d[key] = value
-
-        for endpoint in endpoints:
-            for key, value in config.get(endpoint, {}).items():
-                d[key] = value
-        return d
-
-    def default_values(self, endpoints=None):
-        if endpoints is None:
-            endpoints = []
-        config = self.default_value
-        d = config.get("", {})
-        for endpoint in endpoints:
-            for key, value in config.get(endpoint, {}).items():
-                d[key] = value
-        return d
-
-    def combine_fields(self, endpoints=None):
-        if endpoints is None:
-            endpoints = []
-        config = self.combine_field
-        d = config.get("", {})
-        for endpoint in endpoints:
-            for key, value in config.get(endpoint, {}).items():
-                d[key] = value
-        return d
-
-    def concatenations(self, resource=None, endpoints=[]):
-        result = self.concat.get("", {})
-        if resource:
-            result.update(self.concat.get(resource, {}))
-
-        for endpoint in endpoints:
-            result.update(self.concat.get(endpoint, {}))
-
-        return result
-
-    def migrations(self):
-        return self.migrate
-
-    def lookups(self, resource=None):
-        d = self.lookup.get("", {})
-        if resource:
-            d.update(self.lookup.get(resource, {}))
-        return d
-
-    def redirect_lookups(self):
-        return self.redirect_lookup
-
-    def get_pipeline_callback(self):
-        file = os.path.join(self.path, "pipeline-callback.py")
-        spec = importlib.util.spec_from_file_location("pipeline-callback.py", file)
-        module = importlib.util.module_from_spec(spec)
-        spec.loader.exec_module(module)
-        return module.PipelineCallback
-
-    @staticmethod
-    def compose(phases):
-        def add(f, g):
-            return lambda x: g.process(f(x))
-
-        return functools.reduce(add, phases, lambda phase: phase)
-
-    def run(self, input_path, phases):
-        logging.debug(f"running {input_path} through {phases}")
-        chain = self.compose(phases)
-        for row in chain(input_path):
-            pass
-
-
-class EntityNumGen:
-    def __init__(self, entity_num_state: dict = None):
-        if not entity_num_state:
-            entity_num_state = {
-                "range_min": 0,
-                "range_max": 100,
-                "current": 0,
-            }
-
-        self.state = entity_num_state
-
-    def next(self):
-        current = self.state["current"]
-        new_current = current + 1
-
-        if new_current > int(self.state["range_max"]):
-            new_current = int(self.state["range_min"])
-
-        if new_current < int(self.state["range_min"]):
-            new_current = int(self.state["range_min"])
-
-        self.state["current"] = new_current
-
-        return new_current
-
-
-class Lookups:
-    def __init__(self, directory=None) -> None:
-        self.directory = directory or "pipeline"
-        self.lookups_path = Path(directory) / "lookup.csv"
-        self.old_entity_path = Path(directory) / "old-entity.csv"
-        self.entries = []
-        self.schema = Schema("lookup")
-        self.entity_num_gen = EntityNumGen()
-
-    def add_entry(self, entry, is_new_entry=True):
-        """
-        is_new_entry is an addition to allow for backward compatibility.
-        Older lookups may not be valid in accordance with the current
-        minimal column requirements
-        :param entry:
-        :param is_new_entry:
-        :return:
-        """
-        if is_new_entry:
-            if not self.validate_entry(entry):
-                return
-
-        self.entries.append(entry)
-
-    def load_csv(self, lookups_path=None):
-        """
-        load in lookups as df, not when we process pipeline but useful for other analysis
-        """
-        lookups_path = lookups_path or self.lookups_path
-        reader = csv.DictReader(open(lookups_path, newline=""))
-        extra_fields = set(reader.fieldnames) - set(self.schema.fieldnames)
-
-        if len(extra_fields):
-            raise RuntimeError(
-                f"{len(extra_fields)} extra fields founds in lookup.csv ({','.join(list(extra_fields))})"
-            )
-
-        for row in reader:
-            self.add_entry(row, is_new_entry=False)
-
-    def get_max_entity(self, prefix, specification) -> int:
-        if len(self.entries) == 0:
-            return 0
-        if not prefix:
-            return 0
-
-        dataset_prefix = specification.dataset_prefix(prefix)
-        try:
-            ret_val = max(
-                [
-                    int(entry["entity"])
-                    for entry in self.entries
-                    if (entry["prefix"] == prefix or entry["prefix"] == dataset_prefix)
-                    and (entry.get("entity", None))
-                ]
-            )
-            return ret_val
-        except ValueError:
-            return 0
-
-    def save_csv(self, lookups_path=None, entries=None, old_entity_path=None):
-        path = lookups_path or self.lookups_path
-
-        entity_values = []
-        if os.path.exists(path):
-            reader = csv.DictReader(open(path, newline=""))
-            for row in reader:
-                entity_values.append(row["entity"])
-
-        if entries is None:
-            entries = self.entries
-
-        os.makedirs(os.path.dirname(path), exist_ok=True)
-        logging.debug("saving %s" % (path))
-        f = open(path, "w", newline="")
-        writer = csv.DictWriter(
-            f, fieldnames=self.schema.fieldnames, extrasaction="ignore"
-        )
-        writer.writeheader()
-
-        old_entity_file_path = old_entity_path or self.old_entity_path
-        if os.path.exists(old_entity_file_path):
-            old_entity_path = self.old_entity_path
-            reader = csv.DictReader(open(old_entity_file_path, newline=""))
-
-            for row in reader:
-                entity_values.append(row["old-entity"])
-                entity_values.append(row["entity"])
-
-        new_entities = []
-        get_entity = None
-        for idx, entry in enumerate(entries):
-            if not entry:
-                continue
-            else:
-                if not entry.get("entity"):
-                    while True:
-                        generated_entity = self.entity_num_gen.next()
-
-                        if generated_entity == get_entity:
-                            print(
-                                "There are no more entity numbers available within this dataset."
-                            )
-                            break
-
-                        if get_entity is None:
-                            get_entity = generated_entity
-
-                        if str(generated_entity) not in entity_values:
-                            entry["entity"] = generated_entity
-                            new_entities.append(entry)
-                            entity_values.append(str(generated_entity))
-                            writer.writerow(entry)
-                            break
-                else:
-                    writer.writerow(entry)
-        return new_entities
-
-    # @staticmethod
-    def validate_entry(self, entry) -> bool:
-        # ensures minimum expected fields exist and are not empty strings
-        expected_fields = ["prefix", "organisation", "reference"]
-        for field in expected_fields:
-            if not entry.get(field, ""):
-                raise ValueError(f"ERROR: expected {field} not found in lookup entry")
-
-        if len(self.entries) > 0:
-            # check entry does not already exist
-            existing_entries = len(
-                [
-                    1
-                    for item in self.entries
-                    if item["prefix"] == entry["prefix"]
-                    and item["organisation"] == entry["organisation"]
-                    and item["reference"] == entry["reference"]
-                ]
-            )
-
-            if existing_entries > 0:
-                # print(f">>> ERROR: lookup already exists - {entry['organisation']} {entry['reference']}")
-                return False
-
-        return True
diff --git a/tests/acceptance/test_dataset_create.py b/tests/acceptance/test_dataset_create.py
index de67328af..cb038925d 100644
--- a/tests/acceptance/test_dataset_create.py
+++ b/tests/acceptance/test_dataset_create.py
@@ -4,7 +4,7 @@
 """
 
 import pytest
-
+import logging
 import numpy as np
 import pandas as pd
 import os
@@ -27,31 +27,188 @@ def session_tmp_path():
 
 
 @pytest.fixture
-def input_paths():
+def cache_path(tmp_path):
+    cache_path = tmp_path / "var" / "cache"
+    cache_path.mkdir(parents=True, exist_ok=True)
+    return cache_path
+
+
+test_geometry = "MULTIPOLYGON(((-0.49901924 53.81622,-0.5177418 53.76114,-0.4268378 53.78454,-0.49901924 53.81622)))"
+transformed_1_data = {
+    "end_date": [np.nan] * 16,
+    "entity": [11, 11, 11, 11, 11, 11, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12],
+    "entry_date": [
+        "2023-01-01",
+        "2023-01-01",
+        "2023-01-01",
+        "2023-01-01",
+        "2023-01-01",
+        "2023-01-01",
+        "2023-02-01",
+        "2023-02-01",
+        "2023-02-01",
+        "2023-02-01",
+        "2023-02-01",
+        "2023-02-01",
+        "2023-02-01",
+        "2023-02-01",
+        "2023-02-01",
+        "2023-02-01",
+    ],
+    "entry_number": [2] * 16,
+    "fact": [
+        "abcdef1",
+        "abcdef2",
+        "abcdef3",
+        "abcdef4",
+        "abcdef5",
+        "abcdef6",
+        "abc1231",
+        "abc1232",
+        "abc1233",
+        "def4561",
+        "def4562",
+        "def4563",
+        "a1b2c31",
+        "a1b2c32",
+        "a1b2c33",
+        "a1b2c34",
+    ],
+    "field": [
+        "entry-date",
+        "geometry",
+        "point",
+        "document-url",
+        "organisation",
+        "entry-date",
+        "geometry",
+        "organisation",
+        "entry-date",
+        "geometry",
+        "organisation",
+        "entry-date",
+        "geomtry",
+        "document-url",
+        "notes-checking",
+        "organisation",
+    ],
+    "priority": [2] * 16,
+    "reference_entity": [np.nan] * 16,
+    "resource": [
+        "zyxwvu",
+        "zyxwvu",
+        "zyxwvu",
+        "zyxwvu",
+        "zyxwvu",
+        "zyxwvu",
+        "yxwvut",
+        "yxwvut",
+        "zyxwvu",
+        "xwvuts",
+        "xwvuts",
+        "zyxwvu",
+        "wvutsr",
+        "wvutsr",
+        "wvutsr",
+        "wvutsr",
+    ],
+    "start_date": [np.nan] * 16,
+    "value": [
+        "2023-01-01",
+        f"{test_geometry}",
+        '"POINT(-0.481 53.788)"',
+        "https://www.test.xyz",
+        "organisation:AAA",
+        "2023-01-01",
+        f"{test_geometry}",
+        "local-authority:BBB",
+        "2023-01-01",
+        f"{test_geometry}",
+        "local-authority:CCC",
+        "2023-01-01",
+        f"{test_geometry}",
+        "https://www.testing.yyz",
+        "Something random",
+        "local-authority:DDD",
+    ],
+}
+
+
+@pytest.fixture
+def input_paths(cache_path):
+    data_dicts = {"resource_1": transformed_1_data}
     input_paths = []
-    directory = f"tests/data/{test_collection}/transformed/{test_dataset}/"
-    for root, dirs, files in os.walk(directory):
-        for file in files:
-            full_path = os.path.join(root, file)
-            input_paths.append(full_path)
+    directory = cache_path / "transformed_parquet" / "conservation-area"
+    directory.mkdir(parents=True, exist_ok=True)
+
+    for path, data in data_dicts.items():
+        data = pd.DataFrame.from_dict(data)
+        input_path = directory / f"{path}.parquet"
+        data.to_parquet(input_path, index=False)
+        logging.error(str(input_path))
+        input_paths.append(str(input_path))
 
     return input_paths
 
 
 @pytest.fixture
-def organisation_path():
+def organisation_path(tmp_path):
     """
     build an organisations dataset to use
     """
-    orgs_path = f"tests/data/{test_collection}/organisation.csv"
+    org_data = {
+        "entity": [101, 102],
+        "name": ["test", "test_2"],
+        "prefix": ["local-authority", "local-authority"],
+        "reference": ["test", "test_2"],
+        "dataset": ["local-authority", "local-authority"],
+        "organisation": ["local-authority:test", "local-authority:test_2"],
+    }
+    orgs_path = tmp_path / "organisation.csv"
+
+    pd.DataFrame.from_dict(org_data).to_csv(orgs_path, index=False)
     return orgs_path
 
 
 @pytest.fixture
-def cache_path(session_tmp_path):
-    cache_path = session_tmp_path / "var" / "cache"
-    os.makedirs(cache_path, exist_ok=True)
-    return cache_path
+def column_field_path(tmp_path):
+    column_field_dir = tmp_path / "column-field"
+    dataset_cfd = column_field_dir / "conservation-area"
+    (dataset_cfd).mkdir(parents=True, exist_ok=True)
+    data = {
+        "end_date": [""],
+        "entry_date": [""],
+        "field": ["geometry"],
+        "dataset": ["conservation-area"],
+        "start_date": [""],
+        "resource": [""],
+        "column": ["WKT"],
+    }
+    pd.DataFrame.from_dict(data).to_csv(dataset_cfd / "resource_1.csv", index=False)
+    logging.error(str(dataset_cfd / "resource_1.csv"))
+    return column_field_dir
+
+
+@pytest.fixture
+def dataset_resource_path(tmp_path):
+    dataset_resource_path = tmp_path / "dataset-resource"
+    dataset_drd = dataset_resource_path / "conservation-area"
+    dataset_drd.mkdir(parents=True, exist_ok=True)
+    data = {
+        "end_date": [""],
+        "entry_date": [""],
+        "dataset": ["conservation-area"],
+        "entity_count": [""],
+        "entry_count": [1],
+        "line_count": [1],
+        "mime_type": [""],
+        "internal_path": [""],
+        "internal_mime_type": [""],
+        "resource": ["resource_1"],
+        "start_date": [""],
+    }
+    pd.DataFrame.from_dict(data).to_csv(dataset_drd / "resource_1.csv", index=False)
+    return dataset_resource_path
 
 
 @pytest.fixture
@@ -85,6 +242,8 @@ def test_acceptance_dataset_create(
     cache_path,
     dataset_dir,
     resource_path,
+    column_field_path,
+    dataset_resource_path,
 ):
     output_path = dataset_dir / f"{test_dataset}.sqlite3"
 
@@ -102,9 +261,9 @@ def test_acceptance_dataset_create(
             "--organisation-path",
             str(organisation_path),
             "--column-field-dir",
-            str(f"tests/data/{test_collection}/var/column-field"),
+            str(column_field_path),
             "--dataset-resource-dir",
-            str(f"tests/data/{test_collection}/var/dataset-resource"),
+            str(dataset_resource_path),
             "--issue-dir",
             str(issue_dir),
             "--cache-dir",
@@ -125,13 +284,29 @@ def test_acceptance_dataset_create(
         print("Command error output:")
         print(result.exception)
 
+    files = [
+        str(f.name)
+        for f in (
+            cache_path / "conservation-area" / "dataset=conservation-area"
+        ).iterdir()
+    ]
+    logging.error(files)
+    for file in ["entity.parquet", "fact.parquet", "fact_resource.parquet"]:
+        assert file in files, f"file {file} not created. files found {', '.join(files)}"
     assert result.exit_code == 0, "error returned when building dataset"
-    pq_cache = os.path.join(cache_path, test_dataset)
-    pq_files = [file for file in os.listdir(pq_cache) if file.endswith(".parquet")]
-    assert len(pq_files) == 3, "Not all parquet files created"
-    assert np.all(
-        np.sort(pq_files) == ["entity.parquet", "fact.parquet", "fact_resource.parquet"]
-    ), "parquet file names not correct"
+
+    # check that parquet files have been created correctlly in the cache directory
+    # may  want to adjust this for how we structure  a parquet package in the future
+    # also we are using the cache to store this for now but in the future  we may  want to store it in a specific directory
+    files = [
+        str(f.name)
+        for f in (
+            cache_path / "conservation-area" / "dataset=conservation-area"
+        ).iterdir()
+    ]
+
+    for file in ["entity.parquet", "fact.parquet", "fact_resource.parquet"]:
+        assert file in files, f"file {file} not created. files found {', '.join(files)}"
 
     # Check the sqlite file was created
     assert os.path.exists(output_path), f"sqlite file {output_path} does not exists"
@@ -149,8 +324,19 @@ def test_acceptance_dataset_create(
     ), f"Missing following tables in sqlite database: {missing_tables}"
 
     for table in list(expected_tables):
-        pq_rows = len(pd.read_parquet(f"{pq_cache}/{table}.parquet"))
+
+        pq_rows = len(
+            pd.read_parquet(
+                cache_path
+                / "conservation-area"
+                / "dataset=conservation-area"
+                / f"{table}.parquet"
+            )
+        )
+
+        assert pq_rows > 0, f"parquet file {table} is empty"
         sql_rows = cursor.execute(f"SELECT COUNT(*) FROM {table};").fetchone()[0]
+        assert sql_rows > 0, f"database table {table} is empty"
         assert (
             pq_rows == sql_rows
         ), f"Different rows between the parquet files and database table for {table}"

From 60ee6fe6d693f6cf7ac75219b217d5b480467167 Mon Sep 17 00:00:00 2001
From: eveleighoj <35256612+eveleighoj@users.noreply.github.com>
Date: Fri, 10 Jan 2025 15:11:00 +0000
Subject: [PATCH 10/51] ensure no error if all orgs are blank

---
 digital_land/package/dataset_parquet.py       | 11 ++++++-
 tests/acceptance/test_dataset_create.py       | 27 +++++++++++++++-
 .../package/test_dataset_parquet.py           | 32 ++++++++++++++++++-
 3 files changed, 67 insertions(+), 3 deletions(-)

diff --git a/digital_land/package/dataset_parquet.py b/digital_land/package/dataset_parquet.py
index f97b9a37a..6ed50b23b 100644
--- a/digital_land/package/dataset_parquet.py
+++ b/digital_land/package/dataset_parquet.py
@@ -218,6 +218,9 @@ def load_entities(self, transformed_parquet_dir, resource_path, organisation_pat
             if field not in (distinct_fields + extra_fields)
         ]
 
+        if "organisation" not in distinct_fields:
+            null_fields.append("organisation")
+
         # select fields - a list  of fields which have to be selected directly from the pivoted table
         # these are entity fields that are not null fields or a few special ones
         extra_fields = [
@@ -235,6 +238,12 @@ def load_entities(self, transformed_parquet_dir, resource_path, organisation_pat
         fields_to_include = ["entity", "field", "value"]
         fields_str = ", ".join(fields_to_include)
 
+        # create this statement to add a nul org  column, this is needed when no entities have an associated organisation
+        if "organisation" not in distinct_fields:
+            optional_org_str = ",''::VARCHAR AS \"organisation\""
+        else:
+            optional_org_str = ""
+
         # Take original data, group by entity & field, and order by highest priority then latest record.
         # If there are still matches then pick the first resource (and fact, just to make sure)
         # changes to make
@@ -248,7 +257,7 @@ def load_entities(self, transformed_parquet_dir, resource_path, organisation_pat
         # query  to create the file
 
         query = f"""
-            SELECT {fields_str} FROM (
+            SELECT {fields_str}{optional_org_str} FROM (
                 SELECT {fields_str}, CASE WHEN resource_csv."end-date" IS NULL THEN '2999-12-31' ELSE resource_csv."end-date" END AS resource_end_date
                 FROM parquet_scan('{transformed_parquet_dir}/*.parquet') tf
                 LEFT JOIN read_csv_auto('{resource_path}', max_line_size=40000000) resource_csv
diff --git a/tests/acceptance/test_dataset_create.py b/tests/acceptance/test_dataset_create.py
index cb038925d..1508db528 100644
--- a/tests/acceptance/test_dataset_create.py
+++ b/tests/acceptance/test_dataset_create.py
@@ -290,7 +290,6 @@ def test_acceptance_dataset_create(
             cache_path / "conservation-area" / "dataset=conservation-area"
         ).iterdir()
     ]
-    logging.error(files)
     for file in ["entity.parquet", "fact.parquet", "fact_resource.parquet"]:
         assert file in files, f"file {file} not created. files found {', '.join(files)}"
     assert result.exit_code == 0, "error returned when building dataset"
@@ -340,3 +339,29 @@ def test_acceptance_dataset_create(
         assert (
             pq_rows == sql_rows
         ), f"Different rows between the parquet files and database table for {table}"
+
+    # entity table specific tests to check how we expect the data to be used
+
+    # json field checks
+    # where no json  value  is present  we  expect the value to be null. not blank or an empty json bracket
+    # so will ensure these aren't in the results of  any test
+    sql = """
+        SELECT *
+        FROM entity
+        WHERE json = '{}'
+        ;"""
+
+    results = cursor.execute(sql).fetchall()
+    assert (
+        len(results) == 0
+    ), "there should be no rows where json is an empty json bracket"
+
+    # check no json values are arrays
+    sql = """
+        SELECT *
+        FROM entity
+        WHERE json_type(json) NOT IN ('object', NULL)
+        ;"""
+
+    results = cursor.execute(sql).fetchall()
+    assert len(results) == 0, "all json values should be objects or null"
diff --git a/tests/integration/package/test_dataset_parquet.py b/tests/integration/package/test_dataset_parquet.py
index 7318b979e..0f0042958 100644
--- a/tests/integration/package/test_dataset_parquet.py
+++ b/tests/integration/package/test_dataset_parquet.py
@@ -612,7 +612,37 @@ def test_load_fact_resource_empty_file_with_another(data, expected, tmp_path):
 
 @pytest.mark.parametrize(
     "data,expected_count,expected_props",
-    [(transformed_1_data, 2, {11: {"end_date": ""}})],
+    # need to buid an example where organisation is blank
+    [
+        (transformed_1_data, 2, {11: {"end_date": ""}}),
+        (
+            {
+                "end_date": [np.nan],  # 19 records
+                "entity": [
+                    110,
+                ],
+                "entry_date": [
+                    "2023-01-01",
+                ],
+                "entry_number": [2],
+                "fact": [
+                    "badcfe1",
+                ],
+                "field": [
+                    "entry-date",
+                ],
+                "priority": [2],
+                "reference_entity": [np.nan],  # 19 records
+                "resource": [
+                    "zyx123",
+                ],
+                "start_date": [np.nan],  # 19 records
+                "value": ["2023-01-01"],
+            },
+            1,
+            {},
+        ),
+    ],
 )
 def test_load_entities_single_file(
     data, expected_count, expected_props, tmp_path, org_path, resource_path

From 792b9a64e155869813664fb1d2c5b7f1cea21ed4 Mon Sep 17 00:00:00 2001
From: eveleighoj <35256612+eveleighoj@users.noreply.github.com>
Date: Sat, 11 Jan 2025 00:37:01 +0000
Subject: [PATCH 11/51] add converted resources to the cache directory

---
 digital_land/commands.py      | 12 ++++++++----
 digital_land/phase/convert.py | 18 ++++++++++--------
 2 files changed, 18 insertions(+), 12 deletions(-)

diff --git a/digital_land/commands.py b/digital_land/commands.py
index 1fd065bcb..010c42c38 100644
--- a/digital_land/commands.py
+++ b/digital_land/commands.py
@@ -162,17 +162,17 @@ def collection_retire_endpoints_and_sources(
 #
 #  pipeline commands
 #
-def convert(input_path, output_path, custom_temp_dir=None):
+def convert(input_path, output_path):
     if not output_path:
         output_path = default_output_path("converted", input_path)
     dataset_resource_log = DatasetResourceLog()
     converted_resource_log = ConvertedResourceLog()
+    # TBD this actualy duplictaes the data and does nothing else, should just convert it?
     run_pipeline(
         ConvertPhase(
             input_path,
             dataset_resource_log=dataset_resource_log,
             converted_resource_log=converted_resource_log,
-            custom_temp_dir=custom_temp_dir,
         ),
         DumpPhase(output_path),
     )
@@ -191,10 +191,11 @@ def pipeline_run(
     operational_issue_dir="performance/operational_issue/",
     organisation_path=None,
     save_harmonised=False,
+    #  TBD save all logs in  a log directory, this will mean only one path passed in.
     column_field_dir=None,
     dataset_resource_dir=None,
     converted_resource_dir=None,
-    custom_temp_dir=None,  # TBD: rename to "tmpdir"
+    cache_dir="var/cache",
     endpoints=[],
     organisations=[],
     entry_date="",
@@ -202,6 +203,9 @@ def pipeline_run(
     resource=None,
     output_log_dir=None,
 ):
+    # set up paths
+    cache_dir = Path(cache_dir)
+
     if resource is None:
         resource = resource_from_path(input_path)
     dataset = dataset
@@ -265,7 +269,7 @@ def pipeline_run(
             path=input_path,
             dataset_resource_log=dataset_resource_log,
             converted_resource_log=converted_resource_log,
-            custom_temp_dir=custom_temp_dir,
+            output_path=cache_dir / "converted_resources" / dataset / f"{resource}.csv",
         ),
         NormalisePhase(skip_patterns=skip_patterns, null_path=null_path),
         ParsePhase(),
diff --git a/digital_land/phase/convert.py b/digital_land/phase/convert.py
index 0f411f3b1..f0964e03f 100644
--- a/digital_land/phase/convert.py
+++ b/digital_land/phase/convert.py
@@ -187,20 +187,21 @@ def __init__(
         path=None,
         dataset_resource_log=None,
         converted_resource_log=None,
-        custom_temp_dir=None,
         output_path=None,
     ):
+        """
+        given a fie/filepath will aim to convert  it to a csv and return the path to a csv, if the file is aready a csv
+
+        Args:
+            path (str): Path to the shapefile or geojson
+            dataset_resource_log (DatasetResourceLog): DatasetResourceLog object
+            converted_resource_log (ConvertedResourceLog): ConvertedResourceLog object
+            output_path (str): Optional output path for the converted csv
+        """
         self.path = path
         self.dataset_resource_log = dataset_resource_log
         self.converted_resource_log = converted_resource_log
         self.charset = ""
-        # Allows for custom temporary directory to be specified
-        # This allows symlink creation in case of /tmp & path being on different partitions
-        if custom_temp_dir:
-            self.temp_file_extra_kwargs = {"dir": custom_temp_dir}
-        else:
-            self.temp_file_extra_kwargs = {}
-
         self.output_path = output_path
         if output_path:
             output_dir = os.path.dirname(output_path)
@@ -247,6 +248,7 @@ def process(self, stream=None):
 
             return Stream(input_path, f=iter(()), log=self.dataset_resource_log)
 
+    # should  this  be  a method and not a function? I think we  re-factor it  into a function let's remove references to self
     def _read_text_file(self, input_path, encoding):
         f = read_csv(input_path, encoding)
         self.dataset_resource_log.mime_type = "text/csv" + self.charset

From a0a445e5872523986dfe438a4dff9c7a580fb2a1 Mon Sep 17 00:00:00 2001
From: eveleighoj <35256612+eveleighoj@users.noreply.github.com>
Date: Sat, 11 Jan 2025 10:51:42 +0000
Subject: [PATCH 12/51] change to cache directory

---
 digital_land/cli.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/digital_land/cli.py b/digital_land/cli.py
index 0a08af6a6..5a3ab4c00 100644
--- a/digital_land/cli.py
+++ b/digital_land/cli.py
@@ -202,7 +202,9 @@ def dataset_dump_flattened_cmd(ctx, input_path, output_path):
 @click.option("--endpoints", help="list of endpoint hashes", default="")
 @click.option("--organisations", help="list of organisations", default="")
 @click.option("--entry-date", help="default entry-date value", default="")
-@click.option("--custom-temp-dir", help="default temporary directory", default=None)
+@click.option(
+    "--cache-dir", help="cache directory to store conveted files etc. in", default=None
+)
 @click.option("--config-path", help="Path  to a configuration sqlite", default=None)
 @click.option(
     "--resource",
@@ -232,7 +234,7 @@ def pipeline_command(
     endpoints,
     organisations,
     entry_date,
-    custom_temp_dir,
+    cache_dir,
     collection_dir,
     operational_issue_dir,
     config_path,
@@ -263,7 +265,7 @@ def pipeline_command(
         endpoints=endpoints,
         organisations=organisations,
         entry_date=entry_date,
-        custom_temp_dir=custom_temp_dir,
+        cache_dir=cache_dir,
         config_path=config_path,
         resource=resource,
         output_log_dir=output_log_dir,

From b1d08c787526cafec1abb0f03564bccb1fe70398 Mon Sep 17 00:00:00 2001
From: eveleighoj <35256612+eveleighoj@users.noreply.github.com>
Date: Sat, 11 Jan 2025 11:32:46 +0000
Subject: [PATCH 13/51] change to cache directory default

---
 digital_land/cli.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/digital_land/cli.py b/digital_land/cli.py
index 5a3ab4c00..0f51a1655 100644
--- a/digital_land/cli.py
+++ b/digital_land/cli.py
@@ -203,7 +203,9 @@ def dataset_dump_flattened_cmd(ctx, input_path, output_path):
 @click.option("--organisations", help="list of organisations", default="")
 @click.option("--entry-date", help="default entry-date value", default="")
 @click.option(
-    "--cache-dir", help="cache directory to store conveted files etc. in", default=None
+    "--cache-dir",
+    help="cache directory to store conveted files etc. in",
+    default="var/cache",
 )
 @click.option("--config-path", help="Path  to a configuration sqlite", default=None)
 @click.option(

From 175f4655e74d9f2a880ed33b9f10a745119460d9 Mon Sep 17 00:00:00 2001
From: eveleighoj <35256612+eveleighoj@users.noreply.github.com>
Date: Sun, 12 Jan 2025 13:01:57 +0000
Subject: [PATCH 14/51] use output path in other function

---
 digital_land/phase/convert.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/digital_land/phase/convert.py b/digital_land/phase/convert.py
index f0964e03f..d27fc5460 100644
--- a/digital_land/phase/convert.py
+++ b/digital_land/phase/convert.py
@@ -363,8 +363,9 @@ def _read_binary_file(self, input_path):
             if internal_path:
                 self.dataset_resource_log.internal_path = internal_path
                 self.dataset_resource_log.internal_mime_type = mime_type
+                # TODO erpace temp path with output path
                 temp_path = tempfile.NamedTemporaryFile(
-                    suffix=".zip", **self.temp_file_extra_kwargs
+                    suffix=".zip", dir=self.output_path.parent
                 ).name
                 os.link(input_path, temp_path)
                 zip_path = f"/vsizip/{temp_path}{internal_path}"

From cbb5eec06ea1b6e18c966b052563326d73696157 Mon Sep 17 00:00:00 2001
From: eveleighoj <35256612+eveleighoj@users.noreply.github.com>
Date: Tue, 14 Jan 2025 10:12:05 +0000
Subject: [PATCH 15/51] use a specific path for duck.db database to help with
 overflow

---
 digital_land/commands.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/digital_land/commands.py b/digital_land/commands.py
index 010c42c38..22da688a9 100644
--- a/digital_land/commands.py
+++ b/digital_land/commands.py
@@ -402,6 +402,7 @@ def dataset_create(
         organisation=organisation,
         path=output_path,
         specification_dir=None,  # TBD: package should use this specification object
+        duckdb_path=cache_dir / "overflow.duckdb",
     )
     package.create()
     for path in input_paths:

From f56cf08d97300f410921fca694e51673862a7dda Mon Sep 17 00:00:00 2001
From: eveleighoj <35256612+eveleighoj@users.noreply.github.com>
Date: Wed, 15 Jan 2025 16:37:23 +0000
Subject: [PATCH 16/51] make query  entity range specific

---
 digital_land/package/dataset_parquet.py | 92 +++++++++++++++++++++++--
 digital_land/pipeline/process.py        |  1 +
 2 files changed, 88 insertions(+), 5 deletions(-)

diff --git a/digital_land/package/dataset_parquet.py b/digital_land/package/dataset_parquet.py
index 6ed50b23b..9f4e3ff6f 100644
--- a/digital_land/package/dataset_parquet.py
+++ b/digital_land/package/dataset_parquet.py
@@ -171,11 +171,51 @@ def load_fact_resource(self, transformed_parquet_dir):
         """
         )
 
-    def load_entities(self, transformed_parquet_dir, resource_path, organisation_path):
-        output_path = self.path / f"dataset={self.dataset}" / "entity.parquet"
-        output_path.parent.mkdir(parents=True, exist_ok=True)
-        # get the other paths
-        # fact_resource_parquet_path = f"{self.cache_dir}/fact_resource{self.suffix}"
+    # def combine_parquet_files(input_path,output_path):
+    #     """
+    #     This method combines multiple parquet files into a single parquet file
+    #     """
+    #     # check input path is a directory using  Path
+    #     if not Path(input_path).is_dir():
+    #         raise ValueError("Input path must be a directory")
+
+    #     # check output_path is a file that doesn't exist
+    #     if not Path(output_path).is_file():
+    #         raise ValueError("Output path must be a file")
+
+    #     # use self.conn to use  duckdb to combine files
+    #     sql = f"""
+    #         COPY (select * from parquet_scan('{input_path}/*.parquet')) TO '{output_path}' (FORMAT PARQUET);
+    #     """
+    #     self.conn.execute(sql)
+
+    #     # Combine all the parquet files into a single parquet file
+    #     combined_df = pd.concat(
+    #         [pd.read_parquet(f"{input_path}/{file}") for file in parquet_files]
+    #     )
+
+    #     # Save the combined dataframe to a parquet file
+    #     combined_df.to_parquet(output_path, index=False)
+
+    def load_entities_range(
+        self,
+        transformed_parquet_dir,
+        resource_path,
+        organisation_path,
+        output_path,
+        entity_range=None,
+    ):
+        # figure  out which resources we actually need to do  expensive queries on, store  in parquet
+        # sql = f"""
+        # COPY(
+        #     SELECT DISTINCT resource
+        #     FROM parquet_scan('{transformed_parquet_dir}/*.parquet')
+        #     QUALIFY ROW_NUMBER() OVER (
+        #         PARTITION BY enttity,field
+        #         ORDER BY prioity, enttry_date DESC, entry_number DESC, resource, fact
+        #         ) = 1
+        #     ) TO '{self.cache_path / 'duckdb_temp_files' / 'distinct_resource.parquet'}' (FORMAT PARQUET);
+        # """
 
         logging.info(f"loading entities from {transformed_parquet_dir}")
 
@@ -256,12 +296,20 @@ def load_entities(self, transformed_parquet_dir, resource_path, organisation_pat
         # query to sort the final table
         # query  to create the file
 
+        # craft a where clause to limit entities in quetion, this chunking helps solve memory issues
+        if entity_range is not None:
+            entity_where_clause = (
+                f"WHERE entity >= {entity_range[0]} AND entity < {entity_range[1]}"
+            )
+        else:
+            entity_where_clause = ""
         query = f"""
             SELECT {fields_str}{optional_org_str} FROM (
                 SELECT {fields_str}, CASE WHEN resource_csv."end-date" IS NULL THEN '2999-12-31' ELSE resource_csv."end-date" END AS resource_end_date
                 FROM parquet_scan('{transformed_parquet_dir}/*.parquet') tf
                 LEFT JOIN read_csv_auto('{resource_path}', max_line_size=40000000) resource_csv
                 ON tf.resource = resource_csv.resource
+                {entity_where_clause}
                 QUALIFY ROW_NUMBER() OVER (
                     PARTITION BY entity, field
                     ORDER BY priority, entry_date DESC, entry_number DESC, resource_end_date DESC, tf.resource, fact
@@ -336,6 +384,40 @@ def load_entities(self, transformed_parquet_dir, resource_path, organisation_pat
         #  might  need  to un some fetch all toget result back
         self.conn.execute(sql)
 
+    def load_entities(self, transformed_parquet_dir, resource_path, organisation_path):
+        output_path = self.path / f"dataset={self.dataset}" / "entity.parquet"
+        output_path.parent.mkdir(parents=True, exist_ok=True)
+
+        # retrieve entity counnts including and minimum
+        min_sql = f"select MIN(entity) FROM parquet_scan('{transformed_parquet_dir}/*.parquet');"
+        min_entity = self.conn.execute(min_sql).fetchone()
+        max_sql = f"select MAX(entity) FROM parquet_scan('{transformed_parquet_dir}/*.parquet');"
+        max_entity = self.conn.execute(max_sql).fetchone()
+        total_entities = max_entity - min_entity
+        entity_limit = 1000000
+        if total_entities > entity_limit:
+            logger.info(f"total entities {total_entities} exceeds limit {entity_limit}")
+            _ = 0
+            file_count = 1
+            while _ < max_entity:
+                output_path = (
+                    self.path
+                    / f"dataset={self.dataset}"
+                    / f"entity_{file_count}.parquet"
+                )
+                entity_range = [_, entity_limit]
+                logger.info(
+                    f"loading entities from {entity_range[0]} to {entity_range[1]}"
+                )
+                self.load_entities_range(
+                    transformed_parquet_dir,
+                    resource_path,
+                    organisation_path,
+                    output_path,
+                    entity_range,
+                )
+                _ += entity_limit
+
     def load_to_sqlite(self, sqlite_path):
         """
         Convert parquet files to sqlite3 tables assumes the sqlite table already exist. There is an arguement to
diff --git a/digital_land/pipeline/process.py b/digital_land/pipeline/process.py
index 0a0fc3e61..11955ac41 100644
--- a/digital_land/pipeline/process.py
+++ b/digital_land/pipeline/process.py
@@ -8,6 +8,7 @@
 # load in specification
 
 
+# TODO need to take in the correct data types for the columns
 def convert_tranformed_csv_to_pq(input_path, output_path):
     """
     function to convert a transformed resource to a parrquet file.

From 7c04686d4fdd513a31b1ba39b5484205e31f6c87 Mon Sep 17 00:00:00 2001
From: eveleighoj <35256612+eveleighoj@users.noreply.github.com>
Date: Wed, 15 Jan 2025 16:45:11 +0000
Subject: [PATCH 17/51] add condition for no range

---
 digital_land/package/dataset_parquet.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/digital_land/package/dataset_parquet.py b/digital_land/package/dataset_parquet.py
index 9f4e3ff6f..17b5f645d 100644
--- a/digital_land/package/dataset_parquet.py
+++ b/digital_land/package/dataset_parquet.py
@@ -400,7 +400,7 @@ def load_entities(self, transformed_parquet_dir, resource_path, organisation_pat
             _ = 0
             file_count = 1
             while _ < max_entity:
-                output_path = (
+                temp_output_path = (
                     self.path
                     / f"dataset={self.dataset}"
                     / f"entity_{file_count}.parquet"
@@ -413,11 +413,16 @@ def load_entities(self, transformed_parquet_dir, resource_path, organisation_pat
                     transformed_parquet_dir,
                     resource_path,
                     organisation_path,
-                    output_path,
+                    temp_output_path,
                     entity_range,
                 )
                 _ += entity_limit
 
+        else:
+            self.load_entities_range(
+                transformed_parquet_dir, resource_path, organisation_path, output_path
+            )
+
     def load_to_sqlite(self, sqlite_path):
         """
         Convert parquet files to sqlite3 tables assumes the sqlite table already exist. There is an arguement to

From ccd1ddaf2c1b13c0d64f4350b1df2f7182a96be3 Mon Sep 17 00:00:00 2001
From: eveleighoj <35256612+eveleighoj@users.noreply.github.com>
Date: Wed, 15 Jan 2025 17:27:09 +0000
Subject: [PATCH 18/51] update arguement  in wrong place

---
 digital_land/commands.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/digital_land/commands.py b/digital_land/commands.py
index 22da688a9..010c42c38 100644
--- a/digital_land/commands.py
+++ b/digital_land/commands.py
@@ -402,7 +402,6 @@ def dataset_create(
         organisation=organisation,
         path=output_path,
         specification_dir=None,  # TBD: package should use this specification object
-        duckdb_path=cache_dir / "overflow.duckdb",
     )
     package.create()
     for path in input_paths:

From 753b40a3b77ef104ad6b3e8fe58687f40d8dbdd1 Mon Sep 17 00:00:00 2001
From: eveleighoj <35256612+eveleighoj@users.noreply.github.com>
Date: Wed, 15 Jan 2025 18:51:31 +0000
Subject: [PATCH 19/51] start range in correct place

---
 digital_land/commands.py                | 68 +++++++++++++------------
 digital_land/package/dataset_parquet.py |  3 +-
 2 files changed, 37 insertions(+), 34 deletions(-)

diff --git a/digital_land/commands.py b/digital_land/commands.py
index 010c42c38..641e9c8f8 100644
--- a/digital_land/commands.py
+++ b/digital_land/commands.py
@@ -25,7 +25,8 @@
     ConvertedResourceLog,
 )
 from digital_land.organisation import Organisation
-from digital_land.package.dataset import DatasetPackage
+
+# from digital_land.package.dataset import DatasetPackage
 from digital_land.package.dataset_parquet import DatasetParquetPackage
 from digital_land.phase.combine import FactCombinePhase
 from digital_land.phase.concat import ConcatFieldPhase
@@ -392,37 +393,37 @@ def dataset_create(
         sys.exit(2)
 
     # Set up initial objects
-    organisation = Organisation(
-        organisation_path=organisation_path, pipeline_dir=Path(pipeline.path)
-    )
+    # organisation = Organisation(
+    #     organisation_path=organisation_path, pipeline_dir=Path(pipeline.path)
+    # )
 
     # create sqlite dataset packageas before and load inn data that isn't in the parquetpackage yet
-    package = DatasetPackage(
-        dataset,
-        organisation=organisation,
-        path=output_path,
-        specification_dir=None,  # TBD: package should use this specification object
-    )
-    package.create()
-    for path in input_paths:
-        path_obj = Path(path)
-        package.load_column_fields(column_field_dir / dataset / f"{path_obj.stem}.csv")
-        package.load_dataset_resource(
-            dataset_resource_dir / dataset / f"{path_obj.stem}.csv"
-        )
-
-    old_entity_path = Path(pipeline.path) / "old-entity.csv"
-    if old_entity_path.exists():
-        package.load_old_entities(old_entity_path)
-
-    issue_paths = issue_dir / dataset
-    if issue_paths.exists():
-        for issue_path in os.listdir(issue_paths):
-            package.load_issues(os.path.join(issue_paths, issue_path))
-    else:
-        logging.warning("No directory for this dataset in the provided issue_directory")
-
-    package.add_counts()
+    # package = DatasetPackage(
+    #     dataset,
+    #     organisation=organisation,
+    #     path=output_path,
+    #     specification_dir=None,  # TBD: package should use this specification object
+    # )
+    # package.create()
+    # for path in input_paths:
+    #     path_obj = Path(path)
+    #     package.load_column_fields(column_field_dir / dataset / f"{path_obj.stem}.csv")
+    #     package.load_dataset_resource(
+    #         dataset_resource_dir / dataset / f"{path_obj.stem}.csv"
+    #     )
+
+    # old_entity_path = Path(pipeline.path) / "old-entity.csv"
+    # if old_entity_path.exists():
+    #     package.load_old_entities(old_entity_path)
+
+    # issue_paths = issue_dir / dataset
+    # if issue_paths.exists():
+    #     for issue_path in os.listdir(issue_paths):
+    #         package.load_issues(os.path.join(issue_paths, issue_path))
+    # else:
+    #     logging.warning("No directory for this dataset in the provided issue_directory")
+
+    # package.add_counts()
 
     # Repeat for parquet
     # Set up cache directory to store parquet files. The sqlite files created from this will be saved in the dataset
@@ -433,12 +434,13 @@ def dataset_create(
         dataset,
         path=dataset_parquet_path,
         specification_dir=None,  # TBD: package should use this specification object
+        duckdb_path=cache_dir / "overflow.duckdb",
     )
     # pqpackage.create_temp_table(input_paths)
-    pqpackage.load_facts(transformed_parquet_dir)
-    pqpackage.load_fact_resource(transformed_parquet_dir)
+    # pqpackage.load_facts(transformed_parquet_dir)
+    # pqpackage.load_fact_resource(transformed_parquet_dir)
     pqpackage.load_entities(transformed_parquet_dir, resource_path, organisation_path)
-    pqpackage.load_to_sqlite(output_path)
+    # pqpackage.load_to_sqlite(output_path)
 
 
 def dataset_dump(input_path, output_path):
diff --git a/digital_land/package/dataset_parquet.py b/digital_land/package/dataset_parquet.py
index 17b5f645d..94a559070 100644
--- a/digital_land/package/dataset_parquet.py
+++ b/digital_land/package/dataset_parquet.py
@@ -227,6 +227,7 @@ def load_entities_range(
         query = f"""
             SELECT DISTINCT REPLACE(field,'-','_')
             FROM parquet_scan('{transformed_parquet_dir}/*.parquet')
+            WHERE entity >= {entity_range[0]} AND entity < {entity_range[1]}
         """
 
         # distinct_fields - list of fields in the field in fact
@@ -397,7 +398,7 @@ def load_entities(self, transformed_parquet_dir, resource_path, organisation_pat
         entity_limit = 1000000
         if total_entities > entity_limit:
             logger.info(f"total entities {total_entities} exceeds limit {entity_limit}")
-            _ = 0
+            _ = min_entity
             file_count = 1
             while _ < max_entity:
                 temp_output_path = (

From 25f7618c9baa4617fb0d266be87f2747e0820866 Mon Sep 17 00:00:00 2001
From: eveleighoj <35256612+eveleighoj@users.noreply.github.com>
Date: Wed, 15 Jan 2025 19:09:15 +0000
Subject: [PATCH 20/51] get value from tuple

---
 digital_land/package/dataset_parquet.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/digital_land/package/dataset_parquet.py b/digital_land/package/dataset_parquet.py
index 94a559070..ff3a8b5b4 100644
--- a/digital_land/package/dataset_parquet.py
+++ b/digital_land/package/dataset_parquet.py
@@ -391,9 +391,9 @@ def load_entities(self, transformed_parquet_dir, resource_path, organisation_pat
 
         # retrieve entity counnts including and minimum
         min_sql = f"select MIN(entity) FROM parquet_scan('{transformed_parquet_dir}/*.parquet');"
-        min_entity = self.conn.execute(min_sql).fetchone()
+        min_entity = self.conn.execute(min_sql).fetchone()[0]
         max_sql = f"select MAX(entity) FROM parquet_scan('{transformed_parquet_dir}/*.parquet');"
-        max_entity = self.conn.execute(max_sql).fetchone()
+        max_entity = self.conn.execute(max_sql).fetchone()[0]
         total_entities = max_entity - min_entity
         entity_limit = 1000000
         if total_entities > entity_limit:

From 1b8f7a4c62ba942397c73a356e00f9f7cc3505f7 Mon Sep 17 00:00:00 2001
From: eveleighoj <35256612+eveleighoj@users.noreply.github.com>
Date: Wed, 15 Jan 2025 19:12:49 +0000
Subject: [PATCH 21/51] correct max in for loop

---
 digital_land/package/dataset_parquet.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/digital_land/package/dataset_parquet.py b/digital_land/package/dataset_parquet.py
index ff3a8b5b4..4b2e53e2a 100644
--- a/digital_land/package/dataset_parquet.py
+++ b/digital_land/package/dataset_parquet.py
@@ -406,7 +406,7 @@ def load_entities(self, transformed_parquet_dir, resource_path, organisation_pat
                     / f"dataset={self.dataset}"
                     / f"entity_{file_count}.parquet"
                 )
-                entity_range = [_, entity_limit]
+                entity_range = [_, _ + entity_limit]
                 logger.info(
                     f"loading entities from {entity_range[0]} to {entity_range[1]}"
                 )

From 73835e988c1c25419fadd43b72f952c36c49ad5b Mon Sep 17 00:00:00 2001
From: eveleighoj <35256612+eveleighoj@users.noreply.github.com>
Date: Thu, 16 Jan 2025 08:17:34 +0000
Subject: [PATCH 22/51] iterate  file count

---
 digital_land/package/dataset_parquet.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/digital_land/package/dataset_parquet.py b/digital_land/package/dataset_parquet.py
index 4b2e53e2a..b1b070558 100644
--- a/digital_land/package/dataset_parquet.py
+++ b/digital_land/package/dataset_parquet.py
@@ -418,6 +418,7 @@ def load_entities(self, transformed_parquet_dir, resource_path, organisation_pat
                     entity_range,
                 )
                 _ += entity_limit
+                file_count += 1
 
         else:
             self.load_entities_range(

From 8938a25135cd2dad9fd71498716efb4940ee8e5f Mon Sep 17 00:00:00 2001
From: eveleighoj <35256612+eveleighoj@users.noreply.github.com>
Date: Thu, 16 Jan 2025 08:56:33 +0000
Subject: [PATCH 23/51] build a single file

---
 digital_land/package/dataset_parquet.py | 35 +++++++++++++++++++++----
 1 file changed, 30 insertions(+), 5 deletions(-)

diff --git a/digital_land/package/dataset_parquet.py b/digital_land/package/dataset_parquet.py
index b1b070558..d19e553a7 100644
--- a/digital_land/package/dataset_parquet.py
+++ b/digital_land/package/dataset_parquet.py
@@ -385,6 +385,24 @@ def load_entities_range(
         #  might  need  to un some fetch all toget result back
         self.conn.execute(sql)
 
+    def combine_parquet_files(self, input_path, output_path):
+        """
+        This method combines multiple parquet files into a single parquet file
+        """
+        # check input path is a directory using  Path
+        if not Path(input_path).is_dir():
+            raise ValueError("Input path must be a directory")
+
+        # check output_path is a file that doesn't exist
+        if not Path(output_path).is_file():
+            raise ValueError("Output path must be a file")
+
+        # use self.conn to use  duckdb to combine files
+        sql = f"""
+            COPY (select * from parquet_scan('{input_path}/*.parquet')) TO '{output_path}' (FORMAT PARQUET);
+        """
+        self.conn.execute(sql)
+
     def load_entities(self, transformed_parquet_dir, resource_path, organisation_path):
         output_path = self.path / f"dataset={self.dataset}" / "entity.parquet"
         output_path.parent.mkdir(parents=True, exist_ok=True)
@@ -397,15 +415,18 @@ def load_entities(self, transformed_parquet_dir, resource_path, organisation_pat
         total_entities = max_entity - min_entity
         entity_limit = 1000000
         if total_entities > entity_limit:
+            # create a temparary output path to store separate entity file in
+            temp_dir = (
+                self.cache_dir
+                / "temp_parquet_files"
+                / "title-boundaries"
+                / "entity_files"
+            )
             logger.info(f"total entities {total_entities} exceeds limit {entity_limit}")
             _ = min_entity
             file_count = 1
             while _ < max_entity:
-                temp_output_path = (
-                    self.path
-                    / f"dataset={self.dataset}"
-                    / f"entity_{file_count}.parquet"
-                )
+                temp_output_path = temp_dir / f"entity_{file_count}.parquet"
                 entity_range = [_, _ + entity_limit]
                 logger.info(
                     f"loading entities from {entity_range[0]} to {entity_range[1]}"
@@ -419,7 +440,11 @@ def load_entities(self, transformed_parquet_dir, resource_path, organisation_pat
                 )
                 _ += entity_limit
                 file_count += 1
+            # combine all the parquet files into a single parquet file
+            self.combine_parquet_files(temp_dir, output_path)
 
+            # remove temporary files
+            temp_dir.rmdir()
         else:
             self.load_entities_range(
                 transformed_parquet_dir, resource_path, organisation_path, output_path

From 0f590eb1ff9b58927ccaa811b44d47c3893eed94 Mon Sep 17 00:00:00 2001
From: eveleighoj <35256612+eveleighoj@users.noreply.github.com>
Date: Thu, 16 Jan 2025 09:11:03 +0000
Subject: [PATCH 24/51] remove cache_dir thats not provided

---
 digital_land/package/dataset_parquet.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/digital_land/package/dataset_parquet.py b/digital_land/package/dataset_parquet.py
index d19e553a7..85829d58f 100644
--- a/digital_land/package/dataset_parquet.py
+++ b/digital_land/package/dataset_parquet.py
@@ -417,7 +417,7 @@ def load_entities(self, transformed_parquet_dir, resource_path, organisation_pat
         if total_entities > entity_limit:
             # create a temparary output path to store separate entity file in
             temp_dir = (
-                self.cache_dir
+                output_path.parent
                 / "temp_parquet_files"
                 / "title-boundaries"
                 / "entity_files"

From e7a3f7f3a83154f07b418318032509a40285b020 Mon Sep 17 00:00:00 2001
From: eveleighoj <35256612+eveleighoj@users.noreply.github.com>
Date: Thu, 16 Jan 2025 09:16:26 +0000
Subject: [PATCH 25/51] make the right  directory

---
 digital_land/package/dataset_parquet.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/digital_land/package/dataset_parquet.py b/digital_land/package/dataset_parquet.py
index 85829d58f..363ee2373 100644
--- a/digital_land/package/dataset_parquet.py
+++ b/digital_land/package/dataset_parquet.py
@@ -422,6 +422,7 @@ def load_entities(self, transformed_parquet_dir, resource_path, organisation_pat
                 / "title-boundaries"
                 / "entity_files"
             )
+            temp_dir.mkdir(parents=True, exist_ok=True)
             logger.info(f"total entities {total_entities} exceeds limit {entity_limit}")
             _ = min_entity
             file_count = 1

From f0722c5d3c23ecd6ecf84935efb9a0d3c759f098 Mon Sep 17 00:00:00 2001
From: eveleighoj <35256612+eveleighoj@users.noreply.github.com>
Date: Thu, 16 Jan 2025 09:34:39 +0000
Subject: [PATCH 26/51] need remove whole tree

---
 digital_land/package/dataset_parquet.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/digital_land/package/dataset_parquet.py b/digital_land/package/dataset_parquet.py
index 363ee2373..e9c5c4893 100644
--- a/digital_land/package/dataset_parquet.py
+++ b/digital_land/package/dataset_parquet.py
@@ -1,6 +1,7 @@
 import os
 import logging
 import duckdb
+import shutil
 from pathlib import Path
 from .package import Package
 
@@ -445,7 +446,7 @@ def load_entities(self, transformed_parquet_dir, resource_path, organisation_pat
             self.combine_parquet_files(temp_dir, output_path)
 
             # remove temporary files
-            temp_dir.rmdir()
+            shutil.rmtree(temp_dir)
         else:
             self.load_entities_range(
                 transformed_parquet_dir, resource_path, organisation_path, output_path

From b89079a113903f6d50531bfcab0153318f608c3b Mon Sep 17 00:00:00 2001
From: eveleighoj <35256612+eveleighoj@users.noreply.github.com>
Date: Thu, 16 Jan 2025 10:15:17 +0000
Subject: [PATCH 27/51] run full command

---
 digital_land/commands.py | 67 ++++++++++++++++++++--------------------
 1 file changed, 33 insertions(+), 34 deletions(-)

diff --git a/digital_land/commands.py b/digital_land/commands.py
index 641e9c8f8..bc91a4654 100644
--- a/digital_land/commands.py
+++ b/digital_land/commands.py
@@ -26,7 +26,7 @@
 )
 from digital_land.organisation import Organisation
 
-# from digital_land.package.dataset import DatasetPackage
+from digital_land.package.dataset import DatasetPackage
 from digital_land.package.dataset_parquet import DatasetParquetPackage
 from digital_land.phase.combine import FactCombinePhase
 from digital_land.phase.concat import ConcatFieldPhase
@@ -393,37 +393,37 @@ def dataset_create(
         sys.exit(2)
 
     # Set up initial objects
-    # organisation = Organisation(
-    #     organisation_path=organisation_path, pipeline_dir=Path(pipeline.path)
-    # )
+    organisation = Organisation(
+        organisation_path=organisation_path, pipeline_dir=Path(pipeline.path)
+    )
 
     # create sqlite dataset packageas before and load inn data that isn't in the parquetpackage yet
-    # package = DatasetPackage(
-    #     dataset,
-    #     organisation=organisation,
-    #     path=output_path,
-    #     specification_dir=None,  # TBD: package should use this specification object
-    # )
-    # package.create()
-    # for path in input_paths:
-    #     path_obj = Path(path)
-    #     package.load_column_fields(column_field_dir / dataset / f"{path_obj.stem}.csv")
-    #     package.load_dataset_resource(
-    #         dataset_resource_dir / dataset / f"{path_obj.stem}.csv"
-    #     )
-
-    # old_entity_path = Path(pipeline.path) / "old-entity.csv"
-    # if old_entity_path.exists():
-    #     package.load_old_entities(old_entity_path)
-
-    # issue_paths = issue_dir / dataset
-    # if issue_paths.exists():
-    #     for issue_path in os.listdir(issue_paths):
-    #         package.load_issues(os.path.join(issue_paths, issue_path))
-    # else:
-    #     logging.warning("No directory for this dataset in the provided issue_directory")
-
-    # package.add_counts()
+    package = DatasetPackage(
+        dataset,
+        organisation=organisation,
+        path=output_path,
+        specification_dir=None,  # TBD: package should use this specification object
+    )
+    package.create()
+    for path in input_paths:
+        path_obj = Path(path)
+        package.load_column_fields(column_field_dir / dataset / f"{path_obj.stem}.csv")
+        package.load_dataset_resource(
+            dataset_resource_dir / dataset / f"{path_obj.stem}.csv"
+        )
+
+    old_entity_path = Path(pipeline.path) / "old-entity.csv"
+    if old_entity_path.exists():
+        package.load_old_entities(old_entity_path)
+
+    issue_paths = issue_dir / dataset
+    if issue_paths.exists():
+        for issue_path in os.listdir(issue_paths):
+            package.load_issues(os.path.join(issue_paths, issue_path))
+    else:
+        logging.warning("No directory for this dataset in the provided issue_directory")
+
+    package.add_counts()
 
     # Repeat for parquet
     # Set up cache directory to store parquet files. The sqlite files created from this will be saved in the dataset
@@ -436,11 +436,10 @@ def dataset_create(
         specification_dir=None,  # TBD: package should use this specification object
         duckdb_path=cache_dir / "overflow.duckdb",
     )
-    # pqpackage.create_temp_table(input_paths)
-    # pqpackage.load_facts(transformed_parquet_dir)
-    # pqpackage.load_fact_resource(transformed_parquet_dir)
+    pqpackage.load_facts(transformed_parquet_dir)
+    pqpackage.load_fact_resource(transformed_parquet_dir)
     pqpackage.load_entities(transformed_parquet_dir, resource_path, organisation_path)
-    # pqpackage.load_to_sqlite(output_path)
+    pqpackage.load_to_sqlite(output_path)
 
 
 def dataset_dump(input_path, output_path):

From 57feeb8a0081f3e56411e382b1c8a1ab4ad42cd6 Mon Sep 17 00:00:00 2001
From: eveleighoj <35256612+eveleighoj@users.noreply.github.com>
Date: Thu, 16 Jan 2025 11:41:39 +0000
Subject: [PATCH 28/51] change logginng level

---
 digital_land/commands.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/digital_land/commands.py b/digital_land/commands.py
index bc91a4654..76bfe0d33 100644
--- a/digital_land/commands.py
+++ b/digital_land/commands.py
@@ -375,6 +375,9 @@ def dataset_create(
     cache_dir="var/cache",
     resource_path="collection/resource.csv",
 ):
+    # set level for logging to see what's going on
+    logger.setLevel(logging.INFO)
+
     # chek all paths are paths
     issue_dir = Path(issue_dir)
     column_field_dir = Path(column_field_dir)
@@ -439,6 +442,8 @@ def dataset_create(
     pqpackage.load_facts(transformed_parquet_dir)
     pqpackage.load_fact_resource(transformed_parquet_dir)
     pqpackage.load_entities(transformed_parquet_dir, resource_path, organisation_path)
+
+    logger.info("parquet files created")
     pqpackage.load_to_sqlite(output_path)
 
 
From 2917f43c92a3fc2b3c4014431aec71e34dbf9713 Mon Sep 17 00:00:00 2001
From: eveleighoj <35256612+eveleighoj@users.noreply.github.com>
Date: Thu, 16 Jan 2025 14:08:13 +0000
Subject: [PATCH 29/51] make parquet package prinnt info logs

---
 digital_land/commands.py | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/digital_land/commands.py b/digital_land/commands.py
index 76bfe0d33..54a39335d 100644
--- a/digital_land/commands.py
+++ b/digital_land/commands.py
@@ -377,6 +377,8 @@ def dataset_create(
 ):
     # set level for logging to see what's going on
     logger.setLevel(logging.INFO)
+    dataset_parquet_logger = logging.getLogger("dataset_parquet")
+    dataset_parquet_logger.setLevel(logging.INFO)
 
     # chek all paths are paths
     issue_dir = Path(issue_dir)
@@ -410,15 +412,18 @@ def dataset_create(
     package.create()
     for path in input_paths:
         path_obj = Path(path)
+        logging.info(f"loading column field log into {output_path}")
         package.load_column_fields(column_field_dir / dataset / f"{path_obj.stem}.csv")
+        logging.info(f"loading dataset resource log into {output_path}")
         package.load_dataset_resource(
             dataset_resource_dir / dataset / f"{path_obj.stem}.csv"
         )
-
+    logging.info(f"loading old entities into {output_path}")
     old_entity_path = Path(pipeline.path) / "old-entity.csv"
     if old_entity_path.exists():
         package.load_old_entities(old_entity_path)
 
+    logging.info(f"loading issues into {output_path}")
     issue_paths = issue_dir / dataset
     if issue_paths.exists():
         for issue_path in os.listdir(issue_paths):
@@ -426,8 +431,6 @@ def dataset_create(
     else:
         logging.warning("No directory for this dataset in the provided issue_directory")
 
-    package.add_counts()
-
     # Repeat for parquet
     # Set up cache directory to store parquet files. The sqlite files created from this will be saved in the dataset
     if not os.path.exists(cache_dir):
@@ -443,9 +446,12 @@ def dataset_create(
     pqpackage.load_fact_resource(transformed_parquet_dir)
     pqpackage.load_entities(transformed_parquet_dir, resource_path, organisation_path)
 
-    logger.info("parquet files created")
+    logger.info("loading fact,fact_resource and entity into {output_path}")
     pqpackage.load_to_sqlite(output_path)
 
+    logger.info(f"creating dataset package {output_path} counts")
+    package.add_counts()
+
 
 def dataset_dump(input_path, output_path):
     cmd = f"sqlite3 -header -csv {input_path} 'select * from entity;' > {output_path}"

From 3d3abd9d3a2c04d49633eec93226eb8facfcb686 Mon Sep 17 00:00:00 2001
From: eveleighoj <35256612+eveleighoj@users.noreply.github.com>
Date: Thu, 16 Jan 2025 14:19:41 +0000
Subject: [PATCH 30/51] make intgeger

---
 digital_land/pipeline/process.py | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/digital_land/pipeline/process.py b/digital_land/pipeline/process.py
index 11955ac41..93d558616 100644
--- a/digital_land/pipeline/process.py
+++ b/digital_land/pipeline/process.py
@@ -21,10 +21,21 @@ def convert_tranformed_csv_to_pq(input_path, output_path):
     # Define the chunk size for reading the CSV file
     chunk_size = 1000000  # Number of rows per chunk
 
+    # expand on column names
     # Open a CSV reader with PyArrow
     # csv_reader = pv.open_csv(input_path, read_options=pv.ReadOptions(block_size=chunk_size))
     csv_iterator = pd.read_csv(
-        input_path, chunksize=chunk_size, dtype=str, na_filter=False
+        input_path,
+        chunksize=chunk_size,
+        dtype={
+            "entity": int,
+            **{
+                col: str
+                for col in pd.read_csv(input_path, nrows=1).columns
+                if col != "entity"
+            },
+        },
+        na_filter=False,
     )
 
     # Initialize the Parquet writer with the schema from the first chunk

From 3453840bfa7b564ac11eeaaf89ac17980daf8b03 Mon Sep 17 00:00:00 2001
From: eveleighoj <35256612+eveleighoj@users.noreply.github.com>
Date: Thu, 16 Jan 2025 14:24:26 +0000
Subject: [PATCH 31/51] include entity field change

---
 digital_land/pipeline/process.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/digital_land/pipeline/process.py b/digital_land/pipeline/process.py
index 93d558616..7ad249c52 100644
--- a/digital_land/pipeline/process.py
+++ b/digital_land/pipeline/process.py
@@ -45,7 +45,7 @@ def convert_tranformed_csv_to_pq(input_path, output_path):
 
     fields = [
         ("end-date", pa.string()),
-        ("entity", pa.string()),
+        ("entity", pa.int64()),
         ("entry-date", pa.string()),
         ("entry-number", pa.string()),
         ("fact", pa.string()),

From f43907deb84142e20a1afdeef1dc307e5724eba1 Mon Sep 17 00:00:00 2001
From: eveleighoj <35256612+eveleighoj@users.noreply.github.com>
Date: Thu, 16 Jan 2025 16:33:53 +0000
Subject: [PATCH 32/51] ensure where clause isnt used if no range is supplied

---
 digital_land/package/dataset_parquet.py | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/digital_land/package/dataset_parquet.py b/digital_land/package/dataset_parquet.py
index e9c5c4893..73bef18ca 100644
--- a/digital_land/package/dataset_parquet.py
+++ b/digital_land/package/dataset_parquet.py
@@ -224,11 +224,17 @@ def load_entities_range(
         # Do this to match with later field names.
         entity_fields = [e.replace("-", "_") for e in entity_fields]
         # input_paths_str = f"{self.cache_dir}/fact{self.suffix}"
+        if entity_range is not None:
+            entity_where_clause = (
+                f"WHERE entity >= {entity_range[0]} AND entity < {entity_range[1]}"
+            )
+        else:
+            entity_where_clause = ""
 
         query = f"""
             SELECT DISTINCT REPLACE(field,'-','_')
             FROM parquet_scan('{transformed_parquet_dir}/*.parquet')
-            WHERE entity >= {entity_range[0]} AND entity < {entity_range[1]}
+            {entity_where_clause}
         """
 
         # distinct_fields - list of fields in the field in fact
@@ -299,12 +305,7 @@ def load_entities_range(
         # query  to create the file
 
         # craft a where clause to limit entities in quetion, this chunking helps solve memory issues
-        if entity_range is not None:
-            entity_where_clause = (
-                f"WHERE entity >= {entity_range[0]} AND entity < {entity_range[1]}"
-            )
-        else:
-            entity_where_clause = ""
+
         query = f"""
             SELECT {fields_str}{optional_org_str} FROM (
                 SELECT {fields_str}, CASE WHEN resource_csv."end-date" IS NULL THEN '2999-12-31' ELSE resource_csv."end-date" END AS resource_end_date

From c94d4f129c515f492ccfd6411f788165ba2c92dc Mon Sep 17 00:00:00 2001
From: eveleighoj <35256612+eveleighoj@users.noreply.github.com>
Date: Thu, 16 Jan 2025 16:46:25 +0000
Subject: [PATCH 33/51] use better logging

---
 digital_land/commands.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/digital_land/commands.py b/digital_land/commands.py
index 54a39335d..80541c796 100644
--- a/digital_land/commands.py
+++ b/digital_land/commands.py
@@ -377,7 +377,7 @@ def dataset_create(
 ):
     # set level for logging to see what's going on
     logger.setLevel(logging.INFO)
-    dataset_parquet_logger = logging.getLogger("dataset_parquet")
+    dataset_parquet_logger = logging.getLogger("digital_land.package.dataset_parquet")
     dataset_parquet_logger.setLevel(logging.INFO)
 
     # chek all paths are paths

From 8ec07f9424189b06aa764e685e24bcab894b4cb9 Mon Sep 17 00:00:00 2001
From: eveleighoj <35256612+eveleighoj@users.noreply.github.com>
Date: Thu, 16 Jan 2025 16:57:05 +0000
Subject: [PATCH 34/51] logging tweaks

---
 digital_land/commands.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/digital_land/commands.py b/digital_land/commands.py
index 80541c796..e834ffb76 100644
--- a/digital_land/commands.py
+++ b/digital_land/commands.py
@@ -376,9 +376,7 @@ def dataset_create(
     resource_path="collection/resource.csv",
 ):
     # set level for logging to see what's going on
-    logger.setLevel(logging.INFO)
-    dataset_parquet_logger = logging.getLogger("digital_land.package.dataset_parquet")
-    dataset_parquet_logger.setLevel(logging.INFO)
+    logging.basicConfig(level=logging.INFO)
 
     # chek all paths are paths
     issue_dir = Path(issue_dir)

From bd0749d61f6d53f82b470573eacc9cccb26221b4 Mon Sep 17 00:00:00 2001
From: eveleighoj <35256612+eveleighoj@users.noreply.github.com>
Date: Thu, 16 Jan 2025 17:32:20 +0000
Subject: [PATCH 35/51] add loading loggings

---
 digital_land/commands.py                |  3 ++-
 digital_land/package/dataset_parquet.py | 21 +++++++--------------
 2 files changed, 9 insertions(+), 15 deletions(-)

diff --git a/digital_land/commands.py b/digital_land/commands.py
index e834ffb76..ef10edc08 100644
--- a/digital_land/commands.py
+++ b/digital_land/commands.py
@@ -376,7 +376,8 @@ def dataset_create(
     resource_path="collection/resource.csv",
 ):
     # set level for logging to see what's going on
-    logging.basicConfig(level=logging.INFO)
+    logger.setLevel(logging.INFO)
+    logging.getLogger("digital_land.package.dataset_parquet").setLevel(logging.INFO)
 
     # chek all paths are paths
     issue_dir = Path(issue_dir)
diff --git a/digital_land/package/dataset_parquet.py b/digital_land/package/dataset_parquet.py
index 73bef18ca..93e543db3 100644
--- a/digital_land/package/dataset_parquet.py
+++ b/digital_land/package/dataset_parquet.py
@@ -218,7 +218,7 @@ def load_entities_range(
         #     ) TO '{self.cache_path / 'duckdb_temp_files' / 'distinct_resource.parquet'}' (FORMAT PARQUET);
         # """
 
-        logging.info(f"loading entities from {transformed_parquet_dir}")
+        logger.info(f"loading entities from {transformed_parquet_dir}")
 
         entity_fields = self.specification.schema["entity"]["fields"]
         # Do this to match with later field names.
@@ -391,6 +391,7 @@ def combine_parquet_files(self, input_path, output_path):
         """
         This method combines multiple parquet files into a single parquet file
         """
+        logger.info(f"combining parquet files from {input_path} into {output_path}")
         # check input path is a directory using  Path
         if not Path(input_path).is_dir():
             raise ValueError("Input path must be a directory")
@@ -461,7 +462,7 @@ def load_to_sqlite(self, sqlite_path):
         # At present we are saving the parquet files in 'cache' but saving the sqlite files produced in 'dataset'
         # In future when parquet files are saved to 'dataset' remove the 'cache_dir' in the function arguments and
         # replace 'cache_dir' with 'output_path' in this function's code
-        logging.info(
+        logger.info(
             f"loading sqlite3 tables in {sqlite_path} from parquet files in {self.path}"
         )
         # migrate to connection creation
@@ -477,6 +478,8 @@ def load_to_sqlite(self, sqlite_path):
         fields_str = ", ".join(
             [field.replace("-", "_") for field in fact_resource_fields]
         )
+
+        logger.info("loading fact_resource data")
         # insert fact_resource data
         self.conn.execute(
             f"""
@@ -485,6 +488,7 @@ def load_to_sqlite(self, sqlite_path):
             """
         )
 
+        logger.info("loading fact data")
         # insert fact data
         fact_fields = self.specification.schema["fact"]["fields"]
         fields_str = ", ".join([field.replace("-", "_") for field in fact_fields])
@@ -496,6 +500,7 @@ def load_to_sqlite(self, sqlite_path):
             """
         )
 
+        logger.info("loading entity data")
         # insert entity data
         entity_fields = self.specification.schema["entity"]["fields"]
         fields_str = ", ".join(
@@ -512,18 +517,6 @@ def load_to_sqlite(self, sqlite_path):
             """
         )
 
-        # Fix the column names
-        # for column in self.conn.execute("DESCRIBE TABLE temp_table;").fetchall():
-        #     if "-" in column[0]:
-        #         self.conn.execute(
-        #             f"ALTER TABLE temp_table RENAME COLUMN '{column[0]}' TO '{column[0].replace('-','_')}';"
-        #         )
-
-        # Copy the data
-        # self.conn.execute(
-        #     f"INSERT INTO sqlite_db.{table_name} BY NAME (SELECT * FROM temp_table);"
-        # )
-
         self.conn.execute("DETACH DATABASE sqlite_db;")
 
     def close_conn(self):

From ba48403422770dbf73f0468e59ed728b3a7233d7 Mon Sep 17 00:00:00 2001
From: eveleighoj <35256612+eveleighoj@users.noreply.github.com>
Date: Thu, 16 Jan 2025 17:58:47 +0000
Subject: [PATCH 36/51] remove indexes before loading

---
 digital_land/commands.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/digital_land/commands.py b/digital_land/commands.py
index ef10edc08..5be9bc215 100644
--- a/digital_land/commands.py
+++ b/digital_land/commands.py
@@ -408,7 +408,9 @@ def dataset_create(
         path=output_path,
         specification_dir=None,  # TBD: package should use this specification object
     )
-    package.create()
+    # don'tt use create as we don't want to create the indexes
+    package.create_database()
+    package.disconnect()
     for path in input_paths:
         path_obj = Path(path)
         logging.info(f"loading column field log into {output_path}")
@@ -448,6 +450,11 @@ def dataset_create(
     logger.info("loading fact,fact_resource and entity into {output_path}")
     pqpackage.load_to_sqlite(output_path)
 
+    logger.info(f"add indexes to {output_path}")
+    package.connect()
+    package.create_indexes()
+    package.disconnect()
+
     logger.info(f"creating dataset package {output_path} counts")
     package.add_counts()
 

From 0bf17f2d316c22c51d23ddbb11e6b1e7b136053d Mon Sep 17 00:00:00 2001
From: eveleighoj <35256612+eveleighoj@users.noreply.github.com>
Date: Thu, 16 Jan 2025 18:04:07 +0000
Subject: [PATCH 37/51] add crreate cursur

---
 digital_land/commands.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/digital_land/commands.py b/digital_land/commands.py
index 5be9bc215..eabc02219 100644
--- a/digital_land/commands.py
+++ b/digital_land/commands.py
@@ -452,6 +452,7 @@ def dataset_create(
 
     logger.info(f"add indexes to {output_path}")
     package.connect()
+    package.create_cursor()
     package.create_indexes()
     package.disconnect()
 

From 0183d9b44e4e9a3f5de80628c94ad9ec3d45a140 Mon Sep 17 00:00:00 2001
From: eveleighoj <35256612+eveleighoj@users.noreply.github.com>
Date: Thu, 23 Jan 2025 13:59:59 +0000
Subject: [PATCH 38/51] remove error and some wrong validation

---
 digital_land/package/dataset_parquet.py | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/digital_land/package/dataset_parquet.py b/digital_land/package/dataset_parquet.py
index 93e543db3..365c73ed4 100644
--- a/digital_land/package/dataset_parquet.py
+++ b/digital_land/package/dataset_parquet.py
@@ -319,7 +319,6 @@ def load_entities_range(
                 ) = 1
             )
         """
-        logging.error(query)
 
         pivot_query = f"""
             PIVOT (
@@ -392,14 +391,6 @@ def combine_parquet_files(self, input_path, output_path):
         This method combines multiple parquet files into a single parquet file
         """
         logger.info(f"combining parquet files from {input_path} into {output_path}")
-        # check input path is a directory using  Path
-        if not Path(input_path).is_dir():
-            raise ValueError("Input path must be a directory")
-
-        # check output_path is a file that doesn't exist
-        if not Path(output_path).is_file():
-            raise ValueError("Output path must be a file")
-
         # use self.conn to use  duckdb to combine files
         sql = f"""
             COPY (select * from parquet_scan('{input_path}/*.parquet')) TO '{output_path}' (FORMAT PARQUET);

From 4a605ad6e1b814b5b69360c080775eeba59cca21 Mon Sep 17 00:00:00 2001
From: eveleighoj <35256612+eveleighoj@users.noreply.github.com>
Date: Fri, 24 Jan 2025 13:54:42 +0000
Subject: [PATCH 39/51]  chacnge package structure

---
 digital_land/package/dataset_parquet.py       | 43 ++++----------
 .../package/test_dataset_parquet.py           | 56 +++++++++++++++----
 2 files changed, 57 insertions(+), 42 deletions(-)

diff --git a/digital_land/package/dataset_parquet.py b/digital_land/package/dataset_parquet.py
index 365c73ed4..54a62935e 100644
--- a/digital_land/package/dataset_parquet.py
+++ b/digital_land/package/dataset_parquet.py
@@ -54,11 +54,16 @@ def __init__(self, dataset, path, duckdb_path=None, **kwargs):
         self.typology = self.specification.schema[dataset]["typology"]
 
         # set up key file paths
-        self.fact_path = self.path / f"dataset={self.dataset}" / "fact.parquet"
+        self.fact_path = self.path / "fact" / f"dataset={self.dataset}" / "fact.parquet"
         self.fact_resource_path = (
-            self.path / f"dataset={self.dataset}" / "fact_resource.parquet"
+            self.path
+            / "fact-resource"
+            / f"dataset={self.dataset}"
+            / "fact-resource.parquet"
+        )
+        self.entity_path = (
+            self.path / "entity" / f"dataset={self.dataset}" / "entity.parquet"
         )
-        self.entity_path = self.path / f"dataset={self.dataset}" / "entity.parquet"
 
     def get_schema(self):
         schema = {}
@@ -124,7 +129,7 @@ def load_facts(self, transformed_parquet_dir):
         """
         This method loads facts into a fact table from a directory containing all transformed files as parquet files
         """
-        output_path = self.path / f"dataset={self.dataset}" / "fact.parquet"
+        output_path = self.fact_path
         output_path.parent.mkdir(parents=True, exist_ok=True)
         logging.info("loading facts from temp table")
 
@@ -151,7 +156,7 @@ def load_facts(self, transformed_parquet_dir):
 
     def load_fact_resource(self, transformed_parquet_dir):
         logging.info(f"loading fact resources from {str(transformed_parquet_dir)}")
-        output_path = self.path / f"dataset={self.dataset}" / "fact_resource.parquet"
+        output_path = self.fact_resource_path
         output_path.parent.mkdir(parents=True, exist_ok=True)
         fact_resource_fields = self.specification.schema["fact-resource"]["fields"]
         fields_str = ", ".join(
@@ -172,32 +177,6 @@ def load_fact_resource(self, transformed_parquet_dir):
         """
         )
 
-    # def combine_parquet_files(input_path,output_path):
-    #     """
-    #     This method combines multiple parquet files into a single parquet file
-    #     """
-    #     # check input path is a directory using  Path
-    #     if not Path(input_path).is_dir():
-    #         raise ValueError("Input path must be a directory")
-
-    #     # check output_path is a file that doesn't exist
-    #     if not Path(output_path).is_file():
-    #         raise ValueError("Output path must be a file")
-
-    #     # use self.conn to use  duckdb to combine files
-    #     sql = f"""
-    #         COPY (select * from parquet_scan('{input_path}/*.parquet')) TO '{output_path}' (FORMAT PARQUET);
-    #     """
-    #     self.conn.execute(sql)
-
-    #     # Combine all the parquet files into a single parquet file
-    #     combined_df = pd.concat(
-    #         [pd.read_parquet(f"{input_path}/{file}") for file in parquet_files]
-    #     )
-
-    #     # Save the combined dataframe to a parquet file
-    #     combined_df.to_parquet(output_path, index=False)
-
     def load_entities_range(
         self,
         transformed_parquet_dir,
@@ -398,7 +377,7 @@ def combine_parquet_files(self, input_path, output_path):
         self.conn.execute(sql)
 
     def load_entities(self, transformed_parquet_dir, resource_path, organisation_path):
-        output_path = self.path / f"dataset={self.dataset}" / "entity.parquet"
+        output_path = self.entity_path
         output_path.parent.mkdir(parents=True, exist_ok=True)
 
         # retrieve entity counnts including and minimum
diff --git a/tests/integration/package/test_dataset_parquet.py b/tests/integration/package/test_dataset_parquet.py
index 0f0042958..f94244dd1 100644
--- a/tests/integration/package/test_dataset_parquet.py
+++ b/tests/integration/package/test_dataset_parquet.py
@@ -380,7 +380,11 @@ def test_load_facts_single_file(data: dict, expected: int, tmp_path):
     package.load_facts(transformed_parquet_dir=transformed_parquet_dir)
 
     output_file = (
-        tmp_path / "conservation-area" / "dataset=conservation-area" / "fact.parquet"
+        tmp_path
+        / "conservation-area"
+        / "fact"
+        / "dataset=conservation-area"
+        / "fact.parquet"
     )
     assert os.path.exists(output_file), "fact.parquet file does not exist"
 
@@ -421,7 +425,11 @@ def test_load_facts_multiple_files(data1, data2, expected, tmp_path):
     package.load_facts(transformed_parquet_dir=transformed_parquet_dir)
 
     output_file = (
-        tmp_path / "conservation-area" / "dataset=conservation-area" / "fact.parquet"
+        tmp_path
+        / "conservation-area"
+        / "fact"
+        / "dataset=conservation-area"
+        / "fact.parquet"
     )
     assert os.path.exists(output_file), "fact.parquet file does not exist"
 
@@ -472,7 +480,11 @@ def test_load_facts_one_file_with_empty_file(data, expected, tmp_path):
     package.load_facts(transformed_parquet_dir=transformed_parquet_dir)
 
     output_file = (
-        tmp_path / "conservation-area" / "dataset=conservation-area" / "fact.parquet"
+        tmp_path
+        / "conservation-area"
+        / "fact"
+        / "dataset=conservation-area"
+        / "fact.parquet"
     )
     assert os.path.exists(output_file), "fact.parquet file does not exist"
 
@@ -504,8 +516,9 @@ def test_load_fact_resource_single_file(data, expected, tmp_path):
     output_file = (
         tmp_path
         / "conservation-area"
+        / "fact-resource"
         / "dataset=conservation-area"
-        / "fact_resource.parquet"
+        / "fact-resource.parquet"
     )
     assert os.path.exists(output_file), "fact-resource.parquet file does not exist"
 
@@ -544,8 +557,9 @@ def test_load_fact_resource_two_filea(data_1, data_2, expected, tmp_path):
     output_file = (
         tmp_path
         / "conservation-area"
+        / "fact-resource"
         / "dataset=conservation-area"
-        / "fact_resource.parquet"
+        / "fact-resource.parquet"
     )
     assert os.path.exists(output_file), "fact-resource.parquet file does not exist"
 
@@ -596,8 +610,9 @@ def test_load_fact_resource_empty_file_with_another(data, expected, tmp_path):
     output_file = (
         tmp_path
         / "conservation-area"
+        / "fact-resource"
         / "dataset=conservation-area"
-        / "fact_resource.parquet"
+        / "fact-resource.parquet"
     )
     assert os.path.exists(output_file), "fact-resource.parquet file does not exist"
 
@@ -662,7 +677,11 @@ def test_load_entities_single_file(
     package.load_entities(transformed_parquet_dir, resource_path, org_path)
 
     output_file = (
-        tmp_path / "conservation-area" / "dataset=conservation-area" / "entity.parquet"
+        tmp_path
+        / "conservation-area"
+        / "entity"
+        / "dataset=conservation-area"
+        / "entity.parquet"
     )
     assert os.path.exists(output_file), "entity.parquet file does not exist"
 
@@ -737,15 +756,32 @@ def test_load_pq_to_sqlite_basic(
     fact_resource_df = pd.DataFrame.from_dict(fact_resource_data)
     entity_df = pd.DataFrame.from_dict(entity_data)
 
+    (dataset_parquet_path / "fact" / "dataset=conservation-area").mkdir(
+        parents=True, exist_ok=True
+    )
+    (dataset_parquet_path / "fact-resource" / "dataset=conservation-area").mkdir(
+        parents=True, exist_ok=True
+    )
+    (dataset_parquet_path / "entity" / "dataset=conservation-area").mkdir(
+        parents=True, exist_ok=True
+    )
+
     fact_df.to_parquet(
-        dataset_parquet_path / "dataset=conservation-area" / "fact.parquet", index=False
+        dataset_parquet_path / "fact" / "dataset=conservation-area" / "fact.parquet",
+        index=False,
     )
     fact_resource_df.to_parquet(
-        dataset_parquet_path / "dataset=conservation-area" / "fact_resource.parquet",
+        dataset_parquet_path
+        / "fact-resource"
+        / "dataset=conservation-area"
+        / "fact-resource.parquet",
         index=False,
     )
     entity_df.to_parquet(
-        dataset_parquet_path / "dataset=conservation-area" / "entity.parquet",
+        dataset_parquet_path
+        / "entity"
+        / "dataset=conservation-area"
+        / "entity.parquet",
         index=False,
     )
 

From 1b34bd19472f9588d10fead2f5c146dcaa0a5121 Mon Sep 17 00:00:00 2001
From: eveleighoj <35256612+eveleighoj@users.noreply.github.com>
Date: Fri, 24 Jan 2025 19:28:38 +0000
Subject: [PATCH 40/51] align  with proper convert changes

---
 digital_land/commands.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/digital_land/commands.py b/digital_land/commands.py
index 954a7f0e7..898da4c60 100644
--- a/digital_land/commands.py
+++ b/digital_land/commands.py
@@ -283,7 +283,7 @@ def pipeline_run(
             path=input_path,
             dataset_resource_log=dataset_resource_log,
             converted_resource_log=converted_resource_log,
-            output_path=cache_dir / "converted_resources" / dataset / f"{resource}.csv",
+            output_path=converted_path,
         ),
         NormalisePhase(skip_patterns=skip_patterns, null_path=null_path),
         ParsePhase(),

From a856ae28562c99528dfde3ee883ab176bf5c3e31 Mon Sep 17 00:00:00 2001
From: alexglasertpx <alex.glaser@tpximpact.com>
Date: Tue, 4 Feb 2025 17:23:19 +0000
Subject: [PATCH 41/51] ran black

---
 digital_land/phase/convert.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/digital_land/phase/convert.py b/digital_land/phase/convert.py
index d27fc5460..d0d5aa051 100644
--- a/digital_land/phase/convert.py
+++ b/digital_land/phase/convert.py
@@ -188,6 +188,7 @@ def __init__(
         dataset_resource_log=None,
         converted_resource_log=None,
         output_path=None,
+        custom_temp_dir=None,
     ):
         """
         given a fie/filepath will aim to convert  it to a csv and return the path to a csv, if the file is aready a csv
@@ -207,6 +208,9 @@ def __init__(
             output_dir = os.path.dirname(output_path)
             if not os.path.exists(output_dir):
                 os.makedirs(output_dir)
+        self.custom_temp_dir = custom_temp_dir
+        if custom_temp_dir:
+            os.makedirs(custom_temp_dir, exist_ok=True)
 
     def process(self, stream=None):
         input_path = self.path

From cd57b17d19ca5275646f256958b2034a6e612224 Mon Sep 17 00:00:00 2001
From: alexglasertpx <alex.glaser@tpximpact.com>
Date: Wed, 5 Feb 2025 15:57:00 +0000
Subject: [PATCH 42/51] Reran test with duckdb 1.2.0


From 50fe50efcd03d2a0b6bea2ecebf12918c1ed85fa Mon Sep 17 00:00:00 2001
From: alexglasertpx <alex.glaser@tpximpact.com>
Date: Wed, 5 Feb 2025 16:11:40 +0000
Subject: [PATCH 43/51] Added 'INSTALL spatial' at class initialisation

---
 digital_land/package/dataset_parquet.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/digital_land/package/dataset_parquet.py b/digital_land/package/dataset_parquet.py
index 54a62935e..0b5085164 100644
--- a/digital_land/package/dataset_parquet.py
+++ b/digital_land/package/dataset_parquet.py
@@ -47,6 +47,8 @@ def __init__(self, dataset, path, duckdb_path=None, **kwargs):
             self.duckdb_path = Path(duckdb_path)
             self.duckdb_path.parent.mkdir(parents=True, exist_ok=True)
             self.conn = duckdb.connect(self.duckdb_path)
+            query = "INSTALL spatial; LOAD spatial;"
+            self.conn.execute(query)
         else:
             self.conn = duckdb.connect()
 

From 3bed06981673b89c9a012fe53d93955be7186ebd Mon Sep 17 00:00:00 2001
From: alexglasertpx <alex.glaser@tpximpact.com>
Date: Wed, 5 Feb 2025 16:56:00 +0000
Subject: [PATCH 44/51] Remove adding spatial parts in class initialisation

---
 digital_land/package/dataset_parquet.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/digital_land/package/dataset_parquet.py b/digital_land/package/dataset_parquet.py
index 0b5085164..54a62935e 100644
--- a/digital_land/package/dataset_parquet.py
+++ b/digital_land/package/dataset_parquet.py
@@ -47,8 +47,6 @@ def __init__(self, dataset, path, duckdb_path=None, **kwargs):
             self.duckdb_path = Path(duckdb_path)
             self.duckdb_path.parent.mkdir(parents=True, exist_ok=True)
             self.conn = duckdb.connect(self.duckdb_path)
-            query = "INSTALL spatial; LOAD spatial;"
-            self.conn.execute(query)
         else:
             self.conn = duckdb.connect()
 

From 6771c05d0ea852b0a0c05f61399256836da6a472 Mon Sep 17 00:00:00 2001
From: alexglasertpx <alex.glaser@tpximpact.com>
Date: Wed, 5 Feb 2025 16:56:24 +0000
Subject: [PATCH 45/51] Set verrsion for duckdb

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 0f4b77b0a..6c140a941 100644
--- a/setup.py
+++ b/setup.py
@@ -53,7 +53,7 @@ def get_long_description():
         "dataclasses-json",
         "pydantic",
         "json-stream",
-        "duckdb",
+        "duckdb==1.1.3",
         "dask",
         "pyarrow",
         "pygit2",

From 974876226ad4b1e36161ecff7d6468b8e019fae9 Mon Sep 17 00:00:00 2001
From: alexglasertpx <alex.glaser@tpximpact.com>
Date: Wed, 5 Feb 2025 17:28:05 +0000
Subject: [PATCH 46/51] Adding spatial install on class initialisation and
 setting duckdb version

---
 digital_land/package/dataset_parquet.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/digital_land/package/dataset_parquet.py b/digital_land/package/dataset_parquet.py
index 54a62935e..88cac0db7 100644
--- a/digital_land/package/dataset_parquet.py
+++ b/digital_land/package/dataset_parquet.py
@@ -47,6 +47,7 @@ def __init__(self, dataset, path, duckdb_path=None, **kwargs):
             self.duckdb_path = Path(duckdb_path)
             self.duckdb_path.parent.mkdir(parents=True, exist_ok=True)
             self.conn = duckdb.connect(self.duckdb_path)
+            self.conn.execute("INSTALL spatial; LOAD spatial;")
         else:
             self.conn = duckdb.connect()
 

From a29b628850b22850086ecf42f290e4a641865fdd Mon Sep 17 00:00:00 2001
From: alexglasertpx <alex.glaser@tpximpact.com>
Date: Wed, 5 Feb 2025 17:39:46 +0000
Subject: [PATCH 47/51] Removed duckdb version

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 6c140a941..0f4b77b0a 100644
--- a/setup.py
+++ b/setup.py
@@ -53,7 +53,7 @@ def get_long_description():
         "dataclasses-json",
         "pydantic",
         "json-stream",
-        "duckdb==1.1.3",
+        "duckdb",
         "dask",
         "pyarrow",
         "pygit2",

From 39457df8277c8e828c9222818bcf9db8ce05b6b7 Mon Sep 17 00:00:00 2001
From: alexglasertpx <alex.glaser@tpximpact.com>
Date: Thu, 6 Feb 2025 10:11:57 +0000
Subject: [PATCH 48/51] Removed 'INSTALL spatial' from class initialisation

---
 digital_land/package/dataset_parquet.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/digital_land/package/dataset_parquet.py b/digital_land/package/dataset_parquet.py
index 88cac0db7..b713f1e4c 100644
--- a/digital_land/package/dataset_parquet.py
+++ b/digital_land/package/dataset_parquet.py
@@ -47,7 +47,7 @@ def __init__(self, dataset, path, duckdb_path=None, **kwargs):
             self.duckdb_path = Path(duckdb_path)
             self.duckdb_path.parent.mkdir(parents=True, exist_ok=True)
             self.conn = duckdb.connect(self.duckdb_path)
-            self.conn.execute("INSTALL spatial; LOAD spatial;")
+            # self.conn.execute("INSTALL spatial; LOAD spatial;")
         else:
             self.conn = duckdb.connect()
 

From 901148a18deebcbe4d35ada3e4e681ad8746b284 Mon Sep 17 00:00:00 2001
From: alexglasertpx <alex.glaser@tpximpact.com>
Date: Thu, 6 Feb 2025 17:34:26 +0000
Subject: [PATCH 49/51] Reversed prior changes to check if get same error

---
 digital_land/phase/convert.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/digital_land/phase/convert.py b/digital_land/phase/convert.py
index 20fbbabc0..b92e2d900 100644
--- a/digital_land/phase/convert.py
+++ b/digital_land/phase/convert.py
@@ -188,7 +188,7 @@ def __init__(
         dataset_resource_log=None,
         converted_resource_log=None,
         output_path=None,
-        custom_temp_dir=None,
+        # custom_temp_dir=None,
     ):
         """
         given a fie/filepath will aim to convert it to a csv and return the path to a csv, if the file is already a csv
@@ -208,9 +208,9 @@ def __init__(
             output_dir = os.path.dirname(output_path)
             if not os.path.exists(output_dir):
                 os.makedirs(output_dir)
-        self.custom_temp_dir = custom_temp_dir
-        if custom_temp_dir:
-            os.makedirs(custom_temp_dir, exist_ok=True)
+        # self.custom_temp_dir = custom_temp_dir
+        # if custom_temp_dir:
+        #     os.makedirs(custom_temp_dir, exist_ok=True)
 
     def process(self, stream=None):
         input_path = self.path

From 937f3f03b8d2bf74fe940d1dbfb28afb704fb8a9 Mon Sep 17 00:00:00 2001
From: alexglasertpx <alex.glaser@tpximpact.com>
Date: Thu, 6 Feb 2025 17:41:52 +0000
Subject: [PATCH 50/51] Reversed prior changes to check if get same error

---
 digital_land/commands.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/digital_land/commands.py b/digital_land/commands.py
index 067f149ea..996bf1981 100644
--- a/digital_land/commands.py
+++ b/digital_land/commands.py
@@ -1265,7 +1265,6 @@ def get_resource_unidentified_lookups(
         ConvertPhase(
             path=input_path,
             dataset_resource_log=dataset_resource_log,
-            custom_temp_dir=custom_temp_dir,
         ),
         NormalisePhase(skip_patterns=skip_patterns, null_path=null_path),
         ParsePhase(),

From 1db57db8214f662e78ad2ec971d721ddc4f037bd Mon Sep 17 00:00:00 2001
From: alexglasertpx <alex.glaser@tpximpact.com>
Date: Thu, 6 Feb 2025 17:47:50 +0000
Subject: [PATCH 51/51] Reversed prior changes to check if get same error

---
 digital_land/commands.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/digital_land/commands.py b/digital_land/commands.py
index 996bf1981..963e2e6c7 100644
--- a/digital_land/commands.py
+++ b/digital_land/commands.py
@@ -1211,7 +1211,7 @@ def get_resource_unidentified_lookups(
     # could alter resource_from_path to file from path and promote to a utils folder
     resource = resource_from_path(input_path)
     dataset_resource_log = DatasetResourceLog(dataset=dataset, resource=resource)
-    custom_temp_dir = tmp_dir  # './var'
+    # custom_temp_dir = tmp_dir  # './var'
 
     print("")
     print("----------------------------------------------------------------------")