#29: add task for nature import to layer 2 (#397)

Kraysent · web-flow · commit 9ed6b51e57db · 2026-03-06T00:06:44.000+03:00
* #29: add layer2 task for nature import * add orphans pgcs test * last update tests * style checks * add dry run * move log level to main task * add pgc index * add better logging * use execute many for batched insertions * combine migrations
diff --git a/app/data/model/__init__.py b/app/data/model/__init__.py
@@ -15,6 +15,7 @@
     CIResultObjectCollision,
     CIResultObjectExisting,
     CIResultObjectNew,
+    NatureRecord,
     Record,
     RecordCrossmatch,
     RecordWithPGC,
@@ -45,8 +46,9 @@
     "CIResultObjectCollision",
     "CIResultObjectExisting",
     "CIResultObjectNew",
-    "RecordWithPGC",
+    "NatureRecord",
     "Record",
+    "RecordWithPGC",
     "Layer2CatalogObject",
     "Layer2Object",
     "RawCatalog",
diff --git a/app/data/model/interface.py b/app/data/model/interface.py
@@ -30,6 +30,7 @@ class RawCatalog(enum.Enum):
     aggregated data on layer 2.
     """
 
+    ALL = "all"
     ICRS = "icrs"
     DESIGNATION = "designation"
     REDSHIFT = "redshift"
diff --git a/app/data/model/records.py b/app/data/model/records.py
@@ -46,3 +46,10 @@ class RecordCrossmatch:
 class RecordWithPGC:
     pgc: int
     record: Record
+
+
+@dataclass
+class NatureRecord:
+    pgc: int
+    record_id: str
+    type_name: str
diff --git a/app/data/repositories/layer1.py b/app/data/repositories/layer1.py
@@ -33,8 +33,7 @@ def save_structured_data(self, table: str, columns: list[str], ids: list[str], d
         )
         rows = [[rid] + vals for rid, vals in zip(ids, data, strict=True)]
         with self.with_tx():
-            cursor = self._storage.get_connection().cursor()
-            cursor.executemany(query, rows)
+            self._storage.execute_batch(query, rows)
 
     def save_data(self, records: list[model.Record]) -> None:
         all_catalog_objects = []
@@ -126,6 +125,22 @@ def get_new_observations(
 
         return records
 
+    def get_new_nature_records(self, dt: datetime.datetime, limit: int, offset: int) -> list[model.NatureRecord]:
+        query = """SELECT o.pgc, l1.record_id, l1.type_name
+        FROM nature.data AS l1
+        JOIN layer0.records AS o ON l1.record_id = o.id
+        WHERE o.pgc IN (
+            SELECT DISTINCT o.pgc
+            FROM nature.data AS l1
+            JOIN layer0.records AS o ON l1.record_id = o.id
+            WHERE o.modification_time > %s AND o.pgc > %s
+            ORDER BY o.pgc
+            LIMIT %s
+        )
+        ORDER BY o.pgc ASC"""
+        rows = self._storage.query(query, params=[dt, offset, limit])
+        return [model.NatureRecord(pgc=int(r["pgc"]), record_id=r["record_id"], type_name=r["type_name"]) for r in rows]
+
     def query_records(
         self,
         catalogs: list[model.RawCatalog],
diff --git a/app/data/repositories/layer2/repository.py b/app/data/repositories/layer2/repository.py
@@ -24,13 +24,15 @@ def __init__(self, storage: postgres.PgStorage, logger: structlog.stdlib.BoundLo
         self._logger = logger
         self._storage = storage
 
-    def get_last_update_time(self) -> datetime.datetime:
-        return self._storage.query_one("SELECT dt FROM layer2.last_update WHERE catalog = %s", params=["all"])["dt"]
+    def get_last_update_time(self, catalog: model.RawCatalog) -> datetime.datetime:
+        return self._storage.query_one("SELECT dt FROM layer2.last_update WHERE catalog = %s", params=[catalog.value])[
+            "dt"
+        ]
 
-    def update_last_update_time(self, dt: datetime.datetime):
+    def update_last_update_time(self, dt: datetime.datetime, catalog: model.RawCatalog) -> None:
         self._storage.exec(
             "UPDATE layer2.last_update SET dt = %s WHERE catalog = %s",
-            params=[dt, "all"],
+            params=[dt, catalog.value],
         )
 
     def get_orphaned_pgcs(self, catalogs: list[model.RawCatalog]) -> dict[str, list[int]]:
@@ -96,6 +98,20 @@ def save_data(self, objects: list[model.Layer2CatalogObject]):
 
             self._storage.exec(query, params=params)
 
+    def save(self, table: str, columns: list[str], pgcs: list[int], data: list[list[Any]]) -> None:
+        if not pgcs:
+            return
+        all_columns = ["pgc"] + columns
+        placeholders = ",".join(["%s"] * len(all_columns))
+        on_conflict = ", ".join([f"{c} = EXCLUDED.{c}" for c in all_columns])
+        query = (
+            f"INSERT INTO {table} ({', '.join(all_columns)}) VALUES ({placeholders}) "
+            f"ON CONFLICT (pgc) DO UPDATE SET {on_conflict}"
+        )
+        rows = [[pgc, *row] for pgc, row in zip(pgcs, data, strict=True)]
+        with self.with_tx():
+            self._storage.execute_batch(query, rows)
+
     def _construct_batch_query(
         self,
         catalogs: list[model.RawCatalog],
diff --git a/app/lib/storage/postgres/postgres_storage.py b/app/lib/storage/postgres/postgres_storage.py
@@ -1,3 +1,4 @@
+from collections.abc import Sequence
 from typing import Any
 
 import numpy as np
@@ -101,6 +102,15 @@ def exec(self, query: str | sql.SQL | sql.Composed, *, params: list[Any] | None
         cursor = self._connection.cursor()
         cursor.execute(query, params)
 
+    def execute_batch(self, query: str, rows: Sequence[Sequence[Any]]) -> None:
+        if self._connection is None:
+            raise RuntimeError("Unable to execute query: connection to Postgres was not established")
+
+        log.debug("SQL execute batch", query=query.replace("\n", " "), num_rows=len(rows))
+
+        cursor = self._connection.cursor()
+        cursor.executemany(query, rows)
+
     def query(self, query: str | sql.SQL | sql.Composed, *, params: list[Any] | None = None) -> list[rows.DictRow]:
         if params is None:
             params = []
diff --git a/app/tasks/layer2_import.py b/app/tasks/layer2_import.py
@@ -37,7 +37,7 @@ def prepare(self, config: interface.Config):
         self.layer2_repository = repositories.Layer2Repository(self.pg_storage, self.log)
 
     def run(self):
-        last_update_dt = self.layer2_repository.get_last_update_time()
+        last_update_dt = self.layer2_repository.get_last_update_time(model.RawCatalog.ALL)
 
         self.log.info("Starting Layer 2 import", last_update=last_update_dt.ctime())
 
@@ -77,7 +77,7 @@ def run(self):
 
             self.log.info("Updated catalog", catalog=catalog.value)
 
-        self.layer2_repository.update_last_update_time(datetime.datetime.now(tz=datetime.UTC))
+        self.layer2_repository.update_last_update_time(datetime.datetime.now(tz=datetime.UTC), model.RawCatalog.ALL)
         self.log.info("Layer 2 import completed", last_update=last_update_dt.ctime())
 
     def cleanup(self):
diff --git a/app/tasks/layer2_import_nature.py b/app/tasks/layer2_import_nature.py
@@ -0,0 +1,127 @@
+import datetime
+from typing import final
+
+import structlog
+
+from app.data import model, repositories
+from app.lib import containers
+from app.lib.storage import postgres
+from app.tasks import interface
+
+
+@final
+class Layer2ImportNatureTask(interface.Task):
+    def __init__(
+        self,
+        logger: structlog.stdlib.BoundLogger,
+        batch_size: int = 100000,
+        dry_run: bool = False,
+    ) -> None:
+        self.log = logger
+        self.batch_size = batch_size
+        self.dry_run = dry_run
+
+    @classmethod
+    def name(cls) -> str:
+        return "layer2-import-nature"
+
+    def prepare(self, config: interface.Config) -> None:
+        self.pg_storage = postgres.PgStorage(config.storage, self.log)
+        self.pg_storage.connect()
+        self.layer1_repository = repositories.Layer1Repository(self.pg_storage, self.log)
+        self.layer2_repository = repositories.Layer2Repository(self.pg_storage, self.log)
+
+    def run(self) -> None:
+        last_update_dt = self.layer2_repository.get_last_update_time(model.RawCatalog.NATURE)
+        self.log.info(
+            "Starting Layer 2 nature import",
+            last_update=last_update_dt.ctime(),
+            dry_run=self.dry_run,
+        )
+
+        objects_to_save = 0
+        type_distribution: dict[str, int] = {}
+        for offset, records in containers.read_batches(
+            self.layer1_repository.get_new_nature_records,
+            lambda data: len(data) == 0,
+            0,
+            lambda d, _: d[-1].pgc,
+            last_update_dt,
+            batch_size=self.batch_size,
+        ):
+            records_by_pgc = containers.group_by(records, key_func=lambda r: r.pgc)
+            pgcs: list[int] = []
+            data: list[list[str]] = []
+            for pgc, pgc_records in records_by_pgc.items():
+                type_counts: dict[str, int] = {}
+                for rec in pgc_records:
+                    type_counts[rec.type_name] = type_counts.get(rec.type_name, 0) + 1
+                max_type = max(type_counts, key=lambda k: type_counts[k])
+                type_distribution[max_type] = type_distribution.get(max_type, 0) + 1
+                pgcs.append(pgc)
+                data.append([max_type])
+            if pgcs:
+                objects_to_save += len(pgcs)
+                if not self.dry_run:
+                    self.layer2_repository.save("layer2.nature", ["type_name"], pgcs, data)
+            self.log.info(
+                "Processed batch",
+                last_pgc=offset,
+                batch_size=len(records),
+                total_processed=objects_to_save,
+            )
+
+        orphaned = self.layer2_repository.get_orphaned_pgcs([model.RawCatalog.NATURE])
+        pgcs_to_remove = [pgc for pgcs in orphaned.values() for pgc in pgcs]
+        orphans_to_delete = len(pgcs_to_remove)
+        if pgcs_to_remove and not self.dry_run:
+            self.layer2_repository.remove_pgcs([model.RawCatalog.NATURE], pgcs_to_remove)
+
+        if not self.dry_run:
+            self.layer2_repository.update_last_update_time(
+                datetime.datetime.now(tz=datetime.UTC), model.RawCatalog.NATURE
+            )
+        self.log.info("Layer 2 nature import completed", last_update=last_update_dt.ctime())
+
+        if self.dry_run:
+            self._print_summary(objects_to_save, orphans_to_delete, type_distribution)
+
+    def _print_summary(
+        self,
+        objects_to_save: int,
+        orphans_to_delete: int,
+        type_distribution: dict[str, int],
+    ) -> None:
+        col_desc = "Description"
+        col_count = "Count"
+        type_rows = [(t, c) for t, c in sorted(type_distribution.items())]
+        width_desc = max(
+            len(col_desc),
+            30,
+            len("Distribution by type"),
+            *(len(f"  {t}") for t, _ in type_rows) if type_rows else [0],
+        )
+        width_count = max(
+            len(col_count),
+            len(str(objects_to_save)),
+            len(str(orphans_to_delete)),
+            *(len(str(c)) for _, c in type_rows) if type_rows else [0],
+        )
+        sep = f"+{'-' * (width_desc + 2)}+{'-' * (width_count + 2)}+"
+        lines = [
+            sep,
+            f"| {col_desc:<{width_desc}} | {col_count:>{width_count}} |",
+            sep,
+            f"| {'Objects to be saved':<{width_desc}} | {objects_to_save:>{width_count}} |",
+            f"| {'Orphans to be deleted':<{width_desc}} | {orphans_to_delete:>{width_count}} |",
+            sep,
+        ]
+        if type_rows:
+            lines.append(f"| {'Distribution by type':<{width_desc}} | {'':>{width_count}} |")
+            lines.extend([f"| {f'  {t}':<{width_desc}} | {c:>{width_count}} |" for t, c in type_rows])
+            lines.append(sep)
+        for line in lines:
+            print(line)
+
+    def cleanup(self) -> None:
+        self.pg_storage.disconnect()
diff --git a/app/tasks/registry.py b/app/tasks/registry.py
@@ -7,6 +7,7 @@
     interface,
     layer0_marking,
     layer2_import,
+    layer2_import_nature,
     layer2_orphan_cleanup,
     submit_crossmatch,
 )
@@ -16,6 +17,7 @@
     layer0_marking.Layer0MarkingTask,
     submit_crossmatch.SubmitCrossmatchTask,
     layer2_import.Layer2ImportTask,
+    layer2_import_nature.Layer2ImportNatureTask,
     layer2_orphan_cleanup.Layer2OrphanCleanupTask,
 ]
 
diff --git a/main.py b/main.py
@@ -10,8 +10,16 @@
 
 
 @click.group()
-def cli():
-    pass
+@click.option(
+    "--log-level",
+    type=click.Choice(["debug", "info", "warning", "error", "critical"], case_sensitive=False),
+    default="info",
+    help="Set the logging level (for runtask and other commands that use it)",
+)
+@click.pass_context
+def cli(ctx: click.Context, log_level: str) -> None:
+    ctx.ensure_object(dict)
+    ctx.obj["log_level"] = log_level
 
 
 @cli.command(short_help=AdminAPICommand.help())
@@ -50,14 +58,15 @@ def dataapi(config: str):
     type=str,
     help="Path to input data file",
 )
-@click.option(
-    "--log-level",
-    type=click.Choice(["debug", "info", "warning", "error", "critical"], case_sensitive=False),
-    default="info",
-    help="Set the logging level",
-)
+@click.pass_context
 @click.argument("task_args", nargs=-1, type=click.UNPROCESSED)
-def runtask(task_name: str, input_data: str | None, log_level: str, task_args: tuple[str, ...]):
+def runtask(
+    ctx: click.Context,
+    task_name: str,
+    input_data: str | None,
+    task_args: tuple[str, ...],
+) -> None:
+    log_level = (ctx.obj or {}).get("log_level", "info")
     commands.run(RunTaskCommand(task_name, input_data, None, task_args, log_level))
 
 
diff --git a/postgres/migrations/V025__nature_last_update_and_layer0_records_pgc_index.sql b/postgres/migrations/V025__nature_last_update_and_layer0_records_pgc_index.sql
@@ -0,0 +1,5 @@
+/* pgmigrate-encoding: utf-8 */
+
+INSERT INTO layer2.last_update (dt, catalog) VALUES (to_timestamp(0), 'nature');
+
+CREATE INDEX IF NOT EXISTS layer0_records_pgc_idx ON layer0.records (pgc);
diff --git a/tests/integration/layer1_repository_test.py b/tests/integration/layer1_repository_test.py
diff --git a/tests/integration/layer2_import_test.py b/tests/integration/layer2_import_test.py
diff --git a/tests/integration/layer2_repository_test.py b/tests/integration/layer2_repository_test.py
diff --git a/tests/lib/postgres.py b/tests/lib/postgres.py

Original file line number	Diff line number	Diff line change
`@@ -7,6 +7,7 @@`
`7`	`7`	`interface,`
`8`	`8`	`layer0_marking,`
`9`	`9`	`layer2_import,`
	`10`	`+ layer2_import_nature,`
`10`	`11`	`layer2_orphan_cleanup,`
`11`	`12`	`submit_crossmatch,`
`12`	`13`	`)`
`@@ -16,6 +17,7 @@`
`16`	`17`	`layer0_marking.Layer0MarkingTask,`
`17`	`18`	`submit_crossmatch.SubmitCrossmatchTask,`
`18`	`19`	`layer2_import.Layer2ImportTask,`
	`20`	`+ layer2_import_nature.Layer2ImportNatureTask,`
`19`	`21`	`layer2_orphan_cleanup.Layer2OrphanCleanupTask,`
`20`	`22`	`]`
`21`	`23`