|
| 1 | +import datetime |
| 2 | +from typing import final |
| 3 | + |
| 4 | +import structlog |
| 5 | + |
| 6 | +from app.data import model, repositories |
| 7 | +from app.lib import containers |
| 8 | +from app.lib.storage import postgres |
| 9 | +from app.tasks import interface |
| 10 | + |
| 11 | + |
| 12 | +@final |
| 13 | +class Layer2ImportNatureTask(interface.Task): |
| 14 | + def __init__( |
| 15 | + self, |
| 16 | + logger: structlog.stdlib.BoundLogger, |
| 17 | + batch_size: int = 100000, |
| 18 | + dry_run: bool = False, |
| 19 | + ) -> None: |
| 20 | + self.log = logger |
| 21 | + self.batch_size = batch_size |
| 22 | + self.dry_run = dry_run |
| 23 | + |
| 24 | + @classmethod |
| 25 | + def name(cls) -> str: |
| 26 | + return "layer2-import-nature" |
| 27 | + |
| 28 | + def prepare(self, config: interface.Config) -> None: |
| 29 | + self.pg_storage = postgres.PgStorage(config.storage, self.log) |
| 30 | + self.pg_storage.connect() |
| 31 | + self.layer1_repository = repositories.Layer1Repository(self.pg_storage, self.log) |
| 32 | + self.layer2_repository = repositories.Layer2Repository(self.pg_storage, self.log) |
| 33 | + |
| 34 | + def run(self) -> None: |
| 35 | + last_update_dt = self.layer2_repository.get_last_update_time(model.RawCatalog.NATURE) |
| 36 | + self.log.info( |
| 37 | + "Starting Layer 2 nature import", |
| 38 | + last_update=last_update_dt.ctime(), |
| 39 | + dry_run=self.dry_run, |
| 40 | + ) |
| 41 | + |
| 42 | + objects_to_save = 0 |
| 43 | + type_distribution: dict[str, int] = {} |
| 44 | + for offset, records in containers.read_batches( |
| 45 | + self.layer1_repository.get_new_nature_records, |
| 46 | + lambda data: len(data) == 0, |
| 47 | + 0, |
| 48 | + lambda d, _: d[-1].pgc, |
| 49 | + last_update_dt, |
| 50 | + batch_size=self.batch_size, |
| 51 | + ): |
| 52 | + records_by_pgc = containers.group_by(records, key_func=lambda r: r.pgc) |
| 53 | + pgcs: list[int] = [] |
| 54 | + data: list[list[str]] = [] |
| 55 | + for pgc, pgc_records in records_by_pgc.items(): |
| 56 | + type_counts: dict[str, int] = {} |
| 57 | + for rec in pgc_records: |
| 58 | + type_counts[rec.type_name] = type_counts.get(rec.type_name, 0) + 1 |
| 59 | + max_type = max(type_counts, key=lambda k: type_counts[k]) |
| 60 | + type_distribution[max_type] = type_distribution.get(max_type, 0) + 1 |
| 61 | + pgcs.append(pgc) |
| 62 | + data.append([max_type]) |
| 63 | + if pgcs: |
| 64 | + objects_to_save += len(pgcs) |
| 65 | + if not self.dry_run: |
| 66 | + self.layer2_repository.save("layer2.nature", ["type_name"], pgcs, data) |
| 67 | + self.log.info( |
| 68 | + "Processed batch", |
| 69 | + last_pgc=offset, |
| 70 | + batch_size=len(records), |
| 71 | + total_processed=objects_to_save, |
| 72 | + ) |
| 73 | + |
| 74 | + orphaned = self.layer2_repository.get_orphaned_pgcs([model.RawCatalog.NATURE]) |
| 75 | + pgcs_to_remove = [pgc for pgcs in orphaned.values() for pgc in pgcs] |
| 76 | + orphans_to_delete = len(pgcs_to_remove) |
| 77 | + if pgcs_to_remove and not self.dry_run: |
| 78 | + self.layer2_repository.remove_pgcs([model.RawCatalog.NATURE], pgcs_to_remove) |
| 79 | + |
| 80 | + if not self.dry_run: |
| 81 | + self.layer2_repository.update_last_update_time( |
| 82 | + datetime.datetime.now(tz=datetime.UTC), model.RawCatalog.NATURE |
| 83 | + ) |
| 84 | + self.log.info("Layer 2 nature import completed", last_update=last_update_dt.ctime()) |
| 85 | + |
| 86 | + if self.dry_run: |
| 87 | + self._print_summary(objects_to_save, orphans_to_delete, type_distribution) |
| 88 | + |
| 89 | + def _print_summary( |
| 90 | + self, |
| 91 | + objects_to_save: int, |
| 92 | + orphans_to_delete: int, |
| 93 | + type_distribution: dict[str, int], |
| 94 | + ) -> None: |
| 95 | + col_desc = "Description" |
| 96 | + col_count = "Count" |
| 97 | + type_rows = [(t, c) for t, c in sorted(type_distribution.items())] |
| 98 | + width_desc = max( |
| 99 | + len(col_desc), |
| 100 | + 30, |
| 101 | + len("Distribution by type"), |
| 102 | + *(len(f" {t}") for t, _ in type_rows) if type_rows else [0], |
| 103 | + ) |
| 104 | + width_count = max( |
| 105 | + len(col_count), |
| 106 | + len(str(objects_to_save)), |
| 107 | + len(str(orphans_to_delete)), |
| 108 | + *(len(str(c)) for _, c in type_rows) if type_rows else [0], |
| 109 | + ) |
| 110 | + sep = f"+{'-' * (width_desc + 2)}+{'-' * (width_count + 2)}+" |
| 111 | + lines = [ |
| 112 | + sep, |
| 113 | + f"| {col_desc:<{width_desc}} | {col_count:>{width_count}} |", |
| 114 | + sep, |
| 115 | + f"| {'Objects to be saved':<{width_desc}} | {objects_to_save:>{width_count}} |", |
| 116 | + f"| {'Orphans to be deleted':<{width_desc}} | {orphans_to_delete:>{width_count}} |", |
| 117 | + sep, |
| 118 | + ] |
| 119 | + if type_rows: |
| 120 | + lines.append(f"| {'Distribution by type':<{width_desc}} | {'':>{width_count}} |") |
| 121 | + lines.extend([f"| {f' {t}':<{width_desc}} | {c:>{width_count}} |" for t, c in type_rows]) |
| 122 | + lines.append(sep) |
| 123 | + for line in lines: |
| 124 | + print(line) |
| 125 | + |
| 126 | + def cleanup(self) -> None: |
| 127 | + self.pg_storage.disconnect() |
0 commit comments