From 1d7d684f541ffa6738d7ab6625e6f3744a886cd4 Mon Sep 17 00:00:00 2001 From: Katherine Bargar Date: Thu, 5 Mar 2026 17:52:53 +0000 Subject: [PATCH 01/18] Refactor metadata collection to be per property --- src/ldlite/database/_expansion/nodes.py | 79 ++++++++++++++----------- 1 file changed, 44 insertions(+), 35 deletions(-) diff --git a/src/ldlite/database/_expansion/nodes.py b/src/ldlite/database/_expansion/nodes.py index 7883f51..e9f41d3 100644 --- a/src/ldlite/database/_expansion/nodes.py +++ b/src/ldlite/database/_expansion/nodes.py @@ -131,74 +131,83 @@ def unnest( cur.execute( sql.SQL( """ +SELECT ldlite_system.jobject_keys(j) FROM ( + SELECT {json_col} AS j + FROM {table} + WHERE NOT ldlite_system.jis_null({json_col}) + LIMIT 1 +) +""", + ) + .format(table=source_table, json_col=self.identifier) + .as_string(), + ) + props = [prop[0] for prop in cur.fetchall()] + + for prop in props: + with conn.cursor() as cur: + cur.execute( + sql.SQL( + """ WITH - one_object AS ( - SELECT {json_col} AS json - FROM {table} - WHERE NOT ldlite_system.jis_null({json_col}) - LIMIT 1 - ), - props AS (SELECT ldlite_system.jobject_keys(json) AS prop FROM one_object), values AS ( - SELECT - prop - ,ldlite_system.jextract({json_col}, prop) as ld_value - FROM {table}, props + SELECT ldlite_system.jextract({json_col}, $1) as ld_value + FROM {table} ), value_and_types AS ( SELECT - prop - ,ldlite_system.jtype_of(ld_value) AS json_type + ldlite_system.jtype_of(ld_value) AS json_type ,ld_value FROM values WHERE NOT ldlite_system.jis_null(ld_value) ), array_values AS ( SELECT - v.prop - ,ldlite_system.jtype_of(a.ld_value) AS json_type - ,v.ld_value - FROM value_and_types v, ldlite_system.jexplode(v.ld_value) a + ldlite_system.jtype_of(a.ld_value) AS json_type + ,a.ld_value + FROM value_and_types v + CROSS JOIN LATERAL ( + SELECT s.ld_value + FROM ldlite_system.jexplode(v.ld_value) AS s + WHERE NOT ldlite_system.jis_null(s.ld_value) + LIMIT 3 + ) a WHERE v.json_type = 'array' ), all_values AS ( SELECT - prop - ,json_type + json_type ,ld_value ,FALSE AS is_array FROM value_and_types WHERE json_type <> 'array' UNION SELECT - prop - ,json_type + json_type ,ld_value ,TRUE AS is_array FROM array_values WHERE NOT ldlite_system.jis_null(ld_value) ) SELECT - prop - ,STRING_AGG(DISTINCT json_type, '|') AS json_type + STRING_AGG(DISTINCT json_type, '|') AS json_type ,bool_and(is_array) AS is_array ,bool_and(ldlite_system.jis_uuid(ld_value)) AS is_uuid ,bool_and(ldlite_system.jis_datetime(ld_value)) AS is_datetime ,bool_and(ldlite_system.jis_float(ld_value)) AS is_float FROM all_values -GROUP BY prop +HAVING COUNT(*) > 0 """, + ) + .format(table=source_table, json_col=self.identifier) + .as_string(), + (prop,), ) - .format(table=source_table, json_col=self.identifier) - .as_string(), - ) - - create_columns.extend( - [ - row.select_column(self.identifier, self.add(row)) - for row in [Metadata(*r) for r in cur.fetchall()] - ], - ) + if (row := cur.fetchone()) is not None: + meta = Metadata(prop, *row) + create_columns.append( + meta.select_column(self.identifier, self.add(meta)), + ) with conn.cursor() as cur: cur.execute( @@ -264,7 +273,7 @@ def explode( o_col = self.name + "__o" create_columns: list[sql.Composable] = [ sql.SQL( - "(ROW_NUMBER() OVER (ORDER BY (SELECT NULL)))::integer AS __id" + "(ROW_NUMBER() OVER (ORDER BY (SELECT NULL)))::integer AS __id", ), *[sql.Identifier(v) for v in self.carryover], sql.SQL( From 1fee373a23539cbdf8fa272022d619b8bb9033ef Mon Sep 17 00:00:00 2001 From: Katherine Bargar Date: Thu, 5 Mar 2026 18:27:17 +0000 Subject: [PATCH 02/18] Monitor progress of transformation --- src/ldlite/__init__.py | 16 +++++++++++++++- src/ldlite/database/__init__.py | 8 +++++++- src/ldlite/database/_expansion/__init__.py | 8 +++++++- src/ldlite/database/_expansion/nodes.py | 17 +++++++++++++++++ src/ldlite/database/_typed_database.py | 10 +++++++++- 5 files changed, 55 insertions(+), 4 deletions(-) diff --git a/src/ldlite/__init__.py b/src/ldlite/__init__.py index 1c118ae..e3b35be 100644 --- a/src/ldlite/__init__.py +++ b/src/ldlite/__init__.py @@ -350,7 +350,21 @@ def query( # noqa: C901, PLR0912, PLR0913, PLR0915 transform_started = datetime.now(timezone.utc) if not use_legacy_transform: - newtables = self._database.expand_prefix(table, json_depth, keep_raw) + with tqdm( + total=0, + desc="transforming", + leave=False, + mininterval=5, + disable=self._quiet, + unit="ops", + delay=5, + ) as progress: + newtables = self._database.expand_prefix( + table, + json_depth, + keep_raw, + progress, + ) if keep_raw: newtables = [table, *newtables] transform_elapsed = datetime.now(timezone.utc) - transform_started diff --git a/src/ldlite/database/__init__.py b/src/ldlite/database/__init__.py index 9acd7b8..8ed5875 100644 --- a/src/ldlite/database/__init__.py +++ b/src/ldlite/database/__init__.py @@ -54,7 +54,13 @@ def ingest_records(self, prefix: str, records: Iterator[bytes]) -> int: """Ingests a stream of records dowloaded from FOLIO to the raw table.""" @abstractmethod - def expand_prefix(self, prefix: str, json_depth: int, keep_raw: bool) -> list[str]: + def expand_prefix( + self, + prefix: str, + json_depth: int, + keep_raw: bool, + progress: tqdm[NoReturn] | None = None, + ) -> list[str]: """Unnests and explodes the raw data at the given prefix.""" @abstractmethod diff --git a/src/ldlite/database/_expansion/__init__.py b/src/ldlite/database/_expansion/__init__.py index d6f8bd5..4b22070 100644 --- a/src/ldlite/database/_expansion/__init__.py +++ b/src/ldlite/database/_expansion/__init__.py @@ -2,7 +2,7 @@ from collections import deque from dataclasses import dataclass -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, NoReturn from psycopg import sql @@ -13,6 +13,7 @@ import duckdb import psycopg + from tqdm import tqdm @dataclass @@ -34,6 +35,7 @@ class ExpandContext: ] # source_cte will go away when DuckDB implements CTAS RETURNING source_cte: Callable[[bool], str] + progress: tqdm[NoReturn] | None def array_context( self, @@ -48,6 +50,7 @@ def array_context( self.get_output_table, self.preprocess, self.source_cte, + self.progress, ) @@ -76,6 +79,7 @@ def _expand_nonmarc( ctx.source_table, ctx.get_transform_table(count), ctx.source_cte(False), + ctx.progress, ) expand_children_of = deque([root]) @@ -92,6 +96,7 @@ def _expand_nonmarc( ctx.get_transform_table(count), ctx.get_transform_table(count + 1), ctx.source_cte(False), + ctx.progress, ) expand_children_of.append(c) count += 1 @@ -109,6 +114,7 @@ def _expand_nonmarc( new_source_table, ctx.get_transform_table(count + 1), ctx.source_cte(True), + ctx.progress, ) count += 1 diff --git a/src/ldlite/database/_expansion/nodes.py b/src/ldlite/database/_expansion/nodes.py index e9f41d3..7f62729 100644 --- a/src/ldlite/database/_expansion/nodes.py +++ b/src/ldlite/database/_expansion/nodes.py @@ -7,9 +7,11 @@ if TYPE_CHECKING: from collections.abc import Iterator + from typing import NoReturn import duckdb import psycopg + from tqdm import tqdm from .metadata import Metadata @@ -121,6 +123,7 @@ def unnest( source_table: sql.Identifier, dest_table: sql.Identifier, source_cte: str, + progress: tqdm[NoReturn] | None, ) -> None: self.unnested = True create_columns: list[sql.Composable] = [ @@ -144,6 +147,10 @@ def unnest( ) props = [prop[0] for prop in cur.fetchall()] + prop_count = len(props) + if progress is not None: + progress.total += prop_count * 2 + for prop in props: with conn.cursor() as cur: cur.execute( @@ -208,6 +215,11 @@ def unnest( create_columns.append( meta.select_column(self.identifier, self.add(meta)), ) + if progress is not None and meta.is_array: + progress.total += 1 + + if progress is not None: + progress.update(1) with conn.cursor() as cur: cur.execute( @@ -229,6 +241,8 @@ def unnest( ) .as_string(), ) + if progress is not None: + progress.update(prop_count) def _carryover(self) -> Iterator[str]: for n in self.root.descendents: @@ -268,6 +282,7 @@ def explode( source_table: sql.Identifier, dest_table: sql.Identifier, source_cte: str, + progress: tqdm[NoReturn] | None, ) -> list[str]: with conn.cursor() as cur: o_col = self.name + "__o" @@ -310,6 +325,8 @@ def explode( ) .as_string(), ) + if progress is not None: + progress.update(1) return ["__id", *self.carryover, o_col] diff --git a/src/ldlite/database/_typed_database.py b/src/ldlite/database/_typed_database.py index eb4a1f3..a9197ad 100644 --- a/src/ldlite/database/_typed_database.py +++ b/src/ldlite/database/_typed_database.py @@ -16,6 +16,7 @@ if TYPE_CHECKING: from collections.abc import Callable, Sequence + from typing import NoReturn import duckdb from tqdm import tqdm @@ -184,7 +185,13 @@ def preprocess_source_table( @abstractmethod def source_table_cte_stmt(self, keep_source: bool) -> str: ... - def expand_prefix(self, prefix: str, json_depth: int, keep_raw: bool) -> list[str]: + def expand_prefix( + self, + prefix: str, + json_depth: int, + keep_raw: bool, + progress: tqdm[NoReturn] | None = None, + ) -> list[str]: pfx = Prefix(prefix) with closing(self._conn_factory()) as conn: self._drop_extracted_tables(conn, pfx) @@ -224,6 +231,7 @@ def expand_prefix(self, prefix: str, json_depth: int, keep_raw: bool) -> list[st pfx.output_table, self.preprocess_source_table, # type: ignore [arg-type] self.source_table_cte_stmt, + progress, ), ) From b37dd1f0645c8cbc1cc51fea5f5e5316ab2ab587 Mon Sep 17 00:00:00 2001 From: Katherine Bargar Date: Thu, 5 Mar 2026 21:07:53 +0000 Subject: [PATCH 03/18] Optimize the definitions of postgres json operators --- src/ldlite/database/_postgres.py | 192 +++++++++++++++++++++++++------ 1 file changed, 156 insertions(+), 36 deletions(-) diff --git a/src/ldlite/database/_postgres.py b/src/ldlite/database/_postgres.py index a9b4b81..e4de4ff 100644 --- a/src/ldlite/database/_postgres.py +++ b/src/ldlite/database/_postgres.py @@ -42,49 +42,53 @@ def _setup_jfuncs(conn: psycopg.Connection) -> None: PARALLEL SAFE; CREATE OR REPLACE FUNCTION ldlite_system.jextract(j JSONB, p TEXT) RETURNS JSONB AS $$ -WITH jp AS ( - -- This is somewhat of a hack. - -- There isn't a really good way to get the element unchanged - -- which works for duckdb and postgres AND CRUCIALLY - -- has a similar syntax to everything else so that we don't - -- have to have special cases for exploding the array and it - -- can share all the same type checking / statement generation code. - -- We're pretending that postgres supports -> '$' style syntax like duckdb. - SELECT - CASE - WHEN p = '$' THEN j #> '{}' - ELSE j->p - END AS val - ,CASE - WHEN p = '$' THEN j #>> '{}' - ELSE j->>p - END AS str -) SELECT - CASE - WHEN ldlite_system.jtype_of(jp.val) = 'string' THEN + CASE jsonb_typeof(val) + WHEN 'string' THEN CASE - WHEN lower(jp.str) = 'null' THEN 'null'::JSONB - WHEN length(jp.str) = 0 THEN 'null'::JSONB - ELSE jp.val + WHEN lower(val #>> '{}') IN ('null', '') THEN 'null'::jsonb + ELSE val END - WHEN ldlite_system.jtype_of(jp.val) = 'array' THEN + WHEN 'array' THEN CASE - WHEN jsonb_array_length(jsonb_path_query_array(jp.val, '$[*] ? (@ != null)')) = 0 THEN 'null'::JSONB - ELSE jsonb_path_query_array(jp.val, '$[*] ? (@ != null)') + WHEN jsonb_array_length(val) = 0 THEN 'null'::jsonb + WHEN NOT EXISTS ( + SELECT 1 + FROM jsonb_array_elements(val) AS e(elem) + WHERE elem = 'null'::jsonb + LIMIT 1 + ) THEN val + ELSE COALESCE( + ( + SELECT jsonb_agg(e) + FROM jsonb_array_elements(val) AS a(e) + WHERE e <> 'null'::jsonb + ), + 'null'::jsonb + ) END - WHEN ldlite_system.jtype_of(jp.val) = 'object' THEN + WHEN 'object' THEN CASE - WHEN jp.str = '{}' THEN 'null'::JSONB - ELSE jp.val + WHEN val = '{}'::jsonb THEN 'null'::jsonb + ELSE val END - ELSE jp.val + ELSE val END -FROM jp; +FROM ( + -- This is somewhat of a hack. + -- There isn't a really good way to get the element unchanged + -- which works for duckdb and postgres AND CRUCIALLY + -- has a similar syntax to everything else so that we don't + -- have to have special cases for exploding the array and it + -- can share all the same type checking / statement generation code. + -- We're pretending that postgres supports -> '$' style syntax like duckdb. + SELECT CASE WHEN p = '$' THEN j ELSE j->p END AS val +) s; $$ LANGUAGE sql IMMUTABLE -PARALLEL SAFE; +PARALLEL SAFE +STRICT; CREATE OR REPLACE FUNCTION ldlite_system.jextract_string(j JSONB, p TEXT) RETURNS TEXT AS $$ SELECT ldlite_system.jextract(j, p) #>> '{}' @@ -103,7 +107,67 @@ def _setup_jfuncs(conn: psycopg.Connection) -> None: CREATE OR REPLACE FUNCTION ldlite_system.jis_uuid(j JSONB) RETURNS BOOLEAN AS $$ SELECT CASE - WHEN ldlite_system.jtype_of(j) = 'string' THEN j->>0 ~ '^[a-fA-F0-9]{8}-[a-fA-F0-9]{4}-[1-5][a-fA-F0-9]{3}-[89abAB][a-fA-F0-9]{3}-[a-fA-F0-9]{12}$' + WHEN jsonb_typeof(j) = 'string' THEN + ( + WITH v AS (SELECT $1 #>> '{}' AS s) + SELECT + LENGTH(s) = 36 + -- Hyphens at canonical positions: 9,14,19,24 + AND substr(s, 9,1) = '-' + AND substr(s, 14,1) = '-' + AND substr(s, 19,1) = '-' + AND substr(s, 24,1) = '-' + + -- Version M at pos 15 must be 1..5 + AND substr(s, 15,1) BETWEEN '1' AND '5' + + -- Variant N at pos 20 must be 8,9,a,b,A,B + AND substr(s, 20,1) IN ('8','9','a','b','A','B') + + -- All other non-hyphen characters must be hex [0-9a-fA-F] + AND ( + -- positions 1..8 + (substr(s, 1,1) BETWEEN '0' AND '9' OR substr(s, 1,1) BETWEEN 'a' AND 'f' OR substr(s, 1,1) BETWEEN 'A' AND 'F') AND + (substr(s, 2,1) BETWEEN '0' AND '9' OR substr(s, 2,1) BETWEEN 'a' AND 'f' OR substr(s, 2,1) BETWEEN 'A' AND 'F') AND + (substr(s, 3,1) BETWEEN '0' AND '9' OR substr(s, 3,1) BETWEEN 'a' AND 'f' OR substr(s, 3,1) BETWEEN 'A' AND 'F') AND + (substr(s, 4,1) BETWEEN '0' AND '9' OR substr(s, 4,1) BETWEEN 'a' AND 'f' OR substr(s, 4,1) BETWEEN 'A' AND 'F') AND + (substr(s, 5,1) BETWEEN '0' AND '9' OR substr(s, 5,1) BETWEEN 'a' AND 'f' OR substr(s, 5,1) BETWEEN 'A' AND 'F') AND + (substr(s, 6,1) BETWEEN '0' AND '9' OR substr(s, 6,1) BETWEEN 'a' AND 'f' OR substr(s, 6,1) BETWEEN 'A' AND 'F') AND + (substr(s, 7,1) BETWEEN '0' AND '9' OR substr(s, 7,1) BETWEEN 'a' AND 'f' OR substr(s, 7,1) BETWEEN 'A' AND 'F') AND + (substr(s, 8,1) BETWEEN '0' AND '9' OR substr(s, 8,1) BETWEEN 'a' AND 'f' OR substr(s, 8,1) BETWEEN 'A' AND 'F') AND + + -- positions 10..13 + (substr(s, 10,1) BETWEEN '0' AND '9' OR substr(s, 10,1) BETWEEN 'a' AND 'f' OR substr(s, 10,1) BETWEEN 'A' AND 'F') AND + (substr(s, 11,1) BETWEEN '0' AND '9' OR substr(s, 11,1) BETWEEN 'a' AND 'f' OR substr(s, 11,1) BETWEEN 'A' AND 'F') AND + (substr(s, 12,1) BETWEEN '0' AND '9' OR substr(s, 12,1) BETWEEN 'a' AND 'f' OR substr(s, 12,1) BETWEEN 'A' AND 'F') AND + (substr(s, 13,1) BETWEEN '0' AND '9' OR substr(s, 13,1) BETWEEN 'a' AND 'f' OR substr(s, 13,1) BETWEEN 'A' AND 'F') AND + + -- positions 16..18 (pos 15 is version, already checked) + (substr(s, 16,1) BETWEEN '0' AND '9' OR substr(s, 16,1) BETWEEN 'a' AND 'f' OR substr(s, 16,1) BETWEEN 'A' AND 'F') AND + (substr(s, 17,1) BETWEEN '0' AND '9' OR substr(s, 17,1) BETWEEN 'a' AND 'f' OR substr(s, 17,1) BETWEEN 'A' AND 'F') AND + (substr(s, 18,1) BETWEEN '0' AND '9' OR substr(s, 18,1) BETWEEN 'a' AND 'f' OR substr(s, 18,1) BETWEEN 'A' AND 'F') AND + + -- positions 21..23 (pos 20 is variant, already checked) + (substr(s, 21,1) BETWEEN '0' AND '9' OR substr(s, 21,1) BETWEEN 'a' AND 'f' OR substr(s, 21,1) BETWEEN 'A' AND 'F') AND + (substr(s, 22,1) BETWEEN '0' AND '9' OR substr(s, 22,1) BETWEEN 'a' AND 'f' OR substr(s, 22,1) BETWEEN 'A' AND 'F') AND + (substr(s, 23,1) BETWEEN '0' AND '9' OR substr(s, 23,1) BETWEEN 'a' AND 'f' OR substr(s, 23,1) BETWEEN 'A' AND 'F') AND + + -- positions 25..36 + (substr(s, 25,1) BETWEEN '0' AND '9' OR substr(s, 25,1) BETWEEN 'a' AND 'f' OR substr(s, 25,1) BETWEEN 'A' AND 'F') AND + (substr(s, 26,1) BETWEEN '0' AND '9' OR substr(s, 26,1) BETWEEN 'a' AND 'f' OR substr(s, 26,1) BETWEEN 'A' AND 'F') AND + (substr(s, 27,1) BETWEEN '0' AND '9' OR substr(s, 27,1) BETWEEN 'a' AND 'f' OR substr(s, 27,1) BETWEEN 'A' AND 'F') AND + (substr(s, 28,1) BETWEEN '0' AND '9' OR substr(s, 28,1) BETWEEN 'a' AND 'f' OR substr(s, 28,1) BETWEEN 'A' AND 'F') AND + (substr(s, 29,1) BETWEEN '0' AND '9' OR substr(s, 29,1) BETWEEN 'a' AND 'f' OR substr(s, 29,1) BETWEEN 'A' AND 'F') AND + (substr(s, 30,1) BETWEEN '0' AND '9' OR substr(s, 30,1) BETWEEN 'a' AND 'f' OR substr(s, 30,1) BETWEEN 'A' AND 'F') AND + (substr(s, 31,1) BETWEEN '0' AND '9' OR substr(s, 31,1) BETWEEN 'a' AND 'f' OR substr(s, 31,1) BETWEEN 'A' AND 'F') AND + (substr(s, 32,1) BETWEEN '0' AND '9' OR substr(s, 32,1) BETWEEN 'a' AND 'f' OR substr(s, 32,1) BETWEEN 'A' AND 'F') AND + (substr(s, 33,1) BETWEEN '0' AND '9' OR substr(s, 33,1) BETWEEN 'a' AND 'f' OR substr(s, 33,1) BETWEEN 'A' AND 'F') AND + (substr(s, 34,1) BETWEEN '0' AND '9' OR substr(s, 34,1) BETWEEN 'a' AND 'f' OR substr(s, 34,1) BETWEEN 'A' AND 'F') AND + (substr(s, 35,1) BETWEEN '0' AND '9' OR substr(s, 35,1) BETWEEN 'a' AND 'f' OR substr(s, 35,1) BETWEEN 'A' AND 'F') AND + (substr(s, 36,1) BETWEEN '0' AND '9' OR substr(s, 36,1) BETWEEN 'a' AND 'f' OR substr(s, 36,1) BETWEEN 'A' AND 'F') + ) + FROM v + ) ELSE FALSE END; $$ @@ -114,7 +178,63 @@ def _setup_jfuncs(conn: psycopg.Connection) -> None: CREATE OR REPLACE FUNCTION ldlite_system.jis_datetime(j JSONB) RETURNS BOOLEAN AS $$ SELECT CASE - WHEN ldlite_system.jtype_of(j) = 'string' THEN j->>0 ~ '^\d{4}-[01]\d-[0123]\dT[012]\d:[012345]\d:[012345]\d\.\d{3}(\+\d{2}:\d{2})?$' + WHEN jsonb_typeof(j) = 'string' THEN + ( + WITH v AS (SELECT $1 #>> '{}' AS s) + SELECT + -- Length must be exactly 29 characters + length(s) = 29 + + -- Fixed punctuation positions + AND substr(s, 5, 1) = '-' + AND substr(s, 8, 1) = '-' + AND substr(s, 11, 1) = 'T' + AND substr(s, 14, 1) = ':' + AND substr(s, 17, 1) = ':' + AND substr(s, 20, 1) = '.' + AND substr(s, 24, 1) = '+' + AND substr(s, 27, 1) = ':' + + -- YYYY + AND substr(s, 1, 1) BETWEEN '0' AND '9' + AND substr(s, 2, 1) BETWEEN '0' AND '9' + AND substr(s, 3, 1) BETWEEN '0' AND '9' + AND substr(s, 4, 1) BETWEEN '0' AND '9' + + -- MM + AND substr(s, 6, 1) BETWEEN '0' AND '9' + AND substr(s, 7, 1) BETWEEN '0' AND '9' + + -- DD + AND substr(s, 9, 1) BETWEEN '0' AND '9' + AND substr(s, 10, 1) BETWEEN '0' AND '9' + + -- HH + AND substr(s, 12, 1) BETWEEN '0' AND '9' + AND substr(s, 13, 1) BETWEEN '0' AND '9' + + -- mm + AND substr(s, 15, 1) BETWEEN '0' AND '9' + AND substr(s, 16, 1) BETWEEN '0' AND '9' + + -- SS + AND substr(s, 18, 1) BETWEEN '0' AND '9' + AND substr(s, 19, 1) BETWEEN '0' AND '9' + + -- mmm + AND substr(s, 21, 1) BETWEEN '0' AND '9' + AND substr(s, 22, 1) BETWEEN '0' AND '9' + AND substr(s, 23, 1) BETWEEN '0' AND '9' + + -- Timezone HH + AND substr(s, 25, 1) BETWEEN '0' AND '9' + AND substr(s, 26, 1) BETWEEN '0' AND '9' + + -- Timezone MM + AND substr(s, 28, 1) BETWEEN '0' AND '9' + AND substr(s, 29, 1) BETWEEN '0' AND '9' + FROM v + ) ELSE FALSE END; $$ @@ -125,7 +245,7 @@ def _setup_jfuncs(conn: psycopg.Connection) -> None: CREATE OR REPLACE FUNCTION ldlite_system.jis_float(j JSONB) RETURNS BOOLEAN AS $$ SELECT CASE - WHEN ldlite_system.jtype_of(j) = 'number' THEN j->>0 LIKE '%.%' + WHEN jsonb_typeof(j) = 'number' THEN scale((j)::numeric) > 0 ELSE FALSE END; $$ @@ -134,7 +254,7 @@ def _setup_jfuncs(conn: psycopg.Connection) -> None: PARALLEL SAFE; CREATE OR REPLACE FUNCTION ldlite_system.jis_null(j JSONB) RETURNS BOOLEAN AS $$ -SELECT j IS NULL OR j = 'null'::JSONB; +SELECT COALESCE(j = 'null'::jsonb, TRUE); $$ LANGUAGE sql IMMUTABLE From 0fe59864a768f26e30015ec7cd62c1fa515e04d5 Mon Sep 17 00:00:00 2001 From: Katherine Bargar Date: Fri, 6 Mar 2026 15:47:48 +0000 Subject: [PATCH 04/18] Simplify checks for specific data types --- src/ldlite/database/_expansion/nodes.py | 8 +- src/ldlite/database/_postgres.py | 143 ++---------------------- tests/test_json_operators.py | 11 -- 3 files changed, 13 insertions(+), 149 deletions(-) diff --git a/src/ldlite/database/_expansion/nodes.py b/src/ldlite/database/_expansion/nodes.py index 7f62729..6659784 100644 --- a/src/ldlite/database/_expansion/nodes.py +++ b/src/ldlite/database/_expansion/nodes.py @@ -199,12 +199,12 @@ def unnest( SELECT STRING_AGG(DISTINCT json_type, '|') AS json_type ,bool_and(is_array) AS is_array - ,bool_and(ldlite_system.jis_uuid(ld_value)) AS is_uuid - ,bool_and(ldlite_system.jis_datetime(ld_value)) AS is_datetime - ,bool_and(ldlite_system.jis_float(ld_value)) AS is_float + ,bool_and(json_type = 'string' AND ldlite_system.jis_uuid(ld_value)) AS is_uuid + ,bool_and(json_type = 'string' AND ldlite_system.jis_datetime(ld_value)) AS is_datetime + ,bool_and(json_type = 'number' AND ldlite_system.jis_float(ld_value)) AS is_float FROM all_values HAVING COUNT(*) > 0 -""", +""", # noqa: E501 ) .format(table=source_table, json_col=self.identifier) .as_string(), diff --git a/src/ldlite/database/_postgres.py b/src/ldlite/database/_postgres.py index e4de4ff..0bebfa3 100644 --- a/src/ldlite/database/_postgres.py +++ b/src/ldlite/database/_postgres.py @@ -105,153 +105,28 @@ def _setup_jfuncs(conn: psycopg.Connection) -> None: PARALLEL SAFE; CREATE OR REPLACE FUNCTION ldlite_system.jis_uuid(j JSONB) RETURNS BOOLEAN AS $$ -SELECT - CASE - WHEN jsonb_typeof(j) = 'string' THEN - ( - WITH v AS (SELECT $1 #>> '{}' AS s) - SELECT - LENGTH(s) = 36 - -- Hyphens at canonical positions: 9,14,19,24 - AND substr(s, 9,1) = '-' - AND substr(s, 14,1) = '-' - AND substr(s, 19,1) = '-' - AND substr(s, 24,1) = '-' - - -- Version M at pos 15 must be 1..5 - AND substr(s, 15,1) BETWEEN '1' AND '5' - - -- Variant N at pos 20 must be 8,9,a,b,A,B - AND substr(s, 20,1) IN ('8','9','a','b','A','B') - - -- All other non-hyphen characters must be hex [0-9a-fA-F] - AND ( - -- positions 1..8 - (substr(s, 1,1) BETWEEN '0' AND '9' OR substr(s, 1,1) BETWEEN 'a' AND 'f' OR substr(s, 1,1) BETWEEN 'A' AND 'F') AND - (substr(s, 2,1) BETWEEN '0' AND '9' OR substr(s, 2,1) BETWEEN 'a' AND 'f' OR substr(s, 2,1) BETWEEN 'A' AND 'F') AND - (substr(s, 3,1) BETWEEN '0' AND '9' OR substr(s, 3,1) BETWEEN 'a' AND 'f' OR substr(s, 3,1) BETWEEN 'A' AND 'F') AND - (substr(s, 4,1) BETWEEN '0' AND '9' OR substr(s, 4,1) BETWEEN 'a' AND 'f' OR substr(s, 4,1) BETWEEN 'A' AND 'F') AND - (substr(s, 5,1) BETWEEN '0' AND '9' OR substr(s, 5,1) BETWEEN 'a' AND 'f' OR substr(s, 5,1) BETWEEN 'A' AND 'F') AND - (substr(s, 6,1) BETWEEN '0' AND '9' OR substr(s, 6,1) BETWEEN 'a' AND 'f' OR substr(s, 6,1) BETWEEN 'A' AND 'F') AND - (substr(s, 7,1) BETWEEN '0' AND '9' OR substr(s, 7,1) BETWEEN 'a' AND 'f' OR substr(s, 7,1) BETWEEN 'A' AND 'F') AND - (substr(s, 8,1) BETWEEN '0' AND '9' OR substr(s, 8,1) BETWEEN 'a' AND 'f' OR substr(s, 8,1) BETWEEN 'A' AND 'F') AND - - -- positions 10..13 - (substr(s, 10,1) BETWEEN '0' AND '9' OR substr(s, 10,1) BETWEEN 'a' AND 'f' OR substr(s, 10,1) BETWEEN 'A' AND 'F') AND - (substr(s, 11,1) BETWEEN '0' AND '9' OR substr(s, 11,1) BETWEEN 'a' AND 'f' OR substr(s, 11,1) BETWEEN 'A' AND 'F') AND - (substr(s, 12,1) BETWEEN '0' AND '9' OR substr(s, 12,1) BETWEEN 'a' AND 'f' OR substr(s, 12,1) BETWEEN 'A' AND 'F') AND - (substr(s, 13,1) BETWEEN '0' AND '9' OR substr(s, 13,1) BETWEEN 'a' AND 'f' OR substr(s, 13,1) BETWEEN 'A' AND 'F') AND - - -- positions 16..18 (pos 15 is version, already checked) - (substr(s, 16,1) BETWEEN '0' AND '9' OR substr(s, 16,1) BETWEEN 'a' AND 'f' OR substr(s, 16,1) BETWEEN 'A' AND 'F') AND - (substr(s, 17,1) BETWEEN '0' AND '9' OR substr(s, 17,1) BETWEEN 'a' AND 'f' OR substr(s, 17,1) BETWEEN 'A' AND 'F') AND - (substr(s, 18,1) BETWEEN '0' AND '9' OR substr(s, 18,1) BETWEEN 'a' AND 'f' OR substr(s, 18,1) BETWEEN 'A' AND 'F') AND - - -- positions 21..23 (pos 20 is variant, already checked) - (substr(s, 21,1) BETWEEN '0' AND '9' OR substr(s, 21,1) BETWEEN 'a' AND 'f' OR substr(s, 21,1) BETWEEN 'A' AND 'F') AND - (substr(s, 22,1) BETWEEN '0' AND '9' OR substr(s, 22,1) BETWEEN 'a' AND 'f' OR substr(s, 22,1) BETWEEN 'A' AND 'F') AND - (substr(s, 23,1) BETWEEN '0' AND '9' OR substr(s, 23,1) BETWEEN 'a' AND 'f' OR substr(s, 23,1) BETWEEN 'A' AND 'F') AND - - -- positions 25..36 - (substr(s, 25,1) BETWEEN '0' AND '9' OR substr(s, 25,1) BETWEEN 'a' AND 'f' OR substr(s, 25,1) BETWEEN 'A' AND 'F') AND - (substr(s, 26,1) BETWEEN '0' AND '9' OR substr(s, 26,1) BETWEEN 'a' AND 'f' OR substr(s, 26,1) BETWEEN 'A' AND 'F') AND - (substr(s, 27,1) BETWEEN '0' AND '9' OR substr(s, 27,1) BETWEEN 'a' AND 'f' OR substr(s, 27,1) BETWEEN 'A' AND 'F') AND - (substr(s, 28,1) BETWEEN '0' AND '9' OR substr(s, 28,1) BETWEEN 'a' AND 'f' OR substr(s, 28,1) BETWEEN 'A' AND 'F') AND - (substr(s, 29,1) BETWEEN '0' AND '9' OR substr(s, 29,1) BETWEEN 'a' AND 'f' OR substr(s, 29,1) BETWEEN 'A' AND 'F') AND - (substr(s, 30,1) BETWEEN '0' AND '9' OR substr(s, 30,1) BETWEEN 'a' AND 'f' OR substr(s, 30,1) BETWEEN 'A' AND 'F') AND - (substr(s, 31,1) BETWEEN '0' AND '9' OR substr(s, 31,1) BETWEEN 'a' AND 'f' OR substr(s, 31,1) BETWEEN 'A' AND 'F') AND - (substr(s, 32,1) BETWEEN '0' AND '9' OR substr(s, 32,1) BETWEEN 'a' AND 'f' OR substr(s, 32,1) BETWEEN 'A' AND 'F') AND - (substr(s, 33,1) BETWEEN '0' AND '9' OR substr(s, 33,1) BETWEEN 'a' AND 'f' OR substr(s, 33,1) BETWEEN 'A' AND 'F') AND - (substr(s, 34,1) BETWEEN '0' AND '9' OR substr(s, 34,1) BETWEEN 'a' AND 'f' OR substr(s, 34,1) BETWEEN 'A' AND 'F') AND - (substr(s, 35,1) BETWEEN '0' AND '9' OR substr(s, 35,1) BETWEEN 'a' AND 'f' OR substr(s, 35,1) BETWEEN 'A' AND 'F') AND - (substr(s, 36,1) BETWEEN '0' AND '9' OR substr(s, 36,1) BETWEEN 'a' AND 'f' OR substr(s, 36,1) BETWEEN 'A' AND 'F') - ) - FROM v - ) - ELSE FALSE - END; +SELECT j::text ~* '^"[a-f0-9]{8}-[a-f0-9]{4}-[1-5][a-f0-9]{3}-[89ab][a-f0-9]{3}-[a-f0-9]{12}"$'; $$ LANGUAGE sql IMMUTABLE -PARALLEL SAFE; +PARALLEL SAFE +STRICT; CREATE OR REPLACE FUNCTION ldlite_system.jis_datetime(j JSONB) RETURNS BOOLEAN AS $$ -SELECT - CASE - WHEN jsonb_typeof(j) = 'string' THEN - ( - WITH v AS (SELECT $1 #>> '{}' AS s) - SELECT - -- Length must be exactly 29 characters - length(s) = 29 - - -- Fixed punctuation positions - AND substr(s, 5, 1) = '-' - AND substr(s, 8, 1) = '-' - AND substr(s, 11, 1) = 'T' - AND substr(s, 14, 1) = ':' - AND substr(s, 17, 1) = ':' - AND substr(s, 20, 1) = '.' - AND substr(s, 24, 1) = '+' - AND substr(s, 27, 1) = ':' - - -- YYYY - AND substr(s, 1, 1) BETWEEN '0' AND '9' - AND substr(s, 2, 1) BETWEEN '0' AND '9' - AND substr(s, 3, 1) BETWEEN '0' AND '9' - AND substr(s, 4, 1) BETWEEN '0' AND '9' - - -- MM - AND substr(s, 6, 1) BETWEEN '0' AND '9' - AND substr(s, 7, 1) BETWEEN '0' AND '9' - - -- DD - AND substr(s, 9, 1) BETWEEN '0' AND '9' - AND substr(s, 10, 1) BETWEEN '0' AND '9' - - -- HH - AND substr(s, 12, 1) BETWEEN '0' AND '9' - AND substr(s, 13, 1) BETWEEN '0' AND '9' - - -- mm - AND substr(s, 15, 1) BETWEEN '0' AND '9' - AND substr(s, 16, 1) BETWEEN '0' AND '9' - - -- SS - AND substr(s, 18, 1) BETWEEN '0' AND '9' - AND substr(s, 19, 1) BETWEEN '0' AND '9' - - -- mmm - AND substr(s, 21, 1) BETWEEN '0' AND '9' - AND substr(s, 22, 1) BETWEEN '0' AND '9' - AND substr(s, 23, 1) BETWEEN '0' AND '9' - - -- Timezone HH - AND substr(s, 25, 1) BETWEEN '0' AND '9' - AND substr(s, 26, 1) BETWEEN '0' AND '9' - - -- Timezone MM - AND substr(s, 28, 1) BETWEEN '0' AND '9' - AND substr(s, 29, 1) BETWEEN '0' AND '9' - FROM v - ) - ELSE FALSE - END; +SELECT j::text ~ '^"\d{4}-[01]\d-[0123]\dT[012]\d:[012345]\d:[012345]\d\.\d{3}(\+\d{2}:\d{2})?"$' $$ LANGUAGE sql IMMUTABLE -PARALLEL SAFE; +PARALLEL SAFE +STRICT; CREATE OR REPLACE FUNCTION ldlite_system.jis_float(j JSONB) RETURNS BOOLEAN AS $$ -SELECT - CASE - WHEN jsonb_typeof(j) = 'number' THEN scale((j)::numeric) > 0 - ELSE FALSE - END; +SELECT SCALE((j)::numeric) > 0 $$ LANGUAGE sql IMMUTABLE -PARALLEL SAFE; +PARALLEL SAFE +STRICT; CREATE OR REPLACE FUNCTION ldlite_system.jis_null(j JSONB) RETURNS BOOLEAN AS $$ SELECT COALESCE(j = 'null'::jsonb, TRUE); diff --git a/tests/test_json_operators.py b/tests/test_json_operators.py index d89cf44..2ddfe13 100644 --- a/tests/test_json_operators.py +++ b/tests/test_json_operators.py @@ -177,7 +177,6 @@ def case_jexplode(p: tuple[Any, ...]) -> JsonTC: ("str", False), ("str_empty", False), ("num", False), - ("na", False), ("na_str1", False), ("na_str2", False), ("uuid_nof", False), @@ -218,7 +217,6 @@ def case_jis_null(p: tuple[Any, ...]) -> JsonTC: ("str", False), ("str_empty", False), ("num", False), - ("na", False), ("na_str1", False), ("na_str2", False), ("uuid_nof", False), @@ -239,15 +237,6 @@ def case_jis_datetime(p: tuple[Any, ...]) -> JsonTC: @parametrize( p=[ - ("str", False), - ("str_empty", False), - ("num", False), - ("na", False), - ("na_str1", False), - ("na_str2", False), - ("uuid_nof", False), - ("uuid", False), - ("dt", False), ("num", False), ("float", True), ], From f6a59783bedc6199ff9ed39ea130d3f06b8fc582 Mon Sep 17 00:00:00 2001 From: Katherine Bargar Date: Fri, 6 Mar 2026 16:31:01 +0000 Subject: [PATCH 05/18] Allow postgres to short circuit data type checks --- src/ldlite/database/_expansion/nodes.py | 30 +++++++++++++++++-------- 1 file changed, 21 insertions(+), 9 deletions(-) diff --git a/src/ldlite/database/_expansion/nodes.py b/src/ldlite/database/_expansion/nodes.py index 6659784..7c82077 100644 --- a/src/ldlite/database/_expansion/nodes.py +++ b/src/ldlite/database/_expansion/nodes.py @@ -197,20 +197,32 @@ def unnest( WHERE NOT ldlite_system.jis_null(ld_value) ) SELECT - STRING_AGG(DISTINCT json_type, '|') AS json_type - ,bool_and(is_array) AS is_array - ,bool_and(json_type = 'string' AND ldlite_system.jis_uuid(ld_value)) AS is_uuid - ,bool_and(json_type = 'string' AND ldlite_system.jis_datetime(ld_value)) AS is_datetime - ,bool_and(json_type = 'number' AND ldlite_system.jis_float(ld_value)) AS is_float -FROM all_values -HAVING COUNT(*) > 0 -""", # noqa: E501 + (SELECT STRING_AGG(DISTINCT json_type, '|') FROM all_values) AS json_type + ,(SELECT BOOL_AND(is_array) FROM all_values) AS is_array + ,NOT EXISTS + ( + SELECT 1 FROM all_values + WHERE json_type = 'string' AND NOT ldlite_system.jis_uuid(ld_value) + ) AS is_uuid + ,NOT EXISTS + ( + SELECT 1 FROM all_values + WHERE json_type = 'string' AND NOT ldlite_system.jis_datetime(ld_value) + ) AS is_datetime + ,NOT EXISTS + ( + SELECT 1 FROM all_values + WHERE json_type = 'number' AND NOT ldlite_system.jis_float(ld_value) + ) AS is_float +""", ) .format(table=source_table, json_col=self.identifier) .as_string(), (prop,), ) - if (row := cur.fetchone()) is not None: + if (row := cur.fetchone()) is not None and all( + c is not None for c in row + ): meta = Metadata(prop, *row) create_columns.append( meta.select_column(self.identifier, self.add(meta)), From 4706f98adabdfea75341f741ac8960ee93a228bd Mon Sep 17 00:00:00 2001 From: Katherine Bargar Date: Fri, 6 Mar 2026 16:50:41 +0000 Subject: [PATCH 06/18] Sample large tables for column metadata --- src/ldlite/database/_duckdb.py | 4 ++ src/ldlite/database/_expansion/__init__.py | 56 ++-------------------- src/ldlite/database/_expansion/context.py | 52 ++++++++++++++++++++ src/ldlite/database/_expansion/nodes.py | 49 ++++++++++++------- src/ldlite/database/_postgres.py | 4 ++ src/ldlite/database/_typed_database.py | 8 +++- 6 files changed, 104 insertions(+), 69 deletions(-) create mode 100644 src/ldlite/database/_expansion/context.py diff --git a/src/ldlite/database/_duckdb.py b/src/ldlite/database/_duckdb.py index 98ac5bb..f4f6035 100644 --- a/src/ldlite/database/_duckdb.py +++ b/src/ldlite/database/_duckdb.py @@ -134,6 +134,10 @@ def ingest_records( def source_table_cte_stmt(self, keep_source: bool) -> str: # noqa: ARG002 return "WITH ld_source AS (SELECT * FROM {source_table})" + @property + def tablesample(self) -> str: + return "USING SAMPLE {sample} PERCENT (bernoulli)" + # DuckDB has some strong opinions about cursors that are different than postgres # https://github.com/duckdb/duckdb/issues/11018 diff --git a/src/ldlite/database/_expansion/__init__.py b/src/ldlite/database/_expansion/__init__.py index 4b22070..a62b004 100644 --- a/src/ldlite/database/_expansion/__init__.py +++ b/src/ldlite/database/_expansion/__init__.py @@ -1,57 +1,14 @@ from __future__ import annotations from collections import deque -from dataclasses import dataclass -from typing import TYPE_CHECKING, NoReturn +from typing import TYPE_CHECKING from psycopg import sql from .nodes import ArrayNode, ObjectNode if TYPE_CHECKING: - from collections.abc import Callable - - import duckdb - import psycopg - from tqdm import tqdm - - -@dataclass -class ExpandContext: - conn: duckdb.DuckDBPyConnection | psycopg.Connection - source_table: sql.Identifier - json_depth: int - get_transform_table: Callable[[int], sql.Identifier] - get_output_table: Callable[[str], tuple[str, sql.Identifier]] - # This is necessary for Analyzing the table in pg before querying it - # I don't love how this is implemented - preprocess: Callable[ - [ - duckdb.DuckDBPyConnection | psycopg.Connection, - sql.Identifier, - list[sql.Identifier], - ], - None, - ] - # source_cte will go away when DuckDB implements CTAS RETURNING - source_cte: Callable[[bool], str] - progress: tqdm[NoReturn] | None - - def array_context( - self, - new_source_table: sql.Identifier, - new_json_depth: int, - ) -> ExpandContext: - return ExpandContext( - self.conn, - new_source_table, - new_json_depth, - self.get_transform_table, - self.get_output_table, - self.preprocess, - self.source_cte, - self.progress, - ) + from .context import ExpandContext def expand_nonmarc( @@ -75,11 +32,7 @@ def _expand_nonmarc( initial_count = count ctx.preprocess(ctx.conn, ctx.source_table, [root.identifier]) root.unnest( - ctx.conn, - ctx.source_table, - ctx.get_transform_table(count), - ctx.source_cte(False), - ctx.progress, + ctx, ctx.source_table, ctx.get_transform_table(count), ctx.source_cte(False) ) expand_children_of = deque([root]) @@ -92,11 +45,10 @@ def _expand_nonmarc( continue ctx.preprocess(ctx.conn, ctx.get_transform_table(count), [c.identifier]) c.unnest( - ctx.conn, + ctx, ctx.get_transform_table(count), ctx.get_transform_table(count + 1), ctx.source_cte(False), - ctx.progress, ) expand_children_of.append(c) count += 1 diff --git a/src/ldlite/database/_expansion/context.py b/src/ldlite/database/_expansion/context.py new file mode 100644 index 0000000..0e6cec0 --- /dev/null +++ b/src/ldlite/database/_expansion/context.py @@ -0,0 +1,52 @@ +from __future__ import annotations + +from dataclasses import dataclass +from typing import TYPE_CHECKING, NoReturn + +if TYPE_CHECKING: + from collections.abc import Callable + + import duckdb + import psycopg + from psycopg import sql + from tqdm import tqdm + + +@dataclass +class ExpandContext: + conn: duckdb.DuckDBPyConnection | psycopg.Connection + source_table: sql.Identifier + json_depth: int + get_transform_table: Callable[[int], sql.Identifier] + get_output_table: Callable[[str], tuple[str, sql.Identifier]] + # This is necessary for Analyzing the table in pg before querying it + # I don't love how this is implemented + preprocess: Callable[ + [ + duckdb.DuckDBPyConnection | psycopg.Connection, + sql.Identifier, + list[sql.Identifier], + ], + None, + ] + # source_cte will go away when DuckDB implements CTAS RETURNING + source_cte: Callable[[bool], str] + tablesample: str + progress: tqdm[NoReturn] | None + + def array_context( + self, + new_source_table: sql.Identifier, + new_json_depth: int, + ) -> ExpandContext: + return ExpandContext( + self.conn, + new_source_table, + new_json_depth, + self.get_transform_table, + self.get_output_table, + self.preprocess, + self.source_cte, + self.tablesample, + self.progress, + ) diff --git a/src/ldlite/database/_expansion/nodes.py b/src/ldlite/database/_expansion/nodes.py index 7c82077..a70383a 100644 --- a/src/ldlite/database/_expansion/nodes.py +++ b/src/ldlite/database/_expansion/nodes.py @@ -1,7 +1,7 @@ from __future__ import annotations from collections import deque -from typing import TYPE_CHECKING, TypeVar +from typing import TYPE_CHECKING, TypeVar, cast from psycopg import sql @@ -13,6 +13,8 @@ import psycopg from tqdm import tqdm + from .context import ExpandContext + from .metadata import Metadata TNode = TypeVar("TNode", bound="ExpansionNode") @@ -119,18 +121,27 @@ def object_children(self) -> list[ObjectNode]: def unnest( self, - conn: duckdb.DuckDBPyConnection | psycopg.Connection, + ctx: ExpandContext, source_table: sql.Identifier, dest_table: sql.Identifier, source_cte: str, - progress: tqdm[NoReturn] | None, ) -> None: self.unnested = True create_columns: list[sql.Composable] = [ sql.Identifier(v) for v in self.carryover ] - with conn.cursor() as cur: + with ctx.conn.cursor() as cur: + cur.execute( + sql.SQL("SELECT COUNT(*) FROM {table}") + .format(table=source_table) + .as_string(), + ) + total = cast("tuple[int]", cur.fetchone())[0] + if total == 0: + return + + with ctx.conn.cursor() as cur: cur.execute( sql.SQL( """ @@ -148,18 +159,20 @@ def unnest( props = [prop[0] for prop in cur.fetchall()] prop_count = len(props) - if progress is not None: - progress.total += prop_count * 2 + if ctx.progress is not None: + ctx.progress.total += prop_count * 2 for prop in props: - with conn.cursor() as cur: + with ctx.conn.cursor() as cur: cur.execute( sql.SQL( """ WITH values AS ( SELECT ldlite_system.jextract({json_col}, $1) as ld_value - FROM {table} + FROM {table} """ # noqa: S608 + + ctx.tablesample + + """ ), value_and_types AS ( SELECT @@ -216,7 +229,11 @@ def unnest( ) AS is_float """, ) - .format(table=source_table, json_col=self.identifier) + .format( + table=source_table, + json_col=self.identifier, + sample=sql.Literal(min(100, 100000 // total * 100)), + ) .as_string(), (prop,), ) @@ -227,13 +244,13 @@ def unnest( create_columns.append( meta.select_column(self.identifier, self.add(meta)), ) - if progress is not None and meta.is_array: - progress.total += 1 + if ctx.progress is not None and meta.is_array: + ctx.progress.total += 1 - if progress is not None: - progress.update(1) + if ctx.progress is not None: + ctx.progress.update(1) - with conn.cursor() as cur: + with ctx.conn.cursor() as cur: cur.execute( sql.SQL( """ @@ -253,8 +270,8 @@ def unnest( ) .as_string(), ) - if progress is not None: - progress.update(prop_count) + if ctx.progress is not None: + ctx.progress.update(prop_count) def _carryover(self) -> Iterator[str]: for n in self.root.descendents: diff --git a/src/ldlite/database/_postgres.py b/src/ldlite/database/_postgres.py index 0bebfa3..43e5b49 100644 --- a/src/ldlite/database/_postgres.py +++ b/src/ldlite/database/_postgres.py @@ -205,3 +205,7 @@ def source_table_cte_stmt(self, keep_source: bool) -> str: if keep_source: return "WITH ld_source AS (SELECT * FROM {source_table})" return "WITH ld_source AS (DELETE FROM {source_table} RETURNING *)" + + @property + def tablesample(self) -> str: + return "TABLESAMPLE BERNOULLI ({sample})" diff --git a/src/ldlite/database/_typed_database.py b/src/ldlite/database/_typed_database.py index a9197ad..3c3d84b 100644 --- a/src/ldlite/database/_typed_database.py +++ b/src/ldlite/database/_typed_database.py @@ -11,7 +11,8 @@ from psycopg import sql from . import Database, LoadHistory -from ._expansion import ExpandContext, expand_nonmarc +from ._expansion import expand_nonmarc +from ._expansion.context import ExpandContext from ._prefix import Prefix if TYPE_CHECKING: @@ -185,6 +186,10 @@ def preprocess_source_table( @abstractmethod def source_table_cte_stmt(self, keep_source: bool) -> str: ... + @property + @abstractmethod + def tablesample(self) -> str: ... + def expand_prefix( self, prefix: str, @@ -231,6 +236,7 @@ def expand_prefix( pfx.output_table, self.preprocess_source_table, # type: ignore [arg-type] self.source_table_cte_stmt, + self.tablesample, progress, ), ) From b07ec9f4207e7e8ff0079053f548ed9d18dbeaa4 Mon Sep 17 00:00:00 2001 From: Katherine Bargar Date: Fri, 6 Mar 2026 17:38:53 +0000 Subject: [PATCH 07/18] Split progress bars between scanning and transforming --- src/ldlite/__init__.py | 33 +++++++++++++++------- src/ldlite/database/__init__.py | 3 +- src/ldlite/database/_expansion/__init__.py | 17 +++++++++-- src/ldlite/database/_expansion/context.py | 6 ++-- src/ldlite/database/_expansion/nodes.py | 21 +++++--------- src/ldlite/database/_typed_database.py | 10 +++++-- 6 files changed, 58 insertions(+), 32 deletions(-) diff --git a/src/ldlite/__init__.py b/src/ldlite/__init__.py index e3b35be..9e8e757 100644 --- a/src/ldlite/__init__.py +++ b/src/ldlite/__init__.py @@ -350,20 +350,33 @@ def query( # noqa: C901, PLR0912, PLR0913, PLR0915 transform_started = datetime.now(timezone.utc) if not use_legacy_transform: - with tqdm( - total=0, - desc="transforming", - leave=False, - mininterval=5, - disable=self._quiet, - unit="ops", - delay=5, - ) as progress: + no_iters_format = ( + "{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}]" + ) + with ( + tqdm( + desc="scanning", + leave=False, + mininterval=5, + disable=self._quiet, + delay=5, + bar_format=no_iters_format, + ) as scan_progress, + tqdm( + desc="transforming", + leave=False, + mininterval=5, + disable=self._quiet, + delay=5, + bar_format=no_iters_format, + ) as transform_progress, + ): newtables = self._database.expand_prefix( table, json_depth, keep_raw, - progress, + scan_progress, + transform_progress, ) if keep_raw: newtables = [table, *newtables] diff --git a/src/ldlite/database/__init__.py b/src/ldlite/database/__init__.py index 8ed5875..e1df517 100644 --- a/src/ldlite/database/__init__.py +++ b/src/ldlite/database/__init__.py @@ -59,7 +59,8 @@ def expand_prefix( prefix: str, json_depth: int, keep_raw: bool, - progress: tqdm[NoReturn] | None = None, + scan_progress: tqdm[NoReturn] | None = None, + transform_progress: tqdm[NoReturn] | None = None, ) -> list[str]: """Unnests and explodes the raw data at the given prefix.""" diff --git a/src/ldlite/database/_expansion/__init__.py b/src/ldlite/database/_expansion/__init__.py index a62b004..585f956 100644 --- a/src/ldlite/database/_expansion/__init__.py +++ b/src/ldlite/database/_expansion/__init__.py @@ -29,15 +29,25 @@ def _expand_nonmarc( count: int, ctx: ExpandContext, ) -> tuple[int, list[str]]: + ctx.scan_progress.total = (ctx.scan_progress.total or 0) + 1 + ctx.transform_progress.total = (ctx.transform_progress.total or 0) + 1 + ctx.transform_progress.refresh() initial_count = count ctx.preprocess(ctx.conn, ctx.source_table, [root.identifier]) root.unnest( - ctx, ctx.source_table, ctx.get_transform_table(count), ctx.source_cte(False) + ctx, + ctx.source_table, + ctx.get_transform_table(count), + ctx.source_cte(False), ) + ctx.transform_progress.update(1) expand_children_of = deque([root]) while expand_children_of: on = expand_children_of.popleft() + if ctx.transform_progress: + ctx.transform_progress.total += len(on.object_children) + ctx.transform_progress.refresh() for c in on.object_children: if len(c.parents) >= ctx.json_depth: if c.parent is not None: @@ -52,11 +62,14 @@ def _expand_nonmarc( ) expand_children_of.append(c) count += 1 + ctx.transform_progress.update(1) created_tables = [] new_source_table = ctx.get_transform_table(count) arrays = root.descendents_oftype(ArrayNode) + ctx.transform_progress.total += len(arrays) + ctx.transform_progress.refresh() ctx.preprocess(ctx.conn, new_source_table, [a.identifier for a in arrays]) for an in arrays: if len(an.parents) >= ctx.json_depth: @@ -66,9 +79,9 @@ def _expand_nonmarc( new_source_table, ctx.get_transform_table(count + 1), ctx.source_cte(True), - ctx.progress, ) count += 1 + ctx.transform_progress.update(1) if an.meta.is_object: (sub_index, array_tables) = _expand_nonmarc( diff --git a/src/ldlite/database/_expansion/context.py b/src/ldlite/database/_expansion/context.py index 0e6cec0..257ab87 100644 --- a/src/ldlite/database/_expansion/context.py +++ b/src/ldlite/database/_expansion/context.py @@ -32,7 +32,8 @@ class ExpandContext: # source_cte will go away when DuckDB implements CTAS RETURNING source_cte: Callable[[bool], str] tablesample: str - progress: tqdm[NoReturn] | None + scan_progress: tqdm[NoReturn] + transform_progress: tqdm[NoReturn] def array_context( self, @@ -48,5 +49,6 @@ def array_context( self.preprocess, self.source_cte, self.tablesample, - self.progress, + self.scan_progress, + self.transform_progress, ) diff --git a/src/ldlite/database/_expansion/nodes.py b/src/ldlite/database/_expansion/nodes.py index a70383a..ab6dfec 100644 --- a/src/ldlite/database/_expansion/nodes.py +++ b/src/ldlite/database/_expansion/nodes.py @@ -1,17 +1,16 @@ from __future__ import annotations from collections import deque +from math import floor from typing import TYPE_CHECKING, TypeVar, cast from psycopg import sql if TYPE_CHECKING: from collections.abc import Iterator - from typing import NoReturn import duckdb import psycopg - from tqdm import tqdm from .context import ExpandContext @@ -159,8 +158,8 @@ def unnest( props = [prop[0] for prop in cur.fetchall()] prop_count = len(props) - if ctx.progress is not None: - ctx.progress.total += prop_count * 2 + ctx.scan_progress.total += prop_count + ctx.scan_progress.update(1) for prop in props: with ctx.conn.cursor() as cur: @@ -232,7 +231,7 @@ def unnest( .format( table=source_table, json_col=self.identifier, - sample=sql.Literal(min(100, 100000 // total * 100)), + sample=sql.Literal(min(100, floor((100000 / total) * 100))), ) .as_string(), (prop,), @@ -244,11 +243,10 @@ def unnest( create_columns.append( meta.select_column(self.identifier, self.add(meta)), ) - if ctx.progress is not None and meta.is_array: - ctx.progress.total += 1 + if meta.is_object: + ctx.scan_progress.total += 1 - if ctx.progress is not None: - ctx.progress.update(1) + ctx.scan_progress.update(1) with ctx.conn.cursor() as cur: cur.execute( @@ -270,8 +268,6 @@ def unnest( ) .as_string(), ) - if ctx.progress is not None: - ctx.progress.update(prop_count) def _carryover(self) -> Iterator[str]: for n in self.root.descendents: @@ -311,7 +307,6 @@ def explode( source_table: sql.Identifier, dest_table: sql.Identifier, source_cte: str, - progress: tqdm[NoReturn] | None, ) -> list[str]: with conn.cursor() as cur: o_col = self.name + "__o" @@ -354,8 +349,6 @@ def explode( ) .as_string(), ) - if progress is not None: - progress.update(1) return ["__id", *self.carryover, o_col] diff --git a/src/ldlite/database/_typed_database.py b/src/ldlite/database/_typed_database.py index 3c3d84b..f66b464 100644 --- a/src/ldlite/database/_typed_database.py +++ b/src/ldlite/database/_typed_database.py @@ -9,6 +9,7 @@ import psycopg from psycopg import sql +from tqdm import tqdm from . import Database, LoadHistory from ._expansion import expand_nonmarc @@ -20,7 +21,6 @@ from typing import NoReturn import duckdb - from tqdm import tqdm DB = TypeVar("DB", bound="duckdb.DuckDBPyConnection | psycopg.Connection") @@ -195,7 +195,8 @@ def expand_prefix( prefix: str, json_depth: int, keep_raw: bool, - progress: tqdm[NoReturn] | None = None, + scan_progress: tqdm[NoReturn] | None = None, + transform_progress: tqdm[NoReturn] | None = None, ) -> list[str]: pfx = Prefix(prefix) with closing(self._conn_factory()) as conn: @@ -237,7 +238,10 @@ def expand_prefix( self.preprocess_source_table, # type: ignore [arg-type] self.source_table_cte_stmt, self.tablesample, - progress, + scan_progress if scan_progress is not None else tqdm(disable=True), + transform_progress + if transform_progress is not None + else tqdm(disable=True), ), ) From c6081bf5c8709f60eefce2318481b31d21259eda Mon Sep 17 00:00:00 2001 From: Katherine Bargar Date: Fri, 6 Mar 2026 19:49:29 +0000 Subject: [PATCH 08/18] Remove slow jextract function in favor of more native functionality --- src/ldlite/database/_duckdb.py | 43 ++--------- src/ldlite/database/_expansion/metadata.py | 30 ++++++-- src/ldlite/database/_expansion/nodes.py | 10 ++- src/ldlite/database/_postgres.py | 66 +++------------- tests/test_json_operators.py | 90 +--------------------- 5 files changed, 50 insertions(+), 189 deletions(-) diff --git a/src/ldlite/database/_duckdb.py b/src/ldlite/database/_duckdb.py index f4f6035..db48527 100644 --- a/src/ldlite/database/_duckdb.py +++ b/src/ldlite/database/_duckdb.py @@ -41,62 +41,33 @@ def _setup_jfuncs(conn: duckdb.DuckDBPyConnection) -> None: END ; -CREATE OR REPLACE FUNCTION ldlite_system.jextract(j, p) AS - CASE ldlite_system.jtype_of(main.json_extract(j, p)) - WHEN 'string' THEN - CASE - WHEN lower(main.json_extract_string(j, p)) = 'null' THEN 'null'::JSON - WHEN length(main.json_extract_string(j, p)) = 0 THEN 'null'::JSON - ELSE main.json_extract(j, p) - END - WHEN 'object' THEN - CASE - WHEN main.json_extract_string(j, p) = '{}' THEN 'null'::JSON - ELSE main.json_extract(j, p) - END - WHEN 'array' THEN - CASE - WHEN length(list_filter((main.json_extract(j, p))::JSON[], lambda x : x != 'null'::JSON)) = 0 THEN 'null'::JSON - ELSE list_filter((main.json_extract(j, p))::JSON[], lambda x : x != 'null'::JSON) - END - ELSE coalesce(main.json_extract(j, p), 'null'::JSON) - END -; - -CREATE OR REPLACE FUNCTION ldlite_system.jextract_string(j, p) AS - main.json_extract_string(ldlite_system.jextract(j, p), '$') -; - CREATE OR REPLACE FUNCTION ldlite_system.jobject_keys(j) AS unnest(main.json_keys(j)) ; CREATE OR REPLACE FUNCTION ldlite_system.jis_uuid(j) AS - CASE ldlite_system.jtype_of(j) - WHEN 'string' THEN regexp_full_match(main.json_extract_string(j, '$'), '^[a-fA-F0-9]{8}-[a-fA-F0-9]{4}-[1-5][a-fA-F0-9]{3}-[89abAB][a-fA-F0-9]{3}-[a-fA-F0-9]{12}$') - ELSE FALSE - END + regexp_full_match(main.json_extract_string(j, '$'), '(?i)^[a-f0-9]{8}-[a-f0-9]{4}-[1-5][a-f0-9]{3}-[89abAB][a-f0-9]{3}-[a-f0-9]{12}$') ; CREATE OR REPLACE FUNCTION ldlite_system.jis_datetime(j) AS - CASE ldlite_system.jtype_of(j) - WHEN 'string' THEN regexp_full_match(main.json_extract_string(j, '$'), '^\d{4}-[01]\d-[0123]\dT[012]\d:[012345]\d:[012345]\d\.\d{3}(\+\d{2}:\d{2})?$') - ELSE FALSE - END + regexp_full_match(main.json_extract_string(j, '$'), '^\d{4}-[01]\d-[0123]\dT[012]\d:[012345]\d:[012345]\d\.\d{3}(\+\d{2}:\d{2})?$') ; CREATE OR REPLACE FUNCTION ldlite_system.jis_float(j) AS - coalesce(main.json_type(j), 'NULL') == 'DOUBLE' + main.json_type(j) == 'DOUBLE' ; CREATE OR REPLACE FUNCTION ldlite_system.jis_null(j) AS - j is NULL or j == 'null'::JSON + j IS NULL OR j == 'null'::JSON OR main.json_extract_string(j, '$') IN ('NULL', 'null', '', '{}', '[]') ; CREATE OR REPLACE FUNCTION ldlite_system.jexplode(j) AS TABLE ( SELECT value as ld_value FROM main.json_each(j) ); +CREATE OR REPLACE FUNCTION ldlite_system.jself_string(j) AS + main.json_extract_string(j, '$') +; """, # noqa: E501 ) diff --git a/src/ldlite/database/_expansion/metadata.py b/src/ldlite/database/_expansion/metadata.py index 15f75f7..e18b854 100644 --- a/src/ldlite/database/_expansion/metadata.py +++ b/src/ldlite/database/_expansion/metadata.py @@ -40,26 +40,44 @@ def select_column( json_col: sql.Identifier, alias: str, ) -> sql.Composed: + # '$' is a special character that means the root of the json + # I couldn't figure out how to make the array expansion work + # without it if self.is_array or self.is_object: stmt = sql.SQL( - "(ldlite_system.jextract({json_col}, {prop})) AS {alias}", + "{json_col}->{prop} AS {alias}" + if self.prop != "$" + else "{json_col} AS {alias}", ) elif self.json_type == "number": stmt = sql.SQL( - "(ldlite_system.jextract_string({json_col}, {prop}))" - "::numeric AS {alias}", + "({json_col}->>{prop})::numeric AS {alias}" + if self.prop != "$" + else "ldlite_system.jself_string({json_col})::numeric AS {alias}", ) elif self.json_type == "boolean": stmt = sql.SQL( - "(ldlite_system.jextract_string({json_col}, {prop}))::bool AS {alias}", + "NULLIF(NULLIF({json_col}->>{prop}, ''), 'null')::bool AS {alias}" + if self.prop != "$" + else "NULLIF(NULLIF(" + "ldlite_system.jself_string({json_col})" + ", ''), 'null')::bool AS {alias}", ) elif self.json_type == "string" and self.is_uuid: stmt = sql.SQL( - "(ldlite_system.jextract_string({json_col}, {prop}))::uuid AS {alias}", + "NULLIF(NULLIF({json_col}->>{prop}, ''), 'null')::uuid AS {alias}" + if self.prop != "$" + else "NULLIF(NULLIF(" + "ldlite_system.jself_string({json_col})" + ", ''), 'null')::uuid AS {alias}", ) else: stmt = sql.SQL( - "(ldlite_system.jextract_string({json_col}, {prop})) AS {alias}", + "NULLIF(NULLIF({json_col}->>{prop}, ''), 'null') AS {alias}" + if self.prop != "$" + else "NULLIF(NULLIF(" + "ldlite_system.jself_string({json_col})" + ", ''), 'null') AS {alias}", ) return stmt.format( diff --git a/src/ldlite/database/_expansion/nodes.py b/src/ldlite/database/_expansion/nodes.py index ab6dfec..4b9c466 100644 --- a/src/ldlite/database/_expansion/nodes.py +++ b/src/ldlite/database/_expansion/nodes.py @@ -168,7 +168,7 @@ def unnest( """ WITH values AS ( - SELECT ldlite_system.jextract({json_col}, $1) as ld_value + SELECT {json_col}->$1 as ld_value FROM {table} """ # noqa: S608 + ctx.tablesample + """ @@ -258,12 +258,14 @@ def unnest( + """ SELECT {cols} -FROM ld_source; +FROM ld_source +WHERE NOT ldlite_system.jis_null({json_col}) """, ) .format( source_table=source_table, dest_table=dest_table, + json_col=self.identifier, cols=sql.SQL("\n ,").join(create_columns), ) .as_string(), @@ -275,7 +277,7 @@ def _carryover(self) -> Iterator[str]: yield n.name else: yield n.name - yield from n.values + yield from [v for v in n.values if v != "jsonb"] @property def carryover(self) -> list[str]: @@ -354,7 +356,7 @@ def explode( def _carryover(self) -> Iterator[str]: for n in reversed(self.parents): - yield from [v for v in n.values if v != "__id"] + yield from [v for v in n.values if v not in ("__id", "jsonb")] @property def carryover(self) -> list[str]: diff --git a/src/ldlite/database/_postgres.py b/src/ldlite/database/_postgres.py index 43e5b49..ae23631 100644 --- a/src/ldlite/database/_postgres.py +++ b/src/ldlite/database/_postgres.py @@ -41,62 +41,6 @@ def _setup_jfuncs(conn: psycopg.Connection) -> None: IMMUTABLE PARALLEL SAFE; -CREATE OR REPLACE FUNCTION ldlite_system.jextract(j JSONB, p TEXT) RETURNS JSONB AS $$ -SELECT - CASE jsonb_typeof(val) - WHEN 'string' THEN - CASE - WHEN lower(val #>> '{}') IN ('null', '') THEN 'null'::jsonb - ELSE val - END - WHEN 'array' THEN - CASE - WHEN jsonb_array_length(val) = 0 THEN 'null'::jsonb - WHEN NOT EXISTS ( - SELECT 1 - FROM jsonb_array_elements(val) AS e(elem) - WHERE elem = 'null'::jsonb - LIMIT 1 - ) THEN val - ELSE COALESCE( - ( - SELECT jsonb_agg(e) - FROM jsonb_array_elements(val) AS a(e) - WHERE e <> 'null'::jsonb - ), - 'null'::jsonb - ) - END - WHEN 'object' THEN - CASE - WHEN val = '{}'::jsonb THEN 'null'::jsonb - ELSE val - END - ELSE val - END -FROM ( - -- This is somewhat of a hack. - -- There isn't a really good way to get the element unchanged - -- which works for duckdb and postgres AND CRUCIALLY - -- has a similar syntax to everything else so that we don't - -- have to have special cases for exploding the array and it - -- can share all the same type checking / statement generation code. - -- We're pretending that postgres supports -> '$' style syntax like duckdb. - SELECT CASE WHEN p = '$' THEN j ELSE j->p END AS val -) s; -$$ -LANGUAGE sql -IMMUTABLE -PARALLEL SAFE -STRICT; - -CREATE OR REPLACE FUNCTION ldlite_system.jextract_string(j JSONB, p TEXT) RETURNS TEXT AS $$ -SELECT ldlite_system.jextract(j, p) #>> '{}' -$$ -LANGUAGE sql -IMMUTABLE -PARALLEL SAFE; - CREATE OR REPLACE FUNCTION ldlite_system.jobject_keys(j JSONB) RETURNS SETOF TEXT AS $$ SELECT jsonb_object_keys(j); $$ @@ -129,7 +73,7 @@ def _setup_jfuncs(conn: psycopg.Connection) -> None: STRICT; CREATE OR REPLACE FUNCTION ldlite_system.jis_null(j JSONB) RETURNS BOOLEAN AS $$ -SELECT COALESCE(j = 'null'::jsonb, TRUE); +SELECT j IS NULL OR j = 'null'::jsonb OR j #>> '{}' IN ('NULL', 'null', '', '{}', '[]') $$ LANGUAGE sql IMMUTABLE @@ -141,6 +85,14 @@ def _setup_jfuncs(conn: psycopg.Connection) -> None: LANGUAGE sql IMMUTABLE PARALLEL SAFE; + + +CREATE OR REPLACE FUNCTION ldlite_system.jself_string(j JSONB) RETURNS TEXT AS $$ +SELECT j #>> '{}' +$$ +LANGUAGE sql +IMMUTABLE +PARALLEL SAFE; """, # noqa: E501 ) diff --git a/tests/test_json_operators.py b/tests/test_json_operators.py index 2ddfe13..987a8e0 100644 --- a/tests/test_json_operators.py +++ b/tests/test_json_operators.py @@ -27,88 +27,6 @@ class JsonTC: assertion_params: tuple[Any, ...] -@parametrize( - p=[ - ("str", '"str_val"'), - ("str_empty", "null"), - ("num", "12"), - ("float", "16.3"), - ("bool", "true"), - ("obj", '{"k1":"v1","k2":"v2"}'), - ("obj_some", '{"k1":"v1","k2":null}'), - ("obj_empty", "null"), - ("arr_zero", "null"), - ("na", "null"), - ("na_str1", "null"), - ("na_str2", "null"), - ], -) -def case_jextract(p: tuple[Any, ...]) -> JsonTC: - return JsonTC( - """SELECT ldlite_system.jextract(jc, $1){assertion} FROM j;""", - p[:1], - """= $2::{jtype}""", - p[1:], - ) - - -# Duckdb through 1.3 and 1.4 have different json comparison behavior here -# Whitespace matters in 1.4 and not 1.3 -# This makes the arrays text and compares the values as a workaround -@parametrize( - p=[ - ("arr_str", '["s1", "s2", "s3"]'), - ("arr_str_some", '["s1", "s2"]'), - ("arr_obj_some", '[{"k1":"v1"}]'), - ], -) -def case_jextract_duckdb(p: tuple[Any, ...]) -> JsonTC: - return JsonTC( - """SELECT ldlite_system.jextract(jc, $1){assertion} FROM j;""", - p[:1], - """::text[] = $2::JSON::text[]""", - p[1:], - ) - - -# The differences betweeen postgres/duckdb here only matters for tests -# This can all be rectified when duckdb 1.4 is the minimum version -@parametrize( - p=[ - ("arr_str", '["s1", "s2", "s3"]'), - ("arr_str_some", '["s1", "s2"]'), - ("arr_obj_some", '[{"k1":"v1"}]'), - ], -) -def case_jextract_postgres(p: tuple[Any, ...]) -> JsonTC: - return JsonTC( - """SELECT ldlite_system.jextract(jc, $1){assertion} FROM j;""", - p[:1], - """ = $2::JSONB""", - p[1:], - ) - - -@parametrize( - p=[ - ("str", "str_val"), - ("num", "12"), - ("float", "16.3"), - ("bool", "true"), - ("na",), - ("na_str1",), - ("na_str2",), - ], -) -def case_jextract_string(p: tuple[Any, ...]) -> JsonTC: - return JsonTC( - """SELECT ldlite_system.jextract_string(jc, $1){assertion} FROM j;""", - p[:1], - """ = $2""" if len(p) == 2 else """ IS NULL""", - p[1:], - ) - - def case_jobject_keys() -> JsonTC: return JsonTC( """ @@ -197,10 +115,10 @@ def case_jis_uuid(p: tuple[Any, ...]) -> JsonTC: @parametrize( p=[ ("na", True), - ("obj_empty", False), - ("arr_zero", False), - ("na_str1", False), - ("na_str2", False), + ("obj_empty", True), + ("arr_zero", True), + ("na_str1", True), + ("na_str2", True), ], ) def case_jis_null(p: tuple[Any, ...]) -> JsonTC: From 8d9a61cf11aace6ba7a9f07925949593a950d78a Mon Sep 17 00:00:00 2001 From: Katherine Bargar Date: Fri, 6 Mar 2026 20:32:47 +0000 Subject: [PATCH 09/18] Find all possible keys in json --- src/ldlite/database/_expansion/nodes.py | 33 +++++++++++++++++++------ 1 file changed, 26 insertions(+), 7 deletions(-) diff --git a/src/ldlite/database/_expansion/nodes.py b/src/ldlite/database/_expansion/nodes.py index 4b9c466..916fae2 100644 --- a/src/ldlite/database/_expansion/nodes.py +++ b/src/ldlite/database/_expansion/nodes.py @@ -144,15 +144,34 @@ def unnest( cur.execute( sql.SQL( """ -SELECT ldlite_system.jobject_keys(j) FROM ( - SELECT {json_col} AS j - FROM {table} - WHERE NOT ldlite_system.jis_null({json_col}) - LIMIT 1 -) +WITH + all_objects AS ( + SELECT {json_col} AS j + FROM {source_table} + WHERE NOT ldlite_system.jis_null({json_col}) + ), + first_object AS ( + SELECT * FROM all_objects + LIMIT 1 + ), + first_keys AS ( + SELECT ldlite_system.jobject_keys(j) AS k + FROM first_object + ), + ordered_first_keys AS ( + SELECT (ROW_NUMBER() OVER (ORDER BY (SELECT NULL))) AS idx, k + FROM first_keys + ), all_keys AS ( + SELECT DISTINCT ldlite_system.jobject_keys(j) k + FROM all_objects + ) +SELECT ak.k +FROM all_keys ak +LEFT JOIN ordered_first_keys ofk ON ak.k = ofk.k +ORDER BY ofk.idx NULLS LAST """, ) - .format(table=source_table, json_col=self.identifier) + .format(source_table=source_table, json_col=self.identifier) .as_string(), ) props = [prop[0] for prop in cur.fetchall()] From b63f7e26221a705bd1cfbbab73e8b079faf8bc2c Mon Sep 17 00:00:00 2001 From: Katherine Bargar Date: Fri, 6 Mar 2026 20:37:36 +0000 Subject: [PATCH 10/18] Remove sampling --- src/ldlite/database/_duckdb.py | 4 ---- src/ldlite/database/_expansion/context.py | 2 -- src/ldlite/database/_expansion/nodes.py | 4 +--- src/ldlite/database/_postgres.py | 4 ---- src/ldlite/database/_typed_database.py | 5 ----- 5 files changed, 1 insertion(+), 18 deletions(-) diff --git a/src/ldlite/database/_duckdb.py b/src/ldlite/database/_duckdb.py index db48527..4754e52 100644 --- a/src/ldlite/database/_duckdb.py +++ b/src/ldlite/database/_duckdb.py @@ -105,10 +105,6 @@ def ingest_records( def source_table_cte_stmt(self, keep_source: bool) -> str: # noqa: ARG002 return "WITH ld_source AS (SELECT * FROM {source_table})" - @property - def tablesample(self) -> str: - return "USING SAMPLE {sample} PERCENT (bernoulli)" - # DuckDB has some strong opinions about cursors that are different than postgres # https://github.com/duckdb/duckdb/issues/11018 diff --git a/src/ldlite/database/_expansion/context.py b/src/ldlite/database/_expansion/context.py index 257ab87..7b9e29a 100644 --- a/src/ldlite/database/_expansion/context.py +++ b/src/ldlite/database/_expansion/context.py @@ -31,7 +31,6 @@ class ExpandContext: ] # source_cte will go away when DuckDB implements CTAS RETURNING source_cte: Callable[[bool], str] - tablesample: str scan_progress: tqdm[NoReturn] transform_progress: tqdm[NoReturn] @@ -48,7 +47,6 @@ def array_context( self.get_output_table, self.preprocess, self.source_cte, - self.tablesample, self.scan_progress, self.transform_progress, ) diff --git a/src/ldlite/database/_expansion/nodes.py b/src/ldlite/database/_expansion/nodes.py index 916fae2..dc83a2e 100644 --- a/src/ldlite/database/_expansion/nodes.py +++ b/src/ldlite/database/_expansion/nodes.py @@ -188,9 +188,7 @@ def unnest( WITH values AS ( SELECT {json_col}->$1 as ld_value - FROM {table} """ # noqa: S608 - + ctx.tablesample - + """ + FROM {table} ), value_and_types AS ( SELECT diff --git a/src/ldlite/database/_postgres.py b/src/ldlite/database/_postgres.py index ae23631..4d2f19a 100644 --- a/src/ldlite/database/_postgres.py +++ b/src/ldlite/database/_postgres.py @@ -157,7 +157,3 @@ def source_table_cte_stmt(self, keep_source: bool) -> str: if keep_source: return "WITH ld_source AS (SELECT * FROM {source_table})" return "WITH ld_source AS (DELETE FROM {source_table} RETURNING *)" - - @property - def tablesample(self) -> str: - return "TABLESAMPLE BERNOULLI ({sample})" diff --git a/src/ldlite/database/_typed_database.py b/src/ldlite/database/_typed_database.py index f66b464..a3f4eb4 100644 --- a/src/ldlite/database/_typed_database.py +++ b/src/ldlite/database/_typed_database.py @@ -186,10 +186,6 @@ def preprocess_source_table( @abstractmethod def source_table_cte_stmt(self, keep_source: bool) -> str: ... - @property - @abstractmethod - def tablesample(self) -> str: ... - def expand_prefix( self, prefix: str, @@ -237,7 +233,6 @@ def expand_prefix( pfx.output_table, self.preprocess_source_table, # type: ignore [arg-type] self.source_table_cte_stmt, - self.tablesample, scan_progress if scan_progress is not None else tqdm(disable=True), transform_progress if transform_progress is not None From c23c6af974aec775dde7757556258413e0fbdac3 Mon Sep 17 00:00:00 2001 From: Katherine Bargar Date: Fri, 6 Mar 2026 20:48:48 +0000 Subject: [PATCH 11/18] Actually drop the tables in the prefix --- src/ldlite/database/_typed_database.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ldlite/database/_typed_database.py b/src/ldlite/database/_typed_database.py index a3f4eb4..2ddd2b3 100644 --- a/src/ldlite/database/_typed_database.py +++ b/src/ldlite/database/_typed_database.py @@ -136,7 +136,7 @@ def _drop_extracted_tables( for (et,) in tables: cur.execute( sql.SQL("DROP TABLE IF EXISTS {table};") - .format(table=sql.Identifier(cast("str", et))) + .format(table=sql.Identifier(*cast("str", et).split("."))) .as_string(), ) cur.execute( From 7d04ee30c0f9e238034483d6bb08c4fee73652f0 Mon Sep 17 00:00:00 2001 From: Katherine Bargar Date: Fri, 6 Mar 2026 20:48:54 +0000 Subject: [PATCH 12/18] Dont carry over jsonb column --- src/ldlite/database/_expansion/__init__.py | 1 - src/ldlite/database/_expansion/nodes.py | 6 ++---- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/src/ldlite/database/_expansion/__init__.py b/src/ldlite/database/_expansion/__init__.py index 585f956..5ab4e47 100644 --- a/src/ldlite/database/_expansion/__init__.py +++ b/src/ldlite/database/_expansion/__init__.py @@ -31,7 +31,6 @@ def _expand_nonmarc( ) -> tuple[int, list[str]]: ctx.scan_progress.total = (ctx.scan_progress.total or 0) + 1 ctx.transform_progress.total = (ctx.transform_progress.total or 0) + 1 - ctx.transform_progress.refresh() initial_count = count ctx.preprocess(ctx.conn, ctx.source_table, [root.identifier]) root.unnest( diff --git a/src/ldlite/database/_expansion/nodes.py b/src/ldlite/database/_expansion/nodes.py index dc83a2e..695649d 100644 --- a/src/ldlite/database/_expansion/nodes.py +++ b/src/ldlite/database/_expansion/nodes.py @@ -290,11 +290,9 @@ def unnest( def _carryover(self) -> Iterator[str]: for n in self.root.descendents: - if isinstance(n, ObjectNode) and not n.unnested: + if (isinstance(n, ObjectNode) and not n.unnested) or n.name != "jsonb": yield n.name - else: - yield n.name - yield from [v for v in n.values if v != "jsonb"] + yield from n.values @property def carryover(self) -> list[str]: From f619eb62e58711306c1fd2b4f7f126c8e76df37c Mon Sep 17 00:00:00 2001 From: Katherine Bargar Date: Mon, 9 Mar 2026 16:34:35 +0000 Subject: [PATCH 13/18] Simplify the query to get property names --- src/ldlite/database/_duckdb.py | 4 +-- src/ldlite/database/_expansion/nodes.py | 38 +++++++++++-------------- src/ldlite/database/_postgres.py | 2 +- tests/test_json_operators.py | 5 +++- 4 files changed, 23 insertions(+), 26 deletions(-) diff --git a/src/ldlite/database/_duckdb.py b/src/ldlite/database/_duckdb.py index 4754e52..1ca8508 100644 --- a/src/ldlite/database/_duckdb.py +++ b/src/ldlite/database/_duckdb.py @@ -41,8 +41,8 @@ def _setup_jfuncs(conn: duckdb.DuckDBPyConnection) -> None: END ; -CREATE OR REPLACE FUNCTION ldlite_system.jobject_keys(j) AS - unnest(main.json_keys(j)) +CREATE OR REPLACE FUNCTION ldlite_system.jobject_keys(j) AS TABLE + SELECT je.key as ld_key FROM json_each(j) je ORDER BY je.id ; CREATE OR REPLACE FUNCTION ldlite_system.jis_uuid(j) AS diff --git a/src/ldlite/database/_expansion/nodes.py b/src/ldlite/database/_expansion/nodes.py index 695649d..1a4c631 100644 --- a/src/ldlite/database/_expansion/nodes.py +++ b/src/ldlite/database/_expansion/nodes.py @@ -145,30 +145,24 @@ def unnest( sql.SQL( """ WITH - all_objects AS ( - SELECT {json_col} AS j - FROM {source_table} - WHERE NOT ldlite_system.jis_null({json_col}) - ), - first_object AS ( - SELECT * FROM all_objects - LIMIT 1 - ), - first_keys AS ( - SELECT ldlite_system.jobject_keys(j) AS k - FROM first_object + keys AS ( + SELECT + keys.ld_key AS k + ,ROW_NUMBER() OVER (PARTITION BY t.__id) AS idx + FROM {source_table} t, ldlite_system.jobject_keys(t.{json_col}) keys + WHERE {json_col} IS NOT NULL AND ldlite_system.jtype_of(t.{json_col}) = 'object' ), - ordered_first_keys AS ( - SELECT (ROW_NUMBER() OVER (ORDER BY (SELECT NULL))) AS idx, k - FROM first_keys - ), all_keys AS ( - SELECT DISTINCT ldlite_system.jobject_keys(j) k - FROM all_objects + ordered_keys AS ( + SELECT + k + ,MAX(idx) idx + ,COUNT(idx) freq + FROM keys + GROUP BY k ) -SELECT ak.k -FROM all_keys ak -LEFT JOIN ordered_first_keys ofk ON ak.k = ofk.k -ORDER BY ofk.idx NULLS LAST +SELECT k +FROM ordered_keys +ORDER BY idx, freq DESC """, ) .format(source_table=source_table, json_col=self.identifier) diff --git a/src/ldlite/database/_postgres.py b/src/ldlite/database/_postgres.py index 4d2f19a..2b51f9c 100644 --- a/src/ldlite/database/_postgres.py +++ b/src/ldlite/database/_postgres.py @@ -41,7 +41,7 @@ def _setup_jfuncs(conn: psycopg.Connection) -> None: IMMUTABLE PARALLEL SAFE; -CREATE OR REPLACE FUNCTION ldlite_system.jobject_keys(j JSONB) RETURNS SETOF TEXT AS $$ +CREATE OR REPLACE FUNCTION ldlite_system.jobject_keys(j JSONB) RETURNS TABLE (ld_key TEXT) AS $$ SELECT jsonb_object_keys(j); $$ LANGUAGE sql diff --git a/tests/test_json_operators.py b/tests/test_json_operators.py index 987a8e0..8b1f8c8 100644 --- a/tests/test_json_operators.py +++ b/tests/test_json_operators.py @@ -33,7 +33,10 @@ def case_jobject_keys() -> JsonTC: {assertion} (SELECT e.jkey, a.jkey FROM (SELECT 'k1' jkey UNION SELECT 'k2' jkey) as e -FULL OUTER JOIN (SELECT ldlite_system.jobject_keys(jc->'obj') jkey FROM j) as a +FULL OUTER JOIN ( + SELECT k.ld_key as jkey + FROM j, ldlite_system.jobject_keys(j.jc->'obj') k +) as a USING (jkey) WHERE e.jkey IS NULL or a.jkey IS NULL) as q;""", (), From 00775d17b286297cd72f2bc25527256000496474 Mon Sep 17 00:00:00 2001 From: Katherine Bargar Date: Mon, 9 Mar 2026 20:06:31 +0000 Subject: [PATCH 14/18] Ignore empty source tables --- src/ldlite/database/_expansion/__init__.py | 4 +++- src/ldlite/database/_expansion/nodes.py | 6 ++++-- src/ldlite/database/_typed_database.py | 15 ++++++++------- 3 files changed, 15 insertions(+), 10 deletions(-) diff --git a/src/ldlite/database/_expansion/__init__.py b/src/ldlite/database/_expansion/__init__.py index 5ab4e47..6388a3d 100644 --- a/src/ldlite/database/_expansion/__init__.py +++ b/src/ldlite/database/_expansion/__init__.py @@ -33,13 +33,15 @@ def _expand_nonmarc( ctx.transform_progress.total = (ctx.transform_progress.total or 0) + 1 initial_count = count ctx.preprocess(ctx.conn, ctx.source_table, [root.identifier]) - root.unnest( + has_rows = root.unnest( ctx, ctx.source_table, ctx.get_transform_table(count), ctx.source_cte(False), ) ctx.transform_progress.update(1) + if not has_rows: + return (0, []) expand_children_of = deque([root]) while expand_children_of: diff --git a/src/ldlite/database/_expansion/nodes.py b/src/ldlite/database/_expansion/nodes.py index 1a4c631..dc3dfb6 100644 --- a/src/ldlite/database/_expansion/nodes.py +++ b/src/ldlite/database/_expansion/nodes.py @@ -124,7 +124,7 @@ def unnest( source_table: sql.Identifier, dest_table: sql.Identifier, source_cte: str, - ) -> None: + ) -> bool: self.unnested = True create_columns: list[sql.Composable] = [ sql.Identifier(v) for v in self.carryover @@ -138,7 +138,7 @@ def unnest( ) total = cast("tuple[int]", cur.fetchone())[0] if total == 0: - return + return False with ctx.conn.cursor() as cur: cur.execute( @@ -282,6 +282,8 @@ def unnest( .as_string(), ) + return True + def _carryover(self) -> Iterator[str]: for n in self.root.descendents: if (isinstance(n, ObjectNode) and not n.unnested) or n.name != "jsonb": diff --git a/src/ldlite/database/_typed_database.py b/src/ldlite/database/_typed_database.py index 2ddd2b3..efaa57c 100644 --- a/src/ldlite/database/_typed_database.py +++ b/src/ldlite/database/_typed_database.py @@ -252,14 +252,15 @@ def expand_prefix( .format(catalog_table=pfx.catalog_table.id) .as_string(), ) - cur.executemany( - sql.SQL("INSERT INTO {catalog_table} VALUES ($1)") - .format( - catalog_table=pfx.catalog_table.id, + if len(created_tables) > 0: + cur.executemany( + sql.SQL("INSERT INTO {catalog_table} VALUES ($1)") + .format( + catalog_table=pfx.catalog_table.id, + ) + .as_string(), + [(pfx.catalog_table_row(t),) for t in created_tables], ) - .as_string(), - [(pfx.catalog_table_row(t),) for t in created_tables], - ) conn.commit() From 523bd2c01db1cc2fd46fecfcfa658cb441d5e6fd Mon Sep 17 00:00:00 2001 From: Katherine Bargar Date: Mon, 9 Mar 2026 20:33:23 +0000 Subject: [PATCH 15/18] Better display of iterators during transform --- src/ldlite/__init__.py | 12 +++++------- src/ldlite/database/_expansion/__init__.py | 2 ++ src/ldlite/database/_expansion/nodes.py | 2 ++ 3 files changed, 9 insertions(+), 7 deletions(-) diff --git a/src/ldlite/__init__.py b/src/ldlite/__init__.py index 9e8e757..1675fc7 100644 --- a/src/ldlite/__init__.py +++ b/src/ldlite/__init__.py @@ -356,18 +356,12 @@ def query( # noqa: C901, PLR0912, PLR0913, PLR0915 with ( tqdm( desc="scanning", - leave=False, - mininterval=5, disable=self._quiet, - delay=5, bar_format=no_iters_format, ) as scan_progress, tqdm( desc="transforming", - leave=False, - mininterval=5, disable=self._quiet, - delay=5, bar_format=no_iters_format, ) as transform_progress, ): @@ -382,7 +376,11 @@ def query( # noqa: C901, PLR0912, PLR0913, PLR0915 newtables = [table, *newtables] transform_elapsed = datetime.now(timezone.utc) - transform_started - with tqdm(desc="indexing", disable=self._quiet) as progress: + with tqdm( + desc="indexing", + disable=self._quiet, + bar_format=no_iters_format, + ) as progress: index_started = datetime.now(timezone.utc) self._database.index_prefix(table, progress) diff --git a/src/ldlite/database/_expansion/__init__.py b/src/ldlite/database/_expansion/__init__.py index 6388a3d..ae78c48 100644 --- a/src/ldlite/database/_expansion/__init__.py +++ b/src/ldlite/database/_expansion/__init__.py @@ -30,7 +30,9 @@ def _expand_nonmarc( ctx: ExpandContext, ) -> tuple[int, list[str]]: ctx.scan_progress.total = (ctx.scan_progress.total or 0) + 1 + ctx.scan_progress.refresh() ctx.transform_progress.total = (ctx.transform_progress.total or 0) + 1 + ctx.transform_progress.refresh() initial_count = count ctx.preprocess(ctx.conn, ctx.source_table, [root.identifier]) has_rows = root.unnest( diff --git a/src/ldlite/database/_expansion/nodes.py b/src/ldlite/database/_expansion/nodes.py index dc3dfb6..afb2ca9 100644 --- a/src/ldlite/database/_expansion/nodes.py +++ b/src/ldlite/database/_expansion/nodes.py @@ -172,6 +172,7 @@ def unnest( prop_count = len(props) ctx.scan_progress.total += prop_count + ctx.scan_progress.refresh() ctx.scan_progress.update(1) for prop in props: @@ -256,6 +257,7 @@ def unnest( ) if meta.is_object: ctx.scan_progress.total += 1 + ctx.scan_progress.refresh() ctx.scan_progress.update(1) From 76d92b206c5c515e17fb1331ae78fdc624e09d78 Mon Sep 17 00:00:00 2001 From: Katherine Bargar Date: Tue, 10 Mar 2026 13:05:29 +0000 Subject: [PATCH 16/18] Actually fix the indexing --- environment.yaml | 2 +- src/ldlite/database/_typed_database.py | 8 ++-- tests/test_query.py | 66 +++++++++++++------------- 3 files changed, 37 insertions(+), 39 deletions(-) diff --git a/environment.yaml b/environment.yaml index 22c24ca..ea467a4 100644 --- a/environment.yaml +++ b/environment.yaml @@ -7,4 +7,4 @@ dependencies: - python>=3.10,<3.11 - pdm==2.26.6 - precious==0.10.2 - - libpq>=13.0 + - libpq>=14.0 diff --git a/src/ldlite/database/_typed_database.py b/src/ldlite/database/_typed_database.py index efaa57c..1303931 100644 --- a/src/ldlite/database/_typed_database.py +++ b/src/ldlite/database/_typed_database.py @@ -286,10 +286,10 @@ def index_prefix(self, prefix: str, progress: tqdm[NoReturn] | None = None) -> N cur.execute( sql.SQL( r""" -SELECT TABLE_NAME, COLUMN_NAME FROM INFORMATION_SCHEMA.COLUMNS +SELECT TABLE_SCHEMA, TABLE_NAME, COLUMN_NAME FROM INFORMATION_SCHEMA.COLUMNS WHERE TABLE_SCHEMA = $1 AND - TABLE_NAME IN (SELECT TABLE_NAME FROM {catalog}) AND + TABLE_NAME IN (SELECT SPLIT_PART(TABLE_NAME, '.', -1) FROM {catalog}) AND ( DATA_TYPE IN ('UUID', 'uuid') OR COLUMN_NAME = 'id' OR @@ -313,8 +313,8 @@ def index_prefix(self, prefix: str, progress: tqdm[NoReturn] | None = None) -> N sql.SQL("CREATE INDEX {name} ON {table} ({column});") .format( name=sql.Identifier(str(uuid4()).split("-")[0]), - table=sql.Identifier(*index[0].split(".")), - column=sql.Identifier(index[1]), + table=sql.Identifier(index[0], index[1]), + column=sql.Identifier(index[2]), ) .as_string(), ) diff --git a/tests/test_query.py b/tests/test_query.py index 41ad44e..8993e7d 100644 --- a/tests/test_query.py +++ b/tests/test_query.py @@ -31,7 +31,7 @@ class QueryTC(MockedResponseTestCase): def case_one_table(json_depth: int) -> QueryTC: return QueryTC( Call( - "prefix", + "tests.prefix", json_depth=json_depth, returns={ "purchaseOrders": [ @@ -48,7 +48,7 @@ def case_one_table(json_depth: int) -> QueryTC: ["id", "value"], [("b096504a-3d54-4664-9bf5-1b872466fd66", "value")], ), - "prefix__tcatalog": (["table_name"], [("prefix__t",)]), + "prefix__tcatalog": (["table_name"], [("tests.prefix__t",)]), }, expected_indexes=[ ("prefix__t", "id"), @@ -60,7 +60,7 @@ def case_one_table(json_depth: int) -> QueryTC: def case_two_tables(json_depth: int) -> QueryTC: return QueryTC( Call( - "prefix", + "tests.prefix", json_depth=json_depth, returns={ "purchaseOrders": [ @@ -109,7 +109,7 @@ def case_two_tables(json_depth: int) -> QueryTC: ), "prefix__tcatalog": ( ["table_name"], - [("prefix__t",), ("prefix__t__sub_objects",)], + [("tests.prefix__t",), ("tests.prefix__t__sub_objects",)], ), }, expected_indexes=[ @@ -124,7 +124,7 @@ def case_two_tables(json_depth: int) -> QueryTC: def case_table_no_expansion(json_depth: int) -> QueryTC: return QueryTC( Call( - "prefix", + "tests.prefix", json_depth=json_depth, returns={ "purchaseOrders": [ @@ -149,7 +149,7 @@ def case_table_no_expansion(json_depth: int) -> QueryTC: def case_table_underexpansion() -> QueryTC: return QueryTC( Call( - "prefix", + "tests.prefix", json_depth=2, returns={ "purchaseOrders": [ @@ -194,7 +194,7 @@ def case_table_underexpansion() -> QueryTC: ), "prefix__tcatalog": ( ["table_name"], - [("prefix__t",), ("prefix__t__sub_objects",)], + [("tests.prefix__t",), ("tests.prefix__t__sub_objects",)], ), }, ) @@ -204,7 +204,7 @@ def case_table_underexpansion() -> QueryTC: def case_three_tables(json_depth: int) -> QueryTC: return QueryTC( Call( - "prefix", + "tests.prefix", json_depth=json_depth, returns={ "purchaseOrders": [ @@ -254,9 +254,9 @@ def case_three_tables(json_depth: int) -> QueryTC: "prefix__tcatalog": ( ["table_name"], [ - ("prefix__t",), - ("prefix__t__sub_objects",), - ("prefix__t__sub_objects__sub_sub_objects",), + ("tests.prefix__t",), + ("tests.prefix__t__sub_objects",), + ("tests.prefix__t__sub_objects__sub_sub_objects",), ], ), }, @@ -277,7 +277,7 @@ def case_three_tables(json_depth: int) -> QueryTC: def case_nested_object() -> QueryTC: return QueryTC( Call( - "prefix", + "tests.prefix", json_depth=2, returns={ "purchaseOrders": [ @@ -307,7 +307,7 @@ def case_nested_object() -> QueryTC: ), "prefix__tcatalog": ( ["table_name"], - [("prefix__t",)], + [("tests.prefix__t",)], ), }, expected_indexes=[ @@ -320,7 +320,7 @@ def case_nested_object() -> QueryTC: def case_doubly_nested_object() -> QueryTC: return QueryTC( Call( - "prefix", + "tests.prefix", json_depth=3, returns={ "purchaseOrders": [ @@ -361,7 +361,7 @@ def case_doubly_nested_object() -> QueryTC: ), "prefix__tcatalog": ( ["table_name"], - [("prefix__t",)], + [("tests.prefix__t",)], ), }, expected_indexes=[ @@ -375,7 +375,7 @@ def case_doubly_nested_object() -> QueryTC: def case_nested_object_underexpansion() -> QueryTC: return QueryTC( Call( - "prefix", + "tests.prefix", json_depth=1, returns={ "purchaseOrders": [ @@ -405,7 +405,7 @@ def case_nested_object_underexpansion() -> QueryTC: ), "prefix__tcatalog": ( ["table_name"], - [("prefix__t",)], + [("tests.prefix__t",)], ), }, ) @@ -414,7 +414,7 @@ def case_nested_object_underexpansion() -> QueryTC: def case_tables_and_lists() -> QueryTC: return QueryTC( Call( - "prefix", + "tests.prefix", json_depth=3, returns={ "purchaseOrders": [ @@ -502,9 +502,9 @@ def case_tables_and_lists() -> QueryTC: "prefix__tcatalog": ( ["table_name"], [ - ("prefix__t",), - ("prefix__t__sub_object__sub_objects",), - ("prefix__t__sub_objects",), + ("tests.prefix__t",), + ("tests.prefix__t__sub_object__sub_objects",), + ("tests.prefix__t__sub_objects",), ], ), }, @@ -518,7 +518,7 @@ def case_tables_and_lists() -> QueryTC: def case_id_generation() -> QueryTC: return QueryTC( Call( - "prefix", + "tests.prefix", json_depth=4, returns={ "purchaseOrders": [ @@ -591,7 +591,7 @@ def case_id_generation() -> QueryTC: def case_indexing_id_like() -> QueryTC: return QueryTC( Call( - "prefix", + "tests.prefix", json_depth=4, returns={ "purchaseOrders": [ @@ -623,7 +623,7 @@ def case_indexing_id_like() -> QueryTC: def case_drop_raw(json_depth: int) -> QueryTC: return QueryTC( Call( - "prefix", + "tests.prefix", json_depth=json_depth, keep_raw=False, returns={ @@ -641,7 +641,7 @@ def case_drop_raw(json_depth: int) -> QueryTC: ["id", "value"], [("b096504a-3d54-4664-9bf5-1b872466fd66", "value")], ), - "prefix__tcatalog": (["table_name"], [("prefix__t",)]), + "prefix__tcatalog": (["table_name"], [("tests.prefix__t",)]), }, expected_indexes=[ ("prefix__t", "id"), @@ -654,7 +654,7 @@ def case_drop_raw(json_depth: int) -> QueryTC: def case_null_records() -> QueryTC: return QueryTC( Call( - "prefix", + "tests.prefix", json_depth=1, returns={ "purchaseOrders": [ @@ -677,7 +677,7 @@ def case_null_records() -> QueryTC: def case_erm_keys() -> QueryTC: return QueryTC( Call( - "prefix", + "tests.prefix", json_depth=3, returns={ "pageSize": 30, @@ -699,7 +699,7 @@ def case_erm_keys() -> QueryTC: ["id", "value"], [("b096504a-3d54-4664-9bf5-1b872466fd66", "value")], ), - "prefix__tcatalog": (["table_name"], [("prefix__t",)]), + "prefix__tcatalog": (["table_name"], [("tests.prefix__t",)]), }, expected_indexes=[ ("prefix__t", "id"), @@ -732,7 +732,6 @@ def _act(uut: "ldlite.LDLite", tc: QueryTC) -> None: def _assert( conn: "dbapi.DBAPIConnection", - res_schema: str, # TODO: have schema be part of tc tc: QueryTC, ) -> None: with closing(conn.cursor()) as cur: @@ -740,9 +739,8 @@ def _assert( """ SELECT table_name FROM information_schema.tables - WHERE table_schema=$1 + WHERE table_schema='tests' """, - (res_schema,), ) assert sorted([r[0] for r in cur.fetchall()]) == sorted(tc.expected_tables) @@ -758,7 +756,7 @@ def _assert( for c in cols ], ), - table=sql.Identifier(table), + table=sql.Identifier("tests", table), ) .as_string(), ) @@ -820,7 +818,7 @@ def test_duckdb( _act(uut, tc) with duckdb.connect(dsn) as conn: - _assert(cast("dbapi.DBAPIConnection", conn), "main", tc) + _assert(cast("dbapi.DBAPIConnection", conn), tc) @mock.patch("httpx_folio.auth.httpx.post") @@ -842,4 +840,4 @@ def test_postgres( _act(uut, tc) with psycopg.connect(dsn, cursor_factory=psycopg.RawCursor) as conn: - _assert(cast("dbapi.DBAPIConnection", conn), "public", tc) + _assert(cast("dbapi.DBAPIConnection", conn), tc) From 9936f850aa234573a71341d293d0215a72e7f1d8 Mon Sep 17 00:00:00 2001 From: Katherine Bargar Date: Tue, 10 Mar 2026 14:03:43 +0000 Subject: [PATCH 17/18] Dont keep tqdm progress bars around --- src/ldlite/__init__.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/ldlite/__init__.py b/src/ldlite/__init__.py index 1675fc7..3d9d3ef 100644 --- a/src/ldlite/__init__.py +++ b/src/ldlite/__init__.py @@ -336,6 +336,7 @@ def query( # noqa: C901, PLR0912, PLR0913, PLR0915 tqdm( records, desc="downloading", + leave=False, total=total_records, mininterval=5, disable=self._quiet, @@ -356,11 +357,13 @@ def query( # noqa: C901, PLR0912, PLR0913, PLR0915 with ( tqdm( desc="scanning", + leave=False, disable=self._quiet, bar_format=no_iters_format, ) as scan_progress, tqdm( desc="transforming", + leave=False, disable=self._quiet, bar_format=no_iters_format, ) as transform_progress, @@ -378,6 +381,7 @@ def query( # noqa: C901, PLR0912, PLR0913, PLR0915 with tqdm( desc="indexing", + leave=False, disable=self._quiet, bar_format=no_iters_format, ) as progress: From 3fa7cc9cd9016c918c18368c5c6cc7f20310fc78 Mon Sep 17 00:00:00 2001 From: Katherine Bargar Date: Tue, 10 Mar 2026 16:40:36 +0000 Subject: [PATCH 18/18] Be more specific about when to include the object column name --- src/ldlite/database/_expansion/nodes.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/ldlite/database/_expansion/nodes.py b/src/ldlite/database/_expansion/nodes.py index afb2ca9..75ea842 100644 --- a/src/ldlite/database/_expansion/nodes.py +++ b/src/ldlite/database/_expansion/nodes.py @@ -288,7 +288,9 @@ def unnest( def _carryover(self) -> Iterator[str]: for n in self.root.descendents: - if (isinstance(n, ObjectNode) and not n.unnested) or n.name != "jsonb": + if isinstance(n, ObjectNode) and not n.unnested and n.name != "jsonb": + yield n.name + if isinstance(n, ArrayNode): yield n.name yield from n.values