From 1d7d684f541ffa6738d7ab6625e6f3744a886cd4 Mon Sep 17 00:00:00 2001
From: Katherine Bargar <kbargar@fivecolleges.edu>
Date: Thu, 5 Mar 2026 17:52:53 +0000
Subject: [PATCH 01/18] Refactor metadata collection to be per property

---
 src/ldlite/database/_expansion/nodes.py | 79 ++++++++++++++-----------
 1 file changed, 44 insertions(+), 35 deletions(-)

diff --git a/src/ldlite/database/_expansion/nodes.py b/src/ldlite/database/_expansion/nodes.py
index 7883f51..e9f41d3 100644
--- a/src/ldlite/database/_expansion/nodes.py
+++ b/src/ldlite/database/_expansion/nodes.py
@@ -131,74 +131,83 @@ def unnest(
             cur.execute(
                 sql.SQL(
                     """
+SELECT ldlite_system.jobject_keys(j) FROM (
+    SELECT {json_col} AS j
+    FROM {table}
+    WHERE NOT ldlite_system.jis_null({json_col})
+    LIMIT 1
+)
+""",
+                )
+                .format(table=source_table, json_col=self.identifier)
+                .as_string(),
+            )
+            props = [prop[0] for prop in cur.fetchall()]
+
+        for prop in props:
+            with conn.cursor() as cur:
+                cur.execute(
+                    sql.SQL(
+                        """
 WITH
-    one_object AS (
-        SELECT {json_col} AS json
-        FROM {table}
-        WHERE NOT ldlite_system.jis_null({json_col})
-        LIMIT 1
-    ),
-    props AS (SELECT ldlite_system.jobject_keys(json) AS prop FROM one_object),
     values AS (
-        SELECT
-            prop
-            ,ldlite_system.jextract({json_col}, prop) as ld_value
-        FROM {table}, props
+        SELECT ldlite_system.jextract({json_col}, $1) as ld_value
+        FROM {table}
     ),
     value_and_types AS (
         SELECT
-            prop
-            ,ldlite_system.jtype_of(ld_value) AS json_type
+            ldlite_system.jtype_of(ld_value) AS json_type
             ,ld_value
         FROM values
         WHERE NOT ldlite_system.jis_null(ld_value)
     ),
     array_values AS (
         SELECT
-            v.prop
-            ,ldlite_system.jtype_of(a.ld_value) AS json_type
-            ,v.ld_value
-        FROM value_and_types v, ldlite_system.jexplode(v.ld_value) a
+            ldlite_system.jtype_of(a.ld_value) AS json_type
+            ,a.ld_value
+        FROM value_and_types v
+        CROSS JOIN LATERAL (
+            SELECT s.ld_value
+            FROM ldlite_system.jexplode(v.ld_value) AS s
+            WHERE NOT ldlite_system.jis_null(s.ld_value)
+            LIMIT 3
+        ) a
         WHERE v.json_type = 'array'
     ),
     all_values AS (
         SELECT
-            prop
-            ,json_type
+            json_type
             ,ld_value
             ,FALSE AS is_array
         FROM value_and_types
         WHERE json_type <> 'array'
         UNION
         SELECT
-            prop
-            ,json_type
+            json_type
             ,ld_value
             ,TRUE AS is_array
         FROM array_values
         WHERE NOT ldlite_system.jis_null(ld_value)
     )
 SELECT
-    prop
-    ,STRING_AGG(DISTINCT json_type, '|') AS json_type
+    STRING_AGG(DISTINCT json_type, '|') AS json_type
     ,bool_and(is_array) AS is_array
     ,bool_and(ldlite_system.jis_uuid(ld_value)) AS is_uuid
     ,bool_and(ldlite_system.jis_datetime(ld_value)) AS is_datetime
     ,bool_and(ldlite_system.jis_float(ld_value)) AS is_float
 FROM all_values
-GROUP BY prop
+HAVING COUNT(*) > 0
 """,
+                    )
+                    .format(table=source_table, json_col=self.identifier)
+                    .as_string(),
+                    (prop,),
                 )
-                .format(table=source_table, json_col=self.identifier)
-                .as_string(),
-            )
-
-            create_columns.extend(
-                [
-                    row.select_column(self.identifier, self.add(row))
-                    for row in [Metadata(*r) for r in cur.fetchall()]
-                ],
-            )
+                if (row := cur.fetchone()) is not None:
+                    meta = Metadata(prop, *row)
+                    create_columns.append(
+                        meta.select_column(self.identifier, self.add(meta)),
+                    )
 
         with conn.cursor() as cur:
             cur.execute(
@@ -264,7 +273,7 @@ def explode(
             o_col = self.name + "__o"
             create_columns: list[sql.Composable] = [
                 sql.SQL(
-                    "(ROW_NUMBER() OVER (ORDER BY (SELECT NULL)))::integer AS __id"
+                    "(ROW_NUMBER() OVER (ORDER BY (SELECT NULL)))::integer AS __id",
                 ),
                 *[sql.Identifier(v) for v in self.carryover],
                 sql.SQL(

From 1fee373a23539cbdf8fa272022d619b8bb9033ef Mon Sep 17 00:00:00 2001
From: Katherine Bargar <kbargar@fivecolleges.edu>
Date: Thu, 5 Mar 2026 18:27:17 +0000
Subject: [PATCH 02/18] Monitor progress of transformation

---
 src/ldlite/__init__.py                     | 16 +++++++++++++++-
 src/ldlite/database/__init__.py            |  8 +++++++-
 src/ldlite/database/_expansion/__init__.py |  8 +++++++-
 src/ldlite/database/_expansion/nodes.py    | 17 +++++++++++++++++
 src/ldlite/database/_typed_database.py     | 10 +++++++++-
 5 files changed, 55 insertions(+), 4 deletions(-)

diff --git a/src/ldlite/__init__.py b/src/ldlite/__init__.py
index 1c118ae..e3b35be 100644
--- a/src/ldlite/__init__.py
+++ b/src/ldlite/__init__.py
@@ -350,7 +350,21 @@ def query(  # noqa: C901, PLR0912, PLR0913, PLR0915
 
         transform_started = datetime.now(timezone.utc)
         if not use_legacy_transform:
-            newtables = self._database.expand_prefix(table, json_depth, keep_raw)
+            with tqdm(
+                total=0,
+                desc="transforming",
+                leave=False,
+                mininterval=5,
+                disable=self._quiet,
+                unit="ops",
+                delay=5,
+            ) as progress:
+                newtables = self._database.expand_prefix(
+                    table,
+                    json_depth,
+                    keep_raw,
+                    progress,
+                )
             if keep_raw:
                 newtables = [table, *newtables]
             transform_elapsed = datetime.now(timezone.utc) - transform_started
diff --git a/src/ldlite/database/__init__.py b/src/ldlite/database/__init__.py
index 9acd7b8..8ed5875 100644
--- a/src/ldlite/database/__init__.py
+++ b/src/ldlite/database/__init__.py
@@ -54,7 +54,13 @@ def ingest_records(self, prefix: str, records: Iterator[bytes]) -> int:
         """Ingests a stream of records dowloaded from FOLIO to the raw table."""
 
     @abstractmethod
-    def expand_prefix(self, prefix: str, json_depth: int, keep_raw: bool) -> list[str]:
+    def expand_prefix(
+        self,
+        prefix: str,
+        json_depth: int,
+        keep_raw: bool,
+        progress: tqdm[NoReturn] | None = None,
+    ) -> list[str]:
         """Unnests and explodes the raw data at the given prefix."""
 
     @abstractmethod
diff --git a/src/ldlite/database/_expansion/__init__.py b/src/ldlite/database/_expansion/__init__.py
index d6f8bd5..4b22070 100644
--- a/src/ldlite/database/_expansion/__init__.py
+++ b/src/ldlite/database/_expansion/__init__.py
@@ -2,7 +2,7 @@
 
 from collections import deque
 from dataclasses import dataclass
-from typing import TYPE_CHECKING
+from typing import TYPE_CHECKING, NoReturn
 
 from psycopg import sql
 
@@ -13,6 +13,7 @@
 
     import duckdb
     import psycopg
+    from tqdm import tqdm
 
 
 @dataclass
@@ -34,6 +35,7 @@ class ExpandContext:
     ]
     # source_cte will go away when DuckDB implements CTAS RETURNING
     source_cte: Callable[[bool], str]
+    progress: tqdm[NoReturn] | None
 
     def array_context(
         self,
@@ -48,6 +50,7 @@ def array_context(
             self.get_output_table,
             self.preprocess,
             self.source_cte,
+            self.progress,
         )
 
 
@@ -76,6 +79,7 @@ def _expand_nonmarc(
         ctx.source_table,
         ctx.get_transform_table(count),
         ctx.source_cte(False),
+        ctx.progress,
     )
 
     expand_children_of = deque([root])
@@ -92,6 +96,7 @@ def _expand_nonmarc(
                 ctx.get_transform_table(count),
                 ctx.get_transform_table(count + 1),
                 ctx.source_cte(False),
+                ctx.progress,
             )
             expand_children_of.append(c)
             count += 1
@@ -109,6 +114,7 @@ def _expand_nonmarc(
             new_source_table,
             ctx.get_transform_table(count + 1),
             ctx.source_cte(True),
+            ctx.progress,
         )
         count += 1
 
diff --git a/src/ldlite/database/_expansion/nodes.py b/src/ldlite/database/_expansion/nodes.py
index e9f41d3..7f62729 100644
--- a/src/ldlite/database/_expansion/nodes.py
+++ b/src/ldlite/database/_expansion/nodes.py
@@ -7,9 +7,11 @@
 
 if TYPE_CHECKING:
     from collections.abc import Iterator
+    from typing import NoReturn
 
     import duckdb
     import psycopg
+    from tqdm import tqdm
 
 from .metadata import Metadata
 
@@ -121,6 +123,7 @@ def unnest(
         source_table: sql.Identifier,
         dest_table: sql.Identifier,
         source_cte: str,
+        progress: tqdm[NoReturn] | None,
     ) -> None:
         self.unnested = True
         create_columns: list[sql.Composable] = [
@@ -144,6 +147,10 @@ def unnest(
             )
             props = [prop[0] for prop in cur.fetchall()]
 
+        prop_count = len(props)
+        if progress is not None:
+            progress.total += prop_count * 2
+
         for prop in props:
             with conn.cursor() as cur:
                 cur.execute(
@@ -208,6 +215,11 @@ def unnest(
                     create_columns.append(
                         meta.select_column(self.identifier, self.add(meta)),
                     )
+                    if progress is not None and meta.is_array:
+                        progress.total += 1
+
+                if progress is not None:
+                    progress.update(1)
 
         with conn.cursor() as cur:
             cur.execute(
@@ -229,6 +241,8 @@ def unnest(
                 )
                 .as_string(),
             )
+            if progress is not None:
+                progress.update(prop_count)
 
     def _carryover(self) -> Iterator[str]:
         for n in self.root.descendents:
@@ -268,6 +282,7 @@ def explode(
         source_table: sql.Identifier,
         dest_table: sql.Identifier,
         source_cte: str,
+        progress: tqdm[NoReturn] | None,
     ) -> list[str]:
         with conn.cursor() as cur:
             o_col = self.name + "__o"
@@ -310,6 +325,8 @@ def explode(
                 )
                 .as_string(),
             )
+            if progress is not None:
+                progress.update(1)
 
         return ["__id", *self.carryover, o_col]
 
diff --git a/src/ldlite/database/_typed_database.py b/src/ldlite/database/_typed_database.py
index eb4a1f3..a9197ad 100644
--- a/src/ldlite/database/_typed_database.py
+++ b/src/ldlite/database/_typed_database.py
@@ -16,6 +16,7 @@
 
 if TYPE_CHECKING:
     from collections.abc import Callable, Sequence
+    from typing import NoReturn
 
     import duckdb
     from tqdm import tqdm
@@ -184,7 +185,13 @@ def preprocess_source_table(
     @abstractmethod
     def source_table_cte_stmt(self, keep_source: bool) -> str: ...
 
-    def expand_prefix(self, prefix: str, json_depth: int, keep_raw: bool) -> list[str]:
+    def expand_prefix(
+        self,
+        prefix: str,
+        json_depth: int,
+        keep_raw: bool,
+        progress: tqdm[NoReturn] | None = None,
+    ) -> list[str]:
         pfx = Prefix(prefix)
         with closing(self._conn_factory()) as conn:
             self._drop_extracted_tables(conn, pfx)
@@ -224,6 +231,7 @@ def expand_prefix(self, prefix: str, json_depth: int, keep_raw: bool) -> list[st
                     pfx.output_table,
                     self.preprocess_source_table,  # type: ignore [arg-type]
                     self.source_table_cte_stmt,
+                    progress,
                 ),
             )
 

From b37dd1f0645c8cbc1cc51fea5f5e5316ab2ab587 Mon Sep 17 00:00:00 2001
From: Katherine Bargar <kbargar@fivecolleges.edu>
Date: Thu, 5 Mar 2026 21:07:53 +0000
Subject: [PATCH 03/18] Optimize the definitions of postgres json operators

---
 src/ldlite/database/_postgres.py | 192 +++++++++++++++++++++++++------
 1 file changed, 156 insertions(+), 36 deletions(-)

diff --git a/src/ldlite/database/_postgres.py b/src/ldlite/database/_postgres.py
index a9b4b81..e4de4ff 100644
--- a/src/ldlite/database/_postgres.py
+++ b/src/ldlite/database/_postgres.py
@@ -42,49 +42,53 @@ def _setup_jfuncs(conn: psycopg.Connection) -> None:
 PARALLEL SAFE;
 
 CREATE OR REPLACE FUNCTION ldlite_system.jextract(j JSONB, p TEXT) RETURNS JSONB AS $$
-WITH jp AS (
-    -- This is somewhat of a hack.
-    -- There isn't a really good way to get the element unchanged
-    -- which works for duckdb and postgres AND CRUCIALLY
-    -- has a similar syntax to everything else so that we don't
-    -- have to have special cases for exploding the array and it
-    -- can share all the same type checking / statement generation code.
-    -- We're pretending that postgres supports -> '$' style syntax like duckdb.
-    SELECT
-    CASE
-        WHEN p = '$' THEN j #> '{}'
-        ELSE j->p
-    END AS val
-    ,CASE
-        WHEN p = '$' THEN j #>> '{}'
-        ELSE j->>p
-    END AS str
-)
 SELECT
-    CASE
-        WHEN ldlite_system.jtype_of(jp.val) = 'string' THEN
+    CASE jsonb_typeof(val)
+        WHEN 'string' THEN
             CASE
-                WHEN lower(jp.str) = 'null' THEN 'null'::JSONB
-                WHEN length(jp.str) = 0 THEN 'null'::JSONB
-                ELSE jp.val
+                WHEN lower(val #>> '{}') IN ('null', '') THEN 'null'::jsonb
+                ELSE val
             END
-        WHEN ldlite_system.jtype_of(jp.val) = 'array' THEN
+        WHEN 'array' THEN
             CASE
-                WHEN jsonb_array_length(jsonb_path_query_array(jp.val, '$[*] ? (@ != null)')) = 0 THEN 'null'::JSONB
-                ELSE jsonb_path_query_array(jp.val, '$[*] ? (@ != null)')
+                WHEN jsonb_array_length(val) = 0 THEN 'null'::jsonb
+                WHEN NOT EXISTS (
+                    SELECT 1
+                    FROM jsonb_array_elements(val) AS e(elem)
+                    WHERE elem = 'null'::jsonb
+                    LIMIT 1
+                ) THEN val
+                ELSE COALESCE(
+                    (
+                        SELECT jsonb_agg(e)
+                        FROM jsonb_array_elements(val) AS a(e)
+                        WHERE e <> 'null'::jsonb
+                    ),
+                    'null'::jsonb
+                )
             END
-        WHEN ldlite_system.jtype_of(jp.val) = 'object' THEN
+        WHEN 'object' THEN
             CASE
-                WHEN jp.str = '{}' THEN 'null'::JSONB
-                ELSE jp.val
+                WHEN val = '{}'::jsonb THEN 'null'::jsonb
+                ELSE val
             END
-        ELSE jp.val
+        ELSE val
     END
-FROM jp;
+FROM (
+    -- This is somewhat of a hack.
+    -- There isn't a really good way to get the element unchanged
+    -- which works for duckdb and postgres AND CRUCIALLY
+    -- has a similar syntax to everything else so that we don't
+    -- have to have special cases for exploding the array and it
+    -- can share all the same type checking / statement generation code.
+    -- We're pretending that postgres supports -> '$' style syntax like duckdb.
+    SELECT CASE WHEN p = '$' THEN j ELSE j->p END AS val
+) s;
 $$
 LANGUAGE sql
 IMMUTABLE
-PARALLEL SAFE;
+PARALLEL SAFE
+STRICT;
 
 CREATE OR REPLACE FUNCTION ldlite_system.jextract_string(j JSONB, p TEXT) RETURNS TEXT AS $$
 SELECT ldlite_system.jextract(j, p) #>> '{}'
@@ -103,7 +107,67 @@ def _setup_jfuncs(conn: psycopg.Connection) -> None:
 CREATE OR REPLACE FUNCTION ldlite_system.jis_uuid(j JSONB) RETURNS BOOLEAN AS $$
 SELECT
     CASE
-        WHEN ldlite_system.jtype_of(j) = 'string' THEN j->>0 ~ '^[a-fA-F0-9]{8}-[a-fA-F0-9]{4}-[1-5][a-fA-F0-9]{3}-[89abAB][a-fA-F0-9]{3}-[a-fA-F0-9]{12}$'
+        WHEN jsonb_typeof(j) = 'string' THEN
+        (
+            WITH v AS (SELECT $1 #>> '{}' AS s)
+            SELECT
+                LENGTH(s) = 36
+                -- Hyphens at canonical positions: 9,14,19,24
+                AND substr(s,  9,1) = '-'
+                AND substr(s, 14,1) = '-'
+                AND substr(s, 19,1) = '-'
+                AND substr(s, 24,1) = '-'
+
+                -- Version M at pos 15 must be 1..5
+                AND substr(s, 15,1) BETWEEN '1' AND '5'
+
+                -- Variant N at pos 20 must be 8,9,a,b,A,B
+                AND substr(s, 20,1) IN ('8','9','a','b','A','B')
+
+                -- All other non-hyphen characters must be hex [0-9a-fA-F]
+                AND (
+                -- positions 1..8
+                    (substr(s,  1,1) BETWEEN '0' AND '9' OR substr(s,  1,1) BETWEEN 'a' AND 'f' OR substr(s,  1,1) BETWEEN 'A' AND 'F') AND
+                    (substr(s,  2,1) BETWEEN '0' AND '9' OR substr(s,  2,1) BETWEEN 'a' AND 'f' OR substr(s,  2,1) BETWEEN 'A' AND 'F') AND
+                    (substr(s,  3,1) BETWEEN '0' AND '9' OR substr(s,  3,1) BETWEEN 'a' AND 'f' OR substr(s,  3,1) BETWEEN 'A' AND 'F') AND
+                    (substr(s,  4,1) BETWEEN '0' AND '9' OR substr(s,  4,1) BETWEEN 'a' AND 'f' OR substr(s,  4,1) BETWEEN 'A' AND 'F') AND
+                    (substr(s,  5,1) BETWEEN '0' AND '9' OR substr(s,  5,1) BETWEEN 'a' AND 'f' OR substr(s,  5,1) BETWEEN 'A' AND 'F') AND
+                    (substr(s,  6,1) BETWEEN '0' AND '9' OR substr(s,  6,1) BETWEEN 'a' AND 'f' OR substr(s,  6,1) BETWEEN 'A' AND 'F') AND
+                    (substr(s,  7,1) BETWEEN '0' AND '9' OR substr(s,  7,1) BETWEEN 'a' AND 'f' OR substr(s,  7,1) BETWEEN 'A' AND 'F') AND
+                    (substr(s,  8,1) BETWEEN '0' AND '9' OR substr(s,  8,1) BETWEEN 'a' AND 'f' OR substr(s,  8,1) BETWEEN 'A' AND 'F') AND
+
+                    -- positions 10..13
+                    (substr(s, 10,1) BETWEEN '0' AND '9' OR substr(s, 10,1) BETWEEN 'a' AND 'f' OR substr(s, 10,1) BETWEEN 'A' AND 'F') AND
+                    (substr(s, 11,1) BETWEEN '0' AND '9' OR substr(s, 11,1) BETWEEN 'a' AND 'f' OR substr(s, 11,1) BETWEEN 'A' AND 'F') AND
+                    (substr(s, 12,1) BETWEEN '0' AND '9' OR substr(s, 12,1) BETWEEN 'a' AND 'f' OR substr(s, 12,1) BETWEEN 'A' AND 'F') AND
+                    (substr(s, 13,1) BETWEEN '0' AND '9' OR substr(s, 13,1) BETWEEN 'a' AND 'f' OR substr(s, 13,1) BETWEEN 'A' AND 'F') AND
+
+                    -- positions 16..18 (pos 15 is version, already checked)
+                    (substr(s, 16,1) BETWEEN '0' AND '9' OR substr(s, 16,1) BETWEEN 'a' AND 'f' OR substr(s, 16,1) BETWEEN 'A' AND 'F') AND
+                    (substr(s, 17,1) BETWEEN '0' AND '9' OR substr(s, 17,1) BETWEEN 'a' AND 'f' OR substr(s, 17,1) BETWEEN 'A' AND 'F') AND
+                    (substr(s, 18,1) BETWEEN '0' AND '9' OR substr(s, 18,1) BETWEEN 'a' AND 'f' OR substr(s, 18,1) BETWEEN 'A' AND 'F') AND
+
+                    -- positions 21..23 (pos 20 is variant, already checked)
+                    (substr(s, 21,1) BETWEEN '0' AND '9' OR substr(s, 21,1) BETWEEN 'a' AND 'f' OR substr(s, 21,1) BETWEEN 'A' AND 'F') AND
+                    (substr(s, 22,1) BETWEEN '0' AND '9' OR substr(s, 22,1) BETWEEN 'a' AND 'f' OR substr(s, 22,1) BETWEEN 'A' AND 'F') AND
+                    (substr(s, 23,1) BETWEEN '0' AND '9' OR substr(s, 23,1) BETWEEN 'a' AND 'f' OR substr(s, 23,1) BETWEEN 'A' AND 'F') AND
+
+                    -- positions 25..36
+                    (substr(s, 25,1) BETWEEN '0' AND '9' OR substr(s, 25,1) BETWEEN 'a' AND 'f' OR substr(s, 25,1) BETWEEN 'A' AND 'F') AND
+                    (substr(s, 26,1) BETWEEN '0' AND '9' OR substr(s, 26,1) BETWEEN 'a' AND 'f' OR substr(s, 26,1) BETWEEN 'A' AND 'F') AND
+                    (substr(s, 27,1) BETWEEN '0' AND '9' OR substr(s, 27,1) BETWEEN 'a' AND 'f' OR substr(s, 27,1) BETWEEN 'A' AND 'F') AND
+                    (substr(s, 28,1) BETWEEN '0' AND '9' OR substr(s, 28,1) BETWEEN 'a' AND 'f' OR substr(s, 28,1) BETWEEN 'A' AND 'F') AND
+                    (substr(s, 29,1) BETWEEN '0' AND '9' OR substr(s, 29,1) BETWEEN 'a' AND 'f' OR substr(s, 29,1) BETWEEN 'A' AND 'F') AND
+                    (substr(s, 30,1) BETWEEN '0' AND '9' OR substr(s, 30,1) BETWEEN 'a' AND 'f' OR substr(s, 30,1) BETWEEN 'A' AND 'F') AND
+                    (substr(s, 31,1) BETWEEN '0' AND '9' OR substr(s, 31,1) BETWEEN 'a' AND 'f' OR substr(s, 31,1) BETWEEN 'A' AND 'F') AND
+                    (substr(s, 32,1) BETWEEN '0' AND '9' OR substr(s, 32,1) BETWEEN 'a' AND 'f' OR substr(s, 32,1) BETWEEN 'A' AND 'F') AND
+                    (substr(s, 33,1) BETWEEN '0' AND '9' OR substr(s, 33,1) BETWEEN 'a' AND 'f' OR substr(s, 33,1) BETWEEN 'A' AND 'F') AND
+                    (substr(s, 34,1) BETWEEN '0' AND '9' OR substr(s, 34,1) BETWEEN 'a' AND 'f' OR substr(s, 34,1) BETWEEN 'A' AND 'F') AND
+                    (substr(s, 35,1) BETWEEN '0' AND '9' OR substr(s, 35,1) BETWEEN 'a' AND 'f' OR substr(s, 35,1) BETWEEN 'A' AND 'F') AND
+                    (substr(s, 36,1) BETWEEN '0' AND '9' OR substr(s, 36,1) BETWEEN 'a' AND 'f' OR substr(s, 36,1) BETWEEN 'A' AND 'F')
+                )
+            FROM v
+        )
         ELSE FALSE
     END;
 $$
@@ -114,7 +178,63 @@ def _setup_jfuncs(conn: psycopg.Connection) -> None:
 CREATE OR REPLACE FUNCTION ldlite_system.jis_datetime(j JSONB) RETURNS BOOLEAN AS $$
 SELECT
     CASE
-        WHEN ldlite_system.jtype_of(j) = 'string' THEN j->>0 ~ '^\d{4}-[01]\d-[0123]\dT[012]\d:[012345]\d:[012345]\d\.\d{3}(\+\d{2}:\d{2})?$'
+        WHEN jsonb_typeof(j) = 'string' THEN
+        (
+            WITH v AS (SELECT $1 #>> '{}' AS s)
+            SELECT
+                -- Length must be exactly 29 characters
+                length(s) = 29
+
+                -- Fixed punctuation positions
+                AND substr(s, 5, 1)  = '-'
+                AND substr(s, 8, 1)  = '-'
+                AND substr(s, 11, 1) = 'T'
+                AND substr(s, 14, 1) = ':'
+                AND substr(s, 17, 1) = ':'
+                AND substr(s, 20, 1) = '.'
+                AND substr(s, 24, 1) = '+'
+                AND substr(s, 27, 1) = ':'
+
+                -- YYYY
+                AND substr(s, 1, 1) BETWEEN '0' AND '9'
+                AND substr(s, 2, 1) BETWEEN '0' AND '9'
+                AND substr(s, 3, 1) BETWEEN '0' AND '9'
+                AND substr(s, 4, 1) BETWEEN '0' AND '9'
+
+                -- MM
+                AND substr(s, 6, 1) BETWEEN '0' AND '9'
+                AND substr(s, 7, 1) BETWEEN '0' AND '9'
+
+                -- DD
+                AND substr(s, 9, 1) BETWEEN '0' AND '9'
+                AND substr(s, 10, 1) BETWEEN '0' AND '9'
+
+                -- HH
+                AND substr(s, 12, 1) BETWEEN '0' AND '9'
+                AND substr(s, 13, 1) BETWEEN '0' AND '9'
+
+                -- mm
+                AND substr(s, 15, 1) BETWEEN '0' AND '9'
+                AND substr(s, 16, 1) BETWEEN '0' AND '9'
+
+                -- SS
+                AND substr(s, 18, 1) BETWEEN '0' AND '9'
+                AND substr(s, 19, 1) BETWEEN '0' AND '9'
+
+                -- mmm
+                AND substr(s, 21, 1) BETWEEN '0' AND '9'
+                AND substr(s, 22, 1) BETWEEN '0' AND '9'
+                AND substr(s, 23, 1) BETWEEN '0' AND '9'
+
+                -- Timezone HH
+                AND substr(s, 25, 1) BETWEEN '0' AND '9'
+                AND substr(s, 26, 1) BETWEEN '0' AND '9'
+
+                -- Timezone MM
+                AND substr(s, 28, 1) BETWEEN '0' AND '9'
+                AND substr(s, 29, 1) BETWEEN '0' AND '9'
+            FROM v
+        )
         ELSE FALSE
     END;
 $$
@@ -125,7 +245,7 @@ def _setup_jfuncs(conn: psycopg.Connection) -> None:
 CREATE OR REPLACE FUNCTION ldlite_system.jis_float(j JSONB) RETURNS BOOLEAN AS $$
 SELECT
     CASE
-        WHEN ldlite_system.jtype_of(j) = 'number' THEN j->>0 LIKE '%.%'
+        WHEN jsonb_typeof(j) = 'number' THEN scale((j)::numeric) > 0
         ELSE FALSE
     END;
 $$
@@ -134,7 +254,7 @@ def _setup_jfuncs(conn: psycopg.Connection) -> None:
 PARALLEL SAFE;
 
 CREATE OR REPLACE FUNCTION ldlite_system.jis_null(j JSONB) RETURNS BOOLEAN AS $$
-SELECT j IS NULL OR j = 'null'::JSONB;
+SELECT COALESCE(j = 'null'::jsonb, TRUE);
 $$
 LANGUAGE sql
 IMMUTABLE

From 0fe59864a768f26e30015ec7cd62c1fa515e04d5 Mon Sep 17 00:00:00 2001
From: Katherine Bargar <kbargar@fivecolleges.edu>
Date: Fri, 6 Mar 2026 15:47:48 +0000
Subject: [PATCH 04/18] Simplify checks for specific data types

---
 src/ldlite/database/_expansion/nodes.py |   8 +-
 src/ldlite/database/_postgres.py        | 143 ++----------------------
 tests/test_json_operators.py            |  11 --
 3 files changed, 13 insertions(+), 149 deletions(-)

diff --git a/src/ldlite/database/_expansion/nodes.py b/src/ldlite/database/_expansion/nodes.py
index 7f62729..6659784 100644
--- a/src/ldlite/database/_expansion/nodes.py
+++ b/src/ldlite/database/_expansion/nodes.py
@@ -199,12 +199,12 @@ def unnest(
 SELECT
     STRING_AGG(DISTINCT json_type, '|') AS json_type
     ,bool_and(is_array) AS is_array
-    ,bool_and(ldlite_system.jis_uuid(ld_value)) AS is_uuid
-    ,bool_and(ldlite_system.jis_datetime(ld_value)) AS is_datetime
-    ,bool_and(ldlite_system.jis_float(ld_value)) AS is_float
+    ,bool_and(json_type = 'string' AND ldlite_system.jis_uuid(ld_value)) AS is_uuid
+    ,bool_and(json_type = 'string' AND ldlite_system.jis_datetime(ld_value)) AS is_datetime
+    ,bool_and(json_type = 'number' AND ldlite_system.jis_float(ld_value)) AS is_float
 FROM all_values
 HAVING COUNT(*) > 0
-""",
+""",  # noqa: E501
                     )
                     .format(table=source_table, json_col=self.identifier)
                     .as_string(),
diff --git a/src/ldlite/database/_postgres.py b/src/ldlite/database/_postgres.py
index e4de4ff..0bebfa3 100644
--- a/src/ldlite/database/_postgres.py
+++ b/src/ldlite/database/_postgres.py
@@ -105,153 +105,28 @@ def _setup_jfuncs(conn: psycopg.Connection) -> None:
 PARALLEL SAFE;
 
 CREATE OR REPLACE FUNCTION ldlite_system.jis_uuid(j JSONB) RETURNS BOOLEAN AS $$
-SELECT
-    CASE
-        WHEN jsonb_typeof(j) = 'string' THEN
-        (
-            WITH v AS (SELECT $1 #>> '{}' AS s)
-            SELECT
-                LENGTH(s) = 36
-                -- Hyphens at canonical positions: 9,14,19,24
-                AND substr(s,  9,1) = '-'
-                AND substr(s, 14,1) = '-'
-                AND substr(s, 19,1) = '-'
-                AND substr(s, 24,1) = '-'
-
-                -- Version M at pos 15 must be 1..5
-                AND substr(s, 15,1) BETWEEN '1' AND '5'
-
-                -- Variant N at pos 20 must be 8,9,a,b,A,B
-                AND substr(s, 20,1) IN ('8','9','a','b','A','B')
-
-                -- All other non-hyphen characters must be hex [0-9a-fA-F]
-                AND (
-                -- positions 1..8
-                    (substr(s,  1,1) BETWEEN '0' AND '9' OR substr(s,  1,1) BETWEEN 'a' AND 'f' OR substr(s,  1,1) BETWEEN 'A' AND 'F') AND
-                    (substr(s,  2,1) BETWEEN '0' AND '9' OR substr(s,  2,1) BETWEEN 'a' AND 'f' OR substr(s,  2,1) BETWEEN 'A' AND 'F') AND
-                    (substr(s,  3,1) BETWEEN '0' AND '9' OR substr(s,  3,1) BETWEEN 'a' AND 'f' OR substr(s,  3,1) BETWEEN 'A' AND 'F') AND
-                    (substr(s,  4,1) BETWEEN '0' AND '9' OR substr(s,  4,1) BETWEEN 'a' AND 'f' OR substr(s,  4,1) BETWEEN 'A' AND 'F') AND
-                    (substr(s,  5,1) BETWEEN '0' AND '9' OR substr(s,  5,1) BETWEEN 'a' AND 'f' OR substr(s,  5,1) BETWEEN 'A' AND 'F') AND
-                    (substr(s,  6,1) BETWEEN '0' AND '9' OR substr(s,  6,1) BETWEEN 'a' AND 'f' OR substr(s,  6,1) BETWEEN 'A' AND 'F') AND
-                    (substr(s,  7,1) BETWEEN '0' AND '9' OR substr(s,  7,1) BETWEEN 'a' AND 'f' OR substr(s,  7,1) BETWEEN 'A' AND 'F') AND
-                    (substr(s,  8,1) BETWEEN '0' AND '9' OR substr(s,  8,1) BETWEEN 'a' AND 'f' OR substr(s,  8,1) BETWEEN 'A' AND 'F') AND
-
-                    -- positions 10..13
-                    (substr(s, 10,1) BETWEEN '0' AND '9' OR substr(s, 10,1) BETWEEN 'a' AND 'f' OR substr(s, 10,1) BETWEEN 'A' AND 'F') AND
-                    (substr(s, 11,1) BETWEEN '0' AND '9' OR substr(s, 11,1) BETWEEN 'a' AND 'f' OR substr(s, 11,1) BETWEEN 'A' AND 'F') AND
-                    (substr(s, 12,1) BETWEEN '0' AND '9' OR substr(s, 12,1) BETWEEN 'a' AND 'f' OR substr(s, 12,1) BETWEEN 'A' AND 'F') AND
-                    (substr(s, 13,1) BETWEEN '0' AND '9' OR substr(s, 13,1) BETWEEN 'a' AND 'f' OR substr(s, 13,1) BETWEEN 'A' AND 'F') AND
-
-                    -- positions 16..18 (pos 15 is version, already checked)
-                    (substr(s, 16,1) BETWEEN '0' AND '9' OR substr(s, 16,1) BETWEEN 'a' AND 'f' OR substr(s, 16,1) BETWEEN 'A' AND 'F') AND
-                    (substr(s, 17,1) BETWEEN '0' AND '9' OR substr(s, 17,1) BETWEEN 'a' AND 'f' OR substr(s, 17,1) BETWEEN 'A' AND 'F') AND
-                    (substr(s, 18,1) BETWEEN '0' AND '9' OR substr(s, 18,1) BETWEEN 'a' AND 'f' OR substr(s, 18,1) BETWEEN 'A' AND 'F') AND
-
-                    -- positions 21..23 (pos 20 is variant, already checked)
-                    (substr(s, 21,1) BETWEEN '0' AND '9' OR substr(s, 21,1) BETWEEN 'a' AND 'f' OR substr(s, 21,1) BETWEEN 'A' AND 'F') AND
-                    (substr(s, 22,1) BETWEEN '0' AND '9' OR substr(s, 22,1) BETWEEN 'a' AND 'f' OR substr(s, 22,1) BETWEEN 'A' AND 'F') AND
-                    (substr(s, 23,1) BETWEEN '0' AND '9' OR substr(s, 23,1) BETWEEN 'a' AND 'f' OR substr(s, 23,1) BETWEEN 'A' AND 'F') AND
-
-                    -- positions 25..36
-                    (substr(s, 25,1) BETWEEN '0' AND '9' OR substr(s, 25,1) BETWEEN 'a' AND 'f' OR substr(s, 25,1) BETWEEN 'A' AND 'F') AND
-                    (substr(s, 26,1) BETWEEN '0' AND '9' OR substr(s, 26,1) BETWEEN 'a' AND 'f' OR substr(s, 26,1) BETWEEN 'A' AND 'F') AND
-                    (substr(s, 27,1) BETWEEN '0' AND '9' OR substr(s, 27,1) BETWEEN 'a' AND 'f' OR substr(s, 27,1) BETWEEN 'A' AND 'F') AND
-                    (substr(s, 28,1) BETWEEN '0' AND '9' OR substr(s, 28,1) BETWEEN 'a' AND 'f' OR substr(s, 28,1) BETWEEN 'A' AND 'F') AND
-                    (substr(s, 29,1) BETWEEN '0' AND '9' OR substr(s, 29,1) BETWEEN 'a' AND 'f' OR substr(s, 29,1) BETWEEN 'A' AND 'F') AND
-                    (substr(s, 30,1) BETWEEN '0' AND '9' OR substr(s, 30,1) BETWEEN 'a' AND 'f' OR substr(s, 30,1) BETWEEN 'A' AND 'F') AND
-                    (substr(s, 31,1) BETWEEN '0' AND '9' OR substr(s, 31,1) BETWEEN 'a' AND 'f' OR substr(s, 31,1) BETWEEN 'A' AND 'F') AND
-                    (substr(s, 32,1) BETWEEN '0' AND '9' OR substr(s, 32,1) BETWEEN 'a' AND 'f' OR substr(s, 32,1) BETWEEN 'A' AND 'F') AND
-                    (substr(s, 33,1) BETWEEN '0' AND '9' OR substr(s, 33,1) BETWEEN 'a' AND 'f' OR substr(s, 33,1) BETWEEN 'A' AND 'F') AND
-                    (substr(s, 34,1) BETWEEN '0' AND '9' OR substr(s, 34,1) BETWEEN 'a' AND 'f' OR substr(s, 34,1) BETWEEN 'A' AND 'F') AND
-                    (substr(s, 35,1) BETWEEN '0' AND '9' OR substr(s, 35,1) BETWEEN 'a' AND 'f' OR substr(s, 35,1) BETWEEN 'A' AND 'F') AND
-                    (substr(s, 36,1) BETWEEN '0' AND '9' OR substr(s, 36,1) BETWEEN 'a' AND 'f' OR substr(s, 36,1) BETWEEN 'A' AND 'F')
-                )
-            FROM v
-        )
-        ELSE FALSE
-    END;
+SELECT j::text ~* '^"[a-f0-9]{8}-[a-f0-9]{4}-[1-5][a-f0-9]{3}-[89ab][a-f0-9]{3}-[a-f0-9]{12}"$';
 $$
 LANGUAGE sql
 IMMUTABLE
-PARALLEL SAFE;
+PARALLEL SAFE
+STRICT;
 
 CREATE OR REPLACE FUNCTION ldlite_system.jis_datetime(j JSONB) RETURNS BOOLEAN AS $$
-SELECT
-    CASE
-        WHEN jsonb_typeof(j) = 'string' THEN
-        (
-            WITH v AS (SELECT $1 #>> '{}' AS s)
-            SELECT
-                -- Length must be exactly 29 characters
-                length(s) = 29
-
-                -- Fixed punctuation positions
-                AND substr(s, 5, 1)  = '-'
-                AND substr(s, 8, 1)  = '-'
-                AND substr(s, 11, 1) = 'T'
-                AND substr(s, 14, 1) = ':'
-                AND substr(s, 17, 1) = ':'
-                AND substr(s, 20, 1) = '.'
-                AND substr(s, 24, 1) = '+'
-                AND substr(s, 27, 1) = ':'
-
-                -- YYYY
-                AND substr(s, 1, 1) BETWEEN '0' AND '9'
-                AND substr(s, 2, 1) BETWEEN '0' AND '9'
-                AND substr(s, 3, 1) BETWEEN '0' AND '9'
-                AND substr(s, 4, 1) BETWEEN '0' AND '9'
-
-                -- MM
-                AND substr(s, 6, 1) BETWEEN '0' AND '9'
-                AND substr(s, 7, 1) BETWEEN '0' AND '9'
-
-                -- DD
-                AND substr(s, 9, 1) BETWEEN '0' AND '9'
-                AND substr(s, 10, 1) BETWEEN '0' AND '9'
-
-                -- HH
-                AND substr(s, 12, 1) BETWEEN '0' AND '9'
-                AND substr(s, 13, 1) BETWEEN '0' AND '9'
-
-                -- mm
-                AND substr(s, 15, 1) BETWEEN '0' AND '9'
-                AND substr(s, 16, 1) BETWEEN '0' AND '9'
-
-                -- SS
-                AND substr(s, 18, 1) BETWEEN '0' AND '9'
-                AND substr(s, 19, 1) BETWEEN '0' AND '9'
-
-                -- mmm
-                AND substr(s, 21, 1) BETWEEN '0' AND '9'
-                AND substr(s, 22, 1) BETWEEN '0' AND '9'
-                AND substr(s, 23, 1) BETWEEN '0' AND '9'
-
-                -- Timezone HH
-                AND substr(s, 25, 1) BETWEEN '0' AND '9'
-                AND substr(s, 26, 1) BETWEEN '0' AND '9'
-
-                -- Timezone MM
-                AND substr(s, 28, 1) BETWEEN '0' AND '9'
-                AND substr(s, 29, 1) BETWEEN '0' AND '9'
-            FROM v
-        )
-        ELSE FALSE
-    END;
+SELECT j::text ~ '^"\d{4}-[01]\d-[0123]\dT[012]\d:[012345]\d:[012345]\d\.\d{3}(\+\d{2}:\d{2})?"$'
 $$
 LANGUAGE sql
 IMMUTABLE
-PARALLEL SAFE;
+PARALLEL SAFE
+STRICT;
 
 CREATE OR REPLACE FUNCTION ldlite_system.jis_float(j JSONB) RETURNS BOOLEAN AS $$
-SELECT
-    CASE
-        WHEN jsonb_typeof(j) = 'number' THEN scale((j)::numeric) > 0
-        ELSE FALSE
-    END;
+SELECT SCALE((j)::numeric) > 0
 $$
 LANGUAGE sql
 IMMUTABLE
-PARALLEL SAFE;
+PARALLEL SAFE
+STRICT;
 
 CREATE OR REPLACE FUNCTION ldlite_system.jis_null(j JSONB) RETURNS BOOLEAN AS $$
 SELECT COALESCE(j = 'null'::jsonb, TRUE);
diff --git a/tests/test_json_operators.py b/tests/test_json_operators.py
index d89cf44..2ddfe13 100644
--- a/tests/test_json_operators.py
+++ b/tests/test_json_operators.py
@@ -177,7 +177,6 @@ def case_jexplode(p: tuple[Any, ...]) -> JsonTC:
         ("str", False),
         ("str_empty", False),
         ("num", False),
-        ("na", False),
         ("na_str1", False),
         ("na_str2", False),
         ("uuid_nof", False),
@@ -218,7 +217,6 @@ def case_jis_null(p: tuple[Any, ...]) -> JsonTC:
         ("str", False),
         ("str_empty", False),
         ("num", False),
-        ("na", False),
         ("na_str1", False),
         ("na_str2", False),
         ("uuid_nof", False),
@@ -239,15 +237,6 @@ def case_jis_datetime(p: tuple[Any, ...]) -> JsonTC:
 
 @parametrize(
     p=[
-        ("str", False),
-        ("str_empty", False),
-        ("num", False),
-        ("na", False),
-        ("na_str1", False),
-        ("na_str2", False),
-        ("uuid_nof", False),
-        ("uuid", False),
-        ("dt", False),
         ("num", False),
         ("float", True),
     ],

From f6a59783bedc6199ff9ed39ea130d3f06b8fc582 Mon Sep 17 00:00:00 2001
From: Katherine Bargar <kbargar@fivecolleges.edu>
Date: Fri, 6 Mar 2026 16:31:01 +0000
Subject: [PATCH 05/18] Allow postgres to short circuit data type checks

---
 src/ldlite/database/_expansion/nodes.py | 30 +++++++++++++++++--------
 1 file changed, 21 insertions(+), 9 deletions(-)

diff --git a/src/ldlite/database/_expansion/nodes.py b/src/ldlite/database/_expansion/nodes.py
index 6659784..7c82077 100644
--- a/src/ldlite/database/_expansion/nodes.py
+++ b/src/ldlite/database/_expansion/nodes.py
@@ -197,20 +197,32 @@ def unnest(
         WHERE NOT ldlite_system.jis_null(ld_value)
     )
 SELECT
-    STRING_AGG(DISTINCT json_type, '|') AS json_type
-    ,bool_and(is_array) AS is_array
-    ,bool_and(json_type = 'string' AND ldlite_system.jis_uuid(ld_value)) AS is_uuid
-    ,bool_and(json_type = 'string' AND ldlite_system.jis_datetime(ld_value)) AS is_datetime
-    ,bool_and(json_type = 'number' AND ldlite_system.jis_float(ld_value)) AS is_float
-FROM all_values
-HAVING COUNT(*) > 0
-""",  # noqa: E501
+    (SELECT STRING_AGG(DISTINCT json_type, '|') FROM all_values) AS json_type
+    ,(SELECT BOOL_AND(is_array) FROM all_values) AS is_array
+    ,NOT EXISTS
+    (
+        SELECT 1 FROM all_values
+        WHERE json_type = 'string' AND NOT ldlite_system.jis_uuid(ld_value)
+    ) AS is_uuid
+    ,NOT EXISTS
+    (
+        SELECT 1 FROM all_values
+        WHERE json_type = 'string' AND NOT ldlite_system.jis_datetime(ld_value)
+    ) AS is_datetime
+    ,NOT EXISTS
+    (
+        SELECT 1 FROM all_values
+        WHERE json_type = 'number' AND NOT ldlite_system.jis_float(ld_value)
+    ) AS is_float
+""",
                     )
                     .format(table=source_table, json_col=self.identifier)
                     .as_string(),
                     (prop,),
                 )
-                if (row := cur.fetchone()) is not None:
+                if (row := cur.fetchone()) is not None and all(
+                    c is not None for c in row
+                ):
                     meta = Metadata(prop, *row)
                     create_columns.append(
                         meta.select_column(self.identifier, self.add(meta)),

From 4706f98adabdfea75341f741ac8960ee93a228bd Mon Sep 17 00:00:00 2001
From: Katherine Bargar <kbargar@fivecolleges.edu>
Date: Fri, 6 Mar 2026 16:50:41 +0000
Subject: [PATCH 06/18] Sample large tables for column metadata

---
 src/ldlite/database/_duckdb.py             |  4 ++
 src/ldlite/database/_expansion/__init__.py | 56 ++--------------------
 src/ldlite/database/_expansion/context.py  | 52 ++++++++++++++++++++
 src/ldlite/database/_expansion/nodes.py    | 49 ++++++++++++-------
 src/ldlite/database/_postgres.py           |  4 ++
 src/ldlite/database/_typed_database.py     |  8 +++-
 6 files changed, 104 insertions(+), 69 deletions(-)
 create mode 100644 src/ldlite/database/_expansion/context.py

diff --git a/src/ldlite/database/_duckdb.py b/src/ldlite/database/_duckdb.py
index 98ac5bb..f4f6035 100644
--- a/src/ldlite/database/_duckdb.py
+++ b/src/ldlite/database/_duckdb.py
@@ -134,6 +134,10 @@ def ingest_records(
     def source_table_cte_stmt(self, keep_source: bool) -> str:  # noqa: ARG002
         return "WITH ld_source AS (SELECT * FROM {source_table})"
 
+    @property
+    def tablesample(self) -> str:
+        return "USING SAMPLE {sample} PERCENT (bernoulli)"
+
 
 # DuckDB has some strong opinions about cursors that are different than postgres
 # https://github.com/duckdb/duckdb/issues/11018
diff --git a/src/ldlite/database/_expansion/__init__.py b/src/ldlite/database/_expansion/__init__.py
index 4b22070..a62b004 100644
--- a/src/ldlite/database/_expansion/__init__.py
+++ b/src/ldlite/database/_expansion/__init__.py
@@ -1,57 +1,14 @@
 from __future__ import annotations
 
 from collections import deque
-from dataclasses import dataclass
-from typing import TYPE_CHECKING, NoReturn
+from typing import TYPE_CHECKING
 
 from psycopg import sql
 
 from .nodes import ArrayNode, ObjectNode
 
 if TYPE_CHECKING:
-    from collections.abc import Callable
-
-    import duckdb
-    import psycopg
-    from tqdm import tqdm
-
-
-@dataclass
-class ExpandContext:
-    conn: duckdb.DuckDBPyConnection | psycopg.Connection
-    source_table: sql.Identifier
-    json_depth: int
-    get_transform_table: Callable[[int], sql.Identifier]
-    get_output_table: Callable[[str], tuple[str, sql.Identifier]]
-    # This is necessary for Analyzing the table in pg before querying it
-    # I don't love how this is implemented
-    preprocess: Callable[
-        [
-            duckdb.DuckDBPyConnection | psycopg.Connection,
-            sql.Identifier,
-            list[sql.Identifier],
-        ],
-        None,
-    ]
-    # source_cte will go away when DuckDB implements CTAS RETURNING
-    source_cte: Callable[[bool], str]
-    progress: tqdm[NoReturn] | None
-
-    def array_context(
-        self,
-        new_source_table: sql.Identifier,
-        new_json_depth: int,
-    ) -> ExpandContext:
-        return ExpandContext(
-            self.conn,
-            new_source_table,
-            new_json_depth,
-            self.get_transform_table,
-            self.get_output_table,
-            self.preprocess,
-            self.source_cte,
-            self.progress,
-        )
+    from .context import ExpandContext
 
 
 def expand_nonmarc(
@@ -75,11 +32,7 @@ def _expand_nonmarc(
     initial_count = count
     ctx.preprocess(ctx.conn, ctx.source_table, [root.identifier])
     root.unnest(
-        ctx.conn,
-        ctx.source_table,
-        ctx.get_transform_table(count),
-        ctx.source_cte(False),
-        ctx.progress,
+        ctx, ctx.source_table, ctx.get_transform_table(count), ctx.source_cte(False)
     )
 
     expand_children_of = deque([root])
@@ -92,11 +45,10 @@ def _expand_nonmarc(
                 continue
             ctx.preprocess(ctx.conn, ctx.get_transform_table(count), [c.identifier])
             c.unnest(
-                ctx.conn,
+                ctx,
                 ctx.get_transform_table(count),
                 ctx.get_transform_table(count + 1),
                 ctx.source_cte(False),
-                ctx.progress,
             )
             expand_children_of.append(c)
             count += 1
diff --git a/src/ldlite/database/_expansion/context.py b/src/ldlite/database/_expansion/context.py
new file mode 100644
index 0000000..0e6cec0
--- /dev/null
+++ b/src/ldlite/database/_expansion/context.py
@@ -0,0 +1,52 @@
+from __future__ import annotations
+
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, NoReturn
+
+if TYPE_CHECKING:
+    from collections.abc import Callable
+
+    import duckdb
+    import psycopg
+    from psycopg import sql
+    from tqdm import tqdm
+
+
+@dataclass
+class ExpandContext:
+    conn: duckdb.DuckDBPyConnection | psycopg.Connection
+    source_table: sql.Identifier
+    json_depth: int
+    get_transform_table: Callable[[int], sql.Identifier]
+    get_output_table: Callable[[str], tuple[str, sql.Identifier]]
+    # This is necessary for Analyzing the table in pg before querying it
+    # I don't love how this is implemented
+    preprocess: Callable[
+        [
+            duckdb.DuckDBPyConnection | psycopg.Connection,
+            sql.Identifier,
+            list[sql.Identifier],
+        ],
+        None,
+    ]
+    # source_cte will go away when DuckDB implements CTAS RETURNING
+    source_cte: Callable[[bool], str]
+    tablesample: str
+    progress: tqdm[NoReturn] | None
+
+    def array_context(
+        self,
+        new_source_table: sql.Identifier,
+        new_json_depth: int,
+    ) -> ExpandContext:
+        return ExpandContext(
+            self.conn,
+            new_source_table,
+            new_json_depth,
+            self.get_transform_table,
+            self.get_output_table,
+            self.preprocess,
+            self.source_cte,
+            self.tablesample,
+            self.progress,
+        )
diff --git a/src/ldlite/database/_expansion/nodes.py b/src/ldlite/database/_expansion/nodes.py
index 7c82077..a70383a 100644
--- a/src/ldlite/database/_expansion/nodes.py
+++ b/src/ldlite/database/_expansion/nodes.py
@@ -1,7 +1,7 @@
 from __future__ import annotations
 
 from collections import deque
-from typing import TYPE_CHECKING, TypeVar
+from typing import TYPE_CHECKING, TypeVar, cast
 
 from psycopg import sql
 
@@ -13,6 +13,8 @@
     import psycopg
     from tqdm import tqdm
 
+    from .context import ExpandContext
+
 from .metadata import Metadata
 
 TNode = TypeVar("TNode", bound="ExpansionNode")
@@ -119,18 +121,27 @@ def object_children(self) -> list[ObjectNode]:
 
     def unnest(
         self,
-        conn: duckdb.DuckDBPyConnection | psycopg.Connection,
+        ctx: ExpandContext,
         source_table: sql.Identifier,
         dest_table: sql.Identifier,
         source_cte: str,
-        progress: tqdm[NoReturn] | None,
     ) -> None:
         self.unnested = True
         create_columns: list[sql.Composable] = [
             sql.Identifier(v) for v in self.carryover
         ]
 
-        with conn.cursor() as cur:
+        with ctx.conn.cursor() as cur:
+            cur.execute(
+                sql.SQL("SELECT COUNT(*) FROM {table}")
+                .format(table=source_table)
+                .as_string(),
+            )
+            total = cast("tuple[int]", cur.fetchone())[0]
+            if total == 0:
+                return
+
+        with ctx.conn.cursor() as cur:
             cur.execute(
                 sql.SQL(
                     """
@@ -148,18 +159,20 @@ def unnest(
             props = [prop[0] for prop in cur.fetchall()]
 
         prop_count = len(props)
-        if progress is not None:
-            progress.total += prop_count * 2
+        if ctx.progress is not None:
+            ctx.progress.total += prop_count * 2
 
         for prop in props:
-            with conn.cursor() as cur:
+            with ctx.conn.cursor() as cur:
                 cur.execute(
                     sql.SQL(
                         """
 WITH
     values AS (
         SELECT ldlite_system.jextract({json_col}, $1) as ld_value
-        FROM {table}
+        FROM {table} """  # noqa: S608
+                        + ctx.tablesample
+                        + """
     ),
     value_and_types AS (
         SELECT
@@ -216,7 +229,11 @@ def unnest(
     ) AS is_float
 """,
                     )
-                    .format(table=source_table, json_col=self.identifier)
+                    .format(
+                        table=source_table,
+                        json_col=self.identifier,
+                        sample=sql.Literal(min(100, 100000 // total * 100)),
+                    )
                     .as_string(),
                     (prop,),
                 )
@@ -227,13 +244,13 @@ def unnest(
                     create_columns.append(
                         meta.select_column(self.identifier, self.add(meta)),
                     )
-                    if progress is not None and meta.is_array:
-                        progress.total += 1
+                    if ctx.progress is not None and meta.is_array:
+                        ctx.progress.total += 1
 
-                if progress is not None:
-                    progress.update(1)
+                if ctx.progress is not None:
+                    ctx.progress.update(1)
 
-        with conn.cursor() as cur:
+        with ctx.conn.cursor() as cur:
             cur.execute(
                 sql.SQL(
                     """
@@ -253,8 +270,8 @@ def unnest(
                 )
                 .as_string(),
             )
-            if progress is not None:
-                progress.update(prop_count)
+            if ctx.progress is not None:
+                ctx.progress.update(prop_count)
 
     def _carryover(self) -> Iterator[str]:
         for n in self.root.descendents:
diff --git a/src/ldlite/database/_postgres.py b/src/ldlite/database/_postgres.py
index 0bebfa3..43e5b49 100644
--- a/src/ldlite/database/_postgres.py
+++ b/src/ldlite/database/_postgres.py
@@ -205,3 +205,7 @@ def source_table_cte_stmt(self, keep_source: bool) -> str:
         if keep_source:
             return "WITH ld_source AS (SELECT * FROM {source_table})"
         return "WITH ld_source AS (DELETE FROM {source_table} RETURNING *)"
+
+    @property
+    def tablesample(self) -> str:
+        return "TABLESAMPLE BERNOULLI ({sample})"
diff --git a/src/ldlite/database/_typed_database.py b/src/ldlite/database/_typed_database.py
index a9197ad..3c3d84b 100644
--- a/src/ldlite/database/_typed_database.py
+++ b/src/ldlite/database/_typed_database.py
@@ -11,7 +11,8 @@
 from psycopg import sql
 
 from . import Database, LoadHistory
-from ._expansion import ExpandContext, expand_nonmarc
+from ._expansion import expand_nonmarc
+from ._expansion.context import ExpandContext
 from ._prefix import Prefix
 
 if TYPE_CHECKING:
@@ -185,6 +186,10 @@ def preprocess_source_table(
     @abstractmethod
     def source_table_cte_stmt(self, keep_source: bool) -> str: ...
 
+    @property
+    @abstractmethod
+    def tablesample(self) -> str: ...
+
     def expand_prefix(
         self,
         prefix: str,
@@ -231,6 +236,7 @@ def expand_prefix(
                     pfx.output_table,
                     self.preprocess_source_table,  # type: ignore [arg-type]
                     self.source_table_cte_stmt,
+                    self.tablesample,
                     progress,
                 ),
             )

From b07ec9f4207e7e8ff0079053f548ed9d18dbeaa4 Mon Sep 17 00:00:00 2001
From: Katherine Bargar <kbargar@fivecolleges.edu>
Date: Fri, 6 Mar 2026 17:38:53 +0000
Subject: [PATCH 07/18] Split progress bars between scanning and transforming

---
 src/ldlite/__init__.py                     | 33 +++++++++++++++-------
 src/ldlite/database/__init__.py            |  3 +-
 src/ldlite/database/_expansion/__init__.py | 17 +++++++++--
 src/ldlite/database/_expansion/context.py  |  6 ++--
 src/ldlite/database/_expansion/nodes.py    | 21 +++++---------
 src/ldlite/database/_typed_database.py     | 10 +++++--
 6 files changed, 58 insertions(+), 32 deletions(-)

diff --git a/src/ldlite/__init__.py b/src/ldlite/__init__.py
index e3b35be..9e8e757 100644
--- a/src/ldlite/__init__.py
+++ b/src/ldlite/__init__.py
@@ -350,20 +350,33 @@ def query(  # noqa: C901, PLR0912, PLR0913, PLR0915
 
         transform_started = datetime.now(timezone.utc)
         if not use_legacy_transform:
-            with tqdm(
-                total=0,
-                desc="transforming",
-                leave=False,
-                mininterval=5,
-                disable=self._quiet,
-                unit="ops",
-                delay=5,
-            ) as progress:
+            no_iters_format = (
+                "{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}]"
+            )
+            with (
+                tqdm(
+                    desc="scanning",
+                    leave=False,
+                    mininterval=5,
+                    disable=self._quiet,
+                    delay=5,
+                    bar_format=no_iters_format,
+                ) as scan_progress,
+                tqdm(
+                    desc="transforming",
+                    leave=False,
+                    mininterval=5,
+                    disable=self._quiet,
+                    delay=5,
+                    bar_format=no_iters_format,
+                ) as transform_progress,
+            ):
                 newtables = self._database.expand_prefix(
                     table,
                     json_depth,
                     keep_raw,
-                    progress,
+                    scan_progress,
+                    transform_progress,
                 )
             if keep_raw:
                 newtables = [table, *newtables]
diff --git a/src/ldlite/database/__init__.py b/src/ldlite/database/__init__.py
index 8ed5875..e1df517 100644
--- a/src/ldlite/database/__init__.py
+++ b/src/ldlite/database/__init__.py
@@ -59,7 +59,8 @@ def expand_prefix(
         prefix: str,
         json_depth: int,
         keep_raw: bool,
-        progress: tqdm[NoReturn] | None = None,
+        scan_progress: tqdm[NoReturn] | None = None,
+        transform_progress: tqdm[NoReturn] | None = None,
     ) -> list[str]:
         """Unnests and explodes the raw data at the given prefix."""
 
diff --git a/src/ldlite/database/_expansion/__init__.py b/src/ldlite/database/_expansion/__init__.py
index a62b004..585f956 100644
--- a/src/ldlite/database/_expansion/__init__.py
+++ b/src/ldlite/database/_expansion/__init__.py
@@ -29,15 +29,25 @@ def _expand_nonmarc(
     count: int,
     ctx: ExpandContext,
 ) -> tuple[int, list[str]]:
+    ctx.scan_progress.total = (ctx.scan_progress.total or 0) + 1
+    ctx.transform_progress.total = (ctx.transform_progress.total or 0) + 1
+    ctx.transform_progress.refresh()
     initial_count = count
     ctx.preprocess(ctx.conn, ctx.source_table, [root.identifier])
     root.unnest(
-        ctx, ctx.source_table, ctx.get_transform_table(count), ctx.source_cte(False)
+        ctx,
+        ctx.source_table,
+        ctx.get_transform_table(count),
+        ctx.source_cte(False),
     )
+    ctx.transform_progress.update(1)
 
     expand_children_of = deque([root])
     while expand_children_of:
         on = expand_children_of.popleft()
+        if ctx.transform_progress:
+            ctx.transform_progress.total += len(on.object_children)
+            ctx.transform_progress.refresh()
         for c in on.object_children:
             if len(c.parents) >= ctx.json_depth:
                 if c.parent is not None:
@@ -52,11 +62,14 @@ def _expand_nonmarc(
             )
             expand_children_of.append(c)
             count += 1
+            ctx.transform_progress.update(1)
 
     created_tables = []
 
     new_source_table = ctx.get_transform_table(count)
     arrays = root.descendents_oftype(ArrayNode)
+    ctx.transform_progress.total += len(arrays)
+    ctx.transform_progress.refresh()
     ctx.preprocess(ctx.conn, new_source_table, [a.identifier for a in arrays])
     for an in arrays:
         if len(an.parents) >= ctx.json_depth:
@@ -66,9 +79,9 @@ def _expand_nonmarc(
             new_source_table,
             ctx.get_transform_table(count + 1),
             ctx.source_cte(True),
-            ctx.progress,
         )
         count += 1
+        ctx.transform_progress.update(1)
 
         if an.meta.is_object:
             (sub_index, array_tables) = _expand_nonmarc(
diff --git a/src/ldlite/database/_expansion/context.py b/src/ldlite/database/_expansion/context.py
index 0e6cec0..257ab87 100644
--- a/src/ldlite/database/_expansion/context.py
+++ b/src/ldlite/database/_expansion/context.py
@@ -32,7 +32,8 @@ class ExpandContext:
     # source_cte will go away when DuckDB implements CTAS RETURNING
     source_cte: Callable[[bool], str]
     tablesample: str
-    progress: tqdm[NoReturn] | None
+    scan_progress: tqdm[NoReturn]
+    transform_progress: tqdm[NoReturn]
 
     def array_context(
         self,
@@ -48,5 +49,6 @@ def array_context(
             self.preprocess,
             self.source_cte,
             self.tablesample,
-            self.progress,
+            self.scan_progress,
+            self.transform_progress,
         )
diff --git a/src/ldlite/database/_expansion/nodes.py b/src/ldlite/database/_expansion/nodes.py
index a70383a..ab6dfec 100644
--- a/src/ldlite/database/_expansion/nodes.py
+++ b/src/ldlite/database/_expansion/nodes.py
@@ -1,17 +1,16 @@
 from __future__ import annotations
 
 from collections import deque
+from math import floor
 from typing import TYPE_CHECKING, TypeVar, cast
 
 from psycopg import sql
 
 if TYPE_CHECKING:
     from collections.abc import Iterator
-    from typing import NoReturn
 
     import duckdb
     import psycopg
-    from tqdm import tqdm
 
     from .context import ExpandContext
 
@@ -159,8 +158,8 @@ def unnest(
             props = [prop[0] for prop in cur.fetchall()]
 
         prop_count = len(props)
-        if ctx.progress is not None:
-            ctx.progress.total += prop_count * 2
+        ctx.scan_progress.total += prop_count
+        ctx.scan_progress.update(1)
 
         for prop in props:
             with ctx.conn.cursor() as cur:
@@ -232,7 +231,7 @@ def unnest(
                     .format(
                         table=source_table,
                         json_col=self.identifier,
-                        sample=sql.Literal(min(100, 100000 // total * 100)),
+                        sample=sql.Literal(min(100, floor((100000 / total) * 100))),
                     )
                     .as_string(),
                     (prop,),
@@ -244,11 +243,10 @@ def unnest(
                     create_columns.append(
                         meta.select_column(self.identifier, self.add(meta)),
                     )
-                    if ctx.progress is not None and meta.is_array:
-                        ctx.progress.total += 1
+                    if meta.is_object:
+                        ctx.scan_progress.total += 1
 
-                if ctx.progress is not None:
-                    ctx.progress.update(1)
+                ctx.scan_progress.update(1)
 
         with ctx.conn.cursor() as cur:
             cur.execute(
@@ -270,8 +268,6 @@ def unnest(
                 )
                 .as_string(),
             )
-            if ctx.progress is not None:
-                ctx.progress.update(prop_count)
 
     def _carryover(self) -> Iterator[str]:
         for n in self.root.descendents:
@@ -311,7 +307,6 @@ def explode(
         source_table: sql.Identifier,
         dest_table: sql.Identifier,
         source_cte: str,
-        progress: tqdm[NoReturn] | None,
     ) -> list[str]:
         with conn.cursor() as cur:
             o_col = self.name + "__o"
@@ -354,8 +349,6 @@ def explode(
                 )
                 .as_string(),
             )
-            if progress is not None:
-                progress.update(1)
 
         return ["__id", *self.carryover, o_col]
 
diff --git a/src/ldlite/database/_typed_database.py b/src/ldlite/database/_typed_database.py
index 3c3d84b..f66b464 100644
--- a/src/ldlite/database/_typed_database.py
+++ b/src/ldlite/database/_typed_database.py
@@ -9,6 +9,7 @@
 
 import psycopg
 from psycopg import sql
+from tqdm import tqdm
 
 from . import Database, LoadHistory
 from ._expansion import expand_nonmarc
@@ -20,7 +21,6 @@
     from typing import NoReturn
 
     import duckdb
-    from tqdm import tqdm
 
 
 DB = TypeVar("DB", bound="duckdb.DuckDBPyConnection | psycopg.Connection")
@@ -195,7 +195,8 @@ def expand_prefix(
         prefix: str,
         json_depth: int,
         keep_raw: bool,
-        progress: tqdm[NoReturn] | None = None,
+        scan_progress: tqdm[NoReturn] | None = None,
+        transform_progress: tqdm[NoReturn] | None = None,
     ) -> list[str]:
         pfx = Prefix(prefix)
         with closing(self._conn_factory()) as conn:
@@ -237,7 +238,10 @@ def expand_prefix(
                     self.preprocess_source_table,  # type: ignore [arg-type]
                     self.source_table_cte_stmt,
                     self.tablesample,
-                    progress,
+                    scan_progress if scan_progress is not None else tqdm(disable=True),
+                    transform_progress
+                    if transform_progress is not None
+                    else tqdm(disable=True),
                 ),
             )
 

From c6081bf5c8709f60eefce2318481b31d21259eda Mon Sep 17 00:00:00 2001
From: Katherine Bargar <kbargar@fivecolleges.edu>
Date: Fri, 6 Mar 2026 19:49:29 +0000
Subject: [PATCH 08/18] Remove slow jextract function in favor of more native
 functionality

---
 src/ldlite/database/_duckdb.py             | 43 ++---------
 src/ldlite/database/_expansion/metadata.py | 30 ++++++--
 src/ldlite/database/_expansion/nodes.py    | 10 ++-
 src/ldlite/database/_postgres.py           | 66 +++-------------
 tests/test_json_operators.py               | 90 +---------------------
 5 files changed, 50 insertions(+), 189 deletions(-)

diff --git a/src/ldlite/database/_duckdb.py b/src/ldlite/database/_duckdb.py
index f4f6035..db48527 100644
--- a/src/ldlite/database/_duckdb.py
+++ b/src/ldlite/database/_duckdb.py
@@ -41,62 +41,33 @@ def _setup_jfuncs(conn: duckdb.DuckDBPyConnection) -> None:
     END
 ;
 
-CREATE OR REPLACE FUNCTION ldlite_system.jextract(j, p) AS
-    CASE ldlite_system.jtype_of(main.json_extract(j, p))
-        WHEN 'string' THEN
-            CASE
-                WHEN lower(main.json_extract_string(j, p)) = 'null' THEN 'null'::JSON
-                WHEN length(main.json_extract_string(j, p)) = 0 THEN 'null'::JSON
-                ELSE main.json_extract(j, p)
-            END
-        WHEN 'object' THEN
-            CASE
-                WHEN main.json_extract_string(j, p) = '{}' THEN 'null'::JSON
-                ELSE main.json_extract(j, p)
-            END
-        WHEN 'array' THEN
-            CASE
-                WHEN length(list_filter((main.json_extract(j, p))::JSON[], lambda x : x != 'null'::JSON)) = 0 THEN 'null'::JSON
-                ELSE list_filter((main.json_extract(j, p))::JSON[], lambda x : x != 'null'::JSON)
-            END
-        ELSE coalesce(main.json_extract(j, p), 'null'::JSON)
-    END
-;
-
-CREATE OR REPLACE FUNCTION ldlite_system.jextract_string(j, p) AS
-    main.json_extract_string(ldlite_system.jextract(j, p), '$')
-;
-
 CREATE OR REPLACE FUNCTION ldlite_system.jobject_keys(j) AS
     unnest(main.json_keys(j))
 ;
 
 CREATE OR REPLACE FUNCTION ldlite_system.jis_uuid(j) AS
-    CASE ldlite_system.jtype_of(j)
-        WHEN 'string' THEN regexp_full_match(main.json_extract_string(j, '$'), '^[a-fA-F0-9]{8}-[a-fA-F0-9]{4}-[1-5][a-fA-F0-9]{3}-[89abAB][a-fA-F0-9]{3}-[a-fA-F0-9]{12}$')
-        ELSE FALSE
-    END
+    regexp_full_match(main.json_extract_string(j, '$'), '(?i)^[a-f0-9]{8}-[a-f0-9]{4}-[1-5][a-f0-9]{3}-[89abAB][a-f0-9]{3}-[a-f0-9]{12}$')
 ;
 
 CREATE OR REPLACE FUNCTION ldlite_system.jis_datetime(j) AS
-    CASE ldlite_system.jtype_of(j)
-        WHEN 'string' THEN regexp_full_match(main.json_extract_string(j, '$'), '^\d{4}-[01]\d-[0123]\dT[012]\d:[012345]\d:[012345]\d\.\d{3}(\+\d{2}:\d{2})?$')
-        ELSE FALSE
-    END
+    regexp_full_match(main.json_extract_string(j, '$'), '^\d{4}-[01]\d-[0123]\dT[012]\d:[012345]\d:[012345]\d\.\d{3}(\+\d{2}:\d{2})?$')
 ;
 
 CREATE OR REPLACE FUNCTION ldlite_system.jis_float(j) AS
-    coalesce(main.json_type(j), 'NULL') == 'DOUBLE'
+    main.json_type(j) == 'DOUBLE'
 ;
 
 CREATE OR REPLACE FUNCTION ldlite_system.jis_null(j) AS
-    j is NULL or j == 'null'::JSON
+    j IS NULL OR j == 'null'::JSON OR main.json_extract_string(j, '$') IN ('NULL', 'null', '', '{}', '[]')
 ;
 
 CREATE OR REPLACE FUNCTION ldlite_system.jexplode(j) AS TABLE (
     SELECT value as ld_value FROM main.json_each(j)
 );
 
+CREATE OR REPLACE FUNCTION ldlite_system.jself_string(j) AS
+    main.json_extract_string(j, '$')
+;
 """,  # noqa: E501
             )
 
diff --git a/src/ldlite/database/_expansion/metadata.py b/src/ldlite/database/_expansion/metadata.py
index 15f75f7..e18b854 100644
--- a/src/ldlite/database/_expansion/metadata.py
+++ b/src/ldlite/database/_expansion/metadata.py
@@ -40,26 +40,44 @@ def select_column(
         json_col: sql.Identifier,
         alias: str,
     ) -> sql.Composed:
+        # '$' is a special character that means the root of the json
+        # I couldn't figure out how to make the array expansion work
+        # without it
         if self.is_array or self.is_object:
             stmt = sql.SQL(
-                "(ldlite_system.jextract({json_col}, {prop})) AS {alias}",
+                "{json_col}->{prop} AS {alias}"
+                if self.prop != "$"
+                else "{json_col} AS {alias}",
             )
         elif self.json_type == "number":
             stmt = sql.SQL(
-                "(ldlite_system.jextract_string({json_col}, {prop}))"
-                "::numeric AS {alias}",
+                "({json_col}->>{prop})::numeric AS {alias}"
+                if self.prop != "$"
+                else "ldlite_system.jself_string({json_col})::numeric AS {alias}",
             )
         elif self.json_type == "boolean":
             stmt = sql.SQL(
-                "(ldlite_system.jextract_string({json_col}, {prop}))::bool AS {alias}",
+                "NULLIF(NULLIF({json_col}->>{prop}, ''), 'null')::bool AS {alias}"
+                if self.prop != "$"
+                else "NULLIF(NULLIF("
+                "ldlite_system.jself_string({json_col})"
+                ", ''), 'null')::bool AS {alias}",
             )
         elif self.json_type == "string" and self.is_uuid:
             stmt = sql.SQL(
-                "(ldlite_system.jextract_string({json_col}, {prop}))::uuid AS {alias}",
+                "NULLIF(NULLIF({json_col}->>{prop}, ''), 'null')::uuid AS {alias}"
+                if self.prop != "$"
+                else "NULLIF(NULLIF("
+                "ldlite_system.jself_string({json_col})"
+                ", ''), 'null')::uuid AS {alias}",
             )
         else:
             stmt = sql.SQL(
-                "(ldlite_system.jextract_string({json_col}, {prop})) AS {alias}",
+                "NULLIF(NULLIF({json_col}->>{prop}, ''), 'null') AS {alias}"
+                if self.prop != "$"
+                else "NULLIF(NULLIF("
+                "ldlite_system.jself_string({json_col})"
+                ", ''), 'null') AS {alias}",
             )
 
         return stmt.format(
diff --git a/src/ldlite/database/_expansion/nodes.py b/src/ldlite/database/_expansion/nodes.py
index ab6dfec..4b9c466 100644
--- a/src/ldlite/database/_expansion/nodes.py
+++ b/src/ldlite/database/_expansion/nodes.py
@@ -168,7 +168,7 @@ def unnest(
                         """
 WITH
     values AS (
-        SELECT ldlite_system.jextract({json_col}, $1) as ld_value
+        SELECT {json_col}->$1 as ld_value
         FROM {table} """  # noqa: S608
                         + ctx.tablesample
                         + """
@@ -258,12 +258,14 @@ def unnest(
                     + """
 SELECT
     {cols}
-FROM ld_source;
+FROM ld_source
+WHERE NOT ldlite_system.jis_null({json_col})
 """,
                 )
                 .format(
                     source_table=source_table,
                     dest_table=dest_table,
+                    json_col=self.identifier,
                     cols=sql.SQL("\n    ,").join(create_columns),
                 )
                 .as_string(),
@@ -275,7 +277,7 @@ def _carryover(self) -> Iterator[str]:
                 yield n.name
             else:
                 yield n.name
-            yield from n.values
+            yield from [v for v in n.values if v != "jsonb"]
 
     @property
     def carryover(self) -> list[str]:
@@ -354,7 +356,7 @@ def explode(
 
     def _carryover(self) -> Iterator[str]:
         for n in reversed(self.parents):
-            yield from [v for v in n.values if v != "__id"]
+            yield from [v for v in n.values if v not in ("__id", "jsonb")]
 
     @property
     def carryover(self) -> list[str]:
diff --git a/src/ldlite/database/_postgres.py b/src/ldlite/database/_postgres.py
index 43e5b49..ae23631 100644
--- a/src/ldlite/database/_postgres.py
+++ b/src/ldlite/database/_postgres.py
@@ -41,62 +41,6 @@ def _setup_jfuncs(conn: psycopg.Connection) -> None:
 IMMUTABLE
 PARALLEL SAFE;
 
-CREATE OR REPLACE FUNCTION ldlite_system.jextract(j JSONB, p TEXT) RETURNS JSONB AS $$
-SELECT
-    CASE jsonb_typeof(val)
-        WHEN 'string' THEN
-            CASE
-                WHEN lower(val #>> '{}') IN ('null', '') THEN 'null'::jsonb
-                ELSE val
-            END
-        WHEN 'array' THEN
-            CASE
-                WHEN jsonb_array_length(val) = 0 THEN 'null'::jsonb
-                WHEN NOT EXISTS (
-                    SELECT 1
-                    FROM jsonb_array_elements(val) AS e(elem)
-                    WHERE elem = 'null'::jsonb
-                    LIMIT 1
-                ) THEN val
-                ELSE COALESCE(
-                    (
-                        SELECT jsonb_agg(e)
-                        FROM jsonb_array_elements(val) AS a(e)
-                        WHERE e <> 'null'::jsonb
-                    ),
-                    'null'::jsonb
-                )
-            END
-        WHEN 'object' THEN
-            CASE
-                WHEN val = '{}'::jsonb THEN 'null'::jsonb
-                ELSE val
-            END
-        ELSE val
-    END
-FROM (
-    -- This is somewhat of a hack.
-    -- There isn't a really good way to get the element unchanged
-    -- which works for duckdb and postgres AND CRUCIALLY
-    -- has a similar syntax to everything else so that we don't
-    -- have to have special cases for exploding the array and it
-    -- can share all the same type checking / statement generation code.
-    -- We're pretending that postgres supports -> '$' style syntax like duckdb.
-    SELECT CASE WHEN p = '$' THEN j ELSE j->p END AS val
-) s;
-$$
-LANGUAGE sql
-IMMUTABLE
-PARALLEL SAFE
-STRICT;
-
-CREATE OR REPLACE FUNCTION ldlite_system.jextract_string(j JSONB, p TEXT) RETURNS TEXT AS $$
-SELECT ldlite_system.jextract(j, p) #>> '{}'
-$$
-LANGUAGE sql
-IMMUTABLE
-PARALLEL SAFE;
-
 CREATE OR REPLACE FUNCTION ldlite_system.jobject_keys(j JSONB) RETURNS SETOF TEXT AS $$
 SELECT jsonb_object_keys(j);
 $$
@@ -129,7 +73,7 @@ def _setup_jfuncs(conn: psycopg.Connection) -> None:
 STRICT;
 
 CREATE OR REPLACE FUNCTION ldlite_system.jis_null(j JSONB) RETURNS BOOLEAN AS $$
-SELECT COALESCE(j = 'null'::jsonb, TRUE);
+SELECT j IS NULL OR j = 'null'::jsonb OR j #>> '{}' IN ('NULL', 'null', '', '{}', '[]')
 $$
 LANGUAGE sql
 IMMUTABLE
@@ -141,6 +85,14 @@ def _setup_jfuncs(conn: psycopg.Connection) -> None:
 LANGUAGE sql
 IMMUTABLE
 PARALLEL SAFE;
+
+
+CREATE OR REPLACE FUNCTION ldlite_system.jself_string(j JSONB) RETURNS TEXT AS $$
+SELECT j #>> '{}'
+$$
+LANGUAGE sql
+IMMUTABLE
+PARALLEL SAFE;
 """,  # noqa: E501
             )
 
diff --git a/tests/test_json_operators.py b/tests/test_json_operators.py
index 2ddfe13..987a8e0 100644
--- a/tests/test_json_operators.py
+++ b/tests/test_json_operators.py
@@ -27,88 +27,6 @@ class JsonTC:
     assertion_params: tuple[Any, ...]
 
 
-@parametrize(
-    p=[
-        ("str", '"str_val"'),
-        ("str_empty", "null"),
-        ("num", "12"),
-        ("float", "16.3"),
-        ("bool", "true"),
-        ("obj", '{"k1":"v1","k2":"v2"}'),
-        ("obj_some", '{"k1":"v1","k2":null}'),
-        ("obj_empty", "null"),
-        ("arr_zero", "null"),
-        ("na", "null"),
-        ("na_str1", "null"),
-        ("na_str2", "null"),
-    ],
-)
-def case_jextract(p: tuple[Any, ...]) -> JsonTC:
-    return JsonTC(
-        """SELECT ldlite_system.jextract(jc, $1){assertion} FROM j;""",
-        p[:1],
-        """= $2::{jtype}""",
-        p[1:],
-    )
-
-
-# Duckdb through 1.3 and 1.4 have different json comparison behavior here
-# Whitespace matters in 1.4 and not 1.3
-# This makes the arrays text and compares the values as a workaround
-@parametrize(
-    p=[
-        ("arr_str", '["s1", "s2", "s3"]'),
-        ("arr_str_some", '["s1", "s2"]'),
-        ("arr_obj_some", '[{"k1":"v1"}]'),
-    ],
-)
-def case_jextract_duckdb(p: tuple[Any, ...]) -> JsonTC:
-    return JsonTC(
-        """SELECT ldlite_system.jextract(jc, $1){assertion} FROM j;""",
-        p[:1],
-        """::text[] = $2::JSON::text[]""",
-        p[1:],
-    )
-
-
-# The differences betweeen postgres/duckdb here only matters for tests
-# This can all be rectified when duckdb 1.4 is the minimum version
-@parametrize(
-    p=[
-        ("arr_str", '["s1", "s2", "s3"]'),
-        ("arr_str_some", '["s1", "s2"]'),
-        ("arr_obj_some", '[{"k1":"v1"}]'),
-    ],
-)
-def case_jextract_postgres(p: tuple[Any, ...]) -> JsonTC:
-    return JsonTC(
-        """SELECT ldlite_system.jextract(jc, $1){assertion} FROM j;""",
-        p[:1],
-        """ = $2::JSONB""",
-        p[1:],
-    )
-
-
-@parametrize(
-    p=[
-        ("str", "str_val"),
-        ("num", "12"),
-        ("float", "16.3"),
-        ("bool", "true"),
-        ("na",),
-        ("na_str1",),
-        ("na_str2",),
-    ],
-)
-def case_jextract_string(p: tuple[Any, ...]) -> JsonTC:
-    return JsonTC(
-        """SELECT ldlite_system.jextract_string(jc, $1){assertion} FROM j;""",
-        p[:1],
-        """ = $2""" if len(p) == 2 else """ IS NULL""",
-        p[1:],
-    )
-
-
 def case_jobject_keys() -> JsonTC:
     return JsonTC(
         """
@@ -197,10 +115,10 @@ def case_jis_uuid(p: tuple[Any, ...]) -> JsonTC:
 @parametrize(
     p=[
         ("na", True),
-        ("obj_empty", False),
-        ("arr_zero", False),
-        ("na_str1", False),
-        ("na_str2", False),
+        ("obj_empty", True),
+        ("arr_zero", True),
+        ("na_str1", True),
+        ("na_str2", True),
     ],
 )
 def case_jis_null(p: tuple[Any, ...]) -> JsonTC:

From 8d9a61cf11aace6ba7a9f07925949593a950d78a Mon Sep 17 00:00:00 2001
From: Katherine Bargar <kbargar@fivecolleges.edu>
Date: Fri, 6 Mar 2026 20:32:47 +0000
Subject: [PATCH 09/18] Find all possible keys in json

---
 src/ldlite/database/_expansion/nodes.py | 33 +++++++++++++++++++------
 1 file changed, 26 insertions(+), 7 deletions(-)

diff --git a/src/ldlite/database/_expansion/nodes.py b/src/ldlite/database/_expansion/nodes.py
index 4b9c466..916fae2 100644
--- a/src/ldlite/database/_expansion/nodes.py
+++ b/src/ldlite/database/_expansion/nodes.py
@@ -144,15 +144,34 @@ def unnest(
             cur.execute(
                 sql.SQL(
                     """
-SELECT ldlite_system.jobject_keys(j) FROM (
-    SELECT {json_col} AS j
-    FROM {table}
-    WHERE NOT ldlite_system.jis_null({json_col})
-    LIMIT 1
-)
+WITH
+    all_objects AS (
+        SELECT {json_col} AS j
+        FROM {source_table}
+        WHERE NOT ldlite_system.jis_null({json_col})
+    ),
+    first_object AS (
+        SELECT * FROM all_objects
+        LIMIT 1
+    ),
+    first_keys AS (
+        SELECT ldlite_system.jobject_keys(j) AS k
+        FROM first_object
+    ),
+    ordered_first_keys AS (
+        SELECT (ROW_NUMBER() OVER (ORDER BY (SELECT NULL))) AS idx, k
+        FROM first_keys
+    ), all_keys AS (
+        SELECT DISTINCT ldlite_system.jobject_keys(j) k
+        FROM all_objects
+    )
+SELECT ak.k
+FROM all_keys ak
+LEFT JOIN ordered_first_keys ofk ON ak.k = ofk.k
+ORDER BY ofk.idx NULLS LAST
 """,
                 )
-                .format(table=source_table, json_col=self.identifier)
+                .format(source_table=source_table, json_col=self.identifier)
                 .as_string(),
             )
             props = [prop[0] for prop in cur.fetchall()]

From b63f7e26221a705bd1cfbbab73e8b079faf8bc2c Mon Sep 17 00:00:00 2001
From: Katherine Bargar <kbargar@fivecolleges.edu>
Date: Fri, 6 Mar 2026 20:37:36 +0000
Subject: [PATCH 10/18] Remove sampling

---
 src/ldlite/database/_duckdb.py            | 4 ----
 src/ldlite/database/_expansion/context.py | 2 --
 src/ldlite/database/_expansion/nodes.py   | 4 +---
 src/ldlite/database/_postgres.py          | 4 ----
 src/ldlite/database/_typed_database.py    | 5 -----
 5 files changed, 1 insertion(+), 18 deletions(-)

diff --git a/src/ldlite/database/_duckdb.py b/src/ldlite/database/_duckdb.py
index db48527..4754e52 100644
--- a/src/ldlite/database/_duckdb.py
+++ b/src/ldlite/database/_duckdb.py
@@ -105,10 +105,6 @@ def ingest_records(
     def source_table_cte_stmt(self, keep_source: bool) -> str:  # noqa: ARG002
         return "WITH ld_source AS (SELECT * FROM {source_table})"
 
-    @property
-    def tablesample(self) -> str:
-        return "USING SAMPLE {sample} PERCENT (bernoulli)"
-
 
 # DuckDB has some strong opinions about cursors that are different than postgres
 # https://github.com/duckdb/duckdb/issues/11018
diff --git a/src/ldlite/database/_expansion/context.py b/src/ldlite/database/_expansion/context.py
index 257ab87..7b9e29a 100644
--- a/src/ldlite/database/_expansion/context.py
+++ b/src/ldlite/database/_expansion/context.py
@@ -31,7 +31,6 @@ class ExpandContext:
     ]
     # source_cte will go away when DuckDB implements CTAS RETURNING
     source_cte: Callable[[bool], str]
-    tablesample: str
     scan_progress: tqdm[NoReturn]
     transform_progress: tqdm[NoReturn]
 
@@ -48,7 +47,6 @@ def array_context(
             self.get_output_table,
             self.preprocess,
             self.source_cte,
-            self.tablesample,
             self.scan_progress,
             self.transform_progress,
         )
diff --git a/src/ldlite/database/_expansion/nodes.py b/src/ldlite/database/_expansion/nodes.py
index 916fae2..dc83a2e 100644
--- a/src/ldlite/database/_expansion/nodes.py
+++ b/src/ldlite/database/_expansion/nodes.py
@@ -188,9 +188,7 @@ def unnest(
 WITH
     values AS (
         SELECT {json_col}->$1 as ld_value
-        FROM {table} """  # noqa: S608
-                        + ctx.tablesample
-                        + """
+        FROM {table}
     ),
     value_and_types AS (
         SELECT
diff --git a/src/ldlite/database/_postgres.py b/src/ldlite/database/_postgres.py
index ae23631..4d2f19a 100644
--- a/src/ldlite/database/_postgres.py
+++ b/src/ldlite/database/_postgres.py
@@ -157,7 +157,3 @@ def source_table_cte_stmt(self, keep_source: bool) -> str:
         if keep_source:
             return "WITH ld_source AS (SELECT * FROM {source_table})"
         return "WITH ld_source AS (DELETE FROM {source_table} RETURNING *)"
-
-    @property
-    def tablesample(self) -> str:
-        return "TABLESAMPLE BERNOULLI ({sample})"
diff --git a/src/ldlite/database/_typed_database.py b/src/ldlite/database/_typed_database.py
index f66b464..a3f4eb4 100644
--- a/src/ldlite/database/_typed_database.py
+++ b/src/ldlite/database/_typed_database.py
@@ -186,10 +186,6 @@ def preprocess_source_table(
     @abstractmethod
     def source_table_cte_stmt(self, keep_source: bool) -> str: ...
 
-    @property
-    @abstractmethod
-    def tablesample(self) -> str: ...
-
     def expand_prefix(
         self,
         prefix: str,
@@ -237,7 +233,6 @@ def expand_prefix(
                     pfx.output_table,
                     self.preprocess_source_table,  # type: ignore [arg-type]
                     self.source_table_cte_stmt,
-                    self.tablesample,
                     scan_progress if scan_progress is not None else tqdm(disable=True),
                     transform_progress
                     if transform_progress is not None

From c23c6af974aec775dde7757556258413e0fbdac3 Mon Sep 17 00:00:00 2001
From: Katherine Bargar <kbargar@fivecolleges.edu>
Date: Fri, 6 Mar 2026 20:48:48 +0000
Subject: [PATCH 11/18] Actually drop the tables in the prefix

---
 src/ldlite/database/_typed_database.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/ldlite/database/_typed_database.py b/src/ldlite/database/_typed_database.py
index a3f4eb4..2ddd2b3 100644
--- a/src/ldlite/database/_typed_database.py
+++ b/src/ldlite/database/_typed_database.py
@@ -136,7 +136,7 @@ def _drop_extracted_tables(
             for (et,) in tables:
                 cur.execute(
                     sql.SQL("DROP TABLE IF EXISTS {table};")
-                    .format(table=sql.Identifier(cast("str", et)))
+                    .format(table=sql.Identifier(*cast("str", et).split(".")))
                     .as_string(),
                 )
             cur.execute(

From 7d04ee30c0f9e238034483d6bb08c4fee73652f0 Mon Sep 17 00:00:00 2001
From: Katherine Bargar <kbargar@fivecolleges.edu>
Date: Fri, 6 Mar 2026 20:48:54 +0000
Subject: [PATCH 12/18] Dont carry over jsonb column

---
 src/ldlite/database/_expansion/__init__.py | 1 -
 src/ldlite/database/_expansion/nodes.py    | 6 ++----
 2 files changed, 2 insertions(+), 5 deletions(-)

diff --git a/src/ldlite/database/_expansion/__init__.py b/src/ldlite/database/_expansion/__init__.py
index 585f956..5ab4e47 100644
--- a/src/ldlite/database/_expansion/__init__.py
+++ b/src/ldlite/database/_expansion/__init__.py
@@ -31,7 +31,6 @@ def _expand_nonmarc(
 ) -> tuple[int, list[str]]:
     ctx.scan_progress.total = (ctx.scan_progress.total or 0) + 1
     ctx.transform_progress.total = (ctx.transform_progress.total or 0) + 1
-    ctx.transform_progress.refresh()
     initial_count = count
     ctx.preprocess(ctx.conn, ctx.source_table, [root.identifier])
     root.unnest(
diff --git a/src/ldlite/database/_expansion/nodes.py b/src/ldlite/database/_expansion/nodes.py
index dc83a2e..695649d 100644
--- a/src/ldlite/database/_expansion/nodes.py
+++ b/src/ldlite/database/_expansion/nodes.py
@@ -290,11 +290,9 @@ def unnest(
 
     def _carryover(self) -> Iterator[str]:
         for n in self.root.descendents:
-            if isinstance(n, ObjectNode) and not n.unnested:
+            if (isinstance(n, ObjectNode) and not n.unnested) or n.name != "jsonb":
                 yield n.name
-            else:
-                yield n.name
-            yield from [v for v in n.values if v != "jsonb"]
+            yield from n.values
 
     @property
     def carryover(self) -> list[str]:

From f619eb62e58711306c1fd2b4f7f126c8e76df37c Mon Sep 17 00:00:00 2001
From: Katherine Bargar <kbargar@fivecolleges.edu>
Date: Mon, 9 Mar 2026 16:34:35 +0000
Subject: [PATCH 13/18] Simplify the query to get property names

---
 src/ldlite/database/_duckdb.py          |  4 +--
 src/ldlite/database/_expansion/nodes.py | 38 +++++++++++--------------
 src/ldlite/database/_postgres.py        |  2 +-
 tests/test_json_operators.py            |  5 +++-
 4 files changed, 23 insertions(+), 26 deletions(-)

diff --git a/src/ldlite/database/_duckdb.py b/src/ldlite/database/_duckdb.py
index 4754e52..1ca8508 100644
--- a/src/ldlite/database/_duckdb.py
+++ b/src/ldlite/database/_duckdb.py
@@ -41,8 +41,8 @@ def _setup_jfuncs(conn: duckdb.DuckDBPyConnection) -> None:
     END
 ;
 
-CREATE OR REPLACE FUNCTION ldlite_system.jobject_keys(j) AS
-    unnest(main.json_keys(j))
+CREATE OR REPLACE FUNCTION ldlite_system.jobject_keys(j) AS TABLE
+    SELECT je.key as ld_key FROM json_each(j) je ORDER BY je.id
 ;
 
 CREATE OR REPLACE FUNCTION ldlite_system.jis_uuid(j) AS
diff --git a/src/ldlite/database/_expansion/nodes.py b/src/ldlite/database/_expansion/nodes.py
index 695649d..1a4c631 100644
--- a/src/ldlite/database/_expansion/nodes.py
+++ b/src/ldlite/database/_expansion/nodes.py
@@ -145,30 +145,24 @@ def unnest(
                 sql.SQL(
                     """
 WITH
-    all_objects AS (
-        SELECT {json_col} AS j
-        FROM {source_table}
-        WHERE NOT ldlite_system.jis_null({json_col})
-    ),
-    first_object AS (
-        SELECT * FROM all_objects
-        LIMIT 1
-    ),
-    first_keys AS (
-        SELECT ldlite_system.jobject_keys(j) AS k
-        FROM first_object
+    keys AS (
+        SELECT
+            keys.ld_key AS k
+            ,ROW_NUMBER() OVER (PARTITION BY t.__id) AS idx
+        FROM {source_table} t, ldlite_system.jobject_keys(t.{json_col}) keys
+        WHERE {json_col} IS NOT NULL AND ldlite_system.jtype_of(t.{json_col}) = 'object'
     ),
-    ordered_first_keys AS (
-        SELECT (ROW_NUMBER() OVER (ORDER BY (SELECT NULL))) AS idx, k
-        FROM first_keys
-    ), all_keys AS (
-        SELECT DISTINCT ldlite_system.jobject_keys(j) k
-        FROM all_objects
+    ordered_keys AS (
+        SELECT
+            k
+            ,MAX(idx) idx
+            ,COUNT(idx) freq
+        FROM keys
+        GROUP BY k
     )
-SELECT ak.k
-FROM all_keys ak
-LEFT JOIN ordered_first_keys ofk ON ak.k = ofk.k
-ORDER BY ofk.idx NULLS LAST
+SELECT k
+FROM ordered_keys
+ORDER BY idx, freq DESC
 """,
                 )
                 .format(source_table=source_table, json_col=self.identifier)
diff --git a/src/ldlite/database/_postgres.py b/src/ldlite/database/_postgres.py
index 4d2f19a..2b51f9c 100644
--- a/src/ldlite/database/_postgres.py
+++ b/src/ldlite/database/_postgres.py
@@ -41,7 +41,7 @@ def _setup_jfuncs(conn: psycopg.Connection) -> None:
 IMMUTABLE
 PARALLEL SAFE;
 
-CREATE OR REPLACE FUNCTION ldlite_system.jobject_keys(j JSONB) RETURNS SETOF TEXT AS $$
+CREATE OR REPLACE FUNCTION ldlite_system.jobject_keys(j JSONB) RETURNS TABLE (ld_key TEXT) AS $$
 SELECT jsonb_object_keys(j);
 $$
 LANGUAGE sql
diff --git a/tests/test_json_operators.py b/tests/test_json_operators.py
index 987a8e0..8b1f8c8 100644
--- a/tests/test_json_operators.py
+++ b/tests/test_json_operators.py
@@ -33,7 +33,10 @@ def case_jobject_keys() -> JsonTC:
 {assertion}
 (SELECT e.jkey, a.jkey
 FROM (SELECT 'k1' jkey UNION SELECT 'k2' jkey) as e
-FULL OUTER JOIN (SELECT ldlite_system.jobject_keys(jc->'obj') jkey FROM j) as a
+FULL OUTER JOIN (
+    SELECT k.ld_key as jkey
+    FROM j, ldlite_system.jobject_keys(j.jc->'obj') k
+) as a
     USING (jkey)
 WHERE e.jkey IS NULL or a.jkey IS NULL) as q;""",
         (),

From 00775d17b286297cd72f2bc25527256000496474 Mon Sep 17 00:00:00 2001
From: Katherine Bargar <kbargar@fivecolleges.edu>
Date: Mon, 9 Mar 2026 20:06:31 +0000
Subject: [PATCH 14/18] Ignore empty source tables

---
 src/ldlite/database/_expansion/__init__.py |  4 +++-
 src/ldlite/database/_expansion/nodes.py    |  6 ++++--
 src/ldlite/database/_typed_database.py     | 15 ++++++++-------
 3 files changed, 15 insertions(+), 10 deletions(-)

diff --git a/src/ldlite/database/_expansion/__init__.py b/src/ldlite/database/_expansion/__init__.py
index 5ab4e47..6388a3d 100644
--- a/src/ldlite/database/_expansion/__init__.py
+++ b/src/ldlite/database/_expansion/__init__.py
@@ -33,13 +33,15 @@ def _expand_nonmarc(
     ctx.transform_progress.total = (ctx.transform_progress.total or 0) + 1
     initial_count = count
     ctx.preprocess(ctx.conn, ctx.source_table, [root.identifier])
-    root.unnest(
+    has_rows = root.unnest(
         ctx,
         ctx.source_table,
         ctx.get_transform_table(count),
         ctx.source_cte(False),
     )
     ctx.transform_progress.update(1)
+    if not has_rows:
+        return (0, [])
 
     expand_children_of = deque([root])
     while expand_children_of:
diff --git a/src/ldlite/database/_expansion/nodes.py b/src/ldlite/database/_expansion/nodes.py
index 1a4c631..dc3dfb6 100644
--- a/src/ldlite/database/_expansion/nodes.py
+++ b/src/ldlite/database/_expansion/nodes.py
@@ -124,7 +124,7 @@ def unnest(
         source_table: sql.Identifier,
         dest_table: sql.Identifier,
         source_cte: str,
-    ) -> None:
+    ) -> bool:
         self.unnested = True
         create_columns: list[sql.Composable] = [
             sql.Identifier(v) for v in self.carryover
@@ -138,7 +138,7 @@ def unnest(
             )
             total = cast("tuple[int]", cur.fetchone())[0]
             if total == 0:
-                return
+                return False
 
         with ctx.conn.cursor() as cur:
             cur.execute(
@@ -282,6 +282,8 @@ def unnest(
                 .as_string(),
             )
 
+        return True
+
     def _carryover(self) -> Iterator[str]:
         for n in self.root.descendents:
             if (isinstance(n, ObjectNode) and not n.unnested) or n.name != "jsonb":
diff --git a/src/ldlite/database/_typed_database.py b/src/ldlite/database/_typed_database.py
index 2ddd2b3..efaa57c 100644
--- a/src/ldlite/database/_typed_database.py
+++ b/src/ldlite/database/_typed_database.py
@@ -252,14 +252,15 @@ def expand_prefix(
                     .format(catalog_table=pfx.catalog_table.id)
                     .as_string(),
                 )
-                cur.executemany(
-                    sql.SQL("INSERT INTO {catalog_table} VALUES ($1)")
-                    .format(
-                        catalog_table=pfx.catalog_table.id,
+                if len(created_tables) > 0:
+                    cur.executemany(
+                        sql.SQL("INSERT INTO {catalog_table} VALUES ($1)")
+                        .format(
+                            catalog_table=pfx.catalog_table.id,
+                        )
+                        .as_string(),
+                        [(pfx.catalog_table_row(t),) for t in created_tables],
                     )
-                    .as_string(),
-                    [(pfx.catalog_table_row(t),) for t in created_tables],
-                )
 
             conn.commit()
 

From 523bd2c01db1cc2fd46fecfcfa658cb441d5e6fd Mon Sep 17 00:00:00 2001
From: Katherine Bargar <kbargar@fivecolleges.edu>
Date: Mon, 9 Mar 2026 20:33:23 +0000
Subject: [PATCH 15/18] Better display of iterators during transform

---
 src/ldlite/__init__.py                     | 12 +++++-------
 src/ldlite/database/_expansion/__init__.py |  2 ++
 src/ldlite/database/_expansion/nodes.py    |  2 ++
 3 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/src/ldlite/__init__.py b/src/ldlite/__init__.py
index 9e8e757..1675fc7 100644
--- a/src/ldlite/__init__.py
+++ b/src/ldlite/__init__.py
@@ -356,18 +356,12 @@ def query(  # noqa: C901, PLR0912, PLR0913, PLR0915
             with (
                 tqdm(
                     desc="scanning",
-                    leave=False,
-                    mininterval=5,
                     disable=self._quiet,
-                    delay=5,
                     bar_format=no_iters_format,
                 ) as scan_progress,
                 tqdm(
                     desc="transforming",
-                    leave=False,
-                    mininterval=5,
                     disable=self._quiet,
-                    delay=5,
                     bar_format=no_iters_format,
                 ) as transform_progress,
             ):
@@ -382,7 +376,11 @@ def query(  # noqa: C901, PLR0912, PLR0913, PLR0915
                 newtables = [table, *newtables]
             transform_elapsed = datetime.now(timezone.utc) - transform_started
 
-            with tqdm(desc="indexing", disable=self._quiet) as progress:
+            with tqdm(
+                desc="indexing",
+                disable=self._quiet,
+                bar_format=no_iters_format,
+            ) as progress:
                 index_started = datetime.now(timezone.utc)
                 self._database.index_prefix(table, progress)
 
diff --git a/src/ldlite/database/_expansion/__init__.py b/src/ldlite/database/_expansion/__init__.py
index 6388a3d..ae78c48 100644
--- a/src/ldlite/database/_expansion/__init__.py
+++ b/src/ldlite/database/_expansion/__init__.py
@@ -30,7 +30,9 @@ def _expand_nonmarc(
     ctx: ExpandContext,
 ) -> tuple[int, list[str]]:
     ctx.scan_progress.total = (ctx.scan_progress.total or 0) + 1
+    ctx.scan_progress.refresh()
     ctx.transform_progress.total = (ctx.transform_progress.total or 0) + 1
+    ctx.transform_progress.refresh()
     initial_count = count
     ctx.preprocess(ctx.conn, ctx.source_table, [root.identifier])
     has_rows = root.unnest(
diff --git a/src/ldlite/database/_expansion/nodes.py b/src/ldlite/database/_expansion/nodes.py
index dc3dfb6..afb2ca9 100644
--- a/src/ldlite/database/_expansion/nodes.py
+++ b/src/ldlite/database/_expansion/nodes.py
@@ -172,6 +172,7 @@ def unnest(
 
         prop_count = len(props)
         ctx.scan_progress.total += prop_count
+        ctx.scan_progress.refresh()
         ctx.scan_progress.update(1)
 
         for prop in props:
@@ -256,6 +257,7 @@ def unnest(
                     )
                     if meta.is_object:
                         ctx.scan_progress.total += 1
+                        ctx.scan_progress.refresh()
 
                 ctx.scan_progress.update(1)
 

From 76d92b206c5c515e17fb1331ae78fdc624e09d78 Mon Sep 17 00:00:00 2001
From: Katherine Bargar <kbargar@fivecolleges.edu>
Date: Tue, 10 Mar 2026 13:05:29 +0000
Subject: [PATCH 16/18] Actually fix the indexing

---
 environment.yaml                       |  2 +-
 src/ldlite/database/_typed_database.py |  8 ++--
 tests/test_query.py                    | 66 +++++++++++++-------------
 3 files changed, 37 insertions(+), 39 deletions(-)

diff --git a/environment.yaml b/environment.yaml
index 22c24ca..ea467a4 100644
--- a/environment.yaml
+++ b/environment.yaml
@@ -7,4 +7,4 @@ dependencies:
   - python>=3.10,<3.11
   - pdm==2.26.6
   - precious==0.10.2
-  - libpq>=13.0
+  - libpq>=14.0
diff --git a/src/ldlite/database/_typed_database.py b/src/ldlite/database/_typed_database.py
index efaa57c..1303931 100644
--- a/src/ldlite/database/_typed_database.py
+++ b/src/ldlite/database/_typed_database.py
@@ -286,10 +286,10 @@ def index_prefix(self, prefix: str, progress: tqdm[NoReturn] | None = None) -> N
                 cur.execute(
                     sql.SQL(
                         r"""
-SELECT TABLE_NAME, COLUMN_NAME FROM INFORMATION_SCHEMA.COLUMNS
+SELECT TABLE_SCHEMA, TABLE_NAME, COLUMN_NAME FROM INFORMATION_SCHEMA.COLUMNS
 WHERE
     TABLE_SCHEMA = $1 AND
-    TABLE_NAME IN (SELECT TABLE_NAME FROM {catalog}) AND
+    TABLE_NAME IN (SELECT SPLIT_PART(TABLE_NAME, '.', -1) FROM {catalog}) AND
     (
         DATA_TYPE IN ('UUID', 'uuid') OR
         COLUMN_NAME = 'id' OR
@@ -313,8 +313,8 @@ def index_prefix(self, prefix: str, progress: tqdm[NoReturn] | None = None) -> N
                         sql.SQL("CREATE INDEX {name} ON {table} ({column});")
                         .format(
                             name=sql.Identifier(str(uuid4()).split("-")[0]),
-                            table=sql.Identifier(*index[0].split(".")),
-                            column=sql.Identifier(index[1]),
+                            table=sql.Identifier(index[0], index[1]),
+                            column=sql.Identifier(index[2]),
                         )
                         .as_string(),
                     )
diff --git a/tests/test_query.py b/tests/test_query.py
index 41ad44e..8993e7d 100644
--- a/tests/test_query.py
+++ b/tests/test_query.py
@@ -31,7 +31,7 @@ class QueryTC(MockedResponseTestCase):
 def case_one_table(json_depth: int) -> QueryTC:
     return QueryTC(
         Call(
-            "prefix",
+            "tests.prefix",
             json_depth=json_depth,
             returns={
                 "purchaseOrders": [
@@ -48,7 +48,7 @@ def case_one_table(json_depth: int) -> QueryTC:
                 ["id", "value"],
                 [("b096504a-3d54-4664-9bf5-1b872466fd66", "value")],
             ),
-            "prefix__tcatalog": (["table_name"], [("prefix__t",)]),
+            "prefix__tcatalog": (["table_name"], [("tests.prefix__t",)]),
         },
         expected_indexes=[
             ("prefix__t", "id"),
@@ -60,7 +60,7 @@ def case_one_table(json_depth: int) -> QueryTC:
 def case_two_tables(json_depth: int) -> QueryTC:
     return QueryTC(
         Call(
-            "prefix",
+            "tests.prefix",
             json_depth=json_depth,
             returns={
                 "purchaseOrders": [
@@ -109,7 +109,7 @@ def case_two_tables(json_depth: int) -> QueryTC:
             ),
             "prefix__tcatalog": (
                 ["table_name"],
-                [("prefix__t",), ("prefix__t__sub_objects",)],
+                [("tests.prefix__t",), ("tests.prefix__t__sub_objects",)],
             ),
         },
         expected_indexes=[
@@ -124,7 +124,7 @@ def case_two_tables(json_depth: int) -> QueryTC:
 def case_table_no_expansion(json_depth: int) -> QueryTC:
     return QueryTC(
         Call(
-            "prefix",
+            "tests.prefix",
             json_depth=json_depth,
             returns={
                 "purchaseOrders": [
@@ -149,7 +149,7 @@ def case_table_no_expansion(json_depth: int) -> QueryTC:
 def case_table_underexpansion() -> QueryTC:
     return QueryTC(
         Call(
-            "prefix",
+            "tests.prefix",
             json_depth=2,
             returns={
                 "purchaseOrders": [
@@ -194,7 +194,7 @@ def case_table_underexpansion() -> QueryTC:
             ),
             "prefix__tcatalog": (
                 ["table_name"],
-                [("prefix__t",), ("prefix__t__sub_objects",)],
+                [("tests.prefix__t",), ("tests.prefix__t__sub_objects",)],
             ),
         },
     )
@@ -204,7 +204,7 @@ def case_table_underexpansion() -> QueryTC:
 def case_three_tables(json_depth: int) -> QueryTC:
     return QueryTC(
         Call(
-            "prefix",
+            "tests.prefix",
             json_depth=json_depth,
             returns={
                 "purchaseOrders": [
@@ -254,9 +254,9 @@ def case_three_tables(json_depth: int) -> QueryTC:
             "prefix__tcatalog": (
                 ["table_name"],
                 [
-                    ("prefix__t",),
-                    ("prefix__t__sub_objects",),
-                    ("prefix__t__sub_objects__sub_sub_objects",),
+                    ("tests.prefix__t",),
+                    ("tests.prefix__t__sub_objects",),
+                    ("tests.prefix__t__sub_objects__sub_sub_objects",),
                 ],
             ),
         },
@@ -277,7 +277,7 @@ def case_three_tables(json_depth: int) -> QueryTC:
 def case_nested_object() -> QueryTC:
     return QueryTC(
         Call(
-            "prefix",
+            "tests.prefix",
             json_depth=2,
             returns={
                 "purchaseOrders": [
@@ -307,7 +307,7 @@ def case_nested_object() -> QueryTC:
             ),
             "prefix__tcatalog": (
                 ["table_name"],
-                [("prefix__t",)],
+                [("tests.prefix__t",)],
             ),
         },
         expected_indexes=[
@@ -320,7 +320,7 @@ def case_nested_object() -> QueryTC:
 def case_doubly_nested_object() -> QueryTC:
     return QueryTC(
         Call(
-            "prefix",
+            "tests.prefix",
             json_depth=3,
             returns={
                 "purchaseOrders": [
@@ -361,7 +361,7 @@ def case_doubly_nested_object() -> QueryTC:
             ),
             "prefix__tcatalog": (
                 ["table_name"],
-                [("prefix__t",)],
+                [("tests.prefix__t",)],
             ),
         },
         expected_indexes=[
@@ -375,7 +375,7 @@ def case_doubly_nested_object() -> QueryTC:
 def case_nested_object_underexpansion() -> QueryTC:
     return QueryTC(
         Call(
-            "prefix",
+            "tests.prefix",
             json_depth=1,
             returns={
                 "purchaseOrders": [
@@ -405,7 +405,7 @@ def case_nested_object_underexpansion() -> QueryTC:
             ),
             "prefix__tcatalog": (
                 ["table_name"],
-                [("prefix__t",)],
+                [("tests.prefix__t",)],
             ),
         },
     )
@@ -414,7 +414,7 @@ def case_nested_object_underexpansion() -> QueryTC:
 def case_tables_and_lists() -> QueryTC:
     return QueryTC(
         Call(
-            "prefix",
+            "tests.prefix",
             json_depth=3,
             returns={
                 "purchaseOrders": [
@@ -502,9 +502,9 @@ def case_tables_and_lists() -> QueryTC:
             "prefix__tcatalog": (
                 ["table_name"],
                 [
-                    ("prefix__t",),
-                    ("prefix__t__sub_object__sub_objects",),
-                    ("prefix__t__sub_objects",),
+                    ("tests.prefix__t",),
+                    ("tests.prefix__t__sub_object__sub_objects",),
+                    ("tests.prefix__t__sub_objects",),
                 ],
             ),
         },
@@ -518,7 +518,7 @@ def case_tables_and_lists() -> QueryTC:
 def case_id_generation() -> QueryTC:
     return QueryTC(
         Call(
-            "prefix",
+            "tests.prefix",
             json_depth=4,
             returns={
                 "purchaseOrders": [
@@ -591,7 +591,7 @@ def case_id_generation() -> QueryTC:
 def case_indexing_id_like() -> QueryTC:
     return QueryTC(
         Call(
-            "prefix",
+            "tests.prefix",
             json_depth=4,
             returns={
                 "purchaseOrders": [
@@ -623,7 +623,7 @@ def case_indexing_id_like() -> QueryTC:
 def case_drop_raw(json_depth: int) -> QueryTC:
     return QueryTC(
         Call(
-            "prefix",
+            "tests.prefix",
             json_depth=json_depth,
             keep_raw=False,
             returns={
@@ -641,7 +641,7 @@ def case_drop_raw(json_depth: int) -> QueryTC:
                 ["id", "value"],
                 [("b096504a-3d54-4664-9bf5-1b872466fd66", "value")],
             ),
-            "prefix__tcatalog": (["table_name"], [("prefix__t",)]),
+            "prefix__tcatalog": (["table_name"], [("tests.prefix__t",)]),
         },
         expected_indexes=[
             ("prefix__t", "id"),
@@ -654,7 +654,7 @@ def case_drop_raw(json_depth: int) -> QueryTC:
 def case_null_records() -> QueryTC:
     return QueryTC(
         Call(
-            "prefix",
+            "tests.prefix",
             json_depth=1,
             returns={
                 "purchaseOrders": [
@@ -677,7 +677,7 @@ def case_null_records() -> QueryTC:
 def case_erm_keys() -> QueryTC:
     return QueryTC(
         Call(
-            "prefix",
+            "tests.prefix",
             json_depth=3,
             returns={
                 "pageSize": 30,
@@ -699,7 +699,7 @@ def case_erm_keys() -> QueryTC:
                 ["id", "value"],
                 [("b096504a-3d54-4664-9bf5-1b872466fd66", "value")],
             ),
-            "prefix__tcatalog": (["table_name"], [("prefix__t",)]),
+            "prefix__tcatalog": (["table_name"], [("tests.prefix__t",)]),
         },
         expected_indexes=[
             ("prefix__t", "id"),
@@ -732,7 +732,6 @@ def _act(uut: "ldlite.LDLite", tc: QueryTC) -> None:
 
 def _assert(
     conn: "dbapi.DBAPIConnection",
-    res_schema: str,  # TODO: have schema be part of tc
     tc: QueryTC,
 ) -> None:
     with closing(conn.cursor()) as cur:
@@ -740,9 +739,8 @@ def _assert(
             """
                 SELECT table_name
                 FROM information_schema.tables
-                WHERE table_schema=$1
+                WHERE table_schema='tests'
                 """,
-            (res_schema,),
         )
         assert sorted([r[0] for r in cur.fetchall()]) == sorted(tc.expected_tables)
 
@@ -758,7 +756,7 @@ def _assert(
                             for c in cols
                         ],
                     ),
-                    table=sql.Identifier(table),
+                    table=sql.Identifier("tests", table),
                 )
                 .as_string(),
             )
@@ -820,7 +818,7 @@ def test_duckdb(
     _act(uut, tc)
 
     with duckdb.connect(dsn) as conn:
-        _assert(cast("dbapi.DBAPIConnection", conn), "main", tc)
+        _assert(cast("dbapi.DBAPIConnection", conn), tc)
 
 
 @mock.patch("httpx_folio.auth.httpx.post")
@@ -842,4 +840,4 @@ def test_postgres(
     _act(uut, tc)
 
     with psycopg.connect(dsn, cursor_factory=psycopg.RawCursor) as conn:
-        _assert(cast("dbapi.DBAPIConnection", conn), "public", tc)
+        _assert(cast("dbapi.DBAPIConnection", conn), tc)

From 9936f850aa234573a71341d293d0215a72e7f1d8 Mon Sep 17 00:00:00 2001
From: Katherine Bargar <kbargar@fivecolleges.edu>
Date: Tue, 10 Mar 2026 14:03:43 +0000
Subject: [PATCH 17/18] Dont keep tqdm progress bars around

---
 src/ldlite/__init__.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/ldlite/__init__.py b/src/ldlite/__init__.py
index 1675fc7..3d9d3ef 100644
--- a/src/ldlite/__init__.py
+++ b/src/ldlite/__init__.py
@@ -336,6 +336,7 @@ def query(  # noqa: C901, PLR0912, PLR0913, PLR0915
                 tqdm(
                     records,
                     desc="downloading",
+                    leave=False,
                     total=total_records,
                     mininterval=5,
                     disable=self._quiet,
@@ -356,11 +357,13 @@ def query(  # noqa: C901, PLR0912, PLR0913, PLR0915
             with (
                 tqdm(
                     desc="scanning",
+                    leave=False,
                     disable=self._quiet,
                     bar_format=no_iters_format,
                 ) as scan_progress,
                 tqdm(
                     desc="transforming",
+                    leave=False,
                     disable=self._quiet,
                     bar_format=no_iters_format,
                 ) as transform_progress,
@@ -378,6 +381,7 @@ def query(  # noqa: C901, PLR0912, PLR0913, PLR0915
 
             with tqdm(
                 desc="indexing",
+                leave=False,
                 disable=self._quiet,
                 bar_format=no_iters_format,
             ) as progress:

From 3fa7cc9cd9016c918c18368c5c6cc7f20310fc78 Mon Sep 17 00:00:00 2001
From: Katherine Bargar <kbargar@fivecolleges.edu>
Date: Tue, 10 Mar 2026 16:40:36 +0000
Subject: [PATCH 18/18] Be more specific about when to include the object
 column name

---
 src/ldlite/database/_expansion/nodes.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/ldlite/database/_expansion/nodes.py b/src/ldlite/database/_expansion/nodes.py
index afb2ca9..75ea842 100644
--- a/src/ldlite/database/_expansion/nodes.py
+++ b/src/ldlite/database/_expansion/nodes.py
@@ -288,7 +288,9 @@ def unnest(
 
     def _carryover(self) -> Iterator[str]:
         for n in self.root.descendents:
-            if (isinstance(n, ObjectNode) and not n.unnested) or n.name != "jsonb":
+            if isinstance(n, ObjectNode) and not n.unnested and n.name != "jsonb":
+                yield n.name
+            if isinstance(n, ArrayNode):
                 yield n.name
             yield from n.values