-
Notifications
You must be signed in to change notification settings - Fork 5
Add cross-table lookup support for join-based transformations #136
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
Merged
Changes from all commits
Commits
Show all changes
8 commits
Select commit
Hold shift + click to select a range
00c752e
Add cross-table lookup support for join-based transformations
amc-corey-cox f7223a5
Fix Python 3.9 compatibility: use Optional[] instead of X | None
amc-corey-cox 48fb889
Align transformer_model.py with gen-pydantic output
amc-corey-cox 18a3406
Fix Python 3.9 compat: remove zip(strict=True)
amc-corey-cox d6f50c5
Address PR review feedback
amc-corey-cox 45fc917
Merge branch 'main' into cross-table-lookup
amc-corey-cox 685b188
Add edge-case tests for cross-table lookup
turbomam 751bbdb
Merge branch 'main' into cross-table-lookup
amc-corey-cox File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,81 @@ | ||
| """Spec-driven processing engine with cross-table lookup support.""" | ||
|
|
||
| from __future__ import annotations | ||
|
|
||
| import logging | ||
| from typing import TYPE_CHECKING, Any | ||
|
|
||
| from linkml_map.utils.lookup_index import LookupIndex | ||
|
|
||
| if TYPE_CHECKING: | ||
| from collections.abc import Iterator | ||
|
|
||
| from linkml_map.loaders.data_loaders import DataLoader | ||
| from linkml_map.transformer.object_transformer import ObjectTransformer | ||
|
|
||
| logger = logging.getLogger(__name__) | ||
|
|
||
|
|
||
| def transform_spec( | ||
| transformer: ObjectTransformer, | ||
| data_loader: DataLoader, | ||
| source_type: str | None = None, | ||
| ) -> Iterator[dict[str, Any]]: | ||
| """ | ||
| Iterate class_derivation blocks and stream transformed rows. | ||
|
|
||
| For each block whose ``populated_from`` names a loadable table, this | ||
| function: | ||
|
|
||
| 1. Registers any ``joins`` as secondary tables in a :class:`LookupIndex`. | ||
| 2. Streams primary-table rows through | ||
| :meth:`ObjectTransformer.map_object`. | ||
| 3. Drops secondary tables when the block is done. | ||
|
|
||
| :param transformer: A configured :class:`ObjectTransformer`. | ||
| :param data_loader: Loader that can resolve table names to file paths. | ||
| :param source_type: Optional explicit source type override. | ||
| :returns: Iterator of transformed row dicts. | ||
| """ | ||
| spec = transformer.derived_specification | ||
| if spec is None: | ||
| return | ||
|
|
||
| if transformer.lookup_index is None: | ||
| transformer.lookup_index = LookupIndex() | ||
|
|
||
| for class_deriv in spec.class_derivations: | ||
| table_name = class_deriv.populated_from or class_deriv.name | ||
| if table_name not in data_loader: | ||
| logger.debug("Skipping class_derivation %s: no data found", class_deriv.name) | ||
| continue | ||
|
|
||
| joined_tables: list[str] = [] | ||
| try: | ||
| # Register secondary (joined) tables | ||
| if class_deriv.joins: | ||
| for join_name, join_spec in class_deriv.joins.items(): | ||
| lookup_key = join_spec.lookup_key or join_spec.join_on | ||
| source_key = join_spec.source_key or join_spec.join_on | ||
| if not lookup_key or not source_key: | ||
| msg = ( | ||
| f"Join {join_name!r} must specify 'join_on' or both " | ||
| f"'source_key' and 'lookup_key'" | ||
| ) | ||
| raise ValueError(msg) | ||
| join_path = data_loader.get_path(join_name) | ||
| transformer.lookup_index.register_table( | ||
| join_name, join_path, lookup_key | ||
| ) | ||
| joined_tables.append(join_name) | ||
|
|
||
| # Stream primary table rows | ||
| for row in data_loader[table_name]: | ||
| yield transformer.map_object( | ||
| row, | ||
| source_type=source_type or table_name, | ||
| class_derivation=class_deriv, | ||
| ) | ||
| finally: | ||
| for jt in joined_tables: | ||
| transformer.lookup_index.drop(jt) | ||
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,90 @@ | ||
| """DuckDB-backed cross-table lookup index for join resolution.""" | ||
|
|
||
| from __future__ import annotations | ||
|
|
||
| import re | ||
| from pathlib import Path | ||
| from typing import Any | ||
|
|
||
| import duckdb | ||
|
|
||
| _IDENTIFIER_RE = re.compile(r"^[a-zA-Z_][a-zA-Z0-9_]*$") | ||
|
|
||
|
|
||
| def _validate_identifier(name: str) -> None: | ||
| """Validate that *name* is a safe SQL identifier.""" | ||
| if not _IDENTIFIER_RE.match(name): | ||
| msg = f"Invalid identifier: {name!r}" | ||
| raise ValueError(msg) | ||
|
|
||
|
|
||
| class LookupIndex: | ||
| """ | ||
| In-memory DuckDB index for cross-table lookups. | ||
|
|
||
| Each registered table is loaded from a CSV/TSV file via ``read_csv_auto`` | ||
| and indexed on a key column for fast single-row lookups. | ||
| """ | ||
|
|
||
| def __init__(self) -> None: | ||
| """Initialize an empty lookup index with an in-memory DuckDB connection.""" | ||
| self._conn = duckdb.connect(":memory:") | ||
| self._tables: dict[str, str] = {} # table_name -> key_column | ||
|
|
||
| def register_table(self, name: str, file_path: Path | str, key_column: str) -> None: | ||
| """ | ||
| Load a CSV/TSV file into DuckDB and create an index on *key_column*. | ||
|
|
||
| :param name: Logical table name (must be a valid identifier). | ||
| :param file_path: Path to a CSV or TSV file. | ||
| :param key_column: Column to index for lookups. | ||
| """ | ||
| _validate_identifier(name) | ||
| _validate_identifier(key_column) | ||
| file_path = Path(file_path) | ||
| self._conn.execute( | ||
| f"CREATE OR REPLACE TABLE {name} AS " # noqa: S608 | ||
| "SELECT * FROM read_csv_auto(?, all_varchar=true)", | ||
| [str(file_path)] | ||
| ) | ||
| self._conn.execute( | ||
| f"CREATE INDEX IF NOT EXISTS idx_{name}_{key_column} ON {name} ({key_column})" # noqa: S608 | ||
| ) | ||
amc-corey-cox marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| self._tables[name] = key_column | ||
|
|
||
| def lookup_row( | ||
| self, table: str, key_col: str, key_val: Any # noqa: ANN401 | ||
| ) -> dict[str, Any] | None: | ||
| """ | ||
| Return the first row matching *key_val* on *key_col*, or ``None``. | ||
|
|
||
| :param table: Previously registered table name. | ||
| :param key_col: Column to match on. | ||
| :param key_val: Value to look up. | ||
| :returns: Row as a dict, or None if not found. | ||
| """ | ||
| _validate_identifier(table) | ||
| _validate_identifier(key_col) | ||
| result = self._conn.execute( | ||
| f"SELECT * FROM {table} WHERE {key_col} = $1 LIMIT 1", # noqa: S608 | ||
| [str(key_val)], | ||
| ).fetchone() | ||
| if result is None: | ||
| return None | ||
| columns = [desc[0] for desc in self._conn.description] | ||
| return dict(zip(columns, result)) | ||
|
|
||
| def drop(self, table: str) -> None: | ||
| """Drop a registered table, releasing memory.""" | ||
| _validate_identifier(table) | ||
| self._conn.execute(f"DROP TABLE IF EXISTS {table}") # noqa: S608 | ||
| self._tables.pop(table, None) | ||
|
|
||
| def is_registered(self, table: str) -> bool: | ||
| """Check whether *table* has been registered.""" | ||
| return table in self._tables | ||
|
|
||
| def close(self) -> None: | ||
| """Close the DuckDB connection.""" | ||
| self._conn.close() | ||
| self._tables.clear() | ||
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.