From 56e979c376be4e7ef64f0f5bee1d504111896f84 Mon Sep 17 00:00:00 2001 From: amc-corey-cox <69321580+amc-corey-cox@users.noreply.github.com> Date: Fri, 15 May 2026 14:37:08 -0500 Subject: [PATCH 1/3] Add @safe_function extension surface and slugify+case-converter built-ins MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A `@safe_function` decorator + file-path loader lets downstream trans-spec authors register custom safe functions into the expression eval namespace from user-supplied Python modules, loaded via `-F`/`--functions` (repeatable). Author declares safety on the function (pure, bounded-time, no I/O); linkml-map enforces namespace boundaries, not function purity. - Collision between extensions → error. Collision with a built-in → error unless declared `override=True`. `override=True` with no matching built-in → warning (typo catcher). - Threaded through `ObjectTransformer.extension_functions` so Python callers can also set it directly. - Replaces simpleeval's `FunctionNotDefined` message with one that includes Levenshtein suggestions and a pointer to `--functions`. - Adds `python-slugify` and `inflection` as deps; built-in `slugify`, `to_snake`, `to_camel`, `to_pascal` join `_SCALAR_FUNCTIONS`. - Docs: `docs/api/extensions.md` covers the contract, the when-not-to-use anti-pattern, override semantics, and the required-extensions header convention. - Example extension in `tests/input/extensions/slugify_ext.py` dogfoods both `@safe_function` and `override=True`. Closes #242, #248. --- docs/api/extensions.md | 146 ++++++ docs/api/functions.md | 7 + mkdocs.yml | 1 + pyproject.toml | 2 + src/linkml_map/cli/cli.py | 18 + .../transformer/object_transformer.py | 14 +- src/linkml_map/utils/eval_utils.py | 119 ++++- src/linkml_map/utils/extensions.py | 178 +++++++ tests/input/extensions/slugify_ext.py | 30 ++ tests/test_utils/test_eval_utils.py | 89 +++- tests/test_utils/test_extensions.py | 445 ++++++++++++++++++ uv.lock | 34 ++ 12 files changed, 1075 insertions(+), 8 deletions(-) create mode 100644 docs/api/extensions.md create mode 100644 src/linkml_map/utils/extensions.py create mode 100644 tests/input/extensions/slugify_ext.py create mode 100644 tests/test_utils/test_extensions.py diff --git a/docs/api/extensions.md b/docs/api/extensions.md new file mode 100644 index 00000000..351cec49 --- /dev/null +++ b/docs/api/extensions.md @@ -0,0 +1,146 @@ +# Extension Functions + +linkml-map ships a curated set of safe built-in functions for use in +[expressions](expressions.md). When you need a function that isn't built in, +you can register your own — without forking linkml-map or wrapping it in a +custom Python harness — by tagging plain Python functions with +`@safe_function` and pointing the CLI at the file. + +## Quick example + +A user-supplied `my_helpers.py`: + +```python +from linkml_map.utils.extensions import safe_function + + +@safe_function +def normalize_taxon_id(s: str) -> str | None: + """Strip the 'NCBI:' prefix and pad to 8 digits.""" + if not s: + return None + raw = s.removeprefix("NCBI:").strip() + return f"NCBI:{int(raw):08d}" +``` + +Then in a trans-spec: + +```yaml +# required_extensions: my_helpers.py (convention; see below) +class_derivations: + Organism: + populated_from: SourceOrganism + slot_derivations: + tax_id: + expr: "normalize_taxon_id(taxon)" +``` + +And at the command line: + +```bash +linkml-map map-data -s schema.yaml -T transform.yaml \ + --functions ./my_helpers.py \ + data.tsv -o out.jsonl +``` + +The flag is repeatable: pass `--functions` (or the short form `-F`) once per +extension file. + +## The `@safe_function` contract + +Applying `@safe_function` is a **declaration by the author** that the function +is: + +- **Pure** — no I/O, no network calls, no global state mutation +- **Bounded-time** — deterministic and fast; runs once per row in a transform +- **Deterministic** — same inputs produce same outputs + +linkml-map **does not verify** these properties. The name "safe" reflects what +*you* are declaring about the function, not what linkml-map enforces. This is +the same posture as `typing.final` or `@SafeVarargs` in other ecosystems. + +The trust model is identical to `pip install`: anything in a module you import +will run. If you're importing a third-party extension, treat it like any other +dependency. + +## When NOT to use extensions + +Extensions are not an escape hatch for putting transformation logic in Python. +They exist for **named atomic operations** that read cleaner as a name than as +an expression chain — `slugify(name)` instead of +`replace(replace(lower(strip(name)), ' ', '_'), ',', '')`. + +If the function you're tempted to write is more than a few lines of pure +data manipulation, ask first whether it belongs in the trans-spec or in the +source/target schema. The declarative spec is the documentation of what the +transformation does; pulling logic out into Python hides it from review. + +## Override semantics + +A `@safe_function` may shadow a built-in if you explicitly say so: + +```python +@safe_function(override=True) +def lower(s: str) -> str: + return s.casefold() # locale-aware, replaces the built-in str.lower +``` + +- **Without `override=True`**: collision with a built-in raises `ExtensionError` + at load time — protects against accidental shadowing from a typo. +- **With `override=True` but no matching built-in**: logged as a warning (still + loaded) — useful as a typo catcher for the override case. +- **Collision between two extensions**: always an error. Pick one. + +There is no CLI flag to enable overrides. The decision lives on the function +declaration, where the author is responsible for it. + +## List-style functions + +By default, scalar functions distribute over lists and propagate `None` +(`slugify([a, b, None])` → `[slugify(a), slugify(b), None]`). For functions +that legitimately accept a list as their first argument (aggregators, etc.), +opt out: + +```python +@safe_function(distributes=False) +def median(items: list[float]) -> float: + sorted_items = sorted(items) + return sorted_items[len(sorted_items) // 2] +``` + +## Required-extension convention + +A trans-spec that references an extension function won't run without +`--functions`. The runtime error is clear (`Unknown function 'foo'. (If this +is a custom function, pass it via --functions .)`), but it's still +runtime. Until linkml-map gains a declarative `required_extensions:` key, the +convention is to note the dependency in a header comment on the spec: + +```yaml +# required_extensions: +# - my_helpers.py +# +id: https://example.org/my-transform +class_derivations: + ... +``` + +## Programmatic use + +Python callers can skip the CLI and set extensions directly on the transformer: + +```python +from linkml_map.transformer.object_transformer import ObjectTransformer +from linkml_map.utils.extensions import load_extensions + +ext = load_extensions(["./my_helpers.py"]) +tr = ObjectTransformer(extension_functions=ext) +``` + +`extension_functions` accepts any `dict[str, Callable]`, so you can also bypass +the loader entirely and hand-build the dict if you prefer (skipping the +decorator-tagging step). + +## API reference + +::: linkml_map.utils.extensions diff --git a/docs/api/functions.md b/docs/api/functions.md index a5c87593..af931a0a 100644 --- a/docs/api/functions.md +++ b/docs/api/functions.md @@ -20,6 +20,13 @@ and null propagation. | `len(items)` | Length of a list | | `case(pairs...)` | Conditional — first matching `(condition, value)` pair | | `uuid5(namespace, name)` | Deterministic UUID v5 generation | +| `slugify(s, separator="_")` | ASCII-fold + lowercase + collapse non-alphanumerics; `None` on no extractable content | +| `to_snake(s)` | Convert to `snake_case` | +| `to_camel(s)` | Convert to `camelCase` | +| `to_pascal(s)` | Convert to `PascalCase` | + +For functions not in this list, see [Extension Functions](extensions.md) to +register your own. ## Unit Conversion diff --git a/mkdocs.yml b/mkdocs.yml index 21817ebf..2ca134ba 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -39,6 +39,7 @@ nav: - Inference: api/inference.md - Expressions: api/expressions.md - Functions: api/functions.md + - Extensions: api/extensions.md # - Subsetter: api/subsetter.md - FAQ: faq.md site_url: https://linkml.github.io/linkml-map/ diff --git a/pyproject.toml b/pyproject.toml index cc93684b..3cdbf62a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -16,6 +16,7 @@ dependencies = [ "duckdb>=1.1.0", "flatten-dict>=0.4.2", "graphviz>=0.20", + "inflection>=0.5.1", "jinja2>=3", "lark>=1", "linkml>=1.11.0", @@ -23,6 +24,7 @@ dependencies = [ "more-itertools>=10.0.0", "pint>=0.20", "pydantic>=2,<3", + "python-slugify>=8.0.4", "pyyaml", "simpleeval>=1.0.3", "ucumvert>=0.2", diff --git a/src/linkml_map/cli/cli.py b/src/linkml_map/cli/cli.py index b118c99a..b1473cf5 100644 --- a/src/linkml_map/cli/cli.py +++ b/src/linkml_map/cli/cli.py @@ -21,6 +21,7 @@ from linkml_map.transformer.engine import transform_spec from linkml_map.transformer.errors import TransformationError from linkml_map.transformer.object_transformer import ObjectTransformer +from linkml_map.utils.extensions import ExtensionError, load_extensions from linkml_map.writers import ( EXTENSION_FORMAT_MAP, MultiStreamWriter, @@ -128,6 +129,16 @@ def main(verbose: int, quiet: bool) -> None: default=None, help="Write the resolved (merged + filtered) spec to this file path as a side-effect.", ) +@click.option( + "-F", + "--functions", + multiple=True, + type=click.Path(exists=True, dir_okay=False), + help=( + "Python file containing functions tagged with ``@safe_function``. " + "Their names are merged into the expression eval namespace. Repeatable." + ), +) @click.argument("input_data") def map_data( input_data: str, @@ -174,6 +185,13 @@ def map_data( """ logger.info(f"Transforming {input_data} conforming to {schema} using {transformer_specification}") + function_paths = kwargs.pop("functions", ()) + if function_paths: + try: + kwargs["extension_functions"] = load_extensions(function_paths) + except ExtensionError as err: + raise click.ClickException(str(err)) from err + input_path = Path(input_data) # Determine output format diff --git a/src/linkml_map/transformer/object_transformer.py b/src/linkml_map/transformer/object_transformer.py index 665ecc6a..b4479e7c 100644 --- a/src/linkml_map/transformer/object_transformer.py +++ b/src/linkml_map/transformer/object_transformer.py @@ -4,7 +4,7 @@ import json import logging -from collections.abc import Iterator, Mapping +from collections.abc import Callable, Iterator, Mapping from contextlib import contextmanager from dataclasses import dataclass, field from typing import Any @@ -271,6 +271,16 @@ class ObjectTransformer(Transformer): object_index: ObjectIndex = None lookup_index: Any = None # Optional[LookupIndex] — lazy import to avoid hard duckdb dep + extension_functions: dict[str, Callable] = field(default_factory=dict) + """Custom safe functions to merge into the expression eval namespace. + + Set by the CLI loader (from ``--functions`` files) or directly by Python + callers. Names here are available inside ``expr:`` expressions alongside + the built-ins. Per-call context functions (currently ``slot()``) take + precedence; built-in functions are precedence-checked at load time via + :func:`~linkml_map.utils.extensions.load_extensions`. + """ + _warned_unbound_names: set[str] = field(default_factory=set, repr=False) """Names already warned about in non-strict mode. @@ -396,7 +406,7 @@ def map_object( ) tgt_attrs = {} bindings = Bindings.from_context(self, context) - expr_functions = {"slot": lambda name: tgt_attrs.get(name)} + expr_functions = {**self.extension_functions, "slot": lambda name: tgt_attrs.get(name)} for slot_deriv in class_deriv.slot_derivations.values(): with self._slot_error_context(slot_deriv, context): tgt_attrs[str(slot_deriv.name)] = self._derive_slot( diff --git a/src/linkml_map/utils/eval_utils.py b/src/linkml_map/utils/eval_utils.py index 52c60139..e3062e49 100644 --- a/src/linkml_map/utils/eval_utils.py +++ b/src/linkml_map/utils/eval_utils.py @@ -18,13 +18,16 @@ """ import ast +import difflib import logging import math import uuid -from collections.abc import Mapping +from collections.abc import Iterable, Mapping from typing import Any -from simpleeval import EvalWithCompoundTypes, NameNotDefined +import inflection +from simpleeval import EvalWithCompoundTypes, InvalidExpression, NameNotDefined +from slugify import slugify as _slugify_lib logger = logging.getLogger(__name__) @@ -219,6 +222,71 @@ def _substr(s: str, start: int, end: int | None = None) -> str: return s[start:end] +def _slugify(s: str, separator: str = "_") -> str | None: + """Slugify a string to an identifier-shaped form. + + ASCII-folds, lowercases, and collapses non-alphanumeric runs to + ``separator``. Returns ``None`` if no identifier-shaped content remains — + matching the SQL-style null convention so ``slugify`` composes with + ``or``-chain fallbacks:: + + expr: "slugify(name) or slugify(label) or 'anonymous'" + + Backed by `python-slugify `_. + + >>> _slugify("Hello, World!") + 'hello_world' + >>> _slugify("Schöne Grüße") + 'schone_grusse' + >>> _slugify("café-au-lait", separator="-") + 'cafe-au-lait' + >>> _slugify("!!!") is None + True + >>> _slugify("") is None + True + """ + return _slugify_lib(s, separator=separator, lowercase=True) or None + + +def _to_snake(s: str) -> str: + """Convert a string to ``snake_case``. + + Backed by `inflection.underscore `_. + + >>> _to_snake("CamelCase") + 'camel_case' + >>> _to_snake("HTTPResponse") + 'http_response' + """ + return inflection.underscore(s) + + +def _to_pascal(s: str) -> str: + """Convert a string to ``PascalCase``. + + Backed by `inflection.camelize`. + + >>> _to_pascal("snake_case") + 'SnakeCase' + >>> _to_pascal("http_response") + 'HttpResponse' + """ + return inflection.camelize(s) + + +def _to_camel(s: str) -> str: + """Convert a string to ``camelCase``. + + Backed by `inflection.camelize` with ``uppercase_first_letter=False``. + + >>> _to_camel("snake_case") + 'snakeCase' + >>> _to_camel("http_response") + 'httpResponse' + """ + return inflection.camelize(s, uppercase_first_letter=False) + + #: Functions that operate on scalars and should distribute over lists. _SCALAR_FUNCTIONS: dict[str, Any] = { "str": str, @@ -249,6 +317,11 @@ def _substr(s: str, start: int, end: int | None = None) -> str: # String splitting/joining "split": str.split, "join": str.join, + # Identifier-shaped normalization + "slugify": _slugify, + "to_snake": _to_snake, + "to_camel": _to_camel, + "to_pascal": _to_pascal, } @@ -398,6 +471,36 @@ def wrapper(left: Any, right: Any) -> Any: # noqa: ANN401 return wrapper +class UnknownFunctionError(InvalidExpression): + """Raised when an expression calls a function not in the eval namespace. + + Subclasses ``InvalidExpression`` so existing ``except (InvalidExpression, ...)`` + blocks in the transformer continue to catch it (including the + ``unrestricted_eval`` fallback path). Carries a richer message than + simpleeval's default ``FunctionNotDefined``. + """ + + def __init__(self, func_name: str, message: str) -> None: + super().__init__(message) + self.func_name = func_name + + +def suggest_for_unknown_name(name: str, *, known_names: Iterable[str]) -> str: + """Build a user-facing error message for an unknown function name. + + Includes Levenshtein-style suggestions from the known-names pool plus a + parenthetical pointing at ``--functions`` so the single message covers both + the typo case and the missing-extension case. + + :param name: The unknown name as referenced in the expression. + :param known_names: Names currently in the eval namespace (built-ins plus + any loaded extensions) — the pool for did-you-mean suggestions. + """ + suggestions = difflib.get_close_matches(name, set(known_names), n=3, cutoff=0.6) + suggestion = f" Did you mean: {', '.join(repr(s) for s in suggestions)}?" if suggestions else "" + return f"Unknown function {name!r}.{suggestion} (If this is a custom function, pass it via --functions .)" + + class LinkMLEvaluator(EvalWithCompoundTypes): """ Expression evaluator with LinkML-specific extensions. @@ -445,6 +548,18 @@ def __init__( for op_type in (ast.USub, ast.UAdd, ast.Invert): self.operators[op_type] = _null_propagating_unary(self.operators[op_type]) + def _eval_call(self, node: ast.Call) -> Any: # noqa: ANN401 + """Pre-check function names for unknown-function errors with did-you-mean. + + Replaces simpleeval's default ``FunctionNotDefined`` message ("Function + 'X' not defined") with one that includes Levenshtein suggestions and a + pointer to ``--functions`` for the missing-extension case. + """ + if isinstance(node.func, ast.Name) and node.func.id not in self.functions: + msg = suggest_for_unknown_name(node.func.id, known_names=self.functions.keys()) + raise UnknownFunctionError(node.func.id, msg) from None + return super()._eval_call(node) + def _eval_name(self, node: ast.Name) -> Any: # noqa: ANN401 """ Override name resolution to surface unbound variables in strict mode. diff --git a/src/linkml_map/utils/extensions.py b/src/linkml_map/utils/extensions.py new file mode 100644 index 00000000..93e815cd --- /dev/null +++ b/src/linkml_map/utils/extensions.py @@ -0,0 +1,178 @@ +"""Extension surface for downstream-supplied safe functions. + +Trans-spec authors register custom functions into the eval namespace by writing +a Python module and tagging functions with :func:`safe_function`. linkml-map +loads these via the ``-F``/``--functions`` CLI flag (repeatable) or the +``extension_functions`` kwarg on :class:`~linkml_map.transformer.object_transformer.ObjectTransformer`. + +The decorator is a **declaration by the author** that the function is pure, +bounded-time, and free of I/O. linkml-map does not verify this — the safety +boundary is the named namespace, not sandboxed execution. Same posture as +:func:`typing.final`. + +Example +------- +A user-supplied ``my_helpers.py``:: + + from linkml_map.utils.extensions import safe_function + + @safe_function + def slugify(s, separator="_"): + ... + + @safe_function(override=True) # explicit shadowing of a built-in + def lower(s): + ... + + @safe_function(distributes=False) # list-style; opts out of scalar distribution + def my_aggregator(items): + ... + +Then:: + + linkml-map map-data ... --functions ./my_helpers.py + +Semantics +--------- +- Collision between two extensions → :class:`ExtensionError`. +- Collision with a built-in without ``override=True`` → :class:`ExtensionError`. +- ``override=True`` declared but no matching built-in → ``logging.warning``. +- Missing extension file → :class:`ExtensionError`. + +For the contract authors are declaring, see ``docs/expressions/extensions.md``. +""" + +from __future__ import annotations + +import importlib.util +import logging +import sys +from collections.abc import Callable, Iterable +from pathlib import Path +from typing import Any + +from linkml_map.utils.eval_utils import FUNCTIONS, _distributing + +_SAFE_FUNCTION_ATTR = "_linkml_safe_function" + +logger = logging.getLogger(__name__) + + +class ExtensionError(Exception): + """Raised when loading an extension function module fails.""" + + +def safe_function( + func: Callable | None = None, + *, + override: bool = False, + distributes: bool = True, +) -> Callable: + """Tag a function for inclusion in the safe-function namespace. + + Applying this decorator is a **declaration by the author** that the + function is pure, bounded-time, and free of I/O. linkml-map does not + verify these properties. + + Usable bare or with kwargs:: + + @safe_function + def slugify(s): ... + + @safe_function(override=True) + def lower(s): ... + + :param override: Allow shadowing a built-in of the same name. Without this, + a collision with a built-in raises :class:`ExtensionError` at load time. + :param distributes: Apply the scalar-distributing wrapper (broadcasts over + lists, propagates ``None``). Default ``True``; set ``False`` for + functions that accept a list as their first argument. + """ + + def _tag(fn: Callable) -> Callable: + fn._linkml_safe_function = {"override": override, "distributes": distributes} # type: ignore[attr-defined] + return fn + + if func is not None: + # Bare ``@safe_function`` with no parentheses. + return _tag(func) + return _tag + + +def _load_module_from_path(path: Path) -> Any: # noqa: ANN401 + """Import a Python file as an anonymous module. + + :raises ExtensionError: If the file does not exist. + """ + if not path.exists(): + msg = f"Extension file not found: {path}" + raise ExtensionError(msg) + spec = importlib.util.spec_from_file_location(f"_linkml_ext_{path.stem}", path) + if spec is None or spec.loader is None: + msg = f"Could not create import spec for: {path}" + raise ExtensionError(msg) + module = importlib.util.module_from_spec(spec) + sys.modules[spec.name] = module + spec.loader.exec_module(module) + return module + + +def _collect_tagged_functions(module: Any) -> dict[str, dict[str, Any]]: # noqa: ANN401 + """Walk module attributes and collect functions tagged with :func:`safe_function`.""" + found: dict[str, dict[str, Any]] = {} + for name in dir(module): + if name.startswith("_"): + continue + obj = getattr(module, name) + meta = getattr(obj, _SAFE_FUNCTION_ATTR, None) + if isinstance(meta, dict): + found[name] = {"func": obj, **meta} + return found + + +def load_extensions(paths: Iterable[str | Path]) -> dict[str, Callable]: + """Load tagged functions from a list of file paths into one merged dict. + + Applies the scalar-distributing wrapper to functions declared with + ``distributes=True`` (the default), so they broadcast over lists and + propagate ``None`` consistently with the built-ins. + + :param paths: Iterable of file paths to ``.py`` modules with tagged functions. + :returns: Mapping of ``name → callable`` ready to merge into + ``ObjectTransformer.extension_functions``. + :raises ExtensionError: On missing file, name collision between extensions, + or attempt to shadow a built-in without ``override=True``. + """ + merged: dict[str, Callable] = {} + sources: dict[str, Path] = {} + + for raw_path in paths: + path = Path(raw_path).resolve() + module = _load_module_from_path(path) + tagged = _collect_tagged_functions(module) + + for name, meta in tagged.items(): + if name in merged: + msg = f"Extension name collision: {name!r} defined in both {sources[name]} and {path}" + raise ExtensionError(msg) + if name in FUNCTIONS and not meta["override"]: + msg = ( + f"Extension {name!r} from {path} shadows a built-in of the " + f"same name. Declare ``@safe_function(override=True)`` if intentional." + ) + raise ExtensionError(msg) + if meta["override"] and name not in FUNCTIONS: + logger.warning( + "Extension %r from %s declared override=True but no built-in %r exists", + name, + path, + name, + ) + + fn = meta["func"] + if meta["distributes"]: + fn = _distributing(fn) + merged[name] = fn + sources[name] = path + + return merged diff --git a/tests/input/extensions/slugify_ext.py b/tests/input/extensions/slugify_ext.py new file mode 100644 index 00000000..e5cffc99 --- /dev/null +++ b/tests/input/extensions/slugify_ext.py @@ -0,0 +1,30 @@ +"""Example extension module: a minimal pure-stdlib ``slugify``. + +Dogfood reference for the ``@safe_function`` extension mechanism. The built-in +``slugify`` (backed by ``python-slugify``) is the production version; this +example exists to demonstrate both how to write an extension and how +``override=True`` shadows a built-in. +""" + +import re +import unicodedata + +from linkml_map.utils.extensions import safe_function + + +@safe_function(override=True) +def slugify(s: str, separator: str = "_") -> str | None: + """ASCII-fold, lowercase, collapse non-alphanumeric runs to ``separator``. + + Returns ``None`` if no identifier-shaped content remains, matching the + SQL-style null convention so it composes with ``or``-chains in expressions:: + + expr: "slugify(name) or 'anonymous'" + """ + folded = unicodedata.normalize("NFKD", s).encode("ascii", "ignore").decode("ascii").lower() + cleaned = re.sub(r"[^a-z0-9]+", separator, folded).strip(separator) + if not cleaned: + return None + if cleaned[0].isdigit(): + cleaned = separator + cleaned + return cleaned diff --git a/tests/test_utils/test_eval_utils.py b/tests/test_utils/test_eval_utils.py index 7ae53dcb..c38d992a 100644 --- a/tests/test_utils/test_eval_utils.py +++ b/tests/test_utils/test_eval_utils.py @@ -407,8 +407,8 @@ def test_function_distributes_over_lists() -> None: def test_unknown_function_raises() -> None: - """Calling an unknown function raises an error.""" - with pytest.raises(Exception, match="[Nn]ot defined"): # codespell:ignore + """Calling an unknown function raises an error with a helpful message.""" + with pytest.raises(Exception, match="Unknown function 'my_func'"): eval_expr("my_func(1)") @@ -551,9 +551,9 @@ def test_constants(expr: str, expected: Any) -> None: def test_import_blocked() -> None: """Import expressions are not supported.""" - from simpleeval import FunctionNotDefined + from simpleeval import InvalidExpression - with pytest.raises(FunctionNotDefined): + with pytest.raises(InvalidExpression, match="__import__"): eval_expr("__import__('os').listdir()") @@ -1198,3 +1198,84 @@ def test_substr_null_in_list() -> None: None, "2023", ] + + +# ---- slugify ---- + + +@pytest.mark.parametrize( + ("expr", "kwargs", "expected"), + [ + ('slugify("Hello, World!")', {}, "hello_world"), + ('slugify("Schöne Grüße")', {}, "schone_grusse"), + ('slugify("Smith, J.R. (Jr.)")', {}, "smith_j_r_jr"), + ('slugify("123 hits")', {}, "123_hits"), + ('slugify("café-au-lait", "-")', {}, "cafe-au-lait"), + ("slugify(x)", {"x": "Hello World"}, "hello_world"), + ], + ids=["basic", "unicode", "punctuation", "leading_digit", "dash_sep", "with_var"], +) +def test_slugify(expr: str, kwargs: dict, expected: str) -> None: + """slugify converts arbitrary strings to identifier-shaped slugs.""" + assert eval_expr(expr, **kwargs) == expected + + +@pytest.mark.parametrize( + "expr", + ['slugify("")', 'slugify("!!!")', 'slugify(" ")'], + ids=["empty", "all_punct", "all_whitespace"], +) +def test_slugify_no_content_returns_none(expr: str) -> None: + """slugify returns None when no identifier-shaped content remains.""" + assert eval_expr(expr) is None + + +def test_slugify_null_propagation() -> None: + """slugify propagates None.""" + assert eval_expr("slugify(x)", x=None) is None + + +def test_slugify_distributes_over_list() -> None: + """slugify distributes over lists.""" + assert eval_expr("slugify(names)", names=["Foo Bar", "Baz Qux"]) == ["foo_bar", "baz_qux"] + + +def test_slugify_composes_with_or_fallback() -> None: + """slugify returning None composes with ``or`` for fallback chains.""" + assert eval_expr('slugify(x) or "anonymous"', x="!!!") == "anonymous" + + +# ---- to_snake / to_camel / to_pascal ---- + + +@pytest.mark.parametrize( + ("expr", "expected"), + [ + ('to_snake("CamelCase")', "camel_case"), + ('to_snake("HTTPResponse")', "http_response"), + ('to_pascal("snake_case")', "SnakeCase"), + ('to_pascal("http_response")', "HttpResponse"), + ('to_camel("snake_case")', "snakeCase"), + ('to_camel("http_response")', "httpResponse"), + ], + ids=[ + "snake_from_camel", + "snake_from_acronym", + "pascal_from_snake", + "pascal_two_word", + "camel_from_snake", + "camel_two_word", + ], +) +def test_case_converters(expr: str, expected: str) -> None: + """Case converters wrap inflection.underscore / inflection.camelize.""" + assert eval_expr(expr) == expected + + +def test_case_converters_distribute_and_propagate_none() -> None: + """Case converters share the standard distributing + null-propagating wrapper.""" + assert eval_expr("to_snake(x)", x=None) is None + assert eval_expr("to_pascal(x)", x=["snake_case", "http_response"]) == [ + "SnakeCase", + "HttpResponse", + ] diff --git a/tests/test_utils/test_extensions.py b/tests/test_utils/test_extensions.py new file mode 100644 index 00000000..976a7c9e --- /dev/null +++ b/tests/test_utils/test_extensions.py @@ -0,0 +1,445 @@ +"""Tests for the safe-function extension mechanism.""" + +# ruff: noqa: ANN401, PLR2004 + +import logging +from pathlib import Path + +import pytest + +from linkml_map.utils.eval_utils import ( + FUNCTIONS, + UnknownFunctionError, + eval_expr, + eval_expr_with_mapping, + suggest_for_unknown_name, +) +from linkml_map.utils.extensions import ( + _SAFE_FUNCTION_ATTR, + ExtensionError, + load_extensions, + safe_function, +) + + +def _write_ext(tmp_path: Path, name: str, body: str) -> Path: + """Write an extension module file and return its path.""" + path = tmp_path / name + path.write_text(body) + return path + + +# ---- Decorator ---- + + +def test_decorator_bare() -> None: + """``@safe_function`` without parentheses tags the function.""" + + @safe_function + def f(x: str) -> str: + return x.upper() + + meta = getattr(f, _SAFE_FUNCTION_ATTR) + assert meta == {"override": False, "distributes": True} + assert f("hi") == "HI" + + +def test_decorator_with_kwargs() -> None: + """``@safe_function(override=True, distributes=False)`` carries kwargs.""" + + @safe_function(override=True, distributes=False) + def f(items: list) -> int: + return len(items) + + meta = getattr(f, _SAFE_FUNCTION_ATTR) + assert meta == {"override": True, "distributes": False} + + +# ---- Loader ---- + + +def test_load_single_extension(tmp_path: Path) -> None: + """Loading a single file picks up all tagged functions.""" + path = _write_ext( + tmp_path, + "ext1.py", + """ +from linkml_map.utils.extensions import safe_function + +@safe_function +def reverse(s): + return s[::-1] + +@safe_function +def double(s): + return s + s + +def untagged(s): + return s # should be ignored +""", + ) + + loaded = load_extensions([path]) + + assert set(loaded.keys()) == {"reverse", "double"} + assert loaded["reverse"]("abc") == "cba" + + +def test_load_multiple_extensions(tmp_path: Path) -> None: + """Loading multiple files merges their tagged functions.""" + a = _write_ext( + tmp_path, + "a.py", + """ +from linkml_map.utils.extensions import safe_function + +@safe_function +def alpha(s): + return "A:" + s +""", + ) + b = _write_ext( + tmp_path, + "b.py", + """ +from linkml_map.utils.extensions import safe_function + +@safe_function +def beta(s): + return "B:" + s +""", + ) + + loaded = load_extensions([a, b]) + + assert set(loaded.keys()) == {"alpha", "beta"} + assert loaded["alpha"]("x") == "A:x" + assert loaded["beta"]("y") == "B:y" + + +def test_collision_between_extensions(tmp_path: Path) -> None: + """Two extensions defining the same name → ExtensionError.""" + a = _write_ext( + tmp_path, + "a.py", + """ +from linkml_map.utils.extensions import safe_function + +@safe_function +def shared(s): + return "A:" + s +""", + ) + b = _write_ext( + tmp_path, + "b.py", + """ +from linkml_map.utils.extensions import safe_function + +@safe_function +def shared(s): + return "B:" + s +""", + ) + + with pytest.raises(ExtensionError, match="collision"): + load_extensions([a, b]) + + +def test_collision_with_builtin_without_override(tmp_path: Path) -> None: + """Shadowing a built-in without override=True → ExtensionError.""" + path = _write_ext( + tmp_path, + "ext.py", + """ +from linkml_map.utils.extensions import safe_function + +@safe_function +def lower(s): + return s +""", + ) + + with pytest.raises(ExtensionError, match="shadows a built-in"): + load_extensions([path]) + + +def test_collision_with_builtin_with_override(tmp_path: Path) -> None: + """Shadowing a built-in with override=True succeeds and replaces it.""" + path = _write_ext( + tmp_path, + "ext.py", + """ +from linkml_map.utils.extensions import safe_function + +@safe_function(override=True) +def lower(s): + return "OVERRIDDEN:" + s +""", + ) + + loaded = load_extensions([path]) + + assert loaded["lower"]("hi") == "OVERRIDDEN:hi" + + +def test_override_without_existing_builtin_warns(tmp_path: Path, caplog: pytest.LogCaptureFixture) -> None: + """override=True declared but no matching built-in → warning, no error.""" + path = _write_ext( + tmp_path, + "ext.py", + """ +from linkml_map.utils.extensions import safe_function + +@safe_function(override=True) +def no_such_builtin(s): + return s +""", + ) + + with caplog.at_level(logging.WARNING, logger="linkml_map.utils.extensions"): + loaded = load_extensions([path]) + + assert "no_such_builtin" in loaded + assert any("override=True" in r.message for r in caplog.records) + + +def test_missing_file_errors(tmp_path: Path) -> None: + """A missing extension file → ExtensionError.""" + missing = tmp_path / "nope.py" + with pytest.raises(ExtensionError, match="not found"): + load_extensions([missing]) + + +# ---- Wrapper behavior: distribution and None propagation ---- + + +def test_distributing_applied_by_default(tmp_path: Path) -> None: + """Scalar extensions distribute over lists and propagate None by default.""" + path = _write_ext( + tmp_path, + "ext.py", + """ +from linkml_map.utils.extensions import safe_function + +@safe_function +def shout(s): + return s.upper() + "!" +""", + ) + + loaded = load_extensions([path]) + + assert loaded["shout"]("hi") == "HI!" + assert loaded["shout"](["a", "b"]) == ["A!", "B!"] + assert loaded["shout"](None) is None + + +def test_distributes_false_keeps_list_intact(tmp_path: Path) -> None: + """``distributes=False`` does not apply the distributing wrapper.""" + path = _write_ext( + tmp_path, + "ext.py", + """ +from linkml_map.utils.extensions import safe_function + +@safe_function(distributes=False) +def join_them(items): + return ",".join(items) +""", + ) + + loaded = load_extensions([path]) + + # Receives the list as-is, doesn't distribute. + assert loaded["join_them"](["a", "b", "c"]) == "a,b,c" + + +# ---- Integration with the evaluator ---- + + +def test_extension_callable_via_eval_expr_with_mapping(tmp_path: Path) -> None: + """Extension functions plug into the evaluator via the ``functions`` kwarg.""" + path = _write_ext( + tmp_path, + "ext.py", + """ +from linkml_map.utils.extensions import safe_function + +@safe_function +def excited(s): + return s + "!" +""", + ) + loaded = load_extensions([path]) + + result = eval_expr_with_mapping("excited(name)", {"name": "hi"}, functions=loaded) + + assert result == "hi!" + + +# ---- Unknown function errors ---- + + +def test_unknown_function_raises_enriched_error() -> None: + """An unknown function name raises UnknownFunctionError with a helpful message.""" + with pytest.raises(UnknownFunctionError) as exc_info: + eval_expr("no_such_helper('hello world')") + + err = exc_info.value + assert err.func_name == "no_such_helper" + assert "Unknown function 'no_such_helper'" in str(err) + assert "--functions" in str(err) + + +def test_unknown_function_includes_did_you_mean() -> None: + """Close matches in the known pool show up as did-you-mean suggestions.""" + with pytest.raises(UnknownFunctionError) as exc_info: + # ``lwer`` is close to ``lower`` + eval_expr("lwer('HELLO')") + + assert "'lower'" in str(exc_info.value) + + +def test_suggest_for_unknown_name_no_close_match() -> None: + """Without a close match, no did-you-mean phrase is included.""" + msg = suggest_for_unknown_name("zzqqxx", known_names=FUNCTIONS.keys()) + assert "Did you mean" not in msg + assert "Unknown function 'zzqqxx'" in msg + assert "--functions" in msg + + +# ---- Integration with ObjectTransformer ---- + +EXTENSIONS_DIR = Path(__file__).parent.parent / "input" / "extensions" + + +def test_object_transformer_uses_extension_functions() -> None: + """End-to-end: extension function is callable from a slot derivation expr.""" + from linkml_runtime import SchemaView + + from linkml_map.transformer.object_transformer import ObjectTransformer + + source_schema = """ +id: https://example.org/src +name: src +prefixes: + linkml: https://w3id.org/linkml/ +default_prefix: ex +imports: [linkml:types] +classes: + Person: + attributes: + full_name: {range: string} +""" + target_schema = """ +id: https://example.org/tgt +name: tgt +prefixes: + linkml: https://w3id.org/linkml/ +default_prefix: ex +imports: [linkml:types] +classes: + Agent: + attributes: + identifier: {range: string} +""" + transform_spec = { + "id": "https://example.org/tr", + "class_derivations": { + "Agent": { + "populated_from": "Person", + "slot_derivations": { + "identifier": {"expr": "slugify(full_name)"}, + }, + } + }, + } + + extensions = load_extensions([EXTENSIONS_DIR / "slugify_ext.py"]) + tr = ObjectTransformer(extension_functions=extensions) + tr.source_schemaview = SchemaView(source_schema) + tr.target_schemaview = SchemaView(target_schema) + tr.create_transformer_specification(transform_spec) + + result = tr.map_object({"full_name": "Smith, J.R. (Jr.)"}, source_type="Person") + + assert result == {"identifier": "smith_j_r_jr"} + + +def test_cli_loads_extension_module(tmp_path: Path) -> None: + """CLI ``--functions`` loads the extension and makes its names available.""" + import yaml + from click.testing import CliRunner + + from linkml_map.cli.cli import main + + source_schema = tmp_path / "source.yaml" + source_schema.write_text( + """ +id: https://example.org/src +name: src +prefixes: + linkml: https://w3id.org/linkml/ +default_prefix: ex +imports: [linkml:types] +classes: + Person: + attributes: + full_name: {range: string} +""" + ) + target_schema = tmp_path / "target.yaml" + target_schema.write_text( + """ +id: https://example.org/tgt +name: tgt +prefixes: + linkml: https://w3id.org/linkml/ +default_prefix: ex +imports: [linkml:types] +classes: + Agent: + attributes: + identifier: {range: string} +""" + ) + spec = tmp_path / "spec.yaml" + spec.write_text( + """ +id: https://example.org/tr +class_derivations: + Agent: + populated_from: Person + slot_derivations: + identifier: + expr: "slugify(full_name)" +""" + ) + data = tmp_path / "person.yaml" + data.write_text("full_name: 'O''Brien, P.'\n") + out = tmp_path / "out.yaml" + + runner = CliRunner() + result = runner.invoke( + main, + [ + "map-data", + "-s", + str(source_schema), + "--target-schema", + str(target_schema), + "-T", + str(spec), + "--functions", + str(EXTENSIONS_DIR / "slugify_ext.py"), + "--source-type", + "Person", + "-o", + str(out), + str(data), + ], + ) + + assert result.exit_code == 0, result.output + assert yaml.safe_load(out.read_text()) == {"identifier": "o_brien_p"} diff --git a/uv.lock b/uv.lock index 9b722ddb..a32ff08c 100644 --- a/uv.lock +++ b/uv.lock @@ -843,6 +843,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/5f/53/fb7122b71361a0d121b669dcf3d31244ef75badbbb724af388948de543e2/imagesize-2.0.0-py2.py3-none-any.whl", hash = "sha256:5667c5bbb57ab3f1fa4bc366f4fbc971db3d5ed011fd2715fd8001f782718d96", size = 9441, upload-time = "2026-03-03T14:18:27.892Z" }, ] +[[package]] +name = "inflection" +version = "0.5.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/e1/7e/691d061b7329bc8d54edbf0ec22fbfb2afe61facb681f9aaa9bff7a27d04/inflection-0.5.1.tar.gz", hash = "sha256:1a29730d366e996aaacffb2f1f1cb9593dc38e2ddd30c91250c6dde09ea9b417", size = 15091, upload-time = "2020-08-22T08:16:29.139Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/59/91/aa6bde563e0085a02a435aa99b49ef75b0a4b062635e606dab23ce18d720/inflection-0.5.1-py2.py3-none-any.whl", hash = "sha256:f38b2b640938a4f35ade69ac3d053042959b62a0f1076a5bbaa1b9526605a8a2", size = 9454, upload-time = "2020-08-22T08:16:27.816Z" }, +] + [[package]] name = "iniconfig" version = "2.1.0" @@ -1373,6 +1382,7 @@ dependencies = [ { name = "duckdb" }, { name = "flatten-dict" }, { name = "graphviz" }, + { name = "inflection" }, { name = "jinja2" }, { name = "lark" }, { name = "linkml" }, @@ -1380,6 +1390,7 @@ dependencies = [ { name = "more-itertools" }, { name = "pint" }, { name = "pydantic" }, + { name = "python-slugify" }, { name = "pyyaml" }, { name = "simpleeval" }, { name = "ucumvert" }, @@ -1409,6 +1420,7 @@ requires-dist = [ { name = "duckdb", specifier = ">=1.1.0" }, { name = "flatten-dict", specifier = ">=0.4.2" }, { name = "graphviz", specifier = ">=0.20" }, + { name = "inflection", specifier = ">=0.5.1" }, { name = "jinja2", specifier = ">=3" }, { name = "lark", specifier = ">=1" }, { name = "linkml", specifier = ">=1.11.0" }, @@ -1416,6 +1428,7 @@ requires-dist = [ { name = "more-itertools", specifier = ">=10.0.0" }, { name = "pint", specifier = ">=0.20" }, { name = "pydantic", specifier = ">=2,<3" }, + { name = "python-slugify", specifier = ">=8.0.4" }, { name = "pyyaml" }, { name = "simpleeval", specifier = ">=1.0.3" }, { name = "ucumvert", specifier = ">=0.2" }, @@ -2344,6 +2357,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/08/20/0f2523b9e50a8052bc6a8b732dfc8568abbdc42010aef03a2d750bdab3b2/python_json_logger-3.3.0-py3-none-any.whl", hash = "sha256:dd980fae8cffb24c13caf6e158d3d61c0d6d22342f932cb6e9deedab3d35eec7", size = 15163, upload-time = "2025-03-07T07:08:25.627Z" }, ] +[[package]] +name = "python-slugify" +version = "8.0.4" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "text-unidecode" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/87/c7/5e1547c44e31da50a460df93af11a535ace568ef89d7a811069ead340c4a/python-slugify-8.0.4.tar.gz", hash = "sha256:59202371d1d05b54a9e7720c5e038f928f45daaffe41dd10822f3907b937c856", size = 10921, upload-time = "2024-02-08T18:32:45.488Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/a4/62/02da182e544a51a5c3ccf4b03ab79df279f9c60c5e82d5e8bec7ca26ac11/python_slugify-8.0.4-py2.py3-none-any.whl", hash = "sha256:276540b79961052b66b7d116620b36518847f52d5fd9e3a70164fc8c50faa6b8", size = 10051, upload-time = "2024-02-08T18:32:43.911Z" }, +] + [[package]] name = "pytrie" version = "0.4.0" @@ -3094,6 +3119,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/6a/9e/2064975477fdc887e47ad42157e214526dcad8f317a948dee17e1659a62f/terminado-0.18.1-py3-none-any.whl", hash = "sha256:a4468e1b37bb318f8a86514f65814e1afc977cf29b3992a4500d9dd305dcceb0", size = 14154, upload-time = "2024-03-12T14:34:36.569Z" }, ] +[[package]] +name = "text-unidecode" +version = "1.3" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/ab/e2/e9a00f0ccb71718418230718b3d900e71a5d16e701a3dae079a21e9cd8f8/text-unidecode-1.3.tar.gz", hash = "sha256:bad6603bb14d279193107714b288be206cac565dfa49aa5b105294dd5c4aab93", size = 76885, upload-time = "2019-08-30T21:36:45.405Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/a6/a5/c0b6468d3824fe3fde30dbb5e1f687b291608f9473681bbf7dabbf5a87d7/text_unidecode-1.3-py2.py3-none-any.whl", hash = "sha256:1311f10e8b895935241623731c2ba64f4c455287888b18189350b67134a822e8", size = 78154, upload-time = "2019-08-30T21:37:03.543Z" }, +] + [[package]] name = "tinycss2" version = "1.4.0" From 1191c7254a36063adabbb8becd22f91ee8258050 Mon Sep 17 00:00:00 2001 From: amc-corey-cox <69321580+amc-corey-cox@users.noreply.github.com> Date: Fri, 15 May 2026 16:08:15 -0500 Subject: [PATCH 2/3] Address PR review feedback MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Fix docstring path: docs/expressions/extensions.md → docs/api/extensions.md - Sort known-names pool before difflib.get_close_matches for deterministic suggestion ordering across runs/Python hash seeds --- src/linkml_map/utils/eval_utils.py | 2 +- src/linkml_map/utils/extensions.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/linkml_map/utils/eval_utils.py b/src/linkml_map/utils/eval_utils.py index e3062e49..cbcdb493 100644 --- a/src/linkml_map/utils/eval_utils.py +++ b/src/linkml_map/utils/eval_utils.py @@ -496,7 +496,7 @@ def suggest_for_unknown_name(name: str, *, known_names: Iterable[str]) -> str: :param known_names: Names currently in the eval namespace (built-ins plus any loaded extensions) — the pool for did-you-mean suggestions. """ - suggestions = difflib.get_close_matches(name, set(known_names), n=3, cutoff=0.6) + suggestions = difflib.get_close_matches(name, sorted(set(known_names)), n=3, cutoff=0.6) suggestion = f" Did you mean: {', '.join(repr(s) for s in suggestions)}?" if suggestions else "" return f"Unknown function {name!r}.{suggestion} (If this is a custom function, pass it via --functions .)" diff --git a/src/linkml_map/utils/extensions.py b/src/linkml_map/utils/extensions.py index 93e815cd..42e1f501 100644 --- a/src/linkml_map/utils/extensions.py +++ b/src/linkml_map/utils/extensions.py @@ -39,7 +39,7 @@ def my_aggregator(items): - ``override=True`` declared but no matching built-in → ``logging.warning``. - Missing extension file → :class:`ExtensionError`. -For the contract authors are declaring, see ``docs/expressions/extensions.md``. +For the contract authors are declaring, see ``docs/api/extensions.md``. """ from __future__ import annotations From 4abdecd7fe1ad0b97fd148e63455b43367c44ece Mon Sep 17 00:00:00 2001 From: amc-corey-cox <69321580+amc-corey-cox@users.noreply.github.com> Date: Fri, 15 May 2026 17:13:26 -0500 Subject: [PATCH 3/3] Harden extension loader and broaden override test coverage MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Hash-suffix sys.modules names so same-basename files don't clobber - Wrap exec_module so import-time errors surface as ExtensionError with path context, and sys.modules is cleaned up on failure - Use setattr(_SAFE_FUNCTION_ATTR, ...) to keep tagging/discovery in sync - Reserve 'slot' (and any future per-call names) — refuse at load time rather than silently shadow - New tests: import-time error, same-basename non-collision, reserved name, override end-to-end through ObjectTransformer, CLI clean-error on broken extension - New fixture tests/input/extensions/override_demo_ext.py with distinct output so the integration test can prove override fires through the full stack - Docs: extensions.md notes the reserved-name guard --- docs/api/extensions.md | 9 ++ src/linkml_map/utils/extensions.py | 41 ++++-- tests/input/extensions/override_demo_ext.py | 12 ++ tests/test_utils/test_extensions.py | 131 ++++++++++++++++++++ 4 files changed, 186 insertions(+), 7 deletions(-) create mode 100644 tests/input/extensions/override_demo_ext.py diff --git a/docs/api/extensions.md b/docs/api/extensions.md index 351cec49..08c21429 100644 --- a/docs/api/extensions.md +++ b/docs/api/extensions.md @@ -75,6 +75,15 @@ data manipulation, ask first whether it belongs in the trans-spec or in the source/target schema. The declarative spec is the documentation of what the transformation does; pulling logic out into Python hides it from review. +## Reserved names + +A handful of names are injected per-call by the transformer (currently `slot`, +used inside expressions to reference a previously derived target slot). An +extension cannot define a function with one of these names — it would be +silently shadowed at evaluation time. `load_extensions` raises +`ExtensionError` on the attempt so the conflict shows up at load time +rather than as silent wrong behavior. + ## Override semantics A `@safe_function` may shadow a built-in if you explicitly say so: diff --git a/src/linkml_map/utils/extensions.py b/src/linkml_map/utils/extensions.py index 42e1f501..c414487b 100644 --- a/src/linkml_map/utils/extensions.py +++ b/src/linkml_map/utils/extensions.py @@ -44,17 +44,23 @@ def my_aggregator(items): from __future__ import annotations +import hashlib import importlib.util import logging import sys from collections.abc import Callable, Iterable from pathlib import Path +from types import ModuleType from typing import Any from linkml_map.utils.eval_utils import FUNCTIONS, _distributing _SAFE_FUNCTION_ATTR = "_linkml_safe_function" +#: Names injected per-call by the transformer (e.g. ``slot``). Extensions cannot +#: use these — they would be silently shadowed at expression-evaluation time. +_RESERVED_NAMES: frozenset[str] = frozenset({"slot"}) + logger = logging.getLogger(__name__) @@ -90,7 +96,7 @@ def lower(s): ... """ def _tag(fn: Callable) -> Callable: - fn._linkml_safe_function = {"override": override, "distributes": distributes} # type: ignore[attr-defined] + setattr(fn, _SAFE_FUNCTION_ATTR, {"override": override, "distributes": distributes}) return fn if func is not None: @@ -99,25 +105,39 @@ def _tag(fn: Callable) -> Callable: return _tag -def _load_module_from_path(path: Path) -> Any: # noqa: ANN401 +def _load_module_from_path(path: Path) -> ModuleType: """Import a Python file as an anonymous module. - :raises ExtensionError: If the file does not exist. + The ``sys.modules`` registration key includes a short hash of the resolved + path so two extension files sharing a basename (e.g. ``a/helpers.py`` and + ``b/helpers.py``) don't clobber each other. On import-time failure, the + half-initialized entry is removed and the underlying exception is wrapped + as :class:`ExtensionError` with path context. + + :raises ExtensionError: If the file does not exist, the spec cannot be + constructed, or the module raises during import. """ if not path.exists(): msg = f"Extension file not found: {path}" raise ExtensionError(msg) - spec = importlib.util.spec_from_file_location(f"_linkml_ext_{path.stem}", path) + path_hash = hashlib.sha1(str(path.resolve()).encode()).hexdigest()[:8] # noqa: S324 + spec_name = f"_linkml_ext_{path.stem}_{path_hash}" + spec = importlib.util.spec_from_file_location(spec_name, path) if spec is None or spec.loader is None: msg = f"Could not create import spec for: {path}" raise ExtensionError(msg) module = importlib.util.module_from_spec(spec) - sys.modules[spec.name] = module - spec.loader.exec_module(module) + sys.modules[spec_name] = module + try: + spec.loader.exec_module(module) + except Exception as exc: + sys.modules.pop(spec_name, None) + msg = f"Failed to import extension {path}: {type(exc).__name__}: {exc}" + raise ExtensionError(msg) from exc return module -def _collect_tagged_functions(module: Any) -> dict[str, dict[str, Any]]: # noqa: ANN401 +def _collect_tagged_functions(module: ModuleType) -> dict[str, dict[str, Any]]: """Walk module attributes and collect functions tagged with :func:`safe_function`.""" found: dict[str, dict[str, Any]] = {} for name in dir(module): @@ -152,6 +172,13 @@ def load_extensions(paths: Iterable[str | Path]) -> dict[str, Callable]: tagged = _collect_tagged_functions(module) for name, meta in tagged.items(): + if name in _RESERVED_NAMES: + msg = ( + f"Extension name {name!r} from {path} is reserved — the " + f"transformer injects it per-call, so it would silently " + f"shadow your extension. Pick a different name." + ) + raise ExtensionError(msg) if name in merged: msg = f"Extension name collision: {name!r} defined in both {sources[name]} and {path}" raise ExtensionError(msg) diff --git a/tests/input/extensions/override_demo_ext.py b/tests/input/extensions/override_demo_ext.py new file mode 100644 index 00000000..45d6c557 --- /dev/null +++ b/tests/input/extensions/override_demo_ext.py @@ -0,0 +1,12 @@ +"""Test fixture: ``slugify`` extension with distinct output to verify that +``override=True`` actually shadows the built-in through the full transformer +stack, not just at the loader layer. +""" + +from linkml_map.utils.extensions import safe_function + + +@safe_function(override=True) +def slugify(s: str, separator: str = "_") -> str: + """Deliberately distinct slugify so test assertions can prove the override fired.""" + return f"EXT:{s}" diff --git a/tests/test_utils/test_extensions.py b/tests/test_utils/test_extensions.py index 976a7c9e..836385af 100644 --- a/tests/test_utils/test_extensions.py +++ b/tests/test_utils/test_extensions.py @@ -3,6 +3,7 @@ # ruff: noqa: ANN401, PLR2004 import logging +import sys from pathlib import Path import pytest @@ -443,3 +444,133 @@ def test_cli_loads_extension_module(tmp_path: Path) -> None: assert result.exit_code == 0, result.output assert yaml.safe_load(out.read_text()) == {"identifier": "o_brien_p"} + + +# ---- Loader robustness: import-time errors and module-name collisions ---- + + +def test_import_time_error_raises_extension_error(tmp_path: Path) -> None: + """SyntaxError in an extension surfaces as ExtensionError, not a traceback.""" + bad = _write_ext(tmp_path, "broken.py", "def f(:\n") + with pytest.raises(ExtensionError, match="broken.py"): + load_extensions([bad]) + # sys.modules cleaned up on failure — no half-initialized entry left behind. + assert not any(k.startswith("_linkml_ext_broken_") for k in sys.modules) + + +def test_two_files_with_same_basename_do_not_collide(tmp_path: Path) -> None: + """Two extension files named the same don't clobber each other in sys.modules.""" + a_dir = tmp_path / "a" + b_dir = tmp_path / "b" + a_dir.mkdir() + b_dir.mkdir() + (a_dir / "helpers.py").write_text( + "from linkml_map.utils.extensions import safe_function\n@safe_function\ndef alpha(s):\n return 'A:' + s\n" + ) + (b_dir / "helpers.py").write_text( + "from linkml_map.utils.extensions import safe_function\n@safe_function\ndef beta(s):\n return 'B:' + s\n" + ) + + loaded = load_extensions([a_dir / "helpers.py", b_dir / "helpers.py"]) + + # Both functions present — second load didn't clobber the first's module. + assert loaded["alpha"]("x") == "A:x" + assert loaded["beta"]("x") == "B:x" + + +# ---- Reserved-name guard ---- + + +def test_reserved_name_slot_rejected(tmp_path: Path) -> None: + """Naming an extension 'slot' raises ExtensionError (reserved by the transformer).""" + path = _write_ext( + tmp_path, + "ext.py", + "from linkml_map.utils.extensions import safe_function\n@safe_function\ndef slot(name):\n return name\n", + ) + with pytest.raises(ExtensionError, match="reserved"): + load_extensions([path]) + + +# ---- Override end-to-end through the transformer ---- + + +def test_override_shadows_builtin_through_transformer() -> None: + """``override=True`` replaces the built-in inside ObjectTransformer eval, + not just in the loader dict. + + Uses ``override_demo_ext.py`` which returns ``"EXT:"`` — distinct + from the built-in ``slugify`` — so the assertion can only pass if the + extension's function actually fired during evaluation. + """ + from linkml_runtime import SchemaView + + from linkml_map.transformer.object_transformer import ObjectTransformer + + source_schema = """ +id: https://example.org/src +name: src +prefixes: + linkml: https://w3id.org/linkml/ +default_prefix: ex +imports: [linkml:types] +classes: + Person: + attributes: + full_name: {range: string} +""" + target_schema = """ +id: https://example.org/tgt +name: tgt +prefixes: + linkml: https://w3id.org/linkml/ +default_prefix: ex +imports: [linkml:types] +classes: + Agent: + attributes: + identifier: {range: string} +""" + transform_spec = { + "id": "https://example.org/tr", + "class_derivations": { + "Agent": { + "populated_from": "Person", + "slot_derivations": {"identifier": {"expr": "slugify(full_name)"}}, + } + }, + } + + extensions = load_extensions([EXTENSIONS_DIR / "override_demo_ext.py"]) + tr = ObjectTransformer(extension_functions=extensions) + tr.source_schemaview = SchemaView(source_schema) + tr.target_schemaview = SchemaView(target_schema) + tr.create_transformer_specification(transform_spec) + + result = tr.map_object({"full_name": "Smith"}, source_type="Person") + + assert result == {"identifier": "EXT:Smith"} + + +# ---- CLI negative path ---- + + +def test_cli_bad_extension_yields_clean_error(tmp_path: Path) -> None: + """A failing extension load surfaces as a ClickException, not a traceback.""" + from click.testing import CliRunner + + from linkml_map.cli.cli import main + + bad = tmp_path / "broken.py" + bad.write_text("def f(:\n") # SyntaxError at import time + + # Other args don't matter — load_extensions fires before file processing. + runner = CliRunner() + result = runner.invoke( + main, + ["map-data", "-s", "x", "-T", "y", "--functions", str(bad), "in"], + ) + + assert result.exit_code != 0 + assert "broken.py" in result.output + assert "Traceback" not in result.output