From 3c8d8dc6e029b6e1a2412232a7f7959701b31e52 Mon Sep 17 00:00:00 2001 From: Yian Shang Date: Fri, 22 May 2026 09:57:41 -0700 Subject: [PATCH 01/11] Add compatibility layer between legacy units and new units --- .../datajunction_server/database/node.py | 38 ++- .../internal/deployment/orchestrator.py | 120 ++++++++- .../internal/namespaces.py | 18 +- .../datajunction_server/models/deployment.py | 66 ++++- .../datajunction_server/models/metric.py | 6 + .../datajunction_server/models/unit.py | 99 ++++++- .../internal/deployment/orchestration_test.py | 250 +++++++++++++++++- .../tests/internal/namespaces_test.py | 134 ++++++++++ .../tests/models/deployment_test.py | 73 +++++ datajunction-server/tests/models/unit_test.py | 119 +++++++++ 10 files changed, 898 insertions(+), 25 deletions(-) diff --git a/datajunction-server/datajunction_server/database/node.py b/datajunction-server/datajunction_server/database/node.py index ac83f4dab..df30590cc 100644 --- a/datajunction-server/datajunction_server/database/node.py +++ b/datajunction-server/datajunction_server/database/node.py @@ -545,6 +545,36 @@ async def to_spec(self, session: AsyncSession) -> NodeSpec: # Metric-specific if self.type == NodeType.METRIC: + # Prefer the structured column unit on export when it can't be + # expressed as a legacy enum (non-USD currency, compound, count + # with code, data_size, etc.). Otherwise fall back to the + # legacy metric_metadata.unit so round-trip preserves the + # author's original `unit: dollar` shape. + from datajunction_server.models.unit import ( + structured_to_legacy_unit_name, + ) + + col_unit = ( + self.current.columns[0].unit + if self.current.columns and self.current.columns[0].unit + else None + ) + legacy_from_md = ( + self.current.metric_metadata.unit + if self.current.metric_metadata and self.current.metric_metadata.unit + else None + ) + structured_spec: dict | None = None + legacy_spec = legacy_from_md + if col_unit is not None: + if structured_to_legacy_unit_name(col_unit) is None: + # Not legacy-expressible — use structured form, drop + # the legacy field to avoid double-emit. + structured_spec = col_unit + legacy_spec = None + # If legacy-expressible, keep legacy_from_md (preserves + # original `unit: dollar` shape on round-trip). + extra_kwargs.update( required_dimensions=sorted( col.name for col in self.current.required_dimensions @@ -552,12 +582,8 @@ async def to_spec(self, session: AsyncSession) -> NodeSpec: direction=self.current.metric_metadata.direction if self.current.metric_metadata else None, - unit_enum=( - self.current.metric_metadata.unit - if self.current.metric_metadata - and self.current.metric_metadata.unit - else None - ), + unit_enum=legacy_spec, + unit_structured=structured_spec, significant_digits=self.current.metric_metadata.significant_digits if self.current.metric_metadata else None, diff --git a/datajunction-server/datajunction_server/internal/deployment/orchestrator.py b/datajunction-server/datajunction_server/internal/deployment/orchestrator.py index 453d8e692..688d82d67 100644 --- a/datajunction-server/datajunction_server/internal/deployment/orchestrator.py +++ b/datajunction-server/datajunction_server/internal/deployment/orchestrator.py @@ -81,10 +81,15 @@ from datajunction_server.models.node import ( DEFAULT_DRAFT_VERSION, DEFAULT_PUBLISHED_VERSION, + MetricUnit, NodeMode, NodeStatus, NodeType, ) +from datajunction_server.models.unit import ( + legacy_unit_to_structured, + structured_to_legacy_unit_name, +) from datajunction_server.utils import ( SEPARATOR, Version, @@ -3518,15 +3523,26 @@ async def _create_node_revision( metric_spec = cast(MetricSpec, result.spec) if new_revision.columns: # pragma: no branch new_revision.columns[0].display_name = new_revision.display_name + + # PR 2 back-compat: bridge legacy metric_metadata.unit and + # structured columns[0].unit so users on either input shape + # end up with both fields populated where expressible. + output_col = new_revision.columns[0] if new_revision.columns else None + self._reconcile_metric_unit(metric_spec, output_col) + + legacy_unit = self._derive_legacy_unit_for_storage( + metric_spec, + output_col, + ) if ( - metric_spec.unit_enum + legacy_unit or metric_spec.direction or metric_spec.significant_digits or metric_spec.max_decimal_exponent or metric_spec.min_decimal_exponent ): new_revision.metric_metadata = MetricMetadata( - unit=metric_spec.unit_enum, + unit=legacy_unit, direction=metric_spec.direction, significant_digits=metric_spec.significant_digits, max_decimal_exponent=metric_spec.max_decimal_exponent, @@ -3546,6 +3562,106 @@ async def _create_node_revision( new_revision.required_dimensions = required_dimensions return new_revision + def _reconcile_metric_unit( + self, + metric_spec: MetricSpec, + output_col: Column | None, + ) -> None: + """ + Resolve the canonical structured unit for a metric's output column + from the three possible input surfaces and write it to + `output_col.unit`. + + Input surfaces (highest priority first): + 1. `metric_spec.unit_structured` — top-level structured `unit:` + at the metric spec level. Authored shape for the new model. + 2. `output_col.unit` — structured value set via the explicit + `columns[].unit` form. Supported for uniformity with + non-metric nodes, but unusual on metrics (where the column + name is auto-derived). + 3. `metric_spec.unit_enum` — legacy flat-string `unit: dollar` + form. Translated via the legacy → structured table. + + Conflict handling: if (1) is set together with (2), a warning is + logged naming the node; the metric-level structured value wins. + Combining (1) or (2) with (3) is normal — the structured value wins + and the legacy field becomes redundant scaffolding. + """ + if output_col is None: + return # pragma: no cover + + legacy = metric_spec.unit_enum + spec_structured = ( + metric_spec.unit_structured.model_dump() + if metric_spec.unit_structured is not None + else None + ) + column_structured = output_col.unit + + # (1) Metric-level structured input wins absolutely. + if spec_structured is not None: + if column_structured is not None and column_structured != spec_structured: + logger.warning( + "Metric %s sets a structured unit at both the metric " + "spec level (%r) and on columns[].unit (%r); the " + "metric-level value wins. Remove one to silence this.", + metric_spec.rendered_name, + spec_structured, + column_structured, + ) + if legacy is not None and legacy != MetricUnit.UNKNOWN: + logger.warning( + "Metric %s sets both a structured unit (%r) and the " + "legacy unit_enum (%s); structured value wins.", + metric_spec.rendered_name, + spec_structured, + legacy.name, + ) + output_col.unit = spec_structured + return + + # (2) columns[].unit fallback. + if column_structured is not None: + if legacy is not None and legacy != MetricUnit.UNKNOWN: + logger.warning( + "Metric %s sets both metric_metadata.unit (%s) and " + "columns[].unit (%r); structured value wins.", + metric_spec.rendered_name, + legacy.name, + column_structured, + ) + return + + # (3) Legacy translation. + translated = legacy_unit_to_structured(legacy) + if translated is not None: + output_col.unit = translated + + def _derive_legacy_unit_for_storage( + self, + metric_spec: MetricSpec, + output_col: Column | None, + ) -> MetricUnit | None: + """ + Compute the value to write to `metricmetadata.unit` (legacy DB + column) given the canonical structured `output_col.unit`. This is + the dual-write that preserves rollback safety: if PR 4 is reverted, + the legacy column still holds the right value for everything the + legacy enum can represent. + + Returns None when the structured value has no legacy equivalent + (non-USD currencies, compound units, data sizes, count with code) — + the caller writes NULL to `metricmetadata.unit` in that case. + """ + structured = output_col.unit if output_col is not None else None + if structured is not None: + name = structured_to_legacy_unit_name(structured) + return MetricUnit[name] if name is not None else None + # No structured unit — fall back to whatever the legacy spec field + # had (typically None at this point because _reconcile_metric_unit + # would have copied it onto the column). + return metric_spec.unit_enum + def _create_column_from_spec( self, col: ColumnSpec, diff --git a/datajunction-server/datajunction_server/internal/namespaces.py b/datajunction-server/datajunction_server/internal/namespaces.py index 791968406..dd4c54452 100644 --- a/datajunction-server/datajunction_server/internal/namespaces.py +++ b/datajunction-server/datajunction_server/internal/namespaces.py @@ -1445,7 +1445,14 @@ def _has_column_customizations(col: dict) -> bool: "attributes", [], ) - return has_custom_display or has_attributes or has_description or has_partition + has_unit = bool(col.get("unit")) + return ( + has_custom_display + or has_attributes + or has_description + or has_partition + or has_unit + ) def _merge_columns_preserving_comments(existing_list, new_list, is_cube=False): @@ -1737,9 +1744,18 @@ def _node_spec_to_yaml_dict(node_spec, include_all_columns=False) -> dict: # Filter columns to only include meaningful customizations # Special case for cubes: ALWAYS only export columns with partitions + # Special case for metrics: the metric's output column has a generated + # name and its unit is emitted at the metric spec top level via the + # `unit:` field. Suppress per-column unit on metric exports to avoid + # double-emission with the metric-level `unit:`. # For other nodes: respect include_all_columns flag to preserve comments if "columns" in data and data["columns"] is not None: is_cube = data.get("node_type") == "cube" + is_metric = data.get("node_type") == "metric" + + if is_metric: + for col in data["columns"]: + col.pop("unit", None) # Cubes: always filter to only partitions (even when preserving comments) # Other nodes: only filter when include_all_columns=False diff --git a/datajunction-server/datajunction_server/models/deployment.py b/datajunction-server/datajunction_server/models/deployment.py index 990188744..6c510098c 100644 --- a/datajunction-server/datajunction_server/models/deployment.py +++ b/datajunction-server/datajunction_server/models/deployment.py @@ -425,7 +425,18 @@ def __eq__(self, other: Any) -> bool: class MetricSpec(NodeSpec): """ - Specification for a metric node + Specification for a metric node. + + The `unit` input field accepts either of two shapes: + - **Legacy flat string** (`unit: dollar`) — translated via the + `MetricUnit` enum. Bounded to legacy values. + - **Structured dict** (`unit: {kind: currency, code: USD}` or + `unit: {numerator: ..., denominator: ...}`) — the same shape as + `ColumnSpec.unit`, but authored at the metric level so users don't + need to know the metric's output column name. + + Internally these are stored separately (`unit_enum`, `unit_structured`) + and reconciled at deploy time onto the metric's single output column. """ node_type: Literal[NodeType.METRIC] = NodeType.METRIC @@ -436,6 +447,10 @@ class MetricSpec(NodeSpec): required_dimensions: list[str] | None = None # Field(default_factory=list) direction: MetricDirection | None = None unit_enum: MetricUnit | None = Field(default=None, exclude=True) + # Structured unit form at the metric level — peer of `unit_enum`. + # Only one of `unit_enum` / `unit_structured` is set per spec (the + # __init__ dispatches by input shape). + unit_structured: Unit | None = Field(default=None, exclude=True) significant_digits: int | None = None min_decimal_exponent: int | None = None @@ -443,22 +458,45 @@ class MetricSpec(NodeSpec): def __init__(self, **data: Any): unit = data.pop("unit", None) - if unit: - try: - if isinstance(unit, MetricUnit): - data["unit_enum"] = unit - else: - data["unit_enum"] = MetricUnit[ # pragma: no cover - unit.strip().upper() - ] - except KeyError: # pragma: no cover - raise DJInvalidInputException(f"Invalid metric unit: {unit}") + if unit is not None and unit != "": + if isinstance(unit, MetricUnit): + data["unit_enum"] = unit + elif isinstance(unit, str): + try: + data["unit_enum"] = MetricUnit[unit.strip().upper()] + except KeyError: + raise DJInvalidInputException(f"Invalid metric unit: {unit}") + elif isinstance(unit, dict): + # Structured form. Defer validation to the Unit discriminated + # union — the same code path ColumnSpec.unit uses. + data["unit_structured"] = unit + else: + raise DJInvalidInputException( + f"Metric unit must be a string or a structured dict; " + f"got {type(unit).__name__}", + ) super().__init__(**data) @property - def unit(self) -> str | None: - """Return lowercased unit name for JSON serialization.""" - if self.unit_enum is None: # pragma: no cover + def unit(self) -> str | dict | None: + """ + Return the canonical metric unit value for serialization. + + Returns: + - `None` if no unit is set. + - A structured dict if the metric was authored with a structured + unit at the spec level. + - The legacy lowercase enum name (e.g. `"dollar"`) otherwise. + + Output consumers that need a structured value regardless of input + shape should read `column.unit` on the metric's output column. + """ + if self.unit_structured is not None: + # mode="json" so UnitKind enum members render as plain strings. + # The dict is consumed by YAML and JSON serializers that don't + # know about the enum subclass. + return self.unit_structured.model_dump(mode="json") + if self.unit_enum is None: return None return self.unit_enum.value.name.lower() diff --git a/datajunction-server/datajunction_server/models/metric.py b/datajunction-server/datajunction_server/models/metric.py index 1fd640f49..cfe1e2e65 100644 --- a/datajunction-server/datajunction_server/models/metric.py +++ b/datajunction-server/datajunction_server/models/metric.py @@ -42,6 +42,11 @@ class Metric(BaseModel): dimensions: List[DimensionAttributeOutput] metric_metadata: Optional[MetricMetadataOutput] = None + # Structured metric-level unit, derived from the metric's output column. + # `metric_metadata.unit` remains the legacy flat-string field for + # back-compat; `unit` is the structured shape and is the source of truth + # going forward. `None` when no unit is set, regardless of input shape. + unit: Optional[Dict] = None required_dimensions: List[str] incompatible_druid_functions: List[str] @@ -88,6 +93,7 @@ async def parse_node( expression=str(query_ast.select.projection[0]), dimensions=dims, metric_metadata=node.current.metric_metadata, + unit=(node.current.columns[0].unit if node.current.columns else None), required_dimensions=[dim.name for dim in node.current.required_dimensions], incompatible_druid_functions=incompatible_druid_functions, measures=measures, diff --git a/datajunction-server/datajunction_server/models/unit.py b/datajunction-server/datajunction_server/models/unit.py index e625867b9..3b78cfd4a 100644 --- a/datajunction-server/datajunction_server/models/unit.py +++ b/datajunction-server/datajunction_server/models/unit.py @@ -10,10 +10,13 @@ import re from enum import Enum -from typing import Annotated, Any, Union +from typing import TYPE_CHECKING, Annotated, Any, Union from pydantic import BaseModel, Discriminator, Tag, model_validator +if TYPE_CHECKING: + from datajunction_server.models.node import MetricUnit + class UnitKind(str, Enum): """ @@ -202,3 +205,97 @@ def _unit_discriminator(value: Any) -> str: ], Discriminator(_unit_discriminator), ] + + +# ------------------------------------------------------------------------- +# Legacy <-> structured translation. +# +# The legacy `MetricUnit` enum (datajunction_server.models.node.MetricUnit) +# was a flat one-per-(kind, denomination) enumeration applied only to metric +# nodes. The new structured `Unit` lives on every column. These functions +# translate between the two so that: +# - existing YAML / API input using the legacy `unit: ` keeps +# working (PR 2 wires this on the input side). +# - the legacy `metricmetadata.unit` DB column can be dual-written from +# `column.unit` for rollback safety (PR 2 wires this on the storage side). +# - the legacy `metric_metadata.unit` API field can be derived from +# `column.unit` for downstream consumers (PR 4 wires this on the output +# side; the reverse function lands here so it lives next to its inverse). +# +# Translation is intentionally lossy in the reverse direction: structured +# values the legacy enum can't represent (non-USD currencies, compound +# units, data sizes, count-with-code) map to None. The legacy column simply +# isn't populated for those. +# ------------------------------------------------------------------------- + +# Keyed by MetricUnit.name (not the enum member itself) so this module can +# avoid importing node.py at module load. Callers translate to/from the enum +# at the call site. +_LEGACY_NAME_TO_STRUCTURED: dict[str, dict | None] = { + "UNKNOWN": None, + "UNITLESS": {"kind": "unitless"}, + "PERCENTAGE": {"kind": "percentage"}, + "PROPORTION": {"kind": "proportion"}, + "DOLLAR": {"kind": "currency", "code": "USD"}, + "MILLISECOND": {"kind": "time", "code": "ms"}, + "SECOND": {"kind": "time", "code": "s"}, + "MINUTE": {"kind": "time", "code": "min"}, + "HOUR": {"kind": "time", "code": "h"}, + "DAY": {"kind": "time", "code": "d"}, + "WEEK": {"kind": "time", "code": "wk"}, + "MONTH": {"kind": "time", "code": "mo"}, + "YEAR": {"kind": "time", "code": "yr"}, + # BIT, BYTE intentionally omitted — unused in production data. +} + + +def legacy_unit_to_structured( + legacy: "MetricUnit | None", +) -> dict | None: + """ + Translate a legacy `MetricUnit` enum value into a structured `Unit` dict. + + Returns None for `MetricUnit.UNKNOWN` and for `None`, since both mean + "no unit set." Returns `{kind: unitless}` for `MetricUnit.UNITLESS`, + preserving the distinction between "explicitly no unit" and "not set." + """ + if legacy is None: + return None + return _LEGACY_NAME_TO_STRUCTURED.get(legacy.name) + + +def structured_to_legacy_unit_name(unit: dict | None) -> str | None: + """ + Translate a structured `Unit` dict back to the legacy `MetricUnit.name` + when expressible. Returns None when the structured value has no legacy + equivalent (non-USD currencies, compound units, data sizes, count with + code, etc.) — callers should treat that as "don't populate the legacy + column." + + Returns the enum member name (e.g. "DOLLAR"), not a MetricUnit instance, + so this module stays import-free of node.py. Callers do + `MetricUnit[name]` at the call site. + """ + if unit is None: + return None + # Compound units have no legacy equivalent. + if "numerator" in unit: + return None + kind = unit.get("kind") + code = unit.get("code") + if kind == "unitless": + return "UNITLESS" + if kind == "percentage": + return "PERCENTAGE" + if kind == "proportion": + return "PROPORTION" + if kind == "currency": + return "DOLLAR" if code == "USD" else None + if kind == "time": + # Reverse of _LEGACY_NAME_TO_STRUCTURED for the time kind. + for legacy_name, structured in _LEGACY_NAME_TO_STRUCTURED.items(): + if structured == {"kind": "time", "code": code}: + return legacy_name + return None + # count, data_size — no legacy equivalent. + return None diff --git a/datajunction-server/tests/internal/deployment/orchestration_test.py b/datajunction-server/tests/internal/deployment/orchestration_test.py index 626a7f796..70d29d0ac 100644 --- a/datajunction-server/tests/internal/deployment/orchestration_test.py +++ b/datajunction-server/tests/internal/deployment/orchestration_test.py @@ -44,7 +44,7 @@ DimensionJoinLinkSpec, DimensionReferenceLinkSpec, ) -from datajunction_server.models.node import NodeStatus +from datajunction_server.models.node import MetricUnit, NodeStatus from datajunction_server.database.namespace import NodeNamespace from datajunction_server.database.user import OAuthProvider, User from datajunction_server.database.tag import Tag @@ -2673,3 +2673,251 @@ async def test_auto_register_sources_uses_sqlalchemy_for_system_catalog( assert {"id", "name"} <= col_names # The query service must NOT have been called. mock_query_client.get_columns_for_tables_batch.assert_not_called() + + +# ------------------------------------------------------------------------- +# PR 2 — legacy/structured unit bridging in the orchestrator +# ------------------------------------------------------------------------- + + +class TestReconcileMetricUnit: + """ + Tests for `_reconcile_metric_unit` and `_derive_legacy_unit_for_storage`. + These bridge the legacy `MetricSpec.unit_enum` (from YAML `unit: dollar` + or API `metric_metadata.unit`) and the structured `column.unit` so users + on either input shape end up with consistent storage. + + Called as unbound methods with a MagicMock for self — the helpers don't + touch self state, they just operate on the spec and column. + """ + + @staticmethod + def _make_col(unit: dict | None = None) -> Column: + return Column(name="value", type=None, order=0, unit=unit) + + @staticmethod + def _make_spec(unit: MetricUnit | None = None) -> MetricSpec: + kwargs: dict = { + "name": "m", + "namespace": "default", + "query": "SELECT 1", + } + if unit is not None: + kwargs["unit"] = unit + return MetricSpec(**kwargs) + + def test_legacy_only_writes_through_to_column(self): + spec = self._make_spec(unit=MetricUnit.DOLLAR) + col = self._make_col() + DeploymentOrchestrator._reconcile_metric_unit(MagicMock(), spec, col) + assert col.unit == {"kind": "currency", "code": "USD"} + + def test_structured_only_left_untouched(self): + spec = self._make_spec() # no legacy unit + col = self._make_col(unit={"kind": "percentage"}) + DeploymentOrchestrator._reconcile_metric_unit(MagicMock(), spec, col) + assert col.unit == {"kind": "percentage"} + + def test_both_set_structured_wins_and_warns(self, caplog): + import logging + + spec = self._make_spec(unit=MetricUnit.DOLLAR) + col = self._make_col(unit={"kind": "currency", "code": "EUR"}) + with caplog.at_level( + logging.WARNING, + logger="datajunction_server.internal.deployment.orchestrator", + ): + DeploymentOrchestrator._reconcile_metric_unit(MagicMock(), spec, col) + assert col.unit == {"kind": "currency", "code": "EUR"} + assert any( + "sets both metric_metadata.unit" in rec.message for rec in caplog.records + ) + + def test_neither_set_leaves_both_unset(self): + spec = self._make_spec() + col = self._make_col() + DeploymentOrchestrator._reconcile_metric_unit(MagicMock(), spec, col) + assert col.unit is None + + def test_unknown_legacy_does_not_overwrite(self): + # MetricUnit.UNKNOWN translates to None — should not set column.unit. + spec = self._make_spec(unit=MetricUnit.UNKNOWN) + col = self._make_col() + DeploymentOrchestrator._reconcile_metric_unit(MagicMock(), spec, col) + assert col.unit is None + + def test_unitless_legacy_preserves_distinction(self): + # UNITLESS → {kind: unitless} (explicitly no unit), not None. + spec = self._make_spec(unit=MetricUnit.UNITLESS) + col = self._make_col() + DeploymentOrchestrator._reconcile_metric_unit(MagicMock(), spec, col) + assert col.unit == {"kind": "unitless"} + + def test_metric_level_structured_overrides_legacy(self): + spec = self._make_spec() + spec.unit_structured = MagicMock() + spec.unit_structured.model_dump = lambda: { + "kind": "currency", + "code": "EUR", + } + spec.unit_enum = MetricUnit.DOLLAR # ignored when structured is set + col = self._make_col() + DeploymentOrchestrator._reconcile_metric_unit(MagicMock(), spec, col) + assert col.unit == {"kind": "currency", "code": "EUR"} + + def test_metric_level_structured_overrides_column_unit_with_warning(self, caplog): + import logging + + spec = self._make_spec() + spec.unit_structured = MagicMock() + spec.unit_structured.model_dump = lambda: { + "kind": "currency", + "code": "EUR", + } + col = self._make_col(unit={"kind": "percentage"}) + with caplog.at_level( + logging.WARNING, + logger="datajunction_server.internal.deployment.orchestrator", + ): + DeploymentOrchestrator._reconcile_metric_unit(MagicMock(), spec, col) + # Metric-level wins. + assert col.unit == {"kind": "currency", "code": "EUR"} + assert any("metric-level value wins" in rec.message for rec in caplog.records) + + def test_metric_level_structured_matching_column_unit_no_warning(self, caplog): + # When metric-level structured equals columns[].unit, no warning fires. + import logging + + spec = self._make_spec() + spec.unit_structured = MagicMock() + same = {"kind": "currency", "code": "USD"} + spec.unit_structured.model_dump = lambda: dict(same) + col = self._make_col(unit=dict(same)) + with caplog.at_level( + logging.WARNING, + logger="datajunction_server.internal.deployment.orchestrator", + ): + DeploymentOrchestrator._reconcile_metric_unit(MagicMock(), spec, col) + assert col.unit == same + assert not any( + "metric-level value wins" in rec.message for rec in caplog.records + ) + + def test_metric_level_structured_warns_when_combined_with_legacy(self, caplog): + import logging + + spec = self._make_spec(unit=MetricUnit.DOLLAR) + spec.unit_structured = MagicMock() + spec.unit_structured.model_dump = lambda: { + "kind": "currency", + "code": "EUR", + } + col = self._make_col() + with caplog.at_level( + logging.WARNING, + logger="datajunction_server.internal.deployment.orchestrator", + ): + DeploymentOrchestrator._reconcile_metric_unit(MagicMock(), spec, col) + assert col.unit == {"kind": "currency", "code": "EUR"} + assert any( + "structured value wins" in rec.message and "legacy" in rec.message + for rec in caplog.records + ) + + +class TestDeriveLegacyUnitForStorage: + """ + Tests for `_derive_legacy_unit_for_storage`, the reverse half of the + bridge: reads the canonical `column.unit` and returns the value to + dual-write to `metricmetadata.unit`. Returns None when the structured + value has no legacy equivalent. + """ + + @staticmethod + def _spec(unit: MetricUnit | None = None) -> MetricSpec: + kwargs: dict = {"name": "m", "namespace": "default", "query": "SELECT 1"} + if unit is not None: + kwargs["unit"] = unit + return MetricSpec(**kwargs) + + @staticmethod + def _col(unit: dict | None) -> Column: + return Column(name="value", type=None, order=0, unit=unit) + + def test_structured_with_legacy_equivalent_returns_enum(self): + col = self._col({"kind": "currency", "code": "USD"}) + result = DeploymentOrchestrator._derive_legacy_unit_for_storage( + MagicMock(), + self._spec(), + col, + ) + assert result == MetricUnit.DOLLAR + + def test_non_usd_currency_returns_none(self): + col = self._col({"kind": "currency", "code": "EUR"}) + result = DeploymentOrchestrator._derive_legacy_unit_for_storage( + MagicMock(), + self._spec(), + col, + ) + assert result is None + + def test_compound_returns_none(self): + col = self._col( + { + "numerator": {"kind": "count"}, + "denominator": {"kind": "time", "code": "s"}, + }, + ) + result = DeploymentOrchestrator._derive_legacy_unit_for_storage( + MagicMock(), + self._spec(), + col, + ) + assert result is None + + def test_data_size_returns_none(self): + col = self._col({"kind": "data_size", "code": "MB"}) + result = DeploymentOrchestrator._derive_legacy_unit_for_storage( + MagicMock(), + self._spec(), + col, + ) + assert result is None + + def test_count_with_code_returns_none(self): + col = self._col({"kind": "count", "code": "clicks"}) + result = DeploymentOrchestrator._derive_legacy_unit_for_storage( + MagicMock(), + self._spec(), + col, + ) + assert result is None + + def test_no_structured_falls_back_to_spec_legacy(self): + # When column.unit is None, fall back to whatever the legacy spec + # field held (covers the rare case where _reconcile didn't run). + col = self._col(None) + result = DeploymentOrchestrator._derive_legacy_unit_for_storage( + MagicMock(), + self._spec(unit=MetricUnit.HOUR), + col, + ) + assert result == MetricUnit.HOUR + + def test_no_structured_no_legacy_returns_none(self): + col = self._col(None) + result = DeploymentOrchestrator._derive_legacy_unit_for_storage( + MagicMock(), + self._spec(), + col, + ) + assert result is None + + def test_no_column_returns_spec_legacy(self): + result = DeploymentOrchestrator._derive_legacy_unit_for_storage( + MagicMock(), + self._spec(unit=MetricUnit.SECOND), + None, + ) + assert result == MetricUnit.SECOND diff --git a/datajunction-server/tests/internal/namespaces_test.py b/datajunction-server/tests/internal/namespaces_test.py index 97322fe6b..92f7a0d82 100644 --- a/datajunction-server/tests/internal/namespaces_test.py +++ b/datajunction-server/tests/internal/namespaces_test.py @@ -389,6 +389,140 @@ def test_tags_preserve_order_on_merge(self): tag_lines = [line for line in result.splitlines() if " - " in line] assert tag_lines == [" - ratio_metric", " - core"] + def test_metric_with_legacy_string_unit_round_trips(self): + """Metric authored with legacy `unit: dollar` emits `unit: dollar` on export.""" + spec = MetricSpec( + name="ns.metrics.revenue", + node_type=NodeType.METRIC, + unit="dollar", + query="SELECT SUM(amount) FROM ns.transforms.t", + ) + output = node_spec_to_yaml(spec) + assert "unit: dollar" in output + # Not a structured dict in the legacy case. + assert "kind:" not in output + + def test_metric_with_structured_unit_emits_at_spec_level(self): + """Structured `unit:` at the metric spec level emits as a nested dict.""" + spec = MetricSpec( + name="ns.metrics.revenue_eur", + node_type=NodeType.METRIC, + unit={"kind": "currency", "code": "EUR"}, + query="SELECT SUM(amount_eur) FROM ns.transforms.t", + ) + output = node_spec_to_yaml(spec) + # Emitted at spec level, not on a columns block. + assert "unit:" in output + assert "kind: currency" in output + assert "code: EUR" in output + + def test_metric_with_compound_unit_emits_nested(self): + spec = MetricSpec( + name="ns.metrics.qps", + node_type=NodeType.METRIC, + unit={ + "numerator": {"kind": "count"}, + "denominator": {"kind": "time", "code": "s"}, + }, + query="SELECT 1", + ) + output = node_spec_to_yaml(spec) + assert "numerator:" in output + assert "denominator:" in output + assert "kind: time" in output + assert "code: s" in output + + def test_metric_export_suppresses_per_column_unit(self): + """A metric's columns[].unit must NOT appear in YAML export — the metric + emits its unit at the spec top level instead.""" + spec = MetricSpec( + name="ns.metrics.revenue", + node_type=NodeType.METRIC, + unit={"kind": "currency", "code": "USD"}, + query="SELECT SUM(amount) FROM ns.transforms.t", + columns=[ + { + "name": "revenue", + "unit": {"kind": "currency", "code": "USD"}, + }, + ], + ) + output = node_spec_to_yaml(spec) + # Spec-level unit is present... + assert "kind: currency" in output + # ...but columns block (if present) does not contain a duplicate unit + if "columns:" in output: + # Find columns block and inspect its body + columns_idx = output.find("columns:") + columns_block = output[columns_idx:] + # The metric-level `unit:` line lives before `columns:` — so any + # unit-related text after `columns:` would be a per-column emit. + assert "unit:" not in columns_block + + def test_column_with_unit_is_exported(self): + """A column whose only customization is a unit must still be exported. + + Before PR 2's fix to `_has_column_customizations`, a unit-only column + was treated as "unmodified" and silently dropped from the YAML. + """ + spec = TransformSpec( + name="ns.transforms.t", + node_type=NodeType.TRANSFORM, + query="SELECT revenue FROM ns.source.s", + columns=[ + { + "name": "revenue", + "unit": {"kind": "currency", "code": "USD"}, + }, + ], + ) + lines = node_spec_to_yaml(spec).splitlines() + assert "columns:" in lines + assert " - name: revenue" in lines + # Unit appears as a nested mapping; check key + values are present + unit_idx = next(i for i, line in enumerate(lines) if line.strip() == "unit:") + # The following lines should be the kind/code mapping + assert "kind: currency" in lines[unit_idx + 1] + assert "code: USD" in lines[unit_idx + 2] + + def test_column_without_unit_is_not_exported_with_noise(self): + """A column with no customizations (including no unit) is omitted from YAML. + + The new `unit` field must not introduce `unit: null` lines on every + column when no unit is set. + """ + spec = TransformSpec( + name="ns.transforms.t", + node_type=NodeType.TRANSFORM, + query="SELECT id FROM ns.source.s", + columns=[{"name": "id"}], # no unit, no customizations + ) + output = node_spec_to_yaml(spec) + assert "unit:" not in output + assert "columns:" not in output # column itself filtered out + + def test_compound_unit_round_trips(self): + """Compound units serialize cleanly and the structure is preserved.""" + spec = TransformSpec( + name="ns.transforms.t", + node_type=NodeType.TRANSFORM, + query="SELECT qps FROM ns.source.s", + columns=[ + { + "name": "qps", + "unit": { + "numerator": {"kind": "count"}, + "denominator": {"kind": "time", "code": "s"}, + }, + }, + ], + ) + output = node_spec_to_yaml(spec) + assert "numerator:" in output + assert "denominator:" in output + assert "kind: time" in output + assert "code: s" in output + def test_column_attributes_are_sorted(self): """column attributes are sorted alphabetically regardless of input order""" spec = TransformSpec( diff --git a/datajunction-server/tests/models/deployment_test.py b/datajunction-server/tests/models/deployment_test.py index 7c20c8260..d593c3900 100644 --- a/datajunction-server/tests/models/deployment_test.py +++ b/datajunction-server/tests/models/deployment_test.py @@ -73,6 +73,79 @@ def test_metric_spec(): assert not metric_spec.__eq__(other_metric_spec) +def test_metric_spec_accepts_legacy_string_unit(): + """Legacy `unit: dollar` (string) parses into unit_enum, leaves unit_structured None.""" + spec = MetricSpec(name="m", query="SELECT 1", unit="dollar") + assert spec.unit_enum == MetricUnit.DOLLAR + assert spec.unit_structured is None + assert spec.unit == "dollar" + + +def test_metric_spec_accepts_metric_unit_enum_directly(): + spec = MetricSpec(name="m", query="SELECT 1", unit=MetricUnit.SECOND) + assert spec.unit_enum == MetricUnit.SECOND + assert spec.unit_structured is None + assert spec.unit == "second" + + +def test_metric_spec_accepts_structured_unit_dict(): + """Structured `unit: {kind: ..., code: ...}` parses into unit_structured.""" + from datajunction_server.models.unit import AtomicUnit, UnitKind + + spec = MetricSpec( + name="m", + query="SELECT 1", + unit={"kind": "currency", "code": "USD"}, + ) + assert spec.unit_enum is None + assert isinstance(spec.unit_structured, AtomicUnit) + assert spec.unit_structured.kind == UnitKind.CURRENCY + assert spec.unit_structured.code == "USD" + assert spec.unit == {"kind": "currency", "code": "USD"} + + +def test_metric_spec_accepts_compound_unit_dict(): + """Compound units round-trip through MetricSpec.""" + from datajunction_server.models.unit import CompoundUnit + + spec = MetricSpec( + name="m", + query="SELECT 1", + unit={ + "numerator": {"kind": "count", "code": "clicks"}, + "denominator": {"kind": "count", "code": "impressions"}, + }, + ) + assert isinstance(spec.unit_structured, CompoundUnit) + assert spec.unit_structured.numerator.code == "clicks" + assert spec.unit_structured.denominator.code == "impressions" + assert spec.unit == { + "numerator": {"kind": "count", "code": "clicks"}, + "denominator": {"kind": "count", "code": "impressions"}, + } + + +def test_metric_spec_rejects_unknown_legacy_string(): + from datajunction_server.errors import DJInvalidInputException + + with pytest.raises(DJInvalidInputException, match="Invalid metric unit"): + MetricSpec(name="m", query="SELECT 1", unit="dollars") + + +def test_metric_spec_rejects_non_string_non_dict_unit(): + from datajunction_server.errors import DJInvalidInputException + + with pytest.raises(DJInvalidInputException, match="must be a string or"): + MetricSpec(name="m", query="SELECT 1", unit=42) + + +def test_metric_spec_no_unit_set(): + spec = MetricSpec(name="m", query="SELECT 1") + assert spec.unit_enum is None + assert spec.unit_structured is None + assert spec.unit is None + + def test_reference_link_spec(): link_spec = DimensionReferenceLinkSpec( role="test_role", diff --git a/datajunction-server/tests/models/unit_test.py b/datajunction-server/tests/models/unit_test.py index 14b73010f..d3b364bad 100644 --- a/datajunction-server/tests/models/unit_test.py +++ b/datajunction-server/tests/models/unit_test.py @@ -6,11 +6,14 @@ from pydantic import TypeAdapter, ValidationError from datajunction_server.models.deployment import ColumnSpec +from datajunction_server.models.node import MetricUnit from datajunction_server.models.unit import ( AtomicUnit, CompoundUnit, Unit, UnitKind, + legacy_unit_to_structured, + structured_to_legacy_unit_name, ) _unit_adapter: TypeAdapter[Unit] = TypeAdapter(Unit) @@ -206,3 +209,119 @@ def test_column_spec_round_trip_via_dump(self) -> None: assert dumped["unit"] == {"kind": "currency", "code": "USD"} roundtripped = ColumnSpec.model_validate(dumped) assert roundtripped == spec + + +class TestLegacyUnitTranslation: + """ + Coverage for `legacy_unit_to_structured` and `structured_to_legacy_unit_name`. + """ + + @pytest.mark.parametrize( + ("legacy", "structured"), + [ + (MetricUnit.UNKNOWN, None), + (MetricUnit.UNITLESS, {"kind": "unitless"}), + (MetricUnit.PERCENTAGE, {"kind": "percentage"}), + (MetricUnit.PROPORTION, {"kind": "proportion"}), + (MetricUnit.DOLLAR, {"kind": "currency", "code": "USD"}), + (MetricUnit.MILLISECOND, {"kind": "time", "code": "ms"}), + (MetricUnit.SECOND, {"kind": "time", "code": "s"}), + (MetricUnit.MINUTE, {"kind": "time", "code": "min"}), + (MetricUnit.HOUR, {"kind": "time", "code": "h"}), + (MetricUnit.DAY, {"kind": "time", "code": "d"}), + (MetricUnit.WEEK, {"kind": "time", "code": "wk"}), + (MetricUnit.MONTH, {"kind": "time", "code": "mo"}), + (MetricUnit.YEAR, {"kind": "time", "code": "yr"}), + ], + ) + def test_forward_translation_matches_table( + self, + legacy: MetricUnit, + structured: dict | None, + ) -> None: + assert legacy_unit_to_structured(legacy) == structured + + def test_forward_translation_handles_none(self) -> None: + assert legacy_unit_to_structured(None) is None + + def test_forward_translation_bit_byte_unused(self) -> None: + # BIT / BYTE have zero rows in production and are intentionally + # absent from the translation table. They map to None — callers + # treat that the same as UNKNOWN. + assert legacy_unit_to_structured(MetricUnit.BIT) is None + assert legacy_unit_to_structured(MetricUnit.BYTE) is None + + @pytest.mark.parametrize( + ("structured", "legacy_name"), + [ + ({"kind": "unitless"}, "UNITLESS"), + ({"kind": "percentage"}, "PERCENTAGE"), + ({"kind": "proportion"}, "PROPORTION"), + ({"kind": "currency", "code": "USD"}, "DOLLAR"), + ({"kind": "time", "code": "ms"}, "MILLISECOND"), + ({"kind": "time", "code": "s"}, "SECOND"), + ({"kind": "time", "code": "min"}, "MINUTE"), + ({"kind": "time", "code": "h"}, "HOUR"), + ({"kind": "time", "code": "d"}, "DAY"), + ({"kind": "time", "code": "wk"}, "WEEK"), + ({"kind": "time", "code": "mo"}, "MONTH"), + ({"kind": "time", "code": "yr"}, "YEAR"), + ], + ) + def test_reverse_translation_matches_table( + self, + structured: dict, + legacy_name: str, + ) -> None: + assert structured_to_legacy_unit_name(structured) == legacy_name + + @pytest.mark.parametrize( + "structured", + [ + None, + {"kind": "currency", "code": "EUR"}, # non-USD + {"kind": "currency", "code": None}, # currency with no code + {"kind": "data_size", "code": "MB"}, # no legacy equivalent + {"kind": "count", "code": "clicks"}, # no legacy equivalent + {"kind": "count"}, # no legacy equivalent + { + "numerator": {"kind": "count"}, + "denominator": {"kind": "time", "code": "s"}, + }, # compound + # time with a code that exists in the new vocabulary but not + # in any legacy enum member (today there is no such code, but + # this guards against future additions to TIME_CODES that + # aren't reflected in _LEGACY_NAME_TO_STRUCTURED). + {"kind": "time", "code": "fortnight"}, + ], + ) + def test_reverse_translation_returns_none_for_inexpressible( + self, + structured: dict | None, + ) -> None: + assert structured_to_legacy_unit_name(structured) is None + + @pytest.mark.parametrize( + "legacy", + [ + MetricUnit.UNITLESS, + MetricUnit.PERCENTAGE, + MetricUnit.PROPORTION, + MetricUnit.DOLLAR, + MetricUnit.MILLISECOND, + MetricUnit.SECOND, + MetricUnit.MINUTE, + MetricUnit.HOUR, + MetricUnit.DAY, + MetricUnit.WEEK, + MetricUnit.MONTH, + MetricUnit.YEAR, + ], + ) + def test_round_trip_legacy_to_structured_to_legacy( + self, + legacy: MetricUnit, + ) -> None: + structured = legacy_unit_to_structured(legacy) + assert structured is not None + assert structured_to_legacy_unit_name(structured) == legacy.name From 3b239e0316aa186a118a467e1573233f61e4eb70 Mon Sep 17 00:00:00 2001 From: Yian Shang Date: Fri, 22 May 2026 10:29:26 -0700 Subject: [PATCH 02/11] Fix gap for cubes --- datajunction-server/datajunction_server/internal/nodes.py | 1 + 1 file changed, 1 insertion(+) diff --git a/datajunction-server/datajunction_server/internal/nodes.py b/datajunction-server/datajunction_server/internal/nodes.py index c8fd3e2ff..2e2919912 100644 --- a/datajunction-server/datajunction_server/internal/nodes.py +++ b/datajunction-server/datajunction_server/internal/nodes.py @@ -634,6 +634,7 @@ async def create_cube_node_revision( if referenced_node.type == NodeType.METRIC else col.display_name, type=col.type, + unit=col.unit, attributes=[ ColumnAttribute(attribute_type_id=attr.attribute_type_id) for attr in col.attributes From 4cec2066a784e71138e93c3f7c2213136ecb244b Mon Sep 17 00:00:00 2001 From: Yian Shang Date: Fri, 22 May 2026 19:02:54 -0700 Subject: [PATCH 03/11] Fix coverage --- .../datajunction_server/database/node.py | 53 ++-- .../internal/namespaces.py | 13 +- .../datajunction_server/internal/nodes.py | 19 ++ .../tests/internal/legacy_unit_compat_test.py | 248 ++++++++++++++++++ 4 files changed, 305 insertions(+), 28 deletions(-) create mode 100644 datajunction-server/tests/internal/legacy_unit_compat_test.py diff --git a/datajunction-server/datajunction_server/database/node.py b/datajunction-server/datajunction_server/database/node.py index df30590cc..ba1704732 100644 --- a/datajunction-server/datajunction_server/database/node.py +++ b/datajunction-server/datajunction_server/database/node.py @@ -174,6 +174,36 @@ def _build_search_score( return relevance * branch_boost * popularity +def _resolve_metric_unit_for_spec( + col_unit: dict | None, + legacy_from_md: "Any | None", +) -> "Tuple[Any | None, dict | None]": + """ + Decide which of (legacy enum, structured dict) to populate on a MetricSpec + when round-tripping a metric back from the DB. + + Rules (preserves authoring intent on round-trip): + - No structured `column.unit` → emit only the legacy field (whatever was + in metric_metadata.unit), structured stays None. + - Structured `column.unit` is legacy-expressible (USD, percentage, time + codes, etc.) → keep the legacy field as authoritative so `unit: dollar` + round-trips as `unit: dollar`, not `unit: {kind: currency, code: USD}`. + Structured stays None. + - Structured `column.unit` is NOT legacy-expressible (EUR, compound, + count-with-code, data_size) → populate structured, null the legacy so + nothing tries to dual-emit. + + Returns (legacy_for_spec, structured_for_spec). + """ + from datajunction_server.models.unit import structured_to_legacy_unit_name + + if col_unit is None: + return legacy_from_md, None + if structured_to_legacy_unit_name(col_unit) is None: + return None, col_unit + return legacy_from_md, None + + class NodeRelationship(Base): """ Join table for self-referential many-to-many relationships between nodes. @@ -545,15 +575,6 @@ async def to_spec(self, session: AsyncSession) -> NodeSpec: # Metric-specific if self.type == NodeType.METRIC: - # Prefer the structured column unit on export when it can't be - # expressed as a legacy enum (non-USD currency, compound, count - # with code, data_size, etc.). Otherwise fall back to the - # legacy metric_metadata.unit so round-trip preserves the - # author's original `unit: dollar` shape. - from datajunction_server.models.unit import ( - structured_to_legacy_unit_name, - ) - col_unit = ( self.current.columns[0].unit if self.current.columns and self.current.columns[0].unit @@ -564,16 +585,10 @@ async def to_spec(self, session: AsyncSession) -> NodeSpec: if self.current.metric_metadata and self.current.metric_metadata.unit else None ) - structured_spec: dict | None = None - legacy_spec = legacy_from_md - if col_unit is not None: - if structured_to_legacy_unit_name(col_unit) is None: - # Not legacy-expressible — use structured form, drop - # the legacy field to avoid double-emit. - structured_spec = col_unit - legacy_spec = None - # If legacy-expressible, keep legacy_from_md (preserves - # original `unit: dollar` shape on round-trip). + legacy_spec, structured_spec = _resolve_metric_unit_for_spec( + col_unit, + legacy_from_md, + ) extra_kwargs.update( required_dimensions=sorted( diff --git a/datajunction-server/datajunction_server/internal/namespaces.py b/datajunction-server/datajunction_server/internal/namespaces.py index dd4c54452..2e59eeadd 100644 --- a/datajunction-server/datajunction_server/internal/namespaces.py +++ b/datajunction-server/datajunction_server/internal/namespaces.py @@ -1744,18 +1744,13 @@ def _node_spec_to_yaml_dict(node_spec, include_all_columns=False) -> dict: # Filter columns to only include meaningful customizations # Special case for cubes: ALWAYS only export columns with partitions - # Special case for metrics: the metric's output column has a generated - # name and its unit is emitted at the metric spec top level via the - # `unit:` field. Suppress per-column unit on metric exports to avoid - # double-emission with the metric-level `unit:`. + # Metric specs exclude `columns` from model_dump (`columns: ... exclude=True`), + # so metric YAML never carries per-column entries — the metric's unit is + # emitted at the spec top level via the `unit:` field. No suppression + # needed here for metrics. # For other nodes: respect include_all_columns flag to preserve comments if "columns" in data and data["columns"] is not None: is_cube = data.get("node_type") == "cube" - is_metric = data.get("node_type") == "metric" - - if is_metric: - for col in data["columns"]: - col.pop("unit", None) # Cubes: always filter to only partitions (even when preserving comments) # Other nodes: only filter when include_all_columns=False diff --git a/datajunction-server/datajunction_server/internal/nodes.py b/datajunction-server/datajunction_server/internal/nodes.py index 2e2919912..5b16d3823 100644 --- a/datajunction-server/datajunction_server/internal/nodes.py +++ b/datajunction-server/datajunction_server/internal/nodes.py @@ -583,6 +583,25 @@ async def create_node_revision( if node_revision.type == NodeType.METRIC: if node_revision.columns: node_revision.columns[0].display_name = node_revision.display_name + # Bridge legacy metric_metadata.unit onto columns[0].unit so the + # direct create-metric endpoint produces the same DB state as the + # deployment orchestrator path (PR 2). Without this, metrics + # created via /nodes/metric/ have no column.unit, while the same + # node copied via branch fast-path does — causing spec drift. + if ( + node_revision.metric_metadata + and node_revision.metric_metadata.unit + and node_revision.columns[0].unit is None + ): + from datajunction_server.models.unit import ( + legacy_unit_to_structured, + ) + + structured = legacy_unit_to_structured( + node_revision.metric_metadata.unit, + ) + if structured is not None: + node_revision.columns[0].unit = structured node_revision.catalog_id = catalog_id return node_revision diff --git a/datajunction-server/tests/internal/legacy_unit_compat_test.py b/datajunction-server/tests/internal/legacy_unit_compat_test.py new file mode 100644 index 000000000..db76bdc74 --- /dev/null +++ b/datajunction-server/tests/internal/legacy_unit_compat_test.py @@ -0,0 +1,248 @@ +""" +Legacy `unit:` YAML compatibility test. + +Verifies that a metric authored with the legacy flat-string form +(`unit: dollar`) — the shape every existing pre-PR-2 YAML uses — continues +to work end-to-end after PR 1, PR 2, and PR 2.5: + + 1. The YAML parses through `MetricSpec` without errors. + 2. The orchestrator's reconcile + derive helpers populate both the + canonical `column.unit` (structured) and the legacy `metricmetadata.unit` + (enum) consistently. + 3. The reverse direction — reading from DB state back into a spec — + reproduces the original legacy `unit: dollar` shape (not the structured + dict), preserving the user's authoring intent for round-trip stability. + 4. `node_spec_to_yaml` emits `unit: dollar` (legacy string), not a + structured dict, when the value is legacy-expressible. + +This is the "did legacy keep working?" smoke test for the unit lift. +""" + +from unittest.mock import MagicMock + +import pytest +import yaml + +from datajunction_server.database.column import Column +from datajunction_server.internal.deployment.orchestrator import ( + DeploymentOrchestrator, +) +from datajunction_server.internal.namespaces import node_spec_to_yaml +from datajunction_server.models.deployment import MetricSpec +from datajunction_server.models.node import MetricUnit +from datajunction_server.models.unit import ( + legacy_unit_to_structured, + structured_to_legacy_unit_name, +) + + +# A representative legacy metric YAML — the exact shape every pre-PR-2 +# user has on disk today. No `columns:` block. No structured `unit:`. Just +# the flat-string field at the metric spec top level. +LEGACY_METRIC_YAML = """\ +name: legacy_compat.total_revenue +node_type: metric +display_name: Total Revenue (Legacy) +description: Sum of all transaction amounts (authored with legacy unit field). +direction: higher_is_better +unit: dollar +significant_digits: 2 +query: SELECT SUM(amount) FROM legacy_compat.transactions +""" + + +def _make_col(unit: dict | None = None) -> Column: + """Minimal Column constructed without a session (matches existing helper tests).""" + return Column(name="value", type=None, order=0, unit=unit) + + +class TestLegacyYamlEndToEnd: + """ + End-to-end: legacy YAML → MetricSpec → orchestrator helpers → DB state + → MetricSpec (round-trip via to_spec equivalent) → YAML export. + """ + + @pytest.fixture + def spec(self) -> MetricSpec: + return MetricSpec(**yaml.safe_load(LEGACY_METRIC_YAML)) + + def test_parses_into_unit_enum_not_structured(self, spec: MetricSpec): + """Legacy YAML routes to unit_enum; structured field remains None.""" + assert spec.unit_enum == MetricUnit.DOLLAR + assert spec.unit_structured is None + # `spec.unit` returns the legacy string form for back-compat output. + assert spec.unit == "dollar" + + def test_other_metric_fields_preserved(self, spec: MetricSpec): + """Non-unit metric fields still parse correctly alongside the legacy unit.""" + assert spec.rendered_name == "legacy_compat.total_revenue" + assert spec.direction is not None + assert spec.direction.value == "higher_is_better" + assert spec.significant_digits == 2 + + def test_orchestrator_reconciles_legacy_onto_column(self, spec: MetricSpec): + """The reconcile step writes the structured value to columns[0].unit.""" + col = _make_col() + DeploymentOrchestrator._reconcile_metric_unit(MagicMock(), spec, col) + # The legacy DOLLAR gets translated into the structured form on the column. + assert col.unit == {"kind": "currency", "code": "USD"} + + def test_orchestrator_derives_legacy_for_storage(self, spec: MetricSpec): + """After reconcile, the derive step returns MetricUnit.DOLLAR so the legacy + DB column is dual-written. This is what protects API consumers reading + `metric_metadata.unit`.""" + col = _make_col() + DeploymentOrchestrator._reconcile_metric_unit(MagicMock(), spec, col) + legacy_enum = DeploymentOrchestrator._derive_legacy_unit_for_storage( + MagicMock(), + spec, + col, + ) + assert legacy_enum == MetricUnit.DOLLAR + + def test_translation_round_trips(self, spec: MetricSpec): + """Legacy → structured → legacy is the identity for every value in the + migration table — guarantees both halves of the bridge agree.""" + structured = legacy_unit_to_structured(spec.unit_enum) + assert structured == {"kind": "currency", "code": "USD"} + legacy_name = structured_to_legacy_unit_name(structured) + assert legacy_name == "DOLLAR" + assert MetricUnit[legacy_name] == spec.unit_enum + + def test_yaml_export_emits_legacy_string(self, spec: MetricSpec): + """node_spec_to_yaml emits `unit: dollar` (string), not a structured + dict — preserves the author's original YAML shape on re-export when + the value is legacy-expressible.""" + output = node_spec_to_yaml(spec) + assert "unit: dollar" in output + # No structured form leaks into the export. + assert "kind:" not in output + assert "code:" not in output + + def test_yaml_export_does_not_introduce_columns_block(self, spec: MetricSpec): + """A legacy metric YAML has no `columns:` block. After all the unit + plumbing, re-export must NOT introduce one — that would be a churn + diff on every legacy metric file.""" + output = node_spec_to_yaml(spec) + assert "columns:" not in output + + +class TestAllLegacyUnitValuesRoundTrip: + """ + Every legacy `MetricUnit` value that exists in production data + (per the per-revision count we measured) must successfully: + (a) parse from the legacy YAML form, + (b) translate forward into a structured dict, and + (c) translate back via the reverse table. + + BIT and BYTE are intentionally absent from the translation table (0 rows + in production) — they map to None on forward translation, which is + documented behavior. + """ + + @pytest.mark.parametrize( + "legacy_str", + [ + "unitless", + "percentage", + "proportion", + "dollar", + "millisecond", + "second", + "minute", + "hour", + "day", + "week", + "month", + "year", + ], + ) + def test_legacy_value_round_trips(self, legacy_str: str): + spec = MetricSpec( + name="legacy_compat.m", + query="SELECT 1", + unit=legacy_str, + ) + assert spec.unit_enum == MetricUnit[legacy_str.upper()] + assert spec.unit_structured is None + # Forward translation produces a structured value. + structured = legacy_unit_to_structured(spec.unit_enum) + assert structured is not None + # Reverse translation returns the same enum member. + assert structured_to_legacy_unit_name(structured) == legacy_str.upper() + + def test_unknown_legacy_value_rejected(self): + """Typos in legacy YAML still raise a clear error, not a silent pass.""" + from datajunction_server.errors import DJInvalidInputException + + with pytest.raises(DJInvalidInputException, match="Invalid metric unit"): + MetricSpec( + name="legacy_compat.m", + query="SELECT 1", + unit="dollars", # plural typo + ) + + def test_resolve_metric_unit_for_spec_no_column_unit(self): + """Helper: no structured column unit → emit only legacy enum.""" + from datajunction_server.database.node import _resolve_metric_unit_for_spec + + legacy, structured = _resolve_metric_unit_for_spec( + col_unit=None, + legacy_from_md=MetricUnit.DOLLAR, + ) + assert legacy == MetricUnit.DOLLAR + assert structured is None + + def test_resolve_metric_unit_for_spec_legacy_expressible_keeps_legacy(self): + """Helper: structured is legacy-expressible → keep legacy field, structured stays None. + + Preserves round-trip stability: an author who wrote `unit: dollar` sees + `unit: dollar` back, not `unit: {kind: currency, code: USD}`. + """ + from datajunction_server.database.node import _resolve_metric_unit_for_spec + + legacy, structured = _resolve_metric_unit_for_spec( + col_unit={"kind": "currency", "code": "USD"}, + legacy_from_md=MetricUnit.DOLLAR, + ) + assert legacy == MetricUnit.DOLLAR + assert structured is None + + @pytest.mark.parametrize( + "col_unit", + [ + {"kind": "currency", "code": "EUR"}, + {"kind": "data_size", "code": "MB"}, + {"kind": "count", "code": "clicks"}, + { + "numerator": {"kind": "count"}, + "denominator": {"kind": "time", "code": "s"}, + }, + ], + ) + def test_resolve_metric_unit_for_spec_structured_only(self, col_unit): + """Helper: structured NOT legacy-expressible → populate structured, null legacy. + + Covers the round-trip path for EUR / compound / data_size / + count-with-code, which have no MetricUnit enum equivalent. Without + this branch, a metric authored with `unit: {kind: currency, code: EUR}` + would lose its unit on re-export (since the legacy field would be + null and the structured wouldn't be populated). + """ + from datajunction_server.database.node import _resolve_metric_unit_for_spec + + legacy, structured = _resolve_metric_unit_for_spec( + col_unit=col_unit, + legacy_from_md=None, + ) + assert legacy is None + assert structured == col_unit + + def test_uppercase_legacy_input_accepted(self): + """Legacy parser accepts case-insensitive input (existing behavior).""" + spec = MetricSpec( + name="legacy_compat.m", + query="SELECT 1", + unit="DOLLAR", + ) + assert spec.unit_enum == MetricUnit.DOLLAR From abcdefaecafd9021c7ffd98d1398f418d431523a Mon Sep 17 00:00:00 2001 From: Yian Shang Date: Fri, 22 May 2026 19:08:58 -0700 Subject: [PATCH 04/11] Clean up --- .../internal/deployment/orchestrator.py | 18 +++++++------- .../datajunction_server/internal/nodes.py | 11 +++++---- .../datajunction_server/models/unit.py | 24 ++++++++++++------- .../internal/deployment/orchestration_test.py | 2 +- .../tests/internal/legacy_unit_compat_test.py | 16 +++++-------- .../tests/internal/namespaces_test.py | 5 ++-- datajunction-server/tests/models/unit_test.py | 19 ++++++++++----- 7 files changed, 54 insertions(+), 41 deletions(-) diff --git a/datajunction-server/datajunction_server/internal/deployment/orchestrator.py b/datajunction-server/datajunction_server/internal/deployment/orchestrator.py index 688d82d67..fe1c2aa08 100644 --- a/datajunction-server/datajunction_server/internal/deployment/orchestrator.py +++ b/datajunction-server/datajunction_server/internal/deployment/orchestrator.py @@ -3524,9 +3524,10 @@ async def _create_node_revision( if new_revision.columns: # pragma: no branch new_revision.columns[0].display_name = new_revision.display_name - # PR 2 back-compat: bridge legacy metric_metadata.unit and - # structured columns[0].unit so users on either input shape - # end up with both fields populated where expressible. + # Bridge legacy metric_metadata.unit and structured + # columns[0].unit so users on either input shape end up with + # both fields populated where expressible. See + # `_reconcile_metric_unit` for the full precedence rules. output_col = new_revision.columns[0] if new_revision.columns else None self._reconcile_metric_unit(metric_spec, output_col) @@ -3643,11 +3644,12 @@ def _derive_legacy_unit_for_storage( output_col: Column | None, ) -> MetricUnit | None: """ - Compute the value to write to `metricmetadata.unit` (legacy DB - column) given the canonical structured `output_col.unit`. This is - the dual-write that preserves rollback safety: if PR 4 is reverted, - the legacy column still holds the right value for everything the - legacy enum can represent. + Compute the value to write to the legacy `metricmetadata.unit` DB + column given the canonical structured `output_col.unit`. Dual-writing + the legacy column keeps it in sync with `column.unit` so: + - API consumers reading `metric_metadata.unit` keep seeing values. + - A code rollback to a release that reads only the legacy column + still finds the right data for everything the enum can express. Returns None when the structured value has no legacy equivalent (non-USD currencies, compound units, data sizes, count with code) — diff --git a/datajunction-server/datajunction_server/internal/nodes.py b/datajunction-server/datajunction_server/internal/nodes.py index 5b16d3823..d7361eac3 100644 --- a/datajunction-server/datajunction_server/internal/nodes.py +++ b/datajunction-server/datajunction_server/internal/nodes.py @@ -583,11 +583,12 @@ async def create_node_revision( if node_revision.type == NodeType.METRIC: if node_revision.columns: node_revision.columns[0].display_name = node_revision.display_name - # Bridge legacy metric_metadata.unit onto columns[0].unit so the - # direct create-metric endpoint produces the same DB state as the - # deployment orchestrator path (PR 2). Without this, metrics - # created via /nodes/metric/ have no column.unit, while the same - # node copied via branch fast-path does — causing spec drift. + # Bridge legacy metric_metadata.unit onto columns[0].unit so this + # endpoint produces the same DB state as the deployment + # orchestrator. Without this, metrics created here have no + # column.unit while the same node copied via branch fast-path + # (which routes through the orchestrator) does — causing spec + # drift that surfaces as cube column inheritance mismatches. if ( node_revision.metric_metadata and node_revision.metric_metadata.unit diff --git a/datajunction-server/datajunction_server/models/unit.py b/datajunction-server/datajunction_server/models/unit.py index 3b78cfd4a..ce24120c4 100644 --- a/datajunction-server/datajunction_server/models/unit.py +++ b/datajunction-server/datajunction_server/models/unit.py @@ -215,12 +215,12 @@ def _unit_discriminator(value: Any) -> str: # nodes. The new structured `Unit` lives on every column. These functions # translate between the two so that: # - existing YAML / API input using the legacy `unit: ` keeps -# working (PR 2 wires this on the input side). +# parsing into the canonical `column.unit` storage. # - the legacy `metricmetadata.unit` DB column can be dual-written from -# `column.unit` for rollback safety (PR 2 wires this on the storage side). +# `column.unit` to keep API consumers reading the legacy field happy. # - the legacy `metric_metadata.unit` API field can be derived from -# `column.unit` for downstream consumers (PR 4 wires this on the output -# side; the reverse function lands here so it lives next to its inverse). +# `column.unit` for downstream consumers (the reverse function lands here +# so it lives next to its inverse). # # Translation is intentionally lossy in the reverse direction: structured # values the legacy enum can't represent (non-USD currencies, compound @@ -245,7 +245,11 @@ def _unit_discriminator(value: Any) -> str: "WEEK": {"kind": "time", "code": "wk"}, "MONTH": {"kind": "time", "code": "mo"}, "YEAR": {"kind": "time", "code": "yr"}, - # BIT, BYTE intentionally omitted — unused in production data. + "BYTE": {"kind": "data_size", "code": "B"}, + # BIT has no entry in DATA_SIZE_CODES (which uses byte-based units like + # B, KB, MB, ... and their binary cousins KiB, MiB). Bits are atypical + # in BI / data-platform metrics; if a user appears, add "b" to + # DATA_SIZE_CODES and {"BIT": {"kind": "data_size", "code": "b"}} here. } @@ -291,11 +295,13 @@ def structured_to_legacy_unit_name(unit: dict | None) -> str | None: return "PROPORTION" if kind == "currency": return "DOLLAR" if code == "USD" else None - if kind == "time": - # Reverse of _LEGACY_NAME_TO_STRUCTURED for the time kind. + if kind in ("time", "data_size"): + # Reverse of _LEGACY_NAME_TO_STRUCTURED for kinds where multiple + # legacy enum members map by code. + target = {"kind": kind, "code": code} for legacy_name, structured in _LEGACY_NAME_TO_STRUCTURED.items(): - if structured == {"kind": "time", "code": code}: + if structured == target: return legacy_name return None - # count, data_size — no legacy equivalent. + # count — free-form code, no legacy equivalent. return None diff --git a/datajunction-server/tests/internal/deployment/orchestration_test.py b/datajunction-server/tests/internal/deployment/orchestration_test.py index 70d29d0ac..f7bf7b320 100644 --- a/datajunction-server/tests/internal/deployment/orchestration_test.py +++ b/datajunction-server/tests/internal/deployment/orchestration_test.py @@ -2676,7 +2676,7 @@ async def test_auto_register_sources_uses_sqlalchemy_for_system_catalog( # ------------------------------------------------------------------------- -# PR 2 — legacy/structured unit bridging in the orchestrator +# Legacy/structured unit bridging in the orchestrator metric-deploy path. # ------------------------------------------------------------------------- diff --git a/datajunction-server/tests/internal/legacy_unit_compat_test.py b/datajunction-server/tests/internal/legacy_unit_compat_test.py index db76bdc74..f64fac10f 100644 --- a/datajunction-server/tests/internal/legacy_unit_compat_test.py +++ b/datajunction-server/tests/internal/legacy_unit_compat_test.py @@ -1,9 +1,7 @@ """ -Legacy `unit:` YAML compatibility test. - -Verifies that a metric authored with the legacy flat-string form -(`unit: dollar`) — the shape every existing pre-PR-2 YAML uses — continues -to work end-to-end after PR 1, PR 2, and PR 2.5: +End-to-end compatibility test for the legacy flat-string `unit:` form +(e.g. `unit: dollar`) — the shape every metric YAML used before the unit +lift to `column.unit`. Verifies: 1. The YAML parses through `MetricSpec` without errors. 2. The orchestrator's reconcile + derive helpers populate both the @@ -14,8 +12,6 @@ dict), preserving the user's authoring intent for round-trip stability. 4. `node_spec_to_yaml` emits `unit: dollar` (legacy string), not a structured dict, when the value is legacy-expressible. - -This is the "did legacy keep working?" smoke test for the unit lift. """ from unittest.mock import MagicMock @@ -36,9 +32,9 @@ ) -# A representative legacy metric YAML — the exact shape every pre-PR-2 -# user has on disk today. No `columns:` block. No structured `unit:`. Just -# the flat-string field at the metric spec top level. +# A representative legacy metric YAML — flat-string `unit:` at the metric +# spec top level, no `columns:` block, no structured unit. This is the +# shape every existing metric YAML uses. LEGACY_METRIC_YAML = """\ name: legacy_compat.total_revenue node_type: metric diff --git a/datajunction-server/tests/internal/namespaces_test.py b/datajunction-server/tests/internal/namespaces_test.py index 92f7a0d82..caa319de7 100644 --- a/datajunction-server/tests/internal/namespaces_test.py +++ b/datajunction-server/tests/internal/namespaces_test.py @@ -462,8 +462,9 @@ def test_metric_export_suppresses_per_column_unit(self): def test_column_with_unit_is_exported(self): """A column whose only customization is a unit must still be exported. - Before PR 2's fix to `_has_column_customizations`, a unit-only column - was treated as "unmodified" and silently dropped from the YAML. + `_has_column_customizations` recognizes `unit` alongside display_name, + description, attributes, and partition; without that, a unit-only + column would be treated as "unmodified" and silently dropped. """ spec = TransformSpec( name="ns.transforms.t", diff --git a/datajunction-server/tests/models/unit_test.py b/datajunction-server/tests/models/unit_test.py index d3b364bad..0a9e582b3 100644 --- a/datajunction-server/tests/models/unit_test.py +++ b/datajunction-server/tests/models/unit_test.py @@ -244,12 +244,17 @@ def test_forward_translation_matches_table( def test_forward_translation_handles_none(self) -> None: assert legacy_unit_to_structured(None) is None - def test_forward_translation_bit_byte_unused(self) -> None: - # BIT / BYTE have zero rows in production and are intentionally - # absent from the translation table. They map to None — callers - # treat that the same as UNKNOWN. + def test_forward_translation_byte_maps_to_data_size(self) -> None: + # BYTE has a clean structured equivalent under DATA_SIZE. + assert legacy_unit_to_structured(MetricUnit.BYTE) == { + "kind": "data_size", + "code": "B", + } + + def test_forward_translation_bit_unmapped(self) -> None: + # BIT has no entry in DATA_SIZE_CODES (bytes-only) so it maps to + # None until/unless someone needs it. assert legacy_unit_to_structured(MetricUnit.BIT) is None - assert legacy_unit_to_structured(MetricUnit.BYTE) is None @pytest.mark.parametrize( ("structured", "legacy_name"), @@ -266,6 +271,7 @@ def test_forward_translation_bit_byte_unused(self) -> None: ({"kind": "time", "code": "wk"}, "WEEK"), ({"kind": "time", "code": "mo"}, "MONTH"), ({"kind": "time", "code": "yr"}, "YEAR"), + ({"kind": "data_size", "code": "B"}, "BYTE"), ], ) def test_reverse_translation_matches_table( @@ -281,7 +287,7 @@ def test_reverse_translation_matches_table( None, {"kind": "currency", "code": "EUR"}, # non-USD {"kind": "currency", "code": None}, # currency with no code - {"kind": "data_size", "code": "MB"}, # no legacy equivalent + {"kind": "data_size", "code": "MB"}, # only BYTE has legacy form {"kind": "count", "code": "clicks"}, # no legacy equivalent {"kind": "count"}, # no legacy equivalent { @@ -316,6 +322,7 @@ def test_reverse_translation_returns_none_for_inexpressible( MetricUnit.WEEK, MetricUnit.MONTH, MetricUnit.YEAR, + MetricUnit.BYTE, ], ) def test_round_trip_legacy_to_structured_to_legacy( From 44d21691452b5c41af8d0dd1aae425d71a057d1d Mon Sep 17 00:00:00 2001 From: Yian Shang Date: Fri, 22 May 2026 22:01:59 -0700 Subject: [PATCH 05/11] Fix tests --- .../tests/api/namespaces_test.py | 1 + datajunction-server/tests/api/nodes_test.py | 31 +++++++++++++++++++ 2 files changed, 32 insertions(+) diff --git a/datajunction-server/tests/api/namespaces_test.py b/datajunction-server/tests/api/namespaces_test.py index 2c311eb05..2022692cc 100644 --- a/datajunction-server/tests/api/namespaces_test.py +++ b/datajunction-server/tests/api/namespaces_test.py @@ -854,6 +854,7 @@ async def test_export_namespaces_deployment(client_with_roads: AsyncClient): "display_name": "Num Repair Orders", "name": "${prefix}num_repair_orders", "type": "bigint", + "unit": {"kind": "currency", "code": "USD"}, }, { "attributes": [], diff --git a/datajunction-server/tests/api/nodes_test.py b/datajunction-server/tests/api/nodes_test.py index 24975bd3d..8f912b30a 100644 --- a/datajunction-server/tests/api/nodes_test.py +++ b/datajunction-server/tests/api/nodes_test.py @@ -2959,6 +2959,37 @@ async def test_update_metric_node(self, client_with_roads: AsyncClient): data = response.json() assert data["required_dimensions"] == ["repair_order_id"] + @pytest.mark.asyncio + async def test_metric_unknown_unit_does_not_set_column_unit( + self, + client_with_roads: AsyncClient, + ): + """ + MetricUnit.UNKNOWN maps to None in legacy_unit_to_structured, so setting + unit: "unknown" via metric_metadata must not write anything to column.unit. + Covers the structured-is-None branch in internal/nodes.py (line 604). + """ + response = await client_with_roads.post( + "/nodes/metric/", + json={ + "name": "default.test_unknown_unit_metric", + "description": "Test metric with unknown unit", + "query": "SELECT count(repair_order_id) FROM default.repair_orders", + "mode": "published", + "metric_metadata": { + "unit": "unknown", + }, + }, + ) + assert response.status_code in (200, 201) + node = response.json() + col = next( + c + for c in node["columns"] + if c["name"] == "default_DOT_test_unknown_unit_metric" + ) + assert col.get("unit") is None + @pytest.mark.asyncio async def test_create_dimension_node_fails( self, From 9dd02d0eb434e1c83446314369a4bdecdff4ce8c Mon Sep 17 00:00:00 2001 From: Yian Shang Date: Sun, 24 May 2026 15:59:49 -0700 Subject: [PATCH 06/11] Canonicalize inputs for units at boundaries so that they are parsed into one canonical form --- .../datajunction_server/database/node.py | 8 +- .../internal/deployment/orchestrator.py | 14 +- .../datajunction_server/models/deployment.py | 83 ++++++--- .../datajunction_server/models/unit.py | 33 ++++ .../internal/deployment/orchestration_test.py | 29 +-- .../tests/internal/legacy_unit_compat_test.py | 175 +++++++++++++++++- 6 files changed, 289 insertions(+), 53 deletions(-) diff --git a/datajunction-server/datajunction_server/database/node.py b/datajunction-server/datajunction_server/database/node.py index ba1704732..d0d43cdf1 100644 --- a/datajunction-server/datajunction_server/database/node.py +++ b/datajunction-server/datajunction_server/database/node.py @@ -67,6 +67,7 @@ from datajunction_server.models.node import ( DEFAULT_DRAFT_VERSION, BuildCriteria, + MetricUnit, NodeCursor, NodeMode, NodeStatus, @@ -580,9 +581,14 @@ async def to_spec(self, session: AsyncSession) -> NodeSpec: if self.current.columns and self.current.columns[0].unit else None ) + # `MetricUnit.UNKNOWN` is the DB-column default for any metric + # that never had a unit authored. Treat it as "no unit" so it + # doesn't leak as `unit: unknown` on YAML re-export. legacy_from_md = ( self.current.metric_metadata.unit - if self.current.metric_metadata and self.current.metric_metadata.unit + if self.current.metric_metadata + and self.current.metric_metadata.unit + and self.current.metric_metadata.unit != MetricUnit.UNKNOWN else None ) legacy_spec, structured_spec = _resolve_metric_unit_for_spec( diff --git a/datajunction-server/datajunction_server/internal/deployment/orchestrator.py b/datajunction-server/datajunction_server/internal/deployment/orchestrator.py index fe1c2aa08..50f8a5aa6 100644 --- a/datajunction-server/datajunction_server/internal/deployment/orchestrator.py +++ b/datajunction-server/datajunction_server/internal/deployment/orchestrator.py @@ -89,6 +89,7 @@ from datajunction_server.models.unit import ( legacy_unit_to_structured, structured_to_legacy_unit_name, + unit_to_dict, ) from datajunction_server.utils import ( SEPARATOR, @@ -3592,12 +3593,11 @@ def _reconcile_metric_unit( return # pragma: no cover legacy = metric_spec.unit_enum - spec_structured = ( - metric_spec.unit_structured.model_dump() - if metric_spec.unit_structured is not None - else None - ) - column_structured = output_col.unit + spec_structured = unit_to_dict(metric_spec.unit_structured) + # column.unit comes from JSONB as a plain dict; canonicalize so a + # hand-rolled or legacy-translated value compares equal to the + # spec_structured shape regardless of code: None presence. + column_structured = unit_to_dict(output_col.unit) # (1) Metric-level structured input wins absolutely. if spec_structured is not None: @@ -3676,7 +3676,7 @@ def _create_column_from_spec( display_name=col.display_name, description=col.description, order=order, - unit=col.unit.model_dump() if col.unit is not None else None, + unit=unit_to_dict(col.unit), attributes=[ ColumnAttribute( attribute_type=self.registry.attributes.get(attr), diff --git a/datajunction-server/datajunction_server/models/deployment.py b/datajunction-server/datajunction_server/models/deployment.py index 6c510098c..6730d6dc9 100644 --- a/datajunction-server/datajunction_server/models/deployment.py +++ b/datajunction-server/datajunction_server/models/deployment.py @@ -4,10 +4,11 @@ Field, PrivateAttr, ConfigDict, + TypeAdapter, model_validator, ) -from typing import Annotated, Any, Literal, Optional, Union +from typing import Annotated, Any, ClassVar, Literal, Optional, Union from datajunction_server.models.partition import Granularity, PartitionType from datajunction_server.errors import ( DJInvalidDeploymentConfig, @@ -27,7 +28,11 @@ NodeStatus, NodeType, ) -from datajunction_server.models.unit import Unit +from datajunction_server.models.unit import ( + Unit, + legacy_unit_to_structured, + unit_to_dict, +) from datajunction_server.utils import SEPARATOR @@ -456,25 +461,41 @@ class MetricSpec(NodeSpec): min_decimal_exponent: int | None = None max_decimal_exponent: int | None = None + # Class-level adapter used by __init__ to eagerly validate structured + # unit input. `ClassVar` keeps Pydantic from treating it as a field. + _unit_adapter: ClassVar[TypeAdapter] = TypeAdapter(Unit) + def __init__(self, **data: Any): unit = data.pop("unit", None) - if unit is not None and unit != "": - if isinstance(unit, MetricUnit): - data["unit_enum"] = unit - elif isinstance(unit, str): - try: - data["unit_enum"] = MetricUnit[unit.strip().upper()] - except KeyError: - raise DJInvalidInputException(f"Invalid metric unit: {unit}") - elif isinstance(unit, dict): - # Structured form. Defer validation to the Unit discriminated - # union — the same code path ColumnSpec.unit uses. - data["unit_structured"] = unit - else: + # Empty string and empty dict both mean "no unit authored" — match + # the historical `if unit:` permissive gate so existing YAML files + # with `unit: ""` or `unit: {}` (e.g. template residue) keep parsing. + if unit is None or unit == "" or unit == {}: + pass + elif isinstance(unit, MetricUnit): + data["unit_enum"] = unit + elif isinstance(unit, str): + try: + data["unit_enum"] = MetricUnit[unit.strip().upper()] + except KeyError: + raise DJInvalidInputException(f"Invalid metric unit: {unit}") + elif isinstance(unit, dict): + # Validate eagerly so users get a clean DJInvalidInputException + # (with the offending dict echoed) instead of a noisy Pydantic + # ValidationError that leaks internal field names like + # `unit_structured.atomic.kind`. + try: + MetricSpec._unit_adapter.validate_python(unit) + except Exception as exc: raise DJInvalidInputException( - f"Metric unit must be a string or a structured dict; " - f"got {type(unit).__name__}", - ) + f"Invalid metric unit {unit!r}: {exc}", + ) from exc + data["unit_structured"] = unit + else: + raise DJInvalidInputException( + f"Metric unit must be a string or a structured dict; " + f"got {type(unit).__name__}", + ) super().__init__(**data) @property @@ -492,11 +513,9 @@ def unit(self) -> str | dict | None: shape should read `column.unit` on the metric's output column. """ if self.unit_structured is not None: - # mode="json" so UnitKind enum members render as plain strings. - # The dict is consumed by YAML and JSON serializers that don't - # know about the enum subclass. - return self.unit_structured.model_dump(mode="json") - if self.unit_enum is None: + # Canonical dict shape (JSON-friendly, no None values). + return unit_to_dict(self.unit_structured) + if self.unit_enum is None or self.unit_enum == MetricUnit.UNKNOWN: return None return self.unit_enum.value.name.lower() @@ -505,6 +524,22 @@ def model_dump(self, **kwargs): # pragma: no cover base["unit"] = self.unit return base + def _canonical_unit(self) -> dict | None: + """ + Reduce both legacy and structured inputs to the same canonical dict + form for equality comparisons. Returns None when the metric has no + unit (or only the UNKNOWN sentinel), the structured dict otherwise. + Two specs that author the same conceptual unit via different input + shapes (`unit: dollar` vs `unit: {kind: currency, code: USD}`) will + produce identical canonical forms — so __eq__ doesn't falsely report + drift between YAML and DB-roundtripped specs. + """ + if self.unit_structured is not None: + return unit_to_dict(self.unit_structured) + if self.unit_enum is None or self.unit_enum == MetricUnit.UNKNOWN: + return None + return legacy_unit_to_structured(self.unit_enum) + def __eq__(self, other: Any) -> bool: if not isinstance(other, MetricSpec): return False @@ -513,7 +548,7 @@ def __eq__(self, other: Any) -> bool: and self.query_ast.compare(other.query_ast) and (self.required_dimensions or []) == (other.required_dimensions or []) and eq_or_fallback(self.direction, other.direction, MetricDirection.NEUTRAL) - and eq_or_fallback(self.unit, other.unit, MetricUnit.UNKNOWN.value.name) + and self._canonical_unit() == other._canonical_unit() and self.significant_digits == other.significant_digits and self.min_decimal_exponent == other.min_decimal_exponent and self.max_decimal_exponent == other.max_decimal_exponent diff --git a/datajunction-server/datajunction_server/models/unit.py b/datajunction-server/datajunction_server/models/unit.py index ce24120c4..41e12a08c 100644 --- a/datajunction-server/datajunction_server/models/unit.py +++ b/datajunction-server/datajunction_server/models/unit.py @@ -207,6 +207,39 @@ def _unit_discriminator(value: Any) -> str: ] +def unit_to_dict(unit: "AtomicUnit | CompoundUnit | dict | None") -> dict | None: + """ + Canonical JSON-friendly dict form of a Unit. Used by every storage and + comparison path so the on-disk shape is stable regardless of input: + + - `mode="json"` so UnitKind enum members render as plain strings. + - `exclude_none=True` so `code: None` doesn't appear on dimensionless + kinds (unitless / percentage / proportion / count-without-code), + keeping shapes consistent with the legacy translation table. + + Accepts either a Pydantic model or an already-dict input — the dict + path is a no-op pass-through after a defensive copy. + """ + if unit is None: + return None + if isinstance(unit, BaseModel): + return unit.model_dump(mode="json", exclude_none=True) + # Already a dict; normalize by stripping any None values one level deep + # so a hand-rolled dict matches the canonical shape. + if isinstance(unit, dict): + return _strip_none(unit) + raise TypeError( # pragma: no cover + f"unit_to_dict expected Unit | dict | None, got {type(unit).__name__}", + ) + + +def _strip_none(value: Any) -> Any: + """Drop None-valued keys recursively from a dict; pass through otherwise.""" + if isinstance(value, dict): + return {k: _strip_none(v) for k, v in value.items() if v is not None} + return value + + # ------------------------------------------------------------------------- # Legacy <-> structured translation. # diff --git a/datajunction-server/tests/internal/deployment/orchestration_test.py b/datajunction-server/tests/internal/deployment/orchestration_test.py index f7bf7b320..27431cad6 100644 --- a/datajunction-server/tests/internal/deployment/orchestration_test.py +++ b/datajunction-server/tests/internal/deployment/orchestration_test.py @@ -2754,12 +2754,7 @@ def test_unitless_legacy_preserves_distinction(self): assert col.unit == {"kind": "unitless"} def test_metric_level_structured_overrides_legacy(self): - spec = self._make_spec() - spec.unit_structured = MagicMock() - spec.unit_structured.model_dump = lambda: { - "kind": "currency", - "code": "EUR", - } + spec = self._make_spec(unit={"kind": "currency", "code": "EUR"}) spec.unit_enum = MetricUnit.DOLLAR # ignored when structured is set col = self._make_col() DeploymentOrchestrator._reconcile_metric_unit(MagicMock(), spec, col) @@ -2768,12 +2763,7 @@ def test_metric_level_structured_overrides_legacy(self): def test_metric_level_structured_overrides_column_unit_with_warning(self, caplog): import logging - spec = self._make_spec() - spec.unit_structured = MagicMock() - spec.unit_structured.model_dump = lambda: { - "kind": "currency", - "code": "EUR", - } + spec = self._make_spec(unit={"kind": "currency", "code": "EUR"}) col = self._make_col(unit={"kind": "percentage"}) with caplog.at_level( logging.WARNING, @@ -2788,10 +2778,8 @@ def test_metric_level_structured_matching_column_unit_no_warning(self, caplog): # When metric-level structured equals columns[].unit, no warning fires. import logging - spec = self._make_spec() - spec.unit_structured = MagicMock() same = {"kind": "currency", "code": "USD"} - spec.unit_structured.model_dump = lambda: dict(same) + spec = self._make_spec(unit=dict(same)) col = self._make_col(unit=dict(same)) with caplog.at_level( logging.WARNING, @@ -2807,11 +2795,12 @@ def test_metric_level_structured_warns_when_combined_with_legacy(self, caplog): import logging spec = self._make_spec(unit=MetricUnit.DOLLAR) - spec.unit_structured = MagicMock() - spec.unit_structured.model_dump = lambda: { - "kind": "currency", - "code": "EUR", - } + # Override the unit_structured field directly to simulate "both + # fields set" — production code wouldn't normally set both at once, + # but the reconcile path must handle it gracefully. + from datajunction_server.models.unit import AtomicUnit, UnitKind + + spec.unit_structured = AtomicUnit(kind=UnitKind.CURRENCY, code="EUR") col = self._make_col() with caplog.at_level( logging.WARNING, diff --git a/datajunction-server/tests/internal/legacy_unit_compat_test.py b/datajunction-server/tests/internal/legacy_unit_compat_test.py index f64fac10f..c76f4987f 100644 --- a/datajunction-server/tests/internal/legacy_unit_compat_test.py +++ b/datajunction-server/tests/internal/legacy_unit_compat_test.py @@ -234,7 +234,7 @@ def test_resolve_metric_unit_for_spec_structured_only(self, col_unit): assert legacy is None assert structured == col_unit - def test_uppercase_legacy_input_accepted(self): + def test_uppercase_legacy_input_accepted(self): # noqa: ANN201 """Legacy parser accepts case-insensitive input (existing behavior).""" spec = MetricSpec( name="legacy_compat.m", @@ -242,3 +242,176 @@ def test_uppercase_legacy_input_accepted(self): unit="DOLLAR", ) assert spec.unit_enum == MetricUnit.DOLLAR + + +class TestUnknownDoesNotLeak: + """ + Code-review fix #1: `MetricUnit.UNKNOWN` is the DB-column default for + any metric that never had a unit authored. It must not leak as + `unit: unknown` in YAML re-exports or any output surface. + """ + + def test_metric_spec_unit_property_treats_unknown_as_none(self): + spec = MetricSpec( + name="m", + query="SELECT 1", + unit=MetricUnit.UNKNOWN, + ) + # unit_enum is UNKNOWN internally, but the public `unit` view + # returns None — UNKNOWN is the legacy "no unit" sentinel. + assert spec.unit_enum == MetricUnit.UNKNOWN + assert spec.unit is None + + def test_yaml_export_does_not_emit_unknown(self): + """Round-trip: a metric with the UNKNOWN sentinel must not produce + `unit: unknown` in YAML.""" + from datajunction_server.internal.namespaces import node_spec_to_yaml + + spec = MetricSpec( + name="m", + query="SELECT 1", + unit=MetricUnit.UNKNOWN, + ) + output = node_spec_to_yaml(spec) + assert "unit:" not in output + assert "unknown" not in output + + +class TestEqIgnoresInputShape: + """ + Code-review fix #2: __eq__ must compare units by canonical form so + `unit: dollar` (legacy) and `unit: {kind: currency, code: USD}` + (structured) are equal for the same conceptual unit — preventing + spurious drift detection on redeploy / git-sync. + """ + + def test_legacy_and_structured_usd_compare_equal(self): + a = MetricSpec(name="m", query="SELECT 1", unit="dollar") + b = MetricSpec( + name="m", + query="SELECT 1", + unit={"kind": "currency", "code": "USD"}, + ) + assert a == b + + def test_legacy_percentage_equals_structured_percentage(self): + a = MetricSpec(name="m", query="SELECT 1", unit="percentage") + b = MetricSpec(name="m", query="SELECT 1", unit={"kind": "percentage"}) + assert a == b + + def test_different_currencies_not_equal(self): + a = MetricSpec(name="m", query="SELECT 1", unit="dollar") + b = MetricSpec( + name="m", + query="SELECT 1", + unit={"kind": "currency", "code": "EUR"}, + ) + assert a != b + + def test_no_unit_and_unknown_unit_compare_equal(self): + """UNKNOWN is the legacy "no unit" sentinel; treat it as equivalent + to no unit set on the other side.""" + a = MetricSpec(name="m", query="SELECT 1") + b = MetricSpec(name="m", query="SELECT 1", unit=MetricUnit.UNKNOWN) + assert a == b + + +class TestUnitToDictCanonicalShape: + """ + Code-review fix #3: unit_to_dict produces a stable JSON-friendly dict + that drops None-valued keys, so storage is byte-identical regardless + of whether the input came via the structured (model_dump) path or the + legacy translation table. + """ + + def test_unitless_atomic_drops_code_key(self): + from datajunction_server.models.unit import ( + AtomicUnit, + UnitKind, + unit_to_dict, + ) + + u = AtomicUnit(kind=UnitKind.UNITLESS) + # model_dump() default would emit {'kind': UnitKind.UNITLESS, 'code': None}. + # unit_to_dict yields a JSON-friendly, none-stripped form matching + # what the legacy translation table stores. + assert unit_to_dict(u) == {"kind": "unitless"} + + def test_currency_with_code(self): + from datajunction_server.models.unit import ( + AtomicUnit, + UnitKind, + unit_to_dict, + ) + + assert unit_to_dict( + AtomicUnit(kind=UnitKind.CURRENCY, code="USD"), + ) == {"kind": "currency", "code": "USD"} + + def test_dict_input_passes_through_with_none_stripped(self): + from datajunction_server.models.unit import unit_to_dict + + assert unit_to_dict({"kind": "unitless", "code": None}) == {"kind": "unitless"} + assert unit_to_dict( + {"kind": "currency", "code": "USD"}, + ) == {"kind": "currency", "code": "USD"} + + def test_none_returns_none(self): + from datajunction_server.models.unit import unit_to_dict + + assert unit_to_dict(None) is None + + def test_storage_paths_produce_identical_shape(self): + """The legacy-translation path and the structured-input path must + write the same JSONB shape for a conceptually identical unit.""" + from datajunction_server.models.unit import ( + AtomicUnit, + UnitKind, + legacy_unit_to_structured, + unit_to_dict, + ) + + legacy_shape = legacy_unit_to_structured(MetricUnit.UNITLESS) + structured_shape = unit_to_dict(AtomicUnit(kind=UnitKind.UNITLESS)) + assert legacy_shape == structured_shape + + +class TestBadDictErrorUx: + """ + Code-review fix #4: a malformed structured `unit` dict must surface as + a clean `DJInvalidInputException` echoing the offending input, not a + raw Pydantic ValidationError that leaks internal field names. + """ + + @pytest.mark.parametrize( + "bad_unit", + [ + {"kind": "bogus"}, # bad kind + {"kind": "time", "code": "seconds"}, # bad time code + {"kind": "currency", "code": "usd"}, # lowercase currency + {"foo": "bar"}, # no kind / numerator + ], + ) + def test_bad_dict_raises_djinvalid_input(self, bad_unit): + from datajunction_server.errors import DJInvalidInputException + + with pytest.raises(DJInvalidInputException) as exc_info: + MetricSpec(name="m", query="SELECT 1", unit=bad_unit) + # Message echoes the offending dict so users know what they sent. + assert "Invalid metric unit" in str(exc_info.value) + # Internal field name must not leak. + assert "unit_structured" not in str(exc_info.value) + + +class TestFalsyInputGate: + """ + Code-review fix #5: empty string / empty dict must be tolerated as + "no unit authored", matching the historical `if unit:` permissive gate. + """ + + @pytest.mark.parametrize("falsy", [None, "", {}]) + def test_falsy_unit_treated_as_unset(self, falsy): + spec = MetricSpec(name="m", query="SELECT 1", unit=falsy) + assert spec.unit_enum is None + assert spec.unit_structured is None + assert spec.unit is None From a02b1b30791720b0b3ab5e8f77080b5aa4293b54 Mon Sep 17 00:00:00 2001 From: Yian Shang Date: Sun, 24 May 2026 22:14:34 -0700 Subject: [PATCH 07/11] Reduce func side effect --- .../internal/deployment/orchestrator.py | 49 ++++++------- .../internal/deployment/orchestration_test.py | 73 ++++++++----------- .../tests/internal/legacy_unit_compat_test.py | 28 ++++--- 3 files changed, 71 insertions(+), 79 deletions(-) diff --git a/datajunction-server/datajunction_server/internal/deployment/orchestrator.py b/datajunction-server/datajunction_server/internal/deployment/orchestrator.py index 50f8a5aa6..1f48d23bc 100644 --- a/datajunction-server/datajunction_server/internal/deployment/orchestrator.py +++ b/datajunction-server/datajunction_server/internal/deployment/orchestrator.py @@ -3528,9 +3528,13 @@ async def _create_node_revision( # Bridge legacy metric_metadata.unit and structured # columns[0].unit so users on either input shape end up with # both fields populated where expressible. See - # `_reconcile_metric_unit` for the full precedence rules. + # `_resolve_metric_unit` for the full precedence rules. output_col = new_revision.columns[0] if new_revision.columns else None - self._reconcile_metric_unit(metric_spec, output_col) + if output_col is not None: + output_col.unit = self._resolve_metric_unit( + metric_spec, + output_col.unit, + ) legacy_unit = self._derive_legacy_unit_for_storage( metric_spec, @@ -3564,40 +3568,38 @@ async def _create_node_revision( new_revision.required_dimensions = required_dimensions return new_revision - def _reconcile_metric_unit( + def _resolve_metric_unit( self, metric_spec: MetricSpec, - output_col: Column | None, - ) -> None: + column_unit: dict | None, + ) -> dict | None: """ - Resolve the canonical structured unit for a metric's output column - from the three possible input surfaces and write it to - `output_col.unit`. + Compute the canonical structured unit dict for a metric's output + column, reconciling the three possible input surfaces. Pure: + returns the value the caller should assign; no mutation. Input surfaces (highest priority first): 1. `metric_spec.unit_structured` — top-level structured `unit:` at the metric spec level. Authored shape for the new model. - 2. `output_col.unit` — structured value set via the explicit + 2. `column_unit` — structured value set via the explicit `columns[].unit` form. Supported for uniformity with non-metric nodes, but unusual on metrics (where the column name is auto-derived). 3. `metric_spec.unit_enum` — legacy flat-string `unit: dollar` form. Translated via the legacy → structured table. - Conflict handling: if (1) is set together with (2), a warning is - logged naming the node; the metric-level structured value wins. - Combining (1) or (2) with (3) is normal — the structured value wins - and the legacy field becomes redundant scaffolding. - """ - if output_col is None: - return # pragma: no cover + Conflict handling: if (1) is set together with (2) or (3) and the + values disagree, a warning is logged naming the node. + Returns the canonical dict shape (None if no unit is set on any + surface). + """ legacy = metric_spec.unit_enum spec_structured = unit_to_dict(metric_spec.unit_structured) # column.unit comes from JSONB as a plain dict; canonicalize so a # hand-rolled or legacy-translated value compares equal to the # spec_structured shape regardless of code: None presence. - column_structured = unit_to_dict(output_col.unit) + column_structured = unit_to_dict(column_unit) # (1) Metric-level structured input wins absolutely. if spec_structured is not None: @@ -3618,8 +3620,7 @@ def _reconcile_metric_unit( spec_structured, legacy.name, ) - output_col.unit = spec_structured - return + return spec_structured # (2) columns[].unit fallback. if column_structured is not None: @@ -3631,12 +3632,10 @@ def _reconcile_metric_unit( legacy.name, column_structured, ) - return + return column_structured # (3) Legacy translation. - translated = legacy_unit_to_structured(legacy) - if translated is not None: - output_col.unit = translated + return legacy_unit_to_structured(legacy) def _derive_legacy_unit_for_storage( self, @@ -3660,8 +3659,8 @@ def _derive_legacy_unit_for_storage( name = structured_to_legacy_unit_name(structured) return MetricUnit[name] if name is not None else None # No structured unit — fall back to whatever the legacy spec field - # had (typically None at this point because _reconcile_metric_unit - # would have copied it onto the column). + # had (typically None at this point because the caller will have + # already assigned the resolved column.unit from _resolve_metric_unit). return metric_spec.unit_enum def _create_column_from_spec( diff --git a/datajunction-server/tests/internal/deployment/orchestration_test.py b/datajunction-server/tests/internal/deployment/orchestration_test.py index 27431cad6..dac80347b 100644 --- a/datajunction-server/tests/internal/deployment/orchestration_test.py +++ b/datajunction-server/tests/internal/deployment/orchestration_test.py @@ -2682,7 +2682,7 @@ async def test_auto_register_sources_uses_sqlalchemy_for_system_catalog( class TestReconcileMetricUnit: """ - Tests for `_reconcile_metric_unit` and `_derive_legacy_unit_for_storage`. + Tests for `_resolve_metric_unit` and `_derive_legacy_unit_for_storage`. These bridge the legacy `MetricSpec.unit_enum` (from YAML `unit: dollar` or API `metric_metadata.unit`) and the structured `column.unit` so users on either input shape end up with consistent storage. @@ -2692,11 +2692,7 @@ class TestReconcileMetricUnit: """ @staticmethod - def _make_col(unit: dict | None = None) -> Column: - return Column(name="value", type=None, order=0, unit=unit) - - @staticmethod - def _make_spec(unit: MetricUnit | None = None) -> MetricSpec: + def _make_spec(unit: MetricUnit | dict | None = None) -> MetricSpec: kwargs: dict = { "name": "m", "namespace": "default", @@ -2706,72 +2702,65 @@ def _make_spec(unit: MetricUnit | None = None) -> MetricSpec: kwargs["unit"] = unit return MetricSpec(**kwargs) - def test_legacy_only_writes_through_to_column(self): + @staticmethod + def _resolve(spec: MetricSpec, column_unit: dict | None) -> dict | None: + return DeploymentOrchestrator._resolve_metric_unit( + MagicMock(), + spec, + column_unit, + ) + + def test_legacy_only_resolves_to_translated_dict(self): spec = self._make_spec(unit=MetricUnit.DOLLAR) - col = self._make_col() - DeploymentOrchestrator._reconcile_metric_unit(MagicMock(), spec, col) - assert col.unit == {"kind": "currency", "code": "USD"} + assert self._resolve(spec, None) == {"kind": "currency", "code": "USD"} - def test_structured_only_left_untouched(self): + def test_structured_only_resolves_to_column_unit(self): spec = self._make_spec() # no legacy unit - col = self._make_col(unit={"kind": "percentage"}) - DeploymentOrchestrator._reconcile_metric_unit(MagicMock(), spec, col) - assert col.unit == {"kind": "percentage"} + assert self._resolve(spec, {"kind": "percentage"}) == {"kind": "percentage"} def test_both_set_structured_wins_and_warns(self, caplog): import logging spec = self._make_spec(unit=MetricUnit.DOLLAR) - col = self._make_col(unit={"kind": "currency", "code": "EUR"}) with caplog.at_level( logging.WARNING, logger="datajunction_server.internal.deployment.orchestrator", ): - DeploymentOrchestrator._reconcile_metric_unit(MagicMock(), spec, col) - assert col.unit == {"kind": "currency", "code": "EUR"} + resolved = self._resolve(spec, {"kind": "currency", "code": "EUR"}) + assert resolved == {"kind": "currency", "code": "EUR"} assert any( "sets both metric_metadata.unit" in rec.message for rec in caplog.records ) - def test_neither_set_leaves_both_unset(self): + def test_neither_set_resolves_to_none(self): spec = self._make_spec() - col = self._make_col() - DeploymentOrchestrator._reconcile_metric_unit(MagicMock(), spec, col) - assert col.unit is None + assert self._resolve(spec, None) is None - def test_unknown_legacy_does_not_overwrite(self): - # MetricUnit.UNKNOWN translates to None — should not set column.unit. + def test_unknown_legacy_resolves_to_none(self): + # MetricUnit.UNKNOWN translates to None — should not produce any unit. spec = self._make_spec(unit=MetricUnit.UNKNOWN) - col = self._make_col() - DeploymentOrchestrator._reconcile_metric_unit(MagicMock(), spec, col) - assert col.unit is None + assert self._resolve(spec, None) is None def test_unitless_legacy_preserves_distinction(self): # UNITLESS → {kind: unitless} (explicitly no unit), not None. spec = self._make_spec(unit=MetricUnit.UNITLESS) - col = self._make_col() - DeploymentOrchestrator._reconcile_metric_unit(MagicMock(), spec, col) - assert col.unit == {"kind": "unitless"} + assert self._resolve(spec, None) == {"kind": "unitless"} def test_metric_level_structured_overrides_legacy(self): spec = self._make_spec(unit={"kind": "currency", "code": "EUR"}) spec.unit_enum = MetricUnit.DOLLAR # ignored when structured is set - col = self._make_col() - DeploymentOrchestrator._reconcile_metric_unit(MagicMock(), spec, col) - assert col.unit == {"kind": "currency", "code": "EUR"} + assert self._resolve(spec, None) == {"kind": "currency", "code": "EUR"} def test_metric_level_structured_overrides_column_unit_with_warning(self, caplog): import logging spec = self._make_spec(unit={"kind": "currency", "code": "EUR"}) - col = self._make_col(unit={"kind": "percentage"}) with caplog.at_level( logging.WARNING, logger="datajunction_server.internal.deployment.orchestrator", ): - DeploymentOrchestrator._reconcile_metric_unit(MagicMock(), spec, col) - # Metric-level wins. - assert col.unit == {"kind": "currency", "code": "EUR"} + resolved = self._resolve(spec, {"kind": "percentage"}) + assert resolved == {"kind": "currency", "code": "EUR"} assert any("metric-level value wins" in rec.message for rec in caplog.records) def test_metric_level_structured_matching_column_unit_no_warning(self, caplog): @@ -2780,13 +2769,12 @@ def test_metric_level_structured_matching_column_unit_no_warning(self, caplog): same = {"kind": "currency", "code": "USD"} spec = self._make_spec(unit=dict(same)) - col = self._make_col(unit=dict(same)) with caplog.at_level( logging.WARNING, logger="datajunction_server.internal.deployment.orchestrator", ): - DeploymentOrchestrator._reconcile_metric_unit(MagicMock(), spec, col) - assert col.unit == same + resolved = self._resolve(spec, dict(same)) + assert resolved == same assert not any( "metric-level value wins" in rec.message for rec in caplog.records ) @@ -2797,17 +2785,16 @@ def test_metric_level_structured_warns_when_combined_with_legacy(self, caplog): spec = self._make_spec(unit=MetricUnit.DOLLAR) # Override the unit_structured field directly to simulate "both # fields set" — production code wouldn't normally set both at once, - # but the reconcile path must handle it gracefully. + # but the resolver must handle it gracefully. from datajunction_server.models.unit import AtomicUnit, UnitKind spec.unit_structured = AtomicUnit(kind=UnitKind.CURRENCY, code="EUR") - col = self._make_col() with caplog.at_level( logging.WARNING, logger="datajunction_server.internal.deployment.orchestrator", ): - DeploymentOrchestrator._reconcile_metric_unit(MagicMock(), spec, col) - assert col.unit == {"kind": "currency", "code": "EUR"} + resolved = self._resolve(spec, None) + assert resolved == {"kind": "currency", "code": "EUR"} assert any( "structured value wins" in rec.message and "legacy" in rec.message for rec in caplog.records diff --git a/datajunction-server/tests/internal/legacy_unit_compat_test.py b/datajunction-server/tests/internal/legacy_unit_compat_test.py index c76f4987f..f63d2b8d6 100644 --- a/datajunction-server/tests/internal/legacy_unit_compat_test.py +++ b/datajunction-server/tests/internal/legacy_unit_compat_test.py @@ -76,19 +76,25 @@ def test_other_metric_fields_preserved(self, spec: MetricSpec): assert spec.direction.value == "higher_is_better" assert spec.significant_digits == 2 - def test_orchestrator_reconciles_legacy_onto_column(self, spec: MetricSpec): - """The reconcile step writes the structured value to columns[0].unit.""" - col = _make_col() - DeploymentOrchestrator._reconcile_metric_unit(MagicMock(), spec, col) - # The legacy DOLLAR gets translated into the structured form on the column. - assert col.unit == {"kind": "currency", "code": "USD"} + def test_orchestrator_resolves_legacy_to_structured(self, spec: MetricSpec): + """The resolver translates the legacy DOLLAR into the structured form.""" + resolved = DeploymentOrchestrator._resolve_metric_unit( + MagicMock(), + spec, + None, + ) + assert resolved == {"kind": "currency", "code": "USD"} def test_orchestrator_derives_legacy_for_storage(self, spec: MetricSpec): - """After reconcile, the derive step returns MetricUnit.DOLLAR so the legacy - DB column is dual-written. This is what protects API consumers reading - `metric_metadata.unit`.""" - col = _make_col() - DeploymentOrchestrator._reconcile_metric_unit(MagicMock(), spec, col) + """After resolution, the derive step returns MetricUnit.DOLLAR so the + legacy DB column is dual-written. This is what protects API consumers + reading `metric_metadata.unit`.""" + resolved = DeploymentOrchestrator._resolve_metric_unit( + MagicMock(), + spec, + None, + ) + col = _make_col(unit=resolved) legacy_enum = DeploymentOrchestrator._derive_legacy_unit_for_storage( MagicMock(), spec, From 4afb35254fc233355839c1fc451932d5a385149c Mon Sep 17 00:00:00 2001 From: Yian Shang Date: Sun, 24 May 2026 22:53:52 -0700 Subject: [PATCH 08/11] Fix --- .../datajunction_server/database/column.py | 12 ++- .../datajunction_server/database/node.py | 20 ++-- .../datajunction_server/models/metric.py | 5 +- .../datajunction_server/models/node.py | 8 ++ .../datajunction_server/models/unit.py | 64 +++++++++++-- datajunction-server/tests/models/unit_test.py | 92 +++++++++++++++++++ 6 files changed, 184 insertions(+), 17 deletions(-) diff --git a/datajunction-server/datajunction_server/database/column.py b/datajunction-server/datajunction_server/database/column.py index 56887c9b7..f34ade845 100644 --- a/datajunction-server/datajunction_server/database/column.py +++ b/datajunction-server/datajunction_server/database/column.py @@ -3,7 +3,6 @@ from typing import TYPE_CHECKING, List, Optional, Tuple from sqlalchemy import BigInteger, ForeignKey, Index, Integer, String -from sqlalchemy.dialects.postgresql import JSONB from sqlalchemy.orm import Mapped, mapped_column, relationship from datajunction_server.database.attributetype import ColumnAttribute @@ -11,6 +10,7 @@ from datajunction_server.models.attribute import ColumnAttributes from datajunction_server.models.base import labelize from datajunction_server.models.column import ColumnTypeDecorator +from datajunction_server.models.unit import Unit, UnitTypeDecorator from datajunction_server.sql.parsing.types import ColumnType if TYPE_CHECKING: @@ -46,9 +46,13 @@ class Column(Base): # type: ignore type: Mapped[Optional[ColumnType]] = mapped_column(ColumnTypeDecorator) description: Mapped[Optional[str]] = mapped_column() - # Structured unit metadata. JSONB to accommodate atomic and compound shapes. - # Validated at the model layer via datajunction_server.models.unit.Unit. - unit: Mapped[Optional[dict]] = mapped_column(JSONB, nullable=True) + # Structured unit metadata. The TypeDecorator validates JSONB into a + # typed `Unit` (AtomicUnit | CompoundUnit) on read and canonicalizes + # Unit-or-dict input back to JSONB on write. + unit: Mapped[Optional[Unit]] = mapped_column( + UnitTypeDecorator, + nullable=True, + ) dimension_id: Mapped[Optional[int]] = mapped_column( ForeignKey("node.id", ondelete="SET NULL", name="fk_column_dimension_id_node"), diff --git a/datajunction-server/datajunction_server/database/node.py b/datajunction-server/datajunction_server/database/node.py index d0d43cdf1..ca2ff9886 100644 --- a/datajunction-server/datajunction_server/database/node.py +++ b/datajunction-server/datajunction_server/database/node.py @@ -176,13 +176,17 @@ def _build_search_score( def _resolve_metric_unit_for_spec( - col_unit: dict | None, + col_unit: "Any | None", legacy_from_md: "Any | None", ) -> "Tuple[Any | None, dict | None]": """ Decide which of (legacy enum, structured dict) to populate on a MetricSpec when round-tripping a metric back from the DB. + Accepts `col_unit` as either a `Unit` Pydantic instance (the typed shape + returned by `UnitTypeDecorator`) or a plain dict (in-memory test + fixtures, or untyped paths). Internally normalizes via `unit_to_dict`. + Rules (preserves authoring intent on round-trip): - No structured `column.unit` → emit only the legacy field (whatever was in metric_metadata.unit), structured stays None. @@ -194,14 +198,18 @@ def _resolve_metric_unit_for_spec( count-with-code, data_size) → populate structured, null the legacy so nothing tries to dual-emit. - Returns (legacy_for_spec, structured_for_spec). + Returns (legacy_for_spec, structured_for_spec_dict). """ - from datajunction_server.models.unit import structured_to_legacy_unit_name + from datajunction_server.models.unit import ( + structured_to_legacy_unit_name, + unit_to_dict, + ) - if col_unit is None: + col_unit_dict = unit_to_dict(col_unit) + if col_unit_dict is None: return legacy_from_md, None - if structured_to_legacy_unit_name(col_unit) is None: - return None, col_unit + if structured_to_legacy_unit_name(col_unit_dict) is None: + return None, col_unit_dict return legacy_from_md, None diff --git a/datajunction-server/datajunction_server/models/metric.py b/datajunction-server/datajunction_server/models/metric.py index cfe1e2e65..ec690966a 100644 --- a/datajunction-server/datajunction_server/models/metric.py +++ b/datajunction-server/datajunction_server/models/metric.py @@ -16,6 +16,7 @@ ) from datajunction_server.models.query import ColumnMetadata, V3ColumnMetadata from datajunction_server.models.sql import ScanEstimate, TranspiledSQL +from datajunction_server.models.unit import unit_to_dict from datajunction_server.sql.decompose import MetricComponentExtractor from datajunction_server.sql.parsing.backends.antlr4 import ast, parse from datajunction_server.transpilation import transpile_sql @@ -93,7 +94,9 @@ async def parse_node( expression=str(query_ast.select.projection[0]), dimensions=dims, metric_metadata=node.current.metric_metadata, - unit=(node.current.columns[0].unit if node.current.columns else None), + unit=unit_to_dict( + node.current.columns[0].unit if node.current.columns else None, + ), required_dimensions=[dim.name for dim in node.current.required_dimensions], incompatible_druid_functions=incompatible_druid_functions, measures=measures, diff --git a/datajunction-server/datajunction_server/models/node.py b/datajunction-server/datajunction_server/models/node.py index f59ee4aea..2359721d9 100644 --- a/datajunction-server/datajunction_server/models/node.py +++ b/datajunction-server/datajunction_server/models/node.py @@ -793,6 +793,14 @@ class ColumnOutput(BaseModel): def extract_type(cls, raw): return str(raw) + @field_validator("unit", mode="before") + def _canonicalize_unit(cls, raw): + """Accept either a typed Unit (from the SQLAlchemy decorator) or a + raw dict; emit the canonical JSON-friendly dict shape on output.""" + from datajunction_server.models.unit import unit_to_dict + + return unit_to_dict(raw) + class SourceColumnOutput(BaseModel): """ diff --git a/datajunction-server/datajunction_server/models/unit.py b/datajunction-server/datajunction_server/models/unit.py index 41e12a08c..137154fa2 100644 --- a/datajunction-server/datajunction_server/models/unit.py +++ b/datajunction-server/datajunction_server/models/unit.py @@ -13,8 +13,12 @@ from typing import TYPE_CHECKING, Annotated, Any, Union from pydantic import BaseModel, Discriminator, Tag, model_validator +from sqlalchemy import TypeDecorator +from sqlalchemy.dialects.postgresql import JSONB if TYPE_CHECKING: + from pydantic import TypeAdapter + from datajunction_server.models.node import MetricUnit @@ -240,6 +244,51 @@ def _strip_none(value: Any) -> Any: return value +# Type adapter used by the SQLAlchemy TypeDecorator to validate JSONB rows +# into Unit instances on read. Built once at module load. +_UNIT_ADAPTER: "TypeAdapter[Unit]" = None # type: ignore[assignment] + + +def _get_unit_adapter(): + """Lazy accessor — defer TypeAdapter construction until first use so + importing `unit.py` stays cheap and avoids any circular-import risk.""" + global _UNIT_ADAPTER + if _UNIT_ADAPTER is None: + from pydantic import TypeAdapter + + _UNIT_ADAPTER = TypeAdapter(Unit) + return _UNIT_ADAPTER + + +class UnitTypeDecorator(TypeDecorator): + """ + SQLAlchemy TypeDecorator that bridges the structured ``Unit`` pydantic + model and the underlying JSONB storage for ``column.unit``. + + On write: accepts a ``Unit`` instance or a raw dict and normalizes to + the canonical JSONB shape via ``unit_to_dict``. + + On read: validates the JSONB dict into a ``Unit`` (``AtomicUnit`` or + ``CompoundUnit``) so callers get typed access — ``.kind``, ``.code``, + ``.label()`` — without stringly-typed dict lookups. + + Mirrors the existing ``ColumnTypeDecorator`` pattern. Cheaper than the + SQL-type variant because the decoded value is small and Pydantic 2's + Rust-backed validator handles it in microseconds. + """ + + impl = JSONB + cache_ok = True + + def process_bind_param(self, value, dialect): + return unit_to_dict(value) + + def process_result_value(self, value, dialect): + if value is None: + return None + return _get_unit_adapter().validate_python(value) + + # ------------------------------------------------------------------------- # Legacy <-> structured translation. # @@ -301,18 +350,21 @@ def legacy_unit_to_structured( return _LEGACY_NAME_TO_STRUCTURED.get(legacy.name) -def structured_to_legacy_unit_name(unit: dict | None) -> str | None: +def structured_to_legacy_unit_name( + unit: "AtomicUnit | CompoundUnit | dict | None", +) -> str | None: """ - Translate a structured `Unit` dict back to the legacy `MetricUnit.name` - when expressible. Returns None when the structured value has no legacy - equivalent (non-USD currencies, compound units, data sizes, count with - code, etc.) — callers should treat that as "don't populate the legacy - column." + Translate a structured `Unit` (Pydantic model or already-dict) back to + the legacy `MetricUnit.name` when expressible. Returns None when the + structured value has no legacy equivalent (non-USD currencies, compound + units, data sizes, count with code, etc.) — callers should treat that + as "don't populate the legacy column." Returns the enum member name (e.g. "DOLLAR"), not a MetricUnit instance, so this module stays import-free of node.py. Callers do `MetricUnit[name]` at the call site. """ + unit = unit_to_dict(unit) if unit is None: return None # Compound units have no legacy equivalent. diff --git a/datajunction-server/tests/models/unit_test.py b/datajunction-server/tests/models/unit_test.py index 0a9e582b3..c55f281ba 100644 --- a/datajunction-server/tests/models/unit_test.py +++ b/datajunction-server/tests/models/unit_test.py @@ -211,6 +211,98 @@ def test_column_spec_round_trip_via_dump(self) -> None: assert roundtripped == spec +class TestUnitTypeDecorator: + """ + The SQLAlchemy TypeDecorator converts between JSONB dicts and Unit + pydantic instances at the ORM boundary. Read paths return typed + `Unit`; write paths accept either `Unit` or `dict` and canonicalize. + """ + + def test_process_bind_param_accepts_unit_model(self) -> None: + from datajunction_server.models.unit import ( + AtomicUnit, + UnitKind, + UnitTypeDecorator, + ) + + dec = UnitTypeDecorator() + u = AtomicUnit(kind=UnitKind.CURRENCY, code="USD") + assert dec.process_bind_param(u, None) == { + "kind": "currency", + "code": "USD", + } + + def test_process_bind_param_accepts_raw_dict(self) -> None: + from datajunction_server.models.unit import UnitTypeDecorator + + dec = UnitTypeDecorator() + assert dec.process_bind_param( + {"kind": "unitless", "code": None}, + None, + ) == {"kind": "unitless"} + + def test_process_bind_param_none_passthrough(self) -> None: + from datajunction_server.models.unit import UnitTypeDecorator + + dec = UnitTypeDecorator() + assert dec.process_bind_param(None, None) is None + + def test_process_result_value_atomic(self) -> None: + from datajunction_server.models.unit import ( + AtomicUnit, + UnitKind, + UnitTypeDecorator, + ) + + dec = UnitTypeDecorator() + result = dec.process_result_value( + {"kind": "currency", "code": "USD"}, + None, + ) + assert isinstance(result, AtomicUnit) + assert result.kind == UnitKind.CURRENCY + assert result.code == "USD" + + def test_process_result_value_compound(self) -> None: + from datajunction_server.models.unit import ( + CompoundUnit, + UnitTypeDecorator, + ) + + dec = UnitTypeDecorator() + result = dec.process_result_value( + { + "numerator": {"kind": "count"}, + "denominator": {"kind": "time", "code": "s"}, + }, + None, + ) + assert isinstance(result, CompoundUnit) + assert result.numerator.code is None + assert result.denominator.code == "s" + + def test_process_result_value_none_passthrough(self) -> None: + from datajunction_server.models.unit import UnitTypeDecorator + + dec = UnitTypeDecorator() + assert dec.process_result_value(None, None) is None + + def test_round_trip_idempotent(self) -> None: + """write(read(write(x))) == write(x) — canonical shape is stable.""" + from datajunction_server.models.unit import ( + AtomicUnit, + UnitKind, + UnitTypeDecorator, + ) + + dec = UnitTypeDecorator() + original = AtomicUnit(kind=UnitKind.UNITLESS) + wire1 = dec.process_bind_param(original, None) + unit_back = dec.process_result_value(wire1, None) + wire2 = dec.process_bind_param(unit_back, None) + assert wire1 == wire2 == {"kind": "unitless"} + + class TestLegacyUnitTranslation: """ Coverage for `legacy_unit_to_structured` and `structured_to_legacy_unit_name`. From 45224b9254ae3af70670c2fbbf0d5a5cd15455d1 Mon Sep 17 00:00:00 2001 From: Yian Shang Date: Mon, 25 May 2026 19:57:31 -0700 Subject: [PATCH 09/11] Clean up --- .../datajunction_server/database/node.py | 44 +++---- .../internal/deployment/orchestrator.py | 53 ++++---- .../datajunction_server/internal/nodes.py | 1 + .../datajunction_server/models/deployment.py | 18 +-- .../datajunction_server/models/node.py | 12 +- .../datajunction_server/models/unit.py | 117 ++++++++++-------- .../internal/deployment/orchestration_test.py | 15 ++- .../tests/internal/legacy_unit_compat_test.py | 28 +++-- datajunction-server/tests/models/unit_test.py | 16 +-- 9 files changed, 174 insertions(+), 130 deletions(-) diff --git a/datajunction-server/datajunction_server/database/node.py b/datajunction-server/datajunction_server/database/node.py index ca2ff9886..f55cf3a85 100644 --- a/datajunction-server/datajunction_server/database/node.py +++ b/datajunction-server/datajunction_server/database/node.py @@ -74,6 +74,10 @@ ) from datajunction_server.models.node_type import NodeType from datajunction_server.models.partition import PartitionType +from datajunction_server.models.unit import ( + _get_unit_adapter, + structured_to_legacy_unit, +) from datajunction_server.naming import amenable_name from datajunction_server.typing import UTCDatetime from datajunction_server.utils import SEPARATOR, execute_with_retry @@ -178,38 +182,34 @@ def _build_search_score( def _resolve_metric_unit_for_spec( col_unit: "Any | None", legacy_from_md: "Any | None", -) -> "Tuple[Any | None, dict | None]": +) -> "Tuple[Any | None, Any | None]": """ - Decide which of (legacy enum, structured dict) to populate on a MetricSpec + Decide which of (legacy enum, structured Unit) to populate on a MetricSpec when round-tripping a metric back from the DB. - Accepts `col_unit` as either a `Unit` Pydantic instance (the typed shape - returned by `UnitTypeDecorator`) or a plain dict (in-memory test - fixtures, or untyped paths). Internally normalizes via `unit_to_dict`. - Rules (preserves authoring intent on round-trip): - No structured `column.unit` → emit only the legacy field (whatever was - in metric_metadata.unit), structured stays None. + in metric_metadata.unit); structured stays None. - Structured `column.unit` is legacy-expressible (USD, percentage, time - codes, etc.) → keep the legacy field as authoritative so `unit: dollar` - round-trips as `unit: dollar`, not `unit: {kind: currency, code: USD}`. - Structured stays None. + codes, BYTE) → keep the legacy field as authoritative so `unit: dollar` + round-trips as `unit: dollar`, not the structured shape. - Structured `column.unit` is NOT legacy-expressible (EUR, compound, - count-with-code, data_size) → populate structured, null the legacy so - nothing tries to dual-emit. + count-with-code, other data sizes) → populate structured, null the + legacy so nothing tries to dual-emit. - Returns (legacy_for_spec, structured_for_spec_dict). - """ - from datajunction_server.models.unit import ( - structured_to_legacy_unit_name, - unit_to_dict, - ) + Accepts either a `Unit` Pydantic instance (the typed shape returned by + `UnitTypeDecorator`) or a plain dict (in-memory test paths) and + normalizes via the same TypeAdapter the decorator uses on read. - col_unit_dict = unit_to_dict(col_unit) - if col_unit_dict is None: + Returns (legacy_for_spec, structured_for_spec). The structured value is + a Unit instance; `MetricSpec.unit_structured` accepts it directly. + """ + if col_unit is None: return legacy_from_md, None - if structured_to_legacy_unit_name(col_unit_dict) is None: - return None, col_unit_dict + if isinstance(col_unit, dict): + col_unit = _get_unit_adapter().validate_python(col_unit) + if structured_to_legacy_unit(col_unit) is None: + return None, col_unit return legacy_from_md, None diff --git a/datajunction-server/datajunction_server/internal/deployment/orchestrator.py b/datajunction-server/datajunction_server/internal/deployment/orchestrator.py index 1f48d23bc..c3c1ee0d1 100644 --- a/datajunction-server/datajunction_server/internal/deployment/orchestrator.py +++ b/datajunction-server/datajunction_server/internal/deployment/orchestrator.py @@ -87,9 +87,11 @@ NodeType, ) from datajunction_server.models.unit import ( + AtomicUnit, + CompoundUnit, + _get_unit_adapter, legacy_unit_to_structured, - structured_to_legacy_unit_name, - unit_to_dict, + structured_to_legacy_unit, ) from datajunction_server.utils import ( SEPARATOR, @@ -3522,15 +3524,14 @@ async def _create_node_revision( if result.spec.node_type == NodeType.METRIC: metric_spec = cast(MetricSpec, result.spec) + output_col: Column | None = None if new_revision.columns: # pragma: no branch - new_revision.columns[0].display_name = new_revision.display_name - - # Bridge legacy metric_metadata.unit and structured - # columns[0].unit so users on either input shape end up with - # both fields populated where expressible. See - # `_resolve_metric_unit` for the full precedence rules. - output_col = new_revision.columns[0] if new_revision.columns else None - if output_col is not None: + output_col = new_revision.columns[0] + output_col.display_name = new_revision.display_name + # Bridge legacy metric_metadata.unit and structured + # columns[0].unit so users on either input shape end up + # with both fields populated where expressible. See + # `_resolve_metric_unit` for the precedence rules. output_col.unit = self._resolve_metric_unit( metric_spec, output_col.unit, @@ -3571,12 +3572,12 @@ async def _create_node_revision( def _resolve_metric_unit( self, metric_spec: MetricSpec, - column_unit: dict | None, - ) -> dict | None: + column_unit: "AtomicUnit | CompoundUnit | dict | None", + ) -> "AtomicUnit | CompoundUnit | None": """ - Compute the canonical structured unit dict for a metric's output - column, reconciling the three possible input surfaces. Pure: - returns the value the caller should assign; no mutation. + Compute the canonical structured unit for a metric's output column, + reconciling the three possible input surfaces. Pure: returns the + value the caller should assign to `column.unit`; no mutation. Input surfaces (highest priority first): 1. `metric_spec.unit_structured` — top-level structured `unit:` @@ -3591,15 +3592,18 @@ def _resolve_metric_unit( Conflict handling: if (1) is set together with (2) or (3) and the values disagree, a warning is logged naming the node. - Returns the canonical dict shape (None if no unit is set on any - surface). + Accepts dict for `column_unit` (in-memory test paths assign dicts + directly) and normalizes to Unit internally; returns Unit or None. """ legacy = metric_spec.unit_enum - spec_structured = unit_to_dict(metric_spec.unit_structured) - # column.unit comes from JSONB as a plain dict; canonicalize so a - # hand-rolled or legacy-translated value compares equal to the - # spec_structured shape regardless of code: None presence. - column_structured = unit_to_dict(column_unit) + spec_structured: AtomicUnit | CompoundUnit | None = metric_spec.unit_structured + # column.unit may arrive as Unit (typed via the SQLAlchemy decorator) + # or as a raw dict (test fixtures, fresh in-memory writes); normalize + # to a Unit for direct equality comparisons. + if isinstance(column_unit, dict): + column_structured = _get_unit_adapter().validate_python(column_unit) + else: + column_structured = column_unit # (1) Metric-level structured input wins absolutely. if spec_structured is not None: @@ -3656,8 +3660,7 @@ def _derive_legacy_unit_for_storage( """ structured = output_col.unit if output_col is not None else None if structured is not None: - name = structured_to_legacy_unit_name(structured) - return MetricUnit[name] if name is not None else None + return structured_to_legacy_unit(structured) # No structured unit — fall back to whatever the legacy spec field # had (typically None at this point because the caller will have # already assigned the resolved column.unit from _resolve_metric_unit). @@ -3675,7 +3678,7 @@ def _create_column_from_spec( display_name=col.display_name, description=col.description, order=order, - unit=unit_to_dict(col.unit), + unit=col.unit, attributes=[ ColumnAttribute( attribute_type=self.registry.attributes.get(attr), diff --git a/datajunction-server/datajunction_server/internal/nodes.py b/datajunction-server/datajunction_server/internal/nodes.py index d7361eac3..82fe3d16d 100644 --- a/datajunction-server/datajunction_server/internal/nodes.py +++ b/datajunction-server/datajunction_server/internal/nodes.py @@ -100,6 +100,7 @@ ) from datajunction_server.models.node_type import NodeType from datajunction_server.models.query import QueryCreate +from datajunction_server.models.unit import legacy_unit_to_structured from datajunction_server.service_clients import QueryServiceClient from datajunction_server.sql.dag import ( get_downstream_nodes, diff --git a/datajunction-server/datajunction_server/models/deployment.py b/datajunction-server/datajunction_server/models/deployment.py index 6730d6dc9..3a40a9eb4 100644 --- a/datajunction-server/datajunction_server/models/deployment.py +++ b/datajunction-server/datajunction_server/models/deployment.py @@ -524,18 +524,18 @@ def model_dump(self, **kwargs): # pragma: no cover base["unit"] = self.unit return base - def _canonical_unit(self) -> dict | None: + def _canonical_unit(self) -> "Unit | None": """ - Reduce both legacy and structured inputs to the same canonical dict - form for equality comparisons. Returns None when the metric has no - unit (or only the UNKNOWN sentinel), the structured dict otherwise. - Two specs that author the same conceptual unit via different input - shapes (`unit: dollar` vs `unit: {kind: currency, code: USD}`) will - produce identical canonical forms — so __eq__ doesn't falsely report - drift between YAML and DB-roundtripped specs. + Reduce both legacy and structured inputs to the same canonical Unit + instance for equality comparisons. Returns None when the metric has + no unit (or only the UNKNOWN sentinel). Two specs that author the + same conceptual unit via different input shapes (`unit: dollar` vs + `unit: {kind: currency, code: USD}`) produce equal frozen Unit + instances — so __eq__ doesn't falsely report drift between YAML and + DB-roundtripped specs. """ if self.unit_structured is not None: - return unit_to_dict(self.unit_structured) + return self.unit_structured if self.unit_enum is None or self.unit_enum == MetricUnit.UNKNOWN: return None return legacy_unit_to_structured(self.unit_enum) diff --git a/datajunction-server/datajunction_server/models/node.py b/datajunction-server/datajunction_server/models/node.py index 2359721d9..a7c288c9d 100644 --- a/datajunction-server/datajunction_server/models/node.py +++ b/datajunction-server/datajunction_server/models/node.py @@ -32,6 +32,11 @@ from datajunction_server.models.materialization import MaterializationConfigOutput from datajunction_server.models.node_type import NodeNameOutput, NodeType from datajunction_server.models.partition import PartitionOutput +from datajunction_server.models.unit import ( + AtomicUnit, + CompoundUnit, + unit_to_dict, +) from datajunction_server.models.tag import TagMinimum, TagOutput from datajunction_server.models.user import UserNameOnly from datajunction_server.sql.parsing.types import ColumnType @@ -794,11 +799,12 @@ def extract_type(cls, raw): return str(raw) @field_validator("unit", mode="before") - def _canonicalize_unit(cls, raw): + def _canonicalize_unit( + cls, + raw: "AtomicUnit | CompoundUnit | dict | None", + ) -> dict | None: """Accept either a typed Unit (from the SQLAlchemy decorator) or a raw dict; emit the canonical JSON-friendly dict shape on output.""" - from datajunction_server.models.unit import unit_to_dict - return unit_to_dict(raw) diff --git a/datajunction-server/datajunction_server/models/unit.py b/datajunction-server/datajunction_server/models/unit.py index 137154fa2..b4b259cbe 100644 --- a/datajunction-server/datajunction_server/models/unit.py +++ b/datajunction-server/datajunction_server/models/unit.py @@ -12,7 +12,7 @@ from enum import Enum from typing import TYPE_CHECKING, Annotated, Any, Union -from pydantic import BaseModel, Discriminator, Tag, model_validator +from pydantic import BaseModel, ConfigDict, Discriminator, Tag, model_validator from sqlalchemy import TypeDecorator from sqlalchemy.dialects.postgresql import JSONB @@ -101,6 +101,10 @@ class AtomicUnit(BaseModel): instance (currency code, time scale, count label, etc.). """ + # Frozen so shared singletons (e.g., the legacy translation table) are + # safe to hand out without defensive copies. + model_config = ConfigDict(frozen=True) + kind: UnitKind code: str | None = None @@ -181,6 +185,8 @@ class CompoundUnit(BaseModel): (CTR = clicks/impressions, QPS = queries/second, throughput = bytes/second). """ + model_config = ConfigDict(frozen=True) + numerator: AtomicUnit denominator: AtomicUnit @@ -312,81 +318,92 @@ def process_result_value(self, value, dialect): # Keyed by MetricUnit.name (not the enum member itself) so this module can # avoid importing node.py at module load. Callers translate to/from the enum -# at the call site. -_LEGACY_NAME_TO_STRUCTURED: dict[str, dict | None] = { +# at the call site. Values are frozen Unit instances — safe to hand out +# without defensive copies. +_LEGACY_NAME_TO_STRUCTURED: "dict[str, AtomicUnit | None]" = { "UNKNOWN": None, - "UNITLESS": {"kind": "unitless"}, - "PERCENTAGE": {"kind": "percentage"}, - "PROPORTION": {"kind": "proportion"}, - "DOLLAR": {"kind": "currency", "code": "USD"}, - "MILLISECOND": {"kind": "time", "code": "ms"}, - "SECOND": {"kind": "time", "code": "s"}, - "MINUTE": {"kind": "time", "code": "min"}, - "HOUR": {"kind": "time", "code": "h"}, - "DAY": {"kind": "time", "code": "d"}, - "WEEK": {"kind": "time", "code": "wk"}, - "MONTH": {"kind": "time", "code": "mo"}, - "YEAR": {"kind": "time", "code": "yr"}, - "BYTE": {"kind": "data_size", "code": "B"}, + "UNITLESS": AtomicUnit(kind=UnitKind.UNITLESS), + "PERCENTAGE": AtomicUnit(kind=UnitKind.PERCENTAGE), + "PROPORTION": AtomicUnit(kind=UnitKind.PROPORTION), + "DOLLAR": AtomicUnit(kind=UnitKind.CURRENCY, code="USD"), + "MILLISECOND": AtomicUnit(kind=UnitKind.TIME, code="ms"), + "SECOND": AtomicUnit(kind=UnitKind.TIME, code="s"), + "MINUTE": AtomicUnit(kind=UnitKind.TIME, code="min"), + "HOUR": AtomicUnit(kind=UnitKind.TIME, code="h"), + "DAY": AtomicUnit(kind=UnitKind.TIME, code="d"), + "WEEK": AtomicUnit(kind=UnitKind.TIME, code="wk"), + "MONTH": AtomicUnit(kind=UnitKind.TIME, code="mo"), + "YEAR": AtomicUnit(kind=UnitKind.TIME, code="yr"), + "BYTE": AtomicUnit(kind=UnitKind.DATA_SIZE, code="B"), # BIT has no entry in DATA_SIZE_CODES (which uses byte-based units like # B, KB, MB, ... and their binary cousins KiB, MiB). Bits are atypical # in BI / data-platform metrics; if a user appears, add "b" to - # DATA_SIZE_CODES and {"BIT": {"kind": "data_size", "code": "b"}} here. + # DATA_SIZE_CODES and {"BIT": AtomicUnit(kind=DATA_SIZE, code="b")} here. } def legacy_unit_to_structured( legacy: "MetricUnit | None", -) -> dict | None: +) -> "AtomicUnit | None": """ - Translate a legacy `MetricUnit` enum value into a structured `Unit` dict. + Translate a legacy `MetricUnit` enum value into a structured `Unit`. Returns None for `MetricUnit.UNKNOWN` and for `None`, since both mean - "no unit set." Returns `{kind: unitless}` for `MetricUnit.UNITLESS`, - preserving the distinction between "explicitly no unit" and "not set." + "no unit set." Returns `AtomicUnit(kind=unitless)` for + `MetricUnit.UNITLESS`, preserving the distinction between "explicitly + no unit" and "not set." """ if legacy is None: return None return _LEGACY_NAME_TO_STRUCTURED.get(legacy.name) -def structured_to_legacy_unit_name( +def structured_to_legacy_unit( unit: "AtomicUnit | CompoundUnit | dict | None", -) -> str | None: +) -> "MetricUnit | None": """ Translate a structured `Unit` (Pydantic model or already-dict) back to - the legacy `MetricUnit.name` when expressible. Returns None when the - structured value has no legacy equivalent (non-USD currencies, compound - units, data sizes, count with code, etc.) — callers should treat that - as "don't populate the legacy column." - - Returns the enum member name (e.g. "DOLLAR"), not a MetricUnit instance, - so this module stays import-free of node.py. Callers do - `MetricUnit[name]` at the call site. + the legacy `MetricUnit` enum value when expressible. Returns None when + the structured value has no legacy equivalent (non-USD currencies, + compound units, data sizes other than BYTE, count with code, etc.) — + callers should treat that as "don't populate the legacy column." + + `MetricUnit` is imported locally to keep `models/unit.py` free of a + module-top dependency on `models/node.py`, where `MetricUnit` lives. """ - unit = unit_to_dict(unit) + # Local import: `models/node.py` imports `unit_to_dict` from this module + # at module-load, so a top-level reverse import would cycle. + from datajunction_server.models.node import MetricUnit + if unit is None: return None - # Compound units have no legacy equivalent. - if "numerator" in unit: + # Normalize dict input to a Unit instance for typed attribute access. + # A malformed dict (e.g., an unknown time code that slipped through a + # direct DB write) has no legacy equivalent — treat it as unmapped. + if isinstance(unit, dict): + try: + unit = _get_unit_adapter().validate_python(unit) + except Exception: + return None + if isinstance(unit, CompoundUnit): return None - kind = unit.get("kind") - code = unit.get("code") - if kind == "unitless": - return "UNITLESS" - if kind == "percentage": - return "PERCENTAGE" - if kind == "proportion": - return "PROPORTION" - if kind == "currency": - return "DOLLAR" if code == "USD" else None - if kind in ("time", "data_size"): - # Reverse of _LEGACY_NAME_TO_STRUCTURED for kinds where multiple - # legacy enum members map by code. - target = {"kind": kind, "code": code} + assert isinstance(unit, AtomicUnit) # narrowed by isinstance above + adapter + kind = unit.kind + code = unit.code + if kind == UnitKind.UNITLESS: + return MetricUnit.UNITLESS + if kind == UnitKind.PERCENTAGE: + return MetricUnit.PERCENTAGE + if kind == UnitKind.PROPORTION: + return MetricUnit.PROPORTION + if kind == UnitKind.CURRENCY: + return MetricUnit.DOLLAR if code == "USD" else None + if kind in (UnitKind.TIME, UnitKind.DATA_SIZE): + # Reverse of _LEGACY_NAME_TO_STRUCTURED — direct equality on + # frozen Unit instances. for legacy_name, structured in _LEGACY_NAME_TO_STRUCTURED.items(): - if structured == target: - return legacy_name + if structured == unit: + return MetricUnit[legacy_name] return None # count — free-form code, no legacy equivalent. return None diff --git a/datajunction-server/tests/internal/deployment/orchestration_test.py b/datajunction-server/tests/internal/deployment/orchestration_test.py index dac80347b..b362182af 100644 --- a/datajunction-server/tests/internal/deployment/orchestration_test.py +++ b/datajunction-server/tests/internal/deployment/orchestration_test.py @@ -2704,10 +2704,17 @@ def _make_spec(unit: MetricUnit | dict | None = None) -> MetricSpec: @staticmethod def _resolve(spec: MetricSpec, column_unit: dict | None) -> dict | None: - return DeploymentOrchestrator._resolve_metric_unit( - MagicMock(), - spec, - column_unit, + """Resolve and convert to canonical dict so tests can assert against + plain dict literals. The resolver itself returns Unit; we coerce to + dict here for assertion readability.""" + from datajunction_server.models.unit import unit_to_dict + + return unit_to_dict( + DeploymentOrchestrator._resolve_metric_unit( + MagicMock(), + spec, + column_unit, + ), ) def test_legacy_only_resolves_to_translated_dict(self): diff --git a/datajunction-server/tests/internal/legacy_unit_compat_test.py b/datajunction-server/tests/internal/legacy_unit_compat_test.py index f63d2b8d6..545a212d7 100644 --- a/datajunction-server/tests/internal/legacy_unit_compat_test.py +++ b/datajunction-server/tests/internal/legacy_unit_compat_test.py @@ -27,8 +27,11 @@ from datajunction_server.models.deployment import MetricSpec from datajunction_server.models.node import MetricUnit from datajunction_server.models.unit import ( + AtomicUnit, + UnitKind, legacy_unit_to_structured, - structured_to_legacy_unit_name, + structured_to_legacy_unit, + unit_to_dict, ) @@ -83,7 +86,7 @@ def test_orchestrator_resolves_legacy_to_structured(self, spec: MetricSpec): spec, None, ) - assert resolved == {"kind": "currency", "code": "USD"} + assert resolved == AtomicUnit(kind=UnitKind.CURRENCY, code="USD") def test_orchestrator_derives_legacy_for_storage(self, spec: MetricSpec): """After resolution, the derive step returns MetricUnit.DOLLAR so the @@ -106,8 +109,8 @@ def test_translation_round_trips(self, spec: MetricSpec): """Legacy → structured → legacy is the identity for every value in the migration table — guarantees both halves of the bridge agree.""" structured = legacy_unit_to_structured(spec.unit_enum) - assert structured == {"kind": "currency", "code": "USD"} - legacy_name = structured_to_legacy_unit_name(structured) + assert structured == AtomicUnit(kind=UnitKind.CURRENCY, code="USD") + legacy_name = structured_to_legacy_unit(structured) assert legacy_name == "DOLLAR" assert MetricUnit[legacy_name] == spec.unit_enum @@ -171,7 +174,7 @@ def test_legacy_value_round_trips(self, legacy_str: str): structured = legacy_unit_to_structured(spec.unit_enum) assert structured is not None # Reverse translation returns the same enum member. - assert structured_to_legacy_unit_name(structured) == legacy_str.upper() + assert structured_to_legacy_unit(structured) == legacy_str.upper() def test_unknown_legacy_value_rejected(self): """Typos in legacy YAML still raise a clear error, not a silent pass.""" @@ -238,7 +241,8 @@ def test_resolve_metric_unit_for_spec_structured_only(self, col_unit): legacy_from_md=None, ) assert legacy is None - assert structured == col_unit + # Helper returns a Unit instance; compare via canonical dict. + assert unit_to_dict(structured) == col_unit def test_uppercase_legacy_input_accepted(self): # noqa: ANN201 """Legacy parser accepts case-insensitive input (existing behavior).""" @@ -369,7 +373,8 @@ def test_none_returns_none(self): def test_storage_paths_produce_identical_shape(self): """The legacy-translation path and the structured-input path must - write the same JSONB shape for a conceptually identical unit.""" + produce the same canonical value for a conceptually identical unit. + Comparing via unit_to_dict normalizes both Unit and dict inputs.""" from datajunction_server.models.unit import ( AtomicUnit, UnitKind, @@ -377,9 +382,12 @@ def test_storage_paths_produce_identical_shape(self): unit_to_dict, ) - legacy_shape = legacy_unit_to_structured(MetricUnit.UNITLESS) - structured_shape = unit_to_dict(AtomicUnit(kind=UnitKind.UNITLESS)) - assert legacy_shape == structured_shape + legacy = legacy_unit_to_structured(MetricUnit.UNITLESS) + structured = AtomicUnit(kind=UnitKind.UNITLESS) + # Frozen Unit equality directly: + assert legacy == structured + # And the JSONB-canonical dict shape matches too: + assert unit_to_dict(legacy) == unit_to_dict(structured) class TestBadDictErrorUx: diff --git a/datajunction-server/tests/models/unit_test.py b/datajunction-server/tests/models/unit_test.py index c55f281ba..58e4f1ee3 100644 --- a/datajunction-server/tests/models/unit_test.py +++ b/datajunction-server/tests/models/unit_test.py @@ -13,7 +13,8 @@ Unit, UnitKind, legacy_unit_to_structured, - structured_to_legacy_unit_name, + structured_to_legacy_unit, + unit_to_dict, ) _unit_adapter: TypeAdapter[Unit] = TypeAdapter(Unit) @@ -305,7 +306,7 @@ def test_round_trip_idempotent(self) -> None: class TestLegacyUnitTranslation: """ - Coverage for `legacy_unit_to_structured` and `structured_to_legacy_unit_name`. + Coverage for `legacy_unit_to_structured` and `structured_to_legacy_unit`. """ @pytest.mark.parametrize( @@ -331,14 +332,15 @@ def test_forward_translation_matches_table( legacy: MetricUnit, structured: dict | None, ) -> None: - assert legacy_unit_to_structured(legacy) == structured + # Returns a Unit instance (or None); compare via canonical dict. + assert unit_to_dict(legacy_unit_to_structured(legacy)) == structured def test_forward_translation_handles_none(self) -> None: assert legacy_unit_to_structured(None) is None def test_forward_translation_byte_maps_to_data_size(self) -> None: # BYTE has a clean structured equivalent under DATA_SIZE. - assert legacy_unit_to_structured(MetricUnit.BYTE) == { + assert unit_to_dict(legacy_unit_to_structured(MetricUnit.BYTE)) == { "kind": "data_size", "code": "B", } @@ -371,7 +373,7 @@ def test_reverse_translation_matches_table( structured: dict, legacy_name: str, ) -> None: - assert structured_to_legacy_unit_name(structured) == legacy_name + assert structured_to_legacy_unit(structured) == legacy_name @pytest.mark.parametrize( "structured", @@ -397,7 +399,7 @@ def test_reverse_translation_returns_none_for_inexpressible( self, structured: dict | None, ) -> None: - assert structured_to_legacy_unit_name(structured) is None + assert structured_to_legacy_unit(structured) is None @pytest.mark.parametrize( "legacy", @@ -423,4 +425,4 @@ def test_round_trip_legacy_to_structured_to_legacy( ) -> None: structured = legacy_unit_to_structured(legacy) assert structured is not None - assert structured_to_legacy_unit_name(structured) == legacy.name + assert structured_to_legacy_unit(structured) == legacy.name From a0d5ab8b94183799c834d8cfa45c3b11238bc0f9 Mon Sep 17 00:00:00 2001 From: Yian Shang Date: Mon, 25 May 2026 20:00:54 -0700 Subject: [PATCH 10/11] Fix --- .../datajunction_server/database/node.py | 20 ++++++----- .../internal/deployment/orchestrator.py | 4 ++- .../datajunction_server/internal/nodes.py | 1 - .../tests/internal/legacy_unit_compat_test.py | 8 ++--- datajunction-server/tests/models/unit_test.py | 34 +++++++++---------- 5 files changed, 36 insertions(+), 31 deletions(-) diff --git a/datajunction-server/datajunction_server/database/node.py b/datajunction-server/datajunction_server/database/node.py index f55cf3a85..2705d4f34 100644 --- a/datajunction-server/datajunction_server/database/node.py +++ b/datajunction-server/datajunction_server/database/node.py @@ -75,6 +75,8 @@ from datajunction_server.models.node_type import NodeType from datajunction_server.models.partition import PartitionType from datajunction_server.models.unit import ( + AtomicUnit, + CompoundUnit, _get_unit_adapter, structured_to_legacy_unit, ) @@ -180,9 +182,9 @@ def _build_search_score( def _resolve_metric_unit_for_spec( - col_unit: "Any | None", - legacy_from_md: "Any | None", -) -> "Tuple[Any | None, Any | None]": + col_unit: "AtomicUnit | CompoundUnit | dict | None", + legacy_from_md: "MetricUnit | None", +) -> "Tuple[MetricUnit | None, AtomicUnit | CompoundUnit | None]": """ Decide which of (legacy enum, structured Unit) to populate on a MetricSpec when round-tripping a metric back from the DB. @@ -197,12 +199,14 @@ def _resolve_metric_unit_for_spec( count-with-code, other data sizes) → populate structured, null the legacy so nothing tries to dual-emit. - Accepts either a `Unit` Pydantic instance (the typed shape returned by - `UnitTypeDecorator`) or a plain dict (in-memory test paths) and - normalizes via the same TypeAdapter the decorator uses on read. + Accepts `col_unit` as either a Unit Pydantic instance (the typed shape + returned by `UnitTypeDecorator`) or a plain dict (in-memory test paths + or untyped writes); normalizes via the adapter so the rest of the + function works on the Unit instance. - Returns (legacy_for_spec, structured_for_spec). The structured value is - a Unit instance; `MetricSpec.unit_structured` accepts it directly. + Returns `(legacy_for_spec, structured_for_spec)`: + - `legacy_for_spec` populates `MetricSpec.unit_enum`. + - `structured_for_spec` populates `MetricSpec.unit_structured`. """ if col_unit is None: return legacy_from_md, None diff --git a/datajunction-server/datajunction_server/internal/deployment/orchestrator.py b/datajunction-server/datajunction_server/internal/deployment/orchestrator.py index c3c1ee0d1..26b150521 100644 --- a/datajunction-server/datajunction_server/internal/deployment/orchestrator.py +++ b/datajunction-server/datajunction_server/internal/deployment/orchestrator.py @@ -3524,7 +3524,7 @@ async def _create_node_revision( if result.spec.node_type == NodeType.METRIC: metric_spec = cast(MetricSpec, result.spec) - output_col: Column | None = None + output_col: Column | None if new_revision.columns: # pragma: no branch output_col = new_revision.columns[0] output_col.display_name = new_revision.display_name @@ -3536,6 +3536,8 @@ async def _create_node_revision( metric_spec, output_col.unit, ) + else: + output_col = None legacy_unit = self._derive_legacy_unit_for_storage( metric_spec, diff --git a/datajunction-server/datajunction_server/internal/nodes.py b/datajunction-server/datajunction_server/internal/nodes.py index 82fe3d16d..d7361eac3 100644 --- a/datajunction-server/datajunction_server/internal/nodes.py +++ b/datajunction-server/datajunction_server/internal/nodes.py @@ -100,7 +100,6 @@ ) from datajunction_server.models.node_type import NodeType from datajunction_server.models.query import QueryCreate -from datajunction_server.models.unit import legacy_unit_to_structured from datajunction_server.service_clients import QueryServiceClient from datajunction_server.sql.dag import ( get_downstream_nodes, diff --git a/datajunction-server/tests/internal/legacy_unit_compat_test.py b/datajunction-server/tests/internal/legacy_unit_compat_test.py index 545a212d7..1b488e0cd 100644 --- a/datajunction-server/tests/internal/legacy_unit_compat_test.py +++ b/datajunction-server/tests/internal/legacy_unit_compat_test.py @@ -110,9 +110,9 @@ def test_translation_round_trips(self, spec: MetricSpec): migration table — guarantees both halves of the bridge agree.""" structured = legacy_unit_to_structured(spec.unit_enum) assert structured == AtomicUnit(kind=UnitKind.CURRENCY, code="USD") - legacy_name = structured_to_legacy_unit(structured) - assert legacy_name == "DOLLAR" - assert MetricUnit[legacy_name] == spec.unit_enum + legacy = structured_to_legacy_unit(structured) + assert legacy == MetricUnit.DOLLAR + assert legacy == spec.unit_enum def test_yaml_export_emits_legacy_string(self, spec: MetricSpec): """node_spec_to_yaml emits `unit: dollar` (string), not a structured @@ -174,7 +174,7 @@ def test_legacy_value_round_trips(self, legacy_str: str): structured = legacy_unit_to_structured(spec.unit_enum) assert structured is not None # Reverse translation returns the same enum member. - assert structured_to_legacy_unit(structured) == legacy_str.upper() + assert structured_to_legacy_unit(structured) == MetricUnit[legacy_str.upper()] def test_unknown_legacy_value_rejected(self): """Typos in legacy YAML still raise a clear error, not a silent pass.""" diff --git a/datajunction-server/tests/models/unit_test.py b/datajunction-server/tests/models/unit_test.py index 58e4f1ee3..223dfaef2 100644 --- a/datajunction-server/tests/models/unit_test.py +++ b/datajunction-server/tests/models/unit_test.py @@ -351,29 +351,29 @@ def test_forward_translation_bit_unmapped(self) -> None: assert legacy_unit_to_structured(MetricUnit.BIT) is None @pytest.mark.parametrize( - ("structured", "legacy_name"), + ("structured", "expected"), [ - ({"kind": "unitless"}, "UNITLESS"), - ({"kind": "percentage"}, "PERCENTAGE"), - ({"kind": "proportion"}, "PROPORTION"), - ({"kind": "currency", "code": "USD"}, "DOLLAR"), - ({"kind": "time", "code": "ms"}, "MILLISECOND"), - ({"kind": "time", "code": "s"}, "SECOND"), - ({"kind": "time", "code": "min"}, "MINUTE"), - ({"kind": "time", "code": "h"}, "HOUR"), - ({"kind": "time", "code": "d"}, "DAY"), - ({"kind": "time", "code": "wk"}, "WEEK"), - ({"kind": "time", "code": "mo"}, "MONTH"), - ({"kind": "time", "code": "yr"}, "YEAR"), - ({"kind": "data_size", "code": "B"}, "BYTE"), + ({"kind": "unitless"}, MetricUnit.UNITLESS), + ({"kind": "percentage"}, MetricUnit.PERCENTAGE), + ({"kind": "proportion"}, MetricUnit.PROPORTION), + ({"kind": "currency", "code": "USD"}, MetricUnit.DOLLAR), + ({"kind": "time", "code": "ms"}, MetricUnit.MILLISECOND), + ({"kind": "time", "code": "s"}, MetricUnit.SECOND), + ({"kind": "time", "code": "min"}, MetricUnit.MINUTE), + ({"kind": "time", "code": "h"}, MetricUnit.HOUR), + ({"kind": "time", "code": "d"}, MetricUnit.DAY), + ({"kind": "time", "code": "wk"}, MetricUnit.WEEK), + ({"kind": "time", "code": "mo"}, MetricUnit.MONTH), + ({"kind": "time", "code": "yr"}, MetricUnit.YEAR), + ({"kind": "data_size", "code": "B"}, MetricUnit.BYTE), ], ) def test_reverse_translation_matches_table( self, structured: dict, - legacy_name: str, + expected: MetricUnit, ) -> None: - assert structured_to_legacy_unit(structured) == legacy_name + assert structured_to_legacy_unit(structured) == expected @pytest.mark.parametrize( "structured", @@ -425,4 +425,4 @@ def test_round_trip_legacy_to_structured_to_legacy( ) -> None: structured = legacy_unit_to_structured(legacy) assert structured is not None - assert structured_to_legacy_unit(structured) == legacy.name + assert structured_to_legacy_unit(structured) == legacy From 3ee9e5688ae6c7c701c28acd52bd4c6145f2fd6c Mon Sep 17 00:00:00 2001 From: Yian Shang Date: Mon, 25 May 2026 20:20:32 -0700 Subject: [PATCH 11/11] Fix --- .../datajunction_server/database/node.py | 9 ++++++--- .../internal/deployment/orchestrator.py | 14 +++++--------- 2 files changed, 11 insertions(+), 12 deletions(-) diff --git a/datajunction-server/datajunction_server/database/node.py b/datajunction-server/datajunction_server/database/node.py index 2705d4f34..635cc01f3 100644 --- a/datajunction-server/datajunction_server/database/node.py +++ b/datajunction-server/datajunction_server/database/node.py @@ -210,10 +210,13 @@ def _resolve_metric_unit_for_spec( """ if col_unit is None: return legacy_from_md, None + normalized: "AtomicUnit | CompoundUnit" if isinstance(col_unit, dict): - col_unit = _get_unit_adapter().validate_python(col_unit) - if structured_to_legacy_unit(col_unit) is None: - return None, col_unit + normalized = _get_unit_adapter().validate_python(col_unit) + else: + normalized = col_unit + if structured_to_legacy_unit(normalized) is None: + return None, normalized return legacy_from_md, None diff --git a/datajunction-server/datajunction_server/internal/deployment/orchestrator.py b/datajunction-server/datajunction_server/internal/deployment/orchestrator.py index 26b150521..2b6e94759 100644 --- a/datajunction-server/datajunction_server/internal/deployment/orchestrator.py +++ b/datajunction-server/datajunction_server/internal/deployment/orchestrator.py @@ -3524,20 +3524,16 @@ async def _create_node_revision( if result.spec.node_type == NodeType.METRIC: metric_spec = cast(MetricSpec, result.spec) - output_col: Column | None + output_col: Column | None = None if new_revision.columns: # pragma: no branch - output_col = new_revision.columns[0] - output_col.display_name = new_revision.display_name + col: Column = new_revision.columns[0] + col.display_name = new_revision.display_name # Bridge legacy metric_metadata.unit and structured # columns[0].unit so users on either input shape end up # with both fields populated where expressible. See # `_resolve_metric_unit` for the precedence rules. - output_col.unit = self._resolve_metric_unit( - metric_spec, - output_col.unit, - ) - else: - output_col = None + col.unit = self._resolve_metric_unit(metric_spec, col.unit) + output_col = col legacy_unit = self._derive_legacy_unit_for_storage( metric_spec,