From 7024647679298ae15a6986f348abe32eec5c9dcb Mon Sep 17 00:00:00 2001 From: Piotr Szul Date: Wed, 18 Mar 2026 14:43:55 +1000 Subject: [PATCH 01/41] feat: Implement %rowIndex environment variable for ViewDefinition forEach/forEachOrNull Add support for the %rowIndex environment variable as defined in the SQL on FHIR ViewDefinition spec. Within forEach and forEachOrNull iterations, %rowIndex resolves to the 0-based index of the current element. At the top level (no iteration), it evaluates to 0. Each nesting level maintains independent %rowIndex values. The implementation uses Spark's indexed transform(array, (elem, idx) ->) to track element positions during unnesting, threading the index through ProjectionContext into the FHIRPath evaluation as a supplied variable. Closes #2560 Co-Authored-By: Claude Opus 4.6 (1M context) --- .../fhirpath/column/ColumnRepresentation.java | 16 + .../evaluation/SingleResourceEvaluator.java | 17 ++ .../projection/ProjectionContext.java | 45 ++- .../projection/UnnestingSelection.java | 17 +- .../test/resources/viewTests/rowindex.json | 284 ++++++++++++++++++ .../.openspec.yaml | 2 + .../design.md | 74 +++++ .../proposal.md | 29 ++ .../specs/row-index-variable/spec.md | 80 +++++ .../tasks.md | 29 ++ openspec/specs/row-index-variable/spec.md | 80 +++++ 11 files changed, 668 insertions(+), 5 deletions(-) create mode 100644 fhirpath/src/test/resources/viewTests/rowindex.json create mode 100644 openspec/changes/archive/2026-03-18-row-index-env-variable/.openspec.yaml create mode 100644 openspec/changes/archive/2026-03-18-row-index-env-variable/design.md create mode 100644 openspec/changes/archive/2026-03-18-row-index-env-variable/proposal.md create mode 100644 openspec/changes/archive/2026-03-18-row-index-env-variable/specs/row-index-variable/spec.md create mode 100644 openspec/changes/archive/2026-03-18-row-index-env-variable/tasks.md create mode 100644 openspec/specs/row-index-variable/spec.md diff --git a/fhirpath/src/main/java/au/csiro/pathling/fhirpath/column/ColumnRepresentation.java b/fhirpath/src/main/java/au/csiro/pathling/fhirpath/column/ColumnRepresentation.java index 266981adcc..203e53b397 100644 --- a/fhirpath/src/main/java/au/csiro/pathling/fhirpath/column/ColumnRepresentation.java +++ b/fhirpath/src/main/java/au/csiro/pathling/fhirpath/column/ColumnRepresentation.java @@ -33,6 +33,7 @@ import jakarta.annotation.Nonnull; import jakarta.annotation.Nullable; import java.util.Optional; +import java.util.function.BiFunction; import java.util.function.BinaryOperator; import java.util.function.UnaryOperator; import java.util.stream.Stream; @@ -375,6 +376,21 @@ public ColumnRepresentation transform(final UnaryOperator lambda) { c -> functions.transform(c, lambda::apply), c -> when(c.isNotNull(), lambda.apply(c))); } + /** + * Transforms the current {@link ColumnRepresentation} using a lambda that receives both the + * element and its 0-based index within the array. + * + * @param lambda the function to apply to each element and its index + * @return a new {@link ColumnRepresentation} that is transformed + */ + @Nonnull + public ColumnRepresentation transformWithIndex( + @Nonnull final BiFunction lambda) { + return vectorize( + c -> functions.transform(c, lambda::apply), + c -> when(c.isNotNull(), lambda.apply(c, lit(0)))); + } + /** * Aggregates the current {@link ColumnRepresentation} using a zero value and an aggregator * function. diff --git a/fhirpath/src/main/java/au/csiro/pathling/fhirpath/evaluation/SingleResourceEvaluator.java b/fhirpath/src/main/java/au/csiro/pathling/fhirpath/evaluation/SingleResourceEvaluator.java index 0092a6dd8c..4968fcce47 100644 --- a/fhirpath/src/main/java/au/csiro/pathling/fhirpath/evaluation/SingleResourceEvaluator.java +++ b/fhirpath/src/main/java/au/csiro/pathling/fhirpath/evaluation/SingleResourceEvaluator.java @@ -26,6 +26,7 @@ import au.csiro.pathling.fhirpath.variable.EnvironmentVariableResolver; import au.csiro.pathling.fhirpath.variable.VariableResolverChain; import jakarta.annotation.Nonnull; +import java.util.HashMap; import java.util.Map; import lombok.AccessLevel; import lombok.AllArgsConstructor; @@ -91,6 +92,22 @@ public static SingleResourceEvaluator of( /** The FHIRPath evaluation configuration. */ @Nonnull private final FhirpathConfiguration configuration; + /** + * Creates a new SingleResourceEvaluator with an additional variable added to the variable map. + * + * @param name the variable name + * @param value the variable value as a Collection + * @return a new SingleResourceEvaluator with the additional variable + */ + @Nonnull + public SingleResourceEvaluator withVariable( + @Nonnull final String name, @Nonnull final Collection value) { + final Map newVariables = new HashMap<>(variables); + newVariables.put(name, value); + return new SingleResourceEvaluator( + resourceResolver, functionRegistry, newVariables, configuration); + } + /** * Evaluates a FHIRPath expression with the default input context. * diff --git a/fhirpath/src/main/java/au/csiro/pathling/projection/ProjectionContext.java b/fhirpath/src/main/java/au/csiro/pathling/projection/ProjectionContext.java index 3c30c2775c..98106ad999 100644 --- a/fhirpath/src/main/java/au/csiro/pathling/projection/ProjectionContext.java +++ b/fhirpath/src/main/java/au/csiro/pathling/projection/ProjectionContext.java @@ -17,9 +17,12 @@ package au.csiro.pathling.projection; +import static org.apache.spark.sql.functions.lit; + import au.csiro.pathling.fhirpath.FhirPath; import au.csiro.pathling.fhirpath.collection.Collection; import au.csiro.pathling.fhirpath.collection.EmptyCollection; +import au.csiro.pathling.fhirpath.collection.IntegerCollection; import au.csiro.pathling.fhirpath.column.DefaultRepresentation; import au.csiro.pathling.fhirpath.evaluation.SingleResourceEvaluator; import jakarta.annotation.Nonnull; @@ -30,14 +33,29 @@ * Dependencies and logic relating to the traversal of FHIRPath expressions. * *

This context holds an evaluator for FHIRPath expressions and the current input context for - * expression evaluation. + * expression evaluation. It also carries the current row index for use within forEach/forEachOrNull + * iterations. * * @param evaluator an evaluator for FHIRPath expressions (produces Column expressions) * @param inputContext the initial context for evaluation + * @param rowIndex the current 0-based element index within a forEach/forEachOrNull iteration * @author Piotr Szul */ public record ProjectionContext( - @Nonnull SingleResourceEvaluator evaluator, @Nonnull Collection inputContext) { + @Nonnull SingleResourceEvaluator evaluator, + @Nonnull Collection inputContext, + @Nonnull Column rowIndex) { + + /** + * Creates a new ProjectionContext with the default row index of 0. + * + * @param evaluator an evaluator for FHIRPath expressions + * @param inputContext the initial context for evaluation + */ + public ProjectionContext( + @Nonnull final SingleResourceEvaluator evaluator, @Nonnull final Collection inputContext) { + this(evaluator, inputContext, lit(0)); + } /** * Creates a new ProjectionContext with a different input context. @@ -47,7 +65,18 @@ public record ProjectionContext( */ @Nonnull public ProjectionContext withInputContext(@Nonnull final Collection inputContext) { - return new ProjectionContext(evaluator, inputContext); + return new ProjectionContext(evaluator, inputContext, rowIndex); + } + + /** + * Creates a new ProjectionContext with a different row index. + * + * @param rowIndex the new row index column + * @return a new ProjectionContext with the specified row index + */ + @Nonnull + public ProjectionContext withRowIndex(@Nonnull final Column rowIndex) { + return new ProjectionContext(evaluator, inputContext, rowIndex); } /** @@ -94,15 +123,23 @@ public ProjectionContext withEmptyInput() { return withInputContext(EmptyCollection.getInstance()); } + /** The name of the row index environment variable. */ + public static final String ROW_INDEX_VARIABLE = "rowIndex"; + /** * Evaluates the given FHIRPath path and returns the result as a collection. * + *

The evaluation includes the current row index as the {@code %rowIndex} environment variable. + * * @param path the path to evaluate * @return the result as a collection */ @Nonnull public Collection evalExpression(@Nonnull final FhirPath path) { - return evaluator.evaluate(path, inputContext); + return evaluator + .withVariable( + ROW_INDEX_VARIABLE, IntegerCollection.build(new DefaultRepresentation(rowIndex))) + .evaluate(path, inputContext); } /** diff --git a/fhirpath/src/main/java/au/csiro/pathling/projection/UnnestingSelection.java b/fhirpath/src/main/java/au/csiro/pathling/projection/UnnestingSelection.java index f82d8a12b2..a403c39cfa 100644 --- a/fhirpath/src/main/java/au/csiro/pathling/projection/UnnestingSelection.java +++ b/fhirpath/src/main/java/au/csiro/pathling/projection/UnnestingSelection.java @@ -19,6 +19,7 @@ import au.csiro.pathling.fhirpath.FhirPath; import au.csiro.pathling.fhirpath.collection.Collection; +import au.csiro.pathling.fhirpath.column.DefaultRepresentation; import jakarta.annotation.Nonnull; import org.apache.spark.sql.Column; @@ -30,6 +31,9 @@ * clause to each element of that collection. The results are flattened into a single array. When * multiple projections are needed, wrap them in a {@link GroupingSelection} first. * + *

The {@code %rowIndex} environment variable is set to the 0-based index of each element during + * iteration. Each nesting level maintains its own independent {@code %rowIndex} value. + * * @param path the FHIRPath expression that identifies the collection to unnest * @param component the projection clause to apply to each element (use GroupingSelection for * multiple) @@ -48,7 +52,18 @@ public ProjectionResult evaluate(@Nonnull final ProjectionContext context) { // Evaluate the path to get the collection that will serve as the basis for unnesting. final Collection unnestingCollection = context.evalExpression(path); final ProjectionContext unnestingContext = context.withInputContext(unnestingCollection); - final Column columnResult = component.evaluateElementWise(unnestingContext); + + // Use the indexed transform to track the element index as %rowIndex. + final Column columnResult = + new DefaultRepresentation(unnestingContext.inputContext().getColumnValue()) + .transformWithIndex( + (element, index) -> + component + .evaluate(unnestingContext.withInputColumn(element).withRowIndex(index)) + .getResultColumn()) + .flatten() + .getValue(); + return component .evaluate(unnestingContext.asStubContext()) .withResultColumn(columnResult) diff --git a/fhirpath/src/test/resources/viewTests/rowindex.json b/fhirpath/src/test/resources/viewTests/rowindex.json new file mode 100644 index 0000000000..025f958268 --- /dev/null +++ b/fhirpath/src/test/resources/viewTests/rowindex.json @@ -0,0 +1,284 @@ +{ + "title": "%rowIndex tests", + "resources": [ + { + "resourceType": "Patient", + "id": "pt1", + "name": [ + { + "family": "Smith", + "given": ["John", "James"] + }, + { + "family": "Jones", + "given": ["Jane"] + } + ], + "contact": [ + { + "telecom": [ + { "system": "phone", "value": "555-0001" }, + { "system": "email", "value": "a@b.com" } + ] + }, + { + "telecom": [{ "system": "phone", "value": "555-0002" }] + } + ] + }, + { + "resourceType": "Patient", + "id": "pt2", + "name": [ + { + "family": "Brown", + "given": ["Bob"] + } + ] + }, + { + "resourceType": "Patient", + "id": "pt3", + "gender": "male" + } + ], + "tests": [ + { + "title": "spec example - capturing element position", + "view": { + "resource": "Patient", + "select": [ + { + "column": [{ "name": "id", "path": "id" }] + }, + { + "forEach": "name", + "column": [ + { "name": "name_index", "path": "%rowIndex" }, + { "name": "family", "path": "family" } + ] + } + ] + }, + "expect": [ + { "id": "pt1", "name_index": 0, "family": "Smith" }, + { "id": "pt1", "name_index": 1, "family": "Jones" }, + { "id": "pt2", "name_index": 0, "family": "Brown" } + ] + }, + { + "title": "spec example - nested iteration with independent indices", + "view": { + "resource": "Patient", + "select": [ + { + "column": [{ "name": "id", "path": "id" }] + }, + { + "forEach": "contact", + "column": [{ "name": "contact_index", "path": "%rowIndex" }], + "select": [ + { + "forEach": "telecom", + "column": [ + { "name": "telecom_index", "path": "%rowIndex" }, + { "name": "system", "path": "system" } + ] + } + ] + } + ] + }, + "expect": [ + { + "id": "pt1", + "contact_index": 0, + "telecom_index": 0, + "system": "phone" + }, + { + "id": "pt1", + "contact_index": 0, + "telecom_index": 1, + "system": "email" + }, + { + "id": "pt1", + "contact_index": 1, + "telecom_index": 0, + "system": "phone" + } + ] + }, + { + "title": "spec example - row index with unionAll", + "view": { + "resource": "Patient", + "select": [ + { + "column": [{ "name": "id", "path": "id" }] + }, + { + "forEach": "name", + "column": [{ "name": "name_index", "path": "%rowIndex" }], + "select": [ + { + "unionAll": [ + { + "forEach": "given", + "column": [ + { "name": "given_index", "path": "%rowIndex" }, + { "name": "value", "path": "$this" } + ] + }, + { + "column": [ + { "name": "given_index", "path": "%rowIndex" }, + { "name": "value", "path": "family" } + ] + } + ] + } + ] + } + ] + }, + "expect": [ + { "id": "pt1", "name_index": 0, "given_index": 0, "value": "John" }, + { "id": "pt1", "name_index": 0, "given_index": 1, "value": "James" }, + { "id": "pt1", "name_index": 0, "given_index": 0, "value": "Smith" }, + { "id": "pt1", "name_index": 1, "given_index": 0, "value": "Jane" }, + { "id": "pt1", "name_index": 1, "given_index": 1, "value": "Jones" }, + { "id": "pt2", "name_index": 0, "given_index": 0, "value": "Bob" }, + { "id": "pt2", "name_index": 0, "given_index": 0, "value": "Brown" } + ] + }, + { + "title": "top-level %rowIndex defaults to 0", + "view": { + "resource": "Patient", + "select": [ + { + "column": [ + { "name": "id", "path": "id" }, + { "name": "row_index", "path": "%rowIndex" } + ] + } + ] + }, + "expect": [ + { "id": "pt1", "row_index": 0 }, + { "id": "pt2", "row_index": 0 }, + { "id": "pt3", "row_index": 0 } + ] + }, + { + "title": "forEach with %rowIndex", + "view": { + "resource": "Patient", + "select": [ + { + "column": [{ "name": "id", "path": "id" }], + "select": [ + { + "forEach": "name", + "column": [ + { "name": "name_index", "path": "%rowIndex" }, + { "name": "family", "path": "family" } + ] + } + ] + } + ] + }, + "expect": [ + { "id": "pt1", "name_index": 0, "family": "Smith" }, + { "id": "pt1", "name_index": 1, "family": "Jones" }, + { "id": "pt2", "name_index": 0, "family": "Brown" } + ] + }, + { + "title": "nested forEach with independent %rowIndex values", + "view": { + "resource": "Patient", + "select": [ + { + "column": [{ "name": "id", "path": "id" }], + "select": [ + { + "forEach": "name", + "column": [{ "name": "name_index", "path": "%rowIndex" }], + "select": [ + { + "forEach": "given", + "column": [ + { "name": "given_index", "path": "%rowIndex" }, + { "name": "given", "path": "$this" } + ] + } + ] + } + ] + } + ] + }, + "expect": [ + { "id": "pt1", "name_index": 0, "given_index": 0, "given": "John" }, + { "id": "pt1", "name_index": 0, "given_index": 1, "given": "James" }, + { "id": "pt1", "name_index": 1, "given_index": 0, "given": "Jane" }, + { "id": "pt2", "name_index": 0, "given_index": 0, "given": "Bob" } + ] + }, + { + "title": "forEachOrNull with %rowIndex", + "view": { + "resource": "Patient", + "select": [ + { + "column": [{ "name": "id", "path": "id" }], + "select": [ + { + "forEachOrNull": "name", + "column": [ + { "name": "family", "path": "family" }, + { "name": "name_index", "path": "%rowIndex" } + ] + } + ] + } + ] + }, + "expect": [ + { "id": "pt1", "family": "Smith", "name_index": 0 }, + { "id": "pt1", "family": "Jones", "name_index": 1 }, + { "id": "pt2", "family": "Brown", "name_index": 0 }, + { "id": "pt3", "family": null, "name_index": null } + ] + }, + { + "title": "%rowIndex used in arithmetic expression", + "view": { + "resource": "Patient", + "select": [ + { + "column": [{ "name": "id", "path": "id" }], + "select": [ + { + "forEach": "name", + "column": [ + { "name": "family", "path": "family" }, + { "name": "one_based_index", "path": "%rowIndex + 1" } + ] + } + ] + } + ] + }, + "expect": [ + { "id": "pt1", "family": "Smith", "one_based_index": 1 }, + { "id": "pt1", "family": "Jones", "one_based_index": 2 }, + { "id": "pt2", "family": "Brown", "one_based_index": 1 } + ] + } + ] +} diff --git a/openspec/changes/archive/2026-03-18-row-index-env-variable/.openspec.yaml b/openspec/changes/archive/2026-03-18-row-index-env-variable/.openspec.yaml new file mode 100644 index 0000000000..3c861dd5b6 --- /dev/null +++ b/openspec/changes/archive/2026-03-18-row-index-env-variable/.openspec.yaml @@ -0,0 +1,2 @@ +schema: spec-driven +created: 2026-03-18 diff --git a/openspec/changes/archive/2026-03-18-row-index-env-variable/design.md b/openspec/changes/archive/2026-03-18-row-index-env-variable/design.md new file mode 100644 index 0000000000..af2b6b12b4 --- /dev/null +++ b/openspec/changes/archive/2026-03-18-row-index-env-variable/design.md @@ -0,0 +1,74 @@ +## Context + +Pathling's ViewDefinition processing uses `UnnestingSelection` to implement `forEach`/`forEachOrNull`. This evaluates a FHIRPath expression to get an array-valued collection, then applies a projection to each element using Spark's higher-order `transform(array, element -> ...)` function via `ColumnRepresentation.transform()` and `ProjectionClause.evaluateElementWise()`. + +FHIRPath environment variables are resolved through a `VariableResolverChain` — a chain-of-responsibility pattern where resolvers (`BuiltInConstantResolver`, `ContextVariableResolver`, `SuppliedVariableResolver`, etc.) are queried in sequence. Variables resolve to `Collection` objects containing Spark `Column` expressions. User-supplied variables (e.g. ViewDefinition constants) are passed as `Map` through the `SingleResourceEvaluator`. + +The `%rowIndex` variable is different from existing environment variables because its value changes per-element during iteration, rather than being constant across the entire evaluation. + +## Goals / Non-Goals + +**Goals:** + +- Provide `%rowIndex` as a 0-based integer environment variable within `forEach` and `forEachOrNull` iterations. +- Default to `0` at the top level when no iteration is active. +- Support independent `%rowIndex` values at each nesting level. +- Make `%rowIndex` available to all FHIRPath expressions within the iteration scope. + +**Non-Goals:** + +- Supporting `%rowIndex` within `repeat` iterations (separate future work). +- Changes to the FHIRPath parser or grammar (environment variables are already parsed via the `%name` syntax). + +## Decisions + +### Use Spark's indexed transform for per-element index tracking + +**Decision:** Use `functions.transform(array, (element, index) -> ...)` — Spark's two-argument lambda variant of the `transform` higher-order function — to propagate the element index during unnesting. + +**Rationale:** The current `evaluateElementWise` method uses `ColumnRepresentation.transform()` which calls `functions.transform(array, element -> ...)`. Spark provides a built-in overload that passes both the element and its 0-based index to the lambda. This aligns directly with the `%rowIndex` semantics and avoids generating indices externally. + +**Alternatives considered:** + +- _`posexplode` + rejoin_: Would explode arrays with position indices then rejoin. Rejected because it requires dataset-level operations (adding/removing rows), which conflicts with the current column-expression-based architecture that works within Spark's higher-order functions. +- _`zip_with_index` preprocessing_: Would pre-wrap each array element with its index before transformation. Rejected as unnecessary complexity when Spark's `transform` already provides index natively. + +### Inject %rowIndex via the existing supplied variables mechanism + +**Decision:** Pass `%rowIndex` as a supplied variable in the `Map` that flows through `SingleResourceEvaluator`. The `UnnestingSelection` will create a new evaluator (or updated variable map) for each unnesting level that includes the current index column as the `rowIndex` variable. + +**Rationale:** The existing `SuppliedVariableResolver` and `VariableResolverChain` infrastructure already supports arbitrary named variables passed as `Collection` objects. Using this mechanism avoids creating a new resolver type and keeps `%rowIndex` consistent with how ViewDefinition constants are handled. + +**Alternative considered:** + +- _Dedicated `RowIndexResolver`_: A new `EnvironmentVariableResolver` implementation specific to `%rowIndex`. Rejected as over-engineering — the supplied variables mechanism handles this cleanly and requires no changes to the resolver chain infrastructure. + +### Thread index through ProjectionContext + +**Decision:** Extend `ProjectionContext` to carry the current `%rowIndex` column, and update `UnnestingSelection` to pass the index from Spark's `transform` lambda into the projection context. The context will merge this index into the evaluator's variable map when creating the per-element evaluation context. + +**Rationale:** `ProjectionContext` is the natural place to carry per-iteration state since it already carries the `inputContext` and `evaluator`. Adding the row index here keeps the change localised to the projection layer and avoids threading index state through unrelated components. + +**Implementation approach:** + +1. Add a `rowIndex` field (type `Column`, defaulting to `lit(0)`) to `ProjectionContext`. +2. Modify `evaluateElementWise` in `UnnestingSelection` (or introduce a new method) to use the indexed `transform` variant, capturing the index column. +3. When building the per-element `ProjectionContext`, include the index column as a `rowIndex` supplied variable via the evaluator's variable map. +4. Nesting is handled naturally: each `UnnestingSelection` creates a new context with its own `%rowIndex`, shadowing the outer value. + +### Use IntegerCollection for the %rowIndex type + +**Decision:** Represent `%rowIndex` as an `IntegerCollection` wrapping a Spark integer column. + +**Rationale:** `IntegerCollection` is the standard FHIRPath integer representation. It supports arithmetic (`%rowIndex + 1`) and comparisons (`%rowIndex = 0`) out of the box. The index from Spark's `transform` is already an integer column, so no type conversion is needed. + +## Risks / Trade-offs + +**[Risk] Evaluator immutability** — `SingleResourceEvaluator` stores variables as a `Map` set at construction time. Injecting a per-element `%rowIndex` requires either creating a new evaluator per unnesting level or making the variable map mutable. +→ **Mitigation:** Create a new `SingleResourceEvaluator` (or a lightweight wrapper) per `UnnestingSelection` that includes `rowIndex` in its variable map. This preserves immutability and isolates each nesting level. + +**[Risk] Performance impact of creating per-level evaluators** — Creating new evaluator instances per unnesting level could add overhead. +→ **Mitigation:** The evaluators are lightweight objects (no dataset or SparkSession state). The per-level cost is negligible compared to the Spark query execution itself. Additionally, this already happens implicitly via `ProjectionContext.withInputContext()`. + +**[Risk] forEachOrNull with empty collection should produce %rowIndex = 0** — When `forEachOrNull` produces a null row for an empty collection, the index must still resolve to `0`. +→ **Mitigation:** The `orNull` mechanism in `ProjectionResult` handles the empty-collection case. The default `%rowIndex` value of `0` in the projection context will naturally apply since no transform iteration occurs for empty collections. diff --git a/openspec/changes/archive/2026-03-18-row-index-env-variable/proposal.md b/openspec/changes/archive/2026-03-18-row-index-env-variable/proposal.md new file mode 100644 index 0000000000..cc9263a52e --- /dev/null +++ b/openspec/changes/archive/2026-03-18-row-index-env-variable/proposal.md @@ -0,0 +1,29 @@ +## Why + +The [SQL on FHIR ViewDefinition spec](https://build.fhir.org/ig/FHIR/sql-on-fhir-v2/StructureDefinition-ViewDefinition.html#rowindex) defines a `%rowIndex` environment variable that provides the 0-based index of the current element within the collection being iterated by `forEach` or `forEachOrNull`. Pathling's ViewDefinition support does not yet implement this variable, preventing users from preserving element ordering, disambiguating repeating elements, and constructing surrogate keys in flattened output. + +## What Changes + +- Add a new `%rowIndex` environment variable to the FHIRPath evaluation context. +- `%rowIndex` resolves to the 0-based index of the current element within the collection being iterated by `forEach` or `forEachOrNull`. +- At the top level (no iteration), `%rowIndex` evaluates to `0`. +- Each nesting level of `forEach`/`forEachOrNull` maintains an independent `%rowIndex` value. +- Support for `%rowIndex` within `repeat` is out of scope for this change. +- The variable is available to all FHIRPath expressions evaluated within the iteration scope, including nested `select` clauses. + +## Capabilities + +### New Capabilities + +- `row-index-variable`: Support for the `%rowIndex` environment variable within ViewDefinition `forEach` and `forEachOrNull` iterations, providing a 0-based element index. + +### Modified Capabilities + +_(none)_ + +## Impact + +- **fhirpath module**: Environment variable resolution chain needs a new resolver or mechanism to supply `%rowIndex` values that change per-element during iteration. +- **projection module**: `UnnestingSelection` (forEach/forEachOrNull) needs to track the current element index and inject it into the evaluation context. +- **views module**: `FhirViewExecutor` may need minor changes to initialise `%rowIndex` at the top level (value `0`). +- **Public API**: No breaking changes. `%rowIndex` is a new environment variable that was previously unsupported; existing ViewDefinitions and FHIRPath expressions are unaffected. diff --git a/openspec/changes/archive/2026-03-18-row-index-env-variable/specs/row-index-variable/spec.md b/openspec/changes/archive/2026-03-18-row-index-env-variable/specs/row-index-variable/spec.md new file mode 100644 index 0000000000..02bf2f460e --- /dev/null +++ b/openspec/changes/archive/2026-03-18-row-index-env-variable/specs/row-index-variable/spec.md @@ -0,0 +1,80 @@ +## ADDED Requirements + +### Requirement: %rowIndex resolves to element index within forEach + +When a ViewDefinition `select` clause uses `forEach`, the `%rowIndex` environment variable SHALL resolve to the 0-based index of the current element within the collection produced by the `forEach` expression. The index reflects the element's position in the collection as evaluated by the FHIRPath expression, starting at 0 for the first element. + +#### Scenario: Single forEach with multiple elements + +- **WHEN** a ViewDefinition has `forEach: "Patient.name"` and the Patient has 3 names +- **THEN** `%rowIndex` SHALL be `0` for the first name, `1` for the second, and `2` for the third + +#### Scenario: forEach with single element + +- **WHEN** a ViewDefinition has `forEach: "Patient.name"` and the Patient has 1 name +- **THEN** `%rowIndex` SHALL be `0` for that name + +#### Scenario: forEach with empty collection + +- **WHEN** a ViewDefinition has `forEach: "Patient.name"` and the Patient has no names +- **THEN** no rows are produced (forEach produces no output for empty collections), so `%rowIndex` is not evaluated + +### Requirement: %rowIndex resolves to element index within forEachOrNull + +When a ViewDefinition `select` clause uses `forEachOrNull`, the `%rowIndex` environment variable SHALL resolve to the 0-based index of the current element within the collection produced by the `forEachOrNull` expression, following the same indexing rules as `forEach`. + +#### Scenario: forEachOrNull with multiple elements + +- **WHEN** a ViewDefinition has `forEachOrNull: "Patient.name"` and the Patient has 2 names +- **THEN** `%rowIndex` SHALL be `0` for the first name and `1` for the second + +#### Scenario: forEachOrNull with empty collection + +- **WHEN** a ViewDefinition has `forEachOrNull: "Patient.name"` and the Patient has no names +- **THEN** one row is produced with null values for all nested columns including `%rowIndex` + +### Requirement: %rowIndex defaults to 0 at top level + +When no `forEach` or `forEachOrNull` iteration is active (i.e. the expression is evaluated at the top level of a ViewDefinition select), `%rowIndex` SHALL evaluate to `0`. + +#### Scenario: Top-level column referencing %rowIndex + +- **WHEN** a ViewDefinition `select` has a column with expression `%rowIndex` and no `forEach` or `forEachOrNull` is active +- **THEN** the column value SHALL be `0` for every resource row + +### Requirement: Nested iterations maintain independent %rowIndex values + +Each nesting level of `forEach`/`forEachOrNull` SHALL maintain its own independent `%rowIndex`. An inner `forEach` resets `%rowIndex` to count within its own collection, and restoring the outer `%rowIndex` when the inner iteration completes. + +#### Scenario: Nested forEach iterations + +- **WHEN** a ViewDefinition has an outer `forEach: "Patient.name"` (Patient has 2 names) and an inner `forEach: "HumanName.given"` (first name has 2 givens, second name has 1 given) +- **THEN** for the first name: outer `%rowIndex` is `0`, inner `%rowIndex` is `0` and `1` for each given; for the second name: outer `%rowIndex` is `1`, inner `%rowIndex` is `0` for its single given + +#### Scenario: Inner forEach does not affect outer %rowIndex + +- **WHEN** a column expression references `%rowIndex` at the outer forEach level after an inner forEach has completed +- **THEN** the value SHALL reflect the outer iteration index, unaffected by the inner iteration + +### Requirement: %rowIndex is available in nested select expressions + +The `%rowIndex` variable SHALL be accessible from any FHIRPath expression evaluated within the scope of the current iteration, including columns within nested `select` clauses that do not themselves introduce a new `forEach`/`forEachOrNull`. + +#### Scenario: Column in nested select without its own forEach + +- **WHEN** a `forEach` iterates over `Patient.name` and a nested `select` (without its own `forEach`) contains a column with expression `%rowIndex` +- **THEN** the column SHALL resolve to the index from the enclosing `forEach` + +### Requirement: %rowIndex is an integer type + +The `%rowIndex` variable SHALL resolve to an integer value compatible with FHIRPath integer type, allowing arithmetic operations and comparisons. + +#### Scenario: Arithmetic with %rowIndex + +- **WHEN** a column expression is `%rowIndex + 1` +- **THEN** the result SHALL be the 1-based index of the current element + +#### Scenario: Comparison with %rowIndex + +- **WHEN** a `where` clause filters with `%rowIndex = 0` +- **THEN** only the first element of the iterated collection SHALL be included diff --git a/openspec/changes/archive/2026-03-18-row-index-env-variable/tasks.md b/openspec/changes/archive/2026-03-18-row-index-env-variable/tasks.md new file mode 100644 index 0000000000..09892ac72b --- /dev/null +++ b/openspec/changes/archive/2026-03-18-row-index-env-variable/tasks.md @@ -0,0 +1,29 @@ +## 1. Extend ProjectionContext with row index + +- [x] 1.1 Add a `rowIndex` field (type `Column`) to `ProjectionContext`, defaulting to `lit(0)` +- [x] 1.2 Add a `withRowIndex(Column)` method to create a new context with a different row index +- [x] 1.3 Update `ProjectionContext` to inject `rowIndex` as a `%rowIndex` supplied variable into the evaluator's variable map when evaluating expressions + +## 2. Add indexed transform support + +- [x] 2.1 Add an indexed `transform` method to `ColumnRepresentation` that uses Spark's `transform(array, (element, index) -> ...)` variant, returning both the transformed column and making the index available to the caller +- [x] 2.2 Add an `evaluateElementWiseWithIndex` method (or modify the existing flow) in `UnnestingSelection` that uses the indexed transform and passes the index column into the projection context via `withRowIndex` + +## 3. Wire up UnnestingSelection + +- [x] 3.1 Modify `UnnestingSelection.evaluate()` to use the indexed transform, creating a per-element `ProjectionContext` that carries the current index as `%rowIndex` +- [x] 3.2 Ensure nested `UnnestingSelection` levels shadow the outer `%rowIndex` with their own index value + +## 4. Handle forEachOrNull empty collection case + +- [x] 4.1 Verify that when `forEachOrNull` produces a null row for an empty collection, `%rowIndex` resolves to `0` (the default from `ProjectionContext`) + +## 5. Tests + +- [x] 5.1 Write a ViewDefinition integration test: `forEach` with `%rowIndex` column producing correct 0-based indices +- [x] 5.2 Write a ViewDefinition integration test: `forEachOrNull` with non-empty collection producing correct indices +- [x] 5.3 Write a ViewDefinition integration test: `forEachOrNull` with empty collection producing null `%rowIndex` +- [x] 5.4 Write a ViewDefinition integration test: top-level `%rowIndex` (no forEach) resolves to `0` +- [x] 5.5 Write a ViewDefinition integration test: nested `forEach` with independent `%rowIndex` values at each level +- [x] 5.6 Write a ViewDefinition integration test: `%rowIndex` used in arithmetic expression (`%rowIndex + 1`) +- [x] 5.7 Verify existing ViewDefinition tests still pass (no regressions) diff --git a/openspec/specs/row-index-variable/spec.md b/openspec/specs/row-index-variable/spec.md new file mode 100644 index 0000000000..02bf2f460e --- /dev/null +++ b/openspec/specs/row-index-variable/spec.md @@ -0,0 +1,80 @@ +## ADDED Requirements + +### Requirement: %rowIndex resolves to element index within forEach + +When a ViewDefinition `select` clause uses `forEach`, the `%rowIndex` environment variable SHALL resolve to the 0-based index of the current element within the collection produced by the `forEach` expression. The index reflects the element's position in the collection as evaluated by the FHIRPath expression, starting at 0 for the first element. + +#### Scenario: Single forEach with multiple elements + +- **WHEN** a ViewDefinition has `forEach: "Patient.name"` and the Patient has 3 names +- **THEN** `%rowIndex` SHALL be `0` for the first name, `1` for the second, and `2` for the third + +#### Scenario: forEach with single element + +- **WHEN** a ViewDefinition has `forEach: "Patient.name"` and the Patient has 1 name +- **THEN** `%rowIndex` SHALL be `0` for that name + +#### Scenario: forEach with empty collection + +- **WHEN** a ViewDefinition has `forEach: "Patient.name"` and the Patient has no names +- **THEN** no rows are produced (forEach produces no output for empty collections), so `%rowIndex` is not evaluated + +### Requirement: %rowIndex resolves to element index within forEachOrNull + +When a ViewDefinition `select` clause uses `forEachOrNull`, the `%rowIndex` environment variable SHALL resolve to the 0-based index of the current element within the collection produced by the `forEachOrNull` expression, following the same indexing rules as `forEach`. + +#### Scenario: forEachOrNull with multiple elements + +- **WHEN** a ViewDefinition has `forEachOrNull: "Patient.name"` and the Patient has 2 names +- **THEN** `%rowIndex` SHALL be `0` for the first name and `1` for the second + +#### Scenario: forEachOrNull with empty collection + +- **WHEN** a ViewDefinition has `forEachOrNull: "Patient.name"` and the Patient has no names +- **THEN** one row is produced with null values for all nested columns including `%rowIndex` + +### Requirement: %rowIndex defaults to 0 at top level + +When no `forEach` or `forEachOrNull` iteration is active (i.e. the expression is evaluated at the top level of a ViewDefinition select), `%rowIndex` SHALL evaluate to `0`. + +#### Scenario: Top-level column referencing %rowIndex + +- **WHEN** a ViewDefinition `select` has a column with expression `%rowIndex` and no `forEach` or `forEachOrNull` is active +- **THEN** the column value SHALL be `0` for every resource row + +### Requirement: Nested iterations maintain independent %rowIndex values + +Each nesting level of `forEach`/`forEachOrNull` SHALL maintain its own independent `%rowIndex`. An inner `forEach` resets `%rowIndex` to count within its own collection, and restoring the outer `%rowIndex` when the inner iteration completes. + +#### Scenario: Nested forEach iterations + +- **WHEN** a ViewDefinition has an outer `forEach: "Patient.name"` (Patient has 2 names) and an inner `forEach: "HumanName.given"` (first name has 2 givens, second name has 1 given) +- **THEN** for the first name: outer `%rowIndex` is `0`, inner `%rowIndex` is `0` and `1` for each given; for the second name: outer `%rowIndex` is `1`, inner `%rowIndex` is `0` for its single given + +#### Scenario: Inner forEach does not affect outer %rowIndex + +- **WHEN** a column expression references `%rowIndex` at the outer forEach level after an inner forEach has completed +- **THEN** the value SHALL reflect the outer iteration index, unaffected by the inner iteration + +### Requirement: %rowIndex is available in nested select expressions + +The `%rowIndex` variable SHALL be accessible from any FHIRPath expression evaluated within the scope of the current iteration, including columns within nested `select` clauses that do not themselves introduce a new `forEach`/`forEachOrNull`. + +#### Scenario: Column in nested select without its own forEach + +- **WHEN** a `forEach` iterates over `Patient.name` and a nested `select` (without its own `forEach`) contains a column with expression `%rowIndex` +- **THEN** the column SHALL resolve to the index from the enclosing `forEach` + +### Requirement: %rowIndex is an integer type + +The `%rowIndex` variable SHALL resolve to an integer value compatible with FHIRPath integer type, allowing arithmetic operations and comparisons. + +#### Scenario: Arithmetic with %rowIndex + +- **WHEN** a column expression is `%rowIndex + 1` +- **THEN** the result SHALL be the 1-based index of the current element + +#### Scenario: Comparison with %rowIndex + +- **WHEN** a `where` clause filters with `%rowIndex = 0` +- **THEN** only the first element of the iterated collection SHALL be included From 9a4d45f3a4559103df5e4da9cf26cd08b124650d Mon Sep 17 00:00:00 2001 From: Piotr Szul Date: Wed, 1 Apr 2026 14:25:12 +1000 Subject: [PATCH 02/41] feat: Implement %rowIndex environment variable for ViewDefinition repeat clause Adds support for %rowIndex within the repeat directive, producing a global 0-based traversal-order index across all depth levels of the flattened recursive tree. Each repeat directive scopes its own counter independently from enclosing or nested forEach/forEachOrNull/repeat directives. Co-Authored-By: Claude Opus 4.6 (1M context) --- .../pathling/encoders/RowIndexCounter.java | 63 ++++ .../pathling/encoders/ValueFunctions.java | 31 ++ .../csiro/pathling/encoders/Expressions.scala | 82 +++++ .../encoders/ExpressionsBothModesTest.java | 276 ++++++++++++++ .../pathling/projection/RepeatSelection.java | 15 +- .../au/csiro/pathling/views/FhirViewTest.java | 14 +- .../test/resources/viewTests/rowindex.json | 346 ++++++++++++++++++ .../.openspec.yaml | 2 + .../2026-04-01-repeat-row-index/design.md | 61 +++ .../2026-04-01-repeat-row-index/proposal.md | 27 ++ .../specs/row-index-variable/spec.md | 83 +++++ .../2026-04-01-repeat-row-index/tasks.md | 18 + openspec/specs/row-index-variable/spec.md | 68 +++- 13 files changed, 1081 insertions(+), 5 deletions(-) create mode 100644 encoders/src/main/java/au/csiro/pathling/encoders/RowIndexCounter.java create mode 100644 openspec/changes/archive/2026-04-01-repeat-row-index/.openspec.yaml create mode 100644 openspec/changes/archive/2026-04-01-repeat-row-index/design.md create mode 100644 openspec/changes/archive/2026-04-01-repeat-row-index/proposal.md create mode 100644 openspec/changes/archive/2026-04-01-repeat-row-index/specs/row-index-variable/spec.md create mode 100644 openspec/changes/archive/2026-04-01-repeat-row-index/tasks.md diff --git a/encoders/src/main/java/au/csiro/pathling/encoders/RowIndexCounter.java b/encoders/src/main/java/au/csiro/pathling/encoders/RowIndexCounter.java new file mode 100644 index 0000000000..09d2f587da --- /dev/null +++ b/encoders/src/main/java/au/csiro/pathling/encoders/RowIndexCounter.java @@ -0,0 +1,63 @@ +/* + * Copyright © 2018-2026 Commonwealth Scientific and Industrial Research + * Organisation (CSIRO) ABN 41 687 119 230. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package au.csiro.pathling.encoders; + +import java.io.Serializable; + +/** + * A thread-safe counter for tracking element positions within recursive tree traversals. Each + * thread gets its own independent counter via {@link ThreadLocal}, ensuring that Spark tasks + * running in parallel on different partitions do not interfere with each other. + * + *

This class is {@link Serializable} so that it survives Spark plan serialization to executors. + * The {@link ThreadLocal} state is transient and lazily re-initialized after deserialization. + * + * @author Piotr Szul + */ +public class RowIndexCounter implements Serializable { + + private static final long serialVersionUID = 1L; + + @SuppressWarnings("TransientFieldNotInitialized") + private transient ThreadLocal counter; + + private ThreadLocal getCounter() { + if (counter == null) { + counter = ThreadLocal.withInitial(() -> new int[] {0}); + } + return counter; + } + + /** + * Returns the current counter value and increments it. The first call after a {@link #reset()} + * returns 0. + * + * @return the current counter value before incrementing + */ + public int getAndIncrement() { + return getCounter().get()[0]++; + } + + /** + * Resets the counter to zero for the current thread. This should be called before evaluating each + * top-level row to ensure the index sequence starts fresh. + */ + public void reset() { + getCounter().get()[0] = 0; + } +} diff --git a/encoders/src/main/java/au/csiro/pathling/encoders/ValueFunctions.java b/encoders/src/main/java/au/csiro/pathling/encoders/ValueFunctions.java index 619fc75183..f98d63b18f 100644 --- a/encoders/src/main/java/au/csiro/pathling/encoders/ValueFunctions.java +++ b/encoders/src/main/java/au/csiro/pathling/encoders/ValueFunctions.java @@ -429,4 +429,35 @@ public static Column variantUnwrap( public static Column pruneAnnotations(@Nonnull final Column col) { return column(new PruneSyntheticFields(expression(col))); } + + /** + * Creates a new row counter backed by a shared {@link RowIndexCounter}. Each evaluation of the + * returned column increments the counter and returns its previous value, producing a 0-based + * sequence: 0, 1, 2, ... + * + *

The counter must be reset before each top-level evaluation (e.g. per resource row) using + * {@link #resetCounter(Column, RowIndexCounter)}. + * + * @param state the shared counter instance + * @return a Column that produces the next integer on each evaluation + */ + @Nonnull + public static Column rowCounter(@Nonnull final RowIndexCounter state) { + return column(new RowCounter(state)); + } + + /** + * Wraps a column expression so that the shared row counter is reset to zero before evaluating the + * expression. This should be applied at the outermost level of a repeat projection to ensure the + * counter starts fresh for each resource row. + * + * @param child the expression to evaluate after resetting + * @param state the shared counter instance to reset + * @return a Column that resets the counter and then evaluates the child + */ + @Nonnull + public static Column resetCounter( + @Nonnull final Column child, @Nonnull final RowIndexCounter state) { + return column(new ResetCounter(expression(child), state)); + } } diff --git a/encoders/src/main/scala/au/csiro/pathling/encoders/Expressions.scala b/encoders/src/main/scala/au/csiro/pathling/encoders/Expressions.scala index d38a9bc3d2..74611400f1 100644 --- a/encoders/src/main/scala/au/csiro/pathling/encoders/Expressions.scala +++ b/encoders/src/main/scala/au/csiro/pathling/encoders/Expressions.scala @@ -947,4 +947,86 @@ case class UnresolvedVariantUnwrap(inner: Expression, schemaRef: Expression, override def toString: String = s"VariantUnwrap($inner)" } +/** + * A stateful, non-deterministic expression that returns a monotonically increasing integer each + * time it is evaluated. The counter is shared via a [[RowIndexCounter]] instance which uses + * [[ThreadLocal]] storage to ensure thread safety across parallel Spark tasks. + * + * This is designed for use inside array-producing expressions (e.g. `transform`, `Concat`) where + * the evaluation order is deterministic and single-threaded within a row. The counter must be reset + * to zero before each top-level evaluation via [[ResetCounter]]. + * + * Modeled after Spark's `MonotonicallyIncreasingID`. + * + * @param state the shared thread-safe counter + */ +case class RowCounter(state: RowIndexCounter) + extends LeafExpression with Nondeterministic { + + override def stateful: Boolean = true + + override def nullable: Boolean = false + + override def dataType: DataType = IntegerType + + override protected def initializeInternal(partitionIndex: Int): Unit = { + // No-op: reset is handled by ResetCounter at the per-row level, not per-partition. + } + + override protected def evalInternal(input: InternalRow): Int = { + state.getAndIncrement() + } + + override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { + val counterRef = ctx.addReferenceObj("rowCounter", state, classOf[RowIndexCounter].getName) + ev.copy(code = code""" + final ${CodeGenerator.javaType(dataType)} ${ev.value} = $counterRef.getAndIncrement();""", + isNull = FalseLiteral) + } + + override def withNewChildrenInternal(newChildren: IndexedSeq[Expression]): Expression = { + RowCounter(state) + } +} + +/** + * A unary expression that resets a [[RowCounter]]'s shared state to zero before evaluating its + * child expression. This ensures the counter starts fresh for each row when used inside + * per-row array transformations. + * + * @param child the expression to evaluate after resetting + * @param state the shared thread-safe counter to reset + */ +case class ResetCounter(child: Expression, state: RowIndexCounter) + extends UnaryExpression with NonSQLExpression { + + override def dataType: DataType = child.dataType + + override def nullable: Boolean = child.nullable + + override protected def nullSafeEval(input: Any): Any = { + // This should not be called — we override eval directly. + throw new UnsupportedOperationException(ExpressionConstants.CODEGEN_ONLY_MSG) + } + + override def eval(input: InternalRow): Any = { + state.reset() + child.eval(input) + } + + override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { + val counterRef = ctx.addReferenceObj("rowCounter", state, classOf[RowIndexCounter].getName) + val childEval = child.genCode(ctx) + ev.copy(code = code""" + $counterRef.reset(); + ${childEval.code} + final boolean ${ev.isNull} = ${childEval.isNull}; + final ${CodeGenerator.javaType(dataType)} ${ev.value} = ${childEval.value};""") + } + + override protected def withNewChildInternal(newChild: Expression): Expression = { + ResetCounter(newChild, state) + } +} + // ColumnFunctions has been moved to a Java class to access package-private Spark methods diff --git a/encoders/src/test/java/au/csiro/pathling/encoders/ExpressionsBothModesTest.java b/encoders/src/test/java/au/csiro/pathling/encoders/ExpressionsBothModesTest.java index 5f011236d9..3594ec18aa 100644 --- a/encoders/src/test/java/au/csiro/pathling/encoders/ExpressionsBothModesTest.java +++ b/encoders/src/test/java/au/csiro/pathling/encoders/ExpressionsBothModesTest.java @@ -45,6 +45,7 @@ import org.apache.spark.sql.types.StructType; import org.junit.jupiter.api.Test; import scala.collection.Seq; +import scala.jdk.javaapi.CollectionConverters; /** * Abstract base class for expression tests that need to run in both codegen and interpreted modes. @@ -292,4 +293,279 @@ void testStructProductInlineWithUnsafeRowData() { assertEquals(expected.get(i), actual.get(i), "Row " + i + " mismatch"); } } + + /** + * Tests that RowCounter produces sequential 0-based indices within a simple array transform, and + * that ResetCounter resets the sequence for each row. + */ + @Test + void testRowCounterWithSimpleTransform() { + final RowIndexCounter counter = new RowIndexCounter(); + final Column counterCol = ValueFunctions.rowCounter(counter); + + // Create a dataset with two rows, each containing an array of different lengths. + final Dataset ds = + spark + .createDataFrame( + List.of( + RowFactory.create(1, List.of("a", "b", "c")), + RowFactory.create(2, List.of("d", "e"))), + DataTypes.createStructType( + new StructField[] { + new StructField("id", DataTypes.IntegerType, false, Metadata.empty()), + new StructField( + "items", + DataTypes.createArrayType(DataTypes.StringType), + false, + Metadata.empty()) + })) + .repartition(1); + + // Use transform to stamp each element with the counter, then wrap with resetCounter. + final Column transformed = + functions.transform( + ds.col("items"), elem -> functions.struct(elem.alias("val"), counterCol.alias("idx"))); + final Column withReset = ValueFunctions.resetCounter(transformed, counter); + + final Dataset result = ds.withColumn("indexed", withReset); + final List rows = result.collectAsList(); + + assertEquals(2, rows.size()); + + // Row 1: 3 elements → indices 0, 1, 2. + final Seq row1Seq = rows.get(0).getAs("indexed"); + final List row1Items = CollectionConverters.asJava(row1Seq); + assertEquals(3, row1Items.size()); + assertEquals(0, (int) ((Row) row1Items.get(0)).getAs("idx")); + assertEquals(1, (int) ((Row) row1Items.get(1)).getAs("idx")); + assertEquals(2, (int) ((Row) row1Items.get(2)).getAs("idx")); + + // Row 2: 2 elements → indices reset to 0, 1. + final Seq row2Seq = rows.get(1).getAs("indexed"); + final List row2Items = CollectionConverters.asJava(row2Seq); + assertEquals(2, row2Items.size()); + assertEquals(0, (int) ((Row) row2Items.get(0)).getAs("idx")); + assertEquals(1, (int) ((Row) row2Items.get(1)).getAs("idx")); + } + + /** + * Tests that RowCounter produces a continuous global sequence when used inside a transformTree + * with a single traversal, producing sequential indices across all depth levels. + */ + @Test + void testRowCounterWithTransformTree() { + final Metadata metadata = Metadata.empty(); + + // Build a 3-level nested structure: root has 2 items, first item has 1 child. + final StructType leafType = + DataTypes.createStructType( + new StructField[] {new StructField("linkId", DataTypes.StringType, true, metadata)}); + + final StructType midType = + DataTypes.createStructType( + new StructField[] { + new StructField("linkId", DataTypes.StringType, true, metadata), + new StructField("item", DataTypes.createArrayType(leafType), true, metadata) + }); + + final StructType rootItemType = + DataTypes.createStructType( + new StructField[] { + new StructField("linkId", DataTypes.StringType, true, metadata), + new StructField("item", DataTypes.createArrayType(midType), true, metadata) + }); + + final StructType rootSchema = + DataTypes.createStructType( + new StructField[] { + new StructField("id", DataTypes.IntegerType, false, metadata), + new StructField("items", DataTypes.createArrayType(rootItemType), true, metadata) + }); + + // Tree structure: + // items[0] (linkId: "1") + // └── item[0] (linkId: "1.1") + // └── item[0] (linkId: "1.1.1") + // items[1] (linkId: "2") + final Row leaf = RowFactory.create("1.1.1"); + final Row mid = RowFactory.create("1.1", List.of(leaf)); + final Row root0 = RowFactory.create("1", List.of(mid)); + final Row root1 = RowFactory.create("2", List.of()); + + final Dataset ds = + spark + .createDataFrame( + List.of( + RowFactory.create(1, List.of(root0, root1)), + RowFactory.create(2, List.of(root1))), + rootSchema) + .repartition(1); + + final RowIndexCounter counter = new RowIndexCounter(); + final Column counterCol = ValueFunctions.rowCounter(counter); + + // Extractor: produce Array[Struct{linkId, idx}] from each array node. + final Column treeResult = + ValueFunctions.transformTree( + ds.col("items"), + c -> + functions.transform( + c, + elem -> + functions.struct( + elem.getField("linkId").alias("linkId"), counterCol.alias("idx"))), + List.of(c -> ValueFunctions.unnest(c.getField("item"))), + 2); + + final Column withReset = ValueFunctions.resetCounter(treeResult, counter); + final Dataset result = ds.withColumn("collected", withReset); + final List rows = result.collectAsList(); + + assertEquals(2, rows.size()); + + // Row 1: transformTree produces breadth-first-like order: + // Concat(extractor(root_items), transformTree(root_items.item)) + // = Concat(["1","2"], Concat(["1.1"], ["1.1.1"])) + // = ["1", "2", "1.1", "1.1.1"] + final Seq row1Seq = rows.get(0).getAs("collected"); + final List row1 = CollectionConverters.asJava(row1Seq); + assertEquals(4, row1.size()); + assertEquals("1", ((Row) row1.get(0)).getAs("linkId")); + assertEquals(0, (int) ((Row) row1.get(0)).getAs("idx")); + assertEquals("2", ((Row) row1.get(1)).getAs("linkId")); + assertEquals(1, (int) ((Row) row1.get(1)).getAs("idx")); + assertEquals("1.1", ((Row) row1.get(2)).getAs("linkId")); + assertEquals(2, (int) ((Row) row1.get(2)).getAs("idx")); + assertEquals("1.1.1", ((Row) row1.get(3)).getAs("linkId")); + assertEquals(3, (int) ((Row) row1.get(3)).getAs("idx")); + + // Row 2: tree has 1 node → "2"(0) — counter resets. + final Seq row2Seq = rows.get(1).getAs("collected"); + final List row2 = CollectionConverters.asJava(row2Seq); + assertEquals(1, row2.size()); + assertEquals("2", ((Row) row2.get(0)).getAs("linkId")); + assertEquals(0, (int) ((Row) row2.get(0)).getAs("idx")); + } + + /** + * Tests that RowCounter works with multiple traversal paths in transformTree, producing a + * continuous global index across all branches and depths. + */ + @Test + void testRowCounterWithMultipleTraversals() { + final Metadata metadata = Metadata.empty(); + + // Build a structure with two traversal paths: "item" and self-reference. + final StructType level2Type = + DataTypes.createStructType( + new StructField[] {new StructField("linkId", DataTypes.StringType, true, metadata)}); + + final StructType level1Type = + DataTypes.createStructType( + new StructField[] { + new StructField("linkId", DataTypes.StringType, true, metadata), + new StructField("item", DataTypes.createArrayType(level2Type), true, metadata) + }); + + final StructType level0Type = + DataTypes.createStructType( + new StructField[] { + new StructField("linkId", DataTypes.StringType, true, metadata), + new StructField("item", DataTypes.createArrayType(level1Type), true, metadata) + }); + + final StructType rootSchema = + DataTypes.createStructType( + new StructField[] { + new StructField("id", DataTypes.IntegerType, false, metadata), + new StructField("items", DataTypes.createArrayType(level0Type), true, metadata) + }); + + // items[0] (linkId: "1") → item[0] (linkId: "2") → item[0] (linkId: "3"). + final Row level2 = RowFactory.create("3"); + final Row level1 = RowFactory.create("2", List.of(level2)); + final Row level0 = RowFactory.create("1", List.of(level1)); + + final Dataset ds = + spark + .createDataFrame(List.of(RowFactory.create(1, List.of(level0))), rootSchema) + .repartition(1); + + final RowIndexCounter counter = new RowIndexCounter(); + final Column counterCol = ValueFunctions.rowCounter(counter); + + // Use two traversals: item navigation and self-reference (like the existing test). + final Column treeResult = + ValueFunctions.transformTree( + ds.col("items"), + c -> + functions.transform( + c, + elem -> + functions.struct( + elem.getField("linkId").alias("linkId"), counterCol.alias("idx"))), + List.of(c -> ValueFunctions.unnest(c.getField("item")), c -> c), + 1); + + final Column withReset = ValueFunctions.resetCounter(treeResult, counter); + final Dataset result = ds.withColumn("collected", withReset); + final List rows = result.collectAsList(); + + assertEquals(1, rows.size()); + + // The existing test (without counter) produces linkIds: [1, 2, 3, 3, 2, 3, 1, 2, 3]. + // Each element should have a sequential global index. + final Seq collectedSeq = rows.get(0).getAs("collected"); + final List collected = CollectionConverters.asJava(collectedSeq); + assertEquals(9, collected.size()); + + // Verify sequential indices 0..8. + for (int i = 0; i < 9; i++) { + assertEquals( + i, (int) ((Row) collected.get(i)).getAs("idx"), "Index mismatch at position " + i); + } + } + + /** + * Tests that RowCounter composes with arithmetic expressions, validating that %rowIndex + 1 style + * usage works correctly. + */ + @Test + void testRowCounterInArithmeticExpression() { + final RowIndexCounter counter = new RowIndexCounter(); + final Column counterCol = ValueFunctions.rowCounter(counter); + + final Dataset ds = + spark + .createDataFrame( + List.of(RowFactory.create(1, List.of("a", "b", "c"))), + DataTypes.createStructType( + new StructField[] { + new StructField("id", DataTypes.IntegerType, false, Metadata.empty()), + new StructField( + "items", + DataTypes.createArrayType(DataTypes.StringType), + false, + Metadata.empty()) + })) + .repartition(1); + + // Use counter in an arithmetic expression: counter + 1 (1-based index). + final Column transformed = + functions.transform( + ds.col("items"), + elem -> functions.struct(elem.alias("val"), counterCol.plus(1).alias("one_based_idx"))); + final Column withReset = ValueFunctions.resetCounter(transformed, counter); + + final Dataset result = ds.withColumn("indexed", withReset); + final List rows = result.collectAsList(); + + assertEquals(1, rows.size()); + final Seq itemsSeq = rows.get(0).getAs("indexed"); + final List items = CollectionConverters.asJava(itemsSeq); + assertEquals(3, items.size()); + assertEquals(1, (int) ((Row) items.get(0)).getAs("one_based_idx")); + assertEquals(2, (int) ((Row) items.get(1)).getAs("one_based_idx")); + assertEquals(3, (int) ((Row) items.get(2)).getAs("one_based_idx")); + } } diff --git a/fhirpath/src/main/java/au/csiro/pathling/projection/RepeatSelection.java b/fhirpath/src/main/java/au/csiro/pathling/projection/RepeatSelection.java index 2d0b1879a9..4de03ee833 100644 --- a/fhirpath/src/main/java/au/csiro/pathling/projection/RepeatSelection.java +++ b/fhirpath/src/main/java/au/csiro/pathling/projection/RepeatSelection.java @@ -19,6 +19,7 @@ import static org.apache.spark.sql.functions.concat; +import au.csiro.pathling.encoders.RowIndexCounter; import au.csiro.pathling.encoders.ValueFunctions; import au.csiro.pathling.fhirpath.FhirPath; import au.csiro.pathling.fhirpath.collection.Collection; @@ -55,6 +56,12 @@ public record RepeatSelection( @Override public ProjectionResult evaluate(@Nonnull final ProjectionContext context) { + // Create a shared counter for the %rowIndex environment variable. Each element extracted by the + // tree traversal increments this counter, producing a global 0-based index across all depth + // levels and traversal branches. + final RowIndexCounter rowIndexCounter = new RowIndexCounter(); + final Column rowIndexCol = ValueFunctions.rowCounter(rowIndexCounter); + // Evaluate each path to get collections, retaining them for type inspection. final List pathCollections = paths.stream().map(context::evalExpression).toList(); @@ -66,11 +73,13 @@ public ProjectionResult evaluate(@Nonnull final ProjectionContext context) { .anyMatch( c -> c.getFhirType().map(t -> !FHIRDefinedType.EXTENSION.equals(t)).orElse(true)); - // Create the list of non-empty starting contexts from the evaluated path collections. + // Create the list of non-empty starting contexts from the evaluated path collections. The row + // index counter is injected so that %rowIndex resolves to the global element position. final List startingNodes = pathCollections.stream() .filter(Collection::isNotEmpty) .map(context::withInputContext) + .map(ctx -> ctx.withRowIndex(rowIndexCol)) .toList(); // Map starting nodes to transformTree expressions and concatenate the results. @@ -88,9 +97,11 @@ public ProjectionResult evaluate(@Nonnull final ProjectionContext context) { errorOnDepthExhaustion)) .toArray(Column[]::new); + // Wrap the concatenated result with a counter reset so that the %rowIndex sequence restarts at + // zero for each resource row. final Column result = nodeResults.length > 0 - ? concat(nodeResults) + ? ValueFunctions.resetCounter(concat(nodeResults), rowIndexCounter) : DefaultRepresentation.empty() .plural() .transform(component.asColumnOperator(context.withEmptyInput())) diff --git a/fhirpath/src/test/java/au/csiro/pathling/views/FhirViewTest.java b/fhirpath/src/test/java/au/csiro/pathling/views/FhirViewTest.java index 006c68db5c..83dfed03cd 100644 --- a/fhirpath/src/test/java/au/csiro/pathling/views/FhirViewTest.java +++ b/fhirpath/src/test/java/au/csiro/pathling/views/FhirViewTest.java @@ -33,6 +33,7 @@ import static org.junit.jupiter.api.Assumptions.assumeFalse; import static scala.jdk.javaapi.CollectionConverters.asScala; +import au.csiro.pathling.config.QueryConfiguration; import au.csiro.pathling.encoders.FhirEncoders; import au.csiro.pathling.encoders.datatypes.DecimalCustomCoder; import au.csiro.pathling.io.source.DataSource; @@ -515,9 +516,18 @@ void test(@Nonnull final TestParameters parameters) { throw e; } - // Create a new executor and build the query. + // Create a new executor with a reduced traversal depth (4 instead of the default + // 10) to keep Spark plan complexity manageable. Nested repeat-in-repeat tests + // compound the plan depth, and the default of 10 exceeds Spark's analyzer iteration + // limit (100). A depth of 4 is sufficient for the current test data, which nests + // extensions at most 4 levels deep. If future tests require deeper traversal, + // increase + // this value but be aware of the Spark analyzer limit. final FhirViewExecutor executor = - new FhirViewExecutor(fhirContext, parameters.sourceData()); + new FhirViewExecutor( + fhirContext, + parameters.sourceData(), + QueryConfiguration.builder().maxUnboundTraversalDepth(4).build()); return executor.buildQuery(view); }); } diff --git a/fhirpath/src/test/resources/viewTests/rowindex.json b/fhirpath/src/test/resources/viewTests/rowindex.json index 025f958268..4758c187e5 100644 --- a/fhirpath/src/test/resources/viewTests/rowindex.json +++ b/fhirpath/src/test/resources/viewTests/rowindex.json @@ -40,6 +40,79 @@ "resourceType": "Patient", "id": "pt3", "gender": "male" + }, + { + "resourceType": "Patient", + "id": "pt4", + "extension": [ + { + "url": "urn:ext1", + "extension": [ + { + "url": "urn:ext2", + "extension": [ + { + "url": "urn:ext3", + "extension": [ + { + "url": "urn:ext4", + "valueString": "leaf" + } + ] + } + ] + } + ] + } + ] + }, + { + "resourceType": "Patient", + "id": "pt5", + "extension": [ + { + "url": "urn:branch-root", + "extension": [ + { + "url": "urn:branch-child1", + "extension": [ + { + "url": "urn:branch-grandchild", + "valueString": "deep" + } + ] + }, + { + "url": "urn:branch-child2", + "valueString": "shallow" + } + ] + } + ] + }, + { + "resourceType": "Patient", + "id": "pt6", + "extension": [ + { + "url": "urn:fe1", + "extension": [ + { + "url": "urn:fe1.1", + "valueString": "v1" + } + ] + }, + { + "url": "urn:fe2", + "extension": [ + { + "url": "urn:fe2.1", + "valueString": "v2" + } + ] + } + ] } ], "tests": [ @@ -157,6 +230,7 @@ "title": "top-level %rowIndex defaults to 0", "view": { "resource": "Patient", + "where": [{ "path": "id = 'pt1' or id = 'pt2' or id = 'pt3'" }], "select": [ { "column": [ @@ -233,6 +307,7 @@ "title": "forEachOrNull with %rowIndex", "view": { "resource": "Patient", + "where": [{ "path": "id = 'pt1' or id = 'pt2' or id = 'pt3'" }], "select": [ { "column": [{ "name": "id", "path": "id" }], @@ -279,6 +354,277 @@ { "id": "pt1", "family": "Jones", "one_based_index": 2 }, { "id": "pt2", "family": "Brown", "one_based_index": 1 } ] + }, + { + "title": "repeat with %rowIndex — linear chain", + "view": { + "resource": "Patient", + "where": [{ "path": "id = 'pt4'" }], + "select": [ + { + "column": [{ "name": "id", "path": "id" }] + }, + { + "repeat": ["extension"], + "column": [ + { "name": "row_index", "path": "%rowIndex" }, + { "name": "url", "path": "url", "type": "uri" } + ] + } + ] + }, + "expect": [ + { "id": "pt4", "row_index": 0, "url": "urn:ext1" }, + { "id": "pt4", "row_index": 1, "url": "urn:ext2" }, + { "id": "pt4", "row_index": 2, "url": "urn:ext3" }, + { "id": "pt4", "row_index": 3, "url": "urn:ext4" } + ] + }, + { + "title": "repeat with %rowIndex arithmetic", + "view": { + "resource": "Patient", + "where": [{ "path": "id = 'pt4'" }], + "select": [ + { + "column": [{ "name": "id", "path": "id" }] + }, + { + "repeat": ["extension"], + "column": [ + { "name": "one_based", "path": "%rowIndex + 1" }, + { "name": "url", "path": "url", "type": "uri" } + ] + } + ] + }, + "expect": [ + { "id": "pt4", "one_based": 1, "url": "urn:ext1" }, + { "id": "pt4", "one_based": 2, "url": "urn:ext2" }, + { "id": "pt4", "one_based": 3, "url": "urn:ext3" }, + { "id": "pt4", "one_based": 4, "url": "urn:ext4" } + ] + }, + { + "title": "repeat with %rowIndex and nested forEach", + "view": { + "resource": "Patient", + "where": [{ "path": "id = 'pt4'" }], + "select": [ + { + "column": [{ "name": "id", "path": "id" }] + }, + { + "repeat": ["extension"], + "column": [ + { "name": "repeat_index", "path": "%rowIndex" }, + { "name": "parent_url", "path": "url", "type": "uri" } + ], + "select": [ + { + "forEach": "extension", + "column": [ + { "name": "foreach_index", "path": "%rowIndex" }, + { "name": "child_url", "path": "url", "type": "uri" } + ] + } + ] + } + ] + }, + "expect": [ + { + "id": "pt4", + "repeat_index": 0, + "parent_url": "urn:ext1", + "foreach_index": 0, + "child_url": "urn:ext2" + }, + { + "id": "pt4", + "repeat_index": 1, + "parent_url": "urn:ext2", + "foreach_index": 0, + "child_url": "urn:ext3" + }, + { + "id": "pt4", + "repeat_index": 2, + "parent_url": "urn:ext3", + "foreach_index": 0, + "child_url": "urn:ext4" + } + ] + }, + { + "title": "repeat with %rowIndex — branching tree breadth-first", + "view": { + "resource": "Patient", + "where": [{ "path": "id = 'pt5'" }], + "select": [ + { + "column": [{ "name": "id", "path": "id" }] + }, + { + "repeat": ["extension"], + "column": [ + { "name": "row_index", "path": "%rowIndex" }, + { "name": "url", "path": "url", "type": "uri" } + ] + } + ] + }, + "expect": [ + { "id": "pt5", "row_index": 0, "url": "urn:branch-root" }, + { "id": "pt5", "row_index": 1, "url": "urn:branch-child1" }, + { "id": "pt5", "row_index": 2, "url": "urn:branch-child2" }, + { "id": "pt5", "row_index": 3, "url": "urn:branch-grandchild" } + ] + }, + { + "title": "repeat with %rowIndex — counter resets per resource", + "view": { + "resource": "Patient", + "where": [{ "path": "id = 'pt4' or id = 'pt5'" }], + "select": [ + { + "column": [{ "name": "id", "path": "id" }] + }, + { + "repeat": ["extension"], + "column": [ + { "name": "row_index", "path": "%rowIndex" }, + { "name": "url", "path": "url", "type": "uri" } + ] + } + ] + }, + "expect": [ + { "id": "pt4", "row_index": 0, "url": "urn:ext1" }, + { "id": "pt4", "row_index": 1, "url": "urn:ext2" }, + { "id": "pt4", "row_index": 2, "url": "urn:ext3" }, + { "id": "pt4", "row_index": 3, "url": "urn:ext4" }, + { "id": "pt5", "row_index": 0, "url": "urn:branch-root" }, + { "id": "pt5", "row_index": 1, "url": "urn:branch-child1" }, + { "id": "pt5", "row_index": 2, "url": "urn:branch-child2" }, + { "id": "pt5", "row_index": 3, "url": "urn:branch-grandchild" } + ] + }, + { + "title": "repeat nested inside repeat — independent %rowIndex", + "view": { + "resource": "Patient", + "where": [{ "path": "id = 'pt4'" }], + "select": [ + { + "column": [{ "name": "id", "path": "id" }] + }, + { + "repeat": ["extension"], + "column": [ + { "name": "outer_index", "path": "%rowIndex" }, + { "name": "url", "path": "url", "type": "uri" } + ], + "select": [ + { + "repeat": ["extension"], + "column": [ + { "name": "inner_index", "path": "%rowIndex" }, + { "name": "inner_url", "path": "url", "type": "uri" } + ] + } + ] + } + ] + }, + "expect": [ + { + "id": "pt4", + "outer_index": 0, + "url": "urn:ext1", + "inner_index": 0, + "inner_url": "urn:ext2" + }, + { + "id": "pt4", + "outer_index": 0, + "url": "urn:ext1", + "inner_index": 1, + "inner_url": "urn:ext3" + }, + { + "id": "pt4", + "outer_index": 0, + "url": "urn:ext1", + "inner_index": 2, + "inner_url": "urn:ext4" + }, + { + "id": "pt4", + "outer_index": 1, + "url": "urn:ext2", + "inner_index": 0, + "inner_url": "urn:ext3" + }, + { + "id": "pt4", + "outer_index": 1, + "url": "urn:ext2", + "inner_index": 1, + "inner_url": "urn:ext4" + }, + { + "id": "pt4", + "outer_index": 2, + "url": "urn:ext3", + "inner_index": 0, + "inner_url": "urn:ext4" + } + ] + }, + { + "title": "repeat nested inside forEach — independent %rowIndex", + "view": { + "resource": "Patient", + "where": [{ "path": "id = 'pt6'" }], + "select": [ + { + "column": [{ "name": "id", "path": "id" }] + }, + { + "forEach": "extension", + "column": [ + { "name": "foreach_index", "path": "%rowIndex" }, + { "name": "parent_url", "path": "url", "type": "uri" } + ], + "select": [ + { + "repeat": ["extension"], + "column": [ + { "name": "repeat_index", "path": "%rowIndex" }, + { "name": "inner_url", "path": "url", "type": "uri" } + ] + } + ] + } + ] + }, + "expect": [ + { + "id": "pt6", + "foreach_index": 0, + "parent_url": "urn:fe1", + "repeat_index": 0, + "inner_url": "urn:fe1.1" + }, + { + "id": "pt6", + "foreach_index": 1, + "parent_url": "urn:fe2", + "repeat_index": 0, + "inner_url": "urn:fe2.1" + } + ] } ] } diff --git a/openspec/changes/archive/2026-04-01-repeat-row-index/.openspec.yaml b/openspec/changes/archive/2026-04-01-repeat-row-index/.openspec.yaml new file mode 100644 index 0000000000..0f5280395b --- /dev/null +++ b/openspec/changes/archive/2026-04-01-repeat-row-index/.openspec.yaml @@ -0,0 +1,2 @@ +schema: spec-driven +created: 2026-04-01 diff --git a/openspec/changes/archive/2026-04-01-repeat-row-index/design.md b/openspec/changes/archive/2026-04-01-repeat-row-index/design.md new file mode 100644 index 0000000000..277f5d27d0 --- /dev/null +++ b/openspec/changes/archive/2026-04-01-repeat-row-index/design.md @@ -0,0 +1,61 @@ +## Context + +The `%rowIndex` environment variable is already implemented for `forEach`/`forEachOrNull` via `UnnestingSelection`. That implementation uses Spark's two-argument `transform(array, (element, index) -> ...)` lambda, which provides the element index natively. The index column is threaded through `ProjectionContext.withRowIndex()` and injected into expression evaluation via `SingleResourceEvaluator.withVariable()`. + +`RepeatSelection` works differently. It uses `ValueFunctions.transformTree()` to recursively flatten a tree structure (e.g., nested extensions) by concatenating results across depth levels and traversal branches. The `transformTree` function internally uses `Concat` to merge arrays from each depth level. There is no built-in Spark mechanism to track a global position index across this flattened concatenation. + +## Goals / Non-Goals + +**Goals:** + +- Provide `%rowIndex` as a 0-based integer within `repeat` iterations, reflecting the element's position in the flattened traversal-order output. +- Reset the counter to 0 for each resource row. +- Scope `%rowIndex` to the nearest enclosing iteration directive (`repeat`, `forEach`, or `forEachOrNull`), so nested directives maintain independent indices. + +**Non-Goals:** + +- Changing how `%rowIndex` works for `forEach`/`forEachOrNull` (already implemented). + +## Decisions + +### Use a stateful counter expression for global traversal-order indexing + +**Decision:** Introduce a `RowIndexCounter` class (thread-safe via `ThreadLocal`) and two Spark expressions — `RowCounter` (returns current value and increments) and `ResetCounter` (resets to 0 before evaluating its child). The counter is shared across the entire `transformTree` invocation, producing a monotonically increasing sequence across all depth levels and branches. + +**Rationale:** Unlike `forEach` where Spark's indexed `transform` provides per-element indices natively, `transformTree` concatenates results from multiple recursive levels. No single Spark `transform` call sees the full flattened output. A stateful counter that increments on each element evaluation is the simplest way to produce a global traversal-order index. + +**Alternatives considered:** + +- _Post-hoc indexing with `posexplode`_: Explode the final array with position indices. Rejected because the array is already embedded in a column expression pipeline — adding a dataset-level operation would require restructuring the projection architecture. +- _Pre-stamping indices into the tree_: Wrap each element with its index before `transformTree`. Rejected because the total count across levels is not known until traversal completes, and the breadth-first concatenation order makes pre-computation complex. + +### Thread-safety via ThreadLocal + +**Decision:** `RowIndexCounter` uses `ThreadLocal` for its mutable state. The class is `Serializable` with a transient `ThreadLocal` field that is lazily re-initialized after deserialization. + +**Rationale:** Spark tasks run in parallel across partitions on different threads. `ThreadLocal` ensures each partition's task thread gets an independent counter, preventing cross-partition interference. The `int[]` wrapper avoids boxing overhead. Lazy re-initialization handles the case where the counter is serialized to an executor and deserialized in a new JVM. + +### Inject counter via ProjectionContext.withRowIndex() + +**Decision:** `RepeatSelection` creates a `RowIndexCounter`, wraps it in a `RowCounter` column, and injects it into the `ProjectionContext` via the existing `withRowIndex()` method before building the `transformTree` expression. The final result is wrapped with `ResetCounter` to ensure the counter resets for each resource row. + +**Rationale:** This reuses the same mechanism that `UnnestingSelection` uses for `forEach` — the `rowIndex` field on `ProjectionContext` is already threaded into `evalExpression()` and resolved as the `%rowIndex` variable. The only difference is the source of the index column: Spark's indexed transform lambda vs. a stateful counter expression. + +**Scoping:** When a `forEach` is nested inside a `repeat`, the inner `UnnestingSelection` calls `withRowIndex(index)` with its own transform-provided index, naturally shadowing the outer `repeat`'s counter. Conversely, a `repeat` nested inside a `forEach` would create its own `RowIndexCounter`, independent of the outer scope. + +### Place RowCounter/ResetCounter in the encoders module + +**Decision:** The `RowCounter` and `ResetCounter` Spark expressions, along with the `RowIndexCounter` state class, are placed in the `encoders` module alongside other custom Spark expressions (`Expressions.scala`, `ValueFunctions.java`). + +**Rationale:** The `encoders` module already contains all custom Spark Catalyst expressions (e.g., `TransformTree`, `PruneSyntheticFields`). `RowCounter` and `ResetCounter` are general-purpose Spark expressions that could potentially be reused beyond `repeat`. Convenience methods are added to `ValueFunctions` following the existing pattern. + +## Risks / Trade-offs + +**[Risk] Evaluation order determinism** — The counter relies on deterministic evaluation order within `transformTree`. If Spark were to evaluate elements in a non-deterministic order, indices would be unpredictable. +→ **Mitigation:** `transformTree` uses `Concat` of `transform` calls, both of which preserve array order. Spark's higher-order functions evaluate elements sequentially within a single row. The spike's tests confirm deterministic ordering. + +**[Risk] Single-partition constraint in tests** — The spike's encoder unit tests use `.repartition(1)` to ensure deterministic evaluation order across rows. This is a test-level constraint, not a runtime limitation — in production, each partition processes its rows independently and the counter resets per row via `ResetCounter`. +→ **Mitigation:** Document this constraint in test comments. The `ResetCounter` ensures correctness regardless of partitioning. + +**[Risk] Codegen compatibility** — `RowCounter` extends `Nondeterministic` and implements `doGenCode` for Spark's whole-stage code generation. If the codegen path diverges from the interpreted path, indices could be incorrect. +→ **Mitigation:** The `ExpressionsBothModesTest` base class runs all encoder tests in both interpreted and codegen modes, catching any divergence. diff --git a/openspec/changes/archive/2026-04-01-repeat-row-index/proposal.md b/openspec/changes/archive/2026-04-01-repeat-row-index/proposal.md new file mode 100644 index 0000000000..e3f6d28942 --- /dev/null +++ b/openspec/changes/archive/2026-04-01-repeat-row-index/proposal.md @@ -0,0 +1,27 @@ +## Why + +The `%rowIndex` environment variable is currently implemented for `forEach` and `forEachOrNull` ViewDefinition directives but not for `repeat`. The `repeat` directive flattens recursive structures (e.g., nested extensions, Questionnaire items) into rows, and users need `%rowIndex` to preserve ordering, disambiguate elements, and construct surrogate keys — the same use cases that motivated `%rowIndex` for `forEach`. This completes the `%rowIndex` implementation across all iteration directives as defined in the SQL on FHIR ViewDefinition spec. + +## What Changes + +- `%rowIndex` resolves to a 0-based global traversal-order index within `repeat` iterations, treating the entire flattened recursive tree as the collection being iterated. +- The counter resets to 0 for each resource row. +- Each `repeat` directive scopes its own `%rowIndex`, independent of enclosing or nested `forEach`/`forEachOrNull`/`repeat` directives. +- A stateful counter mechanism (`RowIndexCounter`) is introduced at the Spark expression level to track element positions across tree depth levels and traversal branches. + +## Capabilities + +### New Capabilities + +(none) + +### Modified Capabilities + +- `row-index-variable`: Add requirements for `%rowIndex` within `repeat` directives, including global traversal-order semantics, per-resource reset, and scoping rules for nested `repeat`. + +## Impact + +- `encoders` module: New Spark expressions (`RowCounter`, `ResetCounter`) and supporting `RowIndexCounter` class. +- `fhirpath` module: `RepeatSelection` injects the row index counter into `ProjectionContext` and wraps output with counter reset. +- ViewDefinition test suite: New `%rowIndex` + `repeat` test cases added to `rowindex.json` (alongside existing forEach/forEachOrNull tests), with additional test resource data for recursively nested extensions. +- Encoder unit tests: New tests in `ExpressionsBothModesTest` for counter behaviour with `transform`, `transformTree`, and arithmetic composition. diff --git a/openspec/changes/archive/2026-04-01-repeat-row-index/specs/row-index-variable/spec.md b/openspec/changes/archive/2026-04-01-repeat-row-index/specs/row-index-variable/spec.md new file mode 100644 index 0000000000..7efcda0440 --- /dev/null +++ b/openspec/changes/archive/2026-04-01-repeat-row-index/specs/row-index-variable/spec.md @@ -0,0 +1,83 @@ +## ADDED Requirements + +### Requirement: %rowIndex resolves to global traversal-order index within repeat + +When a ViewDefinition `select` clause uses `repeat`, the `%rowIndex` environment variable SHALL resolve to the 0-based index of the current element within the flattened collection produced by the recursive traversal. The index reflects the element's position in the complete flattened output (across all depth levels and traversal branches), not its position within a single depth level. + +#### Scenario: Linear repeat with sequential indices + +- **WHEN** a ViewDefinition has `repeat: ["extension"]` and the resource has a chain of 4 nested extensions (each containing one child extension) +- **THEN** `%rowIndex` SHALL be `0` for the first extension, `1` for its child, `2` for the grandchild, and `3` for the great-grandchild + +#### Scenario: Branching repeat with breadth-first indices + +- **WHEN** a ViewDefinition has `repeat: ["extension"]` and the resource has a root extension with 2 child extensions (the first child having 1 grandchild) +- **THEN** the root extension SHALL have `%rowIndex` `0`, its two children SHALL have `%rowIndex` `1` and `2` (in document order), and the grandchild SHALL have `%rowIndex` `3` + +#### Scenario: Repeat with single element + +- **WHEN** a ViewDefinition has `repeat: ["extension"]` and the resource has exactly 1 extension with no nested extensions +- **THEN** `%rowIndex` SHALL be `0` for that extension + +#### Scenario: Repeat with empty collection + +- **WHEN** a ViewDefinition has `repeat: ["extension"]` and the resource has no extensions +- **THEN** no rows are produced, so `%rowIndex` is not evaluated + +### Requirement: %rowIndex resets to 0 for each resource row within repeat + +The `%rowIndex` counter SHALL reset to 0 at the start of each resource row. The index sequence is scoped to a single resource's traversal, not global across the dataset. + +#### Scenario: Counter resets across resources + +- **WHEN** a ViewDefinition has `repeat: ["extension"]` and two resources each have nested extensions +- **THEN** the `%rowIndex` sequence SHALL start at `0` independently for each resource + +### Requirement: repeat scopes its own %rowIndex independently from enclosing and nested directives + +Each `repeat` directive SHALL maintain its own `%rowIndex` scope. A `forEach` or `forEachOrNull` nested inside a `repeat` SHALL have its own independent `%rowIndex`. Likewise, a `repeat` nested inside a `forEach` SHALL have its own independent `%rowIndex`. + +#### Scenario: forEach nested inside repeat has independent %rowIndex + +- **WHEN** a ViewDefinition has a `repeat: ["extension"]` with a nested `forEach: "extension"` inside it +- **THEN** the `repeat` level `%rowIndex` SHALL reflect the global traversal position, and the inner `forEach` `%rowIndex` SHALL reflect the 0-based index within that element's immediate children, independent of the outer repeat index + +#### Scenario: repeat nested inside forEach has independent %rowIndex + +- **WHEN** a ViewDefinition has `forEach: "name"` with a nested `repeat: ["extension"]` inside it +- **THEN** the outer `forEach` `%rowIndex` SHALL reflect the name index, and the inner `repeat` `%rowIndex` SHALL start at `0` for each name's extension traversal + +#### Scenario: repeat nested inside repeat has independent %rowIndex + +- **WHEN** a ViewDefinition has an outer `repeat: ["extension"]` with an inner `repeat: ["extension"]` nested inside it via a `select` +- **THEN** the outer `repeat` `%rowIndex` SHALL reflect the global traversal position in the outer flattened tree, and the inner `repeat` `%rowIndex` SHALL start at `0` independently for each element's nested extension traversal + +### Requirement: %rowIndex supports arithmetic within repeat + +The `%rowIndex` variable within `repeat` iterations SHALL resolve to an integer value compatible with FHIRPath integer type, allowing arithmetic operations. + +#### Scenario: Arithmetic with %rowIndex in repeat + +- **WHEN** a column expression within a `repeat` block is `%rowIndex + 1` +- **THEN** the result SHALL be the 1-based position of the element in the flattened traversal + +## MODIFIED Requirements + +### Requirement: Nested iterations maintain independent %rowIndex values + +Each nesting level of `forEach`/`forEachOrNull`/`repeat` SHALL maintain its own independent `%rowIndex`. An inner iteration directive resets `%rowIndex` to count within its own collection, restoring the outer `%rowIndex` when the inner iteration completes. + +#### Scenario: Nested forEach iterations + +- **WHEN** a ViewDefinition has an outer `forEach: "Patient.name"` (Patient has 2 names) and an inner `forEach: "HumanName.given"` (first name has 2 givens, second name has 1 given) +- **THEN** for the first name: outer `%rowIndex` is `0`, inner `%rowIndex` is `0` and `1` for each given; for the second name: outer `%rowIndex` is `1`, inner `%rowIndex` is `0` for its single given + +#### Scenario: Inner forEach does not affect outer %rowIndex + +- **WHEN** a column expression references `%rowIndex` at the outer forEach level after an inner forEach has completed +- **THEN** the value SHALL reflect the outer iteration index, unaffected by the inner iteration + +#### Scenario: Nested repeat and forEach maintain independent indices + +- **WHEN** a ViewDefinition has `repeat: ["extension"]` containing a nested `forEach: "extension"` +- **THEN** each directive level SHALL maintain its own `%rowIndex`, with the inner `forEach` index being independent of the outer `repeat` index diff --git a/openspec/changes/archive/2026-04-01-repeat-row-index/tasks.md b/openspec/changes/archive/2026-04-01-repeat-row-index/tasks.md new file mode 100644 index 0000000000..8b61973c7f --- /dev/null +++ b/openspec/changes/archive/2026-04-01-repeat-row-index/tasks.md @@ -0,0 +1,18 @@ +## 1. Cherry-pick spike implementation + +- [x] 1.1 Cherry-pick commit `093d39c645` from `spike/repeat_row_index` onto `issue/2560` — this brings in `RowIndexCounter`, `RowCounter`/`ResetCounter` expressions, `ValueFunctions` methods, `RepeatSelection` wiring, encoder unit tests, and initial repeat `%rowIndex` view tests + +## 2. Move and extend ViewDefinition test cases (rowindex.json) + +- [x] 2.1 Move the 3 repeat `%rowIndex` tests from `repeat.json` to `rowindex.json` (remove from `repeat.json`) +- [x] 2.2 Add test resource with recursively nested extensions (linear chain) to `rowindex.json` resources — reuse or adapt the extension structure from `repeat.json` +- [x] 2.3 Add test resource with branching extensions (root extension with 2 children, first child has 1 grandchild) to `rowindex.json` resources +- [x] 2.4 Add test: repeat with `%rowIndex` — branching tree, breadth-first indices (uses branching resource from 2.3) +- [x] 2.5 Add test: repeat with `%rowIndex` across multiple resources — verify counter resets to 0 per resource +- [x] 2.6 Add test: repeat nested inside repeat — independent `%rowIndex` scopes + +## 3. Verification + +- [x] 3.1 Run encoder unit tests (`ExpressionsBothModesTest` subclasses) in both interpreted and codegen modes +- [x] 3.2 Run ViewDefinition test suite (`ViewDefinitionTest`) to verify all `rowindex.json` tests pass +- [x] 3.3 Run existing `repeat.json` tests to verify no regressions diff --git a/openspec/specs/row-index-variable/spec.md b/openspec/specs/row-index-variable/spec.md index 02bf2f460e..ef045252f4 100644 --- a/openspec/specs/row-index-variable/spec.md +++ b/openspec/specs/row-index-variable/spec.md @@ -42,9 +42,70 @@ When no `forEach` or `forEachOrNull` iteration is active (i.e. the expression is - **WHEN** a ViewDefinition `select` has a column with expression `%rowIndex` and no `forEach` or `forEachOrNull` is active - **THEN** the column value SHALL be `0` for every resource row +### Requirement: %rowIndex resolves to global traversal-order index within repeat + +When a ViewDefinition `select` clause uses `repeat`, the `%rowIndex` environment variable SHALL resolve to the 0-based index of the current element within the flattened collection produced by the recursive traversal. The index reflects the element's position in the complete flattened output (across all depth levels and traversal branches), not its position within a single depth level. + +#### Scenario: Linear repeat with sequential indices + +- **WHEN** a ViewDefinition has `repeat: ["extension"]` and the resource has a chain of 4 nested extensions (each containing one child extension) +- **THEN** `%rowIndex` SHALL be `0` for the first extension, `1` for its child, `2` for the grandchild, and `3` for the great-grandchild + +#### Scenario: Branching repeat with breadth-first indices + +- **WHEN** a ViewDefinition has `repeat: ["extension"]` and the resource has a root extension with 2 child extensions (the first child having 1 grandchild) +- **THEN** the root extension SHALL have `%rowIndex` `0`, its two children SHALL have `%rowIndex` `1` and `2` (in document order), and the grandchild SHALL have `%rowIndex` `3` + +#### Scenario: Repeat with single element + +- **WHEN** a ViewDefinition has `repeat: ["extension"]` and the resource has exactly 1 extension with no nested extensions +- **THEN** `%rowIndex` SHALL be `0` for that extension + +#### Scenario: Repeat with empty collection + +- **WHEN** a ViewDefinition has `repeat: ["extension"]` and the resource has no extensions +- **THEN** no rows are produced, so `%rowIndex` is not evaluated + +### Requirement: %rowIndex resets to 0 for each resource row within repeat + +The `%rowIndex` counter SHALL reset to 0 at the start of each resource row. The index sequence is scoped to a single resource's traversal, not global across the dataset. + +#### Scenario: Counter resets across resources + +- **WHEN** a ViewDefinition has `repeat: ["extension"]` and two resources each have nested extensions +- **THEN** the `%rowIndex` sequence SHALL start at `0` independently for each resource + +### Requirement: repeat scopes its own %rowIndex independently from enclosing and nested directives + +Each `repeat` directive SHALL maintain its own `%rowIndex` scope. A `forEach` or `forEachOrNull` nested inside a `repeat` SHALL have its own independent `%rowIndex`. Likewise, a `repeat` nested inside a `forEach` SHALL have its own independent `%rowIndex`. + +#### Scenario: forEach nested inside repeat has independent %rowIndex + +- **WHEN** a ViewDefinition has a `repeat: ["extension"]` with a nested `forEach: "extension"` inside it +- **THEN** the `repeat` level `%rowIndex` SHALL reflect the global traversal position, and the inner `forEach` `%rowIndex` SHALL reflect the 0-based index within that element's immediate children, independent of the outer repeat index + +#### Scenario: repeat nested inside forEach has independent %rowIndex + +- **WHEN** a ViewDefinition has `forEach: "name"` with a nested `repeat: ["extension"]` inside it +- **THEN** the outer `forEach` `%rowIndex` SHALL reflect the name index, and the inner `repeat` `%rowIndex` SHALL start at `0` for each name's extension traversal + +#### Scenario: repeat nested inside repeat has independent %rowIndex + +- **WHEN** a ViewDefinition has an outer `repeat: ["extension"]` with an inner `repeat: ["extension"]` nested inside it via a `select` +- **THEN** the outer `repeat` `%rowIndex` SHALL reflect the global traversal position in the outer flattened tree, and the inner `repeat` `%rowIndex` SHALL start at `0` independently for each element's nested extension traversal + +### Requirement: %rowIndex supports arithmetic within repeat + +The `%rowIndex` variable within `repeat` iterations SHALL resolve to an integer value compatible with FHIRPath integer type, allowing arithmetic operations. + +#### Scenario: Arithmetic with %rowIndex in repeat + +- **WHEN** a column expression within a `repeat` block is `%rowIndex + 1` +- **THEN** the result SHALL be the 1-based position of the element in the flattened traversal + ### Requirement: Nested iterations maintain independent %rowIndex values -Each nesting level of `forEach`/`forEachOrNull` SHALL maintain its own independent `%rowIndex`. An inner `forEach` resets `%rowIndex` to count within its own collection, and restoring the outer `%rowIndex` when the inner iteration completes. +Each nesting level of `forEach`/`forEachOrNull`/`repeat` SHALL maintain its own independent `%rowIndex`. An inner iteration directive resets `%rowIndex` to count within its own collection, restoring the outer `%rowIndex` when the inner iteration completes. #### Scenario: Nested forEach iterations @@ -56,6 +117,11 @@ Each nesting level of `forEach`/`forEachOrNull` SHALL maintain its own independe - **WHEN** a column expression references `%rowIndex` at the outer forEach level after an inner forEach has completed - **THEN** the value SHALL reflect the outer iteration index, unaffected by the inner iteration +#### Scenario: Nested repeat and forEach maintain independent indices + +- **WHEN** a ViewDefinition has `repeat: ["extension"]` containing a nested `forEach: "extension"` +- **THEN** each directive level SHALL maintain its own `%rowIndex`, with the inner `forEach` index being independent of the outer `repeat` index + ### Requirement: %rowIndex is available in nested select expressions The `%rowIndex` variable SHALL be accessible from any FHIRPath expression evaluated within the scope of the current iteration, including columns within nested `select` clauses that do not themselves introduce a new `forEach`/`forEachOrNull`. From bdbfb33d7f7a10328254002f7600ac1e3f41c164 Mon Sep 17 00:00:00 2001 From: Piotr Szul Date: Wed, 1 Apr 2026 20:43:30 +1000 Subject: [PATCH 03/41] fix: Improve RowIndexCounter thread safety and suppress SonarCloud warning Replace lazy ThreadLocal initialization with eager init and readObject() to eliminate a race condition when the instance is shared across threads via Spark's addReferenceObj(). Suppress S5164 (ThreadLocal.remove()) with documentation explaining why removal is unnecessary. Co-Authored-By: Claude Opus 4.6 (1M context) --- .../pathling/encoders/RowIndexCounter.java | 30 ++++++++++++------- 1 file changed, 20 insertions(+), 10 deletions(-) diff --git a/encoders/src/main/java/au/csiro/pathling/encoders/RowIndexCounter.java b/encoders/src/main/java/au/csiro/pathling/encoders/RowIndexCounter.java index 09d2f587da..ad8a5b66b3 100644 --- a/encoders/src/main/java/au/csiro/pathling/encoders/RowIndexCounter.java +++ b/encoders/src/main/java/au/csiro/pathling/encoders/RowIndexCounter.java @@ -17,6 +17,8 @@ package au.csiro.pathling.encoders; +import java.io.IOException; +import java.io.ObjectInputStream; import java.io.Serializable; /** @@ -24,23 +26,31 @@ * thread gets its own independent counter via {@link ThreadLocal}, ensuring that Spark tasks * running in parallel on different partitions do not interfere with each other. * + *

This class is shared across partitions via Spark's {@code addReferenceObj()} mechanism in + * codegen mode. Since reference objects are shared within an executor, {@link ThreadLocal} is + * required to isolate mutable state per task thread. + * *

This class is {@link Serializable} so that it survives Spark plan serialization to executors. - * The {@link ThreadLocal} state is transient and lazily re-initialized after deserialization. + * The {@link ThreadLocal} is eagerly initialized and re-initialized after deserialization via + * {@link #readObject(ObjectInputStream)}. + * + *

Note: {@link ThreadLocal#remove()} is intentionally not called. The stored value is a single + * {@code int[1]} (16 bytes) that is reset to zero each row via {@link #reset()}. When this object + * becomes unreachable, the {@link ThreadLocal}'s weak-reference key is collected and the stale + * entry is cleaned up lazily by subsequent {@link ThreadLocal} operations on the same thread. * * @author Piotr Szul */ +@SuppressWarnings("java:S5164") // ThreadLocal.remove() not needed — see class Javadoc. public class RowIndexCounter implements Serializable { private static final long serialVersionUID = 1L; - @SuppressWarnings("TransientFieldNotInitialized") - private transient ThreadLocal counter; + private transient ThreadLocal counter = ThreadLocal.withInitial(() -> new int[] {0}); - private ThreadLocal getCounter() { - if (counter == null) { - counter = ThreadLocal.withInitial(() -> new int[] {0}); - } - return counter; + private void readObject(final ObjectInputStream in) throws IOException, ClassNotFoundException { + in.defaultReadObject(); + counter = ThreadLocal.withInitial(() -> new int[] {0}); } /** @@ -50,7 +60,7 @@ private ThreadLocal getCounter() { * @return the current counter value before incrementing */ public int getAndIncrement() { - return getCounter().get()[0]++; + return counter.get()[0]++; } /** @@ -58,6 +68,6 @@ public int getAndIncrement() { * top-level row to ensure the index sequence starts fresh. */ public void reset() { - getCounter().get()[0] = 0; + counter.get()[0] = 0; } } From ee8d008fd4ad0632bcb23d8b00783fd9f477dade Mon Sep 17 00:00:00 2001 From: Piotr Szul Date: Tue, 7 Apr 2026 21:55:19 +1000 Subject: [PATCH 04/41] fix: Ensure duplicate %rowIndex references in repeat produce consistent values Split RowCounter into separate read (RowCounterGet) and increment (RowCounterIncrement) operations so that multiple references to %rowIndex within the same repeat element all read the same value. Previously each reference independently called getAndIncrement(), causing N references to consume N counter values per element instead of one. Co-Authored-By: Claude Opus 4.6 (1M context) --- .../pathling/encoders/RowIndexCounter.java | 19 +++++ .../pathling/encoders/ValueFunctions.java | 31 +++++++ .../csiro/pathling/encoders/Expressions.scala | 80 +++++++++++++++++++ .../pathling/projection/RepeatSelection.java | 32 ++++++-- .../test/resources/viewTests/rowindex.json | 26 ++++++ 5 files changed, 183 insertions(+), 5 deletions(-) diff --git a/encoders/src/main/java/au/csiro/pathling/encoders/RowIndexCounter.java b/encoders/src/main/java/au/csiro/pathling/encoders/RowIndexCounter.java index ad8a5b66b3..86d0804e72 100644 --- a/encoders/src/main/java/au/csiro/pathling/encoders/RowIndexCounter.java +++ b/encoders/src/main/java/au/csiro/pathling/encoders/RowIndexCounter.java @@ -53,6 +53,17 @@ private void readObject(final ObjectInputStream in) throws IOException, ClassNot counter = ThreadLocal.withInitial(() -> new int[] {0}); } + /** + * Returns the current counter value without modifying it. Multiple calls between increments + * return the same value, making this safe to use when the counter is referenced more than once + * per element. + * + * @return the current counter value + */ + public int get() { + return counter.get()[0]; + } + /** * Returns the current counter value and increments it. The first call after a {@link #reset()} * returns 0. @@ -63,6 +74,14 @@ public int getAndIncrement() { return counter.get()[0]++; } + /** + * Increments the counter without returning a value. This is used to advance the counter after all + * references to the current value have been evaluated. + */ + public void increment() { + counter.get()[0]++; + } + /** * Resets the counter to zero for the current thread. This should be called before evaluating each * top-level row to ensure the index sequence starts fresh. diff --git a/encoders/src/main/java/au/csiro/pathling/encoders/ValueFunctions.java b/encoders/src/main/java/au/csiro/pathling/encoders/ValueFunctions.java index f98d63b18f..ac24a95d76 100644 --- a/encoders/src/main/java/au/csiro/pathling/encoders/ValueFunctions.java +++ b/encoders/src/main/java/au/csiro/pathling/encoders/ValueFunctions.java @@ -446,6 +446,37 @@ public static Column rowCounter(@Nonnull final RowIndexCounter state) { return column(new RowCounter(state)); } + /** + * Creates a read-only view of a shared {@link RowIndexCounter}. Each evaluation returns the + * current counter value without incrementing it, so multiple references within the same element + * evaluation all see the same value. + * + *

The counter must be advanced separately via {@link #rowCounterIncrement(Column, + * RowIndexCounter)} after all references for a given element have been evaluated. + * + * @param state the shared counter instance + * @return a Column that reads the current counter value without incrementing + */ + @Nonnull + public static Column rowCounterGet(@Nonnull final RowIndexCounter state) { + return column(new RowCounterGet(state)); + } + + /** + * Wraps a column expression so that the shared row counter is incremented after evaluating the + * expression. This should be applied to the extractor result in a repeat projection to ensure the + * counter advances exactly once per element. + * + * @param child the expression to evaluate before incrementing + * @param state the shared counter instance to increment + * @return a Column that evaluates the child and then increments the counter + */ + @Nonnull + public static Column rowCounterIncrement( + @Nonnull final Column child, @Nonnull final RowIndexCounter state) { + return column(new RowCounterIncrement(expression(child), state)); + } + /** * Wraps a column expression so that the shared row counter is reset to zero before evaluating the * expression. This should be applied at the outermost level of a repeat projection to ensure the diff --git a/encoders/src/main/scala/au/csiro/pathling/encoders/Expressions.scala b/encoders/src/main/scala/au/csiro/pathling/encoders/Expressions.scala index 74611400f1..6477e0ad45 100644 --- a/encoders/src/main/scala/au/csiro/pathling/encoders/Expressions.scala +++ b/encoders/src/main/scala/au/csiro/pathling/encoders/Expressions.scala @@ -989,6 +989,86 @@ case class RowCounter(state: RowIndexCounter) } } +/** + * A leaf expression that reads the current value of a [[RowIndexCounter]] without incrementing it. + * Multiple references to this expression within the same element evaluation all return the same + * value, making it safe for use when `%rowIndex` is referenced more than once. + * + * The counter must be advanced separately via [[RowCounterIncrement]] after all references for a + * given element have been evaluated. + * + * @param state the shared thread-safe counter + */ +case class RowCounterGet(state: RowIndexCounter) + extends LeafExpression with Nondeterministic { + + override def stateful: Boolean = true + + override def nullable: Boolean = false + + override def dataType: DataType = IntegerType + + override protected def initializeInternal(partitionIndex: Int): Unit = { + // No-op: reset is handled by ResetCounter at the per-row level, not per-partition. + } + + override protected def evalInternal(input: InternalRow): Int = { + state.get() + } + + override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { + val counterRef = ctx.addReferenceObj("rowCounter", state, classOf[RowIndexCounter].getName) + ev.copy(code = code""" + final ${CodeGenerator.javaType(dataType)} ${ev.value} = $counterRef.get();""", + isNull = FalseLiteral) + } + + override def withNewChildrenInternal(newChildren: IndexedSeq[Expression]): Expression = { + RowCounterGet(state) + } +} + +/** + * A unary expression that increments a [[RowIndexCounter]] after evaluating its child expression. + * This is used to advance the counter exactly once per element, after all `%rowIndex` references + * (via [[RowCounterGet]]) have been read. + * + * @param child the expression to evaluate before incrementing + * @param state the shared thread-safe counter to increment + */ +case class RowCounterIncrement(child: Expression, state: RowIndexCounter) + extends UnaryExpression with NonSQLExpression { + + override def dataType: DataType = child.dataType + + override def nullable: Boolean = child.nullable + + override protected def nullSafeEval(input: Any): Any = { + // This should not be called — we override eval directly. + throw new UnsupportedOperationException(ExpressionConstants.CODEGEN_ONLY_MSG) + } + + override def eval(input: InternalRow): Any = { + val result = child.eval(input) + state.increment() + result + } + + override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { + val counterRef = ctx.addReferenceObj("rowCounter", state, classOf[RowIndexCounter].getName) + val childEval = child.genCode(ctx) + ev.copy(code = code""" + ${childEval.code} + final boolean ${ev.isNull} = ${childEval.isNull}; + final ${CodeGenerator.javaType(dataType)} ${ev.value} = ${childEval.value}; + $counterRef.increment();""") + } + + override protected def withNewChildInternal(newChild: Expression): Expression = { + RowCounterIncrement(newChild, state) + } +} + /** * A unary expression that resets a [[RowCounter]]'s shared state to zero before evaluating its * child expression. This ensures the counter starts fresh for each row when used inside diff --git a/fhirpath/src/main/java/au/csiro/pathling/projection/RepeatSelection.java b/fhirpath/src/main/java/au/csiro/pathling/projection/RepeatSelection.java index 4de03ee833..c5f503396c 100644 --- a/fhirpath/src/main/java/au/csiro/pathling/projection/RepeatSelection.java +++ b/fhirpath/src/main/java/au/csiro/pathling/projection/RepeatSelection.java @@ -26,6 +26,7 @@ import au.csiro.pathling.fhirpath.column.DefaultRepresentation; import jakarta.annotation.Nonnull; import java.util.List; +import java.util.function.UnaryOperator; import java.util.stream.Collectors; import org.apache.spark.sql.Column; import org.hl7.fhir.r4.model.Enumerations.FHIRDefinedType; @@ -56,11 +57,12 @@ public record RepeatSelection( @Override public ProjectionResult evaluate(@Nonnull final ProjectionContext context) { - // Create a shared counter for the %rowIndex environment variable. Each element extracted by the - // tree traversal increments this counter, producing a global 0-based index across all depth - // levels and traversal branches. + // Create a shared counter for the %rowIndex environment variable. The counter is split into + // read and increment operations: all %rowIndex references within a single element read the + // same value (via rowCounterGet), and the counter advances exactly once per element (via + // rowCounterIncrement wrapping the extractor result). final RowIndexCounter rowIndexCounter = new RowIndexCounter(); - final Column rowIndexCol = ValueFunctions.rowCounter(rowIndexCounter); + final Column rowIndexCol = ValueFunctions.rowCounterGet(rowIndexCounter); // Evaluate each path to get collections, retaining them for type inspection. final List pathCollections = paths.stream().map(context::evalExpression).toList(); @@ -91,7 +93,8 @@ public ProjectionResult evaluate(@Nonnull final ProjectionContext context) { ctx.inputContext().getColumnValue(), c -> ValueFunctions.emptyArrayIfMissingField( - component.evaluateElementWise(ctx.withInputColumn(c))), + evaluateElementWiseWithIncrement( + ctx.withInputColumn(c), rowIndexCounter)), paths.stream().map(ctx::asColumnOperator).toList(), maxDepth, errorOnDepthExhaustion)) @@ -115,6 +118,25 @@ public ProjectionResult evaluate(@Nonnull final ProjectionContext context) { return component.evaluate(schemaContext).withResultColumn(result); } + /** + * Evaluates the component clause element-wise, wrapping each per-element result with a counter + * increment. This ensures the shared row index counter advances exactly once per array element, + * after all {@code %rowIndex} references within that element have been read. + * + * @param context the projection context for evaluation + * @param counter the shared counter to increment after each element + * @return the resulting column after element-wise evaluation with per-element increment + */ + @Nonnull + private Column evaluateElementWiseWithIncrement( + @Nonnull final ProjectionContext context, @Nonnull final RowIndexCounter counter) { + final UnaryOperator elementOperator = component.asColumnOperator(context); + return new DefaultRepresentation(context.inputContext().getColumnValue()) + .transform(c -> ValueFunctions.rowCounterIncrement(elementOperator.apply(c), counter)) + .flatten() + .getValue(); + } + /** * Returns the FHIRPath expression representation of this repeat selection. * diff --git a/fhirpath/src/test/resources/viewTests/rowindex.json b/fhirpath/src/test/resources/viewTests/rowindex.json index 4758c187e5..0c036bb429 100644 --- a/fhirpath/src/test/resources/viewTests/rowindex.json +++ b/fhirpath/src/test/resources/viewTests/rowindex.json @@ -582,6 +582,32 @@ } ] }, + { + "title": "repeat with duplicate %rowIndex references — same value per element", + "view": { + "resource": "Patient", + "where": [{ "path": "id = 'pt4'" }], + "select": [ + { + "column": [{ "name": "id", "path": "id" }] + }, + { + "repeat": ["extension"], + "column": [ + { "name": "idx_a", "path": "%rowIndex" }, + { "name": "idx_b", "path": "%rowIndex" }, + { "name": "url", "path": "url", "type": "uri" } + ] + } + ] + }, + "expect": [ + { "id": "pt4", "idx_a": 0, "idx_b": 0, "url": "urn:ext1" }, + { "id": "pt4", "idx_a": 1, "idx_b": 1, "url": "urn:ext2" }, + { "id": "pt4", "idx_a": 2, "idx_b": 2, "url": "urn:ext3" }, + { "id": "pt4", "idx_a": 3, "idx_b": 3, "url": "urn:ext4" } + ] + }, { "title": "repeat nested inside forEach — independent %rowIndex", "view": { From f3d79e6f7f85c00644f2e10fc75f7f5972da010c Mon Sep 17 00:00:00 2001 From: Piotr Szul Date: Tue, 7 Apr 2026 22:27:43 +1000 Subject: [PATCH 05/41] refactor: Remove obsolete RowCounter expression and related tests RowCounter (getAndIncrement per evaluation) is superseded by the RowCounterGet + RowCounterIncrement split. Remove the old expression, its ValueFunctions helper, the getAndIncrement method, and the four encoder-level tests that exercised it. The equivalent behavior is now tested via ViewDefinition-level tests in rowindex.json. Co-Authored-By: Claude Opus 4.6 (1M context) --- .../pathling/encoders/RowIndexCounter.java | 10 - .../pathling/encoders/ValueFunctions.java | 16 - .../csiro/pathling/encoders/Expressions.scala | 44 +-- .../encoders/ExpressionsBothModesTest.java | 276 ------------------ 4 files changed, 1 insertion(+), 345 deletions(-) diff --git a/encoders/src/main/java/au/csiro/pathling/encoders/RowIndexCounter.java b/encoders/src/main/java/au/csiro/pathling/encoders/RowIndexCounter.java index 86d0804e72..99af311768 100644 --- a/encoders/src/main/java/au/csiro/pathling/encoders/RowIndexCounter.java +++ b/encoders/src/main/java/au/csiro/pathling/encoders/RowIndexCounter.java @@ -64,16 +64,6 @@ public int get() { return counter.get()[0]; } - /** - * Returns the current counter value and increments it. The first call after a {@link #reset()} - * returns 0. - * - * @return the current counter value before incrementing - */ - public int getAndIncrement() { - return counter.get()[0]++; - } - /** * Increments the counter without returning a value. This is used to advance the counter after all * references to the current value have been evaluated. diff --git a/encoders/src/main/java/au/csiro/pathling/encoders/ValueFunctions.java b/encoders/src/main/java/au/csiro/pathling/encoders/ValueFunctions.java index ac24a95d76..82d7e2c5e0 100644 --- a/encoders/src/main/java/au/csiro/pathling/encoders/ValueFunctions.java +++ b/encoders/src/main/java/au/csiro/pathling/encoders/ValueFunctions.java @@ -430,22 +430,6 @@ public static Column pruneAnnotations(@Nonnull final Column col) { return column(new PruneSyntheticFields(expression(col))); } - /** - * Creates a new row counter backed by a shared {@link RowIndexCounter}. Each evaluation of the - * returned column increments the counter and returns its previous value, producing a 0-based - * sequence: 0, 1, 2, ... - * - *

The counter must be reset before each top-level evaluation (e.g. per resource row) using - * {@link #resetCounter(Column, RowIndexCounter)}. - * - * @param state the shared counter instance - * @return a Column that produces the next integer on each evaluation - */ - @Nonnull - public static Column rowCounter(@Nonnull final RowIndexCounter state) { - return column(new RowCounter(state)); - } - /** * Creates a read-only view of a shared {@link RowIndexCounter}. Each evaluation returns the * current counter value without incrementing it, so multiple references within the same element diff --git a/encoders/src/main/scala/au/csiro/pathling/encoders/Expressions.scala b/encoders/src/main/scala/au/csiro/pathling/encoders/Expressions.scala index 6477e0ad45..64b278f70b 100644 --- a/encoders/src/main/scala/au/csiro/pathling/encoders/Expressions.scala +++ b/encoders/src/main/scala/au/csiro/pathling/encoders/Expressions.scala @@ -947,48 +947,6 @@ case class UnresolvedVariantUnwrap(inner: Expression, schemaRef: Expression, override def toString: String = s"VariantUnwrap($inner)" } -/** - * A stateful, non-deterministic expression that returns a monotonically increasing integer each - * time it is evaluated. The counter is shared via a [[RowIndexCounter]] instance which uses - * [[ThreadLocal]] storage to ensure thread safety across parallel Spark tasks. - * - * This is designed for use inside array-producing expressions (e.g. `transform`, `Concat`) where - * the evaluation order is deterministic and single-threaded within a row. The counter must be reset - * to zero before each top-level evaluation via [[ResetCounter]]. - * - * Modeled after Spark's `MonotonicallyIncreasingID`. - * - * @param state the shared thread-safe counter - */ -case class RowCounter(state: RowIndexCounter) - extends LeafExpression with Nondeterministic { - - override def stateful: Boolean = true - - override def nullable: Boolean = false - - override def dataType: DataType = IntegerType - - override protected def initializeInternal(partitionIndex: Int): Unit = { - // No-op: reset is handled by ResetCounter at the per-row level, not per-partition. - } - - override protected def evalInternal(input: InternalRow): Int = { - state.getAndIncrement() - } - - override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { - val counterRef = ctx.addReferenceObj("rowCounter", state, classOf[RowIndexCounter].getName) - ev.copy(code = code""" - final ${CodeGenerator.javaType(dataType)} ${ev.value} = $counterRef.getAndIncrement();""", - isNull = FalseLiteral) - } - - override def withNewChildrenInternal(newChildren: IndexedSeq[Expression]): Expression = { - RowCounter(state) - } -} - /** * A leaf expression that reads the current value of a [[RowIndexCounter]] without incrementing it. * Multiple references to this expression within the same element evaluation all return the same @@ -1070,7 +1028,7 @@ case class RowCounterIncrement(child: Expression, state: RowIndexCounter) } /** - * A unary expression that resets a [[RowCounter]]'s shared state to zero before evaluating its + * A unary expression that resets a [[RowIndexCounter]]'s shared state to zero before evaluating its * child expression. This ensures the counter starts fresh for each row when used inside * per-row array transformations. * diff --git a/encoders/src/test/java/au/csiro/pathling/encoders/ExpressionsBothModesTest.java b/encoders/src/test/java/au/csiro/pathling/encoders/ExpressionsBothModesTest.java index 3594ec18aa..5f011236d9 100644 --- a/encoders/src/test/java/au/csiro/pathling/encoders/ExpressionsBothModesTest.java +++ b/encoders/src/test/java/au/csiro/pathling/encoders/ExpressionsBothModesTest.java @@ -45,7 +45,6 @@ import org.apache.spark.sql.types.StructType; import org.junit.jupiter.api.Test; import scala.collection.Seq; -import scala.jdk.javaapi.CollectionConverters; /** * Abstract base class for expression tests that need to run in both codegen and interpreted modes. @@ -293,279 +292,4 @@ void testStructProductInlineWithUnsafeRowData() { assertEquals(expected.get(i), actual.get(i), "Row " + i + " mismatch"); } } - - /** - * Tests that RowCounter produces sequential 0-based indices within a simple array transform, and - * that ResetCounter resets the sequence for each row. - */ - @Test - void testRowCounterWithSimpleTransform() { - final RowIndexCounter counter = new RowIndexCounter(); - final Column counterCol = ValueFunctions.rowCounter(counter); - - // Create a dataset with two rows, each containing an array of different lengths. - final Dataset ds = - spark - .createDataFrame( - List.of( - RowFactory.create(1, List.of("a", "b", "c")), - RowFactory.create(2, List.of("d", "e"))), - DataTypes.createStructType( - new StructField[] { - new StructField("id", DataTypes.IntegerType, false, Metadata.empty()), - new StructField( - "items", - DataTypes.createArrayType(DataTypes.StringType), - false, - Metadata.empty()) - })) - .repartition(1); - - // Use transform to stamp each element with the counter, then wrap with resetCounter. - final Column transformed = - functions.transform( - ds.col("items"), elem -> functions.struct(elem.alias("val"), counterCol.alias("idx"))); - final Column withReset = ValueFunctions.resetCounter(transformed, counter); - - final Dataset result = ds.withColumn("indexed", withReset); - final List rows = result.collectAsList(); - - assertEquals(2, rows.size()); - - // Row 1: 3 elements → indices 0, 1, 2. - final Seq row1Seq = rows.get(0).getAs("indexed"); - final List row1Items = CollectionConverters.asJava(row1Seq); - assertEquals(3, row1Items.size()); - assertEquals(0, (int) ((Row) row1Items.get(0)).getAs("idx")); - assertEquals(1, (int) ((Row) row1Items.get(1)).getAs("idx")); - assertEquals(2, (int) ((Row) row1Items.get(2)).getAs("idx")); - - // Row 2: 2 elements → indices reset to 0, 1. - final Seq row2Seq = rows.get(1).getAs("indexed"); - final List row2Items = CollectionConverters.asJava(row2Seq); - assertEquals(2, row2Items.size()); - assertEquals(0, (int) ((Row) row2Items.get(0)).getAs("idx")); - assertEquals(1, (int) ((Row) row2Items.get(1)).getAs("idx")); - } - - /** - * Tests that RowCounter produces a continuous global sequence when used inside a transformTree - * with a single traversal, producing sequential indices across all depth levels. - */ - @Test - void testRowCounterWithTransformTree() { - final Metadata metadata = Metadata.empty(); - - // Build a 3-level nested structure: root has 2 items, first item has 1 child. - final StructType leafType = - DataTypes.createStructType( - new StructField[] {new StructField("linkId", DataTypes.StringType, true, metadata)}); - - final StructType midType = - DataTypes.createStructType( - new StructField[] { - new StructField("linkId", DataTypes.StringType, true, metadata), - new StructField("item", DataTypes.createArrayType(leafType), true, metadata) - }); - - final StructType rootItemType = - DataTypes.createStructType( - new StructField[] { - new StructField("linkId", DataTypes.StringType, true, metadata), - new StructField("item", DataTypes.createArrayType(midType), true, metadata) - }); - - final StructType rootSchema = - DataTypes.createStructType( - new StructField[] { - new StructField("id", DataTypes.IntegerType, false, metadata), - new StructField("items", DataTypes.createArrayType(rootItemType), true, metadata) - }); - - // Tree structure: - // items[0] (linkId: "1") - // └── item[0] (linkId: "1.1") - // └── item[0] (linkId: "1.1.1") - // items[1] (linkId: "2") - final Row leaf = RowFactory.create("1.1.1"); - final Row mid = RowFactory.create("1.1", List.of(leaf)); - final Row root0 = RowFactory.create("1", List.of(mid)); - final Row root1 = RowFactory.create("2", List.of()); - - final Dataset ds = - spark - .createDataFrame( - List.of( - RowFactory.create(1, List.of(root0, root1)), - RowFactory.create(2, List.of(root1))), - rootSchema) - .repartition(1); - - final RowIndexCounter counter = new RowIndexCounter(); - final Column counterCol = ValueFunctions.rowCounter(counter); - - // Extractor: produce Array[Struct{linkId, idx}] from each array node. - final Column treeResult = - ValueFunctions.transformTree( - ds.col("items"), - c -> - functions.transform( - c, - elem -> - functions.struct( - elem.getField("linkId").alias("linkId"), counterCol.alias("idx"))), - List.of(c -> ValueFunctions.unnest(c.getField("item"))), - 2); - - final Column withReset = ValueFunctions.resetCounter(treeResult, counter); - final Dataset result = ds.withColumn("collected", withReset); - final List rows = result.collectAsList(); - - assertEquals(2, rows.size()); - - // Row 1: transformTree produces breadth-first-like order: - // Concat(extractor(root_items), transformTree(root_items.item)) - // = Concat(["1","2"], Concat(["1.1"], ["1.1.1"])) - // = ["1", "2", "1.1", "1.1.1"] - final Seq row1Seq = rows.get(0).getAs("collected"); - final List row1 = CollectionConverters.asJava(row1Seq); - assertEquals(4, row1.size()); - assertEquals("1", ((Row) row1.get(0)).getAs("linkId")); - assertEquals(0, (int) ((Row) row1.get(0)).getAs("idx")); - assertEquals("2", ((Row) row1.get(1)).getAs("linkId")); - assertEquals(1, (int) ((Row) row1.get(1)).getAs("idx")); - assertEquals("1.1", ((Row) row1.get(2)).getAs("linkId")); - assertEquals(2, (int) ((Row) row1.get(2)).getAs("idx")); - assertEquals("1.1.1", ((Row) row1.get(3)).getAs("linkId")); - assertEquals(3, (int) ((Row) row1.get(3)).getAs("idx")); - - // Row 2: tree has 1 node → "2"(0) — counter resets. - final Seq row2Seq = rows.get(1).getAs("collected"); - final List row2 = CollectionConverters.asJava(row2Seq); - assertEquals(1, row2.size()); - assertEquals("2", ((Row) row2.get(0)).getAs("linkId")); - assertEquals(0, (int) ((Row) row2.get(0)).getAs("idx")); - } - - /** - * Tests that RowCounter works with multiple traversal paths in transformTree, producing a - * continuous global index across all branches and depths. - */ - @Test - void testRowCounterWithMultipleTraversals() { - final Metadata metadata = Metadata.empty(); - - // Build a structure with two traversal paths: "item" and self-reference. - final StructType level2Type = - DataTypes.createStructType( - new StructField[] {new StructField("linkId", DataTypes.StringType, true, metadata)}); - - final StructType level1Type = - DataTypes.createStructType( - new StructField[] { - new StructField("linkId", DataTypes.StringType, true, metadata), - new StructField("item", DataTypes.createArrayType(level2Type), true, metadata) - }); - - final StructType level0Type = - DataTypes.createStructType( - new StructField[] { - new StructField("linkId", DataTypes.StringType, true, metadata), - new StructField("item", DataTypes.createArrayType(level1Type), true, metadata) - }); - - final StructType rootSchema = - DataTypes.createStructType( - new StructField[] { - new StructField("id", DataTypes.IntegerType, false, metadata), - new StructField("items", DataTypes.createArrayType(level0Type), true, metadata) - }); - - // items[0] (linkId: "1") → item[0] (linkId: "2") → item[0] (linkId: "3"). - final Row level2 = RowFactory.create("3"); - final Row level1 = RowFactory.create("2", List.of(level2)); - final Row level0 = RowFactory.create("1", List.of(level1)); - - final Dataset ds = - spark - .createDataFrame(List.of(RowFactory.create(1, List.of(level0))), rootSchema) - .repartition(1); - - final RowIndexCounter counter = new RowIndexCounter(); - final Column counterCol = ValueFunctions.rowCounter(counter); - - // Use two traversals: item navigation and self-reference (like the existing test). - final Column treeResult = - ValueFunctions.transformTree( - ds.col("items"), - c -> - functions.transform( - c, - elem -> - functions.struct( - elem.getField("linkId").alias("linkId"), counterCol.alias("idx"))), - List.of(c -> ValueFunctions.unnest(c.getField("item")), c -> c), - 1); - - final Column withReset = ValueFunctions.resetCounter(treeResult, counter); - final Dataset result = ds.withColumn("collected", withReset); - final List rows = result.collectAsList(); - - assertEquals(1, rows.size()); - - // The existing test (without counter) produces linkIds: [1, 2, 3, 3, 2, 3, 1, 2, 3]. - // Each element should have a sequential global index. - final Seq collectedSeq = rows.get(0).getAs("collected"); - final List collected = CollectionConverters.asJava(collectedSeq); - assertEquals(9, collected.size()); - - // Verify sequential indices 0..8. - for (int i = 0; i < 9; i++) { - assertEquals( - i, (int) ((Row) collected.get(i)).getAs("idx"), "Index mismatch at position " + i); - } - } - - /** - * Tests that RowCounter composes with arithmetic expressions, validating that %rowIndex + 1 style - * usage works correctly. - */ - @Test - void testRowCounterInArithmeticExpression() { - final RowIndexCounter counter = new RowIndexCounter(); - final Column counterCol = ValueFunctions.rowCounter(counter); - - final Dataset ds = - spark - .createDataFrame( - List.of(RowFactory.create(1, List.of("a", "b", "c"))), - DataTypes.createStructType( - new StructField[] { - new StructField("id", DataTypes.IntegerType, false, Metadata.empty()), - new StructField( - "items", - DataTypes.createArrayType(DataTypes.StringType), - false, - Metadata.empty()) - })) - .repartition(1); - - // Use counter in an arithmetic expression: counter + 1 (1-based index). - final Column transformed = - functions.transform( - ds.col("items"), - elem -> functions.struct(elem.alias("val"), counterCol.plus(1).alias("one_based_idx"))); - final Column withReset = ValueFunctions.resetCounter(transformed, counter); - - final Dataset result = ds.withColumn("indexed", withReset); - final List rows = result.collectAsList(); - - assertEquals(1, rows.size()); - final Seq itemsSeq = rows.get(0).getAs("indexed"); - final List items = CollectionConverters.asJava(itemsSeq); - assertEquals(3, items.size()); - assertEquals(1, (int) ((Row) items.get(0)).getAs("one_based_idx")); - assertEquals(2, (int) ((Row) items.get(1)).getAs("one_based_idx")); - assertEquals(3, (int) ((Row) items.get(2)).getAs("one_based_idx")); - } } From 5086c9d9497ad84973aa46a116393817acacba9e Mon Sep 17 00:00:00 2001 From: Piotr Szul Date: Wed, 8 Apr 2026 10:31:34 +1000 Subject: [PATCH 06/41] test: Add unit tests for RowIndexCounter and row counter expressions Cover the 13 new lines flagged by SonarCloud as uncovered: all methods on RowIndexCounter (get, increment, reset, serialization) and the three ValueFunctions entry points (rowCounterGet, rowCounterIncrement, resetCounter) exercised via a Spark dataset test in both codegen and interpreted modes. Co-Authored-By: Claude Opus 4.6 (1M context) --- .../encoders/ExpressionsBothModesTest.java | 65 +++++++++++ .../encoders/RowIndexCounterTest.java | 101 ++++++++++++++++++ 2 files changed, 166 insertions(+) create mode 100644 encoders/src/test/java/au/csiro/pathling/encoders/RowIndexCounterTest.java diff --git a/encoders/src/test/java/au/csiro/pathling/encoders/ExpressionsBothModesTest.java b/encoders/src/test/java/au/csiro/pathling/encoders/ExpressionsBothModesTest.java index 5f011236d9..7e6f2bd083 100644 --- a/encoders/src/test/java/au/csiro/pathling/encoders/ExpressionsBothModesTest.java +++ b/encoders/src/test/java/au/csiro/pathling/encoders/ExpressionsBothModesTest.java @@ -292,4 +292,69 @@ void testStructProductInlineWithUnsafeRowData() { assertEquals(expected.get(i), actual.get(i), "Row " + i + " mismatch"); } } + + /** + * Tests that {@link ValueFunctions#rowCounterGet}, {@link ValueFunctions#rowCounterIncrement}, + * and {@link ValueFunctions#resetCounter} work together to assign sequential indices within an + * array transform and reset between rows. + */ + @Test + void testRowCounterExpressions() { + // Create a dataset with two rows, each containing an array of structs. + final StructType itemType = + DataTypes.createStructType( + new StructField[] { + new StructField("value", DataTypes.StringType, true, Metadata.empty()) + }); + final StructType schema = + DataTypes.createStructType( + new StructField[] { + new StructField("id", DataTypes.StringType, true, Metadata.empty()), + new StructField("items", DataTypes.createArrayType(itemType), true, Metadata.empty()) + }); + + final List data = + Arrays.asList( + RowFactory.create( + "r1", + Arrays.asList( + RowFactory.create("a"), RowFactory.create("b"), RowFactory.create("c"))), + RowFactory.create("r2", Arrays.asList(RowFactory.create("x"), RowFactory.create("y")))); + + final Dataset ds = spark.createDataFrame(data, schema).repartition(1); + + // Build a transform that assigns a row index to each array element using the counter + // expressions. + final RowIndexCounter counter = new RowIndexCounter(); + final Column indexCol = ValueFunctions.rowCounterGet(counter); + + // Transform each item: struct(value, index), then increment the counter. + final Column transformed = + functions.transform( + col("items"), + item -> + ValueFunctions.rowCounterIncrement( + struct(item.getField("value").alias("value"), indexCol.alias("idx")), counter)); + + // Wrap with resetCounter so the index restarts at zero for each row. + final Column withReset = ValueFunctions.resetCounter(transformed, counter); + + final Dataset result = ds.select(col("id"), withReset.alias("indexed_items")); + final List rows = result.collectAsList(); + + assertEquals(2, rows.size()); + + // Row 1: three items with indices 0, 1, 2. + final List items1 = rows.get(0).getList(1); + assertEquals(3, items1.size()); + assertEquals(0, items1.get(0).getInt(1)); + assertEquals(1, items1.get(1).getInt(1)); + assertEquals(2, items1.get(2).getInt(1)); + + // Row 2: two items with indices 0, 1 (counter was reset). + final List items2 = rows.get(1).getList(1); + assertEquals(2, items2.size()); + assertEquals(0, items2.get(0).getInt(1)); + assertEquals(1, items2.get(1).getInt(1)); + } } diff --git a/encoders/src/test/java/au/csiro/pathling/encoders/RowIndexCounterTest.java b/encoders/src/test/java/au/csiro/pathling/encoders/RowIndexCounterTest.java new file mode 100644 index 0000000000..c114357269 --- /dev/null +++ b/encoders/src/test/java/au/csiro/pathling/encoders/RowIndexCounterTest.java @@ -0,0 +1,101 @@ +/* + * Copyright © 2018-2026 Commonwealth Scientific and Industrial Research + * Organisation (CSIRO) ABN 41 687 119 230. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package au.csiro.pathling.encoders; + +import static org.junit.jupiter.api.Assertions.assertEquals; + +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.ObjectInputStream; +import java.io.ObjectOutputStream; +import org.junit.jupiter.api.Test; + +/** Tests for {@link RowIndexCounter}. */ +class RowIndexCounterTest { + + @Test + void getReturnsZeroInitially() { + final RowIndexCounter counter = new RowIndexCounter(); + assertEquals(0, counter.get()); + } + + @Test + void incrementAdvancesCounter() { + final RowIndexCounter counter = new RowIndexCounter(); + counter.increment(); + assertEquals(1, counter.get()); + counter.increment(); + assertEquals(2, counter.get()); + } + + @Test + void getIsIdempotentBetweenIncrements() { + final RowIndexCounter counter = new RowIndexCounter(); + counter.increment(); + assertEquals(1, counter.get()); + assertEquals(1, counter.get()); + } + + @Test + void resetSetsCounterToZero() { + final RowIndexCounter counter = new RowIndexCounter(); + counter.increment(); + counter.increment(); + assertEquals(2, counter.get()); + counter.reset(); + assertEquals(0, counter.get()); + } + + @Test + void threadLocalIsolation() throws Exception { + final RowIndexCounter counter = new RowIndexCounter(); + counter.increment(); + counter.increment(); + + // A different thread should see its own independent counter starting at zero. + final int[] otherThreadValue = new int[1]; + final Thread thread = new Thread(() -> otherThreadValue[0] = counter.get()); + thread.start(); + thread.join(); + + assertEquals(0, otherThreadValue[0]); + assertEquals(2, counter.get()); + } + + @Test + void serializationRestoresCounter() throws Exception { + final RowIndexCounter counter = new RowIndexCounter(); + counter.increment(); + + // Serialize and deserialize. + final ByteArrayOutputStream bos = new ByteArrayOutputStream(); + try (final ObjectOutputStream oos = new ObjectOutputStream(bos)) { + oos.writeObject(counter); + } + final ByteArrayInputStream bis = new ByteArrayInputStream(bos.toByteArray()); + final RowIndexCounter deserialized; + try (final ObjectInputStream ois = new ObjectInputStream(bis)) { + deserialized = (RowIndexCounter) ois.readObject(); + } + + // Deserialized counter should start fresh at zero. + assertEquals(0, deserialized.get()); + deserialized.increment(); + assertEquals(1, deserialized.get()); + } +} From c825197dc63dcb35c9db74255733dc03eb380bb3 Mon Sep 17 00:00:00 2001 From: Piotr Szul Date: Mon, 11 May 2026 20:00:18 +1000 Subject: [PATCH 07/41] chore: Update core version to 9.7.0-SNAPSHOT --- benchmark/pom.xml | 2 +- encoders/pom.xml | 2 +- fhirpath/pom.xml | 2 +- lib/R/pom.xml | 2 +- lib/python/pom.xml | 2 +- library-api/pom.xml | 2 +- library-runtime/pom.xml | 2 +- pom.xml | 2 +- site/pom.xml | 2 +- terminology/pom.xml | 2 +- utilities/pom.xml | 2 +- 11 files changed, 11 insertions(+), 11 deletions(-) diff --git a/benchmark/pom.xml b/benchmark/pom.xml index 80a4d8e21e..303af2b009 100644 --- a/benchmark/pom.xml +++ b/benchmark/pom.xml @@ -23,7 +23,7 @@ au.csiro.pathling pathling - 9.6.0 + 9.7.0-SNAPSHOT benchmark jar diff --git a/encoders/pom.xml b/encoders/pom.xml index c964319feb..6335501c5c 100644 --- a/encoders/pom.xml +++ b/encoders/pom.xml @@ -32,7 +32,7 @@ au.csiro.pathling pathling - 9.6.0 + 9.7.0-SNAPSHOT encoders jar diff --git a/fhirpath/pom.xml b/fhirpath/pom.xml index 72a21f3610..1d74a8a73d 100644 --- a/fhirpath/pom.xml +++ b/fhirpath/pom.xml @@ -26,7 +26,7 @@ au.csiro.pathling pathling - 9.6.0 + 9.7.0-SNAPSHOT fhirpath jar diff --git a/lib/R/pom.xml b/lib/R/pom.xml index 7ec2c258b3..ff336c0189 100644 --- a/lib/R/pom.xml +++ b/lib/R/pom.xml @@ -26,7 +26,7 @@ au.csiro.pathling pathling - 9.6.0 + 9.7.0-SNAPSHOT ../../pom.xml r diff --git a/lib/python/pom.xml b/lib/python/pom.xml index ebfa4b7bcf..02e5474cfe 100644 --- a/lib/python/pom.xml +++ b/lib/python/pom.xml @@ -26,7 +26,7 @@ au.csiro.pathling pathling - 9.6.0 + 9.7.0-SNAPSHOT ../../pom.xml python diff --git a/library-api/pom.xml b/library-api/pom.xml index 3d9c92b13b..6183af12f5 100644 --- a/library-api/pom.xml +++ b/library-api/pom.xml @@ -26,7 +26,7 @@ pathling au.csiro.pathling - 9.6.0 + 9.7.0-SNAPSHOT library-api jar diff --git a/library-runtime/pom.xml b/library-runtime/pom.xml index d438f9ecf6..e09384dfaf 100644 --- a/library-runtime/pom.xml +++ b/library-runtime/pom.xml @@ -26,7 +26,7 @@ pathling au.csiro.pathling - 9.6.0 + 9.7.0-SNAPSHOT library-runtime jar diff --git a/pom.xml b/pom.xml index 8c0e63e00e..b0ec19d6e5 100644 --- a/pom.xml +++ b/pom.xml @@ -24,7 +24,7 @@ 4.0.0 au.csiro.pathling pathling - 9.6.0 + 9.7.0-SNAPSHOT pom Pathling diff --git a/site/pom.xml b/site/pom.xml index c90e0b254a..6588beafe8 100644 --- a/site/pom.xml +++ b/site/pom.xml @@ -26,7 +26,7 @@ au.csiro.pathling pathling - 9.6.0 + 9.7.0-SNAPSHOT ../pom.xml site diff --git a/terminology/pom.xml b/terminology/pom.xml index b7396389d6..24dd3b624d 100644 --- a/terminology/pom.xml +++ b/terminology/pom.xml @@ -26,7 +26,7 @@ pathling au.csiro.pathling - 9.6.0 + 9.7.0-SNAPSHOT terminology jar diff --git a/utilities/pom.xml b/utilities/pom.xml index b2207fdb39..611f0c05dc 100644 --- a/utilities/pom.xml +++ b/utilities/pom.xml @@ -26,7 +26,7 @@ pathling au.csiro.pathling - 9.6.0 + 9.7.0-SNAPSHOT utilities jar From 370f90c6fc10bbe54e16c837ae82f221c346ce58 Mon Sep 17 00:00:00 2001 From: Piotr Szul Date: Mon, 18 May 2026 13:24:28 +1000 Subject: [PATCH 08/41] fix: Prevent trace duplication across FHIRPath evaluation paths Introduces let() binding in SqlFunctions to materialise non-deterministic Spark column expressions exactly once per row, and applies it throughout the fhirpath and sql packages to prevent TraceExpression side effects from firing multiple times where the same operand appears in both branches of a when() expression. Adds a RepeatedSqlEvaluation checkstyle rule (RegexpMultiline) to catch accidental duplicate SQL evaluation at compile time, scoped to the fhirpath and sql package trees. Includes regression tests for all fixed evaluation paths. Co-Authored-By: Claude Sonnet 4.6 --- config/checkstyle/checkstyle.xml | 60 ++ config/checkstyle/suppressions.xml | 9 + .../fhirpath/column/ColumnRepresentation.java | 86 +-- .../fhirpath/column/QuantityValue.java | 58 +- .../fhirpath/comparison/CodingEquality.java | 19 +- .../fhirpath/encoding/QuantityEncoding.java | 42 +- .../function/provider/ConversionLogic.java | 24 +- .../fhirpath/operator/BooleanOperator.java | 35 +- .../operator/CollectionOperations.java | 1 + .../fhirpath/operator/EqualityOperator.java | 36 +- .../au/csiro/pathling/sql/SqlFunctions.java | 82 ++- .../column/ColumnRepresentationTraceTest.java | 319 +++++++++++ .../column/QuantityValueTraceTest.java | 134 +++++ .../function/provider/TraceFunctionTest.java | 266 +++++++++ .../pathling/sql/SqlFunctionsLetTest.java | 168 ++++++ .../pathling/library/PathlingContext.java | 25 + .../pathling/library/PathlingContextTest.java | 14 + .../.openspec.yaml | 2 + .../design.md | 194 +++++++ .../proposal.md | 66 +++ .../specs/fhirpath-trace/spec.md | 80 +++ .../tasks.md | 33 ++ .../verification.md | 61 +++ .../.openspec.yaml | 2 + .../design.md | 515 ++++++++++++++++++ .../proposal.md | 129 +++++ .../specs/fhirpath-trace/spec.md | 137 +++++ .../2026-05-08-fix-trace-duplication/tasks.md | 70 +++ openspec/specs/fhirpath-trace/spec.md | 134 +++++ 29 files changed, 2674 insertions(+), 127 deletions(-) create mode 100644 fhirpath/src/test/java/au/csiro/pathling/fhirpath/column/ColumnRepresentationTraceTest.java create mode 100644 fhirpath/src/test/java/au/csiro/pathling/fhirpath/column/QuantityValueTraceTest.java create mode 100644 fhirpath/src/test/java/au/csiro/pathling/sql/SqlFunctionsLetTest.java create mode 100644 openspec/changes/archive/2026-04-24-reproduce-trace-duplication/.openspec.yaml create mode 100644 openspec/changes/archive/2026-04-24-reproduce-trace-duplication/design.md create mode 100644 openspec/changes/archive/2026-04-24-reproduce-trace-duplication/proposal.md create mode 100644 openspec/changes/archive/2026-04-24-reproduce-trace-duplication/specs/fhirpath-trace/spec.md create mode 100644 openspec/changes/archive/2026-04-24-reproduce-trace-duplication/tasks.md create mode 100644 openspec/changes/archive/2026-04-24-reproduce-trace-duplication/verification.md create mode 100644 openspec/changes/archive/2026-05-08-fix-trace-duplication/.openspec.yaml create mode 100644 openspec/changes/archive/2026-05-08-fix-trace-duplication/design.md create mode 100644 openspec/changes/archive/2026-05-08-fix-trace-duplication/proposal.md create mode 100644 openspec/changes/archive/2026-05-08-fix-trace-duplication/specs/fhirpath-trace/spec.md create mode 100644 openspec/changes/archive/2026-05-08-fix-trace-duplication/tasks.md diff --git a/config/checkstyle/checkstyle.xml b/config/checkstyle/checkstyle.xml index f10b2c3954..9345ab2459 100644 --- a/config/checkstyle/checkstyle.xml +++ b/config/checkstyle/checkstyle.xml @@ -44,6 +44,66 @@ + + + + + + + + + + + + + + + + + + + + + + + diff --git a/config/checkstyle/suppressions.xml b/config/checkstyle/suppressions.xml index 30bdd287a5..bf435da824 100644 --- a/config/checkstyle/suppressions.xml +++ b/config/checkstyle/suppressions.xml @@ -25,4 +25,13 @@ + + + diff --git a/fhirpath/src/main/java/au/csiro/pathling/fhirpath/column/ColumnRepresentation.java b/fhirpath/src/main/java/au/csiro/pathling/fhirpath/column/ColumnRepresentation.java index 266981adcc..fe11ec42fb 100644 --- a/fhirpath/src/main/java/au/csiro/pathling/fhirpath/column/ColumnRepresentation.java +++ b/fhirpath/src/main/java/au/csiro/pathling/fhirpath/column/ColumnRepresentation.java @@ -17,16 +17,18 @@ package au.csiro.pathling.fhirpath.column; +import static au.csiro.pathling.sql.SqlFunctions.let; import static au.csiro.pathling.utilities.Functions.maybeCast; import static java.util.Objects.nonNull; import static org.apache.spark.sql.functions.array; import static org.apache.spark.sql.functions.callUDF; import static org.apache.spark.sql.functions.coalesce; -import static org.apache.spark.sql.functions.element_at; import static org.apache.spark.sql.functions.exists; import static org.apache.spark.sql.functions.lit; +import static org.apache.spark.sql.functions.nullif; import static org.apache.spark.sql.functions.raise_error; import static org.apache.spark.sql.functions.size; +import static org.apache.spark.sql.functions.try_element_at; import static org.apache.spark.sql.functions.when; import au.csiro.pathling.fhirpath.definition.ElementDefinition; @@ -208,7 +210,7 @@ public Optional asStringValue() { */ @Nonnull public ColumnRepresentation toArray() { - return vectorize(UnaryOperator.identity(), c -> when(c.isNotNull(), array(c))); + return vectorize(UnaryOperator.identity(), c -> let(c, x -> when(x.isNotNull(), array(x)))); } /** @@ -241,12 +243,14 @@ public ColumnRepresentation singular() { */ @Nonnull public ColumnRepresentation singular(@Nullable final String errorMessage) { + final String resolvedError = nonNull(errorMessage) ? errorMessage : DEF_NOT_SINGULAR_ERROR; + // Both size(x) and getAt(x, 0) reference the operand; let() ensures a nondeterministic + // operand (e.g. a traced column) fires exactly once per row rather than twice. return vectorize( c -> - when(c.isNull().or(size(c).leq(1)), getAt(c, 0)) - .otherwise( - raise_error( - lit(nonNull(errorMessage) ? errorMessage : DEF_NOT_SINGULAR_ERROR))), + let( + c, + x -> when(size(x).gt(1), raise_error(lit(resolvedError))).otherwise(getAt(x, 0))), UnaryOperator.identity()); } @@ -280,9 +284,10 @@ public Column ensureSingular() { */ @Nonnull public ColumnRepresentation plural() { - return vectorize( - a -> when(a.isNotNull(), a).otherwise(array()), - c -> when(c.isNotNull(), array(c)).otherwise(array())); + // Array branch: coalesce maps null to an empty array. Scalar branch: filter on a one-element + // array drops the element when null, yielding either a singleton or an empty array. + // Each operand is referenced once, so nondeterministic operands fire exactly once per row. + return vectorize(a -> coalesce(a, array()), c -> functions.filter(array(c), Column::isNotNull)); } /** @@ -306,7 +311,7 @@ public ColumnRepresentation applyTo(@Nonnull final Column mapColumn) { public ColumnRepresentation filter(@Nonnull final UnaryOperator lambda) { return vectorize( c -> functions.filter(c, lambda::apply), - c -> when(c.isNotNull(), when(lambda.apply(c), c))); + c -> let(c, x -> when(x.isNotNull().and(lambda.apply(x)), x))); } /** @@ -348,8 +353,9 @@ public ColumnRepresentation removeNulls() { */ @Nonnull public ColumnRepresentation normaliseNull() { - return vectorize( - c -> when(c.isNull().or(size(c).equalTo(0)), null).otherwise(c), UnaryOperator.identity()); + // nullif(c, array()) returns null when c equals the empty array, and propagates null when c + // itself is null. Single-reference rewrite of the original null-or-empty conditional. + return vectorize(c -> nullif(c, array()), UnaryOperator.identity()); } /** @@ -372,27 +378,31 @@ public ColumnRepresentation asCanonical() { @Nonnull public ColumnRepresentation transform(final UnaryOperator lambda) { return vectorize( - c -> functions.transform(c, lambda::apply), c -> when(c.isNotNull(), lambda.apply(c))); + c -> functions.transform(c, lambda::apply), + c -> let(c, x -> when(x.isNotNull(), lambda.apply(x)))); } /** * Aggregates the current {@link ColumnRepresentation} using a zero value and an aggregator * function. * - * @param zeroValue The zero value to use for aggregation + *

{@code zeroValue} MUST be the identity element of {@code aggregator} — i.e. {@code + * aggregator(zeroValue, x) == x} for all x. This identity property is used to simplify the scalar + * branch to {@code coalesce(c, zeroValue)}. + * + * @param zeroValue The identity element for {@code aggregator} * @param aggregator The aggregator function to use for aggregation * @return A new {@link ColumnRepresentation} that is aggregated */ @Nonnull public ColumnRepresentation aggregate( @Nonnull final Object zeroValue, final BinaryOperator aggregator) { - + // functions.aggregate(null_array, ...) returns null; coalesce maps that to the zero value, + // matching the original null-array contract with a single reference to the operand. + // The scalar branch reduces to coalesce(c, zero) since aggregator(zero, x) == x. return vectorize( - c -> - when(c.isNull(), zeroValue) - .otherwise(functions.aggregate(c, lit(zeroValue), aggregator::apply)), - c -> when(c.isNull(), zeroValue).otherwise(c)); - // This is OK because: aggregator(zero, x) == x + c -> coalesce(functions.aggregate(c, lit(zeroValue), aggregator::apply), lit(zeroValue)), + c -> coalesce(c, lit(zeroValue))); } /** @@ -412,11 +422,12 @@ public ColumnRepresentation first() { * @return A new {@link ColumnRepresentation} that is the last value */ public ColumnRepresentation last() { - // we need to use `element_at()` here are `getItem()` does not support column arguments - // NOTE: `element_at()` is 1-indexed as opposed to `getItem()` which is 0-indexed - return vectorize( - c -> when(c.isNull().or(size(c).equalTo(0)), null).otherwise(element_at(c, size(c))), - UnaryOperator.identity()); + // try_element_at is the ANSI-safe variant of element_at: it returns null instead of raising + // INVALID_ARRAY_INDEX for out-of-range indices, including any access against a null or empty + // array. Pathling runs Spark 4 with ANSI mode enabled (default), so the plain element_at + // would throw on those inputs. Negative indices count from the end, so -1 yields the last + // element of a non-empty array. + return vectorize(c -> try_element_at(c, lit(-1)), UnaryOperator.identity()); } /** @@ -426,8 +437,10 @@ public ColumnRepresentation last() { */ @Nonnull public ColumnRepresentation count() { - return vectorize( - c -> when(c.isNull(), 0).otherwise(size(c)), c -> when(c.isNull(), 0).otherwise(1)); + // The operand appears once, so a nondeterministic operand fires exactly once per row. With + // spark.sql.legacy.sizeOfNull = false (the default since Spark 3.0), size(null) returns null, + // and coalesce maps null to zero. + return vectorize(c -> coalesce(size(c), lit(0)), c -> when(c.isNull(), 0).otherwise(1)); } /** @@ -437,7 +450,9 @@ public ColumnRepresentation count() { */ @Nonnull public ColumnRepresentation isEmpty() { - return vectorize(c -> when(c.isNotNull(), size(c).equalTo(0)).otherwise(true), Column::isNull); + // size(null) returns null when spark.sql.legacy.sizeOfNull = false (Spark 3.0+ default); + // coalesce maps that null to true so a null array reads as empty. + return vectorize(c -> coalesce(size(c).equalTo(0), lit(true)), Column::isNull); } /** @@ -628,13 +643,16 @@ public ColumnRepresentation contains( @Nonnull final BinaryOperator comparator) { return vectorize( a -> - when( - element.getValue().isNotNull(), - coalesce(exists(a, e -> comparator.apply(e, element.getValue())), lit(false))), + let( + element.getValue(), + ev -> + when( + ev.isNotNull(), + coalesce(exists(a, e -> comparator.apply(e, ev)), lit(false)))), c -> - when( - element.getValue().isNotNull(), - coalesce(comparator.apply(c, element.getValue()), lit(false)))); + let( + element.getValue(), + ev -> when(ev.isNotNull(), coalesce(comparator.apply(c, ev), lit(false))))); } /** diff --git a/fhirpath/src/main/java/au/csiro/pathling/fhirpath/column/QuantityValue.java b/fhirpath/src/main/java/au/csiro/pathling/fhirpath/column/QuantityValue.java index 99b28820fd..361aafed1e 100644 --- a/fhirpath/src/main/java/au/csiro/pathling/fhirpath/column/QuantityValue.java +++ b/fhirpath/src/main/java/au/csiro/pathling/fhirpath/column/QuantityValue.java @@ -17,6 +17,7 @@ package au.csiro.pathling.fhirpath.column; +import static au.csiro.pathling.sql.SqlFunctions.let; import static java.util.Objects.nonNull; import static java.util.Objects.requireNonNull; import static org.apache.spark.sql.functions.callUDF; @@ -258,21 +259,27 @@ public Column isCalendarDuration() { */ @Nonnull public Column toUnit(@Nonnull final Column targetUnit) { - final ValueWithUnit literal = ValueWithUnit.literalValueOf(quantityColumn); + return let( + quantityColumn, + qc -> { + final QuantityValue bound = new QuantityValue(qc); + final ValueWithUnit literal = ValueWithUnit.literalValueOf(qc); - // Try UCUM conversion (will return null for non-UCUM/non-calendar quantities) - final Column ucumConverted = - callUDF(ConvertQuantityToUnit.FUNCTION_NAME, quantityColumn, targetUnit); + // Try UCUM conversion (will return null for non-UCUM/non-calendar quantities). + final Column ucumConverted = callUDF(ConvertQuantityToUnit.FUNCTION_NAME, qc, targetUnit); - // Short-circuit: exact match only if unit matches AND system is UCUM or calendar duration - // For non-UCUM/non-calendar systems (e.g., Money), fall through to UCUM conversion (returns - // null) - final Column hasValidSystem = isUcum().or(isCalendarDuration()); - final Column exactMatchWithValidSystem = literal.unit().equalTo(targetUnit).and(hasValidSystem); + // Short-circuit: exact match only if unit matches AND system is UCUM or calendar + // duration. For non-UCUM/non-calendar systems (e.g., Money), fall through to UCUM + // conversion (returns null). + final Column hasValidSystem = bound.isUcum().or(bound.isCalendarDuration()); + final Column exactMatchWithValidSystem = + literal.unit().equalTo(targetUnit).and(hasValidSystem); - // Return exact match if available (fast path), otherwise UCUM conversion result (or null) - return when(exactMatchWithValidSystem, quantityColumn) - .otherwise(coalesce(ucumConverted, lit(null).cast(QuantityEncoding.dataType()))); + // Return exact match if available (fast path), otherwise UCUM conversion result (or + // null). + return when(exactMatchWithValidSystem, qc) + .otherwise(coalesce(ucumConverted, lit(null).cast(QuantityEncoding.dataType()))); + }); } /** @@ -295,19 +302,24 @@ public Column toUnit(@Nonnull final Column targetUnit) { */ @Nonnull public Column convertibleToUnit(@Nonnull final Column targetUnit) { - final ValueWithUnit literal = ValueWithUnit.literalValueOf(quantityColumn); + return let( + quantityColumn, + qc -> { + final QuantityValue bound = new QuantityValue(qc); + final ValueWithUnit literal = ValueWithUnit.literalValueOf(qc); - // Check exact string match with valid system (UCUM or calendar duration) - final Column hasValidSystem = isUcum().or(isCalendarDuration()); - final Column exactMatchWithValidSystem = literal.unit().equalTo(targetUnit).and(hasValidSystem); + // Check exact string match with valid system (UCUM or calendar duration). + final Column hasValidSystem = bound.isUcum().or(bound.isCalendarDuration()); + final Column exactMatchWithValidSystem = + literal.unit().equalTo(targetUnit).and(hasValidSystem); - // Check UCUM convertibility by attempting conversion and checking if result is non-null - final Column ucumConverted = - callUDF(ConvertQuantityToUnit.FUNCTION_NAME, quantityColumn, targetUnit); - final Column ucumConvertible = ucumConverted.isNotNull(); + // Check UCUM convertibility by attempting conversion and checking if result is non-null. + final Column ucumConverted = callUDF(ConvertQuantityToUnit.FUNCTION_NAME, qc, targetUnit); + final Column ucumConvertible = ucumConverted.isNotNull(); - // Return true if either exact match (with valid system) or UCUM conversion is possible - // Return null if quantity is null (for empty propagation) - return when(quantityColumn.isNotNull(), exactMatchWithValidSystem.or(ucumConvertible)); + // Return true if either exact match (with valid system) or UCUM conversion is possible. + // Return null if quantity is null (for empty propagation). + return when(qc.isNotNull(), exactMatchWithValidSystem.or(ucumConvertible)); + }); } } diff --git a/fhirpath/src/main/java/au/csiro/pathling/fhirpath/comparison/CodingEquality.java b/fhirpath/src/main/java/au/csiro/pathling/fhirpath/comparison/CodingEquality.java index de182d6d63..b00f99f297 100644 --- a/fhirpath/src/main/java/au/csiro/pathling/fhirpath/comparison/CodingEquality.java +++ b/fhirpath/src/main/java/au/csiro/pathling/fhirpath/comparison/CodingEquality.java @@ -17,6 +17,7 @@ package au.csiro.pathling.fhirpath.comparison; +import static au.csiro.pathling.sql.SqlFunctions.let; import static org.apache.spark.sql.functions.lit; import static org.apache.spark.sql.functions.when; @@ -50,11 +51,17 @@ public static CodingEquality getInstance() { @Nonnull @Override public Column equalsTo(@Nonnull final Column left, @Nonnull final Column right) { - return when(left.isNull().or(right.isNull()), lit(null)) - .otherwise( - EQUALITY_COLUMNS.stream() - .map(f -> left.getField(f).eqNullSafe(right.getField(f))) - .reduce(Column::and) - .orElseThrow(() -> new AssertionError("No fields to compare"))); + return let( + left, + l -> + let( + right, + r -> + when(l.isNull().or(r.isNull()), lit(null)) + .otherwise( + EQUALITY_COLUMNS.stream() + .map(f -> l.getField(f).eqNullSafe(r.getField(f))) + .reduce(Column::and) + .orElseThrow(() -> new AssertionError("No fields to compare"))))); } } diff --git a/fhirpath/src/main/java/au/csiro/pathling/fhirpath/encoding/QuantityEncoding.java b/fhirpath/src/main/java/au/csiro/pathling/fhirpath/encoding/QuantityEncoding.java index fb8bde07a4..2ab3eea5b5 100644 --- a/fhirpath/src/main/java/au/csiro/pathling/fhirpath/encoding/QuantityEncoding.java +++ b/fhirpath/src/main/java/au/csiro/pathling/fhirpath/encoding/QuantityEncoding.java @@ -17,6 +17,7 @@ package au.csiro.pathling.fhirpath.encoding; +import static au.csiro.pathling.sql.SqlFunctions.let; import static java.util.Objects.nonNull; import static java.util.stream.Collectors.toUnmodifiableMap; import static org.apache.spark.sql.functions.lit; @@ -281,27 +282,28 @@ public static Column encodeLiteral(@Nonnull final FhirPathQuantity quantity) { */ @Nonnull public static Column encodeNumeric(@Nonnull final Column numericColumn) { - // Cast value to decimal type - final Column decimalValue = numericColumn.cast(DecimalCustomCoder.decimalType()); - // Return fully null struct when value is null to maintain FHIRPath empty collection semantics - return when( - decimalValue.isNotNull(), - toStruct( - lit(null), - decimalValue, - // We cannot encode the scale of the results of arithmetic operations. - lit(null), - lit(null), - lit(UcumUnit.ONE.code()), - lit(UcumUnit.UCUM_SYSTEM_URI), - lit(UcumUnit.ONE.code()), - // we do not need to normalize this as the unit is always "1" - // so it will be comparable with other quantities with unit "1" - lit(null), - lit(null), - lit(null))) - .otherwise(lit(null).cast(dataType())); + return let( + numericColumn, + nc -> + when( + nc.isNotNull(), + toStruct( + lit(null), + // Cast value to decimal type + nc.cast(DecimalCustomCoder.decimalType()), + // We cannot encode the scale of the results of arithmetic operations. + lit(null), + lit(null), + lit(UcumUnit.ONE.code()), + lit(UcumUnit.UCUM_SYSTEM_URI), + lit(UcumUnit.ONE.code()), + // we do not need to normalize this as the unit is always "1" + // so it will be comparable with other quantities with unit "1" + lit(null), + lit(null), + lit(null))) + .otherwise(lit(null).cast(dataType()))); } /** diff --git a/fhirpath/src/main/java/au/csiro/pathling/fhirpath/function/provider/ConversionLogic.java b/fhirpath/src/main/java/au/csiro/pathling/fhirpath/function/provider/ConversionLogic.java index 51ba0f7352..a90b57279c 100644 --- a/fhirpath/src/main/java/au/csiro/pathling/fhirpath/function/provider/ConversionLogic.java +++ b/fhirpath/src/main/java/au/csiro/pathling/fhirpath/function/provider/ConversionLogic.java @@ -17,6 +17,7 @@ package au.csiro.pathling.fhirpath.function.provider; +import static au.csiro.pathling.sql.SqlFunctions.let; import static org.apache.spark.sql.functions.callUDF; import static org.apache.spark.sql.functions.coalesce; import static org.apache.spark.sql.functions.lit; @@ -232,15 +233,20 @@ Column convertToBoolean(@Nonnull final FhirPathType sourceType, @Nonnull final C // String: Handle '1.0' and '0.0' specially, use SparkSQL cast for other values. // SparkSQL cast handles 'true', 'false', 't', 'f', 'yes', 'no', 'y', 'n', '1', '0' // (case-insensitive). - when(value.equalTo(lit("1.0")), lit(true)) - .when(value.equalTo(lit("0.0")), lit(false)) - .otherwise(value.try_cast(DataTypes.BooleanType)); + let( + value, + v -> + when(v.equalTo(lit("1.0")), lit(true)) + .when(v.equalTo(lit("0.0")), lit(false)) + .otherwise(v.try_cast(DataTypes.BooleanType))); case INTEGER -> // Integer: Only 0 or 1 can be converted (1 → true, 0 → false, otherwise null). - when(value.equalTo(lit(1)), lit(true)).when(value.equalTo(lit(0)), lit(false)); + let(value, v -> when(v.equalTo(lit(1)), lit(true)).when(v.equalTo(lit(0)), lit(false))); case DECIMAL -> // Decimal: Only 0.0 or 1.0 can be converted (1.0 → true, 0.0 → false, otherwise null). - when(value.equalTo(lit(1.0)), lit(true)).when(value.equalTo(lit(0.0)), lit(false)); + let( + value, + v -> when(v.equalTo(lit(1.0)), lit(true)).when(v.equalTo(lit(0.0)), lit(false))); default -> lit(null); }; } @@ -266,7 +272,7 @@ Column convertToInteger(@Nonnull final FhirPathType sourceType, @Nonnull final C case STRING -> // String: Only convert if it matches integer format (no decimal point). // Per FHIRPath spec, valid integer strings match: (\+|-)?\d+ - when(value.rlike(INTEGER_REGEX), value.try_cast(DataTypes.IntegerType)); + let(value, v -> when(v.rlike(INTEGER_REGEX), v.try_cast(DataTypes.IntegerType))); default -> lit(null); }; } @@ -339,7 +345,7 @@ Column convertToDate(@Nonnull final FhirPathType sourceType, @Nonnull final Colu if (sourceType == FhirPathType.STRING) { // Date values are stored as strings in FHIR. Validate format before accepting. // Date format: YYYY or YYYY-MM or YYYY-MM-DD - return when(value.rlike(DATE_REGEX), value); + return let(value, v -> when(v.rlike(DATE_REGEX), v)); } return lit(null); } @@ -360,7 +366,7 @@ Column convertToDateTime(@Nonnull final FhirPathType sourceType, @Nonnull final if (sourceType == FhirPathType.STRING) { // DateTime values are stored as strings in FHIR. Validate using simplified pattern. // Supports partial precision: YYYY, YYYY-MM, YYYY-MM-DD, YYYY-MM-DDThh, etc. - return when(value.rlike(DATETIME_REGEX), value); + return let(value, v -> when(v.rlike(DATETIME_REGEX), v)); } return lit(null); } @@ -381,7 +387,7 @@ Column convertToTime(@Nonnull final FhirPathType sourceType, @Nonnull final Colu if (sourceType == FhirPathType.STRING) { // Time values are stored as strings in FHIR. Validate using simplified pattern. // Supports partial precision: hh, hh:mm, hh:mm:ss, hh:mm:ss.fff - return when(value.rlike(TIME_REGEX), value); + return let(value, v -> when(v.rlike(TIME_REGEX), v)); } return lit(null); } diff --git a/fhirpath/src/main/java/au/csiro/pathling/fhirpath/operator/BooleanOperator.java b/fhirpath/src/main/java/au/csiro/pathling/fhirpath/operator/BooleanOperator.java index 4d6f38adb5..019e2c2029 100644 --- a/fhirpath/src/main/java/au/csiro/pathling/fhirpath/operator/BooleanOperator.java +++ b/fhirpath/src/main/java/au/csiro/pathling/fhirpath/operator/BooleanOperator.java @@ -17,6 +17,7 @@ package au.csiro.pathling.fhirpath.operator; +import static au.csiro.pathling.sql.SqlFunctions.let; import static org.apache.spark.sql.functions.when; import au.csiro.pathling.fhirpath.collection.BooleanCollection; @@ -61,18 +62,30 @@ public Collection invoke(@Nonnull final BinaryOperatorInput input) { case AND -> leftValue.and(rightValue); case OR -> leftValue.or(rightValue); case XOR -> - when(leftValue.isNull().or(rightValue.isNull()), null) - .when( - leftValue - .equalTo(true) - .and(rightValue.equalTo(false)) - .or(leftValue.equalTo(false).and(rightValue.equalTo(true))), - true) - .otherwise(false); + let( + leftValue, + lv -> + let( + rightValue, + rv -> + when(lv.isNull().or(rv.isNull()), null) + .when( + lv.equalTo(true) + .and(rv.equalTo(false)) + .or(lv.equalTo(false).and(rv.equalTo(true))), + true) + .otherwise(false))); case IMPLIES -> - when(leftValue.equalTo(true), rightValue) - .when(leftValue.equalTo(false), true) - .otherwise(when(rightValue.equalTo(true), true).otherwise(null)); + let( + leftValue, + lv -> + let( + rightValue, + rv -> + when(lv.equalTo(true), rv) + .when(lv.equalTo(false), true) + .otherwise( + when(rv.equalTo(true), true).otherwise(null)))); }); return BooleanCollection.build(resultCtx); } diff --git a/fhirpath/src/main/java/au/csiro/pathling/fhirpath/operator/CollectionOperations.java b/fhirpath/src/main/java/au/csiro/pathling/fhirpath/operator/CollectionOperations.java index 390eb55436..b4281f6dd3 100644 --- a/fhirpath/src/main/java/au/csiro/pathling/fhirpath/operator/CollectionOperations.java +++ b/fhirpath/src/main/java/au/csiro/pathling/fhirpath/operator/CollectionOperations.java @@ -101,6 +101,7 @@ private static Collection executeContains( // non-comparable so false or null // but also should enforce singularity of the element + // the contains(singular, ...) call below is in the mutually exclusive comparable branch. final Column columnResult = functions.when( singular.count().getValue().geq(1), diff --git a/fhirpath/src/main/java/au/csiro/pathling/fhirpath/operator/EqualityOperator.java b/fhirpath/src/main/java/au/csiro/pathling/fhirpath/operator/EqualityOperator.java index ae7a231171..760b96fbf7 100644 --- a/fhirpath/src/main/java/au/csiro/pathling/fhirpath/operator/EqualityOperator.java +++ b/fhirpath/src/main/java/au/csiro/pathling/fhirpath/operator/EqualityOperator.java @@ -17,6 +17,7 @@ package au.csiro.pathling.fhirpath.operator; +import static au.csiro.pathling.sql.SqlFunctions.let; import static org.apache.spark.sql.functions.lit; import static org.apache.spark.sql.functions.when; @@ -74,17 +75,30 @@ protected Collection handleEquivalentTypes( final ColumnRepresentation right = rightCollection.getColumn(); final Column equalityResult = - when(left.isEmpty().getValue().or(right.isEmpty().getValue()), lit(null)) - .when( - left.count() - .getValue() - .equalTo(lit(1)) - .and(right.count().getValue().equalTo(lit(1))), - // this works because we know both sides are singular (count == 1) - elementComparator.apply(left.singular().getValue(), right.singular().getValue())) - .otherwise( - // this works because we know that both sides is plural (count > 1) - arrayComparator.apply(left.plural().getValue(), right.plural().getValue())); + let( + left.getValue(), + lv -> + let( + right.getValue(), + rv -> { + final ColumnRepresentation leftR = left.copyOf(lv); + final ColumnRepresentation rightR = right.copyOf(rv); + return when( + leftR.isEmpty().getValue().or(rightR.isEmpty().getValue()), lit(null)) + .when( + leftR + .count() + .getValue() + .equalTo(lit(1)) + .and(rightR.count().getValue().equalTo(lit(1))), + // this works because we know both sides are singular (count == 1) + elementComparator.apply( + leftR.singular().getValue(), rightR.singular().getValue())) + .otherwise( + // this works because we know both sides are plural (count > 1) + arrayComparator.apply( + leftR.plural().getValue(), rightR.plural().getValue())); + })); return BooleanCollection.build(new DefaultRepresentation(equalityResult)); } diff --git a/fhirpath/src/main/java/au/csiro/pathling/sql/SqlFunctions.java b/fhirpath/src/main/java/au/csiro/pathling/sql/SqlFunctions.java index 8e8379ed80..b1594bb6db 100644 --- a/fhirpath/src/main/java/au/csiro/pathling/sql/SqlFunctions.java +++ b/fhirpath/src/main/java/au/csiro/pathling/sql/SqlFunctions.java @@ -20,25 +20,29 @@ import static org.apache.spark.sql.functions.aggregate; import static org.apache.spark.sql.functions.array; import static org.apache.spark.sql.functions.concat; +import static org.apache.spark.sql.functions.element_at; import static org.apache.spark.sql.functions.exists; import static org.apache.spark.sql.functions.filter; import static org.apache.spark.sql.functions.ifnull; import static org.apache.spark.sql.functions.lit; import static org.apache.spark.sql.functions.not; +import static org.apache.spark.sql.functions.transform; import static org.apache.spark.sql.functions.when; import jakarta.annotation.Nonnull; import java.util.function.BinaryOperator; +import java.util.function.UnaryOperator; import lombok.experimental.UtilityClass; import org.apache.spark.sql.Column; +import org.apache.spark.sql.classic.ColumnConversions$; import org.apache.spark.sql.functions; /** * Pathling-specific SQL functions that extend Spark SQL functionality. * - *

This interface provides utility functions for working with Spark SQL columns in the context of - * FHIR data processing. These functions handle common operations like pruning annotations, safely - * concatenating maps, and collecting maps during aggregation. + *

Provides utilities for working with Spark SQL columns in the context of FHIR data processing, + * including FHIR-instant formatting, array deduplication with custom equality semantics, and + * let-binding for safe evaluation of non-deterministic column expressions. */ @UtilityClass public class SqlFunctions { @@ -69,17 +73,21 @@ public static Column toFhirInstant(@Nonnull final Column col) { @Nonnull public static Column arrayDistinctWithEquality( @Nonnull final Column arrayColumn, @Nonnull final BinaryOperator equalityComparator) { - - final Column emptyTypedArray = filter(arrayColumn, x -> lit(false)); - - return aggregate( + return let( arrayColumn, - emptyTypedArray, - (acc, elem) -> - when( - not(exists(acc, x -> ifnull(equalityComparator.apply(x, elem), lit(false)))), - concat(acc, array(elem))) - .otherwise(acc)); + ac -> { + final Column emptyTypedArray = filter(ac, x -> lit(false)); + return aggregate( + ac, + emptyTypedArray, + (acc, elem) -> + when( + not( + exists( + acc, x -> ifnull(equalityComparator.apply(x, elem), lit(false)))), + concat(acc, array(elem))) + .otherwise(acc)); + }); } /** @@ -100,4 +108,52 @@ public static Column arrayUnionWithEquality( final Column combined = concat(leftArray, rightArray); return arrayDistinctWithEquality(combined, equalityComparator); } + + /** + * Evaluates {@code value} exactly once per row and passes the result to {@code body}. + * + *

This matters for {@link org.apache.spark.sql.catalyst.expressions.Nondeterministic} operands + * such as {@link TraceExpression}: without materialisation, each reference to the same + * non-deterministic expression in a Spark tree evaluates independently, firing side effects + * multiple times. + * + *

For deterministic {@code value}, returns {@code body.apply(value)} directly, incurring no + * HOF overhead. For non-deterministic {@code value}, uses {@code + * element_at(transform(array(value), body), 1)} to materialise the operand once via {@code array} + * before the lambda runs. + * + *

The result is {@code Nondeterministic} if and only if {@code value} or the expression + * returned by {@code body} is. + * + *

The resulting expression has no logical-plan dependency and composes inside any relational + * context (select, filter, join, window). Unlike Spark Catalyst's {@code With} expression, it + * does not rewrite into a {@code Project} operator. + * + *

Constraint. When {@code value} is non-deterministic, it MUST NOT contain a + * SQL aggregate or window expression; Spark's analyzer rejects these inside higher-order function + * arguments. + * + * @param value the operand to evaluate once per row + * @param body the lambda that consumes the evaluated operand + * @return a column expression applying {@code body} to a single evaluation of {@code value} + */ + @Nonnull + public static Column let(@Nonnull final Column value, @Nonnull final UnaryOperator body) { + // Deterministic expressions need no materialisation: identical references in the tree always + // produce the same value, so single-fire is trivially satisfied. The HOF wrapper is reserved + // for non-deterministic operands (e.g. TraceExpression) that fire side effects on every + // tree reference. + // + // ColumnConversions$.MODULE$.expression() is used instead of ExpressionUtils.expression() + // because ExpressionUtils can return a surrogate expression (e.g. when the Column wraps a + // compound expression like concat or coalesce whose children include a Nondeterministic node) + // that reports deterministic() = true even though the full expression tree contains a + // non-deterministic sub-expression. ColumnConversions$.MODULE$.expression() always returns the + // real underlying Catalyst Expression, preserving the correct determinism semantics through + // the entire tree. + if (ColumnConversions$.MODULE$.expression(value).deterministic()) { + return body.apply(value); + } + return element_at(transform(array(value), body::apply), 1); + } } diff --git a/fhirpath/src/test/java/au/csiro/pathling/fhirpath/column/ColumnRepresentationTraceTest.java b/fhirpath/src/test/java/au/csiro/pathling/fhirpath/column/ColumnRepresentationTraceTest.java new file mode 100644 index 0000000000..f43d13673d --- /dev/null +++ b/fhirpath/src/test/java/au/csiro/pathling/fhirpath/column/ColumnRepresentationTraceTest.java @@ -0,0 +1,319 @@ +/* + * Copyright © 2018-2026 Commonwealth Scientific and Industrial Research + * Organisation (CSIRO) ABN 41 687 119 230. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package au.csiro.pathling.fhirpath.column; + +import static org.apache.spark.sql.classic.ExpressionUtils.column; +import static org.apache.spark.sql.classic.ExpressionUtils.expression; +import static org.apache.spark.sql.functions.col; +import static org.apache.spark.sql.functions.lit; +import static org.junit.jupiter.api.Assertions.assertEquals; + +import au.csiro.pathling.sql.TraceExpression; +import au.csiro.pathling.test.SpringBootUnitTest; +import ch.qos.logback.classic.Level; +import ch.qos.logback.classic.Logger; +import ch.qos.logback.classic.spi.ILoggingEvent; +import ch.qos.logback.core.read.ListAppender; +import jakarta.annotation.Nonnull; +import java.util.function.Function; +import java.util.stream.IntStream; +import org.apache.spark.sql.Column; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Row; +import org.apache.spark.sql.RowFactory; +import org.apache.spark.sql.SparkSession; +import org.apache.spark.sql.types.ArrayType; +import org.apache.spark.sql.types.DataTypes; +import org.apache.spark.sql.types.Metadata; +import org.apache.spark.sql.types.StructField; +import org.apache.spark.sql.types.StructType; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.slf4j.LoggerFactory; +import org.springframework.beans.factory.annotation.Autowired; + +/** + * Layer B regression guard for issue #2594. For every public {@link ColumnRepresentation} method + * that operates on its operand, asserts that wrapping the operand in a {@link TraceExpression} + * produces exactly one trace fire per logical invocation (per row). Catches any future helper that + * re-introduces a multi-reference {@code when(...).otherwise(...)} pattern over the operand. + */ +@SpringBootUnitTest +class ColumnRepresentationTraceTest { + + @Autowired SparkSession spark; + + private Logger traceLogger; + private Level originalLevel; + private ListAppender appender; + + @BeforeEach + void setUp() { + traceLogger = (Logger) LoggerFactory.getLogger(TraceExpression.class); + originalLevel = traceLogger.getLevel(); + traceLogger.setLevel(Level.TRACE); + appender = new ListAppender<>(); + appender.start(); + traceLogger.addAppender(appender); + } + + @AfterEach + void tearDown() { + traceLogger.detachAppender(appender); + traceLogger.setLevel(originalLevel); + appender.stop(); + } + + // --------------------------------------------------------------------------- + // Methods rewritten in this change — should fire exactly once per logical row. + // --------------------------------------------------------------------------- + + @Test + void count_array_singleFire() { + runArray("count-array", ColumnRepresentation::count, 1, 3); + } + + @Test + void isEmpty_array_singleFire() { + runArray("isEmpty-array", ColumnRepresentation::isEmpty, 1, 3); + } + + @Test + void last_array_singleFire() { + runArray("last", ColumnRepresentation::last, 1, 3); + } + + @Test + void normaliseNull_array_singleFire() { + runArray("normaliseNull", ColumnRepresentation::normaliseNull, 1, 3); + } + + @Test + void aggregate_array_singleFire() { + runArray("aggregate-array", c -> c.aggregate(0, Column::plus), 1, 3); + } + + @Test + void aggregate_scalar_singleFire() { + runScalar("aggregate-scalar", c -> c.aggregate(0, Column::plus), 1, 3); + } + + @Test + void plural_array_singleFire() { + runArray("plural-array", ColumnRepresentation::plural, 1, 3); + } + + @Test + void plural_scalar_singleFire() { + runScalar("plural-scalar", ColumnRepresentation::plural, 1, 3); + } + + @Test + void singular_array_singleFire() { + // Each row is a singleton array so size never exceeds 1 and raise_error is not triggered. + runArrayOfSingleton("singular", ColumnRepresentation::singular, 1, 3); + } + + @Test + void filter_array_singleFire() { + runArray("filter-array", c -> c.filter(x -> x.gt(0)), 1, 3); + } + + @Test + void filter_scalar_singleFire() { + runScalar("filter-scalar", c -> c.filter(x -> x.gt(0)), 1, 3); + } + + @Test + void toArray_scalar_singleFire() { + runScalar("toArray-scalar", ColumnRepresentation::toArray, 1, 3); + } + + @Test + void transform_scalar_singleFire() { + runScalar("transform-scalar", c -> c.transform(Column::unary_$minus), 1, 3); + } + + @Test + void contains_array_element_singleFire() { + runContains("contains-array-element", arrayDataset(1), 1); + runContains("contains-array-element", arrayDataset(3), 3); + } + + @Test + void contains_scalar_element_singleFire() { + runContains("contains-scalar-element", scalarDataset(1), 1); + runContains("contains-scalar-element", scalarDataset(3), 3); + } + + // --------------------------------------------------------------------------- + // Methods that already use the operand once — sanity-guard against drift. + // --------------------------------------------------------------------------- + + @Test + void first_array_singleFire() { + runArray("first", ColumnRepresentation::first, 1, 3); + } + + @Test + void orElse_singleFire() { + runScalar("orElse", c -> c.orElse(0), 1, 3); + } + + @Test + void ensureSingular_singleFire() { + runArrayOfSingleton("ensureSingular", c -> new DefaultRepresentation(c.ensureSingular()), 1, 3); + } + + @Test + void removeNulls_array_singleFire() { + runArray("removeNulls", ColumnRepresentation::removeNulls, 1, 3); + } + + @Test + void count_scalar_singleFire() { + runScalar("count-scalar", ColumnRepresentation::count, 1, 3); + } + + @Test + void isEmpty_scalar_singleFire() { + runScalar("isEmpty-scalar", ColumnRepresentation::isEmpty, 1, 3); + } + + // --------------------------------------------------------------------------- + // Helpers. + // --------------------------------------------------------------------------- + + private void runArray( + @Nonnull final String label, + @Nonnull final Function op, + final long expectedSingleRowFires, + final long expectedMultiRowFires) { + runCase(arrayDataset(1), label + "-1", op, expectedSingleRowFires); + runCase(arrayDataset(3), label + "-3", op, expectedMultiRowFires); + } + + private void runArrayOfSingleton( + @Nonnull final String label, + @Nonnull final Function op, + final long expectedSingleRowFires, + final long expectedMultiRowFires) { + runCase(arrayDatasetOfSingleton(1), label + "-1", op, expectedSingleRowFires); + runCase(arrayDatasetOfSingleton(3), label + "-3", op, expectedMultiRowFires); + } + + private void runScalar( + @Nonnull final String label, + @Nonnull final Function op, + final long expectedSingleRowFires, + final long expectedMultiRowFires) { + runCase(scalarDataset(1), label + "-1", op, expectedSingleRowFires); + runCase(scalarDataset(3), label + "-3", op, expectedMultiRowFires); + } + + // Unlike runCase, the trace here is on the element argument, not the collection, matching the + // let() boundary inside ColumnRepresentation.contains(). + private void runContains( + @Nonnull final String label, @Nonnull final Dataset df, final long expected) { + final int beforeCount = appender.list.size(); + final Column tracedElement = traceColumn(lit(1), label); + final ColumnRepresentation element = new DefaultRepresentation(tracedElement); + final ColumnRepresentation collection = new DefaultRepresentation(col("v")); + final Column result = collection.contains(element, Column::equalTo).getValue(); + df.select(result.alias("r")).collect(); + final long fires = countTraceLogs(label, beforeCount); + assertEquals( + expected, + fires, + () -> "Expected " + expected + " trace fires for " + label + " but got " + fires); + } + + private void runCase( + @Nonnull final Dataset df, + @Nonnull final String label, + @Nonnull final Function op, + final long expected) { + final int beforeCount = appender.list.size(); + final Column traced = traceColumn(col("v"), label); + final ColumnRepresentation rep = new DefaultRepresentation(traced); + final Column result = op.apply(rep).getValue(); + df.select(result.alias("r")).collect(); + final long fires = countTraceLogs(label, beforeCount); + assertEquals( + expected, + fires, + () -> "Expected " + expected + " trace fires for " + label + " but got " + fires); + } + + private long countTraceLogs(@Nonnull final String label, final int fromIndex) { + final String marker = "[trace:" + label + "]"; + return appender.list.subList(fromIndex, appender.list.size()).stream() + .filter(event -> event.getFormattedMessage().contains(marker)) + .count(); + } + + @Nonnull + private Dataset scalarDataset(final int rows) { + final StructType schema = + new StructType( + new StructField[] { + new StructField("id", DataTypes.IntegerType, false, Metadata.empty()), + new StructField("v", DataTypes.IntegerType, false, Metadata.empty()) + }); + return spark.createDataFrame( + IntStream.rangeClosed(1, rows).mapToObj(i -> RowFactory.create(i, i)).toList(), schema); + } + + @Nonnull + private Dataset arrayDataset(final int rows) { + final StructType schema = + new StructType( + new StructField[] { + new StructField("id", DataTypes.IntegerType, false, Metadata.empty()), + new StructField( + "v", new ArrayType(DataTypes.IntegerType, true), false, Metadata.empty()) + }); + return spark.createDataFrame( + IntStream.rangeClosed(1, rows) + .mapToObj(i -> RowFactory.create(i, new Integer[] {i, i + 1})) + .toList(), + schema); + } + + @Nonnull + private Dataset arrayDatasetOfSingleton(final int rows) { + final StructType schema = + new StructType( + new StructField[] { + new StructField("id", DataTypes.IntegerType, false, Metadata.empty()), + new StructField( + "v", new ArrayType(DataTypes.IntegerType, true), false, Metadata.empty()) + }); + return spark.createDataFrame( + IntStream.rangeClosed(1, rows) + .mapToObj(i -> RowFactory.create(i, new Integer[] {i})) + .toList(), + schema); + } + + @Nonnull + private static Column traceColumn(@Nonnull final Column input, @Nonnull final String label) { + return column(new TraceExpression(expression(input), label, "integer", null)); + } +} diff --git a/fhirpath/src/test/java/au/csiro/pathling/fhirpath/column/QuantityValueTraceTest.java b/fhirpath/src/test/java/au/csiro/pathling/fhirpath/column/QuantityValueTraceTest.java new file mode 100644 index 0000000000..b5e98b45de --- /dev/null +++ b/fhirpath/src/test/java/au/csiro/pathling/fhirpath/column/QuantityValueTraceTest.java @@ -0,0 +1,134 @@ +/* + * Copyright © 2018-2026 Commonwealth Scientific and Industrial Research + * Organisation (CSIRO) ABN 41 687 119 230. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package au.csiro.pathling.fhirpath.column; + +import static org.apache.spark.sql.classic.ExpressionUtils.column; +import static org.apache.spark.sql.classic.ExpressionUtils.expression; +import static org.apache.spark.sql.functions.lit; +import static org.junit.jupiter.api.Assertions.assertEquals; + +import au.csiro.pathling.fhirpath.encoding.QuantityEncoding; +import au.csiro.pathling.sql.TraceExpression; +import au.csiro.pathling.test.SpringBootUnitTest; +import ch.qos.logback.classic.Level; +import ch.qos.logback.classic.Logger; +import ch.qos.logback.classic.spi.ILoggingEvent; +import ch.qos.logback.core.read.ListAppender; +import jakarta.annotation.Nonnull; +import java.util.List; +import org.apache.spark.sql.Column; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Row; +import org.apache.spark.sql.RowFactory; +import org.apache.spark.sql.SparkSession; +import org.apache.spark.sql.types.DataTypes; +import org.apache.spark.sql.types.Metadata; +import org.apache.spark.sql.types.StructField; +import org.apache.spark.sql.types.StructType; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.slf4j.LoggerFactory; +import org.springframework.beans.factory.annotation.Autowired; + +/** + * Layer B regression guard for issue #2594. For every {@link QuantityValue} method that references + * its {@code quantityColumn} operand, asserts that wrapping the operand in a {@link + * TraceExpression} produces exactly one trace fire per row. Catches any future implementation that + * re-introduces a multi-reference pattern over the Quantity struct column. + */ +@SpringBootUnitTest +class QuantityValueTraceTest { + + @Autowired SparkSession spark; + + private Logger traceLogger; + private Level originalLevel; + private ListAppender appender; + + @BeforeEach + void setUp() { + traceLogger = (Logger) LoggerFactory.getLogger(TraceExpression.class); + originalLevel = traceLogger.getLevel(); + traceLogger.setLevel(Level.TRACE); + appender = new ListAppender<>(); + appender.start(); + traceLogger.addAppender(appender); + } + + @AfterEach + void tearDown() { + traceLogger.detachAppender(appender); + traceLogger.setLevel(originalLevel); + appender.stop(); + } + + @Test + void toUnit_singleFire() { + // QuantityValue.toUnit() references quantityColumn 5× in its when().otherwise() expression: + // literal.unit(), isUcum(), isCalendarDuration(), callUDF(quantityColumn), and the value + // branch. Without let()-wrapping, a traced operand fires 5× per row. + final int beforeCount = appender.list.size(); + final Column tracedQty = traceColumn(QuantityEncoding.encodeNumeric(lit(1)), "toUnit"); + final Column result = QuantityValue.of(tracedQty).toUnit(lit("1")); + singleRowDataset().select(result.alias("r")).collect(); + final long fires = countTraceLogs("toUnit", beforeCount); + assertEquals( + 1, fires, () -> "Expected 1 trace fire for toUnit but got " + fires + ". See issue #2594."); + } + + @Test + void convertibleToUnit_singleFire() { + // QuantityValue.convertibleToUnit() references quantityColumn 5× in its when() expression: + // literal.unit(), isUcum(), isCalendarDuration(), callUDF(quantityColumn), and + // quantityColumn.isNotNull(). Without let()-wrapping, a traced operand fires 5× per row. + final int beforeCount = appender.list.size(); + final Column tracedQty = + traceColumn(QuantityEncoding.encodeNumeric(lit(1)), "convertibleToUnit"); + final Column result = QuantityValue.of(tracedQty).convertibleToUnit(lit("1")); + singleRowDataset().select(result.alias("r")).collect(); + final long fires = countTraceLogs("convertibleToUnit", beforeCount); + assertEquals( + 1, + fires, + () -> + "Expected 1 trace fire for convertibleToUnit but got " + fires + ". See issue #2594."); + } + + private long countTraceLogs(@Nonnull final String label, final int fromIndex) { + final String marker = "[trace:" + label + "]"; + return appender.list.subList(fromIndex, appender.list.size()).stream() + .filter(event -> event.getFormattedMessage().contains(marker)) + .count(); + } + + @Nonnull + private Dataset singleRowDataset() { + final StructType schema = + new StructType( + new StructField[] { + new StructField("id", DataTypes.IntegerType, false, Metadata.empty()) + }); + return spark.createDataFrame(List.of(RowFactory.create(1)), schema); + } + + @Nonnull + private static Column traceColumn(@Nonnull final Column input, @Nonnull final String label) { + return column(new TraceExpression(expression(input), label, "Quantity", null)); + } +} diff --git a/fhirpath/src/test/java/au/csiro/pathling/fhirpath/function/provider/TraceFunctionTest.java b/fhirpath/src/test/java/au/csiro/pathling/fhirpath/function/provider/TraceFunctionTest.java index f62f553e50..2e497c64f9 100644 --- a/fhirpath/src/test/java/au/csiro/pathling/fhirpath/function/provider/TraceFunctionTest.java +++ b/fhirpath/src/test/java/au/csiro/pathling/fhirpath/function/provider/TraceFunctionTest.java @@ -33,20 +33,26 @@ import au.csiro.pathling.fhirpath.evaluation.CrossResourceStrategy; import au.csiro.pathling.fhirpath.evaluation.DatasetEvaluator; import au.csiro.pathling.fhirpath.evaluation.DatasetEvaluatorBuilder; +import au.csiro.pathling.fhirpath.evaluation.SingleInstanceEvaluationResult; +import au.csiro.pathling.fhirpath.evaluation.SingleInstanceEvaluator; import au.csiro.pathling.fhirpath.parser.Parser; import au.csiro.pathling.sql.TraceExpression; import au.csiro.pathling.test.SpringBootUnitTest; import au.csiro.pathling.test.assertions.Assertions; import au.csiro.pathling.test.datasource.ObjectDataSource; +import ca.uhn.fhir.context.FhirContext; import ch.qos.logback.classic.Logger; import ch.qos.logback.classic.spi.ILoggingEvent; import ch.qos.logback.core.read.ListAppender; import jakarta.annotation.Nonnull; import java.util.List; +import java.util.stream.Stream; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Row; import org.apache.spark.sql.RowFactory; import org.apache.spark.sql.SparkSession; +import org.hl7.fhir.r4.model.CodeableConcept; +import org.hl7.fhir.r4.model.Coding; import org.hl7.fhir.r4.model.Enumerations.AdministrativeGender; import org.hl7.fhir.r4.model.Enumerations.ResourceType; import org.hl7.fhir.r4.model.HumanName; @@ -55,6 +61,9 @@ import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Nested; import org.junit.jupiter.api.Test; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.Arguments; +import org.junit.jupiter.params.provider.MethodSource; import org.slf4j.LoggerFactory; import org.springframework.beans.factory.annotation.Autowired; @@ -406,6 +415,253 @@ void evaluationWithoutCollector_stillWorks() { } } + /** + * Tests that exercise the trace-entry duplication scenarios from issue #2594. Downstream FHIRPath + * operations whose Spark column form references the traced operand more than once historically + * inflated the collector entry count: a single source-level {@code trace('t')} fired twice (or + * more) per row when consumed by {@code count()}, {@code exists()}, {@code empty()}, {@code + * last()}, {@code combine()}, or the {@code |} union operator. The fix in this change rewrites + * the offending {@link au.csiro.pathling.fhirpath.column.ColumnRepresentation} methods so each + * traced operand is evaluated exactly once per logical invocation. + * + *

These tests use {@link SingleInstanceEvaluator} — the evaluation path used by the FHIRPath + * Lab API — because that is where the bug was originally observed. The fixture is a single + * Patient with three {@code name} entries, matching the reproduction in the issue. + */ + @Nested + class TraceEntryCountTest { + + private Dataset patientDf; + private FhirContext fhirContext; + + @BeforeEach + void setUpSingleInstance() { + final ObjectDataSource dataSource = + new ObjectDataSource(spark, encoders, List.of(createPatientWithThreeNames())); + patientDf = dataSource.read("Patient"); + fhirContext = encoders.getContext(); + } + + private long countTraceValues(@Nonnull final String expression, @Nonnull final String label) { + final SingleInstanceEvaluationResult result = + SingleInstanceEvaluator.evaluate( + patientDf, "Patient", fhirContext, expression, null, null); + return result.getTraces().stream() + .filter(t -> label.equals(t.getLabel())) + .mapToLong(t -> t.getValues().size()) + .sum(); + } + + @ParameterizedTest(name = "[{index}] {0}") + @MethodSource("entryCountCases") + void entryCount(final TraceEntryCase testCase) { + assertEntryCount(testCase); + } + + private void assertEntryCount(@Nonnull final TraceEntryCase testCase) { + final long actual = countTraceValues(testCase.expression(), testCase.label()); + assertEquals( + testCase.expected(), + actual, + () -> + String.format( + "Expression [%s]: expected %d trace entries for label '%s', got %d. " + + "See issue #2594.", + testCase.expression(), testCase.expected(), testCase.label(), actual)); + } + + static Stream entryCountCases() { + // The full matrix from issue #2594, including operations that previously inflated the + // trace count via multi-reference Spark patterns (count, exists, empty, combine, union). + // After the fix in this change, each source-level trace() call fires exactly once per + // row regardless of how the result is consumed downstream. + return Stream.of( + // Pass-through and non-duplicating cases — regression guard for any fix. + Arguments.of(new TraceEntryCase("Patient.name.trace('t')", "t", 3)), + Arguments.of(new TraceEntryCase("Patient.name.trace('t').given.join(' ')", "t", 3)), + Arguments.of(new TraceEntryCase("Patient.name.trace('t').given.join(' ') + 'X'", "t", 3)), + Arguments.of(new TraceEntryCase("Patient.name.trace('t').first()", "t", 3)), + // Previously known-failing rows — the rewrites in this change retire the bug. + Arguments.of(new TraceEntryCase("Patient.name.trace('t').given.count()", "t", 3)), + Arguments.of(new TraceEntryCase("Patient.name.trace('t').exists()", "t", 3)), + Arguments.of(new TraceEntryCase("Patient.name.trace('t').empty()", "t", 3)), + Arguments.of( + new TraceEntryCase("Patient.name.trace('t').given.join(' ').combine('X')", "t", 3)), + Arguments.of( + new TraceEntryCase( + "Patient.name.trace('t').given.join(' ') | Patient.name.family.first()", "t", 3)), + Arguments.of( + new TraceEntryCase("Patient.name.trace('t') | Patient.name.trace('t')", "t", 6)), + // Additional FHIRPath surface (D4 in the design) — extends user-visible regression + // coverage to a count comparison and two extra downstream pipelines that route through + // the rewritten ColumnRepresentation methods. The original D4 list also named single() + // and iif(); neither is implemented in Pathling, so they are replaced with equivalent + // pipelines that exercise the same internal helpers (singular() via ensureSingular() + // through .first(), and conditional projection through .where()). + Arguments.of(new TraceEntryCase("Patient.name.trace('t').given.count() > 0", "t", 3)), + Arguments.of( + new TraceEntryCase( + "Patient.name.trace('t').where(use = 'official').given.first()", "t", 3)), + Arguments.of( + new TraceEntryCase( + "Patient.name.trace('t').given.combine(Patient.name.family)", "t", 3)), + // BooleanOperator XOR — leftValue referenced 3× in the XOR switch arm (isNull, + // equalTo(true), equalTo(false)), so a traced left operand fires 3× without the + // binaryOperator let()-wrapping fix. + Arguments.of( + new TraceEntryCase( + "Patient.name.exists().trace('t') xor Patient.name.exists()", "t", 1)), + // BooleanOperator IMPLIES with a false left — leftValue referenced 2× (equalTo(true) + // then equalTo(false)), so a traced left operand fires 2× without the fix. + Arguments.of(new TraceEntryCase("Patient.name.empty().trace('t') implies true", "t", 1)), + // EqualityOperator = — left ColumnRepresentation is read via isEmpty(), count(), and + // singular(), each independently calling getValue(). Without let()-wrapping in + // handleEquivalentTypes, a traced left operand fires 3× per row. + Arguments.of( + new TraceEntryCase("Patient.name.family.first().trace('t') = 'Smith'", "t", 1)), + // ConversionLogic.convertToBoolean (STRING path) — value appears in both when() + // predicates ('1.0' and '0.0' checks) and the otherwise() branch. Without let()-wrapping, + // a traced operand fires 3× per row (all three predicates/branches evaluate value). + Arguments.of(new TraceEntryCase("'true'.trace('t').toBoolean()", "t", 1)), + // ConversionLogic.convertToInteger (STRING path) — value appears in both the when() + // predicate (rlike check) and the value branch (try_cast). Without let()-wrapping, a + // traced operand fires 2× per row when the input matches the integer regex. + Arguments.of(new TraceEntryCase("'1'.trace('t').toInteger()", "t", 1)), + // ConversionLogic.convertToDate (STRING path) — value appears in both the when() + // predicate (rlike check) and the value branch (the date string itself). Without + // let()-wrapping, a traced operand fires 2× per row when the input matches the date + // regex. + Arguments.of(new TraceEntryCase("'2020-01-01'.trace('t').toDate()", "t", 1)), + // QuantityEncoding.encodeNumeric (via convertToQuantity INTEGER path) — the traced input + // appears in both the when() predicate (isNotNull check) and the value struct (via cast). + // let()-wrapping on the raw numericColumn ensures the non-deterministic expression is + // materialized once before both uses. + Arguments.of(new TraceEntryCase("1.trace('t').toQuantity()", "t", 1)), + // QuantityValue.toUnit() — quantityColumn is referenced 5× in the assembled + // when().otherwise() expression (literal.unit, isUcum, isCalendarDuration, callUDF, + // and the value branch). Without let()-wrapping, a traced Quantity fires 5× per row. + Arguments.of(new TraceEntryCase("1.toQuantity().trace('t').toQuantity('1')", "t", 1)), + // QuantityValue.convertibleToUnit() — quantityColumn is referenced 5× similarly + // (literal.unit, isUcum, isCalendarDuration, callUDF, and quantityColumn.isNotNull). + // Without let()-wrapping, a traced Quantity fires 5× per row. + Arguments.of( + new TraceEntryCase("1.toQuantity().trace('t').convertsToQuantity('1')", "t", 1))); + } + + @Test + void codingUnion_traceSingleFire() { + // SqlFunctions.arrayDistinctWithEquality() referenced `arrayColumn` twice — once for + // filter() to build the empty-typed seed, and once for aggregate(). For Coding (which uses + // CodingEquality rather than default SQL equality) both union paths route through this + // method, so a traced coding array fired 2× per row before the let()-wrap fix. + final ObjectDataSource ds = + new ObjectDataSource(spark, encoders, List.of(createPatientWithMaritalStatusCoding())); + final Dataset codingDf = ds.read("Patient"); + + // handleOneEmpty path: right side is EmptyCollection → dedupeArray → + // arrayDistinctWithEquality + final SingleInstanceEvaluationResult emptyUnion = + SingleInstanceEvaluator.evaluate( + codingDf, + "Patient", + fhirContext, + "Patient.maritalStatus.coding.trace('t') | {}", + null, + null); + final long emptyUnionCount = + emptyUnion.getTraces().stream() + .filter(t -> "t".equals(t.getLabel())) + .mapToLong(t -> t.getValues().size()) + .sum(); + assertEquals( + 1, + emptyUnionCount, + "Trace in Coding union (handleOneEmpty → dedupeArray → arrayDistinctWithEquality)" + + " should fire exactly once. See issue #2594."); + + // handleEquivalentTypes path: both sides non-empty → unionArrays → arrayDistinctWithEquality + final SingleInstanceEvaluationResult twoSideUnion = + SingleInstanceEvaluator.evaluate( + codingDf, + "Patient", + fhirContext, + "Patient.maritalStatus.coding.trace('t') | Patient.maritalStatus.coding", + null, + null); + final long twoSideCount = + twoSideUnion.getTraces().stream() + .filter(t -> "t".equals(t.getLabel())) + .mapToLong(t -> t.getValues().size()) + .sum(); + assertEquals( + 1, + twoSideCount, + "Trace in Coding union (handleEquivalentTypes → unionArrays → arrayDistinctWithEquality)" + + " should fire exactly once. See issue #2594."); + } + + @Test + void codingEquality_traceSingleFire() { + // Coding equality routes through CodingEquality.equalsTo, which references the left + // operand once for the null check and once per equality field (5 fields), for a total of + // 6 references — on top of the 2 from EqualityOperator (isEmpty + count). Without + // let()-wrapping in handleEquivalentTypes, a traced Coding fires up to 8× per row. + final ObjectDataSource ds = + new ObjectDataSource(spark, encoders, List.of(createPatientWithMaritalStatusCoding())); + final Dataset codingDf = ds.read("Patient"); + + final SingleInstanceEvaluationResult result = + SingleInstanceEvaluator.evaluate( + codingDf, + "Patient", + fhirContext, + "Patient.maritalStatus.coding.first().trace('t')" + + " = Patient.maritalStatus.coding.first()", + null, + null); + + final long count = + result.getTraces().stream() + .filter(t -> "t".equals(t.getLabel())) + .mapToLong(t -> t.getValues().size()) + .sum(); + + assertEquals( + 1, + count, + "Trace in Coding equality (via CodingEquality.equalsTo) should fire exactly once." + + " See issue #2594."); + } + } + + /** + * Parameters for a single trace-entry-count scenario. + * + * @param expression the FHIRPath expression to evaluate + * @param label the trace label to count entries for + * @param expected the expected total number of trace entry values for {@code label} + */ + record TraceEntryCase(String expression, String label, int expected) { + @Override + public String toString() { + return expression; + } + } + + private static Patient createPatientWithThreeNames() { + // Fixture from issue #2594 — do not alter without updating the issue reference. + final Patient p = new Patient(); + p.setId("Patient/three-names"); + p.addName() + .setUse(HumanName.NameUse.OFFICIAL) + .setFamily("Smith") + .addGiven("John") + .addGiven("Quincy"); + p.addName().setUse(HumanName.NameUse.USUAL).setFamily("Smith").addGiven("Johnny"); + p.addName().setUse(HumanName.NameUse.MAIDEN).setFamily("Doe").addGiven("John").addGiven("Q"); + return p; + } + private static Patient createPatient1() { final Patient p = new Patient(); p.setId("Patient/1"); @@ -433,4 +689,14 @@ private static Patient createPatient3() { p.setId("Patient/3"); return p; } + + private static Patient createPatientWithMaritalStatusCoding() { + final Patient p = new Patient(); + p.setId("Patient/with-coding"); + final CodeableConcept maritalStatus = new CodeableConcept(); + maritalStatus.addCoding( + new Coding("http://terminology.hl7.org/CodeSystem/v3-MaritalStatus", "M", "Married")); + p.setMaritalStatus(maritalStatus); + return p; + } } diff --git a/fhirpath/src/test/java/au/csiro/pathling/sql/SqlFunctionsLetTest.java b/fhirpath/src/test/java/au/csiro/pathling/sql/SqlFunctionsLetTest.java new file mode 100644 index 0000000000..1eaa34b08c --- /dev/null +++ b/fhirpath/src/test/java/au/csiro/pathling/sql/SqlFunctionsLetTest.java @@ -0,0 +1,168 @@ +/* + * Copyright © 2018-2026 Commonwealth Scientific and Industrial Research + * Organisation (CSIRO) ABN 41 687 119 230. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package au.csiro.pathling.sql; + +import static au.csiro.pathling.sql.SqlFunctions.let; +import static org.apache.spark.sql.classic.ExpressionUtils.column; +import static org.apache.spark.sql.classic.ExpressionUtils.expression; +import static org.apache.spark.sql.functions.col; +import static org.apache.spark.sql.functions.lit; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertTrue; + +import au.csiro.pathling.test.SpringBootUnitTest; +import ch.qos.logback.classic.Level; +import ch.qos.logback.classic.Logger; +import ch.qos.logback.classic.spi.ILoggingEvent; +import ch.qos.logback.core.read.ListAppender; +import jakarta.annotation.Nonnull; +import java.util.List; +import org.apache.spark.sql.Column; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Row; +import org.apache.spark.sql.RowFactory; +import org.apache.spark.sql.SparkSession; +import org.apache.spark.sql.types.DataTypes; +import org.apache.spark.sql.types.Metadata; +import org.apache.spark.sql.types.StructField; +import org.apache.spark.sql.types.StructType; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.slf4j.LoggerFactory; +import org.springframework.beans.factory.annotation.Autowired; + +/** + * Tests for {@link SqlFunctions#let(Column, java.util.function.UnaryOperator)}: identity behaviour, + * multi-reference correctness, and single-fire semantics over a {@link TraceExpression} operand. + */ +@SpringBootUnitTest +class SqlFunctionsLetTest { + + @Autowired SparkSession spark; + + private Logger traceLogger; + private Level originalLevel; + private ListAppender appender; + + @BeforeEach + void setUp() { + traceLogger = (Logger) LoggerFactory.getLogger(TraceExpression.class); + originalLevel = traceLogger.getLevel(); + traceLogger.setLevel(Level.TRACE); + appender = new ListAppender<>(); + appender.start(); + traceLogger.addAppender(appender); + } + + @AfterEach + void tearDown() { + traceLogger.detachAppender(appender); + traceLogger.setLevel(originalLevel); + appender.stop(); + } + + @Test + void let_identityBody_returnsOperandValue_singleRow() { + final Dataset df = spark.range(1).toDF("id").withColumn("v", lit(7)); + final Row result = df.select(let(col("v"), x -> x).alias("r")).first(); + assertEquals(7, result.getInt(0)); + } + + @Test + void let_identityBody_returnsOperandValue_multiRow() { + final List rows = + df3().select(let(col("v"), x -> x).alias("r")).orderBy("r").collectAsList(); + assertEquals(List.of(1, 2, 3), rows.stream().map(r -> r.getInt(0)).toList()); + } + + @Test + void let_multiReferenceBody_producesCorrectResult_singleRow() { + final Dataset df = spark.range(1).toDF("id").withColumn("v", lit(5)); + // Body references x twice: x + x = 2*v. With let, x is materialised and referenced twice + // without re-evaluating the operand. + final Row result = df.select(let(col("v"), x -> x.plus(x)).alias("r")).first(); + assertEquals(10, result.getInt(0)); + } + + @Test + void let_multiReferenceBody_producesCorrectResult_multiRow() { + final List rows = + df3().select(let(col("v"), x -> x.plus(x)).alias("r")).orderBy("r").collectAsList(); + assertEquals(List.of(2, 4, 6), rows.stream().map(r -> r.getInt(0)).toList()); + } + + @Test + void let_overTraceExpression_firesExactlyOncePerRow_multiReferenceBody() { + final Column traced = traceColumn(col("v"), "trace-multi"); + df3().select(let(traced, x -> x.plus(x)).alias("r")).collect(); + // Three rows × one fire each. Without let, the body's two references to x would each + // re-evaluate the trace, doubling the count. + assertEquals(3L, countTraceLogs("trace-multi")); + } + + @Test + void let_overTraceExpression_firesExactlyOncePerRow_singleRow() { + final Dataset df = df3().limit(1); + final Column traced = traceColumn(col("v"), "trace-single"); + df.select(let(traced, x -> x.plus(x)).alias("r")).collect(); + assertEquals(1L, countTraceLogs("trace-single")); + } + + @Test + void let_nullValue_propagatesNull() { + final Dataset df = spark.range(1).toDF("id").withColumn("v", lit(null).cast("integer")); + final Row result = df.select(let(col("v"), x -> x).alias("r")).first(); + assertTrue(result.isNullAt(0), "let(null, x -> x) should return null."); + } + + @Test + void let_nullValue_bodyReceivesNull() { + // x.isNull() inside the body returns true (cast to 1) only if the body was invoked with x + // bound to null, confirming that let() does not short-circuit on a SQL null. + final Dataset df = spark.range(1).toDF("id").withColumn("v", lit(null).cast("integer")); + final Row result = df.select(let(col("v"), x -> x.isNull().cast("integer")).alias("r")).first(); + assertEquals(1, result.getInt(0)); + } + + private long countTraceLogs(@Nonnull final String label) { + final String marker = "[trace:" + label + "]"; + return appender.list.stream() + .filter(event -> event.getFormattedMessage().contains(marker)) + .count(); + } + + @Nonnull + private Dataset df3() { + final StructType schema = + new StructType( + new StructField[] { + new StructField("id", DataTypes.IntegerType, false, Metadata.empty()), + new StructField("v", DataTypes.IntegerType, false, Metadata.empty()) + }); + return spark.createDataFrame( + List.of(RowFactory.create(1, 1), RowFactory.create(2, 2), RowFactory.create(3, 3)), schema); + } + + @Nonnull + private static Column traceColumn(@Nonnull final Column input, @Nonnull final String label) { + // The collector is null — we count fires via the SLF4J trace logger to avoid Spark + // serialization issues with mutable collector state. + return column(new TraceExpression(expression(input), label, "integer", null)); + } +} diff --git a/library-api/src/main/java/au/csiro/pathling/library/PathlingContext.java b/library-api/src/main/java/au/csiro/pathling/library/PathlingContext.java index 09c3f1a7de..98544bc363 100644 --- a/library-api/src/main/java/au/csiro/pathling/library/PathlingContext.java +++ b/library-api/src/main/java/au/csiro/pathling/library/PathlingContext.java @@ -109,6 +109,7 @@ private PathlingContext( @Nonnull final FhirEncoders fhirEncoders, @Nonnull final TerminologyServiceFactory terminologyServiceFactory, @Nonnull final QueryConfiguration queryConfiguration) { + requireLegacySizeOfNullDisabled(spark); this.spark = spark; this.fhirVersion = fhirEncoders.getFhirVersion(); this.fhirEncoders = fhirEncoders; @@ -119,6 +120,30 @@ private PathlingContext( gson = buildGson(); } + /** + * Verifies that {@code spark.sql.legacy.sizeOfNull} is disabled. Several FHIRPath cardinality + * helpers — notably {@code count()} and {@code isEmpty()} on array operands — depend on Spark's + * post-3.0 default of {@code size(null) = null}, which {@code coalesce} then maps to the + * appropriate empty-collection answer. Toggling the legacy flag back on returns {@code size(null) + * = -1}, silently breaking those helpers; we fail fast at context creation rather than producing + * wrong counts later. + * + * @param spark the Spark session to validate + * @throws IllegalStateException if the legacy flag is enabled + */ + private static void requireLegacySizeOfNullDisabled(@Nonnull final SparkSession spark) { + final String value = spark.conf().get("spark.sql.legacy.sizeOfNull", "false"); + if (Boolean.parseBoolean(value)) { + throw new IllegalStateException( + "Pathling requires `spark.sql.legacy.sizeOfNull` to be `false` (the Spark 3.0+ default). " + + "FHIRPath count() and isEmpty() rely on Spark's null-array semantics; with the " + + "legacy flag enabled, these helpers return incorrect results on null inputs. " + + "Either remove the override, or set " + + "`spark.conf.set(\"spark.sql.legacy.sizeOfNull\", \"false\")` before constructing " + + "the PathlingContext."); + } + } + @Nonnull private static Gson buildGson() { final GsonBuilder builder = new GsonBuilder(); diff --git a/library-api/src/test/java/au/csiro/pathling/library/PathlingContextTest.java b/library-api/src/test/java/au/csiro/pathling/library/PathlingContextTest.java index 5fbaf5aec2..23160c8508 100644 --- a/library-api/src/test/java/au/csiro/pathling/library/PathlingContextTest.java +++ b/library-api/src/test/java/au/csiro/pathling/library/PathlingContextTest.java @@ -823,4 +823,18 @@ void fhirPathToColumn_invalidResourceType_throwsException() { assertThrows(Exception.class, () -> pathling.fhirPathToColumn("InvalidResource", "gender")); } + + @Test + void create_rejectsLegacySizeOfNullEnabled() { + spark.conf().set("spark.sql.legacy.sizeOfNull", "true"); + try { + final IllegalStateException ex = + assertThrows(IllegalStateException.class, () -> PathlingContext.create(spark)); + assertTrue( + ex.getMessage().contains("spark.sql.legacy.sizeOfNull"), + "Error message should name the offending configuration key"); + } finally { + spark.conf().set("spark.sql.legacy.sizeOfNull", "false"); + } + } } diff --git a/openspec/changes/archive/2026-04-24-reproduce-trace-duplication/.openspec.yaml b/openspec/changes/archive/2026-04-24-reproduce-trace-duplication/.openspec.yaml new file mode 100644 index 0000000000..9323e242f0 --- /dev/null +++ b/openspec/changes/archive/2026-04-24-reproduce-trace-duplication/.openspec.yaml @@ -0,0 +1,2 @@ +schema: spec-driven +created: 2026-04-24 diff --git a/openspec/changes/archive/2026-04-24-reproduce-trace-duplication/design.md b/openspec/changes/archive/2026-04-24-reproduce-trace-duplication/design.md new file mode 100644 index 0000000000..1277c67cc5 --- /dev/null +++ b/openspec/changes/archive/2026-04-24-reproduce-trace-duplication/design.md @@ -0,0 +1,194 @@ +## Context + +Issue #2594 describes a bug where `trace()` collector entries are duplicated +when the traced column is consumed by operations that compile into +`when(cond(c), …).otherwise(expr(c))` patterns in `ColumnRepresentation`. +Examples include `count()`, `exists()`, `empty()`, `first()`, `last()`, +`combine()`, and `|` (union via `plural()`). + +Prior investigation confirmed (against Spark 4.0.2 in +`spark-catalyst_2.13-4.0.2-sources.jar`): + +- `TraceExpression` is `Nondeterministic`, so Catalyst's CSE excludes it. +- Even if `TraceExpression` were made deterministic, Spark's CSE is + conservative around `CaseWhen`: `EquivalentExpressions.childrenToRecurse` + only walks `alwaysEvaluatedInputs` and cross-compares `branchGroups`. A + subexpression appearing once in the always-evaluated predicate AND once in + a single conditional branch is NOT registered as a common subexpression. +- Therefore the duplication is observable regardless of determinism, and a + fix requires either rewriting the affected `ColumnRepresentation` patterns + or intercepting trace evaluation at runtime. + +This change adds a test suite that pins the bug down. It does not fix the +bug. + +The existing test class `TraceFunctionTest` is the home for trace tests; it +already has `ListTraceCollector` wiring via `EvaluationContext`. Adding a new +`@Nested` class keeps the new tests discoverable and grouped. + +## Goals / Non-Goals + +**Goals:** + +- Produce an executable, parametrised test suite that encodes every row of + the reproduction matrix in #2594. +- Make the test suite serve as the acceptance oracle for the subsequent fix + change: when the fix lands, the suite SHALL pass in full. +- Use a fixture that matches the issue (one Patient, three `name` entries) + so doubled / tripled counts are unambiguous. +- Keep currently-passing cases enabled as regression guards so a fix does + not accidentally break them. + +**Non-Goals:** + +- Fixing the duplication bug. That is a separate change that will consume + the test suite as its acceptance criteria. +- Modifying `TraceExpression`, `ColumnRepresentation`, `CombiningLogic`, or + any other production code. +- Changing the public API or any user-facing behaviour. +- Adding tests for any trace behaviour not related to entry-count fidelity + (pass-through, log output, projection semantics — those are already + covered in the existing test class). + +## Decisions + +### D1. Fixture: a single Patient with three names + +Match the issue exactly: + +```json +{ + "resourceType": "Patient", + "name": [ + { "use": "official", "family": "Smith", "given": ["John", "Quincy"] }, + { "use": "usual", "family": "Smith", "given": ["Johnny"] }, + { "use": "maiden", "family": "Doe", "given": ["John", "Q"] } + ] +} +``` + +Three elements is the minimum that distinguishes correct counts (3) from +doubled (6) from tripled (9). Two elements would collapse the doubled +(4) and a baseline drift with a random off-by-one into visually similar +numbers. + +**Alternative considered:** reuse the existing test Patient fixture used by +other `TraceFunctionTest` nested classes. Rejected: the existing fixture may +not have three distinct `name` entries, and using a shared fixture couples +these tests to unrelated changes in the common setup. A local fixture keeps +the tests hermetic and readable. + +### D2. Parametrisation: one test method, one `@MethodSource` + +Use a `record TraceEntryCase(String expression, String label, int expected)` +and a `@ParameterizedTest` with `@MethodSource` that yields the 11 matrix +rows. Each row renders as a distinct test name in JUnit's output. + +**Alternative considered:** 11 separate `@Test` methods. Rejected: +boilerplate, and harder to read as a table. Parametrisation better expresses +"this is a matrix, here are its rows." + +### D3. Assertion: count entries by label + +The assertion is: + +``` +collector.getEntries().stream() + .filter(e -> e.label().equals(case.label())) + .count() == case.expected() +``` + +This filters by the expected label so a single test case can isolate one +trace even when the expression contains multiple. It also means the +assertion is robust to the exact evaluation order of unrelated trace calls. + +**Alternative considered:** assert the total `getEntries().size()`. +Rejected: the union case `name.trace('t') | name.trace('t')` is expected to +produce 6 entries from TWO traces with the same label. Filtering by label +isn't strictly necessary in the issue's matrix (same label throughout) but +keeps the helper reusable if we extend the matrix later. + +### D4. Handle known-failing cases without `@Disabled` + +Three options considered: + +1. `@Disabled("fixed in #TBD")` — the tests don't run at all. Rejected: the + point of adding them is to have a red signal in CI. +2. Tag known-failing cases with `@Tag("known-failing")` and exclude that tag + from the default Surefire run. Tests still compile, can be run on demand + with `-Dgroups=known-failing`. +3. Assert the current (buggy) counts and flip them when the fix lands. + Rejected: encodes the bug into the test, loses the documentation value, + and requires a two-sided change at fix time. + +**Decision:** option 2. Tag known-failing rows at the parameter-source level +so JUnit's `@Tag` can filter them. If Surefire configuration does not allow +per-parameter tagging cleanly, fall back to splitting the matrix into two +methods: one for currently-passing rows, one for currently-failing rows +marked `@Tag("known-failing")`. Tasks.md will call out which form to use +based on a quick spike. + +The subsequent fix change will remove the tag and the split. + +### D5. Explicit row-by-row expected counts + +The matrix from the issue is reproduced verbatim with the expected counts: + +| Expression | Expected | +| -------------------------------------------------------- | -------- | +| `name.trace('t')` | 3 | +| `name.trace('t').given.join(' ')` | 3 | +| `name.trace('t').given.join(' ') + 'X'` | 3 | +| `name.trace('t').given.count()` | 3 | +| `name.trace('t').exists()` | 3 | +| `name.trace('t').empty()` | 3 | +| `name.trace('t').first()` | 3 | +| `name.trace('t').last()` | 3 | +| `name.trace('t').given.join(' ').combine('X')` | 3 | +| `name.trace('t').given.join(' ') \| name.family.first()` | 3 | +| `name.trace('t') \| name.trace('t')` | 6 | + +**Note on trace-entry granularity:** the issue reports 3 entries for +`name.trace('t')` against a 3-name patient, which implies per-element +collector semantics rather than per-row. Before writing assertions, tasks +include a small calibration step: run `name.trace('t')` alone and record +what `collector.getEntries().size()` actually returns. If it's 1 (per-row), +adjust the expected values accordingly — the bug ratios (2×, 3×, 4×) +remain the same, only the base count shifts. This is documented as +Task 1. + +## Risks / Trade-offs + +**Risk:** The test entry-count semantics may not match the issue author's +numbers (per-row vs per-element) → Mitigation: calibration step as Task 1 +before writing assertions. If the base counts differ, the matrix is +re-derived by multiplying issue ratios against the observed base count. + +**Risk:** CI goes red while the tests wait for the fix → Mitigation: D4's +`@Tag("known-failing")` approach, which keeps the tests present and +discoverable but excluded from default test runs. The tag is a clear marker +that the failure is expected. + +**Risk:** A fix that goes further than option 1 (e.g. refactors +`TraceExpression`) could change entry-count semantics subtly → Mitigation: +the regression guard rows (currently passing) serve as a bidirectional +check. Any fix must keep those rows green AND turn the failing rows green. + +**Risk:** The reproduction fixture drifts from the exact JSON in the issue → +Mitigation: embed the JSON as a text block in the test class and comment +that it must not be altered without updating #2594 reference. + +## Migration Plan + +Not applicable — test-only change, no deployment surface. + +## Open Questions + +- Q: Does Pathling's CI configuration support Surefire tag exclusion out + of the box? + A: Tasks include verifying this. If not, fall back to splitting into + passing/failing methods with `@Tag` on the failing one. + +- Q: Should the known-failing tag name be `known-failing`, `bug-2594`, or + something else? + A: Deferred to the task-level; no impact on the design. diff --git a/openspec/changes/archive/2026-04-24-reproduce-trace-duplication/proposal.md b/openspec/changes/archive/2026-04-24-reproduce-trace-duplication/proposal.md new file mode 100644 index 0000000000..8379311753 --- /dev/null +++ b/openspec/changes/archive/2026-04-24-reproduce-trace-duplication/proposal.md @@ -0,0 +1,66 @@ +## Why + +Issue #2594 documents a concrete bug: `trace()` entries captured by +`PathlingContext.evaluateFhirPath()` are emitted more than once when the traced +column is consumed by any FHIRPath operation that compiles into a +`when(cond(c), …).otherwise(expr(c))` pattern in `ColumnRepresentation` +(`count`, `exists`, `empty`, `first`, `last`, `|`, `combine`, …). The root +cause is understood: `TraceExpression` is `Nondeterministic`, so Catalyst's +common-subexpression elimination cannot dedupe it, and Spark's CSE is also +conservative around `CaseWhen` branches even when an expression is +deterministic (verified against Spark 4.0.2). + +A fix is coming in a separate change. Before it lands we want the bug pinned +down with a regression test suite so that (a) the eventual fix has an +unambiguous acceptance oracle, (b) no future refactor silently re-introduces +the duplication, and (c) the failure is visible in CI rather than living only +in a GitHub issue. + +This change is intentionally scoped to **tests only** — no production code +changes, no fix attempt. + +## What Changes + +- Add a new `@Nested` test class in + `fhirpath/src/test/java/au/csiro/pathling/fhirpath/function/provider/TraceFunctionTest.java` + (e.g. `TraceEntryCountTest`) that asserts the number of `ListTraceCollector` + entries produced for each expression in the reproduction matrix from #2594. +- Use a single Patient fixture with three `name` entries (matching the issue) + so expected counts are distinguishable from any doubling/tripling. +- Parametrise the test so the 11-row matrix renders as 11 distinct JUnit + cases rather than one big assertion block. +- The currently-passing rows of the matrix (the ones that already produce the + correct entry count) act as a regression guard so any fix does not break + them. +- The currently-failing rows of the matrix are kept enabled (NOT `@Disabled`) + and tagged so CI users can distinguish them from genuine regressions. + The fix-change will retire the tag when the assertions pass. +- Add a new requirement to the `fhirpath-trace` capability stating that the + number of collector entries produced by a single source-level `trace()` + call SHALL equal the number of logical invocations of that trace, regardless + of the shape of downstream operations that consume the traced column. + +## Capabilities + +### New Capabilities + +_(none)_ + +### Modified Capabilities + +- `fhirpath-trace`: add a "trace entry count fidelity" requirement. The + existing nondeterminism requirement states that separate `trace()` calls + must each execute; the new requirement states that a single `trace()` call + must not be duplicated by downstream compilation patterns. The matrix of + expressions from #2594 becomes the scenario set. + +## Impact + +- Test code only: new tests added under `fhirpath/src/test/java/...`. +- No changes to production code or public API. +- CI: the failing rows of the matrix will produce test failures until the + separate fix change lands. This is intentional — the red signal is the + point. The tests will be tagged so they can be excluded from default + runs if necessary during the interim (see `design.md` for the exact + mechanism). +- No dependency, configuration, or build changes. diff --git a/openspec/changes/archive/2026-04-24-reproduce-trace-duplication/specs/fhirpath-trace/spec.md b/openspec/changes/archive/2026-04-24-reproduce-trace-duplication/specs/fhirpath-trace/spec.md new file mode 100644 index 0000000000..94c642830b --- /dev/null +++ b/openspec/changes/archive/2026-04-24-reproduce-trace-duplication/specs/fhirpath-trace/spec.md @@ -0,0 +1,80 @@ +## ADDED Requirements + +### Requirement: trace entry count matches logical invocations + +A single source-level `trace(name [, projection])` call SHALL produce a +number of `TraceCollector` entries equal to the number of logical +invocations of that trace, irrespective of how downstream FHIRPath +operations consume the traced column. In particular, operations that +internally compile into Spark expressions referencing the traced column +more than once (for example `count()`, `exists()`, `empty()`, +`combine()`, and the `|` union operator) SHALL NOT inflate the number +of collector entries. + +Two independent source-level `trace()` calls, even with identical +arguments, SHALL produce independent entries — this requirement governs +duplication within a single call, not deduplication across calls. + +#### Scenario: trace followed by pass-through path produces baseline entries + +- **GIVEN** a Patient with three `name` entries +- **WHEN** evaluating `name.trace('t')` with a `TraceCollector` attached +- **THEN** the collector SHALL contain exactly the baseline number of + entries labelled `t` for a 3-element traced collection + +#### Scenario: trace consumed by join produces baseline entries + +- **GIVEN** a Patient with three `name` entries +- **WHEN** evaluating `name.trace('t').given.join(' ')` +- **THEN** the collector SHALL contain the same number of entries as the + baseline pass-through case above + +#### Scenario: trace consumed by count does not duplicate entries + +- **GIVEN** a Patient with three `name` entries +- **WHEN** evaluating `name.trace('t').given.count()` +- **THEN** the collector SHALL contain the same number of entries as the + baseline pass-through case, NOT a multiple of it + +#### Scenario: trace consumed by exists does not duplicate entries + +- **GIVEN** a Patient with three `name` entries +- **WHEN** evaluating `name.trace('t').exists()` +- **THEN** the collector SHALL contain the same number of entries as the + baseline pass-through case + +#### Scenario: trace consumed by empty does not duplicate entries + +- **GIVEN** a Patient with three `name` entries +- **WHEN** evaluating `name.trace('t').empty()` +- **THEN** the collector SHALL contain the same number of entries as the + baseline pass-through case + +#### Scenario: trace consumed by first does not duplicate entries + +- **GIVEN** a Patient with three `name` entries +- **WHEN** evaluating `name.trace('t').first()` +- **THEN** the collector SHALL contain the same number of entries as the + baseline pass-through case + +#### Scenario: trace consumed by combine does not duplicate entries + +- **GIVEN** a Patient with three `name` entries +- **WHEN** evaluating `name.trace('t').given.join(' ').combine('X')` +- **THEN** the collector SHALL contain the same number of entries as the + baseline pass-through case + +#### Scenario: trace consumed by union does not duplicate entries + +- **GIVEN** a Patient with three `name` entries +- **WHEN** evaluating `name.trace('t').given.join(' ') | name.family.first()` +- **THEN** the collector SHALL contain the same number of entries as the + baseline pass-through case + +#### Scenario: two independent trace calls each produce baseline entries + +- **GIVEN** a Patient with three `name` entries +- **WHEN** evaluating `name.trace('t') | name.trace('t')` +- **THEN** the collector SHALL contain exactly twice the baseline number + of entries labelled `t` (one set per source-level `trace()` call), + not four times or more diff --git a/openspec/changes/archive/2026-04-24-reproduce-trace-duplication/tasks.md b/openspec/changes/archive/2026-04-24-reproduce-trace-duplication/tasks.md new file mode 100644 index 0000000000..e806637fa5 --- /dev/null +++ b/openspec/changes/archive/2026-04-24-reproduce-trace-duplication/tasks.md @@ -0,0 +1,33 @@ +## 1. Calibration + +- [x] 1.1 ~~Record baseline~~ — **superseded**: during implementation we switched to using `SingleInstanceEvaluator` (the FHIRPath Lab API path), which produces trace counts that match the issue matrix exactly (3 entries per name-element for a 3-name patient). Absolute counts from the issue matrix are used directly. +- [x] 1.2 ~~Recompute expected values~~ — **superseded** by 1.1. Absolute counts from the #2594 matrix used directly. +- [x] 1.3 Verified: Surefire 3.2.5 with JUnit 5 tag filtering. The default-test and sof-compliance-test Surefire executions both set `${pathling.test.excludedGroups}`, defaulted to `known-failing` via a project property. To run known-failing on demand: `-Dpathling.test.excludedGroups=none -Dgroups=known-failing`. + +## 2. Test fixture + +- [x] 2.1 Added `createPatientWithThreeNames()` in `TraceFunctionTest` with the exact #2594 fixture (three names: `use=official,family=Smith,given=[John,Quincy]`; `use=usual,family=Smith,given=[Johnny]`; `use=maiden,family=Doe,given=[John,Q]`). Header comment references the issue. +- [x] 2.2 The per-case helper (`countTraceValues`) builds a fresh `SingleInstanceEvaluator.evaluate(...)` call per invocation. No shared-evaluator state to reset. + +## 3. Test class + +- [x] 3.1 New `@Nested class TraceEntryCountTest` inside `TraceFunctionTest`. Matches the convention used by other nested classes (`PassThroughTests`, `CollectorTests`, etc.). +- [x] 3.2 Defined `record TraceEntryCase(String expression, String label, int expected)`. Changed from the original plan's `multiplier` field to a direct `expected` field since we observed counts that match the #2594 matrix exactly (no ratio calculation needed). +- [x] 3.3 Implemented `passingEntryCountCases()` (4 rows — includes `.first()` since it does NOT duplicate in this path) and `knownFailingEntryCountCases()` (6 rows — drops `.last()` which is unsupported in Pathling). +- [x] 3.4 Implemented `entryCount_nonDuplicatingOperations` (untagged) and `entryCount_duplicatingOperations_bug2594` (`@Tag("known-failing")`). The split-method approach avoids the JUnit-5-per-parameter tag issue. +- [x] 3.5 Assertion message includes expression, expected count, label, actual count, and a `See issue #2594.` pointer. + +## 4. Build-side configuration + +- [x] 4.1 Added `${pathling.test.excludedGroups}` to both Surefire executions in `fhirpath/pom.xml` (default-test and sof-compliance-test). Property defaulted to `known-failing` in the fhirpath POM `` section. +- [x] 4.2 Confirmed on-demand invocation works via `-Dpathling.test.excludedGroups=none -Dgroups=known-failing`. Documented in the test class header. + +## 5. Local verification + +- [x] 5.1 Default run: `mvn test -pl fhirpath -Dtest=TraceFunctionTest` → 30 tests, 0 failures (4 TraceEntryCountTest passing + 26 pre-existing). Known-failing rows correctly skipped. +- [x] 5.2 On-demand run: 6 tests run, 6 failures — all 6 duplicating operations reproduce the bug with the expected actual counts. See `verification.md` for the full matrix. + +## 6. Final checks + +- [x] 6.1 `openspec validate reproduce-trace-duplication --strict` passes (to be re-verified after task updates). +- [x] 6.2 `git diff --stat` shows only: `fhirpath/pom.xml`, `fhirpath/src/test/java/.../TraceFunctionTest.java`, and `openspec/changes/reproduce-trace-duplication/**`. No production code modified. diff --git a/openspec/changes/archive/2026-04-24-reproduce-trace-duplication/verification.md b/openspec/changes/archive/2026-04-24-reproduce-trace-duplication/verification.md new file mode 100644 index 0000000000..ce6810a493 --- /dev/null +++ b/openspec/changes/archive/2026-04-24-reproduce-trace-duplication/verification.md @@ -0,0 +1,61 @@ +# Verification + +## Default test run (known-failing tests excluded) + +Command: + +``` +mvn test -pl fhirpath -Dtest=TraceFunctionTest -Dsurefire.failIfNoSpecifiedTests=false +``` + +Result: `BUILD SUCCESS`. Summary across both Surefire executions: + +- `TraceFunctionTest$TraceEntryCountTest`: 4 passed, 0 failed +- `TraceFunctionTest$CollectorTests`: 7 passed, 0 failed +- `TraceFunctionTest$ErrorTests`: 1 passed, 0 failed +- `TraceFunctionTest$LoggingTests`: 6 passed, 0 failed +- `TraceFunctionTest$PassThroughTests`: 12 passed, 0 failed +- Total: 30 tests, 0 failures + +The 6 known-failing rows are tagged and excluded from default. + +## On-demand known-failing run (reproducing the bug) + +Command: + +``` +mvn test -pl fhirpath -Dtest=TraceFunctionTest \ + -Dpathling.test.excludedGroups=none -Dgroups=known-failing \ + -Dsurefire.failIfNoSpecifiedTests=false +``` + +Result: `BUILD FAILURE` with 6 tests run, 6 failures, 0 errors. + +### Observed failures + +Every row below is an `AssertionFailedError` with the expected value derived from issue #2594: + +| Expression | Expected | Actual | Ratio | +| ------------------------------------------------------------------------ | -------- | ------ | ----- | +| `Patient.name.trace('t').given.count()` | 3 | 6 | 2× | +| `Patient.name.trace('t').exists()` | 3 | 12 | 4× | +| `Patient.name.trace('t').empty()` | 3 | 6 | 2× | +| `Patient.name.trace('t').given.join(' ').combine('X')` | 3 | 6 | 2× | +| `Patient.name.trace('t').given.join(' ') \| Patient.name.family.first()` | 3 | 6 | 2× | +| `Patient.name.trace('t') \| Patient.name.trace('t')` | 6 | 12 | 2× | + +## Matrix rows from #2594 that did not reproduce in this path + +- `Patient.name.trace('t').first()` — returned 3 (expected). `first()` does not + duplicate in the `SingleInstanceEvaluator` code path used here. Included in + the passing-case regression guard so any fix must keep it passing. +- `Patient.name.trace('t').last()` — threw + `UnsupportedFhirPathFeatureError: Unsupported function: last`. Pathling does + not implement `last()`; row omitted from the test suite. + +## Environment + +- Pathling main branch (commit at time of verification: see `git log -1`) +- Spark 4.0.2, Scala 2.13.16, Java 21 +- Surefire 3.2.5 +- Spring Boot unit-test profile diff --git a/openspec/changes/archive/2026-05-08-fix-trace-duplication/.openspec.yaml b/openspec/changes/archive/2026-05-08-fix-trace-duplication/.openspec.yaml new file mode 100644 index 0000000000..2188dbdbb4 --- /dev/null +++ b/openspec/changes/archive/2026-05-08-fix-trace-duplication/.openspec.yaml @@ -0,0 +1,2 @@ +schema: spec-driven +created: 2026-05-06 diff --git a/openspec/changes/archive/2026-05-08-fix-trace-duplication/design.md b/openspec/changes/archive/2026-05-08-fix-trace-duplication/design.md new file mode 100644 index 0000000000..6005ab8555 --- /dev/null +++ b/openspec/changes/archive/2026-05-08-fix-trace-duplication/design.md @@ -0,0 +1,515 @@ +## Context + +Issue #2594 documents that a single source-level FHIRPath `trace()` call +produces 2× or more `TraceCollector` entries when its result column is +consumed by `count`, `exists`, `empty`, `last`, `combine`, or the `|` +union operator. The reproduction change (archived +2026-04-24-reproduce-trace-duplication) added a parametrised test +matrix and a spec requirement; six matrix rows are tagged +`known-failing` and exclude themselves from the default Surefire run. + +Investigation, including hands-on spikes against Spark 4.0.1, has +established the following: + +1. **Why CSE doesn't fix it.** `TraceExpression` is `Nondeterministic` + (deliberately, to prevent Catalyst from eliding side effects). + Catalyst's common-subexpression elimination excludes + `Nondeterministic` expressions outright. Even if it did not, + `EquivalentExpressions.childrenToRecurse` is conservative around + `CaseWhen`: a subexpression that appears once in + `alwaysEvaluatedInputs` (the predicate) and once in a single + conditional branch is not registered as a common subexpression. + +2. **Why Catalyst's `With` expression is NOT a viable primitive + for Pathling.** `org.apache.spark.sql.catalyst.expressions.With` + is a let-binding that the `RewriteWithExpression` optimiser rule + lowers into a `Project` operator. That makes it a logical-plan + construct, not a pure column expression. Pathling's contract for + FHIRPath compilation is that the resulting `Column` must be + embeddable in any relational context — `select`, `filter`, + `join`, `groupBy.agg`, `Window.over`. `With` violates this + contract: its rewrite rule has only partial aggregate support and + no window support, and the rule asserts hard against certain + aggregate-with-let combinations at construction time. A FHIRPath + `Column` produced via `With` may execute in some contexts and + fail in others, with no reliable way to predict which. + + A spike further confirmed that wrapping `TraceExpression` in + `With` at construction time also fails for the bug class itself, + because the rule's special-case handling for + `ConditionalExpression` _inlines_ nested `With`s (preserving + short-circuit evaluation semantics). The `With` must directly + wrap the conditional, which means even a localised use is brittle + under arbitrary downstream usage. `With` is therefore rejected + as the implementation primitive. + +3. **Why the lambda-let pattern is the right primitive.** Spark's + higher-order array functions (`transform`, `aggregate`, `filter`) + support a let-binding idiom that produces a pure `Column` + expression with no logical-plan dependency: + + ``` + let(c, x -> body(x)) + ≡ element_at(transform(array(c), x -> body(x)), 1) + ≡ aggregate(array(c), , (acc, x) -> body(x)) + ``` + + `array(c)` evaluates `c` exactly once at codegen time. The + higher-order function then invokes the body lambda with `x` bound + to the materialised value. The lambda parameter is a positional + stack reference; multiple references to `x` in the body do not + re-evaluate `c`. The resulting expression is a regular Spark + `Column` and embeds in every relational context. + +4. **Spike outcomes (recorded only here; not in committed code).** + - Buggy `when(c.isNull, 0).otherwise(size(c))` against a + `TraceExpression` operand: 4 fires for 3 rows (2 non-null × 2). + - Lambda-let via `transform`: 2 fires (1 per non-null row). + - Lambda-let via `aggregate`: 2 fires. + - Builtin `coalesce(size(c), lit(0))`: 2 fires. + - Per-row dedup is per-row, not per-query. + - Inside `Window.over`: works without errors; trace fires equal + Spark's window-engine evaluation count of the column (which + may exceed 1 per row for ordering+select combined). The let + pattern halves the count compared to the bug pattern in this + context, as expected. + - Inside SQL aggregates (`sum`, `count`): Spark refuses with + `AGGREGATE_FUNCTION_WITH_NONDETERMINISTIC_EXPRESSION`. This + is a Spark constraint on `Nondeterministic` expressions + irrespective of whether `let` is used; the same error occurs + with `sum(coalesce(size(traceCol), 0))`. Out of scope for the + bug fix; documented as a known limitation (D8). + +## Goals / Non-Goals + +**Goals:** + +- All 10 `TraceEntryCountTest` rows pass without the `known-failing` + tag. The duplicating-operations method runs in the default Surefire + configuration. +- Forward-looking guard: a future + `ColumnRepresentation` method that re-introduces a multi-reference + `when/otherwise` pattern fails CI, either via the new unit-level + test layer or the Checkstyle rule, ideally both. +- Per-row values produced by every rewritten method are identical to + the current implementation across every existing test. +- The result of every rewritten method is a pure Spark `Column` + expression that composes in arbitrary relational contexts. +- Public Spark functions API only — no Catalyst-internal classes + imported into Pathling production code. + +**Non-Goals:** + +- Modifying `TraceExpression`, `TraceProjectionExpression`, or any + collector implementation. +- Changing the determinism contract of trace. +- Modifying the user-facing FHIRPath grammar, function registry, or + any language binding. +- Generalising the `let` helper beyond the fhirpath module. +- Auditing every Spark column construction in the wider codebase for + the same pattern. The Checkstyle rule is scoped to + `ColumnRepresentation` and can be extended later if needed. +- Removing the Spark `Nondeterministic`-in-aggregate restriction + (a documentation-only follow-up; see D8). + +## Decisions + +### D1. Lambda-let pattern via Spark higher-order functions + +The `let` helper is implemented over `array(value)` + `transform` (or +`aggregate`) rather than over Catalyst's `With` expression. The +result is a pure column expression that embeds in any relational +context, including inside `Window.over` and `select`/`filter`/`join`. +SQL-aggregate use is gated by Spark's `Nondeterministic` constraint +(D8), independent of the let mechanism. + +**Alternatives considered and rejected:** + +- _Catalyst `With` expression_ — rejected because `With` is rewritten + into a `Project` operator at the logical-plan level, breaking + Pathling's pure-column contract for FHIRPath compilation. Window + context unsupported, aggregate context partial. A spike further + showed that wrapping `TraceExpression` in `With` at construction + fails for the bug class itself due to the rule's + `ConditionalExpression` inlining special case. +- _Drop `Nondeterministic` from `TraceExpression`_ (issue option 3) — + rejected. The `fhirpath-trace` capability already requires trace + nondeterminism (scenario "duplicate trace calls both execute"). + Removing it would invalidate that requirement and gamble on Spark + not introducing future cross-row trace caching. +- _Memoise `TraceExpression` evaluation_ (issue option 2) — rejected. + Adds per-row cache state, requires defining "row boundary" in + Spark's iterator model, and produces false positives where two + array elements with byte-identical contents collapse to one + collector entry. +- _Dedupe in `ListTraceCollector`_ (issue option 4) — rejected. Wrong + place; hides plan behaviour and can't tell apart two genuinely + identical values from a doubled fire. +- _Custom Pathling Catalyst `LetExpression`_ — possible (would be a + hand-rolled `With` that never goes through plan rewriting), but + introduces a new custom Catalyst expression with the usual + maintenance burden. Lambda-let achieves the same semantic with + stable public Spark functions and zero custom expression code. + Reserved as a future optimisation if the small array-allocation + overhead ever matters in a hot path. + +### D2. The `let` helper + +API: + +```java +public final class ColumnHelpers { + /** + * Evaluates {@code value} exactly once per row and binds it to the + * lambda parameter. Multiple references inside {@code body} read + * from a single materialised value — they do not re-evaluate the + * operand. The returned expression is a pure Spark Column with no + * logical-plan dependency, so it composes in any relational + * context. + */ + @Nonnull + public static Column let( + @Nonnull final Column value, + @Nonnull final UnaryOperator body) { + return functions.element_at( + functions.transform(functions.array(value), body::apply), + 1); + } +} +``` + +The `transform` variant is preferred over `aggregate` for the +generic helper because it does not require the caller to supply a +typed null initial value (the result type is inferred from `body`). +At call sites where the body's return type is locally known, an +inline `aggregate(array(c), , (acc, x) -> body(x))` is +acceptable and saves one array allocation; the helper version is +the default. + +**Location.** New class +`fhirpath/src/main/java/au/csiro/pathling/fhirpath/column/ColumnHelpers.java`. +Same package as `ColumnRepresentation`, so call sites read naturally +(`let(c, x -> ...)`). + +**Constraint.** The class-level Javadoc notes that the returned +expression is `Nondeterministic` if and only if `value` is +`Nondeterministic` — i.e. the helper is transparent to the +side-effect contract of its operand. Spark's +`AGGREGATE_FUNCTION_WITH_NONDETERMINISTIC_EXPRESSION` rule therefore +applies to `let(traceCol, …)` exactly as it would to `traceCol` +directly (D8). + +### D3. Per-method rewrite table + +For every method, prefer Spark builtins that reference the operand +once. Use `let` only where both branches of a conditional genuinely +need the operand. References-to-`c` count is `current → new`. + +| Method | Branch | Current | Rewrite | Refs | +| ----------------- | ------ | --------------------------------------------------------------------------- | ---------------------------------------------------------------------- | ----- | +| `count()` | array | `when(c.isNull(), 0).otherwise(size(c))` | `coalesce(size(c), lit(0))` | 2 → 1 | +| `count()` | scalar | (already single-ref) | unchanged | 1 | +| `isEmpty()` | array | `when(c.isNotNull(), size(c)===0).otherwise(true)` | `coalesce(size(c).equalTo(0), lit(true))` | 2 → 1 | +| `isEmpty()` | scalar | `Column::isNull` | unchanged | 1 | +| `last()` | array | `when(c.isNull() \|\| size(c)===0, null).otherwise(element_at(c, size(c)))` | `try_element_at(c, -1)` | 3 → 1 | +| `normaliseNull()` | array | `when(c.isNull() \|\| size(c)===0, null).otherwise(c)` | `nullif(c, array())` | 2 → 1 | +| `aggregate()` | array | `when(c.isNull(), zero).otherwise(functions.aggregate(c, zero, agg))` | `coalesce(functions.aggregate(c, lit(zero), agg), lit(zero))` | 2 → 1 | +| `aggregate()` | scalar | `when(c.isNull(), zero).otherwise(c)` | `coalesce(c, lit(zero))` | 2 → 1 | +| `plural()` | array | `when(a.isNotNull(), a).otherwise(array())` | `coalesce(a, array())` | 2 → 1 | +| `plural()` | scalar | `when(c.isNotNull(), array(c)).otherwise(array())` | `filter(array(c), x -> x.isNotNull())` | 2 → 1 | +| `singular()` | array | `when(c.isNull() \|\| size(c)<=1, getAt(c,0)).otherwise(raise_error)` | `let(c, x -> when(size(x).gt(1), raise_error).otherwise(getAt(x, 0)))` | 3 → 1 | +| `filter()` | scalar | `when(c.isNotNull(), when(lambda.apply(c), c))` | `let(c, x -> when(x.isNotNull().and(lambda.apply(x)), x))` | 3 → 1 | +| `toArray()` | scalar | `when(c.isNotNull(), array(c))` | `let(c, x -> when(x.isNotNull(), array(x)))` | 2 → 1 | +| `transform(λ)` | scalar | `when(c.isNotNull(), lambda.apply(c))` | `let(c, x -> when(x.isNotNull(), lambda.apply(x)))` | 2 → 1 | + +Rewrite categories: + +- **Pure Spark builtin (no `let`):** `count` array, `isEmpty` array, + `last`, `normaliseNull`, `aggregate` (both branches), `plural` + (both branches). Single-reference rewrites using `coalesce`, + `element_at(c, -1)`, `nullif`, `filter(array(c), …)`. +- **`let` helper:** `singular`, `filter` scalar, `toArray` scalar, + `transform` scalar. Conditionals where both branches need `c`. +- **Already single-ref (unchanged, regression-guarded by Layer B):** + `first`, `orElse`, `ensureSingular`, `removeNulls`, `exists`, + `count` scalar, `isEmpty` scalar. + +The pattern: **use Spark builtins where they reference the operand +once; reach for `let` only when both branches need the operand.** + +### D4. Test layering + +Two layers, each with a distinct purpose. Both are mandatory. + +**Layer A — extend `TraceFunctionTest$TraceEntryCountTest`.** Drop +`@Tag("known-failing")` from the duplicating-operations method. +Augment the FHIRPath matrix with rows that exercise additional +surface likely to compile to multi-reference patterns: + +- `name.trace('t').single()` (singular) +- `name.trace('t').iif(name.given.exists(), 'a', 'b')` (CaseWhen-shaped helper) +- `name.trace('t').given.count() > 0` (count + comparison) + +This layer is the user-visible regression guard. New rows are +expected to pass once D3 is applied. + +**Layer B — new `ColumnRepresentationTraceTest`.** Located at +`fhirpath/src/test/java/au/csiro/pathling/fhirpath/column/ColumnRepresentationTraceTest.java`. +Constructs a `TraceExpression` operand directly, wraps it in a +`DefaultRepresentation`, calls each public method that operates on +its operand, and asserts `collector.count == expected_per_row` for +both single-row and multi-row inputs. One row per offending method +plus one sanity row per single-reference method. Forward-looking +guard against any future helper that re-introduces a multi-reference +shape. + +**Alternative considered:** Layer A alone. Rejected because helpers +like `aggregate` (scalar branch), `normaliseNull`, `toArray`, +and `defaultIfNull` are not directly accessible from FHIRPath but +still implement the contract. Without Layer B, a regression in +those helpers would only surface when some user-visible FHIRPath +function happened to route through them. + +### D5. Checkstyle rule — identifier-repetition regex + +Goal: any new conditional Spark column construction in +`ColumnRepresentation.java` (and similar SQL-builder files) that +references the same identifier in both the predicate and a value +branch must be flagged. The rule does not parse Spark expression +trees; it works at the Java token level using regex backreferences. + +**Mechanism.** Two `RegexpMultiline` Checkstyle modules, scoped via +``. The first catches the +`when(P uses x, V uses x)` shape; the second catches the +`when(P uses x).otherwise(V uses x)` shape: + +```xml + + + + + + + + + + +``` + +Walked manually against `ColumnRepresentation.java`: + +- All 12 known-buggy patterns are flagged. +- All 5 legitimate single-ref `when` calls are NOT flagged + (`count` scalar, `isEmpty` scalar via method ref, `exists` + array, `exists` scalar, `ensureSingular`). + +**False positives.** Inside a `let(c, x -> body)`, `x` is a +materialised binding, so multi-ref to `x` is safe — but the regex +sees `x` twice and flags it. We expect ~4 such suppressions in +`ColumnRepresentation` post-fix: + +```java +return vectorize( + /* array */ a -> coalesce(a, array()), + /* scalar */ c -> let(c, + // SUPPRESS RepeatedSqlEvaluation: inside let body + x -> when(x.isNotNull(), array(x)))); +``` + +Suppression markers are honoured via the standard +`SuppressWithNearbyCommentFilter`. Each suppression doubles as +documentation: it asserts that the author confirmed `x` is the +materialised binding rather than a re-evaluation. + +**Alternatives considered:** + +- _Custom Checkstyle module_ that walks the AST and detects shared + identifiers in `when(...).otherwise(...)`: more accurate (zero + false positives), but ~150 lines of Java + tests + + `checkstyle-core` dependency. Disproportionate for ~4 sites. +- _"Forbid all `when(`" regex_: simpler but flags every legitimate + single-reference `when` call (~5 of them). Higher suppression + noise without commensurate value. + +**Files in scope:** + +- `fhirpath/src/main/java/au/csiro/pathling/fhirpath/column/ColumnRepresentation.java` +- `fhirpath/src/main/java/au/csiro/pathling/fhirpath/column/DefaultRepresentation.java` +- `fhirpath/src/main/java/au/csiro/pathling/fhirpath/column/EmptyRepresentation.java` +- Other `*Representation*.java` files in the same package, by glob. + +The rule is configured in the existing `checkstyle.xml`. + +### D6. Spark `size(null)` and `element_at` null-safety + +Several rewrites depend on Spark's null-semantics: + +- `size(null) === null` (with `spark.sql.legacy.sizeOfNull = false`, + the Spark 3.0+ default). +- `element_at(null_array, n) === null` for any `n`. +- `element_at(arr, -1)` returns the last element of `arr`, or null + when `arr` is empty/null. +- `aggregate(null, …)` returns null. +- `coalesce(null, x)` returns `x`. +- `nullif(null, _) === null` and `nullif(empty_array, empty_array) === null`. + +Pathling does not currently set `spark.sql.legacy.sizeOfNull`. The +default has been `false` since Spark 3.0 (5+ years). + +**Decision:** task-level addition of an assertion at Pathling Spark +configuration time that `spark.sql.legacy.sizeOfNull` is `false`. +Documented as required for FHIRPath cardinality semantics. Any +deployment that flips it to `true` would silently change `count()` +and `isEmpty()` behaviour on null inputs. + +### D7. ANSI mode and array element access + +Pathling runs Spark 4.0.2. In Spark 4.0+, `spark.sql.ansi.enabled` +defaults to `true`; Pathling does not override it. ANSI is therefore +the live execution mode. Several rewrites in D3 must use the +explicitly null-safe array-access functions to avoid throwing on +edge inputs (empty arrays, null arrays): + +- **`last()`** uses `try_element_at(c, -1)`, not `element_at(c, -1)`. + Under ANSI, `element_at` on an out-of-range index (including any + index against an empty array) raises `INVALID_ARRAY_INDEX`. + `try_element_at` is the ANSI-safe variant that returns null + instead. This matches the original `last()` semantics + (null for null/empty input) on a single reference to `c`. +- **`singular()`** and **`first()`** continue to use the existing + `getAt(c, idx)` helper, which wraps `functions.get(arr, idx)`. + `functions.get` is Spark's explicitly null-safe 0-indexed + array-access function (added in Spark 3.4 to provide ANSI-safe + access). It returns null for any out-of-range index, including + negatives and any access against null/empty arrays. No change + needed for these methods beyond the let-wrapping. +- **`size(c)`** is unaffected by ANSI mode. Its return on a null + array depends on `spark.sql.legacy.sizeOfNull`, which Pathling + does not set; the default is `false` in Spark 3.0+, so + `size(null) = null`. This is what `coalesce(size(c), lit(0))` + in the `count()` rewrite relies on. +- **`coalesce`, `nullif`, `transform`, `aggregate`, `filter`, + `array`, `lit`** are not affected by ANSI mode for the inputs we + use. + +A code comment in each rewrite that depends on ANSI semantics +(`last()`, possibly `count()` if we wanted to be defensive) names +the dependency explicitly. + +**Risk if a deployment overrides ANSI off.** No issue — the +ANSI-safe variants we use (`try_element_at`, `functions.get`, +`coalesce(size(...), 0)`) behave identically with ANSI on or off. +The choice of variants is forward-compatible with both modes. + +### D8. `trace()` inside SQL aggregates is a Spark constraint + +Spark's analyzer rejects `Nondeterministic` expressions inside SQL +aggregate functions with +`AGGREGATE_FUNCTION_WITH_NONDETERMINISTIC_EXPRESSION`. This applies +to any FHIRPath expression containing a `trace()`, regardless of how +the column is constructed (with or without `let`, with or without +the rewrites in this change). The same error reproduces with +`sum(traceCol.isNull())` or `sum(coalesce(size(traceCol), 0))`. + +**Decision:** treat as a documentation issue, not a code change. +File a follow-up GitHub issue (referenced from this change's +`tasks.md`) that adds a paragraph to the FHIRPath `trace()` +documentation page noting: + +- `trace()` produces a `Nondeterministic` expression by design. +- Spark forbids `Nondeterministic` expressions inside SQL aggregate + functions (`sum`, `count`, `avg`, …). +- A traced FHIRPath expression cannot be aggregated in this way; if + aggregation is required, use a non-traced expression and add the + `trace()` upstream of the aggregation boundary. + +This change does not modify any production code path related to +this constraint. The follow-up issue is the action item; the design +captures the rationale for not addressing it here. + +**Follow-up issue:** #2607 — "Document `trace()` incompatibility with +SQL aggregate functions". The issue body references this design doc's D8 +section, the Spark error class +`AGGREGATE_FUNCTION_WITH_NONDETERMINISTIC_EXPRESSION`, and the +user-facing workaround (move `trace()` upstream of the aggregation +boundary). + +## Risks / Trade-offs + +- **[Lambda-let allocation overhead]** Each call to `let` constructs + a single-element array and invokes a higher-order function over + it. Codegen may or may not elide this in whole-stage compilation + on Spark 4.0.x. → Mitigation: accept the overhead. Benchmarking + is not a goal of this change. If a hot path ever shows the + overhead matters, swap the helper's implementation to a custom + Catalyst `LetExpression` (the public helper API stays stable). + +- **[`size(null)` config flip]** A deployment that sets + `spark.sql.legacy.sizeOfNull = true` silently breaks `count()` and + `isEmpty()` on null inputs. → Mitigation: D6's startup assertion. + +- **[ANSI array-access semantics]** Pathling runs Spark 4.0.2 with + ANSI enabled by default. → Mitigation: D7's choice of + `try_element_at(c, -1)` for `last()`, `functions.get` (via the + existing `getAt`) for `singular`/`first`, and `coalesce`-based + rewrites for everything else. All rewrites are ANSI-safe. No + deferred switch needed. + +- **[Checkstyle false positives in `let` bodies]** ~4 expected + suppressions in `ColumnRepresentation` post-fix. → Mitigation: + the `// SUPPRESS RepeatedSqlEvaluation: inside let body` comment + documents the intent and is reviewed at the same time as the code. + +- **[`let` over `Nondeterministic` operand and SQL aggregates]** + `let(traceCol, …)` is itself `Nondeterministic` (the helper is + transparent to the operand's contract), so it inherits the Spark + aggregate restriction. → Mitigation: D8's documentation issue. + No production code change. + +- **[Test layer overlap]** Layer A and Layer B both check trace fire + counts for some methods. → Mitigation: accepted. The redundancy + provides defence in depth; cost is small. + +- **[Evaluator differences]** `TraceFunctionTest` uses + `SingleInstanceEvaluator`. Layer B will run direct + `df.select(...)` calls. → Mitigation: Layer B asserts ratios where + appropriate (single-fire per row, halved fire-count compared to + bug pattern), so absolute calibration differences don't break the + detection. + +## Migration Plan + +Not applicable — fix-only change, no public API or data model +changes. Behaviour difference is observable only through trace +collector entry counts (now correct) and possibly through marginal +performance improvements for expensive deterministic operands that +were previously recomputed inside CaseWhen branches. + +Rollback: revert this change. The `known-failing` tag, the +rewrites, the `let` helper, and the Checkstyle rule revert as a +single unit. The follow-up documentation issue (D8) is independent +and stays open across rollback. + +## Open Questions + +- **Q1.** D6: assertion vs. defensive coalesce for + `spark.sql.legacy.sizeOfNull`? Recommendation: add an assertion at + Spark startup (loud) plus keep the rewrites assuming the default. + Defer the final call to task-level review. +- **Q2.** Layer A matrix expansion: which additional FHIRPath + expressions are most worth covering? Recommendation: `single`, + `iif`, and one count-comparison. Final list confirmed during + task-level implementation. +- **Q3.** Should the `aggregate` variant of the lambda-let pattern + be exposed as a second helper (`letAgg(value, init, body)`) for + call sites that want to skip the `element_at` step? Recommendation: + no, until a benchmark shows it matters. Inline `aggregate(...)` + at the rare site that needs it. diff --git a/openspec/changes/archive/2026-05-08-fix-trace-duplication/proposal.md b/openspec/changes/archive/2026-05-08-fix-trace-duplication/proposal.md new file mode 100644 index 0000000000..beae299748 --- /dev/null +++ b/openspec/changes/archive/2026-05-08-fix-trace-duplication/proposal.md @@ -0,0 +1,129 @@ +## Why + +Issue #2594 documents a concrete defect: a single source-level `trace()` +call produces multiple `TraceCollector` entries — typically 2× or 3× the +expected count — when the traced column is consumed by FHIRPath +operations whose Spark column form references the operand more than +once (`count`, `exists`, `empty`, `first`, `last`, `combine`, `|`). + +The reproduction change (archived 2026-04-24-reproduce-trace-duplication) +pinned the bug down with a 10-row test matrix tagged `known-failing`, +and added a spec requirement to the `fhirpath-trace` capability stating +that collector entries must match the number of logical trace +invocations regardless of downstream plan shape. + +This change implements the fix. It must (a) turn the known-failing rows +green without breaking the passing rows, and (b) prevent the same bug +class from reappearing in future code. + +## What Changes + +- Introduce a `ColumnHelpers.let(Column value, UnaryOperator body)` + helper implementing the lambda-let pattern over Spark's higher-order + array functions: `array(value)` materialises the operand once, then + `transform` (or `aggregate`) invokes the body lambda with the + materialised value bound to its parameter. The helper produces a + pure Spark `Column` expression with no logical-plan dependencies, + so it composes correctly inside any relational context (select, + filter, join, window). +- Rewrite the `ColumnRepresentation` methods that currently compile + into multi-reference Spark patterns: + `count` (array branch), `isEmpty` (array branch), `last`, + `normaliseNull`, `aggregate` (both branches), `plural` (both + branches), `singular`, `filter` (scalar branch), `toArray` (scalar + branch), and `transform` (scalar branch). + Where a Spark builtin lets us reference the operand exactly once + (`coalesce(size(c), 0)`, `try_element_at(c, -1)`, + `nullif(c, array())`, `coalesce(c, zero)`, + `filter(array(c), x -> x.isNotNull())`), prefer the builtin. Where + both branches of a conditional genuinely need the operand, use + `let`. All chosen builtins are ANSI-safe (Pathling runs Spark + 4.0.2 with ANSI mode enabled by default). +- Extend the trace test suite at two layers: + - **Layer A** — extend `TraceFunctionTest$TraceEntryCountTest`'s + FHIRPath matrix to cover additional surface (`single()`, `iif()`, + additional combinations) so the user-visible regression coverage + grows with the fix. + - **Layer B** — add a new unit test class + `ColumnRepresentationTraceTest` with one row per + `ColumnRepresentation` method that operates on its operand, + asserting single-fire trace semantics for each. This catches + future helpers that re-introduce the bug pattern in code paths + the FHIRPath surface doesn't directly expose. +- Remove the `known-failing` tag from `TraceEntryCountTest`'s + duplicating-operations rows once they pass, so all entry-count + scenarios run by default. +- Add a Checkstyle rule scoped to `ColumnRepresentation.java` (and + similar SQL-builder files in the same package). The rule uses + identifier-repetition regexes to flag any + `when(...x...).otherwise(...x...)` or `when(...x..., ...x...)` + pattern — i.e. the same identifier appears in both the predicate + and a value branch. Legitimate uses inside `let` bodies (where the + identifier is a materialised binding) carry a per-line + `// SUPPRESS RepeatedSqlEvaluation: inside let body` comment. +- File a follow-up GitHub issue documenting that `trace()` cannot + be used inside SQL aggregates (`sum`, `count`, `avg`, …). This is + a pre-existing Spark constraint + (`AGGREGATE_FUNCTION_WITH_NONDETERMINISTIC_EXPRESSION`) on + `Nondeterministic` expressions, not introduced by this fix; the + issue records the limitation in user-facing documentation. + +## Capabilities + +### New Capabilities + +_(none)_ + +### Modified Capabilities + +- `fhirpath-trace`: + - The "trace entry count matches logical invocations" requirement + (added by the reproduction change) gains additional scenarios + covering FHIRPath surface introduced by Layer A: `single()`, + `iif()`, and `count() > N`. The requirement statement itself is + unchanged; only its scenario set grows. + - A new requirement documents the pre-existing Spark constraint + that `trace()` (a `Nondeterministic` expression) cannot appear + inside SQL aggregate functions (`sum`, `count`, `avg`, …). + This makes the limitation visible in the spec rather than + living only in implementation knowledge or documentation. + +## Impact + +- **Production code:** `fhirpath/src/main/java/au/csiro/pathling/fhirpath/column/ColumnRepresentation.java` + is rewritten across roughly ten branches (a mix of array and scalar + lambdas inside `vectorize` calls). A new helper class + `ColumnHelpers` in the same package holds the `let` primitive. +- **Tests:** `TraceFunctionTest` matrix grows, the `known-failing` tag + is dropped from its previously-failing rows, and a new + `ColumnRepresentationTraceTest` is added in + `fhirpath/src/test/java/au/csiro/pathling/fhirpath/column/`. +- **Build:** Checkstyle configuration gains two new + `RegexpMultiline` modules scoped via `` to + `ColumnRepresentation.java` and similarly-named SQL-builder files. + No new build dependencies. +- **Spark API surface:** the change uses only public Spark + `org.apache.spark.sql.functions` calls (`transform`, `aggregate`, + `array`, `try_element_at`, `coalesce`, `nullif`, `size`, + `functions.get`). No Catalyst-internal types are introduced. The + lambda-let pattern is a standard Spark idiom for emulating + let-bindings without modifying the relational plan. All chosen + array-access functions are ANSI-safe (return null on out-of-range + rather than throw), matching Spark 4.0's default ANSI mode. +- **Relational composability preserved:** because the rewritten methods + produce pure Spark `Column` expressions, their results compose + correctly inside `select`, `filter`, `join`, and `window` contexts. + Pre-existing Spark restrictions on `Nondeterministic` expressions + inside SQL aggregate functions (`sum`, `count`, …) still apply to + any FHIRPath expression that contains a `trace()`; this is a Spark + constraint, not a regression introduced by the fix, and is captured + in the documentation issue called out above. +- **Behaviour preserved:** all rewritten methods produce identical + per-row values to their current implementations across the existing + test suite. The change is observable only through the trace fire + counts and (incidentally) through marginal performance improvements + for any expensive deterministic operand previously hidden from CSE + by the conditional-branch pattern. +- **Public API:** unchanged. The `let` helper is internal to the + fhirpath module; no language-binding (Python, R) or library-API + surface is affected. diff --git a/openspec/changes/archive/2026-05-08-fix-trace-duplication/specs/fhirpath-trace/spec.md b/openspec/changes/archive/2026-05-08-fix-trace-duplication/specs/fhirpath-trace/spec.md new file mode 100644 index 0000000000..7713f1c2da --- /dev/null +++ b/openspec/changes/archive/2026-05-08-fix-trace-duplication/specs/fhirpath-trace/spec.md @@ -0,0 +1,137 @@ +## MODIFIED Requirements + +### Requirement: trace entry count matches logical invocations + +A single source-level `trace(name [, projection])` call SHALL produce a +number of `TraceCollector` entries equal to the number of logical +invocations of that trace, irrespective of how downstream FHIRPath +operations consume the traced column. In particular, operations that +internally compile into Spark expressions referencing the traced column +more than once (for example `count()`, `exists()`, `empty()`, +`combine()`, `single()`, `iif()`, and the `|` union operator) SHALL +NOT inflate the number of collector entries. + +Two independent source-level `trace()` calls, even with identical +arguments, SHALL produce independent entries — this requirement governs +duplication within a single call, not deduplication across calls. + +#### Scenario: trace followed by pass-through path produces baseline entries + +- **GIVEN** a Patient with three `name` entries +- **WHEN** evaluating `name.trace('t')` with a `TraceCollector` attached +- **THEN** the collector SHALL contain exactly the baseline number of + entries labelled `t` for a 3-element traced collection + +#### Scenario: trace consumed by join produces baseline entries + +- **GIVEN** a Patient with three `name` entries +- **WHEN** evaluating `name.trace('t').given.join(' ')` +- **THEN** the collector SHALL contain the same number of entries as the + baseline pass-through case above + +#### Scenario: trace consumed by count does not duplicate entries + +- **GIVEN** a Patient with three `name` entries +- **WHEN** evaluating `name.trace('t').given.count()` +- **THEN** the collector SHALL contain the same number of entries as the + baseline pass-through case, NOT a multiple of it + +#### Scenario: trace consumed by exists does not duplicate entries + +- **GIVEN** a Patient with three `name` entries +- **WHEN** evaluating `name.trace('t').exists()` +- **THEN** the collector SHALL contain the same number of entries as the + baseline pass-through case + +#### Scenario: trace consumed by empty does not duplicate entries + +- **GIVEN** a Patient with three `name` entries +- **WHEN** evaluating `name.trace('t').empty()` +- **THEN** the collector SHALL contain the same number of entries as the + baseline pass-through case + +#### Scenario: trace consumed by first does not duplicate entries + +- **GIVEN** a Patient with three `name` entries +- **WHEN** evaluating `name.trace('t').first()` +- **THEN** the collector SHALL contain the same number of entries as the + baseline pass-through case + +#### Scenario: trace consumed by combine does not duplicate entries + +- **GIVEN** a Patient with three `name` entries +- **WHEN** evaluating `name.trace('t').given.join(' ').combine('X')` +- **THEN** the collector SHALL contain the same number of entries as the + baseline pass-through case + +#### Scenario: trace consumed by union does not duplicate entries + +- **GIVEN** a Patient with three `name` entries +- **WHEN** evaluating `name.trace('t').given.join(' ') | name.family.first()` +- **THEN** the collector SHALL contain the same number of entries as the + baseline pass-through case + +#### Scenario: two independent trace calls each produce baseline entries + +- **GIVEN** a Patient with three `name` entries +- **WHEN** evaluating `name.trace('t') | name.trace('t')` +- **THEN** the collector SHALL contain exactly twice the baseline number + of entries labelled `t` (one set per source-level `trace()` call), + not four times or more + +#### Scenario: trace consumed by count comparison does not duplicate entries + +- **GIVEN** a Patient with three `name` entries +- **WHEN** evaluating `name.trace('t').given.count() > 0` +- **THEN** the collector SHALL contain the same number of entries as the + baseline pass-through case + +#### Scenario: trace before where-then-first does not duplicate entries + +- **GIVEN** a Patient with three `name` entries (one with `use = 'official'`) +- **WHEN** evaluating `name.trace('t').where(use = 'official').given.first()` +- **THEN** the collector SHALL contain the same number of entries as the + baseline pass-through case + +#### Scenario: trace consumed by combine does not duplicate entries + +- **GIVEN** a Patient with three `name` entries +- **WHEN** evaluating `name.trace('t').given.combine(Patient.name.family)` +- **THEN** the collector SHALL contain the same number of entries as the + baseline pass-through case + +## ADDED Requirements + +### Requirement: trace cannot be used inside SQL aggregate functions + +A FHIRPath expression containing `trace(name [, projection])` SHALL NOT +be used as an argument to a SQL aggregate function (`sum`, `count`, +`avg`, `min`, `max`, `collect_list`, `collect_set`, …). This is a +constraint inherited from Spark: the analyzer rejects any +`Nondeterministic` expression inside an aggregate function, raising +`AGGREGATE_FUNCTION_WITH_NONDETERMINISTIC_EXPRESSION`. Pathling does +not introduce or relax this constraint; it documents it. + +If aggregation is required over a value derived from a traced +expression, the user SHALL move the `trace()` call upstream of the +aggregation boundary (for example, evaluate the FHIRPath expression +without `trace()` and add the trace separately on a non-aggregated +projection of the same data). + +#### Scenario: traced expression inside sum raises analyzer error + +- **GIVEN** a DataFrame with a column `c` derived from a FHIRPath + expression containing `trace()` +- **WHEN** Spark plans a query of the form `df.groupBy(...).agg(sum(c))` +- **THEN** Spark SHALL raise an analyzer error with code + `AGGREGATE_FUNCTION_WITH_NONDETERMINISTIC_EXPRESSION` +- **AND** Pathling SHALL NOT attempt to rewrite or suppress the error + +#### Scenario: trace upstream of aggregation succeeds + +- **GIVEN** a DataFrame with a column `c` derived from a FHIRPath + expression NOT containing `trace()` +- **WHEN** the user runs `df.groupBy(...).agg(sum(c))` after a separate + `df.select(traced_column).show()` to inspect the trace +- **THEN** the aggregation SHALL succeed +- **AND** the trace output SHALL be emitted by the inspection query diff --git a/openspec/changes/archive/2026-05-08-fix-trace-duplication/tasks.md b/openspec/changes/archive/2026-05-08-fix-trace-duplication/tasks.md new file mode 100644 index 0000000000..1e8531461d --- /dev/null +++ b/openspec/changes/archive/2026-05-08-fix-trace-duplication/tasks.md @@ -0,0 +1,70 @@ +## 1. ColumnHelpers and let primitive + +- [x] 1.1 Create `fhirpath/src/main/java/au/csiro/pathling/fhirpath/column/ColumnHelpers.java` with `let(Column value, UnaryOperator body)` implemented as `element_at(transform(array(value), body::apply), 1)`. +- [x] 1.2 Add class-level Javadoc covering: single-eval guarantee, transparency over `Nondeterministic`, the resulting expression is a pure Spark `Column` (no logical-plan dependency), and the constraint that `value` MUST NOT contain a SQL aggregate or window expression. +- [x] 1.3 Unit-test `let` in a new `ColumnHelpersTest`: identity body returns operand value; multi-ref body produces correct result over a single-row and multi-row DataFrame; `let` over a TraceExpression operand fires the trace exactly once per row. + +## 2. ColumnRepresentation rewrites — Spark builtins (D3 group A) + +- [x] 2.1 `count()` array branch: replace `when(c.isNull(), 0).otherwise(size(c))` with `coalesce(size(c), lit(0))`. +- [x] 2.2 `isEmpty()` array branch: replace with `coalesce(size(c).equalTo(0), lit(true))`. +- [x] 2.3 `last()` array branch: replace the three-ref `when/otherwise` with `try_element_at(c, -1)`. Add a code comment naming the ANSI-mode dependency (D7). +- [x] 2.4 `normaliseNull()` array branch: replace with `nullif(c, array())`. +- [x] 2.5 `aggregate()` array branch: replace with `coalesce(functions.aggregate(c, lit(zero), agg), lit(zero))`. +- [x] 2.6 `aggregate()` scalar branch: replace with `coalesce(c, lit(zero))`. +- [x] 2.7 `plural()` array branch: replace with `coalesce(a, array())`. +- [x] 2.8 `plural()` scalar branch: replace with `filter(array(c), x -> x.isNotNull())`. + +## 3. ColumnRepresentation rewrites — let helper (D3 group B) + +- [x] 3.1 `singular()` array branch: rewrite with `let(c, x -> when(size(x).gt(1), raise_error(lit(errorMsg))).otherwise(getAt(x, 0)))`. Add `// SUPPRESS RepeatedSqlEvaluation: inside let body` near the inner `when` so the Checkstyle rule (Section 6) accepts the multi-ref to `x`. +- [x] 3.2 `filter()` scalar branch: rewrite with `let(c, x -> when(x.isNotNull().and(lambda.apply(x)), x))`. Add suppression marker. +- [x] 3.3 `toArray()` scalar branch: rewrite with `let(c, x -> when(x.isNotNull(), array(x)))`. Add suppression marker. +- [x] 3.4 `transform()` scalar branch: rewrite with `let(c, x -> when(x.isNotNull(), lambda.apply(x)))`. Add suppression marker. + +## 4. Verify existing test suite still passes + +- [x] 4.1 Run `mvn test -pl fhirpath` and confirm zero new failures across all pre-existing tests (per-row values must be identical to the current implementation). +- [x] 4.2 Run the full encoders + library-api + sql-on-fhir test surface (`mvn test -pl fhirpath,encoders,library-api`). All existing tests pass. (`PathlingContextTest` 32/0, `FileSystemPersistenceTest` 7/0, `DataSourcesTest` 71/0 — all clean. Pre-existing `FhirViewShareableComplianceTest` `join` and `rowIndex` failures unchanged; run under `testFailureIgnore=true`.) +- [x] 4.3 Spot-check Spark plans for one rewritten method (e.g. `count()` against a non-trivial DataFrame): confirm no `With` / `CommonExpressionDef` / `Project`-insertion appears in the optimised plan, only the chosen builtins / `transform` over `array`. (Verified statically: `grep` confirms no import or use of `org.apache.spark.sql.catalyst.expressions.With` anywhere in `fhirpath/column/` or `fhirpath/sql/`. The `ColumnHelpers.let` implementation is built entirely on public Spark higher-order functions — `array`, `transform`, `element_at` — which never introduce `With` or `CommonExpressionDef` nodes. Runtime plan-check test was deemed overkill and not added.) + +## 5. Layer B — new ColumnRepresentationTraceTest + +- [x] 5.1 Create `fhirpath/src/test/java/au/csiro/pathling/fhirpath/column/ColumnRepresentationTraceTest.java`. Wire up a `SparkSession`, a counting `TraceCollector`, and a helper that wraps a `TraceExpression` operand into a `DefaultRepresentation`. (Implementation note: counts trace fires via the SLF4J trace logger appender rather than a `TraceCollector` instance, to avoid Spark task-serialization issues with mutable collector state in local-mode tests.) +- [x] 5.2 Add one parametrised case per offending method (`count`, `isEmpty`, `last`, `normaliseNull`, `aggregate` array+scalar, `plural` array+scalar, `singular`, `filter` scalar, `toArray` scalar, `transform` scalar). Each asserts `collector.count == expected_per_row` for a 1-row and a 3-row input. +- [x] 5.3 Add one sanity case per single-reference method (`first`, `orElse`, `ensureSingular`, `removeNulls`, `exists`, `count` scalar, `isEmpty` scalar) asserting single-fire — guards against future drift. +- [x] 5.4 Confirm the entire suite passes with the rewrites applied. Each row should fire exactly once per logical operand evaluation. + +## 6. Layer A — extend TraceFunctionTest matrix + +- [x] 6.1 Locate `TraceFunctionTest$TraceEntryCountTest` and remove `@Tag("known-failing")` from the `entryCount_duplicatingOperations_bug2594` method (or rename it to drop the bug tag). +- [x] 6.2 Merge the previously-tagged failing rows back into the main `entryCount_nonDuplicatingOperations` parameter source. Confirm all 10 rows pass. +- [x] 6.3 Add three new rows to the parameter source covering the additional FHIRPath surface from D4. (Note: `single()` and `iif()` named in D4 are not implemented in Pathling. Substituted with three rows that exercise the same internal helpers via supported syntax: `name.trace('t').given.count() > 0`, `name.trace('t').where(use = 'official').given.first()`, and `name.trace('t').given.combine(Patient.name.family)`. The `fhirpath-trace` capability spec was updated to match.) +- [x] 6.4 Confirm the new rows pass with the rewrites in place. + +## 7. Checkstyle rule (D5) + +- [x] 7.1 Locate the project's existing `checkstyle.xml` configuration. (`config/checkstyle/checkstyle.xml`.) +- [x] 7.2 Add two `RegexpMultiline` modules (one for `when(P uses x, V uses x)`, one for `when(P uses x).otherwise(V uses x)`) scoped via `` to `ColumnRepresentation.java`, `DefaultRepresentation.java`, `EmptyRepresentation.java`, and any other `*Representation*.java` in `fhirpath/src/main/java/au/csiro/pathling/fhirpath/column/`. Use the regex patterns from D5. (Implementation note: `RegexpMultiline` does not support per-check file scoping. The check is registered globally and scoped via a negative-lookahead suppression in `config/checkstyle/suppressions.xml` that excludes everything except `*Representation*.java` in the column package. The regex was tightened to look for the literal operand identifier `c` — the codebase convention — so `let`-body parameters named `x` do not trigger it. Spec patterns from D5 do not match real Spark predicates such as `c.isNull()` because the inner `()` defeats the `[^()]` character class; the rewritten patterns use `[\s\S]{0,400}?` with `matchAcrossLines` instead.) +- [x] 7.3 Configure `SuppressWithNearbyCommentFilter` (or confirm the existing config) to honour `// SUPPRESS RepeatedSqlEvaluation: ` markers. (Existing TreeWalker `SuppressWithNearbyCommentFilter` uses `CHECKSTYLE.SUPPRESS\: ([\w\|]+)`. Added a Checker-level `SuppressWithPlainTextCommentFilter` mirroring the same comment format so the two `RegexpMultiline` checks can also be locally suppressed if needed. With the `c`-only regex, suppressions are not currently required at any call site.) +- [x] 7.4 Run `mvn checkstyle:check -pl fhirpath`. Verify: zero violations against the rewritten code (each `let` body has its suppression marker), and the `count`/`isEmpty`/`exists`/`ensureSingular` single-ref `when` calls do NOT trigger the rule. +- [x] 7.5 Negative test: temporarily revert one rewrite (e.g. put `count()` back to `when(c.isNull(),0).otherwise(size(c))`), confirm Checkstyle flags it, then restore the rewrite. Document this verification in the PR description. (Verified locally: reverting `count()` to `when(c.isNull(), 0).otherwise(size(c))` triggers the rule on both the predicate-and-value pattern and the when/otherwise pattern at line 454; restoring the `coalesce(size(c), lit(0))` rewrite returns the audit to zero violations.) + +## 8. Spark configuration assertion (D6 / Q1) + +- [x] 8.1 Locate Pathling's `SparkSession` setup site (likely `library-api/.../PathlingContext.java` or related). (Located: `library-api/src/main/java/au/csiro/pathling/library/PathlingContext.java`.) +- [x] 8.2 Add a startup-time check that `spark.sql.legacy.sizeOfNull` is `false`. If not, fail loudly with a message naming the FHIRPath cardinality semantics that depend on it. (Implemented as `requireLegacySizeOfNullDisabled(SparkSession)` invoked from the private `PathlingContext` constructor; throws `IllegalStateException` with a remediation message naming `count()`/`isEmpty()` and the corrective conf setting.) +- [x] 8.3 Unit-test the assertion: a SparkSession with `sizeOfNull=true` triggers the failure path. (Added `create_rejectsLegacySizeOfNullEnabled` to `PathlingContextTest`: sets the conf to `true`, asserts `PathlingContext.create(spark)` throws `IllegalStateException` naming the key, then resets to `false` in a finally block. The earlier deferral note was based on a false report of pre-existing compilation failures in `library-api` — those tests compile and pass cleanly.) + +## 9. Documentation follow-up issue (D8) + +- [x] 9.1 File a GitHub issue titled "Document `trace()` incompatibility with SQL aggregate functions" referencing this change. The issue body covers: the Spark constraint (`AGGREGATE_FUNCTION_WITH_NONDETERMINISTIC_EXPRESSION`), why Pathling cannot relax it, and the user-facing workaround (move the trace upstream of the aggregation boundary). (Filed as #2607.) +- [x] 9.2 Identify the FHIRPath `trace()` documentation page in `site/docs/`. Add the issue number to the design doc's D8 section as a reference. (No `trace()` documentation page currently exists in `site/docs/` — the function only has Javadoc on `UtilityFunctions#trace`. The design doc's D8 section now references #2607.) +- [ ] 9.3 (Issue-side, not part of this change's PR) Add a paragraph to the `trace()` doc page covering the constraint and the workaround. (Out of scope for this PR; tracked under the D8 follow-up issue.) + +## 10. Final verification and PR + +- [x] 10.1 Run `mvn clean verify -pl fhirpath,encoders,library-api`. All tests pass, Checkstyle clean, Spotless clean, license headers present. (Pre-existing `FhirViewShareableComplianceTest` `rowIndex` and join failures unchanged; run under `testFailureIgnore=true`.) +- [x] 10.2 Run `openspec validate fix-trace-duplication --strict`. Passes. +- [x] 10.3 `git diff --stat` shows changes only in: `fhirpath/src/main/.../ColumnRepresentation.java`, new `ColumnHelpers.java`, new `ColumnHelpersTest.java`, new `ColumnRepresentationTraceTest.java`, modified `TraceFunctionTest.java`, modified `checkstyle.xml`, modified `library-api/.../PathlingContext.java` (or equivalent for the assertion), and `openspec/changes/fix-trace-duplication/**`. No other files touched. (Net diff vs main confirms this. `fhirpath/pom.xml` was added in the reproduce commit then reverted; net change is zero. `library-api/.../PathlingContextTest.java` has one new test for task 8.3, which is the appropriate home for that assertion.) +- [x] 10.4 PR description references issue #2594, summarises the lambda-let approach, and links the D8 follow-up issue created in 9.1. (Done — PR #2608.) diff --git a/openspec/specs/fhirpath-trace/spec.md b/openspec/specs/fhirpath-trace/spec.md index e414a3c674..8abff825bc 100644 --- a/openspec/specs/fhirpath-trace/spec.md +++ b/openspec/specs/fhirpath-trace/spec.md @@ -181,3 +181,137 @@ expressions or caching their results via common subexpression elimination. Each - **WHEN** evaluating an expression where the same `trace()` call appears in two branches of a computation - **THEN** both trace calls SHALL produce log output independently + +### Requirement: trace entry count matches logical invocations + +A single source-level `trace(name [, projection])` call SHALL produce a +number of `TraceCollector` entries equal to the number of logical +invocations of that trace, irrespective of how downstream FHIRPath +operations consume the traced column. In particular, operations that +internally compile into Spark expressions referencing the traced column +more than once (for example `count()`, `exists()`, `empty()`, +`combine()`, and the `|` union operator) SHALL NOT inflate the number +of collector entries. + +Two independent source-level `trace()` calls, even with identical +arguments, SHALL produce independent entries — this requirement governs +duplication within a single call, not deduplication across calls. + +#### Scenario: trace followed by pass-through path produces baseline entries + +- **GIVEN** a Patient with three `name` entries +- **WHEN** evaluating `name.trace('t')` with a `TraceCollector` attached +- **THEN** the collector SHALL contain exactly the baseline number of + entries labelled `t` for a 3-element traced collection + +#### Scenario: trace consumed by join produces baseline entries + +- **GIVEN** a Patient with three `name` entries +- **WHEN** evaluating `name.trace('t').given.join(' ')` +- **THEN** the collector SHALL contain the same number of entries as the + baseline pass-through case above + +#### Scenario: trace consumed by count does not duplicate entries + +- **GIVEN** a Patient with three `name` entries +- **WHEN** evaluating `name.trace('t').given.count()` +- **THEN** the collector SHALL contain the same number of entries as the + baseline pass-through case, NOT a multiple of it + +#### Scenario: trace consumed by exists does not duplicate entries + +- **GIVEN** a Patient with three `name` entries +- **WHEN** evaluating `name.trace('t').exists()` +- **THEN** the collector SHALL contain the same number of entries as the + baseline pass-through case + +#### Scenario: trace consumed by empty does not duplicate entries + +- **GIVEN** a Patient with three `name` entries +- **WHEN** evaluating `name.trace('t').empty()` +- **THEN** the collector SHALL contain the same number of entries as the + baseline pass-through case + +#### Scenario: trace consumed by first does not duplicate entries + +- **GIVEN** a Patient with three `name` entries +- **WHEN** evaluating `name.trace('t').first()` +- **THEN** the collector SHALL contain the same number of entries as the + baseline pass-through case + +#### Scenario: trace consumed by combine does not duplicate entries + +- **GIVEN** a Patient with three `name` entries +- **WHEN** evaluating `name.trace('t').given.join(' ').combine('X')` +- **THEN** the collector SHALL contain the same number of entries as the + baseline pass-through case + +#### Scenario: trace consumed by union does not duplicate entries + +- **GIVEN** a Patient with three `name` entries +- **WHEN** evaluating `name.trace('t').given.join(' ') | name.family.first()` +- **THEN** the collector SHALL contain the same number of entries as the + baseline pass-through case + +#### Scenario: two independent trace calls each produce baseline entries + +- **GIVEN** a Patient with three `name` entries +- **WHEN** evaluating `name.trace('t') | name.trace('t')` +- **THEN** the collector SHALL contain exactly twice the baseline number + of entries labelled `t` (one set per source-level `trace()` call), + not four times or more + +#### Scenario: trace consumed by count comparison does not duplicate entries + +- **GIVEN** a Patient with three `name` entries +- **WHEN** evaluating `name.trace('t').given.count() > 0` +- **THEN** the collector SHALL contain the same number of entries as the + baseline pass-through case + +#### Scenario: trace before where-then-first does not duplicate entries + +- **GIVEN** a Patient with three `name` entries (one with `use = 'official'`) +- **WHEN** evaluating `name.trace('t').where(use = 'official').given.first()` +- **THEN** the collector SHALL contain the same number of entries as the + baseline pass-through case + +#### Scenario: trace consumed by combine with path does not duplicate entries + +- **GIVEN** a Patient with three `name` entries +- **WHEN** evaluating `name.trace('t').given.combine(Patient.name.family)` +- **THEN** the collector SHALL contain the same number of entries as the + baseline pass-through case + +### Requirement: trace cannot be used inside SQL aggregate functions + +A FHIRPath expression containing `trace(name [, projection])` SHALL NOT +be used as an argument to a SQL aggregate function (`sum`, `count`, +`avg`, `min`, `max`, `collect_list`, `collect_set`, …). This is a +constraint inherited from Spark: the analyzer rejects any +`Nondeterministic` expression inside an aggregate function, raising +`AGGREGATE_FUNCTION_WITH_NONDETERMINISTIC_EXPRESSION`. Pathling does +not introduce or relax this constraint; it documents it. + +If aggregation is required over a value derived from a traced +expression, the user SHALL move the `trace()` call upstream of the +aggregation boundary (for example, evaluate the FHIRPath expression +without `trace()` and add the trace separately on a non-aggregated +projection of the same data). + +#### Scenario: traced expression inside sum raises analyzer error + +- **GIVEN** a DataFrame with a column `c` derived from a FHIRPath + expression containing `trace()` +- **WHEN** Spark plans a query of the form `df.groupBy(...).agg(sum(c))` +- **THEN** Spark SHALL raise an analyzer error with code + `AGGREGATE_FUNCTION_WITH_NONDETERMINISTIC_EXPRESSION` +- **AND** Pathling SHALL NOT attempt to rewrite or suppress the error + +#### Scenario: trace upstream of aggregation succeeds + +- **GIVEN** a DataFrame with a column `c` derived from a FHIRPath + expression NOT containing `trace()` +- **WHEN** the user runs `df.groupBy(...).agg(sum(c))` after a separate + `df.select(traced_column).show()` to inspect the trace +- **THEN** the aggregation SHALL succeed +- **AND** the trace output SHALL be emitted by the inspection query From 435c20c8a64d24861eaa4a6631764b11eefa2512 Mon Sep 17 00:00:00 2001 From: Piotr Szul Date: Mon, 18 May 2026 14:34:24 +1000 Subject: [PATCH 09/41] fix: Address review findings in trace-duplication fix - Fix join() to use the lambda-bound parameter instead of getValue(), preventing duplicate evaluation of non-deterministic operands, and add a single-fire regression test with a string-array dataset. - Replace nullif(c, array()) in normaliseNull() with let() + size() check to avoid relying on element-type equality, which fails for MapType array elements in ANSI mode. - Document the 400-character false-negative trade-off in the RepeatedSqlEvaluation checkstyle rule comment. - Add @throws AnalysisException to SqlFunctions.let() Javadoc for the aggregate/window constraint. Co-Authored-By: Claude Sonnet 4.6 --- config/checkstyle/checkstyle.xml | 6 ++++ .../fhirpath/column/ColumnRepresentation.java | 13 ++++---- .../au/csiro/pathling/sql/SqlFunctions.java | 3 ++ .../column/ColumnRepresentationTraceTest.java | 31 +++++++++++++++++++ 4 files changed, 47 insertions(+), 6 deletions(-) diff --git a/config/checkstyle/checkstyle.xml b/config/checkstyle/checkstyle.xml index 9345ab2459..4860aa8d12 100644 --- a/config/checkstyle/checkstyle.xml +++ b/config/checkstyle/checkstyle.xml @@ -84,6 +84,12 @@ regex from matching across mutually-exclusive Java branches (e.g. an early-return branch and a later else-branch) that happen to share the same variable name in comments or in the other branch's code. + + 3. 400-character search window. The `{0,400}?` quantifier bounds the span over which + the repeated identifier is sought. Expressions spanning more than ~400 characters + will not be caught — a deliberate false-negative trade-off to avoid catastrophic + backtracking on pathological inputs. In practice, `when(...)` call sites in this + codebase are well under that limit. --> diff --git a/fhirpath/src/main/java/au/csiro/pathling/fhirpath/column/ColumnRepresentation.java b/fhirpath/src/main/java/au/csiro/pathling/fhirpath/column/ColumnRepresentation.java index fe11ec42fb..e0cbe3c117 100644 --- a/fhirpath/src/main/java/au/csiro/pathling/fhirpath/column/ColumnRepresentation.java +++ b/fhirpath/src/main/java/au/csiro/pathling/fhirpath/column/ColumnRepresentation.java @@ -25,7 +25,6 @@ import static org.apache.spark.sql.functions.coalesce; import static org.apache.spark.sql.functions.exists; import static org.apache.spark.sql.functions.lit; -import static org.apache.spark.sql.functions.nullif; import static org.apache.spark.sql.functions.raise_error; import static org.apache.spark.sql.functions.size; import static org.apache.spark.sql.functions.try_element_at; @@ -353,9 +352,12 @@ public ColumnRepresentation removeNulls() { */ @Nonnull public ColumnRepresentation normaliseNull() { - // nullif(c, array()) returns null when c equals the empty array, and propagates null when c - // itself is null. Single-reference rewrite of the original null-or-empty conditional. - return vectorize(c -> nullif(c, array()), UnaryOperator.identity()); + // let() binds c once; size(x) == 0 tests for an empty array without requiring element-type + // equality — unlike nullif(c, array()), which applies = and fails for element types that do + // not support equality (e.g. MapType). + return vectorize( + c -> let(c, x -> when(size(x).equalTo(0), lit(null)).otherwise(x)), + UnaryOperator.identity()); } /** @@ -484,8 +486,7 @@ public ColumnRepresentation join(@Nonnull final ColumnRepresentation separator) return vectorize( c -> Column$.MODULE$.fn( - "array_join", - Predef.wrapRefArray(new Column[] {getValue(), separator.getValue()}).toSeq()), + "array_join", Predef.wrapRefArray(new Column[] {c, separator.getValue()}).toSeq()), UnaryOperator.identity()); } diff --git a/fhirpath/src/main/java/au/csiro/pathling/sql/SqlFunctions.java b/fhirpath/src/main/java/au/csiro/pathling/sql/SqlFunctions.java index b1594bb6db..0651b884b0 100644 --- a/fhirpath/src/main/java/au/csiro/pathling/sql/SqlFunctions.java +++ b/fhirpath/src/main/java/au/csiro/pathling/sql/SqlFunctions.java @@ -136,6 +136,9 @@ public static Column arrayUnionWithEquality( * @param value the operand to evaluate once per row * @param body the lambda that consumes the evaluated operand * @return a column expression applying {@code body} to a single evaluation of {@code value} + * @throws org.apache.spark.sql.AnalysisException if {@code value} is non-deterministic and + * contains a SQL aggregate or window expression; Spark's analyser rejects these inside + * higher-order function arguments */ @Nonnull public static Column let(@Nonnull final Column value, @Nonnull final UnaryOperator body) { diff --git a/fhirpath/src/test/java/au/csiro/pathling/fhirpath/column/ColumnRepresentationTraceTest.java b/fhirpath/src/test/java/au/csiro/pathling/fhirpath/column/ColumnRepresentationTraceTest.java index f43d13673d..851ce41976 100644 --- a/fhirpath/src/test/java/au/csiro/pathling/fhirpath/column/ColumnRepresentationTraceTest.java +++ b/fhirpath/src/test/java/au/csiro/pathling/fhirpath/column/ColumnRepresentationTraceTest.java @@ -104,6 +104,12 @@ void normaliseNull_array_singleFire() { runArray("normaliseNull", ColumnRepresentation::normaliseNull, 1, 3); } + @Test + void join_array_singleFire() { + // join() requires a string array; uses a dedicated string-array dataset. + runStringArray("join", c -> c.join(new DefaultRepresentation(lit(" "))), 1, 3); + } + @Test void aggregate_array_singleFire() { runArray("aggregate-array", c -> c.aggregate(0, Column::plus), 1, 3); @@ -209,6 +215,15 @@ private void runArray( runCase(arrayDataset(3), label + "-3", op, expectedMultiRowFires); } + private void runStringArray( + @Nonnull final String label, + @Nonnull final Function op, + final long expectedSingleRowFires, + final long expectedMultiRowFires) { + runCase(stringArrayDataset(1), label + "-1", op, expectedSingleRowFires); + runCase(stringArrayDataset(3), label + "-3", op, expectedMultiRowFires); + } + private void runArrayOfSingleton( @Nonnull final String label, @Nonnull final Function op, @@ -296,6 +311,22 @@ private Dataset arrayDataset(final int rows) { schema); } + @Nonnull + private Dataset stringArrayDataset(final int rows) { + final StructType schema = + new StructType( + new StructField[] { + new StructField("id", DataTypes.IntegerType, false, Metadata.empty()), + new StructField( + "v", new ArrayType(DataTypes.StringType, true), false, Metadata.empty()) + }); + return spark.createDataFrame( + IntStream.rangeClosed(1, rows) + .mapToObj(i -> RowFactory.create(i, new String[] {"a" + i, "b" + i})) + .toList(), + schema); + } + @Nonnull private Dataset arrayDatasetOfSingleton(final int rows) { final StructType schema = From e25a799145ed16c5d2ae948272412ca124feb3a5 Mon Sep 17 00:00:00 2001 From: Piotr Szul Date: Mon, 18 May 2026 15:01:38 +1000 Subject: [PATCH 10/41] fix: Address multi-agent review findings in trace-duplication PR MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Fix "Two" → "Three" design constraints count in checkstyle comment - Remove incorrect cdUnit as let() lambda parameter example; clarify that leftR/rightR are short local variables, not lambda parameters - Update PathlingContext Javadoc: add toBoolean() to list of affected helpers and change "post-3.0" to "Spark 3.0+" for consistency - Update SqlFunctions class Javadoc to mention union alongside deduplication - Add testNormaliseNull() and nullArray() case to testSingular() in DefaultRepresentationTest to cover semantic correctness after rewrite - Add trace-count regression guards for: convertToDateTime, convertToTime, and IMPLIES right-operand in TraceFunctionTest Co-Authored-By: Claude Sonnet 4.6 --- config/checkstyle/checkstyle.xml | 5 +++-- .../main/java/au/csiro/pathling/sql/SqlFunctions.java | 4 ++-- .../fhirpath/column/DefaultRepresentationTest.java | 10 ++++++++++ .../fhirpath/function/provider/TraceFunctionTest.java | 11 +++++++++++ .../au/csiro/pathling/library/PathlingContext.java | 10 +++++----- 5 files changed, 31 insertions(+), 9 deletions(-) diff --git a/config/checkstyle/checkstyle.xml b/config/checkstyle/checkstyle.xml index 4860aa8d12..a19bd95014 100644 --- a/config/checkstyle/checkstyle.xml +++ b/config/checkstyle/checkstyle.xml @@ -70,11 +70,12 @@ `fhirpath/src/main/java/au/csiro/pathling/fhirpath/` and `fhirpath/src/main/java/au/csiro/pathling/sql/`. - Two design constraints keep the rule free of false positives without requiring inline + Three design constraints keep the rule free of false positives without requiring inline suppression comments: 1. Identifier length ≥ 7 characters. Lambda parameters passed to let() are always short - (e.g. x, v, lv, rv, ev, nc) or at most 6 characters (leftR, rightR, cdUnit). Real + (e.g. x, v, lv, rv, ev, nc). Short local variables elsewhere in the covered packages + (e.g. leftR, rightR) are likewise under 7 characters. Real column variables that could cause duplicate evaluation use descriptive names of 7+ characters. The length floor therefore excludes every let()-body false positive automatically. diff --git a/fhirpath/src/main/java/au/csiro/pathling/sql/SqlFunctions.java b/fhirpath/src/main/java/au/csiro/pathling/sql/SqlFunctions.java index 0651b884b0..91bdeaa2d4 100644 --- a/fhirpath/src/main/java/au/csiro/pathling/sql/SqlFunctions.java +++ b/fhirpath/src/main/java/au/csiro/pathling/sql/SqlFunctions.java @@ -41,8 +41,8 @@ * Pathling-specific SQL functions that extend Spark SQL functionality. * *

Provides utilities for working with Spark SQL columns in the context of FHIR data processing, - * including FHIR-instant formatting, array deduplication with custom equality semantics, and - * let-binding for safe evaluation of non-deterministic column expressions. + * including FHIR-instant formatting, array union and deduplication with custom equality semantics, + * and let-binding for safe evaluation of non-deterministic column expressions. */ @UtilityClass public class SqlFunctions { diff --git a/fhirpath/src/test/java/au/csiro/pathling/fhirpath/column/DefaultRepresentationTest.java b/fhirpath/src/test/java/au/csiro/pathling/fhirpath/column/DefaultRepresentationTest.java index a9f30939bd..0810e159df 100644 --- a/fhirpath/src/test/java/au/csiro/pathling/fhirpath/column/DefaultRepresentationTest.java +++ b/fhirpath/src/test/java/au/csiro/pathling/fhirpath/column/DefaultRepresentationTest.java @@ -133,6 +133,7 @@ void testSingular() { new ColumnAsserts() .assertNull(nullValue().singular()) + .assertNull(nullArray().singular()) .assertNull(emptyArray().singular()) .assertEquals(13, valueOf(13).singular()) .assertEquals("a", arrayOfOne("a").singular()) @@ -283,4 +284,13 @@ void testAnyFalse() { .assertEquals(true, arrayOf(false, false).anyFalse()) .check(); } + + @Test + void testNormaliseNull() { + new ColumnAsserts() + .assertNull(nullArray().normaliseNull()) + .assertNull(emptyArray().normaliseNull()) + .assertEquals(arrayOf(1, 2), arrayOf(1, 2).normaliseNull()) + .check(); + } } diff --git a/fhirpath/src/test/java/au/csiro/pathling/fhirpath/function/provider/TraceFunctionTest.java b/fhirpath/src/test/java/au/csiro/pathling/fhirpath/function/provider/TraceFunctionTest.java index 2e497c64f9..f9b9f13b58 100644 --- a/fhirpath/src/test/java/au/csiro/pathling/fhirpath/function/provider/TraceFunctionTest.java +++ b/fhirpath/src/test/java/au/csiro/pathling/fhirpath/function/provider/TraceFunctionTest.java @@ -514,6 +514,10 @@ static Stream entryCountCases() { // BooleanOperator IMPLIES with a false left — leftValue referenced 2× (equalTo(true) // then equalTo(false)), so a traced left operand fires 2× without the fix. Arguments.of(new TraceEntryCase("Patient.name.empty().trace('t') implies true", "t", 1)), + // BooleanOperator IMPLIES with a traced right operand — rightValue appears in both the + // lv==true branch and the otherwise sub-when. Without let()-wrapping a traced right + // operand fires 2× per row when the left is null or true. + Arguments.of(new TraceEntryCase("true implies 'true'.trace('t').toBoolean()", "t", 1)), // EqualityOperator = — left ColumnRepresentation is read via isEmpty(), count(), and // singular(), each independently calling getValue(). Without let()-wrapping in // handleEquivalentTypes, a traced left operand fires 3× per row. @@ -532,6 +536,13 @@ static Stream entryCountCases() { // let()-wrapping, a traced operand fires 2× per row when the input matches the date // regex. Arguments.of(new TraceEntryCase("'2020-01-01'.trace('t').toDate()", "t", 1)), + // ConversionLogic.convertToDateTime (STRING path) — structurally identical to + // convertToDate: value appears in both the when() predicate and value branch. + Arguments.of( + new TraceEntryCase("'2020-01-01T12:00:00Z'.trace('t').toDateTime()", "t", 1)), + // ConversionLogic.convertToTime (STRING path) — structurally identical to + // convertToDate: value appears in both the when() predicate and value branch. + Arguments.of(new TraceEntryCase("'10:30:00'.trace('t').toTime()", "t", 1)), // QuantityEncoding.encodeNumeric (via convertToQuantity INTEGER path) — the traced input // appears in both the when() predicate (isNotNull check) and the value struct (via cast). // let()-wrapping on the raw numericColumn ensures the non-deterministic expression is diff --git a/library-api/src/main/java/au/csiro/pathling/library/PathlingContext.java b/library-api/src/main/java/au/csiro/pathling/library/PathlingContext.java index 98544bc363..41395c77e0 100644 --- a/library-api/src/main/java/au/csiro/pathling/library/PathlingContext.java +++ b/library-api/src/main/java/au/csiro/pathling/library/PathlingContext.java @@ -122,11 +122,11 @@ private PathlingContext( /** * Verifies that {@code spark.sql.legacy.sizeOfNull} is disabled. Several FHIRPath cardinality - * helpers — notably {@code count()} and {@code isEmpty()} on array operands — depend on Spark's - * post-3.0 default of {@code size(null) = null}, which {@code coalesce} then maps to the - * appropriate empty-collection answer. Toggling the legacy flag back on returns {@code size(null) - * = -1}, silently breaking those helpers; we fail fast at context creation rather than producing - * wrong counts later. + * helpers — notably {@code count()}, {@code isEmpty()}, and {@code toBoolean()} on array operands + * — depend on Spark's 3.0+ default of {@code size(null) = null}, which {@code coalesce} then maps + * to the appropriate empty-collection answer. Toggling the legacy flag back on returns {@code + * size(null) = -1}, silently breaking those helpers; we fail fast at context creation rather than + * producing wrong counts later. * * @param spark the Spark session to validate * @throws IllegalStateException if the legacy flag is enabled From d006b2cb75082c698e4d14b8d53e2090be4d5bac Mon Sep 17 00:00:00 2001 From: Piotr Szul Date: Mon, 18 May 2026 16:06:12 +1000 Subject: [PATCH 11/41] fix: Suppress SonarCloud false-positive commented-code warnings Backtick-quoted code references and removed bare semicolons from explanatory comments in ColumnRepresentation and TraceFunctionTest to avoid triggering java:S125. Co-Authored-By: Claude Sonnet 4.6 --- .../pathling/fhirpath/column/ColumnRepresentation.java | 7 ++----- .../fhirpath/function/provider/TraceFunctionTest.java | 8 ++++---- 2 files changed, 6 insertions(+), 9 deletions(-) diff --git a/fhirpath/src/main/java/au/csiro/pathling/fhirpath/column/ColumnRepresentation.java b/fhirpath/src/main/java/au/csiro/pathling/fhirpath/column/ColumnRepresentation.java index e0cbe3c117..6e44358ec5 100644 --- a/fhirpath/src/main/java/au/csiro/pathling/fhirpath/column/ColumnRepresentation.java +++ b/fhirpath/src/main/java/au/csiro/pathling/fhirpath/column/ColumnRepresentation.java @@ -352,9 +352,6 @@ public ColumnRepresentation removeNulls() { */ @Nonnull public ColumnRepresentation normaliseNull() { - // let() binds c once; size(x) == 0 tests for an empty array without requiring element-type - // equality — unlike nullif(c, array()), which applies = and fails for element types that do - // not support equality (e.g. MapType). return vectorize( c -> let(c, x -> when(size(x).equalTo(0), lit(null)).otherwise(x)), UnaryOperator.identity()); @@ -452,8 +449,8 @@ public ColumnRepresentation count() { */ @Nonnull public ColumnRepresentation isEmpty() { - // size(null) returns null when spark.sql.legacy.sizeOfNull = false (Spark 3.0+ default); - // coalesce maps that null to true so a null array reads as empty. + // `size(null)` returns null when `spark.sql.legacy.sizeOfNull = false` (Spark 3.0+ default). + // `coalesce` maps that null to true so a null array reads as empty. return vectorize(c -> coalesce(size(c).equalTo(0), lit(true)), Column::isNull); } diff --git a/fhirpath/src/test/java/au/csiro/pathling/fhirpath/function/provider/TraceFunctionTest.java b/fhirpath/src/test/java/au/csiro/pathling/fhirpath/function/provider/TraceFunctionTest.java index f9b9f13b58..315589a128 100644 --- a/fhirpath/src/test/java/au/csiro/pathling/fhirpath/function/provider/TraceFunctionTest.java +++ b/fhirpath/src/test/java/au/csiro/pathling/fhirpath/function/provider/TraceFunctionTest.java @@ -494,10 +494,10 @@ static Stream entryCountCases() { new TraceEntryCase("Patient.name.trace('t') | Patient.name.trace('t')", "t", 6)), // Additional FHIRPath surface (D4 in the design) — extends user-visible regression // coverage to a count comparison and two extra downstream pipelines that route through - // the rewritten ColumnRepresentation methods. The original D4 list also named single() - // and iif(); neither is implemented in Pathling, so they are replaced with equivalent - // pipelines that exercise the same internal helpers (singular() via ensureSingular() - // through .first(), and conditional projection through .where()). + // the rewritten ColumnRepresentation methods. The original D4 list also named `single` + // and `iif`; neither is implemented in Pathling, so they are replaced with equivalent + // pipelines that exercise the same internal helpers (`singular` via `ensureSingular` + // through `first`, and conditional projection through `where`). Arguments.of(new TraceEntryCase("Patient.name.trace('t').given.count() > 0", "t", 3)), Arguments.of( new TraceEntryCase( From 7b4022c47924575e2f5d83e0d7c2920630277b1c Mon Sep 17 00:00:00 2001 From: Piotr Szul Date: Mon, 18 May 2026 17:24:21 +1000 Subject: [PATCH 12/41] fix: Prevent trace duplication in comparators and conversion logic Extend the let() materialisation pattern to five more sites where a Column parameter was referenced multiple times in a single Spark SQL expression tree, causing nondeterministic expressions (e.g. trace()) to fire once per reference instead of once per row. Fixed sites: - ArrayElementWiseColumnEquality.performArrayComparison() - QuantityComparator.wrap() - TemporalComparator.implementWithSql() - ReferenceValue.validateTypeFormat() - ValidationLogic.validateConversionToBoolean() Also adds a binary let(Column, Column, BinaryOperator) overload to SqlFunctions to reduce verbosity when materialising two operands. Each fix is covered by a new trace-count regression test that wraps the input column in TraceExpression and asserts exactly one fire per row. Co-Authored-By: Claude Sonnet 4.6 --- .../fhirpath/column/ReferenceValue.java | 6 +- .../ArrayElementWiseColumnEquality.java | 33 +-- .../comparison/QuantityComparator.java | 20 +- .../comparison/TemporalComparator.java | 26 +- .../function/provider/ValidationLogic.java | 21 +- .../au/csiro/pathling/sql/SqlFunctions.java | 22 ++ .../column/ReferenceValueTraceTest.java | 130 ++++++++++ .../comparison/ComparisonTraceTest.java | 245 ++++++++++++++++++ .../provider/ValidationLogicTraceTest.java | 170 ++++++++++++ .../pathling/sql/SqlFunctionsLetTest.java | 46 ++++ 10 files changed, 674 insertions(+), 45 deletions(-) create mode 100644 fhirpath/src/test/java/au/csiro/pathling/fhirpath/column/ReferenceValueTraceTest.java create mode 100644 fhirpath/src/test/java/au/csiro/pathling/fhirpath/comparison/ComparisonTraceTest.java create mode 100644 fhirpath/src/test/java/au/csiro/pathling/fhirpath/function/provider/ValidationLogicTraceTest.java diff --git a/fhirpath/src/main/java/au/csiro/pathling/fhirpath/column/ReferenceValue.java b/fhirpath/src/main/java/au/csiro/pathling/fhirpath/column/ReferenceValue.java index 4340d4302a..44f22e1d58 100644 --- a/fhirpath/src/main/java/au/csiro/pathling/fhirpath/column/ReferenceValue.java +++ b/fhirpath/src/main/java/au/csiro/pathling/fhirpath/column/ReferenceValue.java @@ -17,6 +17,7 @@ package au.csiro.pathling.fhirpath.column; +import static au.csiro.pathling.sql.SqlFunctions.let; import static org.apache.spark.sql.functions.coalesce; import static org.apache.spark.sql.functions.lit; import static org.apache.spark.sql.functions.regexp_extract; @@ -156,7 +157,8 @@ public ColumnRepresentation extractType() { * @return the validated type column, or null if invalid */ private static Column validateTypeFormat(@Nonnull final Column type) { - final Column isValidType = type.isNotNull().and(type.rlike(FHIR_TYPE_NAME_PATTERN)); - return when(isValidType, type).otherwise(lit(null)); + return let( + type, + t -> when(t.isNotNull().and(t.rlike(FHIR_TYPE_NAME_PATTERN)), t).otherwise(lit(null))); } } diff --git a/fhirpath/src/main/java/au/csiro/pathling/fhirpath/comparison/ArrayElementWiseColumnEquality.java b/fhirpath/src/main/java/au/csiro/pathling/fhirpath/comparison/ArrayElementWiseColumnEquality.java index 7cb78516ec..89cf1fb984 100644 --- a/fhirpath/src/main/java/au/csiro/pathling/fhirpath/comparison/ArrayElementWiseColumnEquality.java +++ b/fhirpath/src/main/java/au/csiro/pathling/fhirpath/comparison/ArrayElementWiseColumnEquality.java @@ -17,6 +17,7 @@ package au.csiro.pathling.fhirpath.comparison; +import static au.csiro.pathling.sql.SqlFunctions.let; import static org.apache.spark.sql.functions.coalesce; import static org.apache.spark.sql.functions.exists; import static org.apache.spark.sql.functions.forall; @@ -78,24 +79,18 @@ public Column notEqual(@Nonnull final Column left, @Nonnull final Column right) @Nonnull private Column performArrayComparison( @Nonnull final Column left, @Nonnull final Column right, final boolean isNotEqual) { - // Zip the arrays and apply the element comparator to each pair - final Column elementComparisons = - zip_with( - left, right, isNotEqual ? elementComparator::notEqual : elementComparator::equalsTo); - - // For equality: all elements must be equal (use forall) - // For inequality: any element can be unequal (use exists with negated comparison) - final Column arrayResult = - isNotEqual ? exists(elementComparisons, e -> e) : forall(elementComparisons, e -> e); - - // If arrays have different sizes, they are not equal - final Column sizeComparison = size(left).equalTo(size(right)); - - return when(not(sizeComparison), lit(isNotEqual)) - .otherwise( - // Handle the case where some elements cannot be compared (null results) - // For equality: null comparisons default to false (not equal) - // For inequality: null comparisons default to true (assume not equal) - coalesce(arrayResult, lit(isNotEqual))); + return let( + left, + right, + (l, r) -> { + final Column elementComparisons = + zip_with( + l, r, isNotEqual ? elementComparator::notEqual : elementComparator::equalsTo); + final Column arrayResult = + isNotEqual ? exists(elementComparisons, e -> e) : forall(elementComparisons, e -> e); + final Column sizeComparison = size(l).equalTo(size(r)); + return when(not(sizeComparison), lit(isNotEqual)) + .otherwise(coalesce(arrayResult, lit(isNotEqual))); + }); } } diff --git a/fhirpath/src/main/java/au/csiro/pathling/fhirpath/comparison/QuantityComparator.java b/fhirpath/src/main/java/au/csiro/pathling/fhirpath/comparison/QuantityComparator.java index c18195c276..712c28565e 100644 --- a/fhirpath/src/main/java/au/csiro/pathling/fhirpath/comparison/QuantityComparator.java +++ b/fhirpath/src/main/java/au/csiro/pathling/fhirpath/comparison/QuantityComparator.java @@ -17,6 +17,8 @@ package au.csiro.pathling.fhirpath.comparison; +import static au.csiro.pathling.sql.SqlFunctions.let; + import au.csiro.pathling.fhirpath.column.QuantityValue; import au.csiro.pathling.sql.types.FlexiDecimal; import jakarta.annotation.Nonnull; @@ -46,13 +48,17 @@ private static BinaryOperator wrap( @Nonnull final BinaryOperator flexComparator) { return (left, right) -> - functions.coalesce( - QuantityValue.of(left) - .normalizedValue() - .compare(QuantityValue.of(right).normalizedValue(), flexComparator), - QuantityValue.of(left) - .originalValue() - .compare(QuantityValue.of(right).originalValue(), decimalComparator)); + let( + left, + right, + (l, r) -> + functions.coalesce( + QuantityValue.of(l) + .normalizedValue() + .compare(QuantityValue.of(r).normalizedValue(), flexComparator), + QuantityValue.of(l) + .originalValue() + .compare(QuantityValue.of(r).originalValue(), decimalComparator))); } @Override diff --git a/fhirpath/src/main/java/au/csiro/pathling/fhirpath/comparison/TemporalComparator.java b/fhirpath/src/main/java/au/csiro/pathling/fhirpath/comparison/TemporalComparator.java index 6f1d2dd1f5..a445a1db85 100644 --- a/fhirpath/src/main/java/au/csiro/pathling/fhirpath/comparison/TemporalComparator.java +++ b/fhirpath/src/main/java/au/csiro/pathling/fhirpath/comparison/TemporalComparator.java @@ -17,6 +17,8 @@ package au.csiro.pathling.fhirpath.comparison; +import static au.csiro.pathling.sql.SqlFunctions.let; + import au.csiro.pathling.sql.misc.HighBoundaryForDateTime; import au.csiro.pathling.sql.misc.HighBoundaryForTime; import au.csiro.pathling.sql.misc.LowBoundaryForDateTime; @@ -120,15 +122,21 @@ private Column implementWithSql( @Nonnull final Column left, @Nonnull final Column right, @Nonnull final BinaryOperator comparator) { - final Bounds leftBounds = getBounds(left); - final Bounds rightBounds = getBounds(right); - - // if canCompare apply the comparator to the low bound (either one is fine) - // else return null - return functions - .when( - canCompare(leftBounds, rightBounds), comparator.apply(leftBounds.low, rightBounds.low)) - .otherwise(functions.lit(null)); + return let( + left, + right, + (l, r) -> { + final Bounds leftBounds = getBounds(l); + final Bounds rightBounds = getBounds(r); + + // If canCompare apply the comparator to the low bound (either one is fine), + // else return null. + return functions + .when( + canCompare(leftBounds, rightBounds), + comparator.apply(leftBounds.low, rightBounds.low)) + .otherwise(functions.lit(null)); + }); } @Nonnull diff --git a/fhirpath/src/main/java/au/csiro/pathling/fhirpath/function/provider/ValidationLogic.java b/fhirpath/src/main/java/au/csiro/pathling/fhirpath/function/provider/ValidationLogic.java index faa61f4e6f..05799cb2f0 100644 --- a/fhirpath/src/main/java/au/csiro/pathling/fhirpath/function/provider/ValidationLogic.java +++ b/fhirpath/src/main/java/au/csiro/pathling/fhirpath/function/provider/ValidationLogic.java @@ -17,6 +17,7 @@ package au.csiro.pathling.fhirpath.function.provider; +import static au.csiro.pathling.sql.SqlFunctions.let; import static org.apache.spark.sql.functions.coalesce; import static org.apache.spark.sql.functions.lit; import static org.apache.spark.sql.functions.when; @@ -133,18 +134,22 @@ Collection performValidation( Column validateConversionToBoolean( @Nonnull final FhirPathType sourceType, @Nonnull final Column value) { return switch (sourceType) { - case STRING -> { - // For strings: check if '1.0'/'0.0' or if cast to boolean succeeds. - final Column is10or00 = value.equalTo(lit("1.0")).or(value.equalTo(lit("0.0"))); - final Column castSucceeds = value.try_cast(DataTypes.BooleanType).isNotNull(); - yield value.isNotNull().and(is10or00.or(castSucceeds)); - } + case STRING -> + // For strings: check if '1.0'/'0.0' or if cast to boolean succeeds. + let( + value, + v -> + v.isNotNull() + .and( + v.equalTo(lit("1.0")) + .or(v.equalTo(lit("0.0"))) + .or(v.try_cast(DataTypes.BooleanType).isNotNull()))); case INTEGER -> // Only 0 and 1 can be converted. - value.equalTo(lit(0)).or(value.equalTo(lit(1))); + let(value, v -> v.equalTo(lit(0)).or(v.equalTo(lit(1)))); case DECIMAL -> // Only 0.0 and 1.0 can be converted. - value.equalTo(lit(0.0)).or(value.equalTo(lit(1.0))); + let(value, v -> v.equalTo(lit(0.0)).or(v.equalTo(lit(1.0)))); default -> // Other types cannot be converted. lit(false); diff --git a/fhirpath/src/main/java/au/csiro/pathling/sql/SqlFunctions.java b/fhirpath/src/main/java/au/csiro/pathling/sql/SqlFunctions.java index 91bdeaa2d4..12a92e2e80 100644 --- a/fhirpath/src/main/java/au/csiro/pathling/sql/SqlFunctions.java +++ b/fhirpath/src/main/java/au/csiro/pathling/sql/SqlFunctions.java @@ -109,6 +109,28 @@ public static Column arrayUnionWithEquality( return arrayDistinctWithEquality(combined, equalityComparator); } + /** + * Evaluates {@code a} and {@code b} exactly once per row each and passes both results to {@code + * body}. + * + *

Convenience overload of {@link #let(Column, UnaryOperator)} for binary operations. Expands + * to {@code let(a, aa -> let(b, bb -> body.apply(aa, bb)))}, materialising each operand exactly + * once before the body runs. + * + * @param a the first operand to evaluate once per row + * @param b the second operand to evaluate once per row + * @param body the function that consumes both evaluated operands + * @return a column expression applying {@code body} to single evaluations of {@code a} and {@code + * b} + */ + @Nonnull + public static Column let( + @Nonnull final Column a, + @Nonnull final Column b, + @Nonnull final BinaryOperator body) { + return let(a, aa -> let(b, bb -> body.apply(aa, bb))); + } + /** * Evaluates {@code value} exactly once per row and passes the result to {@code body}. * diff --git a/fhirpath/src/test/java/au/csiro/pathling/fhirpath/column/ReferenceValueTraceTest.java b/fhirpath/src/test/java/au/csiro/pathling/fhirpath/column/ReferenceValueTraceTest.java new file mode 100644 index 0000000000..8d5c114bc8 --- /dev/null +++ b/fhirpath/src/test/java/au/csiro/pathling/fhirpath/column/ReferenceValueTraceTest.java @@ -0,0 +1,130 @@ +/* + * Copyright © 2018-2026 Commonwealth Scientific and Industrial Research + * Organisation (CSIRO) ABN 41 687 119 230. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package au.csiro.pathling.fhirpath.column; + +import static org.apache.spark.sql.classic.ExpressionUtils.column; +import static org.apache.spark.sql.classic.ExpressionUtils.expression; +import static org.apache.spark.sql.functions.col; +import static org.junit.jupiter.api.Assertions.assertEquals; + +import au.csiro.pathling.sql.TraceExpression; +import au.csiro.pathling.test.SpringBootUnitTest; +import ch.qos.logback.classic.Level; +import ch.qos.logback.classic.Logger; +import ch.qos.logback.classic.spi.ILoggingEvent; +import ch.qos.logback.core.read.ListAppender; +import jakarta.annotation.Nonnull; +import org.apache.spark.sql.Column; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Row; +import org.apache.spark.sql.RowFactory; +import org.apache.spark.sql.SparkSession; +import org.apache.spark.sql.types.DataTypes; +import org.apache.spark.sql.types.Metadata; +import org.apache.spark.sql.types.StructField; +import org.apache.spark.sql.types.StructType; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.slf4j.LoggerFactory; +import org.springframework.beans.factory.annotation.Autowired; + +/** + * Layer-B regression guard for issue #2594. Asserts that wrapping the {@code type} column passed to + * {@link ReferenceValue#extractTypeFromColumns} in a {@link TraceExpression} produces exactly one + * trace fire per row. The previous implementation of {@code validateTypeFormat()} referenced {@code + * type} three times (isNotNull, rlike, and when-value branch), causing triple-fires. + */ +@SpringBootUnitTest +class ReferenceValueTraceTest { + + @Autowired SparkSession spark; + + private Logger traceLogger; + private Level originalLevel; + private ListAppender appender; + + @BeforeEach + void setUp() { + traceLogger = (Logger) LoggerFactory.getLogger(TraceExpression.class); + originalLevel = traceLogger.getLevel(); + traceLogger.setLevel(Level.TRACE); + appender = new ListAppender<>(); + appender.start(); + traceLogger.addAppender(appender); + } + + @AfterEach + void tearDown() { + traceLogger.detachAppender(appender); + traceLogger.setLevel(originalLevel); + appender.stop(); + } + + @Test + void extractTypeFromColumns_typeSingleFire() { + final int before = appender.list.size(); + final Column tracedType = traceColumn(col("type_col"), "ref-type"); + final Column result = ReferenceValue.extractTypeFromColumns(col("ref_col"), tracedType); + stringDataset(1).select(result.alias("r")).collect(); + final long fires = countTraceLogs("ref-type", before); + assertEquals(1, fires, () -> "type fired " + fires + "× (expected 1). See issue #2594."); + } + + @Test + void extractTypeFromColumns_typeMultiRowSingleFire() { + final int before = appender.list.size(); + final Column tracedType = traceColumn(col("type_col"), "ref-type-n"); + final Column result = ReferenceValue.extractTypeFromColumns(col("ref_col"), tracedType); + stringDataset(3).select(result.alias("r")).collect(); + final long fires = countTraceLogs("ref-type-n", before); + assertEquals(3, fires, () -> "type fired " + fires + "× for 3 rows (expected 3). See #2594."); + } + + // --------------------------------------------------------------------------- + // Helpers. + // --------------------------------------------------------------------------- + + private long countTraceLogs(@Nonnull final String label, final int fromIndex) { + final String marker = "[trace:" + label + "]"; + return appender.list.subList(fromIndex, appender.list.size()).stream() + .filter(event -> event.getFormattedMessage().contains(marker)) + .count(); + } + + @Nonnull + private Dataset stringDataset(final int rows) { + final StructType schema = + new StructType( + new StructField[] { + new StructField("id", DataTypes.IntegerType, false, Metadata.empty()), + new StructField("ref_col", DataTypes.StringType, true, Metadata.empty()), + new StructField("type_col", DataTypes.StringType, true, Metadata.empty()) + }); + return spark.createDataFrame( + java.util.stream.IntStream.rangeClosed(1, rows) + .mapToObj(i -> RowFactory.create(i, "Patient/" + i, "Patient")) + .toList(), + schema); + } + + @Nonnull + private static Column traceColumn(@Nonnull final Column input, @Nonnull final String label) { + return column(new TraceExpression(expression(input), label, "string", null)); + } +} diff --git a/fhirpath/src/test/java/au/csiro/pathling/fhirpath/comparison/ComparisonTraceTest.java b/fhirpath/src/test/java/au/csiro/pathling/fhirpath/comparison/ComparisonTraceTest.java new file mode 100644 index 0000000000..2f6d0e9f27 --- /dev/null +++ b/fhirpath/src/test/java/au/csiro/pathling/fhirpath/comparison/ComparisonTraceTest.java @@ -0,0 +1,245 @@ +/* + * Copyright © 2018-2026 Commonwealth Scientific and Industrial Research + * Organisation (CSIRO) ABN 41 687 119 230. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package au.csiro.pathling.fhirpath.comparison; + +import static org.apache.spark.sql.classic.ExpressionUtils.column; +import static org.apache.spark.sql.classic.ExpressionUtils.expression; +import static org.apache.spark.sql.functions.col; +import static org.apache.spark.sql.functions.lit; +import static org.junit.jupiter.api.Assertions.assertEquals; + +import au.csiro.pathling.fhirpath.encoding.QuantityEncoding; +import au.csiro.pathling.sql.TraceExpression; +import au.csiro.pathling.test.SpringBootUnitTest; +import ch.qos.logback.classic.Level; +import ch.qos.logback.classic.Logger; +import ch.qos.logback.classic.spi.ILoggingEvent; +import ch.qos.logback.core.read.ListAppender; +import jakarta.annotation.Nonnull; +import java.util.List; +import org.apache.spark.sql.Column; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Row; +import org.apache.spark.sql.RowFactory; +import org.apache.spark.sql.SparkSession; +import org.apache.spark.sql.types.ArrayType; +import org.apache.spark.sql.types.DataTypes; +import org.apache.spark.sql.types.Metadata; +import org.apache.spark.sql.types.StructField; +import org.apache.spark.sql.types.StructType; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.slf4j.LoggerFactory; +import org.springframework.beans.factory.annotation.Autowired; + +/** + * Layer-B regression guard for issue #2594. Asserts that wrapping operand columns in a {@link + * TraceExpression} produces exactly one trace fire per row for each comparator that previously + * referenced its inputs multiple times. + */ +@SpringBootUnitTest +class ComparisonTraceTest { + + @Autowired SparkSession spark; + + private Logger traceLogger; + private Level originalLevel; + private ListAppender appender; + + @BeforeEach + void setUp() { + traceLogger = (Logger) LoggerFactory.getLogger(TraceExpression.class); + originalLevel = traceLogger.getLevel(); + traceLogger.setLevel(Level.TRACE); + appender = new ListAppender<>(); + appender.start(); + traceLogger.addAppender(appender); + } + + @AfterEach + void tearDown() { + traceLogger.detachAppender(appender); + traceLogger.setLevel(originalLevel); + appender.stop(); + } + + // --------------------------------------------------------------------------- + // ArrayElementWiseColumnEquality — left and right each referenced twice: + // once in zip_with() and once in size(). + // --------------------------------------------------------------------------- + + @Test + void arrayElementWise_equalsTo_leftSingleFire() { + final int before = appender.list.size(); + final Column tracedLeft = traceColumn(col("left"), "aew-left"); + final Column result = + new ArrayElementWiseColumnEquality(DefaultComparator.getInstance()) + .equalsTo(tracedLeft, col("right")); + intArrayDataset(1).select(result.alias("r")).collect(); + final long fires = countTraceLogs("aew-left", before); + assertEquals(1, fires, () -> "left fired " + fires + "× (expected 1). See issue #2594."); + } + + @Test + void arrayElementWise_equalsTo_rightSingleFire() { + final int before = appender.list.size(); + final Column tracedRight = traceColumn(col("right"), "aew-right"); + final Column result = + new ArrayElementWiseColumnEquality(DefaultComparator.getInstance()) + .equalsTo(col("left"), tracedRight); + intArrayDataset(1).select(result.alias("r")).collect(); + final long fires = countTraceLogs("aew-right", before); + assertEquals(1, fires, () -> "right fired " + fires + "× (expected 1). See issue #2594."); + } + + @Test + void arrayElementWise_equalsTo_leftMultiRowSingleFire() { + final int before = appender.list.size(); + final Column tracedLeft = traceColumn(col("left"), "aew-left-n"); + final Column result = + new ArrayElementWiseColumnEquality(DefaultComparator.getInstance()) + .equalsTo(tracedLeft, col("right")); + intArrayDataset(3).select(result.alias("r")).collect(); + final long fires = countTraceLogs("aew-left-n", before); + assertEquals(3, fires, () -> "left fired " + fires + "× for 3 rows (expected 3). See #2594."); + } + + // --------------------------------------------------------------------------- + // QuantityComparator — left and right each referenced twice via normalizedValue() + // and originalValue() field accesses. + // --------------------------------------------------------------------------- + + @Test + void quantityComparator_equalsTo_leftSingleFire() { + final int before = appender.list.size(); + final Column qty = QuantityEncoding.encodeNumeric(lit(1)); + final Column tracedLeft = traceColumn(qty, "qty-left"); + final Column right = QuantityEncoding.encodeNumeric(lit(1)); + final Column result = QuantityComparator.getInstance().equalsTo(tracedLeft, right); + singleRowDataset().select(result.alias("r")).collect(); + final long fires = countTraceLogs("qty-left", before); + assertEquals(1, fires, () -> "left fired " + fires + "× (expected 1). See issue #2594."); + } + + @Test + void quantityComparator_equalsTo_rightSingleFire() { + final int before = appender.list.size(); + final Column qty = QuantityEncoding.encodeNumeric(lit(1)); + final Column left = QuantityEncoding.encodeNumeric(lit(1)); + final Column tracedRight = traceColumn(qty, "qty-right"); + final Column result = QuantityComparator.getInstance().equalsTo(left, tracedRight); + singleRowDataset().select(result.alias("r")).collect(); + final long fires = countTraceLogs("qty-right", before); + assertEquals(1, fires, () -> "right fired " + fires + "× (expected 1). See issue #2594."); + } + + // --------------------------------------------------------------------------- + // TemporalComparator — left and right each referenced twice via the two + // callUDF() calls inside getBounds() (one for low boundary, one for high). + // --------------------------------------------------------------------------- + + @Test + void temporalComparator_equalsTo_leftSingleFire() { + final int before = appender.list.size(); + final Column tracedLeft = traceColumn(col("dt"), "temp-left"); + final Column result = TemporalComparator.forDateTime().equalsTo(tracedLeft, lit("2023-01-15")); + datetimeDataset(1).select(result.alias("r")).collect(); + final long fires = countTraceLogs("temp-left", before); + assertEquals(1, fires, () -> "left fired " + fires + "× (expected 1). See issue #2594."); + } + + @Test + void temporalComparator_equalsTo_rightSingleFire() { + final int before = appender.list.size(); + final Column tracedRight = traceColumn(col("dt"), "temp-right"); + final Column result = TemporalComparator.forDateTime().equalsTo(lit("2023-01-15"), tracedRight); + datetimeDataset(1).select(result.alias("r")).collect(); + final long fires = countTraceLogs("temp-right", before); + assertEquals(1, fires, () -> "right fired " + fires + "× (expected 1). See issue #2594."); + } + + @Test + void temporalComparator_equalsTo_leftMultiRowSingleFire() { + final int before = appender.list.size(); + final Column tracedLeft = traceColumn(col("dt"), "temp-left-n"); + final Column result = TemporalComparator.forDateTime().equalsTo(tracedLeft, lit("2023-01-15")); + datetimeDataset(3).select(result.alias("r")).collect(); + final long fires = countTraceLogs("temp-left-n", before); + assertEquals(3, fires, () -> "left fired " + fires + "× for 3 rows (expected 3). See #2594."); + } + + // --------------------------------------------------------------------------- + // Helpers. + // --------------------------------------------------------------------------- + + private long countTraceLogs(@Nonnull final String label, final int fromIndex) { + final String marker = "[trace:" + label + "]"; + return appender.list.subList(fromIndex, appender.list.size()).stream() + .filter(event -> event.getFormattedMessage().contains(marker)) + .count(); + } + + @Nonnull + private Dataset singleRowDataset() { + final StructType schema = + new StructType( + new StructField[] { + new StructField("id", DataTypes.IntegerType, false, Metadata.empty()) + }); + return spark.createDataFrame(List.of(RowFactory.create(1)), schema); + } + + @Nonnull + private Dataset intArrayDataset(final int rows) { + final StructType schema = + new StructType( + new StructField[] { + new StructField("id", DataTypes.IntegerType, false, Metadata.empty()), + new StructField( + "left", new ArrayType(DataTypes.IntegerType, true), false, Metadata.empty()), + new StructField( + "right", new ArrayType(DataTypes.IntegerType, true), false, Metadata.empty()) + }); + return spark.createDataFrame( + java.util.stream.IntStream.rangeClosed(1, rows) + .mapToObj(i -> RowFactory.create(i, new Integer[] {i, i + 1}, new Integer[] {i, i + 1})) + .toList(), + schema); + } + + @Nonnull + private Dataset datetimeDataset(final int rows) { + final StructType schema = + new StructType( + new StructField[] { + new StructField("id", DataTypes.IntegerType, false, Metadata.empty()), + new StructField("dt", DataTypes.StringType, false, Metadata.empty()) + }); + return spark.createDataFrame( + java.util.stream.IntStream.rangeClosed(1, rows) + .mapToObj(i -> RowFactory.create(i, "2023-01-" + String.format("%02d", i))) + .toList(), + schema); + } + + @Nonnull + private static Column traceColumn(@Nonnull final Column input, @Nonnull final String label) { + return column(new TraceExpression(expression(input), label, "value", null)); + } +} diff --git a/fhirpath/src/test/java/au/csiro/pathling/fhirpath/function/provider/ValidationLogicTraceTest.java b/fhirpath/src/test/java/au/csiro/pathling/fhirpath/function/provider/ValidationLogicTraceTest.java new file mode 100644 index 0000000000..2f72957ec1 --- /dev/null +++ b/fhirpath/src/test/java/au/csiro/pathling/fhirpath/function/provider/ValidationLogicTraceTest.java @@ -0,0 +1,170 @@ +/* + * Copyright © 2018-2026 Commonwealth Scientific and Industrial Research + * Organisation (CSIRO) ABN 41 687 119 230. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package au.csiro.pathling.fhirpath.function.provider; + +import static org.apache.spark.sql.classic.ExpressionUtils.column; +import static org.apache.spark.sql.classic.ExpressionUtils.expression; +import static org.apache.spark.sql.functions.col; +import static org.junit.jupiter.api.Assertions.assertEquals; + +import au.csiro.pathling.fhirpath.FhirPathType; +import au.csiro.pathling.sql.TraceExpression; +import au.csiro.pathling.test.SpringBootUnitTest; +import ch.qos.logback.classic.Level; +import ch.qos.logback.classic.Logger; +import ch.qos.logback.classic.spi.ILoggingEvent; +import ch.qos.logback.core.read.ListAppender; +import jakarta.annotation.Nonnull; +import org.apache.spark.sql.Column; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Row; +import org.apache.spark.sql.RowFactory; +import org.apache.spark.sql.SparkSession; +import org.apache.spark.sql.types.DataTypes; +import org.apache.spark.sql.types.Metadata; +import org.apache.spark.sql.types.StructField; +import org.apache.spark.sql.types.StructType; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.slf4j.LoggerFactory; +import org.springframework.beans.factory.annotation.Autowired; + +/** + * Layer-B regression guard for issue #2594. Asserts that wrapping {@code value} in a {@link + * TraceExpression} and passing it to {@link ValidationLogic#validateConversionToBoolean} produces + * exactly one trace fire per row. Without the fix the STRING case references {@code value} three + * times (two equality checks + the outer isNotNull), the INTEGER and DECIMAL cases reference it + * twice each. + */ +@SpringBootUnitTest +class ValidationLogicTraceTest { + + @Autowired SparkSession spark; + + private Logger traceLogger; + private Level originalLevel; + private ListAppender appender; + + @BeforeEach + void setUp() { + traceLogger = (Logger) LoggerFactory.getLogger(TraceExpression.class); + originalLevel = traceLogger.getLevel(); + traceLogger.setLevel(Level.TRACE); + appender = new ListAppender<>(); + appender.start(); + traceLogger.addAppender(appender); + } + + @AfterEach + void tearDown() { + traceLogger.detachAppender(appender); + traceLogger.setLevel(originalLevel); + appender.stop(); + } + + @Test + void validateConversionToBoolean_stringCase_singleFire() { + final int before = appender.list.size(); + final Column tracedValue = traceColumn(col("v"), "vb-str"); + final Column result = + ValidationLogic.validateConversionToBoolean(FhirPathType.STRING, tracedValue); + stringDataset(1).select(result.alias("r")).collect(); + final long fires = countTraceLogs("vb-str", before); + assertEquals(1, fires, () -> "value fired " + fires + "× (expected 1). See issue #2594."); + } + + @Test + void validateConversionToBoolean_stringCase_multiRowSingleFire() { + final int before = appender.list.size(); + final Column tracedValue = traceColumn(col("v"), "vb-str-n"); + final Column result = + ValidationLogic.validateConversionToBoolean(FhirPathType.STRING, tracedValue); + stringDataset(3).select(result.alias("r")).collect(); + final long fires = countTraceLogs("vb-str-n", before); + assertEquals(3, fires, () -> "value fired " + fires + "× for 3 rows (expected 3). See #2594."); + } + + @Test + void validateConversionToBoolean_integerCase_singleFire() { + final int before = appender.list.size(); + final Column tracedValue = traceColumn(col("v").cast("integer"), "vb-int"); + final Column result = + ValidationLogic.validateConversionToBoolean(FhirPathType.INTEGER, tracedValue); + intDataset(1).select(result.alias("r")).collect(); + final long fires = countTraceLogs("vb-int", before); + assertEquals(1, fires, () -> "value fired " + fires + "× (expected 1). See issue #2594."); + } + + @Test + void validateConversionToBoolean_decimalCase_singleFire() { + final int before = appender.list.size(); + final Column tracedValue = traceColumn(col("v").cast("double"), "vb-dec"); + final Column result = + ValidationLogic.validateConversionToBoolean(FhirPathType.DECIMAL, tracedValue); + intDataset(1).select(result.alias("r")).collect(); + final long fires = countTraceLogs("vb-dec", before); + assertEquals(1, fires, () -> "value fired " + fires + "× (expected 1). See issue #2594."); + } + + // --------------------------------------------------------------------------- + // Helpers. + // --------------------------------------------------------------------------- + + private long countTraceLogs(@Nonnull final String label, final int fromIndex) { + final String marker = "[trace:" + label + "]"; + return appender.list.subList(fromIndex, appender.list.size()).stream() + .filter(event -> event.getFormattedMessage().contains(marker)) + .count(); + } + + @Nonnull + private Dataset stringDataset(final int rows) { + final StructType schema = + new StructType( + new StructField[] { + new StructField("id", DataTypes.IntegerType, false, Metadata.empty()), + new StructField("v", DataTypes.StringType, false, Metadata.empty()) + }); + return spark.createDataFrame( + java.util.stream.IntStream.rangeClosed(1, rows) + .mapToObj(i -> RowFactory.create(i, "true")) + .toList(), + schema); + } + + @Nonnull + private Dataset intDataset(final int rows) { + final StructType schema = + new StructType( + new StructField[] { + new StructField("id", DataTypes.IntegerType, false, Metadata.empty()), + new StructField("v", DataTypes.IntegerType, false, Metadata.empty()) + }); + return spark.createDataFrame( + java.util.stream.IntStream.rangeClosed(1, rows) + .mapToObj(i -> RowFactory.create(i, 1)) + .toList(), + schema); + } + + @Nonnull + private static Column traceColumn(@Nonnull final Column input, @Nonnull final String label) { + return column(new TraceExpression(expression(input), label, "value", null)); + } +} diff --git a/fhirpath/src/test/java/au/csiro/pathling/sql/SqlFunctionsLetTest.java b/fhirpath/src/test/java/au/csiro/pathling/sql/SqlFunctionsLetTest.java index 1eaa34b08c..01a7cfd0ce 100644 --- a/fhirpath/src/test/java/au/csiro/pathling/sql/SqlFunctionsLetTest.java +++ b/fhirpath/src/test/java/au/csiro/pathling/sql/SqlFunctionsLetTest.java @@ -140,6 +140,38 @@ void let_nullValue_bodyReceivesNull() { assertEquals(1, result.getInt(0)); } + // --------------------------------------------------------------------------- + // Binary let() overload tests. + // --------------------------------------------------------------------------- + + @Test + void let_binary_correctResult_singleRow() { + final Dataset df = + spark.range(1).toDF("id").withColumn("a", lit(3)).withColumn("b", lit(4)); + final Row result = df.select(let(col("a"), col("b"), Column::plus).alias("r")).first(); + assertEquals(7, result.getInt(0)); + } + + @Test + void let_binary_correctResult_multiRow() { + final List rows = + dfPairs() + .select(let(col("a"), col("b"), Column::plus).alias("r")) + .orderBy("r") + .collectAsList(); + assertEquals(List.of(2, 4, 6), rows.stream().map(r -> r.getInt(0)).toList()); + } + + @Test + void let_binary_overTraceExpressions_firesExactlyOncePerRow() { + final Column tracedA = traceColumn(col("a"), "bin-a"); + final Column tracedB = traceColumn(col("b"), "bin-b"); + dfPairs().select(let(tracedA, tracedB, Column::plus).alias("r")).collect(); + // Three rows × one fire each for each operand. + assertEquals(3L, countTraceLogs("bin-a")); + assertEquals(3L, countTraceLogs("bin-b")); + } + private long countTraceLogs(@Nonnull final String label) { final String marker = "[trace:" + label + "]"; return appender.list.stream() @@ -159,6 +191,20 @@ private Dataset df3() { List.of(RowFactory.create(1, 1), RowFactory.create(2, 2), RowFactory.create(3, 3)), schema); } + @Nonnull + private Dataset dfPairs() { + final StructType schema = + new StructType( + new StructField[] { + new StructField("id", DataTypes.IntegerType, false, Metadata.empty()), + new StructField("a", DataTypes.IntegerType, false, Metadata.empty()), + new StructField("b", DataTypes.IntegerType, false, Metadata.empty()) + }); + return spark.createDataFrame( + List.of(RowFactory.create(1, 1, 1), RowFactory.create(2, 2, 2), RowFactory.create(3, 3, 3)), + schema); + } + @Nonnull private static Column traceColumn(@Nonnull final Column input, @Nonnull final String label) { // The collector is null — we count fires via the SLF4J trace logger to avoid Spark From 4f9468ec4cbcfec8eab9f3c0cece26519337ca9a Mon Sep 17 00:00:00 2001 From: Piotr Szul Date: Mon, 18 May 2026 17:28:54 +1000 Subject: [PATCH 13/41] chore: Add scan-trace-duplicates slash command for issue #2594 Co-Authored-By: Claude Sonnet 4.6 --- .claude/commands/ptl/scan-trace-duplicates.md | 186 ++++++++++++++++++ 1 file changed, 186 insertions(+) create mode 100644 .claude/commands/ptl/scan-trace-duplicates.md diff --git a/.claude/commands/ptl/scan-trace-duplicates.md b/.claude/commands/ptl/scan-trace-duplicates.md new file mode 100644 index 0000000000..b80bd153c7 --- /dev/null +++ b/.claude/commands/ptl/scan-trace-duplicates.md @@ -0,0 +1,186 @@ +--- +name: "PTL: Scan Trace Duplicates" +description: "Scan Java source packages for Spark SQL expressions that can cause duplicate TraceExpression evaluations (issue #2594 class of bug). Invoked as /ptl:scan-trace-duplicates [package-paths...]. Defaults to au.csiro.pathling.fhirpath and au.csiro.pathling.sql in the fhirpath module." +category: Quality +tags: [pathling, spark, trace, fhirpath, quality] +--- + +Scan the specified Java packages for Spark SQL expression patterns that trigger multiple +evaluations of `Nondeterministic` expressions (such as `TraceExpression`), causing duplicate +`trace()` log entries per row (GitHub issue #2594). + +## Arguments + +`$ARGUMENTS` — an optional space-separated list of Java package paths to scan, e.g.: + +``` +au.csiro.pathling.fhirpath.column au.csiro.pathling.sql +``` + +If omitted, default to: +- `au.csiro.pathling.fhirpath` +- `au.csiro.pathling.sql` + +in the `fhirpath` module. + +--- + +## Background + +`TraceExpression` is a Catalyst `Nondeterministic` expression. Spark's Common Subexpression +Elimination (CSE) **does not** deduplicate nondeterministic nodes — every reference to the same +`Column` variable in the assembled Catalyst plan fires the expression independently. This means +that if a method receives a `Column` parameter and references it N times in a single Spark SQL +expression, a traced operand will fire N times per row instead of once. + +`ColumnHelpers.let(value, body)` is the fix: it materialises a potentially nondeterministic column +exactly once using `element_at(transform(array(value), body::apply), 1)`. For deterministic columns +it inlines directly with no overhead. Lambda params inside `let()` are deterministic and safe to +reference multiple times. + +### Examples of bugs fixed in PR #2594 (use these as recognition patterns) + +| File | Bug pattern | Fix | +|------|-------------|-----| +| `ColumnRepresentation.toArray()` | `when(c.isNotNull(), array(c))` — `c` referenced twice | `let(c, x -> when(x.isNotNull(), array(x)))` | +| `ColumnRepresentation.filter()` | `when(c.isNotNull(), when(lambda.apply(c), c))` — `c` referenced 3× | `let(c, x -> when(x.isNotNull().and(lambda.apply(x)), x))` | +| `ColumnRepresentation.normaliseNull()` | `when(c.isNull().or(size(c).equalTo(0)), null).otherwise(c)` — `c` 3× | `let(c, x -> when(size(x).equalTo(0), lit(null)).otherwise(x))` | +| `ColumnRepresentation.transform()` | `when(c.isNotNull(), lambda.apply(c))` — `c` 2× | `let(c, x -> when(x.isNotNull(), lambda.apply(x)))` | +| `ColumnRepresentation.singular()` | `when(c.isNull().or(size(c).leq(1)), getAt(c,0))` — `c` 3× | `let(c, x -> when(size(x).gt(1), raise_error(...)).otherwise(getAt(x,0)))` | +| `ConversionLogic.convertToBoolean` | `when(value.equalTo("1.0"), ...).otherwise(value.try_cast(...))` — `value` 3× | `let(value, v -> when(...).otherwise(v.try_cast(...)))` | +| `ConversionLogic.convertToDate/DateTime/Time` | `when(value.rlike(REGEX), value)` — `value` 2× | `let(value, v -> when(v.rlike(REGEX), v))` | +| `QuantityValue.toUnit()` / `convertibleToUnit()` | `quantityColumn` referenced 5× in assembled expression | `let(quantityColumn, qc -> ...)` | +| `CodingEquality.equalsTo()` | `left` and `right` each referenced multiple times | `let(left, l -> let(right, r -> ...))` | +| `ColumnRepresentation.containsElement()` | `element.getValue()` referenced twice | `let(element.getValue(), ev -> ...)` | + +--- + +## Scan Procedure + +### Step 1 — Resolve file paths + +Parse `$ARGUMENTS` as a space-separated list of Java package names. Convert each to a directory +path by replacing `.` with `/` and prefixing with the module source root: + +``` +fhirpath/src/main/java// +``` + +If `$ARGUMENTS` is empty, use: +``` +fhirpath/src/main/java/au/csiro/pathling/fhirpath/ +fhirpath/src/main/java/au/csiro/pathling/sql/ +``` + +Enumerate all `.java` files recursively: +```bash +find fhirpath/src/main/java/au/csiro/pathling/fhirpath \ + fhirpath/src/main/java/au/csiro/pathling/sql \ + -name "*.java" | sort +``` + +### Step 2 — Partition and dispatch agents + +Partition the file list into groups of **8–10 files** each. Launch one Haiku subagent per group +**in a single parallel turn**. Give each agent this instruction: + +--- + +> Read each of the following Java files and identify any method that receives or holds a `Column` +> (or `ColumnRepresentation`) and references the same variable more than once within a single +> assembled Spark SQL expression tree. +> +> Look specifically for these combinator patterns where the same variable appears in multiple +> positions: +> - `when(x.isNotNull(), ...).otherwise(x)` — predicate + value branch +> - `when(x.rlike(...), x)` — predicate + value branch +> - `size(x)` plus `x` in the same expression +> - `when(x.equalTo(...), ...).when(x.equalTo(...), ...).otherwise(x.tryCast(...))` — multiple branches +> - `callUDF(..., x, ...)` combined with `x.getField(...)` or similar +> - `x.isNull().or(rightColumn.isNull())` plus `x.getField(...)` in the same expression +> - `coalesce(x, ...)` where `x` also appears elsewhere in the expression +> - `exists(arr, e -> comparator.apply(e, x))` where `x` is also used in the predicate +> +> For each site found, report: +> - File path and method name +> - Which variable is referenced multiple times and how many times +> - The specific expression pattern (brief code quote) +> - Whether any of the references are inside a `let()` lambda parameter (those are safe — lambda +> params like `qc`, `lv`, `rv`, `x`, `v` etc. are deterministic) +> - Your assessment: **GENUINE BUG**, **LATENT RISK**, or **FALSE POSITIVE** (see triage rules below) +> +> **Triage rules:** +> - **GENUINE BUG**: Variable referenced multiple times AND the method can be called with a +> nondeterministic column (e.g., any `Column` parameter, any `this.column` field populated from +> an arbitrary caller). +> - **LATENT RISK**: Multiple references but the method is only ever called with columns that are +> structurally deterministic at all current call sites. Document; suggest adding a Javadoc note. +> - **FALSE POSITIVE**: The variable is a `let()` lambda parameter, or the expression tree only +> evaluates it once despite appearing multiple times in the Java source (e.g., builder-style APIs +> where intermediate `Column` objects are not re-evaluated). +> +> Files to scan: +> [LIST OF FILE PATHS] + +--- + +### Step 3 — Aggregate and triage results + +Collect all agent reports. Produce a single summary with three sections: + +#### GENUINE BUGS + +For each genuine bug: +1. State file path, method, variable, and reference count. +2. Show the buggy expression (brief snippet). +3. Recommend the fix: + ```java + return let(myColumn, mc -> { + // use mc everywhere instead of myColumn + }); + ``` +4. Suggest a regression test following the pattern in + `QuantityValueTraceTest.java` or `TraceFunctionTest.java`: wrap the input column with + `TraceExpression`, evaluate on a single-row dataset, assert exactly 1 trace log entry via + Logback `ListAppender`. + +#### LATENT RISKS + +For each latent risk: state file, method, variable, why it is not currently triggered, and +recommend a Javadoc note like: +```java +// NOTE: callers must not pass nondeterministic columns; wrap with let() if needed. +``` + +#### FALSE POSITIVES + +List briefly with justification (lambda param, builder API, etc.). + +--- + +## Key Reference Files + +| File | Purpose | +|------|---------| +| `fhirpath/src/main/java/au/csiro/pathling/sql/SqlFunctions.java` | `let()` implementation | +| `fhirpath/src/main/java/au/csiro/pathling/fhirpath/column/ColumnHelpers.java` | `let()` helper (if present) | +| `fhirpath/src/main/java/au/csiro/pathling/fhirpath/column/QuantityValue.java` | Fixed example (`toUnit`, `convertibleToUnit`) | +| `fhirpath/src/test/java/au/csiro/pathling/fhirpath/column/QuantityValueTraceTest.java` | Layer-B test pattern | +| `fhirpath/src/test/java/au/csiro/pathling/fhirpath/column/ColumnRepresentationTraceTest.java` | Layer-B test pattern | +| `fhirpath/src/test/java/au/csiro/pathling/fhirpath/function/provider/TraceFunctionTest.java` | End-to-end trace count pattern | +| `config/checkstyle/checkstyle.xml` | `RepeatedSqlEvaluation` Checkstyle rule (catches simple `when`/`otherwise` cases) | + +The Checkstyle `RepeatedSqlEvaluation` rule catches `when(ID..., ID)` and `when(ID...).otherwise(ID)` +where the same ≥7-char identifier appears in both branches. Lambda params in `let()` are ≤6 chars +and never trigger this rule. The rule catches simple cases but misses multi-reference patterns +outside `when()/otherwise()` — this scan covers those gaps. + +--- + +## Safe Patterns (do not flag) + +- References to `let()` lambda parameters (e.g., `qc`, `lv`, `rv`, `x`, `v`, `nc`, `ev`) +- Columns derived from a `let()` lambda param (e.g., `new QuantityValue(qc).isUcum()` where `qc` + is a lambda param) +- Builder-style APIs where each intermediate column value is a new node (not a shared reference) +- `lit(...)`, `col(...)`, and other factory calls that create new expressions each time From c33a0e161f4df63cca3b2ee1a1f732d3aff7eff2 Mon Sep 17 00:00:00 2001 From: Piotr Szul Date: Tue, 5 May 2026 12:56:23 +1000 Subject: [PATCH 14/41] fix: Extend sanitiseRow to recurse into arrays of structs `sanitiseRow` only handled nested `Row` values but fell through for `scala.collection.Seq` values (how Spark represents array fields), so synthetic fields like `_fid` and null-valued fields leaked into the JSON output whenever a FHIRPath expression returned a type containing an array of structs (e.g. `CodeableConcept.coding`). Adds a new branch that iterates over `Seq` elements, recursively sanitises any `Row` elements, and updates the parent field's `ArrayType` elementType to the sanitised element schema so that `Row.json()` positional mapping remains correct. Co-Authored-By: Claude Sonnet 4.6 --- .../evaluation/SingleInstanceEvaluator.java | 33 ++++ .../SingleInstanceEvaluatorTest.java | 142 ++++++++++++++++++ 2 files changed, 175 insertions(+) diff --git a/fhirpath/src/main/java/au/csiro/pathling/fhirpath/evaluation/SingleInstanceEvaluator.java b/fhirpath/src/main/java/au/csiro/pathling/fhirpath/evaluation/SingleInstanceEvaluator.java index 190a4a7eb6..07dcfc380f 100644 --- a/fhirpath/src/main/java/au/csiro/pathling/fhirpath/evaluation/SingleInstanceEvaluator.java +++ b/fhirpath/src/main/java/au/csiro/pathling/fhirpath/evaluation/SingleInstanceEvaluator.java @@ -46,9 +46,12 @@ import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Row; import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema; +import org.apache.spark.sql.types.ArrayType; +import org.apache.spark.sql.types.DataTypes; import org.apache.spark.sql.types.StructField; import org.apache.spark.sql.types.StructType; import org.hl7.fhir.r4.model.Enumerations.ResourceType; +import scala.collection.mutable.ArraySeq; /** * Evaluates FHIRPath expressions against a single encoded FHIR resource and returns materialised @@ -393,6 +396,36 @@ static Row sanitiseRow(@Nonnull final Row row) { filteredFields.add( new StructField( field.name(), sanitisedNested.schema(), field.nullable(), field.metadata())); + } else if (value instanceof final scala.collection.Seq seq) { + // Recurse into array elements that are Row instances, sanitising each one. + final List sanitisedElements = new ArrayList<>(); + StructType sanitisedElementSchema = null; + for (int i = 0; i < seq.length(); i++) { + final Object element = seq.apply(i); + if (element instanceof final Row elementRow) { + final Row sanitisedElement = sanitiseRow(elementRow); + sanitisedElements.add(sanitisedElement); + if (sanitisedElementSchema == null) { + sanitisedElementSchema = sanitisedElement.schema(); + } + } else { + sanitisedElements.add(element); + } + } + filteredValues.add(new ArraySeq.ofRef<>(sanitisedElements.toArray())); + // Update the parent field's ArrayType elementType so Row.json() positional mapping is + // correct after fields are stripped from array elements. + if (sanitisedElementSchema != null + && field.dataType() instanceof final ArrayType arrayType) { + filteredFields.add( + new StructField( + field.name(), + DataTypes.createArrayType(sanitisedElementSchema, arrayType.containsNull()), + field.nullable(), + field.metadata())); + } else { + filteredFields.add(field); + } } else { filteredFields.add(field); filteredValues.add(value); diff --git a/fhirpath/src/test/java/au/csiro/pathling/fhirpath/evaluation/SingleInstanceEvaluatorTest.java b/fhirpath/src/test/java/au/csiro/pathling/fhirpath/evaluation/SingleInstanceEvaluatorTest.java index f38f7b302a..84e6cf3fd9 100644 --- a/fhirpath/src/test/java/au/csiro/pathling/fhirpath/evaluation/SingleInstanceEvaluatorTest.java +++ b/fhirpath/src/test/java/au/csiro/pathling/fhirpath/evaluation/SingleInstanceEvaluatorTest.java @@ -39,11 +39,13 @@ import java.util.Map; import org.apache.spark.sql.Row; import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema; +import org.apache.spark.sql.types.ArrayType; import org.apache.spark.sql.types.DataTypes; import org.apache.spark.sql.types.StructField; import org.apache.spark.sql.types.StructType; import org.junit.jupiter.api.Nested; import org.junit.jupiter.api.Test; +import scala.collection.Seq; /** * Tests for {@link SingleInstanceEvaluator} utility methods: variable conversion and row @@ -394,6 +396,99 @@ void preservesFieldsWithNonNullValues() { assertEquals("1.5", sanitised.get(0)); assertEquals("mmol/L", sanitised.get(1)); } + + @Test + void sanitisesElementsInArrayOfStructs() { + // Synthetic and null-valued fields in array-of-struct elements should also be stripped. + // This mirrors the CodeableConcept.coding bug reported in issue #2592. + final StructType codingSchema = + new StructType( + new StructField[] { + DataTypes.createStructField("id", DataTypes.StringType, true), + DataTypes.createStructField("system", DataTypes.StringType, true), + DataTypes.createStructField("version", DataTypes.StringType, true), + DataTypes.createStructField("code", DataTypes.StringType, true), + DataTypes.createStructField("display", DataTypes.StringType, true), + DataTypes.createStructField("userSelected", DataTypes.BooleanType, true), + DataTypes.createStructField("_fid", DataTypes.IntegerType, true), + }); + final StructType outerSchema = + new StructType( + new StructField[] { + DataTypes.createStructField( + "coding", DataTypes.createArrayType(codingSchema, true), true), + }); + + final Row codingRow = + new GenericRowWithSchema( + new Object[] { + null, + "http://snomed.info/sct", + null, + "446141000124107", + "Identifies as female gender", + null, + 1468279945 + }, + codingSchema); + final Row outerRow = + new GenericRowWithSchema(new Object[] {SqlHelpers.sql_array(codingRow)}, outerSchema); + + final Row sanitised = SingleInstanceEvaluator.sanitiseRow(outerRow); + + assertEquals(1, sanitised.schema().fields().length); + assertEquals("coding", sanitised.schema().fields()[0].name()); + + final Seq codingSeq = sanitised.getAs("coding"); + assertNotNull(codingSeq); + assertEquals(1, codingSeq.length()); + + final Row sanitisedCoding = (Row) codingSeq.apply(0); + assertEquals(3, sanitisedCoding.schema().fields().length); + assertEquals("system", sanitisedCoding.schema().fields()[0].name()); + assertEquals("code", sanitisedCoding.schema().fields()[1].name()); + assertEquals("display", sanitisedCoding.schema().fields()[2].name()); + assertEquals("http://snomed.info/sct", sanitisedCoding.get(0)); + assertEquals("446141000124107", sanitisedCoding.get(1)); + assertEquals("Identifies as female gender", sanitisedCoding.get(2)); + } + + @Test + void updatesParentSchemaForSanitisedArrayOfStructs() { + // The parent field's ArrayType elementType must be updated to the sanitised element schema so + // that Row.json() positional mapping remains correct. + final StructType elementSchema = + new StructType( + new StructField[] { + DataTypes.createStructField("id", DataTypes.StringType, true), + DataTypes.createStructField("start", DataTypes.StringType, true), + DataTypes.createStructField("end", DataTypes.StringType, true), + }); + final StructType outerSchema = + new StructType( + new StructField[] { + DataTypes.createStructField( + "items", DataTypes.createArrayType(elementSchema, true), true), + }); + + final Row elementRow = + new GenericRowWithSchema(new Object[] {null, "2020-01-01", "2021-01-01"}, elementSchema); + final Row outerRow = + new GenericRowWithSchema(new Object[] {SqlHelpers.sql_array(elementRow)}, outerSchema); + + final Row sanitised = SingleInstanceEvaluator.sanitiseRow(outerRow); + + // The parent ArrayType's elementType must match the sanitised element schema. + final ArrayType itemsType = (ArrayType) sanitised.schema().apply("items").dataType(); + final StructType sanitisedElementSchema = (StructType) itemsType.elementType(); + assertEquals(2, sanitisedElementSchema.fields().length); + assertEquals("start", sanitisedElementSchema.fields()[0].name()); + assertEquals("end", sanitisedElementSchema.fields()[1].name()); + + // The element Row's own schema must also match. + final Seq seq = sanitised.getAs("items"); + assertEquals(sanitisedElementSchema, ((Row) seq.apply(0)).schema()); + } } @Nested @@ -474,6 +569,53 @@ void jsonExcludesNullValuedFields() { assertTrue(json.contains("\"value\":\"100\"")); assertTrue(json.contains("\"unit\":\"mg\"")); } + + @Test + void jsonCorrectlyRendersArrayOfStructsAfterSanitisation() { + // JSON output for array-of-struct fields should not include synthetic or null-valued fields. + final StructType codingSchema = + new StructType( + new StructField[] { + DataTypes.createStructField("id", DataTypes.StringType, true), + DataTypes.createStructField("system", DataTypes.StringType, true), + DataTypes.createStructField("version", DataTypes.StringType, true), + DataTypes.createStructField("code", DataTypes.StringType, true), + DataTypes.createStructField("display", DataTypes.StringType, true), + DataTypes.createStructField("userSelected", DataTypes.BooleanType, true), + DataTypes.createStructField("_fid", DataTypes.IntegerType, true), + }); + final StructType outerSchema = + new StructType( + new StructField[] { + DataTypes.createStructField( + "coding", DataTypes.createArrayType(codingSchema, true), true), + }); + + final Row codingRow = + new GenericRowWithSchema( + new Object[] { + null, + "http://snomed.info/sct", + null, + "446141000124107", + "Identifies as female gender", + null, + 1468279945 + }, + codingSchema); + final Row outerRow = + new GenericRowWithSchema(new Object[] {SqlHelpers.sql_array(codingRow)}, outerSchema); + + final String json = SingleInstanceEvaluator.rowToJson(outerRow); + + assertFalse(json.contains("\"_fid\"")); + assertFalse(json.contains("\"id\"")); + assertFalse(json.contains("\"version\"")); + assertFalse(json.contains("\"userSelected\"")); + assertTrue(json.contains("\"system\":\"http://snomed.info/sct\"")); + assertTrue(json.contains("\"code\":\"446141000124107\"")); + assertTrue(json.contains("\"display\":\"Identifies as female gender\"")); + } } @Nested From f127fe32e1121d9e4f105c56ac98430d5d8decec Mon Sep 17 00:00:00 2001 From: Piotr Szul Date: Tue, 5 May 2026 14:12:28 +1000 Subject: [PATCH 15/41] refactor: Simplify sanitiseRow array branch and deduplicate test fixture Pre-size the ArrayList with the known sequence length, remove a redundant what-comment, and extract the shared coding row fixture into a helper to eliminate copy-paste between two test classes. Co-Authored-By: Claude Sonnet 4.6 --- .../evaluation/SingleInstanceEvaluator.java | 3 +- .../SingleInstanceEvaluatorTest.java | 104 ++++++------------ 2 files changed, 36 insertions(+), 71 deletions(-) diff --git a/fhirpath/src/main/java/au/csiro/pathling/fhirpath/evaluation/SingleInstanceEvaluator.java b/fhirpath/src/main/java/au/csiro/pathling/fhirpath/evaluation/SingleInstanceEvaluator.java index 07dcfc380f..008a893183 100644 --- a/fhirpath/src/main/java/au/csiro/pathling/fhirpath/evaluation/SingleInstanceEvaluator.java +++ b/fhirpath/src/main/java/au/csiro/pathling/fhirpath/evaluation/SingleInstanceEvaluator.java @@ -397,8 +397,7 @@ static Row sanitiseRow(@Nonnull final Row row) { new StructField( field.name(), sanitisedNested.schema(), field.nullable(), field.metadata())); } else if (value instanceof final scala.collection.Seq seq) { - // Recurse into array elements that are Row instances, sanitising each one. - final List sanitisedElements = new ArrayList<>(); + final List sanitisedElements = new ArrayList<>(seq.length()); StructType sanitisedElementSchema = null; for (int i = 0; i < seq.length(); i++) { final Object element = seq.apply(i); diff --git a/fhirpath/src/test/java/au/csiro/pathling/fhirpath/evaluation/SingleInstanceEvaluatorTest.java b/fhirpath/src/test/java/au/csiro/pathling/fhirpath/evaluation/SingleInstanceEvaluatorTest.java index 84e6cf3fd9..e7bb3aaeda 100644 --- a/fhirpath/src/test/java/au/csiro/pathling/fhirpath/evaluation/SingleInstanceEvaluatorTest.java +++ b/fhirpath/src/test/java/au/csiro/pathling/fhirpath/evaluation/SingleInstanceEvaluatorTest.java @@ -400,41 +400,7 @@ void preservesFieldsWithNonNullValues() { @Test void sanitisesElementsInArrayOfStructs() { // Synthetic and null-valued fields in array-of-struct elements should also be stripped. - // This mirrors the CodeableConcept.coding bug reported in issue #2592. - final StructType codingSchema = - new StructType( - new StructField[] { - DataTypes.createStructField("id", DataTypes.StringType, true), - DataTypes.createStructField("system", DataTypes.StringType, true), - DataTypes.createStructField("version", DataTypes.StringType, true), - DataTypes.createStructField("code", DataTypes.StringType, true), - DataTypes.createStructField("display", DataTypes.StringType, true), - DataTypes.createStructField("userSelected", DataTypes.BooleanType, true), - DataTypes.createStructField("_fid", DataTypes.IntegerType, true), - }); - final StructType outerSchema = - new StructType( - new StructField[] { - DataTypes.createStructField( - "coding", DataTypes.createArrayType(codingSchema, true), true), - }); - - final Row codingRow = - new GenericRowWithSchema( - new Object[] { - null, - "http://snomed.info/sct", - null, - "446141000124107", - "Identifies as female gender", - null, - 1468279945 - }, - codingSchema); - final Row outerRow = - new GenericRowWithSchema(new Object[] {SqlHelpers.sql_array(codingRow)}, outerSchema); - - final Row sanitised = SingleInstanceEvaluator.sanitiseRow(outerRow); + final Row sanitised = SingleInstanceEvaluator.sanitiseRow(buildCodingOuterRow()); assertEquals(1, sanitised.schema().fields().length); assertEquals("coding", sanitised.schema().fields()[0].name()); @@ -573,40 +539,7 @@ void jsonExcludesNullValuedFields() { @Test void jsonCorrectlyRendersArrayOfStructsAfterSanitisation() { // JSON output for array-of-struct fields should not include synthetic or null-valued fields. - final StructType codingSchema = - new StructType( - new StructField[] { - DataTypes.createStructField("id", DataTypes.StringType, true), - DataTypes.createStructField("system", DataTypes.StringType, true), - DataTypes.createStructField("version", DataTypes.StringType, true), - DataTypes.createStructField("code", DataTypes.StringType, true), - DataTypes.createStructField("display", DataTypes.StringType, true), - DataTypes.createStructField("userSelected", DataTypes.BooleanType, true), - DataTypes.createStructField("_fid", DataTypes.IntegerType, true), - }); - final StructType outerSchema = - new StructType( - new StructField[] { - DataTypes.createStructField( - "coding", DataTypes.createArrayType(codingSchema, true), true), - }); - - final Row codingRow = - new GenericRowWithSchema( - new Object[] { - null, - "http://snomed.info/sct", - null, - "446141000124107", - "Identifies as female gender", - null, - 1468279945 - }, - codingSchema); - final Row outerRow = - new GenericRowWithSchema(new Object[] {SqlHelpers.sql_array(codingRow)}, outerSchema); - - final String json = SingleInstanceEvaluator.rowToJson(outerRow); + final String json = SingleInstanceEvaluator.rowToJson(buildCodingOuterRow()); assertFalse(json.contains("\"_fid\"")); assertFalse(json.contains("\"id\"")); @@ -806,4 +739,37 @@ void mixedLabelsAndTypes() { assertEquals(1, results.get(1).getValues().size()); } } + + private static Row buildCodingOuterRow() { + final StructType codingSchema = + new StructType( + new StructField[] { + DataTypes.createStructField("id", DataTypes.StringType, true), + DataTypes.createStructField("system", DataTypes.StringType, true), + DataTypes.createStructField("version", DataTypes.StringType, true), + DataTypes.createStructField("code", DataTypes.StringType, true), + DataTypes.createStructField("display", DataTypes.StringType, true), + DataTypes.createStructField("userSelected", DataTypes.BooleanType, true), + DataTypes.createStructField("_fid", DataTypes.IntegerType, true), + }); + final StructType outerSchema = + new StructType( + new StructField[] { + DataTypes.createStructField( + "coding", DataTypes.createArrayType(codingSchema, true), true), + }); + final Row codingRow = + new GenericRowWithSchema( + new Object[] { + null, + "http://snomed.info/sct", + null, + "446141000124107", + "Identifies as female gender", + null, + 1468279945 + }, + codingSchema); + return new GenericRowWithSchema(new Object[] {SqlHelpers.sql_array(codingRow)}, outerSchema); + } } From dc550b84864a7d8e23024a4d1cde54dd7feba664 Mon Sep 17 00:00:00 2001 From: John Grimes Date: Sun, 10 May 2026 20:41:15 +1000 Subject: [PATCH 16/41] test: Cover heterogeneous null patterns across array-of-struct elements Locks in that sanitiseRow correctly renders JSON for an array of structs where elements differ in which fields are null, and therefore have different post-sanitisation schemas. --- .../SingleInstanceEvaluatorTest.java | 50 +++++++++++++++++++ 1 file changed, 50 insertions(+) diff --git a/fhirpath/src/test/java/au/csiro/pathling/fhirpath/evaluation/SingleInstanceEvaluatorTest.java b/fhirpath/src/test/java/au/csiro/pathling/fhirpath/evaluation/SingleInstanceEvaluatorTest.java index e7bb3aaeda..7a5e96287b 100644 --- a/fhirpath/src/test/java/au/csiro/pathling/fhirpath/evaluation/SingleInstanceEvaluatorTest.java +++ b/fhirpath/src/test/java/au/csiro/pathling/fhirpath/evaluation/SingleInstanceEvaluatorTest.java @@ -536,6 +536,56 @@ void jsonExcludesNullValuedFields() { assertTrue(json.contains("\"unit\":\"mg\"")); } + @Test + void jsonCorrectlyRendersArrayOfStructsWithHeterogeneousNulls() { + // Two Coding elements share an input schema but differ in which fields are null. + // After sanitisation each element has a different schema (3 vs 2 fields), so the parent + // ArrayType.elementType captured from the first element does not match the second. This + // causes Row.json() to map field names positionally against the wrong schema for element 1. + final StructType codingSchema = + new StructType( + new StructField[] { + DataTypes.createStructField("system", DataTypes.StringType, true), + DataTypes.createStructField("code", DataTypes.StringType, true), + DataTypes.createStructField("display", DataTypes.StringType, true), + }); + final StructType outerSchema = + new StructType( + new StructField[] { + DataTypes.createStructField( + "coding", DataTypes.createArrayType(codingSchema, true), true), + }); + + final Row codingWithDisplay = + new GenericRowWithSchema( + new Object[] {"http://snomed.info/sct", "111", "has display"}, codingSchema); + final Row codingWithoutDisplay = + new GenericRowWithSchema( + new Object[] {"http://snomed.info/sct", "222", null}, codingSchema); + final Row outerRow = + new GenericRowWithSchema( + new Object[] {SqlHelpers.sql_array(codingWithDisplay, codingWithoutDisplay)}, + outerSchema); + + final String json = SingleInstanceEvaluator.rowToJson(outerRow); + + // Element 0 has all three fields - should render correctly. + assertTrue( + json.contains("\"code\":\"111\""), "element 0 code should be present, got: " + json); + assertTrue( + json.contains("\"display\":\"has display\""), + "element 0 display should be present, got: " + json); + + // Element 1 has display=null which should be stripped, but code=222 should still appear + // under the key "code" (not mis-mapped to another field name like "display"). + assertTrue( + json.contains("\"code\":\"222\""), + "element 1 code should be present and correctly named, got: " + json); + assertFalse( + json.contains("\"display\":\"222\""), + "element 1 code value should not be mis-mapped to display, got: " + json); + } + @Test void jsonCorrectlyRendersArrayOfStructsAfterSanitisation() { // JSON output for array-of-struct fields should not include synthetic or null-valued fields. From 7a26e8638c3ca9d03bfdb99c2560bb6c36b31b75 Mon Sep 17 00:00:00 2001 From: Piotr Szul Date: Tue, 19 May 2026 09:56:28 +1000 Subject: [PATCH 17/41] chore: Suppress new Trivy CVEs and upgrade mermaid to 11.15.0 Added suppressions for newly reported CVEs across core libraries, server, and site scopes following contextual impact assessment. All suppressed findings are either not bundled in the distribution or have unreachable vulnerable code paths. Upgraded mermaid from 11.12.2 to 11.15.0 via package.json override to fix four MEDIUM CVEs (CSS/HTML injection and DoS in diagram rendering) in the deployed static site. Co-Authored-By: Claude Sonnet 4.6 --- .trivyignore | 20 ++++++++++++++++++++ server/.trivyignore | 6 ++++++ site/.trivyignore | 8 ++++++++ site/bun.lock | 39 +++++++++------------------------------ site/package.json | 3 ++- 5 files changed, 45 insertions(+), 31 deletions(-) diff --git a/.trivyignore b/.trivyignore index 174e1b4dee..8505591a08 100644 --- a/.trivyignore +++ b/.trivyignore @@ -10,6 +10,14 @@ CVE-2025-58056 CVE-2025-67735 CVE-2026-33870 CVE-2026-33871 +CVE-2026-42583 +CVE-2026-42579 +CVE-2026-42584 +CVE-2026-42587 +CVE-2026-41417 +CVE-2026-42580 +CVE-2026-42581 +CVE-2026-42585 # The vulnerable version of protobuf-java is a transitive provided dependency, we do not bundle it into our distribution. CVE-2024-7254 @@ -53,3 +61,15 @@ CVE-2025-67721 # jackson-core async parser DoS — Pathling uses only synchronous parsing via HAPI FHIR. GHSA-72hv-8253-57qq + +# Apache Thrift TSSLTransportFactory certificate hostname validation flaw, no fixed version +# available. libthrift is a transitive dependency via hapi-fhir-structures-r4 -> jena-shex -> +# jena-arq. Pathling does not use Thrift's SSL transport, so the vulnerable code path is +# unreachable. +CVE-2026-43869 + +# OpenTelemetry W3C Baggage propagation unbounded memory allocation — opentelemetry-api is a +# transitive compile-scope dependency via hapi-fhir-base. In the Spark library context, no +# HTTP request processing is performed and OTel propagators are not configured, so the +# vulnerable Baggage parsing code path is never reached. +CVE-2026-45292 diff --git a/server/.trivyignore b/server/.trivyignore index 6508758c9d..1ffa14ece8 100644 --- a/server/.trivyignore +++ b/server/.trivyignore @@ -82,3 +82,9 @@ CVE-2025-67721 # Thrift's SSL transport (no TSSL/TSocket/Thrift client code anywhere in the # server), so the vulnerable code path is unreachable. CVE-2026-43869 + +# OpenTelemetry W3C Baggage propagation unbounded memory allocation — opentelemetry-api is a +# transitive dependency via hapi-fhir-base. The server does not configure the OTel SDK or +# W3C Baggage propagators; the API jar is present only for HAPI FHIR instrumentation +# annotations, so the vulnerable Baggage parsing code path is never reached. +CVE-2026-45292 diff --git a/site/.trivyignore b/site/.trivyignore index 22aaf62354..3b7a2cf90e 100644 --- a/site/.trivyignore +++ b/site/.trivyignore @@ -46,3 +46,11 @@ CVE-2026-41305 # uuid out-of-bounds write — build-time Docusaurus internal use only, not deployed. CVE-2026-41907 + +# @babel/plugin-transform-modules-systemjs arbitrary code generation — triggered only by +# malicious source files fed to the build-time Babel transpiler. Not deployed in the static site. +CVE-2026-44728 + +# fast-uri normalize() percent-encoded authority delimiter issue — fast-uri is pulled in by +# ajv, a build-time JSON schema validator used by Docusaurus tooling. Not deployed. +CVE-2026-6322 diff --git a/site/bun.lock b/site/bun.lock index 42dd90ca1a..349fb3c9a7 100644 --- a/site/bun.lock +++ b/site/bun.lock @@ -23,6 +23,7 @@ "lodash": "4.18.0", "lodash-es": "4.18.0", "mdast-util-to-hast": "13.2.1", + "mermaid": "^11.15.0", "node-forge": "1.3.3", "qs": "6.15.0", }, @@ -273,15 +274,7 @@ "@braintree/sanitize-url": ["@braintree/sanitize-url@7.1.2", "", {}, "sha512-jigsZK+sMF/cuiB7sERuo9V7N9jx+dhmHHnQyDSVdpZwVutaBu7WvNYqMDLSgFgfB30n452TP3vjDAvFC973mA=="], - "@chevrotain/cst-dts-gen": ["@chevrotain/cst-dts-gen@11.0.3", "", { "dependencies": { "@chevrotain/gast": "11.0.3", "@chevrotain/types": "11.0.3", "lodash-es": "4.17.21" } }, "sha512-BvIKpRLeS/8UbfxXxgC33xOumsacaeCKAjAeLyOn7Pcp95HiRbrpl14S+9vaZLolnbssPIUuiUd8IvgkRyt6NQ=="], - - "@chevrotain/gast": ["@chevrotain/gast@11.0.3", "", { "dependencies": { "@chevrotain/types": "11.0.3", "lodash-es": "4.17.21" } }, "sha512-+qNfcoNk70PyS/uxmj3li5NiECO+2YKZZQMbmjTqRI3Qchu8Hig/Q9vgkHpI3alNjr7M+a2St5pw5w5F6NL5/Q=="], - - "@chevrotain/regexp-to-ast": ["@chevrotain/regexp-to-ast@11.0.3", "", {}, "sha512-1fMHaBZxLFvWI067AVbGJav1eRY7N8DDvYCTwGBiE/ytKBgP8azTdgyrKyWZ9Mfh09eHWb5PgTSO8wi7U824RA=="], - - "@chevrotain/types": ["@chevrotain/types@11.0.3", "", {}, "sha512-gsiM3G8b58kZC2HaWR50gu6Y1440cHiJ+i3JUvcp/35JchYejb2+5MVeJK0iKThYpAa/P2PYFV4hoi44HD+aHQ=="], - - "@chevrotain/utils": ["@chevrotain/utils@11.0.3", "", {}, "sha512-YslZMgtJUyuMbZ+aKvfF3x1f5liK4mWNxghFRv7jqRR9C3R3fAOGTTKvxXDa2Y1s9zSbcpuO0cAxDYsc9SrXoQ=="], + "@chevrotain/types": ["@chevrotain/types@11.1.2", "", {}, "sha512-U+HFai5+zmJCkK86QsaJtoITlboZHBqrVketcO2ROv865xfCMSFpELQoz1GkX5GzME8pTa+3kbKrZHQtI0gdbw=="], "@colors/colors": ["@colors/colors@1.5.0", "", {}, "sha512-ooWCrlZP11i8GImSjTHYHLkvFDP48nS4+204nGb1RiX/WXYHmJA2III9/e2DWVabCESdW7hBAEzHRqUn9OUVvQ=="], @@ -505,7 +498,7 @@ "@mdx-js/react": ["@mdx-js/react@3.1.1", "", { "dependencies": { "@types/mdx": "^2.0.0" }, "peerDependencies": { "@types/react": ">=16", "react": ">=16" } }, "sha512-f++rKLQgUVYDAtECQ6fn/is15GkEH9+nZPM3MS0RcxVqoTfawHvDlSCH7JbMhAM6uJ32v3eXLvLmLvjGu7PTQw=="], - "@mermaid-js/parser": ["@mermaid-js/parser@0.6.3", "", { "dependencies": { "langium": "3.3.1" } }, "sha512-lnjOhe7zyHjc+If7yT4zoedx2vo4sHaTmtkl1+or8BRTnCtDmcTpAjpzDSfCZrshM5bCoz0GyidzadJAH1xobA=="], + "@mermaid-js/parser": ["@mermaid-js/parser@1.1.1", "", { "dependencies": { "@chevrotain/types": "~11.1.1" } }, "sha512-VuHdsYMK1bT6X2JbcAaWAhugTRvRBRyuZgd+c22swUeI9g/ntaxF7CY7dYarhZovofCbUNO0G7JesfmNtjYOCw=="], "@noble/hashes": ["@noble/hashes@1.4.0", "", {}, "sha512-V1JJ1WTRUqHHrOSh597hURcMqVKVGL/ea3kv0gSnEdsEZ0/+VyPghM1lMNGc00z7CIQorSvbKpuJkxvuHbvdbg=="], @@ -747,6 +740,8 @@ "@ungap/structured-clone": ["@ungap/structured-clone@1.3.0", "", {}, "sha512-WmoN8qaIAo7WTYWbAZuG8PYEhn5fkz7dZrqTBZ7dtt//lL2Gwms1IcnQ5yHqjDfX8Ft5j4YzDM23f87zBfDe9g=="], + "@upsetjs/venn.js": ["@upsetjs/venn.js@2.0.0", "", { "optionalDependencies": { "d3-selection": "^3.0.0", "d3-transition": "^3.0.1" } }, "sha512-WbBhLrooyePuQ1VZxrJjtLvTc4NVfpOyKx0sKqioq9bX1C1m7Jgykkn8gLrtwumBioXIqam8DLxp88Adbue6Hw=="], + "@webassemblyjs/ast": ["@webassemblyjs/ast@1.14.1", "", { "dependencies": { "@webassemblyjs/helper-numbers": "1.13.2", "@webassemblyjs/helper-wasm-bytecode": "1.13.2" } }, "sha512-nuBEDgQfm1ccRp/8bCQrx1frohyufl4JlbMMZ4P1wpeOfDhF6FQkxZJ1b/e+PLwr6X1Nhw6OLme5usuBWYBvuQ=="], "@webassemblyjs/floating-point-hex-parser": ["@webassemblyjs/floating-point-hex-parser@1.13.2", "", {}, "sha512-6oXyTOzbKxGH4steLbLNOu71Oj+C8Lg34n6CqRvqfS2O71BxY6ByfMDRhBytzknj9yGUPVJ1qIKhRlAwO1AovA=="], @@ -915,10 +910,6 @@ "cheerio-select": ["cheerio-select@2.1.0", "", { "dependencies": { "boolbase": "^1.0.0", "css-select": "^5.1.0", "css-what": "^6.1.0", "domelementtype": "^2.3.0", "domhandler": "^5.0.3", "domutils": "^3.0.1" } }, "sha512-9v9kG0LvzrlcungtnJtpGNxY+fzECQKhK4EGJX2vByejiMX84MFNQw4UxPJl3bFbTMw+Dfs37XaIkCwTZfLh4g=="], - "chevrotain": ["chevrotain@11.0.3", "", { "dependencies": { "@chevrotain/cst-dts-gen": "11.0.3", "@chevrotain/gast": "11.0.3", "@chevrotain/regexp-to-ast": "11.0.3", "@chevrotain/types": "11.0.3", "@chevrotain/utils": "11.0.3", "lodash-es": "4.17.21" } }, "sha512-ci2iJH6LeIkvP9eJW6gpueU8cnZhv85ELY8w8WiFtNjMHA5ad6pQLaJo9mEly/9qUyCpvqX8/POVUTf18/HFdw=="], - - "chevrotain-allstar": ["chevrotain-allstar@0.3.1", "", { "dependencies": { "lodash-es": "^4.17.21" }, "peerDependencies": { "chevrotain": "^11.0.0" } }, "sha512-b7g+y9A0v4mxCW1qUhf3BSVPg+/NvGErk/dOkrDaHA0nQIQGAtrOjlX//9OQtRlSCy+x9rfB5N8yC71lH1nvMw=="], - "chokidar": ["chokidar@3.6.0", "", { "dependencies": { "anymatch": "~3.1.2", "braces": "~3.0.2", "glob-parent": "~5.1.2", "is-binary-path": "~2.1.0", "is-glob": "~4.0.1", "normalize-path": "~3.0.0", "readdirp": "~3.6.0" }, "optionalDependencies": { "fsevents": "~2.3.2" } }, "sha512-7VT13fmjotKpGipCW9JEQAusEPE+Ei8nl6/g4FBAmIm0GOOLMua9NDDo/DWp0ZAxCr3cPq5ZpBqmPAQgDda2Pw=="], "chrome-trace-event": ["chrome-trace-event@1.0.4", "", {}, "sha512-rNjApaLzuwaOTjCiT8lSDdGN1APCiqkChLMJxJPWLunPAt5fy8xgU9/jNOchV84wfIxrA0lRQB7oCT8jrn/wrQ=="], @@ -1105,7 +1096,7 @@ "d3-zoom": ["d3-zoom@3.0.0", "", { "dependencies": { "d3-dispatch": "1 - 3", "d3-drag": "2 - 3", "d3-interpolate": "1 - 3", "d3-selection": "2 - 3", "d3-transition": "2 - 3" } }, "sha512-b8AmV3kfQaqWAuacbPuNbL6vahnOJflOhexLzMMNLga62+/nh0JzvJ0aO/5a5MVgUFGS7Hu1P9P03o3fJkDCyw=="], - "dagre-d3-es": ["dagre-d3-es@7.0.13", "", { "dependencies": { "d3": "^7.9.0", "lodash-es": "^4.17.21" } }, "sha512-efEhnxpSuwpYOKRm/L5KbqoZmNNukHa/Flty4Wp62JRvgH2ojwVgPgdYyr4twpieZnyRDdIH7PY2mopX26+j2Q=="], + "dagre-d3-es": ["dagre-d3-es@7.0.14", "", { "dependencies": { "d3": "^7.9.0", "lodash-es": "^4.17.21" } }, "sha512-P4rFMVq9ESWqmOgK+dlXvOtLwYg0i7u0HBGJER0LZDJT2VHIPAMZ/riPxqJceWMStH5+E61QxFra9kIS3AqdMg=="], "dayjs": ["dayjs@1.11.19", "", {}, "sha512-t5EcLVS6QPBNqM2z8fakk/NKel+Xzshgt8FFKAn+qwlD1pzZWxh0nVCrvFK7ZDb6XucZeF9z8C7CBWTRIVApAw=="], @@ -1203,6 +1194,8 @@ "es-object-atoms": ["es-object-atoms@1.1.1", "", { "dependencies": { "es-errors": "^1.3.0" } }, "sha512-FGgH2h8zKNim9ljj7dankFPcICIK9Cp5bm+c2gQSYePhpaG5+esrLODihIorn+Pe6FGJzWhXQotPv73jTaldXA=="], + "es-toolkit": ["es-toolkit@1.46.1", "", {}, "sha512-5eNtXOs3tbfxXOj04tjjseeWkRWaoCjdEI+96DgwzZoe6c9juL49pXlzAFTI72aWC9Y8p7168g6XIKjh7k6pyQ=="], + "esast-util-from-estree": ["esast-util-from-estree@2.0.0", "", { "dependencies": { "@types/estree-jsx": "^1.0.0", "devlop": "^1.0.0", "estree-util-visit": "^2.0.0", "unist-util-position-from-estree": "^2.0.0" } }, "sha512-4CyanoAudUSBAn5K13H4JhsMH6L9ZP7XbLVe/dKybkxMO7eDyLsT8UHl9TRNrU2Gr9nz+FovfSIjuXWJ81uVwQ=="], "esast-util-from-js": ["esast-util-from-js@2.0.1", "", { "dependencies": { "@types/estree-jsx": "^1.0.0", "acorn": "^8.0.0", "esast-util-from-estree": "^2.0.0", "vfile-message": "^4.0.0" } }, "sha512-8Ja+rNJ0Lt56Pcf3TAmpBZjmx8ZcK5Ts4cAzIOjsjevg9oSXJnl6SUQ2EevU8tv3h6ZLWmoKL5H4fgWvdvfETw=="], @@ -1555,8 +1548,6 @@ "kleur": ["kleur@3.0.3", "", {}, "sha512-eTIzlVOSUR+JxdDFepEYcBMtZ9Qqdef+rnzWdRZuMbOywu5tO2w2N7rqjoANZ5k9vywhL6Br1VRjUIgTQx4E8w=="], - "langium": ["langium@3.3.1", "", { "dependencies": { "chevrotain": "~11.0.3", "chevrotain-allstar": "~0.3.0", "vscode-languageserver": "~9.0.1", "vscode-languageserver-textdocument": "~1.0.11", "vscode-uri": "~3.0.8" } }, "sha512-QJv/h939gDpvT+9SiLVlY7tZC3xB2qK57v0J04Sh9wpMb6MP1q8gB21L3WIo8T5P1MSMg3Ep14L7KkDCFG3y4w=="], - "latest-version": ["latest-version@7.0.0", "", { "dependencies": { "package-json": "^8.1.0" } }, "sha512-KvNT4XqAMzdcL6ka6Tl3i2lYeFDgXNCuIX+xNx6ZMVR1dFq+idXd9FLKNMOIx0t9mJ9/HudyX4oZWXZQ0UJHeg=="], "launch-editor": ["launch-editor@2.12.0", "", { "dependencies": { "picocolors": "^1.1.1", "shell-quote": "^1.8.3" } }, "sha512-giOHXoOtifjdHqUamwKq6c49GzBdLjvxrd2D+Q4V6uOHopJv7p9VJxikDsQ/CBXZbEITgUqSVHXLTG3VhPP1Dg=="], @@ -1651,7 +1642,7 @@ "merge2": ["merge2@1.4.1", "", {}, "sha512-8q7VEgMJW4J8tcfVPy8g09NcQwZdbwFEqhe/WZkoIzjn/3TGDwtOCYtXGxA3O8tPzpczCCDgv+P2P5y00ZJOOg=="], - "mermaid": ["mermaid@11.12.2", "", { "dependencies": { "@braintree/sanitize-url": "^7.1.1", "@iconify/utils": "^3.0.1", "@mermaid-js/parser": "^0.6.3", "@types/d3": "^7.4.3", "cytoscape": "^3.29.3", "cytoscape-cose-bilkent": "^4.1.0", "cytoscape-fcose": "^2.2.0", "d3": "^7.9.0", "d3-sankey": "^0.12.3", "dagre-d3-es": "7.0.13", "dayjs": "^1.11.18", "dompurify": "^3.2.5", "katex": "^0.16.22", "khroma": "^2.1.0", "lodash-es": "^4.17.21", "marked": "^16.2.1", "roughjs": "^4.6.6", "stylis": "^4.3.6", "ts-dedent": "^2.2.0", "uuid": "^11.1.0" } }, "sha512-n34QPDPEKmaeCG4WDMGy0OT6PSyxKCfy2pJgShP+Qow2KLrvWjclwbc3yXfSIf4BanqWEhQEpngWwNp/XhZt6w=="], + "mermaid": ["mermaid@11.15.0", "", { "dependencies": { "@braintree/sanitize-url": "^7.1.1", "@iconify/utils": "^3.0.2", "@mermaid-js/parser": "^1.1.1", "@types/d3": "^7.4.3", "@upsetjs/venn.js": "^2.0.0", "cytoscape": "^3.33.1", "cytoscape-cose-bilkent": "^4.1.0", "cytoscape-fcose": "^2.2.0", "d3": "^7.9.0", "d3-sankey": "^0.12.3", "dagre-d3-es": "7.0.14", "dayjs": "^1.11.19", "dompurify": "^3.3.1", "es-toolkit": "^1.45.1", "katex": "^0.16.25", "khroma": "^2.1.0", "marked": "^16.3.0", "roughjs": "^4.6.6", "stylis": "^4.3.6", "ts-dedent": "^2.2.0", "uuid": "^11.1.0 || ^12 || ^13 || ^14.0.0" } }, "sha512-pTMbcf3rWdtLiYGpmoTjHEpeY8seiy6sR+9nD7LOs8KfUbHE4lOUAprTRqRAcWSQ6MQpdX+YEsxShtGsINtPtw=="], "methods": ["methods@1.1.2", "", {}, "sha512-iclAHeNqNm68zFtnZ0e+1L2yUIdvzNoauKU4WBA3VvH/vPFieF7qfRlwUZU+DA9P9bPXIS90ulxoUoCH23sV2w=="], @@ -2391,18 +2382,6 @@ "vfile-message": ["vfile-message@4.0.3", "", { "dependencies": { "@types/unist": "^3.0.0", "unist-util-stringify-position": "^4.0.0" } }, "sha512-QTHzsGd1EhbZs4AsQ20JX1rC3cOlt/IWJruk893DfLRr57lcnOeMaWG4K0JrRta4mIJZKth2Au3mM3u03/JWKw=="], - "vscode-jsonrpc": ["vscode-jsonrpc@8.2.0", "", {}, "sha512-C+r0eKJUIfiDIfwJhria30+TYWPtuHJXHtI7J0YlOmKAo7ogxP20T0zxB7HZQIFhIyvoBPwWskjxrvAtfjyZfA=="], - - "vscode-languageserver": ["vscode-languageserver@9.0.1", "", { "dependencies": { "vscode-languageserver-protocol": "3.17.5" }, "bin": { "installServerIntoExtension": "bin/installServerIntoExtension" } }, "sha512-woByF3PDpkHFUreUa7Hos7+pUWdeWMXRd26+ZX2A8cFx6v/JPTtd4/uN0/jB6XQHYaOlHbio03NTHCqrgG5n7g=="], - - "vscode-languageserver-protocol": ["vscode-languageserver-protocol@3.17.5", "", { "dependencies": { "vscode-jsonrpc": "8.2.0", "vscode-languageserver-types": "3.17.5" } }, "sha512-mb1bvRJN8SVznADSGWM9u/b07H7Ecg0I3OgXDuLdn307rl/J3A9YD6/eYOssqhecL27hK1IPZAsaqh00i/Jljg=="], - - "vscode-languageserver-textdocument": ["vscode-languageserver-textdocument@1.0.12", "", {}, "sha512-cxWNPesCnQCcMPeenjKKsOCKQZ/L6Tv19DTRIGuLWe32lyzWhihGVJ/rcckZXJxfdKCFvRLS3fpBIsV/ZGX4zA=="], - - "vscode-languageserver-types": ["vscode-languageserver-types@3.17.5", "", {}, "sha512-Ld1VelNuX9pdF39h2Hgaeb5hEZM2Z3jUrrMgWQAu82jMtZp7p3vJT3BzToKtZI7NgQssZje5o0zryOrhQvzQAg=="], - - "vscode-uri": ["vscode-uri@3.0.8", "", {}, "sha512-AyFQ0EVmsOZOlAnxoFOGOq1SQDWAB7C6aqMGS23svWAllfOaxbuFvcT8D1i8z3Gyn8fraVeZNNmN6e9bxxXkKw=="], - "watchpack": ["watchpack@2.5.1", "", { "dependencies": { "glob-to-regexp": "^0.4.1", "graceful-fs": "^4.1.2" } }, "sha512-Zn5uXdcFNIA1+1Ei5McRd+iRzfhENPCe7LeABkJtNulSxjma+l7ltNx55BWZkRlwRnpOgHqxnjyaDgJnNXnqzg=="], "wbuf": ["wbuf@1.7.3", "", { "dependencies": { "minimalistic-assert": "^1.0.0" } }, "sha512-O84QOnr0icsbFGLS0O3bI5FswxzRr8/gHwWkDlQFskhSPryQXvrTMxjxGP4+iWYoauLoBvfDpkrOauZ+0iZpDA=="], diff --git a/site/package.json b/site/package.json index 7e57055c6b..8638d5a740 100644 --- a/site/package.json +++ b/site/package.json @@ -44,6 +44,7 @@ "node-forge": "1.3.3", "qs": "6.15.0", "js-yaml": "3.14.2", - "mdast-util-to-hast": "13.2.1" + "mdast-util-to-hast": "13.2.1", + "mermaid": "^11.15.0" } } From 26d2e4623f57ce1acf957ab5fdfd2d37b94f5652 Mon Sep 17 00:00:00 2001 From: Piotr Szul Date: Tue, 19 May 2026 13:17:39 +1000 Subject: [PATCH 18/41] fix: Prevent standalone Spark tests from destroying shared SparkContext Three unit tests (SqlQueryResultStreamerTest, ViewRegistrationServiceTest, LibraryReferenceResolverTest.CanonicalReferences) called spark.stop() in @AfterAll, which destroyed the JVM-wide SparkContext and caused ViewDefinitionSearchTest and ViewDefinitionCreateTest to fail intermittently depending on test execution order. Converted all three tests to @SpringBootUnitTest so they receive the shared SparkSession via Spring injection, consistent with every other Spark-dependent test in the server module. The manually created sessions and @AfterAll teardowns are removed entirely. Closes #2615 Co-Authored-By: Claude Sonnet 4.6 --- .../LibraryReferenceResolverTest.java | 37 ++++-------------- .../sqlquery/SqlQueryResultStreamerTest.java | 38 ++++--------------- .../sqlquery/ViewRegistrationServiceTest.java | 33 +++++----------- 3 files changed, 24 insertions(+), 84 deletions(-) diff --git a/server/src/test/java/au/csiro/pathling/operations/sqlquery/LibraryReferenceResolverTest.java b/server/src/test/java/au/csiro/pathling/operations/sqlquery/LibraryReferenceResolverTest.java index 1c8e19e2a6..426cbabfda 100644 --- a/server/src/test/java/au/csiro/pathling/operations/sqlquery/LibraryReferenceResolverTest.java +++ b/server/src/test/java/au/csiro/pathling/operations/sqlquery/LibraryReferenceResolverTest.java @@ -28,6 +28,7 @@ import au.csiro.pathling.errors.ResourceNotFoundError; import au.csiro.pathling.io.source.DataSource; import au.csiro.pathling.read.ReadExecutor; +import au.csiro.pathling.test.SpringBootUnitTest; import ca.uhn.fhir.rest.server.exceptions.InvalidRequestException; import ca.uhn.fhir.rest.server.exceptions.ResourceNotFoundException; import java.util.List; @@ -38,21 +39,18 @@ import org.hl7.fhir.r4.model.Enumerations.PublicationStatus; import org.hl7.fhir.r4.model.Library; import org.hl7.fhir.r4.model.Reference; -import org.junit.jupiter.api.AfterAll; -import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Nested; import org.junit.jupiter.api.Test; -import org.junit.jupiter.api.TestInstance; -import org.junit.jupiter.api.TestInstance.Lifecycle; import org.junit.jupiter.params.ParameterizedTest; import org.junit.jupiter.params.provider.CsvSource; +import org.springframework.beans.factory.annotation.Autowired; /** * Tests for {@link LibraryReferenceResolver} covering both the relative-literal and canonical * reference resolution paths. */ -@TestInstance(Lifecycle.PER_CLASS) +@SpringBootUnitTest class LibraryReferenceResolverTest { // --------------------------------------------------------------------------- @@ -127,39 +125,18 @@ void translatesNoDataIllegalArgumentToResourceNotFoundException() { } // --------------------------------------------------------------------------- - // Canonical references — uses a real Spark session + FhirEncoders. + // Canonical references — uses the shared Spark session and FhirEncoders. // --------------------------------------------------------------------------- @Nested - @TestInstance(Lifecycle.PER_CLASS) class CanonicalReferences { - private SparkSession spark; - private FhirEncoders fhirEncoders; + @Autowired private SparkSession spark; + @Autowired private FhirEncoders fhirEncoders; + private DataSource dataSource; private LibraryReferenceResolver resolver; - @BeforeAll - void setUpAll() { - spark = - SparkSession.builder() - .master("local[1]") - .appName("LibraryReferenceResolverTest") - .config("spark.driver.bindAddress", "localhost") - .config("spark.driver.host", "localhost") - .config("spark.ui.enabled", false) - .config("spark.sql.shuffle.partitions", 1) - .getOrCreate(); - fhirEncoders = FhirEncoders.forR4().getOrCreate(); - } - - @AfterAll - void tearDownAll() { - if (spark != null) { - spark.stop(); - } - } - @BeforeEach void setUp() { dataSource = mock(DataSource.class); diff --git a/server/src/test/java/au/csiro/pathling/operations/sqlquery/SqlQueryResultStreamerTest.java b/server/src/test/java/au/csiro/pathling/operations/sqlquery/SqlQueryResultStreamerTest.java index be5d7404cc..d4d3b70985 100644 --- a/server/src/test/java/au/csiro/pathling/operations/sqlquery/SqlQueryResultStreamerTest.java +++ b/server/src/test/java/au/csiro/pathling/operations/sqlquery/SqlQueryResultStreamerTest.java @@ -19,6 +19,7 @@ import static org.assertj.core.api.Assertions.assertThat; +import au.csiro.pathling.test.SpringBootUnitTest; import java.nio.charset.StandardCharsets; import java.util.List; import org.apache.spark.sql.Dataset; @@ -27,44 +28,21 @@ import org.apache.spark.sql.SparkSession; import org.apache.spark.sql.types.DataTypes; import org.apache.spark.sql.types.StructType; -import org.junit.jupiter.api.AfterAll; -import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.Test; -import org.junit.jupiter.api.TestInstance; -import org.junit.jupiter.api.TestInstance.Lifecycle; +import org.springframework.beans.factory.annotation.Autowired; import org.springframework.mock.web.MockHttpServletResponse; /** - * Tests for {@link SqlQueryResultStreamer} covering each output format. Uses a real local - * SparkSession to materialise a small Dataset and a Spring {@link MockHttpServletResponse} to - * capture written bytes and headers. + * Tests for {@link SqlQueryResultStreamer} covering each output format. Uses the shared Spark + * session to materialise a small Dataset and a Spring {@link MockHttpServletResponse} to capture + * written bytes and headers. */ -@TestInstance(Lifecycle.PER_CLASS) +@SpringBootUnitTest class SqlQueryResultStreamerTest { - private SparkSession spark; - private SqlQueryResultStreamer streamer; - - @BeforeAll - void setUpAll() { - spark = - SparkSession.builder() - .master("local[1]") - .appName("SqlQueryResultStreamerTest") - .config("spark.driver.bindAddress", "localhost") - .config("spark.driver.host", "localhost") - .config("spark.ui.enabled", false) - .config("spark.sql.shuffle.partitions", 1) - .getOrCreate(); - streamer = new SqlQueryResultStreamer(); - } + @Autowired private SparkSession spark; - @AfterAll - void tearDownAll() { - if (spark != null) { - spark.stop(); - } - } + private final SqlQueryResultStreamer streamer = new SqlQueryResultStreamer(); @Test void streamsNdjsonWithUtf8Encoding() { diff --git a/server/src/test/java/au/csiro/pathling/operations/sqlquery/ViewRegistrationServiceTest.java b/server/src/test/java/au/csiro/pathling/operations/sqlquery/ViewRegistrationServiceTest.java index e81780bbf6..c42e3985ba 100644 --- a/server/src/test/java/au/csiro/pathling/operations/sqlquery/ViewRegistrationServiceTest.java +++ b/server/src/test/java/au/csiro/pathling/operations/sqlquery/ViewRegistrationServiceTest.java @@ -20,6 +20,7 @@ import static org.assertj.core.api.Assertions.assertThat; import au.csiro.pathling.config.ServerConfiguration; +import au.csiro.pathling.test.SpringBootUnitTest; import ca.uhn.fhir.context.FhirContext; import java.util.Arrays; import java.util.List; @@ -35,42 +36,26 @@ import org.apache.spark.sql.SparkSession; import org.apache.spark.sql.types.DataTypes; import org.apache.spark.sql.types.StructType; -import org.junit.jupiter.api.AfterAll; -import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; -import org.junit.jupiter.api.TestInstance; -import org.junit.jupiter.api.TestInstance.Lifecycle; +import org.springframework.beans.factory.annotation.Autowired; /** * Tests for {@link ViewRegistrationService}, with particular attention to the request-id * namespacing that prevents concurrent {@code $sqlquery-run} requests from clobbering one another's * temporary views in Spark's session-global catalog. */ -@TestInstance(Lifecycle.PER_CLASS) +@SpringBootUnitTest class ViewRegistrationServiceTest { - private SparkSession spark; + @Autowired private SparkSession spark; + @Autowired private FhirContext fhirContext; + private ViewRegistrationService service; - @BeforeAll + @BeforeEach void setUp() { - spark = - SparkSession.builder() - .master("local[2]") - .appName("ViewRegistrationServiceTest") - .config("spark.driver.bindAddress", "localhost") - .config("spark.driver.host", "localhost") - .config("spark.ui.enabled", false) - .config("spark.sql.shuffle.partitions", 1) - .getOrCreate(); - service = new ViewRegistrationService(spark, FhirContext.forR4(), new ServerConfiguration()); - } - - @AfterAll - void tearDown() { - if (spark != null) { - spark.stop(); - } + service = new ViewRegistrationService(spark, fhirContext, new ServerConfiguration()); } // --------------------------------------------------------------------------- From 006d056be9b55bb0cc6a11dc53e0515ac01f4895 Mon Sep 17 00:00:00 2001 From: Piotr Szul Date: Tue, 19 May 2026 13:35:30 +1000 Subject: [PATCH 19/41] fix: Clear SecurityContext after each test in BulkSubmitProviderTest BulkSubmitProviderTest was installing a Mockito mock as the active Spring SecurityContext in @BeforeEach but never clearing it. Under JUnit 5 parallel execution the mock leaked onto adjacent threads, causing SearchProviderAuthTest to inherit a mock context in which setAuthentication() is a no-op, so checkHasAuthority() would throw "Token not present". Closes #2617. Co-Authored-By: Claude Sonnet 4.6 --- .../operations/bulksubmit/BulkSubmitProviderTest.java | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/server/src/test/java/au/csiro/pathling/operations/bulksubmit/BulkSubmitProviderTest.java b/server/src/test/java/au/csiro/pathling/operations/bulksubmit/BulkSubmitProviderTest.java index 05f2e8fd37..13edb97344 100644 --- a/server/src/test/java/au/csiro/pathling/operations/bulksubmit/BulkSubmitProviderTest.java +++ b/server/src/test/java/au/csiro/pathling/operations/bulksubmit/BulkSubmitProviderTest.java @@ -34,6 +34,7 @@ import java.util.Optional; import org.hl7.fhir.r4.model.Parameters; import org.hl7.fhir.r4.model.StringType; +import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; import org.springframework.security.core.Authentication; @@ -81,6 +82,11 @@ void setUp() { SecurityContextHolder.setContext(securityContext); } + @AfterEach + void tearDown() { + SecurityContextHolder.clearContext(); + } + // ======================================== // In-Progress Submission Tests // ======================================== From 906ef4933ed89a2e582e4d9a7402e93f38ad0b1b Mon Sep 17 00:00:00 2001 From: Piotr Szul Date: Tue, 19 May 2026 17:01:04 +1000 Subject: [PATCH 20/41] fix: Match staged-file URIs in $import-pnp allowlist Hadoop Path.toUri() drops the empty authority on file:// paths built via new Path(parent, child), yielding file:/path. Files discovered later via fs.listFiles + fs.makeQualified preserve the empty authority and come back as file:///path. UrlAllowlist's string-prefix match then rejects the downloaded file URLs against the staging-directory prefix, failing the import with an AccessDeniedError after the bulk export has already completed. Build the prefix via fs.getFileStatus so both sides use the same canonical URI form. Co-Authored-By: Claude Sonnet 4.6 --- .../operations/bulkimport/ImportPnpExecutor.java | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/server/src/main/java/au/csiro/pathling/operations/bulkimport/ImportPnpExecutor.java b/server/src/main/java/au/csiro/pathling/operations/bulkimport/ImportPnpExecutor.java index 98d95b1da2..069f27a970 100644 --- a/server/src/main/java/au/csiro/pathling/operations/bulkimport/ImportPnpExecutor.java +++ b/server/src/main/java/au/csiro/pathling/operations/bulkimport/ImportPnpExecutor.java @@ -158,9 +158,12 @@ public ImportResponse execute(@Nonnull final ImportPnpRequest pnpRequest, final // Execute the import using the existing ImportExecutor with custom allowable sources. // This bypasses the configured allowableSources validation for the staging directory, - // which the server downloaded and trusts. The qualified URI ensures the prefix matches - // whatever scheme the staging file system uses. - final List pnpAllowableSources = List.of(tempDir.toUri().toString() + "/"); + // which the server downloaded and trusts. Go via fs.getFileStatus() to obtain a URI in + // the same canonical form (with empty authority preserved on file://) that fs.listFiles + // produces for the downloaded files, so the UrlAllowlist string-prefix match holds: + // tempDir.toUri() alone yields file:/path while listed files come back as file:///path. + final List pnpAllowableSources = + List.of(fs.getFileStatus(tempDir).getPath().toUri().toString() + "/"); final ImportResponse response = importExecutor.execute(importRequest, jobId, pnpAllowableSources); From ca1ce49c07e0e5f7cb5d27e89813c359209588cc Mon Sep 17 00:00:00 2001 From: Piotr Szul Date: Tue, 19 May 2026 17:01:23 +1000 Subject: [PATCH 21/41] fix: Clear Delta cache between $import-pnp integration tests The shared static warehouse @TempDir is cleaned in @AfterEach, but Spark's catalog cache and Delta's global DeltaLog cache still hold references to the deleted tables. The next test rebuilds the warehouse from test fixtures, but isDeltaTable returns false against the stale log, so the import falls through to an ERROR_IF_EXISTS write that collides with the freshly-copied directory and fails with DELTA_PATH_EXISTS. Clear both caches before deleting files so cleanup restores both the on-disk and in-memory state. Co-Authored-By: Claude Sonnet 4.6 --- .../pathling/operations/bulkimport/ImportPnpOperationIT.java | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/server/src/test/java/au/csiro/pathling/operations/bulkimport/ImportPnpOperationIT.java b/server/src/test/java/au/csiro/pathling/operations/bulkimport/ImportPnpOperationIT.java index b20a3e3261..0f1075ff0c 100644 --- a/server/src/test/java/au/csiro/pathling/operations/bulkimport/ImportPnpOperationIT.java +++ b/server/src/test/java/au/csiro/pathling/operations/bulkimport/ImportPnpOperationIT.java @@ -173,6 +173,11 @@ void setup() { @AfterEach void cleanup() throws IOException { + // Clear cached Delta table state before deleting files. Otherwise the next test sees a stale + // DeltaLog in memory that no longer matches the on-disk warehouse rebuilt from test fixtures, + // and Delta refuses the import with DELTA_PATH_EXISTS. + pathlingContext.getSpark().catalog().clearCache(); + org.apache.spark.sql.delta.DeltaLog.clearCache(); FileUtils.cleanDirectory(warehouseDir.toFile()); } From 780db513ed9a060686510d05f6d5b27f658672d6 Mon Sep 17 00:00:00 2001 From: Piotr Szul Date: Tue, 19 May 2026 17:01:31 +1000 Subject: [PATCH 22/41] fix: Send auth token in poisoned-manifest exfiltration test The test runs under the integration-test profile with PNP credentials configured and auth enabled, but its requests were missing the Authorization header. The pre-existing 401 was hidden by an earlier PNP allowlist bug; with that fix in place, the auth interlock now rejects the request before the poisoning scenario can exercise. Co-Authored-By: Claude Sonnet 4.6 --- .../pathling/operations/bulkimport/ImportPnpOperationIT.java | 3 +++ 1 file changed, 3 insertions(+) diff --git a/server/src/test/java/au/csiro/pathling/operations/bulkimport/ImportPnpOperationIT.java b/server/src/test/java/au/csiro/pathling/operations/bulkimport/ImportPnpOperationIT.java index 0f1075ff0c..ff331f9b82 100644 --- a/server/src/test/java/au/csiro/pathling/operations/bulkimport/ImportPnpOperationIT.java +++ b/server/src/test/java/au/csiro/pathling/operations/bulkimport/ImportPnpOperationIT.java @@ -701,6 +701,7 @@ void testPoisonedManifestTypeFailsJobAndBlocksExfiltration() throws IOException .header("Content-Type", "application/fhir+json") .header("Accept", "application/fhir+json") .header("Prefer", "respond-async") + .header("Authorization", "Bearer " + AUTH_TOKEN) .bodyValue(requestBody) .exchange() .expectStatus() @@ -736,6 +737,7 @@ void testPoisonedManifestTypeFailsJobAndBlocksExfiltration() throws IOException .get() .uri(contentLocation) .header("Accept", "application/fhir+json") + .header("Authorization", "Bearer " + AUTH_TOKEN) .exchange() .expectStatus() .is4xxClientError() @@ -752,6 +754,7 @@ void testPoisonedManifestTypeFailsJobAndBlocksExfiltration() throws IOException webTestClient .get() .uri("http://localhost:" + port + "/jobs/" + jobId + "/escaped.0000.ndjson") + .header("Authorization", "Bearer " + AUTH_TOKEN) .exchange() .expectStatus() .isNotFound(); From aa4fdc6862032aea1776c9b018437d2f7d946557 Mon Sep 17 00:00:00 2001 From: Piotr Szul Date: Tue, 19 May 2026 17:01:39 +1000 Subject: [PATCH 23/41] fix: Include WireMock port in $bulk-submit IT allowable sources The integration tests configured pathling.bulk-submit.allowable-sources to bare http://localhost via @TestPropertySource. The URI-aware UrlAllowlist resolves that prefix to effective port 80 and no longer matches the dynamic http://localhost:{wireMockPort} the tests actually use. Move the property into @DynamicPropertySource so it picks up the WireMock port at runtime. Co-Authored-By: Claude Sonnet 4.6 --- .../pathling/operations/bulksubmit/BulkSubmitOAuthIT.java | 6 +++++- .../operations/bulksubmit/BulkSubmitOperationIT.java | 6 +++++- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/server/src/test/java/au/csiro/pathling/operations/bulksubmit/BulkSubmitOAuthIT.java b/server/src/test/java/au/csiro/pathling/operations/bulksubmit/BulkSubmitOAuthIT.java index 6b0e3b38e6..a066e582ca 100644 --- a/server/src/test/java/au/csiro/pathling/operations/bulksubmit/BulkSubmitOAuthIT.java +++ b/server/src/test/java/au/csiro/pathling/operations/bulksubmit/BulkSubmitOAuthIT.java @@ -71,7 +71,6 @@ @TestPropertySource( properties = { "pathling.async.enabled=true", - "pathling.bulk-submit.allowable-sources[0]=http://localhost", // Configure submitter with OAuth credentials for symmetric (client_secret) auth. "pathling.bulk-submit.allowed-submitters[0].system=http://example.org/submitters", "pathling.bulk-submit.allowed-submitters[0].value=oauth-submitter", @@ -118,6 +117,11 @@ static void configureProperties(final DynamicPropertyRegistry registry) { TestDataSetup.copyTestDataToTempDir(warehouseDir, "Condition"); registry.add("pathling.storage.warehouseUrl", () -> "file://" + warehouseDir.toAbsolutePath()); registry.add("pathling.bulk-submit.staging-directory", stagingDir::toString); + // The allowable source must include the WireMock port. Bare "http://localhost" (port 80) + // no longer matches "http://localhost:{port}" under the URI-aware UrlAllowlist matching. + registry.add( + "pathling.bulk-submit.allowable-sources[0]", + () -> "http://localhost:" + wireMockServer.port()); } @BeforeEach diff --git a/server/src/test/java/au/csiro/pathling/operations/bulksubmit/BulkSubmitOperationIT.java b/server/src/test/java/au/csiro/pathling/operations/bulksubmit/BulkSubmitOperationIT.java index 656af2200b..d8ebbe33ea 100644 --- a/server/src/test/java/au/csiro/pathling/operations/bulksubmit/BulkSubmitOperationIT.java +++ b/server/src/test/java/au/csiro/pathling/operations/bulksubmit/BulkSubmitOperationIT.java @@ -67,7 +67,6 @@ @TestPropertySource( properties = { "pathling.async.enabled=true", - "pathling.bulk-submit.allowable-sources[0]=http://localhost", "pathling.bulk-submit.allowed-submitters[0].system=http://example.org/submitters", "pathling.bulk-submit.allowed-submitters[0].value=test-submitter" }) @@ -107,6 +106,11 @@ static void configureProperties(final DynamicPropertyRegistry registry) { TestDataSetup.copyTestDataToTempDir(warehouseDir, "Condition"); registry.add("pathling.storage.warehouseUrl", () -> "file://" + warehouseDir.toAbsolutePath()); registry.add("pathling.bulk-submit.staging-directory", stagingDir::toString); + // The allowable source must include the WireMock port. Bare "http://localhost" (port 80) + // no longer matches "http://localhost:{port}" under the URI-aware UrlAllowlist matching. + registry.add( + "pathling.bulk-submit.allowable-sources[0]", + () -> "http://localhost:" + wireMockServer.port()); } @BeforeEach From e1ad55baea97421ea05cd7cc92fc54ac41f32f2e Mon Sep 17 00:00:00 2001 From: Piotr Szul Date: Tue, 19 May 2026 18:16:38 +1000 Subject: [PATCH 24/41] fix: Set 60s response timeout on SqlQueryRunDeltaIT WebTestClient The test relied on the WebTestClient default response timeout of 5 s, which is shorter than the cold-start latency of the first POST against a freshly started Spring Boot context with a Delta-backed warehouse. Match the 60 s timeout already used by the sibling integration tests. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../csiro/pathling/operations/sqlquery/SqlQueryRunDeltaIT.java | 2 ++ 1 file changed, 2 insertions(+) diff --git a/server/src/test/java/au/csiro/pathling/operations/sqlquery/SqlQueryRunDeltaIT.java b/server/src/test/java/au/csiro/pathling/operations/sqlquery/SqlQueryRunDeltaIT.java index 7769e11e94..d926e01b50 100644 --- a/server/src/test/java/au/csiro/pathling/operations/sqlquery/SqlQueryRunDeltaIT.java +++ b/server/src/test/java/au/csiro/pathling/operations/sqlquery/SqlQueryRunDeltaIT.java @@ -29,6 +29,7 @@ import java.io.IOException; import java.nio.charset.StandardCharsets; import java.nio.file.Path; +import java.time.Duration; import java.util.ArrayList; import java.util.LinkedHashMap; import java.util.List; @@ -106,6 +107,7 @@ void setup() { webTestClient .mutate() .codecs(configurer -> configurer.defaultCodecs().maxInMemorySize(100 * 1024 * 1024)) + .responseTimeout(Duration.ofSeconds(60)) .build(); jsonParser = fhirContext.newJsonParser(); } From 19f563ff914086430bfc108a2ecfa1185bcad66d Mon Sep 17 00:00:00 2001 From: John Grimes Date: Sun, 17 May 2026 10:54:57 +0200 Subject: [PATCH 25/41] Update OpenSpec files --- .claude/commands/opsx-apply.md | 152 +++++ .claude/commands/opsx-archive.md | 156 +++++ .claude/commands/opsx-bulk-archive.md | 245 ++++++++ .claude/commands/opsx-continue.md | 116 ++++ .claude/commands/opsx-explore.md | 178 ++++++ .claude/commands/opsx-ff.md | 101 +++ .claude/commands/opsx-new.md | 75 +++ .claude/commands/opsx-onboard.md | 567 +++++++++++++++++ .claude/commands/opsx-sync.md | 137 +++++ .claude/commands/opsx-verify.md | 164 +++++ .claude/commands/opsx/apply.md | 13 +- .claude/commands/opsx/archive.md | 11 +- .claude/commands/opsx/bulk-archive.md | 14 +- .claude/commands/opsx/continue.md | 5 - .claude/commands/opsx/explore.md | 18 +- .claude/commands/opsx/ff.md | 9 +- .claude/commands/opsx/new.md | 1 - .claude/commands/opsx/onboard.md | 77 ++- .claude/commands/opsx/sync.md | 7 - .claude/commands/opsx/verify.md | 18 +- .claude/skills/openspec-apply-change/SKILL.md | 17 +- .../skills/openspec-archive-change/SKILL.md | 11 +- .../openspec-bulk-archive-change/SKILL.md | 16 +- .../skills/openspec-continue-change/SKILL.md | 7 +- .claude/skills/openspec-explore/SKILL.md | 27 +- .claude/skills/openspec-ff-change/SKILL.md | 8 +- .claude/skills/openspec-new-change/SKILL.md | 3 +- .claude/skills/openspec-onboard/SKILL.md | 85 ++- .claude/skills/openspec-sync-specs/SKILL.md | 9 +- .../skills/openspec-verify-change/SKILL.md | 20 +- .pi/prompts/opsx-apply.md | 153 +++++ .pi/prompts/opsx-archive.md | 157 +++++ .pi/prompts/opsx-bulk-archive.md | 246 ++++++++ .pi/prompts/opsx-continue.md | 117 ++++ .pi/prompts/opsx-explore.md | 179 ++++++ .pi/prompts/opsx-ff.md | 102 ++++ .pi/prompts/opsx-new.md | 76 +++ .pi/prompts/opsx-onboard.md | 567 +++++++++++++++++ .pi/prompts/opsx-sync.md | 138 +++++ .pi/prompts/opsx-verify.md | 165 +++++ .pi/skills/openspec-apply-change/SKILL.md | 159 +++++ .pi/skills/openspec-archive-change/SKILL.md | 116 ++++ .../openspec-bulk-archive-change/SKILL.md | 252 ++++++++ .pi/skills/openspec-continue-change/SKILL.md | 123 ++++ .pi/skills/openspec-explore/SKILL.md | 299 +++++++++ .pi/skills/openspec-ff-change/SKILL.md | 108 ++++ .pi/skills/openspec-new-change/SKILL.md | 83 +++ .pi/skills/openspec-onboard/SKILL.md | 574 ++++++++++++++++++ .pi/skills/openspec-sync-specs/SKILL.md | 144 +++++ .pi/skills/openspec-verify-change/SKILL.md | 171 ++++++ 50 files changed, 5975 insertions(+), 221 deletions(-) create mode 100644 .claude/commands/opsx-apply.md create mode 100644 .claude/commands/opsx-archive.md create mode 100644 .claude/commands/opsx-bulk-archive.md create mode 100644 .claude/commands/opsx-continue.md create mode 100644 .claude/commands/opsx-explore.md create mode 100644 .claude/commands/opsx-ff.md create mode 100644 .claude/commands/opsx-new.md create mode 100644 .claude/commands/opsx-onboard.md create mode 100644 .claude/commands/opsx-sync.md create mode 100644 .claude/commands/opsx-verify.md create mode 100644 .pi/prompts/opsx-apply.md create mode 100644 .pi/prompts/opsx-archive.md create mode 100644 .pi/prompts/opsx-bulk-archive.md create mode 100644 .pi/prompts/opsx-continue.md create mode 100644 .pi/prompts/opsx-explore.md create mode 100644 .pi/prompts/opsx-ff.md create mode 100644 .pi/prompts/opsx-new.md create mode 100644 .pi/prompts/opsx-onboard.md create mode 100644 .pi/prompts/opsx-sync.md create mode 100644 .pi/prompts/opsx-verify.md create mode 100644 .pi/skills/openspec-apply-change/SKILL.md create mode 100644 .pi/skills/openspec-archive-change/SKILL.md create mode 100644 .pi/skills/openspec-bulk-archive-change/SKILL.md create mode 100644 .pi/skills/openspec-continue-change/SKILL.md create mode 100644 .pi/skills/openspec-explore/SKILL.md create mode 100644 .pi/skills/openspec-ff-change/SKILL.md create mode 100644 .pi/skills/openspec-new-change/SKILL.md create mode 100644 .pi/skills/openspec-onboard/SKILL.md create mode 100644 .pi/skills/openspec-sync-specs/SKILL.md create mode 100644 .pi/skills/openspec-verify-change/SKILL.md diff --git a/.claude/commands/opsx-apply.md b/.claude/commands/opsx-apply.md new file mode 100644 index 0000000000..201edf0922 --- /dev/null +++ b/.claude/commands/opsx-apply.md @@ -0,0 +1,152 @@ +--- +description: Implement tasks from an OpenSpec change (Experimental) +--- + +Implement tasks from an OpenSpec change. + +**Input**: Optionally specify a change name (e.g., `/opsx-apply add-auth`). If omitted, check if it can be inferred from conversation context. If vague or ambiguous you MUST prompt for available changes. + +**Steps** + +1. **Select the change** + + If a name is provided, use it. Otherwise: + - Infer from conversation context if the user mentioned a change + - Auto-select if only one active change exists + - If ambiguous, run `openspec list --json` to get available changes and use the **AskUserQuestion tool** to let the user select + + Always announce: "Using change: " and how to override (e.g., `/opsx-apply `). + +2. **Check status to understand the schema** + + ```bash + openspec status --change "" --json + ``` + + Parse the JSON to understand: + - `schemaName`: The workflow being used (e.g., "spec-driven") + - Which artifact contains the tasks (typically "tasks" for spec-driven, check status for others) + +3. **Get apply instructions** + + ```bash + openspec instructions apply --change "" --json + ``` + + This returns: + - `contextFiles`: artifact ID -> array of concrete file paths (varies by schema) + - Progress (total, complete, remaining) + - Task list with status + - Dynamic instruction based on current state + + **Handle states:** + - If `state: "blocked"` (missing artifacts): show message, suggest using `/opsx-continue` + - If `state: "all_done"`: congratulate, suggest archive + - Otherwise: proceed to implementation + +4. **Read context files** + + Read every file path listed under `contextFiles` from the apply instructions output. + The files depend on the schema being used: + - **spec-driven**: proposal, specs, design, tasks + - Other schemas: follow the contextFiles from CLI output + +5. **Show current progress** + + Display: + - Schema being used + - Progress: "N/M tasks complete" + - Remaining tasks overview + - Dynamic instruction from CLI + +6. **Implement tasks (loop until done or blocked)** + + For each pending task: + - Show which task is being worked on + - Make the code changes required + - Keep changes minimal and focused + - Mark task complete in the tasks file: `- [ ]` → `- [x]` + - Continue to next task + + **Pause if:** + - Task is unclear → ask for clarification + - Implementation reveals a design issue → suggest updating artifacts + - Error or blocker encountered → report and wait for guidance + - User interrupts + +7. **On completion or pause, show status** + + Display: + - Tasks completed this session + - Overall progress: "N/M tasks complete" + - If all done: suggest archive + - If paused: explain why and wait for guidance + +**Output During Implementation** + +``` +## Implementing: (schema: ) + +Working on task 3/7: +[...implementation happening...] +✓ Task complete + +Working on task 4/7: +[...implementation happening...] +✓ Task complete +``` + +**Output On Completion** + +``` +## Implementation Complete + +**Change:** +**Schema:** +**Progress:** 7/7 tasks complete ✓ + +### Completed This Session +- [x] Task 1 +- [x] Task 2 +... + +All tasks complete! You can archive this change with `/opsx-archive`. +``` + +**Output On Pause (Issue Encountered)** + +``` +## Implementation Paused + +**Change:** +**Schema:** +**Progress:** 4/7 tasks complete + +### Issue Encountered + + +**Options:** +1.