From 8a4412c77726872fc5c3685f063a3e902c1ae328 Mon Sep 17 00:00:00 2001 From: Songkan Tang Date: Sat, 23 May 2026 01:26:36 +0000 Subject: [PATCH] Register DISTINCT_COUNT_APPROX logical marker in PPLFuncImpTable PPL parser fails with "Cannot resolve function: DISTINCT_COUNT_APPROX" on any execution path that does not construct OpenSearchExecutionEngine (unified-query / analytics-engine / DataFusion), because the UDAF was only registered to PPLFuncImpTable.aggExternalFunctionRegistry as a side effect of that constructor. Add a logical marker class DistinctCountApproxLogicalAggFunction in core, expose it as PPLBuiltinOperators.DISTINCT_COUNT_APPROX, and register it inside PPLFuncImpTable.AggBuilder.populate() alongside other built-in aggregates. The marker has no JVM execution: init / add / result throw UnsupportedOperationException, mirroring the pattern already used by RelevanceQueryFunction.RelevanceQueryImplementor for match-family functions which similarly have no JVM semantics. Behavior on V3 path is preserved: OpenSearchExecutionEngine still registers the real HyperLogLog++ DistinctCountApproxAggFunction in aggExternalFunctionRegistry, and getImplementation() consults that external registry first, so the marker is overridden whenever the V3 constructor has run. AggregateAnalyzer continues to translate the operator to OpenSearch cardinality DSL via the BuiltinFunctionName switch which is independent of the wrapped SqlAggFunction instance. Operand metadata for the marker is null to match the existing external registration's permissive type policy and avoid introducing new type rejections that would surface as regressions in existing dc IT. Signed-off-by: Songkan Tang --- ...DistinctCountApproxLogicalAggFunction.java | 61 +++++++++++++++++++ .../function/PPLBuiltinOperators.java | 17 ++++++ .../expression/function/PPLFuncImpTable.java | 8 +++ 3 files changed, 86 insertions(+) create mode 100644 core/src/main/java/org/opensearch/sql/calcite/udf/udaf/DistinctCountApproxLogicalAggFunction.java diff --git a/core/src/main/java/org/opensearch/sql/calcite/udf/udaf/DistinctCountApproxLogicalAggFunction.java b/core/src/main/java/org/opensearch/sql/calcite/udf/udaf/DistinctCountApproxLogicalAggFunction.java new file mode 100644 index 00000000000..4a18fe35a5d --- /dev/null +++ b/core/src/main/java/org/opensearch/sql/calcite/udf/udaf/DistinctCountApproxLogicalAggFunction.java @@ -0,0 +1,61 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +package org.opensearch.sql.calcite.udf.udaf; + +import org.opensearch.sql.calcite.udf.UserDefinedAggFunction; + +/** + * Logical marker UDAF for {@code DISTINCT_COUNT_APPROX}. Lets the PPL parser produce a RelNode that + * contains the operator without committing to a JVM execution path; backends are expected to push + * it down or rewrite it before execution: + * + * + * + *

This class deliberately throws on every method. Reaching a method body means a backend either + * failed to push down or did not register an adapter — that is a configuration bug, not a runtime + * fallback. {@code RelevanceQueryFunction.RelevanceQueryImplementor} (used by {@code match}, {@code + * match_phrase}, etc.) follows the same pattern for relevance search functions that have no JVM + * execution semantics. + */ +public class DistinctCountApproxLogicalAggFunction + implements UserDefinedAggFunction { + + private static final String NOT_EXECUTABLE = + "DISTINCT_COUNT_APPROX logical marker reached Enumerable execution; " + + "an engine-specific implementation must be registered or rewritten before execution."; + + @Override + public MarkerAccumulator init() { + throw new UnsupportedOperationException(NOT_EXECUTABLE); + } + + @Override + public MarkerAccumulator add(MarkerAccumulator acc, Object... values) { + throw new UnsupportedOperationException(NOT_EXECUTABLE); + } + + @Override + public Object result(MarkerAccumulator accumulator) { + throw new UnsupportedOperationException(NOT_EXECUTABLE); + } + + /** Placeholder accumulator. Never actually constructed because {@link #init()} throws. */ + public static class MarkerAccumulator implements Accumulator { + @Override + public Object value(Object... argList) { + throw new UnsupportedOperationException(NOT_EXECUTABLE); + } + } +} diff --git a/core/src/main/java/org/opensearch/sql/expression/function/PPLBuiltinOperators.java b/core/src/main/java/org/opensearch/sql/expression/function/PPLBuiltinOperators.java index 4b7ee41f016..60172e70c84 100644 --- a/core/src/main/java/org/opensearch/sql/expression/function/PPLBuiltinOperators.java +++ b/core/src/main/java/org/opensearch/sql/expression/function/PPLBuiltinOperators.java @@ -29,6 +29,7 @@ import org.apache.calcite.sql.type.SqlTypeTransforms; import org.apache.calcite.sql.util.ReflectiveSqlOperatorTable; import org.apache.calcite.util.BuiltInMethod; +import org.opensearch.sql.calcite.udf.udaf.DistinctCountApproxLogicalAggFunction; import org.opensearch.sql.calcite.udf.udaf.FirstAggFunction; import org.opensearch.sql.calcite.udf.udaf.LastAggFunction; import org.opensearch.sql.calcite.udf.udaf.ListAggFunction; @@ -508,6 +509,22 @@ public class PPLBuiltinOperators extends ReflectiveSqlOperatorTable { PPLReturnTypes.STRING_ARRAY, PPLOperandTypes.ANY_SCALAR_OPTIONAL_INTEGER); + /** + * Logical marker for {@code DISTINCT_COUNT_APPROX} (also exposed as {@code dc} and {@code + * distinct_count} aliases). PPL parser uses this to produce a RelNode; backends override or + * rewrite it before execution. {@code OpenSearchExecutionEngine} registers a real HyperLogLog++ + * implementation in the external registry of {@code PPLFuncImpTable}, which has lookup precedence + * and serves the OpenSearch V3 path. Other backends (DataFusion / analytics-engine) rewrite the + * operator on their own. Operand metadata is {@code null} to match the existing external + * registration's permissive policy and avoid introducing new type rejections. + */ + public static final SqlAggFunction DISTINCT_COUNT_APPROX = + createUserDefinedAggFunction( + DistinctCountApproxLogicalAggFunction.class, + "DISTINCT_COUNT_APPROX", + ReturnTypes.BIGINT_FORCE_NULLABLE, + null); + public static final SqlOperator ENHANCED_COALESCE = new EnhancedCoalesceFunction().toUDF("COALESCE"); diff --git a/core/src/main/java/org/opensearch/sql/expression/function/PPLFuncImpTable.java b/core/src/main/java/org/opensearch/sql/expression/function/PPLFuncImpTable.java index c8f50c60596..6c8aecffec1 100644 --- a/core/src/main/java/org/opensearch/sql/expression/function/PPLFuncImpTable.java +++ b/core/src/main/java/org/opensearch/sql/expression/function/PPLFuncImpTable.java @@ -60,6 +60,7 @@ import static org.opensearch.sql.expression.function.BuiltinFunctionName.DAY_OF_WEEK; import static org.opensearch.sql.expression.function.BuiltinFunctionName.DAY_OF_YEAR; import static org.opensearch.sql.expression.function.BuiltinFunctionName.DEGREES; +import static org.opensearch.sql.expression.function.BuiltinFunctionName.DISTINCT_COUNT_APPROX; import static org.opensearch.sql.expression.function.BuiltinFunctionName.DIVIDE; import static org.opensearch.sql.expression.function.BuiltinFunctionName.DIVIDEFUNCTION; import static org.opensearch.sql.expression.function.BuiltinFunctionName.DUR2SEC; @@ -1426,6 +1427,13 @@ void populate() { registerOperator(INTERNAL_PATTERN, PPLBuiltinOperators.INTERNAL_PATTERN); registerOperator(LIST, PPLBuiltinOperators.LIST); registerOperator(VALUES, PPLBuiltinOperators.VALUES); + // Logical marker so PPL parser succeeds on dc()/distinct_count()/distinct_count_approx() + // regardless of which execution path the query takes. OpenSearchExecutionEngine registers + // a real HyperLogLog++ implementation in aggExternalFunctionRegistry which overrides this + // marker via the external-first lookup precedence in getImplementation(). Other backends + // (DataFusion / analytics-engine) rewrite the operator before substrait emission and never + // execute the marker. + registerOperator(DISTINCT_COUNT_APPROX, PPLBuiltinOperators.DISTINCT_COUNT_APPROX); register( AVG,