Merge pull request ClickHouse#92811 from Blargian/aggregate_functions_5

Blargian · web-flow · commit 0c359bad5cd4 · 2025-12-24T14:16:30.000Z
Docs: move aggregate functions docs to source (#5)
diff --git a/src/AggregateFunctions/AggregateFunctionAny.cpp b/src/AggregateFunctions/AggregateFunctionAny.cpp
@@ -395,12 +395,103 @@ AggregateFunctionPtr createAggregateFunctionAnyLast(
 
 void registerAggregateFunctionsAny(AggregateFunctionFactory & factory)
 {
+    /// any documentation
+    FunctionDocumentation::Description description = R"(
+Selects the first encountered value of a column.
+
+:::warning
+As a query can be executed in arbitrary order, the result of this function is non-deterministic. If you need an arbitrary but deterministic result, use functions min or max.
+:::
+
+By default, the function never returns NULL, i.e. ignores NULL values in the input column.
+However, if the function is used with the `RESPECT NULLS` modifier, it returns the first value reads no matter if NULL or not.
+
+**Implementation details**
+
+In some cases, you can rely on the order of execution.
+This applies to cases when `SELECT` comes from a subquery that uses `ORDER BY`.
+
+When a `SELECT` query has the `GROUP BY` clause or at least one aggregate function, ClickHouse (in contrast to MySQL) requires that all expressions in the `SELECT`, `HAVING`, and `ORDER BY` clauses be calculated from keys or from aggregate functions.
+In other words, each column selected from the table must be used either in keys or inside aggregate functions.
+To get behavior like in MySQL, you can put the other columns in the `any` aggregate function.
+    )";
+    FunctionDocumentation::Syntax syntax = "any(column) [RESPECT NULLS]";
+    FunctionDocumentation::Arguments arguments = {
+        {"column", "The column name.", {"Any"}}
+    };
+    FunctionDocumentation::ReturnedValue returned_value = {R"(
+Returns the first value encountered.
+
+:::note
+The return type of the function is the same as the input, except for LowCardinality which is discarded.
+This means that given no rows as input it will return the default value of that type (0 for integers, or Null for a Nullable() column).
+You might use the -OrNull combinator to modify this behaviour.
+:::
+    )",
+    {"Any"}
+    };
+    FunctionDocumentation::Examples examples = {
+    {
+        "Usage example",
+        R"(
+CREATE TABLE tab (city Nullable(String)) ENGINE=Memory;
+INSERT INTO tab (city) VALUES (NULL), ('Amsterdam'), ('New York'), ('Tokyo'), ('Valencia'), (NULL);
+SELECT any(city), anyRespectNulls(city) FROM tab;
+        )",
+        R"(
+┌─any(city)─┬─anyRespectNulls(city)─┐
+│ Amsterdam │ ᴺᵁᴸᴸ                  │
+└───────────┴───────────────────────┘
+        )"
+    }
+    };
+    FunctionDocumentation::IntroducedIn introduced_in = {1, 1};
+    FunctionDocumentation::Category category = FunctionDocumentation::Category::AggregateFunction;
+    FunctionDocumentation documentation = {description, syntax, arguments, {}, returned_value, examples, introduced_in, category};
+
     AggregateFunctionProperties default_properties = {.returns_default_when_only_null = false, .is_order_dependent = true};
 
-    factory.registerFunction("any", {createAggregateFunctionAny, default_properties});
+    factory.registerFunction("any", {createAggregateFunctionAny, default_properties, documentation});
     factory.registerAlias("any_value", "any", AggregateFunctionFactory::Case::Insensitive);
     factory.registerAlias("first_value", "any", AggregateFunctionFactory::Case::Insensitive);
-    factory.registerFunction("anyLast", {createAggregateFunctionAnyLast, default_properties});
+
+    /// anyLast documentation
+    FunctionDocumentation::Description anyLast_description = R"(
+Selects the last encountered value of a column.
+
+:::warning
+As a query can be executed in arbitrary order, the result of this function is non-deterministic.
+If you need an arbitrary but deterministic result, use functions min or max.
+:::
+
+By default, the function never returns NULL, i.e. ignores NULL values in the input column.
+However, if the function is used with the `RESPECT NULLS` modifier, it returns the last value reads no matter if NULL or not.
+    )";
+    FunctionDocumentation::Syntax anyLast_syntax = "anyLast(column) [RESPECT NULLS]";
+    FunctionDocumentation::Arguments anyLast_arguments = {
+        {"column", "The column name.", {"Any"}}
+    };
+    FunctionDocumentation::ReturnedValue anyLast_returned_value = {"The last value encountered.", {"Any"}};
+    FunctionDocumentation::Examples anyLast_examples = {
+    {
+        "Usage example",
+        R"(
+CREATE TABLE tab (city Nullable(String)) ENGINE=Memory;
+INSERT INTO tab (city) VALUES ('Amsterdam'), (NULL), ('New York'), ('Tokyo'), ('Valencia'), (NULL);
+SELECT anyLast(city), anyLastRespectNulls(city) FROM tab;
+        )",
+        R"(
+┌─anyLast(city)─┬─anyLastRespectNulls(city)─┐
+│ Valencia      │ ᴺᵁᴸᴸ                      │
+└───────────────┴───────────────────────────┘
+        )"
+    }
+    };
+    FunctionDocumentation::IntroducedIn anyLast_introduced_in = {1, 1};
+    FunctionDocumentation::Category anyLast_category = FunctionDocumentation::Category::AggregateFunction;
+    FunctionDocumentation anyLast_documentation = {anyLast_description, anyLast_syntax, anyLast_arguments, {}, anyLast_returned_value, anyLast_examples, anyLast_introduced_in, anyLast_category};
+
+    factory.registerFunction("anyLast", {createAggregateFunctionAnyLast, default_properties, anyLast_documentation}, AggregateFunctionFactory::Case::Sensitive);
     factory.registerAlias("last_value", "anyLast", AggregateFunctionFactory::Case::Insensitive);
 }
 }
diff --git a/src/AggregateFunctions/AggregateFunctionFourthMoment.cpp b/src/AggregateFunctions/AggregateFunctionFourthMoment.cpp
@@ -9,8 +9,72 @@ template <typename T> using AggregateFunctionFourthMoment = AggregateFunctionVar
 
 void registerAggregateFunctionsStatisticsFourthMoment(AggregateFunctionFactory & factory)
 {
-    factory.registerFunction("kurtSamp", createAggregateFunctionStatisticsUnary<AggregateFunctionFourthMoment, StatisticsFunctionKind::kurtSamp>);
-    factory.registerFunction("kurtPop", createAggregateFunctionStatisticsUnary<AggregateFunctionFourthMoment, StatisticsFunctionKind::kurtPop>);
+    FunctionDocumentation::Description description_samp = R"(
+Computes the [sample kurtosis](https://en.wikipedia.org/wiki/Kurtosis) of a sequence.
+
+It represents an unbiased estimate of the kurtosis of a random variable if passed values form its sample.
+    )";
+    FunctionDocumentation::Syntax syntax_samp = R"(
+kurtSamp(expr)
+    )";
+    FunctionDocumentation::Arguments arguments_samp = {
+        {"expr", "[Expression](/sql-reference/syntax#expressions) returning a number.", {"(U)Int*", "Float*", "Decimal"}}
+    };
+    FunctionDocumentation::Parameters parameters = {};
+    FunctionDocumentation::ReturnedValue returned_value_samp = {"Returns the kurtosis of the given distribution. If `n <= 1` (`n` is a size of the sample), then the function returns `nan`.", {"Float64"}};
+    FunctionDocumentation::Examples examples_samp = {
+    {
+        "Computing sample kurtosis",
+        R"(
+CREATE TABLE test_data (x Float64) ENGINE = Memory;
+INSERT INTO test_data VALUES (1), (2), (3), (4), (5), (6), (7), (8), (9), (10);
+
+SELECT kurtSamp(x) FROM test_data;
+        )",
+        R"(
+┌────────kurtSamp(x)─┐
+│ 1.4383636363636365 │
+└────────────────────┘
+        )"
+    }
+    };
+    FunctionDocumentation::IntroducedIn introduced_in_samp = {20, 1};
+    FunctionDocumentation::Category category_samp = FunctionDocumentation::Category::AggregateFunction;
+    FunctionDocumentation documentation_samp = {description_samp, syntax_samp, arguments_samp, {}, returned_value_samp, examples_samp, introduced_in_samp, category_samp};
+
+    factory.registerFunction("kurtSamp", {createAggregateFunctionStatisticsUnary<AggregateFunctionFourthMoment, StatisticsFunctionKind::kurtSamp>, {}, documentation_samp});
+
+    FunctionDocumentation::Description description = R"(
+Computes the [kurtosis](https://en.wikipedia.org/wiki/Kurtosis) of a sequence.
+    )";
+    FunctionDocumentation::Syntax syntax = R"(
+kurtPop(expr)
+    )";
+    FunctionDocumentation::Arguments arguments = {
+        {"expr", "[Expression](/sql-reference/syntax#expressions) returning a number.", {"(U)Int*", "Float*", "Decimal"}}
+    };
+    FunctionDocumentation::ReturnedValue returned_value = {"Returns the kurtosis of the given distribution.", {"Float64"}};
+    FunctionDocumentation::Examples examples = {
+    {
+        "Computing kurtosis",
+        R"(
+CREATE TABLE test_data (x Float64) ENGINE = Memory;
+INSERT INTO test_data VALUES (1), (2), (3), (4), (5), (6), (7), (8), (9), (10);
+
+SELECT kurtPop(x) FROM test_data;
+        )",
+        R"(
+┌─────────kurtPop(x)─┐
+│ 1.7757575757575756 │
+└────────────────────┘
+        )"
+    }
+    };
+    FunctionDocumentation::IntroducedIn introduced_in = {20, 1};
+    FunctionDocumentation::Category category = FunctionDocumentation::Category::AggregateFunction;
+    FunctionDocumentation documentation = {description, syntax, arguments, parameters, returned_value, examples, introduced_in, category};
+
+    factory.registerFunction("kurtPop", {createAggregateFunctionStatisticsUnary<AggregateFunctionFourthMoment, StatisticsFunctionKind::kurtPop>, {}, documentation});
 }
 
 }
diff --git a/src/AggregateFunctions/AggregateFunctionKolmogorovSmirnovTest.cpp b/src/AggregateFunctions/AggregateFunctionKolmogorovSmirnovTest.cpp
@@ -353,7 +353,77 @@ AggregateFunctionPtr createAggregateFunctionKolmogorovSmirnovTest(
 
 void registerAggregateFunctionKolmogorovSmirnovTest(AggregateFunctionFactory & factory)
 {
-    factory.registerFunction("kolmogorovSmirnovTest", createAggregateFunctionKolmogorovSmirnovTest, AggregateFunctionFactory::Case::Insensitive);
+    FunctionDocumentation::Description description = R"(
+Applies Kolmogorov-Smirnov's test to samples from two populations.
+
+Values of both samples are in the `sample_data` column. If `sample_index` equals to 0 then the value in that row belongs to the sample from the first population. Otherwise it belongs to the sample from the second population.
+Samples must belong to continuous, one-dimensional probability distributions.
+    )";
+    FunctionDocumentation::Syntax syntax = R"(
+kolmogorovSmirnovTest([alternative, computation_method])(sample_data, sample_index)
+    )";
+    FunctionDocumentation::Arguments arguments = {
+        {"sample_data", "Sample data.", {"(U)Int*", "Float*", "Decimal"}},
+        {"sample_index", "Sample index.", {"(U)Int*"}}
+    };
+    FunctionDocumentation::Parameters parameters = {
+        {"alternative", "Alternative hypothesis. (Optional, default: 'two-sided'.) Let `F(x) and G(x)` be the CDFs of the first and second distributions respectively. 'two-sided': The null hypothesis is that samples come from the same distribution, e.g. `F(x) = G(x)` for all x. And the alternative is that the distributions are not identical. 'greater': The null hypothesis is that values in the first sample are stochastically smaller than those in the second one, e.g. the CDF of first distribution lies above and hence to the left of that for the second one. Which in fact means that `F(x) >= G(x)` for all x. And the alternative in this case is that `F(x) < G(x)` for at least one x. 'less': The null hypothesis is that values in the first sample are stochastically greater than those in the second one, e.g. the CDF of first distribution lies below and hence to the right of that for the second one. Which in fact means that `F(x) <= G(x)` for all x. And the alternative in this case is that `F(x) > G(x)` for at least one x.", {"String"}},
+        {"computation_method", "The method used to compute p-value. (Optional, default: 'auto'.) 'exact': calculation is performed using precise probability distribution of the test statistics. Compute intensive and wasteful except for small samples. 'asymp' ('asymptotic'): calculation is performed using an approximation. For large sample sizes, the exact and asymptotic p-values are very similar. 'auto': the 'exact' method is used when a maximum number of samples is less than 10'000.", {"String"}}
+    };
+    FunctionDocumentation::ReturnedValue returned_value = {"Returns a tuple with two elements: a calculated statistic and a calculated p-value.", {"Tuple(Float64, Float64)"}};
+    FunctionDocumentation::Examples examples = {
+    {
+        "Same distribution test",
+        R"(
+SELECT kolmogorovSmirnovTest('less', 'exact')(value, num)
+FROM
+(
+    SELECT
+        randNormal(0, 10) AS value,
+        0 AS num
+    FROM numbers(10000)
+    UNION ALL
+    SELECT
+        randNormal(0, 10) AS value,
+        1 AS num
+    FROM numbers(10000)
+)
+        )",
+        R"(
+┌─kolmogorovSmirnovTest('less', 'exact')(value, num)─┐
+│ (0.009899999999999996,0.37528595205132287)         │
+└────────────────────────────────────────────────────┘
+        )"
+    },
+    {
+        "Different distributions test",
+        R"(
+SELECT kolmogorovSmirnovTest('two-sided', 'exact')(value, num)
+FROM
+(
+    SELECT
+        randStudentT(10) AS value,
+        0 AS num
+    FROM numbers(100)
+    UNION ALL
+    SELECT
+        randNormal(0, 10) AS value,
+        1 AS num
+    FROM numbers(100)
+)
+        )",
+        R"(
+┌─kolmogorovSmirnovTest('two-sided', 'exact')(value, num)─┐
+│ (0.4100000000000002,6.61735760482795e-8)                │
+└─────────────────────────────────────────────────────────┘
+        )"
+    }
+    };
+    FunctionDocumentation::IntroducedIn introduced_in = {23, 4};
+    FunctionDocumentation::Category category = FunctionDocumentation::Category::AggregateFunction;
+    FunctionDocumentation documentation = {description, syntax, arguments, parameters, returned_value, examples, introduced_in, category};
+
+    factory.registerFunction("kolmogorovSmirnovTest", {createAggregateFunctionKolmogorovSmirnovTest, {}, documentation}, AggregateFunctionFactory::Case::Insensitive);
 }
 
 }
diff --git a/src/AggregateFunctions/AggregateFunctionLargestTriangleThreeBuckets.cpp b/src/AggregateFunctions/AggregateFunctionLargestTriangleThreeBuckets.cpp
@@ -357,7 +357,51 @@ createAggregateFunctionLargestTriangleThreeBuckets(const std::string & name, con
 
 void registerAggregateFunctionLargestTriangleThreeBuckets(AggregateFunctionFactory & factory)
 {
-    factory.registerFunction(AggregateFunctionLargestTriangleThreeBuckets::name, createAggregateFunctionLargestTriangleThreeBuckets);
+    FunctionDocumentation::Description description = R"(
+Applies the [Largest-Triangle-Three-Buckets](https://skemman.is/bitstream/1946/15343/3/SS_MSthesis.pdf) algorithm to the input data.
+The algorithm is used for downsampling time series data for visualization.
+It is designed to operate on series sorted by x coordinate.
+It works by dividing the sorted series into buckets and then finding the largest triangle in each bucket.
+The number of buckets is equal to the number of points in the resulting series.
+The function will sort data by `x` and then apply the downsampling algorithm to the sorted data.
+
+NaNs are ignored in the provided series, meaning that any NaN values will be excluded from the analysis.
+This ensures that the function operates only on valid numerical data.
+    )";
+    FunctionDocumentation::Syntax syntax = R"(
+largestTriangleThreeBuckets(n)(x, y)
+    )";
+    FunctionDocumentation::Arguments arguments = {
+        {"x", "x coordinate.", {"(U)Int*", "Float*", "Decimal", "Date", "Date32", "DateTime", "DateTime64"}},
+        {"y", "y coordinate.", {"(U)Int*", "Float*", "Decimal", "Date", "Date32", "DateTime", "DateTime64"}}
+    };
+    FunctionDocumentation::Parameters parameters = {
+        {"n", "Number of points in the resulting series.", {"UInt64"}}
+    };
+    FunctionDocumentation::ReturnedValue returned_value = {"Returns an array of tuples with two elements..", {"Array(Tuple(Float64, Float64))"}};
+    FunctionDocumentation::Examples examples = {
+    {
+        "Downsampling time series data",
+        R"(
+CREATE TABLE largestTriangleThreeBuckets_test (x Float64, y Float64) ENGINE = Memory;
+INSERT INTO largestTriangleThreeBuckets_test VALUES
+    (1.0, 10.0), (2.0, 20.0), (3.0, 15.0), (8.0, 60.0), (9.0, 55.0),
+    (10.0, 70.0), (4.0, 30.0), (5.0, 40.0), (6.0, 35.0), (7.0, 50.0);
+
+SELECT largestTriangleThreeBuckets(4)(x, y) FROM largestTriangleThreeBuckets_test;
+        )",
+        R"(
+┌────────largestTriangleThreeBuckets(4)(x, y)───────────┐
+│           [(1,10),(3,15),(9,55),(10,70)]              │
+└───────────────────────────────────────────────────────┘
+        )"
+    }
+    };
+    FunctionDocumentation::IntroducedIn introduced_in = {23, 10};
+    FunctionDocumentation::Category category = FunctionDocumentation::Category::AggregateFunction;
+    FunctionDocumentation documentation = {description, syntax, arguments, parameters, returned_value, examples, introduced_in, category};
+
+    factory.registerFunction(AggregateFunctionLargestTriangleThreeBuckets::name, {createAggregateFunctionLargestTriangleThreeBuckets, {}, documentation});
     factory.registerAlias("lttb", AggregateFunctionLargestTriangleThreeBuckets::name);
 }
 
diff --git a/src/AggregateFunctions/AggregateFunctionMannWhitney.cpp b/src/AggregateFunctions/AggregateFunctionMannWhitney.cpp
@@ -274,7 +274,49 @@ AggregateFunctionPtr createAggregateFunctionMannWhitneyUTest(
 
 void registerAggregateFunctionMannWhitney(AggregateFunctionFactory & factory)
 {
-    factory.registerFunction("mannWhitneyUTest", createAggregateFunctionMannWhitneyUTest);
+    FunctionDocumentation::Description description = R"(
+Applies the Mann-Whitney rank test to samples from two populations.
+
+Values of both samples are in the `sample_data` column.
+If `sample_index` equals to 0 then the value in that row belongs to the sample from the first population.
+Otherwise it belongs to the sample from the second population.
+The null hypothesis is that two populations are stochastically equal.
+Also one-sided hypotheses can be tested.
+This test does not assume that data have normal distribution.
+    )";
+    FunctionDocumentation::Syntax syntax = R"(
+mannWhitneyUTest[(alternative[, continuity_correction])](sample_data, sample_index)
+    )";
+    FunctionDocumentation::Arguments arguments = {
+        {"sample_data", "Sample data.", {"(U)Int*", "Float*", "Decimal*"}},
+        {"sample_index", "Sample index.", {"(U)Int*"}}
+    };
+    FunctionDocumentation::Parameters parameters = {
+        {"alternative", "Optional. Alternative hypothesis. 'two-sided' (default): two populations are not stochastically equal. 'greater': values in the first sample are stochastically greater than those in the second sample. 'less': values in the first sample are stochastically less than those in the second sample.", {"String"}},
+        {"continuity_correction", "Optional. If not 0 then continuity correction in the normal approximation for the p-value is applied. The default value is 1.", {"UInt64"}}
+    };
+    FunctionDocumentation::ReturnedValue returned_value = {"Returns a tuple with two elements: calculated U-statistic and calculated p-value.", {"Tuple(Float64, Float64)"}};
+    FunctionDocumentation::Examples examples = {
+    {
+        "Mann-Whitney U test example",
+        R"(
+CREATE TABLE mww_ttest (sample_data Float64, sample_index UInt8) ENGINE = Memory;
+INSERT INTO mww_ttest VALUES (10, 0), (11, 0), (12, 0), (1, 1), (2, 1), (3, 1);
+
+SELECT mannWhitneyUTest('greater')(sample_data, sample_index) FROM mww_ttest;
+        )",
+        R"(
+┌─mannWhitneyUTest('greater')(sample_data, sample_index)─┐
+│ (9,0.04042779918503192)                                │
+└────────────────────────────────────────────────────────┘
+        )"
+    }
+    };
+    FunctionDocumentation::IntroducedIn introduced_in = {21, 1};
+    FunctionDocumentation::Category category = FunctionDocumentation::Category::AggregateFunction;
+    FunctionDocumentation documentation = {description, syntax, arguments, parameters, returned_value, examples, introduced_in, category};
+
+    factory.registerFunction("mannWhitneyUTest", {createAggregateFunctionMannWhitneyUTest, {}, documentation});
 }
 
 }
diff --git a/src/AggregateFunctions/AggregateFunctionMaxIntersections.cpp b/src/AggregateFunctions/AggregateFunctionMaxIntersections.cpp
diff --git a/src/AggregateFunctions/AggregateFunctionsMinMax.cpp b/src/AggregateFunctions/AggregateFunctionsMinMax.cpp