Merge pull request ClickHouse#92397 from Blargian/aggregate_functions_1

alexey-milovidov · web-flow · commit 35b3b465ee32 · 2025-12-22T19:26:16.000+01:00
Docs: move aggregate functions docs to source (#1)
diff --git a/src/AggregateFunctions/AggregateFunctionAvgWeighted.cpp b/src/AggregateFunctions/AggregateFunctionAvgWeighted.cpp
@@ -184,7 +184,70 @@ createAggregateFunctionAvgWeighted(const std::string & name, const DataTypes & a
 
 void registerAggregateFunctionAvgWeighted(AggregateFunctionFactory & factory)
 {
-    factory.registerFunction("avgWeighted", createAggregateFunctionAvgWeighted);
+    FunctionDocumentation::Description description = R"(
+Calculates the [weighted arithmetic mean](https://en.wikipedia.org/wiki/Weighted_arithmetic_mean).
+    )";
+    FunctionDocumentation::Syntax syntax = "avgWeighted(x, weight)";
+    FunctionDocumentation::Arguments arguments = {
+        {"x", "Values.", {"(U)Int*", "Float*"}},
+        {"weight", "Weights of the values.", {"(U)Int*", "Float*"}}
+    };
+    FunctionDocumentation::Parameters parameters = {};
+    FunctionDocumentation::ReturnedValue returned_value = {"Returns `NaN` if all the weights are equal to 0 or the supplied weights parameter is empty, or the weighted mean otherwise.", {"Float64"}};
+    FunctionDocumentation::Examples examples = {
+    {
+        "Usage example",
+        R"(
+SELECT avgWeighted(x, w)
+FROM VALUES('x Int8, w Int8', (4, 1), (1, 0), (10, 2))
+        )",
+        R"(
+┌─avgWeighted(x, w)─┐
+│                 8 │
+└───────────────────┘
+        )"
+    },
+    {
+        "Mixed integer and float weights",
+        R"(
+SELECT avgWeighted(x, w)
+FROM VALUES('x Int8, w Float64', (4, 1), (1, 0), (10, 2))
+        )",
+        R"(
+┌─avgWeighted(x, w)─┐
+│                 8 │
+└───────────────────┘
+        )"
+    },
+    {
+        "All weights are zero returns NaN",
+        R"(
+SELECT avgWeighted(x, w)
+FROM VALUES('x Int8, w Int8', (0, 0), (1, 0), (10, 0))
+        )",
+        R"(
+┌─avgWeighted(x, w)─┐
+│               nan │
+└───────────────────┘
+        )"
+    },
+    {
+        "Empty table returns NaN",
+        R"(
+CREATE TABLE test (t UInt8) ENGINE = Memory;
+SELECT avgWeighted(t, t) FROM test
+        )",
+        R"(
+┌─avgWeighted(t, t)─┐
+│               nan │
+└───────────────────┘
+        )"
+    }
+    };
+    FunctionDocumentation::Category category = FunctionDocumentation::Category::AggregateFunction;
+    FunctionDocumentation::IntroducedIn introduced_in = {20, 1};
+    FunctionDocumentation documentation = {description, syntax, arguments, parameters, returned_value, examples, introduced_in, category};
+    factory.registerFunction("avgWeighted", {createAggregateFunctionAvgWeighted, AggregateFunctionProperties{}, documentation });
 }
 
 }
diff --git a/src/AggregateFunctions/AggregateFunctionBoundingRatio.cpp b/src/AggregateFunctions/AggregateFunctionBoundingRatio.cpp
@@ -202,7 +202,57 @@ AggregateFunctionPtr createAggregateFunctionRate(const std::string & name, const
 
 void registerAggregateFunctionRate(AggregateFunctionFactory & factory)
 {
-    factory.registerFunction("boundingRatio", createAggregateFunctionRate);
+    FunctionDocumentation::Description description = R"(
+Calculates the slope between the leftmost and rightmost points across a group of values.
+    )";
+    FunctionDocumentation::Syntax syntax = "boundingRatio(x, y)";
+    FunctionDocumentation::Arguments arguments = {
+        {"x", "X-coordinate values.", {"(U)Int*", "Float*", "Decimal"}},
+        {"y", "Y-coordinate values.", {"(U)Int*", "Float*", "Decimal"}}
+    };
+    FunctionDocumentation::Parameters parameters = {};
+    FunctionDocumentation::ReturnedValue returned_value = {"Returns the slope of the line between the leftmost and rightmost points, otherwise returns `NaN` if the data is empty.", {"Float64"}};
+    FunctionDocumentation::Examples examples = {
+    {
+        "Sample data",
+        R"(
+SELECT
+    number,
+    number * 1.5
+FROM numbers(10)
+        )",
+        R"(
+┌─number─┬─multiply(number, 1.5)─┐
+│      0 │                     0 │
+│      1 │                   1.5 │
+│      2 │                     3 │
+│      3 │                   4.5 │
+│      4 │                     6 │
+│      5 │                   7.5 │
+│      6 │                     9 │
+│      7 │                  10.5 │
+│      8 │                    12 │
+│      9 │                  13.5 │
+└────────┴───────────────────────┘
+        )"
+    },
+    {
+        "Usage example",
+        R"(
+SELECT boundingRatio(number, number * 1.5)
+FROM numbers(10)
+        )",
+        R"(
+┌─boundingRatio(number, multiply(number, 1.5))─┐
+│                                          1.5 │
+└──────────────────────────────────────────────┘
+        )"
+    }
+    };
+    FunctionDocumentation::Category category = FunctionDocumentation::Category::AggregateFunction;
+    FunctionDocumentation::IntroducedIn introduced_in = {20, 1};
+    FunctionDocumentation documentation = {description, syntax, arguments, parameters, returned_value, examples, introduced_in, category};
+    factory.registerFunction("boundingRatio", {createAggregateFunctionRate, AggregateFunctionProperties{}, documentation});
 }
 
 }
diff --git a/src/AggregateFunctions/AggregateFunctionCategoricalInformationValue.cpp b/src/AggregateFunctions/AggregateFunctionCategoricalInformationValue.cpp
@@ -180,8 +180,61 @@ AggregateFunctionPtr createAggregateFunctionCategoricalIV(
 
 void registerAggregateFunctionCategoricalIV(AggregateFunctionFactory & factory)
 {
+    FunctionDocumentation::Description description = R"(
+Calculates the information value (IV) for categorical features in relation to a binary target variable.
+
+For each category, the function computes: `(P(tag = 1) - P(tag = 0)) × (log(P(tag = 1)) - log(P(tag = 0)))`
+
+where:
+- P(tag = 1) is the probability that the target equals 1 for the given category
+- P(tag = 0) is the probability that the target equals 0 for the given category
+
+Information Value is a statistic used to measure the strength of a categorical feature's relationship with a binary target variable in predictive modeling.
+Higher absolute values indicate stronger predictive power.
+
+The result indicates how much each discrete (categorical) feature `[category1, category2, ...]` contributes to a learning model which predicts the value of `tag`.
+    )";
+    FunctionDocumentation::Syntax syntax = "categoricalInformationValue(category1[, category2, ...,]tag)";
+    FunctionDocumentation::Arguments arguments = {
+        {"category1, category2, ...", "One or more categorical features to analyze. Each category should contain discrete values.", {"UInt8"}},
+        {"tag", "Binary target variable for prediction. Should contain values 0 and 1.", {"UInt8"}}
+    };
+    FunctionDocumentation::Parameters parameters = {};
+    FunctionDocumentation::ReturnedValue returned_value = {"Returns an array of Float64 values representing the information value for each unique combination of categories. Each value indicates the predictive strength of that category combination for the target variable.", {"Array(Float64)"}};
+    FunctionDocumentation::Examples examples =
+    {
+    {
+        "Basic usage analyzing age groups vs mobile usage",
+        R"(
+-- Using the metrica.hits dataset (available on https://sql.clickhouse.com/) to analyze age-mobile relationship
+SELECT categoricalInformationValue(Age < 15, IsMobile)
+FROM metrica.hits;
+        )",
+        R"(
+[0.0014814694805292418]
+        )"
+    },
+    {
+        "Multiple categorical features with user demographics",
+        R"(
+SELECT categoricalInformationValue(
+    Sex,                 -- 0=male, 1=female
+    toUInt8(Age < 25),   -- 0=25+, 1=under 25
+    toUInt8(IsMobile)    -- 0=desktop, 1=mobile
+) AS iv_values
+FROM metrica.hits
+WHERE Sex IN (0, 1);
+        )",
+        R"(
+[0.00018965785460692887,0.004973668839403392]
+        )"
+    }
+    };
+    FunctionDocumentation::Category category = FunctionDocumentation::Category::AggregateFunction;
+    FunctionDocumentation::IntroducedIn introduced_in = {20, 1};
+    FunctionDocumentation documentation = {description, syntax, arguments, parameters, returned_value, examples, introduced_in, category};
     AggregateFunctionProperties properties = { .returns_default_when_only_null = true };
-    factory.registerFunction("categoricalInformationValue", { createAggregateFunctionCategoricalIV, properties });
+    factory.registerFunction("categoricalInformationValue", { createAggregateFunctionCategoricalIV, properties, documentation });
 }
 
 }
diff --git a/src/AggregateFunctions/AggregateFunctionContingencyCoefficient.cpp b/src/AggregateFunctions/AggregateFunctionContingencyCoefficient.cpp
@@ -47,13 +47,54 @@ struct ContingencyData : CrossTabData
 
 void registerAggregateFunctionContingency(AggregateFunctionFactory & factory)
 {
+    FunctionDocumentation::Description description = R"(
+The `contingency` function calculates the [contingency coefficient](https://en.wikipedia.org/wiki/Contingency_table#Cram%C3%A9r's_V_and_the_contingency_coefficient_C), a value that measures the association between two columns in a table.
+The computation is similar to the [`cramersV`](./cramersv.md) function but with a different denominator in the square root.
+    )";
+    FunctionDocumentation::Syntax syntax = "contingency(column1, column2)";
+    FunctionDocumentation::Arguments arguments = {
+        {"column1", "First column to compare.", {"Any"}},
+        {"column2", "Second column to compare.", {"Any"}}
+    };
+    FunctionDocumentation::Parameters docs_parameters = {};
+    FunctionDocumentation::ReturnedValue returned_value = {"Returns a value between 0 and 1. The larger the result, the closer the association of the two columns.", {"Float64"}};
+    FunctionDocumentation::Examples examples = {
+    {
+        "Comparison with cramersV",
+        R"(
+SELECT
+    cramersV(a, b),
+    contingency(a, b)
+FROM
+(
+    SELECT
+        number % 10 AS a,
+        number % 4 AS b
+    FROM
+        numbers(150)
+)
+        )",
+        R"(
+┌──────cramersV(a, b)─┬───contingency(a, b)─┐
+│ 0.41171788506213564 │ 0.05812725261759165 │
+└─────────────────────┴─────────────────────┘
+        )"
+    }
+    };
+    FunctionDocumentation::Category category = FunctionDocumentation::Category::AggregateFunction;
+    FunctionDocumentation::IntroducedIn introduced_in = {22, 1};
+    FunctionDocumentation documentation = {description, syntax, arguments, docs_parameters, returned_value, examples, introduced_in, category};
     factory.registerFunction(ContingencyData::getName(),
+    {
         [](const std::string & name, const DataTypes & argument_types, const Array & parameters, const Settings *)
         {
             assertBinary(name, argument_types);
             assertNoParameters(name, parameters);
             return std::make_shared<AggregateFunctionCrossTab<ContingencyData>>(argument_types);
-        });
+        },
+        {},
+        documentation
+    });
 }
 
 }
diff --git a/src/AggregateFunctions/AggregateFunctionCorr.cpp b/src/AggregateFunctions/AggregateFunctionCorr.cpp
@@ -9,7 +9,52 @@ template <typename T1, typename T2> using AggregateFunctionCorr = AggregateFunct
 
 void registerAggregateFunctionsStatisticsCorr(AggregateFunctionFactory & factory)
 {
-    factory.registerFunction("corr", createAggregateFunctionStatisticsBinary<AggregateFunctionCorr, StatisticsFunctionKind::corr>, AggregateFunctionFactory::Case::Insensitive);
+    FunctionDocumentation::Description description = R"(
+Calculates the [Pearson correlation coefficient](https://en.wikipedia.org/wiki/Pearson_correlation_coefficient):
+
+$$
+\frac{\Sigma{(x - \bar{x})(y - \bar{y})}}{\sqrt{\Sigma{(x - \bar{x})^2} * \Sigma{(y - \bar{y})^2}}}
+$$
+
+:::note
+This function uses a numerically unstable algorithm. If you need [numerical stability](https://en.wikipedia.org/wiki/Numerical_stability) in calculations, use the [`corrStable`](../reference/corrstable.md) function. It is slower but provides a more accurate result.
+:::
+    )";
+    FunctionDocumentation::Syntax syntax = "corr(x, y)";
+    FunctionDocumentation::Arguments arguments = {
+        {"x", "First variable.", {"(U)Int*", "Float*"}},
+        {"y", "Second variable.", {"(U)Int*", "Float*"}}
+    };
+    FunctionDocumentation::Parameters parameters = {};
+    FunctionDocumentation::ReturnedValue returned_value = {"Returns the Pearson correlation coefficient.", {"Float64"}};
+    FunctionDocumentation::Examples examples = {
+    {
+        "Basic correlation calculation",
+        R"(
+DROP TABLE IF EXISTS series;
+CREATE TABLE series
+(
+    i UInt32,
+    x_value Float64,
+    y_value Float64
+)
+ENGINE = Memory;
+INSERT INTO series(i, x_value, y_value) VALUES (1, 5.6, -4.4),(2, -9.6, 3),(3, -1.3, -4),(4, 5.3, 9.7),(5, 4.4, 0.037),(6, -8.6, -7.8),(7, 5.1, 9.3),(8, 7.9, -3.6),(9, -8.2, 0.62),(10, -3, 7.3);
+
+SELECT corr(x_value, y_value)
+FROM series
+        )",
+        R"(
+┌─corr(x_value, y_value)─┐
+│     0.1730265755453256 │
+└────────────────────────┘
+        )"
+    }
+    };
+    FunctionDocumentation::Category category = FunctionDocumentation::Category::AggregateFunction;
+    FunctionDocumentation::IntroducedIn introduced_in = {1, 1};
+    FunctionDocumentation documentation = {description, syntax, arguments, parameters, returned_value, examples, introduced_in, category};
+    factory.registerFunction("corr", {createAggregateFunctionStatisticsBinary<AggregateFunctionCorr, StatisticsFunctionKind::corr>, AggregateFunctionProperties{}, documentation }, AggregateFunctionFactory::Case::Insensitive);
 }
 
 }
diff --git a/src/AggregateFunctions/AggregateFunctionCount.cpp b/src/AggregateFunctions/AggregateFunctionCount.cpp
@@ -253,8 +253,73 @@ AggregateFunctionPtr createAggregateFunctionCount(const std::string & name, cons
 
 void registerAggregateFunctionCount(AggregateFunctionFactory & factory)
 {
+    FunctionDocumentation::Description description = R"(
+Counts the number of rows or not-NULL values.
+
+ClickHouse supports the following syntaxes for `count`:
+- `count(expr)` or `COUNT(DISTINCT expr)`.
+- `count()` or `COUNT(*)`. The `count()` syntax is ClickHouse-specific.
+
+**Details**
+
+ClickHouse supports the `COUNT(DISTINCT ...)` syntax.
+The behavior of this construction depends on the [`count_distinct_implementation`](../../../operations/settings/settings.md#count_distinct_implementation) setting.
+It defines which of the [uniq*](/sql-reference/aggregate-functions/reference/uniq) functions is used to perform the operation.
+The default is the [uniqExact](/sql-reference/aggregate-functions/reference/uniqexact) function.
+
+The `SELECT count() FROM table` query is optimized by default using metadata from MergeTree.
+If you need to use row-level security, disable optimization using the [`optimize_trivial_count_query`](/operations/settings/settings#optimize_trivial_count_query) setting.
+
+However `SELECT count(nullable_column) FROM table` query can be optimized by enabling the [`optimize_functions_to_subcolumns`](/operations/settings/settings#optimize_functions_to_subcolumns) setting.
+With `optimize_functions_to_subcolumns = 1` the function reads only [`null`](../../../sql-reference/data-types/nullable.md#finding-null) subcolumn instead of reading and processing the whole column data.
+The query `SELECT count(n) FROM table` transforms to `SELECT sum(NOT n.null) FROM table`.
+
+:::tip Improving COUNT(DISTINCT expr) performance
+If your `COUNT(DISTINCT expr)` query is slow, consider adding a [`GROUP BY`](/sql-reference/statements/select/group-by) clause as this improves parallelization.
+You can also use a [projection](../../../sql-reference/statements/alter/projection.md) to create an index on the target column used with `COUNT(DISTINCT target_col)`.
+:::
+    )";
+    FunctionDocumentation::Syntax syntax = "count([expr])";
+    FunctionDocumentation::Arguments arguments = {
+        {"expr", "Optional. An expression. The function counts how many times this expression returned not null.", {"Expression"}}
+    };
+    FunctionDocumentation::Parameters parameters = {};
+    FunctionDocumentation::ReturnedValue returned_value = {"Returns the a row count if the function is called without parameters, otherwise returns a count of how many times the passed expression returned not null.", {"UInt64"}};
+    FunctionDocumentation::Examples examples = {
+    {
+        "Basic row count",
+        R"(
+SELECT count() FROM t
+        )",
+        R"(
+┌─count()─┐
+│       5 │
+└─────────┘
+        )"
+    },
+    {
+        "COUNT(DISTINCT) example",
+        R"(
+-- This example shows that `count(DISTINCT num)` is performed by the `uniqExact` function according to the `count_distinct_implementation` setting value.
+SELECT name, value FROM system.settings WHERE name = 'count_distinct_implementation';
+SELECT count(DISTINCT num) FROM t
+        )",
+        R"(
+┌─name──────────────────────────┬─value─────┐
+│ count_distinct_implementation │ uniqExact │
+└───────────────────────────────┴───────────┘
+┌─uniqExact(num)─┐
+│              3 │
+└────────────────┘
+        )"
+    }
+    };
+    FunctionDocumentation::Category category = FunctionDocumentation::Category::AggregateFunction;
+    FunctionDocumentation::IntroducedIn introduced_in = {1, 1};
+    FunctionDocumentation documentation = {description, syntax, arguments, parameters, returned_value, examples, introduced_in, category};
     AggregateFunctionProperties properties = { .returns_default_when_only_null = true, .is_order_dependent = false };
-    factory.registerFunction("count", {createAggregateFunctionCount, properties}, AggregateFunctionFactory::Case::Insensitive);
+
+    factory.registerFunction("count", {createAggregateFunctionCount, properties, documentation}, AggregateFunctionFactory::Case::Insensitive);
 }
 
 }
diff --git a/src/AggregateFunctions/AggregateFunctionCovar.cpp b/src/AggregateFunctions/AggregateFunctionCovar.cpp
diff --git a/src/AggregateFunctions/AggregateFunctionStatistics.cpp b/src/AggregateFunctions/AggregateFunctionStatistics.cpp
diff --git a/src/AggregateFunctions/AggregateFunctionVarianceMatrix.cpp b/src/AggregateFunctions/AggregateFunctionVarianceMatrix.cpp