Skip to content

Commit 35b3b46

Browse files
Merge pull request ClickHouse#92397 from Blargian/aggregate_functions_1
Docs: move aggregate functions docs to source (#1)
2 parents ef81cdc + 8071a20 commit 35b3b46

9 files changed

Lines changed: 501 additions & 14 deletions

src/AggregateFunctions/AggregateFunctionAvgWeighted.cpp

Lines changed: 64 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -184,7 +184,70 @@ createAggregateFunctionAvgWeighted(const std::string & name, const DataTypes & a
184184

185185
void registerAggregateFunctionAvgWeighted(AggregateFunctionFactory & factory)
186186
{
187-
factory.registerFunction("avgWeighted", createAggregateFunctionAvgWeighted);
187+
FunctionDocumentation::Description description = R"(
188+
Calculates the [weighted arithmetic mean](https://en.wikipedia.org/wiki/Weighted_arithmetic_mean).
189+
)";
190+
FunctionDocumentation::Syntax syntax = "avgWeighted(x, weight)";
191+
FunctionDocumentation::Arguments arguments = {
192+
{"x", "Values.", {"(U)Int*", "Float*"}},
193+
{"weight", "Weights of the values.", {"(U)Int*", "Float*"}}
194+
};
195+
FunctionDocumentation::Parameters parameters = {};
196+
FunctionDocumentation::ReturnedValue returned_value = {"Returns `NaN` if all the weights are equal to 0 or the supplied weights parameter is empty, or the weighted mean otherwise.", {"Float64"}};
197+
FunctionDocumentation::Examples examples = {
198+
{
199+
"Usage example",
200+
R"(
201+
SELECT avgWeighted(x, w)
202+
FROM VALUES('x Int8, w Int8', (4, 1), (1, 0), (10, 2))
203+
)",
204+
R"(
205+
┌─avgWeighted(x, w)─┐
206+
│ 8 │
207+
└───────────────────┘
208+
)"
209+
},
210+
{
211+
"Mixed integer and float weights",
212+
R"(
213+
SELECT avgWeighted(x, w)
214+
FROM VALUES('x Int8, w Float64', (4, 1), (1, 0), (10, 2))
215+
)",
216+
R"(
217+
┌─avgWeighted(x, w)─┐
218+
│ 8 │
219+
└───────────────────┘
220+
)"
221+
},
222+
{
223+
"All weights are zero returns NaN",
224+
R"(
225+
SELECT avgWeighted(x, w)
226+
FROM VALUES('x Int8, w Int8', (0, 0), (1, 0), (10, 0))
227+
)",
228+
R"(
229+
┌─avgWeighted(x, w)─┐
230+
│ nan │
231+
└───────────────────┘
232+
)"
233+
},
234+
{
235+
"Empty table returns NaN",
236+
R"(
237+
CREATE TABLE test (t UInt8) ENGINE = Memory;
238+
SELECT avgWeighted(t, t) FROM test
239+
)",
240+
R"(
241+
┌─avgWeighted(t, t)─┐
242+
│ nan │
243+
└───────────────────┘
244+
)"
245+
}
246+
};
247+
FunctionDocumentation::Category category = FunctionDocumentation::Category::AggregateFunction;
248+
FunctionDocumentation::IntroducedIn introduced_in = {20, 1};
249+
FunctionDocumentation documentation = {description, syntax, arguments, parameters, returned_value, examples, introduced_in, category};
250+
factory.registerFunction("avgWeighted", {createAggregateFunctionAvgWeighted, AggregateFunctionProperties{}, documentation });
188251
}
189252

190253
}

src/AggregateFunctions/AggregateFunctionBoundingRatio.cpp

Lines changed: 51 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -202,7 +202,57 @@ AggregateFunctionPtr createAggregateFunctionRate(const std::string & name, const
202202

203203
void registerAggregateFunctionRate(AggregateFunctionFactory & factory)
204204
{
205-
factory.registerFunction("boundingRatio", createAggregateFunctionRate);
205+
FunctionDocumentation::Description description = R"(
206+
Calculates the slope between the leftmost and rightmost points across a group of values.
207+
)";
208+
FunctionDocumentation::Syntax syntax = "boundingRatio(x, y)";
209+
FunctionDocumentation::Arguments arguments = {
210+
{"x", "X-coordinate values.", {"(U)Int*", "Float*", "Decimal"}},
211+
{"y", "Y-coordinate values.", {"(U)Int*", "Float*", "Decimal"}}
212+
};
213+
FunctionDocumentation::Parameters parameters = {};
214+
FunctionDocumentation::ReturnedValue returned_value = {"Returns the slope of the line between the leftmost and rightmost points, otherwise returns `NaN` if the data is empty.", {"Float64"}};
215+
FunctionDocumentation::Examples examples = {
216+
{
217+
"Sample data",
218+
R"(
219+
SELECT
220+
number,
221+
number * 1.5
222+
FROM numbers(10)
223+
)",
224+
R"(
225+
┌─number─┬─multiply(number, 1.5)─┐
226+
│ 0 │ 0 │
227+
│ 1 │ 1.5 │
228+
│ 2 │ 3 │
229+
│ 3 │ 4.5 │
230+
│ 4 │ 6 │
231+
│ 5 │ 7.5 │
232+
│ 6 │ 9 │
233+
│ 7 │ 10.5 │
234+
│ 8 │ 12 │
235+
│ 9 │ 13.5 │
236+
└────────┴───────────────────────┘
237+
)"
238+
},
239+
{
240+
"Usage example",
241+
R"(
242+
SELECT boundingRatio(number, number * 1.5)
243+
FROM numbers(10)
244+
)",
245+
R"(
246+
┌─boundingRatio(number, multiply(number, 1.5))─┐
247+
│ 1.5 │
248+
└──────────────────────────────────────────────┘
249+
)"
250+
}
251+
};
252+
FunctionDocumentation::Category category = FunctionDocumentation::Category::AggregateFunction;
253+
FunctionDocumentation::IntroducedIn introduced_in = {20, 1};
254+
FunctionDocumentation documentation = {description, syntax, arguments, parameters, returned_value, examples, introduced_in, category};
255+
factory.registerFunction("boundingRatio", {createAggregateFunctionRate, AggregateFunctionProperties{}, documentation});
206256
}
207257

208258
}

src/AggregateFunctions/AggregateFunctionCategoricalInformationValue.cpp

Lines changed: 54 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -180,8 +180,61 @@ AggregateFunctionPtr createAggregateFunctionCategoricalIV(
180180

181181
void registerAggregateFunctionCategoricalIV(AggregateFunctionFactory & factory)
182182
{
183+
FunctionDocumentation::Description description = R"(
184+
Calculates the information value (IV) for categorical features in relation to a binary target variable.
185+
186+
For each category, the function computes: `(P(tag = 1) - P(tag = 0)) × (log(P(tag = 1)) - log(P(tag = 0)))`
187+
188+
where:
189+
- P(tag = 1) is the probability that the target equals 1 for the given category
190+
- P(tag = 0) is the probability that the target equals 0 for the given category
191+
192+
Information Value is a statistic used to measure the strength of a categorical feature's relationship with a binary target variable in predictive modeling.
193+
Higher absolute values indicate stronger predictive power.
194+
195+
The result indicates how much each discrete (categorical) feature `[category1, category2, ...]` contributes to a learning model which predicts the value of `tag`.
196+
)";
197+
FunctionDocumentation::Syntax syntax = "categoricalInformationValue(category1[, category2, ...,]tag)";
198+
FunctionDocumentation::Arguments arguments = {
199+
{"category1, category2, ...", "One or more categorical features to analyze. Each category should contain discrete values.", {"UInt8"}},
200+
{"tag", "Binary target variable for prediction. Should contain values 0 and 1.", {"UInt8"}}
201+
};
202+
FunctionDocumentation::Parameters parameters = {};
203+
FunctionDocumentation::ReturnedValue returned_value = {"Returns an array of Float64 values representing the information value for each unique combination of categories. Each value indicates the predictive strength of that category combination for the target variable.", {"Array(Float64)"}};
204+
FunctionDocumentation::Examples examples =
205+
{
206+
{
207+
"Basic usage analyzing age groups vs mobile usage",
208+
R"(
209+
-- Using the metrica.hits dataset (available on https://sql.clickhouse.com/) to analyze age-mobile relationship
210+
SELECT categoricalInformationValue(Age < 15, IsMobile)
211+
FROM metrica.hits;
212+
)",
213+
R"(
214+
[0.0014814694805292418]
215+
)"
216+
},
217+
{
218+
"Multiple categorical features with user demographics",
219+
R"(
220+
SELECT categoricalInformationValue(
221+
Sex, -- 0=male, 1=female
222+
toUInt8(Age < 25), -- 0=25+, 1=under 25
223+
toUInt8(IsMobile) -- 0=desktop, 1=mobile
224+
) AS iv_values
225+
FROM metrica.hits
226+
WHERE Sex IN (0, 1);
227+
)",
228+
R"(
229+
[0.00018965785460692887,0.004973668839403392]
230+
)"
231+
}
232+
};
233+
FunctionDocumentation::Category category = FunctionDocumentation::Category::AggregateFunction;
234+
FunctionDocumentation::IntroducedIn introduced_in = {20, 1};
235+
FunctionDocumentation documentation = {description, syntax, arguments, parameters, returned_value, examples, introduced_in, category};
183236
AggregateFunctionProperties properties = { .returns_default_when_only_null = true };
184-
factory.registerFunction("categoricalInformationValue", { createAggregateFunctionCategoricalIV, properties });
237+
factory.registerFunction("categoricalInformationValue", { createAggregateFunctionCategoricalIV, properties, documentation });
185238
}
186239

187240
}

src/AggregateFunctions/AggregateFunctionContingencyCoefficient.cpp

Lines changed: 42 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,13 +47,54 @@ struct ContingencyData : CrossTabData
4747

4848
void registerAggregateFunctionContingency(AggregateFunctionFactory & factory)
4949
{
50+
FunctionDocumentation::Description description = R"(
51+
The `contingency` function calculates the [contingency coefficient](https://en.wikipedia.org/wiki/Contingency_table#Cram%C3%A9r's_V_and_the_contingency_coefficient_C), a value that measures the association between two columns in a table.
52+
The computation is similar to the [`cramersV`](./cramersv.md) function but with a different denominator in the square root.
53+
)";
54+
FunctionDocumentation::Syntax syntax = "contingency(column1, column2)";
55+
FunctionDocumentation::Arguments arguments = {
56+
{"column1", "First column to compare.", {"Any"}},
57+
{"column2", "Second column to compare.", {"Any"}}
58+
};
59+
FunctionDocumentation::Parameters docs_parameters = {};
60+
FunctionDocumentation::ReturnedValue returned_value = {"Returns a value between 0 and 1. The larger the result, the closer the association of the two columns.", {"Float64"}};
61+
FunctionDocumentation::Examples examples = {
62+
{
63+
"Comparison with cramersV",
64+
R"(
65+
SELECT
66+
cramersV(a, b),
67+
contingency(a, b)
68+
FROM
69+
(
70+
SELECT
71+
number % 10 AS a,
72+
number % 4 AS b
73+
FROM
74+
numbers(150)
75+
)
76+
)",
77+
R"(
78+
┌──────cramersV(a, b)─┬───contingency(a, b)─┐
79+
│ 0.41171788506213564 │ 0.05812725261759165 │
80+
└─────────────────────┴─────────────────────┘
81+
)"
82+
}
83+
};
84+
FunctionDocumentation::Category category = FunctionDocumentation::Category::AggregateFunction;
85+
FunctionDocumentation::IntroducedIn introduced_in = {22, 1};
86+
FunctionDocumentation documentation = {description, syntax, arguments, docs_parameters, returned_value, examples, introduced_in, category};
5087
factory.registerFunction(ContingencyData::getName(),
88+
{
5189
[](const std::string & name, const DataTypes & argument_types, const Array & parameters, const Settings *)
5290
{
5391
assertBinary(name, argument_types);
5492
assertNoParameters(name, parameters);
5593
return std::make_shared<AggregateFunctionCrossTab<ContingencyData>>(argument_types);
56-
});
94+
},
95+
{},
96+
documentation
97+
});
5798
}
5899

59100
}

src/AggregateFunctions/AggregateFunctionCorr.cpp

Lines changed: 46 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,52 @@ template <typename T1, typename T2> using AggregateFunctionCorr = AggregateFunct
99

1010
void registerAggregateFunctionsStatisticsCorr(AggregateFunctionFactory & factory)
1111
{
12-
factory.registerFunction("corr", createAggregateFunctionStatisticsBinary<AggregateFunctionCorr, StatisticsFunctionKind::corr>, AggregateFunctionFactory::Case::Insensitive);
12+
FunctionDocumentation::Description description = R"(
13+
Calculates the [Pearson correlation coefficient](https://en.wikipedia.org/wiki/Pearson_correlation_coefficient):
14+
15+
$$
16+
\frac{\Sigma{(x - \bar{x})(y - \bar{y})}}{\sqrt{\Sigma{(x - \bar{x})^2} * \Sigma{(y - \bar{y})^2}}}
17+
$$
18+
19+
:::note
20+
This function uses a numerically unstable algorithm. If you need [numerical stability](https://en.wikipedia.org/wiki/Numerical_stability) in calculations, use the [`corrStable`](../reference/corrstable.md) function. It is slower but provides a more accurate result.
21+
:::
22+
)";
23+
FunctionDocumentation::Syntax syntax = "corr(x, y)";
24+
FunctionDocumentation::Arguments arguments = {
25+
{"x", "First variable.", {"(U)Int*", "Float*"}},
26+
{"y", "Second variable.", {"(U)Int*", "Float*"}}
27+
};
28+
FunctionDocumentation::Parameters parameters = {};
29+
FunctionDocumentation::ReturnedValue returned_value = {"Returns the Pearson correlation coefficient.", {"Float64"}};
30+
FunctionDocumentation::Examples examples = {
31+
{
32+
"Basic correlation calculation",
33+
R"(
34+
DROP TABLE IF EXISTS series;
35+
CREATE TABLE series
36+
(
37+
i UInt32,
38+
x_value Float64,
39+
y_value Float64
40+
)
41+
ENGINE = Memory;
42+
INSERT INTO series(i, x_value, y_value) VALUES (1, 5.6, -4.4),(2, -9.6, 3),(3, -1.3, -4),(4, 5.3, 9.7),(5, 4.4, 0.037),(6, -8.6, -7.8),(7, 5.1, 9.3),(8, 7.9, -3.6),(9, -8.2, 0.62),(10, -3, 7.3);
43+
44+
SELECT corr(x_value, y_value)
45+
FROM series
46+
)",
47+
R"(
48+
┌─corr(x_value, y_value)─┐
49+
│ 0.1730265755453256 │
50+
└────────────────────────┘
51+
)"
52+
}
53+
};
54+
FunctionDocumentation::Category category = FunctionDocumentation::Category::AggregateFunction;
55+
FunctionDocumentation::IntroducedIn introduced_in = {1, 1};
56+
FunctionDocumentation documentation = {description, syntax, arguments, parameters, returned_value, examples, introduced_in, category};
57+
factory.registerFunction("corr", {createAggregateFunctionStatisticsBinary<AggregateFunctionCorr, StatisticsFunctionKind::corr>, AggregateFunctionProperties{}, documentation }, AggregateFunctionFactory::Case::Insensitive);
1358
}
1459

1560
}

src/AggregateFunctions/AggregateFunctionCount.cpp

Lines changed: 66 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -253,8 +253,73 @@ AggregateFunctionPtr createAggregateFunctionCount(const std::string & name, cons
253253

254254
void registerAggregateFunctionCount(AggregateFunctionFactory & factory)
255255
{
256+
FunctionDocumentation::Description description = R"(
257+
Counts the number of rows or not-NULL values.
258+
259+
ClickHouse supports the following syntaxes for `count`:
260+
- `count(expr)` or `COUNT(DISTINCT expr)`.
261+
- `count()` or `COUNT(*)`. The `count()` syntax is ClickHouse-specific.
262+
263+
**Details**
264+
265+
ClickHouse supports the `COUNT(DISTINCT ...)` syntax.
266+
The behavior of this construction depends on the [`count_distinct_implementation`](../../../operations/settings/settings.md#count_distinct_implementation) setting.
267+
It defines which of the [uniq*](/sql-reference/aggregate-functions/reference/uniq) functions is used to perform the operation.
268+
The default is the [uniqExact](/sql-reference/aggregate-functions/reference/uniqexact) function.
269+
270+
The `SELECT count() FROM table` query is optimized by default using metadata from MergeTree.
271+
If you need to use row-level security, disable optimization using the [`optimize_trivial_count_query`](/operations/settings/settings#optimize_trivial_count_query) setting.
272+
273+
However `SELECT count(nullable_column) FROM table` query can be optimized by enabling the [`optimize_functions_to_subcolumns`](/operations/settings/settings#optimize_functions_to_subcolumns) setting.
274+
With `optimize_functions_to_subcolumns = 1` the function reads only [`null`](../../../sql-reference/data-types/nullable.md#finding-null) subcolumn instead of reading and processing the whole column data.
275+
The query `SELECT count(n) FROM table` transforms to `SELECT sum(NOT n.null) FROM table`.
276+
277+
:::tip Improving COUNT(DISTINCT expr) performance
278+
If your `COUNT(DISTINCT expr)` query is slow, consider adding a [`GROUP BY`](/sql-reference/statements/select/group-by) clause as this improves parallelization.
279+
You can also use a [projection](../../../sql-reference/statements/alter/projection.md) to create an index on the target column used with `COUNT(DISTINCT target_col)`.
280+
:::
281+
)";
282+
FunctionDocumentation::Syntax syntax = "count([expr])";
283+
FunctionDocumentation::Arguments arguments = {
284+
{"expr", "Optional. An expression. The function counts how many times this expression returned not null.", {"Expression"}}
285+
};
286+
FunctionDocumentation::Parameters parameters = {};
287+
FunctionDocumentation::ReturnedValue returned_value = {"Returns the a row count if the function is called without parameters, otherwise returns a count of how many times the passed expression returned not null.", {"UInt64"}};
288+
FunctionDocumentation::Examples examples = {
289+
{
290+
"Basic row count",
291+
R"(
292+
SELECT count() FROM t
293+
)",
294+
R"(
295+
┌─count()─┐
296+
│ 5 │
297+
└─────────┘
298+
)"
299+
},
300+
{
301+
"COUNT(DISTINCT) example",
302+
R"(
303+
-- This example shows that `count(DISTINCT num)` is performed by the `uniqExact` function according to the `count_distinct_implementation` setting value.
304+
SELECT name, value FROM system.settings WHERE name = 'count_distinct_implementation';
305+
SELECT count(DISTINCT num) FROM t
306+
)",
307+
R"(
308+
┌─name──────────────────────────┬─value─────┐
309+
│ count_distinct_implementation │ uniqExact │
310+
└───────────────────────────────┴───────────┘
311+
┌─uniqExact(num)─┐
312+
│ 3 │
313+
└────────────────┘
314+
)"
315+
}
316+
};
317+
FunctionDocumentation::Category category = FunctionDocumentation::Category::AggregateFunction;
318+
FunctionDocumentation::IntroducedIn introduced_in = {1, 1};
319+
FunctionDocumentation documentation = {description, syntax, arguments, parameters, returned_value, examples, introduced_in, category};
256320
AggregateFunctionProperties properties = { .returns_default_when_only_null = true, .is_order_dependent = false };
257-
factory.registerFunction("count", {createAggregateFunctionCount, properties}, AggregateFunctionFactory::Case::Insensitive);
321+
322+
factory.registerFunction("count", {createAggregateFunctionCount, properties, documentation}, AggregateFunctionFactory::Case::Insensitive);
258323
}
259324

260325
}

0 commit comments

Comments
 (0)