Skip to content

Commit 0c359ba

Browse files
authored
Merge pull request ClickHouse#92811 from Blargian/aggregate_functions_5
Docs: move aggregate functions docs to source (#5)
2 parents 9af09f1 + 86ad2c9 commit 0c359ba

7 files changed

Lines changed: 506 additions & 13 deletions

src/AggregateFunctions/AggregateFunctionAny.cpp

Lines changed: 93 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -395,12 +395,103 @@ AggregateFunctionPtr createAggregateFunctionAnyLast(
395395

396396
void registerAggregateFunctionsAny(AggregateFunctionFactory & factory)
397397
{
398+
/// any documentation
399+
FunctionDocumentation::Description description = R"(
400+
Selects the first encountered value of a column.
401+
402+
:::warning
403+
As a query can be executed in arbitrary order, the result of this function is non-deterministic. If you need an arbitrary but deterministic result, use functions min or max.
404+
:::
405+
406+
By default, the function never returns NULL, i.e. ignores NULL values in the input column.
407+
However, if the function is used with the `RESPECT NULLS` modifier, it returns the first value reads no matter if NULL or not.
408+
409+
**Implementation details**
410+
411+
In some cases, you can rely on the order of execution.
412+
This applies to cases when `SELECT` comes from a subquery that uses `ORDER BY`.
413+
414+
When a `SELECT` query has the `GROUP BY` clause or at least one aggregate function, ClickHouse (in contrast to MySQL) requires that all expressions in the `SELECT`, `HAVING`, and `ORDER BY` clauses be calculated from keys or from aggregate functions.
415+
In other words, each column selected from the table must be used either in keys or inside aggregate functions.
416+
To get behavior like in MySQL, you can put the other columns in the `any` aggregate function.
417+
)";
418+
FunctionDocumentation::Syntax syntax = "any(column) [RESPECT NULLS]";
419+
FunctionDocumentation::Arguments arguments = {
420+
{"column", "The column name.", {"Any"}}
421+
};
422+
FunctionDocumentation::ReturnedValue returned_value = {R"(
423+
Returns the first value encountered.
424+
425+
:::note
426+
The return type of the function is the same as the input, except for LowCardinality which is discarded.
427+
This means that given no rows as input it will return the default value of that type (0 for integers, or Null for a Nullable() column).
428+
You might use the -OrNull combinator to modify this behaviour.
429+
:::
430+
)",
431+
{"Any"}
432+
};
433+
FunctionDocumentation::Examples examples = {
434+
{
435+
"Usage example",
436+
R"(
437+
CREATE TABLE tab (city Nullable(String)) ENGINE=Memory;
438+
INSERT INTO tab (city) VALUES (NULL), ('Amsterdam'), ('New York'), ('Tokyo'), ('Valencia'), (NULL);
439+
SELECT any(city), anyRespectNulls(city) FROM tab;
440+
)",
441+
R"(
442+
┌─any(city)─┬─anyRespectNulls(city)─┐
443+
│ Amsterdam │ ᴺᵁᴸᴸ │
444+
└───────────┴───────────────────────┘
445+
)"
446+
}
447+
};
448+
FunctionDocumentation::IntroducedIn introduced_in = {1, 1};
449+
FunctionDocumentation::Category category = FunctionDocumentation::Category::AggregateFunction;
450+
FunctionDocumentation documentation = {description, syntax, arguments, {}, returned_value, examples, introduced_in, category};
451+
398452
AggregateFunctionProperties default_properties = {.returns_default_when_only_null = false, .is_order_dependent = true};
399453

400-
factory.registerFunction("any", {createAggregateFunctionAny, default_properties});
454+
factory.registerFunction("any", {createAggregateFunctionAny, default_properties, documentation});
401455
factory.registerAlias("any_value", "any", AggregateFunctionFactory::Case::Insensitive);
402456
factory.registerAlias("first_value", "any", AggregateFunctionFactory::Case::Insensitive);
403-
factory.registerFunction("anyLast", {createAggregateFunctionAnyLast, default_properties});
457+
458+
/// anyLast documentation
459+
FunctionDocumentation::Description anyLast_description = R"(
460+
Selects the last encountered value of a column.
461+
462+
:::warning
463+
As a query can be executed in arbitrary order, the result of this function is non-deterministic.
464+
If you need an arbitrary but deterministic result, use functions min or max.
465+
:::
466+
467+
By default, the function never returns NULL, i.e. ignores NULL values in the input column.
468+
However, if the function is used with the `RESPECT NULLS` modifier, it returns the last value reads no matter if NULL or not.
469+
)";
470+
FunctionDocumentation::Syntax anyLast_syntax = "anyLast(column) [RESPECT NULLS]";
471+
FunctionDocumentation::Arguments anyLast_arguments = {
472+
{"column", "The column name.", {"Any"}}
473+
};
474+
FunctionDocumentation::ReturnedValue anyLast_returned_value = {"The last value encountered.", {"Any"}};
475+
FunctionDocumentation::Examples anyLast_examples = {
476+
{
477+
"Usage example",
478+
R"(
479+
CREATE TABLE tab (city Nullable(String)) ENGINE=Memory;
480+
INSERT INTO tab (city) VALUES ('Amsterdam'), (NULL), ('New York'), ('Tokyo'), ('Valencia'), (NULL);
481+
SELECT anyLast(city), anyLastRespectNulls(city) FROM tab;
482+
)",
483+
R"(
484+
┌─anyLast(city)─┬─anyLastRespectNulls(city)─┐
485+
│ Valencia │ ᴺᵁᴸᴸ │
486+
└───────────────┴───────────────────────────┘
487+
)"
488+
}
489+
};
490+
FunctionDocumentation::IntroducedIn anyLast_introduced_in = {1, 1};
491+
FunctionDocumentation::Category anyLast_category = FunctionDocumentation::Category::AggregateFunction;
492+
FunctionDocumentation anyLast_documentation = {anyLast_description, anyLast_syntax, anyLast_arguments, {}, anyLast_returned_value, anyLast_examples, anyLast_introduced_in, anyLast_category};
493+
494+
factory.registerFunction("anyLast", {createAggregateFunctionAnyLast, default_properties, anyLast_documentation}, AggregateFunctionFactory::Case::Sensitive);
404495
factory.registerAlias("last_value", "anyLast", AggregateFunctionFactory::Case::Insensitive);
405496
}
406497
}

src/AggregateFunctions/AggregateFunctionFourthMoment.cpp

Lines changed: 66 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,8 +9,72 @@ template <typename T> using AggregateFunctionFourthMoment = AggregateFunctionVar
99

1010
void registerAggregateFunctionsStatisticsFourthMoment(AggregateFunctionFactory & factory)
1111
{
12-
factory.registerFunction("kurtSamp", createAggregateFunctionStatisticsUnary<AggregateFunctionFourthMoment, StatisticsFunctionKind::kurtSamp>);
13-
factory.registerFunction("kurtPop", createAggregateFunctionStatisticsUnary<AggregateFunctionFourthMoment, StatisticsFunctionKind::kurtPop>);
12+
FunctionDocumentation::Description description_samp = R"(
13+
Computes the [sample kurtosis](https://en.wikipedia.org/wiki/Kurtosis) of a sequence.
14+
15+
It represents an unbiased estimate of the kurtosis of a random variable if passed values form its sample.
16+
)";
17+
FunctionDocumentation::Syntax syntax_samp = R"(
18+
kurtSamp(expr)
19+
)";
20+
FunctionDocumentation::Arguments arguments_samp = {
21+
{"expr", "[Expression](/sql-reference/syntax#expressions) returning a number.", {"(U)Int*", "Float*", "Decimal"}}
22+
};
23+
FunctionDocumentation::Parameters parameters = {};
24+
FunctionDocumentation::ReturnedValue returned_value_samp = {"Returns the kurtosis of the given distribution. If `n <= 1` (`n` is a size of the sample), then the function returns `nan`.", {"Float64"}};
25+
FunctionDocumentation::Examples examples_samp = {
26+
{
27+
"Computing sample kurtosis",
28+
R"(
29+
CREATE TABLE test_data (x Float64) ENGINE = Memory;
30+
INSERT INTO test_data VALUES (1), (2), (3), (4), (5), (6), (7), (8), (9), (10);
31+
32+
SELECT kurtSamp(x) FROM test_data;
33+
)",
34+
R"(
35+
┌────────kurtSamp(x)─┐
36+
│ 1.4383636363636365 │
37+
└────────────────────┘
38+
)"
39+
}
40+
};
41+
FunctionDocumentation::IntroducedIn introduced_in_samp = {20, 1};
42+
FunctionDocumentation::Category category_samp = FunctionDocumentation::Category::AggregateFunction;
43+
FunctionDocumentation documentation_samp = {description_samp, syntax_samp, arguments_samp, {}, returned_value_samp, examples_samp, introduced_in_samp, category_samp};
44+
45+
factory.registerFunction("kurtSamp", {createAggregateFunctionStatisticsUnary<AggregateFunctionFourthMoment, StatisticsFunctionKind::kurtSamp>, {}, documentation_samp});
46+
47+
FunctionDocumentation::Description description = R"(
48+
Computes the [kurtosis](https://en.wikipedia.org/wiki/Kurtosis) of a sequence.
49+
)";
50+
FunctionDocumentation::Syntax syntax = R"(
51+
kurtPop(expr)
52+
)";
53+
FunctionDocumentation::Arguments arguments = {
54+
{"expr", "[Expression](/sql-reference/syntax#expressions) returning a number.", {"(U)Int*", "Float*", "Decimal"}}
55+
};
56+
FunctionDocumentation::ReturnedValue returned_value = {"Returns the kurtosis of the given distribution.", {"Float64"}};
57+
FunctionDocumentation::Examples examples = {
58+
{
59+
"Computing kurtosis",
60+
R"(
61+
CREATE TABLE test_data (x Float64) ENGINE = Memory;
62+
INSERT INTO test_data VALUES (1), (2), (3), (4), (5), (6), (7), (8), (9), (10);
63+
64+
SELECT kurtPop(x) FROM test_data;
65+
)",
66+
R"(
67+
┌─────────kurtPop(x)─┐
68+
│ 1.7757575757575756 │
69+
└────────────────────┘
70+
)"
71+
}
72+
};
73+
FunctionDocumentation::IntroducedIn introduced_in = {20, 1};
74+
FunctionDocumentation::Category category = FunctionDocumentation::Category::AggregateFunction;
75+
FunctionDocumentation documentation = {description, syntax, arguments, parameters, returned_value, examples, introduced_in, category};
76+
77+
factory.registerFunction("kurtPop", {createAggregateFunctionStatisticsUnary<AggregateFunctionFourthMoment, StatisticsFunctionKind::kurtPop>, {}, documentation});
1478
}
1579

1680
}

src/AggregateFunctions/AggregateFunctionKolmogorovSmirnovTest.cpp

Lines changed: 71 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -353,7 +353,77 @@ AggregateFunctionPtr createAggregateFunctionKolmogorovSmirnovTest(
353353

354354
void registerAggregateFunctionKolmogorovSmirnovTest(AggregateFunctionFactory & factory)
355355
{
356-
factory.registerFunction("kolmogorovSmirnovTest", createAggregateFunctionKolmogorovSmirnovTest, AggregateFunctionFactory::Case::Insensitive);
356+
FunctionDocumentation::Description description = R"(
357+
Applies Kolmogorov-Smirnov's test to samples from two populations.
358+
359+
Values of both samples are in the `sample_data` column. If `sample_index` equals to 0 then the value in that row belongs to the sample from the first population. Otherwise it belongs to the sample from the second population.
360+
Samples must belong to continuous, one-dimensional probability distributions.
361+
)";
362+
FunctionDocumentation::Syntax syntax = R"(
363+
kolmogorovSmirnovTest([alternative, computation_method])(sample_data, sample_index)
364+
)";
365+
FunctionDocumentation::Arguments arguments = {
366+
{"sample_data", "Sample data.", {"(U)Int*", "Float*", "Decimal"}},
367+
{"sample_index", "Sample index.", {"(U)Int*"}}
368+
};
369+
FunctionDocumentation::Parameters parameters = {
370+
{"alternative", "Alternative hypothesis. (Optional, default: 'two-sided'.) Let `F(x) and G(x)` be the CDFs of the first and second distributions respectively. 'two-sided': The null hypothesis is that samples come from the same distribution, e.g. `F(x) = G(x)` for all x. And the alternative is that the distributions are not identical. 'greater': The null hypothesis is that values in the first sample are stochastically smaller than those in the second one, e.g. the CDF of first distribution lies above and hence to the left of that for the second one. Which in fact means that `F(x) >= G(x)` for all x. And the alternative in this case is that `F(x) < G(x)` for at least one x. 'less': The null hypothesis is that values in the first sample are stochastically greater than those in the second one, e.g. the CDF of first distribution lies below and hence to the right of that for the second one. Which in fact means that `F(x) <= G(x)` for all x. And the alternative in this case is that `F(x) > G(x)` for at least one x.", {"String"}},
371+
{"computation_method", "The method used to compute p-value. (Optional, default: 'auto'.) 'exact': calculation is performed using precise probability distribution of the test statistics. Compute intensive and wasteful except for small samples. 'asymp' ('asymptotic'): calculation is performed using an approximation. For large sample sizes, the exact and asymptotic p-values are very similar. 'auto': the 'exact' method is used when a maximum number of samples is less than 10'000.", {"String"}}
372+
};
373+
FunctionDocumentation::ReturnedValue returned_value = {"Returns a tuple with two elements: a calculated statistic and a calculated p-value.", {"Tuple(Float64, Float64)"}};
374+
FunctionDocumentation::Examples examples = {
375+
{
376+
"Same distribution test",
377+
R"(
378+
SELECT kolmogorovSmirnovTest('less', 'exact')(value, num)
379+
FROM
380+
(
381+
SELECT
382+
randNormal(0, 10) AS value,
383+
0 AS num
384+
FROM numbers(10000)
385+
UNION ALL
386+
SELECT
387+
randNormal(0, 10) AS value,
388+
1 AS num
389+
FROM numbers(10000)
390+
)
391+
)",
392+
R"(
393+
┌─kolmogorovSmirnovTest('less', 'exact')(value, num)─┐
394+
│ (0.009899999999999996,0.37528595205132287) │
395+
└────────────────────────────────────────────────────┘
396+
)"
397+
},
398+
{
399+
"Different distributions test",
400+
R"(
401+
SELECT kolmogorovSmirnovTest('two-sided', 'exact')(value, num)
402+
FROM
403+
(
404+
SELECT
405+
randStudentT(10) AS value,
406+
0 AS num
407+
FROM numbers(100)
408+
UNION ALL
409+
SELECT
410+
randNormal(0, 10) AS value,
411+
1 AS num
412+
FROM numbers(100)
413+
)
414+
)",
415+
R"(
416+
┌─kolmogorovSmirnovTest('two-sided', 'exact')(value, num)─┐
417+
│ (0.4100000000000002,6.61735760482795e-8) │
418+
└─────────────────────────────────────────────────────────┘
419+
)"
420+
}
421+
};
422+
FunctionDocumentation::IntroducedIn introduced_in = {23, 4};
423+
FunctionDocumentation::Category category = FunctionDocumentation::Category::AggregateFunction;
424+
FunctionDocumentation documentation = {description, syntax, arguments, parameters, returned_value, examples, introduced_in, category};
425+
426+
factory.registerFunction("kolmogorovSmirnovTest", {createAggregateFunctionKolmogorovSmirnovTest, {}, documentation}, AggregateFunctionFactory::Case::Insensitive);
357427
}
358428

359429
}

src/AggregateFunctions/AggregateFunctionLargestTriangleThreeBuckets.cpp

Lines changed: 45 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -357,7 +357,51 @@ createAggregateFunctionLargestTriangleThreeBuckets(const std::string & name, con
357357

358358
void registerAggregateFunctionLargestTriangleThreeBuckets(AggregateFunctionFactory & factory)
359359
{
360-
factory.registerFunction(AggregateFunctionLargestTriangleThreeBuckets::name, createAggregateFunctionLargestTriangleThreeBuckets);
360+
FunctionDocumentation::Description description = R"(
361+
Applies the [Largest-Triangle-Three-Buckets](https://skemman.is/bitstream/1946/15343/3/SS_MSthesis.pdf) algorithm to the input data.
362+
The algorithm is used for downsampling time series data for visualization.
363+
It is designed to operate on series sorted by x coordinate.
364+
It works by dividing the sorted series into buckets and then finding the largest triangle in each bucket.
365+
The number of buckets is equal to the number of points in the resulting series.
366+
The function will sort data by `x` and then apply the downsampling algorithm to the sorted data.
367+
368+
NaNs are ignored in the provided series, meaning that any NaN values will be excluded from the analysis.
369+
This ensures that the function operates only on valid numerical data.
370+
)";
371+
FunctionDocumentation::Syntax syntax = R"(
372+
largestTriangleThreeBuckets(n)(x, y)
373+
)";
374+
FunctionDocumentation::Arguments arguments = {
375+
{"x", "x coordinate.", {"(U)Int*", "Float*", "Decimal", "Date", "Date32", "DateTime", "DateTime64"}},
376+
{"y", "y coordinate.", {"(U)Int*", "Float*", "Decimal", "Date", "Date32", "DateTime", "DateTime64"}}
377+
};
378+
FunctionDocumentation::Parameters parameters = {
379+
{"n", "Number of points in the resulting series.", {"UInt64"}}
380+
};
381+
FunctionDocumentation::ReturnedValue returned_value = {"Returns an array of tuples with two elements..", {"Array(Tuple(Float64, Float64))"}};
382+
FunctionDocumentation::Examples examples = {
383+
{
384+
"Downsampling time series data",
385+
R"(
386+
CREATE TABLE largestTriangleThreeBuckets_test (x Float64, y Float64) ENGINE = Memory;
387+
INSERT INTO largestTriangleThreeBuckets_test VALUES
388+
(1.0, 10.0), (2.0, 20.0), (3.0, 15.0), (8.0, 60.0), (9.0, 55.0),
389+
(10.0, 70.0), (4.0, 30.0), (5.0, 40.0), (6.0, 35.0), (7.0, 50.0);
390+
391+
SELECT largestTriangleThreeBuckets(4)(x, y) FROM largestTriangleThreeBuckets_test;
392+
)",
393+
R"(
394+
┌────────largestTriangleThreeBuckets(4)(x, y)───────────┐
395+
│ [(1,10),(3,15),(9,55),(10,70)] │
396+
└───────────────────────────────────────────────────────┘
397+
)"
398+
}
399+
};
400+
FunctionDocumentation::IntroducedIn introduced_in = {23, 10};
401+
FunctionDocumentation::Category category = FunctionDocumentation::Category::AggregateFunction;
402+
FunctionDocumentation documentation = {description, syntax, arguments, parameters, returned_value, examples, introduced_in, category};
403+
404+
factory.registerFunction(AggregateFunctionLargestTriangleThreeBuckets::name, {createAggregateFunctionLargestTriangleThreeBuckets, {}, documentation});
361405
factory.registerAlias("lttb", AggregateFunctionLargestTriangleThreeBuckets::name);
362406
}
363407

src/AggregateFunctions/AggregateFunctionMannWhitney.cpp

Lines changed: 43 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -274,7 +274,49 @@ AggregateFunctionPtr createAggregateFunctionMannWhitneyUTest(
274274

275275
void registerAggregateFunctionMannWhitney(AggregateFunctionFactory & factory)
276276
{
277-
factory.registerFunction("mannWhitneyUTest", createAggregateFunctionMannWhitneyUTest);
277+
FunctionDocumentation::Description description = R"(
278+
Applies the Mann-Whitney rank test to samples from two populations.
279+
280+
Values of both samples are in the `sample_data` column.
281+
If `sample_index` equals to 0 then the value in that row belongs to the sample from the first population.
282+
Otherwise it belongs to the sample from the second population.
283+
The null hypothesis is that two populations are stochastically equal.
284+
Also one-sided hypotheses can be tested.
285+
This test does not assume that data have normal distribution.
286+
)";
287+
FunctionDocumentation::Syntax syntax = R"(
288+
mannWhitneyUTest[(alternative[, continuity_correction])](sample_data, sample_index)
289+
)";
290+
FunctionDocumentation::Arguments arguments = {
291+
{"sample_data", "Sample data.", {"(U)Int*", "Float*", "Decimal*"}},
292+
{"sample_index", "Sample index.", {"(U)Int*"}}
293+
};
294+
FunctionDocumentation::Parameters parameters = {
295+
{"alternative", "Optional. Alternative hypothesis. 'two-sided' (default): two populations are not stochastically equal. 'greater': values in the first sample are stochastically greater than those in the second sample. 'less': values in the first sample are stochastically less than those in the second sample.", {"String"}},
296+
{"continuity_correction", "Optional. If not 0 then continuity correction in the normal approximation for the p-value is applied. The default value is 1.", {"UInt64"}}
297+
};
298+
FunctionDocumentation::ReturnedValue returned_value = {"Returns a tuple with two elements: calculated U-statistic and calculated p-value.", {"Tuple(Float64, Float64)"}};
299+
FunctionDocumentation::Examples examples = {
300+
{
301+
"Mann-Whitney U test example",
302+
R"(
303+
CREATE TABLE mww_ttest (sample_data Float64, sample_index UInt8) ENGINE = Memory;
304+
INSERT INTO mww_ttest VALUES (10, 0), (11, 0), (12, 0), (1, 1), (2, 1), (3, 1);
305+
306+
SELECT mannWhitneyUTest('greater')(sample_data, sample_index) FROM mww_ttest;
307+
)",
308+
R"(
309+
┌─mannWhitneyUTest('greater')(sample_data, sample_index)─┐
310+
│ (9,0.04042779918503192) │
311+
└────────────────────────────────────────────────────────┘
312+
)"
313+
}
314+
};
315+
FunctionDocumentation::IntroducedIn introduced_in = {21, 1};
316+
FunctionDocumentation::Category category = FunctionDocumentation::Category::AggregateFunction;
317+
FunctionDocumentation documentation = {description, syntax, arguments, parameters, returned_value, examples, introduced_in, category};
318+
319+
factory.registerFunction("mannWhitneyUTest", {createAggregateFunctionMannWhitneyUTest, {}, documentation});
278320
}
279321

280322
}

0 commit comments

Comments
 (0)