-
Notifications
You must be signed in to change notification settings - Fork 618
feat(tdigest): implement TDIGEST.TRIMMED_MEAN command #3312
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: unstable
Are you sure you want to change the base?
Changes from all commits
6a2198f
7799524
4394482
178b1ee
8b2bb67
cf5451a
3f01bd6
af7b44f
a7f773f
75c60a7
f9ca36d
7d02463
1cf77d3
8a3e9b1
fedf4b8
6afbab9
6fb6a9e
f5b60b0
49cdccb
9b7ac7c
592558d
cfccf80
5960a92
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -45,6 +45,11 @@ | |
| constexpr auto kInfoObservations = "Observations"; | ||
| constexpr auto kInfoTotalCompressions = "Total compressions"; | ||
| constexpr auto kNan = "nan"; | ||
|
|
||
| constexpr const char *errParseLowCutQuantile = "error parsing low_cut_percentile"; | ||
| constexpr const char *errParseHighCutQuantile = "error parsing high_cut_percentile"; | ||
| constexpr const char *errCutQuantileRange = "low_cut_percentile and high_cut_percentile should be in [0,1]"; | ||
| constexpr const char *errLowCutQuantileLess = "low_cut_percentile should be lower than high_cut_percentile"; | ||
| } // namespace | ||
|
|
||
| class CommandTDigestCreate : public Commander { | ||
|
|
@@ -492,6 +497,67 @@ | |
| TDigestMergeOptions options_; | ||
| }; | ||
|
|
||
| class CommandTDigestTrimmedMean : public Commander { | ||
| public: | ||
| Status Parse(const std::vector<std::string> &args) override { | ||
|
Check failure on line 502 in src/commands/cmd_tdigest.cc
|
||
| if (args.size() != 4) { | ||
| return {Status::RedisParseErr, errWrongNumOfArguments}; | ||
| } | ||
|
|
||
| key_name_ = args[1]; | ||
|
|
||
| auto low_cut_quantile = ParseFloat(args[2]); | ||
| if (!low_cut_quantile || std::isnan(*low_cut_quantile)) { | ||
| return {Status::RedisParseErr, errParseLowCutQuantile}; | ||
| } | ||
| low_cut_quantile_ = *low_cut_quantile; | ||
|
|
||
| auto high_cut_quantile = ParseFloat(args[3]); | ||
| if (!high_cut_quantile || std::isnan(*high_cut_quantile)) { | ||
| return {Status::RedisParseErr, errParseHighCutQuantile}; | ||
| } | ||
| high_cut_quantile_ = *high_cut_quantile; | ||
|
|
||
| if (!std::isfinite(low_cut_quantile_) || low_cut_quantile_ < 0.0 || low_cut_quantile_ > 1.0) { | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Using a string validation before numeric validation maybe a better way to avoid the unstable comparison of float numbers.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I tested Redis's actual behavior and found that it accepts numeric forms like To preserve Redis compatibility, I kept numeric parsing and aligned the validation with Redis. Could you please let me know if this approach would be acceptable? 👀
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks for your update. In this case we could keep the float number validation. Maybe we could add a decimal parser in future.😊 |
||
| return {Status::RedisParseErr, errCutQuantileRange}; | ||
| } | ||
| if (!std::isfinite(high_cut_quantile_) || high_cut_quantile_ < 0.0 || high_cut_quantile_ > 1.0) { | ||
| return {Status::RedisParseErr, errCutQuantileRange}; | ||
| } | ||
| if (low_cut_quantile_ >= high_cut_quantile_) { | ||
| return {Status::RedisParseErr, errLowCutQuantileLess}; | ||
| } | ||
|
Comment on lines
+509
to
+529
|
||
|
|
||
| return Status::OK(); | ||
| } | ||
|
|
||
| Status Execute(engine::Context &ctx, Server *srv, Connection *conn, std::string *output) override { | ||
| TDigest tdigest(srv->storage, conn->GetNamespace()); | ||
| TDigestTrimmedMeanResult result; | ||
|
|
||
| auto s = tdigest.TrimmedMean(ctx, key_name_, low_cut_quantile_, high_cut_quantile_, &result); | ||
| if (!s.ok()) { | ||
|
Check warning on line 539 in src/commands/cmd_tdigest.cc
|
||
| if (s.IsNotFound()) { | ||
| return {Status::RedisExecErr, errKeyNotFound}; | ||
| } | ||
| return {Status::RedisExecErr, s.ToString()}; | ||
| } | ||
|
|
||
| if (!result.mean.has_value()) { | ||
| *output = redis::BulkString(kNan); | ||
| } else { | ||
| *output = redis::BulkString(util::Float2String(*result.mean)); | ||
| } | ||
|
|
||
| return Status::OK(); | ||
| } | ||
|
|
||
| private: | ||
| std::string key_name_; | ||
| double low_cut_quantile_; | ||
| double high_cut_quantile_; | ||
| }; | ||
|
|
||
| std::vector<CommandKeyRange> GetMergeKeyRange(const std::vector<std::string> &args) { | ||
| auto numkeys = ParseInt<int>(args[2], 10).ValueOr(0); | ||
| return {{1, 1, 1}, {3, 2 + numkeys, 1}}; | ||
|
|
@@ -507,6 +573,7 @@ | |
| MakeCmdAttr<CommandTDigestByRevRank>("tdigest.byrevrank", -3, "read-only", 1, 1, 1), | ||
| MakeCmdAttr<CommandTDigestByRank>("tdigest.byrank", -3, "read-only", 1, 1, 1), | ||
| MakeCmdAttr<CommandTDigestQuantile>("tdigest.quantile", -3, "read-only", 1, 1, 1), | ||
| MakeCmdAttr<CommandTDigestTrimmedMean>("tdigest.trimmed_mean", 4, "read-only", 1, 1, 1), | ||
| MakeCmdAttr<CommandTDigestReset>("tdigest.reset", 2, "write", 1, 1, 1), | ||
| MakeCmdAttr<CommandTDigestMerge>("tdigest.merge", -4, "write", GetMergeKeyRange)); | ||
| } // namespace redis | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -309,3 +309,46 @@ | |
| } | ||
| return Status::OK(); | ||
| } | ||
|
|
||
| template <typename TD> | ||
| inline StatusOr<double> TDigestTrimmedMean(TD&& td, double low_cut_quantile, double high_cut_quantile) { | ||
|
Check failure on line 314 in src/types/tdigest.h
|
||
| if (td.Size() == 0) { | ||
| return std::numeric_limits<double>::quiet_NaN(); | ||
| } | ||
|
|
||
| const double total_weight = td.TotalWeight(); | ||
| const double leftmost_weight = std::floor(total_weight * low_cut_quantile); | ||
| const double rightmost_weight = std::ceil(total_weight * high_cut_quantile); | ||
|
|
||
| double count_done = 0.0; | ||
| double trimmed_sum = 0.0; | ||
| double trimmed_count = 0.0; | ||
|
|
||
| auto iter = td.Begin(); | ||
| while (iter->Valid()) { | ||
| auto centroid = GET_OR_RET(iter->GetCentroid()); | ||
| const double n_weight = centroid.weight; | ||
| double count_add = n_weight; | ||
|
|
||
| // Keep only the portion of this centroid that overlaps with the trimmed rank range. | ||
| count_add -= std::min(std::max(0.0, leftmost_weight - count_done), count_add); | ||
| count_add = std::min(std::max(0.0, rightmost_weight - count_done), count_add); | ||
|
Comment on lines
+334
to
+335
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Could we have a comment on this? It may be hard to understand at first glance.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Sure, will add it. |
||
|
|
||
| count_done += n_weight; | ||
|
|
||
| trimmed_sum += centroid.mean * count_add; | ||
| trimmed_count += count_add; | ||
|
|
||
| if (count_done >= rightmost_weight) { | ||
| break; | ||
| } | ||
|
|
||
| iter->Next(); | ||
| } | ||
|
|
||
| if (trimmed_count == 0.0) { | ||
| return std::numeric_limits<double>::quiet_NaN(); | ||
| } | ||
|
|
||
| return trimmed_sum / trimmed_count; | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -524,3 +524,81 @@ TEST_F(RedisTDigestTest, ByRank_And_ByRevRank) { | |
| EXPECT_EQ(result[0], 1.0) << "Rank 0 should be minimum"; | ||
| EXPECT_TRUE(std::isinf(result[3])) << "Rank >= total_weight should be infinity"; | ||
| } | ||
|
|
||
| TEST_F(RedisTDigestTest, TrimmedMean) { | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Add cases for invalid arguments and more unordered and complex inputs. |
||
| std::string test_digest_name = "test_digest_trimmed_mean" + std::to_string(util::GetTimeStampMS()); | ||
| bool exists = false; | ||
| auto status = tdigest_->Create(*ctx_, test_digest_name, {100}, &exists); | ||
| ASSERT_FALSE(exists); | ||
| ASSERT_TRUE(status.ok()); | ||
|
|
||
| std::vector<double> values = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}; | ||
| status = tdigest_->Add(*ctx_, test_digest_name, values); | ||
| ASSERT_TRUE(status.ok()) << status.ToString(); | ||
|
|
||
| redis::TDigestTrimmedMeanResult result; | ||
| status = tdigest_->TrimmedMean(*ctx_, test_digest_name, 0.1, 0.9, &result); | ||
| ASSERT_TRUE(status.ok()) << status.ToString(); | ||
| ASSERT_TRUE(result.mean.has_value()); | ||
| EXPECT_NEAR(*result.mean, 5.5, 0.01); | ||
|
|
||
| status = tdigest_->TrimmedMean(*ctx_, test_digest_name, 0.0, 1.0, &result); | ||
| ASSERT_TRUE(status.ok()) << status.ToString(); | ||
| ASSERT_TRUE(result.mean.has_value()); | ||
| EXPECT_NEAR(*result.mean, 5.5, 0.01); | ||
|
|
||
| status = tdigest_->TrimmedMean(*ctx_, test_digest_name, 0.25, 0.75, &result); | ||
| ASSERT_TRUE(status.ok()) << status.ToString(); | ||
| ASSERT_TRUE(result.mean.has_value()); | ||
| EXPECT_NEAR(*result.mean, 5.5, 0.01); | ||
| } | ||
|
|
||
| TEST_F(RedisTDigestTest, TrimmedMeanEmptyDigest) { | ||
| std::string test_digest_name = "test_digest_trimmed_mean_empty" + std::to_string(util::GetTimeStampMS()); | ||
| bool exists = false; | ||
| auto status = tdigest_->Create(*ctx_, test_digest_name, {100}, &exists); | ||
| ASSERT_FALSE(exists); | ||
| ASSERT_TRUE(status.ok()); | ||
|
|
||
| redis::TDigestTrimmedMeanResult result; | ||
| status = tdigest_->TrimmedMean(*ctx_, test_digest_name, 0.1, 0.9, &result); | ||
| ASSERT_TRUE(status.ok()) << status.ToString(); | ||
| ASSERT_FALSE(result.mean.has_value()); | ||
| } | ||
|
|
||
| TEST_F(RedisTDigestTest, TrimmedMeanUnorderedInput) { | ||
| std::string test_digest_name = "test_digest_trimmed_mean_unordered" + std::to_string(util::GetTimeStampMS()); | ||
| bool exists = false; | ||
| auto status = tdigest_->Create(*ctx_, test_digest_name, {100}, &exists); | ||
| ASSERT_FALSE(exists); | ||
| ASSERT_TRUE(status.ok()); | ||
|
|
||
| std::vector<double> values = {5, 2, 8, 1, 9, 3, 7, 4, 6, 10}; | ||
| status = tdigest_->Add(*ctx_, test_digest_name, values); | ||
| ASSERT_TRUE(status.ok()) << status.ToString(); | ||
|
|
||
| redis::TDigestTrimmedMeanResult result; | ||
| status = tdigest_->TrimmedMean(*ctx_, test_digest_name, 0.1, 0.9, &result); | ||
| ASSERT_TRUE(status.ok()) << status.ToString(); | ||
| ASSERT_TRUE(result.mean.has_value()); | ||
| EXPECT_NEAR(*result.mean, 5.5, 0.01); | ||
| } | ||
|
|
||
| TEST_F(RedisTDigestTest, TrimmedMeanComplexInput) { | ||
| std::string test_digest_name = "test_digest_trimmed_mean_complex" + std::to_string(util::GetTimeStampMS()); | ||
| bool exists = false; | ||
| auto status = tdigest_->Create(*ctx_, test_digest_name, {100}, &exists); | ||
| ASSERT_FALSE(exists); | ||
| ASSERT_TRUE(status.ok()); | ||
|
|
||
| std::vector<double> values = {-10, 5, -3, 5, 0, 5, 3, -5, 10, -10}; | ||
| status = tdigest_->Add(*ctx_, test_digest_name, values); | ||
| ASSERT_TRUE(status.ok()) << status.ToString(); | ||
|
|
||
| redis::TDigestTrimmedMeanResult result; | ||
| status = tdigest_->TrimmedMean(*ctx_, test_digest_name, 0.2, 0.8, &result); | ||
| ASSERT_TRUE(status.ok()) << status.ToString(); | ||
| ASSERT_TRUE(result.mean.has_value()); | ||
| ASSERT_FALSE(std::isnan(*result.mean)); | ||
| EXPECT_NEAR(*result.mean, 5.0 / 6.0, 0.01); | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Check the validation of
high_cut_quantileandlow_cut_quantile.The parameter validation should be done in the earliest step rather than in the command processing.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks for the review! I've moved the validation to Parse stage and removed the duplicate checks in the
TDigestTrimmedMeanmethod.