From 1565f0d5a892a7881fb8c648a3c5d4b23379964a Mon Sep 17 00:00:00 2001 From: Anas Khan <83116240+anxkhn@users.noreply.github.com> Date: Sun, 28 Jun 2026 23:22:03 +0530 Subject: [PATCH] Include offending series labels in missing-metric-name push errors When a pushed series has no __name__ label, the error returned to the client only said "no metric name label" / "sample missing metric name" with no indication of which series caused it. Operators had to bisect by stopping Prometheus instances one by one to find the culprit. Enrich both user-facing push paths so the offending series labels are always included: - noMetricNameError now carries the series and renders it via formatLabelSet, matching every sibling validation error in the same file. This covers the validation (400) path used when metric name enforcement is enabled. - The fatal error from tokenForLabels (the shard_by_all_labels=false path that produced the original 500 with no context) is wrapped with the series labels. Fixes #5802 Signed-off-by: Anas Khan <83116240+anxkhn@users.noreply.github.com> --- CHANGELOG.md | 1 + pkg/distributor/distributor.go | 2 +- pkg/distributor/distributor_test.go | 31 +++++++++++++++++++++++++++- pkg/util/validation/errors.go | 12 +++++++---- pkg/util/validation/validate.go | 2 +- pkg/util/validation/validate_test.go | 14 ++++++++++++- 6 files changed, 54 insertions(+), 8 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5afffdfc10b..a58097eaed7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -45,6 +45,7 @@ * [ENHANCEMENT] Ring: Cache `ShuffleShardWithLookback` subrings. The cached entry is invalidated on topology change or once `now` reaches the earliest `RegisteredTimestamp + lookbackPeriod` of any included instance. #7628 * [ENHANCEMENT] Query Frontend: Rename `time_taken` field to `time_taken_ms` and make it return millisecond count. #7649 * [ENHANCEMENT] Update prometheus alertmanager version to v0.33.0. #7647 +* [ENHANCEMENT] Distributor: Include the offending series labels in the "sample missing metric name" and "no metric name label" push errors so operators can identify which series is missing a metric name. #7657 * [BUGFIX] Querier: Fix queryWithRetry and labelsWithRetry returning (nil, nil) on cancelled context by propagating ctx.Err(). #7370 * [BUGFIX] Metrics Helper: Fix non-deterministic bucket order in merged histograms by sorting buckets after map iteration, matching Prometheus client library behavior. #7380 * [BUGFIX] Distributor: Return HTTP 401 Unauthorized when tenant ID resolution fails in the Prometheus Remote Write 2.0 path. #7389 diff --git a/pkg/distributor/distributor.go b/pkg/distributor/distributor.go index edf9b750b87..6fd382ffb63 100644 --- a/pkg/distributor/distributor.go +++ b/pkg/distributor/distributor.go @@ -1208,7 +1208,7 @@ func (d *Distributor) prepareSeriesKeys(ctx context.Context, req *cortexpb.Write // label and dropped labels (if any) key, err := d.tokenForLabels(userID, ts.Labels) if err != nil { - return nil, nil, nil, nil, 0, 0, 0, 0, nil, err + return nil, nil, nil, nil, 0, 0, 0, 0, nil, errors.Wrapf(err, "failed to compute sharding token for series %s", cortexpb.FromLabelAdaptersToMetric(ts.Labels).String()) } validatedSeries, validationErr := d.validateSeries(*ts, userID, skipLabelNameValidation, limits) diff --git a/pkg/distributor/distributor_test.go b/pkg/distributor/distributor_test.go index 2cb0b1522f3..d0ac4f30593 100644 --- a/pkg/distributor/distributor_test.go +++ b/pkg/distributor/distributor_test.go @@ -2186,7 +2186,36 @@ func TestDistributor_Push_LabelRemoval_RemovingNameLabelWillError(t *testing.T) req := mockWriteRequest([]labels.Labels{tc.inputSeries}, 1, 1, false) _, err = ds[0].Push(ctx, req) require.Error(t, err) - assert.Equal(t, "rpc error: code = Code(400) desc = sample missing metric name", err.Error()) + assert.Equal(t, `rpc error: code = Code(400) desc = sample missing metric name metric: "{cluster=\"one\"}"`, err.Error()) +} + +// TestDistributor_Push_NoMetricNameShardingErrorReportsSeries covers the path that +// the original bug report hit (issue #5802): with metric name enforcement disabled +// and shard_by_all_labels=false, a series without a metric name fails while computing +// its sharding token. The resulting error must name the offending series so operators +// can identify it instead of only seeing "no metric name label". +func TestDistributor_Push_NoMetricNameShardingErrorReportsSeries(t *testing.T) { + t.Parallel() + ctx := user.InjectOrgID(context.Background(), "user") + + var limits validation.Limits + flagext.DefaultValues(&limits) + limits.EnforceMetricName = false + + ds, _, _, _ := prepare(t, prepConfig{ + numIngesters: 2, + happyIngesters: 2, + numDistributors: 1, + shardByAllLabels: false, + limits: &limits, + }) + + inputSeries := labels.FromStrings("foo", "bar", "team", "a") + req := mockWriteRequest([]labels.Labels{inputSeries}, 1, 1, false) + _, err := ds[0].Push(ctx, req) + require.Error(t, err) + assert.Contains(t, err.Error(), "no metric name label") + assert.Contains(t, err.Error(), `{foo="bar", team="a"}`) } func TestDistributor_Push_ShouldGuaranteeShardingTokenConsistencyOverTheTime(t *testing.T) { diff --git a/pkg/util/validation/errors.go b/pkg/util/validation/errors.go index 2efc077655a..af47cc6a4ec 100644 --- a/pkg/util/validation/errors.go +++ b/pkg/util/validation/errors.go @@ -134,14 +134,18 @@ func (e *tooManyLabelsError) Error() string { len(e.series), e.limit, cortexpb.FromLabelAdaptersToMetric(e.series).String()) } -type noMetricNameError struct{} +type noMetricNameError struct { + series []cortexpb.LabelAdapter +} -func newNoMetricNameError() ValidationError { - return &noMetricNameError{} +func newNoMetricNameError(series []cortexpb.LabelAdapter) ValidationError { + return &noMetricNameError{ + series: series, + } } func (e *noMetricNameError) Error() string { - return "sample missing metric name" + return fmt.Sprintf("sample missing metric name metric: %.200q", formatLabelSet(e.series)) } type invalidMetricNameError struct { diff --git a/pkg/util/validation/validate.go b/pkg/util/validation/validate.go index a2f7bc395df..73b89d24992 100644 --- a/pkg/util/validation/validate.go +++ b/pkg/util/validation/validate.go @@ -285,7 +285,7 @@ func ValidateMetricName(limits *Limits, ls []cortexpb.LabelAdapter, nameValidati } unsafeMetricName, err := extract.UnsafeMetricNameFromLabelAdapters(ls) if err != nil { - return newNoMetricNameError(), missingMetricName + return newNoMetricNameError(ls), missingMetricName } if !nameValidationScheme.IsValidMetricName(unsafeMetricName) { return newInvalidMetricNameError(unsafeMetricName), invalidMetricName diff --git a/pkg/util/validation/validate_test.go b/pkg/util/validation/validate_test.go index 6832ee61f0e..9aa1657bee4 100644 --- a/pkg/util/validation/validate_test.go +++ b/pkg/util/validation/validate_test.go @@ -81,7 +81,8 @@ func TestValidateMetricName(t *testing.T) { expectErr error reason string }{ - {map[model.LabelName]model.LabelValue{}, newNoMetricNameError(), missingMetricName}, + {map[model.LabelName]model.LabelValue{}, newNoMetricNameError(cortexpb.FromMetricsToLabelAdapters(model.Metric{})), missingMetricName}, + {map[model.LabelName]model.LabelValue{"foo": "bar"}, newNoMetricNameError(cortexpb.FromMetricsToLabelAdapters(model.Metric{"foo": "bar"})), missingMetricName}, {map[model.LabelName]model.LabelValue{model.MetricNameLabel: ""}, newInvalidMetricNameError(""), invalidMetricName}, {map[model.LabelName]model.LabelValue{model.MetricNameLabel: " "}, newInvalidMetricNameError(" "), invalidMetricName}, {map[model.LabelName]model.LabelValue{model.MetricNameLabel: "test.\xc5.metric"}, newInvalidMetricNameError("test.\xc5.metric"), invalidMetricName}, @@ -102,6 +103,17 @@ func TestValidateMetricName(t *testing.T) { assert.Empty(t, reason) } +func TestNoMetricNameError_ReportsOffendingSeries(t *testing.T) { + cfg := new(Limits) + cfg.EnforceMetricName = true + + err, reason := ValidateMetricName(cfg, cortexpb.FromMetricsToLabelAdapters(model.Metric{"foo": "bar"}), model.LegacyValidation) + require.Error(t, err) + assert.Equal(t, missingMetricName, reason) + // The error must surface the offending series so operators can identify it (see issue #5802). + assert.Equal(t, `sample missing metric name metric: "{foo=\"bar\"}"`, err.Error()) +} + func TestValidateLabels(t *testing.T) { cfg := new(Limits) userID := "testUser"