From 6017e00d5911653b4cda70ce71bdc874976747cb Mon Sep 17 00:00:00 2001 From: Pierre Gerbelot Date: Tue, 3 Feb 2026 15:41:21 +0100 Subject: [PATCH 1/4] feat(obs): add support for envoy --- .../lib/domains-observability-data-access.ts | 35 +++ .../alerting-creation-flow.tsx | 39 ++- .../summary-step/alert-queries.ts | 84 +++++++ .../summary-step/summary-step.tsx | 30 ++- .../use-http-route-name.ts | 20 ++ .../card-http-errors.spec.tsx | 146 ++++++++++- .../card-http-errors/card-http-errors.tsx | 51 +++- .../card-percentile-99/card-percentile-99.tsx | 27 +- .../instance-http-errors-chart.tsx | 85 ++++++- .../network-request-duration-chart.tsx | 235 +++++++++++++++--- .../network-request-size-chart.tsx | 171 +++++++++++-- .../network-request-status-chart.tsx | 156 +++++++++--- .../service-dashboard/service-dashboard.tsx | 20 +- 13 files changed, 981 insertions(+), 118 deletions(-) create mode 100644 libs/domains/observability/feature/src/lib/hooks/use-http-route-name/use-http-route-name.ts diff --git a/libs/domains/observability/data-access/src/lib/domains-observability-data-access.ts b/libs/domains/observability/data-access/src/lib/domains-observability-data-access.ts index ef80bafcc73..6f6cf6de864 100644 --- a/libs/domains/observability/data-access/src/lib/domains-observability-data-access.ts +++ b/libs/domains/observability/data-access/src/lib/domains-observability-data-access.ts @@ -91,6 +91,41 @@ export const observability = createQueryKeys('observability', { return response.data.metrics && (JSON.parse(response.data.metrics).data[0] as string) }, }), + httpRouteName: ({ + clusterId, + serviceId, + startDate, + endDate, + }: { + clusterId: string + serviceId: string + startDate: string + endDate: string + }) => ({ + queryKey: ['httpPortName', clusterId, serviceId], + async queryFn() { + const endpoint = `api/v1/label/httproute_name/values?match[]=kube_httproute_labels{qovery_com_associated_service_id="${serviceId}"}` + const response = await clusterApi.getClusterMetrics( + clusterId, + endpoint, + endpoint, + '', + startDate, + endDate, + undefined, + undefined, + undefined, + 'True', + 'True', + undefined, + 'prometheus', + 'false', + 'service_overview', + 'httpRouteName' + ) + return response.data.metrics && (JSON.parse(response.data.metrics).data[0] as string) + }, + }), hpaName: ({ clusterId, serviceId, diff --git a/libs/domains/observability/feature/src/lib/alerting/alerting-creation-flow/alerting-creation-flow.tsx b/libs/domains/observability/feature/src/lib/alerting/alerting-creation-flow/alerting-creation-flow.tsx index 51a3a96e809..629350418cc 100644 --- a/libs/domains/observability/feature/src/lib/alerting/alerting-creation-flow/alerting-creation-flow.tsx +++ b/libs/domains/observability/feature/src/lib/alerting/alerting-creation-flow/alerting-creation-flow.tsx @@ -8,6 +8,7 @@ import { ErrorBoundary, FunnelFlow } from '@qovery/shared/ui' import { useContainerName } from '../../hooks/use-container-name/use-container-name' import { useCreateAlertRule } from '../../hooks/use-create-alert-rule/use-create-alert-rule' import { useHpaName } from '../../hooks/use-hpa-name/use-hpa-name' +import { useHttpRouteName } from '../../hooks/use-http-route-name/use-http-route-name' import { useIngressName } from '../../hooks/use-ingress-name/use-ingress-name' import { generateConditionDescription } from '../../util-alerting/generate-condition-description' import { type AlertConfiguration, type MetricCategory } from './alerting-creation-flow.types' @@ -16,7 +17,9 @@ import { QUERY_CPU, QUERY_HPA_ISSUE, QUERY_HTTP_ERROR, + QUERY_HTTP_ERROR_COMBINED, QUERY_HTTP_LATENCY, + QUERY_HTTP_LATENCY_COMBINED, QUERY_INSTANCE_RESTART, QUERY_MEMORY, QUERY_MISSING_INSTANCE, @@ -43,6 +46,7 @@ interface AlertingCreationFlowContextInterface { totalSteps: number containerName?: string ingressName?: string + httpRouteName?: string onNavigateToMetric: (index: number) => void onComplete: (alerts: AlertConfiguration[]) => Promise isLoading: boolean @@ -103,6 +107,7 @@ export function AlertingCreationFlow({ endDate: now.toISOString(), }) + // NGINX: Fetch nginx ingress name (to remove when migrating to envoy) const { data: ingressName } = useIngressName({ clusterId: environment.cluster_id, serviceId: service.id, @@ -110,6 +115,14 @@ export function AlertingCreationFlow({ endDate: now.toISOString(), }) + // ENVOY: Fetch envoy HTTPRoute name + const { data: httpRouteName } = useHttpRouteName({ + clusterId: environment.cluster_id, + serviceId: service.id, + startDate: oneHourAgo.toISOString(), + endDate: now.toISOString(), + }) + const hasAutoscaling = (service?.serviceType === 'APPLICATION' || service?.serviceType === 'CONTAINER') && service?.min_running_instances !== service?.max_running_instances @@ -155,7 +168,8 @@ export function AlertingCreationFlow({ service?.min_running_instances !== service?.max_running_instances if (!containerName) return - if (hasPublicPort && !ingressName) return + // For HTTP alerts, require at least one of nginx or envoy to be present + if (hasPublicPort && !ingressName && !httpRouteName) return if (hasAutoscaling && !hpaName) return try { @@ -201,8 +215,26 @@ export function AlertingCreationFlow({ .with('memory', () => QUERY_MEMORY(containerName)) .with('missing_instance', () => QUERY_MISSING_INSTANCE(containerName)) .with('instance_restart', () => QUERY_INSTANCE_RESTART(containerName)) - .with('http_error', () => (ingressName ? QUERY_HTTP_ERROR(ingressName) : '')) - .with('http_latency', () => (ingressName ? QUERY_HTTP_LATENCY(ingressName) : '')) + .with('http_error', () => { + // Use combined query if both sources available, otherwise fallback to single source + if (ingressName && httpRouteName) { + return QUERY_HTTP_ERROR_COMBINED(ingressName, httpRouteName) + } + if (ingressName) { + return QUERY_HTTP_ERROR(ingressName) + } + return '' + }) + .with('http_latency', () => { + // Use combined query if both sources available, otherwise fallback to single source + if (ingressName && httpRouteName) { + return QUERY_HTTP_LATENCY_COMBINED(ingressName, httpRouteName) + } + if (ingressName) { + return QUERY_HTTP_LATENCY(ingressName) + } + return '' + }) .with('hpa_limit', () => (hpaName ? QUERY_HPA_ISSUE(hpaName) : '')) .otherwise(() => ''), }, @@ -237,6 +269,7 @@ export function AlertingCreationFlow({ totalSteps, containerName, ingressName, + httpRouteName, onNavigateToMetric: handleNavigateToMetric, onComplete: handleComplete, isLoading, diff --git a/libs/domains/observability/feature/src/lib/alerting/alerting-creation-flow/summary-step/alert-queries.ts b/libs/domains/observability/feature/src/lib/alerting/alerting-creation-flow/summary-step/alert-queries.ts index 667974f76f6..15e1c213592 100644 --- a/libs/domains/observability/feature/src/lib/alerting/alerting-creation-flow/summary-step/alert-queries.ts +++ b/libs/domains/observability/feature/src/lib/alerting/alerting-creation-flow/summary-step/alert-queries.ts @@ -5,6 +5,7 @@ export const QUERY_MEMORY = (containerName: string) => ` container_memory_working_set_bytes{container="${containerName}"} / on(pod, namespace, container) kube_pod_container_resource_requests{resource="memory", container="${containerName}"} ` +// NGINX: Query for nginx HTTP error rate (to remove when migrating to envoy) export const QUERY_HTTP_ERROR = (ingressName: string) => ` ( sum by (ingress, namespace) ( @@ -22,6 +23,48 @@ export const QUERY_HTTP_ERROR = (ingressName: string) => ` ) )` +// ENVOY: Query for envoy HTTP error rate +export const QUERY_HTTP_ERROR_ENVOY = (httpRouteName: string) => ` +( + sum by (httproute_name, namespace) ( + rate(envoy_cluster_upstream_rq_xx{envoy_cluster_name=~".*", + httproute_name="${httpRouteName}", + envoy_response_code_class="5" + }[1m]) + ) +) +/ +( + sum by (httproute_name, namespace) ( + rate(envoy_cluster_upstream_rq_xx{envoy_cluster_name=~".*", + httproute_name="${httpRouteName}" + }[1m]) + ) +)` + +// Combined nginx + envoy HTTP error rate (aggregates both sources) +export const QUERY_HTTP_ERROR_COMBINED = (ingressName: string, httpRouteName: string) => ` +( + sum( + rate(nginx_ingress_controller_requests{ingress="${ingressName}", status=~"5.."}[1m]) + ) + + + sum( + rate(envoy_cluster_upstream_rq_xx{envoy_cluster_name=~".*", httproute_name="${httpRouteName}", envoy_response_code_class="5"}[1m]) + ) +) +/ +( + sum( + rate(nginx_ingress_controller_requests{ingress="${ingressName}"}[1m]) + ) + + + sum( + rate(envoy_cluster_upstream_rq_xx{envoy_cluster_name=~".*", httproute_name="${httpRouteName}"}[1m]) + ) +)` + +// NGINX: Query for nginx HTTP latency p99 (to remove when migrating to envoy) export const QUERY_HTTP_LATENCY = (ingressName: string) => ` histogram_quantile( 0.99, @@ -34,6 +77,47 @@ export const QUERY_HTTP_LATENCY = (ingressName: string) => ` ) )` +// ENVOY: Query for envoy HTTP latency p99 +export const QUERY_HTTP_LATENCY_ENVOY = (httpRouteName: string) => ` +histogram_quantile( + 0.99, + sum by (namespace, httproute_name, le) ( + rate( + envoy_cluster_upstream_rq_time_bucket{ + envoy_cluster_name=~".*", + httproute_name="${httpRouteName}" + }[1m] + ) + ) +) / 1000` + +// Combined nginx + envoy HTTP latency p99 (takes max of both sources) +export const QUERY_HTTP_LATENCY_COMBINED = (ingressName: string, httpRouteName: string) => ` +max( + histogram_quantile( + 0.99, + sum by (namespace, ingress, le) ( + rate( + nginx_ingress_controller_request_duration_seconds_bucket{ + ingress="${ingressName}" + }[1m] + ) + ) + ) + or + histogram_quantile( + 0.99, + sum by (namespace, httproute_name, le) ( + rate( + envoy_cluster_upstream_rq_time_bucket{ + envoy_cluster_name=~".*", + httproute_name="${httpRouteName}" + }[1m] + ) + ) + ) / 1000 +)` + export const QUERY_INSTANCE_RESTART = (containerName: string) => ` ( increase( diff --git a/libs/domains/observability/feature/src/lib/alerting/alerting-creation-flow/summary-step/summary-step.tsx b/libs/domains/observability/feature/src/lib/alerting/alerting-creation-flow/summary-step/summary-step.tsx index 5673c67ebe6..d35a6637d54 100644 --- a/libs/domains/observability/feature/src/lib/alerting/alerting-creation-flow/summary-step/summary-step.tsx +++ b/libs/domains/observability/feature/src/lib/alerting/alerting-creation-flow/summary-step/summary-step.tsx @@ -13,7 +13,9 @@ import { ALERTING_CREATION_EDIT, ALERTING_CREATION_METRIC } from '../router' import { QUERY_CPU, QUERY_HTTP_ERROR, + QUERY_HTTP_ERROR_COMBINED, QUERY_HTTP_LATENCY, + QUERY_HTTP_LATENCY_COMBINED, QUERY_INSTANCE_RESTART, QUERY_MEMORY, QUERY_MISSING_INSTANCE, @@ -138,6 +140,7 @@ export function SummaryStep() { selectedMetrics, containerName, ingressName, + httpRouteName, } = useAlertingCreationFlowContext() useEffect(() => { @@ -160,7 +163,10 @@ export function SummaryStep() { const handleConfirm = async () => { const activeAlerts = alerts.filter((alert) => !alert.skipped) - if (!service || !environment || !containerName || !ingressName) return + if (!service || !environment || !containerName) return + // For HTTP alerts, require at least one of nginx or envoy to be present + const hasPublicPort = activeAlerts.some((alert) => alert.tag === 'http_error' || alert.tag === 'http_latency') + if (hasPublicPort && !ingressName && !httpRouteName) return try { setIsCreatingAlertRule(true) @@ -190,8 +196,26 @@ export function SummaryStep() { .with('memory', () => QUERY_MEMORY(containerName)) .with('missing_instance', () => QUERY_MISSING_INSTANCE(containerName)) .with('instance_restart', () => QUERY_INSTANCE_RESTART(containerName)) - .with('http_error', () => QUERY_HTTP_ERROR(ingressName)) - .with('http_latency', () => QUERY_HTTP_LATENCY(ingressName)) + .with('http_error', () => { + // Use combined query if both sources available, otherwise fallback to single source + if (ingressName && httpRouteName) { + return QUERY_HTTP_ERROR_COMBINED(ingressName, httpRouteName) + } + if (ingressName) { + return QUERY_HTTP_ERROR(ingressName) + } + return '' + }) + .with('http_latency', () => { + // Use combined query if both sources available, otherwise fallback to single source + if (ingressName && httpRouteName) { + return QUERY_HTTP_LATENCY_COMBINED(ingressName, httpRouteName) + } + if (ingressName) { + return QUERY_HTTP_LATENCY(ingressName) + } + return '' + }) .otherwise(() => ''), }, for_duration: alert.for_duration, diff --git a/libs/domains/observability/feature/src/lib/hooks/use-http-route-name/use-http-route-name.ts b/libs/domains/observability/feature/src/lib/hooks/use-http-route-name/use-http-route-name.ts new file mode 100644 index 00000000000..1ae80987137 --- /dev/null +++ b/libs/domains/observability/feature/src/lib/hooks/use-http-route-name/use-http-route-name.ts @@ -0,0 +1,20 @@ +import { useQuery } from '@tanstack/react-query' +import { observability } from '@qovery/domains/observability/data-access' + +export interface UseHttpRouteNameProps { + clusterId: string + serviceId: string + startDate: string + endDate: string + enabled?: boolean +} + +// Retrieves the http route name associated with a specific service (http managed by envoy) +export function useHttpRouteName({ clusterId, serviceId, enabled = true, startDate, endDate }: UseHttpRouteNameProps) { + return useQuery({ + ...observability.httpRouteName({ clusterId, serviceId, startDate, endDate }), + enabled: enabled && Boolean(clusterId && serviceId), + }) +} + +export default useHttpRouteName diff --git a/libs/domains/observability/feature/src/lib/service/service-dashboard/card-http-errors/card-http-errors.spec.tsx b/libs/domains/observability/feature/src/lib/service/service-dashboard/card-http-errors/card-http-errors.spec.tsx index 37046d86a1b..0d142f1efa0 100644 --- a/libs/domains/observability/feature/src/lib/service/service-dashboard/card-http-errors/card-http-errors.spec.tsx +++ b/libs/domains/observability/feature/src/lib/service/service-dashboard/card-http-errors/card-http-errors.spec.tsx @@ -32,6 +32,7 @@ describe('CardHTTPErrors', () => { clusterId: 'test-cluster-id', containerName: 'test-container-name', ingressName: 'test-ingress-name', + httpRouteName: 'test-httproute-name', } beforeEach(() => { @@ -39,6 +40,7 @@ describe('CardHTTPErrors', () => { }) it('should render successfully with loading state', () => { + // Mock all 4 calls (nginx error, nginx total, envoy error, envoy total) as loading useInstantMetrics.mockReturnValue(createMockUseInstantMetricsReturn(undefined, true)) const { baseElement } = renderWithProviders( @@ -52,6 +54,7 @@ describe('CardHTTPErrors', () => { it('should render with no errors (GREEN status)', () => { useInstantMetrics + // NGINX error requests .mockReturnValueOnce( createMockUseInstantMetricsReturn({ data: { @@ -63,6 +66,31 @@ describe('CardHTTPErrors', () => { }, }) ) + // NGINX total requests + .mockReturnValueOnce( + createMockUseInstantMetricsReturn({ + data: { + result: [ + { + value: [1234567890, '100'], + }, + ], + }, + }) + ) + // ENVOY error requests + .mockReturnValueOnce( + createMockUseInstantMetricsReturn({ + data: { + result: [ + { + value: [1234567890, '0'], + }, + ], + }, + }) + ) + // ENVOY total requests .mockReturnValueOnce( createMockUseInstantMetricsReturn({ data: { @@ -87,6 +115,7 @@ describe('CardHTTPErrors', () => { it('should render with errors (RED status) and show modal link', () => { useInstantMetrics + // NGINX error requests .mockReturnValueOnce( createMockUseInstantMetricsReturn({ data: { @@ -98,6 +127,7 @@ describe('CardHTTPErrors', () => { }, }) ) + // NGINX total requests .mockReturnValueOnce( createMockUseInstantMetricsReturn({ data: { @@ -109,6 +139,30 @@ describe('CardHTTPErrors', () => { }, }) ) + // ENVOY error requests + .mockReturnValueOnce( + createMockUseInstantMetricsReturn({ + data: { + result: [ + { + value: [1234567890, '0'], + }, + ], + }, + }) + ) + // ENVOY total requests + .mockReturnValueOnce( + createMockUseInstantMetricsReturn({ + data: { + result: [ + { + value: [1234567890, '0'], + }, + ], + }, + }) + ) renderWithProviders( @@ -125,6 +179,23 @@ describe('CardHTTPErrors', () => { it('should handle empty metrics data', () => { useInstantMetrics + // NGINX error requests + .mockReturnValueOnce( + createMockUseInstantMetricsReturn({ + data: { + result: [], + }, + }) + ) + // NGINX total requests + .mockReturnValueOnce( + createMockUseInstantMetricsReturn({ + data: { + result: [], + }, + }) + ) + // ENVOY error requests .mockReturnValueOnce( createMockUseInstantMetricsReturn({ data: { @@ -132,6 +203,7 @@ describe('CardHTTPErrors', () => { }, }) ) + // ENVOY total requests .mockReturnValueOnce( createMockUseInstantMetricsReturn({ data: { @@ -151,7 +223,13 @@ describe('CardHTTPErrors', () => { it('should handle undefined metrics data', () => { useInstantMetrics + // NGINX error requests + .mockReturnValueOnce(createMockUseInstantMetricsReturn()) + // NGINX total requests + .mockReturnValueOnce(createMockUseInstantMetricsReturn()) + // ENVOY error requests .mockReturnValueOnce(createMockUseInstantMetricsReturn()) + // ENVOY total requests .mockReturnValueOnce(createMockUseInstantMetricsReturn()) renderWithProviders( @@ -167,7 +245,9 @@ describe('CardHTTPErrors', () => { let callCount = 0 useInstantMetrics.mockImplementation(() => { callCount++ + // Calls: 1=nginx errors, 2=nginx total, 3=envoy errors, 4=envoy total if (callCount === 1) { + // NGINX error requests return createMockUseInstantMetricsReturn({ data: { result: [ @@ -177,7 +257,8 @@ describe('CardHTTPErrors', () => { ], }, }) - } else { + } else if (callCount === 2) { + // NGINX total requests return createMockUseInstantMetricsReturn({ data: { result: [ @@ -187,6 +268,28 @@ describe('CardHTTPErrors', () => { ], }, }) + } else if (callCount === 3) { + // ENVOY error requests + return createMockUseInstantMetricsReturn({ + data: { + result: [ + { + value: [1234567890, '0'], + }, + ], + }, + }) + } else { + // ENVOY total requests + return createMockUseInstantMetricsReturn({ + data: { + result: [ + { + value: [1234567890, '0'], + }, + ], + }, + }) } }) @@ -209,7 +312,13 @@ describe('CardHTTPErrors', () => { it('should call useInstantMetrics with correct parameters', () => { useInstantMetrics + // NGINX error requests .mockReturnValueOnce(createMockUseInstantMetricsReturn()) + // NGINX total requests + .mockReturnValueOnce(createMockUseInstantMetricsReturn()) + // ENVOY error requests + .mockReturnValueOnce(createMockUseInstantMetricsReturn()) + // ENVOY total requests .mockReturnValueOnce(createMockUseInstantMetricsReturn()) renderWithProviders( @@ -218,7 +327,10 @@ describe('CardHTTPErrors', () => { ) - expect(useInstantMetrics).toHaveBeenCalledTimes(2) + // Now called 4 times: nginx errors, nginx total, envoy errors, envoy total + expect(useInstantMetrics).toHaveBeenCalledTimes(4) + + // Check nginx calls expect(useInstantMetrics).toHaveBeenCalledWith({ clusterId: 'test-cluster-id', query: expect.stringContaining('nginx:req_inc:5m'), @@ -230,10 +342,15 @@ describe('CardHTTPErrors', () => { const calledQuery = useInstantMetrics.mock.calls[0][0].query expect(calledQuery).toContain('test-ingress-name') + + // Check envoy calls + const envoyQuery = useInstantMetrics.mock.calls[2][0].query + expect(envoyQuery).toContain('test-httproute-name') }) it('should not show modal link when there are no errors', () => { useInstantMetrics + // NGINX error requests .mockReturnValueOnce( createMockUseInstantMetricsReturn({ data: { @@ -245,6 +362,7 @@ describe('CardHTTPErrors', () => { }, }) ) + // NGINX total requests .mockReturnValueOnce( createMockUseInstantMetricsReturn({ data: { @@ -256,6 +374,30 @@ describe('CardHTTPErrors', () => { }, }) ) + // ENVOY error requests + .mockReturnValueOnce( + createMockUseInstantMetricsReturn({ + data: { + result: [ + { + value: [1234567890, '0'], + }, + ], + }, + }) + ) + // ENVOY total requests + .mockReturnValueOnce( + createMockUseInstantMetricsReturn({ + data: { + result: [ + { + value: [1234567890, '0'], + }, + ], + }, + }) + ) renderWithProviders( diff --git a/libs/domains/observability/feature/src/lib/service/service-dashboard/card-http-errors/card-http-errors.tsx b/libs/domains/observability/feature/src/lib/service/service-dashboard/card-http-errors/card-http-errors.tsx index 7403e573511..83135984f39 100644 --- a/libs/domains/observability/feature/src/lib/service/service-dashboard/card-http-errors/card-http-errors.tsx +++ b/libs/domains/observability/feature/src/lib/service/service-dashboard/card-http-errors/card-http-errors.tsx @@ -6,6 +6,7 @@ import { useDashboardContext } from '../../../util-filter/dashboard-context' import { CardMetric } from '../card-metric/card-metric' import { InstanceHTTPErrorsChart } from '../instance-http-errors-chart/instance-http-errors-chart' +// NGINX: Queries for nginx metrics (to remove when migrating to envoy) const queryErrorRequest = (timeRange: string, ingressName: string) => ` sum(sum_over_time( (nginx:req_inc:5m_by_status{ingress="${ingressName}", status=~"5.."})[${timeRange}:5m] @@ -18,20 +19,36 @@ const queryTotalRequest = (timeRange: string, ingressName: string) => ` ) ` +// ENVOY: Queries for envoy metrics +const queryEnvoyErrorRequest = (timeRange: string, httpRouteName: string) => ` + sum(sum_over_time( + (envoy_proxy:req_inc:5m_by_status{httproute_name="${httpRouteName}", envoy_response_code=~"5.."})[${timeRange}:5m] + )) +` + +const queryEnvoyTotalRequest = (timeRange: string, httpRouteName: string) => ` + sum_over_time( + (envoy_proxy:req_inc:5m{httproute_name="${httpRouteName}"})[${timeRange}:5m] + ) +` + export function CardHTTPErrors({ serviceId, clusterId, containerName, ingressName, + httpRouteName, }: { serviceId: string clusterId: string containerName: string ingressName: string + httpRouteName: string }) { const { queryTimeRange, startTimestamp, endTimestamp } = useDashboardContext() const [isModalOpen, setIsModalOpen] = useState(false) + // NGINX: Fetch nginx metrics (to remove when migrating to envoy) const { data: metricsErrorRequest, isLoading: isLoadingMetrics } = useInstantMetrics({ clusterId, query: queryErrorRequest(queryTimeRange, ingressName), @@ -50,8 +67,33 @@ export function CardHTTPErrors({ metricShortName: 'card_req_all_number', }) - const errorRaw = Math.round(metricsErrorRequest?.data?.result[0]?.value[1]) - const totalRequest = Math.round(metricsTotalRequest?.data?.result[0]?.value[1]) || 0 + // ENVOY: Fetch envoy metrics + const { data: metricsEnvoyErrorRequest, isLoading: isLoadingMetricsEnvoyError } = useInstantMetrics({ + clusterId, + query: queryEnvoyErrorRequest(queryTimeRange, httpRouteName), + startTimestamp, + endTimestamp, + boardShortName: 'service_overview', + metricShortName: 'card_envoy_req_errors_number', + }) + + const { data: metricsEnvoyTotalRequest, isLoading: isLoadingMetricsEnvoyTotal } = useInstantMetrics({ + clusterId, + query: queryEnvoyTotalRequest(queryTimeRange, httpRouteName), + startTimestamp, + endTimestamp, + boardShortName: 'service_overview', + metricShortName: 'card_envoy_req_all_number', + }) + + // Aggregate nginx + envoy metrics + const nginxErrors = Math.round(metricsErrorRequest?.data?.result[0]?.value[1]) || 0 + const nginxTotal = Math.round(metricsTotalRequest?.data?.result[0]?.value[1]) || 0 + const envoyErrors = Math.round(metricsEnvoyErrorRequest?.data?.result[0]?.value[1]) || 0 + const envoyTotal = Math.round(metricsEnvoyTotalRequest?.data?.result[0]?.value[1]) || 0 + + const errorRaw = nginxErrors + envoyErrors + const totalRequest = nginxTotal + envoyTotal const errorRate = Math.ceil(totalRequest > 0 ? 100 * (errorRaw / totalRequest) : 0) || 0 const isError = errorRate > 0 @@ -63,7 +105,9 @@ export function CardHTTPErrors({ setIsModalOpen(true) : undefined} hasModalLink={isError} /> @@ -80,6 +124,7 @@ export function CardHTTPErrors({ serviceId={serviceId} containerName={containerName} ingressName={ingressName} + httpRouteName={httpRouteName} /> diff --git a/libs/domains/observability/feature/src/lib/service/service-dashboard/card-percentile-99/card-percentile-99.tsx b/libs/domains/observability/feature/src/lib/service/service-dashboard/card-percentile-99/card-percentile-99.tsx index 66d80b97346..064c49256cb 100644 --- a/libs/domains/observability/feature/src/lib/service/service-dashboard/card-percentile-99/card-percentile-99.tsx +++ b/libs/domains/observability/feature/src/lib/service/service-dashboard/card-percentile-99/card-percentile-99.tsx @@ -5,22 +5,31 @@ import { useDashboardContext } from '../../../util-filter/dashboard-context' import { CardMetric } from '../card-metric/card-metric' import NetworkRequestDurationChart from '../network-request-duration-chart/network-request-duration-chart' +// NGINX: Query for nginx metrics (to remove when migrating to envoy) const query = (timeRange: string, ingressName: string) => ` max_over_time(nginx:request_p99:5m{ingress="${ingressName}"}[${timeRange}]) ` +// ENVOY: Query for envoy metrics +const queryEnvoy = (timeRange: string, httpRouteName: string) => ` + max_over_time(envoy_proxy:request_p99:5m{httproute_name="${httpRouteName}"}[${timeRange}]) +` + export function CardPercentile99({ serviceId, clusterId, ingressName, + httpRouteName, }: { serviceId: string clusterId: string ingressName: string + httpRouteName: string }) { const { queryTimeRange, startTimestamp, endTimestamp } = useDashboardContext() const [isModalOpen, setIsModalOpen] = useState(false) + // NGINX: Fetch nginx metrics (to remove when migrating to envoy) const { data: metrics, isLoading: isLoadingMetrics } = useInstantMetrics({ clusterId, query: query(queryTimeRange, ingressName), @@ -30,7 +39,20 @@ export function CardPercentile99({ metricShortName: 'card_p99_count', }) - const value = Math.round(Number(metrics?.data?.result[0]?.value[1]) * 1000) || 0 + // ENVOY: Fetch envoy metrics + const { data: metricsEnvoy, isLoading: isLoadingMetricsEnvoy } = useInstantMetrics({ + clusterId, + query: queryEnvoy(queryTimeRange, httpRouteName), + startTimestamp, + endTimestamp, + boardShortName: 'service_overview', + metricShortName: 'card_envoy_p99_count', + }) + + // Use max of both sources (nginx values are in seconds, envoy in ms) + const nginxValue = Math.round(Number(metrics?.data?.result[0]?.value[1]) * 1000) || 0 + const envoyValue = Math.round(Number(metricsEnvoy?.data?.result[0]?.value[1])) || 0 + const value = Math.max(nginxValue, envoyValue) const defaultThreshold = 250 const isError = value > defaultThreshold @@ -43,7 +65,7 @@ export function CardPercentile99({ title={title} description={description} status={isError ? 'RED' : 'GREEN'} - isLoading={isLoadingMetrics} + isLoading={isLoadingMetrics || isLoadingMetricsEnvoy} onClick={() => setIsModalOpen(true)} hasModalLink /> @@ -55,6 +77,7 @@ export function CardPercentile99({ serviceId={serviceId} isFullscreen ingressName={ingressName} + httpRouteName={httpRouteName} /> diff --git a/libs/domains/observability/feature/src/lib/service/service-dashboard/instance-http-errors-chart/instance-http-errors-chart.tsx b/libs/domains/observability/feature/src/lib/service/service-dashboard/instance-http-errors-chart/instance-http-errors-chart.tsx index 847216b8621..9e481faa09d 100644 --- a/libs/domains/observability/feature/src/lib/service/service-dashboard/instance-http-errors-chart/instance-http-errors-chart.tsx +++ b/libs/domains/observability/feature/src/lib/service/service-dashboard/instance-http-errors-chart/instance-http-errors-chart.tsx @@ -8,6 +8,7 @@ import { addTimeRangePadding } from '../../../util-chart/add-time-range-padding' import { processMetricsData } from '../../../util-chart/process-metrics-data' import { useDashboardContext } from '../../../util-filter/dashboard-context' +// NGINX: Query for nginx metrics (to remove when migrating to envoy) const query = (ingressName: string) => ` 100 * sum by (status) ( @@ -23,15 +24,33 @@ clamp_min( ) > 0 ` +// ENVOY: Query for envoy metrics +const queryEnvoy = (httpRouteName: string) => ` +100 * +sum by (envoy_response_code) ( + envoy_proxy:req_rate:5m_by_status{httproute_name="${httpRouteName}", envoy_response_code=~"5.."} +) +/ +ignoring(envoy_response_code) group_left +clamp_min( + sum( + envoy_proxy:req_rate:5m{httproute_name="${httpRouteName}"} + ), + 1 +) > 0 +` + export function InstanceHTTPErrorsChart({ clusterId, serviceId, ingressName, + httpRouteName, }: { clusterId: string serviceId: string containerName: string ingressName: string + httpRouteName: string }) { const { startTimestamp, endTimestamp, useLocalTime, timeRange } = useDashboardContext() @@ -53,6 +72,7 @@ export function InstanceHTTPErrorsChart({ setLegendSelectedKeys(new Set()) } + // NGINX: Fetch nginx metrics (to remove when migrating to envoy) const { data: metricsHttpStatusErrorRatio, isLoading: isLoadingHttpStatusErrorRatio } = useMetrics({ clusterId, startTimestamp, @@ -63,19 +83,47 @@ export function InstanceHTTPErrorsChart({ metricShortName: 'http_errors', }) + // ENVOY: Fetch envoy metrics + const { data: metricsEnvoyHttpStatusErrorRatio, isLoading: isLoadingEnvoyHttpStatusErrorRatio } = useMetrics({ + clusterId, + startTimestamp, + endTimestamp, + query: queryEnvoy(httpRouteName), + timeRange, + boardShortName: 'service_overview', + metricShortName: 'envoy_http_errors', + }) + const chartData = useMemo(() => { - // Merge healthy and unhealthy metrics into a single timeSeriesMap + // Check if we have data from either source + if (!metricsHttpStatusErrorRatio?.data?.result && !metricsEnvoyHttpStatusErrorRatio?.data?.result) { + return [] + } + + // Merge nginx and envoy metrics into a single timeSeriesMap const timeSeriesMap = new Map< number, { timestamp: number; time: string; fullTime: string; [key: string]: string | number | null } >() - // Process ratio of HTTP status error + // NGINX: Process nginx HTTP status error ratio (to remove when migrating to envoy) if (metricsHttpStatusErrorRatio?.data?.result) { processMetricsData( metricsHttpStatusErrorRatio, timeSeriesMap, - (_, index) => JSON.stringify(metricsHttpStatusErrorRatio.data.result[index].metric), + (_, index) => JSON.stringify({ ...metricsHttpStatusErrorRatio.data.result[index].metric, source: 'nginx' }), + (value) => parseFloat(value), + useLocalTime + ) + } + + // ENVOY: Process envoy HTTP status error ratio + if (metricsEnvoyHttpStatusErrorRatio?.data?.result) { + processMetricsData( + metricsEnvoyHttpStatusErrorRatio, + timeSeriesMap, + (_, index) => + JSON.stringify({ ...metricsEnvoyHttpStatusErrorRatio.data.result[index].metric, source: 'envoy' }), (value) => parseFloat(value), useLocalTime ) @@ -84,19 +132,36 @@ export function InstanceHTTPErrorsChart({ // Convert map to sorted array and add time range padding const baseChartData = Array.from(timeSeriesMap.values()).sort((a, b) => a.timestamp - b.timestamp) return addTimeRangePadding(baseChartData, startTimestamp, endTimestamp, useLocalTime) - }, [metricsHttpStatusErrorRatio, useLocalTime, startTimestamp, endTimestamp]) + }, [metricsHttpStatusErrorRatio, metricsEnvoyHttpStatusErrorRatio, useLocalTime, startTimestamp, endTimestamp]) const seriesNames = useMemo(() => { - if (!metricsHttpStatusErrorRatio?.data?.result) return [] - return metricsHttpStatusErrorRatio.data.result.map((_: unknown, index: number) => - JSON.stringify(metricsHttpStatusErrorRatio.data.result[index].metric) - ) as string[] - }, [metricsHttpStatusErrorRatio]) + const names: string[] = [] + + // NGINX: Extract nginx series names (to remove when migrating to envoy) + if (metricsHttpStatusErrorRatio?.data?.result) { + names.push( + ...metricsHttpStatusErrorRatio.data.result.map((_: unknown, index: number) => + JSON.stringify({ ...metricsHttpStatusErrorRatio.data.result[index].metric, source: 'nginx' }) + ) + ) + } + + // ENVOY: Extract envoy series names + if (metricsEnvoyHttpStatusErrorRatio?.data?.result) { + names.push( + ...metricsEnvoyHttpStatusErrorRatio.data.result.map((_: unknown, index: number) => + JSON.stringify({ ...metricsEnvoyHttpStatusErrorRatio.data.result[index].metric, source: 'envoy' }) + ) + ) + } + + return names + }, [metricsHttpStatusErrorRatio, metricsEnvoyHttpStatusErrorRatio]) return ( ` nginx:request_p50:5m{ingress="${ingressName}"} ` @@ -19,16 +20,31 @@ const queryDuration99 = (ingressName: string) => ` nginx:request_p99:5m{ingress="${ingressName}"} ` +// ENVOY: Queries for envoy metrics +const queryEnvoyDuration50 = (httpRouteName: string) => ` + envoy_proxy:request_p50:5m{httproute_name="${httpRouteName}"} +` + +const queryEnvoyDuration95 = (httpRouteName: string) => ` + envoy_proxy:request_p95:5m{httproute_name="${httpRouteName}"} +` + +const queryEnvoyDuration99 = (httpRouteName: string) => ` + envoy_proxy:request_p99:5m{httproute_name="${httpRouteName}"} +` + export function NetworkRequestDurationChart({ clusterId, serviceId, isFullscreen, ingressName, + httpRouteName, }: { clusterId: string serviceId: string isFullscreen?: boolean ingressName: string + httpRouteName: string }) { const { startTimestamp, endTimestamp, useLocalTime, timeRange } = useDashboardContext() @@ -50,6 +66,7 @@ export function NetworkRequestDurationChart({ setLegendSelectedKeys(new Set()) } + // NGINX: Fetch nginx metrics (to remove when migrating to envoy) const { data: metrics50, isLoading: isLoadingMetrics50 } = useMetrics({ clusterId, startTimestamp, @@ -80,8 +97,40 @@ export function NetworkRequestDurationChart({ metricShortName: 'network_p95', }) + // ENVOY: Fetch envoy metrics + const { data: metricsEnvoy50, isLoading: isLoadingMetricsEnvoy50 } = useMetrics({ + clusterId, + startTimestamp, + endTimestamp, + timeRange, + query: queryEnvoyDuration50(httpRouteName), + boardShortName: 'service_overview', + metricShortName: 'envoy_p50', + }) + + const { data: metricsEnvoy99, isLoading: isLoadingMetricsEnvoy99 } = useMetrics({ + clusterId, + startTimestamp, + endTimestamp, + timeRange, + query: queryEnvoyDuration99(httpRouteName), + boardShortName: 'service_overview', + metricShortName: 'envoy_p99', + }) + + const { data: metricsEnvoy95, isLoading: isLoadingMetricsEnvoy95 } = useMetrics({ + clusterId, + startTimestamp, + endTimestamp, + timeRange, + query: queryEnvoyDuration95(httpRouteName), + boardShortName: 'service_overview', + metricShortName: 'envoy_p95', + }) + const chartData = useMemo(() => { - if (!metrics99?.data?.result) { + // Check if we have data from either source + if (!metrics99?.data?.result && !metricsEnvoy99?.data?.result) { return [] } @@ -90,39 +139,90 @@ export function NetworkRequestDurationChart({ { timestamp: number; time: string; fullTime: string; [key: string]: string | number | null } >() - // Process network duration 99th percentile metrics - processMetricsData( - metrics99, - timeSeriesMap, - () => '99th percentile', - (value) => parseFloat(value) * 1000, // Convert to ms - useLocalTime - ) - - // Process network duration 99th percentile metrics - processMetricsData( - metrics95, - timeSeriesMap, - () => '95th percentile', - (value) => parseFloat(value) * 1000, // Convert to ms - useLocalTime - ) - - // Process network duration 0.5th percentile metrics - processMetricsData( - metrics50, - timeSeriesMap, - () => '50th percentile', - (value) => parseFloat(value) * 1000, // Convert to ms - useLocalTime - ) + // NGINX: Process nginx duration metrics (to remove when migrating to envoy) + if (metrics99?.data?.result) { + processMetricsData( + metrics99, + timeSeriesMap, + () => '99th percentile (nginx)', + (value) => parseFloat(value) * 1000, // Convert to ms + useLocalTime + ) + } + + if (metrics95?.data?.result) { + processMetricsData( + metrics95, + timeSeriesMap, + () => '95th percentile (nginx)', + (value) => parseFloat(value) * 1000, // Convert to ms + useLocalTime + ) + } + + if (metrics50?.data?.result) { + processMetricsData( + metrics50, + timeSeriesMap, + () => '50th percentile (nginx)', + (value) => parseFloat(value) * 1000, // Convert to ms + useLocalTime + ) + } + + // ENVOY: Process envoy duration metrics + if (metricsEnvoy99?.data?.result) { + processMetricsData( + metricsEnvoy99, + timeSeriesMap, + () => '99th percentile (envoy)', + (value) => parseFloat(value), // Already in ms + useLocalTime + ) + } + + if (metricsEnvoy95?.data?.result) { + processMetricsData( + metricsEnvoy95, + timeSeriesMap, + () => '95th percentile (envoy)', + (value) => parseFloat(value), // Already in ms + useLocalTime + ) + } + + if (metricsEnvoy50?.data?.result) { + processMetricsData( + metricsEnvoy50, + timeSeriesMap, + () => '50th percentile (envoy)', + (value) => parseFloat(value), // Already in ms + useLocalTime + ) + } const baseChartData = Array.from(timeSeriesMap.values()).sort((a, b) => a.timestamp - b.timestamp) return addTimeRangePadding(baseChartData, startTimestamp, endTimestamp, useLocalTime) - }, [metrics99, metrics95, metrics50, useLocalTime, startTimestamp, endTimestamp]) + }, [ + metrics99, + metrics95, + metrics50, + metricsEnvoy99, + metricsEnvoy95, + metricsEnvoy50, + useLocalTime, + startTimestamp, + endTimestamp, + ]) - const isLoadingMetrics = isLoadingMetrics99 || isLoadingMetrics50 || isLoadingMetrics95 + const isLoadingMetrics = + isLoadingMetrics99 || + isLoadingMetrics50 || + isLoadingMetrics95 || + isLoadingMetricsEnvoy99 || + isLoadingMetricsEnvoy50 || + isLoadingMetricsEnvoy95 return ( 0 ? handleResetLegend : undefined} > + {/* NGINX: Lines for nginx metrics (to remove when migrating to envoy) */} 0 && !legendSelectedKeys.has('50th percentile') ? true : false} + hide={legendSelectedKeys.size > 0 && !legendSelectedKeys.has('50th percentile (nginx)')} /> 0 && !legendSelectedKeys.has('95th percentile') ? true : false} + hide={legendSelectedKeys.size > 0 && !legendSelectedKeys.has('95th percentile (nginx)')} /> 0 && !legendSelectedKeys.has('99th percentile') ? true : false} + hide={legendSelectedKeys.size > 0 && !legendSelectedKeys.has('99th percentile (nginx)')} + /> + {/* ENVOY: Lines for envoy metrics */} + 0 && !legendSelectedKeys.has('50th percentile (envoy)')} + /> + 0 && !legendSelectedKeys.has('95th percentile (envoy)')} + /> + 0 && !legendSelectedKeys.has('99th percentile (envoy)')} /> {!isLoadingMetrics && chartData.length > 0 && ( } + content={(props) => { + const nginxSeries = ['50th percentile (nginx)', '95th percentile (nginx)', '99th percentile (nginx)'] + const envoySeries = ['50th percentile (envoy)', '95th percentile (envoy)', '99th percentile (envoy)'] + + return ( +
+ {props.payload?.some((item) => nginxSeries.includes(item.dataKey as string)) && ( + nginxSeries.includes(item.dataKey as string))} + /> + )} + {props.payload?.some((item) => envoySeries.includes(item.dataKey as string)) && ( + envoySeries.includes(item.dataKey as string))} + /> + )} +
+ ) + }} /> )}
diff --git a/libs/domains/observability/feature/src/lib/service/service-dashboard/network-request-size-chart/network-request-size-chart.tsx b/libs/domains/observability/feature/src/lib/service/service-dashboard/network-request-size-chart/network-request-size-chart.tsx index 4f0052ac722..7034f881eb6 100644 --- a/libs/domains/observability/feature/src/lib/service/service-dashboard/network-request-size-chart/network-request-size-chart.tsx +++ b/libs/domains/observability/feature/src/lib/service/service-dashboard/network-request-size-chart/network-request-size-chart.tsx @@ -7,6 +7,7 @@ import { addTimeRangePadding } from '../../../util-chart/add-time-range-padding' import { processMetricsData } from '../../../util-chart/process-metrics-data' import { useDashboardContext } from '../../../util-filter/dashboard-context' +// NGINX: Queries for nginx metrics (to remove when migrating to envoy) const queryResponseSize = (ingressName: string) => ` sum(nginx:resp_bytes_rate:5m{ingress="${ingressName}"}) ` @@ -15,14 +16,25 @@ const queryRequestSize = (ingressName: string) => ` sum(nginx:req_bytes_rate:5m{ingress="${ingressName}"}) ` +// ENVOY: Queries for envoy metrics +const queryEnvoyResponseSize = (httpRouteName: string) => ` + sum(envoy_proxy:resp_bytes_rate:5m{httproute_name="${httpRouteName}"}) +` + +const queryEnvoyRequestSize = (httpRouteName: string) => ` + sum(envoy_proxy:req_bytes_rate:5m{httproute_name="${httpRouteName}"}) +` + export function NetworkRequestSizeChart({ clusterId, serviceId, ingressName, + httpRouteName, }: { clusterId: string serviceId: string ingressName: string + httpRouteName: string }) { const { startTimestamp, endTimestamp, useLocalTime, timeRange } = useDashboardContext() @@ -44,6 +56,7 @@ export function NetworkRequestSizeChart({ setLegendSelectedKeys(new Set()) } + // NGINX: Fetch nginx metrics (to remove when migrating to envoy) const { data: metricsResponseSize, isLoading: isLoadingMetricsResponseSize } = useMetrics({ clusterId, startTimestamp, @@ -64,8 +77,30 @@ export function NetworkRequestSizeChart({ metricShortName: 'network_req_size', }) + // ENVOY: Fetch envoy metrics + const { data: metricsEnvoyResponseSize, isLoading: isLoadingMetricsEnvoyResponseSize } = useMetrics({ + clusterId, + startTimestamp, + endTimestamp, + timeRange, + query: queryEnvoyResponseSize(httpRouteName), + boardShortName: 'service_overview', + metricShortName: 'envoy_resp_size', + }) + + const { data: metricsEnvoyRequestSize, isLoading: isLoadingMetricsEnvoyRequestSize } = useMetrics({ + clusterId, + startTimestamp, + endTimestamp, + timeRange, + query: queryEnvoyRequestSize(httpRouteName), + boardShortName: 'service_overview', + metricShortName: 'envoy_req_size', + }) + const chartData = useMemo(() => { - if (!metricsResponseSize?.data?.result) { + // Check if we have data from either source + if (!metricsResponseSize?.data?.result && !metricsEnvoyResponseSize?.data?.result) { return [] } @@ -74,30 +109,66 @@ export function NetworkRequestSizeChart({ { timestamp: number; time: string; fullTime: string; [key: string]: string | number | null } >() - // Process network response size metrics - processMetricsData( - metricsResponseSize, - timeSeriesMap, - () => 'Response size', - (value) => parseFloat(value), // Convert to bytes - useLocalTime - ) - - // Process network request size metrics - processMetricsData( - metricsRequestSize, - timeSeriesMap, - () => 'Request size', - (value) => parseFloat(value), // Convert to bytes - useLocalTime - ) + // NGINX: Process nginx size metrics (to remove when migrating to envoy) + if (metricsResponseSize?.data?.result) { + processMetricsData( + metricsResponseSize, + timeSeriesMap, + () => 'Response size (nginx)', + (value) => parseFloat(value), + useLocalTime + ) + } + + if (metricsRequestSize?.data?.result) { + processMetricsData( + metricsRequestSize, + timeSeriesMap, + () => 'Request size (nginx)', + (value) => parseFloat(value), + useLocalTime + ) + } + + // ENVOY: Process envoy size metrics + if (metricsEnvoyResponseSize?.data?.result) { + processMetricsData( + metricsEnvoyResponseSize, + timeSeriesMap, + () => 'Response size (envoy)', + (value) => parseFloat(value), + useLocalTime + ) + } + + if (metricsEnvoyRequestSize?.data?.result) { + processMetricsData( + metricsEnvoyRequestSize, + timeSeriesMap, + () => 'Request size (envoy)', + (value) => parseFloat(value), + useLocalTime + ) + } const baseChartData = Array.from(timeSeriesMap.values()).sort((a, b) => a.timestamp - b.timestamp) return addTimeRangePadding(baseChartData, startTimestamp, endTimestamp, useLocalTime) - }, [metricsResponseSize, metricsRequestSize, useLocalTime, startTimestamp, endTimestamp]) + }, [ + metricsResponseSize, + metricsRequestSize, + metricsEnvoyResponseSize, + metricsEnvoyRequestSize, + useLocalTime, + startTimestamp, + endTimestamp, + ]) - const isLoadingMetrics = isLoadingMetricsResponseSize || isLoadingMetricsRequestSize + const isLoadingMetrics = + isLoadingMetricsResponseSize || + isLoadingMetricsRequestSize || + isLoadingMetricsEnvoyResponseSize || + isLoadingMetricsEnvoyRequestSize return ( 0 ? handleResetLegend : undefined} > + {/* NGINX: Lines for nginx metrics (to remove when migrating to envoy) */} 0 && !legendSelectedKeys.has('Response size') ? true : false} + hide={legendSelectedKeys.size > 0 && !legendSelectedKeys.has('Response size (nginx)')} /> 0 && !legendSelectedKeys.has('Request size') ? true : false} + hide={legendSelectedKeys.size > 0 && !legendSelectedKeys.has('Request size (nginx)')} + /> + {/* ENVOY: Lines for envoy metrics */} + 0 && !legendSelectedKeys.has('Response size (envoy)')} + /> + 0 && !legendSelectedKeys.has('Request size (envoy)')} /> {!isLoadingMetrics && chartData.length > 0 && ( } + content={(props) => { + const nginxSeries = ['Request size (nginx)', 'Response size (nginx)'] + const envoySeries = ['Request size (envoy)', 'Response size (envoy)'] + + return ( +
+ {props.payload?.some((item) => nginxSeries.includes(item.dataKey as string)) && ( + nginxSeries.includes(item.dataKey as string))} + /> + )} + {props.payload?.some((item) => envoySeries.includes(item.dataKey as string)) && ( + envoySeries.includes(item.dataKey as string))} + /> + )} +
+ ) + }} /> )}
diff --git a/libs/domains/observability/feature/src/lib/service/service-dashboard/network-request-status-chart/network-request-status-chart.tsx b/libs/domains/observability/feature/src/lib/service/service-dashboard/network-request-status-chart/network-request-status-chart.tsx index 1032c44dda3..fe75a87704d 100644 --- a/libs/domains/observability/feature/src/lib/service/service-dashboard/network-request-status-chart/network-request-status-chart.tsx +++ b/libs/domains/observability/feature/src/lib/service/service-dashboard/network-request-status-chart/network-request-status-chart.tsx @@ -8,18 +8,26 @@ import { addTimeRangePadding } from '../../../util-chart/add-time-range-padding' import { processMetricsData } from '../../../util-chart/process-metrics-data' import { useDashboardContext } from '../../../util-filter/dashboard-context' +// NGINX: Query for nginx metrics (to remove when migrating to envoy) const query = (ingressName: string) => ` sum by(path,status)(nginx:req_rate:5m_by_path_status{ingress="${ingressName}"}) > 0 ` +// ENVOY: Query for envoy metrics +const queryEnvoy = (httpRouteName: string) => ` + sum by(envoy_response_code)(envoy_proxy:req_rate:5m_by_status{httproute_name="${httpRouteName}"}) > 0 +` + export function NetworkRequestStatusChart({ clusterId, serviceId, ingressName, + httpRouteName, }: { clusterId: string serviceId: string ingressName: string + httpRouteName: string }) { const { startTimestamp, endTimestamp, useLocalTime, timeRange } = useDashboardContext() @@ -41,6 +49,7 @@ export function NetworkRequestStatusChart({ setLegendSelectedKeys(new Set()) } + // NGINX: Fetch nginx metrics (to remove when migrating to envoy) const { data: metrics, isLoading: isLoadingMetrics } = useMetrics({ clusterId, startTimestamp, @@ -51,8 +60,20 @@ export function NetworkRequestStatusChart({ metricShortName: 'network_req_status', }) + // ENVOY: Fetch envoy metrics + const { data: metricsEnvoy, isLoading: isLoadingMetricsEnvoy } = useMetrics({ + clusterId, + startTimestamp, + endTimestamp, + timeRange, + query: queryEnvoy(httpRouteName), + boardShortName: 'service_overview', + metricShortName: 'envoy_req_status', + }) + const chartData = useMemo(() => { - if (!metrics?.data?.result) { + // Check if we have data from either source + if (!metrics?.data?.result && !metricsEnvoy?.data?.result) { return [] } @@ -61,31 +82,66 @@ export function NetworkRequestStatusChart({ { timestamp: number; time: string; fullTime: string; [key: string]: string | number | null } >() - // Process network request metrics - processMetricsData( - metrics, - timeSeriesMap, - (_, index) => JSON.stringify(metrics.data.result[index].metric), - (value) => parseFloat(value), - useLocalTime - ) + // NGINX: Process nginx metrics (to remove when migrating to envoy) + if (metrics?.data?.result) { + processMetricsData( + metrics, + timeSeriesMap, + (_, index) => JSON.stringify({ ...metrics.data.result[index].metric, source: 'nginx' }), + (value) => parseFloat(value), + useLocalTime + ) + } + + // ENVOY: Process envoy metrics + if (metricsEnvoy?.data?.result) { + processMetricsData( + metricsEnvoy, + timeSeriesMap, + (_, index) => JSON.stringify({ ...metricsEnvoy.data.result[index].metric, source: 'envoy' }), + (value) => parseFloat(value), + useLocalTime + ) + } const baseChartData = Array.from(timeSeriesMap.values()).sort((a, b) => a.timestamp - b.timestamp) return addTimeRangePadding(baseChartData, startTimestamp, endTimestamp, useLocalTime) - }, [metrics, useLocalTime, startTimestamp, endTimestamp]) + }, [metrics, metricsEnvoy, useLocalTime, startTimestamp, endTimestamp]) const seriesNames = useMemo(() => { - if (!metrics?.data?.result) return [] - return metrics.data.result.map((_: unknown, index: number) => - JSON.stringify(metrics.data.result[index].metric) - ) as string[] - }, [metrics]) + const names: string[] = [] + + // NGINX: Extract nginx series names (to remove when migrating to envoy) + if (metrics?.data?.result) { + names.push( + ...metrics.data.result.map((_: unknown, index: number) => + JSON.stringify({ ...metrics.data.result[index].metric, source: 'nginx' }) + ) + ) + } + + // ENVOY: Extract envoy series names + if (metricsEnvoy?.data?.result) { + names.push( + ...metricsEnvoy.data.result + .filter((result: any) => { + const code = result.metric?.envoy_response_code + return code !== 'undefined' && code !== undefined && code !== '' + }) + .map((result: any) => JSON.stringify({ ...result.metric, source: 'envoy' })) + ) + } + + return names + }, [metrics, metricsEnvoy]) + + const isLoading = isLoadingMetrics || isLoadingMetricsEnvoy return ( 0 && !legendSelectedKeys.has(name) ? true : false} + hide={legendSelectedKeys.size > 0 && !legendSelectedKeys.has(name)} /> ))} - {!isLoadingMetrics && chartData.length > 0 && ( + {!isLoading && chartData.length > 0 && ( ( - { - const { path, status } = JSON.parse(value) - return `path: "${path}" status: "${status}"` - }} - {...props} - /> - )} + content={(props) => { + // Group series by source + const nginxSeries = seriesNames.filter((name) => { + try { + const metric = JSON.parse(name) + return metric.source === 'nginx' + } catch { + return false + } + }) + + const envoySeries = seriesNames.filter((name) => { + try { + const metric = JSON.parse(name) + return metric.source === 'envoy' + } catch { + return false + } + }) + + const formatter = (value: string) => { + const metric = JSON.parse(value) + const { source } = metric + + if (source === 'nginx') { + const { path, status } = metric + return `path: "${path}" status: "${status}" (nginx)` + } else { + const { envoy_response_code } = metric + return `status: "${envoy_response_code}" (envoy)` + } + } + + return ( +
+ {nginxSeries.length > 0 && ( + nginxSeries.includes(item.dataKey as string))} + /> + )} + {envoySeries.length > 0 && ( + envoySeries.includes(item.dataKey as string))} + /> + )} +
+ ) + }} /> )}
diff --git a/libs/domains/observability/feature/src/lib/service/service-dashboard/service-dashboard.tsx b/libs/domains/observability/feature/src/lib/service/service-dashboard/service-dashboard.tsx index 93ab304933a..d108791b90d 100644 --- a/libs/domains/observability/feature/src/lib/service/service-dashboard/service-dashboard.tsx +++ b/libs/domains/observability/feature/src/lib/service/service-dashboard/service-dashboard.tsx @@ -7,6 +7,7 @@ import { useService } from '@qovery/domains/services/feature' import { Button, Callout, Chart, Heading, Icon, InputSelectSmall, Section, Tooltip } from '@qovery/shared/ui' import { useContainerName } from '../../hooks/use-container-name/use-container-name' import { useEnvironment } from '../../hooks/use-environment/use-environment' +import { useHttpRouteName } from '../../hooks/use-http-route-name/use-http-route-name' import { useIngressName } from '../../hooks/use-ingress-name/use-ingress-name' import { useNamespace } from '../../hooks/use-namespace/use-namespace' import { usePodNames } from '../../hooks/use-pod-names/use-pod-names' @@ -105,6 +106,14 @@ function ServiceDashboardContent() { endDate: now.toISOString(), }) + const { data: httpRouteName = '' } = useHttpRouteName({ + clusterId: environment?.cluster_id ?? '', + serviceId: serviceId, + enabled: hasPublicPort, + startDate: oneHourAgo.toISOString(), + endDate: now.toISOString(), + }) + if ((!containerName && isFetchedContainerName) || (!namespace && isFetchedNamespace)) { return (
@@ -231,6 +240,7 @@ function ServiceDashboardContent() { serviceId={serviceId} containerName={containerName} ingressName={ingressName} + httpRouteName={httpRouteName} /> )} {hasOnlyPrivatePorts && ( @@ -242,7 +252,12 @@ function ServiceDashboardContent() { )} {hasStorage && } {hasPublicPort && ( - + )} {hasOnlyPrivatePorts && (
@@ -301,6 +317,7 @@ function ServiceDashboardContent() { clusterId={environment.cluster_id} serviceId={serviceId} ingressName={ingressName} + httpRouteName={httpRouteName} />
@@ -308,6 +325,7 @@ function ServiceDashboardContent() { clusterId={environment.cluster_id} serviceId={serviceId} ingressName={ingressName} + httpRouteName={httpRouteName} />
From 58e4086ff260f97e5c2f93899491988bd075f44f Mon Sep 17 00:00:00 2001 From: Pierre Gerbelot Date: Tue, 3 Feb 2026 16:24:45 +0100 Subject: [PATCH 2/4] feat(alert): adapt http alert to envoy --- .../summary-step/alert-queries.ts | 74 ++++++++++--------- .../summary-step/summary-step.tsx | 20 ++--- 2 files changed, 47 insertions(+), 47 deletions(-) diff --git a/libs/domains/observability/feature/src/lib/alerting/alerting-creation-flow/summary-step/alert-queries.ts b/libs/domains/observability/feature/src/lib/alerting/alerting-creation-flow/summary-step/alert-queries.ts index 15e1c213592..a56b1a96260 100644 --- a/libs/domains/observability/feature/src/lib/alerting/alerting-creation-flow/summary-step/alert-queries.ts +++ b/libs/domains/observability/feature/src/lib/alerting/alerting-creation-flow/summary-step/alert-queries.ts @@ -42,26 +42,30 @@ export const QUERY_HTTP_ERROR_ENVOY = (httpRouteName: string) => ` ) )` -// Combined nginx + envoy HTTP error rate (aggregates both sources) +// Combined nginx + envoy HTTP error rate (takes max of both sources to detect worst case) export const QUERY_HTTP_ERROR_COMBINED = (ingressName: string, httpRouteName: string) => ` -( - sum( - rate(nginx_ingress_controller_requests{ingress="${ingressName}", status=~"5.."}[1m]) - ) - + - sum( - rate(envoy_cluster_upstream_rq_xx{envoy_cluster_name=~".*", httproute_name="${httpRouteName}", envoy_response_code_class="5"}[1m]) - ) -) -/ -( - sum( - rate(nginx_ingress_controller_requests{ingress="${ingressName}"}[1m]) - ) - + - sum( - rate(envoy_cluster_upstream_rq_xx{envoy_cluster_name=~".*", httproute_name="${httpRouteName}"}[1m]) - ) +max( + # NGINX error rate + ( + sum by (namespace) ( + rate(nginx_ingress_controller_requests{ingress="${ingressName}", status=~"5.."}[1m]) + ) + ) / ( + sum by (namespace) ( + rate(nginx_ingress_controller_requests{ingress="${ingressName}"}[1m]) + ) + ) or vector(0), + + # ENVOY error rate + ( + sum by (namespace) ( + rate(envoy_cluster_upstream_rq_xx{envoy_cluster_name=~".*", httproute_name="${httpRouteName}", envoy_response_code_class="5"}[1m]) + ) + ) / ( + sum by (namespace) ( + rate(envoy_cluster_upstream_rq_xx{envoy_cluster_name=~".*", httproute_name="${httpRouteName}"}[1m]) + ) + ) or vector(0) )` // NGINX: Query for nginx HTTP latency p99 (to remove when migrating to envoy) @@ -91,31 +95,35 @@ histogram_quantile( ) ) / 1000` -// Combined nginx + envoy HTTP latency p99 (takes max of both sources) +// Combined nginx + envoy HTTP latency p99 (takes max of both sources to detect worst case) export const QUERY_HTTP_LATENCY_COMBINED = (ingressName: string, httpRouteName: string) => ` max( + # NGINX p99 latency (in seconds) histogram_quantile( 0.99, - sum by (namespace, ingress, le) ( + sum by (namespace, le) ( rate( nginx_ingress_controller_request_duration_seconds_bucket{ ingress="${ingressName}" }[1m] ) ) - ) - or - histogram_quantile( - 0.99, - sum by (namespace, httproute_name, le) ( - rate( - envoy_cluster_upstream_rq_time_bucket{ - envoy_cluster_name=~".*", - httproute_name="${httpRouteName}" - }[1m] + ) or vector(0), + + # ENVOY p99 latency (convert ms to seconds) + ( + histogram_quantile( + 0.99, + sum by (namespace, le) ( + rate( + envoy_cluster_upstream_rq_time_bucket{ + envoy_cluster_name=~".*", + httproute_name="${httpRouteName}" + }[1m] + ) ) - ) - ) / 1000 + ) / 1000 + ) or vector(0) )` export const QUERY_INSTANCE_RESTART = (containerName: string) => ` diff --git a/libs/domains/observability/feature/src/lib/alerting/alerting-creation-flow/summary-step/summary-step.tsx b/libs/domains/observability/feature/src/lib/alerting/alerting-creation-flow/summary-step/summary-step.tsx index d35a6637d54..3d59a6084b7 100644 --- a/libs/domains/observability/feature/src/lib/alerting/alerting-creation-flow/summary-step/summary-step.tsx +++ b/libs/domains/observability/feature/src/lib/alerting/alerting-creation-flow/summary-step/summary-step.tsx @@ -12,9 +12,7 @@ import { type AlertConfiguration } from '../alerting-creation-flow.types' import { ALERTING_CREATION_EDIT, ALERTING_CREATION_METRIC } from '../router' import { QUERY_CPU, - QUERY_HTTP_ERROR, QUERY_HTTP_ERROR_COMBINED, - QUERY_HTTP_LATENCY, QUERY_HTTP_LATENCY_COMBINED, QUERY_INSTANCE_RESTART, QUERY_MEMORY, @@ -197,22 +195,16 @@ export function SummaryStep() { .with('missing_instance', () => QUERY_MISSING_INSTANCE(containerName)) .with('instance_restart', () => QUERY_INSTANCE_RESTART(containerName)) .with('http_error', () => { - // Use combined query if both sources available, otherwise fallback to single source - if (ingressName && httpRouteName) { - return QUERY_HTTP_ERROR_COMBINED(ingressName, httpRouteName) - } - if (ingressName) { - return QUERY_HTTP_ERROR(ingressName) + // Use combined query - works for nginx-only, envoy-only, or both + if (ingressName || httpRouteName) { + return QUERY_HTTP_ERROR_COMBINED(ingressName || '', httpRouteName || '') } return '' }) .with('http_latency', () => { - // Use combined query if both sources available, otherwise fallback to single source - if (ingressName && httpRouteName) { - return QUERY_HTTP_LATENCY_COMBINED(ingressName, httpRouteName) - } - if (ingressName) { - return QUERY_HTTP_LATENCY(ingressName) + // Use combined query - works for nginx-only, envoy-only, or both + if (ingressName || httpRouteName) { + return QUERY_HTTP_LATENCY_COMBINED(ingressName || '', httpRouteName || '') } return '' }) From 3cad55dbb655b7e8256e9e604719c929db56ee97 Mon Sep 17 00:00:00 2001 From: Pierre Gerbelot Date: Tue, 3 Feb 2026 19:16:35 +0100 Subject: [PATCH 3/4] add test --- .../summary-step/alert-queries.spec.ts | 392 ++++++++++++++++++ .../summary-step/alert-queries.ts | 6 +- .../summary-step/summary-step.spec.tsx | 52 +++ 3 files changed, 446 insertions(+), 4 deletions(-) create mode 100644 libs/domains/observability/feature/src/lib/alerting/alerting-creation-flow/summary-step/alert-queries.spec.ts diff --git a/libs/domains/observability/feature/src/lib/alerting/alerting-creation-flow/summary-step/alert-queries.spec.ts b/libs/domains/observability/feature/src/lib/alerting/alerting-creation-flow/summary-step/alert-queries.spec.ts new file mode 100644 index 00000000000..a649be18d95 --- /dev/null +++ b/libs/domains/observability/feature/src/lib/alerting/alerting-creation-flow/summary-step/alert-queries.spec.ts @@ -0,0 +1,392 @@ +import { + QUERY_CPU, + QUERY_HTTP_ERROR, + QUERY_HTTP_ERROR_COMBINED, + QUERY_HTTP_ERROR_ENVOY, + QUERY_HTTP_LATENCY, + QUERY_HTTP_LATENCY_COMBINED, + QUERY_HTTP_LATENCY_ENVOY, + QUERY_INSTANCE_RESTART, + QUERY_MEMORY, + QUERY_MISSING_INSTANCE, +} from './alert-queries' + +describe('QUERY_CPU', () => { + it('should generate CPU query with correct metric and labels', () => { + const query = QUERY_CPU('app-container') + + expect(query).toContain('container_cpu_usage_seconds_total') + expect(query).toContain('container="app-container"') + expect(query).toContain('kube_pod_container_resource_requests') + expect(query).toContain('resource="cpu"') + expect(query).toContain('rate(') + expect(query).toContain('[1m]') + }) + + it('should calculate CPU as ratio of usage to requests', () => { + const query = QUERY_CPU('app-container') + + expect(query).toContain('/') + expect(query).toContain('on(pod, namespace, container)') + }) +}) + +describe('QUERY_MEMORY', () => { + it('should generate memory query with correct metric and labels', () => { + const query = QUERY_MEMORY('app-container') + + expect(query).toContain('container_memory_working_set_bytes') + expect(query).toContain('container="app-container"') + expect(query).toContain('kube_pod_container_resource_requests') + expect(query).toContain('resource="memory"') + }) + + it('should calculate memory as ratio of usage to requests', () => { + const query = QUERY_MEMORY('app-container') + + expect(query).toContain('/') + expect(query).toContain('on(pod, namespace, container)') + }) +}) + +describe('QUERY_HTTP_ERROR', () => { + it('should generate nginx error rate query with correct metric', () => { + const query = QUERY_HTTP_ERROR('api-ingress') + + expect(query).toContain('nginx_ingress_controller_requests') + expect(query).toContain('ingress="api-ingress"') + expect(query).toContain('status=~"5.."') + expect(query).toContain('rate(') + expect(query).toContain('[1m]') + }) + + it('should aggregate by ingress and namespace', () => { + const query = QUERY_HTTP_ERROR('api-ingress') + + expect(query).toContain('sum by (ingress, namespace)') + }) + + it('should calculate error rate as ratio (errors / total)', () => { + const query = QUERY_HTTP_ERROR('api-ingress') + + expect(query).toContain('/') + const parts = query.split('/') + expect(parts.length).toBeGreaterThan(1) + }) +}) + +describe('QUERY_HTTP_ERROR_ENVOY', () => { + it('should generate envoy error rate query with correct metric', () => { + const query = QUERY_HTTP_ERROR_ENVOY('api-route') + + expect(query).toContain('envoy_cluster_upstream_rq_xx') + expect(query).toContain('httproute_name="api-route"') + expect(query).toContain('envoy_response_code_class="5"') + expect(query).toContain('rate(') + expect(query).toContain('[1m]') + }) + + it('should aggregate by httproute_name and namespace', () => { + const query = QUERY_HTTP_ERROR_ENVOY('api-route') + + expect(query).toContain('sum by (httproute_name, namespace)') + }) + + it('should calculate error rate as ratio', () => { + const query = QUERY_HTTP_ERROR_ENVOY('api-route') + + expect(query).toContain('/') + }) +}) + +describe('QUERY_HTTP_ERROR_COMBINED', () => { + const testIngressName = 'api-ingress' + const testRouteName = 'api-route' + + it('should use max() to take worst case between sources', () => { + const query = QUERY_HTTP_ERROR_COMBINED(testIngressName, testRouteName) + + expect(query).toMatch(/^max\(/) + }) + + it('should include both nginx and envoy metrics', () => { + const query = QUERY_HTTP_ERROR_COMBINED(testIngressName, testRouteName) + + expect(query).toContain('nginx_ingress_controller_requests') + expect(query).toContain('envoy_cluster_upstream_rq_xx') + expect(query).toContain(`ingress="${testIngressName}"`) + expect(query).toContain(`httproute_name="${testRouteName}"`) + }) + + it('should use or vector(0) for both sources', () => { + const query = QUERY_HTTP_ERROR_COMBINED(testIngressName, testRouteName) + + const matches = query.match(/or vector\(0\)/g) + expect(matches).toHaveLength(2) + }) + + it('should handle nginx-only case with empty envoy identifier', () => { + const query = QUERY_HTTP_ERROR_COMBINED(testIngressName, '') + + expect(query).toContain(`ingress="${testIngressName}"`) + expect(query).toContain('httproute_name=""') + expect(query).toContain('or vector(0)') + }) + + it('should handle envoy-only case with empty nginx identifier', () => { + const query = QUERY_HTTP_ERROR_COMBINED('', testRouteName) + + expect(query).toContain('ingress=""') + expect(query).toContain(`httproute_name="${testRouteName}"`) + expect(query).toContain('or vector(0)') + }) + + it('should aggregate by namespace for both sources', () => { + const query = QUERY_HTTP_ERROR_COMBINED(testIngressName, testRouteName) + + expect(query).toContain('sum by (namespace)') + }) + + it('should filter 5xx errors for both sources', () => { + const query = QUERY_HTTP_ERROR_COMBINED(testIngressName, testRouteName) + + expect(query).toContain('status=~"5.."') + expect(query).toContain('envoy_response_code_class="5"') + }) +}) + +describe('QUERY_HTTP_LATENCY', () => { + it('should calculate p99 latency using histogram_quantile', () => { + const query = QUERY_HTTP_LATENCY('api-ingress') + + expect(query).toContain('histogram_quantile') + expect(query).toContain('0.99') + }) + + it('should use nginx request duration bucket metric', () => { + const query = QUERY_HTTP_LATENCY('api-ingress') + + expect(query).toContain('nginx_ingress_controller_request_duration_seconds_bucket') + expect(query).toContain('ingress="api-ingress"') + }) + + it('should aggregate by namespace, ingress, and le', () => { + const query = QUERY_HTTP_LATENCY('api-ingress') + + expect(query).toContain('sum by (namespace, ingress, le)') + }) + + it('should use 1m rate window', () => { + const query = QUERY_HTTP_LATENCY('api-ingress') + + expect(query).toContain('rate(') + expect(query).toContain('[1m]') + }) +}) + +describe('QUERY_HTTP_LATENCY_ENVOY', () => { + it('should calculate p99 latency using histogram_quantile', () => { + const query = QUERY_HTTP_LATENCY_ENVOY('api-route') + + expect(query).toContain('histogram_quantile') + expect(query).toContain('0.99') + }) + + it('should use envoy request time bucket metric', () => { + const query = QUERY_HTTP_LATENCY_ENVOY('api-route') + + expect(query).toContain('envoy_cluster_upstream_rq_time_bucket') + expect(query).toContain('httproute_name="api-route"') + }) + + it('should convert milliseconds to seconds', () => { + const query = QUERY_HTTP_LATENCY_ENVOY('api-route') + + expect(query).toContain('/ 1000') + }) + + it('should aggregate by namespace, httproute_name, and le', () => { + const query = QUERY_HTTP_LATENCY_ENVOY('api-route') + + expect(query).toContain('sum by (namespace, httproute_name, le)') + }) +}) + +describe('QUERY_HTTP_LATENCY_COMBINED', () => { + const testIngressName = 'api-ingress' + const testRouteName = 'api-route' + + it('should use max() to take worst case between sources', () => { + const query = QUERY_HTTP_LATENCY_COMBINED(testIngressName, testRouteName) + + expect(query).toMatch(/^max\(/) + }) + + it('should include both nginx and envoy latency metrics', () => { + const query = QUERY_HTTP_LATENCY_COMBINED(testIngressName, testRouteName) + + expect(query).toContain('nginx_ingress_controller_request_duration_seconds_bucket') + expect(query).toContain('envoy_cluster_upstream_rq_time_bucket') + expect(query).toContain(`ingress="${testIngressName}"`) + expect(query).toContain(`httproute_name="${testRouteName}"`) + }) + + it('should calculate p99 for both sources', () => { + const query = QUERY_HTTP_LATENCY_COMBINED(testIngressName, testRouteName) + + const matches = query.match(/histogram_quantile\s*\(\s*0\.99/g) + expect(matches).toHaveLength(2) + }) + + it('should use or vector(0) for both sources', () => { + const query = QUERY_HTTP_LATENCY_COMBINED(testIngressName, testRouteName) + + const matches = query.match(/or vector\(0\)/g) + expect(matches).toHaveLength(2) + }) + + it('should convert envoy latency from milliseconds to seconds', () => { + const query = QUERY_HTTP_LATENCY_COMBINED(testIngressName, testRouteName) + + expect(query).toContain('/ 1000') + }) + + it('should handle nginx-only case', () => { + const query = QUERY_HTTP_LATENCY_COMBINED(testIngressName, '') + + expect(query).toContain(`ingress="${testIngressName}"`) + expect(query).toContain('httproute_name=""') + }) + + it('should handle envoy-only case', () => { + const query = QUERY_HTTP_LATENCY_COMBINED('', testRouteName) + + expect(query).toContain('ingress=""') + expect(query).toContain(`httproute_name="${testRouteName}"`) + }) + + it('should aggregate by namespace and le for histogram', () => { + const query = QUERY_HTTP_LATENCY_COMBINED(testIngressName, testRouteName) + + expect(query).toContain('sum by (namespace, le)') + }) +}) + +describe('QUERY_INSTANCE_RESTART', () => { + it('should track container restarts using increase', () => { + const query = QUERY_INSTANCE_RESTART('app-container') + + expect(query).toContain('kube_pod_container_status_restarts_total') + expect(query).toContain('container="app-container"') + expect(query).toContain('increase(') + }) +}) + +describe('QUERY_MISSING_INSTANCE', () => { + it('should check deployment replicas ratio', () => { + const query = QUERY_MISSING_INSTANCE('app-container') + + expect(query).toContain('kube_deployment_status_replicas_available') + expect(query).toContain('kube_deployment_spec_replicas') + expect(query).toContain('deployment="app-container"') + }) +}) + +describe('HTTP Query Consistency', () => { + it('should use consistent 1m rate window across all HTTP queries', () => { + const httpQueries = [ + QUERY_HTTP_ERROR('api-ingress'), + QUERY_HTTP_ERROR_ENVOY('api-route'), + QUERY_HTTP_ERROR_COMBINED('api-ingress', 'api-route'), + QUERY_HTTP_LATENCY('api-ingress'), + QUERY_HTTP_LATENCY_ENVOY('api-route'), + QUERY_HTTP_LATENCY_COMBINED('api-ingress', 'api-route'), + ] + + httpQueries.forEach((query) => { + expect(query).toContain('[1m]') + }) + }) + + it('should use max() aggregation for all combined queries', () => { + const combinedQueries = [ + QUERY_HTTP_ERROR_COMBINED('api-ingress', 'api-route'), + QUERY_HTTP_LATENCY_COMBINED('api-ingress', 'api-route'), + ] + + combinedQueries.forEach((query) => { + expect(query).toMatch(/^max\(/) + }) + }) + + it('should include or vector(0) for resilience in combined queries', () => { + const combinedQueries = [ + QUERY_HTTP_ERROR_COMBINED('api-ingress', 'api-route'), + QUERY_HTTP_LATENCY_COMBINED('api-ingress', 'api-route'), + ] + + combinedQueries.forEach((query) => { + const matches = query.match(/or vector\(0\)/g) + expect(matches).toHaveLength(2) + }) + }) +}) + +describe('Special Characters in Identifiers', () => { + it('should handle ingress names with hyphens and dots', () => { + const query = QUERY_HTTP_ERROR('my-app-v2.0') + + expect(query).toContain('ingress="my-app-v2.0"') + }) + + it('should handle route names with underscores and hyphens', () => { + const query = QUERY_HTTP_ERROR_ENVOY('my-route_v1.0-beta') + + expect(query).toContain('httproute_name="my-route_v1.0-beta"') + }) + + it('should handle container names with version numbers', () => { + const query = QUERY_CPU('app-container-v2.0') + + expect(query).toContain('container="app-container-v2.0"') + }) +}) + +describe('Query Structure Validation', () => { + it('should calculate error rates as division (errors / total)', () => { + const errorQueries = [ + QUERY_HTTP_ERROR('api-ingress'), + QUERY_HTTP_ERROR_ENVOY('api-route'), + QUERY_HTTP_ERROR_COMBINED('api-ingress', 'api-route'), + ] + + errorQueries.forEach((query) => { + expect(query).toContain('/') + const parts = query.split('/') + expect(parts.length).toBeGreaterThan(1) + }) + }) + + it('should use histogram_quantile for all latency queries', () => { + const latencyQueries = [ + QUERY_HTTP_LATENCY('api-ingress'), + QUERY_HTTP_LATENCY_ENVOY('api-route'), + QUERY_HTTP_LATENCY_COMBINED('api-ingress', 'api-route'), + ] + + latencyQueries.forEach((query) => { + expect(query).toContain('histogram_quantile') + expect(query).toContain('0.99') + }) + }) + + it('should calculate resource usage as ratio to requests', () => { + const resourceQueries = [QUERY_CPU('app-container'), QUERY_MEMORY('app-container')] + + resourceQueries.forEach((query) => { + expect(query).toContain('/') + expect(query).toContain('kube_pod_container_resource_requests') + expect(query).toContain('on(pod, namespace, container)') + }) + }) +}) diff --git a/libs/domains/observability/feature/src/lib/alerting/alerting-creation-flow/summary-step/alert-queries.ts b/libs/domains/observability/feature/src/lib/alerting/alerting-creation-flow/summary-step/alert-queries.ts index a56b1a96260..cc630c886e2 100644 --- a/libs/domains/observability/feature/src/lib/alerting/alerting-creation-flow/summary-step/alert-queries.ts +++ b/libs/domains/observability/feature/src/lib/alerting/alerting-creation-flow/summary-step/alert-queries.ts @@ -43,8 +43,7 @@ export const QUERY_HTTP_ERROR_ENVOY = (httpRouteName: string) => ` )` // Combined nginx + envoy HTTP error rate (takes max of both sources to detect worst case) -export const QUERY_HTTP_ERROR_COMBINED = (ingressName: string, httpRouteName: string) => ` -max( +export const QUERY_HTTP_ERROR_COMBINED = (ingressName: string, httpRouteName: string) => `max( # NGINX error rate ( sum by (namespace) ( @@ -96,8 +95,7 @@ histogram_quantile( ) / 1000` // Combined nginx + envoy HTTP latency p99 (takes max of both sources to detect worst case) -export const QUERY_HTTP_LATENCY_COMBINED = (ingressName: string, httpRouteName: string) => ` -max( +export const QUERY_HTTP_LATENCY_COMBINED = (ingressName: string, httpRouteName: string) => `max( # NGINX p99 latency (in seconds) histogram_quantile( 0.99, diff --git a/libs/domains/observability/feature/src/lib/alerting/alerting-creation-flow/summary-step/summary-step.spec.tsx b/libs/domains/observability/feature/src/lib/alerting/alerting-creation-flow/summary-step/summary-step.spec.tsx index d655563a068..34123873495 100644 --- a/libs/domains/observability/feature/src/lib/alerting/alerting-creation-flow/summary-step/summary-step.spec.tsx +++ b/libs/domains/observability/feature/src/lib/alerting/alerting-creation-flow/summary-step/summary-step.spec.tsx @@ -138,4 +138,56 @@ describe('SummaryStep', () => { expect(screen.getByRole('button', { name: /confirm and create/i })).toBeEnabled() }) + + describe('HTTP Alert Query Selection', () => { + // Mock hooks for testing query selection logic + beforeEach(() => { + jest.clearAllMocks() + }) + + it('should use COMBINED query when both nginx and envoy are available', () => { + jest.mock('../../../hooks/use-ingress-name/use-ingress-name', () => ({ + useIngressName: () => ({ data: 'my-ingress' }), + })) + + jest.mock('../../../hooks/use-http-route-name/use-http-route-name', () => ({ + useHttpRouteName: () => ({ data: 'my-route' }), + })) + + const httpErrorAlert = createAlert({ + tag: 'http_error', + name: 'HTTP Error Alert', + }) + + renderWithContext([httpErrorAlert], ['http_error']) + + expect(screen.getByText('HTTP Error Alert')).toBeInTheDocument() + }) + + it('should handle http_latency alerts', () => { + const httpLatencyAlert = createAlert({ + tag: 'http_latency', + name: 'HTTP Latency Alert', + }) + + renderWithContext([httpLatencyAlert], ['http_latency']) + + expect(screen.getByText('HTTP Latency Alert')).toBeInTheDocument() + }) + + it('should handle multiple alert types including HTTP', () => { + const alerts = [ + createAlert({ id: 'alert-1', name: 'CPU Alert', tag: 'cpu' }), + createAlert({ id: 'alert-2', name: 'HTTP Error Alert', tag: 'http_error' }), + createAlert({ id: 'alert-3', name: 'HTTP Latency Alert', tag: 'http_latency' }), + ] + + renderWithContext(alerts, ['cpu', 'http_error', 'http_latency']) + + expect(screen.getByText('Alerts included in creation (3)')).toBeInTheDocument() + expect(screen.getByText('CPU Alert')).toBeInTheDocument() + expect(screen.getByText('HTTP Error Alert')).toBeInTheDocument() + expect(screen.getByText('HTTP Latency Alert')).toBeInTheDocument() + }) + }) }) From cabe48fce0f00e44bf5351b8e645cf38efd3c8dd Mon Sep 17 00:00:00 2001 From: Pierre Gerbelot Date: Tue, 3 Feb 2026 19:46:16 +0100 Subject: [PATCH 4/4] remove negative padding --- .../network-request-duration-chart.tsx | 2 +- .../network-request-size-chart/network-request-size-chart.tsx | 2 +- .../network-request-status-chart.tsx | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/libs/domains/observability/feature/src/lib/service/service-dashboard/network-request-duration-chart/network-request-duration-chart.tsx b/libs/domains/observability/feature/src/lib/service/service-dashboard/network-request-duration-chart/network-request-duration-chart.tsx index 87755cb8d71..89b923c461b 100644 --- a/libs/domains/observability/feature/src/lib/service/service-dashboard/network-request-duration-chart/network-request-duration-chart.tsx +++ b/libs/domains/observability/feature/src/lib/service/service-dashboard/network-request-duration-chart/network-request-duration-chart.tsx @@ -313,7 +313,7 @@ export function NetworkRequestDurationChart({ const envoySeries = ['50th percentile (envoy)', '95th percentile (envoy)', '99th percentile (envoy)'] return ( -
+
{props.payload?.some((item) => nginxSeries.includes(item.dataKey as string)) && ( +
{props.payload?.some((item) => nginxSeries.includes(item.dataKey as string)) && ( +
{nginxSeries.length > 0 && (