diff --git a/.gitignore b/.gitignore index 02ddfdc..7031f56 100644 --- a/.gitignore +++ b/.gitignore @@ -8,5 +8,7 @@ __pycache__/ *.db .ruff_cache .coverage +coverage.xml .vscode/ -.hypothesis/ \ No newline at end of file +.hypothesis/ +.venv/ diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 0d19624..58b0b3d 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -4,7 +4,14 @@ repos: - id: resolver-unit-tests name: Resolver unit tests language: system - entry: bash -lc 'python -m pytest -q' + entry: >- + bash -lc 'R="$(git rev-parse --show-toplevel 2>/dev/null || echo "$PWD")"; + cd "$R" || exit 1; + P=""; for c in "$PWD/../.venv/bin/python" "$PWD/../venv/bin/python" "$PWD/.venv/bin/python" "$PWD/venv/bin/python" "$(command -v python3)" "$(command -v python)"; do + [[ -n "$c" && -x "$c" ]] || continue; + "$c" -c "import pytest" 2>/dev/null && { P="$c"; break; }; done; + [[ -n "$P" ]] || { echo "error: no Python with pytest; use .venv and pip install -e .[dev]" >&2; exit 1; }; + "$P" -m pytest -q' pass_filenames: false files: \.py$ exclude: (^|/)tests/ @@ -14,13 +21,14 @@ repos: name: Resolver mypy language: system entry: >- - bash -lc 'APP_ENV=development - ENVIRONMENT=development - RESOLVER_DATABASE_URL=postgresql://ci_user:ci_password_123@localhost:5432/resolver - RESOLVER_EXPECTED_SERVICE_TOKEN=ci_resolver_expected_service_token_123456789 - RESOLVER_CONTEXT_VERIFY_KEY=ci_resolver_context_verify_key_12345678901234567890 - PYTHONPATH=$PWD - mypy --config-file "$PWD/pyproject.toml" "$@"' -- + bash -lc 'R="$(git rev-parse --show-toplevel 2>/dev/null || echo "$PWD")"; + cd "$R" || exit 1; + P=""; for c in "$PWD/../.venv/bin/python" "$PWD/../venv/bin/python" "$PWD/.venv/bin/python" "$PWD/venv/bin/python" "$(command -v python3)" "$(command -v python)"; do + [[ -n "$c" && -x "$c" ]] || continue; + "$c" -c "import mypy, numpy, sqlalchemy, pydantic, httpx" 2>/dev/null && { P="$c"; break; }; done; + [[ -n "$P" ]] || { echo "error: no Python with mypy + resolver deps; pip install -e .[dev] in resolver/" >&2; exit 1; }; + APP_ENV=development ENVIRONMENT=development RESOLVER_DATABASE_URL=postgresql://ci_user:ci_password_123@localhost:5432/resolver RESOLVER_EXPECTED_SERVICE_TOKEN=ci_resolver_expected_service_token_123456789 RESOLVER_CONTEXT_VERIFY_KEY=ci_resolver_context_verify_key_12345678901234567890 PYTHONPATH="$PWD" "$P" -m mypy --python-executable "$P" --cache-dir /tmp/resolver-mypy-cache --config-file "$PWD/pyproject.toml" .' + pass_filenames: false files: \.py$ exclude: (^|/)tests/ types: [python] @@ -29,13 +37,14 @@ repos: name: Resolver pylint language: system entry: >- - bash -lc 'APP_ENV=development - ENVIRONMENT=development - RESOLVER_DATABASE_URL=postgresql://ci_user:ci_password_123@localhost:5432/resolver - RESOLVER_EXPECTED_SERVICE_TOKEN=ci_resolver_expected_service_token_123456789 - RESOLVER_CONTEXT_VERIFY_KEY=ci_resolver_context_verify_key_12345678901234567890 - PYTHONPATH=$PWD - pylint --rcfile "$PWD/pyproject.toml" "$@"' -- + bash -lc 'R="$(git rev-parse --show-toplevel 2>/dev/null || echo "$PWD")"; + cd "$R" || exit 1; + P=""; for c in "$PWD/../.venv/bin/python" "$PWD/../venv/bin/python" "$PWD/.venv/bin/python" "$PWD/venv/bin/python" "$(command -v python3)" "$(command -v python)"; do + [[ -n "$c" && -x "$c" ]] || continue; + "$c" -c "import pylint, numpy, sqlalchemy, pydantic, httpx" 2>/dev/null && { P="$c"; break; }; done; + [[ -n "$P" ]] || { echo "error: no Python with pylint + resolver deps; pip install -e .[dev] in resolver/" >&2; exit 1; }; + APP_ENV=development ENVIRONMENT=development RESOLVER_DATABASE_URL=postgresql://ci_user:ci_password_123@localhost:5432/resolver RESOLVER_EXPECTED_SERVICE_TOKEN=ci_resolver_expected_service_token_123456789 RESOLVER_CONTEXT_VERIFY_KEY=ci_resolver_context_verify_key_12345678901234567890 PYTHONPATH="$PWD" "$P" -m pylint --rcfile "$PWD/pyproject.toml" .' + pass_filenames: false files: \.py$ exclude: (^|/)tests/ types: [python] diff --git a/CHANGELOG.md b/CHANGELOG.md index 817456d..9a53b81 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,7 @@ All notable changes to this project will be documented in this file. ### Changed +- Pinned resolver runtime dependencies in `pyproject.toml` to explicit `==` versions for reproducible installs/builds. - Applied a clean pylint reformat/refactor pass across resolver with safe line-wrapping/readability updates. - Enforced strict naming consistency for module state, enum members, and internal variables/constants to align with configured pylint rules. - Removed legacy uppercase alias usage from tests and aligned analyzer compatibility exports with strict snake_case lint policy. diff --git a/api/requests/__init__.py b/api/requests/__init__.py index b699ae9..d97bc32 100644 --- a/api/requests/__init__.py +++ b/api/requests/__init__.py @@ -18,14 +18,14 @@ from .traces import TraceRequest __all__ = [ - "AnalyzeRequest", "AnalyzeJobCreateRequest", - "MetricRequest", + "AnalyzeRequest", "ChangepointRequest", + "CorrelateRequest", + "DeploymentEventRequest", "LogRequest", - "TraceRequest", + "MetricRequest", "SloRequest", - "CorrelateRequest", "TopologyRequest", - "DeploymentEventRequest", + "TraceRequest", ] diff --git a/api/requests/_time_range.py b/api/requests/_time_range.py index b21de97..82ba0cb 100644 --- a/api/requests/_time_range.py +++ b/api/requests/_time_range.py @@ -4,8 +4,6 @@ from __future__ import annotations -from typing import List, Optional - from pydantic import BaseModel, Field, model_validator @@ -14,12 +12,12 @@ class TimeRangeRequest(BaseModel): start: int end: int step: str = "15s" - services: List[str] = Field(default_factory=list) - log_query: Optional[str] = None - metric_queries: Optional[List[str]] = None + services: list[str] = Field(default_factory=list) + log_query: str | None = None + metric_queries: list[str] | None = None @model_validator(mode="after") - def validate_time_range(self) -> "TimeRangeRequest": + def validate_time_range(self) -> TimeRangeRequest: if self.start >= self.end: raise ValueError("start must be less than end") return self diff --git a/api/requests/analyze.py b/api/requests/analyze.py index a0ae33e..068c920 100644 --- a/api/requests/analyze.py +++ b/api/requests/analyze.py @@ -10,16 +10,14 @@ from __future__ import annotations -from typing import Optional - from pydantic import Field from ._time_range import TimeRangeRequest class AnalyzeRequest(TimeRangeRequest): - config_yaml: Optional[str] = None - sensitivity: Optional[float] = Field(default=3.0, ge=1.0, le=6.0) + config_yaml: str | None = None + sensitivity: float | None = Field(default=3.0, ge=1.0, le=6.0) apdex_threshold_ms: float = 500.0 slo_target: float = Field(default=0.999, ge=0.0, le=1.0) correlation_window_seconds: float = Field(default=60.0, ge=10.0, le=600.0) diff --git a/api/requests/metrics.py b/api/requests/metrics.py index 6397ed3..1ab7714 100644 --- a/api/requests/metrics.py +++ b/api/requests/metrics.py @@ -10,8 +10,6 @@ from __future__ import annotations -from typing import Optional - from pydantic import BaseModel, Field @@ -21,7 +19,7 @@ class MetricRequest(BaseModel): start: int end: int step: str = "15s" - sensitivity: Optional[float] = Field(default=3.0, ge=1.0, le=6.0) + sensitivity: float | None = Field(default=3.0, ge=1.0, le=6.0) class ChangepointRequest(BaseModel): diff --git a/api/requests/slo.py b/api/requests/slo.py index 1e3f644..df3acad 100644 --- a/api/requests/slo.py +++ b/api/requests/slo.py @@ -10,8 +10,6 @@ from __future__ import annotations -from typing import Optional - from pydantic import BaseModel, Field, model_validator @@ -22,11 +20,11 @@ class SloRequest(BaseModel): end: int step: str = "15s" target_availability: float = Field(default=0.999, ge=0.0, le=1.0) - error_query: Optional[str] = None - total_query: Optional[str] = None + error_query: str | None = None + total_query: str | None = None @model_validator(mode="after") - def validate_time_range(self) -> "SloRequest": + def validate_time_range(self) -> SloRequest: if self.start >= self.end: raise ValueError("start must be less than end") return self diff --git a/api/requests/traces.py b/api/requests/traces.py index 78a5d28..506fb59 100644 --- a/api/requests/traces.py +++ b/api/requests/traces.py @@ -10,8 +10,6 @@ from __future__ import annotations -from typing import Optional - from pydantic import BaseModel @@ -19,5 +17,5 @@ class TraceRequest(BaseModel): tenant_id: str start: int end: int - service: Optional[str] = None + service: str | None = None apdex_threshold_ms: float = 500.0 diff --git a/api/responses/__init__.py b/api/responses/__init__.py index 76cdeb3..4dfcb2c 100644 --- a/api/responses/__init__.py +++ b/api/responses/__init__.py @@ -56,26 +56,26 @@ } __all__ = [ - "NpModel", - "MetricAnomaly", - "LogBurst", - "LogPattern", - "ServiceLatency", - "ErrorPropagation", - "RootCause", - "SloBurnAlert", - "BudgetStatus", "AnalysisQuality", "AnalysisReport", - "MetricSeriesDistributionStats", - "JobStatus", "AnalyzeConfigTemplateResponse", "AnalyzeJobCreateResponse", - "AnalyzeJobSummary", "AnalyzeJobListResponse", "AnalyzeJobResultResponse", - "AnalyzeReportResponse", + "AnalyzeJobSummary", "AnalyzeReportDeleteResponse", + "AnalyzeReportResponse", + "BudgetStatus", + "ErrorPropagation", + "JobStatus", + "LogBurst", + "LogPattern", + "MetricAnomaly", + "MetricSeriesDistributionStats", + "NpModel", + "RootCause", + "ServiceLatency", + "SloBurnAlert", ] diff --git a/api/responses/analysis.py b/api/responses/analysis.py index 5b6d64f..4488ec3 100644 --- a/api/responses/analysis.py +++ b/api/responses/analysis.py @@ -10,8 +10,6 @@ from __future__ import annotations -from typing import Dict, List, Optional - from pydantic import Field from engine.causal.bayesian import BayesianScore @@ -24,12 +22,12 @@ from engine.ml.clustering import AnomalyCluster from engine.ml.ranking import RankedCause -from .base import NpModel from .anomalies import MetricAnomaly +from .base import NpModel from .logs import LogBurst, LogPattern -from .traces import ErrorPropagation, ServiceLatency -from .slo import SloBurnAlert from .rca import RootCause +from .slo import SloBurnAlert +from .traces import ErrorPropagation, ServiceLatency class MetricSeriesDistributionStats(NpModel): @@ -51,8 +49,8 @@ class MetricSeriesDistributionStats(NpModel): class AnalysisQuality(NpModel): - anomaly_density: Dict[str, float] = Field(default_factory=dict) - suppression_counts: Dict[str, int] = Field(default_factory=dict) + anomaly_density: dict[str, float] = Field(default_factory=dict) + suppression_counts: dict[str, int] = Field(default_factory=dict) gating_profile: str confidence_calibration_version: str @@ -62,23 +60,23 @@ class AnalysisReport(NpModel): start: int end: int duration_seconds: int - metric_anomalies: List[MetricAnomaly] - log_bursts: List[LogBurst] - log_patterns: List[LogPattern] - service_latency: List[ServiceLatency] - error_propagation: List[ErrorPropagation] - slo_alerts: List[SloBurnAlert] = [] - root_causes: List[RootCause] - ranked_causes: List[RankedCause] = [] - change_points: List[ChangePoint] = [] - log_metric_links: List[LogMetricLink] = [] - forecasts: List[TrajectoryForecast] = [] - degradation_signals: List[DegradationSignal] = [] - anomaly_clusters: List[AnomalyCluster] = [] - granger_results: List[GrangerResult] = [] - bayesian_scores: List[BayesianScore] = [] - analysis_warnings: List[str] = [] + metric_anomalies: list[MetricAnomaly] + log_bursts: list[LogBurst] + log_patterns: list[LogPattern] + service_latency: list[ServiceLatency] + error_propagation: list[ErrorPropagation] + slo_alerts: list[SloBurnAlert] = [] + root_causes: list[RootCause] + ranked_causes: list[RankedCause] = [] + change_points: list[ChangePoint] = [] + log_metric_links: list[LogMetricLink] = [] + forecasts: list[TrajectoryForecast] = [] + degradation_signals: list[DegradationSignal] = [] + anomaly_clusters: list[AnomalyCluster] = [] + granger_results: list[GrangerResult] = [] + bayesian_scores: list[BayesianScore] = [] + analysis_warnings: list[str] = [] overall_severity: Severity summary: str - quality: Optional[AnalysisQuality] = None - metric_series_statistics: List[MetricSeriesDistributionStats] = Field(default_factory=list) + quality: AnalysisQuality | None = None + metric_series_statistics: list[MetricSeriesDistributionStats] = Field(default_factory=list) diff --git a/api/responses/anomalies.py b/api/responses/anomalies.py index 5d1920a..23d0445 100644 --- a/api/responses/anomalies.py +++ b/api/responses/anomalies.py @@ -10,8 +10,6 @@ from __future__ import annotations -from typing import Tuple - from engine.enums import ChangeType, Severity from .base import NpModel @@ -25,7 +23,7 @@ class MetricAnomaly(NpModel): z_score: float mad_score: float isolation_score: float - expected_range: Tuple[float, float] + expected_range: tuple[float, float] severity: Severity description: str iqr_score: float = 0.0 diff --git a/api/responses/jobs.py b/api/responses/jobs.py index 291ccb4..a99b9f6 100644 --- a/api/responses/jobs.py +++ b/api/responses/jobs.py @@ -12,7 +12,6 @@ from datetime import datetime from enum import Enum -from typing import Optional from pydantic import BaseModel, Field @@ -42,18 +41,18 @@ class AnalyzeJobSummary(BaseModel): report_id: str status: JobStatus created_at: datetime - started_at: Optional[datetime] = None - finished_at: Optional[datetime] = None - duration_ms: Optional[int] = None - error: Optional[str] = None - summary_preview: Optional[str] = None + started_at: datetime | None = None + finished_at: datetime | None = None + duration_ms: int | None = None + error: str | None = None + summary_preview: str | None = None tenant_id: str requested_by: str class AnalyzeJobListResponse(BaseModel): items: list[AnalyzeJobSummary] - next_cursor: Optional[str] = None + next_cursor: str | None = None class AnalyzeJobResultResponse(BaseModel): @@ -62,7 +61,7 @@ class AnalyzeJobResultResponse(BaseModel): status: JobStatus tenant_id: str requested_by: str - result: Optional[JSONDict] = None + result: JSONDict | None = None class AnalyzeReportResponse(BaseModel): @@ -71,7 +70,7 @@ class AnalyzeReportResponse(BaseModel): status: JobStatus tenant_id: str requested_by: str - result: Optional[JSONDict] = None + result: JSONDict | None = None class AnalyzeReportDeleteResponse(BaseModel): diff --git a/api/responses/rca.py b/api/responses/rca.py index 71e9045..eddbf9b 100644 --- a/api/responses/rca.py +++ b/api/responses/rca.py @@ -10,12 +10,9 @@ from __future__ import annotations -from typing import Dict, List, Optional - from pydantic import ConfigDict, Field from custom_types.json import JSONDict - from engine.enums import Severity, Signal from .base import NpModel @@ -26,13 +23,13 @@ class ApiRootCause(NpModel): hypothesis: str confidence: float = Field(ge=0.0, le=1.0) - evidence: List[str] - contributing_signals: List[Signal] + evidence: list[str] + contributing_signals: list[Signal] recommended_action: str severity: Severity - corroboration_summary: Optional[str] = None + corroboration_summary: str | None = None suppression_diagnostics: JSONDict = Field(default_factory=dict) - selection_score_components: Dict[str, float] = Field(default_factory=dict) + selection_score_components: dict[str, float] = Field(default_factory=dict) # Backward-compatible alias for existing imports. diff --git a/api/responses/slo.py b/api/responses/slo.py index 323773f..3f532a1 100644 --- a/api/responses/slo.py +++ b/api/responses/slo.py @@ -12,4 +12,4 @@ from engine.slo.models import BudgetStatus, SloBurnAlert -__all__ = ["SloBurnAlert", "BudgetStatus"] +__all__ = ["BudgetStatus", "SloBurnAlert"] diff --git a/api/responses/traces.py b/api/responses/traces.py index 0f6e4a6..12e7049 100644 --- a/api/responses/traces.py +++ b/api/responses/traces.py @@ -10,8 +10,6 @@ from __future__ import annotations -from typing import List, Optional - from engine.enums import Severity from .base import NpModel @@ -27,12 +25,12 @@ class ServiceLatency(NpModel): error_rate: float sample_count: int severity: Severity - window_start: Optional[float] = None - window_end: Optional[float] = None + window_start: float | None = None + window_end: float | None = None class ErrorPropagation(NpModel): source_service: str - affected_services: List[str] + affected_services: list[str] error_rate: float severity: Severity diff --git a/api/routes/__init__.py b/api/routes/__init__.py index dd993aa..f57cdaf 100644 --- a/api/routes/__init__.py +++ b/api/routes/__init__.py @@ -12,19 +12,19 @@ from fastapi import APIRouter -from api.routes.health import router as health_router from api.routes.analyze import router as analyze_router -from api.routes.metrics import router as metrics_router -from api.routes.logs import router as logs_router -from api.routes.traces import router as traces_router +from api.routes.causal import router as causal_router from api.routes.correlation import router as correlation_router -from api.routes.slo import router as slo_router -from api.routes.topology import router as topology_router from api.routes.events import router as events_router from api.routes.forecast import router as forecast_router -from api.routes.causal import router as causal_router -from api.routes.ml import router as ml_router +from api.routes.health import router as health_router from api.routes.jobs import router as jobs_router +from api.routes.logs import router as logs_router +from api.routes.metrics import router as metrics_router +from api.routes.ml import router as ml_router +from api.routes.slo import router as slo_router +from api.routes.topology import router as topology_router +from api.routes.traces import router as traces_router router = APIRouter() diff --git a/api/routes/analyze.py b/api/routes/analyze.py index 6e19474..038ac73 100644 --- a/api/routes/analyze.py +++ b/api/routes/analyze.py @@ -13,11 +13,12 @@ from __future__ import annotations from fastapi import APIRouter, Depends -from api.routes.exception import handle_exceptions + from api.requests import AnalyzeRequest from api.responses import AnalysisReport, AnalyzeConfigTemplateResponse -from services.analyze_service import run_analysis +from api.routes.exception import handle_exceptions from services.analysis_config_service import analysis_config_service +from services.analyze_service import run_analysis from services.security_service import require_permission_dependency router = APIRouter(tags=["RCA"]) diff --git a/api/routes/causal.py b/api/routes/causal.py index 4c2a0ee..f23fa7d 100644 --- a/api/routes/causal.py +++ b/api/routes/causal.py @@ -9,27 +9,25 @@ from __future__ import annotations import numpy as np -from typing import Dict - from fastapi import APIRouter, Depends, Query from api.requests import AnalyzeRequest, CorrelateRequest from api.routes.common import coerce_query_value, get_provider, safe_call from api.routes.exception import handle_exceptions from config import DEFAULT_METRIC_QUERIES, DEFAULT_SERVICE_NAME +from custom_types.json import JSONDict from datasources.provider import DataSourceProvider from engine import anomaly from engine.causal import CausalGraph, bayesian_score, test_all_pairs from engine.fetcher import fetch_metrics from engine.registry import get_registry -from custom_types.json import JSONDict from services.security_service import enforce_request_tenant, require_permission_dependency from store import granger as granger_store router = APIRouter(tags=["Causal"]) -def _select_top_variance_series(series_map: Dict[str, list[float]], max_series: int) -> Dict[str, list[float]]: +def _select_top_variance_series(series_map: dict[str, list[float]], max_series: int) -> dict[str, list[float]]: ranked: list[tuple[str, float]] = [] for name, values in series_map.items(): @@ -46,8 +44,8 @@ def _select_top_variance_series(series_map: Dict[str, list[float]], max_series: return {name: values for name, values in series_map.items() if name in selected} -def _common_causes_for_roots(causal_graph: CausalGraph, roots: list[str]) -> Dict[str, list[str]]: - common: Dict[str, list[str]] = {} +def _common_causes_for_roots(causal_graph: CausalGraph, roots: list[str]) -> dict[str, list[str]]: + common: dict[str, list[str]] = {} for idx, root_a in enumerate(roots): for root_b in roots[idx + 1 :]: pair_key = f"{root_a}|{root_b}" @@ -81,7 +79,7 @@ async def granger_causality( provider = get_provider(req.tenant_id) metrics_raw = await _fetch_requested_metrics(provider, req) - series_map: Dict[str, list[float]] = {} + series_map: dict[str, list[float]] = {} for query_string, resp in metrics_raw: for metric_name, _, vals in anomaly.iter_series(resp, query_hint=query_string): series_key = f"{query_string}::{metric_name}" diff --git a/api/routes/common.py b/api/routes/common.py index db9f6b6..e2963fc 100644 --- a/api/routes/common.py +++ b/api/routes/common.py @@ -14,8 +14,8 @@ from __future__ import annotations -from collections.abc import Callable -from typing import Awaitable, Protocol, TypeVar +from collections.abc import Awaitable, Callable +from typing import Protocol, TypeVar from fastapi import HTTPException diff --git a/api/routes/correlation.py b/api/routes/correlation.py index 1a5fb3e..53da7ba 100644 --- a/api/routes/correlation.py +++ b/api/routes/correlation.py @@ -16,13 +16,13 @@ from api.routes.common import get_provider from api.routes.exception import handle_exceptions from config import DEFAULT_METRIC_QUERIES +from custom_types.json import JSONDict from engine import anomaly, logs from engine.correlation import correlate, link_logs_to_metrics from engine.fetcher import fetch_metrics from engine.log_query import build_log_query from engine.registry import get_registry from services.security_service import enforce_request_tenant, require_permission_dependency -from custom_types.json import JSONDict router = APIRouter(tags=["Correlation"]) diff --git a/api/routes/events.py b/api/routes/events.py index 73a2f92..b0e188d 100644 --- a/api/routes/events.py +++ b/api/routes/events.py @@ -9,16 +9,14 @@ from __future__ import annotations -from typing import Dict, List - from fastapi import APIRouter, Depends, HTTPException -from engine.events.models import DeploymentEvent -from api.routes.exception import handle_exceptions -from services.security_service import enforce_request_tenant, get_context_tenant, require_permission_dependency -from engine.registry import get_registry from api.requests import DeploymentEventRequest +from api.routes.exception import handle_exceptions from custom_types.json import JSONDict +from engine.events.models import DeploymentEvent +from engine.registry import get_registry +from services.security_service import enforce_request_tenant, get_context_tenant, require_permission_dependency router = APIRouter(tags=["Events"]) @@ -29,7 +27,7 @@ dependencies=[Depends(require_permission_dependency("create:rca"))], ) @handle_exceptions -async def register_deployment(req: DeploymentEventRequest, tenant_id: str | None = None) -> Dict[str, str]: +async def register_deployment(req: DeploymentEventRequest, tenant_id: str | None = None) -> dict[str, str]: req = enforce_request_tenant(req) tid = get_context_tenant(tenant_id or req.tenant_id) if not isinstance(tid, str) or not tid.strip(): @@ -48,7 +46,7 @@ async def register_deployment(req: DeploymentEventRequest, tenant_id: str | None dependencies=[Depends(require_permission_dependency("read:rca"))], ) @handle_exceptions -async def list_deployments(tenant_id: str) -> List[JSONDict]: +async def list_deployments(tenant_id: str) -> list[JSONDict]: return [ { "service": item["service"], @@ -69,7 +67,7 @@ async def list_deployments(tenant_id: str) -> List[JSONDict]: dependencies=[Depends(require_permission_dependency("delete:rca"))], ) @handle_exceptions -async def clear_deployments(tenant_id: str) -> Dict[str, str]: +async def clear_deployments(tenant_id: str) -> dict[str, str]: resolved_tenant = get_context_tenant(tenant_id) await get_registry().clear_events(resolved_tenant) return {"status": "cleared", "tenant_id": resolved_tenant} diff --git a/api/routes/exception.py b/api/routes/exception.py index 7b49be0..af29d86 100644 --- a/api/routes/exception.py +++ b/api/routes/exception.py @@ -19,8 +19,10 @@ from __future__ import annotations import inspect +from collections.abc import Awaitable, Callable from functools import wraps -from typing import Awaitable, Callable, TypeVar, cast +from typing import TypeVar, cast + from fastapi import HTTPException F = TypeVar("F", bound=Callable[..., object]) diff --git a/api/routes/forecast.py b/api/routes/forecast.py index 1664203..c44513f 100644 --- a/api/routes/forecast.py +++ b/api/routes/forecast.py @@ -8,18 +8,16 @@ from __future__ import annotations -from typing import List - from fastapi import APIRouter, Depends, Query from api.requests import CorrelateRequest from api.routes.common import coerce_query_value, fetch_requested_metrics, get_provider from api.routes.exception import handle_exceptions -from engine import anomaly from config import FORECAST_THRESHOLDS +from custom_types.json import JSONDict +from engine import anomaly from engine.forecast import analyze_degradation, forecast from services.security_service import enforce_request_tenant, require_permission_dependency -from custom_types.json import JSONDict router = APIRouter(tags=["Forecast"]) @@ -46,7 +44,7 @@ async def metric_trajectory( provider = get_provider(req.tenant_id) metrics_raw = await fetch_requested_metrics(provider, req) - results: List[JSONDict] = [] + results: list[JSONDict] = [] for query_string, resp in metrics_raw: for metric_name, ts, vals in anomaly.iter_series(resp, query_hint=query_string): threshold = next((v for k, v in FORECAST_THRESHOLDS.items() if k in metric_name), None) diff --git a/api/routes/health.py b/api/routes/health.py index 17d0762..5852d31 100644 --- a/api/routes/health.py +++ b/api/routes/health.py @@ -9,9 +9,10 @@ from __future__ import annotations from fastapi import APIRouter -from store.client import get_redis, is_using_fallback + from api.routes.exception import handle_exceptions from custom_types.json import JSONDict +from store.client import get_redis, is_using_fallback router = APIRouter(tags=["Health"]) diff --git a/api/routes/jobs.py b/api/routes/jobs.py index 1bd44fb..8a1e7e0 100644 --- a/api/routes/jobs.py +++ b/api/routes/jobs.py @@ -5,20 +5,20 @@ from __future__ import annotations from fastapi import APIRouter, HTTPException, Query, status + from api.requests import AnalyzeJobCreateRequest from api.responses import ( - JobStatus, AnalyzeJobCreateResponse, AnalyzeJobListResponse, AnalyzeJobResultResponse, AnalyzeJobSummary, AnalyzeReportDeleteResponse, AnalyzeReportResponse, + JobStatus, ) from api.responses.jobs import AnalyzeJobSummary as JobView -from services.security_service import ensure_permission, get_internal_context -from services.security_service import InternalContext from services.rca_job_service import rca_job_service +from services.security_service import InternalContext, ensure_permission, get_internal_context router = APIRouter(tags=["RCA Jobs"]) diff --git a/api/routes/logs.py b/api/routes/logs.py index c0fbd27..6f572d5 100644 --- a/api/routes/logs.py +++ b/api/routes/logs.py @@ -8,25 +8,25 @@ from __future__ import annotations -from typing import List from fastapi import APIRouter, Depends + +from api.requests import LogRequest +from api.responses import LogBurst, LogPattern from api.routes.common import get_provider, safe_call, to_nanoseconds from api.routes.exception import handle_exceptions -from services.security_service import enforce_request_tenant, require_permission_dependency from engine import logs -from api.requests import LogRequest -from api.responses import LogBurst, LogPattern +from services.security_service import enforce_request_tenant, require_permission_dependency router = APIRouter(tags=["Logs"]) @router.post( "/anomalies/logs/patterns", - response_model=List[LogPattern], + response_model=list[LogPattern], dependencies=[Depends(require_permission_dependency("read:rca"))], ) @handle_exceptions -async def log_patterns(req: LogRequest) -> List[LogPattern]: +async def log_patterns(req: LogRequest) -> list[LogPattern]: req = enforce_request_tenant(req) raw = await safe_call( get_provider(req.tenant_id).query_logs( @@ -38,11 +38,11 @@ async def log_patterns(req: LogRequest) -> List[LogPattern]: @router.post( "/anomalies/logs/bursts", - response_model=List[LogBurst], + response_model=list[LogBurst], dependencies=[Depends(require_permission_dependency("read:rca"))], ) @handle_exceptions -async def log_bursts(req: LogRequest) -> List[LogBurst]: +async def log_bursts(req: LogRequest) -> list[LogBurst]: req = enforce_request_tenant(req) raw = await safe_call( get_provider(req.tenant_id).query_logs( diff --git a/api/routes/metrics.py b/api/routes/metrics.py index 569ed8b..ac57341 100644 --- a/api/routes/metrics.py +++ b/api/routes/metrics.py @@ -8,26 +8,27 @@ from __future__ import annotations -from typing import List from fastapi import APIRouter, Depends + +from api.requests import ChangepointRequest, MetricRequest +from api.responses import MetricAnomaly from api.routes.common import get_provider, safe_call from api.routes.exception import handle_exceptions -from services.security_service import enforce_request_tenant, require_permission_dependency from engine import anomaly -from engine.changepoint import detect as changepoint_detect, ChangePoint -from api.requests import MetricRequest, ChangepointRequest -from api.responses import MetricAnomaly +from engine.changepoint import ChangePoint +from engine.changepoint import detect as changepoint_detect +from services.security_service import enforce_request_tenant, require_permission_dependency router = APIRouter(tags=["Metrics"]) @router.post( "/anomalies/metrics", - response_model=List[MetricAnomaly], + response_model=list[MetricAnomaly], dependencies=[Depends(require_permission_dependency("read:rca"))], ) @handle_exceptions -async def metric_anomalies(req: MetricRequest) -> List[MetricAnomaly]: +async def metric_anomalies(req: MetricRequest) -> list[MetricAnomaly]: req = enforce_request_tenant(req) raw = await safe_call( get_provider(req.tenant_id).query_metrics(query=req.query, start=req.start, end=req.end, step=req.step) @@ -41,17 +42,17 @@ async def metric_anomalies(req: MetricRequest) -> List[MetricAnomaly]: @router.post( "/changepoints", - response_model=List[ChangePoint], + response_model=list[ChangePoint], dependencies=[Depends(require_permission_dependency("read:rca"))], ) @handle_exceptions -async def metric_changepoints(req: ChangepointRequest) -> List[ChangePoint]: +async def metric_changepoints(req: ChangepointRequest) -> list[ChangePoint]: req = enforce_request_tenant(req) raw = await safe_call( get_provider(req.tenant_id).query_metrics(query=req.query, start=req.start, end=req.end, step=req.step) ) - results: List[ChangePoint] = [] + results: list[ChangePoint] = [] for metric_name, ts, vals in anomaly.iter_series(raw, query_hint=req.query): threshold_sigma = float(req.threshold_sigma) try: diff --git a/api/routes/ml.py b/api/routes/ml.py index 80cde00..213d716 100644 --- a/api/routes/ml.py +++ b/api/routes/ml.py @@ -12,10 +12,10 @@ from api.routes.common import safe_call from api.routes.exception import handle_exceptions +from custom_types.json import JSONDict from engine.enums import Signal from engine.registry import get_registry from services.security_service import get_context_tenant, require_permission_dependency -from custom_types.json import JSONDict router = APIRouter(tags=["ML"]) diff --git a/api/routes/slo.py b/api/routes/slo.py index 6aca2a8..f3d657e 100644 --- a/api/routes/slo.py +++ b/api/routes/slo.py @@ -9,15 +9,18 @@ from __future__ import annotations import logging + from fastapi import APIRouter, Depends + +from api.requests import SloRequest from api.routes.common import get_provider, safe_call from api.routes.exception import handle_exceptions -from services.security_service import enforce_request_tenant, require_permission_dependency -from engine import anomaly -from engine.slo import evaluate as slo_evaluate, remaining_minutes -from api.requests import SloRequest from config import settings from custom_types.json import JSONDict +from engine import anomaly +from engine.slo import evaluate as slo_evaluate +from engine.slo import remaining_minutes +from services.security_service import enforce_request_tenant, require_permission_dependency router = APIRouter(tags=["SLO"]) log = logging.getLogger(__name__) diff --git a/api/routes/topology.py b/api/routes/topology.py index d65f230..cbeb1d2 100644 --- a/api/routes/topology.py +++ b/api/routes/topology.py @@ -10,12 +10,13 @@ from __future__ import annotations from fastapi import APIRouter, Depends + +from api.requests import TopologyRequest from api.routes.common import get_provider, safe_call from api.routes.exception import handle_exceptions -from services.security_service import enforce_request_tenant, require_permission_dependency -from engine.topology import DependencyGraph -from api.requests import TopologyRequest from custom_types.json import JSONDict +from engine.topology import DependencyGraph +from services.security_service import enforce_request_tenant, require_permission_dependency router = APIRouter(tags=["Topology"]) diff --git a/api/routes/traces.py b/api/routes/traces.py index f70ff6a..ed52897 100644 --- a/api/routes/traces.py +++ b/api/routes/traces.py @@ -8,26 +8,26 @@ from __future__ import annotations -from typing import List from fastapi import APIRouter, Depends -from api.routes.common import get_provider, safe_call -from api.routes.exception import handle_exceptions -from services.security_service import enforce_request_tenant, require_permission_dependency -from engine import traces + from api.requests import TraceRequest from api.responses import ServiceLatency +from api.routes.common import get_provider, safe_call +from api.routes.exception import handle_exceptions from datasources.types import TraceFilters +from engine import traces +from services.security_service import enforce_request_tenant, require_permission_dependency router = APIRouter(tags=["Traces"]) @router.post( "/anomalies/traces", - response_model=List[ServiceLatency], + response_model=list[ServiceLatency], dependencies=[Depends(require_permission_dependency("read:rca"))], ) @handle_exceptions -async def trace_anomalies(req: TraceRequest) -> List[ServiceLatency]: +async def trace_anomalies(req: TraceRequest) -> list[ServiceLatency]: req = enforce_request_tenant(req) filters: TraceFilters = {} if req.service: diff --git a/config.py b/config.py index 9be3cc2..1d4bc12 100644 --- a/config.py +++ b/config.py @@ -10,13 +10,12 @@ import os import sys -from typing import Dict, List, Tuple, Optional from pydantic import model_validator from pydantic_settings import BaseSettings -def _to_bool(value: Optional[str], default: bool = False) -> bool: +def _to_bool(value: str | None, default: bool = False) -> bool: if value is None: return default return str(value).strip().lower() in ("1", "true", "yes", "on") @@ -30,11 +29,11 @@ def _is_production_env() -> bool: return _env_name() in {"prod", "production"} -def _normalized_secret(value: Optional[str]) -> str: +def _normalized_secret(value: str | None) -> str: return str(value or "").strip().lower() -def _is_weak_secret(value: Optional[str]) -> bool: +def _is_weak_secret(value: str | None) -> bool: normalized = _normalized_secret(value) if not normalized: return True @@ -45,7 +44,7 @@ def _is_weak_secret(value: Optional[str]) -> bool: ALLOWED_CONTEXT_ALGORITHMS = {"HS256", "HS384", "HS512"} -def _parse_context_algorithms(raw: Optional[str]) -> list[str]: +def _parse_context_algorithms(raw: str | None) -> list[str]: values = [str(v).strip().upper() for v in str(raw or "HS256").split(",") if str(v).strip()] return values or ["HS256"] @@ -161,7 +160,7 @@ def _parse_context_algorithms(raw: Optional[str]) -> list[str]: DATASOURCE_TIMEOUT = 30 HEALTH_PATH = "/ready" -DEFAULT_WEIGHTS: Dict[str, float] = { +DEFAULT_WEIGHTS: dict[str, float] = { "metrics": 0.30, "logs": 0.35, "traces": 0.35, @@ -214,7 +213,7 @@ class Settings(BaseSettings): cusum_threshold: float = float(os.getenv("RESOLVER_CUSUM_THRESHOLD", "6.0")) min_samples: int = int(os.getenv("RESOLVER_MIN_SAMPLES", "12")) - burst_ratio_thresholds: List[Tuple[float, str]] = [ + burst_ratio_thresholds: list[tuple[float, str]] = [ (10.0, "critical"), (5.0, "high"), (2.5, "medium"), @@ -279,7 +278,7 @@ class Settings(BaseSettings): # rca heuristics rca_window_seconds: float = 300.0 - rca_weights: Dict[str, float] = {"metrics": 0.40, "logs": 0.25, "traces": 0.35} + rca_weights: dict[str, float] = {"metrics": 0.40, "logs": 0.25, "traces": 0.35} rca_deploy_score_cutoff: float = 0.65 rca_errorprop_max: float = 0.95 rca_baseline_base: float = 0.5 @@ -324,7 +323,7 @@ class Settings(BaseSettings): events_window_seconds: float = 300.0 - bayesian_priors: Dict[str, float] = { + bayesian_priors: dict[str, float] = { "deployment": 0.35, "resource_exhaustion": 0.20, "dependency_failure": 0.20, @@ -333,7 +332,7 @@ class Settings(BaseSettings): "slo_burn": 0.03, "unknown": 0.02, } - bayesian_likelihoods: Dict[str, Dict[str, float]] = { + bayesian_likelihoods: dict[str, dict[str, float]] = { "deployment": { "has_deployment_event": 0.95, "has_metric_spike": 0.70, @@ -422,7 +421,7 @@ class Settings(BaseSettings): rca_log_pattern_score: float = 0.6 # SLO burn windows: list of (label, window_seconds, threshold, severity) - slo_burn_windows: List[Tuple[str, float, float, str]] = [ + slo_burn_windows: list[tuple[str, float, float, str]] = [ ("1h", 3600, 14.4, "critical"), ("6h", 21600, 6.0, "high"), ("1d", 86400, 3.0, "medium"), @@ -438,17 +437,17 @@ class Settings(BaseSettings): slo_default_target_availability: float = 0.999 # anomaly detection thresholds - anomaly_z_thresholds: List[Tuple[float, float]] = [ + anomaly_z_thresholds: list[tuple[float, float]] = [ (4.5, 0.5), (3.5, 0.35), (3.0, 0.2), ] - anomaly_mad_thresholds: List[Tuple[float, float]] = [ + anomaly_mad_thresholds: list[tuple[float, float]] = [ (6.0, 0.35), (4.5, 0.25), (3.5, 0.15), ] - anomaly_iqr_score_thresholds: List[Tuple[float, float]] = [ + anomaly_iqr_score_thresholds: list[tuple[float, float]] = [ (4.0, 0.35), (3.0, 0.25), (2.0, 0.15), diff --git a/connectors/loki.py b/connectors/loki.py index 4eff259..042f9eb 100644 --- a/connectors/loki.py +++ b/connectors/loki.py @@ -1,5 +1,4 @@ import re -from typing import Optional import httpx @@ -18,7 +17,7 @@ def __init__( base_url: str, tenant_id: str, timeout: int = DATASOURCE_TIMEOUT, - headers: Optional[dict[str, str]] = None, + headers: dict[str, str] | None = None, ) -> None: super().__init__(tenant_id, base_url, timeout, headers) @@ -37,7 +36,7 @@ async def query_range( query: str, start: int, end: int, - limit: Optional[int] = None, + limit: int | None = None, ) -> JSONDict: params: dict[str, str | int | float | bool] = { "query": self._normalize_query(query), diff --git a/connectors/mimir.py b/connectors/mimir.py index b2c44ce..4ec80c3 100644 --- a/connectors/mimir.py +++ b/connectors/mimir.py @@ -1,5 +1,3 @@ -from typing import Optional - import httpx from config import DATASOURCE_TIMEOUT, HEALTH_PATH @@ -18,7 +16,7 @@ def __init__( base_url: str, tenant_id: str, timeout: int = DATASOURCE_TIMEOUT, - headers: Optional[dict[str, str]] = None, + headers: dict[str, str] | None = None, ) -> None: super().__init__(tenant_id, base_url, timeout, headers) diff --git a/connectors/tempo.py b/connectors/tempo.py index 6196ac5..8236e50 100644 --- a/connectors/tempo.py +++ b/connectors/tempo.py @@ -1,5 +1,3 @@ -from typing import Optional - import httpx from config import DATASOURCE_TIMEOUT, HEALTH_PATH @@ -17,7 +15,7 @@ def __init__( base_url: str, tenant_id: str, timeout: int = DATASOURCE_TIMEOUT, - headers: Optional[dict[str, str]] = None, + headers: dict[str, str] | None = None, ) -> None: super().__init__(tenant_id, base_url, timeout, headers) @@ -27,7 +25,7 @@ async def query_range( filters: TraceFilters, start: int, end: int, - limit: Optional[int] = None, + limit: int | None = None, ) -> JSONDict: params: dict[str, str | int | float | bool] = {"start": start, "end": end, **filters} if limit is not None: diff --git a/custom_types/__init__.py b/custom_types/__init__.py index b1ea7b9..d5adf54 100644 --- a/custom_types/__init__.py +++ b/custom_types/__init__.py @@ -12,4 +12,4 @@ from .json import JSONDict, JSONList, JSONScalar, JSONValue -__all__ = ["JSONScalar", "JSONValue", "JSONDict", "JSONList"] +__all__ = ["JSONDict", "JSONList", "JSONScalar", "JSONValue"] diff --git a/custom_types/json.py b/custom_types/json.py index b6026c7..a39156a 100644 --- a/custom_types/json.py +++ b/custom_types/json.py @@ -12,6 +12,7 @@ from collections.abc import Mapping, Sequence from typing import TYPE_CHECKING, TypeAlias, TypeGuard + from typing_extensions import TypeAliasType JSONScalar: TypeAlias = str | int | float | bool | None @@ -40,4 +41,4 @@ def is_json_object(value: object) -> TypeGuard[JSONDict]: return isinstance(value, dict) and all(isinstance(key, str) and is_json_value(item) for key, item in value.items()) -__all__ = ["JSONScalar", "JSONValue", "JSONDict", "JSONList", "is_json_value", "is_json_object"] +__all__ = ["JSONDict", "JSONList", "JSONScalar", "JSONValue", "is_json_object", "is_json_value"] diff --git a/database.py b/database.py index 8fc8b1d..58bc4a5 100644 --- a/database.py +++ b/database.py @@ -10,23 +10,34 @@ from __future__ import annotations +import logging import os import re -import logging +from collections.abc import Iterator from contextlib import contextmanager -from typing import Callable, Iterator, Optional +from typing import Protocol from sqlalchemy import create_engine, text -from sqlalchemy.exc import SQLAlchemyError from sqlalchemy.engine import Engine from sqlalchemy.engine.url import make_url +from sqlalchemy.exc import SQLAlchemyError from sqlalchemy.orm import Session, sessionmaker from db_models import Base logger = logging.getLogger(__name__) -_ENGINE: Optional[Engine] = None -_SESSION_FACTORY: Optional[Callable[[], Session]] = None + + +class _SessionFactory(Protocol): + def __call__(self) -> Session: ... + + +def _new_session(factory: _SessionFactory) -> Session: + return factory() + + +_ENGINE: Engine | None = None +_SESSION_FACTORY: _SessionFactory | None = None def _ensure_postgres_database_exists(database_url: str) -> None: @@ -68,19 +79,23 @@ def init_database(database_url: str) -> None: pool_timeout=int(os.getenv("RESOLVER_DB_POOL_TIMEOUT", "30")), pool_recycle=int(os.getenv("RESOLVER_DB_POOL_RECYCLE", "1800")), ) - session_factory = sessionmaker(bind=engine, autoflush=False, expire_on_commit=False) + factory = sessionmaker(bind=engine, autoflush=False, expire_on_commit=False) globals()["_ENGINE"] = engine - globals()["_SESSION_FACTORY"] = session_factory + globals()["_SESSION_FACTORY"] = factory + + +def _require_session_factory() -> _SessionFactory: + factory = _SESSION_FACTORY + if factory is None or not callable(factory): + raise RuntimeError("Database not initialized") + return factory @contextmanager def get_db_session() -> Iterator[Session]: - if _ENGINE is None or _SESSION_FACTORY is None: - raise RuntimeError("Database not initialized") - session_factory = _SESSION_FACTORY - if not callable(session_factory): + if _ENGINE is None: raise RuntimeError("Database not initialized") - session = session_factory() + session = _new_session(_require_session_factory()) try: yield session session.commit() diff --git a/datasources/base.py b/datasources/base.py index defdcd2..e34f450 100644 --- a/datasources/base.py +++ b/datasources/base.py @@ -9,7 +9,7 @@ """ from abc import ABC, abstractmethod -from typing import Optional + import httpx from datasources.types import JSONDict, TraceFilters @@ -18,7 +18,7 @@ class BaseConnector(ABC): health_path: str = "" - def __init__(self, tenant_id: str, base_url: str, timeout: int = 30, headers: Optional[dict[str, str]] = None): + def __init__(self, tenant_id: str, base_url: str, timeout: int = 30, headers: dict[str, str] | None = None): self.tenant_id = tenant_id self.base_url = str(base_url).rstrip("/") self.timeout = timeout @@ -42,19 +42,17 @@ async def aclose(self) -> None: class LogsConnector(BaseConnector): - @abstractmethod async def query_range( self, query: str, start: int, end: int, - limit: Optional[int] = None, + limit: int | None = None, ) -> JSONDict: ... class MetricsConnector(BaseConnector): - @abstractmethod async def query_range( self, @@ -66,12 +64,11 @@ async def query_range( class TracesConnector(BaseConnector): - @abstractmethod async def query_range( self, filters: TraceFilters, start: int, end: int, - limit: Optional[int] = None, + limit: int | None = None, ) -> JSONDict: ... diff --git a/datasources/data_config.py b/datasources/data_config.py index c8be02d..a9ce739 100644 --- a/datasources/data_config.py +++ b/datasources/data_config.py @@ -10,18 +10,19 @@ from pydantic import field_validator from pydantic_settings import BaseSettings + from config import ( LOGS_BACKEND_LOKI, METRICS_BACKEND_MIMIR, - TRACES_BACKEND_TEMPO, + RESOLVER_CONNECTOR_TIMEOUT, RESOLVER_LOGS_BACKEND, RESOLVER_LOGS_LOKI_URL, RESOLVER_METRICS_BACKEND, RESOLVER_METRICS_MIMIR_URL, + RESOLVER_STARTUP_TIMEOUT, RESOLVER_TRACES_BACKEND, RESOLVER_TRACES_TEMPO_URL, - RESOLVER_CONNECTOR_TIMEOUT, - RESOLVER_STARTUP_TIMEOUT, + TRACES_BACKEND_TEMPO, ) diff --git a/datasources/factory.py b/datasources/factory.py index 97c7b78..3c5fdeb 100644 --- a/datasources/factory.py +++ b/datasources/factory.py @@ -10,6 +10,7 @@ from __future__ import annotations +from config import LOGS_BACKEND_LOKI, METRICS_BACKEND_MIMIR, TRACES_BACKEND_TEMPO from connectors.loki import LokiConnector from connectors.mimir import MimirConnector from connectors.tempo import TempoConnector @@ -18,27 +19,20 @@ class DataSourceFactory: - @staticmethod def create_logs(config: DataSourceSettings, tenant_id: str) -> LogsConnector: - from config import LOGS_BACKEND_LOKI - if config.logs_backend == LOGS_BACKEND_LOKI: return LokiConnector(config.loki_url, tenant_id, timeout=config.connector_timeout) raise ValueError("Unsupported logs backend") @staticmethod def create_metrics(config: DataSourceSettings, tenant_id: str) -> MetricsConnector: - from config import METRICS_BACKEND_MIMIR - if config.metrics_backend == METRICS_BACKEND_MIMIR: return MimirConnector(config.mimir_url, tenant_id, timeout=config.connector_timeout) raise ValueError("Unsupported metrics backend") @staticmethod def create_traces(config: DataSourceSettings, tenant_id: str) -> TracesConnector: - from config import TRACES_BACKEND_TEMPO - if config.traces_backend == TRACES_BACKEND_TEMPO: return TempoConnector(config.tempo_url, tenant_id, timeout=config.connector_timeout) raise ValueError("Unsupported traces backend") diff --git a/datasources/helpers.py b/datasources/helpers.py index e8e1787..0c6468e 100644 --- a/datasources/helpers.py +++ b/datasources/helpers.py @@ -10,59 +10,132 @@ from __future__ import annotations -from typing import Optional +from dataclasses import dataclass +from typing import Any, Protocol, cast + import httpx + from datasources.exceptions import DataSourceUnavailable, InvalidQuery, QueryTimeout from datasources.types import JSONDict, QueryParams +class AsyncGetClient(Protocol): + async def get(self, url: str, **kwargs: object) -> Any: ... + + +@dataclass(frozen=True) +class FetchRequestOptions: + params: QueryParams | None = None + headers: dict[str, str] | None = None + timeout: int = 30 + client: AsyncGetClient | None = None + + +@dataclass(frozen=True) +class FetchErrorMessages: + invalid_msg: str + timeout_msg: str + unavailable_msg: str + + +_DEFAULT_JSON_MESSAGES = FetchErrorMessages( + invalid_msg="query failed", + timeout_msg="query timed out", + unavailable_msg="Cannot reach data source at", +) + +_DEFAULT_TEXT_MESSAGES = FetchErrorMessages( + invalid_msg="request failed", + timeout_msg="request timed out", + unavailable_msg="Cannot reach data source at", +) + + +def _coerce_fetch_options( + options: FetchRequestOptions | None, + legacy_kwargs: dict[str, object], +) -> FetchRequestOptions: + base = options or FetchRequestOptions() + + params = legacy_kwargs.pop("params", base.params) + headers = legacy_kwargs.pop("headers", base.headers) + timeout_raw = legacy_kwargs.pop("timeout", base.timeout) + client_raw = legacy_kwargs.pop("client", base.client) + + timeout = int(cast(int | str | bytes | bytearray, timeout_raw)) + + client = client_raw if hasattr(client_raw, "get") else base.client + + return FetchRequestOptions( + params=cast(QueryParams | None, params), + headers=cast(dict[str, str] | None, headers), + timeout=timeout, + client=client, + ) + + +def _coerce_error_messages( + messages: FetchErrorMessages | None, + legacy_kwargs: dict[str, object], + defaults: FetchErrorMessages, +) -> FetchErrorMessages: + base = messages or defaults + invalid_raw = legacy_kwargs.pop("invalid_msg", base.invalid_msg) + timeout_raw = legacy_kwargs.pop("timeout_msg", base.timeout_msg) + unavailable_raw = legacy_kwargs.pop("unavailable_msg", base.unavailable_msg) + return FetchErrorMessages( + invalid_msg=str(invalid_raw), + timeout_msg=str(timeout_raw), + unavailable_msg=str(unavailable_raw), + ) + + async def fetch_json( url: str, - params: Optional[QueryParams] = None, - headers: Optional[dict[str, str]] = None, - timeout: int = 30, - client: Optional[httpx.AsyncClient] = None, - invalid_msg: str = "query failed", - timeout_msg: str = "query timed out", - unavailable_msg: str = "Cannot reach data source at", + options: FetchRequestOptions | None = None, + messages: FetchErrorMessages | None = None, + **legacy_kwargs: object, ) -> JSONDict: + parsed_options = _coerce_fetch_options(options, dict(legacy_kwargs)) + parsed_messages = _coerce_error_messages(messages, dict(legacy_kwargs), _DEFAULT_JSON_MESSAGES) + try: - if client is None: - async with httpx.AsyncClient(timeout=timeout) as owned_client: - resp = await owned_client.get(url, params=params, headers=headers) + if parsed_options.client is None: + async with httpx.AsyncClient(timeout=parsed_options.timeout) as owned_client: + resp = await owned_client.get(url, params=parsed_options.params, headers=parsed_options.headers) else: - resp = await client.get(url, params=params, headers=headers) + resp = await parsed_options.client.get(url, params=parsed_options.params, headers=parsed_options.headers) resp.raise_for_status() payload = resp.json() return payload if isinstance(payload, dict) else {} except httpx.HTTPStatusError as e: - raise InvalidQuery(f"{invalid_msg} [{e.response.status_code}]: {e.response.text}") from e + raise InvalidQuery(f"{parsed_messages.invalid_msg} [{e.response.status_code}]: {e.response.text}") from e except httpx.TimeoutException as e: - raise QueryTimeout(timeout_msg) from e + raise QueryTimeout(parsed_messages.timeout_msg) from e except httpx.RequestError as e: - raise DataSourceUnavailable(f"{unavailable_msg} {url}") from e + raise DataSourceUnavailable(f"{parsed_messages.unavailable_msg} {url}") from e async def fetch_text( url: str, - headers: Optional[dict[str, str]] = None, - timeout: int = 30, - client: Optional[httpx.AsyncClient] = None, - invalid_msg: str = "request failed", - timeout_msg: str = "request timed out", - unavailable_msg: str = "Cannot reach data source at", + options: FetchRequestOptions | None = None, + messages: FetchErrorMessages | None = None, + **legacy_kwargs: object, ) -> str: + parsed_options = _coerce_fetch_options(options, dict(legacy_kwargs)) + parsed_messages = _coerce_error_messages(messages, dict(legacy_kwargs), _DEFAULT_TEXT_MESSAGES) + try: - if client is None: - async with httpx.AsyncClient(timeout=timeout) as owned_client: - resp = await owned_client.get(url, headers=headers) + if parsed_options.client is None: + async with httpx.AsyncClient(timeout=parsed_options.timeout) as owned_client: + resp = await owned_client.get(url, headers=parsed_options.headers) else: - resp = await client.get(url, headers=headers) + resp = await parsed_options.client.get(url, headers=parsed_options.headers) resp.raise_for_status() return resp.text except httpx.HTTPStatusError as e: - raise InvalidQuery(f"{invalid_msg} [{e.response.status_code}]: {e.response.text}") from e + raise InvalidQuery(f"{parsed_messages.invalid_msg} [{e.response.status_code}]: {e.response.text}") from e except httpx.TimeoutException as e: - raise QueryTimeout(timeout_msg) from e + raise QueryTimeout(parsed_messages.timeout_msg) from e except httpx.RequestError as e: - raise DataSourceUnavailable(f"{unavailable_msg} {url}") from e + raise DataSourceUnavailable(f"{parsed_messages.unavailable_msg} {url}") from e diff --git a/datasources/provider.py b/datasources/provider.py index 1c0c792..d8cf4f8 100644 --- a/datasources/provider.py +++ b/datasources/provider.py @@ -8,8 +8,6 @@ http://www.apache.org/licenses/LICENSE-2.0 """ -from typing import Optional - from .base import LogsConnector, MetricsConnector, TracesConnector from .data_config import DataSourceSettings from .factory import DataSourceFactory @@ -27,13 +25,13 @@ def __init__(self, tenant_id: str, settings: DataSourceSettings) -> None: self.metrics = DataSourceFactory.create_metrics(settings, tenant_id) self.traces = DataSourceFactory.create_traces(settings, tenant_id) - async def query_logs(self, query: str, start: int, end: int, limit: Optional[int] = None) -> JSONDict: + async def query_logs(self, query: str, start: int, end: int, limit: int | None = None) -> JSONDict: return await self.logs.query_range(query=query, start=start, end=end, limit=limit) async def query_metrics(self, query: str, start: int, end: int, step: str) -> JSONDict: return await self.metrics.query_range(query=query, start=start, end=end, step=step) - async def query_traces(self, filters: TraceFilters, start: int, end: int, limit: Optional[int] = None) -> JSONDict: + async def query_traces(self, filters: TraceFilters, start: int, end: int, limit: int | None = None) -> JSONDict: return await self.traces.query_range(filters=filters, start=start, end=end, limit=limit) async def aclose(self) -> None: diff --git a/datasources/retry.py b/datasources/retry.py index ebaba71..c4ce4bb 100644 --- a/datasources/retry.py +++ b/datasources/retry.py @@ -13,8 +13,9 @@ import asyncio import inspect import time +from collections.abc import Awaitable, Callable from functools import wraps -from typing import Awaitable, Callable, Type, Tuple, TypeVar, cast +from typing import TypeVar, cast F = TypeVar("F", bound=Callable[..., object]) @@ -24,7 +25,7 @@ def retry( attempts: int = 3, delay: float = 1.0, backoff: float = 2.0, - exceptions: Tuple[Type[Exception], ...] = (Exception,), + exceptions: tuple[type[Exception], ...] = (Exception,), ) -> Callable[[F], F]: def decorator(func: F) -> F: is_async = inspect.iscoroutinefunction(func) diff --git a/engine/__init__.py b/engine/__init__.py index 05bd4ad..5123946 100644 --- a/engine/__init__.py +++ b/engine/__init__.py @@ -8,6 +8,6 @@ http://www.apache.org/licenses/LICENSE-2.0 """ -from engine.enums import Severity, Signal, ChangeType, RcaCategory +from engine.enums import ChangeType, RcaCategory, Severity, Signal -__all__ = ["Severity", "Signal", "ChangeType", "RcaCategory"] +__all__ = ["ChangeType", "RcaCategory", "Severity", "Signal"] diff --git a/engine/analyze/helpers.py b/engine/analyze/helpers.py index e2c4354..c7cef56 100644 --- a/engine/analyze/helpers.py +++ b/engine/analyze/helpers.py @@ -1,8 +1,6 @@ """ Analyzer Helpers. - Copyright (c) 2026 Stefan Kumarasinghe - Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 @@ -11,37 +9,44 @@ from __future__ import annotations import asyncio -from collections import defaultdict -from collections.abc import Callable, Sequence import dataclasses import logging import math -from typing import Dict, List, TypeAlias, TypeVar, Tuple +from collections import defaultdict +from collections.abc import Callable, Sequence +from typing import Any, TypeAlias, TypeVar import httpx import numpy as np +from api.requests import AnalyzeRequest from api.responses import ( AnalysisQuality, LogBurst, LogPattern, MetricAnomaly, MetricSeriesDistributionStats, +) +from api.responses import ( RootCause as RootCauseModel, ) -from api.requests import AnalyzeRequest from config import FORECAST_THRESHOLDS, SLO_ERROR_QUERY, SLO_TOTAL_QUERY, settings from custom_types.json import JSONDict from datasources.provider import DataSourceProvider +from engine import anomaly from engine.analyze.filters import ( filter_metric_response_by_services as _filter_metric_response_by_services, +) +from engine.analyze.filters import ( normalize_services as _normalize_services, ) -from engine import anomaly +from engine.analyze.series import select_granger_series as _select_granger_series_impl +from engine.analyze.series import slo_series_pairs as _slo_series_pairs_impl from engine.anomaly.stats import compute_series_distribution_stats from engine.baseline import compute as baseline_compute -from engine.changepoint import detect as changepoint_detect, ChangePoint from engine.causal.granger import GrangerResult +from engine.changepoint import ChangePoint +from engine.changepoint import detect as changepoint_detect from engine.enums import Severity, Signal from engine.events.registry import DeploymentEvent, EventRegistry from engine.fetcher import fetch_metrics @@ -68,6 +73,39 @@ _SortKey: TypeAlias = tuple[float | int | str, ...] | float | int | str +@dataclasses.dataclass(frozen=True) +class AnalyzerOutputInputs: + metric_anomalies: list[MetricAnomaly] + change_points: list[ChangePoint] + root_causes: list[RootCauseModel] + ranked_causes: list[RankedCause] + anomaly_clusters: list[AnomalyCluster] + granger_results: list[GrangerResult] + warnings: list[str] + + +@dataclasses.dataclass(frozen=True) +class PrecisionQualityGateInputs: + metric_anomalies: list[MetricAnomaly] + change_points: list[ChangePoint] + root_causes: list[RootCauseModel] + ranked_causes: list[RankedCause] + duration_seconds: float + suppression_counts: dict[str, int] + warnings: list[str] + + +@dataclasses.dataclass(frozen=True) +class MetricSeriesJob: + req: AnalyzeRequest + query_string: str + metric_name: str + ts: list[float] + vals: list[float] + z_threshold: float + analysis_window_seconds: float + + def _to_root_cause_model(rc: object) -> RootCauseModel: def _normalize_signals(values: list[object]) -> list[Signal]: normalized: list[Signal] = [] @@ -133,12 +171,6 @@ def _series_key(query_string: str, metric_name: str) -> str: return f"{query_string}::{metric_name}" -def _trim_to_len(values: list[float], target_len: int) -> list[float]: - if len(values) == target_len: - return values - return values[:target_len] - - def _dedupe_metric_anomalies(items: list[MetricAnomaly]) -> list[MetricAnomaly]: selected: dict[tuple[str, int, str], MetricAnomaly] = {} for item in items: @@ -162,7 +194,13 @@ def _dedupe_metric_anomalies(items: list[MetricAnomaly]) -> list[MetricAnomaly]: return sorted(selected.values(), key=lambda a: (a.timestamp, a.metric_name)) -def _dedupe_change_points(items: List[ChangePoint]) -> List[ChangePoint]: +def _trim_to_len(values: list[float], target_len: int) -> list[float]: + if len(values) == target_len: + return values + return values[:target_len] + + +def _dedupe_change_points(items: list[ChangePoint]) -> list[ChangePoint]: selected: dict[tuple[str, int, str], ChangePoint] = {} for item in items: key = ( @@ -216,24 +254,28 @@ def _cap_list( def _limit_analyzer_output( - *, - metric_anomalies: list[MetricAnomaly], - change_points: List[ChangePoint], - root_causes: list[RootCauseModel], - ranked_causes: list[RankedCause], - anomaly_clusters: list[AnomalyCluster], - granger_results: list[GrangerResult], - warnings: list[str], + inputs: AnalyzerOutputInputs | None = None, + **legacy_kwargs: Any, ) -> tuple[ list[MetricAnomaly], - List[ChangePoint], + list[ChangePoint], list[RootCauseModel], list[RankedCause], list[AnomalyCluster], list[GrangerResult], ]: + if inputs is None: + inputs = AnalyzerOutputInputs( + metric_anomalies=legacy_kwargs.get("metric_anomalies", []), + change_points=legacy_kwargs.get("change_points", []), + root_causes=legacy_kwargs.get("root_causes", []), + ranked_causes=legacy_kwargs.get("ranked_causes", []), + anomaly_clusters=legacy_kwargs.get("anomaly_clusters", []), + granger_results=legacy_kwargs.get("granger_results", []), + warnings=legacy_kwargs.get("warnings", []), + ) metric_anomalies_limited = _cap_list( - metric_anomalies, + inputs.metric_anomalies, settings.analyzer_max_metric_anomalies, key_func=lambda item: ( getattr(getattr(item, "severity", Severity.LOW), "weight", lambda: 0)(), @@ -241,60 +283,54 @@ def _limit_analyzer_output( float(getattr(item, "timestamp", 0.0)), ), ) - if len(metric_anomalies_limited) < len(metric_anomalies): - warnings.append( - f"Metric anomalies capped to top {len(metric_anomalies_limited)} from {len(metric_anomalies)} " + if len(metric_anomalies_limited) < len(inputs.metric_anomalies): + inputs.warnings.append( + f"Metric anomalies capped to top {len(metric_anomalies_limited)} from {len(inputs.metric_anomalies)} " "by severity and z-score." ) change_points_limited = _cap_list( - change_points, + inputs.change_points, settings.analyzer_max_change_points, key_func=lambda item: (float(getattr(item, "magnitude", 0.0)), float(getattr(item, "timestamp", 0.0))), ) - if len(change_points_limited) < len(change_points): - warnings.append( - f"Change points capped to top {len(change_points_limited)} from {len(change_points)} by magnitude." + if len(change_points_limited) < len(inputs.change_points): + inputs.warnings.append( + f"Change points capped to top {len(change_points_limited)} from {len(inputs.change_points)} by magnitude." ) root_causes_limited = _cap_list( - root_causes, + inputs.root_causes, settings.analyzer_max_root_causes, key_func=lambda item: float(getattr(item, "confidence", 0.0)), ) - if len(root_causes_limited) < len(root_causes): - warnings.append(f"Root causes capped to top {len(root_causes_limited)} by confidence.") + if len(root_causes_limited) < len(inputs.root_causes): + inputs.warnings.append(f"Root causes capped to top {len(root_causes_limited)} by confidence.") ranked_limited = _cap_list( - ranked_causes, + inputs.ranked_causes, settings.analyzer_max_root_causes, key_func=lambda item: float(getattr(item, "final_score", 0.0)), ) clusters_limited = _cap_list( - anomaly_clusters, + inputs.anomaly_clusters, settings.analyzer_max_clusters, key_func=lambda item: int(getattr(item, "size", 0)), ) - if len(clusters_limited) < len(anomaly_clusters): - warnings.append(f"Anomaly clusters capped to top {len(clusters_limited)} by size.") + if len(clusters_limited) < len(inputs.anomaly_clusters): + inputs.warnings.append(f"Anomaly clusters capped to top {len(clusters_limited)} by size.") granger_limited = _cap_list( - granger_results, + inputs.granger_results, settings.analyzer_max_granger_pairs, key_func=lambda item: float(getattr(item, "strength", 0.0)), ) - if len(granger_limited) < len(granger_results): - warnings.append(f"Granger pairs capped to top {len(granger_limited)} by strength.") - - return ( - metric_anomalies_limited, - change_points_limited, - root_causes_limited, - ranked_limited, - clusters_limited, - granger_limited, - ) + if len(granger_limited) < len(inputs.granger_results): + inputs.warnings.append(f"Granger pairs capped to top {len(granger_limited)} by strength.") + + ma, cp, rc = metric_anomalies_limited, change_points_limited, root_causes_limited + return ma, cp, rc, ranked_limited, clusters_limited, granger_limited def _signal_key(value: object) -> str: @@ -444,20 +480,32 @@ def _filter_log_bursts_for_precision_rca( def _apply_precision_quality_gates( - *, - metric_anomalies: list[MetricAnomaly], - change_points: List[ChangePoint], - root_causes: list[RootCauseModel], - ranked_causes: list[RankedCause], - duration_seconds: float, - suppression_counts: dict[str, int], - warnings: list[str], -) -> tuple[list[MetricAnomaly], List[ChangePoint], list[RootCauseModel], list[RankedCause], AnalysisQuality]: - profile = str(getattr(settings, "quality_gating_profile", "precision_strict_v1")).strip() or "precision_strict_v1" - is_precision = _is_precision_profile() + inputs: PrecisionQualityGateInputs | None = None, + **legacy_kwargs: Any, +) -> tuple[list[MetricAnomaly], list[ChangePoint], list[RootCauseModel], list[RankedCause], AnalysisQuality]: + kw = legacy_kwargs + inputs = inputs or PrecisionQualityGateInputs( + metric_anomalies=kw.get("metric_anomalies", []), + change_points=kw.get("change_points", []), + root_causes=kw.get("root_causes", []), + ranked_causes=kw.get("ranked_causes", []), + duration_seconds=float(kw.get("duration_seconds", 0.0)), + suppression_counts=kw.get("suppression_counts", {}), + warnings=kw.get("warnings", []), + ) + metric_anomalies, change_points, root_causes, ranked_causes, duration_seconds, suppression_counts, warnings = ( + inputs.metric_anomalies, + inputs.change_points, + inputs.root_causes, + inputs.ranked_causes, + inputs.duration_seconds, + inputs.suppression_counts, + inputs.warnings, + ) + hours = max(float(duration_seconds) / 3600.0, 1.0 / 60.0) - if is_precision and metric_anomalies: + if _is_precision_profile() and metric_anomalies: max_density = max(0.0, float(getattr(settings, "quality_max_anomaly_density_per_metric_per_hour", 0.0))) if max_density > 0: keep_per_metric = max(1, int(math.ceil(max_density * hours))) @@ -492,7 +540,7 @@ def _apply_precision_quality_gates( f"Quality gate suppressed {suppressed} metric anomaly(ies) above density cap " f"{max_density}/metric/hour." ) - if is_precision and change_points: + if _is_precision_profile() and change_points: max_density_cp = max( 0.0, float(getattr(settings, "quality_max_change_point_density_per_metric_per_hour", 0.0)), @@ -503,7 +551,7 @@ def _apply_precision_quality_gates( for change_point in change_points: metric_name = str(getattr(change_point, "metric_name", "metric")).strip() or "metric" by_metric_cp[metric_name].append(change_point) - filtered_cp: List[ChangePoint] = [] + filtered_cp: list[ChangePoint] = [] suppressed_cp = 0 for change_point_items in by_metric_cp.values(): if len(change_point_items) <= keep_per_metric_cp: @@ -534,7 +582,7 @@ def _apply_precision_quality_gates( max_without = max(1, int(getattr(settings, "quality_max_root_causes_without_multisignal", 1))) low_conf_cutoff = max(float(getattr(settings, "rca_min_confidence_display", 0.05)), 0.10) - if is_precision: + if _is_precision_profile(): filtered_root_causes: list[RootCauseModel] = [] suppressed_low_conf = 0 for cause in root_causes: @@ -542,8 +590,7 @@ def _apply_precision_quality_gates( suppressed_low_conf += 1 continue filtered_root_causes.append(cause) - if filtered_root_causes: - root_causes = filtered_root_causes + root_causes = filtered_root_causes or root_causes if suppressed_low_conf > 0: suppression_counts["low_confidence_root_causes"] = ( suppression_counts.get("low_confidence_root_causes", 0) + suppressed_low_conf @@ -582,7 +629,11 @@ def _apply_precision_quality_gates( if not getattr(cause, "corroboration_summary", None): cause.corroboration_summary = _root_cause_corroboration_summary(cause) diagnostics = dict(getattr(cause, "suppression_diagnostics", {}) or {}) - diagnostics.setdefault("gating_profile", profile) + diagnostics.setdefault( + "gating_profile", + str(getattr(settings, "quality_gating_profile", "precision_strict_v1")).strip() + or "precision_strict_v1", + ) signal_count = _root_cause_signal_count(cause) diagnostics.setdefault("signal_count", signal_count) diagnostics["min_corroboration_signals"] = min_corr @@ -592,7 +643,8 @@ def _apply_precision_quality_gates( quality = AnalysisQuality( anomaly_density=_compute_anomaly_density(metric_anomalies, duration_seconds), suppression_counts={k: int(v) for k, v in suppression_counts.items() if int(v) > 0}, - gating_profile=profile, + gating_profile=str(getattr(settings, "quality_gating_profile", "precision_strict_v1")).strip() + or "precision_strict_v1", confidence_calibration_version=str( getattr(settings, "quality_confidence_calibration_version", "calib_2026_02_25") ), @@ -601,14 +653,18 @@ def _apply_precision_quality_gates( async def _process_one_metric_series( - req: AnalyzeRequest, - query_string: str, - metric_name: str, - ts: list[float], - vals: list[float], - z_threshold: float, - analysis_window_seconds: float, -) -> tuple[list[MetricAnomaly], List[ChangePoint], TrajectoryForecast | None, DegradationSignal | None]: + **legacy_kwargs: Any, +) -> tuple[list[MetricAnomaly], list[ChangePoint], TrajectoryForecast | None, DegradationSignal | None]: + job = MetricSeriesJob( + req=legacy_kwargs["req"], + query_string=legacy_kwargs["query_string"], + metric_name=legacy_kwargs["metric_name"], + ts=legacy_kwargs["ts"], + vals=legacy_kwargs["vals"], + z_threshold=legacy_kwargs["z_threshold"], + analysis_window_seconds=legacy_kwargs["analysis_window_seconds"], + ) + req, metric_name, ts, vals, z_threshold = job.req, job.metric_name, job.ts, job.vals, job.z_threshold try: # result is persisted by store; value not used later _ = await baseline_store.compute_and_persist(req.tenant_id, metric_name, ts, vals, z_threshold) @@ -624,18 +680,25 @@ async def _process_one_metric_series( ) sigma_multiplier = max(1.0, sigma_multiplier) try: - change_points = changepoint_detect(ts, vals, threshold_sigma=sigma_multiplier, metric_name=metric_name) + change_points = changepoint_detect( + job.ts, + job.vals, + threshold_sigma=sigma_multiplier, + metric_name=metric_name, + ) except TypeError: # Backward-compatible path for monkeypatched/legacy detector signatures. - change_points = changepoint_detect(ts, vals, sigma_multiplier) + change_points = changepoint_detect(job.ts, job.vals, sigma_multiplier) - threshold = next((v for k, v in FORECAST_THRESHOLDS.items() if k in query_string), None) - if threshold and analysis_window_seconds >= float(getattr(settings, "analyzer_forecast_min_window_seconds", 0.0)): + threshold = next((v for k, v in FORECAST_THRESHOLDS.items() if k in job.query_string), None) + if threshold and job.analysis_window_seconds >= float( + getattr(settings, "analyzer_forecast_min_window_seconds", 0.0) + ): fc = forecast(metric_name, ts, vals, threshold, req.forecast_horizon_seconds) else: fc = None - if analysis_window_seconds >= float(getattr(settings, "analyzer_degradation_min_window_seconds", 0.0)): + if job.analysis_window_seconds >= float(getattr(settings, "analyzer_degradation_min_window_seconds", 0.0)): deg = analyze_degradation(metric_name, ts, vals) else: deg = None @@ -646,27 +709,27 @@ async def _process_one_metric_series( async def _process_metrics( provider: DataSourceProvider, req: AnalyzeRequest, - all_metric_queries: List[str], + all_metric_queries: list[str], z_threshold: float, analysis_window_seconds: float, -) -> Tuple[ +) -> tuple[ list[MetricAnomaly], - List[ChangePoint], + list[ChangePoint], list[TrajectoryForecast], list[DegradationSignal], - Dict[str, List[float]], + dict[str, list[float]], list[MetricSeriesDistributionStats], ]: metrics_raw = await fetch_metrics(provider, all_metric_queries, req.start, req.end, req.step) requested_services = _normalize_services(req.services) if requested_services: - filtered_metrics_raw: List[Tuple[str, JSONDict]] = [] + filtered_metrics_raw: list[tuple[str, JSONDict]] = [] for query_string, resp in metrics_raw: filtered_resp = _filter_metric_response_by_services(resp, requested_services) filtered_metrics_raw.append((query_string, filtered_resp if isinstance(filtered_resp, dict) else {})) metrics_raw = filtered_metrics_raw - series_list: List[Tuple[str, str, list[float], list[float]]] = [ + series_list: list[tuple[str, str, list[float], list[float]]] = [ (query_string, metric_name, ts, vals) for query_string, resp in metrics_raw for metric_name, ts, vals in anomaly.iter_series(resp, query_hint=query_string) @@ -680,25 +743,18 @@ async def _process_metrics( distribution_by_key[sk] = row distribution_stats = list(distribution_by_key.values()) + shared_kwargs = {"req": req, "z_threshold": z_threshold, "analysis_window_seconds": analysis_window_seconds} tasks = [ - _process_one_metric_series( - req, - query_string, - metric_name, - ts, - vals, - z_threshold, - analysis_window_seconds, - ) - for query_string, metric_name, ts, vals in series_list + _process_one_metric_series(**shared_kwargs, query_string=q, metric_name=m, ts=t, vals=v) + for q, m, t, v in series_list ] processed = await asyncio.gather(*tasks, return_exceptions=True) metric_anomalies: list[MetricAnomaly] = [] - change_points: List[ChangePoint] = [] + change_points: list[ChangePoint] = [] forecasts: list[TrajectoryForecast] = [] degradation_signals: list[DegradationSignal] = [] - series_map: Dict[str, List[float]] = {} + series_map: dict[str, list[float]] = {} for (query_string, metric_name, _ts, vals), result in zip(series_list, processed): series_map[_series_key(query_string, metric_name)] = vals @@ -721,45 +777,14 @@ def _slo_series_pairs( tot_raw: anomaly.series.WrappedMimirResponse, warnings: list[str], ) -> list[tuple[list[float], list[float], list[float]]]: - err_series = list(anomaly.iter_series(err_raw, query_hint=SLO_ERROR_QUERY)) - tot_series = list(anomaly.iter_series(tot_raw, query_hint=SLO_TOTAL_QUERY)) - - if len(err_series) != len(tot_series): - warnings.append( - f"SLO series mismatch: errors={len(err_series)} totals={len(tot_series)}. " - f"Using first {min(len(err_series), len(tot_series))} pair(s)." - ) + return _slo_series_pairs_impl( + err_raw, + tot_raw, + warnings, + error_query=SLO_ERROR_QUERY, + total_query=SLO_TOTAL_QUERY, + ) - pairs = [] - for idx in range(min(len(err_series), len(tot_series))): - _, err_ts, err_vals = err_series[idx] - _, _tot_ts, tot_vals = tot_series[idx] - if len(err_vals) != len(tot_vals): - n = min(len(err_vals), len(tot_vals)) - warnings.append(f"SLO sample length mismatch at pair {idx}: errors={len(err_vals)} totals={len(tot_vals)}.") - err_vals = _trim_to_len(err_vals, n) - tot_vals = _trim_to_len(tot_vals, n) - err_ts = _trim_to_len(err_ts, n) - if err_vals and tot_vals and err_ts: - pairs.append((err_ts, err_vals, tot_vals)) - return pairs - - -def _select_granger_series(series_map: Dict[str, List[float]]) -> Dict[str, List[float]]: - min_samples = max(2, int(settings.analyzer_granger_min_samples)) - max_series = max(2, int(settings.analyzer_granger_max_series)) - - eligible: list[tuple[str, float]] = [] - for name, values in series_map.items(): - arr = np.array(values, dtype=float) - finite = arr[np.isfinite(arr)] - if finite.size < min_samples: - continue - var = float(np.var(finite)) - if var <= 0: - continue - eligible.append((name, var)) - eligible.sort(key=lambda x: x[1], reverse=True) - selected_names = {name for name, _ in eligible[:max_series]} - return {name: vals for name, vals in series_map.items() if name in selected_names} +def _select_granger_series(series_map: dict[str, list[float]]) -> dict[str, list[float]]: + return _select_granger_series_impl(series_map) diff --git a/engine/analyze/series.py b/engine/analyze/series.py new file mode 100644 index 0000000..ea6a290 --- /dev/null +++ b/engine/analyze/series.py @@ -0,0 +1,58 @@ +from __future__ import annotations + +import numpy as np + +from config import SLO_ERROR_QUERY, SLO_TOTAL_QUERY, settings +from engine import anomaly + + +def slo_series_pairs( + err_raw: anomaly.series.WrappedMimirResponse, + tot_raw: anomaly.series.WrappedMimirResponse, + warnings: list[str], + *, + error_query: str = SLO_ERROR_QUERY, + total_query: str = SLO_TOTAL_QUERY, +) -> list[tuple[list[float], list[float], list[float]]]: + err_series = list(anomaly.iter_series(err_raw, query_hint=error_query)) + tot_series = list(anomaly.iter_series(tot_raw, query_hint=total_query)) + + if len(err_series) != len(tot_series): + warnings.append( + f"SLO series mismatch: errors={len(err_series)} totals={len(tot_series)}. " + f"Using first {min(len(err_series), len(tot_series))} pair(s)." + ) + + pairs = [] + for idx in range(min(len(err_series), len(tot_series))): + _, err_ts, err_vals = err_series[idx] + _, _tot_ts, tot_vals = tot_series[idx] + if len(err_vals) != len(tot_vals): + n = min(len(err_vals), len(tot_vals)) + warnings.append(f"SLO sample length mismatch at pair {idx}: errors={len(err_vals)} totals={len(tot_vals)}.") + err_vals = err_vals[:n] + tot_vals = tot_vals[:n] + err_ts = err_ts[:n] + if err_vals and tot_vals and err_ts: + pairs.append((err_ts, err_vals, tot_vals)) + return pairs + + +def select_granger_series(series_map: dict[str, list[float]]) -> dict[str, list[float]]: + min_samples = max(2, int(settings.analyzer_granger_min_samples)) + max_series = max(2, int(settings.analyzer_granger_max_series)) + + eligible: list[tuple[str, float]] = [] + for name, values in series_map.items(): + arr = np.array(values, dtype=float) + finite = arr[np.isfinite(arr)] + if finite.size < min_samples: + continue + var = float(np.var(finite)) + if var <= 0: + continue + eligible.append((name, var)) + + eligible.sort(key=lambda x: x[1], reverse=True) + selected_names = {name for name, _ in eligible[:max_series]} + return {name: vals for name, vals in series_map.items() if name in selected_names} diff --git a/engine/analyzer.py b/engine/analyzer.py index 5f8a41f..76627e8 100644 --- a/engine/analyzer.py +++ b/engine/analyzer.py @@ -20,12 +20,30 @@ import httpx from api.requests import AnalyzeRequest -from api.responses import AnalysisReport, RootCause as RootCauseModel, SloBurnAlert as SloBurnAlertModel +from api.responses import ( + AnalysisQuality, + AnalysisReport, + ErrorPropagation, + LogBurst, + LogPattern, + MetricAnomaly, + MetricSeriesDistributionStats, + ServiceLatency, +) +from api.responses import ( + RootCause as RootCauseModel, +) +from api.responses import ( + SloBurnAlert as SloBurnAlertModel, +) from config import DEFAULT_METRIC_QUERIES, SLO_ERROR_QUERY, SLO_TOTAL_QUERY, settings from datasources.provider import DataSourceProvider from engine import anomaly, logs, rca, traces -from engine.anomaly.series import WrappedMimirResponse +from engine.analyze.filters import filter_metric_response_by_services as _filter_metric_response_by_services +from engine.analyze.filters import normalize_services as _normalize_services from engine.analyze.helpers import ( + AnalyzerOutputInputs, + PrecisionQualityGateInputs, _apply_precision_quality_gates, _build_compat_registry, _build_selection_score_components, @@ -39,17 +57,18 @@ _slo_series_pairs, _to_root_cause_model, ) -from engine.analyze.filters import filter_metric_response_by_services as _filter_metric_response_by_services -from engine.analyze.filters import normalize_services as _normalize_services +from engine.anomaly.series import WrappedMimirResponse +from engine.causal import BayesianScore, CausalGraph, GrangerResult, bayesian_score, test_all_pairs +from engine.changepoint import ChangePoint from engine.changepoint import detect as changepoint_detect -from engine.causal import CausalGraph, bayesian_score, test_all_pairs -from engine.correlation import correlate, link_logs_to_metrics +from engine.correlation import CorrelatedEvent, LogMetricLink, correlate, link_logs_to_metrics from engine.dedup import group_metric_anomalies from engine.enums import Severity from engine.forecast.degradation import DegradationSignal from engine.forecast.trajectory import TrajectoryForecast -from engine.ml import RankedCause, cluster, rank -from engine.registry import get_registry +from engine.log_query import build_log_query +from engine.ml import AnomalyCluster, RankedCause, cluster, rank +from engine.registry import TenantRegistry, TenantState, get_registry from engine.slo import evaluate as slo_evaluate from engine.slo.models import SloBurnAlert from engine.topology import DependencyGraph @@ -76,6 +95,36 @@ ) +@dataclasses.dataclass(frozen=True) +class AnalyzerRuntimeState: + warnings: list[str] + suppression_counts: dict[str, int] + + +@dataclasses.dataclass(frozen=True) +class CorrelateStageInputs: + metric_anomalies: list[MetricAnomaly] + log_bursts: list[LogBurst] + rca_log_bursts: list[LogBurst] + service_latency: list[ServiceLatency] + + +@dataclasses.dataclass(frozen=True) +class CausalStageInputs: + series_map: dict[str, list[float]] + metric_anomalies: list[MetricAnomaly] + rca_log_bursts: list[LogBurst] + log_patterns: list[LogPattern] + service_latency: list[ServiceLatency] + error_propagation: list[ErrorPropagation] + correlated_events: list[CorrelatedEvent] + graph: DependencyGraph + change_points: list[ChangePoint] + forecasts: list[TrajectoryForecast] + degradation_signals: list[DegradationSignal] + anomaly_clusters: list[AnomalyCluster] + + def _overall_severity(*groups: Sequence[object]) -> Severity: best = Severity.LOW for group in groups: @@ -120,31 +169,17 @@ def _summary(report: AnalysisReport) -> str: def _build_log_query(services: list[str] | None, requested_log_query: str | None) -> str: - from engine.log_query import build_log_query - return build_log_query(services, requested_log_query) -async def run(provider: DataSourceProvider, req: AnalyzeRequest) -> AnalysisReport: - started = time.perf_counter() - registry = get_registry() - tenant_id = req.tenant_id - normalized_services = [str(service or "").strip() for service in (req.services or []) if str(service or "").strip()] - req.services = normalized_services - primary_service = normalized_services[0] if normalized_services else None - warnings: list[str] = [] - suppression_counts: dict[str, int] = {} - analysis_window_seconds = float(max(0, req.end - req.start)) - - log_query = _build_log_query(req.services, req.log_query) - trace_filters: dict[str, str | int | float | bool] = {"service.name": primary_service} if primary_service else {} - all_metric_queries = list(dict.fromkeys((req.metric_queries or []) + DEFAULT_METRIC_QUERIES)) - - if req.sensitivity: - z_threshold = 1.0 + req.sensitivity * settings.analyzer_sensitivity_factor - else: - z_threshold = settings.baseline_zscore_threshold - +async def _fetch_parallel_observations( + provider: DataSourceProvider, + req: AnalyzeRequest, + *, + log_query: str, + trace_filters: dict[str, str | int | float | bool], + warnings: list[str], +) -> tuple[object, object, object, object]: fetch_started = time.perf_counter() try: logs_raw, traces_raw, slo_errors_raw, slo_total_raw = await asyncio.wait_for( @@ -171,7 +206,24 @@ async def run(provider: DataSourceProvider, req: AnalyzeRequest) -> AnalysisRepo slo_errors_raw = TimeoutError("slo error fetch timeout") slo_total_raw = TimeoutError("slo total fetch timeout") log.debug("analyzer stage=fetch duration=%.4fs", time.perf_counter() - fetch_started) - + return logs_raw, traces_raw, slo_errors_raw, slo_total_raw + + +async def _run_metrics_stage( + provider: DataSourceProvider, + req: AnalyzeRequest, + all_metric_queries: list[str], + z_threshold: float, + analysis_window_seconds: float, + state: AnalyzerRuntimeState, +) -> tuple[ + list[MetricAnomaly], + list[ChangePoint], + list[TrajectoryForecast], + list[DegradationSignal], + dict[str, list[float]], + list[MetricSeriesDistributionStats], +]: metrics_started = time.perf_counter() try: ( @@ -186,10 +238,8 @@ async def run(provider: DataSourceProvider, req: AnalyzeRequest) -> AnalysisRepo timeout=float(settings.analyzer_metrics_timeout_seconds), ) except TimeoutError: - msg = ( - f"Metrics stage timed out after {settings.analyzer_metrics_timeout_seconds}s; " "returning partial report." - ) - warnings.append(msg) + msg = f"Metrics stage timed out after {settings.analyzer_metrics_timeout_seconds}s; returning partial report." + state.warnings.append(msg) log.warning(msg) ( metric_anomalies, @@ -201,7 +251,7 @@ async def run(provider: DataSourceProvider, req: AnalyzeRequest) -> AnalysisRepo ) = ([], [], [], [], {}, []) except _RECOVERABLE_ANALYSIS_ERRORS as exc: msg = f"Metrics unavailable: {exc}" - warnings.append(msg) + state.warnings.append(msg) log.warning(msg) ( metric_anomalies, @@ -218,19 +268,29 @@ async def run(provider: DataSourceProvider, req: AnalyzeRequest) -> AnalysisRepo forecasts = _dedupe_by_metric_with_severity(forecasts) degradation_signals = _dedupe_by_metric_with_severity(degradation_signals) if raw_metric_anomaly_count > len(metric_anomalies): - suppression_counts["duplicate_metric_anomalies"] = raw_metric_anomaly_count - len(metric_anomalies) - warnings.append( + state.suppression_counts["duplicate_metric_anomalies"] = raw_metric_anomaly_count - len(metric_anomalies) + state.warnings.append( f"Deduplicated metric anomalies from {raw_metric_anomaly_count} to {len(metric_anomalies)} " "to reduce duplicate series noise." ) if raw_change_point_count > len(change_points): - suppression_counts["duplicate_change_points"] = raw_change_point_count - len(change_points) - warnings.append( + state.suppression_counts["duplicate_change_points"] = raw_change_point_count - len(change_points) + state.warnings.append( f"Deduplicated change points from {raw_change_point_count} to {len(change_points)} " "to reduce duplicate series noise." ) log.debug("analyzer stage=metrics duration=%.4fs", time.perf_counter() - metrics_started) + return metric_anomalies, change_points, forecasts, degradation_signals, series_map, metric_series_statistics + +async def _run_logs_stage( + provider: DataSourceProvider, + req: AnalyzeRequest, + *, + log_query: str, + logs_raw: object, + warnings: list[str], +) -> tuple[list[LogBurst], list[LogPattern]]: logs_started = time.perf_counter() log_bursts, log_patterns = [], [] if isinstance(logs_raw, dict): @@ -298,7 +358,18 @@ async def run(provider: DataSourceProvider, req: AnalyzeRequest) -> AnalysisRepo warnings.append(msg) log.warning(msg) log.debug("analyzer stage=logs duration=%.4fs", time.perf_counter() - logs_started) - + return log_bursts, log_patterns + + +async def _run_traces_stage( + provider: DataSourceProvider, + req: AnalyzeRequest, + *, + primary_service: str | None, + trace_filters: dict[str, str | int | float | bool], + traces_raw: object, + warnings: list[str], +) -> tuple[list[ServiceLatency], list[ErrorPropagation], DependencyGraph]: traces_started = time.perf_counter() service_latency, error_propagation = [], [] graph = DependencyGraph() @@ -342,7 +413,17 @@ async def run(provider: DataSourceProvider, req: AnalyzeRequest) -> AnalysisRepo warnings.append(msg) log.warning(msg) log.debug("analyzer stage=traces duration=%.4fs", time.perf_counter() - traces_started) + return service_latency, error_propagation, graph + +def _run_slo_stage( + req: AnalyzeRequest, + *, + primary_service: str | None, + slo_errors_raw: object, + slo_total_raw: object, + warnings: list[str], +) -> list[SloBurnAlertModel]: slo_started = time.perf_counter() slo_alerts_raw: list[SloBurnAlert] = [] if isinstance(slo_errors_raw, dict) and isinstance(slo_total_raw, dict): @@ -364,37 +445,95 @@ async def run(provider: DataSourceProvider, req: AnalyzeRequest) -> AnalysisRepo warnings.append("SLO metrics unavailable for one or both queries.") slo_alerts = [SloBurnAlertModel(**dataclasses.asdict(a)) for a in slo_alerts_raw] log.debug("analyzer stage=slo duration=%.4fs", time.perf_counter() - slo_started) + return slo_alerts + +def _normalize_ranked_root_causes( + ranked_causes: list[RankedCause], + warnings: list[str], + suppression_counts: dict[str, int], +) -> tuple[list[RootCauseModel], list[RankedCause]]: + pydantic_root_causes: list[RootCauseModel] = [] + ranked_valid: list[RankedCause] = [] + hypothesis_to_ranked: dict[str, object] = {} + for item in ranked_causes: + try: + root_cause_model = _to_root_cause_model(item.root_cause) + pydantic_root_causes.append(root_cause_model) + ranked_valid.append(item) + hypothesis = str(root_cause_model.hypothesis) + current = hypothesis_to_ranked.get(hypothesis) + if current is None or float(getattr(item, "final_score", 0.0)) > float( + getattr(current, "final_score", 0.0) + ): + hypothesis_to_ranked[hypothesis] = item + except (AttributeError, TypeError, ValueError) as exc: + suppression_counts["invalid_root_cause_drops"] = suppression_counts.get("invalid_root_cause_drops", 0) + 1 + warnings.append(f"Dropped invalid root cause model during normalization: {exc}") + for cause in pydantic_root_causes: + ranked_item = hypothesis_to_ranked.get(str(cause.hypothesis)) + if ranked_item is None: + continue + cause.selection_score_components = _build_selection_score_components(ranked_item, cause) + return pydantic_root_causes, ranked_valid + + +async def _run_correlate_cluster_stage( + tenant_id: str, + req: AnalyzeRequest, + registry: TenantRegistry, + inputs: CorrelateStageInputs, +) -> tuple[list[LogMetricLink], TenantState, list[CorrelatedEvent], list[AnomalyCluster]]: correlate_started = time.perf_counter() - rca_log_bursts = _filter_log_bursts_for_precision_rca( - log_bursts=log_bursts, - log_patterns=log_patterns, - suppression_counts=suppression_counts, - warnings=warnings, - ) - # Keep raw links for investigation UX; filtered bursts are used for RCA correlation/scoring only. - log_metric_links = link_logs_to_metrics(metric_anomalies, log_bursts) - # fetch tenant-specific weights used to compute confidence + log_metric_links = link_logs_to_metrics(inputs.metric_anomalies, inputs.log_bursts) state = await registry.get_state(tenant_id) correlated_events = correlate( - metric_anomalies, - rca_log_bursts, - service_latency, + inputs.metric_anomalies, + inputs.rca_log_bursts, + inputs.service_latency, window_seconds=req.correlation_window_seconds, weight_fn=state.weighted_confidence, ) - anomaly_clusters = cluster(metric_anomalies) + anomaly_clusters = cluster(inputs.metric_anomalies) log.debug("analyzer stage=correlate duration=%.4fs", time.perf_counter() - correlate_started) - + return log_metric_links, state, correlated_events, anomaly_clusters + + +async def _run_causal_rank_and_quality( + tenant_id: str, + primary_service: str | None, + req: AnalyzeRequest, + *, + registry: TenantRegistry, + inputs: CausalStageInputs, + state: AnalyzerRuntimeState, +) -> tuple[ + list[MetricAnomaly], + list[ChangePoint], + list[RootCauseModel], + list[RankedCause], + list[AnomalyCluster], + list[GrangerResult], + list[TrajectoryForecast], + list[DegradationSignal], + AnalysisQuality, + list[BayesianScore], +]: causal_started = time.perf_counter() - series_for_granger = _select_granger_series(series_map) + metric_anomalies = inputs.metric_anomalies + change_points = inputs.change_points + forecasts = inputs.forecasts + degradation_signals = inputs.degradation_signals + anomaly_clusters = inputs.anomaly_clusters + + series_for_granger = _select_granger_series(inputs.series_map) granger_started = time.perf_counter() fresh_granger = ( test_all_pairs(series_for_granger, max_lag=settings.granger_max_lag) if len(series_for_granger) >= 2 else [] ) granger_elapsed = time.perf_counter() - granger_started if granger_elapsed > float(settings.analyzer_causal_timeout_seconds): - warnings.append( + state.warnings.append( f"Causal granger stage exceeded target {settings.analyzer_causal_timeout_seconds}s " f"(actual {granger_elapsed:.2f}s)." ) @@ -405,7 +544,7 @@ async def run(provider: DataSourceProvider, req: AnalyzeRequest) -> AnalysisRepo timeout=1.0, ) except _RECOVERABLE_ANALYSIS_ERRORS as exc: - warnings.append(f"Failed to persist granger results: {exc}") + state.warnings.append(f"Failed to persist granger results: {exc}") causal_graph = CausalGraph() causal_graph.from_granger_results(fresh_granger) @@ -423,45 +562,27 @@ async def run(provider: DataSourceProvider, req: AnalyzeRequest) -> AnalysisRepo bayesian_scores = bayesian_score( has_deployment_event=bool(deployment_events), has_metric_spike=bool(metric_anomalies), - has_log_burst=bool(rca_log_bursts), - has_latency_spike=bool(service_latency), - has_error_propagation=bool(error_propagation), + has_log_burst=bool(inputs.rca_log_bursts), + has_latency_spike=bool(inputs.service_latency), + has_error_propagation=bool(inputs.error_propagation), ) root_causes = rca.generate( metric_anomalies, - rca_log_bursts, - log_patterns, - service_latency, - error_propagation, - correlated_events=correlated_events, - graph=graph, + inputs.rca_log_bursts, + inputs.log_patterns, + inputs.service_latency, + inputs.error_propagation, + correlated_events=inputs.correlated_events, + graph=inputs.graph, event_registry=_build_compat_registry(deployment_events), ) - ranked_causes = rank(root_causes, correlated_events) - pydantic_root_causes: list[RootCauseModel] = [] - ranked_valid: list[RankedCause] = [] - hypothesis_to_ranked: dict[str, object] = {} - for item in ranked_causes: - try: - root_cause_model = _to_root_cause_model(item.root_cause) - pydantic_root_causes.append(root_cause_model) - ranked_valid.append(item) - hypothesis = str(root_cause_model.hypothesis) - current = hypothesis_to_ranked.get(hypothesis) - if current is None or float(getattr(item, "final_score", 0.0)) > float( - getattr(current, "final_score", 0.0) - ): - hypothesis_to_ranked[hypothesis] = item - except (AttributeError, TypeError, ValueError) as exc: - suppression_counts["invalid_root_cause_drops"] = suppression_counts.get("invalid_root_cause_drops", 0) + 1 - warnings.append(f"Dropped invalid root cause model during normalization: {exc}") - ranked_causes = ranked_valid - for cause in pydantic_root_causes: - ranked_item = hypothesis_to_ranked.get(str(cause.hypothesis)) - if ranked_item is None: - continue - cause.selection_score_components = _build_selection_score_components(ranked_item, cause) + ranked_causes = rank(root_causes, inputs.correlated_events) + pydantic_root_causes, ranked_causes = _normalize_ranked_root_causes( + ranked_causes, + state.warnings, + state.suppression_counts, + ) ( metric_anomalies, change_points, @@ -470,24 +591,154 @@ async def run(provider: DataSourceProvider, req: AnalyzeRequest) -> AnalysisRepo anomaly_clusters, fresh_granger, ) = _limit_analyzer_output( - metric_anomalies=metric_anomalies, - change_points=change_points, - root_causes=pydantic_root_causes, - ranked_causes=ranked_causes, - anomaly_clusters=anomaly_clusters, - granger_results=fresh_granger, - warnings=warnings, + AnalyzerOutputInputs( + metric_anomalies=metric_anomalies, + change_points=change_points, + root_causes=pydantic_root_causes, + ranked_causes=ranked_causes, + anomaly_clusters=anomaly_clusters, + granger_results=fresh_granger, + warnings=state.warnings, + ) ) metric_anomalies, change_points, pydantic_root_causes, ranked_causes, quality = _apply_precision_quality_gates( - metric_anomalies=metric_anomalies, - change_points=change_points, - root_causes=pydantic_root_causes, - ranked_causes=ranked_causes, - duration_seconds=float(req.end - req.start), + PrecisionQualityGateInputs( + metric_anomalies=metric_anomalies, + change_points=change_points, + root_causes=pydantic_root_causes, + ranked_causes=ranked_causes, + duration_seconds=float(req.end - req.start), + suppression_counts=state.suppression_counts, + warnings=state.warnings, + ) + ) + log.debug("analyzer stage=causal duration=%.4fs", time.perf_counter() - causal_started) + return ( + metric_anomalies, + change_points, + pydantic_root_causes, + ranked_causes, + anomaly_clusters, + fresh_granger, + forecasts, + degradation_signals, + quality, + bayesian_scores, + ) + + +async def run(provider: DataSourceProvider, req: AnalyzeRequest) -> AnalysisReport: + started = time.perf_counter() + registry = get_registry() + tenant_id = req.tenant_id + normalized_services = [str(service or "").strip() for service in (req.services or []) if str(service or "").strip()] + req.services = normalized_services + primary_service = normalized_services[0] if normalized_services else None + warnings: list[str] = [] + suppression_counts: dict[str, int] = {} + state = AnalyzerRuntimeState(warnings=warnings, suppression_counts=suppression_counts) + analysis_window_seconds = float(max(0, req.end - req.start)) + + log_query = _build_log_query(req.services, req.log_query) + trace_filters: dict[str, str | int | float | bool] = {"service.name": primary_service} if primary_service else {} + all_metric_queries = list(dict.fromkeys((req.metric_queries or []) + DEFAULT_METRIC_QUERIES)) + + if req.sensitivity: + z_threshold = 1.0 + req.sensitivity * settings.analyzer_sensitivity_factor + else: + z_threshold = settings.baseline_zscore_threshold + + logs_raw, traces_raw, slo_errors_raw, slo_total_raw = await _fetch_parallel_observations( + provider, req, log_query=log_query, trace_filters=trace_filters, warnings=warnings + ) + + ( + metric_anomalies, + change_points, + forecasts, + degradation_signals, + series_map, + metric_series_statistics, + ) = await _run_metrics_stage( + provider, + req, + all_metric_queries, + z_threshold, + analysis_window_seconds, + state, + ) + + log_bursts, log_patterns = await _run_logs_stage( + provider, req, log_query=log_query, logs_raw=logs_raw, warnings=warnings + ) + + service_latency, error_propagation, graph = await _run_traces_stage( + provider, + req, + primary_service=primary_service, + trace_filters=trace_filters, + traces_raw=traces_raw, + warnings=warnings, + ) + + slo_alerts = _run_slo_stage( + req, + primary_service=primary_service, + slo_errors_raw=slo_errors_raw, + slo_total_raw=slo_total_raw, + warnings=warnings, + ) + + rca_log_bursts = _filter_log_bursts_for_precision_rca( + log_bursts=log_bursts, + log_patterns=log_patterns, suppression_counts=suppression_counts, warnings=warnings, ) - log.debug("analyzer stage=causal duration=%.4fs", time.perf_counter() - causal_started) + log_metric_links, _, correlated_events, anomaly_clusters = await _run_correlate_cluster_stage( + tenant_id, + req, + registry, + CorrelateStageInputs( + metric_anomalies=metric_anomalies, + log_bursts=log_bursts, + rca_log_bursts=rca_log_bursts, + service_latency=service_latency, + ), + ) + + ( + metric_anomalies, + change_points, + pydantic_root_causes, + ranked_causes, + anomaly_clusters, + fresh_granger, + forecasts, + degradation_signals, + quality, + bayesian_scores, + ) = await _run_causal_rank_and_quality( + tenant_id, + primary_service, + req, + registry=registry, + inputs=CausalStageInputs( + series_map=series_map, + metric_anomalies=metric_anomalies, + rca_log_bursts=rca_log_bursts, + log_patterns=log_patterns, + service_latency=service_latency, + error_propagation=error_propagation, + correlated_events=correlated_events, + graph=graph, + change_points=change_points, + forecasts=forecasts, + degradation_signals=degradation_signals, + anomaly_clusters=anomaly_clusters, + ), + state=state, + ) severity = _overall_severity( metric_anomalies, diff --git a/engine/anomaly/detection.py b/engine/anomaly/detection.py index 2d7d1f5..596f584 100644 --- a/engine/anomaly/detection.py +++ b/engine/anomaly/detection.py @@ -12,15 +12,16 @@ from __future__ import annotations -from importlib import import_module import math -from typing import Callable, List, Protocol +from collections.abc import Callable, Sequence +from importlib import import_module +from typing import Protocol import numpy as np -from engine.enums import ChangeType, Severity from api.responses import MetricAnomaly from config import settings +from engine.enums import ChangeType, Severity linregress: Callable[[np.ndarray, np.ndarray], tuple[float, float, float, float, float]] = import_module( "scipy.stats" @@ -142,7 +143,7 @@ def _is_precision_profile() -> bool: return str(getattr(settings, "quality_gating_profile", "")).strip().lower().startswith("precision") -def _apply_density_cap(anomalies: List[MetricAnomaly], timestamps: np.ndarray) -> List[MetricAnomaly]: +def _apply_density_cap(anomalies: list[MetricAnomaly], timestamps: np.ndarray) -> list[MetricAnomaly]: if not anomalies: return anomalies max_density = float(getattr(settings, "quality_max_anomaly_density_per_metric_per_hour", 0.0)) @@ -172,7 +173,7 @@ def _apply_density_cap(anomalies: List[MetricAnomaly], timestamps: np.ndarray) - return sorted(kept, key=lambda a: a.timestamp) -def _compress_runs(anomalies: List[MetricAnomaly]) -> List[MetricAnomaly]: +def _compress_runs(anomalies: list[MetricAnomaly]) -> list[MetricAnomaly]: if not anomalies or len(anomalies) <= settings.anomaly_run_keep_max: return anomalies @@ -225,10 +226,10 @@ def _compress_runs(anomalies: List[MetricAnomaly]) -> List[MetricAnomaly]: def detect( metric: str, - timestamps: List[float], - values: List[float], + timestamps: Sequence[float], + values: Sequence[float], sensitivity: float | None = None, -) -> List[MetricAnomaly]: +) -> list[MetricAnomaly]: if len(values) < settings.min_samples: return [] @@ -278,7 +279,7 @@ def detect( slope, *_ = linregress(np.arange(len(clean)), clean) - anomalies: List[MetricAnomaly] = [] + anomalies: list[MetricAnomaly] = [] for t, v, z, m, c, iso_l, iso_s in zip(ts, arr, z_scores, mad_scores, cusum_flags, iso_labels, iso_scores): iq = _iqr_score_value(float(v), med, iqr) tukey = _tukey_outlier_class(float(v), q1, q3, iqr) diff --git a/engine/anomaly/series.py b/engine/anomaly/series.py index 8f201c5..6875843 100644 --- a/engine/anomaly/series.py +++ b/engine/anomaly/series.py @@ -13,8 +13,8 @@ import logging import re -from collections.abc import Mapping -from typing import Iterator, Optional, TypeAlias +from collections.abc import Iterator, Mapping +from typing import TypeAlias log = logging.getLogger(__name__) @@ -76,7 +76,7 @@ } -def _metric_hint_from_query(query_hint: Optional[str]) -> Optional[str]: +def _metric_hint_from_query(query_hint: str | None) -> str | None: text = str(query_hint or "").strip() if not text: return None @@ -93,7 +93,7 @@ def _metric_hint_from_query(query_hint: Optional[str]) -> Optional[str]: return None -def _fallback_metric_name(metric: MetricRecord, query_hint: Optional[str]) -> str: +def _fallback_metric_name(metric: MetricRecord, query_hint: str | None) -> str: hinted = _metric_hint_from_query(query_hint) if hinted: return hinted @@ -109,7 +109,7 @@ def _fallback_metric_name(metric: MetricRecord, query_hint: Optional[str]) -> st def iter_series( mimir_response: WrappedMimirResponse, - query_hint: Optional[str] = None, + query_hint: str | None = None, ) -> Iterator[tuple[str, list[float], list[float]]]: if isinstance(mimir_response, tuple): if len(mimir_response) == 2 and isinstance(mimir_response[1], dict): diff --git a/engine/anomaly/stats.py b/engine/anomaly/stats.py index 61aabd5..d072c3d 100644 --- a/engine/anomaly/stats.py +++ b/engine/anomaly/stats.py @@ -11,7 +11,7 @@ from __future__ import annotations import math -from typing import List, cast +from typing import cast import numpy as np from numpy.typing import NDArray @@ -20,7 +20,7 @@ from config import settings -def _finite_array(vals: List[float]) -> NDArray[np.float64]: +def _finite_array(vals: list[float]) -> NDArray[np.float64]: arr = np.asarray(vals, dtype=float) finite = arr[np.isfinite(arr)] return cast(NDArray[np.float64], finite) @@ -60,7 +60,7 @@ def _sample_excess_kurtosis(vals: NDArray[np.float64]) -> float: def compute_series_distribution_stats( series_key: str, metric_name: str, - vals: List[float], + vals: list[float], ) -> MetricSeriesDistributionStats | None: finite = _finite_array(vals) n = int(finite.size) diff --git a/engine/baseline/compute.py b/engine/baseline/compute.py index 015af35..4678e07 100644 --- a/engine/baseline/compute.py +++ b/engine/baseline/compute.py @@ -13,9 +13,9 @@ from __future__ import annotations from dataclasses import dataclass -from typing import Dict, List, Optional, Tuple import numpy as np + from config import settings @@ -25,15 +25,15 @@ class Baseline: std: float lower: float upper: float - seasonal_mean: Optional[float] = None + seasonal_mean: float | None = None sample_count: int = 0 -def _hour_buckets(ts: List[float]) -> List[int]: +def _hour_buckets(ts: list[float]) -> list[int]: return [(int(t) % 86400) // 3600 for t in ts] -def compute(ts: List[float], vals: List[float], z_threshold: float | None = None) -> Baseline: +def compute(ts: list[float], vals: list[float], z_threshold: float | None = None) -> Baseline: if z_threshold is None: z_threshold = settings.baseline_zscore_threshold arr = np.array(vals, dtype=float) @@ -44,11 +44,11 @@ def compute(ts: List[float], vals: List[float], z_threshold: float | None = None s = float(np.std(arr)) or 1.0 return Baseline(mean=m, std=s, lower=m - z_threshold * s, upper=m + z_threshold * s, sample_count=n) - seasonal_mean: Optional[float] = None + seasonal_mean: float | None = None if n >= settings.baseline_seasonal_min_samples: buckets = _hour_buckets(ts) - bucket_map: Dict[int, List[float]] = {} + bucket_map: dict[int, list[float]] = {} for b, v in zip(buckets, vals): bucket_map.setdefault(b, []).append(v) hour_avgs = {h: float(np.mean(v)) for h, v in bucket_map.items()} @@ -70,6 +70,6 @@ def compute(ts: List[float], vals: List[float], z_threshold: float | None = None ) -def score(val: float, baseline: Baseline) -> Tuple[bool, float]: +def score(val: float, baseline: Baseline) -> tuple[bool, float]: z = abs(val - baseline.mean) / baseline.std if baseline.std else 0.0 return (val < baseline.lower or val > baseline.upper), round(z, 3) diff --git a/engine/causal/__init__.py b/engine/causal/__init__.py index fa50710..382d6b5 100644 --- a/engine/causal/__init__.py +++ b/engine/causal/__init__.py @@ -10,20 +10,25 @@ http://www.apache.org/licenses/LICENSE-2.0 """ +from engine.causal.bayesian import BayesianScore +from engine.causal.bayesian import score as bayesian_score from engine.causal.granger import ( GrangerResult, - granger_pair_analysis as test_pair, +) +from engine.causal.granger import ( granger_multiple_pairs as test_all_pairs, ) -from engine.causal.bayesian import BayesianScore, score as bayesian_score +from engine.causal.granger import ( + granger_pair_analysis as test_pair, +) from engine.causal.graph import CausalGraph, InterventionResult __all__ = [ - "GrangerResult", - "test_pair", - "test_all_pairs", "BayesianScore", - "bayesian_score", "CausalGraph", + "GrangerResult", "InterventionResult", + "bayesian_score", + "test_all_pairs", + "test_pair", ] diff --git a/engine/causal/bayesian.py b/engine/causal/bayesian.py index f65d978..ad17eb6 100644 --- a/engine/causal/bayesian.py +++ b/engine/causal/bayesian.py @@ -14,19 +14,16 @@ from __future__ import annotations from dataclasses import dataclass -from typing import Dict, List - -from engine.enums import RcaCategory - from config import settings +from engine.enums import RcaCategory -def _configured_priors() -> Dict[RcaCategory, float]: +def _configured_priors() -> dict[RcaCategory, float]: return {RcaCategory(k): v for k, v in settings.bayesian_priors.items()} -def _configured_likelihoods() -> Dict[RcaCategory, Dict[str, float]]: +def _configured_likelihoods() -> dict[RcaCategory, dict[str, float]]: return {RcaCategory(k): v for k, v in settings.bayesian_likelihoods.items()} @@ -44,8 +41,8 @@ def score( has_log_burst: bool, has_latency_spike: bool, has_error_propagation: bool, -) -> List[BayesianScore]: - evidence: Dict[str, bool] = { +) -> list[BayesianScore]: + evidence: dict[str, bool] = { "has_deployment_event": has_deployment_event, "has_metric_spike": has_metric_spike, "has_log_burst": has_log_burst, @@ -56,7 +53,7 @@ def score( priors = _configured_priors() likelihood_map = _configured_likelihoods() - raw_posteriors: Dict[RcaCategory, float] = {} + raw_posteriors: dict[RcaCategory, float] = {} for category, prior in priors.items(): likelihood = 1.0 likelihoods = likelihood_map.get(category, {}) diff --git a/engine/causal/granger.py b/engine/causal/granger.py index 9de72c0..2b3688d 100644 --- a/engine/causal/granger.py +++ b/engine/causal/granger.py @@ -14,7 +14,6 @@ from dataclasses import dataclass from importlib import import_module -from typing import Dict, List, Optional, Tuple import numpy as np @@ -34,7 +33,7 @@ class GrangerResult: strength: float -def _ols(x: np.ndarray, y: np.ndarray) -> Tuple[np.ndarray, float]: +def _ols(x: np.ndarray, y: np.ndarray) -> tuple[np.ndarray, float]: coeffs, _, _, _ = np.linalg.lstsq(x, y, rcond=None) predicted = x @ coeffs ss_res = float(np.sum((y - predicted) ** 2)) @@ -51,12 +50,12 @@ def _lag_matrix(series: np.ndarray, max_lag: int) -> np.ndarray: def granger_pair_analysis( cause_name: str, - cause_vals: List[float], + cause_vals: list[float], effect_name: str, - effect_vals: List[float], + effect_vals: list[float], max_lag: int | None = None, p_threshold: float | None = None, -) -> Optional[GrangerResult]: +) -> GrangerResult | None: if max_lag is None: max_lag = settings.granger_max_lag if p_threshold is None: @@ -105,16 +104,16 @@ def granger_pair_analysis( def granger_multiple_pairs( - series_map: Dict[str, List[float]], + series_map: dict[str, list[float]], max_lag: int | None = None, p_threshold: float | None = None, -) -> List[GrangerResult]: +) -> list[GrangerResult]: if max_lag is None: max_lag = settings.granger_max_lag if p_threshold is None: p_threshold = settings.granger_p_threshold names = list(series_map.keys()) - results: List[GrangerResult] = [] + results: list[GrangerResult] = [] for i, cause in enumerate(names): for j, effect in enumerate(names): diff --git a/engine/causal/graph.py b/engine/causal/graph.py index 5b8e48e..e8a18d0 100644 --- a/engine/causal/graph.py +++ b/engine/causal/graph.py @@ -12,10 +12,9 @@ from __future__ import annotations from collections import defaultdict, deque -from config import settings from dataclasses import dataclass, field -from typing import Dict, List, Set, Tuple +from config import settings from engine.causal.granger import GrangerResult @@ -30,16 +29,16 @@ class CausalEdge: @dataclass class InterventionResult: target: str - expected_effect_on: Dict[str, float] = field(default_factory=dict) - causal_path: List[str] = field(default_factory=list) + expected_effect_on: dict[str, float] = field(default_factory=dict) + causal_path: list[str] = field(default_factory=list) total_effect: float = 0.0 class CausalGraph: def __init__(self) -> None: - self._edges: List[CausalEdge] = [] - self._forward: Dict[str, List[CausalEdge]] = defaultdict(list) - self._reverse: Dict[str, Set[str]] = defaultdict(set) + self._edges: list[CausalEdge] = [] + self._forward: dict[str, list[CausalEdge]] = defaultdict(list) + self._reverse: dict[str, set[str]] = defaultdict(set) def add_edge(self, cause: str, effect: str, strength: float, lag_seconds: float = 0.0) -> None: edge = CausalEdge(cause=cause, effect=effect, strength=strength, lag_seconds=lag_seconds) @@ -52,15 +51,15 @@ def from_granger_results(self, results: list[GrangerResult]) -> None: if r.is_causal: self.add_edge(r.cause_metric, r.effect_metric, r.strength) - def topological_sort(self) -> List[str]: + def topological_sort(self) -> list[str]: nodes = self.all_nodes() - in_degree: Dict[str, int] = {n: 0 for n in nodes} + in_degree: dict[str, int] = {n: 0 for n in nodes} for node in self._forward: for edge in self._forward[node]: in_degree[edge.effect] = in_degree.get(edge.effect, 0) + 1 queue = deque(n for n in nodes if in_degree.get(n, 0) == 0) - order: List[str] = [] + order: list[str] = [] while queue: node = queue.popleft() order.append(node) @@ -71,7 +70,7 @@ def topological_sort(self) -> List[str]: return order - def root_causes(self) -> List[str]: + def root_causes(self) -> list[str]: all_effects = {e.effect for e in self._edges} all_causes = set(self._forward) return sorted(all_causes - all_effects) @@ -79,10 +78,10 @@ def root_causes(self) -> List[str]: def simulate_intervention(self, target: str, max_depth: int | None = None) -> InterventionResult: if max_depth is None: max_depth = settings.causal_graph_max_depth - effects: Dict[str, float] = {} - path: List[str] = [] - queue: deque[Tuple[str, float, int]] = deque([(target, 1.0, 0)]) - seen: Set[str] = {target} + effects: dict[str, float] = {} + path: list[str] = [] + queue: deque[tuple[str, float, int]] = deque([(target, 1.0, 0)]) + seen: set[str] = {target} while queue: node, cumulative_strength, depth = queue.popleft() @@ -106,9 +105,9 @@ def simulate_intervention(self, target: str, max_depth: int | None = None) -> In total_effect=round(sum(effects.values()), settings.causal_round_precision), ) - def find_common_causes(self, node_a: str, node_b: str) -> List[str]: - def ancestors(node: str) -> Set[str]: - seen: Set[str] = set() + def find_common_causes(self, node_a: str, node_b: str) -> list[str]: + def ancestors(node: str) -> set[str]: + seen: set[str] = set() q: deque[str] = deque([node]) while q: n = q.popleft() @@ -120,5 +119,5 @@ def ancestors(node: str) -> Set[str]: return sorted(ancestors(node_a) & ancestors(node_b)) - def all_nodes(self) -> Set[str]: + def all_nodes(self) -> set[str]: return set(self._forward) | {e.effect for e in self._edges} diff --git a/engine/changepoint/cusum.py b/engine/changepoint/cusum.py index 3682b0e..a830b63 100644 --- a/engine/changepoint/cusum.py +++ b/engine/changepoint/cusum.py @@ -11,11 +11,12 @@ from __future__ import annotations +from collections.abc import Sequence from dataclasses import dataclass -from typing import List import numpy as np +from config import settings from engine.enums import ChangeType @@ -31,8 +32,6 @@ class ChangePoint: def _classify(before: float, after: float, std: float) -> ChangeType: - from config import settings - delta = after - before relative = abs(delta) / (abs(before) + 1e-9) if relative > settings.cusum_relative_cutoff: @@ -42,9 +41,7 @@ def _classify(before: float, after: float, std: float) -> ChangeType: return ChangeType.DRIFT -def _detect_oscillation(arr: np.ndarray, window: int | None = None) -> List[int]: - from config import settings - +def _detect_oscillation(arr: np.ndarray, window: int | None = None) -> list[int]: if window is None: window = settings.cusum_window sign_changes = np.diff(np.sign(np.diff(arr))) @@ -56,13 +53,11 @@ def _detect_oscillation(arr: np.ndarray, window: int | None = None) -> List[int] def detect( - ts: List[float], - vals: List[float], + ts: Sequence[float], + vals: Sequence[float], threshold_sigma: float | None = None, metric_name: str = "metric", -) -> List[ChangePoint]: - from config import settings - +) -> list[ChangePoint]: if threshold_sigma is None: threshold_sigma = settings.cusum_threshold_sigma if len(vals) < 10: @@ -79,7 +74,7 @@ def detect( k = settings.cusum_k * sigma h = threshold_sigma * sigma cusum_pos = cusum_neg = 0.0 - results: List[ChangePoint] = [] + results: list[ChangePoint] = [] for i in range(1, len(arr)): cusum_pos = max(0.0, cusum_pos + arr[i] - mu - k) diff --git a/engine/correlation/__init__.py b/engine/correlation/__init__.py index ba92e35..b2cdd37 100644 --- a/engine/correlation/__init__.py +++ b/engine/correlation/__init__.py @@ -9,7 +9,7 @@ http://www.apache.org/licenses/LICENSE-2.0 """ -from engine.correlation.temporal import CorrelatedEvent, correlate from engine.correlation.signals import LogMetricLink, link_logs_to_metrics +from engine.correlation.temporal import CorrelatedEvent, correlate -__all__ = ["CorrelatedEvent", "correlate", "LogMetricLink", "link_logs_to_metrics"] +__all__ = ["CorrelatedEvent", "LogMetricLink", "correlate", "link_logs_to_metrics"] diff --git a/engine/correlation/signals.py b/engine/correlation/signals.py index afcda37..5c26201 100644 --- a/engine/correlation/signals.py +++ b/engine/correlation/signals.py @@ -13,9 +13,8 @@ from __future__ import annotations from dataclasses import dataclass -from typing import List -from api.responses import MetricAnomaly, LogBurst +from api.responses import LogBurst, MetricAnomaly from config import settings @@ -30,13 +29,13 @@ class LogMetricLink: def link_logs_to_metrics( - metric_anomalies: List[MetricAnomaly], - log_bursts: List[LogBurst], + metric_anomalies: list[MetricAnomaly], + log_bursts: list[LogBurst], max_lag_seconds: float | None = None, -) -> List[LogMetricLink]: +) -> list[LogMetricLink]: if max_lag_seconds is None: max_lag_seconds = settings.max_lag_seconds - links: List[LogMetricLink] = [] + links: list[LogMetricLink] = [] for anomaly in metric_anomalies: for burst in log_bursts: diff --git a/engine/correlation/temporal.py b/engine/correlation/temporal.py index 67dc513..179c1c1 100644 --- a/engine/correlation/temporal.py +++ b/engine/correlation/temporal.py @@ -12,12 +12,12 @@ from __future__ import annotations -from dataclasses import dataclass, field import math import re -from typing import List, Set, Callable +from collections.abc import Callable +from dataclasses import dataclass, field -from api.responses import MetricAnomaly, LogBurst, ServiceLatency +from api.responses import LogBurst, MetricAnomaly, ServiceLatency from config import settings @@ -25,9 +25,9 @@ class CorrelatedEvent: window_start: float window_end: float - metric_anomalies: List[MetricAnomaly] = field(default_factory=list) - log_bursts: List[LogBurst] = field(default_factory=list) - service_latency: List[ServiceLatency] = field(default_factory=list) + metric_anomalies: list[MetricAnomaly] = field(default_factory=list) + log_bursts: list[LogBurst] = field(default_factory=list) + service_latency: list[ServiceLatency] = field(default_factory=list) signal_count: int = 0 confidence: float = 0.0 @@ -95,19 +95,19 @@ def _safe_float(value: object) -> float | None: def correlate( - metric_anomalies: List[MetricAnomaly], - log_bursts: List[LogBurst], - service_latency: List[ServiceLatency], + metric_anomalies: list[MetricAnomaly], + log_bursts: list[LogBurst], + service_latency: list[ServiceLatency], window_seconds: float | None = None, *, weight_fn: Callable[[float, float, float], float] | None = None, -) -> List[CorrelatedEvent]: +) -> list[CorrelatedEvent]: if window_seconds is None: window_seconds = settings.correlation_window_seconds anchor_candidates: list[object] = [a.timestamp for a in metric_anomalies] anchor_candidates.extend(getattr(b, "start", getattr(b, "window_start", None)) for b in log_bursts) - anchor_times: List[float] = [] + anchor_times: list[float] = [] for value in anchor_candidates: parsed = _safe_float(value) if parsed is not None: @@ -117,8 +117,8 @@ def correlate( if not anchor_times: return [] - events: List[CorrelatedEvent] = [] - used: Set[float] = set() + events: list[CorrelatedEvent] = [] + used: set[float] = set() for anchor in anchor_times: if anchor in used: diff --git a/engine/dedup/grouping.py b/engine/dedup/grouping.py index c12c3df..a124571 100644 --- a/engine/dedup/grouping.py +++ b/engine/dedup/grouping.py @@ -12,7 +12,7 @@ from __future__ import annotations from dataclasses import dataclass, field -from typing import Generic, List, TypeVar +from typing import Generic, TypeVar from api.responses import MetricAnomaly from config import settings @@ -23,22 +23,22 @@ @dataclass class AnomalyGroup(Generic[T]): representative: T - members: List[T] = field(default_factory=list) + members: list[T] = field(default_factory=list) count: int = 1 def group_metric_anomalies( - anomalies: List[MetricAnomaly], + anomalies: list[MetricAnomaly], time_window: float | None = None, by_metric: bool = True, -) -> List[AnomalyGroup[MetricAnomaly]]: +) -> list[AnomalyGroup[MetricAnomaly]]: if time_window is None: time_window = settings.dedup_time_window if not anomalies: return [] sorted_a = sorted(anomalies, key=lambda a: a.timestamp) - groups: List[AnomalyGroup[MetricAnomaly]] = [] + groups: list[AnomalyGroup[MetricAnomaly]] = [] current = AnomalyGroup(representative=sorted_a[0], members=[sorted_a[0]]) for a in sorted_a[1:]: diff --git a/engine/enums.py b/engine/enums.py index 9969cd9..18d687c 100644 --- a/engine/enums.py +++ b/engine/enums.py @@ -9,7 +9,9 @@ """ from __future__ import annotations + from enum import Enum + from config import SEVERITY_WEIGHTS, settings diff --git a/engine/events/models.py b/engine/events/models.py index 6fa01f5..eb9d879 100644 --- a/engine/events/models.py +++ b/engine/events/models.py @@ -4,8 +4,6 @@ from __future__ import annotations -from typing import Dict - from pydantic import BaseModel, Field @@ -16,4 +14,4 @@ class DeploymentEvent(BaseModel): author: str = "" environment: str = "production" source: str = "unknown" - metadata: Dict[str, str] = Field(default_factory=dict) + metadata: dict[str, str] = Field(default_factory=dict) diff --git a/engine/events/registry.py b/engine/events/registry.py index a47ad9a..694670f 100644 --- a/engine/events/registry.py +++ b/engine/events/registry.py @@ -12,8 +12,6 @@ from __future__ import annotations -from typing import List - from engine.events.models import DeploymentEvent __all__ = ["DeploymentEvent", "EventRegistry"] @@ -21,18 +19,18 @@ class EventRegistry: def __init__(self) -> None: - self._events: List[DeploymentEvent] = [] + self._events: list[DeploymentEvent] = [] def register(self, event: DeploymentEvent) -> None: self._events.append(event) - def in_window(self, start: float, end: float) -> List[DeploymentEvent]: + def in_window(self, start: float, end: float) -> list[DeploymentEvent]: return [e for e in self._events if start <= e.timestamp <= end] - def for_service(self, service: str) -> List[DeploymentEvent]: + def for_service(self, service: str) -> list[DeploymentEvent]: return [e for e in self._events if e.service == service] - def list_all(self) -> List[DeploymentEvent]: + def list_all(self) -> list[DeploymentEvent]: return list(self._events) def clear(self) -> None: diff --git a/engine/fetcher.py b/engine/fetcher.py index c23a1d6..2a360a9 100644 --- a/engine/fetcher.py +++ b/engine/fetcher.py @@ -13,13 +13,12 @@ import asyncio import logging import re -from typing import Dict, List, Tuple import httpx -from datasources.provider import DataSourceProvider -from custom_types.json import JSONDict from config import settings +from custom_types.json import JSONDict +from datasources.provider import DataSourceProvider log = logging.getLogger(__name__) @@ -32,10 +31,10 @@ def _extract_metric_names(query: str) -> list[str]: async def _scrape_and_fill( provider: DataSourceProvider, - queries: List[str], + queries: list[str], start: int, end: int, -) -> List[Tuple[str, JSONDict]]: +) -> list[tuple[str, JSONDict]]: metrics_backend = getattr(provider, "metrics", None) scrape_func = getattr(metrics_backend, "scrape", None) if not callable(scrape_func): @@ -43,11 +42,11 @@ async def _scrape_and_fill( try: text = await scrape_func() - except (httpx.HTTPError, asyncio.TimeoutError, OSError, TypeError, ValueError) as exc: + except (TimeoutError, httpx.HTTPError, OSError, TypeError, ValueError) as exc: log.warning("scrape_and_fill: scrape failed: %s", exc) return [] - metrics: Dict[str, float] = {} + metrics: dict[str, float] = {} for line in text.splitlines(): line = line.strip() if not line or line.startswith("#"): @@ -65,7 +64,7 @@ async def _scrape_and_fill( log.debug("scrape_and_fill: scrape returned no parseable metrics") return [] - results: List[Tuple[str, JSONDict]] = [] + results: list[tuple[str, JSONDict]] = [] for q in queries: candidates = {n for n in _extract_metric_names(q) if n in metrics} for name in candidates: @@ -93,11 +92,11 @@ async def _scrape_and_fill( async def fetch_metrics( provider: DataSourceProvider, - queries: List[str], + queries: list[str], start: int, end: int, step: str, -) -> List[Tuple[str, JSONDict]]: +) -> list[tuple[str, JSONDict]]: max_parallel = max(1, int(settings.analyzer_max_parallel_metric_queries)) sem = asyncio.Semaphore(max_parallel) @@ -107,7 +106,7 @@ async def _query(q: str) -> JSONDict: raw = await asyncio.gather(*[_query(q) for q in queries], return_exceptions=True) - pairs: List[Tuple[str, JSONDict]] = [] + pairs: list[tuple[str, JSONDict]] = [] any_results = False for q, r in zip(queries, raw): diff --git a/engine/forecast/__init__.py b/engine/forecast/__init__.py index f6962e3..34bff02 100644 --- a/engine/forecast/__init__.py +++ b/engine/forecast/__init__.py @@ -10,7 +10,8 @@ http://www.apache.org/licenses/LICENSE-2.0 """ +from engine.forecast.degradation import DegradationSignal +from engine.forecast.degradation import analyze as analyze_degradation from engine.forecast.trajectory import TrajectoryForecast, forecast -from engine.forecast.degradation import DegradationSignal, analyze as analyze_degradation -__all__ = ["TrajectoryForecast", "forecast", "DegradationSignal", "analyze_degradation"] +__all__ = ["DegradationSignal", "TrajectoryForecast", "analyze_degradation", "forecast"] diff --git a/engine/forecast/degradation.py b/engine/forecast/degradation.py index 1ea3b8e..f634e9b 100644 --- a/engine/forecast/degradation.py +++ b/engine/forecast/degradation.py @@ -11,13 +11,13 @@ from __future__ import annotations +from collections.abc import Sequence from dataclasses import dataclass -from typing import List, Optional import numpy as np -from engine.enums import Severity from config import settings +from engine.enums import Severity @dataclass(frozen=True) @@ -31,7 +31,7 @@ class DegradationSignal: is_accelerating: bool -def _ema(vals: List[float], alpha: float | None = None) -> np.ndarray: +def _ema(vals: Sequence[float], alpha: float | None = None) -> np.ndarray: if alpha is None: alpha = settings.forecast_ema_alpha result = np.zeros(len(vals)) @@ -58,10 +58,10 @@ def _is_counter_like_metric(metric_name: str) -> bool: def analyze( metric_name: str, - ts: List[float], - vals: List[float], + ts: Sequence[float], + vals: Sequence[float], min_degradation_rate: float | None = None, -) -> Optional[DegradationSignal]: +) -> DegradationSignal | None: if min_degradation_rate is None: min_degradation_rate = settings.forecast_min_degradation_rate if len(vals) < settings.forecast_degradation_min_length: diff --git a/engine/forecast/trajectory.py b/engine/forecast/trajectory.py index 437596d..9a3087e 100644 --- a/engine/forecast/trajectory.py +++ b/engine/forecast/trajectory.py @@ -11,13 +11,13 @@ from __future__ import annotations +from collections.abc import Sequence from dataclasses import dataclass -from typing import List, Optional import numpy as np -from engine.enums import Severity from config import settings +from engine.enums import Severity @dataclass(frozen=True) @@ -26,13 +26,13 @@ class TrajectoryForecast: current_value: float slope_per_second: float predicted_value_at_horizon: float - time_to_threshold_seconds: Optional[float] + time_to_threshold_seconds: float | None breach_threshold: float confidence: float severity: Severity -def _linear_fit(ts: List[float], vals: List[float]) -> tuple[float, float]: +def _linear_fit(ts: Sequence[float], vals: Sequence[float]) -> tuple[float, float]: t = np.array(ts, dtype=float) v = np.array(vals, dtype=float) t_norm = t - t[0] @@ -40,7 +40,7 @@ def _linear_fit(ts: List[float], vals: List[float]) -> tuple[float, float]: return float(slope), float(intercept) -def _r_squared(ts: List[float], vals: List[float], slope: float, intercept: float) -> float: +def _r_squared(ts: Sequence[float], vals: Sequence[float], slope: float, intercept: float) -> float: t_norm = np.array(ts, dtype=float) - ts[0] v = np.array(vals, dtype=float) predicted = slope * t_norm + intercept @@ -51,11 +51,11 @@ def _r_squared(ts: List[float], vals: List[float], slope: float, intercept: floa def forecast( metric_name: str, - ts: List[float], - vals: List[float], + ts: Sequence[float], + vals: Sequence[float], threshold: float, horizon_seconds: float | None = None, -) -> Optional[TrajectoryForecast]: +) -> TrajectoryForecast | None: if horizon_seconds is None: horizon_seconds = settings.forecast_trajectory_horizon_cutoff if len(vals) < settings.forecast_trajectory_min_length: @@ -71,7 +71,7 @@ def forecast( current = slope * now_offset + intercept predicted_at_horizon = slope * (now_offset + horizon_seconds) + intercept - time_to_threshold: Optional[float] = None + time_to_threshold: float | None = None if slope > 0 and current < threshold: time_to_threshold = (threshold - current) / slope elif slope < 0 and current > threshold: diff --git a/engine/logs/__init__.py b/engine/logs/__init__.py index 9810e66..74bcca1 100644 --- a/engine/logs/__init__.py +++ b/engine/logs/__init__.py @@ -12,4 +12,4 @@ from engine.logs.frequency import detect_bursts from engine.logs.patterns import analyze -__all__ = ["detect_bursts", "analyze"] +__all__ = ["analyze", "detect_bursts"] diff --git a/engine/logs/frequency.py b/engine/logs/frequency.py index 8f2f213..c213532 100644 --- a/engine/logs/frequency.py +++ b/engine/logs/frequency.py @@ -14,15 +14,13 @@ from __future__ import annotations import re -from collections.abc import Mapping -from typing import Iterator, List, Tuple +from collections.abc import Iterator, Mapping import numpy as np -from engine.enums import Severity from api.responses import LogBurst - from config import settings +from engine.enums import Severity _BURST_RATIO_THRESHOLDS = [(thr, Severity(sev)) for thr, sev in settings.burst_ratio_thresholds] @@ -42,7 +40,7 @@ ) -def _iter_entries(loki_response: Mapping[str, object]) -> Iterator[Tuple[float, str]]: +def _iter_entries(loki_response: Mapping[str, object]) -> Iterator[tuple[float, str]]: data = loki_response.get("data") if not isinstance(data, dict): return @@ -62,7 +60,7 @@ def _iter_entries(loki_response: Mapping[str, object]) -> Iterator[Tuple[float, yield float(ts_ns) / 1e9, line -def _is_benign_repetitive_window(lines: List[str]) -> bool: +def _is_benign_repetitive_window(lines: list[str]) -> bool: if len(lines) < 3: return False adverse = sum(1 for line in lines if _ADVERSE_RE.search(str(line))) @@ -72,7 +70,7 @@ def _is_benign_repetitive_window(lines: List[str]) -> bool: return (benign / len(lines)) >= 0.6 -def detect_bursts(loki_response: Mapping[str, object], window_seconds: float | None = None) -> List[LogBurst]: +def detect_bursts(loki_response: Mapping[str, object], window_seconds: float | None = None) -> list[LogBurst]: if window_seconds is None: window_seconds = settings.logs_frequency_window_seconds entries = sorted(_iter_entries(loki_response), key=lambda x: x[0]) @@ -87,7 +85,7 @@ def detect_bursts(loki_response: Mapping[str, object], window_seconds: float | N baseline_rate = len(timestamps) / total_duration - windows: List[Tuple[float, float, int, bool]] = [] + windows: list[tuple[float, float, int, bool]] = [] i = 0 while i < len(timestamps): w_start = timestamps[i] @@ -101,7 +99,7 @@ def detect_bursts(loki_response: Mapping[str, object], window_seconds: float | N if not windows: return [] - bursts: List[LogBurst] = [] + bursts: list[LogBurst] = [] for w_start, w_end, count, benign_window in windows: rate = count / window_seconds ratio = rate / baseline_rate if baseline_rate > 0 else 0.0 diff --git a/engine/logs/patterns.py b/engine/logs/patterns.py index 51cc2dd..6aeab3c 100644 --- a/engine/logs/patterns.py +++ b/engine/logs/patterns.py @@ -14,13 +14,12 @@ import math import re from collections import Counter, defaultdict -from collections.abc import Mapping -from typing import Iterator, List, Tuple, TypedDict +from collections.abc import Iterator, Mapping +from typing import TypedDict +from api.responses import LogPattern from config import settings - from engine.enums import Severity -from api.responses import LogPattern _NOISE = re.compile(settings.logs_noise_regex, re.I) @@ -40,7 +39,7 @@ class PatternBucket(TypedDict): tokens: list[str] -def _iter_entries(loki_response: Mapping[str, object]) -> Iterator[Tuple[float, str]]: +def _iter_entries(loki_response: Mapping[str, object]) -> Iterator[tuple[float, str]]: data = loki_response.get("data") if not isinstance(data, dict): return @@ -71,7 +70,7 @@ def _classify(line: str) -> Severity: return Severity.LOW -def _entropy(tokens: List[str]) -> float: +def _entropy(tokens: list[str]) -> float: if not tokens: return 0.0 counts = Counter(tokens) @@ -79,7 +78,7 @@ def _entropy(tokens: List[str]) -> float: return -sum((c / total) * math.log2(c / total) for c in counts.values()) -def analyze(loki_response: Mapping[str, object]) -> List[LogPattern]: +def analyze(loki_response: Mapping[str, object]) -> list[LogPattern]: buckets: dict[str, PatternBucket] = defaultdict( lambda: { "count": 0, @@ -105,7 +104,7 @@ def analyze(loki_response: Mapping[str, object]) -> List[LogPattern]: if len(b["tokens"]) < settings.logs_token_cap: b["tokens"].extend(key.split()) - results: List[LogPattern] = [] + results: list[LogPattern] = [] for pattern, b in buckets.items(): if b["first"] == float("inf"): continue diff --git a/engine/ml/__init__.py b/engine/ml/__init__.py index 2f9662d..409a42d 100644 --- a/engine/ml/__init__.py +++ b/engine/ml/__init__.py @@ -13,4 +13,4 @@ from engine.ml.ranking import RankedCause, rank from engine.ml.weights import SignalWeights, get_weights -__all__ = ["AnomalyCluster", "cluster", "RankedCause", "rank", "SignalWeights", "get_weights"] +__all__ = ["AnomalyCluster", "RankedCause", "SignalWeights", "cluster", "get_weights", "rank"] diff --git a/engine/ml/clustering.py b/engine/ml/clustering.py index 46ba425..3b9df24 100644 --- a/engine/ml/clustering.py +++ b/engine/ml/clustering.py @@ -13,7 +13,6 @@ from dataclasses import dataclass from importlib import import_module -from typing import List import numpy as np @@ -24,15 +23,15 @@ @dataclass class AnomalyCluster: cluster_id: int - members: List[MetricAnomaly] + members: list[MetricAnomaly] centroid_timestamp: float centroid_value: float - metric_names: List[str] + metric_names: list[str] size: int is_noise: bool = False -def _feature_matrix(anomalies: List[MetricAnomaly]) -> np.ndarray: +def _feature_matrix(anomalies: list[MetricAnomaly]) -> np.ndarray: ts_arr = np.array([a.timestamp for a in anomalies], dtype=float) val_arr = np.array([a.value for a in anomalies], dtype=float) ts_norm = (ts_arr - ts_arr.min()) / (np.ptp(ts_arr) + 1e-9) @@ -41,10 +40,10 @@ def _feature_matrix(anomalies: List[MetricAnomaly]) -> np.ndarray: def _cluster_one_metric( - anomalies: List[MetricAnomaly], + anomalies: list[MetricAnomaly], eps: float, min_samples: int, -) -> List[AnomalyCluster]: +) -> list[AnomalyCluster]: if len(anomalies) < min_samples: return [] @@ -57,11 +56,11 @@ def _cluster_one_metric( model = dbscan_factory(eps=eps, min_samples=min_samples, metric="euclidean") labels = model.fit_predict(x) - clusters: dict[int, List[MetricAnomaly]] = {} + clusters: dict[int, list[MetricAnomaly]] = {} for label, anomaly in zip(labels, anomalies): clusters.setdefault(int(label), []).append(anomaly) - result: List[AnomalyCluster] = [] + result: list[AnomalyCluster] = [] for cid, members in clusters.items(): result.append( AnomalyCluster( @@ -79,10 +78,10 @@ def _cluster_one_metric( def cluster( - anomalies: List[MetricAnomaly], + anomalies: list[MetricAnomaly], eps: float | None = None, min_samples: int | None = None, -) -> List[AnomalyCluster]: +) -> list[AnomalyCluster]: if not anomalies: return [] if eps is None: @@ -90,11 +89,11 @@ def cluster( if min_samples is None: min_samples = settings.ml_cluster_min_samples - by_metric: dict[str, List[MetricAnomaly]] = {} + by_metric: dict[str, list[MetricAnomaly]] = {} for a in anomalies: by_metric.setdefault(a.metric_name or "", []).append(a) - combined: List[AnomalyCluster] = [] + combined: list[AnomalyCluster] = [] next_cluster_id = 0 for _metric_key in sorted(by_metric.keys()): part = _cluster_one_metric(by_metric[_metric_key], eps, min_samples) @@ -115,7 +114,7 @@ def cluster( return sorted(combined, key=lambda c: c.size, reverse=True) -def _fallback_cluster(anomalies: List[MetricAnomaly]) -> List[AnomalyCluster]: +def _fallback_cluster(anomalies: list[MetricAnomaly]) -> list[AnomalyCluster]: if not anomalies: return [] return [ diff --git a/engine/ml/ranking.py b/engine/ml/ranking.py index a0d2027..a200501 100644 --- a/engine/ml/ranking.py +++ b/engine/ml/ranking.py @@ -14,13 +14,13 @@ from dataclasses import dataclass from importlib import import_module -from typing import List, Optional, Protocol +from typing import Protocol import numpy as np from config import settings -from engine.rca.hypothesis import RootCause from engine.correlation.temporal import CorrelatedEvent +from engine.rca.hypothesis import RootCause @dataclass(frozen=True) @@ -31,7 +31,7 @@ class RankedCause: feature_importance: dict[str, float] -def _extract_features(cause: RootCause, event: Optional[CorrelatedEvent] = None) -> List[float]: +def _extract_features(cause: RootCause, event: CorrelatedEvent | None = None) -> list[float]: return [ cause.confidence, cause.severity.weight() / settings.ranking_severity_divisor, @@ -58,7 +58,7 @@ def _extract_features(cause: RootCause, event: Optional[CorrelatedEvent] = None) ] -def _ranking_pseudo_labels(causes: List[RootCause]) -> list[int]: +def _ranking_pseudo_labels(causes: list[RootCause]) -> list[int]: """ Top half of hypotheses by rule confidence = positive class (avoids trivial single-class RF). """ @@ -101,9 +101,9 @@ def __call__( def rank( - causes: List[RootCause], - correlated_events: Optional[List[CorrelatedEvent]] = None, -) -> List[RankedCause]: + causes: list[RootCause], + correlated_events: list[CorrelatedEvent] | None = None, +) -> list[RankedCause]: if not causes: return [] @@ -114,13 +114,13 @@ def rank( events_map[a.metric_name] = ev feature_matrix = [] - event_refs: List[Optional[CorrelatedEvent]] = [] + event_refs: list[CorrelatedEvent | None] = [] for cause in causes: ref_metric = next( (s.split(":")[1] for s in cause.contributing_signals if s.startswith("metric:")), None, ) - event_ref: Optional[CorrelatedEvent] = events_map.get(ref_metric) if ref_metric else None + event_ref: CorrelatedEvent | None = events_map.get(ref_metric) if ref_metric else None event_refs.append(event_ref) feature_matrix.append(_extract_features(cause, event_ref)) @@ -151,7 +151,7 @@ def rank( ml_scores = np.array([c.confidence for c in causes]) importances_global = None - results: List[RankedCause] = [] + results: list[RankedCause] = [] for i, cause in enumerate(causes): ms = float(ml_scores[i]) if importances_global is not None: diff --git a/engine/ml/weights.py b/engine/ml/weights.py index f15a428..f6ca50f 100644 --- a/engine/ml/weights.py +++ b/engine/ml/weights.py @@ -11,14 +11,11 @@ from __future__ import annotations -from dataclasses import dataclass, field from collections.abc import Mapping -from typing import Dict, Union +from dataclasses import dataclass, field +from config import DEFAULT_WEIGHTS, REGISTRY_ALPHA, settings from engine.enums import Signal -from config import DEFAULT_WEIGHTS, REGISTRY_ALPHA - -from config import settings if settings.default_weight_fallback and settings.default_weight_fallback > 0.0: _DEFAULT_FALLBACK = settings.default_weight_fallback @@ -26,7 +23,7 @@ _DEFAULT_FALLBACK = 1.0 / len(Signal) -def _key(signal: Union[Signal, str]) -> str: +def _key(signal: Signal | str) -> str: return signal.value if isinstance(signal, Signal) else signal @@ -38,17 +35,17 @@ def _coerce_weight(value: object) -> float: raise TypeError(f"Unsupported weight value: {type(value).__name__}") -def _normalise_weights(raw: Mapping[Signal | str, object]) -> Dict[str, float]: +def _normalise_weights(raw: Mapping[Signal | str, object]) -> dict[str, float]: return {_key(k): _coerce_weight(v) for k, v in raw.items()} @dataclass class SignalWeights: - weights: Dict[str, float] = field(default_factory=lambda: _normalise_weights(DEFAULT_WEIGHTS)) + weights: dict[str, float] = field(default_factory=lambda: _normalise_weights(DEFAULT_WEIGHTS)) alpha: float = REGISTRY_ALPHA update_count: int = 0 - def update(self, signal: Union[Signal, str], was_correct: bool) -> None: + def update(self, signal: Signal | str, was_correct: bool) -> None: k = _key(signal) reward = 1.0 if was_correct else 0.0 current = self.weights.get(k, _DEFAULT_FALLBACK) @@ -61,7 +58,7 @@ def _normalize(self) -> None: for k in self.weights: self.weights[k] = self.weights[k] / total - def get(self, signal: Union[Signal, str]) -> float: + def get(self, signal: Signal | str) -> float: return self.weights.get(_key(signal), _DEFAULT_FALLBACK) def weighted_confidence( diff --git a/engine/rca/__init__.py b/engine/rca/__init__.py index c1a4587..9765907 100644 --- a/engine/rca/__init__.py +++ b/engine/rca/__init__.py @@ -2,6 +2,6 @@ Engine module for rca -> init . """ -from engine.rca.hypothesis import generate, RootCause +from engine.rca.hypothesis import RootCause, generate -__all__ = ["generate", "RootCause"] +__all__ = ["RootCause", "generate"] diff --git a/engine/rca/hypothesis.py b/engine/rca/hypothesis.py index 1fe6447..991f422 100644 --- a/engine/rca/hypothesis.py +++ b/engine/rca/hypothesis.py @@ -11,29 +11,29 @@ from __future__ import annotations -from dataclasses import dataclass, field import re -from typing import List, Optional +from dataclasses import dataclass, field from pydantic import ConfigDict + from api.responses import ( - MetricAnomaly, + ErrorPropagation, LogBurst, LogPattern, + MetricAnomaly, ServiceLatency, - ErrorPropagation, ) +from config import settings from engine.correlation.temporal import CorrelatedEvent +from engine.enums import RcaCategory, Severity from engine.events.registry import DeploymentEvent, EventRegistry -from engine.topology.graph import DependencyGraph from engine.rca.scoring import ( + categorize, score_correlated_event, score_deployment_correlation, score_error_propagation, - categorize, ) -from engine.enums import Severity, RcaCategory -from config import settings +from engine.topology.graph import DependencyGraph _METRIC_LABEL_RE = re.compile(r"\{([^}]*)\}") _PROCESS_NAME_KEYS = ( @@ -58,11 +58,11 @@ class HypothesisRootCause: confidence: float severity: Severity category: RcaCategory - evidence: List[str] = field(default_factory=list) - contributing_signals: List[str] = field(default_factory=list) - affected_services: List[str] = field(default_factory=list) + evidence: list[str] = field(default_factory=list) + contributing_signals: list[str] = field(default_factory=list) + affected_services: list[str] = field(default_factory=list) recommended_action: str = "" - deployment: Optional[DeploymentEvent] = None + deployment: DeploymentEvent | None = None corroboration_summary: str = "" @@ -70,6 +70,48 @@ class HypothesisRootCause: RootCause = HypothesisRootCause +@dataclass(frozen=True) +class RcaSignalInputs: + metric_anomalies: list[MetricAnomaly] = field(default_factory=list) + log_bursts: list[LogBurst] = field(default_factory=list) + log_patterns: list[LogPattern] = field(default_factory=list) + service_latency: list[ServiceLatency] = field(default_factory=list) + error_propagation: list[ErrorPropagation] = field(default_factory=list) + + +def _coerce_signal_inputs( + signal_inputs: RcaSignalInputs | list[MetricAnomaly] | None, + legacy_signal_groups: tuple[object, ...], +) -> RcaSignalInputs: + if isinstance(signal_inputs, RcaSignalInputs): + return signal_inputs + + groups: list[object] = [] + if signal_inputs is not None: + groups.append(signal_inputs) + groups.extend(legacy_signal_groups) + + def _as_list(value: object) -> list[object]: + return value if isinstance(value, list) else [] + + if not groups: + return RcaSignalInputs() + + metric_anomalies = _as_list(groups[0]) + log_bursts = _as_list(groups[1]) if len(groups) > 1 else [] + log_patterns = _as_list(groups[2]) if len(groups) > 2 else [] + service_latency = _as_list(groups[3]) if len(groups) > 3 else [] + error_propagation = _as_list(groups[4]) if len(groups) > 4 else [] + + return RcaSignalInputs( + metric_anomalies=[item for item in metric_anomalies if isinstance(item, MetricAnomaly)], + log_bursts=[item for item in log_bursts if isinstance(item, LogBurst)], + log_patterns=[item for item in log_patterns if isinstance(item, LogPattern)], + service_latency=[item for item in service_latency if isinstance(item, ServiceLatency)], + error_propagation=[item for item in error_propagation if isinstance(item, ErrorPropagation)], + ) + + def _anomaly_impact_rank(anomaly: MetricAnomaly) -> tuple[float, float, float]: """ Higher tuple = more important for narrative selection (matches report intent). @@ -84,7 +126,7 @@ def _anomaly_impact_rank(anomaly: MetricAnomaly) -> tuple[float, float, float]: return (weight, z, mad) -def _metric_names_for_hypothesis(metric_anomalies: List[MetricAnomaly], limit: int = 2) -> List[str]: +def _metric_names_for_hypothesis(metric_anomalies: list[MetricAnomaly], limit: int = 2) -> list[str]: """ Pick metric names to cite in the hypothesis from a correlated event. @@ -107,7 +149,7 @@ def _metric_names_for_hypothesis(metric_anomalies: List[MetricAnomaly], limit: i return ordered[:limit] -def _process_entities_for_hypothesis(metric_anomalies: List[MetricAnomaly], limit: int = 2) -> List[str]: +def _process_entities_for_hypothesis(metric_anomalies: list[MetricAnomaly], limit: int = 2) -> list[str]: """ Top process hotspots by anomaly strength, not lexicographic entity string. """ @@ -128,7 +170,7 @@ def _process_entities_for_hypothesis(metric_anomalies: List[MetricAnomaly], limi return ordered[:limit] -def _evidence_score(entries: List[str]) -> float: +def _evidence_score(entries: list[str]) -> float: total = 0.0 for entry in entries: text = str(entry) @@ -141,7 +183,7 @@ def _evidence_score(entries: List[str]) -> float: return total -def _dedupe_causes(causes: List[RootCause]) -> List[RootCause]: +def _dedupe_causes(causes: list[RootCause]) -> list[RootCause]: selected: dict[tuple[str, str], RootCause] = {} for cause in causes: key = (str(cause.category.value), str(cause.hypothesis)) @@ -166,7 +208,7 @@ def _dedupe_causes(causes: List[RootCause]) -> List[RootCause]: return list(selected.values()) -def _signals_from_event(event: CorrelatedEvent) -> List[str]: +def _signals_from_event(event: CorrelatedEvent) -> list[str]: signals: list[str] = [] metric_names = list(dict.fromkeys(a.metric_name for a in event.metric_anomalies if a.metric_name)) if metric_names: @@ -225,7 +267,7 @@ def _process_entity_from_metric_name(metric_name: str) -> str: return process_name -def _corroboration_summary(signals: List[str]) -> str: +def _corroboration_summary(signals: list[str]) -> str: roots = [] for signal in signals: text = str(signal or "").strip().lower() @@ -247,7 +289,9 @@ def _corroboration_summary(signals: List[str]) -> str: return f"{len(unique)} corroborating signal(s): {', '.join(unique)}" -def _action_for_category(category: RcaCategory, service: str = "") -> str: +def _action_for_category(category: RcaCategory | None, service: str = "") -> str: + if category is None: + return "Investigate correlated signals." actions = { RcaCategory.DEPLOYMENT: f"Rollback recent deployment for {service or 'affected service'}.", RcaCategory.RESOURCE_EXHAUSTION: "Check resource limits, scale horizontally or increase quotas.", @@ -261,17 +305,15 @@ def _action_for_category(category: RcaCategory, service: str = "") -> str: def generate( - metric_anomalies: List[MetricAnomaly], - log_bursts: List[LogBurst], - log_patterns: List[LogPattern], - service_latency: List[ServiceLatency], - error_propagation: List[ErrorPropagation], - correlated_events: Optional[List[CorrelatedEvent]] = None, - graph: Optional[DependencyGraph] = None, - event_registry: Optional[EventRegistry] = None, -) -> List[RootCause]: - _ = (metric_anomalies, log_bursts, service_latency) - causes: List[RootCause] = [] + signal_inputs: RcaSignalInputs | list[MetricAnomaly] | None, + *legacy_signal_groups: object, + correlated_events: list[CorrelatedEvent] | None = None, + graph: DependencyGraph | None = None, + event_registry: EventRegistry | None = None, +) -> list[RootCause]: + inputs = _coerce_signal_inputs(signal_inputs, legacy_signal_groups) + _ = (inputs.metric_anomalies, inputs.log_bursts, inputs.service_latency) + causes: list[RootCause] = [] deployments = event_registry.list_all() if event_registry else [] for event in correlated_events or []: @@ -284,7 +326,7 @@ def generate( deploy_score = score_deployment_correlation(event.window_start, deployments) confidence = round(min(settings.rca_score_cap, base_score + deploy_score * 0.2), 3) - deploy_event: Optional[DeploymentEvent] = None + deploy_event: DeploymentEvent | None = None window_seconds = float(settings.rca_deploy_window_seconds) window_start = float(event.window_start) - window_seconds window_end = float(event.window_start) + window_seconds @@ -302,7 +344,7 @@ def _deployment_distance( if nearby_deploys: deploy_event = min(nearby_deploys, key=_deployment_distance) - affected: List[str] = [] + affected: list[str] = [] root_svc = "" if event.service_latency and graph: root_svc = event.service_latency[0].service @@ -354,7 +396,7 @@ def _deployment_distance( ) ) - for prop in error_propagation: + for prop in inputs.error_propagation: svc = prop.source_service affected = getattr(prop, "affected_services", []) conf = score_error_propagation([prop]) @@ -373,22 +415,21 @@ def _deployment_distance( ) ) - critical_patterns = [p for p in log_patterns if p.severity.weight() >= settings.rca_severity_weight_threshold] + critical_patterns = [ + p for p in inputs.log_patterns if p.severity.weight() >= settings.rca_severity_weight_threshold + ] if critical_patterns: causes.append( RootCause( hypothesis=( - f"[log_pattern] {len(critical_patterns)} critical pattern(s): " - f"{critical_patterns[0].pattern[:80]}" + f"[log_pattern] {len(critical_patterns)} critical pattern(s): {critical_patterns[0].pattern[:80]}" ), confidence=settings.rca_log_pattern_score, severity=Severity.HIGH, category=RcaCategory.UNKNOWN, contributing_signals=[f"log:{p.pattern[:40]}" for p in critical_patterns[:3]], recommended_action="Review high-severity log patterns for error root cause.", - corroboration_summary=_corroboration_summary( - [f"log:{p.pattern[:40]}" for p in critical_patterns[:3]] - ), + corroboration_summary=_corroboration_summary([f"log:{p.pattern[:40]}" for p in critical_patterns[:3]]), ) ) diff --git a/engine/rca/scoring.py b/engine/rca/scoring.py index e7fccb6..4a4be56 100644 --- a/engine/rca/scoring.py +++ b/engine/rca/scoring.py @@ -12,18 +12,17 @@ from __future__ import annotations import math -from typing import List from api.responses import ErrorPropagation +from config import settings from engine.correlation.temporal import CorrelatedEvent -from engine.events.registry import DeploymentEvent from engine.enums import RcaCategory -from config import settings +from engine.events.registry import DeploymentEvent def score_deployment_correlation( anomaly_ts: float, - deployments: List[DeploymentEvent], + deployments: list[DeploymentEvent], window_seconds: float | None = None, ) -> float: if window_seconds is None: @@ -69,7 +68,7 @@ def score_correlated_event(event: CorrelatedEvent) -> float: def categorize( event: CorrelatedEvent, - deployments: List[DeploymentEvent], + deployments: list[DeploymentEvent], ) -> RcaCategory: deploy_score = score_deployment_correlation(event.window_start, deployments) if deployments else 0.0 diff --git a/engine/registry.py b/engine/registry.py index dcdccdf..05a0667 100644 --- a/engine/registry.py +++ b/engine/registry.py @@ -10,22 +10,22 @@ from __future__ import annotations -import math import logging -from typing import Dict, List, Union +import math +from config import DEFAULT_WEIGHTS, REGISTRY_ALPHA from engine.enums import Signal -from store import events as event_store, weights as weight_store from engine.events.registry import DeploymentEvent -from config import DEFAULT_WEIGHTS, REGISTRY_ALPHA +from store import events as event_store +from store import weights as weight_store log = logging.getLogger(__name__) SIGNAL_KEYS: tuple[Signal, ...] = (Signal.METRICS, Signal.LOGS, Signal.TRACES) -def _default_weights() -> Dict[Signal, float]: - defaults: Dict[Signal, float] = {} +def _default_weights() -> dict[Signal, float]: + defaults: dict[Signal, float] = {} for signal in SIGNAL_KEYS: configured = DEFAULT_WEIGHTS.get(signal.value) if isinstance(configured, (int, float)) and math.isfinite(float(configured)) and float(configured) >= 0.0: @@ -38,7 +38,7 @@ def _default_weights() -> Dict[Signal, float]: return defaults -def _coerce_weights(raw: object) -> Dict[Signal, float]: +def _coerce_weights(raw: object) -> dict[Signal, float]: weights = _default_weights() if not isinstance(raw, dict): return weights @@ -69,7 +69,7 @@ def _coerce_weights(raw: object) -> Dict[Signal, float]: return weights -def _serialize_weights(weights: Dict[Signal, float]) -> Dict[str, float]: +def _serialize_weights(weights: dict[Signal, float]) -> dict[str, float]: return {k.value: v for k, v in weights.items()} @@ -89,18 +89,18 @@ def _coerce_update_count(value: object) -> int: class TenantState: - __slots__ = ("_weights", "_update_count") + __slots__ = ("_update_count", "_weights") - def __init__(self, weights: Dict[Signal, float], update_count: int) -> None: - self._weights: Dict[Signal, float] = _coerce_weights(weights) + def __init__(self, weights: dict[Signal, float], update_count: int) -> None: + self._weights: dict[Signal, float] = _coerce_weights(weights) self._update_count = update_count @property - def weights(self) -> Dict[Signal, float]: + def weights(self) -> dict[Signal, float]: return dict(self._weights) @property - def weights_serializable(self) -> Dict[str, float]: + def weights_serializable(self) -> dict[str, float]: return _serialize_weights(self._weights) @property @@ -150,7 +150,7 @@ def reset(self) -> None: class TenantRegistry: def __init__(self) -> None: - self._states: Dict[str, TenantState] = {} + self._states: dict[str, TenantState] = {} async def get_state(self, tenant_id: str) -> TenantState: if tenant_id not in self._states: @@ -165,7 +165,7 @@ async def get_state(self, tenant_id: str) -> TenantState: self._states[tenant_id] = state return self._states[tenant_id] - async def update_weight(self, tenant_id: str, signal: Union[Signal, str], was_correct: bool) -> TenantState: + async def update_weight(self, tenant_id: str, signal: Signal | str, was_correct: bool) -> TenantState: if isinstance(signal, str): signal = Signal(signal) state = await self.get_state(tenant_id) @@ -183,13 +183,13 @@ async def reset_weights(self, tenant_id: str) -> TenantState: async def register_event(self, tenant_id: str, event: DeploymentEvent) -> None: await event_store.append(tenant_id, event) - async def get_events(self, tenant_id: str) -> List[event_store.StoredEvent]: + async def get_events(self, tenant_id: str) -> list[event_store.StoredEvent]: return await event_store.load(tenant_id) async def clear_events(self, tenant_id: str) -> None: await event_store.clear(tenant_id) - async def events_in_window(self, tenant_id: str, start: float, end: float) -> List[event_store.StoredEvent]: + async def events_in_window(self, tenant_id: str, start: float, end: float) -> list[event_store.StoredEvent]: events = await event_store.load(tenant_id) return [e for e in events if start <= e["timestamp"] <= end] diff --git a/engine/slo/__init__.py b/engine/slo/__init__.py index 6bb22fb..0d55fea 100644 --- a/engine/slo/__init__.py +++ b/engine/slo/__init__.py @@ -9,7 +9,7 @@ http://www.apache.org/licenses/LICENSE-2.0 """ -from engine.slo.burn import SloBurnAlert, evaluate from engine.slo.budget import BudgetStatus, remaining_minutes +from engine.slo.burn import SloBurnAlert, evaluate -__all__ = ["SloBurnAlert", "evaluate", "BudgetStatus", "remaining_minutes"] +__all__ = ["BudgetStatus", "SloBurnAlert", "evaluate", "remaining_minutes"] diff --git a/engine/slo/budget.py b/engine/slo/budget.py index 3e92f29..b4de08a 100644 --- a/engine/slo/budget.py +++ b/engine/slo/budget.py @@ -11,16 +11,14 @@ from __future__ import annotations -from typing import List - from config import settings from engine.slo.models import BudgetStatus def remaining_minutes( service: str, - error_counts: List[float], - total_counts: List[float], + error_counts: list[float], + total_counts: list[float], target_availability: float = 0.999, ) -> BudgetStatus: total = sum(total_counts) diff --git a/engine/slo/burn.py b/engine/slo/burn.py index e700187..179ebca 100644 --- a/engine/slo/burn.py +++ b/engine/slo/burn.py @@ -10,15 +10,15 @@ from __future__ import annotations -from typing import List, Tuple +from collections.abc import Sequence +from config import settings from engine.enums import Severity from engine.slo.models import SloBurnAlert -from config import settings -def _get_windows() -> List[Tuple[str, float, float, Severity]]: - windows: List[Tuple[str, float, float, Severity]] = [] +def _get_windows() -> list[tuple[str, float, float, Severity]]: + windows: list[tuple[str, float, float, Severity]] = [] for label, window_s, thr, sev in settings.slo_burn_windows: sev_enum = Severity.LOW if isinstance(sev, Severity): @@ -38,11 +38,11 @@ def _get_windows() -> List[Tuple[str, float, float, Severity]]: def evaluate( service: str, - error_counts: List[float], - total_counts: List[float], - ts: List[float], + error_counts: Sequence[float], + total_counts: Sequence[float], + ts: Sequence[float], target_availability: float = settings.slo_default_target_availability, -) -> List[SloBurnAlert]: +) -> list[SloBurnAlert]: if not error_counts or not total_counts or len(ts) < 2: return [] @@ -64,7 +64,7 @@ def evaluate( return [] burn_rate = error_rate / allowed_error_rate - alerts: List[SloBurnAlert] = [] + alerts: list[SloBurnAlert] = [] for label, window_s, threshold, sev in _get_windows(): if duration < window_s * 0.5: diff --git a/engine/topology/__init__.py b/engine/topology/__init__.py index 9207181..0ab055c 100644 --- a/engine/topology/__init__.py +++ b/engine/topology/__init__.py @@ -5,6 +5,6 @@ service impact propagation. """ -from engine.topology.graph import DependencyGraph, BlastRadius +from engine.topology.graph import BlastRadius, DependencyGraph -__all__ = ["DependencyGraph", "BlastRadius"] +__all__ = ["BlastRadius", "DependencyGraph"] diff --git a/engine/topology/graph.py b/engine/topology/graph.py index 6d0d9c2..89544b8 100644 --- a/engine/topology/graph.py +++ b/engine/topology/graph.py @@ -12,22 +12,22 @@ from __future__ import annotations from collections import defaultdict, deque -from config import settings from dataclasses import dataclass -from typing import Dict, List, Set + +from config import settings @dataclass(frozen=True) class BlastRadius: root_service: str - affected_downstream: List[str] + affected_downstream: list[str] depth: int class DependencyGraph: def __init__(self) -> None: - self._forward: Dict[str, Set[str]] = defaultdict(set) - self._reverse: Dict[str, Set[str]] = defaultdict(set) + self._forward: dict[str, set[str]] = defaultdict(set) + self._reverse: dict[str, set[str]] = defaultdict(set) def add_call(self, caller: str, callee: str) -> None: if caller == callee or not caller or not callee: @@ -96,8 +96,8 @@ def _attr_value(attributes: list[dict[str, object]], key: str) -> str: def blast_radius(self, root: str, max_depth: int | None = None) -> BlastRadius: if max_depth is None: max_depth = settings.topology_max_depth - affected: List[str] = [] - seen: Set[str] = {root} + affected: list[str] = [] + seen: set[str] = {root} queue: deque[tuple[str, int]] = deque([(root, 0)]) while queue: @@ -112,9 +112,9 @@ def blast_radius(self, root: str, max_depth: int | None = None) -> BlastRadius: return BlastRadius(root_service=root, affected_downstream=affected, depth=max_depth) - def find_upstream_roots(self, service: str) -> List[str]: - roots: List[str] = [] - seen: Set[str] = set() + def find_upstream_roots(self, service: str) -> list[str]: + roots: list[str] = [] + seen: set[str] = set() queue: deque[str] = deque([service]) while queue: @@ -130,12 +130,12 @@ def find_upstream_roots(self, service: str) -> List[str]: return roots - def critical_path(self, source: str, target: str) -> List[str]: + def critical_path(self, source: str, target: str) -> list[str]: if source == target: return [source] - queue: deque[List[str]] = deque([[source]]) - seen: Set[str] = set() + queue: deque[list[str]] = deque([[source]]) + seen: set[str] = set() while queue: path = queue.popleft() @@ -150,5 +150,5 @@ def critical_path(self, source: str, target: str) -> List[str]: return [] - def all_services(self) -> Set[str]: + def all_services(self) -> set[str]: return set(self._forward) | set(self._reverse) diff --git a/engine/traces/__init__.py b/engine/traces/__init__.py index f4ec536..eb40d4d 100644 --- a/engine/traces/__init__.py +++ b/engine/traces/__init__.py @@ -8,7 +8,7 @@ http://www.apache.org/licenses/LICENSE-2.0 """ -from engine.traces.latency import analyze from engine.traces.errors import detect_propagation +from engine.traces.latency import analyze __all__ = ["analyze", "detect_propagation"] diff --git a/engine/traces/common.py b/engine/traces/common.py index e2bb944..ceb9324 100644 --- a/engine/traces/common.py +++ b/engine/traces/common.py @@ -4,7 +4,7 @@ from __future__ import annotations -from typing import Iterable +from collections.abc import Iterable from custom_types.json import JSONDict diff --git a/engine/traces/errors.py b/engine/traces/errors.py index d7c96ec..438b36b 100644 --- a/engine/traces/errors.py +++ b/engine/traces/errors.py @@ -12,7 +12,6 @@ from collections import defaultdict from collections.abc import Mapping -from typing import List from api.responses import ErrorPropagation from config import settings @@ -21,7 +20,7 @@ from engine.traces.common import iter_trace_spans, span_has_error -def detect_propagation(tempo_response: Mapping[str, object]) -> List[ErrorPropagation]: +def detect_propagation(tempo_response: Mapping[str, object]) -> list[ErrorPropagation]: service_errors: dict[str, int] = defaultdict(int) service_total: dict[str, int] = defaultdict(int) graph = DependencyGraph() @@ -52,7 +51,7 @@ def detect_propagation(tempo_response: Mapping[str, object]) -> List[ErrorPropag if not sources: return [] - results: List[ErrorPropagation] = [] + results: list[ErrorPropagation] = [] for source in sources: affected_services = sorted(graph.blast_radius(source).affected_downstream) diff --git a/engine/traces/latency.py b/engine/traces/latency.py index c461469..d9d43bc 100644 --- a/engine/traces/latency.py +++ b/engine/traces/latency.py @@ -11,14 +11,15 @@ from __future__ import annotations from collections import defaultdict -from typing import List, TypedDict +from typing import TypedDict import numpy as np -from engine.enums import Severity from api.responses import ServiceLatency -from custom_types.json import JSONDict from config import settings +from custom_types.json import JSONDict +from engine.enums import Severity +from engine.traces.common import iter_trace_spans, span_has_error class LatencyBucket(TypedDict): @@ -114,7 +115,7 @@ def _severity(p99: float, error_rate: float, apdex: float) -> Severity: return Severity.from_score(min(score, 1.0)) -def analyze(tempo_response: JSONDict, apdex_t_ms: float | None = None) -> List[ServiceLatency]: +def analyze(tempo_response: JSONDict, apdex_t_ms: float | None = None) -> list[ServiceLatency]: if apdex_t_ms is None: apdex_t_ms = settings.trace_latency_apdex_t_ms @@ -156,12 +157,10 @@ def analyze(tempo_response: JSONDict, apdex_t_ms: float | None = None) -> List[S current_end = bucket["window_end"] bucket["window_end"] = end_s if current_end is None else max(float(current_end), end_s) - from engine.traces.common import iter_trace_spans, span_has_error - if any(span_has_error(span) for span in iter_trace_spans(trace)): bucket["errors"] += 1 - results: List[ServiceLatency] = [] + results: list[ServiceLatency] = [] for key, bucket in buckets.items(): durations = np.array(bucket["durations"], dtype=float) diff --git a/main.py b/main.py index 795d578..3371970 100644 --- a/main.py +++ b/main.py @@ -15,24 +15,24 @@ import logging import sys import time +from collections.abc import AsyncIterator from contextlib import asynccontextmanager -from typing import AsyncIterator, Dict, Optional import httpx import uvicorn from fastapi import FastAPI -from fastapi.routing import APIRoute from fastapi.responses import JSONResponse +from fastapi.routing import APIRoute from pydantic import BaseModel, Field from api.routes import router from api.routes.common import close_providers -from middleware.openapi import install_custom_openapi -from services.security_service import InternalAuthMiddleware -from config import Settings, settings -from database import init_database, init_db, dispose_database +from config import LOGS_BACKEND_LOKI, METRICS_BACKEND_MIMIR, TRACES_BACKEND_TEMPO, Settings, settings +from database import dispose_database, init_database, init_db from datasources.exceptions import BackendStartupTimeout +from middleware.openapi import install_custom_openapi from services.rca_job_service import rca_job_service +from services.security_service import InternalAuthMiddleware logging.basicConfig( level=logging.INFO, @@ -42,7 +42,7 @@ log = logging.getLogger(__name__) _BACKEND_READY = False -_BACKEND_STATUS: Dict[str, str] = {} +_BACKEND_STATUS: dict[str, str] = {} OPENAPI_TAGS = [ {"name": "Health", "description": "Service and backend readiness endpoints."}, {"name": "RCA", "description": "Root cause analysis workflows and templates."}, @@ -75,7 +75,7 @@ def _generate_operation_id(route: APIRoute) -> str: class ResolverReadyResponse(BaseModel): ready: bool = Field(description="Whether resolver dependencies are currently ready.") - backends: Dict[str, str] = Field( + backends: dict[str, str] = Field( default_factory=dict, description="Per-backend readiness details keyed by backend name.", ) @@ -85,7 +85,7 @@ async def wait_for( name: str, url: str, timeout: float, - headers: Optional[Dict[str, str]] = None, + headers: dict[str, str] | None = None, accept_status: tuple[int, ...] = (200, 204, 404), ) -> None: deadline = time.monotonic() + timeout @@ -99,7 +99,7 @@ async def wait_for( log.info("%s ready (attempt %d, status %d)", name, attempt, resp.status_code) return log.debug("%s probe returned %d (attempt %d)", name, resp.status_code, attempt) - except (httpx.RequestError, asyncio.TimeoutError) as exc: + except (TimeoutError, httpx.RequestError) as exc: log.debug("%s not reachable (attempt %d): %s", name, attempt, exc) await asyncio.sleep(2) raise BackendStartupTimeout(f"{name} did not become ready within {timeout}s") @@ -109,12 +109,6 @@ async def _wait_for_all_bg(data_settings: Settings, tenant_id: str) -> None: scope = {"X-Scope-OrgID": tenant_id} checks: list[tuple[str, str, dict[str, str], tuple[int, ...]]] = [] - from config import ( - LOGS_BACKEND_LOKI, - METRICS_BACKEND_MIMIR, - TRACES_BACKEND_TEMPO, - ) - # Logs if data_settings.logs_backend == LOGS_BACKEND_LOKI: checks.append( diff --git a/middleware/openapi.py b/middleware/openapi.py index 677e341..02f6b5f 100644 --- a/middleware/openapi.py +++ b/middleware/openapi.py @@ -10,8 +10,8 @@ from __future__ import annotations -from http import HTTPStatus import re +from http import HTTPStatus from typing import Any from fastapi import FastAPI diff --git a/pyproject.toml b/pyproject.toml index e129697..40cda71 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,14 +4,12 @@ build-backend = "hatchling.build" [project] name = "resolver" -version = "0.0.2" +version = "0.0.3" description = "Resolver RCA and analysis service for the Observantio platform." readme = "README.md" requires-python = ">=3.11" license = "Apache-2.0" -authors = [ - { name = "Observantio" } -] +authors = [{ name = "Observantio" }] keywords = ["observability", "rca", "fastapi", "telemetry", "analysis"] classifiers = [ "Development Status :: 4 - Beta", @@ -20,23 +18,49 @@ classifiers = [ "Programming Language :: Python :: 3", "Programming Language :: Python :: 3.11", ] +dependencies = [ + "fastapi>=0.111.0", + "starlette>=0.49.1", + "uvicorn[standard]>=0.29.0", + "httpx>=0.27.0", + "pydantic>=2.7.0", + "pydantic-settings>=2.3.0", + "pyjwt>=2.10.0", + "numpy>=1.26.0", + "scipy>=1.13.0", + "scikit-learn>=1.5.0", + "sqlalchemy>=2.0.25", + "psycopg2-binary>=2.9.9", + "redis>=5.0.0", +] + +[project.urls] +Repository = "https://github.com/observantio/resolver" +Documentation = "https://observantio.github.io/pitch" +Changelog = "https://github.com/observantio/resolver/blob/main/CHANGELOG.md" [project.optional-dependencies] dev = [ "mypy>=1.18.2", - "types-PyYAML>=6.0.12.20250915", "pylint>=3.3.0", + "ruff>=0.9.0", "pytest>=9.0.0", "pytest-asyncio>=1.3.0", "pytest-cov>=7.0.0", ] +schemathesis = [ + "schemathesis>=4.14.2,<5", +] + [tool.pytest.ini_options] minversion = "9.0" testpaths = ["tests"] pythonpath = ["."] +asyncio_mode = "auto" filterwarnings = [ - "ignore::pytest.PytestRemovedIn9Warning", + "ignore::DeprecationWarning:pydantic\\..*", + "ignore::DeprecationWarning:starlette\\..*", ] [tool.coverage.run] @@ -85,13 +109,26 @@ omit = [ ] [tool.coverage.report] -exclude_also = [ - "if TYPE_CHECKING:", -] +exclude_also = ["if TYPE_CHECKING:"] fail_under = 100 show_missing = true skip_covered = false +[tool.ruff] +line-length = 120 +target-version = "py311" + +[tool.ruff.format] +quote-style = "double" +indent-style = "space" +line-ending = "auto" + +[tool.ruff.lint] +select = ["E", "F", "I", "UP", "B", "SIM", "N", "PTH", "RUF", "ERA", "T20"] + +[tool.ruff.lint.isort] +known-first-party = ["resolver"] + [tool.mypy] python_version = "3.11" pretty = true @@ -113,51 +150,25 @@ disallow_any_generics = true no_implicit_optional = true implicit_reexport = false explicit_package_bases = true -mypy_path = ["."] files = ["."] exclude = [ - "^(tests/|build/|dist/|venv/|\\.venv/|__pycache__/|migrations/)", + "^(build|dist|venv|\\.venv|__pycache__)/", + "^tests/", ] -[[tool.mypy.overrides]] -module = ["tests.*"] -disallow_untyped_defs = false -disallow_incomplete_defs = false -check_untyped_defs = true - -[[tool.mypy.overrides]] -module = ["*.migrations.*"] -ignore_errors = true - -[[tool.mypy.overrides]] -module = ["requests.*", "setuptools.*"] -ignore_missing_imports = true - [tool.pylint.main] jobs = 1 +extension-pkg-allow-list = ["sqlalchemy"] ignore = [".git", "__pycache__", ".mypy_cache", ".pytest_cache", ".ruff_cache", ".venv", "venv", "build", "dist", "tmp", "vendor", "tests"] -init-hook = "import os,sys; cwd=os.getcwd(); sys.path.insert(0,cwd) if cwd not in sys.path else None" +init-hook = "import os, sys; cwd = os.getcwd(); sys.path.insert(0, cwd) if cwd not in sys.path else None" [tool.pylint.messages_control] disable = [ "missing-module-docstring", "missing-class-docstring", "missing-function-docstring", - "too-many-arguments", - "too-many-positional-arguments", "too-many-locals", "too-many-branches", - "protected-access", - "unused-argument", - "logging-fstring-interpolation", - "unnecessary-dunder-call", - "not-callable", - "import-error", - "no-name-in-module", - "no-member", - "import-outside-toplevel", - "wrong-import-order", - "ungrouped-imports", ] [tool.pylint.format] @@ -167,18 +178,17 @@ max-module-lines = 800 [tool.pylint.design] max-args = 6 max-positional-arguments = 6 -max-attributes = 200 -max-public-methods = 60 -max-returns = 30 -max-nested-blocks = 8 -max-statements = 250 +max-attributes = 35 +max-public-methods = 25 +max-returns = 10 +max-nested-blocks = 5 +max-statements = 80 min-public-methods = 0 [tool.pylint.basic] variable-naming-style = "snake_case" argument-naming-style = "snake_case" attr-naming-style = "snake_case" - const-naming-style = "UPPER_CASE" class-const-naming-style = "UPPER_CASE" @@ -186,5 +196,4 @@ class-const-naming-style = "UPPER_CASE" min-similarity-lines = 8 [tool.hatch.build.targets.wheel] -# Service layout does not include a top-level package named "resolver". -bypass-selection = true +packages = ["resolver"] diff --git a/services/analysis_config_service.py b/services/analysis_config_service.py index 4a8cb12..50cb2c8 100644 --- a/services/analysis_config_service.py +++ b/services/analysis_config_service.py @@ -11,10 +11,10 @@ import copy import math import sys +from collections.abc import AsyncIterator from contextlib import asynccontextmanager from dataclasses import dataclass from types import ModuleType -from typing import AsyncIterator, Optional import yaml from fastapi import HTTPException, status @@ -85,12 +85,12 @@ class _RequestOverrideModel(BaseModel): model_config = ConfigDict(extra="forbid") - step: Optional[str] = None - sensitivity: Optional[float] = Field(default=None, ge=1.0, le=6.0) - apdex_threshold_ms: Optional[float] = None - slo_target: Optional[float] = Field(default=None, ge=0.0, le=1.0) - correlation_window_seconds: Optional[float] = Field(default=None, ge=10.0, le=600.0) - forecast_horizon_seconds: Optional[float] = Field(default=None, ge=60.0, le=86400.0) + step: str | None = None + sensitivity: float | None = Field(default=None, ge=1.0, le=6.0) + apdex_threshold_ms: float | None = None + slo_target: float | None = Field(default=None, ge=0.0, le=1.0) + correlation_window_seconds: float | None = Field(default=None, ge=10.0, le=600.0) + forecast_horizon_seconds: float | None = Field(default=None, ge=60.0, le=86400.0) class _ConfigDocumentModel(BaseModel): @@ -302,7 +302,7 @@ def prepare_request( self, req: AnalyzeRequest, *, - explicit_fields: Optional[set[str]] = None, + explicit_fields: set[str] | None = None, ) -> PreparedAnalysisRequest: document = self._parse_document(getattr(req, "config_yaml", None)) request_updates = document.request.model_dump(exclude_none=True) diff --git a/services/analyze_service.py b/services/analyze_service.py index 7237afa..f23d612 100644 --- a/services/analyze_service.py +++ b/services/analyze_service.py @@ -17,7 +17,8 @@ def get_provider(tenant_id: str) -> DataSourceProvider: - from api.routes.common import get_provider as route_get_provider + # Local import avoids circular deps: api.routes → analyze → analyze_service → api.routes.common + from api.routes.common import get_provider as route_get_provider # pylint: disable=import-outside-toplevel return route_get_provider(tenant_id) diff --git a/services/rca_job_service.py b/services/rca_job_service.py index 0218190..10cd2af 100644 --- a/services/rca_job_service.py +++ b/services/rca_job_service.py @@ -15,22 +15,21 @@ import json import logging import uuid -from datetime import datetime, timedelta, timezone -from typing import Optional +from datetime import UTC, datetime, timedelta from fastapi import HTTPException, status from sqlalchemy import and_, or_, select +from api.requests import AnalyzeRequest from api.responses import JobStatus from api.responses.jobs import AnalyzeJobSummary as JobView -from api.requests import AnalyzeRequest -from services.analyze_service import run_analysis -from services.analysis_config_service import analysis_config_service -from services.security_service import InternalContext from config import settings from custom_types.json import JSONDict from database import get_db_session from db_models import RcaJob, RcaReport +from services.analysis_config_service import analysis_config_service +from services.analyze_service import run_analysis +from services.security_service import InternalContext _JOB_EXECUTION_ERRORS = ( asyncio.TimeoutError, @@ -44,18 +43,18 @@ def _utcnow() -> datetime: - return datetime.now(timezone.utc) + return datetime.now(UTC) def _coerce_datetime(value: object) -> datetime: if isinstance(value, datetime): if value.tzinfo is None: - return value.replace(tzinfo=timezone.utc) + return value.replace(tzinfo=UTC) return value return _utcnow() -def _coerce_optional_datetime(value: object) -> Optional[datetime]: +def _coerce_optional_datetime(value: object) -> datetime | None: if value is None: return None return _coerce_datetime(value) @@ -89,7 +88,7 @@ def _encode_cursor(*, created_at: datetime, job_id: str) -> str: return base64.urlsafe_b64encode(raw).decode("ascii") -def _decode_cursor(cursor: Optional[str]) -> tuple[Optional[datetime], Optional[str]]: +def _decode_cursor(cursor: str | None) -> tuple[datetime | None, str | None]: if not cursor: return None, None try: @@ -97,7 +96,7 @@ def _decode_cursor(cursor: Optional[str]) -> tuple[Optional[datetime], Optional[ payload = json.loads(raw) created_at = datetime.fromisoformat(str(payload.get("created_at"))) if created_at.tzinfo is None: - created_at = created_at.replace(tzinfo=timezone.utc) + created_at = created_at.replace(tzinfo=UTC) job_id = str(payload.get("job_id") or "").strip() if not job_id: return None, None @@ -244,7 +243,7 @@ async def _run_job(self, *, job_id: str) -> None: async with self._lock: self._tasks.pop(job_id, None) - def _get_job_row(self, job_id: str) -> Optional[RcaJob]: + def _get_job_row(self, job_id: str) -> RcaJob | None: with get_db_session() as db: return db.get(RcaJob, job_id) @@ -314,11 +313,11 @@ async def list_jobs( self, *, ctx: InternalContext, - status_filter: Optional[JobStatus], + status_filter: JobStatus | None, limit: int, - cursor: Optional[str], - ) -> tuple[list[JobView], Optional[str]]: - def _list() -> tuple[list[JobView], Optional[str]]: + cursor: str | None, + ) -> tuple[list[JobView], str | None]: + def _list() -> tuple[list[JobView], str | None]: with get_db_session() as db: page_size = max(1, min(100, int(limit))) stmt = select(RcaJob).where( @@ -364,8 +363,8 @@ def _get() -> JobView: return await asyncio.to_thread(_get) - async def get_job_result(self, *, job_id: str, ctx: InternalContext) -> tuple[JobView, Optional[JSONDict]]: - def _get() -> tuple[JobView, Optional[JSONDict]]: + async def get_job_result(self, *, job_id: str, ctx: InternalContext) -> tuple[JobView, JSONDict | None]: + def _get() -> tuple[JobView, JSONDict | None]: with get_db_session() as db: row = db.get(RcaJob, job_id) if row is None or row.status == JobStatus.DELETED.value: @@ -381,8 +380,8 @@ def _get() -> tuple[JobView, Optional[JSONDict]]: return await asyncio.to_thread(_get) - async def get_report(self, *, report_id: str, ctx: InternalContext) -> tuple[JobView, Optional[JSONDict]]: - def _get() -> tuple[JobView, Optional[JSONDict]]: + async def get_report(self, *, report_id: str, ctx: InternalContext) -> tuple[JobView, JSONDict | None]: + def _get() -> tuple[JobView, JSONDict | None]: with get_db_session() as db: report = db.get(RcaReport, report_id) if report is None: @@ -401,7 +400,7 @@ def _get() -> tuple[JobView, Optional[JSONDict]]: return await asyncio.to_thread(_get) async def delete_report(self, *, report_id: str, ctx: InternalContext) -> None: - task_to_cancel: Optional[asyncio.Task[None]] = None + task_to_cancel: asyncio.Task[None] | None = None def _delete() -> str: with get_db_session() as db: diff --git a/services/security_service.py b/services/security_service.py index cb0496d..20a8bab 100644 --- a/services/security_service.py +++ b/services/security_service.py @@ -8,14 +8,14 @@ from __future__ import annotations -from collections.abc import Awaitable, Callable -from contextvars import ContextVar, Token -from dataclasses import dataclass -from hmac import compare_digest import logging import threading import time -from typing import Mapping, Optional, TypeVar +from collections.abc import Awaitable, Callable, Mapping +from contextvars import ContextVar, Token +from dataclasses import dataclass +from hmac import compare_digest +from typing import TypeVar import jwt from fastapi import HTTPException, Request, status @@ -26,7 +26,7 @@ from config import ALLOWED_CONTEXT_ALGORITHMS, settings -_context_var: ContextVar["InternalContext | None"] = ContextVar("resolver_internal_context", default=None) +_context_var: ContextVar[InternalContext | None] = ContextVar("resolver_internal_context", default=None) log = logging.getLogger(__name__) _jti_seen_lock = threading.Lock() _jti_seen_cache: dict[str, float] = {} @@ -160,7 +160,7 @@ def get_internal_context() -> InternalContext | None: return _context_var.get() -def get_context_tenant(default_tenant: Optional[str] = None) -> str: +def get_context_tenant(default_tenant: str | None = None) -> str: ctx = get_internal_context() if ctx: return ctx.tenant_id diff --git a/store/__init__.py b/store/__init__.py index 867eb6e..da913b5 100644 --- a/store/__init__.py +++ b/store/__init__.py @@ -8,16 +8,16 @@ http://www.apache.org/licenses/LICENSE-2.0 """ -from store.client import redis_get, redis_set, redis_delete, is_using_fallback -from store import baseline, weights, granger, events +from store import baseline, events, granger, weights +from store.client import is_using_fallback, redis_delete, redis_get, redis_set __all__ = [ + "baseline", + "events", + "granger", + "is_using_fallback", + "redis_delete", "redis_get", "redis_set", - "redis_delete", - "is_using_fallback", - "baseline", "weights", - "granger", - "events", ] diff --git a/store/baseline.py b/store/baseline.py index b9c9d44..a690d92 100644 --- a/store/baseline.py +++ b/store/baseline.py @@ -13,12 +13,11 @@ import json import logging from json import JSONDecodeError -from typing import List, Optional -from engine.baseline.compute import Baseline, compute -from store.client import redis_get, redis_set from config import BASELINE_TTL, BLEND_ALPHA +from engine.baseline.compute import Baseline, compute from store import keys +from store.client import redis_get, redis_set log = logging.getLogger(__name__) @@ -62,7 +61,7 @@ def _blend(cached: Baseline, fresh: Baseline) -> Baseline: ) -async def load(tenant_id: str, metric_name: str) -> Optional[Baseline]: +async def load(tenant_id: str, metric_name: str) -> Baseline | None: try: raw = await redis_get(keys.baseline(tenant_id, metric_name)) if raw: @@ -82,8 +81,8 @@ async def save(tenant_id: str, metric_name: str, baseline: Baseline) -> None: async def compute_and_persist( tenant_id: str, metric_name: str, - ts: List[float], - vals: List[float], + ts: list[float], + vals: list[float], z_threshold: float = 3.0, ) -> Baseline: fresh = compute(ts, vals, z_threshold=z_threshold) diff --git a/store/client.py b/store/client.py index ad0e4fd..5368ee9 100644 --- a/store/client.py +++ b/store/client.py @@ -12,11 +12,14 @@ import asyncio import fnmatch -from importlib import import_module import logging import time +from collections.abc import AsyncIterator +from importlib import import_module from types import ModuleType -from typing import AsyncIterator, Optional, Protocol, cast +from typing import Protocol, cast + +from config import REDIS_URL, settings RedisError: type[Exception] = OSError _redis_exceptions_module: ModuleType | None @@ -41,7 +44,7 @@ async def execute(self) -> object: ... class RedisClientProtocol(Protocol): async def ping(self) -> object: ... - async def get(self, key: str) -> Optional[str]: ... + async def get(self, key: str) -> str | None: ... async def setex(self, key: str, ttl: int, value: str) -> object: ... async def set(self, key: str, value: str) -> object: ... async def delete(self, key: str) -> object: ... @@ -50,26 +53,23 @@ async def lrange(self, key: str, start: int, end: int) -> list[str]: ... def scan_iter(self, pattern: str) -> AsyncIterator[str]: ... -_REDIS_CLIENT: Optional[RedisClientProtocol] = None +_REDIS_CLIENT: RedisClientProtocol | None = None _fallback: dict[str, str] = {} _fallback_lists: dict[str, list[str]] = {} _USING_FALLBACK = False _init_lock = asyncio.Lock() _RETRY_AFTER_MONOTONIC: float = 0.0 +_REDIS_OP_TIMEOUT_SECONDS = 0.5 try: - from config import settings - _MAX_FALLBACK_SIZE = int(settings.store_fallback_max_items) _REDIS_RETRY_COOLDOWN_SECONDS = float(settings.store_redis_retry_cooldown_seconds) - _REDIS_OP_TIMEOUT_SECONDS = 0.5 except (ImportError, AttributeError, TypeError, ValueError): _MAX_FALLBACK_SIZE = 10_000 _REDIS_RETRY_COOLDOWN_SECONDS = 10.0 - _REDIS_OP_TIMEOUT_SECONDS = 0.5 -async def get_redis() -> Optional[RedisClientProtocol]: +async def get_redis() -> RedisClientProtocol | None: if _REDIS_CLIENT is not None: return _REDIS_CLIENT if time.monotonic() < _RETRY_AFTER_MONOTONIC: @@ -79,7 +79,6 @@ async def get_redis() -> Optional[RedisClientProtocol]: async with _init_lock: try: aioredis = import_module("redis.asyncio") - from config import REDIS_URL client = cast( RedisClientProtocol, @@ -96,7 +95,7 @@ async def get_redis() -> Optional[RedisClientProtocol]: globals()["_USING_FALLBACK"] = False log.info("Redis connected: %s", REDIS_URL) return _REDIS_CLIENT - except (ImportError, ModuleNotFoundError, RedisError, asyncio.TimeoutError, OSError) as exc: + except (TimeoutError, ImportError, ModuleNotFoundError, RedisError, OSError) as exc: globals()["_RETRY_AFTER_MONOTONIC"] = time.monotonic() + max(0.0, _REDIS_RETRY_COOLDOWN_SECONDS) if not _USING_FALLBACK: log.warning("Redis unavailable (%s) — using in-memory fallback", exc) @@ -104,18 +103,18 @@ async def get_redis() -> Optional[RedisClientProtocol]: return None -async def redis_get(key: str) -> Optional[str]: +async def redis_get(key: str) -> str | None: client = await get_redis() if client is None: return _fallback.get(key) try: return await asyncio.wait_for(client.get(key), timeout=_REDIS_OP_TIMEOUT_SECONDS) - except (RedisError, asyncio.TimeoutError, OSError) as exc: + except (TimeoutError, RedisError, OSError) as exc: log.debug("Redis GET error %s: %s", key, exc) return _fallback.get(key) -async def redis_set(key: str, value: str, ttl: Optional[int] = None) -> None: +async def redis_set(key: str, value: str, ttl: int | None = None) -> None: client = await get_redis() if client is None: if len(_fallback) < _MAX_FALLBACK_SIZE: @@ -126,7 +125,7 @@ async def redis_set(key: str, value: str, ttl: Optional[int] = None) -> None: await asyncio.wait_for(client.setex(key, ttl, value), timeout=_REDIS_OP_TIMEOUT_SECONDS) else: await asyncio.wait_for(client.set(key, value), timeout=_REDIS_OP_TIMEOUT_SECONDS) - except (RedisError, asyncio.TimeoutError, OSError) as exc: + except (TimeoutError, RedisError, OSError) as exc: log.debug("Redis SET error %s: %s", key, exc) if len(_fallback) < _MAX_FALLBACK_SIZE: _fallback[key] = value @@ -140,13 +139,13 @@ async def redis_delete(key: str) -> None: return try: await asyncio.wait_for(client.delete(key), timeout=_REDIS_OP_TIMEOUT_SECONDS) - except (RedisError, asyncio.TimeoutError, OSError) as exc: + except (TimeoutError, RedisError, OSError) as exc: log.debug("Redis DEL error %s: %s", key, exc) _fallback.pop(key, None) _fallback_lists.pop(key, None) -async def redis_rpush(key: str, value: str, ttl: Optional[int] = None, max_len: Optional[int] = None) -> None: +async def redis_rpush(key: str, value: str, ttl: int | None = None, max_len: int | None = None) -> None: client = await get_redis() if client is None: lst = _fallback_lists.setdefault(key, []) @@ -162,7 +161,7 @@ async def redis_rpush(key: str, value: str, ttl: Optional[int] = None, max_len: if ttl: pipe.expire(key, ttl) await asyncio.wait_for(pipe.execute(), timeout=_REDIS_OP_TIMEOUT_SECONDS) - except (RedisError, asyncio.TimeoutError, OSError) as exc: + except (TimeoutError, RedisError, OSError) as exc: log.debug("Redis RPUSH error %s: %s", key, exc) lst = _fallback_lists.setdefault(key, []) lst.append(value) @@ -176,7 +175,7 @@ async def redis_lrange(key: str) -> list[str]: return list(_fallback_lists.get(key, [])) try: return await asyncio.wait_for(client.lrange(key, 0, -1), timeout=_REDIS_OP_TIMEOUT_SECONDS) - except (RedisError, asyncio.TimeoutError, OSError) as exc: + except (TimeoutError, RedisError, OSError) as exc: log.debug("Redis LRANGE error %s: %s", key, exc) return list(_fallback_lists.get(key, [])) @@ -191,7 +190,7 @@ async def _scan_keys() -> list[str]: return [key async for key in client.scan_iter(pattern)] return await asyncio.wait_for(_scan_keys(), timeout=1.0) - except (RedisError, asyncio.TimeoutError, OSError) as exc: + except (TimeoutError, RedisError, OSError) as exc: log.debug("Redis SCAN error %s: %s", pattern, exc) return [k for k in _fallback if fnmatch.fnmatch(k, pattern)] diff --git a/store/events.py b/store/events.py index 629c7c6..3e65ef2 100644 --- a/store/events.py +++ b/store/events.py @@ -13,12 +13,12 @@ import json import logging from json import JSONDecodeError -from typing import List, TypedDict +from typing import TypedDict -from engine.events.models import DeploymentEvent -from store.client import redis_lrange, redis_rpush, redis_delete from config import EVENTS_TTL +from engine.events.models import DeploymentEvent from store import keys +from store.client import redis_delete, redis_lrange, redis_rpush log = logging.getLogger(__name__) @@ -90,7 +90,7 @@ def _serialise(event: DeploymentEvent) -> str: ) -async def load(tenant_id: str) -> List[StoredEvent]: +async def load(tenant_id: str) -> list[StoredEvent]: try: items = await redis_lrange(keys.events(tenant_id)) events: list[StoredEvent] = [] diff --git a/store/granger.py b/store/granger.py index b471951..23bf20a 100644 --- a/store/granger.py +++ b/store/granger.py @@ -14,12 +14,12 @@ import json import logging from json import JSONDecodeError -from typing import Dict, List, TypedDict +from typing import TypedDict -from engine.causal.granger import GrangerResult -from store.client import redis_get, redis_set from config import GRANGER_TTL +from engine.causal.granger import GrangerResult from store import keys +from store.client import redis_get, redis_set log = logging.getLogger(__name__) @@ -90,7 +90,7 @@ def _coerce_record(value: object) -> GrangerRecord | None: } -async def load(tenant_id: str, service: str) -> List[GrangerRecord]: +async def load(tenant_id: str, service: str) -> list[GrangerRecord]: try: raw = await redis_get(keys.granger(tenant_id, service)) if raw: @@ -112,10 +112,10 @@ async def save_and_merge( tenant_id: str, service: str, fresh_results: list[GrangerResult], -) -> List[GrangerRecord]: +) -> list[GrangerRecord]: cached = await load(tenant_id, service) - stored: Dict[str, GrangerRecord] = {_pair_key(r["cause_metric"], r["effect_metric"]): r for r in cached} + stored: dict[str, GrangerRecord] = {_pair_key(r["cause_metric"], r["effect_metric"]): r for r in cached} for r in fresh_results: pk = _pair_key(r.cause_metric, r.effect_metric) existing = stored.get(pk) @@ -138,9 +138,9 @@ async def save_and_merge( return merged -async def load_all_services(tenant_id: str, services: List[str]) -> List[GrangerRecord]: +async def load_all_services(tenant_id: str, services: list[str]) -> list[GrangerRecord]: per_service = await asyncio.gather(*[load(tenant_id, svc) for svc in services]) - all_results: Dict[str, GrangerRecord] = {} + all_results: dict[str, GrangerRecord] = {} for svc_results in per_service: for r in svc_results: pk = _pair_key(r["cause_metric"], r["effect_metric"]) diff --git a/store/weights.py b/store/weights.py index 35ac54e..3d7e356 100644 --- a/store/weights.py +++ b/store/weights.py @@ -13,12 +13,12 @@ import json import logging from json import JSONDecodeError -from typing import Dict, Optional, TypedDict +from typing import TypedDict -from custom_types.json import is_json_object -from store.client import redis_get, redis_set, redis_delete from config import WEIGHTS_TTL +from custom_types.json import is_json_object from store import keys +from store.client import redis_delete, redis_get, redis_set log = logging.getLogger(__name__) @@ -28,7 +28,7 @@ class StoredWeights(TypedDict): update_count: int -async def load(tenant_id: str) -> Optional[StoredWeights]: +async def load(tenant_id: str) -> StoredWeights | None: try: raw = await redis_get(keys.weights(tenant_id)) if raw: @@ -58,7 +58,7 @@ async def load(tenant_id: str) -> Optional[StoredWeights]: return None -async def save(tenant_id: str, weight_map: Dict[str, float], update_count: int) -> None: +async def save(tenant_id: str, weight_map: dict[str, float], update_count: int) -> None: payload = {"weights": weight_map, "update_count": update_count} try: await redis_set(keys.weights(tenant_id), json.dumps(payload), ttl=WEIGHTS_TTL) diff --git a/tests/conftest.py b/tests/conftest.py index 57b369e..99ccf30 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -10,6 +10,7 @@ import os import sys + import pytest ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), "..")) @@ -22,6 +23,7 @@ @pytest.fixture(autouse=True) def clear_fallback(monkeypatch): import store.client as client + _fallback.clear() client._REDIS_CLIENT = None client._USING_FALLBACK = False @@ -40,10 +42,10 @@ async def fake_delete(key: str): monkeypatch.setattr(client, "redis_set", fake_set) monkeypatch.setattr(client, "redis_delete", fake_delete) - import store.weights as wstore import store.baseline as bstore - import store.granger as gstore import store.events as estore + import store.granger as gstore + import store.weights as wstore for mod in (wstore, bstore, gstore, estore): for name in ("redis_get", "redis_set", "redis_delete"): diff --git a/tests/test_analysis_config_service.py b/tests/test_analysis_config_service.py index d599199..6eab43d 100644 --- a/tests/test_analysis_config_service.py +++ b/tests/test_analysis_config_service.py @@ -238,9 +238,9 @@ async def fake_run(provider, req): "severity_weights": {"low": 1, "medium": 5, "high": 7, "critical": 11}, "mad_threshold": 9.5, } - assert analyzer_module.DEFAULT_METRIC_QUERIES == original_metric_queries - assert registry_module.DEFAULT_WEIGHTS == original_default_weights - assert enums_module.SEVERITY_WEIGHTS == original_severity_weights + assert original_metric_queries == analyzer_module.DEFAULT_METRIC_QUERIES + assert original_default_weights == registry_module.DEFAULT_WEIGHTS + assert original_severity_weights == enums_module.SEVERITY_WEIGHTS assert analyzer_module.settings.mad_threshold == original_mad_threshold diff --git a/tests/test_analyze_helpers_filters_edges.py b/tests/test_analyze_helpers_filters_edges.py index 0c3e48c..46f272c 100644 --- a/tests/test_analyze_helpers_filters_edges.py +++ b/tests/test_analyze_helpers_filters_edges.py @@ -16,9 +16,9 @@ import pytest from api.requests import AnalyzeRequest -from api.responses import LogBurst, LogPattern, MetricAnomaly, RootCause as RootCauseModel -from engine.analyze import filters -from engine.analyze import helpers +from api.responses import LogBurst, LogPattern, MetricAnomaly +from api.responses import RootCause as RootCauseModel +from engine.analyze import filters, helpers from engine.changepoint import ChangePoint from engine.enums import ChangeType, Severity, Signal @@ -217,7 +217,9 @@ def test_signal_scoring_and_periodic_detection_branches(monkeypatch): assert helpers._is_strongly_periodic_log_bursts([]) is False non_periodic = [ - LogBurst(window_start=v, window_end=v + 5.0, rate_per_second=1.0, baseline_rate=0.1, ratio=10, severity=Severity.LOW) + LogBurst( + window_start=v, window_end=v + 5.0, rate_per_second=1.0, baseline_rate=0.1, ratio=10, severity=Severity.LOW + ) for v in [1.0, 2.0, 2.0, 3.0] ] assert helpers._is_strongly_periodic_log_bursts(non_periodic) is False @@ -328,7 +330,9 @@ async def _fake_fetch_metrics(*_args, **_kwargs): monkeypatch.setattr(helpers, "fetch_metrics", _fake_fetch_metrics) monkeypatch.setattr(helpers, "_normalize_services", lambda _s: {"svc"}) - monkeypatch.setattr(helpers, "_filter_metric_response_by_services", lambda resp, _svc: resp if resp != {"bad": 1} else []) + monkeypatch.setattr( + helpers, "_filter_metric_response_by_services", lambda resp, _svc: resp if resp != {"bad": 1} else [] + ) def _iter_series(resp, query_hint=None): if query_hint == "q1": @@ -336,7 +340,11 @@ def _iter_series(resp, query_hint=None): return iter([("m2", [1.0, 2.0], [20.0, 21.0])]) monkeypatch.setattr(helpers.anomaly, "iter_series", _iter_series) - monkeypatch.setattr(helpers, "compute_series_distribution_stats", lambda sk, mn, vals: None if mn == "m1" else SimpleNamespace(series_key=sk)) + monkeypatch.setattr( + helpers, + "compute_series_distribution_stats", + lambda sk, mn, vals: None if mn == "m1" else SimpleNamespace(series_key=sk), + ) async def _process_one(req, query_string, metric_name, ts, vals, z_threshold, analysis_window_seconds): if metric_name == "m1": @@ -468,40 +476,71 @@ def test_periodic_and_log_burst_filter_remaining_branches(monkeypatch): # median out of accepted range (>180) wide = [ - LogBurst(window_start=v, window_end=v + 5.0, rate_per_second=1.0, baseline_rate=0.1, ratio=10, severity=Severity.LOW) + LogBurst( + window_start=v, window_end=v + 5.0, rate_per_second=1.0, baseline_rate=0.1, ratio=10, severity=Severity.LOW + ) for v in [0.0, 500.0, 1000.0, 1500.0] ] assert helpers._is_strongly_periodic_log_bursts(wide) is False # high coefficient of variation noisy = [ - LogBurst(window_start=v, window_end=v + 5.0, rate_per_second=1.0, baseline_rate=0.1, ratio=10, severity=Severity.LOW) + LogBurst( + window_start=v, window_end=v + 5.0, rate_per_second=1.0, baseline_rate=0.1, ratio=10, severity=Severity.LOW + ) for v in [0.0, 20.0, 200.0, 230.0, 500.0] ] assert helpers._is_strongly_periodic_log_bursts(noisy) is False bursts = [ - LogBurst(window_start=1000.0 + i * 60.0, window_end=1005.0 + i * 60.0, rate_per_second=1.0, baseline_rate=0.1, ratio=10, severity=Severity.LOW) + LogBurst( + window_start=1000.0 + i * 60.0, + window_end=1005.0 + i * 60.0, + rate_per_second=1.0, + baseline_rate=0.1, + ratio=10, + severity=Severity.LOW, + ) for i in range(4) ] - patterns = [LogPattern(pattern="x", count=4, first_seen=1.0, last_seen=2.0, rate_per_minute=1.0, entropy=0.1, severity=Severity.LOW, sample="x")] + patterns = [ + LogPattern( + pattern="x", + count=4, + first_seen=1.0, + last_seen=2.0, + rate_per_minute=1.0, + entropy=0.1, + severity=Severity.LOW, + sample="x", + ) + ] suppression_counts: dict[str, int] = {} warnings: list[str] = [] monkeypatch.setattr(helpers.settings, "quality_gating_profile", "recall") - assert helpers._filter_log_bursts_for_precision_rca( - log_bursts=bursts, log_patterns=patterns, suppression_counts=suppression_counts, warnings=warnings - ) == bursts + assert ( + helpers._filter_log_bursts_for_precision_rca( + log_bursts=bursts, log_patterns=patterns, suppression_counts=suppression_counts, warnings=warnings + ) + == bursts + ) monkeypatch.setattr(helpers.settings, "quality_gating_profile", "precision_strict_v1") - assert helpers._filter_log_bursts_for_precision_rca( - log_bursts=bursts, log_patterns=[], suppression_counts=suppression_counts, warnings=warnings - ) == bursts + assert ( + helpers._filter_log_bursts_for_precision_rca( + log_bursts=bursts, log_patterns=[], suppression_counts=suppression_counts, warnings=warnings + ) + == bursts + ) # non-periodic path under precision + low severity patterns - assert helpers._filter_log_bursts_for_precision_rca( - log_bursts=noisy, log_patterns=patterns, suppression_counts=suppression_counts, warnings=warnings - ) == noisy + assert ( + helpers._filter_log_bursts_for_precision_rca( + log_bursts=noisy, log_patterns=patterns, suppression_counts=suppression_counts, warnings=warnings + ) + == noisy + ) @pytest.mark.asyncio diff --git a/tests/test_analyzer_quality.py b/tests/test_analyzer_quality.py index 8ff6009..c8d7749 100644 --- a/tests/test_analyzer_quality.py +++ b/tests/test_analyzer_quality.py @@ -13,7 +13,8 @@ from types import SimpleNamespace -from api.responses import LogBurst, LogPattern, MetricAnomaly, RootCause as RootCauseModel +from api.responses import LogBurst, LogPattern, MetricAnomaly +from api.responses import RootCause as RootCauseModel from engine.analyzer import ( _apply_precision_quality_gates, _build_log_query, diff --git a/tests/test_anomaly_detection.py b/tests/test_anomaly_detection.py index d7d4809..27b019f 100644 --- a/tests/test_anomaly_detection.py +++ b/tests/test_anomaly_detection.py @@ -10,19 +10,18 @@ """ import numpy as np -import pytest from api.responses import MetricAnomaly from engine.anomaly.detection import ( _apply_density_cap, + _change_type, + _compress_runs, + _cusum_changepoints, _iqr_score_value, _mad_scores, - _cusum_changepoints, - _change_type, - _severity, _series_median_iqr, + _severity, _tukey_outlier_class, - _compress_runs, detect, ) from engine.enums import ChangeType, Severity diff --git a/tests/test_anomaly_series.py b/tests/test_anomaly_series.py index ffc8e56..00f1749 100644 --- a/tests/test_anomaly_series.py +++ b/tests/test_anomaly_series.py @@ -9,10 +9,10 @@ http://www.apache.org/licenses/LICENSE-2.0 """ -from engine.anomaly.series import iter_series +from engine.anomaly.series import MetricRecord, iter_series -def _resp(metric: dict, values: list[list[object]]) -> dict: +def _resp(metric: MetricRecord, values: list[list[object]]) -> dict[str, object]: return { "status": "success", "data": { diff --git a/tests/test_anomaly_stats.py b/tests/test_anomaly_stats.py index f7a9988..bf50d36 100644 --- a/tests/test_anomaly_stats.py +++ b/tests/test_anomaly_stats.py @@ -76,10 +76,10 @@ def test_compute_series_distribution_stats_typical_series(): def test_sample_moment_edge_cases_for_skewness_and_kurtosis(): - from engine.anomaly.stats import _sample_skewness, _sample_excess_kurtosis - import numpy as np + from engine.anomaly.stats import _sample_excess_kurtosis, _sample_skewness + assert _sample_skewness(np.array([1.0, 1.0])) == 0.0 assert _sample_skewness(np.array([1.0, 1.0, 1.0])) == 0.0 assert _sample_excess_kurtosis(np.array([1.0, 1.0, 1.0])) == 0.0 diff --git a/tests/test_api_models.py b/tests/test_api_models.py index ac7b41b..0c689ec 100644 --- a/tests/test_api_models.py +++ b/tests/test_api_models.py @@ -10,16 +10,16 @@ """ import pytest +from pydantic import ValidationError from api.requests import AnalyzeRequest, CorrelateRequest, DeploymentEventRequest, SloRequest -from pydantic import ValidationError def test_deployment_request_requires_tenant(): req = DeploymentEventRequest(tenant_id="t1", service="s", timestamp=1.0, version="v1") assert req.tenant_id == "t1" with pytest.raises(ValidationError): - DeploymentEventRequest(service="s", timestamp=1.0, version="v1") + DeploymentEventRequest.model_validate({"service": "s", "timestamp": 1.0, "version": "v1"}) def test_time_range_validations(): diff --git a/tests/test_api_route_surface_edges.py b/tests/test_api_route_surface_edges.py index cd66330..705d029 100644 --- a/tests/test_api_route_surface_edges.py +++ b/tests/test_api_route_surface_edges.py @@ -17,8 +17,8 @@ import numpy as np import pytest -from api.requests import ChangepointRequest, CorrelateRequest, LogRequest, MetricRequest -from api.requests import TraceRequest +import main as app_main +from api.requests import ChangepointRequest, CorrelateRequest, LogRequest, MetricRequest, TraceRequest from api.responses.base import NpModel, _coerce from api.routes import correlation as correlation_route from api.routes import events as events_route @@ -27,7 +27,6 @@ from api.routes import metrics as metrics_route from api.routes import traces as traces_route from custom_types import json as json_types -import main as app_main class DemoModel(NpModel): diff --git a/tests/test_api_routes_causal.py b/tests/test_api_routes_causal.py index 273370f..d8becd2 100644 --- a/tests/test_api_routes_causal.py +++ b/tests/test_api_routes_causal.py @@ -13,8 +13,8 @@ import pytest -from api.routes import causal as causal_route from api.requests import CorrelateRequest +from api.routes import causal as causal_route class DummyProvider: diff --git a/tests/test_api_routes_correlation.py b/tests/test_api_routes_correlation.py index 505dcb1..b4bba8a 100644 --- a/tests/test_api_routes_correlation.py +++ b/tests/test_api_routes_correlation.py @@ -8,8 +8,8 @@ import pytest -from api.routes import correlation as corr_route from api.requests import CorrelateRequest +from api.routes import correlation as corr_route class DummyState: diff --git a/tests/test_api_routes_events.py b/tests/test_api_routes_events.py index 8651178..7efbdab 100644 --- a/tests/test_api_routes_events.py +++ b/tests/test_api_routes_events.py @@ -11,8 +11,8 @@ import pytest -from api.routes import events as events_route from api.requests import DeploymentEventRequest +from api.routes import events as events_route class DummyRegistry: diff --git a/tests/test_api_routes_slo.py b/tests/test_api_routes_slo.py index 0f9f506..e898ce4 100644 --- a/tests/test_api_routes_slo.py +++ b/tests/test_api_routes_slo.py @@ -13,8 +13,8 @@ import pytest -from api.routes import slo as slo_route from api.requests import SloRequest +from api.routes import slo as slo_route from config import settings diff --git a/tests/test_changepoint.py b/tests/test_changepoint.py index 030cbc0..efa9e08 100644 --- a/tests/test_changepoint.py +++ b/tests/test_changepoint.py @@ -9,8 +9,8 @@ http://www.apache.org/licenses/LICENSE-2.0 """ -import pytest import numpy as np +import pytest from config import settings from engine.changepoint.cusum import _detect_oscillation, detect diff --git a/tests/test_config_database_and_engine_edges.py b/tests/test_config_database_and_engine_edges.py index 22743c0..75dd315 100644 --- a/tests/test_config_database_and_engine_edges.py +++ b/tests/test_config_database_and_engine_edges.py @@ -11,20 +11,19 @@ import importlib import os import sys -from typing import Any, cast from types import SimpleNamespace +from typing import Any, cast from unittest.mock import patch import pytest +import database as database_module from api.requests import AnalyzeRequest from api.routes import causal as causal_route -from engine.causal.granger import GrangerResult from engine.dedup.grouping import group_metric_anomalies from engine.enums import ChangeType, Severity from engine.events.models import DeploymentEvent from engine.fetcher import _extract_metric_names, _scrape_and_fill, fetch_metrics -import database as database_module baseline_compute_module = importlib.import_module("engine.baseline.compute") @@ -95,27 +94,23 @@ def test_config_security_validation_edges(): env = _base_production_env() env["RESOLVER_DATABASE_URL"] = "" - with patch.dict(os.environ, env, clear=False): - with pytest.raises(ValueError, match="DATABASE_URL"): - _reload_config_module() + with patch.dict(os.environ, env, clear=False), pytest.raises(ValueError, match="DATABASE_URL"): + _reload_config_module() env = _base_production_env() env["RESOLVER_CONTEXT_REPLAY_TTL_SECONDS"] = "0" - with patch.dict(os.environ, env, clear=False): - with pytest.raises(ValueError, match="REPLAY_TTL"): - _reload_config_module() + with patch.dict(os.environ, env, clear=False), pytest.raises(ValueError, match="REPLAY_TTL"): + _reload_config_module() env = _base_production_env() env["RESOLVER_CONTEXT_ISSUER"] = "" - with patch.dict(os.environ, env, clear=False): - with pytest.raises(ValueError, match="CONTEXT_ISSUER"): - _reload_config_module() + with patch.dict(os.environ, env, clear=False), pytest.raises(ValueError, match="CONTEXT_ISSUER"): + _reload_config_module() env = _base_production_env() env["RESOLVER_CONTEXT_AUDIENCE"] = "" - with patch.dict(os.environ, env, clear=False): - with pytest.raises(ValueError, match="CONTEXT_AUDIENCE"): - _reload_config_module() + with patch.dict(os.environ, env, clear=False), pytest.raises(ValueError, match="CONTEXT_AUDIENCE"): + _reload_config_module() def test_config_secret_helpers_cover_strong_secret_path(): @@ -253,9 +248,8 @@ def test_database_setup_session_and_connection_paths(monkeypatch): database_module._ensure_postgres_database_exists("sqlite:///tmp.db") database_module._ensure_postgres_database_exists("postgresql://user:pass@db") - with pytest.raises(RuntimeError, match="Database not initialized"): - with database_module.get_db_session(): - pass + with pytest.raises(RuntimeError, match="Database not initialized"), database_module.get_db_session(): + pass with pytest.raises(RuntimeError, match="Database not initialized"): database_module.init_db() @@ -329,7 +323,7 @@ def fake_create_engine(url, **kwargs): database_module.dispose_database() monkeypatch.setattr(database_module, "_ensure_postgres_database_exists", lambda url: ensure_calls.append(url)) monkeypatch.setattr(database_module, "create_engine", lambda url, **kwargs: fake_engine) - monkeypatch.setattr(database_module, "sessionmaker", lambda **kwargs: (lambda: fake_session)) + monkeypatch.setattr(database_module, "sessionmaker", lambda **kwargs: lambda: fake_session) monkeypatch.setattr( database_module.Base.metadata, "create_all", lambda bind: ensure_calls.append(("create_all", bind)) ) @@ -342,9 +336,8 @@ def fake_create_engine(url, **kwargs): assert session is fake_session assert ensure_calls[-2:] == ["commit", "close"] - with pytest.raises(RuntimeError, match="boom"): - with database_module.get_db_session(): - raise RuntimeError("boom") + with pytest.raises(RuntimeError, match="boom"), database_module.get_db_session(): + raise RuntimeError("boom") assert ensure_calls[-2:] == ["rollback", "close"] database_module.init_db() @@ -374,7 +367,6 @@ def dispose(self) -> None: database_module._ENGINE = cast(Any, _DisposableEngine()) database_module._SESSION_FACTORY = cast(Any, object()) - with pytest.raises(RuntimeError, match="Database not initialized"): - with database_module.get_db_session(): - pass + with pytest.raises(RuntimeError, match="Database not initialized"), database_module.get_db_session(): + pass database_module.dispose_database() diff --git a/tests/test_config_security.py b/tests/test_config_security.py index aaffc88..c5cc6ef 100644 --- a/tests/test_config_security.py +++ b/tests/test_config_security.py @@ -13,12 +13,13 @@ import importlib import os import sys +import types from unittest.mock import patch import pytest -def _reload_config_module(): +def _reload_config_module() -> types.ModuleType: for module_name in ("config", "Resolvers.config"): if module_name in sys.modules: del sys.modules[module_name] @@ -39,25 +40,22 @@ def _base_production_env() -> dict[str, str]: def test_rejects_invalid_context_algorithm(): - with patch.dict(os.environ, {"RESOLVER_CONTEXT_ALGORITHMS": "RS256"}, clear=False): - with pytest.raises(ValueError): - _reload_config_module() + with patch.dict(os.environ, {"RESOLVER_CONTEXT_ALGORITHMS": "RS256"}, clear=False), pytest.raises(ValueError): + _reload_config_module() def test_production_rejects_missing_expected_service_token(): env = _base_production_env() env["RESOLVER_EXPECTED_SERVICE_TOKEN"] = "" - with patch.dict(os.environ, env, clear=False): - with pytest.raises(ValueError): - _reload_config_module() + with patch.dict(os.environ, env, clear=False), pytest.raises(ValueError): + _reload_config_module() def test_production_rejects_weak_context_verify_key(): env = _base_production_env() env["RESOLVER_CONTEXT_VERIFY_KEY"] = "changeme" - with patch.dict(os.environ, env, clear=False): - with pytest.raises(ValueError): - _reload_config_module() + with patch.dict(os.environ, env, clear=False), pytest.raises(ValueError): + _reload_config_module() def test_production_accepts_strong_security_config(): diff --git a/tests/test_connectors_provider_and_security_edges.py b/tests/test_connectors_provider_and_security_edges.py index b25d646..5d6b830 100644 --- a/tests/test_connectors_provider_and_security_edges.py +++ b/tests/test_connectors_provider_and_security_edges.py @@ -16,6 +16,7 @@ from fastapi import HTTPException from pydantic import BaseModel +import services.security_service as security_service from connectors.loki import LokiConnector from connectors.mimir import MimirConnector from connectors.tempo import TempoConnector @@ -23,7 +24,6 @@ from datasources.exceptions import DataSourceUnavailable, InvalidQuery, QueryTimeout from datasources.helpers import fetch_json, fetch_text from datasources.provider import DataSourceProvider -import services.security_service as security_service class _DummyConnector(BaseConnector): diff --git a/tests/test_correlation.py b/tests/test_correlation.py index 3d3e83b..a25955f 100644 --- a/tests/test_correlation.py +++ b/tests/test_correlation.py @@ -9,11 +9,11 @@ http://www.apache.org/licenses/LICENSE-2.0 """ -from engine.correlation.temporal import CorrelatedEvent, correlate -from engine.correlation.signals import link_logs_to_metrics +from api.responses import LogBurst, MetricAnomaly, ServiceLatency from config import settings -from api.responses import MetricAnomaly, LogBurst, ServiceLatency -from engine.enums import Severity, ChangeType +from engine.correlation.signals import link_logs_to_metrics +from engine.correlation.temporal import CorrelatedEvent, correlate +from engine.enums import ChangeType, Severity def make_anomaly(t): diff --git a/tests/test_datasource_and_store_helpers_more.py b/tests/test_datasource_and_store_helpers_more.py index aa3b6f7..d49abc0 100644 --- a/tests/test_datasource_and_store_helpers_more.py +++ b/tests/test_datasource_and_store_helpers_more.py @@ -9,7 +9,6 @@ from __future__ import annotations import asyncio -from json import JSONDecodeError import pytest diff --git a/tests/test_degradation.py b/tests/test_degradation.py index e9d7865..c79e05d 100644 --- a/tests/test_degradation.py +++ b/tests/test_degradation.py @@ -9,8 +9,8 @@ http://www.apache.org/licenses/LICENSE-2.0 """ -from engine.forecast.degradation import _ema, _acceleration, analyze from engine.enums import Severity +from engine.forecast.degradation import _acceleration, _ema, analyze def test_ema_and_acceleration(): @@ -29,6 +29,6 @@ def test_analyze_degrading(): ts = list(range(20)) vals = [i * 2 for i in ts] sig = analyze("m", ts, vals) - assert isinstance(sig, object) + assert sig is not None assert sig.trend == "degrading" assert sig.severity in Severity diff --git a/tests/test_engine_causal.py b/tests/test_engine_causal.py index 4469ac7..8728edd 100644 --- a/tests/test_engine_causal.py +++ b/tests/test_engine_causal.py @@ -9,9 +9,9 @@ http://www.apache.org/licenses/LICENSE-2.0 """ -from engine.causal.graph import CausalGraph, InterventionResult from engine.causal.bayesian import score as bayesian_score -from engine.causal.granger import granger_pair_analysis, granger_multiple_pairs, GrangerResult +from engine.causal.granger import GrangerResult, granger_multiple_pairs, granger_pair_analysis +from engine.causal.graph import CausalGraph, InterventionResult def test_bayesian_score_consistency(): diff --git a/tests/test_engine_weights.py b/tests/test_engine_weights.py index 31e1509..68c61b3 100644 --- a/tests/test_engine_weights.py +++ b/tests/test_engine_weights.py @@ -11,8 +11,8 @@ import pytest -from engine.ml.weights import SignalWeights from engine.enums import Signal +from engine.ml.weights import SignalWeights def test_signal_weights_update_normalization(): diff --git a/tests/test_enums.py b/tests/test_enums.py index e434789..0cfe750 100644 --- a/tests/test_enums.py +++ b/tests/test_enums.py @@ -9,7 +9,7 @@ http://www.apache.org/licenses/LICENSE-2.0 """ -from engine.enums import Severity, Signal, ChangeType, RcaCategory +from engine.enums import ChangeType, RcaCategory, Severity, Signal def test_severity_from_score_and_weight(): diff --git a/tests/test_forecast.py b/tests/test_forecast.py index 6392bbe..91aa0bf 100644 --- a/tests/test_forecast.py +++ b/tests/test_forecast.py @@ -12,7 +12,7 @@ import pytest from config import settings -from engine.forecast.trajectory import _linear_fit, _r_squared, forecast, TrajectoryForecast +from engine.forecast.trajectory import TrajectoryForecast, _linear_fit, _r_squared, forecast def test_linear_fit_and_r2(): diff --git a/tests/test_fuzzy.py b/tests/test_fuzzy.py index dfda7c9..3923580 100644 --- a/tests/test_fuzzy.py +++ b/tests/test_fuzzy.py @@ -10,22 +10,23 @@ http://www.apache.org/licenses/LICENSE-2.0 """ -import pytest import random -from engine.causal.granger import granger_pair_analysis, granger_multiple_pairs +import pytest + +from api.responses import LogBurst, MetricAnomaly, ServiceLatency from engine.anomaly.detection import detect -from engine.forecast.trajectory import forecast -from engine.forecast.degradation import analyze as degradation -from engine.correlation.temporal import correlate +from engine.causal.granger import granger_multiple_pairs, granger_pair_analysis from engine.causal.graph import CausalGraph -from engine.topology.graph import DependencyGraph +from engine.correlation.temporal import correlate from engine.enums import Severity -from api.responses import MetricAnomaly, LogBurst, ServiceLatency +from engine.forecast.degradation import analyze as degradation +from engine.forecast.trajectory import forecast +from engine.topology.graph import DependencyGraph def random_anomaly(t): - mid = f"m{random.randint(0,5)}" + mid = f"m{random.randint(0, 5)}" return MetricAnomaly( metric_id=mid, metric_name=mid, @@ -54,7 +55,7 @@ def random_logburst(t): def random_latency(): return ServiceLatency( - service=f"s{random.randint(0,3)}", + service=f"s{random.randint(0, 3)}", operation="op", p50_ms=random.random() * 100, p95_ms=random.random() * 200, @@ -119,7 +120,7 @@ def test_fuzzy_correlation_and_causal(seed): g = CausalGraph() for i in range(5): a = f"m{i}" - b = f"m{(i+1)%5}" + b = f"m{(i + 1) % 5}" g.add_edge(a, b, random.random()) _ = g.topological_sort() _ = g.root_causes() diff --git a/tests/test_health_service_and_main_edges.py b/tests/test_health_service_and_main_edges.py index 00b89e1..16a9c30 100644 --- a/tests/test_health_service_and_main_edges.py +++ b/tests/test_health_service_and_main_edges.py @@ -8,21 +8,19 @@ from __future__ import annotations -import asyncio import importlib -import sys import types import httpx import pytest from fastapi.routing import APIRoute -from api.responses import JobStatus -from api.routes import health as health_route -from services import analyze_service import api.responses as response_exports import main as app_main +from api.responses import JobStatus +from api.routes import health as health_route from config import LOGS_BACKEND_LOKI, METRICS_BACKEND_MIMIR, TRACES_BACKEND_TEMPO +from services import analyze_service @pytest.mark.asyncio @@ -78,7 +76,7 @@ def test_response_exports_dynamic_lookup_and_missing_attr(): assert response_exports.JobStatus is JobStatus assert response_exports.AnalyzeJobSummary.__name__ == "AnalyzeJobSummary" with pytest.raises(AttributeError): - getattr(response_exports, "DoesNotExist") + response_exports.DoesNotExist @pytest.mark.asyncio diff --git a/tests/test_helpers.py b/tests/test_helpers.py index 92f4e56..64c1c66 100644 --- a/tests/test_helpers.py +++ b/tests/test_helpers.py @@ -9,11 +9,11 @@ http://www.apache.org/licenses/LICENSE-2.0 """ -import pytest import httpx +import pytest -from datasources.helpers import fetch_json, fetch_text from datasources.exceptions import InvalidQuery, QueryTimeout +from datasources.helpers import fetch_json, fetch_text class DummyResponse: diff --git a/tests/test_internal_security.py b/tests/test_internal_security.py index f4bc425..f58cacc 100644 --- a/tests/test_internal_security.py +++ b/tests/test_internal_security.py @@ -12,12 +12,14 @@ import asyncio import json import uuid -from pydantic import BaseModel + import jwt -from starlette.responses import JSONResponse import pytest +from pydantic import BaseModel +from starlette.responses import JSONResponse import services.security_service as security_service +from config import settings from services.security_service import ( InternalAuthMiddleware, InternalContext, @@ -26,7 +28,6 @@ reset_internal_context, set_internal_context, ) -from config import settings def _headers(payload): diff --git a/tests/test_jobs_routes.py b/tests/test_jobs_routes.py index e2fbebf..8e3e17f 100644 --- a/tests/test_jobs_routes.py +++ b/tests/test_jobs_routes.py @@ -8,8 +8,7 @@ from __future__ import annotations -from datetime import datetime, timezone -from types import SimpleNamespace +from datetime import UTC, datetime import pytest from fastapi import HTTPException @@ -35,7 +34,7 @@ def _ctx() -> InternalContext: def _job_view(status: JobStatus = JobStatus.QUEUED) -> JobView: - now = datetime.now(timezone.utc) + now = datetime.now(UTC) return JobView( job_id="job-1", report_id="report-1", diff --git a/tests/test_logs.py b/tests/test_logs.py index a6f5d41..7329550 100644 --- a/tests/test_logs.py +++ b/tests/test_logs.py @@ -9,9 +9,9 @@ http://www.apache.org/licenses/LICENSE-2.0 """ +from engine.enums import Severity from engine.logs.frequency import detect_bursts from engine.logs.patterns import analyze -from engine.enums import Severity def make_loki_response(lines): diff --git a/tests/test_ml_clustering.py b/tests/test_ml_clustering.py index 05ad77e..1f71b31 100644 --- a/tests/test_ml_clustering.py +++ b/tests/test_ml_clustering.py @@ -11,11 +11,10 @@ from __future__ import annotations import numpy as np -import pytest +import engine.ml.clustering as clustering from api.responses import MetricAnomaly from engine.enums import ChangeType, Severity -import engine.ml.clustering as clustering def _anomaly( diff --git a/tests/test_ml_ranking.py b/tests/test_ml_ranking.py index 5e2d056..09ab284 100644 --- a/tests/test_ml_ranking.py +++ b/tests/test_ml_ranking.py @@ -10,10 +10,8 @@ from __future__ import annotations -import pytest - -from engine.enums import RcaCategory, Severity import engine.ml.ranking as ranking +from engine.enums import RcaCategory, Severity from engine.rca.hypothesis import RootCause diff --git a/tests/test_rca_hypothesis.py b/tests/test_rca_hypothesis.py index 803df7c..325acb3 100644 --- a/tests/test_rca_hypothesis.py +++ b/tests/test_rca_hypothesis.py @@ -9,10 +9,10 @@ http://www.apache.org/licenses/LICENSE-2.0 """ -from engine.rca.hypothesis import _signals_from_event, _action_for_category, generate, RootCause -from engine.enums import RcaCategory, Severity, ChangeType -from engine.correlation.temporal import CorrelatedEvent from api.responses import MetricAnomaly, ServiceLatency +from engine.correlation.temporal import CorrelatedEvent +from engine.enums import ChangeType, RcaCategory, Severity +from engine.rca.hypothesis import RootCause, _action_for_category, _signals_from_event, generate class DummyEvent: @@ -180,7 +180,7 @@ def test_generate_includes_process_entity_from_metric_labels(): anomaly = MetricAnomaly( metric_id="m", metric_name=( - "process_cpu_time_seconds_total{service_name=cache," "process_executable_name=redis-server,process_pid=274}" + "process_cpu_time_seconds_total{service_name=cache,process_executable_name=redis-server,process_pid=274}" ), timestamp=1, value=100, diff --git a/tests/test_retry.py b/tests/test_retry.py index 3cb8871..751ca5d 100644 --- a/tests/test_retry.py +++ b/tests/test_retry.py @@ -9,9 +9,10 @@ http://www.apache.org/licenses/LICENSE-2.0 """ +import asyncio + import pytest -import datasources.retry as retry_module from datasources.retry import retry @@ -39,7 +40,7 @@ async def test_retry_async_applies_backoff_sleep(monkeypatch): async def fake_sleep(delay): sleeps.append(delay) - monkeypatch.setattr(retry_module.asyncio, "sleep", fake_sleep) + monkeypatch.setattr(asyncio, "sleep", fake_sleep) @retry(attempts=3, delay=0.25, backoff=2.0, exceptions=(ValueError,)) async def flaky(): diff --git a/tests/test_slo.py b/tests/test_slo.py index 4ead08f..4c7dd36 100644 --- a/tests/test_slo.py +++ b/tests/test_slo.py @@ -8,8 +8,8 @@ http://www.apache.org/licenses/LICENSE-2.0 """ -from engine.slo.burn import evaluate, SloBurnAlert -from engine.slo.budget import remaining_minutes, BudgetStatus +from engine.slo.budget import BudgetStatus, remaining_minutes +from engine.slo.burn import SloBurnAlert, evaluate def test_slo_evaluate_empty(): diff --git a/tests/test_store_baseline.py b/tests/test_store_baseline.py index 06710c7..a1d8aa4 100644 --- a/tests/test_store_baseline.py +++ b/tests/test_store_baseline.py @@ -27,6 +27,7 @@ async def test_baseline_save_load(): base2 = Baseline(mean=2.0, std=0.5, lower=-1.0, upper=3.0) await bstore.save(tid, metric, base2) l2 = await bstore.load(tid, metric) + assert l2 is not None assert l2.mean == 2.0 ts = [0.0, 1.0, 2.0, 3.0, 4.0] vals = [1.0, 2.0, 1.5, 2.5, 1.0] diff --git a/tests/test_store_client.py b/tests/test_store_client.py index f42e044..5e2a3d5 100644 --- a/tests/test_store_client.py +++ b/tests/test_store_client.py @@ -11,7 +11,7 @@ import pytest from store import client as store_client -from store.client import _fallback, redis_get, redis_set, redis_delete, redis_scan +from store.client import _fallback, redis_delete, redis_get, redis_scan, redis_set @pytest.mark.asyncio diff --git a/tests/test_store_client_events_and_weights_edges.py b/tests/test_store_client_events_and_weights_edges.py index 5f73eba..ff487e1 100644 --- a/tests/test_store_client_events_and_weights_edges.py +++ b/tests/test_store_client_events_and_weights_edges.py @@ -14,12 +14,12 @@ import pytest +import store.client as store_client_module from engine.causal.granger import GrangerResult from engine.events.models import DeploymentEvent from store import events as events_store from store import granger as granger_store from store import weights as weights_store -import store.client as store_client_module class _FakePipeline: diff --git a/tests/test_store_registry.py b/tests/test_store_registry.py index ef2aab8..89b0341 100644 --- a/tests/test_store_registry.py +++ b/tests/test_store_registry.py @@ -10,8 +10,8 @@ import pytest -from engine.enums import Signal from engine import registry as sreg +from engine.enums import Signal from store import weights as wstore diff --git a/tests/test_store_weights.py b/tests/test_store_weights.py index 33c7289..553bd05 100644 --- a/tests/test_store_weights.py +++ b/tests/test_store_weights.py @@ -20,6 +20,7 @@ async def test_weights_lifecycle(): data = {"metrics": 0.6, "logs": 0.4} await wstore.save(tid, data, update_count=5) stored = await wstore.load(tid) + assert stored is not None assert stored["weights"] == data assert stored["update_count"] == 5 await wstore.delete(tid) diff --git a/tests/test_topology.py b/tests/test_topology.py index 6d6ca31..830fb35 100644 --- a/tests/test_topology.py +++ b/tests/test_topology.py @@ -8,7 +8,7 @@ http://www.apache.org/licenses/LICENSE-2.0 """ -from engine.topology.graph import DependencyGraph, BlastRadius +from engine.topology.graph import BlastRadius, DependencyGraph def test_dependency_graph():