diff --git a/setup.cfg b/setup.cfg index 4400342..2900b05 100644 --- a/setup.cfg +++ b/setup.cfg @@ -76,8 +76,10 @@ prometheus = datadog = datadog retrying==1.3.4 + datadog_api_client dynatrace = requests + retrying==1.3.4 bigquery = google-api-python-client google-cloud-bigquery diff --git a/slo_generator/backends/datadog.py b/slo_generator/backends/datadog.py index cc34883..94b0f52 100644 --- a/slo_generator/backends/datadog.py +++ b/slo_generator/backends/datadog.py @@ -17,42 +17,63 @@ """ import logging +import os import pprint -import datadog - -from slo_generator import utils - -LOGGER = logging.getLogger(__name__) -logging.getLogger("datadog.api").setLevel(logging.ERROR) +from datadog_api_client.v1 import ApiClient, ApiException, Configuration +from datadog_api_client.v1.api.authentication_api import AuthenticationApi +from datadog_api_client.v1.api.metrics_api import MetricsApi +from datadog_api_client.v1.api.service_level_objectives_api import ( + ServiceLevelObjectivesApi, +) + +# Configure logging +logging.basicConfig(level=os.environ.get("LOGLEVEL", "ERROR").upper(), force=True) +logger = logging.getLogger(__name__) + + +class DatadogClient: + def __init__(self, api_key=None, app_key=None, api_host=None, **kwargs): + configuration = Configuration( + host=api_host, + enable_retry=True, + retry_backoff_factor=2, + max_retries=5, + **kwargs, + ) + configuration.api_key["apiKeyAuth"] = api_key + configuration.api_key["appKeyAuth"] = app_key + self.api_client = ApiClient(configuration) + AuthenticationApi(self.api_client).validate() + self.slo_api_client = ServiceLevelObjectivesApi(self.api_client) + self.metrics_api_client = MetricsApi(self.api_client) class DatadogBackend: """Backend for querying metrics from Datadog. - Args: client (obj, optional): Existing Datadog client to pass. api_key (str): Datadog API key. app_key (str): Datadog APP key. + app_host (str): Datadog site. kwargs (dict): Extra arguments to pass to initialize function. """ - def __init__(self, client=None, api_key=None, app_key=None, **kwargs): + def __init__( + self, client=None, api_key=None, app_key=None, api_host=None, **kwargs + ): self.client = client if not self.client: - options = {"api_key": api_key, "app_key": app_key} - options.update(kwargs) - datadog.initialize(**options) - self.client = datadog.api + self.client = DatadogClient( + api_key=api_key, app_key=app_key, api_host=api_host, **kwargs + ) def good_bad_ratio(self, timestamp, window, slo_config): """Query SLI value from good and valid queries. - Args: timestamp (int): UNIX timestamp. window (int): Window (in seconds). slo_config (dict): SLO configuration. - Returns: tuple: Good event count, Bad event count. """ @@ -77,9 +98,9 @@ def good_bad_ratio(self, timestamp, window, slo_config): operator_suffix, ) - good_event_query = self.client.Metric.query( - start=start, - end=end, + good_event_query = self.client.metrics_api_client.query_metrics( + _from=int(start), + to=int(end), query=query_good, ) @@ -90,9 +111,9 @@ def good_bad_ratio(self, timestamp, window, slo_config): operator_suffix, ) - event_query = self.client.Metric.query( - start=start, - end=end, + event_query = self.client.metrics_api_client.query_metrics( + _from=int(start), + to=int(end), query=query, ) @@ -101,18 +122,18 @@ def good_bad_ratio(self, timestamp, window, slo_config): if measurement.get("query_valid"): event_count = event_count - good_event_count - LOGGER.debug(f"Good events: {good_event_count} | " f"Bad events: {event_count}") + logging.debug( + f"Good events: {good_event_count} | " f"Bad events: {event_count}" + ) return good_event_count, event_count def query_sli(self, timestamp, window, slo_config): """Query SLI value directly. - Args: timestamp (int): UNIX timestamp. window (int): Window (in seconds). slo_config (dict): SLO configuration. - Returns: float: SLI value. """ @@ -121,59 +142,82 @@ def query_sli(self, timestamp, window, slo_config): end = timestamp query = measurement["query"] query = self._fmt_query(query, window) - response = self.client.Metric.query(start=start, end=end, query=query) - LOGGER.debug(f"Result valid: {pprint.pformat(response)}") + response = self.client.metrics_api_client.query_metrics( + _from=int(start), to=int(end), query=query + ) + logging.debug(f"Result valid: {pprint.pformat(response)}") return DatadogBackend.count(response, average=True) def query_slo(self, timestamp, window, slo_config): """Query SLO value from a given Datadog SLO. - Args: timestamp (int): UNIX timestamp. window (int): Window (in seconds). slo_config (dict): SLO configuration. - Returns: tuple: Good event count, bad event count. """ slo_id = slo_config["spec"]["service_level_indicator"]["slo_id"] from_ts = timestamp - window - if utils.is_debug_enabled(): - slo_data = self.client.ServiceLevelObjective.get(id=slo_id) - LOGGER.debug(f"SLO data: {slo_id} | Result: {pprint.pformat(slo_data)}") - data = self.client.ServiceLevelObjective.history( - id=slo_id, - from_ts=from_ts, - to_ts=timestamp, - ) + try: - LOGGER.debug(f"Timeseries data: {slo_id} | Result: {pprint.pformat(data)}") - good_event_count = data["data"]["series"]["numerator"]["sum"] - valid_event_count = data["data"]["series"]["denominator"]["sum"] - bad_event_count = valid_event_count - good_event_count - return (good_event_count, bad_event_count) - except KeyError as exception: # monitor-based SLI - sli_value = data["data"]["overall"]["sli_value"] / 100 - LOGGER.debug(exception) - return sli_value + # Retrieve the SLO history + data = self.client.slo_api_client.get_slo_history( + slo_id, from_ts=int(from_ts), to_ts=int(timestamp) + ) + logging.info(f"SLO history: {data}") + except ApiException as e: + logging.error(f"Error retrieving SLO history: {e}") + return None, None + + # Check if the data is present and properly structured + try: + logging.debug(f"Timeseries data: {slo_id} | Result: {pprint.pformat(data)}") + + # Check if necessary keys exist before accessing them + good_event_count = ( + data.get("data", {}) + .get("series", {}) + .get("numerator", {}) + .get("sum", 0) + ) + valid_event_count = ( + data.get("data", {}) + .get("series", {}) + .get("denominator", {}) + .get("sum", 0) + ) + + if good_event_count is not None and valid_event_count is not None: + bad_event_count = valid_event_count - good_event_count + return good_event_count, bad_event_count + + except KeyError as exception: # Monitor-based SLI case + logging.debug(f"KeyError exception: {exception}") + # Retrieve the SLI value if it's a monitor-based SLI + sli_value = ( + data.get("data", {}).get("overall", {}).get("sli_value", 0) / 100 + ) + return ( + sli_value, + None, + ) # Return None for bad_event_count if it's not a standard SLO + + # If the data is invalid or there's an issue, return None for both counts + return None, None @staticmethod def _fmt_query(query, window, operator=None, operator_suffix=None): """Format Datadog query: - * If the Datadog expression has a `[window]` placeholder, replace it by the current window. Otherwise, append it to the expression. - * If prefix / suffix operators are defined, apply them to the metric. - * If labels are defined, append them to existing labels. - Args: query (str): Original query in YAML config. window (int): Query window (in seconds). operator (str): Operator (e.g: sum, avg, median, ...) operator_suffix (str): Operator suffix (e.g: as_count(), ...) - Returns: str: Formatted query. """ @@ -184,17 +228,15 @@ def _fmt_query(query, window, operator=None, operator_suffix=None): query = query.replace("[window]", f"{window}") if operator_suffix: query = f"{query}.{operator_suffix}" - LOGGER.debug(f"Query: {query}") + logging.debug(f"Query: {query}") return query @staticmethod def count(response, average=False): """Count events in time series. - Args: response (dict): Datadog Metrics API response. average (bool): Take average of result. - Returns: int: Event count. """ @@ -202,7 +244,7 @@ def count(response, average=False): values = [] pointlist = response["series"][0]["pointlist"] for point in pointlist: - value = point[1] + value = point["value"][1] if value is None: continue values.append(value) @@ -212,5 +254,5 @@ def count(response, average=False): return sum(values) / len(values) return sum(values) except (IndexError, AttributeError) as exception: - LOGGER.debug(exception) + logging.debug(exception) return 0 # no events in timeseries