From 3307d2f4549ba19d7e368d532542dad5950bb411 Mon Sep 17 00:00:00 2001 From: Benjamin Kaminski Date: Thu, 14 Nov 2024 10:16:31 +0100 Subject: [PATCH 1/5] feat(backend/datadog): use datadog-api-client-python rather than datadog modulen --- slo_generator/backends/datadog.py | 121 ++++++++++++++++-------------- 1 file changed, 65 insertions(+), 56 deletions(-) diff --git a/slo_generator/backends/datadog.py b/slo_generator/backends/datadog.py index cc348834..cc81d72e 100644 --- a/slo_generator/backends/datadog.py +++ b/slo_generator/backends/datadog.py @@ -16,43 +16,51 @@ Datadog backend implementation. """ -import logging +import logging, os, time import pprint - -import datadog - from slo_generator import utils - -LOGGER = logging.getLogger(__name__) -logging.getLogger("datadog.api").setLevel(logging.ERROR) - +from datadog_api_client.v1 import Configuration, ApiClient, ApiException +from datadog_api_client.v1.api.service_level_objectives_api import ServiceLevelObjectivesApi +from datadog_api_client.v1.api.metrics_api import MetricsApi +from datadog_api_client.v1.api.authentication_api import AuthenticationApi + +# Configure logging +logging.basicConfig( + level=os.environ.get('LOGLEVEL', 'ERROR').upper(), force=True +) +logger = logging.getLogger(__name__) + +class DatadogClient: + def __init__(self, api_key=None, app_key=None, api_host=None, **kwargs): + configuration = Configuration(host=api_host, **kwargs) + configuration.api_key['apiKeyAuth'] = api_key + configuration.api_key['appKeyAuth'] = app_key + self.api_client = ApiClient(configuration) + AuthenticationApi(self.api_client).validate() + self.slo_api_client = ServiceLevelObjectivesApi(self.api_client) + self.metrics_api_client = MetricsApi(self.api_client) class DatadogBackend: """Backend for querying metrics from Datadog. - Args: client (obj, optional): Existing Datadog client to pass. api_key (str): Datadog API key. app_key (str): Datadog APP key. + app_host (str): Datadog site. kwargs (dict): Extra arguments to pass to initialize function. """ - def __init__(self, client=None, api_key=None, app_key=None, **kwargs): + def __init__(self, client=None, api_key=None, app_key=None, api_host=None, **kwargs): self.client = client if not self.client: - options = {"api_key": api_key, "app_key": app_key} - options.update(kwargs) - datadog.initialize(**options) - self.client = datadog.api + self.client = DatadogClient(api_key=api_key, app_key=app_key, api_host=api_host, **kwargs) def good_bad_ratio(self, timestamp, window, slo_config): """Query SLI value from good and valid queries. - Args: timestamp (int): UNIX timestamp. window (int): Window (in seconds). slo_config (dict): SLO configuration. - Returns: tuple: Good event count, Bad event count. """ @@ -77,9 +85,9 @@ def good_bad_ratio(self, timestamp, window, slo_config): operator_suffix, ) - good_event_query = self.client.Metric.query( - start=start, - end=end, + good_event_query = self.client.metrics_api_client.query_metrics( + _from=int(start), + to=int(end), query=query_good, ) @@ -90,9 +98,9 @@ def good_bad_ratio(self, timestamp, window, slo_config): operator_suffix, ) - event_query = self.client.Metric.query( - start=start, - end=end, + event_query = self.client.metrics_api_client.query_metrics( + _from=int(start), + to=int(end), query=query, ) @@ -101,18 +109,16 @@ def good_bad_ratio(self, timestamp, window, slo_config): if measurement.get("query_valid"): event_count = event_count - good_event_count - LOGGER.debug(f"Good events: {good_event_count} | " f"Bad events: {event_count}") + logging.debug(f"Good events: {good_event_count} | " f"Bad events: {event_count}") return good_event_count, event_count def query_sli(self, timestamp, window, slo_config): """Query SLI value directly. - Args: timestamp (int): UNIX timestamp. window (int): Window (in seconds). slo_config (dict): SLO configuration. - Returns: float: SLI value. """ @@ -121,59 +127,64 @@ def query_sli(self, timestamp, window, slo_config): end = timestamp query = measurement["query"] query = self._fmt_query(query, window) - response = self.client.Metric.query(start=start, end=end, query=query) - LOGGER.debug(f"Result valid: {pprint.pformat(response)}") + response = self.client.metrics_api_client.query_metrics(_from=int(start), to=int(end), query=query) + logging.debug(f"Result valid: {pprint.pformat(response)}") return DatadogBackend.count(response, average=True) def query_slo(self, timestamp, window, slo_config): """Query SLO value from a given Datadog SLO. - Args: timestamp (int): UNIX timestamp. window (int): Window (in seconds). slo_config (dict): SLO configuration. - Returns: tuple: Good event count, bad event count. """ slo_id = slo_config["spec"]["service_level_indicator"]["slo_id"] from_ts = timestamp - window - if utils.is_debug_enabled(): - slo_data = self.client.ServiceLevelObjective.get(id=slo_id) - LOGGER.debug(f"SLO data: {slo_id} | Result: {pprint.pformat(slo_data)}") - data = self.client.ServiceLevelObjective.history( - id=slo_id, - from_ts=from_ts, - to_ts=timestamp, - ) + try: - LOGGER.debug(f"Timeseries data: {slo_id} | Result: {pprint.pformat(data)}") - good_event_count = data["data"]["series"]["numerator"]["sum"] - valid_event_count = data["data"]["series"]["denominator"]["sum"] - bad_event_count = valid_event_count - good_event_count - return (good_event_count, bad_event_count) - except KeyError as exception: # monitor-based SLI - sli_value = data["data"]["overall"]["sli_value"] / 100 - LOGGER.debug(exception) - return sli_value + # Retrieve the SLO history + data = self.client.slo_api_client.get_slo_history(slo_id, from_ts=int(from_ts), to_ts=int(timestamp)) + logging.info(f"SLO history: {data}") + except ApiException as e: + logging.error(f"Error retrieving SLO history: {e}") + return None, None + + # Check if the data is present and properly structured + try: + logging.debug(f"Timeseries data: {slo_id} | Result: {pprint.pformat(data)}") + + # Check if necessary keys exist before accessing them + good_event_count = data.get("data", {}).get("series", {}).get("numerator", {}).get("sum", 0) + valid_event_count = data.get("data", {}).get("series", {}).get("denominator", {}).get("sum", 0) + + if good_event_count is not None and valid_event_count is not None: + bad_event_count = valid_event_count - good_event_count + return good_event_count, bad_event_count + + except KeyError as exception: # Monitor-based SLI case + logging.debug(f"KeyError exception: {exception}") + # Retrieve the SLI value if it's a monitor-based SLI + sli_value = data.get("data", {}).get("overall", {}).get("sli_value", 0) / 100 + return sli_value, None # Return None for bad_event_count if it's not a standard SLO + + # If the data is invalid or there's an issue, return None for both counts + return None, None + @staticmethod def _fmt_query(query, window, operator=None, operator_suffix=None): """Format Datadog query: - * If the Datadog expression has a `[window]` placeholder, replace it by the current window. Otherwise, append it to the expression. - * If prefix / suffix operators are defined, apply them to the metric. - * If labels are defined, append them to existing labels. - Args: query (str): Original query in YAML config. window (int): Query window (in seconds). operator (str): Operator (e.g: sum, avg, median, ...) operator_suffix (str): Operator suffix (e.g: as_count(), ...) - Returns: str: Formatted query. """ @@ -184,17 +195,15 @@ def _fmt_query(query, window, operator=None, operator_suffix=None): query = query.replace("[window]", f"{window}") if operator_suffix: query = f"{query}.{operator_suffix}" - LOGGER.debug(f"Query: {query}") + logging.debug(f"Query: {query}") return query @staticmethod def count(response, average=False): """Count events in time series. - Args: response (dict): Datadog Metrics API response. average (bool): Take average of result. - Returns: int: Event count. """ @@ -202,7 +211,7 @@ def count(response, average=False): values = [] pointlist = response["series"][0]["pointlist"] for point in pointlist: - value = point[1] + value = point['value'][1] if value is None: continue values.append(value) @@ -212,5 +221,5 @@ def count(response, average=False): return sum(values) / len(values) return sum(values) except (IndexError, AttributeError) as exception: - LOGGER.debug(exception) + logging.debug(exception) return 0 # no events in timeseries From fd300f500b3d32ad8b1636de2255c0f258e82b45 Mon Sep 17 00:00:00 2001 From: Benjamin Kaminski Date: Thu, 14 Nov 2024 10:17:07 +0100 Subject: [PATCH 2/5] Update datadog.py --- slo_generator/backends/datadog.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/slo_generator/backends/datadog.py b/slo_generator/backends/datadog.py index cc81d72e..7daf3fb1 100644 --- a/slo_generator/backends/datadog.py +++ b/slo_generator/backends/datadog.py @@ -19,7 +19,7 @@ import logging, os, time import pprint from slo_generator import utils -from datadog_api_client.v1 import Configuration, ApiClient, ApiException +from datadog_api_client.v1 import Configuration, ApiClient from datadog_api_client.v1.api.service_level_objectives_api import ServiceLevelObjectivesApi from datadog_api_client.v1.api.metrics_api import MetricsApi from datadog_api_client.v1.api.authentication_api import AuthenticationApi From 962d9416bfa501da8df4328299a6a0115c4081b6 Mon Sep 17 00:00:00 2001 From: Benjamin Kaminski Date: Thu, 14 Nov 2024 10:41:24 +0100 Subject: [PATCH 3/5] update package name --- setup.cfg | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/setup.cfg b/setup.cfg index 44003420..84537808 100644 --- a/setup.cfg +++ b/setup.cfg @@ -74,8 +74,7 @@ prometheus = prometheus-client prometheus-http-client datadog = - datadog - retrying==1.3.4 + datadog_api_client dynatrace = requests bigquery = From f7d240e8af7c046cc04e97cf64aaecf4b5e1decd Mon Sep 17 00:00:00 2001 From: Benjamin Kaminski Date: Thu, 14 Nov 2024 17:05:35 +0100 Subject: [PATCH 4/5] fix(dynatrace): library retrying is required --- setup.cfg | 1 + slo_generator/backends/datadog.py | 77 ++++++++++++++++++++++--------- 2 files changed, 56 insertions(+), 22 deletions(-) diff --git a/setup.cfg b/setup.cfg index 84537808..29123110 100644 --- a/setup.cfg +++ b/setup.cfg @@ -77,6 +77,7 @@ datadog = datadog_api_client dynatrace = requests + retrying==1.3.4 bigquery = google-api-python-client google-cloud-bigquery diff --git a/slo_generator/backends/datadog.py b/slo_generator/backends/datadog.py index 7daf3fb1..94b0f527 100644 --- a/slo_generator/backends/datadog.py +++ b/slo_generator/backends/datadog.py @@ -16,30 +16,39 @@ Datadog backend implementation. """ -import logging, os, time +import logging +import os import pprint -from slo_generator import utils -from datadog_api_client.v1 import Configuration, ApiClient -from datadog_api_client.v1.api.service_level_objectives_api import ServiceLevelObjectivesApi -from datadog_api_client.v1.api.metrics_api import MetricsApi + +from datadog_api_client.v1 import ApiClient, ApiException, Configuration from datadog_api_client.v1.api.authentication_api import AuthenticationApi +from datadog_api_client.v1.api.metrics_api import MetricsApi +from datadog_api_client.v1.api.service_level_objectives_api import ( + ServiceLevelObjectivesApi, +) # Configure logging -logging.basicConfig( - level=os.environ.get('LOGLEVEL', 'ERROR').upper(), force=True -) +logging.basicConfig(level=os.environ.get("LOGLEVEL", "ERROR").upper(), force=True) logger = logging.getLogger(__name__) + class DatadogClient: def __init__(self, api_key=None, app_key=None, api_host=None, **kwargs): - configuration = Configuration(host=api_host, **kwargs) - configuration.api_key['apiKeyAuth'] = api_key - configuration.api_key['appKeyAuth'] = app_key + configuration = Configuration( + host=api_host, + enable_retry=True, + retry_backoff_factor=2, + max_retries=5, + **kwargs, + ) + configuration.api_key["apiKeyAuth"] = api_key + configuration.api_key["appKeyAuth"] = app_key self.api_client = ApiClient(configuration) AuthenticationApi(self.api_client).validate() self.slo_api_client = ServiceLevelObjectivesApi(self.api_client) self.metrics_api_client = MetricsApi(self.api_client) + class DatadogBackend: """Backend for querying metrics from Datadog. Args: @@ -50,10 +59,14 @@ class DatadogBackend: kwargs (dict): Extra arguments to pass to initialize function. """ - def __init__(self, client=None, api_key=None, app_key=None, api_host=None, **kwargs): + def __init__( + self, client=None, api_key=None, app_key=None, api_host=None, **kwargs + ): self.client = client if not self.client: - self.client = DatadogClient(api_key=api_key, app_key=app_key, api_host=api_host, **kwargs) + self.client = DatadogClient( + api_key=api_key, app_key=app_key, api_host=api_host, **kwargs + ) def good_bad_ratio(self, timestamp, window, slo_config): """Query SLI value from good and valid queries. @@ -109,7 +122,9 @@ def good_bad_ratio(self, timestamp, window, slo_config): if measurement.get("query_valid"): event_count = event_count - good_event_count - logging.debug(f"Good events: {good_event_count} | " f"Bad events: {event_count}") + logging.debug( + f"Good events: {good_event_count} | " f"Bad events: {event_count}" + ) return good_event_count, event_count @@ -127,7 +142,9 @@ def query_sli(self, timestamp, window, slo_config): end = timestamp query = measurement["query"] query = self._fmt_query(query, window) - response = self.client.metrics_api_client.query_metrics(_from=int(start), to=int(end), query=query) + response = self.client.metrics_api_client.query_metrics( + _from=int(start), to=int(end), query=query + ) logging.debug(f"Result valid: {pprint.pformat(response)}") return DatadogBackend.count(response, average=True) @@ -145,7 +162,9 @@ def query_slo(self, timestamp, window, slo_config): try: # Retrieve the SLO history - data = self.client.slo_api_client.get_slo_history(slo_id, from_ts=int(from_ts), to_ts=int(timestamp)) + data = self.client.slo_api_client.get_slo_history( + slo_id, from_ts=int(from_ts), to_ts=int(timestamp) + ) logging.info(f"SLO history: {data}") except ApiException as e: logging.error(f"Error retrieving SLO history: {e}") @@ -156,8 +175,18 @@ def query_slo(self, timestamp, window, slo_config): logging.debug(f"Timeseries data: {slo_id} | Result: {pprint.pformat(data)}") # Check if necessary keys exist before accessing them - good_event_count = data.get("data", {}).get("series", {}).get("numerator", {}).get("sum", 0) - valid_event_count = data.get("data", {}).get("series", {}).get("denominator", {}).get("sum", 0) + good_event_count = ( + data.get("data", {}) + .get("series", {}) + .get("numerator", {}) + .get("sum", 0) + ) + valid_event_count = ( + data.get("data", {}) + .get("series", {}) + .get("denominator", {}) + .get("sum", 0) + ) if good_event_count is not None and valid_event_count is not None: bad_event_count = valid_event_count - good_event_count @@ -166,13 +195,17 @@ def query_slo(self, timestamp, window, slo_config): except KeyError as exception: # Monitor-based SLI case logging.debug(f"KeyError exception: {exception}") # Retrieve the SLI value if it's a monitor-based SLI - sli_value = data.get("data", {}).get("overall", {}).get("sli_value", 0) / 100 - return sli_value, None # Return None for bad_event_count if it's not a standard SLO + sli_value = ( + data.get("data", {}).get("overall", {}).get("sli_value", 0) / 100 + ) + return ( + sli_value, + None, + ) # Return None for bad_event_count if it's not a standard SLO # If the data is invalid or there's an issue, return None for both counts return None, None - @staticmethod def _fmt_query(query, window, operator=None, operator_suffix=None): """Format Datadog query: @@ -211,7 +244,7 @@ def count(response, average=False): values = [] pointlist = response["series"][0]["pointlist"] for point in pointlist: - value = point['value'][1] + value = point["value"][1] if value is None: continue values.append(value) From 1d9e2deafe6e95b3f579c5d30a114de01df808a6 Mon Sep 17 00:00:00 2001 From: Benjamin Kaminski Date: Mon, 18 Nov 2024 16:15:34 +0100 Subject: [PATCH 5/5] Update setup.cfg --- setup.cfg | 2 ++ 1 file changed, 2 insertions(+) diff --git a/setup.cfg b/setup.cfg index 29123110..2900b054 100644 --- a/setup.cfg +++ b/setup.cfg @@ -74,6 +74,8 @@ prometheus = prometheus-client prometheus-http-client datadog = + datadog + retrying==1.3.4 datadog_api_client dynatrace = requests