From 9187a9113118317ab5066016ad4f0620b2594e3c Mon Sep 17 00:00:00 2001 From: Eric Weaver Date: Fri, 8 May 2026 10:01:38 -0400 Subject: [PATCH 01/12] Close and dereference main db connection on cancel --- postgres/datadog_checks/postgres/postgres.py | 17 +++++--- postgres/tests/test_unit.py | 43 ++++++++++++++++++++ 2 files changed, 54 insertions(+), 6 deletions(-) diff --git a/postgres/datadog_checks/postgres/postgres.py b/postgres/datadog_checks/postgres/postgres.py index 113e4402af133..f781aba423bde 100644 --- a/postgres/datadog_checks/postgres/postgres.py +++ b/postgres/datadog_checks/postgres/postgres.py @@ -302,6 +302,15 @@ def execute_query_raw(self, query, db): rows = cursor.fetchall() return rows + def _close_db(self): + if self._db: + try: + self._db.close() + except Exception: + pass + finally: + self._db = None + @contextlib.contextmanager def db(self): """ @@ -322,12 +331,7 @@ def db(self): self.log.warning( "Connection to the database %s has been interrupted, closing connection", self._config.dbname ) - try: - self._db.close() - except Exception: - pass - finally: - self._db = None + self._close_db() raise except Exception: self.log.exception("Unhandled exception while using database connection %s", self._config.dbname) @@ -491,6 +495,7 @@ def cancel(self): self.data_observability.cancel() if self.data_observability._job_loop_future: self.data_observability._job_loop_future.result() + self._close_db() self._close_db_pool() def _clean_state(self): diff --git a/postgres/tests/test_unit.py b/postgres/tests/test_unit.py index ce7d0505880df..e1ac5091fe4f8 100644 --- a/postgres/tests/test_unit.py +++ b/postgres/tests/test_unit.py @@ -336,3 +336,46 @@ def test_new_connection_closes_conn_when_configure_raises(integration_check, pg_ with pytest.raises(psycopg.Error): check._new_connection(check._config.dbname) conn.close.assert_called_once() + + +def test_close_db_closes_open_connection(integration_check, pg_instance): + check = integration_check(pg_instance) + conn = mock.MagicMock() + conn.closed = False + check._db = conn + + check._close_db() + + conn.close.assert_called_once() + assert check._db is None + + +def test_close_db_handles_already_closed_connection(integration_check, pg_instance): + check = integration_check(pg_instance) + conn = mock.MagicMock() + conn.close.side_effect = Exception("already closed") + check._db = conn + + check._close_db() + + assert check._db is None + + +def test_close_db_noop_when_no_connection(integration_check, pg_instance): + check = integration_check(pg_instance) + check._db = None + + check._close_db() + + assert check._db is None + + +def test_cancel_closes_main_db_connection(integration_check, pg_instance): + check = integration_check(pg_instance) + conn = mock.MagicMock() + check._db = conn + + check.cancel() + + conn.close.assert_called_once() + assert check._db is None From 2ba7f1ef9da46e3664dc5b07b1f3d86981446f85 Mon Sep 17 00:00:00 2001 From: Eric Weaver Date: Fri, 8 May 2026 10:15:41 -0400 Subject: [PATCH 02/12] Remove unused db_pool reference --- postgres/datadog_checks/postgres/metadata.py | 1 - 1 file changed, 1 deletion(-) diff --git a/postgres/datadog_checks/postgres/metadata.py b/postgres/datadog_checks/postgres/metadata.py index cd341d12b23e5..f2824cf06fc2b 100644 --- a/postgres/datadog_checks/postgres/metadata.py +++ b/postgres/datadog_checks/postgres/metadata.py @@ -108,7 +108,6 @@ def __init__(self, check: PostgreSql, config: InstanceConfig): ) self._check = check self._config = config - self.db_pool = self._check.db_pool self._collect_pg_settings_enabled = config.collect_settings.enabled self._collect_extensions_enabled = self._collect_pg_settings_enabled self._collect_schemas_enabled = config.collect_schemas.enabled From 7ee78ed6a4c839b9e1b4ad51e8d30b566d8fe0f8 Mon Sep 17 00:00:00 2001 From: Eric Weaver Date: Fri, 8 May 2026 10:19:20 -0400 Subject: [PATCH 03/12] Avoid extra reference to db_pool --- postgres/datadog_checks/postgres/statement_samples.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/postgres/datadog_checks/postgres/statement_samples.py b/postgres/datadog_checks/postgres/statement_samples.py index e8b45dd1ef891..dc951d50c1b21 100644 --- a/postgres/datadog_checks/postgres/statement_samples.py +++ b/postgres/datadog_checks/postgres/statement_samples.py @@ -152,8 +152,6 @@ def __init__(self, check: PostgreSql, config: InstanceConfig): if not config.query_samples.enabled: collection_interval = config.query_activity.collection_interval - self.db_pool = check.db_pool - super(PostgresStatementSamples, self).__init__( check, rate_limit=1 / collection_interval, @@ -646,7 +644,7 @@ def _can_explain_statement(self, obfuscated_statement): def _get_db_explain_setup_state(self, dbname): # type: (str) -> Tuple[Optional[DBExplainError], Optional[Exception]] try: - self.db_pool.get_connection(dbname) + self._check.db_pool.get_connection(dbname) except psycopg.OperationalError as e: self._log.warning( "cannot collect execution plans due to failed DB connection to dbname=%s: %s", dbname, repr(e) @@ -712,7 +710,7 @@ def _run_explain(self, dbname, statement, obfuscated_statement): start_time = time.time() if self._cancel_event.is_set(): raise Exception("Job loop cancelled. Aborting query.") - with self.db_pool.get_connection(dbname) as conn: + with self._check.db_pool.get_connection(dbname) as conn: # When sending potentially non-ascii data, e.g. UTF8, we need to force # the client encoding to UTF-8 to match Python string encoding if conn.info.encoding.lower() in ["ascii", "sqlascii", "sql_ascii"]: From 2c7ad0cca25e7ce7516a89f562eb59fb909fb8b8 Mon Sep 17 00:00:00 2001 From: Eric Weaver Date: Fri, 8 May 2026 10:21:22 -0400 Subject: [PATCH 04/12] Deference cyclical check variable on shutdown --- postgres/datadog_checks/postgres/data_observability.py | 7 +++++++ postgres/datadog_checks/postgres/metadata.py | 7 +++++++ postgres/datadog_checks/postgres/statement_samples.py | 7 +++++++ postgres/datadog_checks/postgres/statements.py | 7 +++++++ 4 files changed, 28 insertions(+) diff --git a/postgres/datadog_checks/postgres/data_observability.py b/postgres/datadog_checks/postgres/data_observability.py index 6f4107b29e5ab..cfecc964e4dea 100644 --- a/postgres/datadog_checks/postgres/data_observability.py +++ b/postgres/datadog_checks/postgres/data_observability.py @@ -36,8 +36,15 @@ def __init__(self, check: PostgreSql, config: InstanceConfig): min_collection_interval=config.min_collection_interval, expected_db_exceptions=(psycopg.errors.DatabaseError,), job_name="data-observability", + shutdown_callback=self._shutdown, ) + def _shutdown(self): + try: + self._check = None + except Exception: + pass + @property def _do_config(self): return self._config.data_observability diff --git a/postgres/datadog_checks/postgres/metadata.py b/postgres/datadog_checks/postgres/metadata.py index f2824cf06fc2b..70e2a610c5452 100644 --- a/postgres/datadog_checks/postgres/metadata.py +++ b/postgres/datadog_checks/postgres/metadata.py @@ -105,6 +105,7 @@ def __init__(self, check: PostgreSql, config: InstanceConfig): min_collection_interval=config.min_collection_interval, expected_db_exceptions=(psycopg.errors.DatabaseError,), job_name="database-metadata", + shutdown_callback=self._shutdown, ) self._check = check self._config = config @@ -121,6 +122,12 @@ def __init__(self, check: PostgreSql, config: InstanceConfig): self._tags_no_db = None self.tags = None + def _shutdown(self): + try: + self._check = None + except Exception: + pass + def _dbtags(self, db, *extra_tags): """ Returns the default instance tags with the initial "db" tag replaced with the provided tag diff --git a/postgres/datadog_checks/postgres/statement_samples.py b/postgres/datadog_checks/postgres/statement_samples.py index dc951d50c1b21..058995991b65f 100644 --- a/postgres/datadog_checks/postgres/statement_samples.py +++ b/postgres/datadog_checks/postgres/statement_samples.py @@ -161,6 +161,7 @@ def __init__(self, check: PostgreSql, config: InstanceConfig): min_collection_interval=config.min_collection_interval, expected_db_exceptions=(psycopg.errors.DatabaseError,), job_name="query-samples", + shutdown_callback=self._shutdown, ) self._check = check self._config = config @@ -220,6 +221,12 @@ def __init__(self, check: PostgreSql, config: InstanceConfig): self._time_since_last_activity_event = 0 self._pg_stat_activity_cols = None + def _shutdown(self): + try: + self._check = None + except Exception: + pass + def _dbtags(self, db, *extra_tags): """ Returns the default instance tags with the initial "db" tag replaced with the provided tag diff --git a/postgres/datadog_checks/postgres/statements.py b/postgres/datadog_checks/postgres/statements.py index 8934812ecc79f..55b907c18e6d0 100644 --- a/postgres/datadog_checks/postgres/statements.py +++ b/postgres/datadog_checks/postgres/statements.py @@ -169,6 +169,7 @@ def __init__(self, check, config: InstanceConfig): dbms="postgres", rate_limit=1 / float(collection_interval), job_name="query-metrics", + shutdown_callback=self._shutdown, ) self._check = check self._metrics_collection_interval = collection_interval @@ -202,6 +203,12 @@ def __init__(self, check, config: InstanceConfig): ttl=60 * 60 / config.query_metrics.full_statement_text_samples_per_hour_per_query, ) + def _shutdown(self): + try: + self._check = None + except Exception: + pass + def _execute_query(self, query, params=(), binary=False, row_factory=None) -> Tuple[list, list]: if self._cancel_event.is_set(): raise Exception("Job loop cancelled. Aborting query.") From e4706b3d0583c0e47d035a58d7e8d27e5c5d103f Mon Sep 17 00:00:00 2001 From: Eric Weaver Date: Fri, 8 May 2026 10:23:40 -0400 Subject: [PATCH 05/12] Clear functools.partial closures in cancel() to release pool references _dynamic_queries holds partial(db_pool.get_connection, ...) closures and _query_manager.executor holds a partial(self.execute_query_raw, db=self.db) closure. Both root the check instance and pool manager in memory after cancel. --- postgres/datadog_checks/postgres/postgres.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/postgres/datadog_checks/postgres/postgres.py b/postgres/datadog_checks/postgres/postgres.py index f781aba423bde..b62c6ac2cf4bf 100644 --- a/postgres/datadog_checks/postgres/postgres.py +++ b/postgres/datadog_checks/postgres/postgres.py @@ -495,6 +495,8 @@ def cancel(self): self.data_observability.cancel() if self.data_observability._job_loop_future: self.data_observability._job_loop_future.result() + self._dynamic_queries = [] + self._query_manager.executor = None self._close_db() self._close_db_pool() From fcc37f2886001610c555de51e94be32b64bc0a23 Mon Sep 17 00:00:00 2001 From: Eric Weaver Date: Fri, 8 May 2026 10:24:48 -0400 Subject: [PATCH 06/12] Null _job_loop_future after join in cancel() to release completed Futures --- postgres/datadog_checks/postgres/postgres.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/postgres/datadog_checks/postgres/postgres.py b/postgres/datadog_checks/postgres/postgres.py index b62c6ac2cf4bf..0d326ddefda41 100644 --- a/postgres/datadog_checks/postgres/postgres.py +++ b/postgres/datadog_checks/postgres/postgres.py @@ -483,18 +483,23 @@ def cancel(self): self.metadata_samples.cancel() if self.statement_metrics._job_loop_future: self.statement_metrics._job_loop_future.result() + self.statement_metrics._job_loop_future = None if self.statement_samples._job_loop_future: self.statement_samples._job_loop_future.result() + self.statement_samples._job_loop_future = None if self.metadata_samples._job_loop_future: self.metadata_samples._job_loop_future.result() + self.metadata_samples._job_loop_future = None elif self._config.data_observability.enabled: self.metadata_samples.cancel() if self.metadata_samples._job_loop_future: self.metadata_samples._job_loop_future.result() + self.metadata_samples._job_loop_future = None if self._config.data_observability.enabled: self.data_observability.cancel() if self.data_observability._job_loop_future: self.data_observability._job_loop_future.result() + self.data_observability._job_loop_future = None self._dynamic_queries = [] self._query_manager.executor = None self._close_db() From 3901ffe755b327e6655fd5b2188b691be3563304 Mon Sep 17 00:00:00 2001 From: Eric Weaver Date: Fri, 8 May 2026 10:39:24 -0400 Subject: [PATCH 07/12] =?UTF-8?q?=20=20=20=20=20=20=20=20=20=20=20=20dbms?= =?UTF-8?q?=3D"postgres",=20=5Fcompiled=5Fpatterns=5Fcache=20=E2=8C=98K=20?= =?UTF-8?q?to=20generate=20command=20ok=20next=20=20commit=20message=20sug?= =?UTF-8?q?gestion=20=20Null=20=5Fjob=5Floop=5Ffuture=20after=20join=20in?= =?UTF-8?q?=20cancel()=20to=20release=20completed=20Futures=20Completed=20?= =?UTF-8?q?Future=20objects=20are=20lightweight=20but=20keep=20the=20async?= =?UTF-8?q?=20job=20reachable=20in=20the=20reference=20graph.=20Nulling=20?= =?UTF-8?q?them=20immediately=20after=20.result()=20returns=20removes=20an?= =?UTF-8?q?=20unnecessary=20link=20that=20delays=20GC=20of=20the=20check?= =?UTF-8?q?=20instance.=20ok=20next=20=20Let=20me=20check=20the=20current?= =?UTF-8?q?=20state=20of=20each=20=5Fshutdown=20method=20and=20the=20cache?= =?UTF-8?q?s=20each=20job=20holds.=20=20Now=20I'll=20update=20each=20=5Fsh?= =?UTF-8?q?utdown=20to=20clear=20caches.=20=20StatementMetrics=20--=20=5Ff?= =?UTF-8?q?ull=5Fstatement=5Ftext=5Fcache,=20=5Fstate,=20=5Fquery=5Fcalls?= =?UTF-8?q?=5Fcache,=20=5Fbaseline=5Fmetrics:=20=20=20=20=20=20=20=20=20?= =?UTF-8?q?=20=20=20=20=20self.=5Fcheck=20=3D=20None=20=20=20=20=20=20=20?= =?UTF-8?q?=20=20=20=20=20=20self.=5Ffull=5Fstatement=5Ftext=5Fcache=20=3D?= =?UTF-8?q?=20None=20=20=20=20=20=20=20=20=20=20=20=20=20self.=5Fstate=20?= =?UTF-8?q?=3D=20None=20=20=20=20=20=20=20=20=20=20=20=20=20self.=5Fquery?= =?UTF-8?q?=5Fcalls=5Fcache=20=3D=20None=20=20=20=20=20=20=20=20=20=20=20?= =?UTF-8?q?=20=20self.=5Fbaseline=5Fmetrics=20=3D=20None=20=20=20=20=20=20?= =?UTF-8?q?=20=20=20except=20Exception:=20StatementSamples=20--=20=5Fcolle?= =?UTF-8?q?ction=5Fstrategy=5Fcache,=20=5Fexplain=5Ferrors=5Fcache,=20=5Fe?= =?UTF-8?q?xplained=5Fstatements=5Fratelimiter,=20=5Fseen=5Fsamples=5Frate?= =?UTF-8?q?limiter,=20=5Fraw=5Fstatement=5Ftext=5Fcache:=20=20=20=20=20=20?= =?UTF-8?q?=20=20=20=20=20=20=20=20self.=5Fcheck=20=3D=20None=20=20=20=20?= =?UTF-8?q?=20=20=20=20=20=20=20=20=20self.=5Fcollection=5Fstrategy=5Fcach?= =?UTF-8?q?e=20=3D=20None=20=20=20=20=20=20=20=20=20=20=20=20=20self.=5Fex?= =?UTF-8?q?plain=5Ferrors=5Fcache=20=3D=20None=20=20=20=20=20=20=20=20=20?= =?UTF-8?q?=20=20=20=20self.=5Fexplained=5Fstatements=5Fratelimiter=20=3D?= =?UTF-8?q?=20None=20=20=20=20=20=20=20=20=20=20=20=20=20self.=5Fseen=5Fsa?= =?UTF-8?q?mples=5Fratelimiter=20=3D=20None=20=20=20=20=20=20=20=20=20=20?= =?UTF-8?q?=20=20=20self.=5Fraw=5Fstatement=5Ftext=5Fcache=20=3D=20None=20?= =?UTF-8?q?=20=20=20=20=20=20=20=20except=20Exception:=20Metadata=20--=20?= =?UTF-8?q?=5Fschema=5Fcollector=20(holds=20back-ref=20to=20check)=20and?= =?UTF-8?q?=20=5Fcompiled=5Fpatterns=5Fcache:=20=20=20=20=20=20=20=20=20?= =?UTF-8?q?=20=20=20=20=20self.=5Fcheck=20=3D=20None=20=20=20=20=20=20=20?= =?UTF-8?q?=20=20=20=20=20=20self.=5Fschema=5Fcollector=20=3D=20None=20=20?= =?UTF-8?q?=20=20=20=20=20=20=20=20=20=20=20self.=5Fcompiled=5Fpatterns=5F?= =?UTF-8?q?cache=20=3D=20None=20=20=20=20=20=20=20=20=20except=20Exception?= =?UTF-8?q?:=20DataObservability=20only=20has=20a=20small=20=5Flast=5Fexec?= =?UTF-8?q?ution=20dict=20--=20not=20worth=20clearing.=20Its=20=5Fshutdown?= =?UTF-8?q?=20already=20just=20nulls=20=5Fcheck,=20which=20is=20sufficient?= =?UTF-8?q?.=20=20All=20pre-existing.=20Done.=20Ready=20for=20your=20commi?= =?UTF-8?q?t.=20=20commit=20message=20suggestion=20=20Clear=20caches=20in?= =?UTF-8?q?=20async=20job=20shutdown=20callbacks=20to=20free=20memory=20ea?= =?UTF-8?q?gerly?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- postgres/datadog_checks/postgres/metadata.py | 2 ++ postgres/datadog_checks/postgres/statement_samples.py | 5 +++++ postgres/datadog_checks/postgres/statements.py | 4 ++++ 3 files changed, 11 insertions(+) diff --git a/postgres/datadog_checks/postgres/metadata.py b/postgres/datadog_checks/postgres/metadata.py index 70e2a610c5452..bef3aa1f96c6c 100644 --- a/postgres/datadog_checks/postgres/metadata.py +++ b/postgres/datadog_checks/postgres/metadata.py @@ -125,6 +125,8 @@ def __init__(self, check: PostgreSql, config: InstanceConfig): def _shutdown(self): try: self._check = None + self._schema_collector = None + self._compiled_patterns_cache = None except Exception: pass diff --git a/postgres/datadog_checks/postgres/statement_samples.py b/postgres/datadog_checks/postgres/statement_samples.py index 058995991b65f..9548773bb67eb 100644 --- a/postgres/datadog_checks/postgres/statement_samples.py +++ b/postgres/datadog_checks/postgres/statement_samples.py @@ -224,6 +224,11 @@ def __init__(self, check: PostgreSql, config: InstanceConfig): def _shutdown(self): try: self._check = None + self._collection_strategy_cache = None + self._explain_errors_cache = None + self._explained_statements_ratelimiter = None + self._seen_samples_ratelimiter = None + self._raw_statement_text_cache = None except Exception: pass diff --git a/postgres/datadog_checks/postgres/statements.py b/postgres/datadog_checks/postgres/statements.py index 55b907c18e6d0..de98a4c49001a 100644 --- a/postgres/datadog_checks/postgres/statements.py +++ b/postgres/datadog_checks/postgres/statements.py @@ -206,6 +206,10 @@ def __init__(self, check, config: InstanceConfig): def _shutdown(self): try: self._check = None + self._full_statement_text_cache = None + self._state = None + self._query_calls_cache = None + self._baseline_metrics = None except Exception: pass From fd9935ef8b450528eaa2b3f48971adc066a5d9f3 Mon Sep 17 00:00:00 2001 From: Eric Weaver Date: Fri, 8 May 2026 11:00:24 -0400 Subject: [PATCH 08/12] Add changelog --- postgres/changelog.d/23640.fixed | 1 + 1 file changed, 1 insertion(+) create mode 100644 postgres/changelog.d/23640.fixed diff --git a/postgres/changelog.d/23640.fixed b/postgres/changelog.d/23640.fixed new file mode 100644 index 0000000000000..92778f0dd54ad --- /dev/null +++ b/postgres/changelog.d/23640.fixed @@ -0,0 +1 @@ +Close dangling connections and break reference cycles on check cancel to reduce memory retention when checks are restarted or rescheduled \ No newline at end of file From a92571468c6d1bea5b7b766062ccc4240d79d3e4 Mon Sep 17 00:00:00 2001 From: Eric Weaver Date: Fri, 8 May 2026 11:35:04 -0400 Subject: [PATCH 09/12] Only invoke shutdown logic is cancel called --- postgres/datadog_checks/postgres/data_observability.py | 2 ++ postgres/datadog_checks/postgres/metadata.py | 2 ++ postgres/datadog_checks/postgres/statement_samples.py | 2 ++ postgres/datadog_checks/postgres/statements.py | 2 ++ 4 files changed, 8 insertions(+) diff --git a/postgres/datadog_checks/postgres/data_observability.py b/postgres/datadog_checks/postgres/data_observability.py index cfecc964e4dea..da2994722b395 100644 --- a/postgres/datadog_checks/postgres/data_observability.py +++ b/postgres/datadog_checks/postgres/data_observability.py @@ -40,6 +40,8 @@ def __init__(self, check: PostgreSql, config: InstanceConfig): ) def _shutdown(self): + if not self._cancel_event.is_set(): + return try: self._check = None except Exception: diff --git a/postgres/datadog_checks/postgres/metadata.py b/postgres/datadog_checks/postgres/metadata.py index bef3aa1f96c6c..aeb0905ada98f 100644 --- a/postgres/datadog_checks/postgres/metadata.py +++ b/postgres/datadog_checks/postgres/metadata.py @@ -123,6 +123,8 @@ def __init__(self, check: PostgreSql, config: InstanceConfig): self.tags = None def _shutdown(self): + if not self._cancel_event.is_set(): + return try: self._check = None self._schema_collector = None diff --git a/postgres/datadog_checks/postgres/statement_samples.py b/postgres/datadog_checks/postgres/statement_samples.py index 9548773bb67eb..0d4422703407c 100644 --- a/postgres/datadog_checks/postgres/statement_samples.py +++ b/postgres/datadog_checks/postgres/statement_samples.py @@ -222,6 +222,8 @@ def __init__(self, check: PostgreSql, config: InstanceConfig): self._pg_stat_activity_cols = None def _shutdown(self): + if not self._cancel_event.is_set(): + return try: self._check = None self._collection_strategy_cache = None diff --git a/postgres/datadog_checks/postgres/statements.py b/postgres/datadog_checks/postgres/statements.py index de98a4c49001a..02bd3f0060687 100644 --- a/postgres/datadog_checks/postgres/statements.py +++ b/postgres/datadog_checks/postgres/statements.py @@ -204,6 +204,8 @@ def __init__(self, check, config: InstanceConfig): ) def _shutdown(self): + if not self._cancel_event.is_set(): + return try: self._check = None self._full_statement_text_cache = None From 99c8cff551adad62232addeccb7980ff36594fa8 Mon Sep 17 00:00:00 2001 From: Eric Weaver Date: Fri, 8 May 2026 11:35:15 -0400 Subject: [PATCH 10/12] Update tests --- postgres/tests/test_pg_integration.py | 2 +- postgres/tests/test_statements.py | 8 ++++---- postgres/tests/test_unit.py | 2 +- postgres/tests/utils.py | 8 +------- 4 files changed, 7 insertions(+), 13 deletions(-) diff --git a/postgres/tests/test_pg_integration.py b/postgres/tests/test_pg_integration.py index 18aa7a510d4a8..6e27a1f974fa4 100644 --- a/postgres/tests/test_pg_integration.py +++ b/postgres/tests/test_pg_integration.py @@ -857,7 +857,7 @@ def test_database_instance_metadata(aggregator, pg_instance, dbm_enabled, report 'database_instance:{}'.format(expected_database_instance), ] check = integration_check(pg_instance) - run_one_check(check) + run_one_check(check, cancel=False) # These tags are a bit dynamic in value, so we get them from the check and ensure they are present expected_tags.append('postgresql_version:{}'.format(check.raw_version)) diff --git a/postgres/tests/test_statements.py b/postgres/tests/test_statements.py index 10c3c363f7f49..f2983eae8befe 100644 --- a/postgres/tests/test_statements.py +++ b/postgres/tests/test_statements.py @@ -1215,7 +1215,7 @@ def test_activity_reported_hostname( check = integration_check(dbm_instance) check._connect() - run_one_check(check) + run_one_check(check, cancel=False) run_one_check(check) dbm_activity = aggregator.get_event_platform_events("dbm-activity") @@ -1480,7 +1480,7 @@ def test_async_job_enabled( dbm_instance['query_metrics'] = {'enabled': statement_metrics_enabled, 'run_sync': False} check = integration_check(dbm_instance) check._connect() - run_one_check(check) + run_one_check(check, cancel=False) if statement_samples_enabled or statement_activity_enabled: assert check.statement_samples._job_loop_future is not None else: @@ -1713,8 +1713,8 @@ def test_async_job_cancel_cancel(aggregator, integration_check, dbm_instance): check = integration_check(dbm_instance) check._connect() run_one_check(check) - assert not check.statement_samples._job_loop_future.running(), "samples thread should be stopped" - assert not check.statement_metrics._job_loop_future.running(), "metrics thread should be stopped" + assert check.statement_samples._job_loop_future is None, "samples future should be cleaned up after cancel" + assert check.statement_metrics._job_loop_future is None, "metrics future should be cleaned up after cancel" # if the thread doesn't start until after the cancel signal is set then the db connection will never # be created in the first place assert check.db_pool.pools.get(dbm_instance['dbname']) is None, "db connection should be gone" diff --git a/postgres/tests/test_unit.py b/postgres/tests/test_unit.py index e1ac5091fe4f8..f446d3eda9ef1 100644 --- a/postgres/tests/test_unit.py +++ b/postgres/tests/test_unit.py @@ -315,7 +315,7 @@ def test_run_explain_uses_parameterized_statement(pg_instance, integration_check conn_cm = mock.MagicMock() conn_cm.__enter__.return_value = mock_conn - with mock.patch.object(check.statement_samples.db_pool, 'get_connection', return_value=conn_cm): + with mock.patch.object(check.db_pool, 'get_connection', return_value=conn_cm): with mock.patch.object(check, 'histogram'): check.statement_samples._run_explain('testdb', statement, statement) diff --git a/postgres/tests/utils.py b/postgres/tests/utils.py index 30d0181ec3ad1..fd948c4781ba0 100644 --- a/postgres/tests/utils.py +++ b/postgres/tests/utils.py @@ -137,17 +137,11 @@ def run_vacuum_thread(pg_instance, vacuum_query, application_name='test'): def run_one_check(check: AgentCheck, cancel=True): """ Run check and immediately cancel. - Waits for all threads to close before continuing. + cancel() joins all threads and nulls futures, so no extra .result() calls needed. """ check.run() if cancel: check.cancel() - if check.statement_samples._job_loop_future is not None: - check.statement_samples._job_loop_future.result() - if check.statement_metrics._job_loop_future is not None: - check.statement_metrics._job_loop_future.result() - if check.metadata_samples._job_loop_future is not None: - check.metadata_samples._job_loop_future.result() def normalize_object(obj): From 171b38875664b5669fc488b8837fd6a1c4bc5397 Mon Sep 17 00:00:00 2001 From: Eric Weaver Date: Fri, 8 May 2026 12:31:09 -0400 Subject: [PATCH 11/12] Call shutdown explicitly --- .../postgres/data_observability.py | 8 +------- postgres/datadog_checks/postgres/metadata.py | 12 +++--------- postgres/datadog_checks/postgres/postgres.py | 5 +++++ .../postgres/statement_samples.py | 18 ++++++------------ postgres/datadog_checks/postgres/statements.py | 16 +++++----------- 5 files changed, 20 insertions(+), 39 deletions(-) diff --git a/postgres/datadog_checks/postgres/data_observability.py b/postgres/datadog_checks/postgres/data_observability.py index da2994722b395..5dce7eabb5b01 100644 --- a/postgres/datadog_checks/postgres/data_observability.py +++ b/postgres/datadog_checks/postgres/data_observability.py @@ -36,16 +36,10 @@ def __init__(self, check: PostgreSql, config: InstanceConfig): min_collection_interval=config.min_collection_interval, expected_db_exceptions=(psycopg.errors.DatabaseError,), job_name="data-observability", - shutdown_callback=self._shutdown, ) def _shutdown(self): - if not self._cancel_event.is_set(): - return - try: - self._check = None - except Exception: - pass + self._check = None @property def _do_config(self): diff --git a/postgres/datadog_checks/postgres/metadata.py b/postgres/datadog_checks/postgres/metadata.py index aeb0905ada98f..d8a2959754291 100644 --- a/postgres/datadog_checks/postgres/metadata.py +++ b/postgres/datadog_checks/postgres/metadata.py @@ -105,7 +105,6 @@ def __init__(self, check: PostgreSql, config: InstanceConfig): min_collection_interval=config.min_collection_interval, expected_db_exceptions=(psycopg.errors.DatabaseError,), job_name="database-metadata", - shutdown_callback=self._shutdown, ) self._check = check self._config = config @@ -123,14 +122,9 @@ def __init__(self, check: PostgreSql, config: InstanceConfig): self.tags = None def _shutdown(self): - if not self._cancel_event.is_set(): - return - try: - self._check = None - self._schema_collector = None - self._compiled_patterns_cache = None - except Exception: - pass + self._check = None + self._schema_collector = None + self._compiled_patterns_cache = None def _dbtags(self, db, *extra_tags): """ diff --git a/postgres/datadog_checks/postgres/postgres.py b/postgres/datadog_checks/postgres/postgres.py index 0d326ddefda41..abe41ba83360d 100644 --- a/postgres/datadog_checks/postgres/postgres.py +++ b/postgres/datadog_checks/postgres/postgres.py @@ -490,16 +490,21 @@ def cancel(self): if self.metadata_samples._job_loop_future: self.metadata_samples._job_loop_future.result() self.metadata_samples._job_loop_future = None + self.statement_metrics._shutdown() + self.statement_samples._shutdown() + self.metadata_samples._shutdown() elif self._config.data_observability.enabled: self.metadata_samples.cancel() if self.metadata_samples._job_loop_future: self.metadata_samples._job_loop_future.result() self.metadata_samples._job_loop_future = None + self.metadata_samples._shutdown() if self._config.data_observability.enabled: self.data_observability.cancel() if self.data_observability._job_loop_future: self.data_observability._job_loop_future.result() self.data_observability._job_loop_future = None + self.data_observability._shutdown() self._dynamic_queries = [] self._query_manager.executor = None self._close_db() diff --git a/postgres/datadog_checks/postgres/statement_samples.py b/postgres/datadog_checks/postgres/statement_samples.py index 0d4422703407c..72a669a2aec24 100644 --- a/postgres/datadog_checks/postgres/statement_samples.py +++ b/postgres/datadog_checks/postgres/statement_samples.py @@ -161,7 +161,6 @@ def __init__(self, check: PostgreSql, config: InstanceConfig): min_collection_interval=config.min_collection_interval, expected_db_exceptions=(psycopg.errors.DatabaseError,), job_name="query-samples", - shutdown_callback=self._shutdown, ) self._check = check self._config = config @@ -222,17 +221,12 @@ def __init__(self, check: PostgreSql, config: InstanceConfig): self._pg_stat_activity_cols = None def _shutdown(self): - if not self._cancel_event.is_set(): - return - try: - self._check = None - self._collection_strategy_cache = None - self._explain_errors_cache = None - self._explained_statements_ratelimiter = None - self._seen_samples_ratelimiter = None - self._raw_statement_text_cache = None - except Exception: - pass + self._check = None + self._collection_strategy_cache = None + self._explain_errors_cache = None + self._explained_statements_ratelimiter = None + self._seen_samples_ratelimiter = None + self._raw_statement_text_cache = None def _dbtags(self, db, *extra_tags): """ diff --git a/postgres/datadog_checks/postgres/statements.py b/postgres/datadog_checks/postgres/statements.py index 02bd3f0060687..b515b717c599d 100644 --- a/postgres/datadog_checks/postgres/statements.py +++ b/postgres/datadog_checks/postgres/statements.py @@ -169,7 +169,6 @@ def __init__(self, check, config: InstanceConfig): dbms="postgres", rate_limit=1 / float(collection_interval), job_name="query-metrics", - shutdown_callback=self._shutdown, ) self._check = check self._metrics_collection_interval = collection_interval @@ -204,16 +203,11 @@ def __init__(self, check, config: InstanceConfig): ) def _shutdown(self): - if not self._cancel_event.is_set(): - return - try: - self._check = None - self._full_statement_text_cache = None - self._state = None - self._query_calls_cache = None - self._baseline_metrics = None - except Exception: - pass + self._check = None + self._full_statement_text_cache = None + self._state = None + self._query_calls_cache = None + self._baseline_metrics = None def _execute_query(self, query, params=(), binary=False, row_factory=None) -> Tuple[list, list]: if self._cancel_event.is_set(): From bef3c81e8c771cb41b2bdc88304132e7ba26a139 Mon Sep 17 00:00:00 2001 From: Eric Weaver Date: Fri, 8 May 2026 16:39:58 -0400 Subject: [PATCH 12/12] Find and break cyclical references on cancel --- postgres/datadog_checks/postgres/postgres.py | 51 +++++++++---------- .../postgres/statement_samples.py | 1 + postgres/tests/test_unit.py | 47 ++++++++++++++++- 3 files changed, 71 insertions(+), 28 deletions(-) diff --git a/postgres/datadog_checks/postgres/postgres.py b/postgres/datadog_checks/postgres/postgres.py index e233aeb4d4f0e..caab366180792 100644 --- a/postgres/datadog_checks/postgres/postgres.py +++ b/postgres/datadog_checks/postgres/postgres.py @@ -17,6 +17,7 @@ from datadog_checks.base.utils.db.core import QueryManager from datadog_checks.base.utils.db.health import HealthEvent, HealthStatus from datadog_checks.base.utils.db.utils import ( + DBMAsyncJob, default_json_event_encoding, tracked_query, ) @@ -472,42 +473,38 @@ def dynamic_queries(self): return self._dynamic_queries + @staticmethod + def _cancel_async_job(job: DBMAsyncJob): + job.cancel() + if job._job_loop_future: + job._job_loop_future.result() + job._job_loop_future = None + job._shutdown() + def cancel(self): """ Cancels and sends cancel signal to all threads. """ if self._config.dbm: - self.statement_samples.cancel() - self.statement_metrics.cancel() - self.metadata_samples.cancel() - if self.statement_metrics._job_loop_future: - self.statement_metrics._job_loop_future.result() - self.statement_metrics._job_loop_future = None - if self.statement_samples._job_loop_future: - self.statement_samples._job_loop_future.result() - self.statement_samples._job_loop_future = None - if self.metadata_samples._job_loop_future: - self.metadata_samples._job_loop_future.result() - self.metadata_samples._job_loop_future = None - self.statement_metrics._shutdown() - self.statement_samples._shutdown() - self.metadata_samples._shutdown() + self._cancel_async_job(self.statement_metrics) + self._cancel_async_job(self.statement_samples) + self._cancel_async_job(self.metadata_samples) elif self._config.data_observability.enabled: - self.metadata_samples.cancel() - if self.metadata_samples._job_loop_future: - self.metadata_samples._job_loop_future.result() - self.metadata_samples._job_loop_future = None - self.metadata_samples._shutdown() + self._cancel_async_job(self.metadata_samples) if self._config.data_observability.enabled: - self.data_observability.cancel() - if self.data_observability._job_loop_future: - self.data_observability._job_loop_future.result() - self.data_observability._job_loop_future = None - self.data_observability._shutdown() - self._dynamic_queries = [] - self._query_manager.executor = None + self._cancel_async_job(self.data_observability) + self._clean_state() + self._query_manager = None + self.health = None + self.check_initializations.clear() + # TODO: move diagnosis cleanup into AgentCheck.cancel() in the base class + self._diagnosis = None self._close_db() self._close_db_pool() + # CheckLoggingAdapter holds self.check until check_id is resolved via + # process(), which only happens after the agent scheduler calls run(). + # If cancel() is called before that, the back-reference is never cleared. + self.log.check = None def _clean_state(self): self.log.debug("Cleaning state") diff --git a/postgres/datadog_checks/postgres/statement_samples.py b/postgres/datadog_checks/postgres/statement_samples.py index 72a669a2aec24..b4c731a270cc3 100644 --- a/postgres/datadog_checks/postgres/statement_samples.py +++ b/postgres/datadog_checks/postgres/statement_samples.py @@ -222,6 +222,7 @@ def __init__(self, check: PostgreSql, config: InstanceConfig): def _shutdown(self): self._check = None + self._explain_parameterized_queries = None self._collection_strategy_cache = None self._explain_errors_cache = None self._explained_statements_ratelimiter = None diff --git a/postgres/tests/test_unit.py b/postgres/tests/test_unit.py index f446d3eda9ef1..decd10285d07d 100644 --- a/postgres/tests/test_unit.py +++ b/postgres/tests/test_unit.py @@ -2,6 +2,8 @@ # All rights reserved # Licensed under Simplified BSD License (see LICENSE) import copy +import gc +import weakref import mock import psycopg @@ -9,7 +11,7 @@ from pytest import fail from semver import VersionInfo -from datadog_checks.postgres import util +from datadog_checks.postgres import PostgreSql, util from datadog_checks.postgres.schemas import PostgresSchemaCollector pytestmark = pytest.mark.unit @@ -379,3 +381,46 @@ def test_cancel_closes_main_db_connection(integration_check, pg_instance): conn.close.assert_called_once() assert check._db is None + + +def test_check_gc_after_cancel(pg_instance): + """Verify cancel() breaks all reference cycles so refcount alone reclaims the check. + + If this test fails, the assertion message lists the types still holding a + reference to the check. To fix it: + + 1. Identify the referrer type in the failure message (e.g. ``QueryManager``). + 2. Find which attribute on that object points back to the check (usually + ``self.check`` or ``self._check``). + 3. Null that attribute in ``cancel()`` or add it to the relevant + ``_shutdown()`` method. + 4. If the referrer is a closure or ``functools.partial``, find the + registration site and null or clear the container that holds it. + """ + pg_instance['dbm'] = True + pg_instance['query_samples'] = {'enabled': True, 'run_sync': True, 'collection_interval': 1} + pg_instance['query_metrics'] = {'enabled': True, 'run_sync': True, 'collection_interval': 10} + pg_instance['query_activity'] = {'enabled': True, 'collection_interval': 1} + pg_instance['data_observability'] = {'enabled': True, 'run_sync': True, 'collection_interval': 1} + + check = PostgreSql('postgres', {}, [pg_instance]) + ref = weakref.ref(check) + + check.cancel() + + gc.collect() + gc.disable() + try: + del check + obj = ref() + if obj is not None: + import inspect + + referrers = [ + f"bound method {r.__qualname__}" if inspect.ismethod(r) else type(r).__name__ + for r in gc.get_referrers(obj) + ] + del obj + fail(f"Check still alive after cancel() + del -- pinned by: {referrers}") + finally: + gc.enable()