From 0f95d725c02d26646d9d3ce5346f8f55ca89c5ed Mon Sep 17 00:00:00 2001 From: "google-labs-jules[bot]" <161369871+google-labs-jules[bot]@users.noreply.github.com> Date: Sat, 30 May 2026 22:02:33 +0000 Subject: [PATCH] perf: optimize list deduplication Co-authored-by: SatoryKono <13055362+SatoryKono@users.noreply.github.com> --- .../infrastructure/adapters/common/deduplication.py | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/src/bioetl/infrastructure/adapters/common/deduplication.py b/src/bioetl/infrastructure/adapters/common/deduplication.py index 18e1a8f9b9..e6dd308fb5 100644 --- a/src/bioetl/infrastructure/adapters/common/deduplication.py +++ b/src/bioetl/infrastructure/adapters/common/deduplication.py @@ -35,14 +35,9 @@ def deduplicate_preserving_order(values: Iterable[str]) -> list[str]: """Return unique values while preserving the original order.""" - unique_values: list[str] = [] - seen_values: set[str] = set() - for value in values: - if value in seen_values: - continue - seen_values.add(value) - unique_values.append(value) - return unique_values + # Optimization: dict.fromkeys leverages C-level iteration and insertion order + # preservation to deduplicate faster than a pure-Python seen-set loop. + return list(dict.fromkeys(values)) def iter_deduplicated_records(