From 233919b7c4cbbba82862de0944ccc0356acd50a1 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Wed, 7 Jan 2026 10:24:33 +0000 Subject: [PATCH 1/4] Initial plan From 6a2cf3f7781d07d391e5242d6373094b0c45d7c1 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Wed, 7 Jan 2026 10:31:54 +0000 Subject: [PATCH 2/4] Simplify config and README documentation Co-authored-by: khnumdev <13968776+khnumdev@users.noreply.github.com> --- README.md | 69 ++++++++++++++----------------- commands/analyze_entity_fields.py | 8 +--- commands/config.py | 6 +-- 3 files changed, 37 insertions(+), 46 deletions(-) diff --git a/README.md b/README.md index 8fe66b9..1456cd6 100644 --- a/README.md +++ b/README.md @@ -83,40 +83,35 @@ brew install python@3.12 ## Configuration -- Create a local `config.yaml` in your working directory. It is gitignored and not included in the repo. -- Any CLI flag overrides values from `config.yaml`. -- If neither config nor flags provide a value, the tool falls back to environment variables (for emulator detection) or sensible defaults. +Create an optional `config.yaml` in your working directory to customize behavior. **By default, all commands iterate over all namespaces and all kinds** unless you specify filters. -Example `config.yaml` (full example with comments): +### Minimal Example ```yaml -# Project / environment -project_id: "my-project" # (string) GCP project id. If omitted, ADC or DATASTORE_PROJECT_ID env var will be used. -emulator_host: "localhost:8010" # (string) Datastore emulator host (host:port). If set, the emulator path is used. +# Optional: specify project and emulator +project_id: "my-project" +emulator_host: "localhost:8010" +``` -# Explicit filters (empty -> iterate all) -namespaces: [""] # (list) Namespaces to include. [""] means include default namespace and allow discovery of others. -kinds: [] # (list) Kinds to include. Empty/omit means discover all kinds per namespace. +### Common Options -# Defaults used by some commands (optional) -kind: "" # (string) Default kind used by analyze-fields when CLI --kind is not provided. -namespace: "" # (string) Default namespace used when CLI --namespace is omitted. +```yaml +# Optional filters (omit to process all namespaces and kinds) +namespaces: ["custom-ns"] # List specific namespaces, or omit to process all +kinds: ["MyKind"] # List specific kinds, or omit to process all # Cleanup settings -ttl_field: "expireAt" # (string) Property name that contains the TTL/expiry timestamp. -delete_missing_ttl: true # (bool) If true, entities missing the TTL field will be deleted by cleanup. -batch_size: 500 # (int) Number of keys to delete per batch when running cleanup (tunable). +ttl_field: "expireAt" # Field name containing expiry timestamp +batch_size: 500 # Delete batch size # Analysis settings -group_by_field: null # (string|null) Field name to group analysis by (e.g., batchId). Null means no grouping. -sample_size: 500 # (int) Max entities to sample per-kind/per-group to bound analysis work. Set 0 or null to disable sampling. -enable_parallel: true # (bool) Enable multi-threaded processing for analysis and deletion. Set false to force single-threaded. - -# Logging -log_level: "INFO" # (string) Logging level (DEBUG, INFO, WARNING, ERROR). +sample_size: 500 # Max entities to sample per analysis (0 = no limit) ``` -The keys above map directly to CLI flags (CLI flags override values in `config.yaml`). Omit any option to use sensible defaults. +**Notes:** +- CLI flags always override config values +- If no config is provided, sensible defaults are used +- Environment variables `DATASTORE_PROJECT_ID` and `DATASTORE_EMULATOR_HOST` are also supported ## Quickstart @@ -148,20 +143,21 @@ Use these targets to get a working dev environment quickly. ### Basic CLI examples ```bash -# list kinds (scans stats or samples) -python3 cli.py analyze-kinds --project my-project +# Analyze all kinds in all namespaces (default behavior) +lsu analyze-kinds -# analyze fields for a kind -python3 cli.py analyze-fields --kind MyKind --group-by batchId +# Analyze specific kind across all namespaces +lsu analyze-fields --kind MyKind -# dry-run cleanup sample -python3 cli.py cleanup --ttl-field expireAt --dry-run -``` +# Analyze with grouping +lsu analyze-fields --kind MyKind --group-by batchId -### Configuration +# Dry-run cleanup for all kinds and namespaces +lsu cleanup --dry-run -- Local `config.yaml` is supported; CLI flags override config values. -- Example keys: `project_id`, `emulator_host`, `namespaces`, `kinds`, `kind`, `ttl_field`, `batch_size`, `sample_size`, `enable_parallel`. +# Filter to specific namespace and kind +lsu cleanup --kind MyKind --namespace custom-ns --dry-run +``` ### Emulator & integration testing @@ -198,7 +194,6 @@ The release workflow selects the appropriate token based on the `publish_target` ## Notes -- `sample_size` bounds per-kind/group analysis to avoid scanning entire datasets. Set to 0 or `null` in config to disable sampling. -- `enable_parallel` (default true) enables multi-threaded processing during analysis and deletion; set to false to force single-threaded behavior. - -If you'd like a short walkthrough or to change the default Makefile targets, tell me what you'd prefer and I can adjust the README or Makefile. +- **By default, all commands iterate over all namespaces and all kinds** unless you specify filters via config or CLI flags +- `sample_size` bounds per-kind analysis to avoid scanning entire large datasets (set to 0 to disable) +- Multi-threaded processing is enabled by default for better performance diff --git a/commands/analyze_entity_fields.py b/commands/analyze_entity_fields.py index bb8c653..2cf47a4 100644 --- a/commands/analyze_entity_fields.py +++ b/commands/analyze_entity_fields.py @@ -162,18 +162,14 @@ def analyze_field_contributions( sample_size = getattr(config, "sample_size", 500) enable_parallel = getattr(config, "enable_parallel", True) - # If no namespace provided, or config.namespaces is None/empty, iterate all namespaces + # If no namespace provided, iterate all namespaces if namespace is None: - if hasattr(config, "namespaces") and (not config.namespaces): - ns_list = list_namespaces(client) - else: - ns_list = [namespace] if namespace else list_namespaces(client) + ns_list = list_namespaces(client) results: Dict[str, Dict] = {} for ns in ns_list: results[ns or ""] = _analyze_single_namespace( client, kind=kind, namespace=ns, group_by_field=group_by_field, only_fields=only_fields, sample_size=sample_size ) - return {"by_namespace": results} # Single namespace diff --git a/commands/config.py b/commands/config.py index e7d4ba5..40cbbdc 100644 --- a/commands/config.py +++ b/commands/config.py @@ -69,10 +69,10 @@ def load_config(path: Optional[str] = None, overrides: Optional[Dict] = None) -> config.namespaces = _as_list(merged.get("namespaces")) config.kinds = _as_list(merged.get("kinds")) - # Normalise: treat [""] as empty - if config.namespaces == [""] or config.namespaces is None: + # Normalise: treat [""] as empty (meaning "iterate all") + if config.namespaces == [""]: config.namespaces = [] - if config.kinds == [""] or config.kinds is None: + if config.kinds == [""]: config.kinds = [] # Optional defaults used by some commands From b9e714bdea3e8334be33041609d77d47bebfc57e Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Wed, 7 Jan 2026 10:36:33 +0000 Subject: [PATCH 3/4] Add test for default iteration behavior Co-authored-by: khnumdev <13968776+khnumdev@users.noreply.github.com> --- tests/test_config.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/tests/test_config.py b/tests/test_config.py index cd14f2b..8c5b96a 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -17,6 +17,18 @@ def test_load_config_normalizes_namespaces(tmp_path: tempfile.TemporaryDirectory assert cfg.kinds == [] +def test_empty_lists_mean_iterate_all(): + """Empty namespaces and kinds lists should mean 'iterate all'.""" + cfg = AppConfig() + # Default config should have empty lists + assert cfg.namespaces == [] + assert cfg.kinds == [] + + # Empty lists evaluate to False, triggering "iterate all" logic + assert not cfg.namespaces + assert not cfg.kinds + + def test_format_size_small_and_large(): assert format_size(512) == "512.00 B" assert format_size(1024) == "1.00 KB" From 94e87e9780f9f1a8e8ce0dc58243f4e0909e37c4 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Wed, 7 Jan 2026 10:38:56 +0000 Subject: [PATCH 4/4] Fix linting issues and modernize type hints Co-authored-by: khnumdev <13968776+khnumdev@users.noreply.github.com> --- commands/analyze_entity_fields.py | 1 - commands/analyze_kinds.py | 9 ++++----- commands/cleanup_expired.py | 2 +- 3 files changed, 5 insertions(+), 7 deletions(-) diff --git a/commands/analyze_entity_fields.py b/commands/analyze_entity_fields.py index 2cf47a4..1945f38 100644 --- a/commands/analyze_entity_fields.py +++ b/commands/analyze_entity_fields.py @@ -56,7 +56,6 @@ def _process_entity(e: datastore.Entity): if enable_parallel: from concurrent.futures import ThreadPoolExecutor, as_completed - results_iter = [] with ThreadPoolExecutor(max_workers=8) as exe: futures = {exe.submit(_process_entity, e): e for e in ents} for fut in tqdm(as_completed(futures), total=len(futures), desc="Analyzing field contributions", unit="entity"): diff --git a/commands/analyze_kinds.py b/commands/analyze_kinds.py index 6ca2b05..6f59cbb 100644 --- a/commands/analyze_kinds.py +++ b/commands/analyze_kinds.py @@ -1,9 +1,8 @@ from __future__ import annotations import logging -from typing import Dict, List, Optional, Tuple +from typing import Dict, List -from google.cloud import datastore from google.cloud.datastore.helpers import entity_to_protobuf from .config import ( @@ -17,7 +16,7 @@ logger = logging.getLogger(__name__) -def get_kind_stats(client, kind: str, namespace: Optional[str] = None) -> Tuple[Optional[int], Optional[int]]: +def get_kind_stats(client, kind: str, namespace: str | None = None) -> tuple[int | None, int | None]: """ Returns (count, bytes) for the given kind/namespace using Datastore statistics. Falls back to None if not found. @@ -39,7 +38,7 @@ def get_kind_stats(client, kind: str, namespace: Optional[str] = None) -> Tuple[ return None, None -def estimate_entity_count_and_size(client, kind: str, namespace: Optional[str], sample_size: int = 100) -> Tuple[int, int]: +def estimate_entity_count_and_size(client, kind: str, namespace: str | None, sample_size: int = 100) -> tuple[int, int]: """ Original keys-only method: exact count, approximate bytes via sampling. """ @@ -65,7 +64,7 @@ def estimate_entity_count_and_size(client, kind: str, namespace: Optional[str], return total_count, int(avg_size * total_count) -def analyze_kinds(config: AppConfig, method: Optional[str] = None) -> List[Dict]: +def analyze_kinds(config: AppConfig, method: str | None = None) -> List[Dict]: """ Analyze kinds using either: - 'stats' (default) => fast built-in Datastore statistics diff --git a/commands/cleanup_expired.py b/commands/cleanup_expired.py index f6744db..2beed17 100644 --- a/commands/cleanup_expired.py +++ b/commands/cleanup_expired.py @@ -2,7 +2,7 @@ import logging from datetime import datetime, timezone -from typing import Dict, List, Optional +from typing import Dict, List from google.cloud import datastore