Merge pull request #8 from khnumdev/copilot/simplify-config-iteration

khnumdev · web-flow · commit c9f2f2a69ca6 · 2026-01-07T14:06:18.000+01:00
Simplify config documentation and clarify default iteration behavior
diff --git a/README.md b/README.md
@@ -83,40 +83,35 @@ brew install python@3.12
 
 ## Configuration
 
-- Create a local `config.yaml` in your working directory. It is gitignored and not included in the repo.
-- Any CLI flag overrides values from `config.yaml`.
-- If neither config nor flags provide a value, the tool falls back to environment variables (for emulator detection) or sensible defaults.
+Create an optional `config.yaml` in your working directory to customize behavior. **By default, all commands iterate over all namespaces and all kinds** unless you specify filters.
 
-Example `config.yaml` (full example with comments):
+### Minimal Example
 
 ```yaml
-# Project / environment
-project_id: "my-project"          # (string) GCP project id. If omitted, ADC or DATASTORE_PROJECT_ID env var will be used.
-emulator_host: "localhost:8010"   # (string) Datastore emulator host (host:port). If set, the emulator path is used.
+# Optional: specify project and emulator
+project_id: "my-project"
+emulator_host: "localhost:8010"
+```
 
-# Explicit filters (empty -> iterate all)
-namespaces: [""]                   # (list) Namespaces to include. [""] means include default namespace and allow discovery of others.
-kinds: []                            # (list) Kinds to include. Empty/omit means discover all kinds per namespace.
+### Common Options
 
-# Defaults used by some commands (optional)
-kind: ""                            # (string) Default kind used by analyze-fields when CLI --kind is not provided.
-namespace: ""                       # (string) Default namespace used when CLI --namespace is omitted.
+```yaml
+# Optional filters (omit to process all namespaces and kinds)
+namespaces: ["custom-ns"]  # List specific namespaces, or omit to process all
+kinds: ["MyKind"]          # List specific kinds, or omit to process all
 
 # Cleanup settings
-ttl_field: "expireAt"               # (string) Property name that contains the TTL/expiry timestamp.
-delete_missing_ttl: true              # (bool) If true, entities missing the TTL field will be deleted by cleanup.
-batch_size: 500                       # (int) Number of keys to delete per batch when running cleanup (tunable).
+ttl_field: "expireAt"      # Field name containing expiry timestamp
+batch_size: 500            # Delete batch size
 
 # Analysis settings
-group_by_field: null                  # (string|null) Field name to group analysis by (e.g., batchId). Null means no grouping.
-sample_size: 500                      # (int) Max entities to sample per-kind/per-group to bound analysis work. Set 0 or null to disable sampling.
-enable_parallel: true                 # (bool) Enable multi-threaded processing for analysis and deletion. Set false to force single-threaded.
-
-# Logging
-log_level: "INFO"                   # (string) Logging level (DEBUG, INFO, WARNING, ERROR).
+sample_size: 500           # Max entities to sample per analysis (0 = no limit)
 ```
 
-The keys above map directly to CLI flags (CLI flags override values in `config.yaml`). Omit any option to use sensible defaults.
+**Notes:**
+- CLI flags always override config values
+- If no config is provided, sensible defaults are used
+- Environment variables `DATASTORE_PROJECT_ID` and `DATASTORE_EMULATOR_HOST` are also supported
 
 ## Quickstart
 
@@ -148,20 +143,21 @@ Use these targets to get a working dev environment quickly.
 
 ### Basic CLI examples
 ```bash
-# list kinds (scans stats or samples)
-python3 cli.py analyze-kinds --project my-project
+# Analyze all kinds in all namespaces (default behavior)
+lsu analyze-kinds
 
-# analyze fields for a kind
-python3 cli.py analyze-fields --kind MyKind --group-by batchId
+# Analyze specific kind across all namespaces
+lsu analyze-fields --kind MyKind
 
-# dry-run cleanup sample
-python3 cli.py cleanup --ttl-field expireAt --dry-run
-```
+# Analyze with grouping
+lsu analyze-fields --kind MyKind --group-by batchId
 
-### Configuration
+# Dry-run cleanup for all kinds and namespaces
+lsu cleanup --dry-run
 
-- Local `config.yaml` is supported; CLI flags override config values.
-- Example keys: `project_id`, `emulator_host`, `namespaces`, `kinds`, `kind`, `ttl_field`, `batch_size`, `sample_size`, `enable_parallel`.
+# Filter to specific namespace and kind
+lsu cleanup --kind MyKind --namespace custom-ns --dry-run
+```
 
 ### Emulator & integration testing
 
@@ -198,7 +194,6 @@ The release workflow selects the appropriate token based on the `publish_target`
 
 ## Notes
 
-- `sample_size` bounds per-kind/group analysis to avoid scanning entire datasets. Set to 0 or `null` in config to disable sampling.
-- `enable_parallel` (default true) enables multi-threaded processing during analysis and deletion; set to false to force single-threaded behavior.
-
-If you'd like a short walkthrough or to change the default Makefile targets, tell me what you'd prefer and I can adjust the README or Makefile.
+- **By default, all commands iterate over all namespaces and all kinds** unless you specify filters via config or CLI flags
+- `sample_size` bounds per-kind analysis to avoid scanning entire large datasets (set to 0 to disable)
+- Multi-threaded processing is enabled by default for better performance
diff --git a/commands/analyze_entity_fields.py b/commands/analyze_entity_fields.py
@@ -56,7 +56,6 @@ def _process_entity(e: datastore.Entity):
     if enable_parallel:
         from concurrent.futures import ThreadPoolExecutor, as_completed
 
-        results_iter = []
         with ThreadPoolExecutor(max_workers=8) as exe:
             futures = {exe.submit(_process_entity, e): e for e in ents}
             for fut in tqdm(as_completed(futures), total=len(futures), desc="Analyzing field contributions", unit="entity"):
@@ -162,18 +161,14 @@ def analyze_field_contributions(
     sample_size = getattr(config, "sample_size", 500)
     enable_parallel = getattr(config, "enable_parallel", True)
 
-    # If no namespace provided, or config.namespaces is None/empty, iterate all namespaces
+    # If no namespace provided, iterate all namespaces
     if namespace is None:
-        if hasattr(config, "namespaces") and (not config.namespaces):
-            ns_list = list_namespaces(client)
-        else:
-            ns_list = [namespace] if namespace else list_namespaces(client)
+        ns_list = list_namespaces(client)
         results: Dict[str, Dict] = {}
         for ns in ns_list:
             results[ns or ""] = _analyze_single_namespace(
                 client, kind=kind, namespace=ns, group_by_field=group_by_field, only_fields=only_fields, sample_size=sample_size
             )
-            
         return {"by_namespace": results}
 
     # Single namespace
diff --git a/commands/analyze_kinds.py b/commands/analyze_kinds.py
@@ -1,9 +1,8 @@
 from __future__ import annotations
 
 import logging
-from typing import Dict, List, Optional, Tuple
+from typing import Dict, List
 
-from google.cloud import datastore
 from google.cloud.datastore.helpers import entity_to_protobuf
 
 from .config import (
@@ -17,7 +16,7 @@
 logger = logging.getLogger(__name__)
 
 
-def get_kind_stats(client, kind: str, namespace: Optional[str] = None) -> Tuple[Optional[int], Optional[int]]:
+def get_kind_stats(client, kind: str, namespace: str | None = None) -> tuple[int | None, int | None]:
     """
     Returns (count, bytes) for the given kind/namespace using Datastore statistics.
     Falls back to None if not found.
@@ -39,7 +38,7 @@ def get_kind_stats(client, kind: str, namespace: Optional[str] = None) -> Tuple[
     return None, None
 
 
-def estimate_entity_count_and_size(client, kind: str, namespace: Optional[str], sample_size: int = 100) -> Tuple[int, int]:
+def estimate_entity_count_and_size(client, kind: str, namespace: str | None, sample_size: int = 100) -> tuple[int, int]:
     """
     Original keys-only method: exact count, approximate bytes via sampling.
     """
@@ -65,7 +64,7 @@ def estimate_entity_count_and_size(client, kind: str, namespace: Optional[str],
     return total_count, int(avg_size * total_count)
 
 
-def analyze_kinds(config: AppConfig, method: Optional[str] = None) -> List[Dict]:
+def analyze_kinds(config: AppConfig, method: str | None = None) -> List[Dict]:
     """
     Analyze kinds using either:
       - 'stats' (default) => fast built-in Datastore statistics
diff --git a/commands/cleanup_expired.py b/commands/cleanup_expired.py
@@ -2,7 +2,7 @@
 
 import logging
 from datetime import datetime, timezone
-from typing import Dict, List, Optional
+from typing import Dict, List
 
 from google.cloud import datastore
 
diff --git a/commands/config.py b/commands/config.py
@@ -69,10 +69,10 @@ def load_config(path: Optional[str] = None, overrides: Optional[Dict] = None) ->
     config.namespaces = _as_list(merged.get("namespaces"))
     config.kinds = _as_list(merged.get("kinds"))
 
-    # Normalise: treat [""] as empty
-    if config.namespaces == [""] or config.namespaces is None:
+    # Normalise: treat [""] as empty (meaning "iterate all")
+    if config.namespaces == [""]:
         config.namespaces = []
-    if config.kinds == [""] or config.kinds is None:
+    if config.kinds == [""]:
         config.kinds = []
 
     # Optional defaults used by some commands
diff --git a/tests/test_config.py b/tests/test_config.py
@@ -17,6 +17,18 @@ def test_load_config_normalizes_namespaces(tmp_path: tempfile.TemporaryDirectory
     assert cfg.kinds == []
 
 
+def test_empty_lists_mean_iterate_all():
+    """Empty namespaces and kinds lists should mean 'iterate all'."""
+    cfg = AppConfig()
+    # Default config should have empty lists
+    assert cfg.namespaces == []
+    assert cfg.kinds == []
+    
+    # Empty lists evaluate to False, triggering "iterate all" logic
+    assert not cfg.namespaces
+    assert not cfg.kinds
+
+
 def test_format_size_small_and_large():
     assert format_size(512) == "512.00 B"
     assert format_size(1024) == "1.00 KB"