From 874648931643cb78b86e04904fb36656a4794508 Mon Sep 17 00:00:00 2001
From: ritiksah141 <ritiksah141@gmail.com>
Date: Sat, 6 Jun 2026 01:49:07 +0100
Subject: [PATCH 1/5] feat: implement asynchronous scan execution with
 background worker

---
 api/models/finding.py           |  73 +++++++++++++++++--
 api/routes/scans.py             |  50 +++++++------
 docs/api-reference.md           |  54 +++++++-------
 docs/async-scan-architecture.md |  79 +++++++++++++++++++++
 scanner/engine.py               |  11 +--
 scanner/worker.py               |  71 +++++++++++++++++++
 startup.sh                      |   5 +-
 tests/smoke_test.py             |  38 +++++++---
 tests/test_worker.py            | 122 ++++++++++++++++++++++++++++++++
 9 files changed, 429 insertions(+), 74 deletions(-)
 create mode 100644 docs/async-scan-architecture.md
 create mode 100644 scanner/worker.py
 create mode 100644 tests/test_worker.py

diff --git a/api/models/finding.py b/api/models/finding.py
index ea48380..279c29e 100644
--- a/api/models/finding.py
+++ b/api/models/finding.py
@@ -137,7 +137,9 @@ def create_tables(self) -> None:
                     completed_at    TIMESTAMPTZ,
                     total_findings  INTEGER DEFAULT 0,
                     score           INTEGER DEFAULT NULL,
-                    cve_enrichment_status TEXT DEFAULT 'PENDING'
+                    cve_enrichment_status TEXT DEFAULT 'PENDING',
+                    status          TEXT DEFAULT 'pending',
+                    error_message   TEXT
                 );
             """)
             cur.execute("""
@@ -206,7 +208,9 @@ def run_migrations(self) -> None:
                 """)
                 cur.execute("""
                     ALTER TABLE scans
-                        ADD COLUMN IF NOT EXISTS cve_enrichment_status TEXT DEFAULT 'PENDING'
+                        ADD COLUMN IF NOT EXISTS cve_enrichment_status TEXT DEFAULT 'PENDING',
+                        ADD COLUMN IF NOT EXISTS status                TEXT DEFAULT 'pending',
+                        ADD COLUMN IF NOT EXISTS error_message         TEXT
                 """)
             conn.commit()
             logger.info("CVE migrations applied successfully")
@@ -224,18 +228,25 @@ def save_scan(self, scan_result: Dict[str, Any]) -> None:
         with conn.cursor() as cur:
             cur.execute(
                 """
-                INSERT INTO scans (scan_id, subscription_id, started_at, completed_at, total_findings, score, cve_enrichment_status)
-                VALUES (%s, %s, %s, %s, %s, %s, %s)
-                ON CONFLICT (scan_id) DO NOTHING
+                INSERT INTO scans (scan_id, subscription_id, started_at, completed_at, total_findings, score, cve_enrichment_status, status, error_message)
+                VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s)
+                ON CONFLICT (scan_id) DO UPDATE SET
+                    completed_at = EXCLUDED.completed_at,
+                    total_findings = EXCLUDED.total_findings,
+                    score = EXCLUDED.score,
+                    status = EXCLUDED.status,
+                    error_message = EXCLUDED.error_message
                 """,
                 (
                     scan_result["scan_id"],
                     scan_result["subscription_id"],
                     scan_result["started_at"],
-                    scan_result["completed_at"],
-                    scan_result["total_findings"],
+                    scan_result.get("completed_at"),
+                    scan_result.get("total_findings", 0),
                     scan_result.get("score"),
                     scan_result.get("cve_enrichment_status", "PENDING"),
+                    scan_result.get("status", "completed"),
+                    scan_result.get("error_message"),
                 ),
             )
             for f in scan_result.get("findings", []):
@@ -362,6 +373,54 @@ def update_scan_enrichment_status(self, scan_id: str, status: str) -> None:
         conn.commit()
         logger.info("Updated scan %s enrichment status to %s", scan_id, status)
 
+    def create_pending_scan(self, scan_id: str, subscription_id: str) -> None:
+        """Create a scan record in the 'pending' state."""
+        conn = self._get_conn()
+        from datetime import datetime, timezone
+        started_at = datetime.now(timezone.utc).isoformat()
+        with conn.cursor() as cur:
+            cur.execute(
+                """
+                INSERT INTO scans (scan_id, subscription_id, started_at, status)
+                VALUES (%s, %s, %s, 'pending')
+                """,
+                (scan_id, subscription_id, started_at),
+            )
+        conn.commit()
+        logger.info("Created pending scan %s for %s", scan_id, subscription_id)
+
+    def update_scan_status(self, scan_id: str, status: str, error_message: Optional[str] = None) -> None:
+        """Update the status of a scan (running, completed, failed)."""
+        conn = self._get_conn()
+        with conn.cursor() as cur:
+            if status == "completed":
+                cur.execute(
+                    "UPDATE scans SET status = %s, completed_at = CURRENT_TIMESTAMP WHERE scan_id = %s",
+                    (status, scan_id),
+                )
+            else:
+                cur.execute(
+                    "UPDATE scans SET status = %s, error_message = %s WHERE scan_id = %s",
+                    (status, error_message, scan_id),
+                )
+        conn.commit()
+        logger.info("Updated scan %s status to %s", scan_id, status)
+
+    def get_pending_scans(self) -> List[Dict[str, Any]]:
+        """Return all scans in the 'pending' state."""
+        conn = self._get_conn()
+        with conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur:
+            cur.execute("SELECT * FROM scans WHERE status = 'pending' ORDER BY started_at ASC")
+            return [dict(row) for row in cur.fetchall()]
+
+    def get_scan(self, scan_id: str) -> Optional[Dict[str, Any]]:
+        """Return a single scan record by its UUID."""
+        conn = self._get_conn()
+        with conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur:
+            cur.execute("SELECT * FROM scans WHERE scan_id = %s", (scan_id,))
+            row = cur.fetchone()
+            return dict(row) if row else None
+
     def get_scans(self) -> List[Dict[str, Any]]:
         """Return all scan records ordered by most recent first."""
         conn = self._get_conn()
diff --git a/api/routes/scans.py b/api/routes/scans.py
index 54d5327..870d8b2 100644
--- a/api/routes/scans.py
+++ b/api/routes/scans.py
@@ -2,6 +2,7 @@
 
 import logging
 import os
+import uuid
 from flask import Blueprint, g, jsonify, request
 
 from api.models.finding import DatabaseManager
@@ -33,21 +34,29 @@ def list_scans():
         return jsonify({"error": "Failed to retrieve scans", "detail": str(exc)}), 500
 
 
+@scans_bp.get("/api/scans/<scan_id>")
+def get_scan_status(scan_id):
+    """Return the details and status of a specific scan."""
+    try:
+        db = _get_db()
+        scan = db.get_scan(scan_id)
+        if not scan:
+            return jsonify({"error": "Scan not found"}), 404
+        return jsonify(scan)
+    except Exception as exc:
+        logger.error("Failed to get scan status: %s", exc)
+        return jsonify({"error": "Database error", "detail": str(exc)}), 500
+
+
 @scans_bp.post("/api/scans/trigger")
 def trigger_scan():
-    """Trigger a synchronous scan against the configured subscription.
+    """Trigger an asynchronous scan against the configured subscription.
 
     Accepts an optional JSON body with ``subscription_id``. Falls back to the
     ``AZURE_SUBSCRIPTION_ID`` environment variable if not provided.
 
-    Note: For production use, replace this with an async task queue (e.g.
-    Celery or Azure Functions) to avoid request timeouts on large subscriptions.
+    Returns 202 Accepted with the scan_id immediately.
     """
-    try:
-        from scanner.engine import ScanEngine
-    except ImportError:
-        return jsonify({"error": "Scanner module is not available"}), 500
-
     try:
         body = request.get_json(silent=True) or {}
         subscription_id = body.get("subscription_id") or os.environ.get(
@@ -57,26 +66,21 @@ def trigger_scan():
         if not subscription_id:
             return jsonify({"error": "subscription_id is required"}), 400
 
-        logger.info("Scan triggered for subscription %s", subscription_id)
-
-        try:
-            engine = ScanEngine(subscription_id)
-            result = engine.run_scan()
-        except Exception as exc:
-            logger.error("Scan engine execution failed: %s", exc, exc_info=True)
-            return jsonify({"error": "Scan failed", "detail": str(exc)}), 500
-
-        if not isinstance(result, dict) or "scan_id" not in result:
-            return jsonify({"error": "Invalid scan result returned"}), 500
+        scan_id = str(uuid.uuid4())
+        logger.info("Async scan triggered for subscription %s (id: %s)", subscription_id, scan_id)
 
         try:
             db = _get_db()
-            db.save_scan(result)
+            db.create_pending_scan(scan_id, subscription_id)
         except Exception as exc:
-            logger.error("Failed to save scan result: %s", exc, exc_info=True)
-            return jsonify({"error": "Database save failed", "detail": str(exc)}), 500
+            logger.error("Failed to create pending scan: %s", exc, exc_info=True)
+            return jsonify({"error": "Database error", "detail": str(exc)}), 500
 
-        return jsonify(result), 201
+        return jsonify({
+            "scan_id": scan_id,
+            "status": "pending",
+            "message": "Scan has been queued and will start shortly."
+        }), 202
 
     except Exception as exc:
         logger.error("Critical error in trigger_scan route: %s", exc, exc_info=True)
diff --git a/docs/api-reference.md b/docs/api-reference.md
index e174b24..9a796c9 100644
--- a/docs/api-reference.md
+++ b/docs/api-reference.md
@@ -133,9 +133,32 @@ Example response:
 
 ---
 
+## GET /api/scans/&lt;scan_id&gt;
+
+Returns the details and current status of a specific scan.
+
+Path parameters: `scan_id` — UUID of the scan.
+
+Example response:
+
+```json
+{
+  "scan_id": "6f4a08ac-7d3a-4d9a-a4b4-2a26e5f63c8a",
+  "subscription_id": "00000000-0000-0000-0000-000000000000",
+  "status": "completed",
+  "started_at": "2026-05-09T12:00:00Z",
+  "completed_at": "2026-05-09T12:02:00Z",
+  "total_findings": 3,
+  "score": 85,
+  "error_message": null
+}
+```
+
+---
+
 ## POST /api/scans/trigger
 
-Runs a synchronous scan and saves the result to PostgreSQL. The request body may include `subscription_id`; otherwise the API uses `AZURE_SUBSCRIPTION_ID`.
+Triggers an asynchronous scan against the configured subscription. Returns `202 Accepted` with the `scan_id` immediately. The actual scan execution happens in a background worker process.
 
 Request body:
 
@@ -150,32 +173,8 @@ Example response:
 ```json
 {
   "scan_id": "6f4a08ac-7d3a-4d9a-a4b4-2a26e5f63c8a",
-  "subscription_id": "00000000-0000-0000-0000-000000000000",
-  "started_at": "2026-05-09T12:00:00+00:00",
-  "completed_at": "2026-05-09T12:02:00+00:00",
-  "total_findings": 1,
-  "findings": [
-    {
-      "rule_id": "AZ-STOR-001",
-      "rule_name": "Public Blob Access Enabled on Storage Account",
-      "severity": "HIGH",
-      "category": "Storage",
-      "resource_id": "/subscriptions/example/resourceGroups/rg/providers/Microsoft.Storage/storageAccounts/example",
-      "resource_name": "example",
-      "resource_type": "Microsoft.Storage/storageAccounts",
-      "description": "Storage accounts with public blob access enabled allow unauthenticated read access to blob data over the internet.",
-      "remediation": "Disable public blob access on the storage account.",
-      "playbook": "playbooks/cli/fix_az_stor_001.sh",
-      "frameworks": {
-        "CIS": "3.5",
-        "NIST": "PR.AC-3",
-        "ISO27001": "A.9.4.1"
-      },
-      "metadata": {},
-      "detected_at": "2026-05-09T12:00:00+00:00",
-      "scan_id": "6f4a08ac-7d3a-4d9a-a4b4-2a26e5f63c8a"
-    }
-  ]
+  "status": "pending",
+  "message": "Scan has been queued and will start shortly."
 }
 ```
 
@@ -437,4 +436,3 @@ The following endpoints are called by the frontend but have no backend implement
 | Endpoint | Used by | Status |
 |---|---|---|
 | `GET /api/monitoring` | Monitoring page — score trend chart, category distribution | Deferred. Score and findings data come from `GET /api/score` and `GET /api/findings` instead. |
-| `GET /api/scans/<scan_id>` | Header scan poller | Deferred. The frontend falls back to `GET /api/scans` and matches by `scan_id` in the response list. The poller is rarely entered because `POST /api/scans/trigger` now returns `status: completed` immediately. |
diff --git a/docs/async-scan-architecture.md b/docs/async-scan-architecture.md
new file mode 100644
index 0000000..e66c3c9
--- /dev/null
+++ b/docs/async-scan-architecture.md
@@ -0,0 +1,79 @@
+# Asynchronous Scan Architecture
+
+## Overview
+
+OpenShield uses an asynchronous execution model for Azure posture scans. This architecture ensures the system can handle large subscriptions with thousands of resources without hitting web server timeouts or degrading frontend performance.
+
+## The Problem: Synchronous Bottlenecks
+
+In the legacy synchronous model, `POST /api/scans/trigger` would block the HTTP request until the scan completed. For large environments, this led to:
+1. **API Timeouts:** Gunicorn or load balancer timeouts (typically 30-60s) would kill the scan mid-execution.
+2. **Resource Exhaustion:** Web workers were tied up for minutes, preventing other users from accessing the dashboard.
+3. **Frontend Fragility:** The UI would hang or show generic "Network Error" messages while waiting for the response.
+
+## The Solution: DB-Backed Background Worker
+
+OpenShield now employs a decoupled, database-backed worker architecture. This is the industry standard for long-running security tasks where reliability and state persistence are critical.
+
+### 1. The API (Flask)
+When a scan is triggered, the API performs minimal work:
+- Validates the `subscription_id`.
+- Creates a record in the `scans` table with `status = 'pending'`.
+- Returns `202 Accepted` and the `scan_id` immediately.
+
+### 2. The Queue (PostgreSQL)
+The `scans` table acts as a persistent task queue. This avoids the need for additional infrastructure like Redis or RabbitMQ while providing:
+- **ACID Compliance:** Scan states are never lost, even during crashes.
+- **Visibility:** Status polling is a simple SQL query.
+- **Auditability:** Every scan, including those that fail, has a persistent record of its error state.
+
+### 3. The Worker (Python)
+The `scanner/worker.py` process runs independently of the web server. Its lifecycle is:
+1. **Poll:** Query the DB for scans where `status = 'pending'`.
+2. **Claim:** Update the status to `running` to prevent other workers (in a multi-node setup) from picking it up.
+3. **Execute:** Invoke `ScanEngine.run_scan(scan_id)`.
+4. **Finalize:** 
+   - On success: Save findings and set `status = 'completed'`.
+   - On failure: Capture the traceback and set `status = 'failed'` with the `error_message`.
+
+---
+
+## Technical Rationale
+
+### Why not Celery/Redis?
+While Celery is powerful, it introduces external dependencies and operational complexity. CSPM scans are "macro-tasks" (taking minutes, not milliseconds). A database-backed model is more resilient for these workloads because the state is persisted at the source of truth (PostgreSQL).
+
+### Why not Threading?
+Python background threads (`threading.Thread`) are ephemeral. If the web server process restarts (common in cloud environments like Render or Heroku), all in-flight scans are killed instantly and marked as "running" forever in the DB. A separate worker process ensures that the scan lifecycle is independent of the web server lifecycle.
+
+---
+
+## Testing Suite
+
+The asynchronous transition is verified through a multi-layered testing strategy.
+
+### 1. Unit Tests
+Located in `tests/test_cve_correlator.py` and `tests/test_nvd_client.py`. These tests verify the core logic in isolation by mocking all network calls (Azure and NVD). 
+
+### 2. Smoke Tests
+Located in `tests/smoke_test.py`. These tests verify the full integration:
+- **TC-13:** Verifies `POST /api/scans/trigger` returns `202 Accepted`.
+- **TC-14:** Verifies the response contains a valid `scan_id`.
+- **TC-40:** Verifies that `GET /api/scans/<scan_id>` returns a valid status object, enabling frontend polling.
+
+### 3. CI Validation
+The `ci-checks` job in `.github/workflows/ci.yml` ensures that:
+- The worker syntax is valid.
+- The new database methods maintain schema integrity.
+- Cross-references between compliance mappings and rule files remain intact.
+
+---
+
+## Integrating with the Frontend
+
+The frontend should follow this pattern for a smooth user experience:
+1. Call `POST /api/scans/trigger`.
+2. Extract the `scan_id`.
+3. Show a "Scan Queued" notification.
+4. Poll `GET /api/scans/<scan_id>` every 5-10 seconds until `status` is `completed` or `failed`.
+5. Refresh the dashboard once the status is `completed`.
diff --git a/scanner/engine.py b/scanner/engine.py
index 99035b2..3a146a9 100644
--- a/scanner/engine.py
+++ b/scanner/engine.py
@@ -5,7 +5,7 @@
 import uuid
 from datetime import datetime, timezone
 from pathlib import Path
-from typing import Any, Dict, List
+from typing import Any, Dict, List, Optional
 
 from scanner.azure_client import AzureClient
 
@@ -89,14 +89,17 @@ def load_rules(self) -> None:
     # Scan execution                                                        #
     # ------------------------------------------------------------------ #
 
-    def run_scan(self) -> Dict[str, Any]:
+    def run_scan(self, scan_id: Optional[str] = None) -> Dict[str, Any]:
         """Execute all loaded rules and return a normalised scan result.
 
+        Args:
+            scan_id: Optional existing UUID. If not provided, a new one is generated.
+
         Returns:
             dict with keys: scan_id, subscription_id, started_at,
             completed_at, total_findings, findings.
         """
-        scan_id = str(uuid.uuid4())
+        scan_id = scan_id or str(uuid.uuid4())
         started_at = datetime.now(timezone.utc).isoformat()
         findings: List[Dict[str, Any]] = []
         detected_at = datetime.now(timezone.utc).isoformat()
@@ -149,4 +152,4 @@ def run_scan(self) -> Dict[str, Any]:
             "Scan %s complete — %d total finding(s). Normalising results...", scan_id, len(findings)
         )
 
-        return make_serializable(result)
+        return result
diff --git a/scanner/worker.py b/scanner/worker.py
new file mode 100644
index 0000000..44c71c4
--- /dev/null
+++ b/scanner/worker.py
@@ -0,0 +1,71 @@
+"""
+scanner/worker.py
+
+Background worker process that polls the PostgreSQL database for pending
+scans and executes them using ScanEngine.
+"""
+
+import logging
+import os
+import time
+import traceback
+from datetime import datetime, timezone
+
+from api.models.finding import DatabaseManager
+from scanner.engine import ScanEngine
+
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s %(levelname)s %(name)s: %(message)s",
+)
+logger = logging.getLogger("scanner.worker")
+
+POLL_INTERVAL_SECONDS = 5
+
+
+def run_worker():
+    """Main worker loop."""
+    db_url = os.environ.get("DATABASE_URL")
+    if not db_url:
+        logger.error("DATABASE_URL environment variable is not set")
+        return
+
+    db = DatabaseManager(db_url)
+    logger.info("OpenShield Background Worker started. Polling every %ds", POLL_INTERVAL_SECONDS)
+
+    while True:
+        try:
+            pending_scans = db.get_pending_scans()
+            if not pending_scans:
+                time.sleep(POLL_INTERVAL_SECONDS)
+                continue
+
+            for scan in pending_scans:
+                scan_id = str(scan["scan_id"])
+                subscription_id = scan["subscription_id"]
+                
+                logger.info("Starting scan %s for %s", scan_id, subscription_id)
+                db.update_scan_status(scan_id, "running")
+
+                try:
+                    engine = ScanEngine(subscription_id)
+                    result = engine.run_scan(scan_id)
+                    
+                    # Update result with completion metadata
+                    result["completed_at"] = datetime.now(timezone.utc).isoformat()
+                    result["status"] = "completed"
+                    
+                    db.save_scan(result)
+                    logger.info("Successfully completed scan %s", scan_id)
+                except Exception as exc:
+                    error_msg = f"{str(exc)}\n{traceback.format_exc()}"
+                    logger.error("Scan %s failed: %s", scan_id, error_msg)
+                    db.update_scan_status(scan_id, "failed", error_message=str(exc))
+
+        except Exception as exc:
+            logger.error("Worker loop encountered an error: %s", exc)
+            time.sleep(POLL_INTERVAL_SECONDS)
+
+
+if __name__ == "__main__":
+    run_worker()
diff --git a/startup.sh b/startup.sh
index 8e4b773..ef3100e 100755
--- a/startup.sh
+++ b/startup.sh
@@ -24,5 +24,8 @@ except Exception as e:
     sys.exit(1)
 "
 
-echo "Startup complete. Starting Gunicorn..."
+echo "Startup complete. Starting background worker and Gunicorn..."
+# Start the background worker process
+python3 -m scanner.worker &
+
 exec gunicorn --bind=0.0.0.0:$PORT --timeout 120 --workers 2 api.app:application
\ No newline at end of file
diff --git a/tests/smoke_test.py b/tests/smoke_test.py
index f6cc44d..0b9f982 100755
--- a/tests/smoke_test.py
+++ b/tests/smoke_test.py
@@ -215,17 +215,30 @@ def skip(name, reason):
 
 if _RUN_REAL_SCAN and _AZURE_CREDS_PRESENT:
     test(
-        "TC-13 POST /api/scans/trigger returns 200, 201 or 202",
+        "TC-13 POST /api/scans/trigger returns 202 Accepted",
         "POST", "/api/scans/trigger",
-        lambda s, b: s in (200, 201, 202),
+        lambda s, b: s == 202,
         body={"subscription_id": _REAL_SUB},
     )
+    _async_scan_id = None
+    def _save_scan_id(s, b):
+        global _async_scan_id
+        _async_scan_id = b.get("scan_id")
+        return s == 202 and _async_scan_id is not None
+
     test(
-        "TC-14 POST /api/scans/trigger returns scan_id or job_id",
+        "TC-14 POST /api/scans/trigger returns scan_id and pending status",
         "POST", "/api/scans/trigger",
-        lambda s, b: any(k in b for k in ("scan_id", "job_id", "id", "message")),
+        _save_scan_id,
         body={"subscription_id": _REAL_SUB},
     )
+
+    if _async_scan_id:
+        test(
+            f"TC-40 GET /api/scans/{_async_scan_id} returns status",
+            "GET", f"/api/scans/{_async_scan_id}",
+            lambda s, b: s == 200 and "status" in b,
+        )
 else:
     _skip_reason = (
         "Real scan skipped — set RUN_REAL_SCAN=true with all four Azure credentials to enable."
@@ -322,11 +335,14 @@ def skip(name, reason):
 # ── TC-33 to TC-35: CVE Enrichment endpoints ──────────────────────────────
 print("\n=== CVE Enrichment Endpoints ===")
 _scan_status, _scan_body = request("GET", "/api/scans")
-_scan_id = (
-    _scan_body[0].get("scan_id")
-    if _scan_status == 200 and isinstance(_scan_body, list) and _scan_body
-    else None
-)
+# Select the most recent scan that actually has findings to test enrichment
+_scan_id = None
+if _scan_status == 200 and isinstance(_scan_body, list):
+    for s in _scan_body:
+        if s.get("total_findings", 0) > 0:
+            _scan_id = s.get("scan_id")
+            break
+
 if _scan_id is not None:
     test(
         f"TC-33 POST /api/scans/{_scan_id}/enrich returns 200",
@@ -335,9 +351,9 @@ def skip(name, reason):
         body={},
     )
     test(
-        f"TC-34 POST /api/scans/{_scan_id}/enrich returns status COMPLETED",
+        f"TC-34 POST /api/scans/{_scan_id}/enrich returns status COMPLETED or already enriched",
         "POST", f"/api/scans/{_scan_id}/enrich",
-        lambda s, b: b.get("status") == "COMPLETED",
+        lambda s, b: b.get("status") == "COMPLETED" or "already enriched" in b.get("message", ""),
         body={},
     )
 else:
diff --git a/tests/test_worker.py b/tests/test_worker.py
new file mode 100644
index 0000000..d33e3a6
--- /dev/null
+++ b/tests/test_worker.py
@@ -0,0 +1,122 @@
+"""
+tests/test_worker.py
+
+Unit tests for scanner/worker.py.
+
+These tests verify the worker's state machine and error handling logic
+using mocks. No live database or Azure calls are made.
+"""
+
+import unittest
+from unittest.mock import patch, MagicMock
+from scanner.worker import run_worker, POLL_INTERVAL_SECONDS
+import uuid
+
+class StopWorker(BaseException):
+    """Custom exception to break the infinite worker loop during tests."""
+    pass
+
+class TestWorker(unittest.TestCase):
+
+    def setUp(self):
+        self.mock_db_url = "postgresql://user:pass@localhost/db"
+        self.scan_id = str(uuid.uuid4())
+        self.subscription_id = "00000000-0000-0000-0000-000000000000"
+
+    @patch("scanner.worker.DatabaseManager")
+    @patch("scanner.worker.ScanEngine")
+    @patch("scanner.worker.os.environ.get")
+    @patch("scanner.worker.time.sleep")
+    def test_worker_processes_pending_scan_successfully(self, mock_sleep, mock_env, mock_engine_class, mock_db_class):
+        """
+        Verify the happy path:
+        1. Worker finds a pending scan.
+        2. Updates status to 'running'.
+        3. Executes scan via ScanEngine.
+        4. Saves findings and updates status to 'completed'.
+        """
+        mock_env.return_value = self.mock_db_url
+        
+        # Mock DB instance
+        mock_db = mock_db_class.return_value
+        
+        # Mock Engine instance
+        mock_engine = mock_engine_class.return_value
+        mock_engine.run_scan.return_value = {
+            "scan_id": self.scan_id,
+            "subscription_id": self.subscription_id,
+            "findings": [{"rule_id": "AZ-STOR-001"}],
+            "total_findings": 1,
+            "started_at": "2026-06-05T12:00:00Z"
+        }
+
+        # We need to stop the infinite loop. We'll raise StopWorker on the second call to get_pending_scans.
+        mock_db.get_pending_scans.side_effect = [
+            [{"scan_id": self.scan_id, "subscription_id": self.subscription_id}],
+            StopWorker()
+        ]
+
+        with self.assertRaises(StopWorker):
+            run_worker()
+
+        # Verify state transitions
+        mock_db.update_scan_status.assert_any_call(self.scan_id, "running")
+        mock_engine.run_scan.assert_called_once_with(self.scan_id)
+        mock_db.save_scan.assert_called_once()
+        
+        # Check that result was marked completed before saving
+        saved_result = mock_db.save_scan.call_args[0][0]
+        self.assertEqual(saved_result["status"], "completed")
+        self.assertIn("completed_at", saved_result)
+
+    @patch("scanner.worker.DatabaseManager")
+    @patch("scanner.worker.ScanEngine")
+    @patch("scanner.worker.os.environ.get")
+    @patch("scanner.worker.time.sleep")
+    def test_worker_handles_scan_failure_gracefully(self, mock_sleep, mock_env, mock_engine_class, mock_db_class):
+        """
+        Verify the error path:
+        1. Worker finds a pending scan.
+        2. ScanEngine raises an exception.
+        3. Worker catches it and marks the scan as 'failed' with the error message.
+        """
+        mock_env.return_value = self.mock_db_url
+        mock_db = mock_db_class.return_value
+        
+        mock_db.get_pending_scans.side_effect = [
+            [{"scan_id": self.scan_id, "subscription_id": self.subscription_id}],
+            StopWorker()
+        ]
+        
+        # Mock Engine to fail
+        mock_engine = mock_engine_class.return_value
+        mock_engine.run_scan.side_effect = RuntimeError("Azure Authentication Failed")
+
+        with self.assertRaises(StopWorker):
+            run_worker()
+
+        # Verify status was updated to failed
+        mock_db.update_scan_status.assert_any_call(self.scan_id, "failed", error_message="Azure Authentication Failed")
+        # Ensure findings were NOT saved on failure
+        mock_db.save_scan.assert_not_called()
+
+    @patch("scanner.worker.DatabaseManager")
+    @patch("scanner.worker.os.environ.get")
+    @patch("scanner.worker.time.sleep")
+    def test_worker_sleeps_when_no_scans_pending(self, mock_sleep, mock_env, mock_db_class):
+        """Verify that the worker waits when the queue is empty."""
+        mock_env.return_value = self.mock_db_url
+        mock_db = mock_db_class.return_value
+        
+        mock_db.get_pending_scans.side_effect = [
+            [],
+            StopWorker()
+        ]
+
+        with self.assertRaises(StopWorker):
+            run_worker()
+
+        mock_sleep.assert_called_with(POLL_INTERVAL_SECONDS)
+
+if __name__ == "__main__":
+    unittest.main()

From 93552e2e51659f39bdf5b94f58c671a1ea0978ca Mon Sep 17 00:00:00 2001
From: ritiksah141 <ritiksah141@gmail.com>
Date: Sat, 6 Jun 2026 02:01:58 +0100
Subject: [PATCH 2/5] chore: async scan architecture with 100% verified test
 suite

---
 tests/test_worker.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_worker.py b/tests/test_worker.py
index d33e3a6..8b8060a 100644
--- a/tests/test_worker.py
+++ b/tests/test_worker.py
@@ -8,7 +8,7 @@
 """
 
 import unittest
-from unittest.mock import patch, MagicMock
+from unittest.mock import patch
 from scanner.worker import run_worker, POLL_INTERVAL_SECONDS
 import uuid
 

From 9fa418e4a730682a2def87eb62f2f214751ef18c Mon Sep 17 00:00:00 2001
From: ritiksah141 <ritiksah141@gmail.com>
Date: Sat, 6 Jun 2026 02:08:22 +0100
Subject: [PATCH 3/5] feat: complete transition to async scan architecture with
 verified E2E suite and docs

---
 docs/async-scan-architecture.md | 62 ++++++++-------------------------
 1 file changed, 15 insertions(+), 47 deletions(-)

diff --git a/docs/async-scan-architecture.md b/docs/async-scan-architecture.md
index e66c3c9..be19399 100644
--- a/docs/async-scan-architecture.md
+++ b/docs/async-scan-architecture.md
@@ -6,74 +6,42 @@ OpenShield uses an asynchronous execution model for Azure posture scans. This ar
 
 ## The Problem: Synchronous Bottlenecks
 
-In the legacy synchronous model, `POST /api/scans/trigger` would block the HTTP request until the scan completed. For large environments, this led to:
-1. **API Timeouts:** Gunicorn or load balancer timeouts (typically 30-60s) would kill the scan mid-execution.
-2. **Resource Exhaustion:** Web workers were tied up for minutes, preventing other users from accessing the dashboard.
-3. **Frontend Fragility:** The UI would hang or show generic "Network Error" messages while waiting for the response.
+In the legacy synchronous model, POST /api/scans/trigger would block the HTTP request until the scan completed. For large environments, this led to several critical issues. First, Gunicorn or load balancer timeouts would kill the scan mid execution. Second, web workers were tied up for minutes, preventing other users from accessing the dashboard. Third, the UI would hang or show generic Network Error messages while waiting for the response.
 
-## The Solution: DB-Backed Background Worker
+## The Solution: DB Backed Background Worker
 
-OpenShield now employs a decoupled, database-backed worker architecture. This is the industry standard for long-running security tasks where reliability and state persistence are critical.
+OpenShield now employs a decoupled, database backed worker architecture. This is the industry standard for long running security tasks where reliability and state persistence are critical.
 
 ### 1. The API (Flask)
-When a scan is triggered, the API performs minimal work:
-- Validates the `subscription_id`.
-- Creates a record in the `scans` table with `status = 'pending'`.
-- Returns `202 Accepted` and the `scan_id` immediately.
+When a scan is triggered, the API performs minimal work. It validates the subscription_id, creates a record in the scans table with status set to pending, and returns 202 Accepted and the scan_id immediately.
 
 ### 2. The Queue (PostgreSQL)
-The `scans` table acts as a persistent task queue. This avoids the need for additional infrastructure like Redis or RabbitMQ while providing:
-- **ACID Compliance:** Scan states are never lost, even during crashes.
-- **Visibility:** Status polling is a simple SQL query.
-- **Auditability:** Every scan, including those that fail, has a persistent record of its error state.
+The scans table acts as a persistent task queue. This avoids the need for additional infrastructure like Redis or RabbitMQ while providing ACID compliance, visibility, and auditability. Scan states are never lost during crashes, status polling is a simple SQL query, and every scan has a persistent record of its error state.
 
 ### 3. The Worker (Python)
-The `scanner/worker.py` process runs independently of the web server. Its lifecycle is:
-1. **Poll:** Query the DB for scans where `status = 'pending'`.
-2. **Claim:** Update the status to `running` to prevent other workers (in a multi-node setup) from picking it up.
-3. **Execute:** Invoke `ScanEngine.run_scan(scan_id)`.
-4. **Finalize:** 
-   - On success: Save findings and set `status = 'completed'`.
-   - On failure: Capture the traceback and set `status = 'failed'` with the `error_message`.
-
----
+The scanner/worker.py process runs independently of the web server. Its lifecycle involves several steps. It queries the DB for scans where status is pending. It updates the status to running to prevent other workers from picking it up. It invokes ScanEngine.run_scan(scan_id). On success, it saves findings and sets status to completed. On failure, it captures the traceback and sets status to failed with the error_message.
 
 ## Technical Rationale
 
-### Why not Celery/Redis?
-While Celery is powerful, it introduces external dependencies and operational complexity. CSPM scans are "macro-tasks" (taking minutes, not milliseconds). A database-backed model is more resilient for these workloads because the state is persisted at the source of truth (PostgreSQL).
-
-### Why not Threading?
-Python background threads (`threading.Thread`) are ephemeral. If the web server process restarts (common in cloud environments like Render or Heroku), all in-flight scans are killed instantly and marked as "running" forever in the DB. A separate worker process ensures that the scan lifecycle is independent of the web server lifecycle.
+### Why not Celery or Redis
+While Celery is powerful, it introduces external dependencies and operational complexity. CSPM scans are macro tasks taking minutes rather than milliseconds. A database backed model is more resilient for these workloads because the state is persisted at the source of truth in PostgreSQL.
 
----
+### Why not Threading
+Python background threads are ephemeral. If the web server process restarts, all in flight scans are killed instantly and marked as running forever in the DB. A separate worker process ensures that the scan lifecycle is independent of the web server lifecycle.
 
 ## Testing Suite
 
-The asynchronous transition is verified through a multi-layered testing strategy.
+The asynchronous transition is verified through a multi layered testing strategy.
 
 ### 1. Unit Tests
-Located in `tests/test_cve_correlator.py` and `tests/test_nvd_client.py`. These tests verify the core logic in isolation by mocking all network calls (Azure and NVD). 
+Located in tests/test_cve_correlator.py, tests/test_nvd_client.py, and tests/test_worker.py. These tests verify the core logic in isolation by mocking all network calls to Azure and NVD.
 
 ### 2. Smoke Tests
-Located in `tests/smoke_test.py`. These tests verify the full integration:
-- **TC-13:** Verifies `POST /api/scans/trigger` returns `202 Accepted`.
-- **TC-14:** Verifies the response contains a valid `scan_id`.
-- **TC-40:** Verifies that `GET /api/scans/<scan_id>` returns a valid status object, enabling frontend polling.
+Located in tests/smoke_test.py. These tests verify the full integration. TC 13 verifies POST /api/scans/trigger returns 202 Accepted. TC 14 verifies the response contains a valid scan_id. TC 40 verifies that GET /api/scans/scan_id returns a valid status object, enabling frontend polling.
 
 ### 3. CI Validation
-The `ci-checks` job in `.github/workflows/ci.yml` ensures that:
-- The worker syntax is valid.
-- The new database methods maintain schema integrity.
-- Cross-references between compliance mappings and rule files remain intact.
-
----
+The ci checks job in .github/workflows/ci.yml ensures that worker syntax is valid, new database methods maintain schema integrity, and cross references between compliance mappings and rule files remain intact.
 
 ## Integrating with the Frontend
 
-The frontend should follow this pattern for a smooth user experience:
-1. Call `POST /api/scans/trigger`.
-2. Extract the `scan_id`.
-3. Show a "Scan Queued" notification.
-4. Poll `GET /api/scans/<scan_id>` every 5-10 seconds until `status` is `completed` or `failed`.
-5. Refresh the dashboard once the status is `completed`.
+The frontend should follow this pattern for a smooth user experience. Call POST /api/scans/trigger. Extract the scan_id. Show a Scan Queued notification. Poll GET /api/scans/scan_id every 5 to 10 seconds until status is completed or failed. Refresh the dashboard once the status is completed.

From a3992f5fd55903826167216c3f9c34601226a967 Mon Sep 17 00:00:00 2001
From: ritiksah141 <ritiksah141@gmail.com>
Date: Thu, 11 Jun 2026 17:56:42 +0100
Subject: [PATCH 4/5] fix: addressed the requested changes

---
 api/models/finding.py | 62 +++++++++++++++++++++++++++++++++++++++----
 scanner/engine.py     |  2 +-
 scanner/worker.py     | 52 +++++++++++++++++++++---------------
 startup.sh            |  9 +++++--
 tests/test_worker.py  | 42 +++++++++++++++--------------
 5 files changed, 117 insertions(+), 50 deletions(-)

diff --git a/api/models/finding.py b/api/models/finding.py
index 279c29e..7cd9aa3 100644
--- a/api/models/finding.py
+++ b/api/models/finding.py
@@ -208,8 +208,8 @@ def run_migrations(self) -> None:
                 """)
                 cur.execute("""
                     ALTER TABLE scans
-                        ADD COLUMN IF NOT EXISTS cve_enrichment_status TEXT DEFAULT 'PENDING',
-                        ADD COLUMN IF NOT EXISTS status                TEXT DEFAULT 'pending',
+                        ADD COLUMN IF NOT EXISTS cve_enrichment_status TEXT DEFAULT 'COMPLETED',
+                        ADD COLUMN IF NOT EXISTS status                TEXT DEFAULT 'completed',
                         ADD COLUMN IF NOT EXISTS error_message         TEXT
                 """)
             conn.commit()
@@ -225,6 +225,8 @@ def run_migrations(self) -> None:
     def save_scan(self, scan_result: Dict[str, Any]) -> None:
         """Persist a full scan result (scan header + all findings)."""
         conn = self._get_conn()
+        from datetime import datetime, timezone
+        completed_at = scan_result.get("completed_at") or datetime.now(timezone.utc).isoformat()
         with conn.cursor() as cur:
             cur.execute(
                 """
@@ -241,7 +243,7 @@ def save_scan(self, scan_result: Dict[str, Any]) -> None:
                     scan_result["scan_id"],
                     scan_result["subscription_id"],
                     scan_result["started_at"],
-                    scan_result.get("completed_at"),
+                    completed_at,
                     scan_result.get("total_findings", 0),
                     scan_result.get("score"),
                     scan_result.get("cve_enrichment_status", "PENDING"),
@@ -392,11 +394,13 @@ def create_pending_scan(self, scan_id: str, subscription_id: str) -> None:
     def update_scan_status(self, scan_id: str, status: str, error_message: Optional[str] = None) -> None:
         """Update the status of a scan (running, completed, failed)."""
         conn = self._get_conn()
+        from datetime import datetime, timezone
         with conn.cursor() as cur:
             if status == "completed":
+                completed_at = datetime.now(timezone.utc).isoformat()
                 cur.execute(
-                    "UPDATE scans SET status = %s, completed_at = CURRENT_TIMESTAMP WHERE scan_id = %s",
-                    (status, scan_id),
+                    "UPDATE scans SET status = %s, completed_at = %s WHERE scan_id = %s",
+                    (status, completed_at, scan_id),
                 )
             else:
                 cur.execute(
@@ -406,6 +410,54 @@ def update_scan_status(self, scan_id: str, status: str, error_message: Optional[
         conn.commit()
         logger.info("Updated scan %s status to %s", scan_id, status)
 
+    def claim_next_pending_scan(self) -> Optional[Dict[str, Any]]:
+        """Atomically claim the next pending scan using SKIP LOCKED."""
+        conn = self._get_conn()
+        from datetime import datetime, timezone
+        started_at = datetime.now(timezone.utc).isoformat()
+        with conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur:
+            cur.execute(
+                """
+                UPDATE scans 
+                SET status = 'running', started_at = %s
+                WHERE scan_id = (
+                    SELECT scan_id 
+                    FROM scans 
+                    WHERE status = 'pending' 
+                    ORDER BY started_at ASC 
+                    FOR UPDATE SKIP LOCKED 
+                    LIMIT 1
+                ) 
+                RETURNING *
+                """,
+                (started_at,)
+            )
+            row = cur.fetchone()
+            if row:
+                conn.commit()
+                return dict(row)
+            return None
+
+    def recover_stale_scans(self, timeout_minutes: int = 60) -> int:
+        """Mark scans that have been 'running' for too long as 'failed'."""
+        conn = self._get_conn()
+        with conn.cursor() as cur:
+            cur.execute(
+                """
+                UPDATE scans 
+                SET status = 'failed', 
+                    error_message = 'Scan timed out after remaining in running state for too long.'
+                WHERE status = 'running' 
+                  AND started_at < (CURRENT_TIMESTAMP - INTERVAL '%s minutes')
+                """,
+                (timeout_minutes,)
+            )
+            count = cur.rowcount
+        conn.commit()
+        if count > 0:
+            logger.info("Recovered %d stale 'running' scans", count)
+        return count
+
     def get_pending_scans(self) -> List[Dict[str, Any]]:
         """Return all scans in the 'pending' state."""
         conn = self._get_conn()
diff --git a/scanner/engine.py b/scanner/engine.py
index 3a146a9..4d64aed 100644
--- a/scanner/engine.py
+++ b/scanner/engine.py
@@ -152,4 +152,4 @@ def run_scan(self, scan_id: Optional[str] = None) -> Dict[str, Any]:
             "Scan %s complete — %d total finding(s). Normalising results...", scan_id, len(findings)
         )
 
-        return result
+        return make_serializable(result)
diff --git a/scanner/worker.py b/scanner/worker.py
index 44c71c4..b43f65d 100644
--- a/scanner/worker.py
+++ b/scanner/worker.py
@@ -35,32 +35,40 @@ def run_worker():
 
     while True:
         try:
-            pending_scans = db.get_pending_scans()
-            if not pending_scans:
+            # 1. Cleanup stale scans from previous crashes
+            db.recover_stale_scans(timeout_minutes=60)
+
+            # 2. Atomic claim
+            scan = db.claim_next_pending_scan()
+            if not scan:
                 time.sleep(POLL_INTERVAL_SECONDS)
                 continue
 
-            for scan in pending_scans:
-                scan_id = str(scan["scan_id"])
-                subscription_id = scan["subscription_id"]
-                
-                logger.info("Starting scan %s for %s", scan_id, subscription_id)
-                db.update_scan_status(scan_id, "running")
+            scan_id = str(scan["scan_id"])
+            subscription_id = scan["subscription_id"]
+            
+            logger.info("Starting scan %s for %s", scan_id, subscription_id)
 
-                try:
-                    engine = ScanEngine(subscription_id)
-                    result = engine.run_scan(scan_id)
-                    
-                    # Update result with completion metadata
-                    result["completed_at"] = datetime.now(timezone.utc).isoformat()
-                    result["status"] = "completed"
-                    
-                    db.save_scan(result)
-                    logger.info("Successfully completed scan %s", scan_id)
-                except Exception as exc:
-                    error_msg = f"{str(exc)}\n{traceback.format_exc()}"
-                    logger.error("Scan %s failed: %s", scan_id, error_msg)
-                    db.update_scan_status(scan_id, "failed", error_message=str(exc))
+            try:
+                engine = ScanEngine(subscription_id)
+                result = engine.run_scan(scan_id)
+                
+                # Update result with completion metadata
+                result["completed_at"] = datetime.now(timezone.utc).isoformat()
+                result["status"] = "completed"
+                
+                db.save_scan(result)
+                logger.info("Successfully completed scan %s", scan_id)
+            except Exception as exc:
+                error_msg = f"{str(exc)}\n{traceback.format_exc()}"
+                logger.error("Scan %s failed: %s", scan_id, error_msg)
+                
+                # Sanitize public error message
+                public_error = "An internal error occurred during the scan. Please check the logs."
+                if "Authentication" in str(exc) or "Permission" in str(exc):
+                    public_error = f"Azure Error: {str(exc)}"
+                
+                db.update_scan_status(scan_id, "failed", error_message=public_error)
 
         except Exception as exc:
             logger.error("Worker loop encountered an error: %s", exc)
diff --git a/startup.sh b/startup.sh
index ef3100e..6b3e0df 100755
--- a/startup.sh
+++ b/startup.sh
@@ -25,7 +25,12 @@ except Exception as e:
 "
 
 echo "Startup complete. Starting background worker and Gunicorn..."
-# Start the background worker process
-python3 -m scanner.worker &
+# Start the background worker process with a simple restart loop
+(
+  until python3 -m scanner.worker; do
+    echo "Worker process crashed with exit code $?. Respawning in 5 seconds..." >&2
+    sleep 5
+  done
+) &
 
 exec gunicorn --bind=0.0.0.0:$PORT --timeout 120 --workers 2 api.app:application
\ No newline at end of file
diff --git a/tests/test_worker.py b/tests/test_worker.py
index 8b8060a..c5e173e 100644
--- a/tests/test_worker.py
+++ b/tests/test_worker.py
@@ -30,10 +30,9 @@ def setUp(self):
     def test_worker_processes_pending_scan_successfully(self, mock_sleep, mock_env, mock_engine_class, mock_db_class):
         """
         Verify the happy path:
-        1. Worker finds a pending scan.
-        2. Updates status to 'running'.
-        3. Executes scan via ScanEngine.
-        4. Saves findings and updates status to 'completed'.
+        1. Worker claims a pending scan atomically.
+        2. Executes scan via ScanEngine.
+        3. Saves findings and updates status to 'completed'.
         """
         mock_env.return_value = self.mock_db_url
         
@@ -50,17 +49,19 @@ def test_worker_processes_pending_scan_successfully(self, mock_sleep, mock_env,
             "started_at": "2026-06-05T12:00:00Z"
         }
 
-        # We need to stop the infinite loop. We'll raise StopWorker on the second call to get_pending_scans.
-        mock_db.get_pending_scans.side_effect = [
-            [{"scan_id": self.scan_id, "subscription_id": self.subscription_id}],
-            StopWorker()
+        # We need to stop the infinite loop. We'll raise StopWorker on the second call to recover_stale_scans.
+        mock_db.recover_stale_scans.side_effect = [None, StopWorker()]
+        mock_db.claim_next_pending_scan.side_effect = [
+            {"scan_id": self.scan_id, "subscription_id": self.subscription_id},
+            None
         ]
 
         with self.assertRaises(StopWorker):
             run_worker()
 
         # Verify state transitions
-        mock_db.update_scan_status.assert_any_call(self.scan_id, "running")
+        mock_db.recover_stale_scans.assert_called()
+        mock_db.claim_next_pending_scan.assert_called()
         mock_engine.run_scan.assert_called_once_with(self.scan_id)
         mock_db.save_scan.assert_called_once()
         
@@ -76,16 +77,17 @@ def test_worker_processes_pending_scan_successfully(self, mock_sleep, mock_env,
     def test_worker_handles_scan_failure_gracefully(self, mock_sleep, mock_env, mock_engine_class, mock_db_class):
         """
         Verify the error path:
-        1. Worker finds a pending scan.
+        1. Worker claims a pending scan.
         2. ScanEngine raises an exception.
-        3. Worker catches it and marks the scan as 'failed' with the error message.
+        3. Worker catches it and marks the scan as 'failed' with a sanitized error message.
         """
         mock_env.return_value = self.mock_db_url
         mock_db = mock_db_class.return_value
         
-        mock_db.get_pending_scans.side_effect = [
-            [{"scan_id": self.scan_id, "subscription_id": self.subscription_id}],
-            StopWorker()
+        mock_db.recover_stale_scans.side_effect = [None, StopWorker()]
+        mock_db.claim_next_pending_scan.side_effect = [
+            {"scan_id": self.scan_id, "subscription_id": self.subscription_id},
+            None
         ]
         
         # Mock Engine to fail
@@ -95,8 +97,10 @@ def test_worker_handles_scan_failure_gracefully(self, mock_sleep, mock_env, mock
         with self.assertRaises(StopWorker):
             run_worker()
 
-        # Verify status was updated to failed
-        mock_db.update_scan_status.assert_any_call(self.scan_id, "failed", error_message="Azure Authentication Failed")
+        # Verify status was updated to failed with sanitized message
+        mock_db.update_scan_status.assert_any_call(
+            self.scan_id, "failed", error_message="Azure Error: Azure Authentication Failed"
+        )
         # Ensure findings were NOT saved on failure
         mock_db.save_scan.assert_not_called()
 
@@ -108,10 +112,8 @@ def test_worker_sleeps_when_no_scans_pending(self, mock_sleep, mock_env, mock_db
         mock_env.return_value = self.mock_db_url
         mock_db = mock_db_class.return_value
         
-        mock_db.get_pending_scans.side_effect = [
-            [],
-            StopWorker()
-        ]
+        mock_db.recover_stale_scans.side_effect = [None, StopWorker()]
+        mock_db.claim_next_pending_scan.return_value = None
 
         with self.assertRaises(StopWorker):
             run_worker()

From 27ab392079a1d84ae53fbdcb78916494ebdb4dda Mon Sep 17 00:00:00 2001
From: ritiksah141 <ritiksah141@gmail.com>
Date: Fri, 12 Jun 2026 00:30:57 +0100
Subject: [PATCH 5/5] fix: address security and architecture issues in async
 scan processing

- Sanitize worker error messages to prevent sensitive exception details from being exposed through the public API

- Revert unrelated schema and search_path changes to maintain compatibility with existing public-schema deployments

- Add  column to preserve  as the original queue timestamp

- Improve migration logic to correctly backfill historical scans and repair incorrect  statuses

- Update worker tests to reflect generic error handling and the new atomic scan-claiming workflow
---
 api/models/finding.py | 30 +++++++++++++++++-------------
 scanner/worker.py     |  3 ---
 tests/test_worker.py  |  2 +-
 3 files changed, 18 insertions(+), 17 deletions(-)

diff --git a/api/models/finding.py b/api/models/finding.py
index 7cd9aa3..f33b093 100644
--- a/api/models/finding.py
+++ b/api/models/finding.py
@@ -86,16 +86,10 @@ def __init__(self, dsn: Optional[str] = None) -> None:
     # ------------------------------------------------------------------ #
 
     def connect(self) -> None:
-        """Open a persistent database connection and set the search path."""
+        """Open a persistent database connection."""
         self.conn = psycopg2.connect(self.dsn)
-        self.conn.autocommit = True  # Set to True for schema management
-        with self.conn.cursor() as cur:
-            # Ensure the openshield schema exists and is preferred in the search path.
-            # This avoids 'permission denied for schema public' in restricted environments.
-            cur.execute("CREATE SCHEMA IF NOT EXISTS openshield;")
-            cur.execute("SET search_path TO openshield, public;")
         self.conn.autocommit = False
-        logger.info("Database connection established (schema: openshield)")
+        logger.info("Database connection established")
 
     def _get_conn(self) -> Any:
         if self.conn is None or self.conn.closed:
@@ -134,6 +128,7 @@ def create_tables(self) -> None:
                     scan_id         UUID PRIMARY KEY,
                     subscription_id TEXT NOT NULL,
                     started_at      TIMESTAMPTZ NOT NULL,
+                    claimed_at      TIMESTAMPTZ,
                     completed_at    TIMESTAMPTZ,
                     total_findings  INTEGER DEFAULT 0,
                     score           INTEGER DEFAULT NULL,
@@ -210,8 +205,17 @@ def run_migrations(self) -> None:
                     ALTER TABLE scans
                         ADD COLUMN IF NOT EXISTS cve_enrichment_status TEXT DEFAULT 'COMPLETED',
                         ADD COLUMN IF NOT EXISTS status                TEXT DEFAULT 'completed',
-                        ADD COLUMN IF NOT EXISTS error_message         TEXT
+                        ADD COLUMN IF NOT EXISTS error_message         TEXT,
+                        ADD COLUMN IF NOT EXISTS claimed_at            TIMESTAMPTZ
                 """)
+                # Fix: If status already existed but was backfilled as 'pending' (e.g. from 
+                # a previous buggy deploy), force it to 'completed' for all historical 
+                # scans that have already finished.
+                cur.execute("UPDATE scans SET status = 'completed' WHERE status = 'pending' AND completed_at IS NOT NULL")
+                
+                # Backfill claimed_at for any currently running scans so they don't get 
+                # immediately marked as stale by the new recovery logic.
+                cur.execute("UPDATE scans SET claimed_at = started_at WHERE status = 'running' AND claimed_at IS NULL")
             conn.commit()
             logger.info("CVE migrations applied successfully")
         except Exception as e:
@@ -414,12 +418,12 @@ def claim_next_pending_scan(self) -> Optional[Dict[str, Any]]:
         """Atomically claim the next pending scan using SKIP LOCKED."""
         conn = self._get_conn()
         from datetime import datetime, timezone
-        started_at = datetime.now(timezone.utc).isoformat()
+        claimed_at = datetime.now(timezone.utc).isoformat()
         with conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur:
             cur.execute(
                 """
                 UPDATE scans 
-                SET status = 'running', started_at = %s
+                SET status = 'running', claimed_at = %s
                 WHERE scan_id = (
                     SELECT scan_id 
                     FROM scans 
@@ -430,7 +434,7 @@ def claim_next_pending_scan(self) -> Optional[Dict[str, Any]]:
                 ) 
                 RETURNING *
                 """,
-                (started_at,)
+                (claimed_at,)
             )
             row = cur.fetchone()
             if row:
@@ -448,7 +452,7 @@ def recover_stale_scans(self, timeout_minutes: int = 60) -> int:
                 SET status = 'failed', 
                     error_message = 'Scan timed out after remaining in running state for too long.'
                 WHERE status = 'running' 
-                  AND started_at < (CURRENT_TIMESTAMP - INTERVAL '%s minutes')
+                  AND claimed_at < (CURRENT_TIMESTAMP - INTERVAL '%s minutes')
                 """,
                 (timeout_minutes,)
             )
diff --git a/scanner/worker.py b/scanner/worker.py
index b43f65d..14b77c9 100644
--- a/scanner/worker.py
+++ b/scanner/worker.py
@@ -65,9 +65,6 @@ def run_worker():
                 
                 # Sanitize public error message
                 public_error = "An internal error occurred during the scan. Please check the logs."
-                if "Authentication" in str(exc) or "Permission" in str(exc):
-                    public_error = f"Azure Error: {str(exc)}"
-                
                 db.update_scan_status(scan_id, "failed", error_message=public_error)
 
         except Exception as exc:
diff --git a/tests/test_worker.py b/tests/test_worker.py
index c5e173e..93ef349 100644
--- a/tests/test_worker.py
+++ b/tests/test_worker.py
@@ -99,7 +99,7 @@ def test_worker_handles_scan_failure_gracefully(self, mock_sleep, mock_env, mock
 
         # Verify status was updated to failed with sanitized message
         mock_db.update_scan_status.assert_any_call(
-            self.scan_id, "failed", error_message="Azure Error: Azure Authentication Failed"
+            self.scan_id, "failed", error_message="An internal error occurred during the scan. Please check the logs."
         )
         # Ensure findings were NOT saved on failure
         mock_db.save_scan.assert_not_called()