From 17e2568f87c15805952e68e00f41b129a4def14a Mon Sep 17 00:00:00 2001 From: Zachary Arguelles Date: Wed, 10 Jun 2026 17:12:45 -0400 Subject: [PATCH 01/20] feat: add configurable dlp control plane --- application/single_app/config.py | 2 +- .../single_app/functions_authentication.py | 36 +- application/single_app/functions_dlp.py | 458 ++++++++++++ application/single_app/functions_dlp_rules.py | 303 ++++++++ application/single_app/functions_documents.py | 660 +++++++++++++----- application/single_app/functions_settings.py | 19 +- application/single_app/requirements.txt | 3 +- application/single_app/route_backend_chats.py | 251 +++++-- .../route_frontend_admin_settings.py | 85 ++- .../static/js/admin/admin_settings.js | 46 +- .../single_app/templates/admin_settings.html | 130 ++++ .../features/DLP_UPLOAD_STAGING.md | 117 ++++ .../features/DLP_WEB_SEARCH_EGRESS_CONTROL.md | 137 ++++ docs/explanation/release_notes.md | 21 +- .../test_dlp_admin_settings_roundtrip.py | 222 ++++++ .../test_dlp_admin_settings_ui.py | 207 ++++++ functional_tests/test_dlp_admin_ui_smoke.py | 136 ++++ functional_tests/test_dlp_control_plane.py | 217 ++++++ functional_tests/test_dlp_regex_rules.py | 241 +++++++ functional_tests/test_dlp_review_events.py | 115 +++ functional_tests/test_dlp_telemetry.py | 210 ++++++ .../test_upload_dlp_ingestion_integration.py | 630 +++++++++++++++++ functional_tests/test_upload_dlp_redaction.py | 257 +++++++ .../test_upload_dlp_workspace_scopes.py | 79 +++ .../test_web_search_current_message_only.py | 9 +- .../test_web_search_dlp_egress.py | 252 +++++++ .../test_web_search_dlp_route_integration.py | 143 ++++ tools/local_dev/render_dlp_admin_preview.py | 91 +++ tools/local_dev/run_dlp_local_stack.md | 111 +++ 29 files changed, 4939 insertions(+), 249 deletions(-) create mode 100644 application/single_app/functions_dlp.py create mode 100644 application/single_app/functions_dlp_rules.py create mode 100644 docs/explanation/features/DLP_UPLOAD_STAGING.md create mode 100644 docs/explanation/features/DLP_WEB_SEARCH_EGRESS_CONTROL.md create mode 100644 functional_tests/test_dlp_admin_settings_roundtrip.py create mode 100644 functional_tests/test_dlp_admin_settings_ui.py create mode 100644 functional_tests/test_dlp_admin_ui_smoke.py create mode 100644 functional_tests/test_dlp_control_plane.py create mode 100644 functional_tests/test_dlp_regex_rules.py create mode 100644 functional_tests/test_dlp_review_events.py create mode 100644 functional_tests/test_dlp_telemetry.py create mode 100644 functional_tests/test_upload_dlp_ingestion_integration.py create mode 100644 functional_tests/test_upload_dlp_redaction.py create mode 100644 functional_tests/test_upload_dlp_workspace_scopes.py create mode 100644 functional_tests/test_web_search_dlp_egress.py create mode 100644 functional_tests/test_web_search_dlp_route_integration.py create mode 100644 tools/local_dev/render_dlp_admin_preview.py create mode 100644 tools/local_dev/run_dlp_local_stack.md diff --git a/application/single_app/config.py b/application/single_app/config.py index 89ff14aa..cd58865b 100644 --- a/application/single_app/config.py +++ b/application/single_app/config.py @@ -95,7 +95,7 @@ EXECUTOR_TYPE = 'thread' EXECUTOR_MAX_WORKERS = 30 SESSION_TYPE = 'filesystem' -VERSION = "0.242.072" +VERSION = "0.242.073" SESSION_COOKIE_SAMESITE = os.getenv('SESSION_COOKIE_SAMESITE', 'Lax') SESSION_COOKIE_HTTPONLY = os.getenv('SESSION_COOKIE_HTTPONLY', 'true').lower() != 'false' diff --git a/application/single_app/functions_authentication.py b/application/single_app/functions_authentication.py index 86abdc12..66eaaa52 100644 --- a/application/single_app/functions_authentication.py +++ b/application/single_app/functions_authentication.py @@ -2,9 +2,14 @@ import base64 import json +import re from config import * -from functions_appinsights import log_event +try: + from functions_appinsights import log_event +except Exception: + def log_event(message, extra=None, level=None, exceptionTraceback=False): + return None from functions_settings import * from functions_debug import debug_print @@ -331,7 +336,23 @@ def get_valid_access_token_for_plugins(scopes=None): "error_code": error_code, "error_description": error_desc } - + + +def _sanitize_video_indexer_auth_log_value(value): + text = str(value) + text = re.sub( + r'([?&]accessToken=)[^&\s\'"<>]+', + r'\1[REDACTED]', + text, + flags=re.IGNORECASE, + ) + return re.sub( + r'([\'"]?accessToken[\'"]?\s*[:=]\s*[\'"]?)[^,\'"\s}&]+', + r'\1[REDACTED]', + text, + flags=re.IGNORECASE, + ) + def get_video_indexer_account_token(settings, video_id=None): """ Get Video Indexer access token using managed identity authentication. @@ -435,7 +456,7 @@ def get_video_indexer_managed_identity_token(settings, video_id=None): debug_print(f"[VIDEO INDEXER AUTH] ARM API response status: {resp.status_code}") if resp.status_code != 200: - debug_print(f"[VIDEO INDEXER AUTH] ARM API response text: {resp.text}") + debug_print(f"[VIDEO INDEXER AUTH] ARM API response text: {_sanitize_video_indexer_auth_log_value(resp.text)}") resp.raise_for_status() response_data = resp.json() @@ -443,20 +464,21 @@ def get_video_indexer_managed_identity_token(settings, video_id=None): ai = response_data.get("accessToken") if not ai: - debug_print(f"[VIDEO INDEXER AUTH] ERROR: No accessToken in response: {response_data}") + debug_print(f"[VIDEO INDEXER AUTH] ERROR: No accessToken in response; response keys: {list(response_data.keys())}") raise ValueError("No accessToken found in ARM API response") debug_print(f"[VIDEO INDEXER AUTH] Account token acquired successfully (length: {len(ai)})") debug_print(f"[VIDEO] Account token acquired (len={len(ai)})", flush=True) return ai except requests.exceptions.RequestException as e: - debug_print(f"[VIDEO INDEXER AUTH] ERROR in ARM API request: {str(e)}") + sanitized_error = _sanitize_video_indexer_auth_log_value(e) + debug_print(f"[VIDEO INDEXER AUTH] ERROR in ARM API request: {sanitized_error}") if hasattr(e, 'response') and e.response is not None: debug_print(f"[VIDEO INDEXER AUTH] Error response status: {e.response.status_code}") - debug_print(f"[VIDEO INDEXER AUTH] Error response text: {e.response.text}") + debug_print(f"[VIDEO INDEXER AUTH] Error response text: {_sanitize_video_indexer_auth_log_value(e.response.text)}") raise except Exception as e: - debug_print(f"[VIDEO INDEXER AUTH] Unexpected error: {str(e)}") + debug_print(f"[VIDEO INDEXER AUTH] Unexpected error: {_sanitize_video_indexer_auth_log_value(e)}") raise diff --git a/application/single_app/functions_dlp.py b/application/single_app/functions_dlp.py new file mode 100644 index 00000000..6f764731 --- /dev/null +++ b/application/single_app/functions_dlp.py @@ -0,0 +1,458 @@ +# functions_dlp.py + +import hashlib +import logging +from collections import OrderedDict + +from functions_dlp_rules import get_effective_dlp_regex_rules, scan_text_with_dlp_regex_rules + +try: + from functions_appinsights import log_event +except Exception: + def log_event(message, extra=None, level=logging.INFO, exceptionTraceback=False): + logging.log(level, "%s %s", message, extra or {}) + + +WEB_SEARCH_BLOCKED_MESSAGE = "Web search was blocked because the message appears to contain non-public information." +WEB_SEARCH_REDACTED_MESSAGE = "Sensitive details were removed before web search." + +DEFAULT_MAX_SCAN_CHARS = 200000 +DEFAULT_SCANNER_TIMEOUT_SECONDS = 5 +SUPPORTED_WEB_SEARCH_MODES = {"monitor", "redact", "block"} + +def _bool_setting(settings, key, default=False): + return bool((settings or {}).get(key, default)) + + +def _safe_int(value, default): + try: + return int(value) + except (TypeError, ValueError): + return default + + +def _safe_float(value, default): + try: + return float(value) + except (TypeError, ValueError): + return default + + +def _normalize_engine(settings): + """Return the implemented DLP engine for this release.""" + requested = str((settings or {}).get("dlp_default_engine", "regex") or "regex").lower() + if requested != "regex": + return "regex" + return "regex" + + +def _normalize_mode(settings, surface): + if surface == "web_search": + mode = str((settings or {}).get("web_search_dlp_mode", "monitor") or "monitor").lower() + elif surface == "upload": + mode = str((settings or {}).get("upload_dlp_mode", "monitor") or "monitor").lower() + else: + mode = str((settings or {}).get("dlp_mode", "monitor") or "monitor").lower() + + return mode if mode in SUPPORTED_WEB_SEARCH_MODES else "monitor" + + +def _empty_result(text, enabled=False, engine="regex", mode="monitor", decision="allow", scanner_status="ok"): + safe_text = str(text or "") + return { + "enabled": enabled, + "engine": engine, + "mode": mode, + "decision": decision, + "text": safe_text, + "redacted_text": safe_text, + "total_replacements": 0, + "match_counts": {}, + "matches": [], + "metadata": {}, + "scanner_status": scanner_status, + } + + +def _apply_regex_engine(text, settings=None, surface="generic"): + rules, rule_errors = get_effective_dlp_regex_rules(settings or {}) + redacted_text, match_counts, matches, rule_metadata = scan_text_with_dlp_regex_rules( + text, + rules, + surface, + ) + return redacted_text, match_counts, matches, { + "rule_errors": len(rule_errors), + **rule_metadata, + } + + +def _decision_from_counts(match_counts, mode): + if not match_counts: + return "allow" + if mode == "block": + return "block" + if mode == "redact": + return "redact" + return "monitor" + + +def normalize_presidio_results(text, recognizer_results, mode="redact", engine="presidio_service"): + """Normalize Presidio-style entity offsets into the shared counts-only result.""" + source_text = str(text or "") + sorted_results = sorted( + [ + item for item in (recognizer_results or []) + if isinstance(item, dict) and item.get("entity_type") and item.get("start") is not None and item.get("end") is not None + ], + key=lambda item: int(item.get("start")), + ) + match_counts = OrderedDict() + redacted_parts = [] + cursor = 0 + + for item in sorted_results: + start = max(0, min(len(source_text), int(item.get("start")))) + end = max(start, min(len(source_text), int(item.get("end")))) + entity_type = str(item.get("entity_type")) + if start < cursor: + continue + redacted_parts.append(source_text[cursor:start]) + redacted_parts.append(f"[REDACTED_{entity_type}]") + cursor = end + match_counts[entity_type] = match_counts.get(entity_type, 0) + 1 + + redacted_parts.append(source_text[cursor:]) + redacted_text = "".join(redacted_parts) + counts = dict(match_counts) + decision = _decision_from_counts(counts, mode) + + return { + "enabled": True, + "engine": engine, + "mode": mode, + "decision": decision, + "text": redacted_text if counts else source_text, + "redacted_text": redacted_text if counts else source_text, + "total_replacements": sum(counts.values()), + "match_counts": counts, + "matches": [{"entity_type": key, "count": value} for key, value in counts.items()], + "metadata": {"adapter": "presidio"}, + "scanner_status": "ok", + } + + +def evaluate_dlp_text(text, settings=None, context=None, surface="generic"): + """Evaluate text against the configured DLP policy and return a safe result.""" + settings = settings or {} + context = context or {} + original_text = str(text or "") + engine = _normalize_engine(settings) + mode = _normalize_mode(settings, surface) + max_scan_chars = _safe_int(settings.get("dlp_max_scan_chars"), DEFAULT_MAX_SCAN_CHARS) + + if not _bool_setting(settings, "enable_dlp_control_plane", False): + return _empty_result(original_text, enabled=False, engine=engine, mode=mode, decision="allow") + + scan_text = original_text[:max_scan_chars] + skipped_chars = max(0, len(original_text) - len(scan_text)) + upload_fail_on_match = surface == "upload" and _bool_setting(settings, "upload_dlp_fail_upload_on_match", False) + + if skipped_chars and surface in {"web_search", "upload"} and (mode in {"redact", "block"} or upload_fail_on_match): + return { + "enabled": True, + "engine": engine, + "mode": mode, + "decision": "block", + "text": "", + "redacted_text": "", + "total_replacements": 0, + "match_counts": {}, + "matches": [], + "metadata": {"skipped_chars": skipped_chars}, + "scanner_status": "truncated", + } + + try: + redacted_text, match_counts, matches, scanner_metadata = _apply_regex_engine(scan_text, settings, surface) + except Exception as exc: + log_event( + "[DLP] Scanner error", + extra={ + "dlp_surface": surface, + "dlp_engine": engine, + "scanner_status": "error", + "error_type": type(exc).__name__, + }, + level=logging.WARNING, + exceptionTraceback=False, + ) + fail_closed = _bool_setting(settings, "dlp_fail_closed_on_scanner_error", True) + return { + "enabled": True, + "engine": engine, + "mode": mode, + "decision": "block" if fail_closed else "allow", + "text": "" if fail_closed else original_text, + "redacted_text": "" if fail_closed else original_text, + "total_replacements": 0, + "match_counts": {}, + "matches": [], + "metadata": {"error_hash": hashlib.sha256(str(exc).encode("utf-8")).hexdigest()[:16]}, + "scanner_status": "error", + } + + if skipped_chars and mode == "monitor": + metadata = dict(scanner_metadata) + metadata["skipped_chars"] = skipped_chars + metadata = {key: value for key, value in metadata.items() if value not in ("", None, {}, [])} + return { + "enabled": True, + "engine": engine, + "mode": mode, + "decision": "allow", + "text": original_text, + "redacted_text": original_text, + "total_replacements": sum(match_counts.values()), + "match_counts": dict(match_counts), + "matches": matches, + "metadata": metadata, + "scanner_status": "truncated", + } + + decision = _decision_from_counts(match_counts, mode) + safe_text = "" if decision == "block" else (redacted_text if match_counts else original_text) + safe_redacted_text = "" if decision == "block" else (redacted_text if match_counts else original_text) + metadata = dict(scanner_metadata) + if skipped_chars: + metadata["skipped_chars"] = skipped_chars + metadata = {key: value for key, value in metadata.items() if value not in ("", None, {}, [])} + + return { + "enabled": True, + "engine": engine, + "mode": mode, + "decision": decision, + "text": safe_text, + "redacted_text": safe_redacted_text, + "total_replacements": sum(match_counts.values()), + "match_counts": dict(match_counts), + "matches": matches, + "metadata": metadata, + "scanner_status": "truncated" if skipped_chars else "ok", + } + + +def evaluate_web_search_egress(text, settings=None, context=None): + """Evaluate and shape DLP decisions for web-search egress.""" + settings = settings or {} + context = context or {} + + if not _bool_setting(settings, "enable_web_search_dlp", False): + result = _empty_result( + text, + enabled=_bool_setting(settings, "enable_dlp_control_plane", False), + engine=_normalize_engine(settings), + mode=_normalize_mode(settings, "web_search"), + decision="allow", + ) + else: + result = evaluate_dlp_text(text, settings=settings, context=context, surface="web_search") + + decision = result.get("decision", "allow") + web_search_allowed = decision != "block" + if decision == "block": + status_message = WEB_SEARCH_BLOCKED_MESSAGE + web_search_query_text = "" + elif decision == "redact": + status_message = WEB_SEARCH_REDACTED_MESSAGE + web_search_query_text = result.get("redacted_text", "") + else: + status_message = "" + web_search_query_text = str(text or "") + + shaped = dict(result) + shaped.update( + { + "web_search_allowed": web_search_allowed, + "web_search_query_text": web_search_query_text, + "status_message": status_message, + } + ) + return shaped + + +def _safe_entity_counts(match_counts): + return { + str(entity_type): int(count) + for entity_type, count in (match_counts or {}).items() + if entity_type and int(count) > 0 + } + + +def _error_hash(result): + metadata = result.get("metadata") if isinstance(result, dict) else {} + raw_error = "" + if isinstance(metadata, dict): + raw_error = str(metadata.get("error") or metadata.get("error_hash") or "") + if not raw_error: + raw_error = "scanner_error" + return hashlib.sha256(raw_error.encode("utf-8")).hexdigest()[:16] + + +def build_dlp_telemetry_properties(result, surface, context=None): + """Build App Insights-safe DLP telemetry properties.""" + result = result or {} + context = context or {} + properties = { + "activity_type": "dlp_decision", + "dlp_surface": str(surface or "unknown"), + "dlp_action": str(result.get("decision") or "allow"), + "dlp_engine": str(result.get("engine") or "unknown"), + "dlp_mode": str(result.get("mode") or "monitor"), + "workspace_scope": str(context.get("workspace_scope") or context.get("document_scope") or "unknown"), + "scanner_status": str(result.get("scanner_status") or "ok"), + "dlp_total_replacements": int(result.get("total_replacements") or 0), + "dlp_entity_counts": _safe_entity_counts(result.get("match_counts")), + } + + for key in ("conversation_id", "chat_type", "document_scope", "document_id"): + if context.get(key): + properties[key] = str(context.get(key)) + + if properties["scanner_status"] != "ok": + properties["scanner_error"] = _error_hash(result) + + return properties + + +def should_emit_dlp_telemetry(result, settings=None): + settings = settings or {} + result = result or {} + if not _bool_setting(settings, "dlp_enable_structured_telemetry", True): + return False + action = str(result.get("decision") or "allow") + if action in {"block", "redact"}: + return True + if str(result.get("scanner_status") or "ok") != "ok": + return True + if _safe_int(result.get("total_replacements"), 0) > 0: + return True + if _safe_entity_counts(result.get("match_counts")): + return True + return _bool_setting(settings, "dlp_telemetry_sample_allow_events", False) + + +def build_dlp_review_event_summary(result, surface, context=None): + """Build a counts-only review payload for optional DLP review routing.""" + result = result or {} + context = context or {} + normalized_surface = str(surface or "unknown") + policy_type = "dlp_web_search" if normalized_surface == "web_search" else f"dlp_{normalized_surface}" + + summary = { + "policy_type": policy_type, + "violation_type": "dlp", + "surface": normalized_surface, + "action": str(result.get("decision") or "allow"), + "engine": str(result.get("engine") or "unknown"), + "mode": str(result.get("mode") or "monitor"), + "entity_counts": _safe_entity_counts(result.get("match_counts")), + "total_replacements": int(result.get("total_replacements") or 0), + "scanner_status": str(result.get("scanner_status") or "ok"), + "raw_matches": None, + } + + for key in ("conversation_id", "user_id", "document_id", "chat_type", "document_scope"): + if context.get(key): + summary[key] = str(context.get(key)) + + return summary + + +def evaluate_upload_content(text, settings=None, context=None): + """PR2-facing helper for upload DLP; upload wiring is added later.""" + settings = settings or {} + context = context or {} + + if not _bool_setting(settings, "enable_upload_dlp", False): + result = _empty_result( + text, + enabled=_bool_setting(settings, "enable_dlp_control_plane", False), + engine=str(settings.get("dlp_default_engine", "regex") or "regex"), + mode=_normalize_mode(settings, "upload"), + decision="allow", + ) + else: + result = evaluate_dlp_text(text, settings=settings, context=context, surface="upload") + + if ( + _bool_setting(settings, "upload_dlp_fail_upload_on_match", False) + and int(result.get("total_replacements") or 0) > 0 + ): + result = dict(result) + result["decision"] = "block" + result["text"] = "" + result["redacted_text"] = "" + + decision = result.get("decision", "allow") + scanner_status = result.get("scanner_status", "ok") + upload_allowed = decision != "block" and scanner_status != "blocked" + if scanner_status != "ok" and decision == "block": + status = "scanner_failed" + elif decision == "block": + status = "blocked" + elif decision == "redact": + status = "accepted_with_redactions" + elif decision == "monitor": + status = "accepted_with_dlp_monitoring" + else: + status = "accepted" + + if decision == "block": + sanitized_text = "" + elif decision == "redact": + sanitized_text = result.get("redacted_text", "") + else: + sanitized_text = str(text or "") + + shaped = dict(result) + shaped.update( + { + "upload_allowed": upload_allowed, + "sanitized_text": sanitized_text, + "status": status, + "dlp_metadata": build_dlp_metadata_summary(result, surface="upload", context=context), + } + ) + return shaped + + +def build_dlp_metadata_summary(result, surface, context=None): + """Build counts-only DLP metadata safe for document records.""" + result = result or {} + context = context or {} + summary = { + "dlp_surface": str(surface or "unknown"), + "dlp_action": str(result.get("decision") or "allow"), + "dlp_engine": str(result.get("engine") or "unknown"), + "dlp_mode": str(result.get("mode") or "monitor"), + "scanner_status": str(result.get("scanner_status") or "ok"), + "total_replacements": int(result.get("total_replacements") or 0), + "entity_counts": _safe_entity_counts(result.get("match_counts")), + } + for key in ("workspace_scope", "document_id"): + if context.get(key): + summary[key] = str(context.get(key)) + return summary + + +def build_upload_dlp_file_log_summary(result, context=None): + """Build a safe file-processing log summary for upload DLP decisions.""" + result = result or {} + context = context or {} + summary = build_dlp_metadata_summary(result, surface="upload", context=context) + for key in ("document_id", "workspace_scope", "page_number", "text_length"): + if context.get(key) is not None: + summary[key] = context.get(key) + return summary diff --git a/application/single_app/functions_dlp_rules.py b/application/single_app/functions_dlp_rules.py new file mode 100644 index 00000000..3b4b0426 --- /dev/null +++ b/application/single_app/functions_dlp_rules.py @@ -0,0 +1,303 @@ +# functions_dlp_rules.py + +import copy +import hashlib +from collections import OrderedDict + +import regex + + +CONFIDENCE_ORDER = {"low": 1, "medium": 2, "high": 3} +ALLOWED_FLAGS = {"IGNORECASE": regex.IGNORECASE, "MULTILINE": regex.MULTILINE} +ALLOWED_VALIDATORS = {"none", "luhn"} +ALLOWED_SURFACES = {"web_search", "upload"} +MAX_RULES = 50 +MAX_PATTERN_LENGTH = 512 +MAX_REPLACEMENT_LENGTH = 80 +MAX_KEYWORDS = 25 +MAX_KEYWORD_LENGTH = 80 +MAX_WINDOW_CHARS = 256 +REGEX_TIMEOUT_SECONDS = 0.05 + + +DEFAULT_DLP_REGEX_RULES = [ + { + "id": "us_ssn", + "label": "U.S. Social Security Number", + "entity_type": "US_SSN", + "enabled": True, + "pattern": r"(? 19: + return False + + checksum = 0 + reverse_digits = list(reversed(digits)) + for index, digit in enumerate(reverse_digits): + if index % 2 == 1: + digit *= 2 + if digit > 9: + digit -= 9 + checksum += digit + return checksum % 10 == 0 + + +def _validator_allows(value, validator): + if validator == "luhn": + return _luhn_valid(value) + return True + + +def _safe_rule_id(value, index): + candidate = _as_string(value, f"rule_{index + 1}").lower() + candidate = regex.sub(r"[^a-z0-9_\-]+", "_", candidate).strip("_-") + return candidate or f"rule_{index + 1}" + + +def validate_dlp_regex_rules(rules): + normalized_rules = [] + errors = [] + + if rules is None: + return get_default_dlp_regex_rules(), [] + if not isinstance(rules, list): + return [], ["dlp_regex_rules must be a list."] + if len(rules) > MAX_RULES: + return [], [f"dlp_regex_rules cannot contain more than {MAX_RULES} rules."] + + seen_ids = set() + for index, rule in enumerate(rules): + if not isinstance(rule, dict): + errors.append(f"Rule {index + 1} must be an object.") + continue + + rule_id = _safe_rule_id(rule.get("id"), index) + if rule_id in seen_ids: + errors.append(f"Rule {rule_id} has a duplicate id.") + continue + seen_ids.add(rule_id) + + pattern = _as_string(rule.get("pattern")) + if not pattern: + errors.append(f"Rule {rule_id} requires a regex pattern.") + continue + if len(pattern) > MAX_PATTERN_LENGTH: + errors.append(f"Rule {rule_id} pattern exceeds {MAX_PATTERN_LENGTH} characters.") + continue + + flags = _normalize_flags(rule.get("flags", [])) + try: + regex.compile(pattern, _compile_flags(flags)) + except Exception as exc: + errors.append(f"Rule {rule_id} regex does not compile: {type(exc).__name__}.") + continue + + surfaces = [ + _as_string(surface).lower() + for surface in rule.get("surfaces", ["web_search", "upload"]) + if _as_string(surface).lower() in ALLOWED_SURFACES + ] + if not surfaces: + errors.append(f"Rule {rule_id} must target web_search, upload, or both.") + continue + + validator = _as_string(rule.get("validator", "none")).lower() + if validator not in ALLOWED_VALIDATORS: + errors.append(f"Rule {rule_id} uses unsupported validator {validator}.") + continue + + confidence = rule.get("confidence", {}) + if not isinstance(confidence, dict): + confidence = {} + + keywords = [] + for keyword in confidence.get("keywords", []): + keyword_text = _as_string(keyword).lower() + if keyword_text and len(keyword_text) <= MAX_KEYWORD_LENGTH and keyword_text not in keywords: + keywords.append(keyword_text) + if len(keywords) >= MAX_KEYWORDS: + break + + try: + window_chars = int(confidence.get("window_chars", 48)) + except (TypeError, ValueError): + window_chars = 48 + window_chars = max(0, min(window_chars, MAX_WINDOW_CHARS)) + + entity_type = _as_string(rule.get("entity_type"), rule_id.upper()).upper() + replacement = _as_string(rule.get("replacement"), f"[REDACTED_{entity_type}]") + if len(replacement) > MAX_REPLACEMENT_LENGTH: + replacement = replacement[:MAX_REPLACEMENT_LENGTH] + + normalized_rules.append( + { + "id": rule_id, + "label": _as_string(rule.get("label"), entity_type), + "entity_type": entity_type, + "enabled": bool(rule.get("enabled", True)), + "pattern": pattern, + "replacement": replacement, + "surfaces": surfaces, + "flags": flags, + "validator": validator, + "confidence": { + "regex_only": _normalize_confidence(confidence.get("regex_only"), "medium"), + "with_keywords": _normalize_confidence(confidence.get("with_keywords"), "high"), + "keywords": keywords, + "window_chars": window_chars, + "minimum": _normalize_confidence(confidence.get("minimum"), "medium"), + }, + } + ) + + return normalized_rules, errors + + +def get_effective_dlp_regex_rules(settings): + normalized_rules, errors = validate_dlp_regex_rules((settings or {}).get("dlp_regex_rules")) + if errors: + default_rules, _ = validate_dlp_regex_rules(get_default_dlp_regex_rules()) + return default_rules, errors + return normalized_rules, [] + + +def _confidence_for_match(source_text, start, end, confidence): + keywords = confidence.get("keywords", []) + window_chars = int(confidence.get("window_chars", 0) or 0) + if not keywords or window_chars <= 0: + return confidence.get("regex_only", "medium") + + left = max(0, start - window_chars) + right = min(len(source_text), end + window_chars) + window = source_text[left:right].lower() + if any(keyword in window for keyword in keywords): + return confidence.get("with_keywords", "high") + return confidence.get("regex_only", "medium") + + +def _confidence_allows(actual, minimum): + return CONFIDENCE_ORDER.get(actual, 0) >= CONFIDENCE_ORDER.get(minimum, 2) + + +def _merge_confidence(existing, candidate): + return max(existing, candidate, key=lambda item: CONFIDENCE_ORDER.get(item, 0)) + + +def scan_text_with_dlp_regex_rules(text, rules, surface): + source_text = str(text or "") + redactions = [] + counts = OrderedDict() + confidence_by_entity = {} + + for rule in rules: + if not rule.get("enabled", True): + continue + if surface not in rule.get("surfaces", []): + continue + + compiled = regex.compile(rule["pattern"], _compile_flags(rule.get("flags", []))) + try: + rule_matches = list(compiled.finditer(source_text, timeout=REGEX_TIMEOUT_SECONDS)) + except TimeoutError: + raise RuntimeError(f"DLP regex rule timed out: {rule['id']}") + + for match in rule_matches: + value = match.group(0) + if not _validator_allows(value, rule.get("validator", "none")): + continue + + confidence = _confidence_for_match(source_text, match.start(), match.end(), rule["confidence"]) + if not _confidence_allows(confidence, rule["confidence"].get("minimum", "medium")): + continue + + entity_type = rule["entity_type"] + counts[entity_type] = counts.get(entity_type, 0) + 1 + confidence_by_entity[entity_type] = _merge_confidence( + confidence_by_entity.get(entity_type, "low"), + confidence, + ) + redactions.append((match.start(), match.end(), rule["replacement"])) + + redactions.sort(key=lambda item: item[0]) + redacted_parts = [] + cursor = 0 + for start, end, replacement in redactions: + if start < cursor: + continue + redacted_parts.append(source_text[cursor:start]) + redacted_parts.append(replacement) + cursor = end + redacted_parts.append(source_text[cursor:]) + + matches = [ + {"entity_type": entity_type, "count": count, "confidence": confidence_by_entity.get(entity_type, "medium")} + for entity_type, count in counts.items() + ] + metadata = { + "rule_count": len(rules), + "match_hash": hashlib.sha256("|".join(counts.keys()).encode("utf-8")).hexdigest()[:16] if counts else "", + } + return "".join(redacted_parts), dict(counts), matches, metadata diff --git a/application/single_app/functions_documents.py b/application/single_app/functions_documents.py index 5ee49733..041c5054 100644 --- a/application/single_app/functions_documents.py +++ b/application/single_app/functions_documents.py @@ -1,5 +1,6 @@ # functions_documents.py that has some changes I need to merge into Development +import logging import re import shutil import traceback @@ -7,7 +8,11 @@ from io import BytesIO from flask import make_response from config import * -from functions_appinsights import log_event +try: + from functions_appinsights import log_event +except Exception: + def log_event(message, extra=None, level=None, exceptionTraceback=False): + return None from functions_visio import build_visio_page_markdown, parse_vsdx_pages from functions_content import * from functions_settings import * @@ -16,6 +21,12 @@ from functions_authentication import * from functions_debug import * from functions_keyvault import SecretReturnType, keyvault_model_endpoint_get_helper +from functions_dlp import ( + build_dlp_telemetry_properties, + build_upload_dlp_file_log_summary, + evaluate_upload_content, + should_emit_dlp_telemetry, +) import azure.cognitiveservices.speech as speechsdk def allowed_file(filename, allowed_extensions=None): @@ -198,6 +209,271 @@ def _resolve_metadata_extraction_client(settings): DI_MARKDOWN_TABLE_ROW_PATTERN = re.compile(r'(?m)^\s*\|.+\|\s*$') +def _sanitize_video_indexer_log_value(value): + text = str(value) + text = re.sub( + r'([?&]accessToken=)[^&\s\'"<>]+', + r'\1[REDACTED]', + text, + flags=re.IGNORECASE, + ) + return re.sub( + r'([\'"]?accessToken[\'"]?\s*[:=]\s*[\'"]?)[^,\'"\s}&]+', + r'\1[REDACTED]', + text, + flags=re.IGNORECASE, + ) + + +def _get_upload_workspace_scope(group_id=None, public_workspace_id=None): + if public_workspace_id is not None: + return "public" + if group_id is not None: + return "group" + return "personal" + + +def _build_upload_dlp_context(document_id, page_number=None, group_id=None, public_workspace_id=None, text=None): + workspace_scope = _get_upload_workspace_scope(group_id=group_id, public_workspace_id=public_workspace_id) + context = { + "document_id": document_id, + "workspace_scope": workspace_scope, + } + if page_number is not None: + context["page_number"] = page_number + if text is not None: + context["text_length"] = len(text) + return context + + +def _should_disable_enhanced_citations_for_upload_dlp(settings): + if not settings.get("enable_dlp_control_plane", False): + return False + if not settings.get("enable_upload_dlp", False): + return False + if settings.get("dlp_fail_closed_on_scanner_error", True): + return True + if settings.get("upload_dlp_fail_upload_on_match", False): + return True + return str(settings.get("upload_dlp_mode", "monitor") or "monitor").lower() in {"redact", "block"} + + +UPLOAD_DLP_METADATA_FIELDS = ("title", "authors", "organization", "keywords", "abstract") +UPLOAD_DLP_STATUS_RANK = { + "accepted": 0, + "accepted_with_dlp_monitoring": 1, + "accepted_with_redactions": 2, + "scanner_failed": 3, + "blocked": 4, +} + + +def _metadata_value_to_text(value): + if value is None: + return "" + if isinstance(value, list): + return "\n".join("" if item is None else str(item) for item in value) + return str(value) + + +def _metadata_text_to_value(original_value, sanitized_text): + text = "" if sanitized_text is None else str(sanitized_text) + if isinstance(original_value, list): + return [line for line in text.splitlines() if line.strip()] + return text + + +def _merge_upload_dlp_document_summary(existing=None, incoming=None): + existing = existing or {} + incoming = incoming or {} + if incoming.get("dlp_metadata"): + incoming = incoming.get("dlp_metadata") or {} + if existing.get("dlp_metadata"): + existing = existing.get("dlp_metadata") or {} + + existing_status = str(existing.get("status") or existing.get("dlp_status") or "accepted") + incoming_status = str(incoming.get("status") or incoming.get("dlp_status") or "accepted") + aggregate_status = existing_status + if UPLOAD_DLP_STATUS_RANK.get(incoming_status, 0) > UPLOAD_DLP_STATUS_RANK.get(existing_status, 0): + aggregate_status = incoming_status + + aggregate = { + "status": aggregate_status, + "entity_counts": {}, + "total_replacements": int(existing.get("total_replacements") or 0) + int(incoming.get("total_replacements") or 0), + "scanner_status": incoming.get("scanner_status") or existing.get("scanner_status") or "ok", + } + for source in (existing.get("entity_counts") or {}, incoming.get("entity_counts") or {}): + for entity_type, count in source.items(): + aggregate["entity_counts"][str(entity_type)] = aggregate["entity_counts"].get(str(entity_type), 0) + int(count or 0) + for key in ("dlp_surface", "dlp_action", "dlp_engine", "dlp_mode", "scanner_status", "workspace_scope", "document_id"): + value = incoming.get(key) if incoming.get(key) is not None else existing.get(key) + if value is not None: + aggregate[key] = value + return aggregate + + +def _upload_metadata_log_summary(metadata, dlp_summary=None): + metadata = metadata or {} + fields = [field for field in UPLOAD_DLP_METADATA_FIELDS if field in metadata] + field_lengths = { + field: len(_metadata_value_to_text(metadata.get(field))) + for field in fields + } + populated_fields = [ + field + for field in fields + if field_lengths.get(field, 0) > 0 + ] + return { + "fields": fields, + "field_count": len(fields), + "field_lengths": field_lengths, + "populated_fields": populated_fields, + "populated_field_count": len(populated_fields), + "dlp_summary": dlp_summary or { + "status": "accepted", + "entity_counts": {}, + "total_replacements": 0, + }, + } + + +def _sanitize_upload_metadata_for_dlp(metadata, user_id, document_id, group_id=None, public_workspace_id=None): + sanitized = dict(metadata or {}) + aggregate = { + "status": "accepted", + "entity_counts": {}, + "total_replacements": 0, + "scanner_status": "ok", + } + + for field_name in UPLOAD_DLP_METADATA_FIELDS: + if field_name not in sanitized: + continue + original_value = sanitized.get(field_name) + metadata_text = _metadata_value_to_text(original_value) + if not metadata_text.strip(): + continue + + result = _evaluate_upload_dlp_text( + metadata_text, + user_id=user_id, + document_id=document_id, + page_number=f"metadata:{field_name}", + group_id=group_id, + public_workspace_id=public_workspace_id, + ) + sanitized[field_name] = _metadata_text_to_value( + original_value, + result.get("sanitized_text", metadata_text), + ) + incoming_summary = dict(result.get("dlp_metadata") or {}) + incoming_summary["status"] = result.get("status", incoming_summary.get("status", "accepted")) + aggregate = _merge_upload_dlp_document_summary(aggregate, incoming_summary) + + return sanitized, aggregate + + +def _get_current_document_dlp_metadata(document_id, user_id, group_id=None, public_workspace_id=None): + try: + document_metadata = get_document_metadata( + document_id=document_id, + user_id=user_id, + group_id=group_id, + public_workspace_id=public_workspace_id, + ) + except Exception as exc: + log_event( + f"[DLP] Failed to retrieve existing upload DLP metadata for document {document_id}: {exc}", + level=logging.WARNING, + ) + return {} + + if not document_metadata: + return {} + + current_metadata = dict(document_metadata.get("dlp_metadata") or {}) + if document_metadata.get("dlp_status") and not current_metadata.get("status"): + current_metadata["status"] = document_metadata.get("dlp_status") + return current_metadata + + +def _record_upload_dlp_result(result, user_id, document_id, group_id=None, public_workspace_id=None, page_number=None): + settings = get_settings() + context = _build_upload_dlp_context( + document_id=document_id, + page_number=page_number, + group_id=group_id, + public_workspace_id=public_workspace_id, + text=result.get("sanitized_text", ""), + ) + safe_summary = build_upload_dlp_file_log_summary(result, context=context) + add_file_task_to_file_processing_log( + document_id=document_id, + user_id=public_workspace_id if public_workspace_id is not None else (group_id if group_id is not None else user_id), + content=f"Upload DLP summary: {safe_summary}" + ) + + if should_emit_dlp_telemetry(result, settings): + log_event( + "[DLP] Upload decision", + extra=build_dlp_telemetry_properties(result, surface="upload", context=context), + ) + + incoming_metadata = dict(result.get("dlp_metadata") or {}) + incoming_metadata["status"] = result.get("status", incoming_metadata.get("status", "accepted")) + existing_metadata = _get_current_document_dlp_metadata( + document_id=document_id, + user_id=user_id, + group_id=group_id, + public_workspace_id=public_workspace_id, + ) + merged_metadata = _merge_upload_dlp_document_summary(existing_metadata, incoming_metadata) + + update_args = { + "document_id": document_id, + "user_id": user_id, + "dlp_status": merged_metadata.get("status"), + "dlp_metadata": merged_metadata, + } + if group_id is not None: + update_args["group_id"] = group_id + if public_workspace_id is not None: + update_args["public_workspace_id"] = public_workspace_id + + try: + update_document(**update_args) + except Exception as exc: + log_event( + f"[DLP] Failed to update upload DLP document metadata for document {document_id}: {exc}", + level=logging.WARNING, + ) + + +def _evaluate_upload_dlp_text(text, user_id, document_id, page_number=None, group_id=None, public_workspace_id=None): + settings = get_settings() + context = _build_upload_dlp_context( + document_id=document_id, + page_number=page_number, + group_id=group_id, + public_workspace_id=public_workspace_id, + text=text, + ) + result = evaluate_upload_content(text, settings=settings, context=context) + _record_upload_dlp_result( + result, + user_id=user_id, + document_id=document_id, + group_id=group_id, + public_workspace_id=public_workspace_id, + page_number=page_number, + ) + if not result.get("upload_allowed", True): + raise ValueError("Upload content blocked by DLP policy.") + return result + + def is_pdf_file_name(file_name): """Return True when the file name points to a PDF document.""" return str(file_name or '').lower().endswith('.pdf') @@ -1191,7 +1467,7 @@ def get_document_metadata(document_id, user_id, group_id=None, public_workspace_ add_file_task_to_file_processing_log( document_id=document_id, user_id=public_workspace_id if is_public_workspace else (group_id if is_group else user_id), - content=f"Document metadata lookup returned {len(document_items)} item(s)." + content=f"Document metadata retrieved for document {document_id}, item_count: {len(document_items)}." ) return _normalize_document_enhanced_citations(document_items[0]) if document_items else None @@ -1231,10 +1507,30 @@ def save_video_chunk( debug_print(f"[VIDEO CHUNK] Converted start_time {start_time} to {seconds} seconds") - # 1) generate embedding on the transcript text + transcript_dlp_result = _evaluate_upload_dlp_text( + page_text_content, + user_id=user_id, + document_id=document_id, + page_number=seconds, + group_id=group_id, + public_workspace_id=public_workspace_id, + ) + sanitized_transcript_text = transcript_dlp_result.get("sanitized_text", page_text_content) + ocr_dlp_result = _evaluate_upload_dlp_text( + ocr_chunk_text, + user_id=user_id, + document_id=document_id, + page_number=f"{seconds}:ocr", + group_id=group_id, + public_workspace_id=public_workspace_id, + ) + sanitized_ocr_text = ocr_dlp_result.get("sanitized_text", ocr_chunk_text) + dlp_metadata = transcript_dlp_result.get("dlp_metadata") + + # 1) generate embedding on the sanitized transcript text try: debug_print(f"[VIDEO CHUNK] Generating embedding for transcript text") - result = generate_embedding(page_text_content) + result = generate_embedding(sanitized_transcript_text) # Handle both tuple (new) and single value (backward compatibility) if isinstance(result, tuple): @@ -1279,15 +1575,16 @@ def save_video_chunk( chunk = { "id": chunk_id, "document_id": document_id, - "chunk_text": page_text_content, - "video_ocr_chunk_text": ocr_chunk_text, + "chunk_text": sanitized_transcript_text, + "video_ocr_chunk_text": sanitized_ocr_text, "embedding": embedding, "file_name": file_name, "start_time": start_time, "chunk_sequence": seconds, "upload_date": current_time, "version": version, - "document_tags": meta.get('tags', []) if meta else [] + "document_tags": meta.get('tags', []) if meta else [], + "dlp_metadata": dlp_metadata } if is_public_workspace: @@ -1325,6 +1622,8 @@ def save_video_chunk( print(f"[VideoChunk] UPLOAD ERROR for {chunk_id}: {e}", flush=True) except Exception as e: + if str(e) == "Upload content blocked by DLP policy.": + raise debug_print(f"[VIDEO CHUNK] Unexpected error processing chunk: {str(e)}") print(f"[VideoChunk] UNEXPECTED ERROR for {document_id}@{start_time}: {e}", flush=True) @@ -1336,7 +1635,8 @@ def process_video_document( update_callback, group_id, public_workspace_id=None, - auto_extract_metadata=True + auto_extract_metadata=True, + enable_enhanced_citations=False ): """ Processes a video by dividing transcript into 30-second chunks, @@ -1367,7 +1667,7 @@ def to_seconds(ts: str) -> float: debug_print("[VIDEO INDEXER] Video file support is enabled, proceeding with indexing") - if settings.get("enable_enhanced_citations", False): + if enable_enhanced_citations: debug_print("[VIDEO INDEXER] Enhanced citations enabled, uploading to blob storage") update_callback(status="Uploading video for enhanced citations...") try: @@ -1422,9 +1722,9 @@ def to_seconds(ts: str) -> float: token = get_video_indexer_account_token(settings) debug_print(f"[VIDEO INDEXER] Authentication successful, token length: {len(token) if token else 0}") except Exception as e: - debug_print(f"[VIDEO INDEXER] Authentication failed: {str(e)}") - print(f"[VIDEO] AUTH ERROR: {e}", flush=True) - update_callback(status=f"VIDEO: auth failed → {e}") + debug_print(f"[VIDEO INDEXER] Authentication failed: {_sanitize_video_indexer_log_value(e)}") + print("[VIDEO] AUTH ERROR", flush=True) + update_callback(status="VIDEO: auth failed") return 0 # 2) Upload video to Indexer @@ -1443,8 +1743,8 @@ def to_seconds(ts: str) -> float: debug_print(f"[VIDEO INDEXER] Using managed identity access token authentication") debug_print(f"[VIDEO INDEXER] Upload URL: {url}") - debug_print(f"[VIDEO INDEXER] Upload params: {params}") - debug_print(f"[VIDEO INDEXER] Starting file upload for: {original_filename}") + debug_print(f"[VIDEO INDEXER] Upload params keys: {list(params.keys())}, accessToken_present={bool(token)}, name_length={len(original_filename or '')}") + debug_print(f"[VIDEO INDEXER] Starting file upload for name_length={len(original_filename or '')}") with open(temp_file_path, "rb") as f: resp = requests.post(url, params=params, headers=headers, files={"file": f}) @@ -1452,7 +1752,7 @@ def to_seconds(ts: str) -> float: debug_print(f"[VIDEO INDEXER] Upload response status: {resp.status_code}") if resp.status_code != 200: - debug_print(f"[VIDEO INDEXER] Upload response text: {resp.text}") + debug_print(f"[VIDEO INDEXER] Upload response text: {_sanitize_video_indexer_log_value(resp.text)}") resp.raise_for_status() response_data = resp.json() @@ -1460,7 +1760,7 @@ def to_seconds(ts: str) -> float: vid = response_data.get("id") if not vid: - debug_print(f"[VIDEO INDEXER] ERROR: No video ID in response: {response_data}") + debug_print(f"[VIDEO INDEXER] ERROR: No video ID in response; response keys: {list(response_data.keys())}") raise ValueError("no video ID returned") debug_print(f"[VIDEO INDEXER] Upload successful, video ID: {vid}") @@ -1483,17 +1783,17 @@ def to_seconds(ts: str) -> float: print(f"[VIDEO] Failed to update document metadata with video_indexer_id: {e}", flush=True) except requests.exceptions.RequestException as e: - debug_print(f"[VIDEO INDEXER] Upload request failed: {str(e)}") + debug_print(f"[VIDEO INDEXER] Upload request failed: {_sanitize_video_indexer_log_value(e)}") if hasattr(e, 'response') and e.response is not None: debug_print(f"[VIDEO INDEXER] Upload error response status: {e.response.status_code}") - debug_print(f"[VIDEO INDEXER] Upload error response text: {e.response.text}") + debug_print(f"[VIDEO INDEXER] Upload error response text: {_sanitize_video_indexer_log_value(e.response.text)}") print(f"[VIDEO] UPLOAD ERROR: {e}", flush=True) - update_callback(status=f"VIDEO: upload failed → {e}") + update_callback(status="VIDEO: upload failed") return 0 except Exception as e: - debug_print(f"[VIDEO INDEXER] Upload unexpected error: {str(e)}") + debug_print(f"[VIDEO INDEXER] Upload unexpected error: {_sanitize_video_indexer_log_value(e)}") print(f"[VIDEO] UPLOAD ERROR: {e}", flush=True) - update_callback(status=f"VIDEO: upload failed → {e}") + update_callback(status="VIDEO: upload failed") return 0 # 3) Poll until ready @@ -1506,8 +1806,8 @@ def to_seconds(ts: str) -> float: debug_print(f"[VIDEO INDEXER] Using managed identity access token for polling") debug_print(f"[VIDEO INDEXER] Requesting full insights (no filtering)") - debug_print(f"[VIDEO INDEXER] Index polling URL: {index_url}") - debug_print(f"[VIDEO INDEXER] Starting processing polling for video ID: {vid}") + debug_print(f"[VIDEO INDEXER] Index polling request prepared, video_id_length={len(str(vid or ''))}") + debug_print(f"[VIDEO INDEXER] Starting processing polling for video ID length: {len(str(vid or ''))}") poll_count = 0 max_polls = 180 # 90 minutes maximum (30 second intervals) @@ -1539,10 +1839,10 @@ def to_seconds(ts: str) -> float: debug_print(f"[VIDEO INDEXER] Poll response keys: {list(data.keys())}") except requests.exceptions.RequestException as e: - debug_print(f"[VIDEO INDEXER] Poll request failed: {str(e)}") + debug_print(f"[VIDEO INDEXER] Poll request failed: {_sanitize_video_indexer_log_value(e)}") if hasattr(e, 'response') and e.response is not None: debug_print(f"[VIDEO INDEXER] Poll error response status: {e.response.status_code}") - debug_print(f"[VIDEO INDEXER] Poll error response text: {e.response.text}") + debug_print(f"[VIDEO INDEXER] Poll error response text: {_sanitize_video_indexer_log_value(e.response.text)}") if poll_count >= max_polls: update_callback(status="VIDEO: polling timeout") return 0 @@ -1593,91 +1893,16 @@ def to_seconds(ts: str) -> float: video_duration_seconds = to_seconds(video_duration) if video_duration else 0 debug_print(f"[VIDEO INDEXER] Video duration: {video_duration} ({video_duration_seconds} seconds)") - # Log raw insights JSON for complete visibility (debug only) - import json - print(f"\n[VIDEO] ===== RAW INSIGHTS JSON =====", flush=True) - try: - insights_json = json.dumps(insights, indent=2, ensure_ascii=False) - # Truncate if too long (show first 10000 chars) - if len(insights_json) > 10000: - print(f"{insights_json[:10000]}\n... (truncated, total length: {len(insights_json)} chars)", flush=True) - else: - print(insights_json, flush=True) - except Exception as e: - print(f"[VIDEO] Could not serialize insights to JSON: {e}", flush=True) - print(f"[VIDEO] ===== END RAW INSIGHTS =====\n", flush=True) - debug_print(f"[VIDEO INDEXER] Insights keys available: {list(insights.keys())}") print(f"[VIDEO] Available insight types: {', '.join(list(insights.keys())[:15])}...", flush=True) - # Debug: Show sample structures for all insight types - print(f"\n[VIDEO] ===== SAMPLE DATA STRUCTURES =====", flush=True) - - transcript_data = insights.get("transcript", []) - if transcript_data: - print(f"[VIDEO] TRANSCRIPT sample: {transcript_data[0]}", flush=True) - - ocr_data = insights.get("ocr", []) - if ocr_data: - print(f"[VIDEO] OCR sample: {ocr_data[0]}", flush=True) - - keywords_data_debug = insights.get("keywords", []) - if keywords_data_debug: - print(f"[VIDEO] KEYWORDS sample: {keywords_data_debug[0]}", flush=True) - - labels_data_debug = insights.get("labels", []) - if labels_data_debug: - debug_print(f"[VIDEO INDEXER] LABELS sample: {labels_data_debug[0]}") - - topics_data_debug = insights.get("topics", []) - if topics_data_debug: - debug_print(f"[VIDEO INDEXER] TOPICS sample: {topics_data_debug[0]}") - - audio_effects_data_debug = insights.get("audioEffects", []) - if audio_effects_data_debug: - debug_print(f"[VIDEO INDEXER] AUDIO_EFFECTS sample: {audio_effects_data_debug[0]}") - - emotions_data_debug = insights.get("emotions", []) - if emotions_data_debug: - debug_print(f"[VIDEO INDEXER] EMOTIONS sample: {emotions_data_debug[0]}") - - sentiments_data_debug = insights.get("sentiments", []) - if sentiments_data_debug: - debug_print(f"[VIDEO INDEXER] SENTIMENTS sample: {sentiments_data_debug[0]}") - - scenes_data_debug = insights.get("scenes", []) - if scenes_data_debug: - debug_print(f"[VIDEO INDEXER] SCENES sample: {scenes_data_debug[0]}") - - shots_data_debug = insights.get("shots", []) - if shots_data_debug: - debug_print(f"[VIDEO INDEXER] SHOTS sample: {shots_data_debug[0]}") - - faces_data_debug = insights.get("faces", []) - if faces_data_debug: - debug_print(f"[VIDEO INDEXER] FACES sample: {faces_data_debug[0]}") - - namedLocations_data_debug = insights.get("namedLocations", []) - if namedLocations_data_debug: - debug_print(f"[VIDEO INDEXER] NAMED_LOCATIONS sample: {namedLocations_data_debug[0]}") - - # Check for other potential label sources - brands_data_debug = insights.get("brands", []) - if brands_data_debug: - debug_print(f"[VIDEO INDEXER] BRANDS sample: {brands_data_debug[0]}") - - visualContentModeration_debug = insights.get("visualContentModeration", []) - if visualContentModeration_debug: - debug_print(f"[VIDEO INDEXER] VISUAL_MODERATION sample: {visualContentModeration_debug[0]}") - - # Show total counts for all available insights - print(f"[VIDEO] COUNTS:", flush=True) + print(f"[VIDEO] Insight counts:", flush=True) for key in insights.keys(): value = insights.get(key, []) if isinstance(value, list): print(f" {key}: {len(value)} items", flush=True) - print(f"[VIDEO] ===== END SAMPLE DATA =====\n", flush=True) + print("[VIDEO] Insight count logging complete", flush=True) transcript = insights.get("transcript", []) ocr_blocks = insights.get("ocr", []) @@ -1783,7 +2008,7 @@ def to_seconds(ts: str) -> float: debug_print(f"[VIDEO INDEXER] Context built - Speech: {len(speech_context)}, OCR: {len(ocr_context)}, Keywords: {len(keywords_context)}, Labels: {len(labels_context)}, People: {len(named_people_context)}, Locations: {len(named_locations_context)}, Objects: {len(detected_objects_context)}") if len(speech_context) > 0: - debug_print(f"[VIDEO INDEXER] First speech item: {speech_context[0]}") + debug_print("[VIDEO INDEXER] First speech item timing metadata available") # Sort all contexts by timestamp speech_context.sort(key=lambda x: to_seconds(x["start"])) @@ -2067,7 +2292,7 @@ def to_seconds(ts: str) -> float: insight_parts.append(f"Objects: {', '.join(chunk_objects)}") chunk_text = ". ".join(insight_parts) if insight_parts else "[No content detected]" - debug_print(f"[VIDEO INDEXER] Chunk {chunk_num + 1} has no speech, using insights as text: {chunk_text[:100]}...") + debug_print(f"[VIDEO INDEXER] Chunk {chunk_num + 1} has no speech, using insight summary length: {len(chunk_text)}") debug_print(f"[VIDEO INDEXER] Chunk {chunk_num + 1} at timestamp {start_ts}") debug_print(f"[VIDEO INDEXER] Chunk {chunk_num + 1} text length: {len(chunk_text)}, OCR text length: {len(ocr_text)}") @@ -2096,6 +2321,8 @@ def to_seconds(ts: str) -> float: debug_print(f"[VIDEO INDEXER] Chunk {chunk_num + 1} saved successfully") total += 1 except Exception as e: + if str(e) == "Upload content blocked by DLP policy.": + raise debug_print(f"[VIDEO INDEXER] Failed to save chunk {chunk_num + 1}: {str(e)}") debug_print(f"[VIDEO INDEXER] Chunk save traceback: {traceback.format_exc()}") @@ -2512,6 +2739,13 @@ def save_chunks(page_text_content, page_number, file_name, user_id, document_id, if not metadata: raise ValueError(f"No metadata found for document {document_id} (group: {is_group})") + metadata, _metadata_dlp_summary = _sanitize_upload_metadata_for_dlp( + metadata, + user_id=user_id, + document_id=document_id, + group_id=group_id, + public_workspace_id=public_workspace_id, + ) version = metadata.get("version") if metadata.get("version") else 1 if version is None: raise ValueError(f"Metadata for document {document_id} missing 'version' field") @@ -2520,15 +2754,6 @@ def save_chunks(page_text_content, page_number, file_name, user_id, document_id, print(f"Error updating document status or retrieving metadata for document {document_id}: {repr(e)}\nTraceback:\n{traceback.format_exc()}") raise - # Generate embedding - try: - #status = f"Generating embedding for page {page_number}" - #update_document(document_id=document_id, user_id=user_id, status=status) - embedding, token_usage = generate_embedding(page_text_content) - except Exception as e: - print(f"Error generating embedding for page {page_number} of document {document_id}: {e}") - raise - # Build chunk document try: chunk_id = f"{document_id}_{page_number}" @@ -2572,12 +2797,32 @@ def save_chunks(page_text_content, page_number, file_name, user_id, document_id, else: debug_print(f"[SAVE_CHUNKS] No vision analysis found for document {document_id}") + upload_dlp_result = _evaluate_upload_dlp_text( + enhanced_chunk_text, + user_id=user_id, + document_id=document_id, + page_number=page_number, + group_id=group_id, + public_workspace_id=public_workspace_id, + ) + sanitized_chunk_text = upload_dlp_result.get("sanitized_text", enhanced_chunk_text) + dlp_metadata = upload_dlp_result.get("dlp_metadata") + + # Generate embedding after upload DLP so embeddings never receive blocked or unredacted enforced content. + try: + #status = f"Generating embedding for page {page_number}" + #update_document(document_id=document_id, user_id=user_id, status=status) + embedding, token_usage = generate_embedding(sanitized_chunk_text) + except Exception as e: + print(f"Error generating embedding for page {page_number} of document {document_id}: {e}") + raise + if is_public_workspace: chunk_document = { "id": chunk_id, "document_id": document_id, "chunk_id": str(page_number), - "chunk_text": enhanced_chunk_text, + "chunk_text": sanitized_chunk_text, "embedding": embedding, "file_name": file_name, "chunk_keywords": chunk_keywords, @@ -2590,7 +2835,8 @@ def save_chunks(page_text_content, page_number, file_name, user_id, document_id, "chunk_sequence": page_number, # or you can keep an incremental idx "upload_date": current_time, "version": version, - "public_workspace_id": public_workspace_id + "public_workspace_id": public_workspace_id, + "dlp_metadata": dlp_metadata } elif is_group: # Get shared_group_ids from document metadata for group documents @@ -2599,7 +2845,7 @@ def save_chunks(page_text_content, page_number, file_name, user_id, document_id, "id": chunk_id, "document_id": document_id, "chunk_id": str(page_number), - "chunk_text": enhanced_chunk_text, + "chunk_text": sanitized_chunk_text, "embedding": embedding, "file_name": file_name, "chunk_keywords": chunk_keywords, @@ -2613,7 +2859,8 @@ def save_chunks(page_text_content, page_number, file_name, user_id, document_id, "upload_date": current_time, "version": version, "group_id": group_id, - "shared_group_ids": shared_group_ids + "shared_group_ids": shared_group_ids, + "dlp_metadata": dlp_metadata } else: # Get shared_user_ids from document metadata for personal documents @@ -2623,7 +2870,7 @@ def save_chunks(page_text_content, page_number, file_name, user_id, document_id, "id": chunk_id, "document_id": document_id, "chunk_id": str(page_number), - "chunk_text": enhanced_chunk_text, + "chunk_text": sanitized_chunk_text, "embedding": embedding, "file_name": file_name, "chunk_keywords": chunk_keywords, @@ -2637,7 +2884,8 @@ def save_chunks(page_text_content, page_number, file_name, user_id, document_id, "upload_date": current_time, "version": version, "user_id": user_id, - "shared_user_ids": shared_user_ids + "shared_user_ids": shared_user_ids, + "dlp_metadata": dlp_metadata } except Exception as e: print(f"Error creating chunk document for page {page_number} of document {document_id}: {e}") @@ -2708,19 +2956,21 @@ def save_chunks_batch(chunks_data, user_id, document_id, group_id=None, public_w if not metadata: raise ValueError(f"No metadata found for document {document_id}") + metadata, _metadata_dlp_summary = _sanitize_upload_metadata_for_dlp( + metadata, + user_id=user_id, + document_id=document_id, + group_id=group_id, + public_workspace_id=public_workspace_id, + ) version = metadata.get("version") if metadata.get("version") else 1 + author = ensure_list(metadata.get('authors')) if metadata else [] + title = metadata.get('title', '') if metadata else '' + document_classification = metadata.get('document_classification', 'None') if metadata else 'None' except Exception as e: log_event(f"[save_chunks_batch] Error retrieving metadata for document {document_id}: {repr(e)}", level=logging.ERROR) raise - # Generate all embeddings in batches - texts = [c['page_text_content'] for c in chunks_data] - try: - embedding_results = generate_embeddings_batch(texts) - except Exception as e: - log_event(f"[save_chunks_batch] Error generating batch embeddings for document {document_id}: {e}", level=logging.ERROR) - raise - # Check for vision analysis once vision_analysis = metadata.get('vision_analysis') vision_text = "" @@ -2742,15 +2992,42 @@ def save_chunks_batch(chunks_data, user_id, document_id, group_id=None, public_w vision_text_parts.append(f"\nContextual Analysis: {vision_analysis['analysis']}") vision_text = "\n".join(vision_text_parts) + sanitized_chunks_data = [] + for chunk_info in chunks_data: + sanitized_chunk_info = dict(chunk_info) + page_number = chunk_info['page_number'] + page_text_content = chunk_info['page_text_content'] + enhanced_chunk_text = page_text_content + vision_text if vision_text else page_text_content + upload_dlp_result = _evaluate_upload_dlp_text( + enhanced_chunk_text, + user_id=user_id, + document_id=document_id, + page_number=page_number, + group_id=group_id, + public_workspace_id=public_workspace_id, + ) + sanitized_chunk_info['page_text_content'] = upload_dlp_result.get("sanitized_text", enhanced_chunk_text) + sanitized_chunk_info['dlp_metadata'] = upload_dlp_result.get("dlp_metadata") + sanitized_chunks_data.append(sanitized_chunk_info) + + # Generate all embeddings in batches after DLP redaction + texts = [c['page_text_content'] for c in sanitized_chunks_data] + try: + embedding_results = generate_embeddings_batch(texts) + except Exception as e: + log_event(f"[save_chunks_batch] Error generating batch embeddings for document {document_id}: {e}", level=logging.ERROR) + raise + # Build all chunk documents chunk_documents = [] total_token_usage = {'total_tokens': 0, 'prompt_tokens': 0, 'model_deployment_name': None} - for idx, chunk_info in enumerate(chunks_data): + for idx, chunk_info in enumerate(sanitized_chunks_data): embedding, token_usage = embedding_results[idx] page_number = chunk_info['page_number'] file_name = chunk_info['file_name'] page_text_content = chunk_info['page_text_content'] + dlp_metadata = chunk_info.get('dlp_metadata') if token_usage: total_token_usage['total_tokens'] += token_usage.get('total_tokens', 0) @@ -2772,14 +3049,15 @@ def save_chunks_batch(chunks_data, user_id, document_id, group_id=None, public_w "chunk_keywords": [], "chunk_summary": "", "page_number": page_number, - "author": [], - "title": "", - "document_classification": "None", + "author": author, + "title": title, + "document_classification": document_classification, "document_tags": metadata.get('tags', []), "chunk_sequence": page_number, "upload_date": current_time, "version": version, - "public_workspace_id": public_workspace_id + "public_workspace_id": public_workspace_id, + "dlp_metadata": dlp_metadata } elif is_group: shared_group_ids = metadata.get('shared_group_ids', []) if metadata else [] @@ -2793,15 +3071,16 @@ def save_chunks_batch(chunks_data, user_id, document_id, group_id=None, public_w "chunk_keywords": [], "chunk_summary": "", "page_number": page_number, - "author": [], - "title": "", - "document_classification": "None", + "author": author, + "title": title, + "document_classification": document_classification, "document_tags": metadata.get('tags', []), "chunk_sequence": page_number, "upload_date": current_time, "version": version, "group_id": group_id, - "shared_group_ids": shared_group_ids + "shared_group_ids": shared_group_ids, + "dlp_metadata": dlp_metadata } else: shared_user_ids = metadata.get('shared_user_ids', []) if metadata else [] @@ -2815,15 +3094,16 @@ def save_chunks_batch(chunks_data, user_id, document_id, group_id=None, public_w "chunk_keywords": [], "chunk_summary": "", "page_number": page_number, - "author": [], - "title": "", - "document_classification": "None", + "author": author, + "title": title, + "document_classification": document_classification, "document_tags": metadata.get('tags', []), "chunk_sequence": page_number, "upload_date": current_time, "version": version, "user_id": user_id, - "shared_user_ids": shared_user_ids + "shared_user_ids": shared_user_ids, + "dlp_metadata": dlp_metadata } chunk_documents.append(chunk_document) @@ -4201,7 +4481,7 @@ def extract_document_metadata(document_id, user_id, group_id=None, public_worksp add_file_task_to_file_processing_log( document_id=document_id, user_id=group_id if is_group else user_id, - content=f"Retrieved document items for document {document_id}: {document_items}" + content=f"Retrieved document items for document {document_id}, item_count: {len(document_items)}." ) except Exception as e: add_file_task_to_file_processing_log( @@ -4231,10 +4511,19 @@ def extract_document_metadata(document_id, user_id, group_id=None, public_worksp if "abstract" in document_metadata: meta_data["abstract"] = document_metadata["abstract"] + meta_data, metadata_dlp_summary = _sanitize_upload_metadata_for_dlp( + meta_data, + user_id=user_id, + document_id=document_id, + group_id=group_id, + public_workspace_id=public_workspace_id, + ) + metadata_summary = _upload_metadata_log_summary(meta_data, metadata_dlp_summary) + add_file_task_to_file_processing_log( document_id=document_id, user_id=group_id if is_group else user_id, - content=f"Extracted metadata for document {document_id}, metadata: {meta_data}" + content=f"Extracted metadata for document {document_id}, summary: {metadata_summary}" ) args = { @@ -4288,12 +4577,13 @@ def extract_document_metadata(document_id, user_id, group_id=None, public_worksp block_reasons.append("Blocklist match") if blocked: + blocked_metadata_summary = _upload_metadata_log_summary(meta_data, metadata_dlp_summary) add_file_task_to_file_processing_log( document_id=document_id, user_id=group_id if is_group else user_id, - content=f"Blocked document metadata: {document_metadata}, reasons: {block_reasons}" + content=f"Blocked document metadata for document {document_id}, summary: {blocked_metadata_summary}, reasons: {block_reasons}" ) - print(f"Blocked document metadata: {document_metadata}\nReasons: {block_reasons}") + print(f"Blocked document metadata for document {document_id}. Reasons: {block_reasons}") return None except Exception as e: @@ -4310,7 +4600,7 @@ def extract_document_metadata(document_id, user_id, group_id=None, public_worksp add_file_task_to_file_processing_log( document_id=document_id, user_id=group_id if is_group else user_id, - content=f"Processing Hybrid search for document {document_id} using {len(meta_data or {})} metadata fields." + content=f"Processing Hybrid search for document {document_id} using metadata fields: {metadata_summary['populated_fields']}" ) args = { @@ -4442,7 +4732,7 @@ def extract_document_metadata(document_id, user_id, group_id=None, public_worksp add_file_task_to_file_processing_log( document_id=document_id, user_id=group_id if is_group else user_id, - content=f"GPT response for document {document_id}: {response_content}" + content=f"GPT response for document {document_id}, response_length: {len(response_content or '')}" ) # --- Step 7: Clean and parse the GPT JSON output --- @@ -4458,7 +4748,7 @@ def extract_document_metadata(document_id, user_id, group_id=None, public_worksp add_file_task_to_file_processing_log( document_id=document_id, user_id=group_id if is_group else user_id, - content=f"Cleaned JSON from GPT response for document {document_id}: {cleaned_str}" + content=f"Cleaned JSON from GPT response for document {document_id}, json_length: {len(cleaned_str or '')}" ) gpt_output = json.loads(cleaned_str) @@ -4466,12 +4756,24 @@ def extract_document_metadata(document_id, user_id, group_id=None, public_worksp add_file_task_to_file_processing_log( document_id=document_id, user_id=group_id if is_group else user_id, - content=f"Decoded JSON from GPT response for document {document_id}: {gpt_output}" + content=f"Decoded JSON from GPT response for document {document_id}, keys: {list(gpt_output.keys()) if isinstance(gpt_output, dict) else []}" ) # Ensure authors and keywords are always lists gpt_output["authors"] = ensure_list(gpt_output.get("authors", [])) gpt_output["keywords"] = ensure_list(gpt_output.get("keywords", [])) + gpt_output, gpt_metadata_dlp_summary = _sanitize_upload_metadata_for_dlp( + gpt_output, + user_id=user_id, + document_id=document_id, + group_id=group_id, + public_workspace_id=public_workspace_id, + ) + add_file_task_to_file_processing_log( + document_id=document_id, + user_id=group_id if is_group else user_id, + content=f"Sanitized GPT metadata for document {document_id}, summary: {_upload_metadata_log_summary(gpt_output, gpt_metadata_dlp_summary)}" + ) except (json.JSONDecodeError, TypeError) as e: add_file_task_to_file_processing_log( @@ -4514,10 +4816,18 @@ def extract_document_metadata(document_id, user_id, group_id=None, public_worksp if is_effectively_empty(meta_data["abstract"]): meta_data["abstract"] = gpt_output.get("abstract", meta_data["abstract"]) + meta_data, final_metadata_dlp_summary = _sanitize_upload_metadata_for_dlp( + meta_data, + user_id=user_id, + document_id=document_id, + group_id=group_id, + public_workspace_id=public_workspace_id, + ) + add_file_task_to_file_processing_log( document_id=document_id, user_id=group_id if is_group else user_id, - content=f"Final metadata for document {document_id}: {meta_data}" + content=f"Final metadata for document {document_id}, summary: {_upload_metadata_log_summary(meta_data, final_metadata_dlp_summary)}" ) args = { @@ -6833,9 +7143,25 @@ def process_di_document(document_id, user_id, temp_file_path, original_filename, elif doc_author: update_fields['authors'] = [doc_author] if doc_subject: update_fields['abstract'] = doc_subject if doc_keywords: update_fields['keywords'] = doc_keywords + metadata_update_fields = { + key: value + for key, value in update_fields.items() + if key in UPLOAD_DLP_METADATA_FIELDS + } + if metadata_update_fields: + sanitized_metadata_fields, _ = _sanitize_upload_metadata_for_dlp( + metadata_update_fields, + user_id=user_id, + document_id=document_id, + group_id=group_id, + public_workspace_id=public_workspace_id, + ) + update_fields.update(sanitized_metadata_fields) update_callback(**update_fields) except Exception as e: + if str(e) == "Upload content blocked by DLP policy.": + raise print(f"Warning: Failed to extract initial metadata for {original_filename}: {e}") # Continue processing even if metadata fails @@ -7451,12 +7777,13 @@ def process_audio_document( update_callback, group_id=None, public_workspace_id=None, - auto_extract_metadata=True + auto_extract_metadata=True, + enable_enhanced_citations=False ) -> int: """Transcribe an audio file via Azure Speech, splitting >10 min into WAV chunks.""" settings = get_settings() - if settings.get("enable_enhanced_citations", False): + if enable_enhanced_citations: update_callback(status="Uploading audio for enhanced citations…") blob_path = upload_to_blob( temp_file_path, @@ -7529,7 +7856,7 @@ def recognized_cb(evt): try: if evt.result.reason == speechsdk.ResultReason.RecognizedSpeech: all_results.append(evt.result.text) - print(f"[Debug] Recognized: {evt.result.text}") + print(f"[Debug] Recognized text length: {len(evt.result.text or '')}") elif evt.result.reason == speechsdk.ResultReason.NoMatch: print(f"[Debug] No speech recognized in segment") except Exception as e: @@ -7618,7 +7945,7 @@ def canceled_cb(evt): # result = speech_recognizer.recognize_once() # if result.reason == speechsdk.ResultReason.RecognizedSpeech: - # print(f"[Debug] Recognized: {result.text}") + # print(f"[Debug] Recognized text length: {len(result.text or '')}") # all_phrases.append(result.text) # elif result.reason == speechsdk.ResultReason.NoMatch: # print(f"[Warning] No speech in {chunk_path}") @@ -8260,6 +8587,11 @@ def process_document_upload_background(document_id, user_id, temp_file_path, ori is_public_workspace = public_workspace_id is not None settings = get_settings() enable_enhanced_citations = settings.get('enable_enhanced_citations', False) # Default to False if missing + disabled_enhanced_citations_for_upload_dlp = ( + enable_enhanced_citations and _should_disable_enhanced_citations_for_upload_dlp(settings) + ) + if disabled_enhanced_citations_for_upload_dlp: + enable_enhanced_citations = False enable_extract_meta_data = settings.get('enable_extract_meta_data', False) # Used by DI flow max_file_size_bytes = settings.get('max_file_size_mb', 16) * 1024 * 1024 @@ -8289,6 +8621,12 @@ def update_doc_callback(**kwargs): update_document(**args) + if disabled_enhanced_citations_for_upload_dlp: + update_doc_callback( + enhanced_citations=False, + status="Enhanced citations disabled because upload DLP enforcement is active" + ) + total_chunks_saved = 0 total_embedding_tokens = 0 embedding_model_name = None @@ -8413,7 +8751,8 @@ def update_doc_callback(**kwargs): update_callback=update_doc_callback, group_id=group_id, public_workspace_id=public_workspace_id, - auto_extract_metadata=False + auto_extract_metadata=False, + enable_enhanced_citations=enable_enhanced_citations ) elif file_ext in audio_extensions: total_chunks_saved = process_audio_document( @@ -8424,7 +8763,8 @@ def update_doc_callback(**kwargs): update_callback=update_doc_callback, group_id=group_id, public_workspace_id=public_workspace_id, - auto_extract_metadata=False + auto_extract_metadata=False, + enable_enhanced_citations=enable_enhanced_citations ) elif file_ext in di_supported_extensions or file_ext == '.doc': result = process_di_document( @@ -8499,6 +8839,14 @@ def update_doc_callback(**kwargs): group_id=group_id, public_workspace_id=public_workspace_id ) + if doc_metadata: + doc_metadata, _ = _sanitize_upload_metadata_for_dlp( + doc_metadata, + user_id=user_id, + document_id=document_id, + group_id=group_id, + public_workspace_id=public_workspace_id, + ) # Determine workspace type if public_workspace_id: @@ -9677,4 +10025,4 @@ def propagate_tags_to_chunks(document_id, tags, user_id, group_id=None, public_w except Exception as e: print(f"Error propagating tags to chunks for document {document_id}: {e}") - raise \ No newline at end of file + raise diff --git a/application/single_app/functions_settings.py b/application/single_app/functions_settings.py index 31c78aa9..9108b2ae 100644 --- a/application/single_app/functions_settings.py +++ b/application/single_app/functions_settings.py @@ -7,6 +7,7 @@ from config import * from functions_appinsights import log_event from functions_cosmos_throughput import get_default_cosmos_throughput_settings +from functions_dlp_rules import get_default_dlp_regex_rules from functions_document_actions import get_default_document_action_capabilities from functions_icon_utils import normalize_icon_payload from functions_service_health import get_default_service_health @@ -1061,6 +1062,22 @@ def get_settings(use_cosmos=False, include_source=False): 'azure_apim_content_safety_endpoint': '', 'azure_apim_content_safety_subscription_key': '', + # Data Loss Prevention (DLP) Settings + 'enable_dlp_control_plane': False, + 'dlp_default_engine': 'regex', + 'dlp_regex_rules': get_default_dlp_regex_rules(), + 'dlp_max_scan_chars': 200000, + 'dlp_fail_closed_on_scanner_error': True, + 'dlp_audit_level': 'counts_only', + 'dlp_enable_structured_telemetry': True, + 'dlp_telemetry_sample_allow_events': False, + 'dlp_review_destination': 'none', + 'enable_web_search_dlp': False, + 'web_search_dlp_mode': 'monitor', + 'enable_upload_dlp': False, + 'upload_dlp_mode': 'monitor', + 'upload_dlp_fail_upload_on_match': False, + # User Feedback / Conversation Archiving 'enable_user_feedback': True, 'require_member_of_feedback_admin': False, @@ -2433,4 +2450,4 @@ def clear_user_search_history(user_id): level=logging.ERROR, exceptionTraceback=True ) - return False \ No newline at end of file + return False diff --git a/application/single_app/requirements.txt b/application/single_app/requirements.txt index 4d79728a..b0237bfa 100644 --- a/application/single_app/requirements.txt +++ b/application/single_app/requirements.txt @@ -61,4 +61,5 @@ aiohttp==3.14.1 html2text==2025.4.15 matplotlib==3.10.7 azure-cognitiveservices-speech==1.48.2 -playwright==1.58.0 \ No newline at end of file +playwright==1.58.0 +regex==2026.5.9 diff --git a/application/single_app/route_backend_chats.py b/application/single_app/route_backend_chats.py index e299ab66..2518a7ff 100644 --- a/application/single_app/route_backend_chats.py +++ b/application/single_app/route_backend_chats.py @@ -113,6 +113,7 @@ from functions_appinsights import log_event from functions_debug import debug_print from functions_governance import ensure_governance_access +from functions_dlp import evaluate_web_search_egress, build_dlp_telemetry_properties, should_emit_dlp_telemetry from functions_notifications import create_chat_response_notification from functions_activity_logging import log_agent_run, log_chat_activity, log_conversation_creation, log_token_usage from flask import current_app @@ -1054,6 +1055,8 @@ def _strip_agent_citation_artifact_refs(agent_citations): FACT_MEMORY_TYPE_FACT = 'fact' FACT_MEMORY_TYPE_INSTRUCTION = 'instruction' FACT_MEMORY_TYPE_LEGACY_DESCRIBER = 'describer' +WEB_SEARCH_DLP_BLOCKED_STATUS = "Web search was blocked because the message appears to contain non-public information." +WEB_SEARCH_DLP_REDACTED_STATUS = "Sensitive details were removed before web search." INLINE_CHART_ID_PATTERN_TEMPLATE = '"chartId":"{}"' TABULAR_INLINE_CHART_MAX_POINTS = 12 TABULAR_INLINE_CHART_MAX_CHARTS = 2 @@ -14368,26 +14371,80 @@ def record_tabular_post_processing_thought(thought_payload): if web_search_enabled: search_thought_label = 'deep_research' if deep_research_enabled else 'web_search' - search_thought_text = "Planning Deep Research web searches" if deep_research_enabled else f"Searching the web for '{web_search_query_text[:50]}'" + search_thought_text = "Planning Deep Research web searches" if deep_research_enabled else f"Searching the web with query length {len(web_search_query_text)}" thought_tracker.add_thought(search_thought_label, search_thought_text) - research_search_result = perform_research_web_searches( + web_search_dlp_result = evaluate_web_search_egress( + web_search_query_text, settings=settings, - conversation_id=conversation_id, - user_id=user_id, - user_message=user_message, - user_message_id=user_message_id, - chat_type=chat_type, - document_scope=document_scope, - active_group_id=active_group_id, - active_public_workspace_id=active_public_workspace_id, - web_search_query_text=web_search_query_text, - system_messages_for_augmentation=system_messages_for_augmentation, - agent_citations_list=agent_citations_list, - web_search_citations_list=web_search_citations_list, - deep_research_enabled=deep_research_enabled, - deep_research_planner_client=gpt_client, - deep_research_planner_model=gpt_model, + context={ + "conversation_id": conversation_id, + "chat_type": chat_type, + "document_scope": document_scope, + "workspace_scope": document_scope, + }, ) + if should_emit_dlp_telemetry(web_search_dlp_result, settings): + log_event( + "[DLP] Web search egress decision", + extra=build_dlp_telemetry_properties( + web_search_dlp_result, + surface="web_search", + context={ + "conversation_id": conversation_id, + "chat_type": chat_type, + "document_scope": document_scope, + "workspace_scope": document_scope, + }, + ), + ) + if web_search_dlp_result.get("status_message"): + thought_tracker.add_thought('web_search', web_search_dlp_result["status_message"]) + + if not web_search_dlp_result.get("web_search_allowed", True): + system_messages_for_augmentation.append({ + "role": "system", + "content": WEB_SEARCH_DLP_BLOCKED_STATUS, + }) + research_search_result = {'query_plan': {}, 'web_search_runs': []} + else: + web_search_query_text = web_search_dlp_result.get("web_search_query_text", web_search_query_text) + if deep_research_enabled: + research_search_result = perform_research_web_searches( + settings=settings, + conversation_id=conversation_id, + user_id=user_id, + user_message=user_message, + user_message_id=user_message_id, + chat_type=chat_type, + document_scope=document_scope, + active_group_id=active_group_id, + active_public_workspace_id=active_public_workspace_id, + web_search_query_text=web_search_query_text, + system_messages_for_augmentation=system_messages_for_augmentation, + agent_citations_list=agent_citations_list, + web_search_citations_list=web_search_citations_list, + deep_research_enabled=deep_research_enabled, + deep_research_planner_client=gpt_client, + deep_research_planner_model=gpt_model, + ) + else: + perform_web_search( + settings=settings, + conversation_id=conversation_id, + user_id=user_id, + user_message=user_message, + user_message_id=user_message_id, + chat_type=chat_type, + document_scope=document_scope, + active_group_id=active_group_id, + active_public_workspace_id=active_public_workspace_id, + web_search_query_text=web_search_query_text, + system_messages_for_augmentation=system_messages_for_augmentation, + agent_citations_list=agent_citations_list, + web_search_citations_list=web_search_citations_list, + web_search_runs_list=deep_research_web_search_runs, + ) + research_search_result = {'query_plan': {}, 'web_search_runs': deep_research_web_search_runs} deep_research_query_plan = research_search_result.get('query_plan', {}) deep_research_web_search_runs = research_search_result.get('web_search_runs', []) if web_search_citations_list: @@ -17578,25 +17635,79 @@ def record_and_publish_streaming_thought(thought_payload): if deep_research_enabled: yield emit_thought('deep_research', "Planning Deep Research web searches") else: - yield emit_thought('web_search', f"Searching the web for '{web_search_query_text[:50]}'") - research_search_result = perform_research_web_searches( + yield emit_thought('web_search', f"Searching the web with query length {len(web_search_query_text)}") + web_search_dlp_result = evaluate_web_search_egress( + web_search_query_text, settings=settings, - conversation_id=conversation_id, - user_id=user_id, - user_message=user_message, - user_message_id=user_message_id, - chat_type=chat_type, - document_scope=document_scope, - active_group_id=active_group_id, - active_public_workspace_id=active_public_workspace_id, - web_search_query_text=web_search_query_text, - system_messages_for_augmentation=system_messages_for_augmentation, - agent_citations_list=agent_citations_list, - web_search_citations_list=web_search_citations_list, - deep_research_enabled=deep_research_enabled, - deep_research_planner_client=gpt_client, - deep_research_planner_model=gpt_model, + context={ + "conversation_id": conversation_id, + "chat_type": chat_type, + "document_scope": document_scope, + "workspace_scope": document_scope, + }, ) + if should_emit_dlp_telemetry(web_search_dlp_result, settings): + log_event( + "[DLP] Web search egress decision", + extra=build_dlp_telemetry_properties( + web_search_dlp_result, + surface="web_search", + context={ + "conversation_id": conversation_id, + "chat_type": chat_type, + "document_scope": document_scope, + "workspace_scope": document_scope, + }, + ), + ) + if web_search_dlp_result.get("status_message"): + yield emit_thought('web_search', web_search_dlp_result["status_message"]) + + if not web_search_dlp_result.get("web_search_allowed", True): + system_messages_for_augmentation.append({ + "role": "system", + "content": WEB_SEARCH_DLP_BLOCKED_STATUS, + }) + research_search_result = {'query_plan': {}, 'web_search_runs': []} + else: + web_search_query_text = web_search_dlp_result.get("web_search_query_text", web_search_query_text) + if deep_research_enabled: + research_search_result = perform_research_web_searches( + settings=settings, + conversation_id=conversation_id, + user_id=user_id, + user_message=user_message, + user_message_id=user_message_id, + chat_type=chat_type, + document_scope=document_scope, + active_group_id=active_group_id, + active_public_workspace_id=active_public_workspace_id, + web_search_query_text=web_search_query_text, + system_messages_for_augmentation=system_messages_for_augmentation, + agent_citations_list=agent_citations_list, + web_search_citations_list=web_search_citations_list, + deep_research_enabled=deep_research_enabled, + deep_research_planner_client=gpt_client, + deep_research_planner_model=gpt_model, + ) + else: + perform_web_search( + settings=settings, + conversation_id=conversation_id, + user_id=user_id, + user_message=user_message, + user_message_id=user_message_id, + chat_type=chat_type, + document_scope=document_scope, + active_group_id=active_group_id, + active_public_workspace_id=active_public_workspace_id, + web_search_query_text=web_search_query_text, + system_messages_for_augmentation=system_messages_for_augmentation, + agent_citations_list=agent_citations_list, + web_search_citations_list=web_search_citations_list, + web_search_runs_list=deep_research_web_search_runs, + ) + research_search_result = {'query_plan': {}, 'web_search_runs': deep_research_web_search_runs} deep_research_query_plan = research_search_result.get('query_plan', {}) deep_research_web_search_runs = research_search_result.get('web_search_runs', []) if web_search_citations_list: @@ -20162,7 +20273,7 @@ def build_conversation_history_segments( def _extract_web_search_citations_from_content(content: str) -> List[Dict[str, str]]: if not content: return [] - debug_print(f"[Citation Extraction] Extracting citations from:\n{content}\n") + debug_print(f"[Citation Extraction] Extracting citations from content length: {len(content)}") citations: List[Dict[str, str]] = [] @@ -20204,7 +20315,7 @@ def _extract_web_search_citations_from_content(content: str) -> List[Dict[str, s if not url: continue citations.append({"url": url, "title": url}) - debug_print(f"[Citation Extraction] Extracted {len(citations)} citations. - {citations}\n") + debug_print(f"[Citation Extraction] Extracted {len(citations)} citations.") return citations @@ -20294,7 +20405,7 @@ def to_int(value: Any) -> Optional[int]: if total_tokens is None: debug_print( "[Web Search][Token Usage Extraction] total_tokens missing or invalid. " - f"usage={usage}" + f"usage_type={type(usage)}, usage_keys={list(usage.keys())}" ) return {} @@ -20418,16 +20529,15 @@ def perform_web_search( debug_print(f"[WebSearch] Parameters received:") debug_print(f"[WebSearch] conversation_id: {conversation_id}") debug_print(f"[WebSearch] user_id: {user_id}") - debug_print(f"[WebSearch] user_message: {user_message[:100] if user_message else None}...") + debug_print(f"[WebSearch] user_message_length: {len(user_message or '')}") debug_print(f"[WebSearch] user_message_id: {user_message_id}") debug_print(f"[WebSearch] chat_type: {chat_type}") debug_print(f"[WebSearch] document_scope: {document_scope}") debug_print(f"[WebSearch] active_group_id: {active_group_id}") debug_print(f"[WebSearch] active_public_workspace_id: {active_public_workspace_id}") - debug_print( - "[WebSearch] web_search_query_text: " - f"{web_search_query_text[:100] if web_search_query_text else None}..." - ) + dlp_enabled = bool(settings.get("enable_dlp_control_plane") and settings.get("enable_web_search_dlp")) + debug_print(f"[WebSearch] web_search_query_length: {len(web_search_query_text or '')}") + debug_print(f"[WebSearch] dlp_enabled: {dlp_enabled}") initial_seed_url_count = len(web_search_citations_list or []) if isinstance(web_search_citations_list, list) else 0 run_started_at = datetime.utcnow().isoformat() @@ -20437,7 +20547,7 @@ def record_web_search_run(success, status, error=None, result_message_length=0, return final_seed_url_count = len(web_search_citations_list or []) if isinstance(web_search_citations_list, list) else initial_seed_url_count web_search_runs_list.append({ - 'query': str(web_search_query_text or user_message or '').strip()[:300], + 'query': str(web_search_query_text or '').strip()[:300], 'label': str(search_context_label or '').strip()[:100], 'status': status, 'success': bool(success), @@ -20499,24 +20609,16 @@ def record_web_search_run(success, status, error=None, result_message_length=0, debug_print(f"[WebSearch] Agent ID is configured: {agent_id}") - query_text = (web_search_query_text or user_message or "").strip() - debug_print(f"[WebSearch] Final query_text after fallback: '{query_text[:100] if query_text else ''}'") + query_text = (web_search_query_text or "").strip() + debug_print(f"[WebSearch] Final approved query_length: {len(query_text)}") if not query_text: - debug_print("[WebSearch] Query text is EMPTY after processing, skipping web search") - log_event( - "[WebSearch] Skipping Foundry web search: empty query", - extra={ - "conversation_id": conversation_id, - "user_id": user_id, - }, - level=logging.WARNING, - ) + debug_print("[WebSearch] Empty approved web-search query; skipping Foundry call") record_web_search_run(True, 'empty_query') - return True # Not an error, just empty query + return True # Not an error, just empty approved query search_request_content = build_research_search_prompt(query_text) - debug_print(f"[WebSearch] Building message history with query: {query_text[:100]}...") + debug_print(f"[WebSearch] Building message history with query_length: {len(query_text)}") message_history = [ ChatMessageContent(role="user", content=search_request_content) ] @@ -20540,37 +20642,39 @@ def record_web_search_run(success, status, error=None, result_message_length=0, ) except FoundryAgentInvocationError as exc: log_event( - f"[WebSearch] Foundry agent invocation failed: {exc}", + "[WebSearch] Foundry agent invocation failed", extra={ "conversation_id": conversation_id, "user_id": user_id, "agent_id": agent_id, + "error_type": type(exc).__name__, }, level=logging.ERROR, - exceptionTraceback=True, + exceptionTraceback=False, ) # Add failure message so the model informs the user system_messages_for_augmentation.append({ "role": "system", - "content": f"Web search failed with error: {exc}. Please inform the user that the web search encountered an error and you cannot provide real-time information for this query. Do not attempt to answer questions requiring current information from your training data - instead, acknowledge the search failure and suggest the user try again.", + "content": "Web search failed. Please inform the user that the web search encountered an error and you cannot provide real-time information for this query. Do not attempt to answer questions requiring current information from your training data - instead, acknowledge the search failure and suggest the user try again.", }) record_web_search_run(False, 'foundry_invocation_error', error=str(exc)) return False # Search failed except Exception as exc: log_event( - f"[WebSearch] Unexpected error invoking Foundry agent: {exc}", + "[WebSearch] Unexpected error invoking Foundry agent", extra={ "conversation_id": conversation_id, "user_id": user_id, "agent_id": agent_id, + "error_type": type(exc).__name__, }, level=logging.ERROR, - exceptionTraceback=True, + exceptionTraceback=False, ) # Add failure message so the model informs the user system_messages_for_augmentation.append({ "role": "system", - "content": f"Web search failed with an unexpected error: {exc}. Please inform the user that the web search encountered an error and you cannot provide real-time information for this query. Do not attempt to answer questions requiring current information from your training data - instead, acknowledge the search failure and suggest the user try again.", + "content": "Web search failed with an unexpected error. Please inform the user that the web search encountered an error and you cannot provide real-time information for this query. Do not attempt to answer questions requiring current information from your training data - instead, acknowledge the search failure and suggest the user try again.", }) record_web_search_run(False, 'unexpected_error', error=str(exc)) return False # Search failed @@ -20584,23 +20688,17 @@ def record_web_search_run(success, status, error=None, result_message_length=0, if result.message: debug_print(f"[WebSearch] Result message length: {len(result.message)} chars") - debug_print(f"[WebSearch] Result message preview: {result.message[:500] if len(result.message) > 500 else result.message}") else: debug_print("[WebSearch] Result message is EMPTY or None") if result.citations: debug_print(f"[WebSearch] Result citations count: {len(result.citations)}") - for i, cit in enumerate(result.citations[:3]): - debug_print(f"[WebSearch] Citation {i}: {json.dumps(cit, default=str)[:200]}...") else: debug_print("[WebSearch] Result citations is EMPTY or None") if result.metadata: - try: - metadata_payload = json.dumps(result.metadata, default=str) - except (TypeError, ValueError): - metadata_payload = str(result.metadata) - debug_print(f"[WebSearch] Foundry metadata: {metadata_payload}") + metadata_keys = list(result.metadata.keys()) if isinstance(result.metadata, Mapping) else [] + debug_print(f"[WebSearch] Foundry metadata present with keys: {metadata_keys}") else: debug_print("[WebSearch] Foundry metadata: ") @@ -20638,12 +20736,12 @@ def record_web_search_run(success, status, error=None, result_message_length=0, debug_print(f"[WebSearch] Processing {len(citations)} citations from result.citations") if citations: for i, citation in enumerate(citations): - debug_print(f"[WebSearch] Processing citation {i}: {json.dumps(citation, default=str)[:200]}...") + debug_print(f"[WebSearch] Processing citation {i}") serializable = make_json_serializable(citation) if not isinstance(serializable, dict): serializable = {"value": str(citation)} citation_title = serializable.get("title") or serializable.get("url") or "Web search source" - debug_print(f"[WebSearch] Adding agent citation with title: {citation_title}") + debug_print(f"[WebSearch] Adding agent citation {i + 1} of {len(citations)}") agent_citations_list.append({ "tool_name": citation_title, "function_name": "azure_ai_foundry_web_search", @@ -20663,7 +20761,8 @@ def record_web_search_run(success, status, error=None, result_message_length=0, else: debug_print("[WebSearch] No citations in result.citations to process") - debug_print(f"[WebSearch] Starting token usage extraction from Foundry metadata. Metadata: {result.metadata}") + metadata_keys = list((result.metadata or {}).keys()) if isinstance(result.metadata, Mapping) else [] + debug_print(f"[WebSearch] Starting token usage extraction from Foundry metadata keys: {metadata_keys}") token_usage = _extract_token_usage_from_metadata(result.metadata or {}) if token_usage.get("total_tokens"): try: @@ -20687,19 +20786,21 @@ def record_web_search_run(success, status, error=None, result_message_length=0, public_workspace_id=active_public_workspace_id, additional_context={ 'agent_id': agent_id, - 'search_query': query_text, + 'search_query_length': len(query_text), 'token_source': 'foundry_metadata' } ) except Exception as log_error: log_event( - f"[WebSearch] Failed to log web search token usage: {log_error}", + "[WebSearch] Failed to log web search token usage", extra={ "conversation_id": conversation_id, "user_id": user_id, "agent_id": agent_id, + "error_type": type(log_error).__name__, }, level=logging.WARNING, + exceptionTraceback=False, ) debug_print("[WebSearch] ========== FINAL SUMMARY ==========") diff --git a/application/single_app/route_frontend_admin_settings.py b/application/single_app/route_frontend_admin_settings.py index a4a7caf2..1b6657ac 100644 --- a/application/single_app/route_frontend_admin_settings.py +++ b/application/single_app/route_frontend_admin_settings.py @@ -1,6 +1,7 @@ # route_frontend_admin_settings.py import re +import secrets from config import * from functions_documents import * @@ -24,6 +25,7 @@ from functions_notifications import broadcast_system_notification from functions_logging import * from functions_document_actions import normalize_document_action_capabilities +from functions_dlp_rules import get_default_dlp_regex_rules, validate_dlp_regex_rules from swagger_wrapper import swagger_route, get_auth_security from datetime import datetime, timedelta, timezone from admin_settings_int_utils import safe_int_with_source @@ -51,6 +53,30 @@ 'agents_page_promoted_popular_tag_label': AGENTS_PAGE_PROMOTED_POPULAR_TAG_LABEL_DEFAULT, } HEX_COLOR_PATTERN = re.compile(r'^#[0-9a-fA-F]{6}$') +ADMIN_SETTINGS_CSRF_SESSION_KEY = "admin_settings_csrf_token" + + +def _new_admin_settings_csrf_token(): + token = secrets.token_urlsafe(32) + session[ADMIN_SETTINGS_CSRF_SESSION_KEY] = token + return token + + +def _get_admin_settings_csrf_token(): + token = session.get(ADMIN_SETTINGS_CSRF_SESSION_KEY) + if not token: + token = _new_admin_settings_csrf_token() + return token + + +def _validate_admin_settings_csrf_token(form_data): + submitted_token = str(form_data.get("admin_settings_csrf_token") or "") + expected_token = str(session.get(ADMIN_SETTINGS_CSRF_SESSION_KEY) or "") + return bool( + submitted_token + and expected_token + and secrets.compare_digest(submitted_token, expected_token) + ) def allowed_file(filename, allowed_extensions): return '.' in filename and \ @@ -532,6 +558,8 @@ def admin_settings(): source_review_runtime_capabilities, ) settings_for_template = redact_admin_settings_secrets_for_form(settings_for_template) + dlp_regex_rules_for_template, _ = validate_dlp_regex_rules(settings.get('dlp_regex_rules')) + dlp_regex_rules_json = json.dumps(dlp_regex_rules_for_template, indent=2) return render_template( 'admin_settings.html', @@ -551,7 +579,9 @@ def admin_settings(): chunk_size_settings=settings.get('chunk_size', {}), chunk_size_cap=get_chunk_size_cap(settings), chunk_size_effective=get_chunk_size_config(settings), - source_review_runtime_capabilities=source_review_runtime_capabilities + source_review_runtime_capabilities=source_review_runtime_capabilities, + admin_settings_csrf_token=_get_admin_settings_csrf_token(), + dlp_regex_rules_json=dlp_regex_rules_json # You don't need to pass deployments separately if they are added to settings['..._model']['all'] # gpt_deployments=gpt_deployments, # embedding_deployments=embedding_deployments, @@ -562,6 +592,11 @@ def admin_settings(): form_data = request.form # Use a variable for easier access user_id = get_current_user_id() + if not _validate_admin_settings_csrf_token(form_data): + _new_admin_settings_csrf_token() + flash("Admin settings request could not be verified. Please reload the page and try again.", "danger") + return redirect(url_for('admin_settings')) + def admin_secret(field_name, form_field_name=None): submitted_value = form_data.get(form_field_name or field_name, '').strip() return resolve_admin_settings_secret_value(field_name, submitted_value, settings) @@ -739,6 +774,36 @@ def parse_admin_int(raw_value, fallback_value, field_name="unknown", hard_defaul source='admin_settings' ) + dlp_max_scan_chars, _ = safe_int_with_source( + form_data.get('dlp_max_scan_chars'), + settings.get('dlp_max_scan_chars', 200000), + 200000 + ) + dlp_max_scan_chars = max(1000, dlp_max_scan_chars) + dlp_review_destination = form_data.get('dlp_review_destination', 'none') + if dlp_review_destination not in ('none',): + dlp_review_destination = 'none' + web_search_dlp_mode = form_data.get('web_search_dlp_mode', 'monitor') + if web_search_dlp_mode not in ('monitor', 'redact', 'block'): + web_search_dlp_mode = 'monitor' + upload_dlp_mode = form_data.get('upload_dlp_mode', 'monitor') + if upload_dlp_mode not in ('monitor', 'redact', 'block'): + upload_dlp_mode = 'monitor' + + raw_dlp_regex_rules = form_data.get('dlp_regex_rules_json', '').strip() + try: + submitted_dlp_regex_rules = json.loads(raw_dlp_regex_rules) if raw_dlp_regex_rules else get_default_dlp_regex_rules() + except json.JSONDecodeError: + _new_admin_settings_csrf_token() + flash("DLP regex rules must be valid JSON.", "danger") + return redirect(url_for('admin_settings')) + + normalized_dlp_regex_rules, dlp_regex_rule_errors = validate_dlp_regex_rules(submitted_dlp_regex_rules) + if dlp_regex_rule_errors: + _new_admin_settings_csrf_token() + flash(f"DLP regex rules are invalid: {dlp_regex_rule_errors[0]}", "danger") + return redirect(url_for('admin_settings')) + existing_source_review_max_bytes = parse_admin_int( settings.get('source_review_max_bytes_per_page'), 5000000, @@ -1956,6 +2021,20 @@ def is_valid_url(url): 'web_search_consent_accepted': web_search_consent_accepted, 'enable_web_search_user_notice': form_data.get('enable_web_search_user_notice') == 'on', 'web_search_user_notice_text': form_data.get('web_search_user_notice_text', 'Your current message will be sent to Microsoft Bing for web search. Conversation history is not sent for web search, but any sensitive content you paste into this message may be sent.').strip(), + 'enable_dlp_control_plane': form_data.get('enable_dlp_control_plane') == 'on', + 'dlp_default_engine': 'regex', + 'dlp_regex_rules': normalized_dlp_regex_rules, + 'dlp_max_scan_chars': dlp_max_scan_chars, + 'dlp_fail_closed_on_scanner_error': form_data.get('dlp_fail_closed_on_scanner_error') == 'on', + 'dlp_audit_level': 'counts_only', + 'dlp_enable_structured_telemetry': form_data.get('dlp_enable_structured_telemetry') == 'on', + 'dlp_telemetry_sample_allow_events': form_data.get('dlp_telemetry_sample_allow_events') == 'on', + 'dlp_review_destination': dlp_review_destination, + 'enable_web_search_dlp': form_data.get('enable_web_search_dlp') == 'on', + 'web_search_dlp_mode': web_search_dlp_mode, + 'enable_upload_dlp': form_data.get('enable_upload_dlp') == 'on', + 'upload_dlp_mode': upload_dlp_mode, + 'upload_dlp_fail_upload_on_match': form_data.get('upload_dlp_fail_upload_on_match') == 'on', 'web_search_agent': { 'agent_type': 'aifoundry', 'azure_openai_gpt_endpoint': form_data.get('web_search_foundry_endpoint', '').strip(), @@ -2385,8 +2464,10 @@ def is_valid_url(url): flash("Failed to update admin settings.", "danger") + _new_admin_settings_csrf_token() + # Redirect back to settings page return redirect(url_for('admin_settings')) # Fallback if not GET or POST (shouldn't happen with standard routing) - return redirect(url_for('admin_settings')) \ No newline at end of file + return redirect(url_for('admin_settings')) diff --git a/application/single_app/static/js/admin/admin_settings.js b/application/single_app/static/js/admin/admin_settings.js index 9123e7e6..401621ba 100644 --- a/application/single_app/static/js/admin/admin_settings.js +++ b/application/single_app/static/js/admin/admin_settings.js @@ -8317,6 +8317,50 @@ function validateAndMoveToNextStep(currentStep) { } } +function initializeDlpSettings() { + const togglePanel = (toggle, panel) => { + if (!toggle || !panel) { + return; + } + panel.classList.toggle('d-none', !toggle.checked); + toggle.addEventListener('change', function () { + panel.classList.toggle('d-none', !this.checked); + if (typeof markFormAsModified === 'function') { + markFormAsModified(); + } + }); + }; + + const enableDlpControlPlane = document.getElementById('enable_dlp_control_plane'); + const dlpControlPlaneSettings = document.getElementById('dlp_control_plane_settings'); + const enableWebSearchDlp = document.getElementById('enable_web_search_dlp'); + const webSearchDlpSettings = document.getElementById('web_search_dlp_settings'); + const webSearchDlpModeSettings = document.getElementById('web_search_dlp_mode_settings'); + const enableUploadDlp = document.getElementById('enable_upload_dlp'); + const uploadDlpSettings = document.getElementById('upload_dlp_settings'); + const uploadDlpModeSettings = document.getElementById('upload_dlp_mode_settings'); + + togglePanel(enableDlpControlPlane, dlpControlPlaneSettings); + togglePanel(enableWebSearchDlp, webSearchDlpModeSettings); + togglePanel(enableUploadDlp, uploadDlpModeSettings); + + if (webSearchDlpSettings && enableDlpControlPlane) { + webSearchDlpSettings.classList.toggle('d-none', !enableDlpControlPlane.checked); + enableDlpControlPlane.addEventListener('change', function () { + webSearchDlpSettings.classList.toggle('d-none', !this.checked); + }); + } + + if (uploadDlpSettings && enableDlpControlPlane) { + uploadDlpSettings.classList.toggle('d-none', !enableDlpControlPlane.checked); + enableDlpControlPlane.addEventListener('change', function () { + uploadDlpSettings.classList.toggle('d-none', !this.checked); + }); + } +} + +document.addEventListener('DOMContentLoaded', initializeDlpSettings); + /** * Navigate to the previous step in the walkthrough */ @@ -8465,4 +8509,4 @@ function openAdminSettingsTab(targetHash) { activateTabFromHash(); } -window.openAdminSettingsTab = openAdminSettingsTab; \ No newline at end of file +window.openAdminSettingsTab = openAdminSettingsTab; diff --git a/application/single_app/templates/admin_settings.html b/application/single_app/templates/admin_settings.html index 4c86667e..55cc0a03 100644 --- a/application/single_app/templates/admin_settings.html +++ b/application/single_app/templates/admin_settings.html @@ -493,6 +493,7 @@

Admin Settings

{% include "_semantic_search_health_warning.html" %}
+ @@ -7359,6 +7360,135 @@
+
+
+ Data Loss Prevention +
+

Configure DLP controls for upload ingestion and web-search egress.

+
+ + +
+ +
+
+
+ + +
Regex scanning is the only implemented engine in this release.
+
+
+ + +
+
+ + +
+
+ +
+
+ + +
+
+
+ + +
+
+
+
+ + +
+
+
+ +
+ + +
+ +
+
Custom Regex Rules
+ + +
+ Rules support surfaces, Luhn validation, and keyword proximity confidence. Defaults include U.S. SSN and Luhn-valid credit card detection. +
+
+ +
+
Web Search DLP
+
+ + +
+
+
+
+ + +
+
+
+
+ +
+
Upload DLP
+
+ + +
+
+
+
+ + +
+
+
+ + +
+
+
+
+
+
+
+ +
User Feedback diff --git a/docs/explanation/features/DLP_UPLOAD_STAGING.md b/docs/explanation/features/DLP_UPLOAD_STAGING.md new file mode 100644 index 00000000..d3e095cb --- /dev/null +++ b/docs/explanation/features/DLP_UPLOAD_STAGING.md @@ -0,0 +1,117 @@ +# DLP Upload Staging + +## Overview + +Version: 0.242.069 + +Dependencies: shared DLP core, configurable regex DLP rules, document processing pipeline, Azure AI Search, Azure OpenAI embeddings. + +SimpleChat now applies DLP to extracted upload text and selected document metadata before embeddings, Azure AI Search indexing, metadata extraction prompts, Cosmos metadata updates, and file-processing logs. The feature reuses the shared DLP core introduced for web-search egress and applies it to `save_chunks()`, `save_chunks_batch()`, `save_video_chunk()`, and metadata extraction/update paths. + +Regex DLP is the implemented engine for this release. The default rules detect U.S. SSNs and Luhn-valid credit card numbers, and administrators can add upload-specific regex rules through the shared `dlp_regex_rules` settings payload. + +## Technical Specifications + +Protected processing points: + +- `save_chunks()` evaluates DLP after metadata and vision text are combined, before `generate_embedding(...)`. +- `save_chunks_batch()` evaluates DLP for each enhanced chunk before `generate_embeddings_batch(...)`. +- `save_video_chunk()` evaluates transcript and OCR text before transcript embedding and AI Search indexing. +- Metadata fields `title`, `authors`, `organization`, `keywords`, and `abstract` are sanitized before metadata extraction prompts, hybrid-search queries, Cosmos updates, Azure AI Search payload metadata, activity logs, and file-processing logs. +- Safe DLP metadata is attached to chunk documents and document records as counts-only summaries. +- Document-level DLP metadata preserves the worst observed status and cumulative entity counts across chunk and metadata scans. +- Configured regex rules can target upload only, web search only, or both surfaces. +- Configured rules support keyword proximity confidence shaping, so a regex candidate can require nearby identifiers such as `document`, `employee`, `SSN`, or another admin-defined term before it redacts or blocks. +- File-processing logs replace raw chunk logging with safe DLP and text-length summaries. +- Enhanced citations are automatically disabled when upload DLP can enforce a block or redaction, including `redact` mode, `block` mode, fail-on-match, and fail-closed scanner errors, because this PR does not generate sanitized binary derivatives for raw source files. + +Upload DLP states: + +- `accepted`: no DLP findings. +- `accepted_with_dlp_monitoring`: findings observed in monitor mode. +- `accepted_with_redactions`: redacted text was embedded and indexed. +- `blocked`: DLP policy blocked indexing. +- `scanner_failed`: scanner failure blocked indexing in fail-closed mode. + +## Admin Settings + +Upload controls are available under Admin Settings > Data Loss Prevention: + +- Enable Upload DLP. +- Upload mode: `monitor`, `redact`, or `block`. +- Fail upload on match. +- Custom Regex Rules, shared with web-search DLP. + +Review routing defaults to `none`. Upload review-event writing is not exposed in this release because the DLP review destination is intentionally locked to `none`. + +## Telemetry And Logs + +Upload DLP telemetry uses `log_event(...)` with safe dimensions: + +- `activity_type = dlp_decision` +- `dlp_surface = upload` +- `dlp_action` +- `dlp_engine` +- `dlp_mode` +- `workspace_scope` +- `scanner_status` +- `dlp_total_replacements` +- `dlp_entity_counts` + +File-processing logs may include safe DLP summaries such as action, engine, counts, document id, workspace scope, page number, and text length. They do not include raw chunk text, raw OCR text, raw vision text, or raw matched values. + +Example Azure Monitor alert concepts: + +```kusto +customEvents +| where tostring(customDimensions.activity_type) == "dlp_decision" +| where tostring(customDimensions.dlp_surface) == "upload" +| where tostring(customDimensions.dlp_action) == "block" +| summarize blocked_uploads=count() by bin(timestamp, 15m) +``` + +```kusto +customEvents +| where tostring(customDimensions.activity_type) == "dlp_decision" +| where tostring(customDimensions.dlp_surface) == "upload" +| where toint(customDimensions.dlp_total_replacements) > 10 +| summarize high_redaction_events=count() by bin(timestamp, 15m) +``` + +```kusto +customEvents +| where tostring(customDimensions.activity_type) == "dlp_decision" +| where tostring(customDimensions.dlp_surface) == "upload" +| where tostring(customDimensions.scanner_status) != "ok" +| summarize scanner_failures=count() by bin(timestamp, 15m) +``` + +## Limitations + +This PR redacts extracted text and selected metadata before embeddings, search indexing, prompts, and metadata persistence. It does not claim that raw binary artifacts are format-redacted. When upload DLP can enforce a block or redaction, enhanced citations are disabled instead of storing raw source blobs. A future format-aware derivative generation or quarantine workflow is needed to produce sanitized binary copies. + +Regex DLP is limited to deterministic structured identifiers and administrator-defined exact-format identifiers. It is weaker for names, addresses, contextual PII, international identifiers, and noisy document text. + +## Testing And Validation + +Functional coverage: + +- `functional_tests/test_upload_dlp_redaction.py` +- `functional_tests/test_dlp_regex_rules.py` +- `functional_tests/test_upload_dlp_workspace_scopes.py` +- `functional_tests/test_upload_dlp_ingestion_integration.py` +- `functional_tests/test_dlp_admin_ui_smoke.py` +- `functional_tests/test_dlp_review_events.py` +- `functional_tests/test_dlp_telemetry.py` +- Shared PR1 DLP tests remain green. + +Validated with Docker Python 3.12: + +- `python -m compileall application/single_app` +- The PR-specific functional tests above. + +Additional review-readiness validation: + +- `tools/local_dev/render_dlp_admin_preview.py` renders the shared DLP admin section and verifies upload controls are visible in the expanded preview. +- `tools/local_dev/run_dlp_local_stack.md` documents the local Cosmos emulator smoke flow inherited from the web-search DLP branch. +- Independent remediation review verified metadata sanitization, enhanced-citation enforcement, document-level status aggregation, and removal of the dead upload review toggle. diff --git a/docs/explanation/features/DLP_WEB_SEARCH_EGRESS_CONTROL.md b/docs/explanation/features/DLP_WEB_SEARCH_EGRESS_CONTROL.md new file mode 100644 index 00000000..e6d0c16a --- /dev/null +++ b/docs/explanation/features/DLP_WEB_SEARCH_EGRESS_CONTROL.md @@ -0,0 +1,137 @@ +# DLP Web Search Egress Control + +## Overview + +Version: 0.242.069 + +Dependencies: Flask chat routes, configurable regex DLP rules, and Azure AI Foundry web-search agent configuration. + +SimpleChat now includes an application-level Data Loss Prevention control before web-search grounding. The app evaluates the current user message after `build_web_search_query_text(...)` and before the configured Azure AI Foundry web-search agent is invoked. + +SimpleChat can inspect the `SimpleChat -> Azure AI Foundry` payload. It cannot inspect or intercept the service-side `Azure AI Foundry Agent Service -> Bing` grounding call inside Microsoft's service boundary. Blocking or redaction therefore happens before the app sends the current message to Foundry. + +## Technical Specifications + +The shared DLP core lives in `application/single_app/functions_dlp.py`. Configurable regex rules live in `application/single_app/functions_dlp_rules.py`. + +Implemented behavior: + +- Regex DLP is the only implemented engine in this release. +- Regex rules are admin-configurable through the `dlp_regex_rules` settings payload. +- Default rules detect U.S. SSNs and Luhn-valid credit card numbers. +- Rules can target web search, upload, or both. +- Rules can use keyword proximity confidence shaping. A regex match can require nearby terms such as `ssn`, `social security`, `card`, or `billing` before it reaches the configured minimum confidence. +- Generic internal phrase matching is not hardcoded. Administrators can add organization-specific phrases or identifiers as explicit custom rules. +- DLP metadata stores entity types and counts only. Raw matched values are not stored in telemetry or review summaries. +- Structured DLP telemetry uses `log_event(...)` and reaches Application Insights when `APPLICATIONINSIGHTS_CONNECTION_STRING` is configured. +- Scanner errors fail closed by default when `dlp_fail_closed_on_scanner_error` is enabled. +- Text that exceeds the configured scan limit is not partially redacted in `redact` or `block` mode. It is blocked with `scanner_status = truncated`; `monitor` mode records the truncated scanner status while preserving web search. +- The web-search route no longer falls back to the raw current message when the DLP-safe query text is empty. + +Admin settings are added in Admin Settings under Data Loss Prevention: + +- Shared DLP enablement, regex engine selection, configurable regex rules, maximum scan characters, scanner fail-closed behavior, telemetry, and review destination. +- Web-search DLP enablement and mode: `monitor`, `redact`, or `block`. +- Review destination defaults to `none`. Safety Violations review routing is documented as a future integration unless the review surface is expanded with distinct DLP labeling and access rules. + +## Usage + +1. Open Admin Settings. +2. Enable Data Loss Prevention. +3. Enable Web Search DLP. +4. Review or edit Custom Regex Rules. +5. Choose a mode: + - `monitor`: detect and emit safe telemetry while preserving web search. Oversized text records `scanner_status = truncated`. + - `redact`: replace detected structured identifiers before web search. Oversized text is blocked instead of partially redacted. + - `block`: skip web search when DLP detects configured sensitive content or when text exceeds the scan limit. + +User-visible status messages: + +- Blocked: `Web search was blocked because the message appears to contain non-public information.` +- Redacted: `Sensitive details were removed before web search.` + +These messages do not include raw values, snippets, recognizer names, scores, or policy identifiers. + +## Configurable Regex Rules + +The MVP DLP engine uses admin-configurable regex rules. Default rules detect U.S. Social Security numbers and Luhn-valid credit card numbers. Generic internal phrase blocking is not hardcoded; administrators can add organization-specific rules when those phrases are meaningful in their environment. + +Each rule can define: + +- entity type and replacement label +- allowed surfaces (`web_search`, `upload`) +- optional `luhn` validation +- keyword proximity confidence shaping +- minimum confidence required before redaction or blocking + +Confidence shaping lets a regex match become stronger when nearby terms are present. For example, an employee identifier rule can require `EID-123456` plus `employee` within 32 characters before it redacts. + +Regex DLP remains deterministic and dependency-light. Richer contextual PII detection for names, addresses, and natural-language identifiers remains future work. + +## Telemetry + +Telemetry dimensions are bounded and safe: + +- `activity_type = dlp_decision` +- `dlp_surface = web_search` +- `dlp_action` +- `dlp_engine` +- `dlp_mode` +- `workspace_scope` +- `scanner_status` +- `dlp_total_replacements` +- `dlp_entity_counts` + +Raw prompts, web-search queries, snippets, raw matched values, and filenames are excluded. + +Example Azure Monitor alert concepts: + +```kusto +customEvents +| where name has "DLP" or tostring(customDimensions.activity_type) == "dlp_decision" +| where tostring(customDimensions.dlp_surface) == "web_search" +| where tostring(customDimensions.dlp_action) == "block" +| summarize blocks=count() by bin(timestamp, 15m) +``` + +```kusto +customEvents +| where tostring(customDimensions.activity_type) == "dlp_decision" +| where tostring(customDimensions.scanner_status) != "ok" +| summarize scanner_errors=count() by bin(timestamp, 15m), tostring(customDimensions.dlp_engine) +``` + +## Review And Retention + +The implemented default is `dlp_review_destination = none`; DLP findings are not written to the Safety Violations review area by default. Review summary helpers return distinct `policy_type` values such as `dlp_web_search` and counts-only entity metadata for future integration. + +Telemetry retention follows the configured Application Insights workspace. This PR does not create a dedicated DLP storage container or store raw DLP matches. + +## Limitations + +Regex DLP is intentionally lightweight. It is useful for structured identifiers such as SSNs, Luhn-valid credit card numbers, and administrator-defined exact-format identifiers, but it is weaker for names, addresses, contextual PII, international identifiers, secrets, and noisy prose. + +The app-level control cannot inspect Bing's internal grounding query after Foundry receives the request. It reduces egress risk by preventing or redacting sensitive text before the app sends the web-search message to the Foundry agent. + +## Testing And Validation + +Functional coverage: + +- `functional_tests/test_dlp_control_plane.py` +- `functional_tests/test_dlp_regex_rules.py` +- `functional_tests/test_dlp_telemetry.py` +- `functional_tests/test_dlp_admin_settings_ui.py` +- `functional_tests/test_dlp_admin_settings_roundtrip.py` +- `functional_tests/test_dlp_review_events.py` +- `functional_tests/test_web_search_dlp_egress.py` +- `functional_tests/test_web_search_dlp_route_integration.py` + +Validated with Docker Python 3.12: + +- `python -m compileall application/single_app` +- The PR-specific functional tests above. + +Additional review-readiness validation: + +- `tools/local_dev/run_dlp_local_stack.md` documents a local Cosmos emulator smoke flow for the DLP admin UI. +- `tools/local_dev/render_dlp_admin_preview.py` renders collapsed and expanded DLP admin section previews from the real Jinja template without storing sensitive sample values. diff --git a/docs/explanation/release_notes.md b/docs/explanation/release_notes.md index 02424981..533c6a71 100644 --- a/docs/explanation/release_notes.md +++ b/docs/explanation/release_notes.md @@ -1,9 +1,26 @@ -This page tracks notable Simple Chat releases and organizes the detailed change log by version. The timeline below provides a quick visual overview of the current release progression through v0.242.071, and the per-version entries continue immediately after it. +This page tracks notable Simple Chat releases and organizes the detailed change log by version. The timeline below provides a quick visual overview of the current release progression through v0.242.073, and the per-version entries continue immediately after it. For feature-focused and fix-focused drill-downs by version, see [Features by Version](/explanation/features/) and [Fixes by Version](/explanation/fixes/). +### **(v0.242.073)** + +#### New Features + +* **Configurable DLP Control Plane** + * Added admin-configurable regex DLP rules with bounded regex execution, optional Luhn validation, and keyword-proximity confidence shaping. + * Added web-search egress enforcement and upload-ingestion enforcement so administrators can monitor, redact, or block configured sensitive content before it leaves SimpleChat or enters embeddings/search indexing. + * Kept default rules intentionally narrow with U.S. SSN and Luhn-valid credit-card detection only. + * (Ref: configurable DLP rules, web-search DLP egress, upload DLP ingestion, Admin Settings DLP controls) + +#### Bug Fixes + +* **Upload DLP Enforcement Edge Cases** + * Treats fail-on-match, fail-closed scanner errors, and truncated scans as enforced upload DLP paths when deciding whether content may be indexed or retained for enhanced citations. + * Sanitizes selected upload metadata before prompts, Search payloads, Cosmos updates, and logs while preserving counts-only DLP telemetry summaries. + * (Ref: upload DLP redaction, scanner failure handling, enhanced-citation safety) + ### **(v0.242.071)** #### New Features @@ -3434,4 +3451,4 @@ We introduced a robust user feedback system, expanded content-safety features fo 7. **App Roles & Enterprise Application** - Provides a robust way to control user access at scale. - - Admins can assign roles to new users or entire Azure AD groups. \ No newline at end of file + - Admins can assign roles to new users or entire Azure AD groups. diff --git a/functional_tests/test_dlp_admin_settings_roundtrip.py b/functional_tests/test_dlp_admin_settings_roundtrip.py new file mode 100644 index 00000000..3178d6dc --- /dev/null +++ b/functional_tests/test_dlp_admin_settings_roundtrip.py @@ -0,0 +1,222 @@ +# test_dlp_admin_settings_roundtrip.py +#!/usr/bin/env python3 +""" +Functional test for DLP admin settings roundtrip. +Version: 0.242.069 +Implemented in: 0.242.069 + +This test ensures DLP admin settings are normalized, persisted, and rendered +through the admin settings POST contract without requiring live Azure services. +""" + +import os +import sys +from pathlib import Path + + +ROOT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +APP_DIR = os.path.join(ROOT_DIR, "application", "single_app") +ADMIN_ROUTE_FILE = os.path.join(APP_DIR, "route_frontend_admin_settings.py") +ADMIN_TEMPLATE_FILE = os.path.join(APP_DIR, "templates", "admin_settings.html") +ADMIN_TEMPLATE = Path(ADMIN_TEMPLATE_FILE) + + +NORMALIZED_ASSIGNMENTS = [ + "dlp_max_scan_chars = max(1000, dlp_max_scan_chars)", + "if web_search_dlp_mode not in ('monitor', 'redact', 'block'):", + "web_search_dlp_mode = 'monitor'", + "if dlp_review_destination not in ('none',):", + "dlp_review_destination = 'none'", +] + + +PERSISTED_DLP_FIELDS = { + "enable_dlp_control_plane": "form_data.get('enable_dlp_control_plane') == 'on'", + "dlp_default_engine": "'regex'", + "dlp_regex_rules": "normalized_dlp_regex_rules", + "dlp_max_scan_chars": "dlp_max_scan_chars", + "dlp_fail_closed_on_scanner_error": "form_data.get('dlp_fail_closed_on_scanner_error') == 'on'", + "dlp_audit_level": "'counts_only'", + "dlp_enable_structured_telemetry": "form_data.get('dlp_enable_structured_telemetry') == 'on'", + "dlp_telemetry_sample_allow_events": "form_data.get('dlp_telemetry_sample_allow_events') == 'on'", + "dlp_review_destination": "dlp_review_destination", + "enable_web_search_dlp": "form_data.get('enable_web_search_dlp') == 'on'", + "web_search_dlp_mode": "web_search_dlp_mode", + "enable_upload_dlp": "form_data.get('enable_upload_dlp') == 'on'", + "upload_dlp_mode": "upload_dlp_mode", + "upload_dlp_fail_upload_on_match": "form_data.get('upload_dlp_fail_upload_on_match') == 'on'", +} + + +UNSUPPORTED_DLP_FORM_FIELDS = [ + "dlp_presidio_use_service", + "dlp_presidio_endpoint", + "dlp_presidio_score_threshold", + "dlp_scanner_timeout_seconds", + "dlp_review_include_redacted_preview", + "web_search_dlp_track_review_events", + "upload_dlp_track_review_events", +] + + +def read_file_text(path): + with open(path, "r", encoding="utf-8") as file_handle: + return file_handle.read() + + +def assert_no_retired_structured_redaction_control(source, source_name): + """Retired structured-redaction controls should not appear in admin DLP sources.""" + redaction_prefix = "web_search_dlp_redact" + for line_number, line in enumerate(source.splitlines(), start=1): + normalized = line.lower() + has_retired_prefix = redaction_prefix in normalized + has_structured_identifier_wording = "structured" in normalized and "identifier" in normalized + assert not (has_retired_prefix and has_structured_identifier_wording), ( + f"Retired structured-redaction DLP control remains in {source_name}:{line_number}" + ) + + +def test_dlp_admin_post_normalizes_untrusted_form_values(): + """Admin POST should clamp numeric inputs and fail closed on enum-like fields.""" + print("Testing DLP admin POST normalization...") + route_source = read_file_text(ADMIN_ROUTE_FILE) + + for snippet in NORMALIZED_ASSIGNMENTS: + assert snippet in route_source, f"Missing DLP normalization contract: {snippet}" + + assert "safe_int_with_source(" in route_source + + for field_name in UNSUPPORTED_DLP_FORM_FIELDS: + assert f"form_data.get('{field_name}'" not in route_source, ( + f"Admin route still accepts unsupported DLP form field: {field_name}" + ) + assert_no_retired_structured_redaction_control(route_source, ADMIN_ROUTE_FILE) + + +def test_dlp_admin_post_persists_normalized_dlp_payload(): + """Admin POST should persist normalized values, not raw form strings.""" + print("Testing DLP admin POST persistence payload...") + route_source = read_file_text(ADMIN_ROUTE_FILE) + + for field_name, expected_value in PERSISTED_DLP_FIELDS.items(): + expected_mapping = f"'{field_name}': {expected_value}" + assert expected_mapping in route_source, f"Missing DLP persistence mapping: {expected_mapping}" + + +def test_dlp_admin_template_roundtrips_persisted_values(): + """Admin template should render the same fields that POST persists.""" + print("Testing DLP admin template roundtrip controls...") + template_source = read_file_text(ADMIN_TEMPLATE_FILE) + + for field_name in PERSISTED_DLP_FIELDS: + if field_name == "dlp_regex_rules": + assert 'id="dlp_regex_rules_json"' in template_source + assert 'name="dlp_regex_rules_json"' in template_source + else: + assert ( + f'id="{field_name}"' in template_source or f'name="{field_name}"' in template_source + ), f"Missing DLP admin control: {field_name}" + + assert 'id="dlp_control_plane_settings"' in template_source + assert 'id="web_search_dlp_mode_settings"' in template_source + + for field_name in UNSUPPORTED_DLP_FORM_FIELDS: + assert field_name not in template_source, f"Unsupported DLP control still rendered: {field_name}" + assert_no_retired_structured_redaction_control(template_source, ADMIN_TEMPLATE_FILE) + + +def test_dlp_review_destination_stays_unreachable_until_review_flow_exists(): + """Review records should stay disabled until a reachable review destination is implemented.""" + print("Testing DLP review destination fail-closed behavior...") + route_source = read_file_text(ADMIN_ROUTE_FILE) + template_source = read_file_text(ADMIN_TEMPLATE_FILE) + + assert "if dlp_review_destination not in ('none',):" in route_source + assert "dlp_review_destination = 'none'" in route_source + assert 'value="safety_violations"' not in template_source + + +def test_admin_dlp_controls_only_expose_supported_regex_engine(): + template = ADMIN_TEMPLATE.read_text(encoding="utf-8") + route_source = read_file_text(ADMIN_ROUTE_FILE) + + assert '' in template + assert "Regex scanning is the only implemented engine in this release." in template + assert 'name="dlp_regex_rules_json"' in template + assert "web_search_dlp_block_on_internal_phrases" not in template + assert "Detect internal phrases" not in template + assert 'value="presidio_service"' not in template + assert 'value="presidio_embedded"' not in template + assert_no_retired_structured_redaction_control(template, str(ADMIN_TEMPLATE)) + + for field_name in UNSUPPORTED_DLP_FORM_FIELDS: + assert f"'{field_name}':" not in route_source, f"Unsupported DLP field still persisted: {field_name}" + assert_no_retired_structured_redaction_control(route_source, ADMIN_ROUTE_FILE) + + +def test_admin_settings_post_validates_csrf_before_dlp_persistence(): + """Admin settings POST should validate CSRF before persisting security-sensitive DLP fields.""" + print("Testing admin settings CSRF validation ordering...") + source = read_file_text(ADMIN_ROUTE_FILE) + + post_index = source.find("if request.method == 'POST':") + form_index = source.find("form_data = request.form", post_index) + csrf_index = source.find("if not _validate_admin_settings_csrf_token(form_data):", form_index) + persist_index = source.find("'enable_dlp_control_plane': form_data.get('enable_dlp_control_plane') == 'on'", form_index) + + assert post_index != -1 + assert form_index > post_index + assert csrf_index > form_index + assert persist_index > csrf_index + assert "secrets.compare_digest" in source + assert "ADMIN_SETTINGS_CSRF_SESSION_KEY" in source + + +def test_admin_settings_persists_valid_dlp_regex_rules(): + """Admin settings should persist normalized configurable regex rules.""" + print("Testing admin regex rule persistence...") + source = read_file_text(ADMIN_ROUTE_FILE) + + assert "dlp_regex_rules_json" in source + assert "validate_dlp_regex_rules" in source + assert "'dlp_regex_rules': normalized_dlp_regex_rules" in source + + +def test_admin_settings_rejects_invalid_dlp_regex_rules_before_update(): + """Invalid DLP regex rules should be rejected before update_settings.""" + print("Testing invalid admin regex rule rejection ordering...") + source = read_file_text(ADMIN_ROUTE_FILE) + + parse_index = source.find("raw_dlp_regex_rules = form_data.get('dlp_regex_rules_json'") + validate_index = source.find("validate_dlp_regex_rules", parse_index) + update_index = source.find("if update_settings(new_settings):", validate_index) + + assert parse_index != -1 + assert validate_index > parse_index + assert update_index > validate_index + assert "return redirect(url_for('admin_settings'))" in source[validate_index:update_index] + + +if __name__ == "__main__": + tests = [ + test_dlp_admin_post_normalizes_untrusted_form_values, + test_dlp_admin_post_persists_normalized_dlp_payload, + test_dlp_admin_template_roundtrips_persisted_values, + test_dlp_review_destination_stays_unreachable_until_review_flow_exists, + test_admin_dlp_controls_only_expose_supported_regex_engine, + test_admin_settings_post_validates_csrf_before_dlp_persistence, + test_admin_settings_persists_valid_dlp_regex_rules, + test_admin_settings_rejects_invalid_dlp_regex_rules_before_update, + ] + + try: + for test in tests: + test() + print(f"All {len(tests)} DLP admin settings roundtrip tests passed.") + sys.exit(0) + except Exception as exc: + print(f"Test failed: {exc}") + import traceback + + traceback.print_exc() + sys.exit(1) diff --git a/functional_tests/test_dlp_admin_settings_ui.py b/functional_tests/test_dlp_admin_settings_ui.py new file mode 100644 index 00000000..f20d802e --- /dev/null +++ b/functional_tests/test_dlp_admin_settings_ui.py @@ -0,0 +1,207 @@ +# test_dlp_admin_settings_ui.py +#!/usr/bin/env python3 +""" +Functional test for DLP admin settings UI. +Version: 0.242.069 +Implemented in: 0.242.069 + +This test ensures shared and web-search DLP defaults exist, admin settings +persist supported controls, the admin template exposes only implemented controls, +and new DLP JavaScript uses Bootstrap d-none instead of JavaScript display toggles. +""" + +import os +import sys + + +ROOT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +SETTINGS_FILE = os.path.join(ROOT_DIR, "application", "single_app", "functions_settings.py") +ADMIN_ROUTE_FILE = os.path.join(ROOT_DIR, "application", "single_app", "route_frontend_admin_settings.py") +ADMIN_TEMPLATE_FILE = os.path.join(ROOT_DIR, "application", "single_app", "templates", "admin_settings.html") +ADMIN_JS_FILE = os.path.join(ROOT_DIR, "application", "single_app", "static", "js", "admin", "admin_settings.js") + + +REQUIRED_KEYS = [ + "enable_dlp_control_plane", + "dlp_default_engine", + "dlp_regex_rules", + "dlp_max_scan_chars", + "dlp_fail_closed_on_scanner_error", + "dlp_audit_level", + "dlp_enable_structured_telemetry", + "dlp_telemetry_sample_allow_events", + "dlp_review_destination", + "enable_web_search_dlp", + "web_search_dlp_mode", + "enable_upload_dlp", + "upload_dlp_mode", + "upload_dlp_fail_upload_on_match", +] + + +UNSUPPORTED_ADMIN_CONTROL_IDS = [ + "dlp_presidio_use_service", + "dlp_presidio_service_settings", + "dlp_presidio_endpoint", + "dlp_presidio_score_threshold", + "dlp_scanner_timeout_seconds", + "dlp_review_include_redacted_preview", + "web_search_dlp_track_review_events", + "upload_dlp_track_review_events", +] + + +RETIRED_DLP_SETTING_KEYS = [ + "dlp_presidio_use_service", + "dlp_presidio_endpoint", + "dlp_presidio_score_threshold", + "dlp_scanner_timeout_seconds", + "dlp_review_include_redacted_preview", + "web_search_dlp_track_review_events", + "upload_dlp_track_review_events", +] + + +def read_file_text(path): + with open(path, "r", encoding="utf-8") as file_handle: + return file_handle.read() + + +def assert_no_retired_structured_redaction_control(source, source_name): + """Retired structured-redaction controls should not appear in admin DLP sources.""" + redaction_prefix = "web_search_dlp_redact" + for line_number, line in enumerate(source.splitlines(), start=1): + normalized = line.lower() + has_retired_prefix = redaction_prefix in normalized + has_structured_identifier_wording = "structured" in normalized and "identifier" in normalized + assert not (has_retired_prefix and has_structured_identifier_wording), ( + f"Retired structured-redaction DLP control remains in {source_name}:{line_number}" + ) + + +def test_dlp_defaults_exist_and_are_safe(): + """Defaults should include shared/web-search DLP and keep review disabled.""" + print("Testing DLP defaults...") + source = read_file_text(SETTINGS_FILE) + + for key in REQUIRED_KEYS: + assert f"'{key}'" in source, f"Missing DLP default setting: {key}" + + assert "'dlp_review_destination': 'none'" in source + assert "'enable_web_search_dlp': False" in source + assert "raw_matches" not in source + + for key in RETIRED_DLP_SETTING_KEYS: + assert f"'{key}'" not in source, f"Retired DLP default setting remains: {key}" + assert_no_retired_structured_redaction_control(source, SETTINGS_FILE) + + +def test_admin_route_persists_dlp_settings(): + """Admin settings route should persist all PR1 DLP fields.""" + print("Testing DLP admin route persistence...") + source = read_file_text(ADMIN_ROUTE_FILE) + + for key in REQUIRED_KEYS: + assert key in source, f"Admin route does not persist or normalize {key}" + + +def test_admin_template_exposes_dlp_controls(): + """Admin UI should expose supported shared and web-search DLP controls.""" + print("Testing DLP admin template controls...") + source = read_file_text(ADMIN_TEMPLATE_FILE) + + assert "Data Loss Prevention" in source + assert 'id="dlp_control_plane_settings"' in source + assert 'id="web_search_dlp_settings"' in source + for key in REQUIRED_KEYS: + if key == "dlp_regex_rules": + assert 'id="dlp_regex_rules_json"' in source + assert 'name="dlp_regex_rules_json"' in source + else: + assert f'id="{key}"' in source or f'name="{key}"' in source, f"Missing DLP control: {key}" + + assert 'value="none"' in source + assert 'value="safety_violations"' not in source, ( + "Safety Violations destination should stay hidden unless PR1 implements reachable review integration" + ) + assert '' in source + assert "Regex scanning is the only implemented engine in this release." in source + assert "Custom Regex Rules" in source + assert "{{ dlp_regex_rules_json }}" in source + assert "web_search_dlp_block_on_internal_phrases" not in source + assert "Detect internal phrases" not in source + + for unsupported_id in UNSUPPORTED_ADMIN_CONTROL_IDS: + assert unsupported_id not in source, f"Unsupported DLP control is still visible: {unsupported_id}" + + assert 'value="presidio_service"' not in source + assert 'value="presidio_embedded"' not in source + assert_no_retired_structured_redaction_control(source, ADMIN_TEMPLATE_FILE) + + +def test_admin_js_uses_d_none_for_dlp_toggles(): + """New DLP JS should use Bootstrap d-none, not style.display.""" + print("Testing DLP admin JavaScript visibility handling...") + source = read_file_text(ADMIN_JS_FILE) + + assert "initializeDlpSettings" in source + assert "dlp_control_plane_settings" in source + assert "web_search_dlp_settings" in source + assert "classList.toggle('d-none'" in source or 'classList.toggle("d-none"' in source + + dlp_section = source[source.find("initializeDlpSettings"):] + assert ".style.display" not in dlp_section + + for unsupported_id in UNSUPPORTED_ADMIN_CONTROL_IDS: + assert unsupported_id not in dlp_section, f"Unsupported DLP JS hook remains: {unsupported_id}" + assert_no_retired_structured_redaction_control(dlp_section, ADMIN_JS_FILE) + + +def test_admin_settings_form_contains_csrf_token(): + """Admin settings form should submit a per-session CSRF token.""" + print("Testing admin settings CSRF template field...") + template = read_file_text(ADMIN_TEMPLATE_FILE) + + form_index = template.find('id="admin-settings-form"') + token_index = template.find('name="admin_settings_csrf_token"', form_index) + value_index = template.find('value="{{ admin_settings_csrf_token }}"', token_index) + + assert form_index != -1 + assert token_index > form_index + assert value_index > token_index + + +def test_admin_template_exposes_regex_rule_editor_without_internal_phrase_toggle(): + """Admin UI should expose configurable regex rules and remove hardcoded internal phrases.""" + print("Testing DLP regex rule admin editor...") + source = read_file_text(ADMIN_TEMPLATE_FILE) + + assert 'id="dlp_regex_rules_json"' in source + assert 'name="dlp_regex_rules_json"' in source + assert "{{ dlp_regex_rules_json }}" in source + assert "Custom Regex Rules" in source + assert "web_search_dlp_block_on_internal_phrases" not in source + assert "Detect internal phrases" not in source + + +if __name__ == "__main__": + tests = [ + test_dlp_defaults_exist_and_are_safe, + test_admin_route_persists_dlp_settings, + test_admin_template_exposes_dlp_controls, + test_admin_js_uses_d_none_for_dlp_toggles, + test_admin_settings_form_contains_csrf_token, + test_admin_template_exposes_regex_rule_editor_without_internal_phrase_toggle, + ] + + try: + for test in tests: + test() + print(f"All {len(tests)} DLP admin settings UI tests passed.") + sys.exit(0) + except Exception as exc: + print(f"Test failed: {exc}") + import traceback + + traceback.print_exc() + sys.exit(1) diff --git a/functional_tests/test_dlp_admin_ui_smoke.py b/functional_tests/test_dlp_admin_ui_smoke.py new file mode 100644 index 00000000..846566c7 --- /dev/null +++ b/functional_tests/test_dlp_admin_ui_smoke.py @@ -0,0 +1,136 @@ +# test_dlp_admin_ui_smoke.py +#!/usr/bin/env python3 +""" +Functional test for DLP admin UI smoke. +Version: 0.242.069 +Implemented in: 0.242.069 + +This test ensures the DLP admin settings card can be extracted into collapsed +and expanded previews for local visual review. +""" + +import importlib.util +import os +import sys +import tempfile +from pathlib import Path + + +ROOT_DIR = Path(__file__).resolve().parents[1] +ADMIN_TEMPLATE_FILE = ROOT_DIR / "application" / "single_app" / "templates" / "admin_settings.html" +PREVIEW_SCRIPT = ROOT_DIR / "tools" / "local_dev" / "render_dlp_admin_preview.py" + + +REQUIRED_CONTROLS = [ + "enable_dlp_control_plane", + "dlp_default_engine", + "dlp_regex_rules_json", + "dlp_max_scan_chars", + "enable_web_search_dlp", + "web_search_dlp_mode", + "enable_upload_dlp", + "upload_dlp_mode", + "upload_dlp_fail_upload_on_match", +] + + +RETIRED_CONTROLS = [ + "dlp_scanner_timeout_seconds", + "web_search_dlp_redact_structured_identifiers", + "web_search_dlp_block_on_internal_phrases", + "upload_dlp_track_review_events", +] + + +def load_preview_module(): + spec = importlib.util.spec_from_file_location("render_dlp_admin_preview", PREVIEW_SCRIPT) + module = importlib.util.module_from_spec(spec) + spec.loader.exec_module(module) + return module + + +def test_dlp_admin_preview_extractor_writes_collapsed_and_expanded_files(): + """Preview extraction should work against the real admin settings template.""" + print("Testing DLP admin preview extraction...") + module = load_preview_module() + + with tempfile.TemporaryDirectory() as temp_dir: + collapsed_path, expanded_path = module.render_previews(ADMIN_TEMPLATE_FILE, Path(temp_dir)) + + assert collapsed_path.exists() + assert expanded_path.exists() + assert collapsed_path.name == "admin-dlp-preview.html" + assert expanded_path.name == "admin-dlp-preview-expanded.html" + + collapsed_html = collapsed_path.read_text(encoding="utf-8") + expanded_html = expanded_path.read_text(encoding="utf-8") + + assert "Data Loss Prevention" in collapsed_html + assert "Data Loss Prevention" in expanded_html + assert 'id="dlp_control_plane_settings"' in collapsed_html + assert 'id="dlp_control_plane_settings"' in expanded_html + + +def test_expanded_dlp_admin_preview_contains_expected_controls(): + """Expanded preview should expose all DLP controls needed for review.""" + print("Testing expanded DLP admin preview controls...") + module = load_preview_module() + + with tempfile.TemporaryDirectory() as temp_dir: + _, expanded_path = module.render_previews(ADMIN_TEMPLATE_FILE, Path(temp_dir)) + expanded_html = expanded_path.read_text(encoding="utf-8") + + for control_id in REQUIRED_CONTROLS: + assert ( + f'id="{control_id}"' in expanded_html or f'name="{control_id}"' in expanded_html + ), f"Missing expanded DLP control: {control_id}" + + for control_id in RETIRED_CONTROLS: + assert ( + f'id="{control_id}"' not in expanded_html and f'name="{control_id}"' not in expanded_html + ), f"Retired DLP control still rendered: {control_id}" + + assert '
' not in expanded_html + assert '
' not in expanded_html + assert '
' not in expanded_html + + +def test_dlp_admin_preview_does_not_expose_raw_secret_values(): + """Preview files should include controls, not populated secrets or raw detector matches.""" + print("Testing DLP admin preview safety...") + module = load_preview_module() + + with tempfile.TemporaryDirectory() as temp_dir: + collapsed_path, expanded_path = module.render_previews(ADMIN_TEMPLATE_FILE, Path(temp_dir)) + rendered = ( + collapsed_path.read_text(encoding="utf-8") + + expanded_path.read_text(encoding="utf-8") + ) + + forbidden = [ + "123-45-6789", + "4111 1111 1111 1111", + "raw_matches", + ] + for value in forbidden: + assert value not in rendered, f"Preview leaked forbidden value: {value}" + + +if __name__ == "__main__": + tests = [ + test_dlp_admin_preview_extractor_writes_collapsed_and_expanded_files, + test_expanded_dlp_admin_preview_contains_expected_controls, + test_dlp_admin_preview_does_not_expose_raw_secret_values, + ] + + try: + for test in tests: + test() + print(f"All {len(tests)} DLP admin UI smoke tests passed.") + sys.exit(0) + except Exception as exc: + print(f"Test failed: {exc}") + import traceback + + traceback.print_exc() + sys.exit(1) diff --git a/functional_tests/test_dlp_control_plane.py b/functional_tests/test_dlp_control_plane.py new file mode 100644 index 00000000..8db4cb69 --- /dev/null +++ b/functional_tests/test_dlp_control_plane.py @@ -0,0 +1,217 @@ +# test_dlp_control_plane.py +#!/usr/bin/env python3 +""" +Functional test for DLP control plane core behavior. +Version: 0.242.069 +Implemented in: 0.242.069 + +This test ensures the shared DLP core supports disabled, regex, Luhn-validated +credit-card, counts-only metadata, ReDoS-resistant scanning, and optional +Presidio service normalization without persisting raw matched values. +""" + +import os +import sys +import time +from unittest.mock import patch + + +ROOT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +APP_DIR = os.path.join(ROOT_DIR, "application", "single_app") +sys.path.insert(0, APP_DIR) + + +RAW_SSN = "123-45-6789" +RAW_CARD = "4111 1111 1111 1111" +INVALID_CARD = "4111 1111 1111 1112" + + +def assert_no_raw_values(payload): + """Assert a nested DLP payload does not include raw sensitive values.""" + serialized = repr(payload) + forbidden_values = [RAW_SSN, RAW_CARD, INVALID_CARD, "Alice Example"] + for value in forbidden_values: + assert value not in serialized, f"Raw value leaked into payload: {value}" + + +def test_disabled_dlp_allows_original_text(): + """Disabled DLP should return the original text and an allow decision.""" + print("Testing disabled DLP behavior...") + from functions_dlp import evaluate_dlp_text + + text = f"Please search for {RAW_SSN}" + result = evaluate_dlp_text( + text, + settings={"enable_dlp_control_plane": False}, + surface="web_search", + ) + + assert result["decision"] == "allow" + assert result["text"] == text + assert result["redacted_text"] == text + assert result["total_replacements"] == 0 + assert result["match_counts"] == {} + assert result["matches"] == [] + + +def test_regex_redacts_ssn_and_counts_only_metadata(): + """Regex mode should redact SSNs and return counts-only metadata.""" + print("Testing SSN redaction and safe metadata...") + from functions_dlp import evaluate_dlp_text + + result = evaluate_dlp_text( + f"Customer SSN is {RAW_SSN}.", + settings={ + "enable_dlp_control_plane": True, + "dlp_default_engine": "regex", + "web_search_dlp_mode": "redact", + }, + surface="web_search", + ) + + assert result["decision"] == "redact" + assert "[REDACTED_US_SSN]" in result["redacted_text"] + assert result["match_counts"] == {"US_SSN": 1} + assert result["total_replacements"] == 1 + assert_no_raw_values(result) + + +def test_credit_card_requires_luhn_validation(): + """Credit-card-like values should redact only when Luhn-valid.""" + print("Testing credit card Luhn validation...") + from functions_dlp import evaluate_dlp_text + + valid_result = evaluate_dlp_text( + f"Use card {RAW_CARD} for the vendor.", + settings={"enable_dlp_control_plane": True, "web_search_dlp_mode": "redact"}, + surface="web_search", + ) + invalid_result = evaluate_dlp_text( + f"Ignore fake card {INVALID_CARD}.", + settings={"enable_dlp_control_plane": True, "web_search_dlp_mode": "redact"}, + surface="web_search", + ) + + assert valid_result["match_counts"] == {"CREDIT_CARD": 1} + assert "[REDACTED_CREDIT_CARD]" in valid_result["redacted_text"] + assert invalid_result["decision"] == "allow" + assert invalid_result["redacted_text"].endswith(f"{INVALID_CARD}.") + assert invalid_result["match_counts"] == {} + assert_no_raw_values(valid_result) + + +def test_regex_scan_is_bounded_on_long_non_matching_input(): + """Regex recognizers should avoid catastrophic backtracking.""" + print("Testing regex performance on long non-matching input...") + from functions_dlp import evaluate_dlp_text + + long_text = ("not-sensitive " * 20000) + "done" + started = time.perf_counter() + result = evaluate_dlp_text( + long_text, + settings={ + "enable_dlp_control_plane": True, + "web_search_dlp_mode": "redact", + "dlp_max_scan_chars": 500000, + }, + surface="web_search", + ) + elapsed = time.perf_counter() - started + + assert result["decision"] == "allow" + assert elapsed < 2.0, f"Regex scan took too long: {elapsed:.3f}s" + + +def test_enforced_dlp_blocks_when_text_exceeds_scan_limit(): + """Enforced DLP must not append unscanned text into sanitized output.""" + from functions_dlp import evaluate_dlp_text + + settings = { + "enable_dlp_control_plane": True, + "dlp_default_engine": "regex", + "dlp_max_scan_chars": 20, + "web_search_dlp_mode": "redact", + "enable_web_search_dlp": True, + } + text = "public prefix only " + ("x" * 25) + " SSN 123-45-6789" + + result = evaluate_dlp_text(text, settings=settings, surface="web_search") + + assert result["decision"] == "block" + assert result["scanner_status"] == "truncated" + assert result["text"] == "" + assert "123-45-6789" not in repr(result) + + +def test_enforced_truncation_blocks_before_scanner_error_fail_open(): + """Protected enforced surfaces should block truncated text before scanner errors.""" + import functions_dlp + + def fail_scan(text, settings, surface="generic"): + raise RuntimeError("scanner unavailable") + + settings = { + "enable_dlp_control_plane": True, + "dlp_fail_closed_on_scanner_error": False, + "dlp_max_scan_chars": 12, + "web_search_dlp_mode": "redact", + "enable_web_search_dlp": True, + } + text = "safe prefix " + ("x" * 25) + f" tail {RAW_SSN}" + + with patch.object(functions_dlp, "_apply_regex_engine", fail_scan): + result = functions_dlp.evaluate_dlp_text(text, settings=settings, surface="web_search") + + assert result["decision"] == "block" + assert result["scanner_status"] == "truncated" + assert result["text"] == "" + assert result["redacted_text"] == "" + assert result["metadata"]["skipped_chars"] > 0 + assert RAW_SSN not in repr(result) + assert "tail" not in repr(result) + + +def test_presidio_service_shape_normalizes_counts_without_raw_values(): + """Optional Presidio service results should normalize into the shared shape.""" + print("Testing Presidio service adapter normalization...") + from functions_dlp import normalize_presidio_results + + normalized = normalize_presidio_results( + text=f"Alice Example has SSN {RAW_SSN}.", + recognizer_results=[ + {"entity_type": "PERSON", "start": 0, "end": 13, "score": 0.88}, + {"entity_type": "US_SSN", "start": 22, "end": 33, "score": 0.99}, + ], + mode="redact", + engine="presidio_service", + ) + + assert normalized["decision"] == "redact" + assert normalized["match_counts"] == {"PERSON": 1, "US_SSN": 1} + assert "[REDACTED_PERSON]" in normalized["redacted_text"] + assert "[REDACTED_US_SSN]" in normalized["redacted_text"] + assert_no_raw_values(normalized) + + +if __name__ == "__main__": + tests = [ + test_disabled_dlp_allows_original_text, + test_regex_redacts_ssn_and_counts_only_metadata, + test_credit_card_requires_luhn_validation, + test_regex_scan_is_bounded_on_long_non_matching_input, + test_enforced_dlp_blocks_when_text_exceeds_scan_limit, + test_enforced_truncation_blocks_before_scanner_error_fail_open, + test_presidio_service_shape_normalizes_counts_without_raw_values, + ] + + try: + for test in tests: + test() + print(f"All {len(tests)} DLP control plane tests passed.") + sys.exit(0) + except Exception as exc: + print(f"Test failed: {exc}") + import traceback + + traceback.print_exc() + sys.exit(1) diff --git a/functional_tests/test_dlp_regex_rules.py b/functional_tests/test_dlp_regex_rules.py new file mode 100644 index 00000000..6fca6b82 --- /dev/null +++ b/functional_tests/test_dlp_regex_rules.py @@ -0,0 +1,241 @@ +# test_dlp_regex_rules.py +#!/usr/bin/env python3 +""" +Functional test for configurable DLP regex rules. +Version: 0.242.069 +Implemented in: 0.241.017 + +This test ensures DLP regex rules are admin-configurable, validated, +confidence-shaped, timeout-bounded, and safe to report without raw matched values. +""" + +import os +import sys + +ROOT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +APP_DIR = os.path.join(ROOT_DIR, "application", "single_app") +sys.path.insert(0, APP_DIR) + + +RAW_SSN = "123-45-6789" +RAW_CARD = "4111 1111 1111 1111" + + +def assert_no_raw_values(result): + payload = repr(result) + assert RAW_SSN not in payload + assert RAW_CARD not in payload + assert "ZX-12345" not in payload + + +def test_default_rules_include_ssn_and_credit_card_only(): + """Default DLP regex rules should be structured identifier defaults only.""" + print("Testing default DLP regex rules...") + from functions_dlp_rules import get_default_dlp_regex_rules + + rules = get_default_dlp_regex_rules() + ids = [rule["id"] for rule in rules] + + assert ids == ["us_ssn", "credit_card"] + assert all(rule["enabled"] is True for rule in rules) + assert "internal_phrase" not in ids + assert "confidential" not in repr(rules).lower() + + +def test_custom_regex_rule_redacts_on_configured_surface(): + """A configured custom rule should redact on an allowed surface.""" + print("Testing custom regex DLP rule...") + from functions_dlp import evaluate_dlp_text + + settings = { + "enable_dlp_control_plane": True, + "web_search_dlp_mode": "redact", + "dlp_regex_rules": [ + { + "id": "ticket_id", + "label": "Ticket ID", + "entity_type": "TICKET_ID", + "enabled": True, + "pattern": r"ZX-\d{5}", + "replacement": "[REDACTED_TICKET_ID]", + "surfaces": ["web_search"], + "flags": [], + "validator": "none", + "confidence": { + "regex_only": "medium", + "with_keywords": "high", + "keywords": ["ticket", "case"], + "window_chars": 24, + "minimum": "medium" + } + } + ], + } + + result = evaluate_dlp_text( + "Search for ticket ZX-12345", + settings=settings, + surface="web_search", + ) + + assert result["decision"] == "redact" + assert result["redacted_text"] == "Search for ticket [REDACTED_TICKET_ID]" + assert result["match_counts"] == {"TICKET_ID": 1} + assert result["matches"] == [{"entity_type": "TICKET_ID", "count": 1, "confidence": "high"}] + assert_no_raw_values(result) + + +def test_disabled_custom_rule_does_not_match(): + """Disabled rules should not produce matches.""" + print("Testing disabled custom regex DLP rule...") + from functions_dlp import evaluate_dlp_text + + settings = { + "enable_dlp_control_plane": True, + "web_search_dlp_mode": "redact", + "dlp_regex_rules": [ + { + "id": "ticket_id", + "label": "Ticket ID", + "entity_type": "TICKET_ID", + "enabled": False, + "pattern": r"ZX-\d{5}", + "replacement": "[REDACTED_TICKET_ID]", + "surfaces": ["web_search"], + "flags": [], + "validator": "none", + "confidence": { + "regex_only": "medium", + "with_keywords": "high", + "keywords": ["ticket"], + "window_chars": 24, + "minimum": "medium" + } + } + ], + } + + result = evaluate_dlp_text("Search for ticket ZX-12345", settings=settings, surface="web_search") + + assert result["decision"] == "allow" + assert result["match_counts"] == {} + assert "ZX-12345" in result["redacted_text"] + + +def test_confidence_requires_nearby_keyword_when_minimum_is_high(): + """Rules can require regex plus nearby keyword evidence for high-confidence matches.""" + print("Testing DLP confidence shaping...") + from functions_dlp import evaluate_dlp_text + + rule = { + "id": "employee_id", + "label": "Employee ID", + "entity_type": "EMPLOYEE_ID", + "enabled": True, + "pattern": r"EID-\d{6}", + "replacement": "[REDACTED_EMPLOYEE_ID]", + "surfaces": ["web_search"], + "flags": [], + "validator": "none", + "confidence": { + "regex_only": "low", + "with_keywords": "high", + "keywords": ["employee", "worker", "staff"], + "window_chars": 32, + "minimum": "high" + } + } + settings = { + "enable_dlp_control_plane": True, + "web_search_dlp_mode": "redact", + "dlp_regex_rules": [rule], + } + + low_result = evaluate_dlp_text("Search for EID-123456", settings=settings, surface="web_search") + high_result = evaluate_dlp_text("Search employee EID-123456", settings=settings, surface="web_search") + + assert low_result["decision"] == "allow" + assert low_result["match_counts"] == {} + assert high_result["decision"] == "redact" + assert high_result["match_counts"] == {"EMPLOYEE_ID": 1} + assert high_result["matches"] == [{"entity_type": "EMPLOYEE_ID", "count": 1, "confidence": "high"}] + + +def test_invalid_regex_rule_is_rejected_before_runtime(): + """Invalid admin regex rules should return validation errors.""" + print("Testing invalid regex rule validation...") + from functions_dlp_rules import validate_dlp_regex_rules + + normalized, errors = validate_dlp_regex_rules( + [ + { + "id": "bad", + "label": "Bad Rule", + "entity_type": "BAD", + "enabled": True, + "pattern": r"(", + "replacement": "[REDACTED_BAD]", + "surfaces": ["web_search"], + "flags": [], + "validator": "none", + "confidence": { + "regex_only": "medium", + "with_keywords": "high", + "keywords": [], + "window_chars": 16, + "minimum": "medium" + } + } + ] + ) + + assert normalized == [] + assert errors + assert "bad" in errors[0] + + +def test_internal_phrase_is_not_a_default_blocker(): + """Generic policy words should not be hardcoded blockers.""" + print("Testing internal phrase is not hardcoded...") + from functions_dlp import evaluate_web_search_egress + + result = evaluate_web_search_egress( + "Search for confidentiality agreement examples", + settings={ + "enable_dlp_control_plane": True, + "enable_web_search_dlp": True, + "web_search_dlp_mode": "redact", + }, + ) + + assert result["web_search_allowed"] is True + assert result["decision"] == "allow" + assert "confidentiality agreement" in result["web_search_query_text"] + + +if __name__ == "__main__": + tests = [ + test_default_rules_include_ssn_and_credit_card_only, + test_custom_regex_rule_redacts_on_configured_surface, + test_disabled_custom_rule_does_not_match, + test_confidence_requires_nearby_keyword_when_minimum_is_high, + test_invalid_regex_rule_is_rejected_before_runtime, + test_internal_phrase_is_not_a_default_blocker, + ] + + failures = [] + for test in tests: + try: + test() + except Exception as exc: + failures.append((test.__name__, exc)) + print(f"Test failed: {test.__name__}: {exc}") + import traceback + traceback.print_exc() + + if failures: + print(f"{len(failures)} of {len(tests)} configurable DLP regex rule tests failed.") + sys.exit(1) + + print(f"All {len(tests)} configurable DLP regex rule tests passed.") + sys.exit(0) diff --git a/functional_tests/test_dlp_review_events.py b/functional_tests/test_dlp_review_events.py new file mode 100644 index 00000000..919e31f8 --- /dev/null +++ b/functional_tests/test_dlp_review_events.py @@ -0,0 +1,115 @@ +# test_dlp_review_events.py +#!/usr/bin/env python3 +""" +Functional test for DLP review event safety. +Version: 0.242.069 +Implemented in: 0.242.069 + +This test ensures DLP review routing defaults to disabled and any optional +review event summary uses distinct DLP policy typing with counts-only payloads. +""" + +import os +import sys + + +ROOT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +APP_DIR = os.path.join(ROOT_DIR, "application", "single_app") +SETTINGS_FILE = os.path.join(APP_DIR, "functions_settings.py") +sys.path.insert(0, APP_DIR) + + +RAW_VALUE = "123-45-6789" + + +def read_file_text(path): + with open(path, "r", encoding="utf-8") as file_handle: + return file_handle.read() + + +def test_review_destination_defaults_to_none(): + """DLP findings should not enter review queues by default.""" + print("Testing default DLP review routing...") + source = read_file_text(SETTINGS_FILE) + assert "'dlp_review_destination': 'none'" in source + assert "'web_search_dlp_track_review_events': False" not in source + assert "'upload_dlp_track_review_events': False" not in source + + +def test_review_summary_has_dlp_type_and_no_raw_values(): + """Review payload summaries should be distinctly typed and counts-only.""" + print("Testing safe DLP review summary...") + from functions_dlp import build_dlp_review_event_summary, evaluate_web_search_egress + + result = evaluate_web_search_egress( + f"Please search for {RAW_VALUE}", + settings={ + "enable_dlp_control_plane": True, + "enable_web_search_dlp": True, + "web_search_dlp_mode": "block", + "dlp_review_destination": "safety_violations", + "web_search_dlp_track_review_events": True, + }, + context={"conversation_id": "conversation-1", "chat_type": "user"}, + ) + summary = build_dlp_review_event_summary( + result, + surface="web_search", + context={"conversation_id": "conversation-1", "user_id": "user-1"}, + ) + + assert summary["policy_type"] == "dlp_web_search" + assert summary["violation_type"] == "dlp" + assert summary["action"] == "block" + assert summary["entity_counts"] == {"US_SSN": 1} + assert "raw_matches" not in summary or summary["raw_matches"] is None + assert RAW_VALUE not in repr(summary) + + +def test_upload_review_summary_has_distinct_type_and_no_raw_values(): + """Upload review payloads should be distinctly typed and counts-only.""" + print("Testing safe upload DLP review summary...") + from functions_dlp import build_dlp_review_event_summary, evaluate_upload_content + + result = evaluate_upload_content( + f"Document chunk has {RAW_VALUE}", + settings={ + "enable_dlp_control_plane": True, + "enable_upload_dlp": True, + "upload_dlp_mode": "redact", + "dlp_review_destination": "safety_violations", + "upload_dlp_track_review_events": True, + }, + context={"document_id": "doc-1", "workspace_scope": "group"}, + ) + summary = build_dlp_review_event_summary( + result, + surface="upload", + context={"document_id": "doc-1", "workspace_scope": "group"}, + ) + + assert summary["policy_type"] == "dlp_upload" + assert summary["violation_type"] == "dlp" + assert summary["action"] == "redact" + assert summary["entity_counts"] == {"US_SSN": 1} + assert RAW_VALUE not in repr(summary) + + +if __name__ == "__main__": + tests = [ + test_review_destination_defaults_to_none, + test_review_summary_has_dlp_type_and_no_raw_values, + test_upload_review_summary_has_distinct_type_and_no_raw_values, + ] + + try: + for test in tests: + test() + print(f"All {len(tests)} DLP review event tests passed.") + sys.exit(0) + except Exception as exc: + print(f"Test failed: {exc}") + import traceback + + traceback.print_exc() + sys.exit(1) diff --git a/functional_tests/test_dlp_telemetry.py b/functional_tests/test_dlp_telemetry.py new file mode 100644 index 00000000..588991c1 --- /dev/null +++ b/functional_tests/test_dlp_telemetry.py @@ -0,0 +1,210 @@ +# test_dlp_telemetry.py +#!/usr/bin/env python3 +""" +Functional test for safe DLP telemetry. +Version: 0.242.069 +Implemented in: 0.242.069 + +This test ensures DLP telemetry properties include bounded decision metadata +without raw matched values, raw prompts, raw web-search queries, raw chunk text, +or raw filenames. +""" + +import os +import sys + + +ROOT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +APP_DIR = os.path.join(ROOT_DIR, "application", "single_app") +FUNCTIONS_DLP_FILE = os.path.join(APP_DIR, "functions_dlp.py") +sys.path.insert(0, APP_DIR) + + +RAW_TEXT = "Search for 123-45-6789 in the confidential roadmap" +RAW_FILENAME = "alice-123-45-6789-roadmap.txt" + + +def test_telemetry_properties_are_counts_only(): + """Telemetry should contain safe bounded DLP properties only.""" + print("Testing DLP telemetry safety...") + from functions_dlp import build_dlp_telemetry_properties, evaluate_dlp_text + + result = evaluate_dlp_text( + RAW_TEXT, + settings={"enable_dlp_control_plane": True, "web_search_dlp_mode": "redact"}, + surface="web_search", + ) + properties = build_dlp_telemetry_properties( + result, + surface="web_search", + context={ + "conversation_id": "conversation-123", + "chat_type": "user", + "workspace_scope": "personal", + "file_name": RAW_FILENAME, + "raw_text": RAW_TEXT, + }, + ) + + assert properties["activity_type"] == "dlp_decision" + assert properties["dlp_surface"] == "web_search" + assert properties["dlp_action"] == "redact" + assert properties["dlp_engine"] == "regex" + assert properties["dlp_mode"] == "redact" + assert properties["workspace_scope"] == "personal" + assert properties["scanner_status"] == "ok" + assert properties["dlp_total_replacements"] == 1 + assert properties["dlp_entity_counts"] == {"US_SSN": 1} + + serialized = repr(properties) + forbidden = [ + "123-45-6789", + "confidential roadmap", + RAW_TEXT, + RAW_FILENAME, + "[REDACTED_US_SSN]", + ] + for value in forbidden: + assert value not in serialized, f"Unsafe telemetry value leaked: {value}" + + +def test_scanner_error_telemetry_is_safe(): + """Scanner failure telemetry should avoid source text and raw errors.""" + print("Testing scanner error telemetry safety...") + from functions_dlp import build_dlp_telemetry_properties + + result = { + "enabled": True, + "engine": "presidio_service", + "mode": "block", + "decision": "block", + "scanner_status": "error", + "text": RAW_TEXT, + "redacted_text": RAW_TEXT, + "total_replacements": 0, + "match_counts": {}, + "matches": [], + "metadata": {"error": "service saw 123-45-6789 before timeout"}, + } + + properties = build_dlp_telemetry_properties( + result, + surface="web_search", + context={"raw_text": RAW_TEXT, "file_name": RAW_FILENAME}, + ) + + assert properties["scanner_status"] == "error" + assert "scanner_error" in properties + serialized = repr(properties) + assert "123-45-6789" not in serialized + assert RAW_TEXT not in serialized + assert RAW_FILENAME not in serialized + + +def test_upload_dlp_telemetry_is_safe(): + """Upload DLP telemetry should include counts and no raw chunk text.""" + print("Testing upload DLP telemetry safety...") + from functions_dlp import build_dlp_telemetry_properties, evaluate_upload_content + + result = evaluate_upload_content( + RAW_TEXT, + settings={ + "enable_dlp_control_plane": True, + "enable_upload_dlp": True, + "upload_dlp_mode": "redact", + }, + context={"document_id": "doc-1", "workspace_scope": "public"}, + ) + properties = build_dlp_telemetry_properties( + result, + surface="upload", + context={"document_id": "doc-1", "workspace_scope": "public", "raw_text": RAW_TEXT}, + ) + + assert properties["dlp_surface"] == "upload" + assert properties["dlp_action"] == "redact" + assert properties["workspace_scope"] == "public" + assert properties["dlp_entity_counts"] == {"US_SSN": 1} + assert RAW_TEXT not in repr(properties) + assert "123-45-6789" not in repr(properties) + + +def test_scanner_error_log_avoids_raw_traceback_capture(): + """Scanner exception logging should not send traceback text to telemetry.""" + print("Testing scanner error log traceback safety...") + with open(FUNCTIONS_DLP_FILE, "r", encoding="utf-8") as file_handle: + source = file_handle.read() + + scanner_error_index = source.find('"[DLP] Scanner error"') + traceback_index = source.find("exceptionTraceback=False", scanner_error_index) + error_type_index = source.find('"error_type": type(exc).__name__', scanner_error_index) + + assert scanner_error_index != -1 + assert traceback_index > scanner_error_index + assert error_type_index > scanner_error_index + assert "exceptionTraceback=True" not in source[scanner_error_index:traceback_index] + + +def test_monitor_detections_emit_telemetry_by_default(): + """Monitor-mode detections should emit telemetry even when allow sampling is disabled.""" + print("Testing monitor-mode DLP telemetry emission...") + from functions_dlp import evaluate_dlp_text, should_emit_dlp_telemetry + + result = evaluate_dlp_text( + RAW_TEXT, + settings={"enable_dlp_control_plane": True, "web_search_dlp_mode": "monitor"}, + surface="web_search", + ) + + assert result["decision"] == "monitor" + assert result["match_counts"] == {"US_SSN": 1} + assert result["total_replacements"] == 1 + assert should_emit_dlp_telemetry(result, settings={}) is True + assert should_emit_dlp_telemetry( + result, + settings={"dlp_telemetry_sample_allow_events": False}, + ) is True + + +def test_clean_allow_telemetry_respects_sampling_default(): + """Clean allow events should stay silent unless allow sampling is enabled.""" + print("Testing clean allow DLP telemetry sampling...") + from functions_dlp import evaluate_dlp_text, should_emit_dlp_telemetry + + result = evaluate_dlp_text( + "Search for public weather forecast", + settings={"enable_dlp_control_plane": True, "web_search_dlp_mode": "monitor"}, + surface="web_search", + ) + + assert result["decision"] == "allow" + assert result["match_counts"] == {} + assert result["total_replacements"] == 0 + assert should_emit_dlp_telemetry(result, settings={}) is False + assert should_emit_dlp_telemetry( + result, + settings={"dlp_telemetry_sample_allow_events": False}, + ) is False + + +if __name__ == "__main__": + tests = [ + test_telemetry_properties_are_counts_only, + test_scanner_error_telemetry_is_safe, + test_upload_dlp_telemetry_is_safe, + test_scanner_error_log_avoids_raw_traceback_capture, + test_monitor_detections_emit_telemetry_by_default, + test_clean_allow_telemetry_respects_sampling_default, + ] + + try: + for test in tests: + test() + print(f"All {len(tests)} DLP telemetry tests passed.") + sys.exit(0) + except Exception as exc: + print(f"Test failed: {exc}") + import traceback + + traceback.print_exc() + sys.exit(1) diff --git a/functional_tests/test_upload_dlp_ingestion_integration.py b/functional_tests/test_upload_dlp_ingestion_integration.py new file mode 100644 index 00000000..6631bfe6 --- /dev/null +++ b/functional_tests/test_upload_dlp_ingestion_integration.py @@ -0,0 +1,630 @@ +# test_upload_dlp_ingestion_integration.py +#!/usr/bin/env python3 +""" +Functional test for upload DLP ingestion integration. +Version: 0.242.069 +Implemented in: 0.242.069 + +This test ensures upload DLP blocks stop before embeddings/search indexing and +redacted text is the only text passed into embedding/index payload construction. +""" + +import ast +import os +import sys +import types +from pathlib import Path +from typing import List + + +ROOT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +APP_DIR = os.path.join(ROOT_DIR, "application", "single_app") +FUNCTIONS_DOCUMENTS_FILE = os.path.join(APP_DIR, "functions_documents.py") +FUNCTIONS_AUTHENTICATION_FILE = os.path.join(APP_DIR, "functions_authentication.py") +FUNCTIONS_DOCUMENTS = Path(FUNCTIONS_DOCUMENTS_FILE) +FUNCTIONS_AUTHENTICATION = Path(FUNCTIONS_AUTHENTICATION_FILE) +sys.path.insert(0, APP_DIR) + + +RAW_VALUE = "123-45-6789" + + +def read_file_text(path): + with open(path, "r", encoding="utf-8") as file_handle: + return file_handle.read() + + +def extract_function_source(source_text, function_name): + parsed = ast.parse(source_text, filename=FUNCTIONS_DOCUMENTS_FILE) + for node in ast.walk(parsed): + if isinstance(node, ast.FunctionDef) and node.name == function_name: + return ast.get_source_segment(source_text, node) + raise AssertionError(f"Function {function_name} not found") + + +def import_functions_documents_for_helper_tests(): + """Import functions_documents with lightweight stubs for optional app dependencies.""" + stub_modules = { + "config": types.ModuleType("config"), + "functions_content": types.ModuleType("functions_content"), + "functions_settings": types.ModuleType("functions_settings"), + "functions_search": types.ModuleType("functions_search"), + "functions_logging": types.ModuleType("functions_logging"), + "functions_authentication": types.ModuleType("functions_authentication"), + "functions_debug": types.ModuleType("functions_debug"), + "functions_keyvault": types.ModuleType("functions_keyvault"), + "azure": types.ModuleType("azure"), + "azure.cognitiveservices": types.ModuleType("azure.cognitiveservices"), + "azure.cognitiveservices.speech": types.ModuleType("azure.cognitiveservices.speech"), + } + stub_modules["config"].List = List + stub_modules["functions_settings"].get_settings = lambda: {} + stub_modules["functions_logging"].add_file_task_to_file_processing_log = lambda **kwargs: None + stub_modules["functions_logging"].log_event = lambda *args, **kwargs: None + stub_modules["functions_keyvault"].SecretReturnType = types.SimpleNamespace(VALUE="value") + stub_modules["functions_keyvault"].keyvault_model_endpoint_get_helper = lambda endpoint, return_type=None: endpoint + + original_modules = {module_name: sys.modules.get(module_name) for module_name in stub_modules} + try: + sys.modules.pop("functions_documents", None) + for module_name, module_stub in stub_modules.items(): + sys.modules[module_name] = module_stub + import functions_documents + finally: + for module_name, original_module in original_modules.items(): + if original_module is None: + sys.modules.pop(module_name, None) + else: + sys.modules[module_name] = original_module + + return functions_documents + + +def import_functions_authentication_for_helper_tests(): + """Import functions_authentication with lightweight stubs for optional app dependencies.""" + config_stub = types.ModuleType("config") + config_stub.AZURE_ENVIRONMENT = "public" + config_stub.CUSTOM_RESOURCE_MANAGER_URL_VALUE = "" + config_stub.DEFAULT_VIDEO_INDEXER_ARM_API_VERSION = "2024-01-01" + config_stub.OIDC_METADATA_URL = "https://login.example/.well-known/openid-configuration" + config_stub.AUDIENCE = "audience" + config_stub.ISSUER = "issuer" + config_stub.requests = types.SimpleNamespace() + config_stub.requests.exceptions = types.SimpleNamespace(RequestException=Exception) + config_stub.jwt = types.SimpleNamespace() + config_stub.DefaultAzureCredential = lambda: None + + stub_modules = { + "config": config_stub, + "functions_settings": types.ModuleType("functions_settings"), + "functions_debug": types.ModuleType("functions_debug"), + } + stub_modules["functions_debug"].debug_print = lambda *args, **kwargs: None + + original_modules = {module_name: sys.modules.get(module_name) for module_name in stub_modules} + try: + sys.modules.pop("functions_authentication", None) + for module_name, module_stub in stub_modules.items(): + sys.modules[module_name] = module_stub + import functions_authentication + finally: + for module_name, original_module in original_modules.items(): + if original_module is None: + sys.modules.pop(module_name, None) + else: + sys.modules[module_name] = original_module + + return functions_authentication + + +def test_upload_helper_blocks_before_returning_to_ingestion_paths(): + """The shared upload DLP evaluator should raise before callers can embed blocked text.""" + print("Testing upload DLP block gate...") + source = read_file_text(FUNCTIONS_DOCUMENTS_FILE) + helper_source = extract_function_source(source, "_evaluate_upload_dlp_text") + + record_index = helper_source.find("_record_upload_dlp_result(") + block_index = helper_source.find('if not result.get("upload_allowed", True):') + raise_index = helper_source.find('raise ValueError("Upload content blocked by DLP policy.")') + return_index = helper_source.find("return result") + + assert record_index != -1, "DLP result should be recorded before block handling" + assert block_index > record_index, "Block gate should run after safe metadata is recorded" + assert raise_index > block_index, "Blocked upload should raise a policy error" + assert return_index > raise_index, "Allowed result should return only after the block gate" + + +def test_single_chunk_uses_sanitized_text_for_embedding_and_indexing(): + """save_chunks should generate embeddings and search documents from sanitized text.""" + print("Testing single chunk sanitized text flow...") + source = read_file_text(FUNCTIONS_DOCUMENTS_FILE) + save_chunks_source = extract_function_source(source, "save_chunks") + + dlp_index = save_chunks_source.find("_evaluate_upload_dlp_text(") + sanitized_index = save_chunks_source.find('sanitized_chunk_text = upload_dlp_result.get("sanitized_text", enhanced_chunk_text)') + embedding_index = save_chunks_source.find("generate_embedding(sanitized_chunk_text)") + index_payload_index = save_chunks_source.find('"chunk_text": sanitized_chunk_text') + + assert dlp_index != -1 + assert sanitized_index > dlp_index + assert embedding_index > sanitized_index + assert index_payload_index > embedding_index + assert "generate_embedding(page_text_content)" not in save_chunks_source + assert RAW_VALUE not in save_chunks_source + + +def test_batch_chunks_use_sanitized_text_for_batch_embeddings_and_indexing(): + """save_chunks_batch should batch only sanitized chunk text.""" + print("Testing batch chunk sanitized text flow...") + source = read_file_text(FUNCTIONS_DOCUMENTS_FILE) + batch_source = extract_function_source(source, "save_chunks_batch") + + dlp_index = batch_source.find("_evaluate_upload_dlp_text(") + metadata_sanitize_index = batch_source.find("metadata, _metadata_dlp_summary = _sanitize_upload_metadata_for_dlp(") + author_index = batch_source.find("author = ensure_list(metadata.get('authors'))") + title_index = batch_source.find("title = metadata.get('title', '')") + sanitized_index = batch_source.find("sanitized_chunk_info['page_text_content']") + texts_index = batch_source.find("texts = [c['page_text_content'] for c in sanitized_chunks_data]") + embedding_index = batch_source.find("generate_embeddings_batch(texts)") + payload_index = batch_source.find('"chunk_text": enhanced_chunk_text') + + assert dlp_index != -1 + assert metadata_sanitize_index != -1 + assert author_index > metadata_sanitize_index + assert title_index > metadata_sanitize_index + assert sanitized_index > dlp_index + assert texts_index > sanitized_index + assert embedding_index > texts_index + assert payload_index > embedding_index + assert '"author": author' in batch_source + assert '"title": title' in batch_source + assert "texts = [c['page_text_content'] for c in chunks_data]" not in batch_source + assert "dlp_metadata" in batch_source + + +def test_video_chunks_use_sanitized_transcript_and_ocr_text(): + """save_video_chunk should sanitize transcript and OCR text before embedding/search.""" + print("Testing video chunk sanitized text flow...") + source = read_file_text(FUNCTIONS_DOCUMENTS_FILE) + video_source = extract_function_source(source, "save_video_chunk") + + transcript_dlp_index = video_source.find("transcript_dlp_result = _evaluate_upload_dlp_text(") + transcript_sanitized_index = video_source.find( + 'sanitized_transcript_text = transcript_dlp_result.get("sanitized_text", page_text_content)' + ) + ocr_dlp_index = video_source.find("ocr_dlp_result = _evaluate_upload_dlp_text(") + ocr_sanitized_index = video_source.find( + 'sanitized_ocr_text = ocr_dlp_result.get("sanitized_text", ocr_chunk_text)' + ) + embedding_index = video_source.find("generate_embedding(sanitized_transcript_text)") + transcript_payload_index = video_source.find('"chunk_text": sanitized_transcript_text') + ocr_payload_index = video_source.find('"video_ocr_chunk_text": sanitized_ocr_text') + + assert transcript_dlp_index != -1 + assert transcript_sanitized_index > transcript_dlp_index + assert ocr_dlp_index > transcript_sanitized_index + assert ocr_sanitized_index > ocr_dlp_index + assert embedding_index > ocr_sanitized_index + assert transcript_payload_index > embedding_index + assert ocr_payload_index > embedding_index + + +def test_video_chunks_preserve_public_workspace_scope(): + """Video chunks should use the public workspace metadata/search path when supplied.""" + print("Testing video chunk public workspace scope...") + source = read_file_text(FUNCTIONS_DOCUMENTS_FILE) + video_source = extract_function_source(source, "save_video_chunk") + process_video_source = extract_function_source(source, "process_video_document") + + assert "public_workspace_id=None" in video_source + assert "is_public_workspace = public_workspace_id is not None" in video_source + assert "public_workspace_id=public_workspace_id" in video_source + assert 'chunk["public_workspace_id"] = public_workspace_id' in video_source + assert 'CLIENTS["search_client_public"]' in video_source + assert "save_video_chunk(" in process_video_source + assert "public_workspace_id=public_workspace_id" in process_video_source + + +def test_video_dlp_block_errors_abort_processing(): + """Video processing should not swallow upload DLP block decisions.""" + print("Testing video DLP block propagation...") + source = read_file_text(FUNCTIONS_DOCUMENTS_FILE) + video_source = extract_function_source(source, "save_video_chunk") + process_video_source = extract_function_source(source, "process_video_document") + + save_chunk_call_index = process_video_source.find("save_video_chunk(") + catch_index = process_video_source.find("except Exception as e:", save_chunk_call_index) + dlp_guard_index = process_video_source.find('if str(e) == "Upload content blocked by DLP policy.":', catch_index) + raise_index = process_video_source.find("raise", dlp_guard_index) + log_index = process_video_source.find("Failed to save chunk", catch_index) + + assert 'if str(e) == "Upload content blocked by DLP policy.":' in video_source + assert save_chunk_call_index != -1 + assert catch_index > save_chunk_call_index + assert dlp_guard_index > catch_index + assert raise_index > dlp_guard_index + assert log_index > raise_index + + +def test_audio_chunks_preserve_public_workspace_scope(): + """Audio transcript chunks should pass public workspace scope through save_chunks.""" + print("Testing audio chunk public workspace scope...") + source = read_file_text(FUNCTIONS_DOCUMENTS_FILE) + audio_source = extract_function_source(source, "process_audio_document") + + save_chunks_index = audio_source.find("save_chunks(") + public_scope_index = audio_source.find("public_workspace_id=public_workspace_id", save_chunks_index) + + assert save_chunks_index != -1 + assert public_scope_index > save_chunks_index + + +def test_media_processing_logs_do_not_emit_raw_detector_text(): + """Media processors should log counts and lengths, not raw transcript/insight bodies.""" + print("Testing media processing log safety...") + source = read_file_text(FUNCTIONS_DOCUMENTS_FILE) + video_source = extract_function_source(source, "process_video_document") + audio_source = extract_function_source(source, "process_audio_document") + + forbidden_video = [ + "RAW INSIGHTS", + "insights_json", + "json.dumps(insights", + "TRANSCRIPT sample", + "OCR sample", + "KEYWORDS sample", + "sample:", + "First speech item: {speech_context[0]}", + "using insights as text: {chunk_text[:100]}", + "chunk_text[:100]", + ] + forbidden_audio = [ + "Recognized: {evt.result.text}", + "Recognized: {result.text}", + ] + + for snippet in forbidden_video: + assert snippet not in video_source, f"Unsafe video log remains: {snippet}" + for snippet in forbidden_audio: + assert snippet not in audio_source, f"Unsafe audio log remains: {snippet}" + + +def test_upload_dlp_metadata_is_counts_only(): + """Upload DLP metadata should store summaries, not raw detector matches.""" + print("Testing upload DLP metadata safety...") + from functions_dlp import evaluate_upload_content + + result = evaluate_upload_content( + f"employee ssn {RAW_VALUE}", + settings={ + "enable_dlp_control_plane": True, + "enable_upload_dlp": True, + "upload_dlp_mode": "redact", + }, + context={"document_id": "doc-1", "workspace_scope": "personal"}, + ) + + metadata = result["dlp_metadata"] + assert metadata["entity_counts"] == {"US_SSN": 1} + assert metadata["total_replacements"] == 1 + assert RAW_VALUE not in repr(metadata) + assert "matches" not in metadata + assert "raw_matches" not in metadata + + +def test_upload_dlp_enforcement_disables_enhanced_citation_blob_upload(): + """Enforced upload DLP should disable raw enhanced-citation blob upload.""" + print("Testing upload DLP enhanced-citation enforcement...") + source = FUNCTIONS_DOCUMENTS.read_text(encoding="utf-8") + assert "_should_disable_enhanced_citations_for_upload_dlp" in source + assert "enable_enhanced_citations = False" in source + assert "upload_dlp_mode" in source + assert 'settings.get("upload_dlp_fail_upload_on_match", False)' in source + assert 'settings.get("dlp_fail_closed_on_scanner_error", True)' in source + + upload_source = extract_function_source(source, "process_document_upload_background") + helper_source = extract_function_source(source, "_should_disable_enhanced_citations_for_upload_dlp") + conditional = "disabled_enhanced_citations_for_upload_dlp = (" + conditional_index = upload_source.find(conditional) + disable_index = upload_source.find("enable_enhanced_citations = False", conditional_index) + status_index = upload_source.find("Enhanced citations disabled because upload DLP enforcement is active") + dispatch_args_index = upload_source.find("args = {") + process_handler_indices = [ + index + for index in ( + upload_source.find("process_txt("), + upload_source.find("process_xml("), + upload_source.find("process_yaml("), + upload_source.find("process_log("), + upload_source.find("process_doc("), + upload_source.find("process_html("), + upload_source.find("process_md("), + upload_source.find("process_json("), + upload_source.find("process_tabular("), + upload_source.find("process_video_document("), + upload_source.find("process_audio_document("), + upload_source.find("process_di_document("), + ) + if index != -1 + ] + + assert conditional_index != -1 + assert 'settings.get("dlp_fail_closed_on_scanner_error", True)' in helper_source + assert 'settings.get("upload_dlp_fail_upload_on_match", False)' in helper_source + assert "return True" in helper_source + assert disable_index > conditional_index + assert status_index > disable_index + assert dispatch_args_index > disable_index + assert '"enable_enhanced_citations": enable_enhanced_citations' in upload_source + assert process_handler_indices + assert disable_index < min(process_handler_indices) + assert "enable_enhanced_citations=enable_enhanced_citations" in upload_source + + video_source = extract_function_source(source, "process_video_document") + audio_source = extract_function_source(source, "process_audio_document") + assert "enable_enhanced_citations=False" in video_source + assert "enable_enhanced_citations=False" in audio_source + assert 'if enable_enhanced_citations:' in video_source + assert 'if enable_enhanced_citations:' in audio_source + assert 'settings.get("enable_enhanced_citations", False)' not in video_source + assert 'settings.get("enable_enhanced_citations", False)' not in audio_source + + +def test_upload_metadata_sanitizer_redacts_counts_only_metadata(): + """Upload metadata sanitizer should redact raw values and return counts only.""" + print("Testing upload metadata DLP sanitizer...") + functions_documents = import_functions_documents_for_helper_tests() + + original_get_settings = functions_documents.get_settings + functions_documents.get_settings = lambda: { + "enable_dlp_control_plane": True, + "enable_upload_dlp": True, + "upload_dlp_mode": "redact", + "dlp_default_engine": "regex", + "dlp_max_scan_chars": 200000, + } + + try: + metadata = { + "title": "Roadmap 123-45-6789", + "authors": ["Alice 123-45-6789"], + "organization": "Org", + "publication_date": "06/2026", + "keywords": ["SSN 123-45-6789"], + "abstract": "Contains 123-45-6789", + } + + sanitized, summary = functions_documents._sanitize_upload_metadata_for_dlp( + metadata, + user_id="user-1", + document_id="doc-1", + ) + finally: + functions_documents.get_settings = original_get_settings + + assert "123-45-6789" not in repr(sanitized) + assert summary["entity_counts"]["US_SSN"] >= 1 + assert "raw_matches" not in repr(summary) + + +def test_upload_metadata_logs_use_safe_counts_and_lengths(): + """Metadata retrieval and extraction logs should not write raw metadata bodies.""" + print("Testing upload metadata log safety...") + source = FUNCTIONS_DOCUMENTS.read_text(encoding="utf-8") + get_metadata_source = extract_function_source(source, "get_document_metadata") + summary_source = extract_function_source(source, "_upload_metadata_log_summary") + + assert "Document metadata retrieved: {document_items}" not in get_metadata_source + assert "item_count: {len(document_items)}" in get_metadata_source + assert '"field_lengths"' in summary_source + assert "Final metadata for document {document_id}: {meta_data}" not in source + assert "Decoded JSON from GPT response for document {document_id}: {gpt_output}" not in source + + +def test_initial_di_metadata_is_sanitized_before_update_callback(): + """DI file properties should be sanitized before first metadata persistence.""" + print("Testing initial DI metadata sanitization...") + source = FUNCTIONS_DOCUMENTS.read_text(encoding="utf-8") + di_source = extract_function_source(source, "process_di_document") + + metadata_fields_index = di_source.find("metadata_update_fields = {") + sanitize_index = di_source.find("_sanitize_upload_metadata_for_dlp(") + update_index = di_source.find("update_callback(**update_fields)") + dlp_reraise_index = di_source.find('if str(e) == "Upload content blocked by DLP policy.":') + warning_index = di_source.find("Warning: Failed to extract initial metadata") + + assert metadata_fields_index != -1 + assert sanitize_index > metadata_fields_index + assert update_index > sanitize_index + assert dlp_reraise_index > update_index + assert warning_index > dlp_reraise_index + + +def test_video_indexer_upload_params_do_not_log_access_token(): + """Video Indexer upload logging should not include raw account tokens.""" + print("Testing Video Indexer upload parameter log safety...") + source = FUNCTIONS_DOCUMENTS.read_text(encoding="utf-8") + video_source = extract_function_source(source, "process_video_document") + + assert '"accessToken": token' in video_source + assert 'debug_print(f"[VIDEO INDEXER] Upload params: {params}")' not in video_source + assert 'debug_print(f"[VIDEO INDEXER] Index polling URL: {index_url}")' not in video_source + assert "accessToken_present={bool(token)}" in video_source + assert "name_length={len(original_filename or '')}" in video_source + assert "Upload params keys" in video_source + assert "Index polling request prepared" in video_source + assert "video_id_length={len(str(vid or ''))}" in video_source + + +def test_video_indexer_request_errors_redact_access_token(): + """Video Indexer request exceptions should redact token-bearing URLs before logging.""" + print("Testing Video Indexer request error redaction...") + functions_documents = import_functions_documents_for_helper_tests() + source = FUNCTIONS_DOCUMENTS.read_text(encoding="utf-8") + video_source = extract_function_source(source, "process_video_document") + + query_error = ( + "403 Client Error: Forbidden for url: " + "https://video.example/Index?accessToken=secret-token&other=value" + ) + dict_error = "{'accessToken': 'secret-token', 'name': 'example.mp4'}" + + redacted_query = functions_documents._sanitize_video_indexer_log_value(query_error) + redacted_dict = functions_documents._sanitize_video_indexer_log_value(dict_error) + + assert "secret-token" not in redacted_query + assert "accessToken=[REDACTED]" in redacted_query + assert "other=value" in redacted_query + assert "secret-token" not in redacted_dict + assert "[REDACTED]" in redacted_dict + assert "Authentication failed: {str(e)}" not in video_source + assert "AUTH ERROR: {e}" not in video_source + assert "auth failed → {e}" not in video_source + assert "Upload request failed: {str(e)}" not in video_source + assert "Poll request failed: {str(e)}" not in video_source + assert "Upload response text: {resp.text}" not in video_source + assert "No video ID in response: {response_data}" not in video_source + assert "_sanitize_video_indexer_log_value(e)" in video_source + assert "_sanitize_video_indexer_log_value(resp.text)" in video_source + assert "_sanitize_video_indexer_log_value(e.response.text)" in video_source + + +def test_video_indexer_auth_errors_redact_access_token(): + """Video Indexer auth response logging should redact token-bearing bodies.""" + print("Testing Video Indexer auth response redaction...") + functions_authentication = import_functions_authentication_for_helper_tests() + source = FUNCTIONS_AUTHENTICATION.read_text(encoding="utf-8") + auth_source = extract_function_source(source, "get_video_indexer_managed_identity_token") + + response_body = '{"accessToken":"secret-token","expiresIn":"3600"}' + query_error = ( + "400 Client Error: Bad Request for url: " + "https://management.example/generateAccessToken?accessToken=secret-token" + ) + + redacted_body = functions_authentication._sanitize_video_indexer_auth_log_value(response_body) + redacted_query = functions_authentication._sanitize_video_indexer_auth_log_value(query_error) + + assert "secret-token" not in redacted_body + assert "secret-token" not in redacted_query + assert "[REDACTED]" in redacted_body + assert "accessToken=[REDACTED]" in redacted_query + assert "ARM API response text: {resp.text}" not in auth_source + assert "ERROR: No accessToken in response: {response_data}" not in auth_source + assert "ERROR in ARM API request: {str(e)}" not in auth_source + assert "Error response text: {e.response.text}" not in auth_source + assert "_sanitize_video_indexer_auth_log_value(resp.text)" in auth_source + assert "_sanitize_video_indexer_auth_log_value(e.response.text)" in auth_source + + +def test_upload_dlp_document_status_aggregates_worst_result(): + """Document DLP summary should preserve the worst observed status.""" + print("Testing upload DLP document status aggregation...") + functions_documents = import_functions_documents_for_helper_tests() + + aggregate = functions_documents._merge_upload_dlp_document_summary( + existing={ + "status": "accepted_with_redactions", + "entity_counts": {"US_SSN": 1}, + "total_replacements": 1, + }, + incoming={ + "status": "accepted", + "entity_counts": {}, + "total_replacements": 0, + }, + ) + + assert aggregate["status"] == "accepted_with_redactions" + assert aggregate["entity_counts"]["US_SSN"] == 1 + + +def test_upload_dlp_record_merges_with_existing_document_status(): + """Recording a clean field should not downgrade an earlier redacted document status.""" + print("Testing upload DLP record persistence aggregation...") + functions_documents = import_functions_documents_for_helper_tests() + + updates = [] + original_get_settings = functions_documents.get_settings + original_get_document_metadata = functions_documents.get_document_metadata + original_update_document = functions_documents.update_document + functions_documents.get_settings = lambda: { + "enable_dlp_control_plane": True, + "enable_upload_dlp": True, + "upload_dlp_mode": "redact", + } + functions_documents.get_document_metadata = lambda **kwargs: { + "dlp_status": "accepted_with_redactions", + "dlp_metadata": { + "status": "accepted_with_redactions", + "entity_counts": {"US_SSN": 1}, + "total_replacements": 1, + "scanner_status": "ok", + }, + } + functions_documents.update_document = lambda **kwargs: updates.append(kwargs) + + try: + functions_documents._record_upload_dlp_result( + { + "status": "accepted", + "sanitized_text": "clean", + "dlp_metadata": { + "status": "accepted", + "entity_counts": {}, + "total_replacements": 0, + "scanner_status": "ok", + }, + }, + user_id="user-1", + document_id="doc-1", + ) + finally: + functions_documents.get_settings = original_get_settings + functions_documents.get_document_metadata = original_get_document_metadata + functions_documents.update_document = original_update_document + + assert updates + assert updates[0]["dlp_status"] == "accepted_with_redactions" + assert updates[0]["dlp_metadata"]["entity_counts"]["US_SSN"] == 1 + + +if __name__ == "__main__": + tests = [ + test_upload_helper_blocks_before_returning_to_ingestion_paths, + test_single_chunk_uses_sanitized_text_for_embedding_and_indexing, + test_batch_chunks_use_sanitized_text_for_batch_embeddings_and_indexing, + test_video_chunks_use_sanitized_transcript_and_ocr_text, + test_video_chunks_preserve_public_workspace_scope, + test_video_dlp_block_errors_abort_processing, + test_audio_chunks_preserve_public_workspace_scope, + test_media_processing_logs_do_not_emit_raw_detector_text, + test_upload_dlp_metadata_is_counts_only, + test_upload_dlp_enforcement_disables_enhanced_citation_blob_upload, + test_upload_metadata_sanitizer_redacts_counts_only_metadata, + test_upload_metadata_logs_use_safe_counts_and_lengths, + test_initial_di_metadata_is_sanitized_before_update_callback, + test_video_indexer_upload_params_do_not_log_access_token, + test_video_indexer_request_errors_redact_access_token, + test_video_indexer_auth_errors_redact_access_token, + test_upload_dlp_document_status_aggregates_worst_result, + test_upload_dlp_record_merges_with_existing_document_status, + ] + + failures = [] + for test in tests: + try: + test() + except Exception as exc: + failures.append((test.__name__, exc)) + print(f"Test failed: {test.__name__}: {exc}") + import traceback + + traceback.print_exc() + + if failures: + print(f"{len(failures)} of {len(tests)} upload DLP ingestion integration tests failed.") + sys.exit(1) + + print(f"All {len(tests)} upload DLP ingestion integration tests passed.") + sys.exit(0) diff --git a/functional_tests/test_upload_dlp_redaction.py b/functional_tests/test_upload_dlp_redaction.py new file mode 100644 index 00000000..caebb4e4 --- /dev/null +++ b/functional_tests/test_upload_dlp_redaction.py @@ -0,0 +1,257 @@ +# test_upload_dlp_redaction.py +#!/usr/bin/env python3 +""" +Functional test for upload DLP redaction. +Version: 0.242.069 +Implemented in: 0.242.069 + +This test ensures upload DLP redacts chunk text before embeddings and Azure AI +Search indexing, hardens raw chunk logs, stores counts-only metadata, and emits +safe upload telemetry/review summaries. +""" + +import ast +import os +import sys + + +ROOT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +APP_DIR = os.path.join(ROOT_DIR, "application", "single_app") +FUNCTIONS_DOCUMENTS_FILE = os.path.join(APP_DIR, "functions_documents.py") +sys.path.insert(0, APP_DIR) + + +RAW_VALUE = "123-45-6789" + + +def read_file_text(path): + with open(path, "r", encoding="utf-8") as file_handle: + return file_handle.read() + + +def extract_function_source(source_text, function_name): + parsed = ast.parse(source_text, filename=FUNCTIONS_DOCUMENTS_FILE) + for node in ast.walk(parsed): + if isinstance(node, ast.FunctionDef) and node.name == function_name: + return ast.get_source_segment(source_text, node) + raise AssertionError(f"Function {function_name} not found") + + +def test_upload_helper_redacts_or_blocks_with_safe_state(): + """Upload helper should shape redact/block states without raw values.""" + print("Testing upload DLP helper behavior...") + from functions_dlp import evaluate_upload_content + + redact_result = evaluate_upload_content( + f"Chunk contains {RAW_VALUE}", + settings={ + "enable_dlp_control_plane": True, + "enable_upload_dlp": True, + "upload_dlp_mode": "redact", + }, + context={"document_id": "doc-1", "workspace_scope": "personal"}, + ) + block_result = evaluate_upload_content( + f"Chunk contains {RAW_VALUE}", + settings={ + "enable_dlp_control_plane": True, + "enable_upload_dlp": True, + "upload_dlp_mode": "block", + }, + context={"document_id": "doc-1", "workspace_scope": "personal"}, + ) + + assert redact_result["decision"] == "redact" + assert redact_result["upload_allowed"] is True + assert redact_result["status"] == "accepted_with_redactions" + assert "[REDACTED_US_SSN]" in redact_result["sanitized_text"] + assert RAW_VALUE not in repr(redact_result) + + assert block_result["decision"] == "block" + assert block_result["upload_allowed"] is False + assert block_result["status"] == "blocked" + assert block_result["sanitized_text"] == "" + assert RAW_VALUE not in repr(block_result) + + +def test_upload_fail_on_match_overrides_redact_mode(): + """Fail-on-match should block even when mode would otherwise redact.""" + print("Testing upload fail-on-match behavior...") + from functions_dlp import evaluate_upload_content + + result = evaluate_upload_content( + f"Chunk contains {RAW_VALUE}", + settings={ + "enable_dlp_control_plane": True, + "enable_upload_dlp": True, + "upload_dlp_mode": "redact", + "upload_dlp_fail_upload_on_match": True, + }, + context={"document_id": "doc-1", "workspace_scope": "personal"}, + ) + + assert result["decision"] == "block" + assert result["upload_allowed"] is False + assert result["status"] == "blocked" + assert result["sanitized_text"] == "" + assert RAW_VALUE not in repr(result) + + +def test_upload_dlp_uses_custom_regex_rules(): + """Upload DLP should honor admin-configured regex rules and confidence shaping.""" + print("Testing upload DLP custom regex rules...") + from functions_dlp import evaluate_upload_content + + raw_document_id = "DOC-123456" + result = evaluate_upload_content( + f"Customer document {raw_document_id} is ready", + settings={ + "enable_dlp_control_plane": True, + "enable_upload_dlp": True, + "upload_dlp_mode": "redact", + "dlp_regex_rules": [ + { + "id": "document_id", + "label": "Document ID", + "entity_type": "DOCUMENT_ID", + "enabled": True, + "pattern": r"DOC-\d{6}", + "replacement": "[REDACTED_DOCUMENT_ID]", + "surfaces": ["upload"], + "flags": [], + "validator": "none", + "confidence": { + "regex_only": "low", + "with_keywords": "high", + "keywords": ["document", "customer"], + "window_chars": 32, + "minimum": "high", + }, + } + ], + }, + context={"document_id": "doc-1", "workspace_scope": "personal"}, + ) + + assert result["decision"] == "redact" + assert result["upload_allowed"] is True + assert result["status"] == "accepted_with_redactions" + assert result["sanitized_text"] == "Customer document [REDACTED_DOCUMENT_ID] is ready" + assert result["match_counts"] == {"DOCUMENT_ID": 1} + assert raw_document_id not in repr(result) + + +def test_upload_dlp_blocks_when_text_exceeds_scan_limit(): + """Enforced upload DLP should block when text exceeds the scan limit.""" + print("Testing upload DLP scan-limit enforcement...") + from functions_dlp import evaluate_upload_content + + result = evaluate_upload_content( + "safe prefix " + ("x" * 50) + " 123-45-6789", + settings={ + "enable_dlp_control_plane": True, + "enable_upload_dlp": True, + "upload_dlp_mode": "redact", + "dlp_default_engine": "regex", + "dlp_max_scan_chars": 12, + }, + context={"document_id": "doc-scan-limit", "workspace_scope": "personal"}, + ) + + assert result["decision"] == "block" + assert result["upload_allowed"] is False + assert result["status"] == "scanner_failed" + assert result["scanner_status"] == "truncated" + assert result["sanitized_text"] == "" + assert "123-45-6789" not in repr(result) + + +def test_upload_fail_on_match_blocks_truncated_monitor_mode(): + """Fail-on-match is an enforcing upload mode and should block truncated scans.""" + print("Testing upload fail-on-match scan-limit enforcement...") + from functions_dlp import evaluate_upload_content + + result = evaluate_upload_content( + "safe prefix " + ("x" * 50) + " 123-45-6789", + settings={ + "enable_dlp_control_plane": True, + "enable_upload_dlp": True, + "upload_dlp_mode": "monitor", + "upload_dlp_fail_upload_on_match": True, + "dlp_default_engine": "regex", + "dlp_max_scan_chars": 12, + }, + context={"document_id": "doc-scan-limit", "workspace_scope": "personal"}, + ) + + assert result["decision"] == "block" + assert result["upload_allowed"] is False + assert result["status"] == "scanner_failed" + assert result["scanner_status"] == "truncated" + assert result["sanitized_text"] == "" + assert "123-45-6789" not in repr(result) + + +def test_save_chunks_redacts_before_embedding_and_logs_safe_summary(): + """save_chunks should evaluate DLP before generate_embedding and avoid raw logs.""" + print("Testing save_chunks DLP ordering and log safety...") + source = read_file_text(FUNCTIONS_DOCUMENTS_FILE) + save_chunks_source = extract_function_source(source, "save_chunks") + + assert "from functions_dlp import" in source + assert "evaluate_upload_content" in source + assert "build_upload_dlp_file_log_summary" in source + assert save_chunks_source.find("_evaluate_upload_dlp_text(") < save_chunks_source.find("generate_embedding(") + assert "generate_embedding(page_text_content)" not in save_chunks_source + assert "page_text_content:{page_text_content}" not in save_chunks_source + assert "chunk_text\": sanitized" in save_chunks_source or "chunk_text\": enhanced_chunk_text" in save_chunks_source + assert "dlp_metadata" in save_chunks_source + + +def test_save_chunks_batch_redacts_before_batch_embedding(): + """save_chunks_batch should sanitize each chunk before batch embeddings.""" + print("Testing save_chunks_batch DLP ordering...") + source = read_file_text(FUNCTIONS_DOCUMENTS_FILE) + batch_source = extract_function_source(source, "save_chunks_batch") + + assert batch_source.find("_evaluate_upload_dlp_text(") < batch_source.find("generate_embeddings_batch(") + assert "texts = [c['page_text_content'] for c in chunks_data]" not in batch_source + assert "sanitized_chunks_data" in batch_source + assert "dlp_metadata" in batch_source + + +def test_save_video_chunk_redacts_transcript_and_ocr_before_embedding_and_indexing(): + """save_video_chunk should sanitize transcript and OCR before embedding/search.""" + print("Testing save_video_chunk DLP ordering...") + source = read_file_text(FUNCTIONS_DOCUMENTS_FILE) + video_source = extract_function_source(source, "save_video_chunk") + + assert video_source.find("_evaluate_upload_dlp_text(") < video_source.find("generate_embedding(") + assert "generate_embedding(page_text_content)" not in video_source + assert '"chunk_text": sanitized_transcript_text' in video_source + assert '"video_ocr_chunk_text": sanitized_ocr_text' in video_source + + +if __name__ == "__main__": + tests = [ + test_upload_helper_redacts_or_blocks_with_safe_state, + test_upload_fail_on_match_overrides_redact_mode, + test_upload_dlp_uses_custom_regex_rules, + test_upload_dlp_blocks_when_text_exceeds_scan_limit, + test_upload_fail_on_match_blocks_truncated_monitor_mode, + test_save_chunks_redacts_before_embedding_and_logs_safe_summary, + test_save_chunks_batch_redacts_before_batch_embedding, + test_save_video_chunk_redacts_transcript_and_ocr_before_embedding_and_indexing, + ] + + try: + for test in tests: + test() + print(f"All {len(tests)} upload DLP redaction tests passed.") + sys.exit(0) + except Exception as exc: + print(f"Test failed: {exc}") + import traceback + + traceback.print_exc() + sys.exit(1) diff --git a/functional_tests/test_upload_dlp_workspace_scopes.py b/functional_tests/test_upload_dlp_workspace_scopes.py new file mode 100644 index 00000000..f9b1109c --- /dev/null +++ b/functional_tests/test_upload_dlp_workspace_scopes.py @@ -0,0 +1,79 @@ +# test_upload_dlp_workspace_scopes.py +#!/usr/bin/env python3 +""" +Functional test for upload DLP workspace scope coverage. +Version: 0.242.069 +Implemented in: 0.242.069 + +This test ensures personal, group, public, and external public upload routes +continue using the shared document processing path protected by upload DLP. +""" + +import os +import sys + + +ROOT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +APP_DIR = os.path.join(ROOT_DIR, "application", "single_app") + + +ROUTE_FILES = { + "personal": os.path.join(APP_DIR, "route_backend_documents.py"), + "group": os.path.join(APP_DIR, "route_backend_group_documents.py"), + "public": os.path.join(APP_DIR, "route_backend_public_documents.py"), + "external_public": os.path.join(APP_DIR, "route_external_public_documents.py"), +} +FUNCTIONS_DOCUMENTS_FILE = os.path.join(APP_DIR, "functions_documents.py") + + +def read_file_text(path): + with open(path, "r", encoding="utf-8") as file_handle: + return file_handle.read() + + +def test_upload_routes_remain_present_for_all_workspace_scopes(): + """All supported upload route files should expose upload endpoints.""" + print("Testing upload route coverage...") + expectations = { + "personal": "/api/documents/upload", + "group": "/api/group_documents/upload", + "public": "/api/public_documents/upload", + "external_public": "/external/public_documents/upload", + } + + for scope, route_file in ROUTE_FILES.items(): + source = read_file_text(route_file) + assert expectations[scope] in source, f"Missing upload route for {scope}" + assert "process_document_upload_background" in source, f"{scope} route does not use shared processing" + + +def test_shared_processing_path_carries_workspace_scope_to_upload_dlp(): + """functions_documents should map upload DLP context for personal/group/public scopes.""" + print("Testing upload DLP workspace context...") + source = read_file_text(FUNCTIONS_DOCUMENTS_FILE) + + assert "workspace_scope" in source + assert '"personal"' in source + assert '"group"' in source + assert '"public"' in source + assert "public_workspace_id" in source + assert "group_id" in source + + +if __name__ == "__main__": + tests = [ + test_upload_routes_remain_present_for_all_workspace_scopes, + test_shared_processing_path_carries_workspace_scope_to_upload_dlp, + ] + + try: + for test in tests: + test() + print(f"All {len(tests)} upload DLP workspace scope tests passed.") + sys.exit(0) + except Exception as exc: + print(f"Test failed: {exc}") + import traceback + + traceback.print_exc() + sys.exit(1) diff --git a/functional_tests/test_web_search_current_message_only.py b/functional_tests/test_web_search_current_message_only.py index 409ba628..7a934690 100644 --- a/functional_tests/test_web_search_current_message_only.py +++ b/functional_tests/test_web_search_current_message_only.py @@ -1,7 +1,7 @@ # test_web_search_current_message_only.py """ Functional test for current-message-only web search egress. -Version: 0.241.046 +Version: 0.242.069 Implemented in: 0.241.008 This test ensures external web search uses only the current user message, @@ -66,7 +66,10 @@ def test_perform_web_search_uses_explicit_outbound_query_and_empty_metadata(): perform_source = extract_function_source(source, 'perform_web_search') assert 'web_search_query_text,' in perform_source - assert 'query_text = (web_search_query_text or user_message or "").strip()' in perform_source + assert 'web_search_query_text or user_message' not in perform_source + assert 'query_text = (web_search_query_text or "").strip()' in perform_source + assert 'debug_print("[WebSearch] Empty approved web-search query; skipping Foundry call")' in perform_source + assert 'return True # Not an error, just empty approved query' in perform_source assert 'foundry_metadata = {}' in perform_source metadata_block = perform_source.split('foundry_metadata = {}', 1)[1].split( @@ -148,4 +151,4 @@ def test_web_search_adds_foundry_citations_to_source_review_seeds(): test() print(f'\n📊 Results: {len(tests)}/{len(tests)} tests passed') - sys.exit(0) \ No newline at end of file + sys.exit(0) diff --git a/functional_tests/test_web_search_dlp_egress.py b/functional_tests/test_web_search_dlp_egress.py new file mode 100644 index 00000000..005881fb --- /dev/null +++ b/functional_tests/test_web_search_dlp_egress.py @@ -0,0 +1,252 @@ +# test_web_search_dlp_egress.py +#!/usr/bin/env python3 +""" +Functional test for web-search DLP egress. +Version: 0.242.069 +Implemented in: 0.242.069 + +This test ensures web-search DLP runs after current-message query construction +and before Foundry web-search execution, blocks sensitive egress, redacts when +configured, and avoids raw query debug logging when DLP is enabled. +""" + +import ast +import os +import sys +from unittest.mock import patch + + +ROOT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +APP_DIR = os.path.join(ROOT_DIR, "application", "single_app") +ROUTE_FILE = os.path.join(APP_DIR, "route_backend_chats.py") +sys.path.insert(0, APP_DIR) + + +RAW_VALUE = "123-45-6789" + + +def read_file_text(path): + with open(path, "r", encoding="utf-8") as file_handle: + return file_handle.read() + + +def extract_function_source(source_text, function_name): + parsed = ast.parse(source_text, filename=ROUTE_FILE) + for node in ast.walk(parsed): + if isinstance(node, ast.FunctionDef) and node.name == function_name: + return ast.get_source_segment(source_text, node) + raise AssertionError(f"Function {function_name} not found") + + +def test_route_imports_and_calls_dlp_before_web_search(): + """Both chat paths should evaluate DLP before perform_web_search.""" + print("Testing web-search DLP call ordering...") + source = read_file_text(ROUTE_FILE) + + assert "from functions_dlp import evaluate_web_search_egress" in source + assert source.count("evaluate_web_search_egress(") >= 2 + + non_stream_slice = source[source.find("web_search_query_text = build_web_search_query_text(user_message)"):] + non_stream_slice = non_stream_slice[:non_stream_slice.find("perform_web_search(") + len("perform_web_search(")] + assert "evaluate_web_search_egress(" in non_stream_slice + + streaming_start = source.find("def chat_stream_api") + streaming_source = source[streaming_start:] + streaming_slice = streaming_source[ + streaming_source.find("web_search_query_text = build_web_search_query_text(user_message)") : + ] + streaming_slice = streaming_slice[:streaming_slice.find("perform_web_search(") + len("perform_web_search(")] + assert "evaluate_web_search_egress(" in streaming_slice + + +def test_dlp_helper_blocks_or_redacts_web_search_text(): + """DLP helper should block or redact web-search egress text.""" + print("Testing web-search DLP helper behavior...") + from functions_dlp import evaluate_web_search_egress + + block_result = evaluate_web_search_egress( + f"Search the web for employee SSN {RAW_VALUE}", + settings={ + "enable_dlp_control_plane": True, + "enable_web_search_dlp": True, + "web_search_dlp_mode": "block", + }, + context={"chat_type": "user"}, + ) + redact_result = evaluate_web_search_egress( + f"Search the web for employee SSN {RAW_VALUE}", + settings={ + "enable_dlp_control_plane": True, + "enable_web_search_dlp": True, + "web_search_dlp_mode": "redact", + }, + context={"chat_type": "user"}, + ) + + assert block_result["decision"] == "block" + assert block_result["web_search_allowed"] is False + assert block_result["status_message"] == ( + "Web search was blocked because the message appears to contain non-public information." + ) + assert redact_result["decision"] == "redact" + assert redact_result["web_search_allowed"] is True + assert "[REDACTED_US_SSN]" in redact_result["web_search_query_text"] + assert RAW_VALUE not in redact_result["web_search_query_text"] + assert redact_result["status_message"] == "Sensitive details were removed before web search." + + +def test_custom_regex_rule_can_redact_web_search_in_redact_mode(): + """Custom regex rules can redact web search when a high-confidence policy rule matches.""" + print("Testing custom regex rule web-search behavior...") + from functions_dlp import evaluate_web_search_egress + + result = evaluate_web_search_egress( + "Search employee EID-123456", + settings={ + "enable_dlp_control_plane": True, + "enable_web_search_dlp": True, + "web_search_dlp_mode": "redact", + "dlp_regex_rules": [ + { + "id": "employee_id", + "label": "Employee ID", + "entity_type": "EMPLOYEE_ID", + "enabled": True, + "pattern": r"EID-\d{6}", + "replacement": "[REDACTED_EMPLOYEE_ID]", + "surfaces": ["web_search"], + "flags": [], + "validator": "none", + "confidence": { + "regex_only": "low", + "with_keywords": "high", + "keywords": ["employee"], + "window_chars": 32, + "minimum": "high" + } + } + ], + }, + context={"chat_type": "user"}, + ) + + assert result["decision"] == "redact" + assert result["web_search_allowed"] is True + assert result["web_search_query_text"] == "Search employee [REDACTED_EMPLOYEE_ID]" + assert "EID-123456" not in repr(result) + + +def test_scanner_error_fails_closed_by_default(): + """Scanner errors must not allow web-search egress by default.""" + import functions_dlp + + def fail_scan(text, settings, surface="generic"): + raise RuntimeError("scanner unavailable") + + with patch.object(functions_dlp, "_apply_regex_engine", fail_scan): + result = functions_dlp.evaluate_dlp_text( + "send 123-45-6789 to web", + settings={ + "enable_dlp_control_plane": True, + "enable_web_search_dlp": True, + "web_search_dlp_mode": "redact", + "dlp_default_engine": "regex", + }, + surface="web_search", + ) + egress_result = functions_dlp.evaluate_web_search_egress( + "send 123-45-6789 to web", + settings={ + "enable_dlp_control_plane": True, + "enable_web_search_dlp": True, + "web_search_dlp_mode": "redact", + "dlp_default_engine": "regex", + }, + context={"chat_type": "user"}, + ) + + assert result["scanner_status"] == "error" + assert egress_result["web_search_allowed"] is False + assert egress_result["web_search_query_text"] == "" + assert "123-45-6789" not in repr(egress_result) + assert result["decision"] == "block" + assert result["text"] == "" + + +def test_blocked_status_continues_normal_chat_without_foundry_web_search(): + """Route source should add safe augmentation instead of calling web search when blocked.""" + print("Testing blocked web-search safe augmentation...") + source = read_file_text(ROUTE_FILE) + assert "Web search was blocked because the message appears to contain non-public information." in source + assert "Sensitive details were removed before web search." in source + assert "web_search_allowed" in source + + +def test_perform_web_search_debug_logging_masks_dlp_queries(): + """perform_web_search should avoid raw query/result debug logging when DLP is enabled.""" + print("Testing web-search debug logging safety...") + source = read_file_text(ROUTE_FILE) + perform_source = extract_function_source(source, "perform_web_search") + citation_source = extract_function_source(source, "_extract_web_search_citations_from_content") + + forbidden = [ + "web_search_query_text[:100]", + "user_message[:100]", + "query_text[:100]", + "result.message[:500]", + "json.dumps(cit", + "json.dumps(citation", + "metadata_payload", + "Metadata: {result.metadata}", + "'search_query': query_text", + '"search_query": query_text', + "Adding agent citation with title", + "Foundry agent invocation failed: {exc}", + "Unexpected error invoking Foundry agent: {exc}", + "Web search failed with error: {exc}", + "Web search failed with an unexpected error: {exc}", + "exceptionTraceback=True", + "Failed to log web search token usage: {log_error}", + ] + for snippet in forbidden: + assert snippet not in perform_source, f"Unsafe debug logging remains: {snippet}" + + assert "Extracting citations from:\\n{content}" not in citation_source + assert " - {citations}" not in citation_source + assert "dlp" in perform_source.lower() + assert "query_length" in perform_source or "text_length" in perform_source + assert "search_query_length" in perform_source + + +def test_token_usage_extraction_logs_metadata_shape_only(): + """Token usage validation should not log raw provider usage metadata.""" + print("Testing web-search token usage extraction log safety...") + source = read_file_text(ROUTE_FILE) + token_source = extract_function_source(source, "_extract_token_usage_from_metadata") + + assert "usage={usage}" not in token_source + assert "usage_keys={list(usage.keys())}" in token_source + + +if __name__ == "__main__": + tests = [ + test_route_imports_and_calls_dlp_before_web_search, + test_dlp_helper_blocks_or_redacts_web_search_text, + test_custom_regex_rule_can_redact_web_search_in_redact_mode, + test_scanner_error_fails_closed_by_default, + test_blocked_status_continues_normal_chat_without_foundry_web_search, + test_perform_web_search_debug_logging_masks_dlp_queries, + test_token_usage_extraction_logs_metadata_shape_only, + ] + + try: + for test in tests: + test() + print(f"All {len(tests)} web-search DLP egress tests passed.") + sys.exit(0) + except Exception as exc: + print(f"Test failed: {exc}") + import traceback + + traceback.print_exc() + sys.exit(1) diff --git a/functional_tests/test_web_search_dlp_route_integration.py b/functional_tests/test_web_search_dlp_route_integration.py new file mode 100644 index 00000000..4c249aeb --- /dev/null +++ b/functional_tests/test_web_search_dlp_route_integration.py @@ -0,0 +1,143 @@ +# test_web_search_dlp_route_integration.py +#!/usr/bin/env python3 +""" +Functional test for web-search DLP route integration. +Version: 0.242.069 +Implemented in: 0.242.069 + +This test ensures chat routes evaluate DLP before Foundry web search, suppress +Foundry calls on block, and send only the redacted query on redact. +""" + +import os +import sys +from pathlib import Path + + +ROOT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +APP_DIR = os.path.join(ROOT_DIR, "application", "single_app") +ROUTE_FILE = os.path.join(APP_DIR, "route_backend_chats.py") +ROUTE_BACKEND_CHATS = Path(ROUTE_FILE) + + +def read_file_text(path): + with open(path, "r", encoding="utf-8") as file_handle: + return file_handle.read() + + +def web_search_dlp_blocks(source_text): + marker = "web_search_dlp_result = evaluate_web_search_egress(" + blocks = [] + start = 0 + while True: + marker_index = source_text.find(marker, start) + if marker_index == -1: + break + perform_index = source_text.find("perform_web_search(", marker_index) + if perform_index == -1: + raise AssertionError("Found DLP evaluation without a later perform_web_search call") + block_start = source_text.rfind("if web_search_enabled:", 0, marker_index) + block_end = source_text.find(")", perform_index) + blocks.append(source_text[block_start:block_end]) + start = perform_index + len("perform_web_search(") + return blocks + + +def extract_top_level_function_source(source_text, function_name): + marker = f"def {function_name}(" + start = source_text.find(marker) + if start == -1: + raise AssertionError(f"Function {function_name} not found") + + next_function = source_text.find("\ndef ", start + len(marker)) + if next_function == -1: + return source_text[start:] + return source_text[start:next_function] + + +def test_dlp_guard_exists_in_both_chat_paths(): + """Both streaming and non-streaming routes should have a DLP-guarded web-search block.""" + print("Testing web-search DLP guarded blocks...") + source = read_file_text(ROUTE_FILE) + blocks = web_search_dlp_blocks(source) + + assert len(blocks) == 2, f"Expected two web-search DLP route blocks, found {len(blocks)}" + + +def test_blocked_dlp_result_suppresses_foundry_call(): + """Blocked DLP decisions should append a safe system message instead of calling Foundry.""" + print("Testing blocked DLP route behavior...") + source = read_file_text(ROUTE_FILE) + + for block in web_search_dlp_blocks(source): + dlp_index = block.find("web_search_dlp_result = evaluate_web_search_egress(") + block_decision_index = block.find('if not web_search_dlp_result.get("web_search_allowed", True):') + blocked_status_index = block.find("WEB_SEARCH_DLP_BLOCKED_STATUS") + else_index = block.find("else:", block_decision_index) + perform_index = block.find("perform_web_search(") + + assert dlp_index != -1 + assert block_decision_index > dlp_index + assert blocked_status_index > block_decision_index + assert else_index > blocked_status_index + assert perform_index > else_index, "Foundry web search must stay in the allowed else branch" + + +def test_redacted_query_is_forwarded_to_foundry(): + """Allowed/redacted DLP decisions should replace the query before Foundry invocation.""" + print("Testing redacted query forwarding...") + source = read_file_text(ROUTE_FILE) + + expected_assignment = ( + 'web_search_query_text = web_search_dlp_result.get("web_search_query_text", web_search_query_text)' + ) + for block in web_search_dlp_blocks(source): + assignment_index = block.find(expected_assignment) + perform_index = block.find("perform_web_search(") + query_argument_index = block.find("web_search_query_text=web_search_query_text", perform_index) + + assert assignment_index != -1, "Route must replace raw query with DLP-safe query" + assert perform_index > assignment_index, "Foundry call must occur after DLP-safe query assignment" + assert query_argument_index > perform_index, "Foundry call must receive the DLP-safe query variable" + + +def test_route_emits_counts_only_dlp_telemetry(): + """Route telemetry should use the shared counts-only telemetry builder.""" + print("Testing route DLP telemetry integration...") + source = read_file_text(ROUTE_FILE) + + for block in web_search_dlp_blocks(source): + assert "should_emit_dlp_telemetry(web_search_dlp_result, settings)" in block + assert "build_dlp_telemetry_properties(" in block + assert 'surface="web_search"' in block + assert "raw_matches" not in block + + +def test_perform_web_search_does_not_fallback_to_raw_user_message(): + source = ROUTE_BACKEND_CHATS.read_text(encoding="utf-8") + perform_source = extract_top_level_function_source(source, "perform_web_search") + + assert "web_search_query_text or user_message" not in perform_source + assert "query_text = (web_search_query_text or \"\").strip()" in perform_source + + +if __name__ == "__main__": + tests = [ + test_dlp_guard_exists_in_both_chat_paths, + test_blocked_dlp_result_suppresses_foundry_call, + test_redacted_query_is_forwarded_to_foundry, + test_route_emits_counts_only_dlp_telemetry, + test_perform_web_search_does_not_fallback_to_raw_user_message, + ] + + try: + for test in tests: + test() + print(f"All {len(tests)} web-search DLP route integration tests passed.") + sys.exit(0) + except Exception as exc: + print(f"Test failed: {exc}") + import traceback + + traceback.print_exc() + sys.exit(1) diff --git a/tools/local_dev/render_dlp_admin_preview.py b/tools/local_dev/render_dlp_admin_preview.py new file mode 100644 index 00000000..d70e13ca --- /dev/null +++ b/tools/local_dev/render_dlp_admin_preview.py @@ -0,0 +1,91 @@ +# render_dlp_admin_preview.py +#!/usr/bin/env python3 +""" +Extract the DLP admin settings card from a captured SimpleChat admin page. + +Usage: + python tools/local_dev/render_dlp_admin_preview.py .codex-local/admin-settings.html .codex-local +""" + +import sys +from pathlib import Path + +from bs4 import BeautifulSoup + + +def _wrap_preview(section_html): + return f""" + + + + + + + + + +
+{section_html} +
+ + +""" + + +def _expand_dlp_controls(section): + for checkbox_id in [ + "enable_dlp_control_plane", + "enable_web_search_dlp", + "enable_upload_dlp", + ]: + node = section.select_one(f"#{checkbox_id}") + if node: + node["checked"] = "" + + for visible_id in [ + "dlp_control_plane_settings", + "web_search_dlp_mode_settings", + "upload_dlp_mode_settings", + ]: + node = section.select_one(f"#{visible_id}") + if node and node.has_attr("class"): + node["class"] = [class_name for class_name in node.get("class", []) if class_name != "d-none"] + + +def render_previews(source_path, output_dir): + source_html = source_path.read_text(encoding="utf-8") + soup = BeautifulSoup(source_html, "html.parser") + section = soup.select_one("#dlp-section") + if section is None: + raise ValueError("Could not find #dlp-section in captured admin settings HTML.") + + output_dir.mkdir(parents=True, exist_ok=True) + collapsed_path = output_dir / "admin-dlp-preview.html" + expanded_path = output_dir / "admin-dlp-preview-expanded.html" + + collapsed_path.write_text(_wrap_preview(str(section)), encoding="utf-8") + _expand_dlp_controls(section) + expanded_path.write_text(_wrap_preview(str(section)), encoding="utf-8") + return collapsed_path, expanded_path + + +def main(argv): + if len(argv) != 3: + print("Usage: render_dlp_admin_preview.py ") + return 2 + + source_path = Path(argv[1]) + output_dir = Path(argv[2]) + collapsed_path, expanded_path = render_previews(source_path, output_dir) + print(f"Wrote {collapsed_path}") + print(f"Wrote {expanded_path}") + return 0 + + +if __name__ == "__main__": + sys.exit(main(sys.argv)) diff --git a/tools/local_dev/run_dlp_local_stack.md b/tools/local_dev/run_dlp_local_stack.md new file mode 100644 index 00000000..c346afa3 --- /dev/null +++ b/tools/local_dev/run_dlp_local_stack.md @@ -0,0 +1,111 @@ +# DLP Local Stack Smoke Runbook + +## Purpose + +Use this runbook to render the DLP admin settings UI against a disposable local Cosmos DB emulator without using Azure-hosted Cosmos. + +## Ports + +- Cosmos gateway: `9081` +- Cosmos health: `9082` +- Cosmos explorer: `1235` +- SimpleChat Flask dev server: `5000` + +Port `8081` is intentionally avoided because local proxy tools may already bind it. + +## Start Cosmos + +```bash +docker run --detach --name simplechat-cosmos-dlp --publish 9081:8081 --publish 9082:8080 --publish 1235:1234 mcr.microsoft.com/cosmosdb/linux/azure-cosmos-emulator:vnext-latest --gateway-endpoint localhost:9081 +``` + +## Verify Cosmos + +```bash +curl.exe -sS http://localhost:9082/status +``` + +The health endpoint should show PostgreSQL and Explorer as healthy. When +`--gateway-endpoint localhost:9081` is used, the container-internal gateway +probe can report unhealthy because it checks the host-advertised port from +inside the container. Use the SDK smoke test below as the authoritative check. + +## SDK Smoke Test + +Run this from the repository root after the Python environment is created: + +```bash +.venv\Scripts\python.exe -c "from azure.cosmos import CosmosClient, PartitionKey; key=''; c=CosmosClient('http://localhost:9081/', credential=key); db=c.create_database_if_not_exists('SimpleChatSmoke'); con=db.create_container_if_not_exists(id='smoke', partition_key=PartitionKey(path='/id')); con.upsert_item({'id':'ok','value':1}); print(con.read_item('ok', partition_key='ok')['value'])" +``` + +Expected output: + +```text +1 +``` + +## Python Environment + +```bash +python -m venv .venv +.venv\Scripts\python.exe -m pip install --upgrade pip +.venv\Scripts\python.exe -m pip install -r application\single_app\requirements.txt +``` + +## Start SimpleChat + +Set these environment variables before launch: + +```dotenv +AZURE_COSMOS_ENDPOINT=http://localhost:9081/ +AZURE_COSMOS_KEY= +AZURE_COSMOS_AUTHENTICATION_TYPE=key +NO_PROXY=localhost,127.0.0.1,::1 +no_proxy=localhost,127.0.0.1,::1 +FLASK_DEBUG=1 +SIMPLECHAT_USE_GUNICORN=0 +SIMPLECHAT_RUN_BACKGROUND_TASKS=0 +DISABLE_FLASK_INSTRUMENTATION=1 +CLIENT_ID=local-dev-client +TENANT_ID=local-dev-tenant +MICROSOFT_PROVIDER_AUTHENTICATION_SECRET=local-dev-secret +``` + +Then run: + +```bash +cd application\single_app +..\..\.venv\Scripts\python.exe app.py +``` + +Open: + +```text +https://localhost:5000 +``` + +## Capture The DLP Admin Card + +After authenticating as an admin user, save the rendered admin settings page: + +```bash +curl.exe -k -sS -H "Cookie: session=" https://localhost:5000/admin/settings -o .codex-local/admin-settings.html +``` + +Then extract the DLP section for a focused visual review: + +```bash +python tools/local_dev/render_dlp_admin_preview.py .codex-local/admin-settings.html .codex-local +``` + +The script writes: + +- `.codex-local/admin-dlp-preview.html` +- `.codex-local/admin-dlp-preview-expanded.html` + +## Known Local Caveats + +- Browser automation may be blocked by Windows group policy. +- If Docker Desktop stops, the Flask process can keep serving cached pages while Cosmos requests fail. +- If another tool owns port `8081`, use `9081` and pass `--gateway-endpoint localhost:9081`. +- Keep `.codex-local/` untracked; it is for local smoke artifacts only. From 0f7e17ba853812759fe2eaae785b7ce6ae720954 Mon Sep 17 00:00:00 2001 From: Zachary Arguelles Date: Thu, 18 Jun 2026 10:55:34 -0400 Subject: [PATCH 02/20] fix: adapt dlp control plane to current development --- application/single_app/functions_dlp.py | 6 +++--- .../features/DLP_WEB_SEARCH_EGRESS_CONTROL.md | 2 +- functional_tests/test_dlp_admin_ui_smoke.py | 6 +++--- functional_tests/test_dlp_control_plane.py | 16 ++++++++-------- .../test_upload_dlp_ingestion_integration.py | 16 ++++++++-------- tools/local_dev/run_dlp_local_stack.md | 2 +- 6 files changed, 24 insertions(+), 24 deletions(-) diff --git a/application/single_app/functions_dlp.py b/application/single_app/functions_dlp.py index 6f764731..67beda74 100644 --- a/application/single_app/functions_dlp.py +++ b/application/single_app/functions_dlp.py @@ -97,8 +97,8 @@ def _decision_from_counts(match_counts, mode): return "monitor" -def normalize_presidio_results(text, recognizer_results, mode="redact", engine="presidio_service"): - """Normalize Presidio-style entity offsets into the shared counts-only result.""" +def normalize_external_analyzer_results(text, recognizer_results, mode="redact", engine="external_analyzer"): + """Normalize external analyzer entity offsets into the shared counts-only result.""" source_text = str(text or "") sorted_results = sorted( [ @@ -137,7 +137,7 @@ def normalize_presidio_results(text, recognizer_results, mode="redact", engine=" "total_replacements": sum(counts.values()), "match_counts": counts, "matches": [{"entity_type": key, "count": value} for key, value in counts.items()], - "metadata": {"adapter": "presidio"}, + "metadata": {"adapter": "external_analyzer"}, "scanner_status": "ok", } diff --git a/docs/explanation/features/DLP_WEB_SEARCH_EGRESS_CONTROL.md b/docs/explanation/features/DLP_WEB_SEARCH_EGRESS_CONTROL.md index e6d0c16a..1cfa88ad 100644 --- a/docs/explanation/features/DLP_WEB_SEARCH_EGRESS_CONTROL.md +++ b/docs/explanation/features/DLP_WEB_SEARCH_EGRESS_CONTROL.md @@ -109,7 +109,7 @@ Telemetry retention follows the configured Application Insights workspace. This ## Limitations -Regex DLP is intentionally lightweight. It is useful for structured identifiers such as SSNs, Luhn-valid credit card numbers, and administrator-defined exact-format identifiers, but it is weaker for names, addresses, contextual PII, international identifiers, secrets, and noisy prose. +Regex DLP is intentionally lightweight. It is useful for structured identifiers such as SSNs, Luhn-valid credit card numbers, and administrator-defined exact-format identifiers, but it is weaker for names, addresses, contextual PII, international identifiers, credential strings, and noisy prose. The app-level control cannot inspect Bing's internal grounding query after Foundry receives the request. It reduces egress risk by preventing or redacting sensitive text before the app sends the web-search message to the Foundry agent. diff --git a/functional_tests/test_dlp_admin_ui_smoke.py b/functional_tests/test_dlp_admin_ui_smoke.py index 846566c7..975e0e3b 100644 --- a/functional_tests/test_dlp_admin_ui_smoke.py +++ b/functional_tests/test_dlp_admin_ui_smoke.py @@ -95,8 +95,8 @@ def test_expanded_dlp_admin_preview_contains_expected_controls(): assert '
' not in expanded_html -def test_dlp_admin_preview_does_not_expose_raw_secret_values(): - """Preview files should include controls, not populated secrets or raw detector matches.""" +def test_dlp_admin_preview_does_not_expose_raw_sensitive_values(): + """Preview files should include controls, not populated credentials or raw detector matches.""" print("Testing DLP admin preview safety...") module = load_preview_module() @@ -120,7 +120,7 @@ def test_dlp_admin_preview_does_not_expose_raw_secret_values(): tests = [ test_dlp_admin_preview_extractor_writes_collapsed_and_expanded_files, test_expanded_dlp_admin_preview_contains_expected_controls, - test_dlp_admin_preview_does_not_expose_raw_secret_values, + test_dlp_admin_preview_does_not_expose_raw_sensitive_values, ] try: diff --git a/functional_tests/test_dlp_control_plane.py b/functional_tests/test_dlp_control_plane.py index 8db4cb69..53386213 100644 --- a/functional_tests/test_dlp_control_plane.py +++ b/functional_tests/test_dlp_control_plane.py @@ -7,7 +7,7 @@ This test ensures the shared DLP core supports disabled, regex, Luhn-validated credit-card, counts-only metadata, ReDoS-resistant scanning, and optional -Presidio service normalization without persisting raw matched values. +external analyzer normalization without persisting raw matched values. """ import os @@ -171,19 +171,19 @@ def fail_scan(text, settings, surface="generic"): assert "tail" not in repr(result) -def test_presidio_service_shape_normalizes_counts_without_raw_values(): - """Optional Presidio service results should normalize into the shared shape.""" - print("Testing Presidio service adapter normalization...") - from functions_dlp import normalize_presidio_results +def test_external_analyzer_shape_normalizes_counts_without_raw_values(): + """Optional external analyzer results should normalize into the shared shape.""" + print("Testing external analyzer adapter normalization...") + from functions_dlp import normalize_external_analyzer_results - normalized = normalize_presidio_results( + normalized = normalize_external_analyzer_results( text=f"Alice Example has SSN {RAW_SSN}.", recognizer_results=[ {"entity_type": "PERSON", "start": 0, "end": 13, "score": 0.88}, {"entity_type": "US_SSN", "start": 22, "end": 33, "score": 0.99}, ], mode="redact", - engine="presidio_service", + engine="external_analyzer", ) assert normalized["decision"] == "redact" @@ -201,7 +201,7 @@ def test_presidio_service_shape_normalizes_counts_without_raw_values(): test_regex_scan_is_bounded_on_long_non_matching_input, test_enforced_dlp_blocks_when_text_exceeds_scan_limit, test_enforced_truncation_blocks_before_scanner_error_fail_open, - test_presidio_service_shape_normalizes_counts_without_raw_values, + test_external_analyzer_shape_normalizes_counts_without_raw_values, ] try: diff --git a/functional_tests/test_upload_dlp_ingestion_integration.py b/functional_tests/test_upload_dlp_ingestion_integration.py index 6631bfe6..15b2befd 100644 --- a/functional_tests/test_upload_dlp_ingestion_integration.py +++ b/functional_tests/test_upload_dlp_ingestion_integration.py @@ -465,17 +465,17 @@ def test_video_indexer_request_errors_redact_access_token(): query_error = ( "403 Client Error: Forbidden for url: " - "https://video.example/Index?accessToken=secret-token&other=value" + "https://video.example/Index?accessToken=opaque-token&other=value" ) - dict_error = "{'accessToken': 'secret-token', 'name': 'example.mp4'}" + dict_error = "{'accessToken': 'opaque-token', 'name': 'example.mp4'}" redacted_query = functions_documents._sanitize_video_indexer_log_value(query_error) redacted_dict = functions_documents._sanitize_video_indexer_log_value(dict_error) - assert "secret-token" not in redacted_query + assert "opaque-token" not in redacted_query assert "accessToken=[REDACTED]" in redacted_query assert "other=value" in redacted_query - assert "secret-token" not in redacted_dict + assert "opaque-token" not in redacted_dict assert "[REDACTED]" in redacted_dict assert "Authentication failed: {str(e)}" not in video_source assert "AUTH ERROR: {e}" not in video_source @@ -496,17 +496,17 @@ def test_video_indexer_auth_errors_redact_access_token(): source = FUNCTIONS_AUTHENTICATION.read_text(encoding="utf-8") auth_source = extract_function_source(source, "get_video_indexer_managed_identity_token") - response_body = '{"accessToken":"secret-token","expiresIn":"3600"}' + response_body = '{"accessToken":"opaque-token","expiresIn":"3600"}' query_error = ( "400 Client Error: Bad Request for url: " - "https://management.example/generateAccessToken?accessToken=secret-token" + "https://management.example/generateAccessToken?accessToken=opaque-token" ) redacted_body = functions_authentication._sanitize_video_indexer_auth_log_value(response_body) redacted_query = functions_authentication._sanitize_video_indexer_auth_log_value(query_error) - assert "secret-token" not in redacted_body - assert "secret-token" not in redacted_query + assert "opaque-token" not in redacted_body + assert "opaque-token" not in redacted_query assert "[REDACTED]" in redacted_body assert "accessToken=[REDACTED]" in redacted_query assert "ARM API response text: {resp.text}" not in auth_source diff --git a/tools/local_dev/run_dlp_local_stack.md b/tools/local_dev/run_dlp_local_stack.md index c346afa3..a75c3b21 100644 --- a/tools/local_dev/run_dlp_local_stack.md +++ b/tools/local_dev/run_dlp_local_stack.md @@ -68,7 +68,7 @@ SIMPLECHAT_RUN_BACKGROUND_TASKS=0 DISABLE_FLASK_INSTRUMENTATION=1 CLIENT_ID=local-dev-client TENANT_ID=local-dev-tenant -MICROSOFT_PROVIDER_AUTHENTICATION_SECRET=local-dev-secret +MICROSOFT_PROVIDER_AUTHENTICATION_SECRET=replace-me ``` Then run: From ec38c96c8fd36bf290cfe89cda4b886b30976ca5 Mon Sep 17 00:00:00 2001 From: Zachary Arguelles Date: Thu, 18 Jun 2026 11:16:22 -0400 Subject: [PATCH 03/20] fix: close dlp egress bypasses --- application/single_app/functions_documents.py | 3 +- application/single_app/route_backend_chats.py | 52 ++++++- .../test_upload_dlp_ingestion_integration.py | 92 +++++++++++- .../test_web_search_dlp_egress.py | 142 +++++++++++++++++- 4 files changed, 282 insertions(+), 7 deletions(-) diff --git a/application/single_app/functions_documents.py b/application/single_app/functions_documents.py index 041c5054..f72c21ba 100644 --- a/application/single_app/functions_documents.py +++ b/application/single_app/functions_documents.py @@ -3026,7 +3026,7 @@ def save_chunks_batch(chunks_data, user_id, document_id, group_id=None, public_w embedding, token_usage = embedding_results[idx] page_number = chunk_info['page_number'] file_name = chunk_info['file_name'] - page_text_content = chunk_info['page_text_content'] + enhanced_chunk_text = chunk_info['page_text_content'] dlp_metadata = chunk_info.get('dlp_metadata') if token_usage: @@ -3036,7 +3036,6 @@ def save_chunks_batch(chunks_data, user_id, document_id, group_id=None, public_w total_token_usage['model_deployment_name'] = token_usage.get('model_deployment_name') chunk_id = f"{document_id}_{page_number}" - enhanced_chunk_text = page_text_content + vision_text if vision_text else page_text_content if is_public_workspace: chunk_document = { diff --git a/application/single_app/route_backend_chats.py b/application/single_app/route_backend_chats.py index 2518a7ff..c0b56a69 100644 --- a/application/single_app/route_backend_chats.py +++ b/application/single_app/route_backend_chats.py @@ -20450,11 +20450,24 @@ def perform_research_web_searches( """Run one or more current-message-only web searches for normal or Deep Research mode.""" web_search_runs = [] query_plan = {} + dlp_context = { + "conversation_id": conversation_id, + "chat_type": chat_type, + "document_scope": document_scope, + "workspace_scope": document_scope, + } if deep_research_enabled: + planner_user_message = user_message + if ( + settings.get("enable_dlp_control_plane", False) + and settings.get("enable_web_search_dlp", False) + and str(settings.get("web_search_dlp_mode", "monitor") or "monitor").lower() == "redact" + ): + planner_user_message = web_search_query_text query_plan = build_deep_research_query_plan( settings=settings, - user_message=user_message, + user_message=planner_user_message, base_query=web_search_query_text, planner_client=deep_research_planner_client, planner_model=deep_research_planner_model, @@ -20484,6 +20497,43 @@ def perform_research_web_searches( search_label = None if deep_research_enabled: search_label = f"Deep Research query {query_index}/{total_queries}" + query_dlp_result = evaluate_web_search_egress( + query_text, + settings=settings, + context={ + **dlp_context, + "deep_research_query_index": query_index, + "deep_research_query_count": total_queries, + }, + ) + if should_emit_dlp_telemetry(query_dlp_result, settings): + log_event( + "[DLP] Web search egress decision", + extra=build_dlp_telemetry_properties( + query_dlp_result, + surface="web_search", + context={ + **dlp_context, + "deep_research_query_index": query_index, + "deep_research_query_count": total_queries, + }, + ), + ) + if not query_dlp_result.get("web_search_allowed", True): + query_item['query'] = "" + if not any( + message.get("content") == WEB_SEARCH_DLP_BLOCKED_STATUS + for message in system_messages_for_augmentation + if isinstance(message, dict) + ): + system_messages_for_augmentation.append({ + "role": "system", + "content": WEB_SEARCH_DLP_BLOCKED_STATUS, + }) + continue + + query_text = query_dlp_result.get("web_search_query_text", query_text) + query_item['query'] = query_text perform_web_search( settings=settings, conversation_id=conversation_id, diff --git a/functional_tests/test_upload_dlp_ingestion_integration.py b/functional_tests/test_upload_dlp_ingestion_integration.py index 15b2befd..6f06833d 100644 --- a/functional_tests/test_upload_dlp_ingestion_integration.py +++ b/functional_tests/test_upload_dlp_ingestion_integration.py @@ -2,8 +2,8 @@ #!/usr/bin/env python3 """ Functional test for upload DLP ingestion integration. -Version: 0.242.069 -Implemented in: 0.242.069 +Version: 0.242.070 +Implemented in: 0.242.070 This test ensures upload DLP blocks stop before embeddings/search indexing and redacted text is the only text passed into embedding/index payload construction. @@ -13,6 +13,7 @@ import os import sys import types +from datetime import datetime, timezone from pathlib import Path from typing import List @@ -58,6 +59,8 @@ def import_functions_documents_for_helper_tests(): "azure.cognitiveservices.speech": types.ModuleType("azure.cognitiveservices.speech"), } stub_modules["config"].List = List + stub_modules["config"].datetime = datetime + stub_modules["config"].timezone = timezone stub_modules["functions_settings"].get_settings = lambda: {} stub_modules["functions_logging"].add_file_task_to_file_processing_log = lambda **kwargs: None stub_modules["functions_logging"].log_event = lambda *args, **kwargs: None @@ -182,6 +185,90 @@ def test_batch_chunks_use_sanitized_text_for_batch_embeddings_and_indexing(): assert "dlp_metadata" in batch_source +def test_batch_chunk_vision_text_is_not_reappended_after_dlp_redaction(): + """save_chunks_batch should index sanitized chunk text without raw vision text.""" + print("Testing batch chunk vision text DLP redaction before indexing...") + functions_documents = import_functions_documents_for_helper_tests() + + uploaded_batches = [] + embedded_texts = [] + + class FakeSearchClient: + def upload_documents(self, documents): + uploaded_batches.append(documents) + + original_get_settings = functions_documents.get_settings + original_get_document_metadata = functions_documents.get_document_metadata + original_update_document = getattr(functions_documents, "update_document", None) + original_clients = getattr(functions_documents, "CLIENTS", None) + original_functions_content = sys.modules.get("functions_content") + + functions_documents.get_settings = lambda: { + "enable_dlp_control_plane": True, + "enable_upload_dlp": True, + "upload_dlp_mode": "redact", + "dlp_default_engine": "regex", + "dlp_max_scan_chars": 200000, + } + functions_documents.get_document_metadata = lambda **kwargs: { + "version": 1, + "authors": ["Author"], + "title": "Document", + "document_classification": "None", + "tags": [], + "shared_user_ids": [], + "vision_analysis": { + "model": "vision-model", + "text": f"badge SSN {RAW_VALUE}", + }, + } + functions_documents.update_document = lambda **kwargs: None + functions_documents.CLIENTS = {"search_client_user": FakeSearchClient()} + + def fake_generate_embeddings_batch(texts): + embedded_texts.extend(texts) + return [([0.1, 0.2, 0.3], {"total_tokens": 1, "prompt_tokens": 1}) for _ in texts] + + functions_content_stub = types.ModuleType("functions_content") + functions_content_stub.generate_embeddings_batch = fake_generate_embeddings_batch + sys.modules["functions_content"] = functions_content_stub + + try: + functions_documents.save_chunks_batch( + [ + { + "page_text_content": "Safe page content.", + "page_number": 1, + "file_name": "vision.pdf", + } + ], + user_id="user-1", + document_id="doc-vision", + ) + finally: + functions_documents.get_settings = original_get_settings + functions_documents.get_document_metadata = original_get_document_metadata + if original_functions_content is None: + sys.modules.pop("functions_content", None) + else: + sys.modules["functions_content"] = original_functions_content + if original_update_document is None: + delattr(functions_documents, "update_document") + else: + functions_documents.update_document = original_update_document + if original_clients is None: + delattr(functions_documents, "CLIENTS") + else: + functions_documents.CLIENTS = original_clients + + assert uploaded_batches + indexed_chunk_text = uploaded_batches[0][0]["chunk_text"] + assert RAW_VALUE not in indexed_chunk_text + assert RAW_VALUE not in repr(embedded_texts) + assert indexed_chunk_text == embedded_texts[0] + assert "badge SSN [REDACTED_US_SSN]" in indexed_chunk_text + + def test_video_chunks_use_sanitized_transcript_and_ocr_text(): """save_video_chunk should sanitize transcript and OCR text before embedding/search.""" print("Testing video chunk sanitized text flow...") @@ -594,6 +681,7 @@ def test_upload_dlp_record_merges_with_existing_document_status(): test_upload_helper_blocks_before_returning_to_ingestion_paths, test_single_chunk_uses_sanitized_text_for_embedding_and_indexing, test_batch_chunks_use_sanitized_text_for_batch_embeddings_and_indexing, + test_batch_chunk_vision_text_is_not_reappended_after_dlp_redaction, test_video_chunks_use_sanitized_transcript_and_ocr_text, test_video_chunks_preserve_public_workspace_scope, test_video_dlp_block_errors_abort_processing, diff --git a/functional_tests/test_web_search_dlp_egress.py b/functional_tests/test_web_search_dlp_egress.py index 005881fb..2906942e 100644 --- a/functional_tests/test_web_search_dlp_egress.py +++ b/functional_tests/test_web_search_dlp_egress.py @@ -2,8 +2,8 @@ #!/usr/bin/env python3 """ Functional test for web-search DLP egress. -Version: 0.242.069 -Implemented in: 0.242.069 +Version: 0.242.070 +Implemented in: 0.242.070 This test ensures web-search DLP runs after current-message query construction and before Foundry web-search execution, blocks sensitive egress, redacts when @@ -218,6 +218,142 @@ def test_perform_web_search_debug_logging_masks_dlp_queries(): assert "search_query_length" in perform_source +def test_deep_research_planned_queries_are_rechecked_before_web_search(): + """Deep Research planner output should be DLP-checked before Foundry web search.""" + print("Testing Deep Research planned query DLP enforcement...") + source = read_file_text(ROUTE_FILE) + function_source = extract_function_source(source, "perform_research_web_searches") + + recorded = {"planner_user_message": None, "queries": []} + + def fake_build_deep_research_query_plan(**kwargs): + recorded["planner_user_message"] = kwargs.get("user_message") + return { + "queries": [ + { + "query": f"Find records for employee SSN {RAW_VALUE}", + "reason": "planner included sensitive source text", + "source": "planner", + } + ] + } + + def fake_perform_web_search(**kwargs): + recorded["queries"].append(kwargs["web_search_query_text"]) + + from functions_dlp import evaluate_web_search_egress + + namespace = { + "build_deep_research_query_plan": fake_build_deep_research_query_plan, + "perform_web_search": fake_perform_web_search, + "evaluate_web_search_egress": evaluate_web_search_egress, + "should_emit_dlp_telemetry": lambda *args, **kwargs: False, + "log_event": lambda *args, **kwargs: None, + "build_dlp_telemetry_properties": lambda *args, **kwargs: {}, + "WEB_SEARCH_DLP_BLOCKED_STATUS": ( + "Web search was blocked because the message appears to contain non-public information." + ), + } + exec(compile(function_source, ROUTE_FILE, "exec"), namespace) + + namespace["perform_research_web_searches"]( + settings={ + "enable_dlp_control_plane": True, + "enable_web_search_dlp": True, + "web_search_dlp_mode": "redact", + "dlp_default_engine": "regex", + }, + conversation_id="conv-1", + user_id="user-1", + user_message=f"Search for employee SSN {RAW_VALUE}", + user_message_id="msg-1", + chat_type="user", + document_scope="personal", + active_group_id=None, + active_public_workspace_id=None, + web_search_query_text="Search for employee SSN [REDACTED_US_SSN]", + system_messages_for_augmentation=[], + agent_citations_list=[], + web_search_citations_list=[], + deep_research_enabled=True, + deep_research_planner_client=object(), + deep_research_planner_model="planner", + ) + + assert RAW_VALUE not in recorded["planner_user_message"] + assert recorded["queries"] == ["Find records for employee SSN [REDACTED_US_SSN]"] + assert RAW_VALUE not in repr(recorded) + + +def test_deep_research_blocked_planned_queries_do_not_call_web_search(): + """Deep Research should skip planner queries when the per-query DLP check blocks.""" + print("Testing Deep Research planned query block enforcement...") + source = read_file_text(ROUTE_FILE) + function_source = extract_function_source(source, "perform_research_web_searches") + + recorded_queries = [] + system_messages = [] + + def fake_build_deep_research_query_plan(**kwargs): + return { + "queries": [ + { + "query": f"Find records for employee SSN {RAW_VALUE}", + "reason": "planner included sensitive source text", + "source": "planner", + } + ] + } + + def fake_perform_web_search(**kwargs): + recorded_queries.append(kwargs["web_search_query_text"]) + + from functions_dlp import evaluate_web_search_egress + + namespace = { + "build_deep_research_query_plan": fake_build_deep_research_query_plan, + "perform_web_search": fake_perform_web_search, + "evaluate_web_search_egress": evaluate_web_search_egress, + "should_emit_dlp_telemetry": lambda *args, **kwargs: False, + "log_event": lambda *args, **kwargs: None, + "build_dlp_telemetry_properties": lambda *args, **kwargs: {}, + "WEB_SEARCH_DLP_BLOCKED_STATUS": ( + "Web search was blocked because the message appears to contain non-public information." + ), + } + exec(compile(function_source, ROUTE_FILE, "exec"), namespace) + + result = namespace["perform_research_web_searches"]( + settings={ + "enable_dlp_control_plane": True, + "enable_web_search_dlp": True, + "web_search_dlp_mode": "block", + "dlp_default_engine": "regex", + }, + conversation_id="conv-1", + user_id="user-1", + user_message="Search for employee SSN [REDACTED_US_SSN]", + user_message_id="msg-1", + chat_type="user", + document_scope="personal", + active_group_id=None, + active_public_workspace_id=None, + web_search_query_text="Search for employee SSN [REDACTED_US_SSN]", + system_messages_for_augmentation=system_messages, + agent_citations_list=[], + web_search_citations_list=[], + deep_research_enabled=True, + deep_research_planner_client=object(), + deep_research_planner_model="planner", + ) + + assert recorded_queries == [] + assert result["web_search_runs"] == [] + assert system_messages == [{"role": "system", "content": namespace["WEB_SEARCH_DLP_BLOCKED_STATUS"]}] + assert RAW_VALUE not in repr(result) + assert RAW_VALUE not in repr(system_messages) + + def test_token_usage_extraction_logs_metadata_shape_only(): """Token usage validation should not log raw provider usage metadata.""" print("Testing web-search token usage extraction log safety...") @@ -236,6 +372,8 @@ def test_token_usage_extraction_logs_metadata_shape_only(): test_scanner_error_fails_closed_by_default, test_blocked_status_continues_normal_chat_without_foundry_web_search, test_perform_web_search_debug_logging_masks_dlp_queries, + test_deep_research_planned_queries_are_rechecked_before_web_search, + test_deep_research_blocked_planned_queries_do_not_call_web_search, test_token_usage_extraction_logs_metadata_shape_only, ] From 62550e2c430f05340901d78518d176a7e0d804b8 Mon Sep 17 00:00:00 2001 From: Zachary Arguelles Date: Thu, 18 Jun 2026 11:22:39 -0400 Subject: [PATCH 04/20] docs: align dlp mvp version metadata --- docs/explanation/features/DLP_UPLOAD_STAGING.md | 2 +- docs/explanation/features/DLP_WEB_SEARCH_EGRESS_CONTROL.md | 2 +- docs/explanation/release_notes.md | 5 +++++ functional_tests/test_dlp_admin_settings_roundtrip.py | 4 ++-- functional_tests/test_dlp_admin_settings_ui.py | 4 ++-- functional_tests/test_dlp_admin_ui_smoke.py | 4 ++-- functional_tests/test_dlp_control_plane.py | 4 ++-- functional_tests/test_dlp_regex_rules.py | 4 ++-- functional_tests/test_dlp_review_events.py | 4 ++-- functional_tests/test_dlp_telemetry.py | 4 ++-- functional_tests/test_upload_dlp_ingestion_integration.py | 4 ++-- functional_tests/test_upload_dlp_redaction.py | 4 ++-- functional_tests/test_upload_dlp_workspace_scopes.py | 4 ++-- functional_tests/test_web_search_current_message_only.py | 2 +- functional_tests/test_web_search_dlp_egress.py | 4 ++-- functional_tests/test_web_search_dlp_route_integration.py | 4 ++-- 16 files changed, 32 insertions(+), 27 deletions(-) diff --git a/docs/explanation/features/DLP_UPLOAD_STAGING.md b/docs/explanation/features/DLP_UPLOAD_STAGING.md index d3e095cb..da8ae2d0 100644 --- a/docs/explanation/features/DLP_UPLOAD_STAGING.md +++ b/docs/explanation/features/DLP_UPLOAD_STAGING.md @@ -2,7 +2,7 @@ ## Overview -Version: 0.242.069 +Version: 0.242.073 Dependencies: shared DLP core, configurable regex DLP rules, document processing pipeline, Azure AI Search, Azure OpenAI embeddings. diff --git a/docs/explanation/features/DLP_WEB_SEARCH_EGRESS_CONTROL.md b/docs/explanation/features/DLP_WEB_SEARCH_EGRESS_CONTROL.md index 1cfa88ad..fc4bfbf7 100644 --- a/docs/explanation/features/DLP_WEB_SEARCH_EGRESS_CONTROL.md +++ b/docs/explanation/features/DLP_WEB_SEARCH_EGRESS_CONTROL.md @@ -2,7 +2,7 @@ ## Overview -Version: 0.242.069 +Version: 0.242.073 Dependencies: Flask chat routes, configurable regex DLP rules, and Azure AI Foundry web-search agent configuration. diff --git a/docs/explanation/release_notes.md b/docs/explanation/release_notes.md index 533c6a71..f2879eb9 100644 --- a/docs/explanation/release_notes.md +++ b/docs/explanation/release_notes.md @@ -16,6 +16,11 @@ For feature-focused and fix-focused drill-downs by version, see [Features by Ver #### Bug Fixes +* **DLP Egress Bypass Closure** + * Applies DLP checks to Deep Research planned web-search queries immediately before outbound search. + * Prevents batch upload indexing from reintroducing raw vision text after DLP redaction. + * (Ref: Deep Research web-search DLP, upload DLP indexing redaction) + * **Upload DLP Enforcement Edge Cases** * Treats fail-on-match, fail-closed scanner errors, and truncated scans as enforced upload DLP paths when deciding whether content may be indexed or retained for enhanced citations. * Sanitizes selected upload metadata before prompts, Search payloads, Cosmos updates, and logs while preserving counts-only DLP telemetry summaries. diff --git a/functional_tests/test_dlp_admin_settings_roundtrip.py b/functional_tests/test_dlp_admin_settings_roundtrip.py index 3178d6dc..1cee064d 100644 --- a/functional_tests/test_dlp_admin_settings_roundtrip.py +++ b/functional_tests/test_dlp_admin_settings_roundtrip.py @@ -2,8 +2,8 @@ #!/usr/bin/env python3 """ Functional test for DLP admin settings roundtrip. -Version: 0.242.069 -Implemented in: 0.242.069 +Version: 0.242.073 +Implemented in: 0.242.073 This test ensures DLP admin settings are normalized, persisted, and rendered through the admin settings POST contract without requiring live Azure services. diff --git a/functional_tests/test_dlp_admin_settings_ui.py b/functional_tests/test_dlp_admin_settings_ui.py index f20d802e..70c9cb77 100644 --- a/functional_tests/test_dlp_admin_settings_ui.py +++ b/functional_tests/test_dlp_admin_settings_ui.py @@ -2,8 +2,8 @@ #!/usr/bin/env python3 """ Functional test for DLP admin settings UI. -Version: 0.242.069 -Implemented in: 0.242.069 +Version: 0.242.073 +Implemented in: 0.242.073 This test ensures shared and web-search DLP defaults exist, admin settings persist supported controls, the admin template exposes only implemented controls, diff --git a/functional_tests/test_dlp_admin_ui_smoke.py b/functional_tests/test_dlp_admin_ui_smoke.py index 975e0e3b..9a4da3a3 100644 --- a/functional_tests/test_dlp_admin_ui_smoke.py +++ b/functional_tests/test_dlp_admin_ui_smoke.py @@ -2,8 +2,8 @@ #!/usr/bin/env python3 """ Functional test for DLP admin UI smoke. -Version: 0.242.069 -Implemented in: 0.242.069 +Version: 0.242.073 +Implemented in: 0.242.073 This test ensures the DLP admin settings card can be extracted into collapsed and expanded previews for local visual review. diff --git a/functional_tests/test_dlp_control_plane.py b/functional_tests/test_dlp_control_plane.py index 53386213..d6ea732d 100644 --- a/functional_tests/test_dlp_control_plane.py +++ b/functional_tests/test_dlp_control_plane.py @@ -2,8 +2,8 @@ #!/usr/bin/env python3 """ Functional test for DLP control plane core behavior. -Version: 0.242.069 -Implemented in: 0.242.069 +Version: 0.242.073 +Implemented in: 0.242.073 This test ensures the shared DLP core supports disabled, regex, Luhn-validated credit-card, counts-only metadata, ReDoS-resistant scanning, and optional diff --git a/functional_tests/test_dlp_regex_rules.py b/functional_tests/test_dlp_regex_rules.py index 6fca6b82..d0207b63 100644 --- a/functional_tests/test_dlp_regex_rules.py +++ b/functional_tests/test_dlp_regex_rules.py @@ -2,8 +2,8 @@ #!/usr/bin/env python3 """ Functional test for configurable DLP regex rules. -Version: 0.242.069 -Implemented in: 0.241.017 +Version: 0.242.073 +Implemented in: 0.242.073 This test ensures DLP regex rules are admin-configurable, validated, confidence-shaped, timeout-bounded, and safe to report without raw matched values. diff --git a/functional_tests/test_dlp_review_events.py b/functional_tests/test_dlp_review_events.py index 919e31f8..ab37cd4e 100644 --- a/functional_tests/test_dlp_review_events.py +++ b/functional_tests/test_dlp_review_events.py @@ -2,8 +2,8 @@ #!/usr/bin/env python3 """ Functional test for DLP review event safety. -Version: 0.242.069 -Implemented in: 0.242.069 +Version: 0.242.073 +Implemented in: 0.242.073 This test ensures DLP review routing defaults to disabled and any optional review event summary uses distinct DLP policy typing with counts-only payloads. diff --git a/functional_tests/test_dlp_telemetry.py b/functional_tests/test_dlp_telemetry.py index 588991c1..ff3e90d6 100644 --- a/functional_tests/test_dlp_telemetry.py +++ b/functional_tests/test_dlp_telemetry.py @@ -2,8 +2,8 @@ #!/usr/bin/env python3 """ Functional test for safe DLP telemetry. -Version: 0.242.069 -Implemented in: 0.242.069 +Version: 0.242.073 +Implemented in: 0.242.073 This test ensures DLP telemetry properties include bounded decision metadata without raw matched values, raw prompts, raw web-search queries, raw chunk text, diff --git a/functional_tests/test_upload_dlp_ingestion_integration.py b/functional_tests/test_upload_dlp_ingestion_integration.py index 6f06833d..869f8fff 100644 --- a/functional_tests/test_upload_dlp_ingestion_integration.py +++ b/functional_tests/test_upload_dlp_ingestion_integration.py @@ -2,8 +2,8 @@ #!/usr/bin/env python3 """ Functional test for upload DLP ingestion integration. -Version: 0.242.070 -Implemented in: 0.242.070 +Version: 0.242.073 +Implemented in: 0.242.073 This test ensures upload DLP blocks stop before embeddings/search indexing and redacted text is the only text passed into embedding/index payload construction. diff --git a/functional_tests/test_upload_dlp_redaction.py b/functional_tests/test_upload_dlp_redaction.py index caebb4e4..4c24a261 100644 --- a/functional_tests/test_upload_dlp_redaction.py +++ b/functional_tests/test_upload_dlp_redaction.py @@ -2,8 +2,8 @@ #!/usr/bin/env python3 """ Functional test for upload DLP redaction. -Version: 0.242.069 -Implemented in: 0.242.069 +Version: 0.242.073 +Implemented in: 0.242.073 This test ensures upload DLP redacts chunk text before embeddings and Azure AI Search indexing, hardens raw chunk logs, stores counts-only metadata, and emits diff --git a/functional_tests/test_upload_dlp_workspace_scopes.py b/functional_tests/test_upload_dlp_workspace_scopes.py index f9b1109c..d8e51f13 100644 --- a/functional_tests/test_upload_dlp_workspace_scopes.py +++ b/functional_tests/test_upload_dlp_workspace_scopes.py @@ -2,8 +2,8 @@ #!/usr/bin/env python3 """ Functional test for upload DLP workspace scope coverage. -Version: 0.242.069 -Implemented in: 0.242.069 +Version: 0.242.073 +Implemented in: 0.242.073 This test ensures personal, group, public, and external public upload routes continue using the shared document processing path protected by upload DLP. diff --git a/functional_tests/test_web_search_current_message_only.py b/functional_tests/test_web_search_current_message_only.py index 7a934690..c09ba5d3 100644 --- a/functional_tests/test_web_search_current_message_only.py +++ b/functional_tests/test_web_search_current_message_only.py @@ -1,7 +1,7 @@ # test_web_search_current_message_only.py """ Functional test for current-message-only web search egress. -Version: 0.242.069 +Version: 0.242.073 Implemented in: 0.241.008 This test ensures external web search uses only the current user message, diff --git a/functional_tests/test_web_search_dlp_egress.py b/functional_tests/test_web_search_dlp_egress.py index 2906942e..1fb94248 100644 --- a/functional_tests/test_web_search_dlp_egress.py +++ b/functional_tests/test_web_search_dlp_egress.py @@ -2,8 +2,8 @@ #!/usr/bin/env python3 """ Functional test for web-search DLP egress. -Version: 0.242.070 -Implemented in: 0.242.070 +Version: 0.242.073 +Implemented in: 0.242.073 This test ensures web-search DLP runs after current-message query construction and before Foundry web-search execution, blocks sensitive egress, redacts when diff --git a/functional_tests/test_web_search_dlp_route_integration.py b/functional_tests/test_web_search_dlp_route_integration.py index 4c249aeb..7a491cd9 100644 --- a/functional_tests/test_web_search_dlp_route_integration.py +++ b/functional_tests/test_web_search_dlp_route_integration.py @@ -2,8 +2,8 @@ #!/usr/bin/env python3 """ Functional test for web-search DLP route integration. -Version: 0.242.069 -Implemented in: 0.242.069 +Version: 0.242.073 +Implemented in: 0.242.073 This test ensures chat routes evaluate DLP before Foundry web search, suppress Foundry calls on block, and send only the redacted query on redact. From 9092b0403c8a7e93b5594f456f38ebc8be996d5f Mon Sep 17 00:00:00 2001 From: Zachary Arguelles Date: Thu, 18 Jun 2026 13:10:59 -0400 Subject: [PATCH 05/20] fix: keep dlp mvp regex only --- application/single_app/functions_dlp.py | 46 ------------------- .../test_dlp_admin_settings_roundtrip.py | 5 -- .../test_dlp_admin_settings_ui.py | 9 ---- functional_tests/test_dlp_control_plane.py | 27 +---------- functional_tests/test_dlp_telemetry.py | 2 +- 5 files changed, 3 insertions(+), 86 deletions(-) diff --git a/application/single_app/functions_dlp.py b/application/single_app/functions_dlp.py index 67beda74..a0185cf2 100644 --- a/application/single_app/functions_dlp.py +++ b/application/single_app/functions_dlp.py @@ -2,7 +2,6 @@ import hashlib import logging -from collections import OrderedDict from functions_dlp_rules import get_effective_dlp_regex_rules, scan_text_with_dlp_regex_rules @@ -97,51 +96,6 @@ def _decision_from_counts(match_counts, mode): return "monitor" -def normalize_external_analyzer_results(text, recognizer_results, mode="redact", engine="external_analyzer"): - """Normalize external analyzer entity offsets into the shared counts-only result.""" - source_text = str(text or "") - sorted_results = sorted( - [ - item for item in (recognizer_results or []) - if isinstance(item, dict) and item.get("entity_type") and item.get("start") is not None and item.get("end") is not None - ], - key=lambda item: int(item.get("start")), - ) - match_counts = OrderedDict() - redacted_parts = [] - cursor = 0 - - for item in sorted_results: - start = max(0, min(len(source_text), int(item.get("start")))) - end = max(start, min(len(source_text), int(item.get("end")))) - entity_type = str(item.get("entity_type")) - if start < cursor: - continue - redacted_parts.append(source_text[cursor:start]) - redacted_parts.append(f"[REDACTED_{entity_type}]") - cursor = end - match_counts[entity_type] = match_counts.get(entity_type, 0) + 1 - - redacted_parts.append(source_text[cursor:]) - redacted_text = "".join(redacted_parts) - counts = dict(match_counts) - decision = _decision_from_counts(counts, mode) - - return { - "enabled": True, - "engine": engine, - "mode": mode, - "decision": decision, - "text": redacted_text if counts else source_text, - "redacted_text": redacted_text if counts else source_text, - "total_replacements": sum(counts.values()), - "match_counts": counts, - "matches": [{"entity_type": key, "count": value} for key, value in counts.items()], - "metadata": {"adapter": "external_analyzer"}, - "scanner_status": "ok", - } - - def evaluate_dlp_text(text, settings=None, context=None, surface="generic"): """Evaluate text against the configured DLP policy and return a safe result.""" settings = settings or {} diff --git a/functional_tests/test_dlp_admin_settings_roundtrip.py b/functional_tests/test_dlp_admin_settings_roundtrip.py index 1cee064d..921759a7 100644 --- a/functional_tests/test_dlp_admin_settings_roundtrip.py +++ b/functional_tests/test_dlp_admin_settings_roundtrip.py @@ -49,9 +49,6 @@ UNSUPPORTED_DLP_FORM_FIELDS = [ - "dlp_presidio_use_service", - "dlp_presidio_endpoint", - "dlp_presidio_score_threshold", "dlp_scanner_timeout_seconds", "dlp_review_include_redacted_preview", "web_search_dlp_track_review_events", @@ -145,8 +142,6 @@ def test_admin_dlp_controls_only_expose_supported_regex_engine(): assert 'name="dlp_regex_rules_json"' in template assert "web_search_dlp_block_on_internal_phrases" not in template assert "Detect internal phrases" not in template - assert 'value="presidio_service"' not in template - assert 'value="presidio_embedded"' not in template assert_no_retired_structured_redaction_control(template, str(ADMIN_TEMPLATE)) for field_name in UNSUPPORTED_DLP_FORM_FIELDS: diff --git a/functional_tests/test_dlp_admin_settings_ui.py b/functional_tests/test_dlp_admin_settings_ui.py index 70c9cb77..3ca18d09 100644 --- a/functional_tests/test_dlp_admin_settings_ui.py +++ b/functional_tests/test_dlp_admin_settings_ui.py @@ -40,10 +40,6 @@ UNSUPPORTED_ADMIN_CONTROL_IDS = [ - "dlp_presidio_use_service", - "dlp_presidio_service_settings", - "dlp_presidio_endpoint", - "dlp_presidio_score_threshold", "dlp_scanner_timeout_seconds", "dlp_review_include_redacted_preview", "web_search_dlp_track_review_events", @@ -52,9 +48,6 @@ RETIRED_DLP_SETTING_KEYS = [ - "dlp_presidio_use_service", - "dlp_presidio_endpoint", - "dlp_presidio_score_threshold", "dlp_scanner_timeout_seconds", "dlp_review_include_redacted_preview", "web_search_dlp_track_review_events", @@ -134,8 +127,6 @@ def test_admin_template_exposes_dlp_controls(): for unsupported_id in UNSUPPORTED_ADMIN_CONTROL_IDS: assert unsupported_id not in source, f"Unsupported DLP control is still visible: {unsupported_id}" - assert 'value="presidio_service"' not in source - assert 'value="presidio_embedded"' not in source assert_no_retired_structured_redaction_control(source, ADMIN_TEMPLATE_FILE) diff --git a/functional_tests/test_dlp_control_plane.py b/functional_tests/test_dlp_control_plane.py index d6ea732d..e55c73a1 100644 --- a/functional_tests/test_dlp_control_plane.py +++ b/functional_tests/test_dlp_control_plane.py @@ -6,8 +6,8 @@ Implemented in: 0.242.073 This test ensures the shared DLP core supports disabled, regex, Luhn-validated -credit-card, counts-only metadata, ReDoS-resistant scanning, and optional -external analyzer normalization without persisting raw matched values. +credit-card, counts-only metadata, and ReDoS-resistant scanning without +persisting raw matched values. """ import os @@ -171,28 +171,6 @@ def fail_scan(text, settings, surface="generic"): assert "tail" not in repr(result) -def test_external_analyzer_shape_normalizes_counts_without_raw_values(): - """Optional external analyzer results should normalize into the shared shape.""" - print("Testing external analyzer adapter normalization...") - from functions_dlp import normalize_external_analyzer_results - - normalized = normalize_external_analyzer_results( - text=f"Alice Example has SSN {RAW_SSN}.", - recognizer_results=[ - {"entity_type": "PERSON", "start": 0, "end": 13, "score": 0.88}, - {"entity_type": "US_SSN", "start": 22, "end": 33, "score": 0.99}, - ], - mode="redact", - engine="external_analyzer", - ) - - assert normalized["decision"] == "redact" - assert normalized["match_counts"] == {"PERSON": 1, "US_SSN": 1} - assert "[REDACTED_PERSON]" in normalized["redacted_text"] - assert "[REDACTED_US_SSN]" in normalized["redacted_text"] - assert_no_raw_values(normalized) - - if __name__ == "__main__": tests = [ test_disabled_dlp_allows_original_text, @@ -201,7 +179,6 @@ def test_external_analyzer_shape_normalizes_counts_without_raw_values(): test_regex_scan_is_bounded_on_long_non_matching_input, test_enforced_dlp_blocks_when_text_exceeds_scan_limit, test_enforced_truncation_blocks_before_scanner_error_fail_open, - test_external_analyzer_shape_normalizes_counts_without_raw_values, ] try: diff --git a/functional_tests/test_dlp_telemetry.py b/functional_tests/test_dlp_telemetry.py index ff3e90d6..f6515a7d 100644 --- a/functional_tests/test_dlp_telemetry.py +++ b/functional_tests/test_dlp_telemetry.py @@ -75,7 +75,7 @@ def test_scanner_error_telemetry_is_safe(): result = { "enabled": True, - "engine": "presidio_service", + "engine": "regex", "mode": "block", "decision": "block", "scanner_status": "error", From 34a776b2f3845c910a47f1fac7a43bfcad08c14b Mon Sep 17 00:00:00 2001 From: Zachary Arguelles Date: Thu, 18 Jun 2026 13:27:19 -0400 Subject: [PATCH 06/20] fix: use app logging for dlp document events --- application/single_app/functions_documents.py | 24 +++++++++++++------ 1 file changed, 17 insertions(+), 7 deletions(-) diff --git a/application/single_app/functions_documents.py b/application/single_app/functions_documents.py index f72c21ba..1a908e5b 100644 --- a/application/single_app/functions_documents.py +++ b/application/single_app/functions_documents.py @@ -1723,7 +1723,7 @@ def to_seconds(ts: str) -> float: debug_print(f"[VIDEO INDEXER] Authentication successful, token length: {len(token) if token else 0}") except Exception as e: debug_print(f"[VIDEO INDEXER] Authentication failed: {_sanitize_video_indexer_log_value(e)}") - print("[VIDEO] AUTH ERROR", flush=True) + log_event("[VIDEO] AUTH ERROR", level=logging.ERROR) update_callback(status="VIDEO: auth failed") return 0 @@ -1894,15 +1894,19 @@ def to_seconds(ts: str) -> float: debug_print(f"[VIDEO INDEXER] Video duration: {video_duration} ({video_duration_seconds} seconds)") debug_print(f"[VIDEO INDEXER] Insights keys available: {list(insights.keys())}") - print(f"[VIDEO] Available insight types: {', '.join(list(insights.keys())[:15])}...", flush=True) + log_event( + f"[VIDEO] Available insight types: {', '.join(list(insights.keys())[:15])}...", + level=logging.INFO, + ) - print(f"[VIDEO] Insight counts:", flush=True) + insight_counts = {} for key in insights.keys(): value = insights.get(key, []) if isinstance(value, list): - print(f" {key}: {len(value)} items", flush=True) + insight_counts[key] = len(value) - print("[VIDEO] Insight count logging complete", flush=True) + log_event("[VIDEO] Insight counts", extra={"insight_counts": insight_counts}, level=logging.INFO) + log_event("[VIDEO] Insight count logging complete", level=logging.INFO) transcript = insights.get("transcript", []) ocr_blocks = insights.get("ocr", []) @@ -2814,7 +2818,10 @@ def save_chunks(page_text_content, page_number, file_name, user_id, document_id, #update_document(document_id=document_id, user_id=user_id, status=status) embedding, token_usage = generate_embedding(sanitized_chunk_text) except Exception as e: - print(f"Error generating embedding for page {page_number} of document {document_id}: {e}") + log_event( + f"Error generating embedding for page {page_number} of document {document_id}: {e}", + level=logging.ERROR, + ) raise if is_public_workspace: @@ -4582,7 +4589,10 @@ def extract_document_metadata(document_id, user_id, group_id=None, public_worksp user_id=group_id if is_group else user_id, content=f"Blocked document metadata for document {document_id}, summary: {blocked_metadata_summary}, reasons: {block_reasons}" ) - print(f"Blocked document metadata for document {document_id}. Reasons: {block_reasons}") + log_event( + f"Blocked document metadata for document {document_id}. Reasons: {block_reasons}", + level=logging.WARNING, + ) return None except Exception as e: From a03c9ee0cfd83dff80a7021e33ce470e1240368e Mon Sep 17 00:00:00 2001 From: Zachary Arguelles Date: Wed, 10 Jun 2026 18:52:16 -0400 Subject: [PATCH 07/20] feat: add presidio endpoint dlp engine --- application/single_app/functions_dlp.py | 34 +++- .../single_app/functions_dlp_presidio.py | 133 +++++++++++++ .../test_dlp_presidio_endpoint.py | 177 ++++++++++++++++++ .../test_dlp_presidio_engine_integration.py | 108 +++++++++++ 4 files changed, 447 insertions(+), 5 deletions(-) create mode 100644 application/single_app/functions_dlp_presidio.py create mode 100644 functional_tests/test_dlp_presidio_endpoint.py create mode 100644 functional_tests/test_dlp_presidio_engine_integration.py diff --git a/application/single_app/functions_dlp.py b/application/single_app/functions_dlp.py index a0185cf2..7266dbd2 100644 --- a/application/single_app/functions_dlp.py +++ b/application/single_app/functions_dlp.py @@ -3,6 +3,7 @@ import hashlib import logging +from functions_dlp_presidio import analyze_with_presidio_endpoint from functions_dlp_rules import get_effective_dlp_regex_rules, scan_text_with_dlp_regex_rules try: @@ -38,10 +39,10 @@ def _safe_float(value, default): def _normalize_engine(settings): - """Return the implemented DLP engine for this release.""" - requested = str((settings or {}).get("dlp_default_engine", "regex") or "regex").lower() - if requested != "regex": - return "regex" + """Return the configured DLP engine.""" + requested = str((settings or {}).get("dlp_default_engine", "regex") or "regex").strip().lower() + if requested in {"regex", "presidio_endpoint"}: + return requested return "regex" @@ -86,6 +87,22 @@ def _apply_regex_engine(text, settings=None, surface="generic"): } +def _apply_presidio_endpoint_engine(text, settings=None, surface="generic"): + recognizer_results = analyze_with_presidio_endpoint(text, settings or {}) + normalized = normalize_presidio_results( + text, + recognizer_results, + mode=_normalize_mode(settings or {}, surface), + engine="presidio_endpoint", + ) + return ( + normalized["redacted_text"], + normalized["match_counts"], + normalized["matches"], + {"adapter": "presidio_endpoint"}, + ) + + def _decision_from_counts(match_counts, mode): if not match_counts: return "allow" @@ -128,7 +145,14 @@ def evaluate_dlp_text(text, settings=None, context=None, surface="generic"): } try: - redacted_text, match_counts, matches, scanner_metadata = _apply_regex_engine(scan_text, settings, surface) + if engine == "presidio_endpoint": + redacted_text, match_counts, matches, scanner_metadata = _apply_presidio_endpoint_engine( + scan_text, + settings, + surface, + ) + else: + redacted_text, match_counts, matches, scanner_metadata = _apply_regex_engine(scan_text, settings, surface) except Exception as exc: log_event( "[DLP] Scanner error", diff --git a/application/single_app/functions_dlp_presidio.py b/application/single_app/functions_dlp_presidio.py new file mode 100644 index 00000000..2102cab5 --- /dev/null +++ b/application/single_app/functions_dlp_presidio.py @@ -0,0 +1,133 @@ +# functions_dlp_presidio.py + +"""HTTP adapter for Presidio-compatible Analyzer endpoints.""" + +import os +from urllib.parse import urlparse + +import requests + + +DEFAULT_PRESIDIO_TIMEOUT_SECONDS = 5 +DEFAULT_PRESIDIO_LANGUAGE = "en" +DEFAULT_PRESIDIO_SCORE_THRESHOLD = 0.5 +DEFAULT_PRESIDIO_AUTH_HEADER_NAME = "X-DLP-API-Key" + + +class PresidioEndpointConfigurationError(ValueError): + """Raised when the configured Presidio endpoint is not safe to call.""" + + +class PresidioEndpointRequestError(RuntimeError): + """Raised when the Presidio endpoint cannot return a usable analyzer result.""" + + +def validate_presidio_endpoint_url(endpoint_url): + """Validate and normalize a Presidio Analyzer endpoint URL.""" + normalized = str(endpoint_url or "").strip() + if not normalized: + raise PresidioEndpointConfigurationError("Presidio analyzer endpoint is required.") + + parsed = urlparse(normalized) + host = (parsed.hostname or "").lower() + if parsed.scheme not in {"http", "https"} or not parsed.netloc: + raise PresidioEndpointConfigurationError("Presidio analyzer endpoint must be an absolute HTTP(S) URL.") + + local_hosts = {"localhost", "127.0.0.1", "::1"} + if parsed.scheme == "http" and host not in local_hosts: + raise PresidioEndpointConfigurationError("Presidio analyzer endpoint must use HTTPS unless it is localhost.") + + return normalized + + +def _safe_float(value, default): + try: + return float(value) + except (TypeError, ValueError): + return default + + +def _safe_int(value, default): + try: + return int(value) + except (TypeError, ValueError): + return default + + +def _get_entities(settings): + entities = (settings or {}).get("dlp_presidio_entities", []) + if isinstance(entities, str): + entities = [item.strip().upper() for item in entities.split(",")] + if not isinstance(entities, list): + return [] + return [str(item).strip().upper() for item in entities if str(item).strip()] + + +def _get_auth_headers(settings): + header_name = str((settings or {}).get("dlp_presidio_auth_header_name") or DEFAULT_PRESIDIO_AUTH_HEADER_NAME).strip() + secret_env_var = str((settings or {}).get("dlp_presidio_auth_secret_env_var") or "").strip() + if not header_name or not secret_env_var: + return {} + + secret_value = os.getenv(secret_env_var, "") + if not secret_value: + return {} + return {header_name: secret_value} + + +def _normalize_result_item(item): + if not isinstance(item, dict): + return None + if not item.get("entity_type") or item.get("start") is None or item.get("end") is None: + return None + try: + return { + "entity_type": str(item.get("entity_type")), + "start": int(item.get("start")), + "end": int(item.get("end")), + "score": float(item.get("score", 0.0)), + } + except (TypeError, ValueError): + return None + + +def analyze_with_presidio_endpoint(text, settings): + """Call a configured Presidio Analyzer endpoint and return recognizer results.""" + settings = settings or {} + endpoint_url = validate_presidio_endpoint_url(settings.get("dlp_presidio_analyzer_endpoint")) + timeout_seconds = max( + 1, + min(30, _safe_int(settings.get("dlp_presidio_timeout_seconds"), DEFAULT_PRESIDIO_TIMEOUT_SECONDS)), + ) + score_threshold = max( + 0.0, + min(1.0, _safe_float(settings.get("dlp_presidio_score_threshold"), DEFAULT_PRESIDIO_SCORE_THRESHOLD)), + ) + language = str(settings.get("dlp_presidio_language") or DEFAULT_PRESIDIO_LANGUAGE).strip() or DEFAULT_PRESIDIO_LANGUAGE + payload = { + "text": str(text or ""), + "language": language, + "entities": _get_entities(settings), + "score_threshold": score_threshold, + } + headers = { + "Content-Type": "application/json", + **_get_auth_headers(settings), + } + + try: + response = requests.post(endpoint_url, json=payload, headers=headers, timeout=timeout_seconds) + response.raise_for_status() + body = response.json() + except Exception as exc: + raise PresidioEndpointRequestError(f"Presidio analyzer request failed: {type(exc).__name__}") from exc + + if not isinstance(body, list): + raise PresidioEndpointRequestError("Presidio analyzer response must be a list.") + + results = [] + for item in body: + normalized = _normalize_result_item(item) + if normalized: + results.append(normalized) + return results diff --git a/functional_tests/test_dlp_presidio_endpoint.py b/functional_tests/test_dlp_presidio_endpoint.py new file mode 100644 index 00000000..c34308d2 --- /dev/null +++ b/functional_tests/test_dlp_presidio_endpoint.py @@ -0,0 +1,177 @@ +# test_dlp_presidio_endpoint.py +#!/usr/bin/env python3 +""" +Functional test for external Presidio endpoint DLP adapter. +Version: 0.242.044 +Implemented in: 0.242.044 + +This test ensures SimpleChat can call a configured Presidio-compatible analyzer +endpoint without embedding Presidio packages or leaking raw scanned text. +""" + +import os +import sys +from unittest.mock import Mock + +ROOT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +APP_DIR = os.path.join(ROOT_DIR, "application", "single_app") +sys.path.insert(0, APP_DIR) + + +RAW_TEXT = "Contact me a@example.com" + + +def test_validate_presidio_endpoint_allows_https_and_localhost(): + """HTTPS and local HTTP endpoint URLs should be accepted.""" + from functions_dlp_presidio import validate_presidio_endpoint_url + + assert validate_presidio_endpoint_url("https://presidio.internal/analyze") == "https://presidio.internal/analyze" + assert validate_presidio_endpoint_url("http://localhost:5002/analyze") == "http://localhost:5002/analyze" + assert validate_presidio_endpoint_url("http://127.0.0.1:5002/analyze") == "http://127.0.0.1:5002/analyze" + assert validate_presidio_endpoint_url("http://[::1]:5002/analyze") == "http://[::1]:5002/analyze" + + +def test_validate_presidio_endpoint_rejects_insecure_remote_http(): + """Remote HTTP endpoint URLs should be rejected.""" + from functions_dlp_presidio import PresidioEndpointConfigurationError, validate_presidio_endpoint_url + + try: + validate_presidio_endpoint_url("http://presidio.example.com/analyze") + except PresidioEndpointConfigurationError as exc: + assert "https" in str(exc).lower() + return + + raise AssertionError("Expected insecure remote HTTP endpoint to be rejected.") + + +def test_validate_presidio_endpoint_rejects_relative_url(): + """Endpoint URLs must be absolute HTTP(S) URLs.""" + from functions_dlp_presidio import PresidioEndpointConfigurationError, validate_presidio_endpoint_url + + try: + validate_presidio_endpoint_url("/analyze") + except PresidioEndpointConfigurationError as exc: + assert "absolute" in str(exc).lower() + return + + raise AssertionError("Expected relative endpoint URL to be rejected.") + + +def test_analyze_with_presidio_endpoint_posts_safe_payload_and_auth_header(monkeypatch): + """The endpoint adapter should post the Analyzer payload and env-backed auth header.""" + from functions_dlp_presidio import analyze_with_presidio_endpoint + + captured = {} + + def fake_post(url, json=None, headers=None, timeout=None): + captured["url"] = url + captured["json"] = json + captured["headers"] = headers + captured["timeout"] = timeout + response = Mock() + response.raise_for_status.return_value = None + response.json.return_value = [ + {"entity_type": "EMAIL_ADDRESS", "start": 11, "end": 24, "score": 0.91} + ] + return response + + monkeypatch.setattr("functions_dlp_presidio.requests.post", fake_post) + monkeypatch.setenv("PRESIDIO_DLP_API_KEY", "unit-test-secret") + settings = { + "dlp_presidio_analyzer_endpoint": "https://presidio.internal/analyze", + "dlp_presidio_auth_header_name": "X-DLP-API-Key", + "dlp_presidio_auth_secret_env_var": "PRESIDIO_DLP_API_KEY", + "dlp_presidio_entities": ["EMAIL_ADDRESS", "US_SSN"], + "dlp_presidio_score_threshold": 0.7, + "dlp_presidio_language": "en", + "dlp_presidio_timeout_seconds": 3, + } + + results = analyze_with_presidio_endpoint(RAW_TEXT, settings) + + assert results == [{"entity_type": "EMAIL_ADDRESS", "start": 11, "end": 24, "score": 0.91}] + assert captured["url"] == "https://presidio.internal/analyze" + assert captured["json"] == { + "text": RAW_TEXT, + "language": "en", + "entities": ["EMAIL_ADDRESS", "US_SSN"], + "score_threshold": 0.7, + } + assert captured["headers"]["X-DLP-API-Key"] == "unit-test-secret" + assert captured["headers"]["Content-Type"] == "application/json" + assert captured["timeout"] == 3 + + +def test_analyze_with_presidio_endpoint_omits_auth_header_without_env_secret(monkeypatch): + """Raw API keys should come only from the configured environment variable.""" + from functions_dlp_presidio import analyze_with_presidio_endpoint + + captured = {} + + def fake_post(url, json=None, headers=None, timeout=None): + captured["headers"] = headers + response = Mock() + response.raise_for_status.return_value = None + response.json.return_value = [] + return response + + monkeypatch.setattr("functions_dlp_presidio.requests.post", fake_post) + monkeypatch.delenv("PRESIDIO_DLP_API_KEY", raising=False) + + analyze_with_presidio_endpoint( + RAW_TEXT, + { + "dlp_presidio_analyzer_endpoint": "https://presidio.internal/analyze", + "dlp_presidio_auth_header_name": "X-DLP-API-Key", + "dlp_presidio_auth_secret_env_var": "PRESIDIO_DLP_API_KEY", + }, + ) + + assert "X-DLP-API-Key" not in captured["headers"] + + +def test_analyze_with_presidio_endpoint_raises_safe_error_without_raw_text(monkeypatch): + """Endpoint exceptions should not leak raw scanned text in their messages.""" + from functions_dlp_presidio import PresidioEndpointRequestError, analyze_with_presidio_endpoint + + def fake_post(url, json=None, headers=None, timeout=None): + raise RuntimeError(f"upstream included {RAW_TEXT}") + + monkeypatch.setattr("functions_dlp_presidio.requests.post", fake_post) + + try: + analyze_with_presidio_endpoint( + RAW_TEXT, + {"dlp_presidio_analyzer_endpoint": "https://presidio.internal/analyze"}, + ) + except PresidioEndpointRequestError as exc: + assert RAW_TEXT not in str(exc) + assert "RuntimeError" in str(exc) + return + + raise AssertionError("Expected endpoint request error.") + + +def test_analyze_with_presidio_endpoint_normalizes_response_items(monkeypatch): + """Recognizer responses should be filtered and normalized deterministically.""" + from functions_dlp_presidio import analyze_with_presidio_endpoint + + def fake_post(url, json=None, headers=None, timeout=None): + response = Mock() + response.raise_for_status.return_value = None + response.json.return_value = [ + {"entity_type": "EMAIL_ADDRESS", "start": "11", "end": "24", "score": "0.91"}, + {"entity_type": "US_SSN", "start": -3, "end": "bad", "score": 0.99}, + {"entity_type": "", "start": 1, "end": 2, "score": 0.4}, + "ignored", + ] + return response + + monkeypatch.setattr("functions_dlp_presidio.requests.post", fake_post) + + results = analyze_with_presidio_endpoint( + RAW_TEXT, + {"dlp_presidio_analyzer_endpoint": "https://presidio.internal/analyze"}, + ) + + assert results == [{"entity_type": "EMAIL_ADDRESS", "start": 11, "end": 24, "score": 0.91}] diff --git a/functional_tests/test_dlp_presidio_engine_integration.py b/functional_tests/test_dlp_presidio_engine_integration.py new file mode 100644 index 00000000..464c57ff --- /dev/null +++ b/functional_tests/test_dlp_presidio_engine_integration.py @@ -0,0 +1,108 @@ +# test_dlp_presidio_engine_integration.py +#!/usr/bin/env python3 +""" +Functional test for Presidio endpoint engine integration. +Version: 0.242.044 +Implemented in: 0.242.044 + +This test ensures the external Presidio endpoint engine reuses SimpleChat's +existing DLP decision, redaction, and fail-closed behavior. +""" + +import os +import sys + +ROOT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +APP_DIR = os.path.join(ROOT_DIR, "application", "single_app") +sys.path.insert(0, APP_DIR) + + +RAW_TEXT = "Contact me a@example.com" + + +def presidio_settings(mode="redact", fail_closed=True): + """Build deterministic settings for Presidio endpoint engine tests.""" + return { + "enable_dlp_control_plane": True, + "dlp_default_engine": "presidio_endpoint", + "dlp_presidio_analyzer_endpoint": "https://presidio.internal/analyze", + "dlp_presidio_timeout_seconds": 3, + "dlp_presidio_score_threshold": 0.7, + "dlp_presidio_entities": ["EMAIL_ADDRESS"], + "dlp_fail_closed_on_scanner_error": fail_closed, + "enable_web_search_dlp": True, + "web_search_dlp_mode": mode, + "enable_upload_dlp": True, + "upload_dlp_mode": mode, + } + + +def test_presidio_endpoint_redacts_with_existing_result_shape(monkeypatch): + """Presidio endpoint matches should redact using the shared DLP result shape.""" + import functions_dlp + + monkeypatch.setattr( + functions_dlp, + "analyze_with_presidio_endpoint", + lambda text, settings: [{"entity_type": "EMAIL_ADDRESS", "start": 11, "end": 24, "score": 0.92}], + ) + + result = functions_dlp.evaluate_dlp_text( + RAW_TEXT, + settings=presidio_settings("redact"), + surface="web_search", + ) + + assert result["engine"] == "presidio_endpoint" + assert result["decision"] == "redact" + assert result["text"] == "Contact me [REDACTED_EMAIL_ADDRESS]" + assert result["redacted_text"] == "Contact me [REDACTED_EMAIL_ADDRESS]" + assert result["match_counts"] == {"EMAIL_ADDRESS": 1} + assert result["scanner_status"] == "ok" + + +def test_presidio_endpoint_blocks_with_existing_result_shape(monkeypatch): + """Block mode should blank text fields while keeping safe counts.""" + import functions_dlp + + monkeypatch.setattr( + functions_dlp, + "analyze_with_presidio_endpoint", + lambda text, settings: [{"entity_type": "EMAIL_ADDRESS", "start": 11, "end": 24, "score": 0.92}], + ) + + result = functions_dlp.evaluate_dlp_text( + RAW_TEXT, + settings=presidio_settings("block"), + surface="upload", + ) + + assert result["engine"] == "presidio_endpoint" + assert result["decision"] == "block" + assert result["text"] == "" + assert result["redacted_text"] == "" + assert result["match_counts"] == {"EMAIL_ADDRESS": 1} + assert result["scanner_status"] == "ok" + + +def test_presidio_endpoint_scanner_error_fails_closed_without_raw_text(monkeypatch): + """Endpoint scanner errors should reuse fail-closed handling and avoid raw text.""" + import functions_dlp + + def fail_scan(text, settings): + raise RuntimeError(f"endpoint failed while scanning {RAW_TEXT}") + + monkeypatch.setattr(functions_dlp, "analyze_with_presidio_endpoint", fail_scan) + + result = functions_dlp.evaluate_dlp_text( + RAW_TEXT, + settings=presidio_settings("redact", fail_closed=True), + surface="web_search", + ) + + assert result["engine"] == "presidio_endpoint" + assert result["decision"] == "block" + assert result["text"] == "" + assert result["redacted_text"] == "" + assert result["scanner_status"] == "error" + assert RAW_TEXT not in repr(result) From 7fca8a1307a26a6a1389694646bf3f57475919a5 Mon Sep 17 00:00:00 2001 From: Zachary Arguelles Date: Wed, 10 Jun 2026 19:08:05 -0400 Subject: [PATCH 08/20] fix: suppress presidio endpoint exception chains --- application/single_app/functions_dlp_presidio.py | 6 +++++- functional_tests/test_dlp_presidio_endpoint.py | 5 ++++- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/application/single_app/functions_dlp_presidio.py b/application/single_app/functions_dlp_presidio.py index 2102cab5..cb881361 100644 --- a/application/single_app/functions_dlp_presidio.py +++ b/application/single_app/functions_dlp_presidio.py @@ -115,12 +115,16 @@ def analyze_with_presidio_endpoint(text, settings): **_get_auth_headers(settings), } + request_error_type = None try: response = requests.post(endpoint_url, json=payload, headers=headers, timeout=timeout_seconds) response.raise_for_status() body = response.json() except Exception as exc: - raise PresidioEndpointRequestError(f"Presidio analyzer request failed: {type(exc).__name__}") from exc + request_error_type = type(exc).__name__ + + if request_error_type: + raise PresidioEndpointRequestError(f"Presidio analyzer request failed: {request_error_type}") from None if not isinstance(body, list): raise PresidioEndpointRequestError("Presidio analyzer response must be a list.") diff --git a/functional_tests/test_dlp_presidio_endpoint.py b/functional_tests/test_dlp_presidio_endpoint.py index c34308d2..df3ac23a 100644 --- a/functional_tests/test_dlp_presidio_endpoint.py +++ b/functional_tests/test_dlp_presidio_endpoint.py @@ -131,7 +131,7 @@ def fake_post(url, json=None, headers=None, timeout=None): def test_analyze_with_presidio_endpoint_raises_safe_error_without_raw_text(monkeypatch): - """Endpoint exceptions should not leak raw scanned text in their messages.""" + """Endpoint exceptions should not retain raw scanned text in messages or exception chains.""" from functions_dlp_presidio import PresidioEndpointRequestError, analyze_with_presidio_endpoint def fake_post(url, json=None, headers=None, timeout=None): @@ -146,7 +146,10 @@ def fake_post(url, json=None, headers=None, timeout=None): ) except PresidioEndpointRequestError as exc: assert RAW_TEXT not in str(exc) + assert RAW_TEXT not in repr(exc) assert "RuntimeError" in str(exc) + assert exc.__cause__ is None + assert exc.__context__ is None return raise AssertionError("Expected endpoint request error.") From 6863881a11f81f8f34a9909219e98700ad1a44d7 Mon Sep 17 00:00:00 2001 From: Zachary Arguelles Date: Wed, 10 Jun 2026 20:36:37 -0400 Subject: [PATCH 09/20] feat: persist presidio endpoint dlp settings --- application/single_app/functions_settings.py | 14 +++++ .../route_frontend_admin_settings.py | 61 ++++++++++++++++++- .../test_dlp_admin_settings_roundtrip.py | 56 +++++++++++++++-- 3 files changed, 125 insertions(+), 6 deletions(-) diff --git a/application/single_app/functions_settings.py b/application/single_app/functions_settings.py index 9108b2ae..1513d300 100644 --- a/application/single_app/functions_settings.py +++ b/application/single_app/functions_settings.py @@ -1072,6 +1072,20 @@ def get_settings(use_cosmos=False, include_source=False): 'dlp_enable_structured_telemetry': True, 'dlp_telemetry_sample_allow_events': False, 'dlp_review_destination': 'none', + 'dlp_presidio_analyzer_endpoint': '', + 'dlp_presidio_auth_header_name': 'X-DLP-API-Key', + 'dlp_presidio_auth_secret_env_var': 'PRESIDIO_DLP_API_KEY', + 'dlp_presidio_timeout_seconds': 5, + 'dlp_presidio_score_threshold': 0.5, + 'dlp_presidio_language': 'en', + 'dlp_presidio_entities': [ + 'CREDIT_CARD', + 'EMAIL_ADDRESS', + 'PHONE_NUMBER', + 'US_SSN', + 'PERSON', + 'LOCATION', + ], 'enable_web_search_dlp': False, 'web_search_dlp_mode': 'monitor', 'enable_upload_dlp': False, diff --git a/application/single_app/route_frontend_admin_settings.py b/application/single_app/route_frontend_admin_settings.py index 1b6657ac..e1520f06 100644 --- a/application/single_app/route_frontend_admin_settings.py +++ b/application/single_app/route_frontend_admin_settings.py @@ -783,6 +783,58 @@ def parse_admin_int(raw_value, fallback_value, field_name="unknown", hard_defaul dlp_review_destination = form_data.get('dlp_review_destination', 'none') if dlp_review_destination not in ('none',): dlp_review_destination = 'none' + dlp_default_engine = form_data.get('dlp_default_engine', settings.get('dlp_default_engine', 'regex')) + if dlp_default_engine not in ('regex', 'presidio_endpoint'): + dlp_default_engine = 'regex' + dlp_presidio_analyzer_endpoint = form_data.get( + 'dlp_presidio_analyzer_endpoint', + settings.get('dlp_presidio_analyzer_endpoint', '') + ).strip() + dlp_presidio_auth_header_name = form_data.get( + 'dlp_presidio_auth_header_name', + settings.get('dlp_presidio_auth_header_name', 'X-DLP-API-Key') + ).strip() or 'X-DLP-API-Key' + dlp_presidio_auth_secret_env_var = form_data.get( + 'dlp_presidio_auth_secret_env_var', + settings.get('dlp_presidio_auth_secret_env_var', 'PRESIDIO_DLP_API_KEY') + ).strip() or 'PRESIDIO_DLP_API_KEY' + dlp_presidio_timeout_seconds, _ = safe_int_with_source( + form_data.get('dlp_presidio_timeout_seconds'), + settings.get('dlp_presidio_timeout_seconds', 5), + 5 + ) + dlp_presidio_timeout_seconds = max(1, min(30, dlp_presidio_timeout_seconds)) + try: + dlp_presidio_score_threshold = float( + form_data.get( + 'dlp_presidio_score_threshold', + settings.get('dlp_presidio_score_threshold', 0.5) + ) + ) + except (TypeError, ValueError): + dlp_presidio_score_threshold = 0.5 + dlp_presidio_score_threshold = max(0.0, min(1.0, dlp_presidio_score_threshold)) + dlp_presidio_language = form_data.get( + 'dlp_presidio_language', + settings.get('dlp_presidio_language', 'en') + ).strip() or 'en' + existing_dlp_presidio_entities = settings.get('dlp_presidio_entities') or [ + 'CREDIT_CARD', + 'EMAIL_ADDRESS', + 'PHONE_NUMBER', + 'US_SSN', + ] + dlp_presidio_entities_raw = form_data.get( + 'dlp_presidio_entities', + ','.join(existing_dlp_presidio_entities) + ) + dlp_presidio_entities = [ + item.strip().upper() + for item in dlp_presidio_entities_raw.split(',') + if item.strip() + ] + if not dlp_presidio_entities: + dlp_presidio_entities = ['CREDIT_CARD', 'EMAIL_ADDRESS', 'PHONE_NUMBER', 'US_SSN'] web_search_dlp_mode = form_data.get('web_search_dlp_mode', 'monitor') if web_search_dlp_mode not in ('monitor', 'redact', 'block'): web_search_dlp_mode = 'monitor' @@ -2022,7 +2074,7 @@ def is_valid_url(url): 'enable_web_search_user_notice': form_data.get('enable_web_search_user_notice') == 'on', 'web_search_user_notice_text': form_data.get('web_search_user_notice_text', 'Your current message will be sent to Microsoft Bing for web search. Conversation history is not sent for web search, but any sensitive content you paste into this message may be sent.').strip(), 'enable_dlp_control_plane': form_data.get('enable_dlp_control_plane') == 'on', - 'dlp_default_engine': 'regex', + 'dlp_default_engine': dlp_default_engine, 'dlp_regex_rules': normalized_dlp_regex_rules, 'dlp_max_scan_chars': dlp_max_scan_chars, 'dlp_fail_closed_on_scanner_error': form_data.get('dlp_fail_closed_on_scanner_error') == 'on', @@ -2030,6 +2082,13 @@ def is_valid_url(url): 'dlp_enable_structured_telemetry': form_data.get('dlp_enable_structured_telemetry') == 'on', 'dlp_telemetry_sample_allow_events': form_data.get('dlp_telemetry_sample_allow_events') == 'on', 'dlp_review_destination': dlp_review_destination, + 'dlp_presidio_analyzer_endpoint': dlp_presidio_analyzer_endpoint, + 'dlp_presidio_auth_header_name': dlp_presidio_auth_header_name, + 'dlp_presidio_auth_secret_env_var': dlp_presidio_auth_secret_env_var, + 'dlp_presidio_timeout_seconds': dlp_presidio_timeout_seconds, + 'dlp_presidio_score_threshold': dlp_presidio_score_threshold, + 'dlp_presidio_language': dlp_presidio_language, + 'dlp_presidio_entities': dlp_presidio_entities, 'enable_web_search_dlp': form_data.get('enable_web_search_dlp') == 'on', 'web_search_dlp_mode': web_search_dlp_mode, 'enable_upload_dlp': form_data.get('enable_upload_dlp') == 'on', diff --git a/functional_tests/test_dlp_admin_settings_roundtrip.py b/functional_tests/test_dlp_admin_settings_roundtrip.py index 921759a7..509034fa 100644 --- a/functional_tests/test_dlp_admin_settings_roundtrip.py +++ b/functional_tests/test_dlp_admin_settings_roundtrip.py @@ -18,11 +18,16 @@ APP_DIR = os.path.join(ROOT_DIR, "application", "single_app") ADMIN_ROUTE_FILE = os.path.join(APP_DIR, "route_frontend_admin_settings.py") ADMIN_TEMPLATE_FILE = os.path.join(APP_DIR, "templates", "admin_settings.html") +FUNCTIONS_SETTINGS_FILE = os.path.join(APP_DIR, "functions_settings.py") ADMIN_TEMPLATE = Path(ADMIN_TEMPLATE_FILE) NORMALIZED_ASSIGNMENTS = [ "dlp_max_scan_chars = max(1000, dlp_max_scan_chars)", + "if dlp_default_engine not in ('regex', 'presidio_endpoint'):", + "dlp_default_engine = 'regex'", + "dlp_presidio_timeout_seconds = max(1, min(30, dlp_presidio_timeout_seconds))", + "dlp_presidio_score_threshold = max(0.0, min(1.0, dlp_presidio_score_threshold))", "if web_search_dlp_mode not in ('monitor', 'redact', 'block'):", "web_search_dlp_mode = 'monitor'", "if dlp_review_destination not in ('none',):", @@ -32,7 +37,7 @@ PERSISTED_DLP_FIELDS = { "enable_dlp_control_plane": "form_data.get('enable_dlp_control_plane') == 'on'", - "dlp_default_engine": "'regex'", + "dlp_default_engine": "dlp_default_engine", "dlp_regex_rules": "normalized_dlp_regex_rules", "dlp_max_scan_chars": "dlp_max_scan_chars", "dlp_fail_closed_on_scanner_error": "form_data.get('dlp_fail_closed_on_scanner_error') == 'on'", @@ -49,6 +54,8 @@ UNSUPPORTED_DLP_FORM_FIELDS = [ + "dlp_presidio_use_service", + "dlp_presidio_endpoint", "dlp_scanner_timeout_seconds", "dlp_review_include_redacted_preview", "web_search_dlp_track_review_events", @@ -133,12 +140,12 @@ def test_dlp_review_destination_stays_unreachable_until_review_flow_exists(): assert 'value="safety_violations"' not in template_source -def test_admin_dlp_controls_only_expose_supported_regex_engine(): +def test_admin_dlp_controls_expose_supported_dlp_engines(): template = ADMIN_TEMPLATE.read_text(encoding="utf-8") route_source = read_file_text(ADMIN_ROUTE_FILE) - assert '' in template - assert "Regex scanning is the only implemented engine in this release." in template + assert 'value="regex"' in template + assert "if dlp_default_engine not in ('regex', 'presidio_endpoint'):" in route_source assert 'name="dlp_regex_rules_json"' in template assert "web_search_dlp_block_on_internal_phrases" not in template assert "Detect internal phrases" not in template @@ -192,16 +199,55 @@ def test_admin_settings_rejects_invalid_dlp_regex_rules_before_update(): assert "return redirect(url_for('admin_settings'))" in source[validate_index:update_index] +def test_presidio_endpoint_settings_are_normalized_without_secret_persistence(): + """Admin POST should persist endpoint metadata but not raw API key values.""" + print("Testing Presidio endpoint metadata persistence...") + route_source = read_file_text(ADMIN_ROUTE_FILE) + + assert "'dlp_default_engine': dlp_default_engine" in route_source + assert "'dlp_presidio_analyzer_endpoint': dlp_presidio_analyzer_endpoint" in route_source + assert "'dlp_presidio_auth_header_name': dlp_presidio_auth_header_name" in route_source + assert "'dlp_presidio_auth_secret_env_var': dlp_presidio_auth_secret_env_var" in route_source + assert "'dlp_presidio_timeout_seconds': dlp_presidio_timeout_seconds" in route_source + assert "'dlp_presidio_score_threshold': dlp_presidio_score_threshold" in route_source + assert "'dlp_presidio_language': dlp_presidio_language" in route_source + assert "'dlp_presidio_entities': dlp_presidio_entities" in route_source + assert "for item in dlp_presidio_entities_raw.split(',')" in route_source + assert "item.strip().upper()" in route_source + assert "if not dlp_presidio_entities:" in route_source + assert "dlp_presidio_entities = ['CREDIT_CARD', 'EMAIL_ADDRESS', 'PHONE_NUMBER', 'US_SSN']" in route_source + assert "'dlp_presidio_auth_secret'" not in route_source + assert "form_data.get('dlp_presidio_auth_secret'" not in route_source + + +def test_default_settings_include_presidio_endpoint_controls(): + """Default settings should include safe Presidio endpoint defaults.""" + print("Testing Presidio endpoint default settings...") + settings_source = read_file_text(FUNCTIONS_SETTINGS_FILE) + + assert "'dlp_default_engine': 'regex'" in settings_source + assert "'dlp_presidio_analyzer_endpoint': ''" in settings_source + assert "'dlp_presidio_auth_header_name': 'X-DLP-API-Key'" in settings_source + assert "'dlp_presidio_auth_secret_env_var': 'PRESIDIO_DLP_API_KEY'" in settings_source + assert "'dlp_presidio_timeout_seconds': 5" in settings_source + assert "'dlp_presidio_score_threshold': 0.5" in settings_source + assert "'dlp_presidio_language': 'en'" in settings_source + assert "'dlp_presidio_entities': [" in settings_source + assert "'dlp_presidio_auth_secret'" not in settings_source + + if __name__ == "__main__": tests = [ test_dlp_admin_post_normalizes_untrusted_form_values, test_dlp_admin_post_persists_normalized_dlp_payload, test_dlp_admin_template_roundtrips_persisted_values, test_dlp_review_destination_stays_unreachable_until_review_flow_exists, - test_admin_dlp_controls_only_expose_supported_regex_engine, + test_admin_dlp_controls_expose_supported_dlp_engines, test_admin_settings_post_validates_csrf_before_dlp_persistence, test_admin_settings_persists_valid_dlp_regex_rules, test_admin_settings_rejects_invalid_dlp_regex_rules_before_update, + test_presidio_endpoint_settings_are_normalized_without_secret_persistence, + test_default_settings_include_presidio_endpoint_controls, ] try: From 8f4803db5c77041c7c26c89e25cf19dbd63aae8f Mon Sep 17 00:00:00 2001 From: Zachary Arguelles Date: Wed, 10 Jun 2026 21:33:58 -0400 Subject: [PATCH 10/20] feat: expose presidio endpoint dlp controls --- .../static/js/admin/admin_settings.js | 15 +++++++ .../single_app/templates/admin_settings.html | 43 ++++++++++++++++++- .../test_dlp_admin_settings_ui.py | 42 +++++++++++++++++- functional_tests/test_dlp_admin_ui_smoke.py | 8 ++++ 4 files changed, 104 insertions(+), 4 deletions(-) diff --git a/application/single_app/static/js/admin/admin_settings.js b/application/single_app/static/js/admin/admin_settings.js index 401621ba..23f93dc0 100644 --- a/application/single_app/static/js/admin/admin_settings.js +++ b/application/single_app/static/js/admin/admin_settings.js @@ -8333,6 +8333,8 @@ function initializeDlpSettings() { const enableDlpControlPlane = document.getElementById('enable_dlp_control_plane'); const dlpControlPlaneSettings = document.getElementById('dlp_control_plane_settings'); + const dlpDefaultEngine = document.getElementById('dlp_default_engine'); + const dlpPresidioEndpointSettings = document.getElementById('dlp_presidio_endpoint_settings'); const enableWebSearchDlp = document.getElementById('enable_web_search_dlp'); const webSearchDlpSettings = document.getElementById('web_search_dlp_settings'); const webSearchDlpModeSettings = document.getElementById('web_search_dlp_mode_settings'); @@ -8344,6 +8346,19 @@ function initializeDlpSettings() { togglePanel(enableWebSearchDlp, webSearchDlpModeSettings); togglePanel(enableUploadDlp, uploadDlpModeSettings); + if (dlpDefaultEngine && dlpPresidioEndpointSettings) { + const updateDlpEngineVisibility = () => { + dlpPresidioEndpointSettings.classList.toggle('d-none', dlpDefaultEngine.value !== 'presidio_endpoint'); + }; + dlpDefaultEngine.addEventListener('change', function () { + updateDlpEngineVisibility(); + if (typeof markFormAsModified === 'function') { + markFormAsModified(); + } + }); + updateDlpEngineVisibility(); + } + if (webSearchDlpSettings && enableDlpControlPlane) { webSearchDlpSettings.classList.toggle('d-none', !enableDlpControlPlane.checked); enableDlpControlPlane.addEventListener('change', function () { diff --git a/application/single_app/templates/admin_settings.html b/application/single_app/templates/admin_settings.html index 55cc0a03..42cb2317 100644 --- a/application/single_app/templates/admin_settings.html +++ b/application/single_app/templates/admin_settings.html @@ -7383,9 +7383,10 @@
-
Regex scanning is the only implemented engine in this release.
+
Use regex for lightweight built-in scanning or Presidio for an admin-managed external analyzer.
@@ -7399,6 +7400,44 @@
+
+
Presidio Analyzer Endpoint
+ +
+
+ + +
Use the Presidio Analyzer REST URL that SimpleChat can reach server-side.
+
+
+ + +
+
+ + +
Store the secret value in App Service settings or a Key Vault reference, not in SimpleChat settings.
+
+
+
+
+ + +
+
+ + +
+
+ + +
Comma-separated Presidio entity names such as CREDIT_CARD, EMAIL_ADDRESS, PHONE_NUMBER, US_SSN, PERSON, LOCATION.
+
+
+
+
diff --git a/functional_tests/test_dlp_admin_settings_ui.py b/functional_tests/test_dlp_admin_settings_ui.py index 3ca18d09..ba3159ff 100644 --- a/functional_tests/test_dlp_admin_settings_ui.py +++ b/functional_tests/test_dlp_admin_settings_ui.py @@ -40,14 +40,28 @@ UNSUPPORTED_ADMIN_CONTROL_IDS = [ + "dlp_presidio_use_service", + "dlp_presidio_service_settings", "dlp_scanner_timeout_seconds", "dlp_review_include_redacted_preview", "web_search_dlp_track_review_events", "upload_dlp_track_review_events", ] +PRESIDIO_ENDPOINT_CONTROL_IDS = [ + "dlp_presidio_endpoint_settings", + "dlp_presidio_analyzer_endpoint", + "dlp_presidio_auth_header_name", + "dlp_presidio_auth_secret_env_var", + "dlp_presidio_timeout_seconds", + "dlp_presidio_score_threshold", + "dlp_presidio_entities", +] + RETIRED_DLP_SETTING_KEYS = [ + "dlp_presidio_use_service", + "dlp_presidio_endpoint", "dlp_scanner_timeout_seconds", "dlp_review_include_redacted_preview", "web_search_dlp_track_review_events", @@ -117,8 +131,11 @@ def test_admin_template_exposes_dlp_controls(): assert 'value="safety_violations"' not in source, ( "Safety Violations destination should stay hidden unless PR1 implements reachable review integration" ) - assert '' in source - assert "Regex scanning is the only implemented engine in this release." in source + assert 'value="regex"' in source + assert 'value="presidio_endpoint"' in source + assert "Regex structured identifier scan" in source + assert "External Presidio Analyzer endpoint" in source + assert "Use regex for lightweight built-in scanning" in source assert "Custom Regex Rules" in source assert "{{ dlp_regex_rules_json }}" in source assert "web_search_dlp_block_on_internal_phrases" not in source @@ -130,6 +147,24 @@ def test_admin_template_exposes_dlp_controls(): assert_no_retired_structured_redaction_control(source, ADMIN_TEMPLATE_FILE) +def test_presidio_endpoint_controls_are_rendered_without_secret_value_field(): + """DLP admin UI should configure endpoint metadata but not store raw API keys.""" + print("Testing Presidio endpoint admin controls...") + source = read_file_text(ADMIN_TEMPLATE_FILE) + + for control_id in PRESIDIO_ENDPOINT_CONTROL_IDS: + assert f'id="{control_id}"' in source, f"Missing Presidio endpoint control: {control_id}" + + assert 'name="dlp_presidio_analyzer_endpoint"' in source + assert 'name="dlp_presidio_auth_header_name"' in source + assert 'name="dlp_presidio_auth_secret_env_var"' in source + assert 'name="dlp_presidio_timeout_seconds"' in source + assert 'name="dlp_presidio_score_threshold"' in source + assert 'name="dlp_presidio_entities"' in source + assert 'name="dlp_presidio_auth_secret"' not in source + assert "production endpoints should be private, authenticated, and https" in source.lower() + + def test_admin_js_uses_d_none_for_dlp_toggles(): """New DLP JS should use Bootstrap d-none, not style.display.""" print("Testing DLP admin JavaScript visibility handling...") @@ -138,6 +173,8 @@ def test_admin_js_uses_d_none_for_dlp_toggles(): assert "initializeDlpSettings" in source assert "dlp_control_plane_settings" in source assert "web_search_dlp_settings" in source + assert "dlp_presidio_endpoint_settings" in source + assert "presidio_endpoint" in source assert "classList.toggle('d-none'" in source or 'classList.toggle("d-none"' in source dlp_section = source[source.find("initializeDlpSettings"):] @@ -180,6 +217,7 @@ def test_admin_template_exposes_regex_rule_editor_without_internal_phrase_toggle test_dlp_defaults_exist_and_are_safe, test_admin_route_persists_dlp_settings, test_admin_template_exposes_dlp_controls, + test_presidio_endpoint_controls_are_rendered_without_secret_value_field, test_admin_js_uses_d_none_for_dlp_toggles, test_admin_settings_form_contains_csrf_token, test_admin_template_exposes_regex_rule_editor_without_internal_phrase_toggle, diff --git a/functional_tests/test_dlp_admin_ui_smoke.py b/functional_tests/test_dlp_admin_ui_smoke.py index 9a4da3a3..812a3e75 100644 --- a/functional_tests/test_dlp_admin_ui_smoke.py +++ b/functional_tests/test_dlp_admin_ui_smoke.py @@ -24,6 +24,13 @@ REQUIRED_CONTROLS = [ "enable_dlp_control_plane", "dlp_default_engine", + "dlp_presidio_endpoint_settings", + "dlp_presidio_analyzer_endpoint", + "dlp_presidio_auth_header_name", + "dlp_presidio_auth_secret_env_var", + "dlp_presidio_timeout_seconds", + "dlp_presidio_score_threshold", + "dlp_presidio_entities", "dlp_regex_rules_json", "dlp_max_scan_chars", "enable_web_search_dlp", @@ -110,6 +117,7 @@ def test_dlp_admin_preview_does_not_expose_raw_sensitive_values(): forbidden = [ "123-45-6789", "4111 1111 1111 1111", + "dlp_presidio_auth_secret\"", "raw_matches", ] for value in forbidden: From b8738664d77548c55c137a9f3be28f91536d9a70 Mon Sep 17 00:00:00 2001 From: Zachary Arguelles Date: Wed, 10 Jun 2026 22:20:33 -0400 Subject: [PATCH 11/20] docs: document external presidio dlp deployment --- .../features/DLP_UPLOAD_STAGING.md | 22 +++++- .../features/DLP_WEB_SEARCH_EGRESS_CONTROL.md | 25 ++++-- docs/how-to/deploy_presidio_dlp.md | 79 +++++++++++++++++++ tools/local_dev/run_dlp_local_stack.md | 23 ++++++ 4 files changed, 141 insertions(+), 8 deletions(-) create mode 100644 docs/how-to/deploy_presidio_dlp.md diff --git a/docs/explanation/features/DLP_UPLOAD_STAGING.md b/docs/explanation/features/DLP_UPLOAD_STAGING.md index da8ae2d0..88c23367 100644 --- a/docs/explanation/features/DLP_UPLOAD_STAGING.md +++ b/docs/explanation/features/DLP_UPLOAD_STAGING.md @@ -4,11 +4,11 @@ Version: 0.242.073 -Dependencies: shared DLP core, configurable regex DLP rules, document processing pipeline, Azure AI Search, Azure OpenAI embeddings. +Dependencies: shared DLP core, configurable regex DLP rules, optional external Presidio Analyzer-compatible endpoint, document processing pipeline, Azure AI Search, Azure OpenAI embeddings. SimpleChat now applies DLP to extracted upload text and selected document metadata before embeddings, Azure AI Search indexing, metadata extraction prompts, Cosmos metadata updates, and file-processing logs. The feature reuses the shared DLP core introduced for web-search egress and applies it to `save_chunks()`, `save_chunks_batch()`, `save_video_chunk()`, and metadata extraction/update paths. -Regex DLP is the implemented engine for this release. The default rules detect U.S. SSNs and Luhn-valid credit card numbers, and administrators can add upload-specific regex rules through the shared `dlp_regex_rules` settings payload. +Regex DLP remains the lightweight default engine. The default rules detect U.S. SSNs and Luhn-valid credit card numbers, and administrators can add upload-specific regex rules through the shared `dlp_regex_rules` settings payload. Administrators can also select `presidio_endpoint` to call an external Presidio Analyzer-compatible endpoint for richer upload text and metadata detection without embedding Presidio in SimpleChat. ## Technical Specifications @@ -22,6 +22,8 @@ Protected processing points: - Document-level DLP metadata preserves the worst observed status and cumulative entity counts across chunk and metadata scans. - Configured regex rules can target upload only, web search only, or both surfaces. - Configured rules support keyword proximity confidence shaping, so a regex candidate can require nearby identifiers such as `document`, `employee`, `SSN`, or another admin-defined term before it redacts or blocks. +- The external Presidio Analyzer endpoint path sends extracted text and selected metadata to an administrator-managed analyzer endpoint, receives spans, and normalizes them into the same counts-only DLP result shape used by regex scanning. +- SimpleChat does not embed Presidio packages or run an in-process analyzer. - File-processing logs replace raw chunk logging with safe DLP and text-length summaries. - Enhanced citations are automatically disabled when upload DLP can enforce a block or redaction, including `redact` mode, `block` mode, fail-on-match, and fail-closed scanner errors, because this PR does not generate sanitized binary derivatives for raw source files. @@ -38,12 +40,24 @@ Upload DLP states: Upload controls are available under Admin Settings > Data Loss Prevention: - Enable Upload DLP. +- Choose the default engine: regex structured identifier scan or external Presidio Analyzer endpoint. +- Configure the Presidio Analyzer endpoint, auth header, secret environment variable name, timeout, score threshold, and entities when `presidio_endpoint` is selected. - Upload mode: `monitor`, `redact`, or `block`. - Fail upload on match. - Custom Regex Rules, shared with web-search DLP. Review routing defaults to `none`. Upload review-event writing is not exposed in this release because the DLP review destination is intentionally locked to `none`. +## External Presidio Analyzer Endpoint + +Administrators can select an external Presidio Analyzer-compatible endpoint as the DLP engine by setting the engine to `presidio_endpoint`. SimpleChat sends upload text and selected metadata to the endpoint from the server side, receives entity spans, and then performs monitor, redact, or block behavior locally before embeddings, Azure AI Search indexing, metadata extraction prompts, Cosmos metadata updates, and file-processing logs. + +This is Option C for Presidio integration: Presidio runs outside SimpleChat. The SimpleChat application image has no embedded Presidio dependency, model package, or analyzer runtime. Regex DLP remains available as the default and fallback path. + +Production deployments should keep the analyzer private and authenticated. Use a private network path plus an API key header or equivalent service boundary, and never expose a public unauthenticated Presidio Analyzer endpoint. SimpleChat stores only the configured secret environment variable name, such as `PRESIDIO_DLP_API_KEY`; the API key value belongs in App Service settings or a Key Vault reference. + +The analyzer receives raw extracted text before redaction. SimpleChat, proxies, wrappers, analyzer containers, and platform diagnostics must not log raw request bodies, response bodies, chunk text, OCR text, vision text, metadata values, matched values, or analyzer explanations. Stored DLP metadata and telemetry remain counts-only. + ## Telemetry And Logs Upload DLP telemetry uses `log_event(...)` with safe dimensions: @@ -90,7 +104,7 @@ customEvents This PR redacts extracted text and selected metadata before embeddings, search indexing, prompts, and metadata persistence. It does not claim that raw binary artifacts are format-redacted. When upload DLP can enforce a block or redaction, enhanced citations are disabled instead of storing raw source blobs. A future format-aware derivative generation or quarantine workflow is needed to produce sanitized binary copies. -Regex DLP is limited to deterministic structured identifiers and administrator-defined exact-format identifiers. It is weaker for names, addresses, contextual PII, international identifiers, and noisy document text. +Regex DLP is limited to deterministic structured identifiers and administrator-defined exact-format identifiers. It is weaker for names, addresses, contextual PII, international identifiers, and noisy document text. Use the external Presidio Analyzer endpoint when richer recognizers are needed and the production analyzer can be kept private, authenticated, and free of raw text logging. ## Testing And Validation @@ -101,6 +115,8 @@ Functional coverage: - `functional_tests/test_upload_dlp_workspace_scopes.py` - `functional_tests/test_upload_dlp_ingestion_integration.py` - `functional_tests/test_dlp_admin_ui_smoke.py` +- `functional_tests/test_dlp_presidio_endpoint.py` +- `functional_tests/test_dlp_presidio_engine_integration.py` - `functional_tests/test_dlp_review_events.py` - `functional_tests/test_dlp_telemetry.py` - Shared PR1 DLP tests remain green. diff --git a/docs/explanation/features/DLP_WEB_SEARCH_EGRESS_CONTROL.md b/docs/explanation/features/DLP_WEB_SEARCH_EGRESS_CONTROL.md index fc4bfbf7..ab506fcc 100644 --- a/docs/explanation/features/DLP_WEB_SEARCH_EGRESS_CONTROL.md +++ b/docs/explanation/features/DLP_WEB_SEARCH_EGRESS_CONTROL.md @@ -4,7 +4,7 @@ Version: 0.242.073 -Dependencies: Flask chat routes, configurable regex DLP rules, and Azure AI Foundry web-search agent configuration. +Dependencies: Flask chat routes, configurable regex DLP rules, optional external Presidio Analyzer-compatible endpoint, and Azure AI Foundry web-search agent configuration. SimpleChat now includes an application-level Data Loss Prevention control before web-search grounding. The app evaluates the current user message after `build_web_search_query_text(...)` and before the configured Azure AI Foundry web-search agent is invoked. @@ -16,12 +16,15 @@ The shared DLP core lives in `application/single_app/functions_dlp.py`. Configur Implemented behavior: -- Regex DLP is the only implemented engine in this release. +- Regex DLP remains the lightweight default engine. +- Administrators can optionally select `presidio_endpoint` to call an external Presidio Analyzer-compatible endpoint from the server side. - Regex rules are admin-configurable through the `dlp_regex_rules` settings payload. - Default rules detect U.S. SSNs and Luhn-valid credit card numbers. - Rules can target web search, upload, or both. - Rules can use keyword proximity confidence shaping. A regex match can require nearby terms such as `ssn`, `social security`, `card`, or `billing` before it reaches the configured minimum confidence. - Generic internal phrase matching is not hardcoded. Administrators can add organization-specific phrases or identifiers as explicit custom rules. +- The external Presidio Analyzer endpoint path returns spans that SimpleChat normalizes into the same counts-only DLP result shape used by regex scanning. +- SimpleChat does not embed Presidio packages or run an in-process analyzer. - DLP metadata stores entity types and counts only. Raw matched values are not stored in telemetry or review summaries. - Structured DLP telemetry uses `log_event(...)` and reaches Application Insights when `APPLICATIONINSIGHTS_CONNECTION_STRING` is configured. - Scanner errors fail closed by default when `dlp_fail_closed_on_scanner_error` is enabled. @@ -30,7 +33,7 @@ Implemented behavior: Admin settings are added in Admin Settings under Data Loss Prevention: -- Shared DLP enablement, regex engine selection, configurable regex rules, maximum scan characters, scanner fail-closed behavior, telemetry, and review destination. +- Shared DLP enablement, engine selection, configurable regex rules, optional Presidio Analyzer endpoint settings, maximum scan characters, scanner fail-closed behavior, telemetry, and review destination. - Web-search DLP enablement and mode: `monitor`, `redact`, or `block`. - Review destination defaults to `none`. Safety Violations review routing is documented as a future integration unless the review surface is expanded with distinct DLP labeling and access rules. @@ -66,7 +69,17 @@ Each rule can define: Confidence shaping lets a regex match become stronger when nearby terms are present. For example, an employee identifier rule can require `EID-123456` plus `employee` within 32 characters before it redacts. -Regex DLP remains deterministic and dependency-light. Richer contextual PII detection for names, addresses, and natural-language identifiers remains future work. +Regex DLP remains deterministic and dependency-light, but it is not equivalent to Presidio. Use the external Presidio Analyzer endpoint when contextual PII detection, such as names, addresses, and natural-language identifiers, is required. + +## External Presidio Analyzer Endpoint + +Administrators can select an external Presidio Analyzer-compatible endpoint as the DLP engine by setting the engine to `presidio_endpoint`. SimpleChat sends the web-search query text to the endpoint from the server side, receives entity spans, and performs redaction or blocking locally using the existing counts-only DLP result shape. + +This is Option C for Presidio integration: the analyzer is external to SimpleChat. SimpleChat keeps no embedded Presidio dependency, model package, or analyzer runtime in the app image. Regex DLP remains available as the default and fallback path. + +Production deployments should keep the analyzer endpoint private and authenticated. Use a private network path plus an API key header or equivalent service boundary, and never expose a public unauthenticated Presidio Analyzer endpoint. SimpleChat stores only the configured secret environment variable name, such as `PRESIDIO_DLP_API_KEY`; the API key value belongs in App Service settings or a Key Vault reference. + +Because the analyzer receives raw text before redaction, SimpleChat, proxies, wrappers, and analyzer infrastructure must not log raw request bodies, response bodies, snippets, matched values, or analyzer explanations. Safe telemetry remains limited to entity types, counts, actions, engines, modes, and scanner status. ## Telemetry @@ -109,7 +122,7 @@ Telemetry retention follows the configured Application Insights workspace. This ## Limitations -Regex DLP is intentionally lightweight. It is useful for structured identifiers such as SSNs, Luhn-valid credit card numbers, and administrator-defined exact-format identifiers, but it is weaker for names, addresses, contextual PII, international identifiers, credential strings, and noisy prose. +Regex DLP is intentionally lightweight. It is useful for structured identifiers such as SSNs, Luhn-valid credit card numbers, and administrator-defined exact-format identifiers, but it is weaker for names, addresses, contextual PII, international identifiers, secrets, and noisy prose. Use the external Presidio Analyzer endpoint when richer recognizers are needed and the production analyzer can be kept private, authenticated, and free of raw text logging. The app-level control cannot inspect Bing's internal grounding query after Foundry receives the request. It reduces egress risk by preventing or redacting sensitive text before the app sends the web-search message to the Foundry agent. @@ -122,6 +135,8 @@ Functional coverage: - `functional_tests/test_dlp_telemetry.py` - `functional_tests/test_dlp_admin_settings_ui.py` - `functional_tests/test_dlp_admin_settings_roundtrip.py` +- `functional_tests/test_dlp_presidio_endpoint.py` +- `functional_tests/test_dlp_presidio_engine_integration.py` - `functional_tests/test_dlp_review_events.py` - `functional_tests/test_web_search_dlp_egress.py` - `functional_tests/test_web_search_dlp_route_integration.py` diff --git a/docs/how-to/deploy_presidio_dlp.md b/docs/how-to/deploy_presidio_dlp.md new file mode 100644 index 00000000..f6fe2501 --- /dev/null +++ b/docs/how-to/deploy_presidio_dlp.md @@ -0,0 +1,79 @@ +# Deploy External Presidio DLP + +SimpleChat can use Option C for richer DLP detection: call an external Presidio Analyzer-compatible HTTP endpoint from the server side while keeping Presidio out of the SimpleChat application image. SimpleChat does not embed Presidio packages, models, or recognizers; it sends text to an administrator-managed analyzer endpoint, receives spans, and applies its existing monitor, redact, or block behavior locally. + +## Recommended Production Shape + +Run the Presidio Analyzer-compatible service as sensitive internal infrastructure. The analyzer receives raw text before SimpleChat redacts it, so production deployments need both network and application controls. + +Required controls: + +- Use a private network path between SimpleChat and the analyzer. +- Require an API key header, usually `X-DLP-API-Key`, at a proxy, wrapper, gateway, or service boundary in front of the analyzer. +- Use HTTPS for every non-local endpoint. +- Do not expose a public unauthenticated Presidio Analyzer endpoint. +- Do not log raw request text, response bodies, snippets, or matched values in SimpleChat, the analyzer wrapper, reverse proxies, or platform diagnostics. +- Keep fail-closed scanner behavior enabled for protected upload and web-search paths when policy requires blocking on scanner errors. + +## SimpleChat Settings + +Configure these values in Admin Settings > Data Loss Prevention: + +- Default Engine: `External Presidio Analyzer endpoint` +- Analyzer Endpoint: `https:///analyze` +- Auth Header: `X-DLP-API-Key` +- Secret Env Var: `PRESIDIO_DLP_API_KEY` +- Timeout Seconds: `5` +- Score Threshold: `0.5` +- Entities: `CREDIT_CARD, EMAIL_ADDRESS, PHONE_NUMBER, US_SSN` + +SimpleChat stores only the environment variable name in its admin settings, such as `PRESIDIO_DLP_API_KEY`. The API key value itself must live in the SimpleChat App Service application settings or in a Key Vault reference used by that App Service setting. Do not paste raw API key values into SimpleChat admin settings or Cosmos-backed configuration. + +## Local Docker Smoke Test + +Run the stock Presidio Analyzer container locally: + +```bash +docker run --rm -p 5002:3000 mcr.microsoft.com/presidio-analyzer:latest +``` + +Configure SimpleChat for a smoke test: + +```text +Default Engine: External Presidio Analyzer endpoint +Analyzer Endpoint: http://localhost:5002/analyze +Auth Header: X-DLP-API-Key +Secret Env Var: PRESIDIO_DLP_API_KEY +Score Threshold: 0.5 +Entities: CREDIT_CARD, EMAIL_ADDRESS, PHONE_NUMBER, US_SSN +``` + +The stock local container does not require an API key. You can leave `PRESIDIO_DLP_API_KEY` unset for this local smoke path, or set it to any placeholder value while testing the SimpleChat configuration surface. Production deployments should add an authenticated proxy, wrapper, or service boundary before enabling the endpoint for protected traffic. + +Test with harmless synthetic content such as `a@example.com`. In `redact` mode, SimpleChat should call the analyzer, receive entity spans, and replace the detected value before web-search egress or upload indexing. In `block` mode, the same finding should prevent the protected action. + +## Separate Azure App Service + +Deploy the Presidio Analyzer-compatible container as a separate Linux Web App for Containers. Restrict ingress with private endpoints, virtual network integration, and access restrictions so only the SimpleChat environment can reach it. If the analyzer endpoint is reachable beyond localhost, place an API-key-validating proxy or wrapper in front of it and configure SimpleChat to send the configured auth header. + +Use this shape when you want independent deployment and operational ownership for the analyzer while still running on App Service. Store the API key value as a SimpleChat App Service setting named by the SimpleChat admin setting, for example `PRESIDIO_DLP_API_KEY`, preferably backed by a Key Vault reference. + +## App Service Sidecar + +For deployments using App Service sidecar support, run the analyzer as a sidecar container next to SimpleChat and configure SimpleChat to call the sidecar endpoint over the local or private container network. This keeps Presidio dependencies out of the SimpleChat image while scaling the analyzer with the SimpleChat App Service instance count. + +Even with a sidecar, avoid raw text logging and keep the analyzer endpoint unreachable from the public internet. If the sidecar is fronted by a local wrapper, validate the `X-DLP-API-Key` or equivalent header there. + +## Azure Container Apps + +For independent scaling, deploy the analyzer as an internal Azure Container Apps service. Configure SimpleChat to reach the internal ingress URL over private networking and require the API key header at the Container Apps ingress, gateway, or wrapper service. + +This shape works well when analyzer CPU or model requirements scale differently from SimpleChat. Store the API key value in the SimpleChat App Service setting or Key Vault reference named by SimpleChat's `Secret Env Var` setting, not in the SimpleChat admin configuration. + +## Security Notes + +The analyzer receives raw user text, extracted document text, and selected metadata before SimpleChat applies redaction. Treat the endpoint as sensitive infrastructure with the same care as an internal document-processing service. + +Do not log raw request bodies, response bodies, matched values, or analyzer explanations. SimpleChat's DLP telemetry and stored metadata should remain counts-only. If you add a gateway, proxy, or wrapper around Presidio Analyzer, disable body logging and scrub diagnostics before sending them to centralized logs. + +Use `presidio_endpoint` only when the endpoint is private, authenticated, and operated by the same trust boundary that is allowed to process the source text. Keep regex DLP as the lightweight default and fallback path when the external analyzer is not configured. diff --git a/tools/local_dev/run_dlp_local_stack.md b/tools/local_dev/run_dlp_local_stack.md index a75c3b21..8bb45877 100644 --- a/tools/local_dev/run_dlp_local_stack.md +++ b/tools/local_dev/run_dlp_local_stack.md @@ -109,3 +109,26 @@ The script writes: - If Docker Desktop stops, the Flask process can keep serving cached pages while Cosmos requests fail. - If another tool owns port `8081`, use `9081` and pass `--gateway-endpoint localhost:9081`. - Keep `.codex-local/` untracked; it is for local smoke artifacts only. + +## Optional Presidio Analyzer Smoke + +SimpleChat can test the external `presidio_endpoint` engine against the stock local Presidio Analyzer container without adding Presidio dependencies to the SimpleChat app image. + +Run a local Presidio Analyzer container: + +```bash +docker run --rm -p 5002:3000 mcr.microsoft.com/presidio-analyzer:latest +``` + +Configure DLP Admin Settings: + +- Default Engine: External Presidio Analyzer endpoint +- Analyzer Endpoint: `http://localhost:5002/analyze` +- Auth Header: `X-DLP-API-Key` +- Secret Env Var: `PRESIDIO_DLP_API_KEY` +- Score Threshold: `0.5` +- Entities: `CREDIT_CARD, EMAIL_ADDRESS, PHONE_NUMBER, US_SSN` + +The stock local container does not require an API key. Production deployments should require a private network path plus an API key header at a proxy, wrapper, gateway, or service boundary. The API key value should live in App Service settings or a Key Vault reference; SimpleChat admin settings store only the environment variable name, such as `PRESIDIO_DLP_API_KEY`. + +Then test a web-search or upload input containing harmless synthetic content such as `a@example.com`. In redact mode, SimpleChat should use the Presidio Analyzer returned spans and replace the value before egress or indexing. Do not enable raw text logging for SimpleChat, proxies, or analyzer containers while testing. From e88baa23738b0b5a62a48d5be29d184ad32d341f Mon Sep 17 00:00:00 2001 From: Zachary Arguelles Date: Wed, 10 Jun 2026 22:31:01 -0400 Subject: [PATCH 12/20] docs: release note presidio endpoint dlp --- application/single_app/config.py | 2 +- docs/explanation/release_notes.md | 13 ++++++++++++- .../test_dlp_admin_settings_roundtrip.py | 3 ++- 3 files changed, 15 insertions(+), 3 deletions(-) diff --git a/application/single_app/config.py b/application/single_app/config.py index cd58865b..e917a776 100644 --- a/application/single_app/config.py +++ b/application/single_app/config.py @@ -95,7 +95,7 @@ EXECUTOR_TYPE = 'thread' EXECUTOR_MAX_WORKERS = 30 SESSION_TYPE = 'filesystem' -VERSION = "0.242.073" +VERSION = "0.242.074" SESSION_COOKIE_SAMESITE = os.getenv('SESSION_COOKIE_SAMESITE', 'Lax') SESSION_COOKIE_HTTPONLY = os.getenv('SESSION_COOKIE_HTTPONLY', 'true').lower() != 'false' diff --git a/docs/explanation/release_notes.md b/docs/explanation/release_notes.md index f2879eb9..ecc8c81c 100644 --- a/docs/explanation/release_notes.md +++ b/docs/explanation/release_notes.md @@ -1,9 +1,20 @@ -This page tracks notable Simple Chat releases and organizes the detailed change log by version. The timeline below provides a quick visual overview of the current release progression through v0.242.073, and the per-version entries continue immediately after it. +This page tracks notable Simple Chat releases and organizes the detailed change log by version. The timeline below provides a quick visual overview of the current release progression through v0.242.074, and the per-version entries continue immediately after it. For feature-focused and fix-focused drill-downs by version, see [Features by Version](/explanation/features/) and [Fixes by Version](/explanation/fixes/). +### **(v0.242.074)** + +#### New Features + +* **External Presidio DLP Endpoint** + * Added optional support for a Presidio Analyzer-compatible endpoint as an advanced DLP engine without embedding Presidio packages in the SimpleChat app image. + * Added server-side endpoint calls with configurable timeout, score threshold, entity allowlist, and API-key header name sourced from an environment variable. + * Reused existing DLP monitor, redact, block, counts-only telemetry, upload, and web-search enforcement behavior. + * Documented local Docker smoke testing and production private-network plus API-key deployment patterns. + * (Ref: external Presidio DLP endpoint, Admin Settings DLP controls, Presidio deployment how-to) + ### **(v0.242.073)** #### New Features diff --git a/functional_tests/test_dlp_admin_settings_roundtrip.py b/functional_tests/test_dlp_admin_settings_roundtrip.py index 509034fa..2b35391a 100644 --- a/functional_tests/test_dlp_admin_settings_roundtrip.py +++ b/functional_tests/test_dlp_admin_settings_roundtrip.py @@ -55,7 +55,6 @@ UNSUPPORTED_DLP_FORM_FIELDS = [ "dlp_presidio_use_service", - "dlp_presidio_endpoint", "dlp_scanner_timeout_seconds", "dlp_review_include_redacted_preview", "web_search_dlp_track_review_events", @@ -126,6 +125,8 @@ def test_dlp_admin_template_roundtrips_persisted_values(): for field_name in UNSUPPORTED_DLP_FORM_FIELDS: assert field_name not in template_source, f"Unsupported DLP control still rendered: {field_name}" + assert 'id="dlp_presidio_endpoint"' not in template_source + assert 'name="dlp_presidio_endpoint"' not in template_source assert_no_retired_structured_redaction_control(template_source, ADMIN_TEMPLATE_FILE) From ab303bcfe9afdee3c9ffa8d2bd1a689b7117d53b Mon Sep 17 00:00:00 2001 From: Zachary Arguelles Date: Thu, 18 Jun 2026 11:43:03 -0400 Subject: [PATCH 13/20] fix: adapt presidio endpoint dlp to current development --- application/single_app/functions_dlp.py | 2 +- application/single_app/templates/admin_settings.html | 6 +++++- functional_tests/test_dlp_admin_settings_roundtrip.py | 2 +- functional_tests/test_dlp_admin_settings_ui.py | 4 +++- functional_tests/test_dlp_admin_ui_smoke.py | 3 ++- functional_tests/test_dlp_control_plane.py | 2 +- functional_tests/test_dlp_presidio_endpoint.py | 4 ++-- functional_tests/test_dlp_presidio_engine_integration.py | 4 ++-- functional_tests/test_dlp_regex_rules.py | 2 +- functional_tests/test_upload_dlp_ingestion_integration.py | 2 +- functional_tests/test_upload_dlp_redaction.py | 2 +- functional_tests/test_web_search_dlp_egress.py | 2 +- functional_tests/test_web_search_dlp_route_integration.py | 2 +- 13 files changed, 22 insertions(+), 15 deletions(-) diff --git a/application/single_app/functions_dlp.py b/application/single_app/functions_dlp.py index 7266dbd2..2497fa18 100644 --- a/application/single_app/functions_dlp.py +++ b/application/single_app/functions_dlp.py @@ -89,7 +89,7 @@ def _apply_regex_engine(text, settings=None, surface="generic"): def _apply_presidio_endpoint_engine(text, settings=None, surface="generic"): recognizer_results = analyze_with_presidio_endpoint(text, settings or {}) - normalized = normalize_presidio_results( + normalized = normalize_external_analyzer_results( text, recognizer_results, mode=_normalize_mode(settings or {}, surface), diff --git a/application/single_app/templates/admin_settings.html b/application/single_app/templates/admin_settings.html index 42cb2317..34f7aa98 100644 --- a/application/single_app/templates/admin_settings.html +++ b/application/single_app/templates/admin_settings.html @@ -7430,7 +7430,11 @@
Presidio Analyzer Endpoint
-
+
+ + +
+
Comma-separated Presidio entity names such as CREDIT_CARD, EMAIL_ADDRESS, PHONE_NUMBER, US_SSN, PERSON, LOCATION.
diff --git a/functional_tests/test_dlp_admin_settings_roundtrip.py b/functional_tests/test_dlp_admin_settings_roundtrip.py index 2b35391a..e80b32d7 100644 --- a/functional_tests/test_dlp_admin_settings_roundtrip.py +++ b/functional_tests/test_dlp_admin_settings_roundtrip.py @@ -2,7 +2,7 @@ #!/usr/bin/env python3 """ Functional test for DLP admin settings roundtrip. -Version: 0.242.073 +Version: 0.242.074 Implemented in: 0.242.073 This test ensures DLP admin settings are normalized, persisted, and rendered diff --git a/functional_tests/test_dlp_admin_settings_ui.py b/functional_tests/test_dlp_admin_settings_ui.py index ba3159ff..c520537e 100644 --- a/functional_tests/test_dlp_admin_settings_ui.py +++ b/functional_tests/test_dlp_admin_settings_ui.py @@ -2,7 +2,7 @@ #!/usr/bin/env python3 """ Functional test for DLP admin settings UI. -Version: 0.242.073 +Version: 0.242.074 Implemented in: 0.242.073 This test ensures shared and web-search DLP defaults exist, admin settings @@ -55,6 +55,7 @@ "dlp_presidio_auth_secret_env_var", "dlp_presidio_timeout_seconds", "dlp_presidio_score_threshold", + "dlp_presidio_language", "dlp_presidio_entities", ] @@ -160,6 +161,7 @@ def test_presidio_endpoint_controls_are_rendered_without_secret_value_field(): assert 'name="dlp_presidio_auth_secret_env_var"' in source assert 'name="dlp_presidio_timeout_seconds"' in source assert 'name="dlp_presidio_score_threshold"' in source + assert 'name="dlp_presidio_language"' in source assert 'name="dlp_presidio_entities"' in source assert 'name="dlp_presidio_auth_secret"' not in source assert "production endpoints should be private, authenticated, and https" in source.lower() diff --git a/functional_tests/test_dlp_admin_ui_smoke.py b/functional_tests/test_dlp_admin_ui_smoke.py index 812a3e75..a0a7f2d8 100644 --- a/functional_tests/test_dlp_admin_ui_smoke.py +++ b/functional_tests/test_dlp_admin_ui_smoke.py @@ -2,7 +2,7 @@ #!/usr/bin/env python3 """ Functional test for DLP admin UI smoke. -Version: 0.242.073 +Version: 0.242.074 Implemented in: 0.242.073 This test ensures the DLP admin settings card can be extracted into collapsed @@ -30,6 +30,7 @@ "dlp_presidio_auth_secret_env_var", "dlp_presidio_timeout_seconds", "dlp_presidio_score_threshold", + "dlp_presidio_language", "dlp_presidio_entities", "dlp_regex_rules_json", "dlp_max_scan_chars", diff --git a/functional_tests/test_dlp_control_plane.py b/functional_tests/test_dlp_control_plane.py index e55c73a1..59e14db7 100644 --- a/functional_tests/test_dlp_control_plane.py +++ b/functional_tests/test_dlp_control_plane.py @@ -2,7 +2,7 @@ #!/usr/bin/env python3 """ Functional test for DLP control plane core behavior. -Version: 0.242.073 +Version: 0.242.074 Implemented in: 0.242.073 This test ensures the shared DLP core supports disabled, regex, Luhn-validated diff --git a/functional_tests/test_dlp_presidio_endpoint.py b/functional_tests/test_dlp_presidio_endpoint.py index df3ac23a..460e7f25 100644 --- a/functional_tests/test_dlp_presidio_endpoint.py +++ b/functional_tests/test_dlp_presidio_endpoint.py @@ -2,8 +2,8 @@ #!/usr/bin/env python3 """ Functional test for external Presidio endpoint DLP adapter. -Version: 0.242.044 -Implemented in: 0.242.044 +Version: 0.242.071 +Implemented in: 0.242.071 This test ensures SimpleChat can call a configured Presidio-compatible analyzer endpoint without embedding Presidio packages or leaking raw scanned text. diff --git a/functional_tests/test_dlp_presidio_engine_integration.py b/functional_tests/test_dlp_presidio_engine_integration.py index 464c57ff..93a340f3 100644 --- a/functional_tests/test_dlp_presidio_engine_integration.py +++ b/functional_tests/test_dlp_presidio_engine_integration.py @@ -2,8 +2,8 @@ #!/usr/bin/env python3 """ Functional test for Presidio endpoint engine integration. -Version: 0.242.044 -Implemented in: 0.242.044 +Version: 0.242.071 +Implemented in: 0.242.071 This test ensures the external Presidio endpoint engine reuses SimpleChat's existing DLP decision, redaction, and fail-closed behavior. diff --git a/functional_tests/test_dlp_regex_rules.py b/functional_tests/test_dlp_regex_rules.py index d0207b63..7552a62c 100644 --- a/functional_tests/test_dlp_regex_rules.py +++ b/functional_tests/test_dlp_regex_rules.py @@ -2,7 +2,7 @@ #!/usr/bin/env python3 """ Functional test for configurable DLP regex rules. -Version: 0.242.073 +Version: 0.242.074 Implemented in: 0.242.073 This test ensures DLP regex rules are admin-configurable, validated, diff --git a/functional_tests/test_upload_dlp_ingestion_integration.py b/functional_tests/test_upload_dlp_ingestion_integration.py index 869f8fff..bda8d0dc 100644 --- a/functional_tests/test_upload_dlp_ingestion_integration.py +++ b/functional_tests/test_upload_dlp_ingestion_integration.py @@ -2,7 +2,7 @@ #!/usr/bin/env python3 """ Functional test for upload DLP ingestion integration. -Version: 0.242.073 +Version: 0.242.074 Implemented in: 0.242.073 This test ensures upload DLP blocks stop before embeddings/search indexing and diff --git a/functional_tests/test_upload_dlp_redaction.py b/functional_tests/test_upload_dlp_redaction.py index 4c24a261..992ec467 100644 --- a/functional_tests/test_upload_dlp_redaction.py +++ b/functional_tests/test_upload_dlp_redaction.py @@ -2,7 +2,7 @@ #!/usr/bin/env python3 """ Functional test for upload DLP redaction. -Version: 0.242.073 +Version: 0.242.074 Implemented in: 0.242.073 This test ensures upload DLP redacts chunk text before embeddings and Azure AI diff --git a/functional_tests/test_web_search_dlp_egress.py b/functional_tests/test_web_search_dlp_egress.py index 1fb94248..82ddacda 100644 --- a/functional_tests/test_web_search_dlp_egress.py +++ b/functional_tests/test_web_search_dlp_egress.py @@ -2,7 +2,7 @@ #!/usr/bin/env python3 """ Functional test for web-search DLP egress. -Version: 0.242.073 +Version: 0.242.074 Implemented in: 0.242.073 This test ensures web-search DLP runs after current-message query construction diff --git a/functional_tests/test_web_search_dlp_route_integration.py b/functional_tests/test_web_search_dlp_route_integration.py index 7a491cd9..fa1c673e 100644 --- a/functional_tests/test_web_search_dlp_route_integration.py +++ b/functional_tests/test_web_search_dlp_route_integration.py @@ -2,7 +2,7 @@ #!/usr/bin/env python3 """ Functional test for web-search DLP route integration. -Version: 0.242.073 +Version: 0.242.074 Implemented in: 0.242.073 This test ensures chat routes evaluate DLP before Foundry web search, suppress From 5a26778a7736f7505d8354566e54f21c730e39b0 Mon Sep 17 00:00:00 2001 From: Zachary Arguelles Date: Thu, 18 Jun 2026 11:59:47 -0400 Subject: [PATCH 14/20] fix: harden presidio endpoint configuration --- .../single_app/functions_dlp_presidio.py | 148 ++++++++++++++- application/single_app/functions_settings.py | 1 + .../route_frontend_admin_settings.py | 55 +++++- .../single_app/templates/admin_settings.html | 15 +- docs/explanation/release_notes.md | 9 + docs/how-to/deploy_presidio_dlp.md | 12 +- .../test_dlp_admin_settings_roundtrip.py | 6 + .../test_dlp_admin_settings_ui.py | 5 +- functional_tests/test_dlp_admin_ui_smoke.py | 1 + .../test_dlp_presidio_endpoint.py | 175 ++++++++++++++++-- .../test_dlp_presidio_engine_integration.py | 3 +- 11 files changed, 394 insertions(+), 36 deletions(-) diff --git a/application/single_app/functions_dlp_presidio.py b/application/single_app/functions_dlp_presidio.py index cb881361..382a8ec6 100644 --- a/application/single_app/functions_dlp_presidio.py +++ b/application/single_app/functions_dlp_presidio.py @@ -2,8 +2,10 @@ """HTTP adapter for Presidio-compatible Analyzer endpoints.""" +import ipaddress import os -from urllib.parse import urlparse +import re +from urllib.parse import parse_qsl, urlparse import requests @@ -12,6 +14,28 @@ DEFAULT_PRESIDIO_LANGUAGE = "en" DEFAULT_PRESIDIO_SCORE_THRESHOLD = 0.5 DEFAULT_PRESIDIO_AUTH_HEADER_NAME = "X-DLP-API-Key" +DEFAULT_PRESIDIO_AUTH_SECRET_ENV_VAR = "PRESIDIO_DLP_API_KEY" +PRESIDIO_AUTH_SECRET_ENV_VAR_PREFIX = "DLP_PRESIDIO_" +PRESIDIO_CREDENTIAL_QUERY_NAMES = { + "key", + "api_key", + "apikey", + "secret", + "token", + "password", + "connection", + "sig", +} +PRESIDIO_PRIVATE_HOST_SUFFIXES = ( + ".internal", + ".local", + ".localdomain", + ".lan", + ".home", + ".corp", +) +PRESIDIO_LOCAL_HOSTS = {"localhost"} +PRESIDIO_SECRET_ENV_VAR_PATTERN = re.compile(r"^[A-Z][A-Z0-9_]*$") class PresidioEndpointConfigurationError(ValueError): @@ -22,7 +46,83 @@ class PresidioEndpointRequestError(RuntimeError): """Raised when the Presidio endpoint cannot return a usable analyzer result.""" -def validate_presidio_endpoint_url(endpoint_url): +def _normalize_host_identifier(host): + normalized = str(host or "").strip().lower().strip(".") + if normalized.startswith("[") and "]" in normalized: + normalized = normalized[1:normalized.index("]")] + if "://" in normalized: + normalized = (urlparse(normalized).hostname or "").strip().lower().strip(".") + return normalized + + +def normalize_presidio_allowed_private_hosts(value): + """Normalize the admin allowlist for private Presidio endpoint hosts.""" + if isinstance(value, (list, tuple, set)): + raw_items = value + else: + raw_items = re.split(r"[\n,]+", str(value or "")) + + normalized_hosts = [] + seen_hosts = set() + for item in raw_items: + host = _normalize_host_identifier(item) + if not host or host in seen_hosts: + continue + normalized_hosts.append(host) + seen_hosts.add(host) + return ", ".join(normalized_hosts) + + +def _get_allowed_private_hosts(allowed_private_hosts): + normalized_allowlist = normalize_presidio_allowed_private_hosts(allowed_private_hosts) + if not normalized_allowlist: + return set() + return { + item.strip() + for item in normalized_allowlist.split(",") + if item.strip() + } + + +def _is_private_presidio_host(host): + normalized_host = _normalize_host_identifier(host) + if not normalized_host: + return True + if normalized_host in PRESIDIO_LOCAL_HOSTS or normalized_host.endswith(".localhost"): + return True + try: + ip_address = ipaddress.ip_address(normalized_host) + return not ip_address.is_global + except ValueError: + return normalized_host.endswith(PRESIDIO_PRIVATE_HOST_SUFFIXES) + + +def _is_loopback_presidio_host(host): + normalized_host = _normalize_host_identifier(host) + if normalized_host in PRESIDIO_LOCAL_HOSTS or normalized_host.endswith(".localhost"): + return True + try: + return ipaddress.ip_address(normalized_host).is_loopback + except ValueError: + return False + + +def normalize_presidio_secret_env_var_name(secret_env_var): + """Return an allowed Presidio secret env var name, or blank when invalid.""" + normalized = str(secret_env_var or "").strip() + if not normalized: + return "" + if normalized == DEFAULT_PRESIDIO_AUTH_SECRET_ENV_VAR: + return normalized + if ( + normalized.startswith(PRESIDIO_AUTH_SECRET_ENV_VAR_PREFIX) + and PRESIDIO_SECRET_ENV_VAR_PATTERN.fullmatch(normalized) + ): + return normalized + return "" + + +def validate_presidio_endpoint_url(endpoint_url, allowed_private_hosts=None): """Validate and normalize a Presidio Analyzer endpoint URL.""" normalized = str(endpoint_url or "").strip() if not normalized: @@ -32,9 +132,23 @@ def validate_presidio_endpoint_url(endpoint_url): host = (parsed.hostname or "").lower() if parsed.scheme not in {"http", "https"} or not parsed.netloc: raise PresidioEndpointConfigurationError("Presidio analyzer endpoint must be an absolute HTTP(S) URL.") + if parsed.username or parsed.password or "@" in parsed.netloc: + raise PresidioEndpointConfigurationError("Presidio analyzer endpoint URL must not include userinfo.") + if parsed.fragment: + raise PresidioEndpointConfigurationError("Presidio analyzer endpoint URL must not include a fragment.") + for query_name, _ in parse_qsl(parsed.query, keep_blank_values=True): + if query_name.strip().lower() in PRESIDIO_CREDENTIAL_QUERY_NAMES: + raise PresidioEndpointConfigurationError( + "Presidio analyzer endpoint URL must not include credential-like query parameters." + ) - local_hosts = {"localhost", "127.0.0.1", "::1"} - if parsed.scheme == "http" and host not in local_hosts: + host_is_private = _is_private_presidio_host(host) + allowed_hosts = _get_allowed_private_hosts(allowed_private_hosts) + if host_is_private and _normalize_host_identifier(host) not in allowed_hosts: + raise PresidioEndpointConfigurationError( + "Private Presidio analyzer endpoint hosts must be listed in the private host allowlist." + ) + if parsed.scheme == "http" and not _is_loopback_presidio_host(host): raise PresidioEndpointConfigurationError("Presidio analyzer endpoint must use HTTPS unless it is localhost.") return normalized @@ -65,7 +179,9 @@ def _get_entities(settings): def _get_auth_headers(settings): header_name = str((settings or {}).get("dlp_presidio_auth_header_name") or DEFAULT_PRESIDIO_AUTH_HEADER_NAME).strip() - secret_env_var = str((settings or {}).get("dlp_presidio_auth_secret_env_var") or "").strip() + secret_env_var = normalize_presidio_secret_env_var_name( + (settings or {}).get("dlp_presidio_auth_secret_env_var") or "" + ) if not header_name or not secret_env_var: return {} @@ -94,7 +210,10 @@ def _normalize_result_item(item): def analyze_with_presidio_endpoint(text, settings): """Call a configured Presidio Analyzer endpoint and return recognizer results.""" settings = settings or {} - endpoint_url = validate_presidio_endpoint_url(settings.get("dlp_presidio_analyzer_endpoint")) + endpoint_url = validate_presidio_endpoint_url( + settings.get("dlp_presidio_analyzer_endpoint"), + settings.get("dlp_presidio_allowed_private_hosts"), + ) timeout_seconds = max( 1, min(30, _safe_int(settings.get("dlp_presidio_timeout_seconds"), DEFAULT_PRESIDIO_TIMEOUT_SECONDS)), @@ -117,9 +236,20 @@ def analyze_with_presidio_endpoint(text, settings): request_error_type = None try: - response = requests.post(endpoint_url, json=payload, headers=headers, timeout=timeout_seconds) - response.raise_for_status() - body = response.json() + response = requests.post( + endpoint_url, + json=payload, + headers=headers, + timeout=timeout_seconds, + allow_redirects=False, + ) + status_code = getattr(response, "status_code", None) + if isinstance(status_code, int) and 300 <= status_code < 400: + request_error_type = "RedirectResponse" + body = None + else: + response.raise_for_status() + body = response.json() except Exception as exc: request_error_type = type(exc).__name__ diff --git a/application/single_app/functions_settings.py b/application/single_app/functions_settings.py index 1513d300..cf81994e 100644 --- a/application/single_app/functions_settings.py +++ b/application/single_app/functions_settings.py @@ -1073,6 +1073,7 @@ def get_settings(use_cosmos=False, include_source=False): 'dlp_telemetry_sample_allow_events': False, 'dlp_review_destination': 'none', 'dlp_presidio_analyzer_endpoint': '', + 'dlp_presidio_allowed_private_hosts': '', 'dlp_presidio_auth_header_name': 'X-DLP-API-Key', 'dlp_presidio_auth_secret_env_var': 'PRESIDIO_DLP_API_KEY', 'dlp_presidio_timeout_seconds': 5, diff --git a/application/single_app/route_frontend_admin_settings.py b/application/single_app/route_frontend_admin_settings.py index e1520f06..40969562 100644 --- a/application/single_app/route_frontend_admin_settings.py +++ b/application/single_app/route_frontend_admin_settings.py @@ -26,6 +26,13 @@ from functions_logging import * from functions_document_actions import normalize_document_action_capabilities from functions_dlp_rules import get_default_dlp_regex_rules, validate_dlp_regex_rules +from functions_dlp_presidio import ( + PresidioEndpointConfigurationError, + DEFAULT_PRESIDIO_AUTH_SECRET_ENV_VAR, + normalize_presidio_allowed_private_hosts, + normalize_presidio_secret_env_var_name, + validate_presidio_endpoint_url, +) from swagger_wrapper import swagger_route, get_auth_security from datetime import datetime, timedelta, timezone from admin_settings_int_utils import safe_int_with_source @@ -786,18 +793,57 @@ def parse_admin_int(raw_value, fallback_value, field_name="unknown", hard_defaul dlp_default_engine = form_data.get('dlp_default_engine', settings.get('dlp_default_engine', 'regex')) if dlp_default_engine not in ('regex', 'presidio_endpoint'): dlp_default_engine = 'regex' - dlp_presidio_analyzer_endpoint = form_data.get( + dlp_presidio_allowed_private_hosts = normalize_presidio_allowed_private_hosts( + form_data.get( + 'dlp_presidio_allowed_private_hosts', + settings.get('dlp_presidio_allowed_private_hosts', '') + ) + ) + submitted_dlp_presidio_analyzer_endpoint = form_data.get( 'dlp_presidio_analyzer_endpoint', settings.get('dlp_presidio_analyzer_endpoint', '') ).strip() + dlp_presidio_analyzer_endpoint = submitted_dlp_presidio_analyzer_endpoint + if dlp_presidio_analyzer_endpoint: + try: + validate_presidio_endpoint_url( + dlp_presidio_analyzer_endpoint, + dlp_presidio_allowed_private_hosts, + ) + except PresidioEndpointConfigurationError as exc: + existing_dlp_presidio_analyzer_endpoint = str( + settings.get('dlp_presidio_analyzer_endpoint', '') + ).strip() + dlp_presidio_analyzer_endpoint = '' + if existing_dlp_presidio_analyzer_endpoint: + try: + validate_presidio_endpoint_url( + existing_dlp_presidio_analyzer_endpoint, + dlp_presidio_allowed_private_hosts, + ) + dlp_presidio_analyzer_endpoint = existing_dlp_presidio_analyzer_endpoint + except PresidioEndpointConfigurationError: + dlp_presidio_analyzer_endpoint = '' + flash(f"Presidio analyzer endpoint was not saved: {exc}", "warning") dlp_presidio_auth_header_name = form_data.get( 'dlp_presidio_auth_header_name', settings.get('dlp_presidio_auth_header_name', 'X-DLP-API-Key') ).strip() or 'X-DLP-API-Key' - dlp_presidio_auth_secret_env_var = form_data.get( + submitted_dlp_presidio_auth_secret_env_var = form_data.get( 'dlp_presidio_auth_secret_env_var', - settings.get('dlp_presidio_auth_secret_env_var', 'PRESIDIO_DLP_API_KEY') - ).strip() or 'PRESIDIO_DLP_API_KEY' + settings.get('dlp_presidio_auth_secret_env_var', DEFAULT_PRESIDIO_AUTH_SECRET_ENV_VAR) + ).strip() + dlp_presidio_auth_secret_env_var = normalize_presidio_secret_env_var_name( + submitted_dlp_presidio_auth_secret_env_var + ) + if submitted_dlp_presidio_auth_secret_env_var and not dlp_presidio_auth_secret_env_var: + dlp_presidio_auth_secret_env_var = normalize_presidio_secret_env_var_name( + settings.get('dlp_presidio_auth_secret_env_var', DEFAULT_PRESIDIO_AUTH_SECRET_ENV_VAR) + ) + flash( + "Presidio auth secret env var was not saved. Use PRESIDIO_DLP_API_KEY or a DLP_PRESIDIO_ name.", + "warning" + ) dlp_presidio_timeout_seconds, _ = safe_int_with_source( form_data.get('dlp_presidio_timeout_seconds'), settings.get('dlp_presidio_timeout_seconds', 5), @@ -2083,6 +2129,7 @@ def is_valid_url(url): 'dlp_telemetry_sample_allow_events': form_data.get('dlp_telemetry_sample_allow_events') == 'on', 'dlp_review_destination': dlp_review_destination, 'dlp_presidio_analyzer_endpoint': dlp_presidio_analyzer_endpoint, + 'dlp_presidio_allowed_private_hosts': dlp_presidio_allowed_private_hosts, 'dlp_presidio_auth_header_name': dlp_presidio_auth_header_name, 'dlp_presidio_auth_secret_env_var': dlp_presidio_auth_secret_env_var, 'dlp_presidio_timeout_seconds': dlp_presidio_timeout_seconds, diff --git a/application/single_app/templates/admin_settings.html b/application/single_app/templates/admin_settings.html index 34f7aa98..a12e79c4 100644 --- a/application/single_app/templates/admin_settings.html +++ b/application/single_app/templates/admin_settings.html @@ -7403,14 +7403,21 @@
Presidio Analyzer Endpoint
-
Use the Presidio Analyzer REST URL that SimpleChat can reach server-side.
+
Do not include credentials, fragments, or API keys in the URL.
+
+ + +
Comma- or newline-separated hostnames/IPs for private, loopback, or link-local Presidio endpoints.
+
+
+
@@ -7418,10 +7425,8 @@
Presidio Analyzer Endpoint
-
Store the secret value in App Service settings or a Key Vault reference, not in SimpleChat settings.
+
Use PRESIDIO_DLP_API_KEY or a DLP_PRESIDIO_ env var. Store the value in App Service settings or Key Vault.
-
-
diff --git a/docs/explanation/release_notes.md b/docs/explanation/release_notes.md index ecc8c81c..a43ed7b9 100644 --- a/docs/explanation/release_notes.md +++ b/docs/explanation/release_notes.md @@ -6,6 +6,15 @@ For feature-focused and fix-focused drill-downs by version, see [Features by Ver ### **(v0.242.074)** +#### Bug Fixes + +* **Presidio Endpoint Configuration Hardening** + * Rejects Presidio Analyzer endpoint URLs with userinfo, fragments, credential-like query parameters, or private/link-local/loopback hosts that are not explicitly allowlisted. + * Adds an admin-configurable `Allowed Private Hosts` allowlist for private Presidio deployments and preserves only validated endpoint settings. + * Disables redirects for Presidio analyzer calls and treats redirect responses as scanner errors under the existing fail-open/fail-closed policy. + * Restricts Presidio auth secret environment variable names to blank, `PRESIDIO_DLP_API_KEY`, or the `DLP_PRESIDIO_` namespace. + * (Ref: Presidio endpoint URL validation, Admin Settings DLP controls, Presidio deployment how-to) + #### New Features * **External Presidio DLP Endpoint** diff --git a/docs/how-to/deploy_presidio_dlp.md b/docs/how-to/deploy_presidio_dlp.md index f6fe2501..22bfb77f 100644 --- a/docs/how-to/deploy_presidio_dlp.md +++ b/docs/how-to/deploy_presidio_dlp.md @@ -21,13 +21,16 @@ Configure these values in Admin Settings > Data Loss Prevention: - Default Engine: `External Presidio Analyzer endpoint` - Analyzer Endpoint: `https:///analyze` +- Allowed Private Hosts: `` - Auth Header: `X-DLP-API-Key` - Secret Env Var: `PRESIDIO_DLP_API_KEY` - Timeout Seconds: `5` - Score Threshold: `0.5` - Entities: `CREDIT_CARD, EMAIL_ADDRESS, PHONE_NUMBER, US_SSN` -SimpleChat stores only the environment variable name in its admin settings, such as `PRESIDIO_DLP_API_KEY`. The API key value itself must live in the SimpleChat App Service application settings or in a Key Vault reference used by that App Service setting. Do not paste raw API key values into SimpleChat admin settings or Cosmos-backed configuration. +SimpleChat stores only the environment variable name in its admin settings, such as `PRESIDIO_DLP_API_KEY`. The API key value itself must live in the SimpleChat App Service application settings or in a Key Vault reference used by that App Service setting. Do not paste raw API key values into SimpleChat admin settings or Cosmos-backed configuration. Secret environment variable names are intentionally limited to blank, `PRESIDIO_DLP_API_KEY`, or names beginning with `DLP_PRESIDIO_`. + +Endpoint URLs must use strict URL hygiene. Do not include usernames, passwords, fragments, or credential-like query parameters such as `key`, `api_key`, `secret`, `token`, `password`, `connection`, or `sig`. Public HTTPS endpoints are accepted after these checks. Private, loopback, link-local, or internal-style hosts must also appear in `Allowed Private Hosts` as comma- or newline-separated hostnames or IP addresses. SimpleChat disables HTTP redirects when calling the analyzer and treats redirect responses as analyzer errors. ## Local Docker Smoke Test @@ -42,6 +45,7 @@ Configure SimpleChat for a smoke test: ```text Default Engine: External Presidio Analyzer endpoint Analyzer Endpoint: http://localhost:5002/analyze +Allowed Private Hosts: localhost Auth Header: X-DLP-API-Key Secret Env Var: PRESIDIO_DLP_API_KEY Score Threshold: 0.5 @@ -54,19 +58,19 @@ Test with harmless synthetic content such as `a@example.com`. In `redact` mode, ## Separate Azure App Service -Deploy the Presidio Analyzer-compatible container as a separate Linux Web App for Containers. Restrict ingress with private endpoints, virtual network integration, and access restrictions so only the SimpleChat environment can reach it. If the analyzer endpoint is reachable beyond localhost, place an API-key-validating proxy or wrapper in front of it and configure SimpleChat to send the configured auth header. +Deploy the Presidio Analyzer-compatible container as a separate Linux Web App for Containers. Restrict ingress with private endpoints, virtual network integration, and access restrictions so only the SimpleChat environment can reach it. Add the analyzer hostname or private IP to SimpleChat's `Allowed Private Hosts` setting. If the analyzer endpoint is reachable beyond localhost, place an API-key-validating proxy or wrapper in front of it and configure SimpleChat to send the configured auth header. Use this shape when you want independent deployment and operational ownership for the analyzer while still running on App Service. Store the API key value as a SimpleChat App Service setting named by the SimpleChat admin setting, for example `PRESIDIO_DLP_API_KEY`, preferably backed by a Key Vault reference. ## App Service Sidecar -For deployments using App Service sidecar support, run the analyzer as a sidecar container next to SimpleChat and configure SimpleChat to call the sidecar endpoint over the local or private container network. This keeps Presidio dependencies out of the SimpleChat image while scaling the analyzer with the SimpleChat App Service instance count. +For deployments using App Service sidecar support, run the analyzer as a sidecar container next to SimpleChat and configure SimpleChat to call the sidecar endpoint over the local or private container network. Add the sidecar hostname, loopback host, or private IP to `Allowed Private Hosts`. This keeps Presidio dependencies out of the SimpleChat image while scaling the analyzer with the SimpleChat App Service instance count. Even with a sidecar, avoid raw text logging and keep the analyzer endpoint unreachable from the public internet. If the sidecar is fronted by a local wrapper, validate the `X-DLP-API-Key` or equivalent header there. ## Azure Container Apps -For independent scaling, deploy the analyzer as an internal Azure Container Apps service. Configure SimpleChat to reach the internal ingress URL over private networking and require the API key header at the Container Apps ingress, gateway, or wrapper service. +For independent scaling, deploy the analyzer as an internal Azure Container Apps service. Configure SimpleChat to reach the internal ingress URL over private networking, add that internal host to `Allowed Private Hosts`, and require the API key header at the Container Apps ingress, gateway, or wrapper service. This shape works well when analyzer CPU or model requirements scale differently from SimpleChat. Store the API key value in the SimpleChat App Service setting or Key Vault reference named by SimpleChat's `Secret Env Var` setting, not in the SimpleChat admin configuration. diff --git a/functional_tests/test_dlp_admin_settings_roundtrip.py b/functional_tests/test_dlp_admin_settings_roundtrip.py index e80b32d7..032de152 100644 --- a/functional_tests/test_dlp_admin_settings_roundtrip.py +++ b/functional_tests/test_dlp_admin_settings_roundtrip.py @@ -207,6 +207,7 @@ def test_presidio_endpoint_settings_are_normalized_without_secret_persistence(): assert "'dlp_default_engine': dlp_default_engine" in route_source assert "'dlp_presidio_analyzer_endpoint': dlp_presidio_analyzer_endpoint" in route_source + assert "'dlp_presidio_allowed_private_hosts': dlp_presidio_allowed_private_hosts" in route_source assert "'dlp_presidio_auth_header_name': dlp_presidio_auth_header_name" in route_source assert "'dlp_presidio_auth_secret_env_var': dlp_presidio_auth_secret_env_var" in route_source assert "'dlp_presidio_timeout_seconds': dlp_presidio_timeout_seconds" in route_source @@ -217,6 +218,10 @@ def test_presidio_endpoint_settings_are_normalized_without_secret_persistence(): assert "item.strip().upper()" in route_source assert "if not dlp_presidio_entities:" in route_source assert "dlp_presidio_entities = ['CREDIT_CARD', 'EMAIL_ADDRESS', 'PHONE_NUMBER', 'US_SSN']" in route_source + assert "validate_presidio_endpoint_url(" in route_source + assert "normalize_presidio_allowed_private_hosts(" in route_source + assert "normalize_presidio_secret_env_var_name(" in route_source + assert "settings.get('dlp_presidio_analyzer_endpoint', '')" in route_source assert "'dlp_presidio_auth_secret'" not in route_source assert "form_data.get('dlp_presidio_auth_secret'" not in route_source @@ -228,6 +233,7 @@ def test_default_settings_include_presidio_endpoint_controls(): assert "'dlp_default_engine': 'regex'" in settings_source assert "'dlp_presidio_analyzer_endpoint': ''" in settings_source + assert "'dlp_presidio_allowed_private_hosts': ''" in settings_source assert "'dlp_presidio_auth_header_name': 'X-DLP-API-Key'" in settings_source assert "'dlp_presidio_auth_secret_env_var': 'PRESIDIO_DLP_API_KEY'" in settings_source assert "'dlp_presidio_timeout_seconds': 5" in settings_source diff --git a/functional_tests/test_dlp_admin_settings_ui.py b/functional_tests/test_dlp_admin_settings_ui.py index c520537e..719a0b8a 100644 --- a/functional_tests/test_dlp_admin_settings_ui.py +++ b/functional_tests/test_dlp_admin_settings_ui.py @@ -51,6 +51,7 @@ PRESIDIO_ENDPOINT_CONTROL_IDS = [ "dlp_presidio_endpoint_settings", "dlp_presidio_analyzer_endpoint", + "dlp_presidio_allowed_private_hosts", "dlp_presidio_auth_header_name", "dlp_presidio_auth_secret_env_var", "dlp_presidio_timeout_seconds", @@ -157,6 +158,7 @@ def test_presidio_endpoint_controls_are_rendered_without_secret_value_field(): assert f'id="{control_id}"' in source, f"Missing Presidio endpoint control: {control_id}" assert 'name="dlp_presidio_analyzer_endpoint"' in source + assert 'name="dlp_presidio_allowed_private_hosts"' in source assert 'name="dlp_presidio_auth_header_name"' in source assert 'name="dlp_presidio_auth_secret_env_var"' in source assert 'name="dlp_presidio_timeout_seconds"' in source @@ -164,7 +166,8 @@ def test_presidio_endpoint_controls_are_rendered_without_secret_value_field(): assert 'name="dlp_presidio_language"' in source assert 'name="dlp_presidio_entities"' in source assert 'name="dlp_presidio_auth_secret"' not in source - assert "production endpoints should be private, authenticated, and https" in source.lower() + assert "public https endpoints are allowed by default" in source.lower() + assert "private hosts must be listed explicitly" in source.lower() def test_admin_js_uses_d_none_for_dlp_toggles(): diff --git a/functional_tests/test_dlp_admin_ui_smoke.py b/functional_tests/test_dlp_admin_ui_smoke.py index a0a7f2d8..f7a53bed 100644 --- a/functional_tests/test_dlp_admin_ui_smoke.py +++ b/functional_tests/test_dlp_admin_ui_smoke.py @@ -26,6 +26,7 @@ "dlp_default_engine", "dlp_presidio_endpoint_settings", "dlp_presidio_analyzer_endpoint", + "dlp_presidio_allowed_private_hosts", "dlp_presidio_auth_header_name", "dlp_presidio_auth_secret_env_var", "dlp_presidio_timeout_seconds", diff --git a/functional_tests/test_dlp_presidio_endpoint.py b/functional_tests/test_dlp_presidio_endpoint.py index 460e7f25..c7ca08d3 100644 --- a/functional_tests/test_dlp_presidio_endpoint.py +++ b/functional_tests/test_dlp_presidio_endpoint.py @@ -2,7 +2,7 @@ #!/usr/bin/env python3 """ Functional test for external Presidio endpoint DLP adapter. -Version: 0.242.071 +Version: 0.242.072 Implemented in: 0.242.071 This test ensures SimpleChat can call a configured Presidio-compatible analyzer @@ -22,13 +22,88 @@ def test_validate_presidio_endpoint_allows_https_and_localhost(): - """HTTPS and local HTTP endpoint URLs should be accepted.""" + """Public HTTPS and explicitly allowlisted local HTTP endpoint URLs should be accepted.""" from functions_dlp_presidio import validate_presidio_endpoint_url - assert validate_presidio_endpoint_url("https://presidio.internal/analyze") == "https://presidio.internal/analyze" - assert validate_presidio_endpoint_url("http://localhost:5002/analyze") == "http://localhost:5002/analyze" - assert validate_presidio_endpoint_url("http://127.0.0.1:5002/analyze") == "http://127.0.0.1:5002/analyze" - assert validate_presidio_endpoint_url("http://[::1]:5002/analyze") == "http://[::1]:5002/analyze" + assert validate_presidio_endpoint_url("https://presidio.example.com/analyze") == "https://presidio.example.com/analyze" + assert ( + validate_presidio_endpoint_url("http://localhost:5002/analyze", "localhost") + == "http://localhost:5002/analyze" + ) + assert ( + validate_presidio_endpoint_url("http://127.0.0.1:5002/analyze", "127.0.0.1") + == "http://127.0.0.1:5002/analyze" + ) + assert validate_presidio_endpoint_url("http://[::1]:5002/analyze", "::1") == "http://[::1]:5002/analyze" + + +def test_validate_presidio_endpoint_rejects_private_hosts_without_allowlist(): + """Private, link-local, and loopback endpoints should require an explicit allowlist.""" + from functions_dlp_presidio import PresidioEndpointConfigurationError, validate_presidio_endpoint_url + + blocked_urls = [ + "https://127.0.0.1:5002/analyze", + "https://[::1]:5002/analyze", + "https://10.1.2.3/analyze", + "https://172.16.0.10/analyze", + "https://192.168.1.20/analyze", + "https://169.254.169.254/metadata", + ] + + for blocked_url in blocked_urls: + try: + validate_presidio_endpoint_url(blocked_url) + except PresidioEndpointConfigurationError as exc: + assert "allowlist" in str(exc).lower() + continue + + raise AssertionError(f"Expected private endpoint to be rejected: {blocked_url}") + + +def test_validate_presidio_endpoint_allows_private_hosts_with_explicit_allowlist(): + """Private endpoint URLs should be accepted only when their host is explicitly allowlisted.""" + from functions_dlp_presidio import validate_presidio_endpoint_url + + allowed_private_hosts = "10.1.2.3\nlocalhost, ::1" + + assert ( + validate_presidio_endpoint_url("https://10.1.2.3/analyze", allowed_private_hosts) + == "https://10.1.2.3/analyze" + ) + assert ( + validate_presidio_endpoint_url("https://localhost:5002/analyze", allowed_private_hosts) + == "https://localhost:5002/analyze" + ) + assert ( + validate_presidio_endpoint_url("https://[::1]:5002/analyze", allowed_private_hosts) + == "https://[::1]:5002/analyze" + ) + + +def test_validate_presidio_endpoint_rejects_url_secret_persistence_vectors(): + """Endpoint URLs should reject userinfo, fragments, and credential-like query names.""" + from functions_dlp_presidio import PresidioEndpointConfigurationError, validate_presidio_endpoint_url + + blocked_urls = [ + "https://user:pass@presidio.example.com/analyze", + "https://presidio.example.com/analyze#fragment", + "https://presidio.example.com/analyze?key=abc", + "https://presidio.example.com/analyze?api_key=abc", + "https://presidio.example.com/analyze?apikey=abc", + "https://presidio.example.com/analyze?secret=abc", + "https://presidio.example.com/analyze?token=abc", + "https://presidio.example.com/analyze?password=abc", + "https://presidio.example.com/analyze?connection=abc", + "https://presidio.example.com/analyze?sig=abc", + ] + + for blocked_url in blocked_urls: + try: + validate_presidio_endpoint_url(blocked_url) + except PresidioEndpointConfigurationError: + continue + + raise AssertionError(f"Expected unsafe endpoint URL to be rejected: {blocked_url}") def test_validate_presidio_endpoint_rejects_insecure_remote_http(): @@ -63,11 +138,12 @@ def test_analyze_with_presidio_endpoint_posts_safe_payload_and_auth_header(monke captured = {} - def fake_post(url, json=None, headers=None, timeout=None): + def fake_post(url, json=None, headers=None, timeout=None, allow_redirects=None): captured["url"] = url captured["json"] = json captured["headers"] = headers captured["timeout"] = timeout + captured["allow_redirects"] = allow_redirects response = Mock() response.raise_for_status.return_value = None response.json.return_value = [ @@ -79,6 +155,7 @@ def fake_post(url, json=None, headers=None, timeout=None): monkeypatch.setenv("PRESIDIO_DLP_API_KEY", "unit-test-secret") settings = { "dlp_presidio_analyzer_endpoint": "https://presidio.internal/analyze", + "dlp_presidio_allowed_private_hosts": "presidio.internal", "dlp_presidio_auth_header_name": "X-DLP-API-Key", "dlp_presidio_auth_secret_env_var": "PRESIDIO_DLP_API_KEY", "dlp_presidio_entities": ["EMAIL_ADDRESS", "US_SSN"], @@ -100,6 +177,7 @@ def fake_post(url, json=None, headers=None, timeout=None): assert captured["headers"]["X-DLP-API-Key"] == "unit-test-secret" assert captured["headers"]["Content-Type"] == "application/json" assert captured["timeout"] == 3 + assert captured["allow_redirects"] is False def test_analyze_with_presidio_endpoint_omits_auth_header_without_env_secret(monkeypatch): @@ -108,7 +186,7 @@ def test_analyze_with_presidio_endpoint_omits_auth_header_without_env_secret(mon captured = {} - def fake_post(url, json=None, headers=None, timeout=None): + def fake_post(url, json=None, headers=None, timeout=None, allow_redirects=None): captured["headers"] = headers response = Mock() response.raise_for_status.return_value = None @@ -122,6 +200,7 @@ def fake_post(url, json=None, headers=None, timeout=None): RAW_TEXT, { "dlp_presidio_analyzer_endpoint": "https://presidio.internal/analyze", + "dlp_presidio_allowed_private_hosts": "presidio.internal", "dlp_presidio_auth_header_name": "X-DLP-API-Key", "dlp_presidio_auth_secret_env_var": "PRESIDIO_DLP_API_KEY", }, @@ -134,7 +213,7 @@ def test_analyze_with_presidio_endpoint_raises_safe_error_without_raw_text(monke """Endpoint exceptions should not retain raw scanned text in messages or exception chains.""" from functions_dlp_presidio import PresidioEndpointRequestError, analyze_with_presidio_endpoint - def fake_post(url, json=None, headers=None, timeout=None): + def fake_post(url, json=None, headers=None, timeout=None, allow_redirects=None): raise RuntimeError(f"upstream included {RAW_TEXT}") monkeypatch.setattr("functions_dlp_presidio.requests.post", fake_post) @@ -142,7 +221,10 @@ def fake_post(url, json=None, headers=None, timeout=None): try: analyze_with_presidio_endpoint( RAW_TEXT, - {"dlp_presidio_analyzer_endpoint": "https://presidio.internal/analyze"}, + { + "dlp_presidio_analyzer_endpoint": "https://presidio.internal/analyze", + "dlp_presidio_allowed_private_hosts": "presidio.internal", + }, ) except PresidioEndpointRequestError as exc: assert RAW_TEXT not in str(exc) @@ -159,7 +241,7 @@ def test_analyze_with_presidio_endpoint_normalizes_response_items(monkeypatch): """Recognizer responses should be filtered and normalized deterministically.""" from functions_dlp_presidio import analyze_with_presidio_endpoint - def fake_post(url, json=None, headers=None, timeout=None): + def fake_post(url, json=None, headers=None, timeout=None, allow_redirects=None): response = Mock() response.raise_for_status.return_value = None response.json.return_value = [ @@ -174,7 +256,76 @@ def fake_post(url, json=None, headers=None, timeout=None): results = analyze_with_presidio_endpoint( RAW_TEXT, - {"dlp_presidio_analyzer_endpoint": "https://presidio.internal/analyze"}, + { + "dlp_presidio_analyzer_endpoint": "https://presidio.internal/analyze", + "dlp_presidio_allowed_private_hosts": "presidio.internal", + }, ) assert results == [{"entity_type": "EMAIL_ADDRESS", "start": 11, "end": 24, "score": 0.91}] + + +def test_analyze_with_presidio_endpoint_treats_redirect_as_endpoint_error(monkeypatch): + """Redirect responses should not be followed or parsed as analyzer results.""" + from functions_dlp_presidio import PresidioEndpointRequestError, analyze_with_presidio_endpoint + + captured = {"calls": 0} + + def fake_post(url, json=None, headers=None, timeout=None, allow_redirects=None): + captured["calls"] += 1 + captured["allow_redirects"] = allow_redirects + response = Mock() + response.status_code = 302 + response.headers = {"Location": "https://attacker.example/analyze"} + response.raise_for_status.return_value = None + response.json.side_effect = AssertionError("Redirect responses must not be parsed.") + return response + + monkeypatch.setattr("functions_dlp_presidio.requests.post", fake_post) + + try: + analyze_with_presidio_endpoint( + RAW_TEXT, + {"dlp_presidio_analyzer_endpoint": "https://presidio.example.com/analyze"}, + ) + except PresidioEndpointRequestError as exc: + assert "redirect" in str(exc).lower() + assert captured["allow_redirects"] is False + assert captured["calls"] == 1 + return + + raise AssertionError("Expected redirect response to be handled as an endpoint error.") + + +def test_presidio_auth_secret_env_var_name_validation(monkeypatch): + """Only the dedicated Presidio DLP secret env var namespace should be read.""" + from functions_dlp_presidio import _get_auth_headers, normalize_presidio_secret_env_var_name + + monkeypatch.setenv("AZURE_OPENAI_KEY", "must-not-leak") + monkeypatch.setenv("COSMOS_CONNECTION_STRING", "must-not-leak") + monkeypatch.setenv("PRESIDIO_DLP_API_KEY", "presidio-secret") + monkeypatch.setenv("DLP_PRESIDIO_TOKEN", "prefixed-secret") + + assert normalize_presidio_secret_env_var_name("") == "" + assert normalize_presidio_secret_env_var_name("PRESIDIO_DLP_API_KEY") == "PRESIDIO_DLP_API_KEY" + assert normalize_presidio_secret_env_var_name("DLP_PRESIDIO_TOKEN") == "DLP_PRESIDIO_TOKEN" + assert normalize_presidio_secret_env_var_name("AZURE_OPENAI_KEY") == "" + assert normalize_presidio_secret_env_var_name("COSMOS_CONNECTION_STRING") == "" + assert _get_auth_headers( + { + "dlp_presidio_auth_header_name": "X-DLP-API-Key", + "dlp_presidio_auth_secret_env_var": "AZURE_OPENAI_KEY", + } + ) == {} + assert _get_auth_headers( + { + "dlp_presidio_auth_header_name": "X-DLP-API-Key", + "dlp_presidio_auth_secret_env_var": "PRESIDIO_DLP_API_KEY", + } + ) == {"X-DLP-API-Key": "presidio-secret"} + assert _get_auth_headers( + { + "dlp_presidio_auth_header_name": "X-DLP-API-Key", + "dlp_presidio_auth_secret_env_var": "DLP_PRESIDIO_TOKEN", + } + ) == {"X-DLP-API-Key": "prefixed-secret"} diff --git a/functional_tests/test_dlp_presidio_engine_integration.py b/functional_tests/test_dlp_presidio_engine_integration.py index 93a340f3..f5727012 100644 --- a/functional_tests/test_dlp_presidio_engine_integration.py +++ b/functional_tests/test_dlp_presidio_engine_integration.py @@ -2,7 +2,7 @@ #!/usr/bin/env python3 """ Functional test for Presidio endpoint engine integration. -Version: 0.242.071 +Version: 0.242.072 Implemented in: 0.242.071 This test ensures the external Presidio endpoint engine reuses SimpleChat's @@ -26,6 +26,7 @@ def presidio_settings(mode="redact", fail_closed=True): "enable_dlp_control_plane": True, "dlp_default_engine": "presidio_endpoint", "dlp_presidio_analyzer_endpoint": "https://presidio.internal/analyze", + "dlp_presidio_allowed_private_hosts": "presidio.internal", "dlp_presidio_timeout_seconds": 3, "dlp_presidio_score_threshold": 0.7, "dlp_presidio_entities": ["EMAIL_ADDRESS"], From 5530a968c8345ad238f4e237173960268c4b33b1 Mon Sep 17 00:00:00 2001 From: Zachary Arguelles Date: Thu, 18 Jun 2026 12:06:02 -0400 Subject: [PATCH 15/20] fix: reject credential-like presidio endpoint query params --- .../single_app/functions_dlp_presidio.py | 27 ++++++++++++++++++- .../test_dlp_presidio_endpoint.py | 3 +++ 2 files changed, 29 insertions(+), 1 deletion(-) diff --git a/application/single_app/functions_dlp_presidio.py b/application/single_app/functions_dlp_presidio.py index 382a8ec6..ac4678d3 100644 --- a/application/single_app/functions_dlp_presidio.py +++ b/application/single_app/functions_dlp_presidio.py @@ -26,6 +26,14 @@ "connection", "sig", } +PRESIDIO_CREDENTIAL_QUERY_WORDS = { + "key", + "secret", + "token", + "password", + "connection", + "sig", +} PRESIDIO_PRIVATE_HOST_SUFFIXES = ( ".internal", ".local", @@ -122,6 +130,23 @@ def normalize_presidio_secret_env_var_name(secret_env_var): return "" +def _is_credential_like_query_name(query_name): + normalized = str(query_name or "").strip().lower() + if not normalized: + return False + compact_name = re.sub(r"[^a-z0-9]+", "", normalized) + query_tokens = { + token + for token in re.split(r"[^a-z0-9]+", normalized) + if token + } + if normalized in PRESIDIO_CREDENTIAL_QUERY_NAMES or compact_name in PRESIDIO_CREDENTIAL_QUERY_NAMES: + return True + if query_tokens & PRESIDIO_CREDENTIAL_QUERY_WORDS: + return True + return any(credential_word in compact_name for credential_word in PRESIDIO_CREDENTIAL_QUERY_WORDS) + + def validate_presidio_endpoint_url(endpoint_url, allowed_private_hosts=None): """Validate and normalize a Presidio Analyzer endpoint URL.""" normalized = str(endpoint_url or "").strip() @@ -137,7 +162,7 @@ def validate_presidio_endpoint_url(endpoint_url, allowed_private_hosts=None): if parsed.fragment: raise PresidioEndpointConfigurationError("Presidio analyzer endpoint URL must not include a fragment.") for query_name, _ in parse_qsl(parsed.query, keep_blank_values=True): - if query_name.strip().lower() in PRESIDIO_CREDENTIAL_QUERY_NAMES: + if _is_credential_like_query_name(query_name): raise PresidioEndpointConfigurationError( "Presidio analyzer endpoint URL must not include credential-like query parameters." ) diff --git a/functional_tests/test_dlp_presidio_endpoint.py b/functional_tests/test_dlp_presidio_endpoint.py index c7ca08d3..d1c84364 100644 --- a/functional_tests/test_dlp_presidio_endpoint.py +++ b/functional_tests/test_dlp_presidio_endpoint.py @@ -95,6 +95,9 @@ def test_validate_presidio_endpoint_rejects_url_secret_persistence_vectors(): "https://presidio.example.com/analyze?password=abc", "https://presidio.example.com/analyze?connection=abc", "https://presidio.example.com/analyze?sig=abc", + "https://presidio.example.com/analyze?client_secret=abc", + "https://presidio.example.com/analyze?access_token=abc", + "https://presidio.example.com/analyze?subscription-key=abc", ] for blocked_url in blocked_urls: From 48ba4191d442cbeba2690ab77a0775a14a10d2ac Mon Sep 17 00:00:00 2001 From: Zachary Arguelles Date: Thu, 18 Jun 2026 12:22:57 -0400 Subject: [PATCH 16/20] fix: harden presidio endpoint safety checks --- application/single_app/functions_dlp.py | 80 ++++++++++++++++-- .../single_app/functions_dlp_presidio.py | 68 ++++++++++++++- application/single_app/functions_settings.py | 8 ++ docs/explanation/release_notes.md | 6 ++ .../test_dlp_admin_settings_roundtrip.py | 46 +++++++++++ .../test_dlp_presidio_endpoint.py | 82 ++++++++++++++++++- .../test_dlp_presidio_engine_integration.py | 48 ++++++++++- 7 files changed, 326 insertions(+), 12 deletions(-) diff --git a/application/single_app/functions_dlp.py b/application/single_app/functions_dlp.py index 2497fa18..0f4383c8 100644 --- a/application/single_app/functions_dlp.py +++ b/application/single_app/functions_dlp.py @@ -2,6 +2,8 @@ import hashlib import logging +import re +from collections import OrderedDict from functions_dlp_presidio import analyze_with_presidio_endpoint from functions_dlp_rules import get_effective_dlp_regex_rules, scan_text_with_dlp_regex_rules @@ -19,6 +21,8 @@ def log_event(message, extra=None, level=logging.INFO, exceptionTraceback=False) DEFAULT_MAX_SCAN_CHARS = 200000 DEFAULT_SCANNER_TIMEOUT_SECONDS = 5 SUPPORTED_WEB_SEARCH_MODES = {"monitor", "redact", "block"} +UNKNOWN_DLP_ENTITY_TYPE = "UNKNOWN_ENTITY" +SAFE_DLP_ENTITY_TYPE_PATTERN = re.compile(r"^[A-Z0-9_]{1,64}$") def _bool_setting(settings, key, default=False): return bool((settings or {}).get(key, default)) @@ -113,6 +117,66 @@ def _decision_from_counts(match_counts, mode): return "monitor" +def normalize_dlp_entity_type(entity_type): + """Normalize untrusted analyzer entity labels before they reach outputs.""" + normalized = str(entity_type or "").strip().upper() + if SAFE_DLP_ENTITY_TYPE_PATTERN.fullmatch(normalized): + return normalized + return UNKNOWN_DLP_ENTITY_TYPE + + +def _safe_result_start(item): + try: + return int(item.get("start")) + except (TypeError, ValueError): + return 0 + + +def normalize_external_analyzer_results(text, recognizer_results, mode="redact", engine="external_analyzer"): + """Normalize external analyzer entity offsets into the shared counts-only result.""" + source_text = str(text or "") + sorted_results = sorted( + [ + item for item in (recognizer_results or []) + if isinstance(item, dict) and item.get("start") is not None and item.get("end") is not None + ], + key=_safe_result_start, + ) + match_counts = OrderedDict() + redacted_parts = [] + cursor = 0 + + for item in sorted_results: + start = max(0, min(len(source_text), int(item.get("start")))) + end = max(start, min(len(source_text), int(item.get("end")))) + entity_type = normalize_dlp_entity_type(item.get("entity_type")) + if start < cursor: + continue + redacted_parts.append(source_text[cursor:start]) + redacted_parts.append(f"[REDACTED_{entity_type}]") + cursor = end + match_counts[entity_type] = match_counts.get(entity_type, 0) + 1 + + redacted_parts.append(source_text[cursor:]) + redacted_text = "".join(redacted_parts) + counts = dict(match_counts) + decision = _decision_from_counts(counts, mode) + + return { + "enabled": True, + "engine": engine, + "mode": mode, + "decision": decision, + "text": redacted_text if counts else source_text, + "redacted_text": redacted_text if counts else source_text, + "total_replacements": sum(counts.values()), + "match_counts": counts, + "matches": [{"entity_type": key, "count": value} for key, value in counts.items()], + "metadata": {"adapter": "external_analyzer"}, + "scanner_status": "ok", + } + + def evaluate_dlp_text(text, settings=None, context=None, surface="generic"): """Evaluate text against the configured DLP policy and return a safe result.""" settings = settings or {} @@ -261,11 +325,17 @@ def evaluate_web_search_egress(text, settings=None, context=None): def _safe_entity_counts(match_counts): - return { - str(entity_type): int(count) - for entity_type, count in (match_counts or {}).items() - if entity_type and int(count) > 0 - } + counts = OrderedDict() + for entity_type, count in (match_counts or {}).items(): + try: + normalized_count = int(count) + except (TypeError, ValueError): + continue + if normalized_count <= 0: + continue + safe_entity_type = normalize_dlp_entity_type(entity_type) + counts[safe_entity_type] = counts.get(safe_entity_type, 0) + normalized_count + return dict(counts) def _error_hash(result): diff --git a/application/single_app/functions_dlp_presidio.py b/application/single_app/functions_dlp_presidio.py index ac4678d3..1408c856 100644 --- a/application/single_app/functions_dlp_presidio.py +++ b/application/single_app/functions_dlp_presidio.py @@ -5,6 +5,7 @@ import ipaddress import os import re +import socket from urllib.parse import parse_qsl, urlparse import requests @@ -115,6 +116,61 @@ def _is_loopback_presidio_host(host): return False +def _is_ip_literal(host): + try: + ipaddress.ip_address(_normalize_host_identifier(host)) + return True + except ValueError: + return False + + +def _resolve_presidio_host_addresses(host, port): + normalized_host = _normalize_host_identifier(host) + if not normalized_host: + return [] + if _is_ip_literal(normalized_host): + return [ipaddress.ip_address(normalized_host)] + + try: + address_info = socket.getaddrinfo( + normalized_host, + port, + type=socket.SOCK_STREAM, + ) + except socket.gaierror as exc: + raise PresidioEndpointConfigurationError("Presidio analyzer endpoint host must resolve in DNS.") from exc + + addresses = [] + seen_addresses = set() + for item in address_info: + sockaddr = item[4] if len(item) > 4 else None + if not sockaddr: + continue + raw_address = str(sockaddr[0]).split("%", 1)[0] + try: + address = ipaddress.ip_address(raw_address) + except ValueError: + continue + if address in seen_addresses: + continue + addresses.append(address) + seen_addresses.add(address) + return addresses + + +def _validate_resolved_presidio_addresses(host, port, allowed_hosts): + normalized_host = _normalize_host_identifier(host) + addresses = _resolve_presidio_host_addresses(normalized_host, port) + if not addresses: + raise PresidioEndpointConfigurationError("Presidio analyzer endpoint host must resolve to an IP address.") + if normalized_host in allowed_hosts: + return + if any(not address.is_global for address in addresses): + raise PresidioEndpointConfigurationError( + "Private Presidio analyzer endpoint hosts must be listed in the private host allowlist." + ) + + def normalize_presidio_secret_env_var_name(secret_env_var): """Return an allowed Presidio secret env var name, or blank when invalid.""" normalized = str(secret_env_var or "").strip() @@ -155,6 +211,7 @@ def validate_presidio_endpoint_url(endpoint_url, allowed_private_hosts=None): parsed = urlparse(normalized) host = (parsed.hostname or "").lower() + normalized_host = _normalize_host_identifier(host) if parsed.scheme not in {"http", "https"} or not parsed.netloc: raise PresidioEndpointConfigurationError("Presidio analyzer endpoint must be an absolute HTTP(S) URL.") if parsed.username or parsed.password or "@" in parsed.netloc: @@ -169,12 +226,17 @@ def validate_presidio_endpoint_url(endpoint_url, allowed_private_hosts=None): host_is_private = _is_private_presidio_host(host) allowed_hosts = _get_allowed_private_hosts(allowed_private_hosts) - if host_is_private and _normalize_host_identifier(host) not in allowed_hosts: + if host_is_private and normalized_host not in allowed_hosts: raise PresidioEndpointConfigurationError( "Private Presidio analyzer endpoint hosts must be listed in the private host allowlist." ) if parsed.scheme == "http" and not _is_loopback_presidio_host(host): raise PresidioEndpointConfigurationError("Presidio analyzer endpoint must use HTTPS unless it is localhost.") + _validate_resolved_presidio_addresses( + host, + parsed.port or (443 if parsed.scheme == "https" else 80), + allowed_hosts, + ) return normalized @@ -219,11 +281,11 @@ def _get_auth_headers(settings): def _normalize_result_item(item): if not isinstance(item, dict): return None - if not item.get("entity_type") or item.get("start") is None or item.get("end") is None: + if "entity_type" not in item or item.get("start") is None or item.get("end") is None: return None try: return { - "entity_type": str(item.get("entity_type")), + "entity_type": str(item.get("entity_type") or ""), "start": int(item.get("start")), "end": int(item.get("end")), "score": float(item.get("score", 0.0)), diff --git a/application/single_app/functions_settings.py b/application/single_app/functions_settings.py index cf81994e..f091a3c7 100644 --- a/application/single_app/functions_settings.py +++ b/application/single_app/functions_settings.py @@ -2303,6 +2303,12 @@ def sanitize_settings_for_user(full_settings: dict) -> dict: return full_settings sensitive_terms = ("key", "secret", "password", "connection", "base64", "storage_account_url") + sensitive_setting_names = { + "dlp_presidio_analyzer_endpoint", + "dlp_presidio_allowed_private_hosts", + "dlp_presidio_auth_header_name", + "dlp_presidio_auth_secret_env_var", + } sanitized = {} for k, v in full_settings.items(): @@ -2310,6 +2316,8 @@ def sanitize_settings_for_user(full_settings: dict) -> dict: continue if k == 'agents_page_promoted_popular_agents': continue + if k in sensitive_setting_names: + continue if any(term in k.lower() for term in sensitive_terms): continue if k in ('model_endpoints', 'personal_model_endpoints') and isinstance(v, list): diff --git a/docs/explanation/release_notes.md b/docs/explanation/release_notes.md index a43ed7b9..68c849ce 100644 --- a/docs/explanation/release_notes.md +++ b/docs/explanation/release_notes.md @@ -8,6 +8,12 @@ For feature-focused and fix-focused drill-downs by version, see [Features by Ver #### Bug Fixes +* **Presidio Endpoint Runtime Safety Hardening** + * Rejects Presidio Analyzer hostnames whose DNS answers include loopback, link-local, private, or otherwise non-global addresses unless the exact endpoint host is explicitly allowlisted. + * Normalizes untrusted Presidio entity labels to safe uppercase identifiers before redaction output, match counts, match summaries, and telemetry are built. + * Strips Presidio endpoint and private-host settings from non-admin settings sanitization output. + * (Ref: Presidio endpoint DNS validation, DLP entity label normalization, user settings sanitization) + * **Presidio Endpoint Configuration Hardening** * Rejects Presidio Analyzer endpoint URLs with userinfo, fragments, credential-like query parameters, or private/link-local/loopback hosts that are not explicitly allowlisted. * Adds an admin-configurable `Allowed Private Hosts` allowlist for private Presidio deployments and preserves only validated endpoint settings. diff --git a/functional_tests/test_dlp_admin_settings_roundtrip.py b/functional_tests/test_dlp_admin_settings_roundtrip.py index 032de152..b4b4d3da 100644 --- a/functional_tests/test_dlp_admin_settings_roundtrip.py +++ b/functional_tests/test_dlp_admin_settings_roundtrip.py @@ -11,11 +11,13 @@ import os import sys +import ast from pathlib import Path ROOT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) APP_DIR = os.path.join(ROOT_DIR, "application", "single_app") +sys.path.insert(0, APP_DIR) ADMIN_ROUTE_FILE = os.path.join(APP_DIR, "route_frontend_admin_settings.py") ADMIN_TEMPLATE_FILE = os.path.join(APP_DIR, "templates", "admin_settings.html") FUNCTIONS_SETTINGS_FILE = os.path.join(APP_DIR, "functions_settings.py") @@ -67,6 +69,21 @@ def read_file_text(path): return file_handle.read() +def load_sanitize_settings_for_user(): + """Load the sanitizer function without importing optional app dependencies.""" + source = read_file_text(FUNCTIONS_SETTINGS_FILE) + tree = ast.parse(source, filename=FUNCTIONS_SETTINGS_FILE) + function_node = next( + node for node in tree.body + if isinstance(node, ast.FunctionDef) and node.name == "sanitize_settings_for_user" + ) + module = ast.Module(body=[function_node], type_ignores=[]) + ast.fix_missing_locations(module) + namespace = {} + exec(compile(module, FUNCTIONS_SETTINGS_FILE, "exec"), namespace) + return namespace["sanitize_settings_for_user"] + + def assert_no_retired_structured_redaction_control(source, source_name): """Retired structured-redaction controls should not appear in admin DLP sources.""" redaction_prefix = "web_search_dlp_redact" @@ -243,6 +260,34 @@ def test_default_settings_include_presidio_endpoint_controls(): assert "'dlp_presidio_auth_secret'" not in settings_source +def test_user_settings_sanitization_strips_presidio_endpoint_controls(): + """Non-admin settings rendering should not expose Presidio endpoints or private host topology.""" + print("Testing Presidio endpoint user settings sanitization...") + sanitize_settings_for_user = load_sanitize_settings_for_user() + + sanitized = sanitize_settings_for_user( + { + "enable_dlp_control_plane": True, + "dlp_default_engine": "presidio_endpoint", + "dlp_presidio_analyzer_endpoint": "https://presidio.internal/analyze", + "dlp_presidio_allowed_private_hosts": "presidio.internal, 10.0.0.5", + "dlp_presidio_auth_header_name": "X-DLP-API-Key", + "dlp_presidio_auth_secret_env_var": "PRESIDIO_DLP_API_KEY", + "nested": { + "dlp_presidio_analyzer_endpoint": "https://nested-presidio.internal/analyze", + "safe": "visible", + }, + } + ) + + assert "dlp_presidio_analyzer_endpoint" not in sanitized + assert "dlp_presidio_allowed_private_hosts" not in sanitized + assert "dlp_presidio_auth_header_name" not in sanitized + assert "dlp_presidio_auth_secret_env_var" not in sanitized + assert "dlp_presidio_analyzer_endpoint" not in sanitized["nested"] + assert sanitized["nested"]["safe"] == "visible" + + if __name__ == "__main__": tests = [ test_dlp_admin_post_normalizes_untrusted_form_values, @@ -255,6 +300,7 @@ def test_default_settings_include_presidio_endpoint_controls(): test_admin_settings_rejects_invalid_dlp_regex_rules_before_update, test_presidio_endpoint_settings_are_normalized_without_secret_persistence, test_default_settings_include_presidio_endpoint_controls, + test_user_settings_sanitization_strips_presidio_endpoint_controls, ] try: diff --git a/functional_tests/test_dlp_presidio_endpoint.py b/functional_tests/test_dlp_presidio_endpoint.py index d1c84364..131aae00 100644 --- a/functional_tests/test_dlp_presidio_endpoint.py +++ b/functional_tests/test_dlp_presidio_endpoint.py @@ -2,7 +2,7 @@ #!/usr/bin/env python3 """ Functional test for external Presidio endpoint DLP adapter. -Version: 0.242.072 +Version: 0.242.073 Implemented in: 0.242.071 This test ensures SimpleChat can call a configured Presidio-compatible analyzer @@ -10,6 +10,7 @@ """ import os +import socket import sys from unittest.mock import Mock @@ -21,10 +22,32 @@ RAW_TEXT = "Contact me a@example.com" -def test_validate_presidio_endpoint_allows_https_and_localhost(): +def stub_dns_answers(monkeypatch, expected_host, addresses=None): + """Return deterministic DNS answers for endpoint validation tests.""" + host_answers = expected_host if isinstance(expected_host, dict) else {expected_host: addresses} + + def fake_getaddrinfo(host, port, *args, **kwargs): + assert host in host_answers + return [ + (socket.AF_INET, socket.SOCK_STREAM, 6, "", (address, port or 443)) + for address in host_answers[host] + ] + + monkeypatch.setattr(socket, "getaddrinfo", fake_getaddrinfo) + + +def test_validate_presidio_endpoint_allows_https_and_localhost(monkeypatch): """Public HTTPS and explicitly allowlisted local HTTP endpoint URLs should be accepted.""" from functions_dlp_presidio import validate_presidio_endpoint_url + stub_dns_answers( + monkeypatch, + { + "presidio.example.com": ["93.184.216.34"], + "localhost": ["127.0.0.1"], + }, + ) + assert validate_presidio_endpoint_url("https://presidio.example.com/analyze") == "https://presidio.example.com/analyze" assert ( validate_presidio_endpoint_url("http://localhost:5002/analyze", "localhost") @@ -60,6 +83,51 @@ def test_validate_presidio_endpoint_rejects_private_hosts_without_allowlist(): raise AssertionError(f"Expected private endpoint to be rejected: {blocked_url}") +def test_validate_presidio_endpoint_rejects_public_hostname_resolving_to_private_ip(monkeypatch): + """Public-looking hostnames should be rejected when DNS resolves to non-global addresses.""" + from functions_dlp_presidio import PresidioEndpointConfigurationError, validate_presidio_endpoint_url + + stub_dns_answers(monkeypatch, "presidio.example.com", ["169.254.169.254"]) + + try: + validate_presidio_endpoint_url("https://presidio.example.com/analyze") + except PresidioEndpointConfigurationError as exc: + assert "allowlist" in str(exc).lower() + return + + raise AssertionError("Expected DNS-resolved metadata endpoint address to be rejected.") + + +def test_validate_presidio_endpoint_rejects_any_private_dns_answer(monkeypatch): + """Any non-global DNS answer should fail unless the endpoint host is explicitly allowlisted.""" + from functions_dlp_presidio import PresidioEndpointConfigurationError, validate_presidio_endpoint_url + + stub_dns_answers(monkeypatch, "presidio.example.com", ["93.184.216.34", "10.0.0.5"]) + + try: + validate_presidio_endpoint_url("https://presidio.example.com/analyze") + except PresidioEndpointConfigurationError as exc: + assert "allowlist" in str(exc).lower() + return + + raise AssertionError("Expected hostname with mixed public/private DNS answers to be rejected.") + + +def test_validate_presidio_endpoint_allows_private_dns_answer_for_exact_allowlisted_host(monkeypatch): + """A private DNS answer should be accepted only for the exact endpoint host in the allowlist.""" + from functions_dlp_presidio import validate_presidio_endpoint_url + + stub_dns_answers(monkeypatch, "presidio.example.com", ["10.0.0.5"]) + + assert ( + validate_presidio_endpoint_url( + "https://presidio.example.com/analyze", + "presidio.example.com", + ) + == "https://presidio.example.com/analyze" + ) + + def test_validate_presidio_endpoint_allows_private_hosts_with_explicit_allowlist(): """Private endpoint URLs should be accepted only when their host is explicitly allowlisted.""" from functions_dlp_presidio import validate_presidio_endpoint_url @@ -167,6 +235,7 @@ def fake_post(url, json=None, headers=None, timeout=None, allow_redirects=None): "dlp_presidio_timeout_seconds": 3, } + stub_dns_answers(monkeypatch, "presidio.internal", ["10.0.0.5"]) results = analyze_with_presidio_endpoint(RAW_TEXT, settings) assert results == [{"entity_type": "EMAIL_ADDRESS", "start": 11, "end": 24, "score": 0.91}] @@ -198,6 +267,7 @@ def fake_post(url, json=None, headers=None, timeout=None, allow_redirects=None): monkeypatch.setattr("functions_dlp_presidio.requests.post", fake_post) monkeypatch.delenv("PRESIDIO_DLP_API_KEY", raising=False) + stub_dns_answers(monkeypatch, "presidio.internal", ["10.0.0.5"]) analyze_with_presidio_endpoint( RAW_TEXT, @@ -220,6 +290,7 @@ def fake_post(url, json=None, headers=None, timeout=None, allow_redirects=None): raise RuntimeError(f"upstream included {RAW_TEXT}") monkeypatch.setattr("functions_dlp_presidio.requests.post", fake_post) + stub_dns_answers(monkeypatch, "presidio.internal", ["10.0.0.5"]) try: analyze_with_presidio_endpoint( @@ -256,6 +327,7 @@ def fake_post(url, json=None, headers=None, timeout=None, allow_redirects=None): return response monkeypatch.setattr("functions_dlp_presidio.requests.post", fake_post) + stub_dns_answers(monkeypatch, "presidio.internal", ["10.0.0.5"]) results = analyze_with_presidio_endpoint( RAW_TEXT, @@ -265,7 +337,10 @@ def fake_post(url, json=None, headers=None, timeout=None, allow_redirects=None): }, ) - assert results == [{"entity_type": "EMAIL_ADDRESS", "start": 11, "end": 24, "score": 0.91}] + assert results == [ + {"entity_type": "EMAIL_ADDRESS", "start": 11, "end": 24, "score": 0.91}, + {"entity_type": "", "start": 1, "end": 2, "score": 0.4}, + ] def test_analyze_with_presidio_endpoint_treats_redirect_as_endpoint_error(monkeypatch): @@ -285,6 +360,7 @@ def fake_post(url, json=None, headers=None, timeout=None, allow_redirects=None): return response monkeypatch.setattr("functions_dlp_presidio.requests.post", fake_post) + stub_dns_answers(monkeypatch, "presidio.example.com", ["93.184.216.34"]) try: analyze_with_presidio_endpoint( diff --git a/functional_tests/test_dlp_presidio_engine_integration.py b/functional_tests/test_dlp_presidio_engine_integration.py index f5727012..1010702c 100644 --- a/functional_tests/test_dlp_presidio_engine_integration.py +++ b/functional_tests/test_dlp_presidio_engine_integration.py @@ -2,7 +2,7 @@ #!/usr/bin/env python3 """ Functional test for Presidio endpoint engine integration. -Version: 0.242.072 +Version: 0.242.073 Implemented in: 0.242.071 This test ensures the external Presidio endpoint engine reuses SimpleChat's @@ -107,3 +107,49 @@ def fail_scan(text, settings): assert result["redacted_text"] == "" assert result["scanner_status"] == "error" assert RAW_TEXT not in repr(result) + + +def test_presidio_endpoint_sanitizes_untrusted_entity_labels(monkeypatch): + """Remote entity labels must not be copied into redaction output or count keys.""" + import functions_dlp + + malicious_label = "EMAIL_ADDRESS_a@example.com" + monkeypatch.setattr( + functions_dlp, + "analyze_with_presidio_endpoint", + lambda text, settings: [{"entity_type": malicious_label, "start": 11, "end": 24, "score": 0.92}], + ) + + result = functions_dlp.evaluate_dlp_text( + RAW_TEXT, + settings=presidio_settings("redact"), + surface="web_search", + ) + telemetry = functions_dlp.build_dlp_telemetry_properties(result, "web_search") + + assert result["redacted_text"] == "Contact me [REDACTED_UNKNOWN_ENTITY]" + assert result["match_counts"] == {"UNKNOWN_ENTITY": 1} + assert result["matches"] == [{"entity_type": "UNKNOWN_ENTITY", "count": 1}] + assert telemetry["dlp_entity_counts"] == {"UNKNOWN_ENTITY": 1} + assert malicious_label not in repr(result) + assert "a@example.com" not in repr(result) + assert malicious_label not in repr(telemetry) + assert "a@example.com" not in repr(telemetry) + + +def test_external_analyzer_normalizes_empty_and_too_long_entity_labels(): + """Invalid external analyzer labels should collapse to a fixed safe entity name.""" + import functions_dlp + + long_label = "A" * 65 + results = [ + {"entity_type": "", "start": 0, "end": 7}, + {"entity_type": long_label, "start": 11, "end": 24}, + ] + + normalized = functions_dlp.normalize_external_analyzer_results(RAW_TEXT, results, mode="redact") + + assert normalized["redacted_text"] == "[REDACTED_UNKNOWN_ENTITY] me [REDACTED_UNKNOWN_ENTITY]" + assert normalized["match_counts"] == {"UNKNOWN_ENTITY": 2} + assert normalized["matches"] == [{"entity_type": "UNKNOWN_ENTITY", "count": 2}] + assert long_label not in repr(normalized) From e47a78bfb84f44f5e46cf311d31f1cd649cfcc96 Mon Sep 17 00:00:00 2001 From: Zachary Arguelles Date: Thu, 18 Jun 2026 12:35:00 -0400 Subject: [PATCH 17/20] fix: prevent presidio endpoint dns rebinding --- .../single_app/functions_dlp_presidio.py | 144 +++++++++++++++++- .../test_dlp_presidio_endpoint.py | 59 +++++-- functional_tests/test_dlp_review_events.py | 2 +- functional_tests/test_dlp_telemetry.py | 2 +- .../test_upload_dlp_workspace_scopes.py | 2 +- 5 files changed, 193 insertions(+), 16 deletions(-) diff --git a/application/single_app/functions_dlp_presidio.py b/application/single_app/functions_dlp_presidio.py index 1408c856..0375b184 100644 --- a/application/single_app/functions_dlp_presidio.py +++ b/application/single_app/functions_dlp_presidio.py @@ -9,6 +9,11 @@ from urllib.parse import parse_qsl, urlparse import requests +from urllib3 import connection as urllib3_connection +from urllib3 import connectionpool as urllib3_connectionpool +from urllib3 import poolmanager as urllib3_poolmanager +from urllib3.util import connection as urllib3_util_connection +from urllib3.util.timeout import _DEFAULT_TIMEOUT DEFAULT_PRESIDIO_TIMEOUT_SECONDS = 5 @@ -140,6 +145,10 @@ def _resolve_presidio_host_addresses(host, port): except socket.gaierror as exc: raise PresidioEndpointConfigurationError("Presidio analyzer endpoint host must resolve in DNS.") from exc + return _extract_presidio_addresses(address_info) + + +def _extract_presidio_addresses(address_info): addresses = [] seen_addresses = set() for item in address_info: @@ -158,9 +167,8 @@ def _resolve_presidio_host_addresses(host, port): return addresses -def _validate_resolved_presidio_addresses(host, port, allowed_hosts): +def _validate_presidio_address_list(host, addresses, allowed_hosts): normalized_host = _normalize_host_identifier(host) - addresses = _resolve_presidio_host_addresses(normalized_host, port) if not addresses: raise PresidioEndpointConfigurationError("Presidio analyzer endpoint host must resolve to an IP address.") if normalized_host in allowed_hosts: @@ -171,6 +179,135 @@ def _validate_resolved_presidio_addresses(host, port, allowed_hosts): ) +def _validate_resolved_presidio_addresses(host, port, allowed_hosts): + normalized_host = _normalize_host_identifier(host) + addresses = _resolve_presidio_host_addresses(normalized_host, port) + _validate_presidio_address_list(normalized_host, addresses, allowed_hosts) + + +def _set_socket_options(sock, socket_options): + for option in socket_options or []: + sock.setsockopt(*option) + + +def _create_presidio_safe_socket_connection(host, port, timeout, source_address, socket_options, allowed_hosts): + connect_host = str(host or "") + if connect_host.startswith("["): + connect_host = connect_host.strip("[]") + connect_host.encode("idna") + + address_info = socket.getaddrinfo( + connect_host, + port, + urllib3_util_connection.allowed_gai_family(), + socket.SOCK_STREAM, + ) + _validate_presidio_address_list(connect_host, _extract_presidio_addresses(address_info), allowed_hosts) + + last_error = None + for family, socktype, proto, _canonname, sockaddr in address_info: + sock = None + try: + sock = socket.socket(family, socktype, proto) + _set_socket_options(sock, socket_options) + if timeout is not _DEFAULT_TIMEOUT: + sock.settimeout(timeout) + if source_address: + sock.bind(source_address) + sock.connect(sockaddr) + last_error = None + return sock + except OSError as exc: + last_error = exc + if sock is not None: + sock.close() + + if last_error is not None: + raise last_error + raise OSError("getaddrinfo returns an empty list") + + +class _PresidioSSRFConnectionMixin: + presidio_allowed_private_hosts = frozenset() + + def _new_conn(self): + try: + return _create_presidio_safe_socket_connection( + self._dns_host, + self.port, + self.timeout, + self.source_address, + self.socket_options, + self.presidio_allowed_private_hosts, + ) + except socket.gaierror as exc: + raise urllib3_connection.NameResolutionError(self.host, self, exc) from exc + except urllib3_connection.SocketTimeout as exc: + raise urllib3_connection.ConnectTimeoutError( + self, + f"Connection to {self.host} timed out. (connect timeout={self.timeout})", + ) from exc + except OSError as exc: + raise urllib3_connection.NewConnectionError( + self, + f"Failed to establish a new connection: {exc}", + ) from exc + + +def _build_presidio_pool_classes(allowed_hosts): + class PresidioSSRFHTTPConnection(_PresidioSSRFConnectionMixin, urllib3_connection.HTTPConnection): + presidio_allowed_private_hosts = allowed_hosts + + class PresidioSSRFHTTPSConnection(_PresidioSSRFConnectionMixin, urllib3_connection.HTTPSConnection): + presidio_allowed_private_hosts = allowed_hosts + + class PresidioSSRFHTTPConnectionPool(urllib3_connectionpool.HTTPConnectionPool): + ConnectionCls = PresidioSSRFHTTPConnection + + class PresidioSSRFHTTPSConnectionPool(urllib3_connectionpool.HTTPSConnectionPool): + ConnectionCls = PresidioSSRFHTTPSConnection + + return { + "http": PresidioSSRFHTTPConnectionPool, + "https": PresidioSSRFHTTPSConnectionPool, + } + + +class _PresidioSSRFHTTPAdapter(requests.adapters.HTTPAdapter): + def __init__(self, allowed_hosts, *args, **kwargs): + self._presidio_pool_classes = _build_presidio_pool_classes(frozenset(allowed_hosts)) + super().__init__(*args, **kwargs) + + def init_poolmanager(self, connections, maxsize, block=False, **pool_kwargs): + self.poolmanager = urllib3_poolmanager.PoolManager( + num_pools=connections, + maxsize=maxsize, + block=block, + **pool_kwargs, + ) + self.poolmanager.pool_classes_by_scheme = self._presidio_pool_classes + + +def _build_presidio_endpoint_session(allowed_private_hosts): + session = requests.Session() + session.trust_env = False + adapter = _PresidioSSRFHTTPAdapter(_get_allowed_private_hosts(allowed_private_hosts)) + session.mount("http://", adapter) + session.mount("https://", adapter) + return session + + +def _post_presidio_endpoint(endpoint_url, json, headers, timeout, allow_redirects, allowed_private_hosts): + with _build_presidio_endpoint_session(allowed_private_hosts) as session: + return session.post( + endpoint_url, + json=json, + headers=headers, + timeout=timeout, + allow_redirects=allow_redirects, + ) + + def normalize_presidio_secret_env_var_name(secret_env_var): """Return an allowed Presidio secret env var name, or blank when invalid.""" normalized = str(secret_env_var or "").strip() @@ -323,12 +460,13 @@ def analyze_with_presidio_endpoint(text, settings): request_error_type = None try: - response = requests.post( + response = _post_presidio_endpoint( endpoint_url, json=payload, headers=headers, timeout=timeout_seconds, allow_redirects=False, + allowed_private_hosts=settings.get("dlp_presidio_allowed_private_hosts"), ) status_code = getattr(response, "status_code", None) if isinstance(status_code, int) and 300 <= status_code < 400: diff --git a/functional_tests/test_dlp_presidio_endpoint.py b/functional_tests/test_dlp_presidio_endpoint.py index 131aae00..bba7a962 100644 --- a/functional_tests/test_dlp_presidio_endpoint.py +++ b/functional_tests/test_dlp_presidio_endpoint.py @@ -113,6 +113,45 @@ def test_validate_presidio_endpoint_rejects_any_private_dns_answer(monkeypatch): raise AssertionError("Expected hostname with mixed public/private DNS answers to be rejected.") +def test_analyze_with_presidio_endpoint_blocks_dns_rebinding_before_socket_connect(monkeypatch): + """The request connection path should re-check DNS answers before opening a socket.""" + from functions_dlp_presidio import ( + PresidioEndpointConfigurationError, + PresidioEndpointRequestError, + analyze_with_presidio_endpoint, + ) + + dns_calls = [] + socket_attempts = {"count": 0} + + def fake_getaddrinfo(host, port, *args, **kwargs): + assert host == "presidio.example.com" + dns_calls.append(host) + address = "93.184.216.34" if len(dns_calls) == 1 else "169.254.169.254" + return [(socket.AF_INET, socket.SOCK_STREAM, 6, "", (address, port or 443))] + + class BlockingSocket: + def __init__(self, *args, **kwargs): + socket_attempts["count"] += 1 + raise AssertionError("Unsafe rebinding address reached socket creation.") + + monkeypatch.setenv("NO_PROXY", "*") + monkeypatch.setattr(socket, "getaddrinfo", fake_getaddrinfo) + monkeypatch.setattr(socket, "socket", BlockingSocket) + + try: + analyze_with_presidio_endpoint( + RAW_TEXT, + {"dlp_presidio_analyzer_endpoint": "https://presidio.example.com/analyze"}, + ) + except (PresidioEndpointConfigurationError, PresidioEndpointRequestError): + assert len(dns_calls) >= 2 + assert socket_attempts["count"] == 0 + return + + raise AssertionError("Expected rebinding request path to be blocked.") + + def test_validate_presidio_endpoint_allows_private_dns_answer_for_exact_allowlisted_host(monkeypatch): """A private DNS answer should be accepted only for the exact endpoint host in the allowlist.""" from functions_dlp_presidio import validate_presidio_endpoint_url @@ -209,7 +248,7 @@ def test_analyze_with_presidio_endpoint_posts_safe_payload_and_auth_header(monke captured = {} - def fake_post(url, json=None, headers=None, timeout=None, allow_redirects=None): + def fake_post(url, json=None, headers=None, timeout=None, allow_redirects=None, allowed_private_hosts=None): captured["url"] = url captured["json"] = json captured["headers"] = headers @@ -222,7 +261,7 @@ def fake_post(url, json=None, headers=None, timeout=None, allow_redirects=None): ] return response - monkeypatch.setattr("functions_dlp_presidio.requests.post", fake_post) + monkeypatch.setattr("functions_dlp_presidio._post_presidio_endpoint", fake_post) monkeypatch.setenv("PRESIDIO_DLP_API_KEY", "unit-test-secret") settings = { "dlp_presidio_analyzer_endpoint": "https://presidio.internal/analyze", @@ -258,14 +297,14 @@ def test_analyze_with_presidio_endpoint_omits_auth_header_without_env_secret(mon captured = {} - def fake_post(url, json=None, headers=None, timeout=None, allow_redirects=None): + def fake_post(url, json=None, headers=None, timeout=None, allow_redirects=None, allowed_private_hosts=None): captured["headers"] = headers response = Mock() response.raise_for_status.return_value = None response.json.return_value = [] return response - monkeypatch.setattr("functions_dlp_presidio.requests.post", fake_post) + monkeypatch.setattr("functions_dlp_presidio._post_presidio_endpoint", fake_post) monkeypatch.delenv("PRESIDIO_DLP_API_KEY", raising=False) stub_dns_answers(monkeypatch, "presidio.internal", ["10.0.0.5"]) @@ -286,10 +325,10 @@ def test_analyze_with_presidio_endpoint_raises_safe_error_without_raw_text(monke """Endpoint exceptions should not retain raw scanned text in messages or exception chains.""" from functions_dlp_presidio import PresidioEndpointRequestError, analyze_with_presidio_endpoint - def fake_post(url, json=None, headers=None, timeout=None, allow_redirects=None): + def fake_post(url, json=None, headers=None, timeout=None, allow_redirects=None, allowed_private_hosts=None): raise RuntimeError(f"upstream included {RAW_TEXT}") - monkeypatch.setattr("functions_dlp_presidio.requests.post", fake_post) + monkeypatch.setattr("functions_dlp_presidio._post_presidio_endpoint", fake_post) stub_dns_answers(monkeypatch, "presidio.internal", ["10.0.0.5"]) try: @@ -315,7 +354,7 @@ def test_analyze_with_presidio_endpoint_normalizes_response_items(monkeypatch): """Recognizer responses should be filtered and normalized deterministically.""" from functions_dlp_presidio import analyze_with_presidio_endpoint - def fake_post(url, json=None, headers=None, timeout=None, allow_redirects=None): + def fake_post(url, json=None, headers=None, timeout=None, allow_redirects=None, allowed_private_hosts=None): response = Mock() response.raise_for_status.return_value = None response.json.return_value = [ @@ -326,7 +365,7 @@ def fake_post(url, json=None, headers=None, timeout=None, allow_redirects=None): ] return response - monkeypatch.setattr("functions_dlp_presidio.requests.post", fake_post) + monkeypatch.setattr("functions_dlp_presidio._post_presidio_endpoint", fake_post) stub_dns_answers(monkeypatch, "presidio.internal", ["10.0.0.5"]) results = analyze_with_presidio_endpoint( @@ -349,7 +388,7 @@ def test_analyze_with_presidio_endpoint_treats_redirect_as_endpoint_error(monkey captured = {"calls": 0} - def fake_post(url, json=None, headers=None, timeout=None, allow_redirects=None): + def fake_post(url, json=None, headers=None, timeout=None, allow_redirects=None, allowed_private_hosts=None): captured["calls"] += 1 captured["allow_redirects"] = allow_redirects response = Mock() @@ -359,7 +398,7 @@ def fake_post(url, json=None, headers=None, timeout=None, allow_redirects=None): response.json.side_effect = AssertionError("Redirect responses must not be parsed.") return response - monkeypatch.setattr("functions_dlp_presidio.requests.post", fake_post) + monkeypatch.setattr("functions_dlp_presidio._post_presidio_endpoint", fake_post) stub_dns_answers(monkeypatch, "presidio.example.com", ["93.184.216.34"]) try: diff --git a/functional_tests/test_dlp_review_events.py b/functional_tests/test_dlp_review_events.py index ab37cd4e..15986076 100644 --- a/functional_tests/test_dlp_review_events.py +++ b/functional_tests/test_dlp_review_events.py @@ -2,7 +2,7 @@ #!/usr/bin/env python3 """ Functional test for DLP review event safety. -Version: 0.242.073 +Version: 0.242.074 Implemented in: 0.242.073 This test ensures DLP review routing defaults to disabled and any optional diff --git a/functional_tests/test_dlp_telemetry.py b/functional_tests/test_dlp_telemetry.py index f6515a7d..297a9816 100644 --- a/functional_tests/test_dlp_telemetry.py +++ b/functional_tests/test_dlp_telemetry.py @@ -2,7 +2,7 @@ #!/usr/bin/env python3 """ Functional test for safe DLP telemetry. -Version: 0.242.073 +Version: 0.242.074 Implemented in: 0.242.073 This test ensures DLP telemetry properties include bounded decision metadata diff --git a/functional_tests/test_upload_dlp_workspace_scopes.py b/functional_tests/test_upload_dlp_workspace_scopes.py index d8e51f13..e7c7569a 100644 --- a/functional_tests/test_upload_dlp_workspace_scopes.py +++ b/functional_tests/test_upload_dlp_workspace_scopes.py @@ -2,7 +2,7 @@ #!/usr/bin/env python3 """ Functional test for upload DLP workspace scope coverage. -Version: 0.242.073 +Version: 0.242.074 Implemented in: 0.242.073 This test ensures personal, group, public, and external public upload routes From 0b1d29aacf8cfccdcfefba9739c18308f5851f41 Mon Sep 17 00:00:00 2001 From: Zachary Arguelles Date: Thu, 18 Jun 2026 13:41:31 -0400 Subject: [PATCH 18/20] docs: align presidio dlp version headers --- docs/explanation/features/DLP_UPLOAD_STAGING.md | 2 +- docs/explanation/features/DLP_WEB_SEARCH_EGRESS_CONTROL.md | 2 +- functional_tests/test_dlp_presidio_endpoint.py | 2 +- functional_tests/test_dlp_presidio_engine_integration.py | 2 +- functional_tests/test_web_search_current_message_only.py | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/docs/explanation/features/DLP_UPLOAD_STAGING.md b/docs/explanation/features/DLP_UPLOAD_STAGING.md index 88c23367..4bee431d 100644 --- a/docs/explanation/features/DLP_UPLOAD_STAGING.md +++ b/docs/explanation/features/DLP_UPLOAD_STAGING.md @@ -2,7 +2,7 @@ ## Overview -Version: 0.242.073 +Version: 0.242.074 Dependencies: shared DLP core, configurable regex DLP rules, optional external Presidio Analyzer-compatible endpoint, document processing pipeline, Azure AI Search, Azure OpenAI embeddings. diff --git a/docs/explanation/features/DLP_WEB_SEARCH_EGRESS_CONTROL.md b/docs/explanation/features/DLP_WEB_SEARCH_EGRESS_CONTROL.md index ab506fcc..be4b2e47 100644 --- a/docs/explanation/features/DLP_WEB_SEARCH_EGRESS_CONTROL.md +++ b/docs/explanation/features/DLP_WEB_SEARCH_EGRESS_CONTROL.md @@ -2,7 +2,7 @@ ## Overview -Version: 0.242.073 +Version: 0.242.074 Dependencies: Flask chat routes, configurable regex DLP rules, optional external Presidio Analyzer-compatible endpoint, and Azure AI Foundry web-search agent configuration. diff --git a/functional_tests/test_dlp_presidio_endpoint.py b/functional_tests/test_dlp_presidio_endpoint.py index bba7a962..14ae8ad1 100644 --- a/functional_tests/test_dlp_presidio_endpoint.py +++ b/functional_tests/test_dlp_presidio_endpoint.py @@ -2,7 +2,7 @@ #!/usr/bin/env python3 """ Functional test for external Presidio endpoint DLP adapter. -Version: 0.242.073 +Version: 0.242.074 Implemented in: 0.242.071 This test ensures SimpleChat can call a configured Presidio-compatible analyzer diff --git a/functional_tests/test_dlp_presidio_engine_integration.py b/functional_tests/test_dlp_presidio_engine_integration.py index 1010702c..99c9754f 100644 --- a/functional_tests/test_dlp_presidio_engine_integration.py +++ b/functional_tests/test_dlp_presidio_engine_integration.py @@ -2,7 +2,7 @@ #!/usr/bin/env python3 """ Functional test for Presidio endpoint engine integration. -Version: 0.242.073 +Version: 0.242.074 Implemented in: 0.242.071 This test ensures the external Presidio endpoint engine reuses SimpleChat's diff --git a/functional_tests/test_web_search_current_message_only.py b/functional_tests/test_web_search_current_message_only.py index c09ba5d3..9d26ac2e 100644 --- a/functional_tests/test_web_search_current_message_only.py +++ b/functional_tests/test_web_search_current_message_only.py @@ -1,7 +1,7 @@ # test_web_search_current_message_only.py """ Functional test for current-message-only web search egress. -Version: 0.242.073 +Version: 0.242.074 Implemented in: 0.241.008 This test ensures external web search uses only the current user message, From 02360d6a5af215fca15bdce0af12d1c168af1686 Mon Sep 17 00:00:00 2001 From: Zachary Arguelles Date: Thu, 18 Jun 2026 14:07:23 -0400 Subject: [PATCH 19/20] fix: require auth for nonlocal presidio endpoints --- application/single_app/config.py | 2 +- .../single_app/functions_dlp_presidio.py | 54 +++++++++- .../route_frontend_admin_settings.py | 16 ++- .../single_app/templates/admin_settings.html | 3 +- .../features/DLP_UPLOAD_STAGING.md | 4 +- .../features/DLP_WEB_SEARCH_EGRESS_CONTROL.md | 4 +- docs/explanation/release_notes.md | 10 +- .../test_dlp_admin_settings_roundtrip.py | 3 +- .../test_dlp_admin_settings_ui.py | 7 +- functional_tests/test_dlp_admin_ui_smoke.py | 2 +- functional_tests/test_dlp_control_plane.py | 2 +- .../test_dlp_presidio_endpoint.py | 102 ++++++++++++++++-- .../test_dlp_presidio_engine_integration.py | 2 +- functional_tests/test_dlp_regex_rules.py | 2 +- functional_tests/test_dlp_review_events.py | 2 +- functional_tests/test_dlp_telemetry.py | 2 +- .../test_upload_dlp_ingestion_integration.py | 2 +- functional_tests/test_upload_dlp_redaction.py | 2 +- .../test_upload_dlp_workspace_scopes.py | 2 +- .../test_web_search_current_message_only.py | 2 +- .../test_web_search_dlp_egress.py | 2 +- .../test_web_search_dlp_route_integration.py | 2 +- 22 files changed, 190 insertions(+), 39 deletions(-) diff --git a/application/single_app/config.py b/application/single_app/config.py index e917a776..c079109b 100644 --- a/application/single_app/config.py +++ b/application/single_app/config.py @@ -95,7 +95,7 @@ EXECUTOR_TYPE = 'thread' EXECUTOR_MAX_WORKERS = 30 SESSION_TYPE = 'filesystem' -VERSION = "0.242.074" +VERSION = "0.242.075" SESSION_COOKIE_SAMESITE = os.getenv('SESSION_COOKIE_SAMESITE', 'Lax') SESSION_COOKIE_HTTPONLY = os.getenv('SESSION_COOKIE_HTTPONLY', 'true').lower() != 'false' diff --git a/application/single_app/functions_dlp_presidio.py b/application/single_app/functions_dlp_presidio.py index 0375b184..602fe710 100644 --- a/application/single_app/functions_dlp_presidio.py +++ b/application/single_app/functions_dlp_presidio.py @@ -22,6 +22,23 @@ DEFAULT_PRESIDIO_AUTH_HEADER_NAME = "X-DLP-API-Key" DEFAULT_PRESIDIO_AUTH_SECRET_ENV_VAR = "PRESIDIO_DLP_API_KEY" PRESIDIO_AUTH_SECRET_ENV_VAR_PREFIX = "DLP_PRESIDIO_" +PRESIDIO_AUTH_HEADER_NAME_PATTERN = re.compile(r"^[!#$%&'*+\-.^_`|~0-9A-Za-z]+$") +PRESIDIO_RESERVED_AUTH_HEADERS = { + "connection", + "content-length", + "content-type", + "cookie", + "expect", + "host", + "keep-alive", + "proxy-authenticate", + "proxy-authorization", + "set-cookie", + "te", + "trailer", + "transfer-encoding", + "upgrade", +} PRESIDIO_CREDENTIAL_QUERY_NAMES = { "key", "api_key", @@ -323,6 +340,18 @@ def normalize_presidio_secret_env_var_name(secret_env_var): return "" +def normalize_presidio_auth_header_name(header_name): + """Return an allowed Presidio auth header name, or blank when invalid.""" + normalized = str(header_name or "").strip() + if not normalized: + return DEFAULT_PRESIDIO_AUTH_HEADER_NAME + if not PRESIDIO_AUTH_HEADER_NAME_PATTERN.fullmatch(normalized): + return "" + if normalized.lower() in PRESIDIO_RESERVED_AUTH_HEADERS: + return "" + return normalized + + def _is_credential_like_query_name(query_name): normalized = str(query_name or "").strip().lower() if not normalized: @@ -401,16 +430,29 @@ def _get_entities(settings): return [str(item).strip().upper() for item in entities if str(item).strip()] -def _get_auth_headers(settings): - header_name = str((settings or {}).get("dlp_presidio_auth_header_name") or DEFAULT_PRESIDIO_AUTH_HEADER_NAME).strip() +def _get_auth_headers(settings, require_secret=False): + header_name = normalize_presidio_auth_header_name( + (settings or {}).get("dlp_presidio_auth_header_name") or DEFAULT_PRESIDIO_AUTH_HEADER_NAME + ) + if not header_name: + raise PresidioEndpointConfigurationError("Presidio auth header name is not allowed.") + secret_env_var = normalize_presidio_secret_env_var_name( - (settings or {}).get("dlp_presidio_auth_secret_env_var") or "" + (settings or {}).get("dlp_presidio_auth_secret_env_var") or DEFAULT_PRESIDIO_AUTH_SECRET_ENV_VAR ) - if not header_name or not secret_env_var: + if not secret_env_var: + if require_secret: + raise PresidioEndpointConfigurationError( + "Presidio analyzer endpoints outside localhost require an auth secret env var." + ) return {} secret_value = os.getenv(secret_env_var, "") if not secret_value: + if require_secret: + raise PresidioEndpointConfigurationError( + "Presidio analyzer endpoints outside localhost require the configured auth secret env var to be set." + ) return {} return {header_name: secret_value} @@ -438,6 +480,8 @@ def analyze_with_presidio_endpoint(text, settings): settings.get("dlp_presidio_analyzer_endpoint"), settings.get("dlp_presidio_allowed_private_hosts"), ) + endpoint_host = urlparse(endpoint_url).hostname or "" + require_auth_secret = not _is_loopback_presidio_host(endpoint_host) timeout_seconds = max( 1, min(30, _safe_int(settings.get("dlp_presidio_timeout_seconds"), DEFAULT_PRESIDIO_TIMEOUT_SECONDS)), @@ -455,7 +499,7 @@ def analyze_with_presidio_endpoint(text, settings): } headers = { "Content-Type": "application/json", - **_get_auth_headers(settings), + **_get_auth_headers(settings, require_secret=require_auth_secret), } request_error_type = None diff --git a/application/single_app/route_frontend_admin_settings.py b/application/single_app/route_frontend_admin_settings.py index 40969562..82384a5a 100644 --- a/application/single_app/route_frontend_admin_settings.py +++ b/application/single_app/route_frontend_admin_settings.py @@ -29,6 +29,7 @@ from functions_dlp_presidio import ( PresidioEndpointConfigurationError, DEFAULT_PRESIDIO_AUTH_SECRET_ENV_VAR, + normalize_presidio_auth_header_name, normalize_presidio_allowed_private_hosts, normalize_presidio_secret_env_var_name, validate_presidio_endpoint_url, @@ -825,10 +826,21 @@ def parse_admin_int(raw_value, fallback_value, field_name="unknown", hard_defaul except PresidioEndpointConfigurationError: dlp_presidio_analyzer_endpoint = '' flash(f"Presidio analyzer endpoint was not saved: {exc}", "warning") - dlp_presidio_auth_header_name = form_data.get( + submitted_dlp_presidio_auth_header_name = form_data.get( 'dlp_presidio_auth_header_name', settings.get('dlp_presidio_auth_header_name', 'X-DLP-API-Key') - ).strip() or 'X-DLP-API-Key' + ).strip() + dlp_presidio_auth_header_name = normalize_presidio_auth_header_name( + submitted_dlp_presidio_auth_header_name + ) + if not dlp_presidio_auth_header_name: + dlp_presidio_auth_header_name = normalize_presidio_auth_header_name( + settings.get('dlp_presidio_auth_header_name', 'X-DLP-API-Key') + ) or 'X-DLP-API-Key' + flash( + "Presidio auth header was not saved. Use a valid custom header such as X-DLP-API-Key.", + "warning" + ) submitted_dlp_presidio_auth_secret_env_var = form_data.get( 'dlp_presidio_auth_secret_env_var', settings.get('dlp_presidio_auth_secret_env_var', DEFAULT_PRESIDIO_AUTH_SECRET_ENV_VAR) diff --git a/application/single_app/templates/admin_settings.html b/application/single_app/templates/admin_settings.html index a12e79c4..109c185a 100644 --- a/application/single_app/templates/admin_settings.html +++ b/application/single_app/templates/admin_settings.html @@ -7403,7 +7403,7 @@
Presidio Analyzer Endpoint
@@ -7421,6 +7421,7 @@
Presidio Analyzer Endpoint
+
Use a custom auth header such as X-DLP-API-Key; connection and content headers are rejected.
diff --git a/docs/explanation/features/DLP_UPLOAD_STAGING.md b/docs/explanation/features/DLP_UPLOAD_STAGING.md index 4bee431d..078fb822 100644 --- a/docs/explanation/features/DLP_UPLOAD_STAGING.md +++ b/docs/explanation/features/DLP_UPLOAD_STAGING.md @@ -2,7 +2,7 @@ ## Overview -Version: 0.242.074 +Version: 0.242.075 Dependencies: shared DLP core, configurable regex DLP rules, optional external Presidio Analyzer-compatible endpoint, document processing pipeline, Azure AI Search, Azure OpenAI embeddings. @@ -54,7 +54,7 @@ Administrators can select an external Presidio Analyzer-compatible endpoint as t This is Option C for Presidio integration: Presidio runs outside SimpleChat. The SimpleChat application image has no embedded Presidio dependency, model package, or analyzer runtime. Regex DLP remains available as the default and fallback path. -Production deployments should keep the analyzer private and authenticated. Use a private network path plus an API key header or equivalent service boundary, and never expose a public unauthenticated Presidio Analyzer endpoint. SimpleChat stores only the configured secret environment variable name, such as `PRESIDIO_DLP_API_KEY`; the API key value belongs in App Service settings or a Key Vault reference. +Production deployments should keep the analyzer private and authenticated. Use a private network path plus an API key header or equivalent service boundary, and never expose a public unauthenticated Presidio Analyzer endpoint. SimpleChat permits unauthenticated Presidio calls only for localhost development endpoints; any non-loopback endpoint requires the configured secret environment variable to resolve before raw upload text or metadata is sent. SimpleChat stores only the configured secret environment variable name, such as `PRESIDIO_DLP_API_KEY`; the API key value belongs in App Service settings or a Key Vault reference. Custom auth header names are validated and reserved HTTP headers such as `Host`, `Content-Type`, and `Connection` are rejected. The analyzer receives raw extracted text before redaction. SimpleChat, proxies, wrappers, analyzer containers, and platform diagnostics must not log raw request bodies, response bodies, chunk text, OCR text, vision text, metadata values, matched values, or analyzer explanations. Stored DLP metadata and telemetry remain counts-only. diff --git a/docs/explanation/features/DLP_WEB_SEARCH_EGRESS_CONTROL.md b/docs/explanation/features/DLP_WEB_SEARCH_EGRESS_CONTROL.md index be4b2e47..a6c657f7 100644 --- a/docs/explanation/features/DLP_WEB_SEARCH_EGRESS_CONTROL.md +++ b/docs/explanation/features/DLP_WEB_SEARCH_EGRESS_CONTROL.md @@ -2,7 +2,7 @@ ## Overview -Version: 0.242.074 +Version: 0.242.075 Dependencies: Flask chat routes, configurable regex DLP rules, optional external Presidio Analyzer-compatible endpoint, and Azure AI Foundry web-search agent configuration. @@ -77,7 +77,7 @@ Administrators can select an external Presidio Analyzer-compatible endpoint as t This is Option C for Presidio integration: the analyzer is external to SimpleChat. SimpleChat keeps no embedded Presidio dependency, model package, or analyzer runtime in the app image. Regex DLP remains available as the default and fallback path. -Production deployments should keep the analyzer endpoint private and authenticated. Use a private network path plus an API key header or equivalent service boundary, and never expose a public unauthenticated Presidio Analyzer endpoint. SimpleChat stores only the configured secret environment variable name, such as `PRESIDIO_DLP_API_KEY`; the API key value belongs in App Service settings or a Key Vault reference. +Production deployments should keep the analyzer endpoint private and authenticated. Use a private network path plus an API key header or equivalent service boundary, and never expose a public unauthenticated Presidio Analyzer endpoint. SimpleChat permits unauthenticated Presidio calls only for localhost development endpoints; any non-loopback endpoint requires the configured secret environment variable to resolve before raw text is sent. SimpleChat stores only the configured secret environment variable name, such as `PRESIDIO_DLP_API_KEY`; the API key value belongs in App Service settings or a Key Vault reference. Custom auth header names are validated and reserved HTTP headers such as `Host`, `Content-Type`, and `Connection` are rejected. Because the analyzer receives raw text before redaction, SimpleChat, proxies, wrappers, and analyzer infrastructure must not log raw request bodies, response bodies, snippets, matched values, or analyzer explanations. Safe telemetry remains limited to entity types, counts, actions, engines, modes, and scanner status. diff --git a/docs/explanation/release_notes.md b/docs/explanation/release_notes.md index 68c849ce..f46f8511 100644 --- a/docs/explanation/release_notes.md +++ b/docs/explanation/release_notes.md @@ -1,13 +1,19 @@ -This page tracks notable Simple Chat releases and organizes the detailed change log by version. The timeline below provides a quick visual overview of the current release progression through v0.242.074, and the per-version entries continue immediately after it. +This page tracks notable Simple Chat releases and organizes the detailed change log by version. The timeline below provides a quick visual overview of the current release progression through v0.242.075, and the per-version entries continue immediately after it. For feature-focused and fix-focused drill-downs by version, see [Features by Version](/explanation/features/) and [Fixes by Version](/explanation/fixes/). -### **(v0.242.074)** +### **(v0.242.075)** #### Bug Fixes +* **Presidio Endpoint Authentication Guardrails** + * Requires non-loopback Presidio Analyzer endpoints to resolve the configured env-backed auth secret before SimpleChat sends raw scan text. + * Keeps unauthenticated Presidio calls limited to localhost development endpoints. + * Validates custom Presidio auth header names and rejects reserved HTTP headers such as `Host`, `Content-Type`, and `Connection`. + * (Ref: Presidio endpoint auth headers, DLP admin settings, Presidio deployment guidance) + * **Presidio Endpoint Runtime Safety Hardening** * Rejects Presidio Analyzer hostnames whose DNS answers include loopback, link-local, private, or otherwise non-global addresses unless the exact endpoint host is explicitly allowlisted. * Normalizes untrusted Presidio entity labels to safe uppercase identifiers before redaction output, match counts, match summaries, and telemetry are built. diff --git a/functional_tests/test_dlp_admin_settings_roundtrip.py b/functional_tests/test_dlp_admin_settings_roundtrip.py index b4b4d3da..c43e5b85 100644 --- a/functional_tests/test_dlp_admin_settings_roundtrip.py +++ b/functional_tests/test_dlp_admin_settings_roundtrip.py @@ -2,7 +2,7 @@ #!/usr/bin/env python3 """ Functional test for DLP admin settings roundtrip. -Version: 0.242.074 +Version: 0.242.075 Implemented in: 0.242.073 This test ensures DLP admin settings are normalized, persisted, and rendered @@ -237,6 +237,7 @@ def test_presidio_endpoint_settings_are_normalized_without_secret_persistence(): assert "dlp_presidio_entities = ['CREDIT_CARD', 'EMAIL_ADDRESS', 'PHONE_NUMBER', 'US_SSN']" in route_source assert "validate_presidio_endpoint_url(" in route_source assert "normalize_presidio_allowed_private_hosts(" in route_source + assert "normalize_presidio_auth_header_name(" in route_source assert "normalize_presidio_secret_env_var_name(" in route_source assert "settings.get('dlp_presidio_analyzer_endpoint', '')" in route_source assert "'dlp_presidio_auth_secret'" not in route_source diff --git a/functional_tests/test_dlp_admin_settings_ui.py b/functional_tests/test_dlp_admin_settings_ui.py index 719a0b8a..a1c47337 100644 --- a/functional_tests/test_dlp_admin_settings_ui.py +++ b/functional_tests/test_dlp_admin_settings_ui.py @@ -2,7 +2,7 @@ #!/usr/bin/env python3 """ Functional test for DLP admin settings UI. -Version: 0.242.074 +Version: 0.242.075 Implemented in: 0.242.073 This test ensures shared and web-search DLP defaults exist, admin settings @@ -166,8 +166,9 @@ def test_presidio_endpoint_controls_are_rendered_without_secret_value_field(): assert 'name="dlp_presidio_language"' in source assert 'name="dlp_presidio_entities"' in source assert 'name="dlp_presidio_auth_secret"' not in source - assert "public https endpoints are allowed by default" in source.lower() - assert "private hosts must be listed explicitly" in source.lower() + assert "private presidio endpoint with an env-backed api key" in source.lower() + assert "localhost endpoints may run without auth for local testing only" in source.lower() + assert "connection and content headers are rejected" in source.lower() def test_admin_js_uses_d_none_for_dlp_toggles(): diff --git a/functional_tests/test_dlp_admin_ui_smoke.py b/functional_tests/test_dlp_admin_ui_smoke.py index f7a53bed..1f4960b2 100644 --- a/functional_tests/test_dlp_admin_ui_smoke.py +++ b/functional_tests/test_dlp_admin_ui_smoke.py @@ -2,7 +2,7 @@ #!/usr/bin/env python3 """ Functional test for DLP admin UI smoke. -Version: 0.242.074 +Version: 0.242.075 Implemented in: 0.242.073 This test ensures the DLP admin settings card can be extracted into collapsed diff --git a/functional_tests/test_dlp_control_plane.py b/functional_tests/test_dlp_control_plane.py index 59e14db7..17ed9155 100644 --- a/functional_tests/test_dlp_control_plane.py +++ b/functional_tests/test_dlp_control_plane.py @@ -2,7 +2,7 @@ #!/usr/bin/env python3 """ Functional test for DLP control plane core behavior. -Version: 0.242.074 +Version: 0.242.075 Implemented in: 0.242.073 This test ensures the shared DLP core supports disabled, regex, Luhn-validated diff --git a/functional_tests/test_dlp_presidio_endpoint.py b/functional_tests/test_dlp_presidio_endpoint.py index 14ae8ad1..9d021d7e 100644 --- a/functional_tests/test_dlp_presidio_endpoint.py +++ b/functional_tests/test_dlp_presidio_endpoint.py @@ -2,7 +2,7 @@ #!/usr/bin/env python3 """ Functional test for external Presidio endpoint DLP adapter. -Version: 0.242.074 +Version: 0.242.075 Implemented in: 0.242.071 This test ensures SimpleChat can call a configured Presidio-compatible analyzer @@ -136,13 +136,17 @@ def __init__(self, *args, **kwargs): raise AssertionError("Unsafe rebinding address reached socket creation.") monkeypatch.setenv("NO_PROXY", "*") + monkeypatch.setenv("PRESIDIO_DLP_API_KEY", "unit-test-secret") monkeypatch.setattr(socket, "getaddrinfo", fake_getaddrinfo) monkeypatch.setattr(socket, "socket", BlockingSocket) try: analyze_with_presidio_endpoint( RAW_TEXT, - {"dlp_presidio_analyzer_endpoint": "https://presidio.example.com/analyze"}, + { + "dlp_presidio_analyzer_endpoint": "https://presidio.example.com/analyze", + "dlp_presidio_auth_secret_env_var": "PRESIDIO_DLP_API_KEY", + }, ) except (PresidioEndpointConfigurationError, PresidioEndpointRequestError): assert len(dns_calls) >= 2 @@ -291,8 +295,8 @@ def fake_post(url, json=None, headers=None, timeout=None, allow_redirects=None, assert captured["allow_redirects"] is False -def test_analyze_with_presidio_endpoint_omits_auth_header_without_env_secret(monkeypatch): - """Raw API keys should come only from the configured environment variable.""" +def test_analyze_with_presidio_endpoint_allows_localhost_without_env_secret(monkeypatch): + """Local development endpoints may omit auth, but only on loopback hosts.""" from functions_dlp_presidio import analyze_with_presidio_endpoint captured = {} @@ -306,13 +310,13 @@ def fake_post(url, json=None, headers=None, timeout=None, allow_redirects=None, monkeypatch.setattr("functions_dlp_presidio._post_presidio_endpoint", fake_post) monkeypatch.delenv("PRESIDIO_DLP_API_KEY", raising=False) - stub_dns_answers(monkeypatch, "presidio.internal", ["10.0.0.5"]) + stub_dns_answers(monkeypatch, "localhost", ["127.0.0.1"]) analyze_with_presidio_endpoint( RAW_TEXT, { - "dlp_presidio_analyzer_endpoint": "https://presidio.internal/analyze", - "dlp_presidio_allowed_private_hosts": "presidio.internal", + "dlp_presidio_analyzer_endpoint": "http://localhost:5002/analyze", + "dlp_presidio_allowed_private_hosts": "localhost", "dlp_presidio_auth_header_name": "X-DLP-API-Key", "dlp_presidio_auth_secret_env_var": "PRESIDIO_DLP_API_KEY", }, @@ -321,6 +325,41 @@ def fake_post(url, json=None, headers=None, timeout=None, allow_redirects=None, assert "X-DLP-API-Key" not in captured["headers"] +def test_analyze_with_presidio_endpoint_requires_auth_secret_for_nonlocal_endpoint(monkeypatch): + """Non-loopback endpoints should not receive raw text without env-backed auth.""" + from functions_dlp_presidio import PresidioEndpointConfigurationError, analyze_with_presidio_endpoint + + called = {"post": False} + + def fake_post(url, json=None, headers=None, timeout=None, allow_redirects=None, allowed_private_hosts=None): + called["post"] = True + response = Mock() + response.raise_for_status.return_value = None + response.json.return_value = [] + return response + + monkeypatch.setattr("functions_dlp_presidio._post_presidio_endpoint", fake_post) + monkeypatch.delenv("PRESIDIO_DLP_API_KEY", raising=False) + stub_dns_answers(monkeypatch, "presidio.internal", ["10.0.0.5"]) + + try: + analyze_with_presidio_endpoint( + RAW_TEXT, + { + "dlp_presidio_analyzer_endpoint": "https://presidio.internal/analyze", + "dlp_presidio_allowed_private_hosts": "presidio.internal", + "dlp_presidio_auth_header_name": "X-DLP-API-Key", + "dlp_presidio_auth_secret_env_var": "PRESIDIO_DLP_API_KEY", + }, + ) + except PresidioEndpointConfigurationError as exc: + assert "auth secret" in str(exc).lower() + assert called["post"] is False + return + + raise AssertionError("Expected missing non-local auth secret to block the request.") + + def test_analyze_with_presidio_endpoint_raises_safe_error_without_raw_text(monkeypatch): """Endpoint exceptions should not retain raw scanned text in messages or exception chains.""" from functions_dlp_presidio import PresidioEndpointRequestError, analyze_with_presidio_endpoint @@ -329,6 +368,7 @@ def fake_post(url, json=None, headers=None, timeout=None, allow_redirects=None, raise RuntimeError(f"upstream included {RAW_TEXT}") monkeypatch.setattr("functions_dlp_presidio._post_presidio_endpoint", fake_post) + monkeypatch.setenv("PRESIDIO_DLP_API_KEY", "unit-test-secret") stub_dns_answers(monkeypatch, "presidio.internal", ["10.0.0.5"]) try: @@ -337,6 +377,7 @@ def fake_post(url, json=None, headers=None, timeout=None, allow_redirects=None, { "dlp_presidio_analyzer_endpoint": "https://presidio.internal/analyze", "dlp_presidio_allowed_private_hosts": "presidio.internal", + "dlp_presidio_auth_secret_env_var": "PRESIDIO_DLP_API_KEY", }, ) except PresidioEndpointRequestError as exc: @@ -366,6 +407,7 @@ def fake_post(url, json=None, headers=None, timeout=None, allow_redirects=None, return response monkeypatch.setattr("functions_dlp_presidio._post_presidio_endpoint", fake_post) + monkeypatch.setenv("PRESIDIO_DLP_API_KEY", "unit-test-secret") stub_dns_answers(monkeypatch, "presidio.internal", ["10.0.0.5"]) results = analyze_with_presidio_endpoint( @@ -373,6 +415,7 @@ def fake_post(url, json=None, headers=None, timeout=None, allow_redirects=None, { "dlp_presidio_analyzer_endpoint": "https://presidio.internal/analyze", "dlp_presidio_allowed_private_hosts": "presidio.internal", + "dlp_presidio_auth_secret_env_var": "PRESIDIO_DLP_API_KEY", }, ) @@ -399,12 +442,16 @@ def fake_post(url, json=None, headers=None, timeout=None, allow_redirects=None, return response monkeypatch.setattr("functions_dlp_presidio._post_presidio_endpoint", fake_post) + monkeypatch.setenv("PRESIDIO_DLP_API_KEY", "unit-test-secret") stub_dns_answers(monkeypatch, "presidio.example.com", ["93.184.216.34"]) try: analyze_with_presidio_endpoint( RAW_TEXT, - {"dlp_presidio_analyzer_endpoint": "https://presidio.example.com/analyze"}, + { + "dlp_presidio_analyzer_endpoint": "https://presidio.example.com/analyze", + "dlp_presidio_auth_secret_env_var": "PRESIDIO_DLP_API_KEY", + }, ) except PresidioEndpointRequestError as exc: assert "redirect" in str(exc).lower() @@ -447,3 +494,42 @@ def test_presidio_auth_secret_env_var_name_validation(monkeypatch): "dlp_presidio_auth_secret_env_var": "DLP_PRESIDIO_TOKEN", } ) == {"X-DLP-API-Key": "prefixed-secret"} + + +def test_presidio_auth_header_name_validation(monkeypatch): + """Auth header names should reject reserved HTTP headers and malformed names.""" + from functions_dlp_presidio import ( + PresidioEndpointConfigurationError, + _get_auth_headers, + normalize_presidio_auth_header_name, + ) + + monkeypatch.setenv("PRESIDIO_DLP_API_KEY", "presidio-secret") + + assert normalize_presidio_auth_header_name("") == "X-DLP-API-Key" + assert normalize_presidio_auth_header_name("X-DLP-API-Key") == "X-DLP-API-Key" + assert normalize_presidio_auth_header_name("Authorization") == "Authorization" + assert normalize_presidio_auth_header_name("Content-Type") == "" + assert normalize_presidio_auth_header_name("Host") == "" + assert normalize_presidio_auth_header_name("Connection") == "" + assert normalize_presidio_auth_header_name("Bad Header") == "" + assert normalize_presidio_auth_header_name("X-DLP-API-Key\r\nX-Injected") == "" + assert _get_auth_headers( + { + "dlp_presidio_auth_header_name": "Authorization", + "dlp_presidio_auth_secret_env_var": "PRESIDIO_DLP_API_KEY", + } + ) == {"Authorization": "presidio-secret"} + + try: + _get_auth_headers( + { + "dlp_presidio_auth_header_name": "Content-Type", + "dlp_presidio_auth_secret_env_var": "PRESIDIO_DLP_API_KEY", + } + ) + except PresidioEndpointConfigurationError as exc: + assert "header" in str(exc).lower() + return + + raise AssertionError("Expected reserved auth header name to be rejected.") diff --git a/functional_tests/test_dlp_presidio_engine_integration.py b/functional_tests/test_dlp_presidio_engine_integration.py index 99c9754f..2d379e77 100644 --- a/functional_tests/test_dlp_presidio_engine_integration.py +++ b/functional_tests/test_dlp_presidio_engine_integration.py @@ -2,7 +2,7 @@ #!/usr/bin/env python3 """ Functional test for Presidio endpoint engine integration. -Version: 0.242.074 +Version: 0.242.075 Implemented in: 0.242.071 This test ensures the external Presidio endpoint engine reuses SimpleChat's diff --git a/functional_tests/test_dlp_regex_rules.py b/functional_tests/test_dlp_regex_rules.py index 7552a62c..e89c8856 100644 --- a/functional_tests/test_dlp_regex_rules.py +++ b/functional_tests/test_dlp_regex_rules.py @@ -2,7 +2,7 @@ #!/usr/bin/env python3 """ Functional test for configurable DLP regex rules. -Version: 0.242.074 +Version: 0.242.075 Implemented in: 0.242.073 This test ensures DLP regex rules are admin-configurable, validated, diff --git a/functional_tests/test_dlp_review_events.py b/functional_tests/test_dlp_review_events.py index 15986076..0fba0af3 100644 --- a/functional_tests/test_dlp_review_events.py +++ b/functional_tests/test_dlp_review_events.py @@ -2,7 +2,7 @@ #!/usr/bin/env python3 """ Functional test for DLP review event safety. -Version: 0.242.074 +Version: 0.242.075 Implemented in: 0.242.073 This test ensures DLP review routing defaults to disabled and any optional diff --git a/functional_tests/test_dlp_telemetry.py b/functional_tests/test_dlp_telemetry.py index 297a9816..95b2485c 100644 --- a/functional_tests/test_dlp_telemetry.py +++ b/functional_tests/test_dlp_telemetry.py @@ -2,7 +2,7 @@ #!/usr/bin/env python3 """ Functional test for safe DLP telemetry. -Version: 0.242.074 +Version: 0.242.075 Implemented in: 0.242.073 This test ensures DLP telemetry properties include bounded decision metadata diff --git a/functional_tests/test_upload_dlp_ingestion_integration.py b/functional_tests/test_upload_dlp_ingestion_integration.py index bda8d0dc..877c6b85 100644 --- a/functional_tests/test_upload_dlp_ingestion_integration.py +++ b/functional_tests/test_upload_dlp_ingestion_integration.py @@ -2,7 +2,7 @@ #!/usr/bin/env python3 """ Functional test for upload DLP ingestion integration. -Version: 0.242.074 +Version: 0.242.075 Implemented in: 0.242.073 This test ensures upload DLP blocks stop before embeddings/search indexing and diff --git a/functional_tests/test_upload_dlp_redaction.py b/functional_tests/test_upload_dlp_redaction.py index 992ec467..7d53cc25 100644 --- a/functional_tests/test_upload_dlp_redaction.py +++ b/functional_tests/test_upload_dlp_redaction.py @@ -2,7 +2,7 @@ #!/usr/bin/env python3 """ Functional test for upload DLP redaction. -Version: 0.242.074 +Version: 0.242.075 Implemented in: 0.242.073 This test ensures upload DLP redacts chunk text before embeddings and Azure AI diff --git a/functional_tests/test_upload_dlp_workspace_scopes.py b/functional_tests/test_upload_dlp_workspace_scopes.py index e7c7569a..58db25c0 100644 --- a/functional_tests/test_upload_dlp_workspace_scopes.py +++ b/functional_tests/test_upload_dlp_workspace_scopes.py @@ -2,7 +2,7 @@ #!/usr/bin/env python3 """ Functional test for upload DLP workspace scope coverage. -Version: 0.242.074 +Version: 0.242.075 Implemented in: 0.242.073 This test ensures personal, group, public, and external public upload routes diff --git a/functional_tests/test_web_search_current_message_only.py b/functional_tests/test_web_search_current_message_only.py index 9d26ac2e..557ff8b0 100644 --- a/functional_tests/test_web_search_current_message_only.py +++ b/functional_tests/test_web_search_current_message_only.py @@ -1,7 +1,7 @@ # test_web_search_current_message_only.py """ Functional test for current-message-only web search egress. -Version: 0.242.074 +Version: 0.242.075 Implemented in: 0.241.008 This test ensures external web search uses only the current user message, diff --git a/functional_tests/test_web_search_dlp_egress.py b/functional_tests/test_web_search_dlp_egress.py index 82ddacda..13f3a0ac 100644 --- a/functional_tests/test_web_search_dlp_egress.py +++ b/functional_tests/test_web_search_dlp_egress.py @@ -2,7 +2,7 @@ #!/usr/bin/env python3 """ Functional test for web-search DLP egress. -Version: 0.242.074 +Version: 0.242.075 Implemented in: 0.242.073 This test ensures web-search DLP runs after current-message query construction diff --git a/functional_tests/test_web_search_dlp_route_integration.py b/functional_tests/test_web_search_dlp_route_integration.py index fa1c673e..369b46ef 100644 --- a/functional_tests/test_web_search_dlp_route_integration.py +++ b/functional_tests/test_web_search_dlp_route_integration.py @@ -2,7 +2,7 @@ #!/usr/bin/env python3 """ Functional test for web-search DLP route integration. -Version: 0.242.074 +Version: 0.242.075 Implemented in: 0.242.073 This test ensures chat routes evaluate DLP before Foundry web search, suppress From 62586906da6c9d4ade57894f6afc91863daca23a Mon Sep 17 00:00:00 2001 From: Zachary Arguelles Date: Thu, 18 Jun 2026 14:16:00 -0400 Subject: [PATCH 20/20] test: add presidio functional test runners --- functional_tests/test_dlp_presidio_endpoint.py | 8 +++++++- functional_tests/test_dlp_presidio_engine_integration.py | 8 +++++++- 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/functional_tests/test_dlp_presidio_endpoint.py b/functional_tests/test_dlp_presidio_endpoint.py index 9d021d7e..3f96abb4 100644 --- a/functional_tests/test_dlp_presidio_endpoint.py +++ b/functional_tests/test_dlp_presidio_endpoint.py @@ -3,7 +3,7 @@ """ Functional test for external Presidio endpoint DLP adapter. Version: 0.242.075 -Implemented in: 0.242.071 +Implemented in: 0.242.075 This test ensures SimpleChat can call a configured Presidio-compatible analyzer endpoint without embedding Presidio packages or leaking raw scanned text. @@ -533,3 +533,9 @@ def test_presidio_auth_header_name_validation(monkeypatch): return raise AssertionError("Expected reserved auth header name to be rejected.") + + +if __name__ == "__main__": + import pytest + + sys.exit(pytest.main([__file__])) diff --git a/functional_tests/test_dlp_presidio_engine_integration.py b/functional_tests/test_dlp_presidio_engine_integration.py index 2d379e77..9e690232 100644 --- a/functional_tests/test_dlp_presidio_engine_integration.py +++ b/functional_tests/test_dlp_presidio_engine_integration.py @@ -3,7 +3,7 @@ """ Functional test for Presidio endpoint engine integration. Version: 0.242.075 -Implemented in: 0.242.071 +Implemented in: 0.242.075 This test ensures the external Presidio endpoint engine reuses SimpleChat's existing DLP decision, redaction, and fail-closed behavior. @@ -153,3 +153,9 @@ def test_external_analyzer_normalizes_empty_and_too_long_entity_labels(): assert normalized["match_counts"] == {"UNKNOWN_ENTITY": 2} assert normalized["matches"] == [{"entity_type": "UNKNOWN_ENTITY", "count": 2}] assert long_label not in repr(normalized) + + +if __name__ == "__main__": + import pytest + + sys.exit(pytest.main([__file__]))