diff --git a/application/single_app/config.py b/application/single_app/config.py index 89ff14aa..c079109b 100644 --- a/application/single_app/config.py +++ b/application/single_app/config.py @@ -95,7 +95,7 @@ EXECUTOR_TYPE = 'thread' EXECUTOR_MAX_WORKERS = 30 SESSION_TYPE = 'filesystem' -VERSION = "0.242.072" +VERSION = "0.242.075" SESSION_COOKIE_SAMESITE = os.getenv('SESSION_COOKIE_SAMESITE', 'Lax') SESSION_COOKIE_HTTPONLY = os.getenv('SESSION_COOKIE_HTTPONLY', 'true').lower() != 'false' diff --git a/application/single_app/functions_authentication.py b/application/single_app/functions_authentication.py index 86abdc12..66eaaa52 100644 --- a/application/single_app/functions_authentication.py +++ b/application/single_app/functions_authentication.py @@ -2,9 +2,14 @@ import base64 import json +import re from config import * -from functions_appinsights import log_event +try: + from functions_appinsights import log_event +except Exception: + def log_event(message, extra=None, level=None, exceptionTraceback=False): + return None from functions_settings import * from functions_debug import debug_print @@ -331,7 +336,23 @@ def get_valid_access_token_for_plugins(scopes=None): "error_code": error_code, "error_description": error_desc } - + + +def _sanitize_video_indexer_auth_log_value(value): + text = str(value) + text = re.sub( + r'([?&]accessToken=)[^&\s\'"<>]+', + r'\1[REDACTED]', + text, + flags=re.IGNORECASE, + ) + return re.sub( + r'([\'"]?accessToken[\'"]?\s*[:=]\s*[\'"]?)[^,\'"\s}&]+', + r'\1[REDACTED]', + text, + flags=re.IGNORECASE, + ) + def get_video_indexer_account_token(settings, video_id=None): """ Get Video Indexer access token using managed identity authentication. @@ -435,7 +456,7 @@ def get_video_indexer_managed_identity_token(settings, video_id=None): debug_print(f"[VIDEO INDEXER AUTH] ARM API response status: {resp.status_code}") if resp.status_code != 200: - debug_print(f"[VIDEO INDEXER AUTH] ARM API response text: {resp.text}") + debug_print(f"[VIDEO INDEXER AUTH] ARM API response text: {_sanitize_video_indexer_auth_log_value(resp.text)}") resp.raise_for_status() response_data = resp.json() @@ -443,20 +464,21 @@ def get_video_indexer_managed_identity_token(settings, video_id=None): ai = response_data.get("accessToken") if not ai: - debug_print(f"[VIDEO INDEXER AUTH] ERROR: No accessToken in response: {response_data}") + debug_print(f"[VIDEO INDEXER AUTH] ERROR: No accessToken in response; response keys: {list(response_data.keys())}") raise ValueError("No accessToken found in ARM API response") debug_print(f"[VIDEO INDEXER AUTH] Account token acquired successfully (length: {len(ai)})") debug_print(f"[VIDEO] Account token acquired (len={len(ai)})", flush=True) return ai except requests.exceptions.RequestException as e: - debug_print(f"[VIDEO INDEXER AUTH] ERROR in ARM API request: {str(e)}") + sanitized_error = _sanitize_video_indexer_auth_log_value(e) + debug_print(f"[VIDEO INDEXER AUTH] ERROR in ARM API request: {sanitized_error}") if hasattr(e, 'response') and e.response is not None: debug_print(f"[VIDEO INDEXER AUTH] Error response status: {e.response.status_code}") - debug_print(f"[VIDEO INDEXER AUTH] Error response text: {e.response.text}") + debug_print(f"[VIDEO INDEXER AUTH] Error response text: {_sanitize_video_indexer_auth_log_value(e.response.text)}") raise except Exception as e: - debug_print(f"[VIDEO INDEXER AUTH] Unexpected error: {str(e)}") + debug_print(f"[VIDEO INDEXER AUTH] Unexpected error: {_sanitize_video_indexer_auth_log_value(e)}") raise diff --git a/application/single_app/functions_dlp.py b/application/single_app/functions_dlp.py new file mode 100644 index 00000000..0f4383c8 --- /dev/null +++ b/application/single_app/functions_dlp.py @@ -0,0 +1,506 @@ +# functions_dlp.py + +import hashlib +import logging +import re +from collections import OrderedDict + +from functions_dlp_presidio import analyze_with_presidio_endpoint +from functions_dlp_rules import get_effective_dlp_regex_rules, scan_text_with_dlp_regex_rules + +try: + from functions_appinsights import log_event +except Exception: + def log_event(message, extra=None, level=logging.INFO, exceptionTraceback=False): + logging.log(level, "%s %s", message, extra or {}) + + +WEB_SEARCH_BLOCKED_MESSAGE = "Web search was blocked because the message appears to contain non-public information." +WEB_SEARCH_REDACTED_MESSAGE = "Sensitive details were removed before web search." + +DEFAULT_MAX_SCAN_CHARS = 200000 +DEFAULT_SCANNER_TIMEOUT_SECONDS = 5 +SUPPORTED_WEB_SEARCH_MODES = {"monitor", "redact", "block"} +UNKNOWN_DLP_ENTITY_TYPE = "UNKNOWN_ENTITY" +SAFE_DLP_ENTITY_TYPE_PATTERN = re.compile(r"^[A-Z0-9_]{1,64}$") + +def _bool_setting(settings, key, default=False): + return bool((settings or {}).get(key, default)) + + +def _safe_int(value, default): + try: + return int(value) + except (TypeError, ValueError): + return default + + +def _safe_float(value, default): + try: + return float(value) + except (TypeError, ValueError): + return default + + +def _normalize_engine(settings): + """Return the configured DLP engine.""" + requested = str((settings or {}).get("dlp_default_engine", "regex") or "regex").strip().lower() + if requested in {"regex", "presidio_endpoint"}: + return requested + return "regex" + + +def _normalize_mode(settings, surface): + if surface == "web_search": + mode = str((settings or {}).get("web_search_dlp_mode", "monitor") or "monitor").lower() + elif surface == "upload": + mode = str((settings or {}).get("upload_dlp_mode", "monitor") or "monitor").lower() + else: + mode = str((settings or {}).get("dlp_mode", "monitor") or "monitor").lower() + + return mode if mode in SUPPORTED_WEB_SEARCH_MODES else "monitor" + + +def _empty_result(text, enabled=False, engine="regex", mode="monitor", decision="allow", scanner_status="ok"): + safe_text = str(text or "") + return { + "enabled": enabled, + "engine": engine, + "mode": mode, + "decision": decision, + "text": safe_text, + "redacted_text": safe_text, + "total_replacements": 0, + "match_counts": {}, + "matches": [], + "metadata": {}, + "scanner_status": scanner_status, + } + + +def _apply_regex_engine(text, settings=None, surface="generic"): + rules, rule_errors = get_effective_dlp_regex_rules(settings or {}) + redacted_text, match_counts, matches, rule_metadata = scan_text_with_dlp_regex_rules( + text, + rules, + surface, + ) + return redacted_text, match_counts, matches, { + "rule_errors": len(rule_errors), + **rule_metadata, + } + + +def _apply_presidio_endpoint_engine(text, settings=None, surface="generic"): + recognizer_results = analyze_with_presidio_endpoint(text, settings or {}) + normalized = normalize_external_analyzer_results( + text, + recognizer_results, + mode=_normalize_mode(settings or {}, surface), + engine="presidio_endpoint", + ) + return ( + normalized["redacted_text"], + normalized["match_counts"], + normalized["matches"], + {"adapter": "presidio_endpoint"}, + ) + + +def _decision_from_counts(match_counts, mode): + if not match_counts: + return "allow" + if mode == "block": + return "block" + if mode == "redact": + return "redact" + return "monitor" + + +def normalize_dlp_entity_type(entity_type): + """Normalize untrusted analyzer entity labels before they reach outputs.""" + normalized = str(entity_type or "").strip().upper() + if SAFE_DLP_ENTITY_TYPE_PATTERN.fullmatch(normalized): + return normalized + return UNKNOWN_DLP_ENTITY_TYPE + + +def _safe_result_start(item): + try: + return int(item.get("start")) + except (TypeError, ValueError): + return 0 + + +def normalize_external_analyzer_results(text, recognizer_results, mode="redact", engine="external_analyzer"): + """Normalize external analyzer entity offsets into the shared counts-only result.""" + source_text = str(text or "") + sorted_results = sorted( + [ + item for item in (recognizer_results or []) + if isinstance(item, dict) and item.get("start") is not None and item.get("end") is not None + ], + key=_safe_result_start, + ) + match_counts = OrderedDict() + redacted_parts = [] + cursor = 0 + + for item in sorted_results: + start = max(0, min(len(source_text), int(item.get("start")))) + end = max(start, min(len(source_text), int(item.get("end")))) + entity_type = normalize_dlp_entity_type(item.get("entity_type")) + if start < cursor: + continue + redacted_parts.append(source_text[cursor:start]) + redacted_parts.append(f"[REDACTED_{entity_type}]") + cursor = end + match_counts[entity_type] = match_counts.get(entity_type, 0) + 1 + + redacted_parts.append(source_text[cursor:]) + redacted_text = "".join(redacted_parts) + counts = dict(match_counts) + decision = _decision_from_counts(counts, mode) + + return { + "enabled": True, + "engine": engine, + "mode": mode, + "decision": decision, + "text": redacted_text if counts else source_text, + "redacted_text": redacted_text if counts else source_text, + "total_replacements": sum(counts.values()), + "match_counts": counts, + "matches": [{"entity_type": key, "count": value} for key, value in counts.items()], + "metadata": {"adapter": "external_analyzer"}, + "scanner_status": "ok", + } + + +def evaluate_dlp_text(text, settings=None, context=None, surface="generic"): + """Evaluate text against the configured DLP policy and return a safe result.""" + settings = settings or {} + context = context or {} + original_text = str(text or "") + engine = _normalize_engine(settings) + mode = _normalize_mode(settings, surface) + max_scan_chars = _safe_int(settings.get("dlp_max_scan_chars"), DEFAULT_MAX_SCAN_CHARS) + + if not _bool_setting(settings, "enable_dlp_control_plane", False): + return _empty_result(original_text, enabled=False, engine=engine, mode=mode, decision="allow") + + scan_text = original_text[:max_scan_chars] + skipped_chars = max(0, len(original_text) - len(scan_text)) + upload_fail_on_match = surface == "upload" and _bool_setting(settings, "upload_dlp_fail_upload_on_match", False) + + if skipped_chars and surface in {"web_search", "upload"} and (mode in {"redact", "block"} or upload_fail_on_match): + return { + "enabled": True, + "engine": engine, + "mode": mode, + "decision": "block", + "text": "", + "redacted_text": "", + "total_replacements": 0, + "match_counts": {}, + "matches": [], + "metadata": {"skipped_chars": skipped_chars}, + "scanner_status": "truncated", + } + + try: + if engine == "presidio_endpoint": + redacted_text, match_counts, matches, scanner_metadata = _apply_presidio_endpoint_engine( + scan_text, + settings, + surface, + ) + else: + redacted_text, match_counts, matches, scanner_metadata = _apply_regex_engine(scan_text, settings, surface) + except Exception as exc: + log_event( + "[DLP] Scanner error", + extra={ + "dlp_surface": surface, + "dlp_engine": engine, + "scanner_status": "error", + "error_type": type(exc).__name__, + }, + level=logging.WARNING, + exceptionTraceback=False, + ) + fail_closed = _bool_setting(settings, "dlp_fail_closed_on_scanner_error", True) + return { + "enabled": True, + "engine": engine, + "mode": mode, + "decision": "block" if fail_closed else "allow", + "text": "" if fail_closed else original_text, + "redacted_text": "" if fail_closed else original_text, + "total_replacements": 0, + "match_counts": {}, + "matches": [], + "metadata": {"error_hash": hashlib.sha256(str(exc).encode("utf-8")).hexdigest()[:16]}, + "scanner_status": "error", + } + + if skipped_chars and mode == "monitor": + metadata = dict(scanner_metadata) + metadata["skipped_chars"] = skipped_chars + metadata = {key: value for key, value in metadata.items() if value not in ("", None, {}, [])} + return { + "enabled": True, + "engine": engine, + "mode": mode, + "decision": "allow", + "text": original_text, + "redacted_text": original_text, + "total_replacements": sum(match_counts.values()), + "match_counts": dict(match_counts), + "matches": matches, + "metadata": metadata, + "scanner_status": "truncated", + } + + decision = _decision_from_counts(match_counts, mode) + safe_text = "" if decision == "block" else (redacted_text if match_counts else original_text) + safe_redacted_text = "" if decision == "block" else (redacted_text if match_counts else original_text) + metadata = dict(scanner_metadata) + if skipped_chars: + metadata["skipped_chars"] = skipped_chars + metadata = {key: value for key, value in metadata.items() if value not in ("", None, {}, [])} + + return { + "enabled": True, + "engine": engine, + "mode": mode, + "decision": decision, + "text": safe_text, + "redacted_text": safe_redacted_text, + "total_replacements": sum(match_counts.values()), + "match_counts": dict(match_counts), + "matches": matches, + "metadata": metadata, + "scanner_status": "truncated" if skipped_chars else "ok", + } + + +def evaluate_web_search_egress(text, settings=None, context=None): + """Evaluate and shape DLP decisions for web-search egress.""" + settings = settings or {} + context = context or {} + + if not _bool_setting(settings, "enable_web_search_dlp", False): + result = _empty_result( + text, + enabled=_bool_setting(settings, "enable_dlp_control_plane", False), + engine=_normalize_engine(settings), + mode=_normalize_mode(settings, "web_search"), + decision="allow", + ) + else: + result = evaluate_dlp_text(text, settings=settings, context=context, surface="web_search") + + decision = result.get("decision", "allow") + web_search_allowed = decision != "block" + if decision == "block": + status_message = WEB_SEARCH_BLOCKED_MESSAGE + web_search_query_text = "" + elif decision == "redact": + status_message = WEB_SEARCH_REDACTED_MESSAGE + web_search_query_text = result.get("redacted_text", "") + else: + status_message = "" + web_search_query_text = str(text or "") + + shaped = dict(result) + shaped.update( + { + "web_search_allowed": web_search_allowed, + "web_search_query_text": web_search_query_text, + "status_message": status_message, + } + ) + return shaped + + +def _safe_entity_counts(match_counts): + counts = OrderedDict() + for entity_type, count in (match_counts or {}).items(): + try: + normalized_count = int(count) + except (TypeError, ValueError): + continue + if normalized_count <= 0: + continue + safe_entity_type = normalize_dlp_entity_type(entity_type) + counts[safe_entity_type] = counts.get(safe_entity_type, 0) + normalized_count + return dict(counts) + + +def _error_hash(result): + metadata = result.get("metadata") if isinstance(result, dict) else {} + raw_error = "" + if isinstance(metadata, dict): + raw_error = str(metadata.get("error") or metadata.get("error_hash") or "") + if not raw_error: + raw_error = "scanner_error" + return hashlib.sha256(raw_error.encode("utf-8")).hexdigest()[:16] + + +def build_dlp_telemetry_properties(result, surface, context=None): + """Build App Insights-safe DLP telemetry properties.""" + result = result or {} + context = context or {} + properties = { + "activity_type": "dlp_decision", + "dlp_surface": str(surface or "unknown"), + "dlp_action": str(result.get("decision") or "allow"), + "dlp_engine": str(result.get("engine") or "unknown"), + "dlp_mode": str(result.get("mode") or "monitor"), + "workspace_scope": str(context.get("workspace_scope") or context.get("document_scope") or "unknown"), + "scanner_status": str(result.get("scanner_status") or "ok"), + "dlp_total_replacements": int(result.get("total_replacements") or 0), + "dlp_entity_counts": _safe_entity_counts(result.get("match_counts")), + } + + for key in ("conversation_id", "chat_type", "document_scope", "document_id"): + if context.get(key): + properties[key] = str(context.get(key)) + + if properties["scanner_status"] != "ok": + properties["scanner_error"] = _error_hash(result) + + return properties + + +def should_emit_dlp_telemetry(result, settings=None): + settings = settings or {} + result = result or {} + if not _bool_setting(settings, "dlp_enable_structured_telemetry", True): + return False + action = str(result.get("decision") or "allow") + if action in {"block", "redact"}: + return True + if str(result.get("scanner_status") or "ok") != "ok": + return True + if _safe_int(result.get("total_replacements"), 0) > 0: + return True + if _safe_entity_counts(result.get("match_counts")): + return True + return _bool_setting(settings, "dlp_telemetry_sample_allow_events", False) + + +def build_dlp_review_event_summary(result, surface, context=None): + """Build a counts-only review payload for optional DLP review routing.""" + result = result or {} + context = context or {} + normalized_surface = str(surface or "unknown") + policy_type = "dlp_web_search" if normalized_surface == "web_search" else f"dlp_{normalized_surface}" + + summary = { + "policy_type": policy_type, + "violation_type": "dlp", + "surface": normalized_surface, + "action": str(result.get("decision") or "allow"), + "engine": str(result.get("engine") or "unknown"), + "mode": str(result.get("mode") or "monitor"), + "entity_counts": _safe_entity_counts(result.get("match_counts")), + "total_replacements": int(result.get("total_replacements") or 0), + "scanner_status": str(result.get("scanner_status") or "ok"), + "raw_matches": None, + } + + for key in ("conversation_id", "user_id", "document_id", "chat_type", "document_scope"): + if context.get(key): + summary[key] = str(context.get(key)) + + return summary + + +def evaluate_upload_content(text, settings=None, context=None): + """PR2-facing helper for upload DLP; upload wiring is added later.""" + settings = settings or {} + context = context or {} + + if not _bool_setting(settings, "enable_upload_dlp", False): + result = _empty_result( + text, + enabled=_bool_setting(settings, "enable_dlp_control_plane", False), + engine=str(settings.get("dlp_default_engine", "regex") or "regex"), + mode=_normalize_mode(settings, "upload"), + decision="allow", + ) + else: + result = evaluate_dlp_text(text, settings=settings, context=context, surface="upload") + + if ( + _bool_setting(settings, "upload_dlp_fail_upload_on_match", False) + and int(result.get("total_replacements") or 0) > 0 + ): + result = dict(result) + result["decision"] = "block" + result["text"] = "" + result["redacted_text"] = "" + + decision = result.get("decision", "allow") + scanner_status = result.get("scanner_status", "ok") + upload_allowed = decision != "block" and scanner_status != "blocked" + if scanner_status != "ok" and decision == "block": + status = "scanner_failed" + elif decision == "block": + status = "blocked" + elif decision == "redact": + status = "accepted_with_redactions" + elif decision == "monitor": + status = "accepted_with_dlp_monitoring" + else: + status = "accepted" + + if decision == "block": + sanitized_text = "" + elif decision == "redact": + sanitized_text = result.get("redacted_text", "") + else: + sanitized_text = str(text or "") + + shaped = dict(result) + shaped.update( + { + "upload_allowed": upload_allowed, + "sanitized_text": sanitized_text, + "status": status, + "dlp_metadata": build_dlp_metadata_summary(result, surface="upload", context=context), + } + ) + return shaped + + +def build_dlp_metadata_summary(result, surface, context=None): + """Build counts-only DLP metadata safe for document records.""" + result = result or {} + context = context or {} + summary = { + "dlp_surface": str(surface or "unknown"), + "dlp_action": str(result.get("decision") or "allow"), + "dlp_engine": str(result.get("engine") or "unknown"), + "dlp_mode": str(result.get("mode") or "monitor"), + "scanner_status": str(result.get("scanner_status") or "ok"), + "total_replacements": int(result.get("total_replacements") or 0), + "entity_counts": _safe_entity_counts(result.get("match_counts")), + } + for key in ("workspace_scope", "document_id"): + if context.get(key): + summary[key] = str(context.get(key)) + return summary + + +def build_upload_dlp_file_log_summary(result, context=None): + """Build a safe file-processing log summary for upload DLP decisions.""" + result = result or {} + context = context or {} + summary = build_dlp_metadata_summary(result, surface="upload", context=context) + for key in ("document_id", "workspace_scope", "page_number", "text_length"): + if context.get(key) is not None: + summary[key] = context.get(key) + return summary diff --git a/application/single_app/functions_dlp_presidio.py b/application/single_app/functions_dlp_presidio.py new file mode 100644 index 00000000..602fe710 --- /dev/null +++ b/application/single_app/functions_dlp_presidio.py @@ -0,0 +1,536 @@ +# functions_dlp_presidio.py + +"""HTTP adapter for Presidio-compatible Analyzer endpoints.""" + +import ipaddress +import os +import re +import socket +from urllib.parse import parse_qsl, urlparse + +import requests +from urllib3 import connection as urllib3_connection +from urllib3 import connectionpool as urllib3_connectionpool +from urllib3 import poolmanager as urllib3_poolmanager +from urllib3.util import connection as urllib3_util_connection +from urllib3.util.timeout import _DEFAULT_TIMEOUT + + +DEFAULT_PRESIDIO_TIMEOUT_SECONDS = 5 +DEFAULT_PRESIDIO_LANGUAGE = "en" +DEFAULT_PRESIDIO_SCORE_THRESHOLD = 0.5 +DEFAULT_PRESIDIO_AUTH_HEADER_NAME = "X-DLP-API-Key" +DEFAULT_PRESIDIO_AUTH_SECRET_ENV_VAR = "PRESIDIO_DLP_API_KEY" +PRESIDIO_AUTH_SECRET_ENV_VAR_PREFIX = "DLP_PRESIDIO_" +PRESIDIO_AUTH_HEADER_NAME_PATTERN = re.compile(r"^[!#$%&'*+\-.^_`|~0-9A-Za-z]+$") +PRESIDIO_RESERVED_AUTH_HEADERS = { + "connection", + "content-length", + "content-type", + "cookie", + "expect", + "host", + "keep-alive", + "proxy-authenticate", + "proxy-authorization", + "set-cookie", + "te", + "trailer", + "transfer-encoding", + "upgrade", +} +PRESIDIO_CREDENTIAL_QUERY_NAMES = { + "key", + "api_key", + "apikey", + "secret", + "token", + "password", + "connection", + "sig", +} +PRESIDIO_CREDENTIAL_QUERY_WORDS = { + "key", + "secret", + "token", + "password", + "connection", + "sig", +} +PRESIDIO_PRIVATE_HOST_SUFFIXES = ( + ".internal", + ".local", + ".localdomain", + ".lan", + ".home", + ".corp", +) +PRESIDIO_LOCAL_HOSTS = {"localhost"} +PRESIDIO_SECRET_ENV_VAR_PATTERN = re.compile(r"^[A-Z][A-Z0-9_]*$") + + +class PresidioEndpointConfigurationError(ValueError): + """Raised when the configured Presidio endpoint is not safe to call.""" + + +class PresidioEndpointRequestError(RuntimeError): + """Raised when the Presidio endpoint cannot return a usable analyzer result.""" + + +def _normalize_host_identifier(host): + normalized = str(host or "").strip().lower().strip(".") + if normalized.startswith("[") and "]" in normalized: + normalized = normalized[1:normalized.index("]")] + if "://" in normalized: + normalized = (urlparse(normalized).hostname or "").strip().lower().strip(".") + return normalized + + +def normalize_presidio_allowed_private_hosts(value): + """Normalize the admin allowlist for private Presidio endpoint hosts.""" + if isinstance(value, (list, tuple, set)): + raw_items = value + else: + raw_items = re.split(r"[\n,]+", str(value or "")) + + normalized_hosts = [] + seen_hosts = set() + for item in raw_items: + host = _normalize_host_identifier(item) + if not host or host in seen_hosts: + continue + normalized_hosts.append(host) + seen_hosts.add(host) + return ", ".join(normalized_hosts) + + +def _get_allowed_private_hosts(allowed_private_hosts): + normalized_allowlist = normalize_presidio_allowed_private_hosts(allowed_private_hosts) + if not normalized_allowlist: + return set() + return { + item.strip() + for item in normalized_allowlist.split(",") + if item.strip() + } + + +def _is_private_presidio_host(host): + normalized_host = _normalize_host_identifier(host) + if not normalized_host: + return True + if normalized_host in PRESIDIO_LOCAL_HOSTS or normalized_host.endswith(".localhost"): + return True + try: + ip_address = ipaddress.ip_address(normalized_host) + return not ip_address.is_global + except ValueError: + return normalized_host.endswith(PRESIDIO_PRIVATE_HOST_SUFFIXES) + + +def _is_loopback_presidio_host(host): + normalized_host = _normalize_host_identifier(host) + if normalized_host in PRESIDIO_LOCAL_HOSTS or normalized_host.endswith(".localhost"): + return True + try: + return ipaddress.ip_address(normalized_host).is_loopback + except ValueError: + return False + + +def _is_ip_literal(host): + try: + ipaddress.ip_address(_normalize_host_identifier(host)) + return True + except ValueError: + return False + + +def _resolve_presidio_host_addresses(host, port): + normalized_host = _normalize_host_identifier(host) + if not normalized_host: + return [] + if _is_ip_literal(normalized_host): + return [ipaddress.ip_address(normalized_host)] + + try: + address_info = socket.getaddrinfo( + normalized_host, + port, + type=socket.SOCK_STREAM, + ) + except socket.gaierror as exc: + raise PresidioEndpointConfigurationError("Presidio analyzer endpoint host must resolve in DNS.") from exc + + return _extract_presidio_addresses(address_info) + + +def _extract_presidio_addresses(address_info): + addresses = [] + seen_addresses = set() + for item in address_info: + sockaddr = item[4] if len(item) > 4 else None + if not sockaddr: + continue + raw_address = str(sockaddr[0]).split("%", 1)[0] + try: + address = ipaddress.ip_address(raw_address) + except ValueError: + continue + if address in seen_addresses: + continue + addresses.append(address) + seen_addresses.add(address) + return addresses + + +def _validate_presidio_address_list(host, addresses, allowed_hosts): + normalized_host = _normalize_host_identifier(host) + if not addresses: + raise PresidioEndpointConfigurationError("Presidio analyzer endpoint host must resolve to an IP address.") + if normalized_host in allowed_hosts: + return + if any(not address.is_global for address in addresses): + raise PresidioEndpointConfigurationError( + "Private Presidio analyzer endpoint hosts must be listed in the private host allowlist." + ) + + +def _validate_resolved_presidio_addresses(host, port, allowed_hosts): + normalized_host = _normalize_host_identifier(host) + addresses = _resolve_presidio_host_addresses(normalized_host, port) + _validate_presidio_address_list(normalized_host, addresses, allowed_hosts) + + +def _set_socket_options(sock, socket_options): + for option in socket_options or []: + sock.setsockopt(*option) + + +def _create_presidio_safe_socket_connection(host, port, timeout, source_address, socket_options, allowed_hosts): + connect_host = str(host or "") + if connect_host.startswith("["): + connect_host = connect_host.strip("[]") + connect_host.encode("idna") + + address_info = socket.getaddrinfo( + connect_host, + port, + urllib3_util_connection.allowed_gai_family(), + socket.SOCK_STREAM, + ) + _validate_presidio_address_list(connect_host, _extract_presidio_addresses(address_info), allowed_hosts) + + last_error = None + for family, socktype, proto, _canonname, sockaddr in address_info: + sock = None + try: + sock = socket.socket(family, socktype, proto) + _set_socket_options(sock, socket_options) + if timeout is not _DEFAULT_TIMEOUT: + sock.settimeout(timeout) + if source_address: + sock.bind(source_address) + sock.connect(sockaddr) + last_error = None + return sock + except OSError as exc: + last_error = exc + if sock is not None: + sock.close() + + if last_error is not None: + raise last_error + raise OSError("getaddrinfo returns an empty list") + + +class _PresidioSSRFConnectionMixin: + presidio_allowed_private_hosts = frozenset() + + def _new_conn(self): + try: + return _create_presidio_safe_socket_connection( + self._dns_host, + self.port, + self.timeout, + self.source_address, + self.socket_options, + self.presidio_allowed_private_hosts, + ) + except socket.gaierror as exc: + raise urllib3_connection.NameResolutionError(self.host, self, exc) from exc + except urllib3_connection.SocketTimeout as exc: + raise urllib3_connection.ConnectTimeoutError( + self, + f"Connection to {self.host} timed out. (connect timeout={self.timeout})", + ) from exc + except OSError as exc: + raise urllib3_connection.NewConnectionError( + self, + f"Failed to establish a new connection: {exc}", + ) from exc + + +def _build_presidio_pool_classes(allowed_hosts): + class PresidioSSRFHTTPConnection(_PresidioSSRFConnectionMixin, urllib3_connection.HTTPConnection): + presidio_allowed_private_hosts = allowed_hosts + + class PresidioSSRFHTTPSConnection(_PresidioSSRFConnectionMixin, urllib3_connection.HTTPSConnection): + presidio_allowed_private_hosts = allowed_hosts + + class PresidioSSRFHTTPConnectionPool(urllib3_connectionpool.HTTPConnectionPool): + ConnectionCls = PresidioSSRFHTTPConnection + + class PresidioSSRFHTTPSConnectionPool(urllib3_connectionpool.HTTPSConnectionPool): + ConnectionCls = PresidioSSRFHTTPSConnection + + return { + "http": PresidioSSRFHTTPConnectionPool, + "https": PresidioSSRFHTTPSConnectionPool, + } + + +class _PresidioSSRFHTTPAdapter(requests.adapters.HTTPAdapter): + def __init__(self, allowed_hosts, *args, **kwargs): + self._presidio_pool_classes = _build_presidio_pool_classes(frozenset(allowed_hosts)) + super().__init__(*args, **kwargs) + + def init_poolmanager(self, connections, maxsize, block=False, **pool_kwargs): + self.poolmanager = urllib3_poolmanager.PoolManager( + num_pools=connections, + maxsize=maxsize, + block=block, + **pool_kwargs, + ) + self.poolmanager.pool_classes_by_scheme = self._presidio_pool_classes + + +def _build_presidio_endpoint_session(allowed_private_hosts): + session = requests.Session() + session.trust_env = False + adapter = _PresidioSSRFHTTPAdapter(_get_allowed_private_hosts(allowed_private_hosts)) + session.mount("http://", adapter) + session.mount("https://", adapter) + return session + + +def _post_presidio_endpoint(endpoint_url, json, headers, timeout, allow_redirects, allowed_private_hosts): + with _build_presidio_endpoint_session(allowed_private_hosts) as session: + return session.post( + endpoint_url, + json=json, + headers=headers, + timeout=timeout, + allow_redirects=allow_redirects, + ) + + +def normalize_presidio_secret_env_var_name(secret_env_var): + """Return an allowed Presidio secret env var name, or blank when invalid.""" + normalized = str(secret_env_var or "").strip() + if not normalized: + return "" + if normalized == DEFAULT_PRESIDIO_AUTH_SECRET_ENV_VAR: + return normalized + if ( + normalized.startswith(PRESIDIO_AUTH_SECRET_ENV_VAR_PREFIX) + and PRESIDIO_SECRET_ENV_VAR_PATTERN.fullmatch(normalized) + ): + return normalized + return "" + + +def normalize_presidio_auth_header_name(header_name): + """Return an allowed Presidio auth header name, or blank when invalid.""" + normalized = str(header_name or "").strip() + if not normalized: + return DEFAULT_PRESIDIO_AUTH_HEADER_NAME + if not PRESIDIO_AUTH_HEADER_NAME_PATTERN.fullmatch(normalized): + return "" + if normalized.lower() in PRESIDIO_RESERVED_AUTH_HEADERS: + return "" + return normalized + + +def _is_credential_like_query_name(query_name): + normalized = str(query_name or "").strip().lower() + if not normalized: + return False + compact_name = re.sub(r"[^a-z0-9]+", "", normalized) + query_tokens = { + token + for token in re.split(r"[^a-z0-9]+", normalized) + if token + } + if normalized in PRESIDIO_CREDENTIAL_QUERY_NAMES or compact_name in PRESIDIO_CREDENTIAL_QUERY_NAMES: + return True + if query_tokens & PRESIDIO_CREDENTIAL_QUERY_WORDS: + return True + return any(credential_word in compact_name for credential_word in PRESIDIO_CREDENTIAL_QUERY_WORDS) + + +def validate_presidio_endpoint_url(endpoint_url, allowed_private_hosts=None): + """Validate and normalize a Presidio Analyzer endpoint URL.""" + normalized = str(endpoint_url or "").strip() + if not normalized: + raise PresidioEndpointConfigurationError("Presidio analyzer endpoint is required.") + + parsed = urlparse(normalized) + host = (parsed.hostname or "").lower() + normalized_host = _normalize_host_identifier(host) + if parsed.scheme not in {"http", "https"} or not parsed.netloc: + raise PresidioEndpointConfigurationError("Presidio analyzer endpoint must be an absolute HTTP(S) URL.") + if parsed.username or parsed.password or "@" in parsed.netloc: + raise PresidioEndpointConfigurationError("Presidio analyzer endpoint URL must not include userinfo.") + if parsed.fragment: + raise PresidioEndpointConfigurationError("Presidio analyzer endpoint URL must not include a fragment.") + for query_name, _ in parse_qsl(parsed.query, keep_blank_values=True): + if _is_credential_like_query_name(query_name): + raise PresidioEndpointConfigurationError( + "Presidio analyzer endpoint URL must not include credential-like query parameters." + ) + + host_is_private = _is_private_presidio_host(host) + allowed_hosts = _get_allowed_private_hosts(allowed_private_hosts) + if host_is_private and normalized_host not in allowed_hosts: + raise PresidioEndpointConfigurationError( + "Private Presidio analyzer endpoint hosts must be listed in the private host allowlist." + ) + if parsed.scheme == "http" and not _is_loopback_presidio_host(host): + raise PresidioEndpointConfigurationError("Presidio analyzer endpoint must use HTTPS unless it is localhost.") + _validate_resolved_presidio_addresses( + host, + parsed.port or (443 if parsed.scheme == "https" else 80), + allowed_hosts, + ) + + return normalized + + +def _safe_float(value, default): + try: + return float(value) + except (TypeError, ValueError): + return default + + +def _safe_int(value, default): + try: + return int(value) + except (TypeError, ValueError): + return default + + +def _get_entities(settings): + entities = (settings or {}).get("dlp_presidio_entities", []) + if isinstance(entities, str): + entities = [item.strip().upper() for item in entities.split(",")] + if not isinstance(entities, list): + return [] + return [str(item).strip().upper() for item in entities if str(item).strip()] + + +def _get_auth_headers(settings, require_secret=False): + header_name = normalize_presidio_auth_header_name( + (settings or {}).get("dlp_presidio_auth_header_name") or DEFAULT_PRESIDIO_AUTH_HEADER_NAME + ) + if not header_name: + raise PresidioEndpointConfigurationError("Presidio auth header name is not allowed.") + + secret_env_var = normalize_presidio_secret_env_var_name( + (settings or {}).get("dlp_presidio_auth_secret_env_var") or DEFAULT_PRESIDIO_AUTH_SECRET_ENV_VAR + ) + if not secret_env_var: + if require_secret: + raise PresidioEndpointConfigurationError( + "Presidio analyzer endpoints outside localhost require an auth secret env var." + ) + return {} + + secret_value = os.getenv(secret_env_var, "") + if not secret_value: + if require_secret: + raise PresidioEndpointConfigurationError( + "Presidio analyzer endpoints outside localhost require the configured auth secret env var to be set." + ) + return {} + return {header_name: secret_value} + + +def _normalize_result_item(item): + if not isinstance(item, dict): + return None + if "entity_type" not in item or item.get("start") is None or item.get("end") is None: + return None + try: + return { + "entity_type": str(item.get("entity_type") or ""), + "start": int(item.get("start")), + "end": int(item.get("end")), + "score": float(item.get("score", 0.0)), + } + except (TypeError, ValueError): + return None + + +def analyze_with_presidio_endpoint(text, settings): + """Call a configured Presidio Analyzer endpoint and return recognizer results.""" + settings = settings or {} + endpoint_url = validate_presidio_endpoint_url( + settings.get("dlp_presidio_analyzer_endpoint"), + settings.get("dlp_presidio_allowed_private_hosts"), + ) + endpoint_host = urlparse(endpoint_url).hostname or "" + require_auth_secret = not _is_loopback_presidio_host(endpoint_host) + timeout_seconds = max( + 1, + min(30, _safe_int(settings.get("dlp_presidio_timeout_seconds"), DEFAULT_PRESIDIO_TIMEOUT_SECONDS)), + ) + score_threshold = max( + 0.0, + min(1.0, _safe_float(settings.get("dlp_presidio_score_threshold"), DEFAULT_PRESIDIO_SCORE_THRESHOLD)), + ) + language = str(settings.get("dlp_presidio_language") or DEFAULT_PRESIDIO_LANGUAGE).strip() or DEFAULT_PRESIDIO_LANGUAGE + payload = { + "text": str(text or ""), + "language": language, + "entities": _get_entities(settings), + "score_threshold": score_threshold, + } + headers = { + "Content-Type": "application/json", + **_get_auth_headers(settings, require_secret=require_auth_secret), + } + + request_error_type = None + try: + response = _post_presidio_endpoint( + endpoint_url, + json=payload, + headers=headers, + timeout=timeout_seconds, + allow_redirects=False, + allowed_private_hosts=settings.get("dlp_presidio_allowed_private_hosts"), + ) + status_code = getattr(response, "status_code", None) + if isinstance(status_code, int) and 300 <= status_code < 400: + request_error_type = "RedirectResponse" + body = None + else: + response.raise_for_status() + body = response.json() + except Exception as exc: + request_error_type = type(exc).__name__ + + if request_error_type: + raise PresidioEndpointRequestError(f"Presidio analyzer request failed: {request_error_type}") from None + + if not isinstance(body, list): + raise PresidioEndpointRequestError("Presidio analyzer response must be a list.") + + results = [] + for item in body: + normalized = _normalize_result_item(item) + if normalized: + results.append(normalized) + return results diff --git a/application/single_app/functions_dlp_rules.py b/application/single_app/functions_dlp_rules.py new file mode 100644 index 00000000..3b4b0426 --- /dev/null +++ b/application/single_app/functions_dlp_rules.py @@ -0,0 +1,303 @@ +# functions_dlp_rules.py + +import copy +import hashlib +from collections import OrderedDict + +import regex + + +CONFIDENCE_ORDER = {"low": 1, "medium": 2, "high": 3} +ALLOWED_FLAGS = {"IGNORECASE": regex.IGNORECASE, "MULTILINE": regex.MULTILINE} +ALLOWED_VALIDATORS = {"none", "luhn"} +ALLOWED_SURFACES = {"web_search", "upload"} +MAX_RULES = 50 +MAX_PATTERN_LENGTH = 512 +MAX_REPLACEMENT_LENGTH = 80 +MAX_KEYWORDS = 25 +MAX_KEYWORD_LENGTH = 80 +MAX_WINDOW_CHARS = 256 +REGEX_TIMEOUT_SECONDS = 0.05 + + +DEFAULT_DLP_REGEX_RULES = [ + { + "id": "us_ssn", + "label": "U.S. Social Security Number", + "entity_type": "US_SSN", + "enabled": True, + "pattern": r"(? 19: + return False + + checksum = 0 + reverse_digits = list(reversed(digits)) + for index, digit in enumerate(reverse_digits): + if index % 2 == 1: + digit *= 2 + if digit > 9: + digit -= 9 + checksum += digit + return checksum % 10 == 0 + + +def _validator_allows(value, validator): + if validator == "luhn": + return _luhn_valid(value) + return True + + +def _safe_rule_id(value, index): + candidate = _as_string(value, f"rule_{index + 1}").lower() + candidate = regex.sub(r"[^a-z0-9_\-]+", "_", candidate).strip("_-") + return candidate or f"rule_{index + 1}" + + +def validate_dlp_regex_rules(rules): + normalized_rules = [] + errors = [] + + if rules is None: + return get_default_dlp_regex_rules(), [] + if not isinstance(rules, list): + return [], ["dlp_regex_rules must be a list."] + if len(rules) > MAX_RULES: + return [], [f"dlp_regex_rules cannot contain more than {MAX_RULES} rules."] + + seen_ids = set() + for index, rule in enumerate(rules): + if not isinstance(rule, dict): + errors.append(f"Rule {index + 1} must be an object.") + continue + + rule_id = _safe_rule_id(rule.get("id"), index) + if rule_id in seen_ids: + errors.append(f"Rule {rule_id} has a duplicate id.") + continue + seen_ids.add(rule_id) + + pattern = _as_string(rule.get("pattern")) + if not pattern: + errors.append(f"Rule {rule_id} requires a regex pattern.") + continue + if len(pattern) > MAX_PATTERN_LENGTH: + errors.append(f"Rule {rule_id} pattern exceeds {MAX_PATTERN_LENGTH} characters.") + continue + + flags = _normalize_flags(rule.get("flags", [])) + try: + regex.compile(pattern, _compile_flags(flags)) + except Exception as exc: + errors.append(f"Rule {rule_id} regex does not compile: {type(exc).__name__}.") + continue + + surfaces = [ + _as_string(surface).lower() + for surface in rule.get("surfaces", ["web_search", "upload"]) + if _as_string(surface).lower() in ALLOWED_SURFACES + ] + if not surfaces: + errors.append(f"Rule {rule_id} must target web_search, upload, or both.") + continue + + validator = _as_string(rule.get("validator", "none")).lower() + if validator not in ALLOWED_VALIDATORS: + errors.append(f"Rule {rule_id} uses unsupported validator {validator}.") + continue + + confidence = rule.get("confidence", {}) + if not isinstance(confidence, dict): + confidence = {} + + keywords = [] + for keyword in confidence.get("keywords", []): + keyword_text = _as_string(keyword).lower() + if keyword_text and len(keyword_text) <= MAX_KEYWORD_LENGTH and keyword_text not in keywords: + keywords.append(keyword_text) + if len(keywords) >= MAX_KEYWORDS: + break + + try: + window_chars = int(confidence.get("window_chars", 48)) + except (TypeError, ValueError): + window_chars = 48 + window_chars = max(0, min(window_chars, MAX_WINDOW_CHARS)) + + entity_type = _as_string(rule.get("entity_type"), rule_id.upper()).upper() + replacement = _as_string(rule.get("replacement"), f"[REDACTED_{entity_type}]") + if len(replacement) > MAX_REPLACEMENT_LENGTH: + replacement = replacement[:MAX_REPLACEMENT_LENGTH] + + normalized_rules.append( + { + "id": rule_id, + "label": _as_string(rule.get("label"), entity_type), + "entity_type": entity_type, + "enabled": bool(rule.get("enabled", True)), + "pattern": pattern, + "replacement": replacement, + "surfaces": surfaces, + "flags": flags, + "validator": validator, + "confidence": { + "regex_only": _normalize_confidence(confidence.get("regex_only"), "medium"), + "with_keywords": _normalize_confidence(confidence.get("with_keywords"), "high"), + "keywords": keywords, + "window_chars": window_chars, + "minimum": _normalize_confidence(confidence.get("minimum"), "medium"), + }, + } + ) + + return normalized_rules, errors + + +def get_effective_dlp_regex_rules(settings): + normalized_rules, errors = validate_dlp_regex_rules((settings or {}).get("dlp_regex_rules")) + if errors: + default_rules, _ = validate_dlp_regex_rules(get_default_dlp_regex_rules()) + return default_rules, errors + return normalized_rules, [] + + +def _confidence_for_match(source_text, start, end, confidence): + keywords = confidence.get("keywords", []) + window_chars = int(confidence.get("window_chars", 0) or 0) + if not keywords or window_chars <= 0: + return confidence.get("regex_only", "medium") + + left = max(0, start - window_chars) + right = min(len(source_text), end + window_chars) + window = source_text[left:right].lower() + if any(keyword in window for keyword in keywords): + return confidence.get("with_keywords", "high") + return confidence.get("regex_only", "medium") + + +def _confidence_allows(actual, minimum): + return CONFIDENCE_ORDER.get(actual, 0) >= CONFIDENCE_ORDER.get(minimum, 2) + + +def _merge_confidence(existing, candidate): + return max(existing, candidate, key=lambda item: CONFIDENCE_ORDER.get(item, 0)) + + +def scan_text_with_dlp_regex_rules(text, rules, surface): + source_text = str(text or "") + redactions = [] + counts = OrderedDict() + confidence_by_entity = {} + + for rule in rules: + if not rule.get("enabled", True): + continue + if surface not in rule.get("surfaces", []): + continue + + compiled = regex.compile(rule["pattern"], _compile_flags(rule.get("flags", []))) + try: + rule_matches = list(compiled.finditer(source_text, timeout=REGEX_TIMEOUT_SECONDS)) + except TimeoutError: + raise RuntimeError(f"DLP regex rule timed out: {rule['id']}") + + for match in rule_matches: + value = match.group(0) + if not _validator_allows(value, rule.get("validator", "none")): + continue + + confidence = _confidence_for_match(source_text, match.start(), match.end(), rule["confidence"]) + if not _confidence_allows(confidence, rule["confidence"].get("minimum", "medium")): + continue + + entity_type = rule["entity_type"] + counts[entity_type] = counts.get(entity_type, 0) + 1 + confidence_by_entity[entity_type] = _merge_confidence( + confidence_by_entity.get(entity_type, "low"), + confidence, + ) + redactions.append((match.start(), match.end(), rule["replacement"])) + + redactions.sort(key=lambda item: item[0]) + redacted_parts = [] + cursor = 0 + for start, end, replacement in redactions: + if start < cursor: + continue + redacted_parts.append(source_text[cursor:start]) + redacted_parts.append(replacement) + cursor = end + redacted_parts.append(source_text[cursor:]) + + matches = [ + {"entity_type": entity_type, "count": count, "confidence": confidence_by_entity.get(entity_type, "medium")} + for entity_type, count in counts.items() + ] + metadata = { + "rule_count": len(rules), + "match_hash": hashlib.sha256("|".join(counts.keys()).encode("utf-8")).hexdigest()[:16] if counts else "", + } + return "".join(redacted_parts), dict(counts), matches, metadata diff --git a/application/single_app/functions_documents.py b/application/single_app/functions_documents.py index 5ee49733..1a908e5b 100644 --- a/application/single_app/functions_documents.py +++ b/application/single_app/functions_documents.py @@ -1,5 +1,6 @@ # functions_documents.py that has some changes I need to merge into Development +import logging import re import shutil import traceback @@ -7,7 +8,11 @@ from io import BytesIO from flask import make_response from config import * -from functions_appinsights import log_event +try: + from functions_appinsights import log_event +except Exception: + def log_event(message, extra=None, level=None, exceptionTraceback=False): + return None from functions_visio import build_visio_page_markdown, parse_vsdx_pages from functions_content import * from functions_settings import * @@ -16,6 +21,12 @@ from functions_authentication import * from functions_debug import * from functions_keyvault import SecretReturnType, keyvault_model_endpoint_get_helper +from functions_dlp import ( + build_dlp_telemetry_properties, + build_upload_dlp_file_log_summary, + evaluate_upload_content, + should_emit_dlp_telemetry, +) import azure.cognitiveservices.speech as speechsdk def allowed_file(filename, allowed_extensions=None): @@ -198,6 +209,271 @@ def _resolve_metadata_extraction_client(settings): DI_MARKDOWN_TABLE_ROW_PATTERN = re.compile(r'(?m)^\s*\|.+\|\s*$') +def _sanitize_video_indexer_log_value(value): + text = str(value) + text = re.sub( + r'([?&]accessToken=)[^&\s\'"<>]+', + r'\1[REDACTED]', + text, + flags=re.IGNORECASE, + ) + return re.sub( + r'([\'"]?accessToken[\'"]?\s*[:=]\s*[\'"]?)[^,\'"\s}&]+', + r'\1[REDACTED]', + text, + flags=re.IGNORECASE, + ) + + +def _get_upload_workspace_scope(group_id=None, public_workspace_id=None): + if public_workspace_id is not None: + return "public" + if group_id is not None: + return "group" + return "personal" + + +def _build_upload_dlp_context(document_id, page_number=None, group_id=None, public_workspace_id=None, text=None): + workspace_scope = _get_upload_workspace_scope(group_id=group_id, public_workspace_id=public_workspace_id) + context = { + "document_id": document_id, + "workspace_scope": workspace_scope, + } + if page_number is not None: + context["page_number"] = page_number + if text is not None: + context["text_length"] = len(text) + return context + + +def _should_disable_enhanced_citations_for_upload_dlp(settings): + if not settings.get("enable_dlp_control_plane", False): + return False + if not settings.get("enable_upload_dlp", False): + return False + if settings.get("dlp_fail_closed_on_scanner_error", True): + return True + if settings.get("upload_dlp_fail_upload_on_match", False): + return True + return str(settings.get("upload_dlp_mode", "monitor") or "monitor").lower() in {"redact", "block"} + + +UPLOAD_DLP_METADATA_FIELDS = ("title", "authors", "organization", "keywords", "abstract") +UPLOAD_DLP_STATUS_RANK = { + "accepted": 0, + "accepted_with_dlp_monitoring": 1, + "accepted_with_redactions": 2, + "scanner_failed": 3, + "blocked": 4, +} + + +def _metadata_value_to_text(value): + if value is None: + return "" + if isinstance(value, list): + return "\n".join("" if item is None else str(item) for item in value) + return str(value) + + +def _metadata_text_to_value(original_value, sanitized_text): + text = "" if sanitized_text is None else str(sanitized_text) + if isinstance(original_value, list): + return [line for line in text.splitlines() if line.strip()] + return text + + +def _merge_upload_dlp_document_summary(existing=None, incoming=None): + existing = existing or {} + incoming = incoming or {} + if incoming.get("dlp_metadata"): + incoming = incoming.get("dlp_metadata") or {} + if existing.get("dlp_metadata"): + existing = existing.get("dlp_metadata") or {} + + existing_status = str(existing.get("status") or existing.get("dlp_status") or "accepted") + incoming_status = str(incoming.get("status") or incoming.get("dlp_status") or "accepted") + aggregate_status = existing_status + if UPLOAD_DLP_STATUS_RANK.get(incoming_status, 0) > UPLOAD_DLP_STATUS_RANK.get(existing_status, 0): + aggregate_status = incoming_status + + aggregate = { + "status": aggregate_status, + "entity_counts": {}, + "total_replacements": int(existing.get("total_replacements") or 0) + int(incoming.get("total_replacements") or 0), + "scanner_status": incoming.get("scanner_status") or existing.get("scanner_status") or "ok", + } + for source in (existing.get("entity_counts") or {}, incoming.get("entity_counts") or {}): + for entity_type, count in source.items(): + aggregate["entity_counts"][str(entity_type)] = aggregate["entity_counts"].get(str(entity_type), 0) + int(count or 0) + for key in ("dlp_surface", "dlp_action", "dlp_engine", "dlp_mode", "scanner_status", "workspace_scope", "document_id"): + value = incoming.get(key) if incoming.get(key) is not None else existing.get(key) + if value is not None: + aggregate[key] = value + return aggregate + + +def _upload_metadata_log_summary(metadata, dlp_summary=None): + metadata = metadata or {} + fields = [field for field in UPLOAD_DLP_METADATA_FIELDS if field in metadata] + field_lengths = { + field: len(_metadata_value_to_text(metadata.get(field))) + for field in fields + } + populated_fields = [ + field + for field in fields + if field_lengths.get(field, 0) > 0 + ] + return { + "fields": fields, + "field_count": len(fields), + "field_lengths": field_lengths, + "populated_fields": populated_fields, + "populated_field_count": len(populated_fields), + "dlp_summary": dlp_summary or { + "status": "accepted", + "entity_counts": {}, + "total_replacements": 0, + }, + } + + +def _sanitize_upload_metadata_for_dlp(metadata, user_id, document_id, group_id=None, public_workspace_id=None): + sanitized = dict(metadata or {}) + aggregate = { + "status": "accepted", + "entity_counts": {}, + "total_replacements": 0, + "scanner_status": "ok", + } + + for field_name in UPLOAD_DLP_METADATA_FIELDS: + if field_name not in sanitized: + continue + original_value = sanitized.get(field_name) + metadata_text = _metadata_value_to_text(original_value) + if not metadata_text.strip(): + continue + + result = _evaluate_upload_dlp_text( + metadata_text, + user_id=user_id, + document_id=document_id, + page_number=f"metadata:{field_name}", + group_id=group_id, + public_workspace_id=public_workspace_id, + ) + sanitized[field_name] = _metadata_text_to_value( + original_value, + result.get("sanitized_text", metadata_text), + ) + incoming_summary = dict(result.get("dlp_metadata") or {}) + incoming_summary["status"] = result.get("status", incoming_summary.get("status", "accepted")) + aggregate = _merge_upload_dlp_document_summary(aggregate, incoming_summary) + + return sanitized, aggregate + + +def _get_current_document_dlp_metadata(document_id, user_id, group_id=None, public_workspace_id=None): + try: + document_metadata = get_document_metadata( + document_id=document_id, + user_id=user_id, + group_id=group_id, + public_workspace_id=public_workspace_id, + ) + except Exception as exc: + log_event( + f"[DLP] Failed to retrieve existing upload DLP metadata for document {document_id}: {exc}", + level=logging.WARNING, + ) + return {} + + if not document_metadata: + return {} + + current_metadata = dict(document_metadata.get("dlp_metadata") or {}) + if document_metadata.get("dlp_status") and not current_metadata.get("status"): + current_metadata["status"] = document_metadata.get("dlp_status") + return current_metadata + + +def _record_upload_dlp_result(result, user_id, document_id, group_id=None, public_workspace_id=None, page_number=None): + settings = get_settings() + context = _build_upload_dlp_context( + document_id=document_id, + page_number=page_number, + group_id=group_id, + public_workspace_id=public_workspace_id, + text=result.get("sanitized_text", ""), + ) + safe_summary = build_upload_dlp_file_log_summary(result, context=context) + add_file_task_to_file_processing_log( + document_id=document_id, + user_id=public_workspace_id if public_workspace_id is not None else (group_id if group_id is not None else user_id), + content=f"Upload DLP summary: {safe_summary}" + ) + + if should_emit_dlp_telemetry(result, settings): + log_event( + "[DLP] Upload decision", + extra=build_dlp_telemetry_properties(result, surface="upload", context=context), + ) + + incoming_metadata = dict(result.get("dlp_metadata") or {}) + incoming_metadata["status"] = result.get("status", incoming_metadata.get("status", "accepted")) + existing_metadata = _get_current_document_dlp_metadata( + document_id=document_id, + user_id=user_id, + group_id=group_id, + public_workspace_id=public_workspace_id, + ) + merged_metadata = _merge_upload_dlp_document_summary(existing_metadata, incoming_metadata) + + update_args = { + "document_id": document_id, + "user_id": user_id, + "dlp_status": merged_metadata.get("status"), + "dlp_metadata": merged_metadata, + } + if group_id is not None: + update_args["group_id"] = group_id + if public_workspace_id is not None: + update_args["public_workspace_id"] = public_workspace_id + + try: + update_document(**update_args) + except Exception as exc: + log_event( + f"[DLP] Failed to update upload DLP document metadata for document {document_id}: {exc}", + level=logging.WARNING, + ) + + +def _evaluate_upload_dlp_text(text, user_id, document_id, page_number=None, group_id=None, public_workspace_id=None): + settings = get_settings() + context = _build_upload_dlp_context( + document_id=document_id, + page_number=page_number, + group_id=group_id, + public_workspace_id=public_workspace_id, + text=text, + ) + result = evaluate_upload_content(text, settings=settings, context=context) + _record_upload_dlp_result( + result, + user_id=user_id, + document_id=document_id, + group_id=group_id, + public_workspace_id=public_workspace_id, + page_number=page_number, + ) + if not result.get("upload_allowed", True): + raise ValueError("Upload content blocked by DLP policy.") + return result + + def is_pdf_file_name(file_name): """Return True when the file name points to a PDF document.""" return str(file_name or '').lower().endswith('.pdf') @@ -1191,7 +1467,7 @@ def get_document_metadata(document_id, user_id, group_id=None, public_workspace_ add_file_task_to_file_processing_log( document_id=document_id, user_id=public_workspace_id if is_public_workspace else (group_id if is_group else user_id), - content=f"Document metadata lookup returned {len(document_items)} item(s)." + content=f"Document metadata retrieved for document {document_id}, item_count: {len(document_items)}." ) return _normalize_document_enhanced_citations(document_items[0]) if document_items else None @@ -1231,10 +1507,30 @@ def save_video_chunk( debug_print(f"[VIDEO CHUNK] Converted start_time {start_time} to {seconds} seconds") - # 1) generate embedding on the transcript text + transcript_dlp_result = _evaluate_upload_dlp_text( + page_text_content, + user_id=user_id, + document_id=document_id, + page_number=seconds, + group_id=group_id, + public_workspace_id=public_workspace_id, + ) + sanitized_transcript_text = transcript_dlp_result.get("sanitized_text", page_text_content) + ocr_dlp_result = _evaluate_upload_dlp_text( + ocr_chunk_text, + user_id=user_id, + document_id=document_id, + page_number=f"{seconds}:ocr", + group_id=group_id, + public_workspace_id=public_workspace_id, + ) + sanitized_ocr_text = ocr_dlp_result.get("sanitized_text", ocr_chunk_text) + dlp_metadata = transcript_dlp_result.get("dlp_metadata") + + # 1) generate embedding on the sanitized transcript text try: debug_print(f"[VIDEO CHUNK] Generating embedding for transcript text") - result = generate_embedding(page_text_content) + result = generate_embedding(sanitized_transcript_text) # Handle both tuple (new) and single value (backward compatibility) if isinstance(result, tuple): @@ -1279,15 +1575,16 @@ def save_video_chunk( chunk = { "id": chunk_id, "document_id": document_id, - "chunk_text": page_text_content, - "video_ocr_chunk_text": ocr_chunk_text, + "chunk_text": sanitized_transcript_text, + "video_ocr_chunk_text": sanitized_ocr_text, "embedding": embedding, "file_name": file_name, "start_time": start_time, "chunk_sequence": seconds, "upload_date": current_time, "version": version, - "document_tags": meta.get('tags', []) if meta else [] + "document_tags": meta.get('tags', []) if meta else [], + "dlp_metadata": dlp_metadata } if is_public_workspace: @@ -1325,6 +1622,8 @@ def save_video_chunk( print(f"[VideoChunk] UPLOAD ERROR for {chunk_id}: {e}", flush=True) except Exception as e: + if str(e) == "Upload content blocked by DLP policy.": + raise debug_print(f"[VIDEO CHUNK] Unexpected error processing chunk: {str(e)}") print(f"[VideoChunk] UNEXPECTED ERROR for {document_id}@{start_time}: {e}", flush=True) @@ -1336,7 +1635,8 @@ def process_video_document( update_callback, group_id, public_workspace_id=None, - auto_extract_metadata=True + auto_extract_metadata=True, + enable_enhanced_citations=False ): """ Processes a video by dividing transcript into 30-second chunks, @@ -1367,7 +1667,7 @@ def to_seconds(ts: str) -> float: debug_print("[VIDEO INDEXER] Video file support is enabled, proceeding with indexing") - if settings.get("enable_enhanced_citations", False): + if enable_enhanced_citations: debug_print("[VIDEO INDEXER] Enhanced citations enabled, uploading to blob storage") update_callback(status="Uploading video for enhanced citations...") try: @@ -1422,9 +1722,9 @@ def to_seconds(ts: str) -> float: token = get_video_indexer_account_token(settings) debug_print(f"[VIDEO INDEXER] Authentication successful, token length: {len(token) if token else 0}") except Exception as e: - debug_print(f"[VIDEO INDEXER] Authentication failed: {str(e)}") - print(f"[VIDEO] AUTH ERROR: {e}", flush=True) - update_callback(status=f"VIDEO: auth failed → {e}") + debug_print(f"[VIDEO INDEXER] Authentication failed: {_sanitize_video_indexer_log_value(e)}") + log_event("[VIDEO] AUTH ERROR", level=logging.ERROR) + update_callback(status="VIDEO: auth failed") return 0 # 2) Upload video to Indexer @@ -1443,8 +1743,8 @@ def to_seconds(ts: str) -> float: debug_print(f"[VIDEO INDEXER] Using managed identity access token authentication") debug_print(f"[VIDEO INDEXER] Upload URL: {url}") - debug_print(f"[VIDEO INDEXER] Upload params: {params}") - debug_print(f"[VIDEO INDEXER] Starting file upload for: {original_filename}") + debug_print(f"[VIDEO INDEXER] Upload params keys: {list(params.keys())}, accessToken_present={bool(token)}, name_length={len(original_filename or '')}") + debug_print(f"[VIDEO INDEXER] Starting file upload for name_length={len(original_filename or '')}") with open(temp_file_path, "rb") as f: resp = requests.post(url, params=params, headers=headers, files={"file": f}) @@ -1452,7 +1752,7 @@ def to_seconds(ts: str) -> float: debug_print(f"[VIDEO INDEXER] Upload response status: {resp.status_code}") if resp.status_code != 200: - debug_print(f"[VIDEO INDEXER] Upload response text: {resp.text}") + debug_print(f"[VIDEO INDEXER] Upload response text: {_sanitize_video_indexer_log_value(resp.text)}") resp.raise_for_status() response_data = resp.json() @@ -1460,7 +1760,7 @@ def to_seconds(ts: str) -> float: vid = response_data.get("id") if not vid: - debug_print(f"[VIDEO INDEXER] ERROR: No video ID in response: {response_data}") + debug_print(f"[VIDEO INDEXER] ERROR: No video ID in response; response keys: {list(response_data.keys())}") raise ValueError("no video ID returned") debug_print(f"[VIDEO INDEXER] Upload successful, video ID: {vid}") @@ -1483,17 +1783,17 @@ def to_seconds(ts: str) -> float: print(f"[VIDEO] Failed to update document metadata with video_indexer_id: {e}", flush=True) except requests.exceptions.RequestException as e: - debug_print(f"[VIDEO INDEXER] Upload request failed: {str(e)}") + debug_print(f"[VIDEO INDEXER] Upload request failed: {_sanitize_video_indexer_log_value(e)}") if hasattr(e, 'response') and e.response is not None: debug_print(f"[VIDEO INDEXER] Upload error response status: {e.response.status_code}") - debug_print(f"[VIDEO INDEXER] Upload error response text: {e.response.text}") + debug_print(f"[VIDEO INDEXER] Upload error response text: {_sanitize_video_indexer_log_value(e.response.text)}") print(f"[VIDEO] UPLOAD ERROR: {e}", flush=True) - update_callback(status=f"VIDEO: upload failed → {e}") + update_callback(status="VIDEO: upload failed") return 0 except Exception as e: - debug_print(f"[VIDEO INDEXER] Upload unexpected error: {str(e)}") + debug_print(f"[VIDEO INDEXER] Upload unexpected error: {_sanitize_video_indexer_log_value(e)}") print(f"[VIDEO] UPLOAD ERROR: {e}", flush=True) - update_callback(status=f"VIDEO: upload failed → {e}") + update_callback(status="VIDEO: upload failed") return 0 # 3) Poll until ready @@ -1506,8 +1806,8 @@ def to_seconds(ts: str) -> float: debug_print(f"[VIDEO INDEXER] Using managed identity access token for polling") debug_print(f"[VIDEO INDEXER] Requesting full insights (no filtering)") - debug_print(f"[VIDEO INDEXER] Index polling URL: {index_url}") - debug_print(f"[VIDEO INDEXER] Starting processing polling for video ID: {vid}") + debug_print(f"[VIDEO INDEXER] Index polling request prepared, video_id_length={len(str(vid or ''))}") + debug_print(f"[VIDEO INDEXER] Starting processing polling for video ID length: {len(str(vid or ''))}") poll_count = 0 max_polls = 180 # 90 minutes maximum (30 second intervals) @@ -1539,10 +1839,10 @@ def to_seconds(ts: str) -> float: debug_print(f"[VIDEO INDEXER] Poll response keys: {list(data.keys())}") except requests.exceptions.RequestException as e: - debug_print(f"[VIDEO INDEXER] Poll request failed: {str(e)}") + debug_print(f"[VIDEO INDEXER] Poll request failed: {_sanitize_video_indexer_log_value(e)}") if hasattr(e, 'response') and e.response is not None: debug_print(f"[VIDEO INDEXER] Poll error response status: {e.response.status_code}") - debug_print(f"[VIDEO INDEXER] Poll error response text: {e.response.text}") + debug_print(f"[VIDEO INDEXER] Poll error response text: {_sanitize_video_indexer_log_value(e.response.text)}") if poll_count >= max_polls: update_callback(status="VIDEO: polling timeout") return 0 @@ -1593,91 +1893,20 @@ def to_seconds(ts: str) -> float: video_duration_seconds = to_seconds(video_duration) if video_duration else 0 debug_print(f"[VIDEO INDEXER] Video duration: {video_duration} ({video_duration_seconds} seconds)") - # Log raw insights JSON for complete visibility (debug only) - import json - print(f"\n[VIDEO] ===== RAW INSIGHTS JSON =====", flush=True) - try: - insights_json = json.dumps(insights, indent=2, ensure_ascii=False) - # Truncate if too long (show first 10000 chars) - if len(insights_json) > 10000: - print(f"{insights_json[:10000]}\n... (truncated, total length: {len(insights_json)} chars)", flush=True) - else: - print(insights_json, flush=True) - except Exception as e: - print(f"[VIDEO] Could not serialize insights to JSON: {e}", flush=True) - print(f"[VIDEO] ===== END RAW INSIGHTS =====\n", flush=True) - debug_print(f"[VIDEO INDEXER] Insights keys available: {list(insights.keys())}") - print(f"[VIDEO] Available insight types: {', '.join(list(insights.keys())[:15])}...", flush=True) - - # Debug: Show sample structures for all insight types - print(f"\n[VIDEO] ===== SAMPLE DATA STRUCTURES =====", flush=True) - - transcript_data = insights.get("transcript", []) - if transcript_data: - print(f"[VIDEO] TRANSCRIPT sample: {transcript_data[0]}", flush=True) - - ocr_data = insights.get("ocr", []) - if ocr_data: - print(f"[VIDEO] OCR sample: {ocr_data[0]}", flush=True) - - keywords_data_debug = insights.get("keywords", []) - if keywords_data_debug: - print(f"[VIDEO] KEYWORDS sample: {keywords_data_debug[0]}", flush=True) - - labels_data_debug = insights.get("labels", []) - if labels_data_debug: - debug_print(f"[VIDEO INDEXER] LABELS sample: {labels_data_debug[0]}") - - topics_data_debug = insights.get("topics", []) - if topics_data_debug: - debug_print(f"[VIDEO INDEXER] TOPICS sample: {topics_data_debug[0]}") - - audio_effects_data_debug = insights.get("audioEffects", []) - if audio_effects_data_debug: - debug_print(f"[VIDEO INDEXER] AUDIO_EFFECTS sample: {audio_effects_data_debug[0]}") - - emotions_data_debug = insights.get("emotions", []) - if emotions_data_debug: - debug_print(f"[VIDEO INDEXER] EMOTIONS sample: {emotions_data_debug[0]}") - - sentiments_data_debug = insights.get("sentiments", []) - if sentiments_data_debug: - debug_print(f"[VIDEO INDEXER] SENTIMENTS sample: {sentiments_data_debug[0]}") - - scenes_data_debug = insights.get("scenes", []) - if scenes_data_debug: - debug_print(f"[VIDEO INDEXER] SCENES sample: {scenes_data_debug[0]}") - - shots_data_debug = insights.get("shots", []) - if shots_data_debug: - debug_print(f"[VIDEO INDEXER] SHOTS sample: {shots_data_debug[0]}") - - faces_data_debug = insights.get("faces", []) - if faces_data_debug: - debug_print(f"[VIDEO INDEXER] FACES sample: {faces_data_debug[0]}") - - namedLocations_data_debug = insights.get("namedLocations", []) - if namedLocations_data_debug: - debug_print(f"[VIDEO INDEXER] NAMED_LOCATIONS sample: {namedLocations_data_debug[0]}") - - # Check for other potential label sources - brands_data_debug = insights.get("brands", []) - if brands_data_debug: - debug_print(f"[VIDEO INDEXER] BRANDS sample: {brands_data_debug[0]}") - - visualContentModeration_debug = insights.get("visualContentModeration", []) - if visualContentModeration_debug: - debug_print(f"[VIDEO INDEXER] VISUAL_MODERATION sample: {visualContentModeration_debug[0]}") + log_event( + f"[VIDEO] Available insight types: {', '.join(list(insights.keys())[:15])}...", + level=logging.INFO, + ) - # Show total counts for all available insights - print(f"[VIDEO] COUNTS:", flush=True) + insight_counts = {} for key in insights.keys(): value = insights.get(key, []) if isinstance(value, list): - print(f" {key}: {len(value)} items", flush=True) + insight_counts[key] = len(value) - print(f"[VIDEO] ===== END SAMPLE DATA =====\n", flush=True) + log_event("[VIDEO] Insight counts", extra={"insight_counts": insight_counts}, level=logging.INFO) + log_event("[VIDEO] Insight count logging complete", level=logging.INFO) transcript = insights.get("transcript", []) ocr_blocks = insights.get("ocr", []) @@ -1783,7 +2012,7 @@ def to_seconds(ts: str) -> float: debug_print(f"[VIDEO INDEXER] Context built - Speech: {len(speech_context)}, OCR: {len(ocr_context)}, Keywords: {len(keywords_context)}, Labels: {len(labels_context)}, People: {len(named_people_context)}, Locations: {len(named_locations_context)}, Objects: {len(detected_objects_context)}") if len(speech_context) > 0: - debug_print(f"[VIDEO INDEXER] First speech item: {speech_context[0]}") + debug_print("[VIDEO INDEXER] First speech item timing metadata available") # Sort all contexts by timestamp speech_context.sort(key=lambda x: to_seconds(x["start"])) @@ -2067,7 +2296,7 @@ def to_seconds(ts: str) -> float: insight_parts.append(f"Objects: {', '.join(chunk_objects)}") chunk_text = ". ".join(insight_parts) if insight_parts else "[No content detected]" - debug_print(f"[VIDEO INDEXER] Chunk {chunk_num + 1} has no speech, using insights as text: {chunk_text[:100]}...") + debug_print(f"[VIDEO INDEXER] Chunk {chunk_num + 1} has no speech, using insight summary length: {len(chunk_text)}") debug_print(f"[VIDEO INDEXER] Chunk {chunk_num + 1} at timestamp {start_ts}") debug_print(f"[VIDEO INDEXER] Chunk {chunk_num + 1} text length: {len(chunk_text)}, OCR text length: {len(ocr_text)}") @@ -2096,6 +2325,8 @@ def to_seconds(ts: str) -> float: debug_print(f"[VIDEO INDEXER] Chunk {chunk_num + 1} saved successfully") total += 1 except Exception as e: + if str(e) == "Upload content blocked by DLP policy.": + raise debug_print(f"[VIDEO INDEXER] Failed to save chunk {chunk_num + 1}: {str(e)}") debug_print(f"[VIDEO INDEXER] Chunk save traceback: {traceback.format_exc()}") @@ -2512,6 +2743,13 @@ def save_chunks(page_text_content, page_number, file_name, user_id, document_id, if not metadata: raise ValueError(f"No metadata found for document {document_id} (group: {is_group})") + metadata, _metadata_dlp_summary = _sanitize_upload_metadata_for_dlp( + metadata, + user_id=user_id, + document_id=document_id, + group_id=group_id, + public_workspace_id=public_workspace_id, + ) version = metadata.get("version") if metadata.get("version") else 1 if version is None: raise ValueError(f"Metadata for document {document_id} missing 'version' field") @@ -2520,15 +2758,6 @@ def save_chunks(page_text_content, page_number, file_name, user_id, document_id, print(f"Error updating document status or retrieving metadata for document {document_id}: {repr(e)}\nTraceback:\n{traceback.format_exc()}") raise - # Generate embedding - try: - #status = f"Generating embedding for page {page_number}" - #update_document(document_id=document_id, user_id=user_id, status=status) - embedding, token_usage = generate_embedding(page_text_content) - except Exception as e: - print(f"Error generating embedding for page {page_number} of document {document_id}: {e}") - raise - # Build chunk document try: chunk_id = f"{document_id}_{page_number}" @@ -2572,12 +2801,35 @@ def save_chunks(page_text_content, page_number, file_name, user_id, document_id, else: debug_print(f"[SAVE_CHUNKS] No vision analysis found for document {document_id}") + upload_dlp_result = _evaluate_upload_dlp_text( + enhanced_chunk_text, + user_id=user_id, + document_id=document_id, + page_number=page_number, + group_id=group_id, + public_workspace_id=public_workspace_id, + ) + sanitized_chunk_text = upload_dlp_result.get("sanitized_text", enhanced_chunk_text) + dlp_metadata = upload_dlp_result.get("dlp_metadata") + + # Generate embedding after upload DLP so embeddings never receive blocked or unredacted enforced content. + try: + #status = f"Generating embedding for page {page_number}" + #update_document(document_id=document_id, user_id=user_id, status=status) + embedding, token_usage = generate_embedding(sanitized_chunk_text) + except Exception as e: + log_event( + f"Error generating embedding for page {page_number} of document {document_id}: {e}", + level=logging.ERROR, + ) + raise + if is_public_workspace: chunk_document = { "id": chunk_id, "document_id": document_id, "chunk_id": str(page_number), - "chunk_text": enhanced_chunk_text, + "chunk_text": sanitized_chunk_text, "embedding": embedding, "file_name": file_name, "chunk_keywords": chunk_keywords, @@ -2590,7 +2842,8 @@ def save_chunks(page_text_content, page_number, file_name, user_id, document_id, "chunk_sequence": page_number, # or you can keep an incremental idx "upload_date": current_time, "version": version, - "public_workspace_id": public_workspace_id + "public_workspace_id": public_workspace_id, + "dlp_metadata": dlp_metadata } elif is_group: # Get shared_group_ids from document metadata for group documents @@ -2599,7 +2852,7 @@ def save_chunks(page_text_content, page_number, file_name, user_id, document_id, "id": chunk_id, "document_id": document_id, "chunk_id": str(page_number), - "chunk_text": enhanced_chunk_text, + "chunk_text": sanitized_chunk_text, "embedding": embedding, "file_name": file_name, "chunk_keywords": chunk_keywords, @@ -2613,7 +2866,8 @@ def save_chunks(page_text_content, page_number, file_name, user_id, document_id, "upload_date": current_time, "version": version, "group_id": group_id, - "shared_group_ids": shared_group_ids + "shared_group_ids": shared_group_ids, + "dlp_metadata": dlp_metadata } else: # Get shared_user_ids from document metadata for personal documents @@ -2623,7 +2877,7 @@ def save_chunks(page_text_content, page_number, file_name, user_id, document_id, "id": chunk_id, "document_id": document_id, "chunk_id": str(page_number), - "chunk_text": enhanced_chunk_text, + "chunk_text": sanitized_chunk_text, "embedding": embedding, "file_name": file_name, "chunk_keywords": chunk_keywords, @@ -2637,7 +2891,8 @@ def save_chunks(page_text_content, page_number, file_name, user_id, document_id, "upload_date": current_time, "version": version, "user_id": user_id, - "shared_user_ids": shared_user_ids + "shared_user_ids": shared_user_ids, + "dlp_metadata": dlp_metadata } except Exception as e: print(f"Error creating chunk document for page {page_number} of document {document_id}: {e}") @@ -2708,19 +2963,21 @@ def save_chunks_batch(chunks_data, user_id, document_id, group_id=None, public_w if not metadata: raise ValueError(f"No metadata found for document {document_id}") + metadata, _metadata_dlp_summary = _sanitize_upload_metadata_for_dlp( + metadata, + user_id=user_id, + document_id=document_id, + group_id=group_id, + public_workspace_id=public_workspace_id, + ) version = metadata.get("version") if metadata.get("version") else 1 + author = ensure_list(metadata.get('authors')) if metadata else [] + title = metadata.get('title', '') if metadata else '' + document_classification = metadata.get('document_classification', 'None') if metadata else 'None' except Exception as e: log_event(f"[save_chunks_batch] Error retrieving metadata for document {document_id}: {repr(e)}", level=logging.ERROR) raise - # Generate all embeddings in batches - texts = [c['page_text_content'] for c in chunks_data] - try: - embedding_results = generate_embeddings_batch(texts) - except Exception as e: - log_event(f"[save_chunks_batch] Error generating batch embeddings for document {document_id}: {e}", level=logging.ERROR) - raise - # Check for vision analysis once vision_analysis = metadata.get('vision_analysis') vision_text = "" @@ -2742,15 +2999,42 @@ def save_chunks_batch(chunks_data, user_id, document_id, group_id=None, public_w vision_text_parts.append(f"\nContextual Analysis: {vision_analysis['analysis']}") vision_text = "\n".join(vision_text_parts) + sanitized_chunks_data = [] + for chunk_info in chunks_data: + sanitized_chunk_info = dict(chunk_info) + page_number = chunk_info['page_number'] + page_text_content = chunk_info['page_text_content'] + enhanced_chunk_text = page_text_content + vision_text if vision_text else page_text_content + upload_dlp_result = _evaluate_upload_dlp_text( + enhanced_chunk_text, + user_id=user_id, + document_id=document_id, + page_number=page_number, + group_id=group_id, + public_workspace_id=public_workspace_id, + ) + sanitized_chunk_info['page_text_content'] = upload_dlp_result.get("sanitized_text", enhanced_chunk_text) + sanitized_chunk_info['dlp_metadata'] = upload_dlp_result.get("dlp_metadata") + sanitized_chunks_data.append(sanitized_chunk_info) + + # Generate all embeddings in batches after DLP redaction + texts = [c['page_text_content'] for c in sanitized_chunks_data] + try: + embedding_results = generate_embeddings_batch(texts) + except Exception as e: + log_event(f"[save_chunks_batch] Error generating batch embeddings for document {document_id}: {e}", level=logging.ERROR) + raise + # Build all chunk documents chunk_documents = [] total_token_usage = {'total_tokens': 0, 'prompt_tokens': 0, 'model_deployment_name': None} - for idx, chunk_info in enumerate(chunks_data): + for idx, chunk_info in enumerate(sanitized_chunks_data): embedding, token_usage = embedding_results[idx] page_number = chunk_info['page_number'] file_name = chunk_info['file_name'] - page_text_content = chunk_info['page_text_content'] + enhanced_chunk_text = chunk_info['page_text_content'] + dlp_metadata = chunk_info.get('dlp_metadata') if token_usage: total_token_usage['total_tokens'] += token_usage.get('total_tokens', 0) @@ -2759,7 +3043,6 @@ def save_chunks_batch(chunks_data, user_id, document_id, group_id=None, public_w total_token_usage['model_deployment_name'] = token_usage.get('model_deployment_name') chunk_id = f"{document_id}_{page_number}" - enhanced_chunk_text = page_text_content + vision_text if vision_text else page_text_content if is_public_workspace: chunk_document = { @@ -2772,14 +3055,15 @@ def save_chunks_batch(chunks_data, user_id, document_id, group_id=None, public_w "chunk_keywords": [], "chunk_summary": "", "page_number": page_number, - "author": [], - "title": "", - "document_classification": "None", + "author": author, + "title": title, + "document_classification": document_classification, "document_tags": metadata.get('tags', []), "chunk_sequence": page_number, "upload_date": current_time, "version": version, - "public_workspace_id": public_workspace_id + "public_workspace_id": public_workspace_id, + "dlp_metadata": dlp_metadata } elif is_group: shared_group_ids = metadata.get('shared_group_ids', []) if metadata else [] @@ -2793,15 +3077,16 @@ def save_chunks_batch(chunks_data, user_id, document_id, group_id=None, public_w "chunk_keywords": [], "chunk_summary": "", "page_number": page_number, - "author": [], - "title": "", - "document_classification": "None", + "author": author, + "title": title, + "document_classification": document_classification, "document_tags": metadata.get('tags', []), "chunk_sequence": page_number, "upload_date": current_time, "version": version, "group_id": group_id, - "shared_group_ids": shared_group_ids + "shared_group_ids": shared_group_ids, + "dlp_metadata": dlp_metadata } else: shared_user_ids = metadata.get('shared_user_ids', []) if metadata else [] @@ -2815,15 +3100,16 @@ def save_chunks_batch(chunks_data, user_id, document_id, group_id=None, public_w "chunk_keywords": [], "chunk_summary": "", "page_number": page_number, - "author": [], - "title": "", - "document_classification": "None", + "author": author, + "title": title, + "document_classification": document_classification, "document_tags": metadata.get('tags', []), "chunk_sequence": page_number, "upload_date": current_time, "version": version, "user_id": user_id, - "shared_user_ids": shared_user_ids + "shared_user_ids": shared_user_ids, + "dlp_metadata": dlp_metadata } chunk_documents.append(chunk_document) @@ -4201,7 +4487,7 @@ def extract_document_metadata(document_id, user_id, group_id=None, public_worksp add_file_task_to_file_processing_log( document_id=document_id, user_id=group_id if is_group else user_id, - content=f"Retrieved document items for document {document_id}: {document_items}" + content=f"Retrieved document items for document {document_id}, item_count: {len(document_items)}." ) except Exception as e: add_file_task_to_file_processing_log( @@ -4231,10 +4517,19 @@ def extract_document_metadata(document_id, user_id, group_id=None, public_worksp if "abstract" in document_metadata: meta_data["abstract"] = document_metadata["abstract"] + meta_data, metadata_dlp_summary = _sanitize_upload_metadata_for_dlp( + meta_data, + user_id=user_id, + document_id=document_id, + group_id=group_id, + public_workspace_id=public_workspace_id, + ) + metadata_summary = _upload_metadata_log_summary(meta_data, metadata_dlp_summary) + add_file_task_to_file_processing_log( document_id=document_id, user_id=group_id if is_group else user_id, - content=f"Extracted metadata for document {document_id}, metadata: {meta_data}" + content=f"Extracted metadata for document {document_id}, summary: {metadata_summary}" ) args = { @@ -4288,12 +4583,16 @@ def extract_document_metadata(document_id, user_id, group_id=None, public_worksp block_reasons.append("Blocklist match") if blocked: + blocked_metadata_summary = _upload_metadata_log_summary(meta_data, metadata_dlp_summary) add_file_task_to_file_processing_log( document_id=document_id, user_id=group_id if is_group else user_id, - content=f"Blocked document metadata: {document_metadata}, reasons: {block_reasons}" + content=f"Blocked document metadata for document {document_id}, summary: {blocked_metadata_summary}, reasons: {block_reasons}" + ) + log_event( + f"Blocked document metadata for document {document_id}. Reasons: {block_reasons}", + level=logging.WARNING, ) - print(f"Blocked document metadata: {document_metadata}\nReasons: {block_reasons}") return None except Exception as e: @@ -4310,7 +4609,7 @@ def extract_document_metadata(document_id, user_id, group_id=None, public_worksp add_file_task_to_file_processing_log( document_id=document_id, user_id=group_id if is_group else user_id, - content=f"Processing Hybrid search for document {document_id} using {len(meta_data or {})} metadata fields." + content=f"Processing Hybrid search for document {document_id} using metadata fields: {metadata_summary['populated_fields']}" ) args = { @@ -4442,7 +4741,7 @@ def extract_document_metadata(document_id, user_id, group_id=None, public_worksp add_file_task_to_file_processing_log( document_id=document_id, user_id=group_id if is_group else user_id, - content=f"GPT response for document {document_id}: {response_content}" + content=f"GPT response for document {document_id}, response_length: {len(response_content or '')}" ) # --- Step 7: Clean and parse the GPT JSON output --- @@ -4458,7 +4757,7 @@ def extract_document_metadata(document_id, user_id, group_id=None, public_worksp add_file_task_to_file_processing_log( document_id=document_id, user_id=group_id if is_group else user_id, - content=f"Cleaned JSON from GPT response for document {document_id}: {cleaned_str}" + content=f"Cleaned JSON from GPT response for document {document_id}, json_length: {len(cleaned_str or '')}" ) gpt_output = json.loads(cleaned_str) @@ -4466,12 +4765,24 @@ def extract_document_metadata(document_id, user_id, group_id=None, public_worksp add_file_task_to_file_processing_log( document_id=document_id, user_id=group_id if is_group else user_id, - content=f"Decoded JSON from GPT response for document {document_id}: {gpt_output}" + content=f"Decoded JSON from GPT response for document {document_id}, keys: {list(gpt_output.keys()) if isinstance(gpt_output, dict) else []}" ) # Ensure authors and keywords are always lists gpt_output["authors"] = ensure_list(gpt_output.get("authors", [])) gpt_output["keywords"] = ensure_list(gpt_output.get("keywords", [])) + gpt_output, gpt_metadata_dlp_summary = _sanitize_upload_metadata_for_dlp( + gpt_output, + user_id=user_id, + document_id=document_id, + group_id=group_id, + public_workspace_id=public_workspace_id, + ) + add_file_task_to_file_processing_log( + document_id=document_id, + user_id=group_id if is_group else user_id, + content=f"Sanitized GPT metadata for document {document_id}, summary: {_upload_metadata_log_summary(gpt_output, gpt_metadata_dlp_summary)}" + ) except (json.JSONDecodeError, TypeError) as e: add_file_task_to_file_processing_log( @@ -4514,10 +4825,18 @@ def extract_document_metadata(document_id, user_id, group_id=None, public_worksp if is_effectively_empty(meta_data["abstract"]): meta_data["abstract"] = gpt_output.get("abstract", meta_data["abstract"]) + meta_data, final_metadata_dlp_summary = _sanitize_upload_metadata_for_dlp( + meta_data, + user_id=user_id, + document_id=document_id, + group_id=group_id, + public_workspace_id=public_workspace_id, + ) + add_file_task_to_file_processing_log( document_id=document_id, user_id=group_id if is_group else user_id, - content=f"Final metadata for document {document_id}: {meta_data}" + content=f"Final metadata for document {document_id}, summary: {_upload_metadata_log_summary(meta_data, final_metadata_dlp_summary)}" ) args = { @@ -6833,9 +7152,25 @@ def process_di_document(document_id, user_id, temp_file_path, original_filename, elif doc_author: update_fields['authors'] = [doc_author] if doc_subject: update_fields['abstract'] = doc_subject if doc_keywords: update_fields['keywords'] = doc_keywords + metadata_update_fields = { + key: value + for key, value in update_fields.items() + if key in UPLOAD_DLP_METADATA_FIELDS + } + if metadata_update_fields: + sanitized_metadata_fields, _ = _sanitize_upload_metadata_for_dlp( + metadata_update_fields, + user_id=user_id, + document_id=document_id, + group_id=group_id, + public_workspace_id=public_workspace_id, + ) + update_fields.update(sanitized_metadata_fields) update_callback(**update_fields) except Exception as e: + if str(e) == "Upload content blocked by DLP policy.": + raise print(f"Warning: Failed to extract initial metadata for {original_filename}: {e}") # Continue processing even if metadata fails @@ -7451,12 +7786,13 @@ def process_audio_document( update_callback, group_id=None, public_workspace_id=None, - auto_extract_metadata=True + auto_extract_metadata=True, + enable_enhanced_citations=False ) -> int: """Transcribe an audio file via Azure Speech, splitting >10 min into WAV chunks.""" settings = get_settings() - if settings.get("enable_enhanced_citations", False): + if enable_enhanced_citations: update_callback(status="Uploading audio for enhanced citations…") blob_path = upload_to_blob( temp_file_path, @@ -7529,7 +7865,7 @@ def recognized_cb(evt): try: if evt.result.reason == speechsdk.ResultReason.RecognizedSpeech: all_results.append(evt.result.text) - print(f"[Debug] Recognized: {evt.result.text}") + print(f"[Debug] Recognized text length: {len(evt.result.text or '')}") elif evt.result.reason == speechsdk.ResultReason.NoMatch: print(f"[Debug] No speech recognized in segment") except Exception as e: @@ -7618,7 +7954,7 @@ def canceled_cb(evt): # result = speech_recognizer.recognize_once() # if result.reason == speechsdk.ResultReason.RecognizedSpeech: - # print(f"[Debug] Recognized: {result.text}") + # print(f"[Debug] Recognized text length: {len(result.text or '')}") # all_phrases.append(result.text) # elif result.reason == speechsdk.ResultReason.NoMatch: # print(f"[Warning] No speech in {chunk_path}") @@ -8260,6 +8596,11 @@ def process_document_upload_background(document_id, user_id, temp_file_path, ori is_public_workspace = public_workspace_id is not None settings = get_settings() enable_enhanced_citations = settings.get('enable_enhanced_citations', False) # Default to False if missing + disabled_enhanced_citations_for_upload_dlp = ( + enable_enhanced_citations and _should_disable_enhanced_citations_for_upload_dlp(settings) + ) + if disabled_enhanced_citations_for_upload_dlp: + enable_enhanced_citations = False enable_extract_meta_data = settings.get('enable_extract_meta_data', False) # Used by DI flow max_file_size_bytes = settings.get('max_file_size_mb', 16) * 1024 * 1024 @@ -8289,6 +8630,12 @@ def update_doc_callback(**kwargs): update_document(**args) + if disabled_enhanced_citations_for_upload_dlp: + update_doc_callback( + enhanced_citations=False, + status="Enhanced citations disabled because upload DLP enforcement is active" + ) + total_chunks_saved = 0 total_embedding_tokens = 0 embedding_model_name = None @@ -8413,7 +8760,8 @@ def update_doc_callback(**kwargs): update_callback=update_doc_callback, group_id=group_id, public_workspace_id=public_workspace_id, - auto_extract_metadata=False + auto_extract_metadata=False, + enable_enhanced_citations=enable_enhanced_citations ) elif file_ext in audio_extensions: total_chunks_saved = process_audio_document( @@ -8424,7 +8772,8 @@ def update_doc_callback(**kwargs): update_callback=update_doc_callback, group_id=group_id, public_workspace_id=public_workspace_id, - auto_extract_metadata=False + auto_extract_metadata=False, + enable_enhanced_citations=enable_enhanced_citations ) elif file_ext in di_supported_extensions or file_ext == '.doc': result = process_di_document( @@ -8499,6 +8848,14 @@ def update_doc_callback(**kwargs): group_id=group_id, public_workspace_id=public_workspace_id ) + if doc_metadata: + doc_metadata, _ = _sanitize_upload_metadata_for_dlp( + doc_metadata, + user_id=user_id, + document_id=document_id, + group_id=group_id, + public_workspace_id=public_workspace_id, + ) # Determine workspace type if public_workspace_id: @@ -9677,4 +10034,4 @@ def propagate_tags_to_chunks(document_id, tags, user_id, group_id=None, public_w except Exception as e: print(f"Error propagating tags to chunks for document {document_id}: {e}") - raise \ No newline at end of file + raise diff --git a/application/single_app/functions_settings.py b/application/single_app/functions_settings.py index 31c78aa9..f091a3c7 100644 --- a/application/single_app/functions_settings.py +++ b/application/single_app/functions_settings.py @@ -7,6 +7,7 @@ from config import * from functions_appinsights import log_event from functions_cosmos_throughput import get_default_cosmos_throughput_settings +from functions_dlp_rules import get_default_dlp_regex_rules from functions_document_actions import get_default_document_action_capabilities from functions_icon_utils import normalize_icon_payload from functions_service_health import get_default_service_health @@ -1061,6 +1062,37 @@ def get_settings(use_cosmos=False, include_source=False): 'azure_apim_content_safety_endpoint': '', 'azure_apim_content_safety_subscription_key': '', + # Data Loss Prevention (DLP) Settings + 'enable_dlp_control_plane': False, + 'dlp_default_engine': 'regex', + 'dlp_regex_rules': get_default_dlp_regex_rules(), + 'dlp_max_scan_chars': 200000, + 'dlp_fail_closed_on_scanner_error': True, + 'dlp_audit_level': 'counts_only', + 'dlp_enable_structured_telemetry': True, + 'dlp_telemetry_sample_allow_events': False, + 'dlp_review_destination': 'none', + 'dlp_presidio_analyzer_endpoint': '', + 'dlp_presidio_allowed_private_hosts': '', + 'dlp_presidio_auth_header_name': 'X-DLP-API-Key', + 'dlp_presidio_auth_secret_env_var': 'PRESIDIO_DLP_API_KEY', + 'dlp_presidio_timeout_seconds': 5, + 'dlp_presidio_score_threshold': 0.5, + 'dlp_presidio_language': 'en', + 'dlp_presidio_entities': [ + 'CREDIT_CARD', + 'EMAIL_ADDRESS', + 'PHONE_NUMBER', + 'US_SSN', + 'PERSON', + 'LOCATION', + ], + 'enable_web_search_dlp': False, + 'web_search_dlp_mode': 'monitor', + 'enable_upload_dlp': False, + 'upload_dlp_mode': 'monitor', + 'upload_dlp_fail_upload_on_match': False, + # User Feedback / Conversation Archiving 'enable_user_feedback': True, 'require_member_of_feedback_admin': False, @@ -2271,6 +2303,12 @@ def sanitize_settings_for_user(full_settings: dict) -> dict: return full_settings sensitive_terms = ("key", "secret", "password", "connection", "base64", "storage_account_url") + sensitive_setting_names = { + "dlp_presidio_analyzer_endpoint", + "dlp_presidio_allowed_private_hosts", + "dlp_presidio_auth_header_name", + "dlp_presidio_auth_secret_env_var", + } sanitized = {} for k, v in full_settings.items(): @@ -2278,6 +2316,8 @@ def sanitize_settings_for_user(full_settings: dict) -> dict: continue if k == 'agents_page_promoted_popular_agents': continue + if k in sensitive_setting_names: + continue if any(term in k.lower() for term in sensitive_terms): continue if k in ('model_endpoints', 'personal_model_endpoints') and isinstance(v, list): @@ -2433,4 +2473,4 @@ def clear_user_search_history(user_id): level=logging.ERROR, exceptionTraceback=True ) - return False \ No newline at end of file + return False diff --git a/application/single_app/requirements.txt b/application/single_app/requirements.txt index 4d79728a..b0237bfa 100644 --- a/application/single_app/requirements.txt +++ b/application/single_app/requirements.txt @@ -61,4 +61,5 @@ aiohttp==3.14.1 html2text==2025.4.15 matplotlib==3.10.7 azure-cognitiveservices-speech==1.48.2 -playwright==1.58.0 \ No newline at end of file +playwright==1.58.0 +regex==2026.5.9 diff --git a/application/single_app/route_backend_chats.py b/application/single_app/route_backend_chats.py index e299ab66..c0b56a69 100644 --- a/application/single_app/route_backend_chats.py +++ b/application/single_app/route_backend_chats.py @@ -113,6 +113,7 @@ from functions_appinsights import log_event from functions_debug import debug_print from functions_governance import ensure_governance_access +from functions_dlp import evaluate_web_search_egress, build_dlp_telemetry_properties, should_emit_dlp_telemetry from functions_notifications import create_chat_response_notification from functions_activity_logging import log_agent_run, log_chat_activity, log_conversation_creation, log_token_usage from flask import current_app @@ -1054,6 +1055,8 @@ def _strip_agent_citation_artifact_refs(agent_citations): FACT_MEMORY_TYPE_FACT = 'fact' FACT_MEMORY_TYPE_INSTRUCTION = 'instruction' FACT_MEMORY_TYPE_LEGACY_DESCRIBER = 'describer' +WEB_SEARCH_DLP_BLOCKED_STATUS = "Web search was blocked because the message appears to contain non-public information." +WEB_SEARCH_DLP_REDACTED_STATUS = "Sensitive details were removed before web search." INLINE_CHART_ID_PATTERN_TEMPLATE = '"chartId":"{}"' TABULAR_INLINE_CHART_MAX_POINTS = 12 TABULAR_INLINE_CHART_MAX_CHARTS = 2 @@ -14368,26 +14371,80 @@ def record_tabular_post_processing_thought(thought_payload): if web_search_enabled: search_thought_label = 'deep_research' if deep_research_enabled else 'web_search' - search_thought_text = "Planning Deep Research web searches" if deep_research_enabled else f"Searching the web for '{web_search_query_text[:50]}'" + search_thought_text = "Planning Deep Research web searches" if deep_research_enabled else f"Searching the web with query length {len(web_search_query_text)}" thought_tracker.add_thought(search_thought_label, search_thought_text) - research_search_result = perform_research_web_searches( + web_search_dlp_result = evaluate_web_search_egress( + web_search_query_text, settings=settings, - conversation_id=conversation_id, - user_id=user_id, - user_message=user_message, - user_message_id=user_message_id, - chat_type=chat_type, - document_scope=document_scope, - active_group_id=active_group_id, - active_public_workspace_id=active_public_workspace_id, - web_search_query_text=web_search_query_text, - system_messages_for_augmentation=system_messages_for_augmentation, - agent_citations_list=agent_citations_list, - web_search_citations_list=web_search_citations_list, - deep_research_enabled=deep_research_enabled, - deep_research_planner_client=gpt_client, - deep_research_planner_model=gpt_model, + context={ + "conversation_id": conversation_id, + "chat_type": chat_type, + "document_scope": document_scope, + "workspace_scope": document_scope, + }, ) + if should_emit_dlp_telemetry(web_search_dlp_result, settings): + log_event( + "[DLP] Web search egress decision", + extra=build_dlp_telemetry_properties( + web_search_dlp_result, + surface="web_search", + context={ + "conversation_id": conversation_id, + "chat_type": chat_type, + "document_scope": document_scope, + "workspace_scope": document_scope, + }, + ), + ) + if web_search_dlp_result.get("status_message"): + thought_tracker.add_thought('web_search', web_search_dlp_result["status_message"]) + + if not web_search_dlp_result.get("web_search_allowed", True): + system_messages_for_augmentation.append({ + "role": "system", + "content": WEB_SEARCH_DLP_BLOCKED_STATUS, + }) + research_search_result = {'query_plan': {}, 'web_search_runs': []} + else: + web_search_query_text = web_search_dlp_result.get("web_search_query_text", web_search_query_text) + if deep_research_enabled: + research_search_result = perform_research_web_searches( + settings=settings, + conversation_id=conversation_id, + user_id=user_id, + user_message=user_message, + user_message_id=user_message_id, + chat_type=chat_type, + document_scope=document_scope, + active_group_id=active_group_id, + active_public_workspace_id=active_public_workspace_id, + web_search_query_text=web_search_query_text, + system_messages_for_augmentation=system_messages_for_augmentation, + agent_citations_list=agent_citations_list, + web_search_citations_list=web_search_citations_list, + deep_research_enabled=deep_research_enabled, + deep_research_planner_client=gpt_client, + deep_research_planner_model=gpt_model, + ) + else: + perform_web_search( + settings=settings, + conversation_id=conversation_id, + user_id=user_id, + user_message=user_message, + user_message_id=user_message_id, + chat_type=chat_type, + document_scope=document_scope, + active_group_id=active_group_id, + active_public_workspace_id=active_public_workspace_id, + web_search_query_text=web_search_query_text, + system_messages_for_augmentation=system_messages_for_augmentation, + agent_citations_list=agent_citations_list, + web_search_citations_list=web_search_citations_list, + web_search_runs_list=deep_research_web_search_runs, + ) + research_search_result = {'query_plan': {}, 'web_search_runs': deep_research_web_search_runs} deep_research_query_plan = research_search_result.get('query_plan', {}) deep_research_web_search_runs = research_search_result.get('web_search_runs', []) if web_search_citations_list: @@ -17578,25 +17635,79 @@ def record_and_publish_streaming_thought(thought_payload): if deep_research_enabled: yield emit_thought('deep_research', "Planning Deep Research web searches") else: - yield emit_thought('web_search', f"Searching the web for '{web_search_query_text[:50]}'") - research_search_result = perform_research_web_searches( + yield emit_thought('web_search', f"Searching the web with query length {len(web_search_query_text)}") + web_search_dlp_result = evaluate_web_search_egress( + web_search_query_text, settings=settings, - conversation_id=conversation_id, - user_id=user_id, - user_message=user_message, - user_message_id=user_message_id, - chat_type=chat_type, - document_scope=document_scope, - active_group_id=active_group_id, - active_public_workspace_id=active_public_workspace_id, - web_search_query_text=web_search_query_text, - system_messages_for_augmentation=system_messages_for_augmentation, - agent_citations_list=agent_citations_list, - web_search_citations_list=web_search_citations_list, - deep_research_enabled=deep_research_enabled, - deep_research_planner_client=gpt_client, - deep_research_planner_model=gpt_model, + context={ + "conversation_id": conversation_id, + "chat_type": chat_type, + "document_scope": document_scope, + "workspace_scope": document_scope, + }, ) + if should_emit_dlp_telemetry(web_search_dlp_result, settings): + log_event( + "[DLP] Web search egress decision", + extra=build_dlp_telemetry_properties( + web_search_dlp_result, + surface="web_search", + context={ + "conversation_id": conversation_id, + "chat_type": chat_type, + "document_scope": document_scope, + "workspace_scope": document_scope, + }, + ), + ) + if web_search_dlp_result.get("status_message"): + yield emit_thought('web_search', web_search_dlp_result["status_message"]) + + if not web_search_dlp_result.get("web_search_allowed", True): + system_messages_for_augmentation.append({ + "role": "system", + "content": WEB_SEARCH_DLP_BLOCKED_STATUS, + }) + research_search_result = {'query_plan': {}, 'web_search_runs': []} + else: + web_search_query_text = web_search_dlp_result.get("web_search_query_text", web_search_query_text) + if deep_research_enabled: + research_search_result = perform_research_web_searches( + settings=settings, + conversation_id=conversation_id, + user_id=user_id, + user_message=user_message, + user_message_id=user_message_id, + chat_type=chat_type, + document_scope=document_scope, + active_group_id=active_group_id, + active_public_workspace_id=active_public_workspace_id, + web_search_query_text=web_search_query_text, + system_messages_for_augmentation=system_messages_for_augmentation, + agent_citations_list=agent_citations_list, + web_search_citations_list=web_search_citations_list, + deep_research_enabled=deep_research_enabled, + deep_research_planner_client=gpt_client, + deep_research_planner_model=gpt_model, + ) + else: + perform_web_search( + settings=settings, + conversation_id=conversation_id, + user_id=user_id, + user_message=user_message, + user_message_id=user_message_id, + chat_type=chat_type, + document_scope=document_scope, + active_group_id=active_group_id, + active_public_workspace_id=active_public_workspace_id, + web_search_query_text=web_search_query_text, + system_messages_for_augmentation=system_messages_for_augmentation, + agent_citations_list=agent_citations_list, + web_search_citations_list=web_search_citations_list, + web_search_runs_list=deep_research_web_search_runs, + ) + research_search_result = {'query_plan': {}, 'web_search_runs': deep_research_web_search_runs} deep_research_query_plan = research_search_result.get('query_plan', {}) deep_research_web_search_runs = research_search_result.get('web_search_runs', []) if web_search_citations_list: @@ -20162,7 +20273,7 @@ def build_conversation_history_segments( def _extract_web_search_citations_from_content(content: str) -> List[Dict[str, str]]: if not content: return [] - debug_print(f"[Citation Extraction] Extracting citations from:\n{content}\n") + debug_print(f"[Citation Extraction] Extracting citations from content length: {len(content)}") citations: List[Dict[str, str]] = [] @@ -20204,7 +20315,7 @@ def _extract_web_search_citations_from_content(content: str) -> List[Dict[str, s if not url: continue citations.append({"url": url, "title": url}) - debug_print(f"[Citation Extraction] Extracted {len(citations)} citations. - {citations}\n") + debug_print(f"[Citation Extraction] Extracted {len(citations)} citations.") return citations @@ -20294,7 +20405,7 @@ def to_int(value: Any) -> Optional[int]: if total_tokens is None: debug_print( "[Web Search][Token Usage Extraction] total_tokens missing or invalid. " - f"usage={usage}" + f"usage_type={type(usage)}, usage_keys={list(usage.keys())}" ) return {} @@ -20339,11 +20450,24 @@ def perform_research_web_searches( """Run one or more current-message-only web searches for normal or Deep Research mode.""" web_search_runs = [] query_plan = {} + dlp_context = { + "conversation_id": conversation_id, + "chat_type": chat_type, + "document_scope": document_scope, + "workspace_scope": document_scope, + } if deep_research_enabled: + planner_user_message = user_message + if ( + settings.get("enable_dlp_control_plane", False) + and settings.get("enable_web_search_dlp", False) + and str(settings.get("web_search_dlp_mode", "monitor") or "monitor").lower() == "redact" + ): + planner_user_message = web_search_query_text query_plan = build_deep_research_query_plan( settings=settings, - user_message=user_message, + user_message=planner_user_message, base_query=web_search_query_text, planner_client=deep_research_planner_client, planner_model=deep_research_planner_model, @@ -20373,6 +20497,43 @@ def perform_research_web_searches( search_label = None if deep_research_enabled: search_label = f"Deep Research query {query_index}/{total_queries}" + query_dlp_result = evaluate_web_search_egress( + query_text, + settings=settings, + context={ + **dlp_context, + "deep_research_query_index": query_index, + "deep_research_query_count": total_queries, + }, + ) + if should_emit_dlp_telemetry(query_dlp_result, settings): + log_event( + "[DLP] Web search egress decision", + extra=build_dlp_telemetry_properties( + query_dlp_result, + surface="web_search", + context={ + **dlp_context, + "deep_research_query_index": query_index, + "deep_research_query_count": total_queries, + }, + ), + ) + if not query_dlp_result.get("web_search_allowed", True): + query_item['query'] = "" + if not any( + message.get("content") == WEB_SEARCH_DLP_BLOCKED_STATUS + for message in system_messages_for_augmentation + if isinstance(message, dict) + ): + system_messages_for_augmentation.append({ + "role": "system", + "content": WEB_SEARCH_DLP_BLOCKED_STATUS, + }) + continue + + query_text = query_dlp_result.get("web_search_query_text", query_text) + query_item['query'] = query_text perform_web_search( settings=settings, conversation_id=conversation_id, @@ -20418,16 +20579,15 @@ def perform_web_search( debug_print(f"[WebSearch] Parameters received:") debug_print(f"[WebSearch] conversation_id: {conversation_id}") debug_print(f"[WebSearch] user_id: {user_id}") - debug_print(f"[WebSearch] user_message: {user_message[:100] if user_message else None}...") + debug_print(f"[WebSearch] user_message_length: {len(user_message or '')}") debug_print(f"[WebSearch] user_message_id: {user_message_id}") debug_print(f"[WebSearch] chat_type: {chat_type}") debug_print(f"[WebSearch] document_scope: {document_scope}") debug_print(f"[WebSearch] active_group_id: {active_group_id}") debug_print(f"[WebSearch] active_public_workspace_id: {active_public_workspace_id}") - debug_print( - "[WebSearch] web_search_query_text: " - f"{web_search_query_text[:100] if web_search_query_text else None}..." - ) + dlp_enabled = bool(settings.get("enable_dlp_control_plane") and settings.get("enable_web_search_dlp")) + debug_print(f"[WebSearch] web_search_query_length: {len(web_search_query_text or '')}") + debug_print(f"[WebSearch] dlp_enabled: {dlp_enabled}") initial_seed_url_count = len(web_search_citations_list or []) if isinstance(web_search_citations_list, list) else 0 run_started_at = datetime.utcnow().isoformat() @@ -20437,7 +20597,7 @@ def record_web_search_run(success, status, error=None, result_message_length=0, return final_seed_url_count = len(web_search_citations_list or []) if isinstance(web_search_citations_list, list) else initial_seed_url_count web_search_runs_list.append({ - 'query': str(web_search_query_text or user_message or '').strip()[:300], + 'query': str(web_search_query_text or '').strip()[:300], 'label': str(search_context_label or '').strip()[:100], 'status': status, 'success': bool(success), @@ -20499,24 +20659,16 @@ def record_web_search_run(success, status, error=None, result_message_length=0, debug_print(f"[WebSearch] Agent ID is configured: {agent_id}") - query_text = (web_search_query_text or user_message or "").strip() - debug_print(f"[WebSearch] Final query_text after fallback: '{query_text[:100] if query_text else ''}'") + query_text = (web_search_query_text or "").strip() + debug_print(f"[WebSearch] Final approved query_length: {len(query_text)}") if not query_text: - debug_print("[WebSearch] Query text is EMPTY after processing, skipping web search") - log_event( - "[WebSearch] Skipping Foundry web search: empty query", - extra={ - "conversation_id": conversation_id, - "user_id": user_id, - }, - level=logging.WARNING, - ) + debug_print("[WebSearch] Empty approved web-search query; skipping Foundry call") record_web_search_run(True, 'empty_query') - return True # Not an error, just empty query + return True # Not an error, just empty approved query search_request_content = build_research_search_prompt(query_text) - debug_print(f"[WebSearch] Building message history with query: {query_text[:100]}...") + debug_print(f"[WebSearch] Building message history with query_length: {len(query_text)}") message_history = [ ChatMessageContent(role="user", content=search_request_content) ] @@ -20540,37 +20692,39 @@ def record_web_search_run(success, status, error=None, result_message_length=0, ) except FoundryAgentInvocationError as exc: log_event( - f"[WebSearch] Foundry agent invocation failed: {exc}", + "[WebSearch] Foundry agent invocation failed", extra={ "conversation_id": conversation_id, "user_id": user_id, "agent_id": agent_id, + "error_type": type(exc).__name__, }, level=logging.ERROR, - exceptionTraceback=True, + exceptionTraceback=False, ) # Add failure message so the model informs the user system_messages_for_augmentation.append({ "role": "system", - "content": f"Web search failed with error: {exc}. Please inform the user that the web search encountered an error and you cannot provide real-time information for this query. Do not attempt to answer questions requiring current information from your training data - instead, acknowledge the search failure and suggest the user try again.", + "content": "Web search failed. Please inform the user that the web search encountered an error and you cannot provide real-time information for this query. Do not attempt to answer questions requiring current information from your training data - instead, acknowledge the search failure and suggest the user try again.", }) record_web_search_run(False, 'foundry_invocation_error', error=str(exc)) return False # Search failed except Exception as exc: log_event( - f"[WebSearch] Unexpected error invoking Foundry agent: {exc}", + "[WebSearch] Unexpected error invoking Foundry agent", extra={ "conversation_id": conversation_id, "user_id": user_id, "agent_id": agent_id, + "error_type": type(exc).__name__, }, level=logging.ERROR, - exceptionTraceback=True, + exceptionTraceback=False, ) # Add failure message so the model informs the user system_messages_for_augmentation.append({ "role": "system", - "content": f"Web search failed with an unexpected error: {exc}. Please inform the user that the web search encountered an error and you cannot provide real-time information for this query. Do not attempt to answer questions requiring current information from your training data - instead, acknowledge the search failure and suggest the user try again.", + "content": "Web search failed with an unexpected error. Please inform the user that the web search encountered an error and you cannot provide real-time information for this query. Do not attempt to answer questions requiring current information from your training data - instead, acknowledge the search failure and suggest the user try again.", }) record_web_search_run(False, 'unexpected_error', error=str(exc)) return False # Search failed @@ -20584,23 +20738,17 @@ def record_web_search_run(success, status, error=None, result_message_length=0, if result.message: debug_print(f"[WebSearch] Result message length: {len(result.message)} chars") - debug_print(f"[WebSearch] Result message preview: {result.message[:500] if len(result.message) > 500 else result.message}") else: debug_print("[WebSearch] Result message is EMPTY or None") if result.citations: debug_print(f"[WebSearch] Result citations count: {len(result.citations)}") - for i, cit in enumerate(result.citations[:3]): - debug_print(f"[WebSearch] Citation {i}: {json.dumps(cit, default=str)[:200]}...") else: debug_print("[WebSearch] Result citations is EMPTY or None") if result.metadata: - try: - metadata_payload = json.dumps(result.metadata, default=str) - except (TypeError, ValueError): - metadata_payload = str(result.metadata) - debug_print(f"[WebSearch] Foundry metadata: {metadata_payload}") + metadata_keys = list(result.metadata.keys()) if isinstance(result.metadata, Mapping) else [] + debug_print(f"[WebSearch] Foundry metadata present with keys: {metadata_keys}") else: debug_print("[WebSearch] Foundry metadata: ") @@ -20638,12 +20786,12 @@ def record_web_search_run(success, status, error=None, result_message_length=0, debug_print(f"[WebSearch] Processing {len(citations)} citations from result.citations") if citations: for i, citation in enumerate(citations): - debug_print(f"[WebSearch] Processing citation {i}: {json.dumps(citation, default=str)[:200]}...") + debug_print(f"[WebSearch] Processing citation {i}") serializable = make_json_serializable(citation) if not isinstance(serializable, dict): serializable = {"value": str(citation)} citation_title = serializable.get("title") or serializable.get("url") or "Web search source" - debug_print(f"[WebSearch] Adding agent citation with title: {citation_title}") + debug_print(f"[WebSearch] Adding agent citation {i + 1} of {len(citations)}") agent_citations_list.append({ "tool_name": citation_title, "function_name": "azure_ai_foundry_web_search", @@ -20663,7 +20811,8 @@ def record_web_search_run(success, status, error=None, result_message_length=0, else: debug_print("[WebSearch] No citations in result.citations to process") - debug_print(f"[WebSearch] Starting token usage extraction from Foundry metadata. Metadata: {result.metadata}") + metadata_keys = list((result.metadata or {}).keys()) if isinstance(result.metadata, Mapping) else [] + debug_print(f"[WebSearch] Starting token usage extraction from Foundry metadata keys: {metadata_keys}") token_usage = _extract_token_usage_from_metadata(result.metadata or {}) if token_usage.get("total_tokens"): try: @@ -20687,19 +20836,21 @@ def record_web_search_run(success, status, error=None, result_message_length=0, public_workspace_id=active_public_workspace_id, additional_context={ 'agent_id': agent_id, - 'search_query': query_text, + 'search_query_length': len(query_text), 'token_source': 'foundry_metadata' } ) except Exception as log_error: log_event( - f"[WebSearch] Failed to log web search token usage: {log_error}", + "[WebSearch] Failed to log web search token usage", extra={ "conversation_id": conversation_id, "user_id": user_id, "agent_id": agent_id, + "error_type": type(log_error).__name__, }, level=logging.WARNING, + exceptionTraceback=False, ) debug_print("[WebSearch] ========== FINAL SUMMARY ==========") diff --git a/application/single_app/route_frontend_admin_settings.py b/application/single_app/route_frontend_admin_settings.py index a4a7caf2..82384a5a 100644 --- a/application/single_app/route_frontend_admin_settings.py +++ b/application/single_app/route_frontend_admin_settings.py @@ -1,6 +1,7 @@ # route_frontend_admin_settings.py import re +import secrets from config import * from functions_documents import * @@ -24,6 +25,15 @@ from functions_notifications import broadcast_system_notification from functions_logging import * from functions_document_actions import normalize_document_action_capabilities +from functions_dlp_rules import get_default_dlp_regex_rules, validate_dlp_regex_rules +from functions_dlp_presidio import ( + PresidioEndpointConfigurationError, + DEFAULT_PRESIDIO_AUTH_SECRET_ENV_VAR, + normalize_presidio_auth_header_name, + normalize_presidio_allowed_private_hosts, + normalize_presidio_secret_env_var_name, + validate_presidio_endpoint_url, +) from swagger_wrapper import swagger_route, get_auth_security from datetime import datetime, timedelta, timezone from admin_settings_int_utils import safe_int_with_source @@ -51,6 +61,30 @@ 'agents_page_promoted_popular_tag_label': AGENTS_PAGE_PROMOTED_POPULAR_TAG_LABEL_DEFAULT, } HEX_COLOR_PATTERN = re.compile(r'^#[0-9a-fA-F]{6}$') +ADMIN_SETTINGS_CSRF_SESSION_KEY = "admin_settings_csrf_token" + + +def _new_admin_settings_csrf_token(): + token = secrets.token_urlsafe(32) + session[ADMIN_SETTINGS_CSRF_SESSION_KEY] = token + return token + + +def _get_admin_settings_csrf_token(): + token = session.get(ADMIN_SETTINGS_CSRF_SESSION_KEY) + if not token: + token = _new_admin_settings_csrf_token() + return token + + +def _validate_admin_settings_csrf_token(form_data): + submitted_token = str(form_data.get("admin_settings_csrf_token") or "") + expected_token = str(session.get(ADMIN_SETTINGS_CSRF_SESSION_KEY) or "") + return bool( + submitted_token + and expected_token + and secrets.compare_digest(submitted_token, expected_token) + ) def allowed_file(filename, allowed_extensions): return '.' in filename and \ @@ -532,6 +566,8 @@ def admin_settings(): source_review_runtime_capabilities, ) settings_for_template = redact_admin_settings_secrets_for_form(settings_for_template) + dlp_regex_rules_for_template, _ = validate_dlp_regex_rules(settings.get('dlp_regex_rules')) + dlp_regex_rules_json = json.dumps(dlp_regex_rules_for_template, indent=2) return render_template( 'admin_settings.html', @@ -551,7 +587,9 @@ def admin_settings(): chunk_size_settings=settings.get('chunk_size', {}), chunk_size_cap=get_chunk_size_cap(settings), chunk_size_effective=get_chunk_size_config(settings), - source_review_runtime_capabilities=source_review_runtime_capabilities + source_review_runtime_capabilities=source_review_runtime_capabilities, + admin_settings_csrf_token=_get_admin_settings_csrf_token(), + dlp_regex_rules_json=dlp_regex_rules_json # You don't need to pass deployments separately if they are added to settings['..._model']['all'] # gpt_deployments=gpt_deployments, # embedding_deployments=embedding_deployments, @@ -562,6 +600,11 @@ def admin_settings(): form_data = request.form # Use a variable for easier access user_id = get_current_user_id() + if not _validate_admin_settings_csrf_token(form_data): + _new_admin_settings_csrf_token() + flash("Admin settings request could not be verified. Please reload the page and try again.", "danger") + return redirect(url_for('admin_settings')) + def admin_secret(field_name, form_field_name=None): submitted_value = form_data.get(form_field_name or field_name, '').strip() return resolve_admin_settings_secret_value(field_name, submitted_value, settings) @@ -739,6 +782,138 @@ def parse_admin_int(raw_value, fallback_value, field_name="unknown", hard_defaul source='admin_settings' ) + dlp_max_scan_chars, _ = safe_int_with_source( + form_data.get('dlp_max_scan_chars'), + settings.get('dlp_max_scan_chars', 200000), + 200000 + ) + dlp_max_scan_chars = max(1000, dlp_max_scan_chars) + dlp_review_destination = form_data.get('dlp_review_destination', 'none') + if dlp_review_destination not in ('none',): + dlp_review_destination = 'none' + dlp_default_engine = form_data.get('dlp_default_engine', settings.get('dlp_default_engine', 'regex')) + if dlp_default_engine not in ('regex', 'presidio_endpoint'): + dlp_default_engine = 'regex' + dlp_presidio_allowed_private_hosts = normalize_presidio_allowed_private_hosts( + form_data.get( + 'dlp_presidio_allowed_private_hosts', + settings.get('dlp_presidio_allowed_private_hosts', '') + ) + ) + submitted_dlp_presidio_analyzer_endpoint = form_data.get( + 'dlp_presidio_analyzer_endpoint', + settings.get('dlp_presidio_analyzer_endpoint', '') + ).strip() + dlp_presidio_analyzer_endpoint = submitted_dlp_presidio_analyzer_endpoint + if dlp_presidio_analyzer_endpoint: + try: + validate_presidio_endpoint_url( + dlp_presidio_analyzer_endpoint, + dlp_presidio_allowed_private_hosts, + ) + except PresidioEndpointConfigurationError as exc: + existing_dlp_presidio_analyzer_endpoint = str( + settings.get('dlp_presidio_analyzer_endpoint', '') + ).strip() + dlp_presidio_analyzer_endpoint = '' + if existing_dlp_presidio_analyzer_endpoint: + try: + validate_presidio_endpoint_url( + existing_dlp_presidio_analyzer_endpoint, + dlp_presidio_allowed_private_hosts, + ) + dlp_presidio_analyzer_endpoint = existing_dlp_presidio_analyzer_endpoint + except PresidioEndpointConfigurationError: + dlp_presidio_analyzer_endpoint = '' + flash(f"Presidio analyzer endpoint was not saved: {exc}", "warning") + submitted_dlp_presidio_auth_header_name = form_data.get( + 'dlp_presidio_auth_header_name', + settings.get('dlp_presidio_auth_header_name', 'X-DLP-API-Key') + ).strip() + dlp_presidio_auth_header_name = normalize_presidio_auth_header_name( + submitted_dlp_presidio_auth_header_name + ) + if not dlp_presidio_auth_header_name: + dlp_presidio_auth_header_name = normalize_presidio_auth_header_name( + settings.get('dlp_presidio_auth_header_name', 'X-DLP-API-Key') + ) or 'X-DLP-API-Key' + flash( + "Presidio auth header was not saved. Use a valid custom header such as X-DLP-API-Key.", + "warning" + ) + submitted_dlp_presidio_auth_secret_env_var = form_data.get( + 'dlp_presidio_auth_secret_env_var', + settings.get('dlp_presidio_auth_secret_env_var', DEFAULT_PRESIDIO_AUTH_SECRET_ENV_VAR) + ).strip() + dlp_presidio_auth_secret_env_var = normalize_presidio_secret_env_var_name( + submitted_dlp_presidio_auth_secret_env_var + ) + if submitted_dlp_presidio_auth_secret_env_var and not dlp_presidio_auth_secret_env_var: + dlp_presidio_auth_secret_env_var = normalize_presidio_secret_env_var_name( + settings.get('dlp_presidio_auth_secret_env_var', DEFAULT_PRESIDIO_AUTH_SECRET_ENV_VAR) + ) + flash( + "Presidio auth secret env var was not saved. Use PRESIDIO_DLP_API_KEY or a DLP_PRESIDIO_ name.", + "warning" + ) + dlp_presidio_timeout_seconds, _ = safe_int_with_source( + form_data.get('dlp_presidio_timeout_seconds'), + settings.get('dlp_presidio_timeout_seconds', 5), + 5 + ) + dlp_presidio_timeout_seconds = max(1, min(30, dlp_presidio_timeout_seconds)) + try: + dlp_presidio_score_threshold = float( + form_data.get( + 'dlp_presidio_score_threshold', + settings.get('dlp_presidio_score_threshold', 0.5) + ) + ) + except (TypeError, ValueError): + dlp_presidio_score_threshold = 0.5 + dlp_presidio_score_threshold = max(0.0, min(1.0, dlp_presidio_score_threshold)) + dlp_presidio_language = form_data.get( + 'dlp_presidio_language', + settings.get('dlp_presidio_language', 'en') + ).strip() or 'en' + existing_dlp_presidio_entities = settings.get('dlp_presidio_entities') or [ + 'CREDIT_CARD', + 'EMAIL_ADDRESS', + 'PHONE_NUMBER', + 'US_SSN', + ] + dlp_presidio_entities_raw = form_data.get( + 'dlp_presidio_entities', + ','.join(existing_dlp_presidio_entities) + ) + dlp_presidio_entities = [ + item.strip().upper() + for item in dlp_presidio_entities_raw.split(',') + if item.strip() + ] + if not dlp_presidio_entities: + dlp_presidio_entities = ['CREDIT_CARD', 'EMAIL_ADDRESS', 'PHONE_NUMBER', 'US_SSN'] + web_search_dlp_mode = form_data.get('web_search_dlp_mode', 'monitor') + if web_search_dlp_mode not in ('monitor', 'redact', 'block'): + web_search_dlp_mode = 'monitor' + upload_dlp_mode = form_data.get('upload_dlp_mode', 'monitor') + if upload_dlp_mode not in ('monitor', 'redact', 'block'): + upload_dlp_mode = 'monitor' + + raw_dlp_regex_rules = form_data.get('dlp_regex_rules_json', '').strip() + try: + submitted_dlp_regex_rules = json.loads(raw_dlp_regex_rules) if raw_dlp_regex_rules else get_default_dlp_regex_rules() + except json.JSONDecodeError: + _new_admin_settings_csrf_token() + flash("DLP regex rules must be valid JSON.", "danger") + return redirect(url_for('admin_settings')) + + normalized_dlp_regex_rules, dlp_regex_rule_errors = validate_dlp_regex_rules(submitted_dlp_regex_rules) + if dlp_regex_rule_errors: + _new_admin_settings_csrf_token() + flash(f"DLP regex rules are invalid: {dlp_regex_rule_errors[0]}", "danger") + return redirect(url_for('admin_settings')) + existing_source_review_max_bytes = parse_admin_int( settings.get('source_review_max_bytes_per_page'), 5000000, @@ -1956,6 +2131,28 @@ def is_valid_url(url): 'web_search_consent_accepted': web_search_consent_accepted, 'enable_web_search_user_notice': form_data.get('enable_web_search_user_notice') == 'on', 'web_search_user_notice_text': form_data.get('web_search_user_notice_text', 'Your current message will be sent to Microsoft Bing for web search. Conversation history is not sent for web search, but any sensitive content you paste into this message may be sent.').strip(), + 'enable_dlp_control_plane': form_data.get('enable_dlp_control_plane') == 'on', + 'dlp_default_engine': dlp_default_engine, + 'dlp_regex_rules': normalized_dlp_regex_rules, + 'dlp_max_scan_chars': dlp_max_scan_chars, + 'dlp_fail_closed_on_scanner_error': form_data.get('dlp_fail_closed_on_scanner_error') == 'on', + 'dlp_audit_level': 'counts_only', + 'dlp_enable_structured_telemetry': form_data.get('dlp_enable_structured_telemetry') == 'on', + 'dlp_telemetry_sample_allow_events': form_data.get('dlp_telemetry_sample_allow_events') == 'on', + 'dlp_review_destination': dlp_review_destination, + 'dlp_presidio_analyzer_endpoint': dlp_presidio_analyzer_endpoint, + 'dlp_presidio_allowed_private_hosts': dlp_presidio_allowed_private_hosts, + 'dlp_presidio_auth_header_name': dlp_presidio_auth_header_name, + 'dlp_presidio_auth_secret_env_var': dlp_presidio_auth_secret_env_var, + 'dlp_presidio_timeout_seconds': dlp_presidio_timeout_seconds, + 'dlp_presidio_score_threshold': dlp_presidio_score_threshold, + 'dlp_presidio_language': dlp_presidio_language, + 'dlp_presidio_entities': dlp_presidio_entities, + 'enable_web_search_dlp': form_data.get('enable_web_search_dlp') == 'on', + 'web_search_dlp_mode': web_search_dlp_mode, + 'enable_upload_dlp': form_data.get('enable_upload_dlp') == 'on', + 'upload_dlp_mode': upload_dlp_mode, + 'upload_dlp_fail_upload_on_match': form_data.get('upload_dlp_fail_upload_on_match') == 'on', 'web_search_agent': { 'agent_type': 'aifoundry', 'azure_openai_gpt_endpoint': form_data.get('web_search_foundry_endpoint', '').strip(), @@ -2385,8 +2582,10 @@ def is_valid_url(url): flash("Failed to update admin settings.", "danger") + _new_admin_settings_csrf_token() + # Redirect back to settings page return redirect(url_for('admin_settings')) # Fallback if not GET or POST (shouldn't happen with standard routing) - return redirect(url_for('admin_settings')) \ No newline at end of file + return redirect(url_for('admin_settings')) diff --git a/application/single_app/static/js/admin/admin_settings.js b/application/single_app/static/js/admin/admin_settings.js index 9123e7e6..23f93dc0 100644 --- a/application/single_app/static/js/admin/admin_settings.js +++ b/application/single_app/static/js/admin/admin_settings.js @@ -8317,6 +8317,65 @@ function validateAndMoveToNextStep(currentStep) { } } +function initializeDlpSettings() { + const togglePanel = (toggle, panel) => { + if (!toggle || !panel) { + return; + } + panel.classList.toggle('d-none', !toggle.checked); + toggle.addEventListener('change', function () { + panel.classList.toggle('d-none', !this.checked); + if (typeof markFormAsModified === 'function') { + markFormAsModified(); + } + }); + }; + + const enableDlpControlPlane = document.getElementById('enable_dlp_control_plane'); + const dlpControlPlaneSettings = document.getElementById('dlp_control_plane_settings'); + const dlpDefaultEngine = document.getElementById('dlp_default_engine'); + const dlpPresidioEndpointSettings = document.getElementById('dlp_presidio_endpoint_settings'); + const enableWebSearchDlp = document.getElementById('enable_web_search_dlp'); + const webSearchDlpSettings = document.getElementById('web_search_dlp_settings'); + const webSearchDlpModeSettings = document.getElementById('web_search_dlp_mode_settings'); + const enableUploadDlp = document.getElementById('enable_upload_dlp'); + const uploadDlpSettings = document.getElementById('upload_dlp_settings'); + const uploadDlpModeSettings = document.getElementById('upload_dlp_mode_settings'); + + togglePanel(enableDlpControlPlane, dlpControlPlaneSettings); + togglePanel(enableWebSearchDlp, webSearchDlpModeSettings); + togglePanel(enableUploadDlp, uploadDlpModeSettings); + + if (dlpDefaultEngine && dlpPresidioEndpointSettings) { + const updateDlpEngineVisibility = () => { + dlpPresidioEndpointSettings.classList.toggle('d-none', dlpDefaultEngine.value !== 'presidio_endpoint'); + }; + dlpDefaultEngine.addEventListener('change', function () { + updateDlpEngineVisibility(); + if (typeof markFormAsModified === 'function') { + markFormAsModified(); + } + }); + updateDlpEngineVisibility(); + } + + if (webSearchDlpSettings && enableDlpControlPlane) { + webSearchDlpSettings.classList.toggle('d-none', !enableDlpControlPlane.checked); + enableDlpControlPlane.addEventListener('change', function () { + webSearchDlpSettings.classList.toggle('d-none', !this.checked); + }); + } + + if (uploadDlpSettings && enableDlpControlPlane) { + uploadDlpSettings.classList.toggle('d-none', !enableDlpControlPlane.checked); + enableDlpControlPlane.addEventListener('change', function () { + uploadDlpSettings.classList.toggle('d-none', !this.checked); + }); + } +} + +document.addEventListener('DOMContentLoaded', initializeDlpSettings); + /** * Navigate to the previous step in the walkthrough */ @@ -8465,4 +8524,4 @@ function openAdminSettingsTab(targetHash) { activateTabFromHash(); } -window.openAdminSettingsTab = openAdminSettingsTab; \ No newline at end of file +window.openAdminSettingsTab = openAdminSettingsTab; diff --git a/application/single_app/templates/admin_settings.html b/application/single_app/templates/admin_settings.html index 4c86667e..109c185a 100644 --- a/application/single_app/templates/admin_settings.html +++ b/application/single_app/templates/admin_settings.html @@ -493,6 +493,7 @@

Admin Settings

{% include "_semantic_search_health_warning.html" %}
+ @@ -7359,6 +7360,184 @@
+
+
+ Data Loss Prevention +
+

Configure DLP controls for upload ingestion and web-search egress.

+
+ + +
+ +
+
+
+ + +
Use regex for lightweight built-in scanning or Presidio for an admin-managed external analyzer.
+
+
+ + +
+
+ + +
+
+ +
+
Presidio Analyzer Endpoint
+ +
+
+ + +
Do not include credentials, fragments, or API keys in the URL.
+
+
+ + +
Comma- or newline-separated hostnames/IPs for private, loopback, or link-local Presidio endpoints.
+
+
+
+
+ + +
Use a custom auth header such as X-DLP-API-Key; connection and content headers are rejected.
+
+
+ + +
Use PRESIDIO_DLP_API_KEY or a DLP_PRESIDIO_ env var. Store the value in App Service settings or Key Vault.
+
+
+ + +
+
+ + +
+
+ + +
+
+ + +
Comma-separated Presidio entity names such as CREDIT_CARD, EMAIL_ADDRESS, PHONE_NUMBER, US_SSN, PERSON, LOCATION.
+
+
+
+ +
+
+ + +
+
+
+ + +
+
+
+
+ + +
+
+
+ +
+ + +
+ +
+
Custom Regex Rules
+ + +
+ Rules support surfaces, Luhn validation, and keyword proximity confidence. Defaults include U.S. SSN and Luhn-valid credit card detection. +
+
+ +
+
Web Search DLP
+
+ + +
+
+
+
+ + +
+
+
+
+ +
+
Upload DLP
+
+ + +
+
+
+
+ + +
+
+
+ + +
+
+
+
+
+
+
+ +
User Feedback diff --git a/docs/explanation/features/DLP_UPLOAD_STAGING.md b/docs/explanation/features/DLP_UPLOAD_STAGING.md new file mode 100644 index 00000000..078fb822 --- /dev/null +++ b/docs/explanation/features/DLP_UPLOAD_STAGING.md @@ -0,0 +1,133 @@ +# DLP Upload Staging + +## Overview + +Version: 0.242.075 + +Dependencies: shared DLP core, configurable regex DLP rules, optional external Presidio Analyzer-compatible endpoint, document processing pipeline, Azure AI Search, Azure OpenAI embeddings. + +SimpleChat now applies DLP to extracted upload text and selected document metadata before embeddings, Azure AI Search indexing, metadata extraction prompts, Cosmos metadata updates, and file-processing logs. The feature reuses the shared DLP core introduced for web-search egress and applies it to `save_chunks()`, `save_chunks_batch()`, `save_video_chunk()`, and metadata extraction/update paths. + +Regex DLP remains the lightweight default engine. The default rules detect U.S. SSNs and Luhn-valid credit card numbers, and administrators can add upload-specific regex rules through the shared `dlp_regex_rules` settings payload. Administrators can also select `presidio_endpoint` to call an external Presidio Analyzer-compatible endpoint for richer upload text and metadata detection without embedding Presidio in SimpleChat. + +## Technical Specifications + +Protected processing points: + +- `save_chunks()` evaluates DLP after metadata and vision text are combined, before `generate_embedding(...)`. +- `save_chunks_batch()` evaluates DLP for each enhanced chunk before `generate_embeddings_batch(...)`. +- `save_video_chunk()` evaluates transcript and OCR text before transcript embedding and AI Search indexing. +- Metadata fields `title`, `authors`, `organization`, `keywords`, and `abstract` are sanitized before metadata extraction prompts, hybrid-search queries, Cosmos updates, Azure AI Search payload metadata, activity logs, and file-processing logs. +- Safe DLP metadata is attached to chunk documents and document records as counts-only summaries. +- Document-level DLP metadata preserves the worst observed status and cumulative entity counts across chunk and metadata scans. +- Configured regex rules can target upload only, web search only, or both surfaces. +- Configured rules support keyword proximity confidence shaping, so a regex candidate can require nearby identifiers such as `document`, `employee`, `SSN`, or another admin-defined term before it redacts or blocks. +- The external Presidio Analyzer endpoint path sends extracted text and selected metadata to an administrator-managed analyzer endpoint, receives spans, and normalizes them into the same counts-only DLP result shape used by regex scanning. +- SimpleChat does not embed Presidio packages or run an in-process analyzer. +- File-processing logs replace raw chunk logging with safe DLP and text-length summaries. +- Enhanced citations are automatically disabled when upload DLP can enforce a block or redaction, including `redact` mode, `block` mode, fail-on-match, and fail-closed scanner errors, because this PR does not generate sanitized binary derivatives for raw source files. + +Upload DLP states: + +- `accepted`: no DLP findings. +- `accepted_with_dlp_monitoring`: findings observed in monitor mode. +- `accepted_with_redactions`: redacted text was embedded and indexed. +- `blocked`: DLP policy blocked indexing. +- `scanner_failed`: scanner failure blocked indexing in fail-closed mode. + +## Admin Settings + +Upload controls are available under Admin Settings > Data Loss Prevention: + +- Enable Upload DLP. +- Choose the default engine: regex structured identifier scan or external Presidio Analyzer endpoint. +- Configure the Presidio Analyzer endpoint, auth header, secret environment variable name, timeout, score threshold, and entities when `presidio_endpoint` is selected. +- Upload mode: `monitor`, `redact`, or `block`. +- Fail upload on match. +- Custom Regex Rules, shared with web-search DLP. + +Review routing defaults to `none`. Upload review-event writing is not exposed in this release because the DLP review destination is intentionally locked to `none`. + +## External Presidio Analyzer Endpoint + +Administrators can select an external Presidio Analyzer-compatible endpoint as the DLP engine by setting the engine to `presidio_endpoint`. SimpleChat sends upload text and selected metadata to the endpoint from the server side, receives entity spans, and then performs monitor, redact, or block behavior locally before embeddings, Azure AI Search indexing, metadata extraction prompts, Cosmos metadata updates, and file-processing logs. + +This is Option C for Presidio integration: Presidio runs outside SimpleChat. The SimpleChat application image has no embedded Presidio dependency, model package, or analyzer runtime. Regex DLP remains available as the default and fallback path. + +Production deployments should keep the analyzer private and authenticated. Use a private network path plus an API key header or equivalent service boundary, and never expose a public unauthenticated Presidio Analyzer endpoint. SimpleChat permits unauthenticated Presidio calls only for localhost development endpoints; any non-loopback endpoint requires the configured secret environment variable to resolve before raw upload text or metadata is sent. SimpleChat stores only the configured secret environment variable name, such as `PRESIDIO_DLP_API_KEY`; the API key value belongs in App Service settings or a Key Vault reference. Custom auth header names are validated and reserved HTTP headers such as `Host`, `Content-Type`, and `Connection` are rejected. + +The analyzer receives raw extracted text before redaction. SimpleChat, proxies, wrappers, analyzer containers, and platform diagnostics must not log raw request bodies, response bodies, chunk text, OCR text, vision text, metadata values, matched values, or analyzer explanations. Stored DLP metadata and telemetry remain counts-only. + +## Telemetry And Logs + +Upload DLP telemetry uses `log_event(...)` with safe dimensions: + +- `activity_type = dlp_decision` +- `dlp_surface = upload` +- `dlp_action` +- `dlp_engine` +- `dlp_mode` +- `workspace_scope` +- `scanner_status` +- `dlp_total_replacements` +- `dlp_entity_counts` + +File-processing logs may include safe DLP summaries such as action, engine, counts, document id, workspace scope, page number, and text length. They do not include raw chunk text, raw OCR text, raw vision text, or raw matched values. + +Example Azure Monitor alert concepts: + +```kusto +customEvents +| where tostring(customDimensions.activity_type) == "dlp_decision" +| where tostring(customDimensions.dlp_surface) == "upload" +| where tostring(customDimensions.dlp_action) == "block" +| summarize blocked_uploads=count() by bin(timestamp, 15m) +``` + +```kusto +customEvents +| where tostring(customDimensions.activity_type) == "dlp_decision" +| where tostring(customDimensions.dlp_surface) == "upload" +| where toint(customDimensions.dlp_total_replacements) > 10 +| summarize high_redaction_events=count() by bin(timestamp, 15m) +``` + +```kusto +customEvents +| where tostring(customDimensions.activity_type) == "dlp_decision" +| where tostring(customDimensions.dlp_surface) == "upload" +| where tostring(customDimensions.scanner_status) != "ok" +| summarize scanner_failures=count() by bin(timestamp, 15m) +``` + +## Limitations + +This PR redacts extracted text and selected metadata before embeddings, search indexing, prompts, and metadata persistence. It does not claim that raw binary artifacts are format-redacted. When upload DLP can enforce a block or redaction, enhanced citations are disabled instead of storing raw source blobs. A future format-aware derivative generation or quarantine workflow is needed to produce sanitized binary copies. + +Regex DLP is limited to deterministic structured identifiers and administrator-defined exact-format identifiers. It is weaker for names, addresses, contextual PII, international identifiers, and noisy document text. Use the external Presidio Analyzer endpoint when richer recognizers are needed and the production analyzer can be kept private, authenticated, and free of raw text logging. + +## Testing And Validation + +Functional coverage: + +- `functional_tests/test_upload_dlp_redaction.py` +- `functional_tests/test_dlp_regex_rules.py` +- `functional_tests/test_upload_dlp_workspace_scopes.py` +- `functional_tests/test_upload_dlp_ingestion_integration.py` +- `functional_tests/test_dlp_admin_ui_smoke.py` +- `functional_tests/test_dlp_presidio_endpoint.py` +- `functional_tests/test_dlp_presidio_engine_integration.py` +- `functional_tests/test_dlp_review_events.py` +- `functional_tests/test_dlp_telemetry.py` +- Shared PR1 DLP tests remain green. + +Validated with Docker Python 3.12: + +- `python -m compileall application/single_app` +- The PR-specific functional tests above. + +Additional review-readiness validation: + +- `tools/local_dev/render_dlp_admin_preview.py` renders the shared DLP admin section and verifies upload controls are visible in the expanded preview. +- `tools/local_dev/run_dlp_local_stack.md` documents the local Cosmos emulator smoke flow inherited from the web-search DLP branch. +- Independent remediation review verified metadata sanitization, enhanced-citation enforcement, document-level status aggregation, and removal of the dead upload review toggle. diff --git a/docs/explanation/features/DLP_WEB_SEARCH_EGRESS_CONTROL.md b/docs/explanation/features/DLP_WEB_SEARCH_EGRESS_CONTROL.md new file mode 100644 index 00000000..a6c657f7 --- /dev/null +++ b/docs/explanation/features/DLP_WEB_SEARCH_EGRESS_CONTROL.md @@ -0,0 +1,152 @@ +# DLP Web Search Egress Control + +## Overview + +Version: 0.242.075 + +Dependencies: Flask chat routes, configurable regex DLP rules, optional external Presidio Analyzer-compatible endpoint, and Azure AI Foundry web-search agent configuration. + +SimpleChat now includes an application-level Data Loss Prevention control before web-search grounding. The app evaluates the current user message after `build_web_search_query_text(...)` and before the configured Azure AI Foundry web-search agent is invoked. + +SimpleChat can inspect the `SimpleChat -> Azure AI Foundry` payload. It cannot inspect or intercept the service-side `Azure AI Foundry Agent Service -> Bing` grounding call inside Microsoft's service boundary. Blocking or redaction therefore happens before the app sends the current message to Foundry. + +## Technical Specifications + +The shared DLP core lives in `application/single_app/functions_dlp.py`. Configurable regex rules live in `application/single_app/functions_dlp_rules.py`. + +Implemented behavior: + +- Regex DLP remains the lightweight default engine. +- Administrators can optionally select `presidio_endpoint` to call an external Presidio Analyzer-compatible endpoint from the server side. +- Regex rules are admin-configurable through the `dlp_regex_rules` settings payload. +- Default rules detect U.S. SSNs and Luhn-valid credit card numbers. +- Rules can target web search, upload, or both. +- Rules can use keyword proximity confidence shaping. A regex match can require nearby terms such as `ssn`, `social security`, `card`, or `billing` before it reaches the configured minimum confidence. +- Generic internal phrase matching is not hardcoded. Administrators can add organization-specific phrases or identifiers as explicit custom rules. +- The external Presidio Analyzer endpoint path returns spans that SimpleChat normalizes into the same counts-only DLP result shape used by regex scanning. +- SimpleChat does not embed Presidio packages or run an in-process analyzer. +- DLP metadata stores entity types and counts only. Raw matched values are not stored in telemetry or review summaries. +- Structured DLP telemetry uses `log_event(...)` and reaches Application Insights when `APPLICATIONINSIGHTS_CONNECTION_STRING` is configured. +- Scanner errors fail closed by default when `dlp_fail_closed_on_scanner_error` is enabled. +- Text that exceeds the configured scan limit is not partially redacted in `redact` or `block` mode. It is blocked with `scanner_status = truncated`; `monitor` mode records the truncated scanner status while preserving web search. +- The web-search route no longer falls back to the raw current message when the DLP-safe query text is empty. + +Admin settings are added in Admin Settings under Data Loss Prevention: + +- Shared DLP enablement, engine selection, configurable regex rules, optional Presidio Analyzer endpoint settings, maximum scan characters, scanner fail-closed behavior, telemetry, and review destination. +- Web-search DLP enablement and mode: `monitor`, `redact`, or `block`. +- Review destination defaults to `none`. Safety Violations review routing is documented as a future integration unless the review surface is expanded with distinct DLP labeling and access rules. + +## Usage + +1. Open Admin Settings. +2. Enable Data Loss Prevention. +3. Enable Web Search DLP. +4. Review or edit Custom Regex Rules. +5. Choose a mode: + - `monitor`: detect and emit safe telemetry while preserving web search. Oversized text records `scanner_status = truncated`. + - `redact`: replace detected structured identifiers before web search. Oversized text is blocked instead of partially redacted. + - `block`: skip web search when DLP detects configured sensitive content or when text exceeds the scan limit. + +User-visible status messages: + +- Blocked: `Web search was blocked because the message appears to contain non-public information.` +- Redacted: `Sensitive details were removed before web search.` + +These messages do not include raw values, snippets, recognizer names, scores, or policy identifiers. + +## Configurable Regex Rules + +The MVP DLP engine uses admin-configurable regex rules. Default rules detect U.S. Social Security numbers and Luhn-valid credit card numbers. Generic internal phrase blocking is not hardcoded; administrators can add organization-specific rules when those phrases are meaningful in their environment. + +Each rule can define: + +- entity type and replacement label +- allowed surfaces (`web_search`, `upload`) +- optional `luhn` validation +- keyword proximity confidence shaping +- minimum confidence required before redaction or blocking + +Confidence shaping lets a regex match become stronger when nearby terms are present. For example, an employee identifier rule can require `EID-123456` plus `employee` within 32 characters before it redacts. + +Regex DLP remains deterministic and dependency-light, but it is not equivalent to Presidio. Use the external Presidio Analyzer endpoint when contextual PII detection, such as names, addresses, and natural-language identifiers, is required. + +## External Presidio Analyzer Endpoint + +Administrators can select an external Presidio Analyzer-compatible endpoint as the DLP engine by setting the engine to `presidio_endpoint`. SimpleChat sends the web-search query text to the endpoint from the server side, receives entity spans, and performs redaction or blocking locally using the existing counts-only DLP result shape. + +This is Option C for Presidio integration: the analyzer is external to SimpleChat. SimpleChat keeps no embedded Presidio dependency, model package, or analyzer runtime in the app image. Regex DLP remains available as the default and fallback path. + +Production deployments should keep the analyzer endpoint private and authenticated. Use a private network path plus an API key header or equivalent service boundary, and never expose a public unauthenticated Presidio Analyzer endpoint. SimpleChat permits unauthenticated Presidio calls only for localhost development endpoints; any non-loopback endpoint requires the configured secret environment variable to resolve before raw text is sent. SimpleChat stores only the configured secret environment variable name, such as `PRESIDIO_DLP_API_KEY`; the API key value belongs in App Service settings or a Key Vault reference. Custom auth header names are validated and reserved HTTP headers such as `Host`, `Content-Type`, and `Connection` are rejected. + +Because the analyzer receives raw text before redaction, SimpleChat, proxies, wrappers, and analyzer infrastructure must not log raw request bodies, response bodies, snippets, matched values, or analyzer explanations. Safe telemetry remains limited to entity types, counts, actions, engines, modes, and scanner status. + +## Telemetry + +Telemetry dimensions are bounded and safe: + +- `activity_type = dlp_decision` +- `dlp_surface = web_search` +- `dlp_action` +- `dlp_engine` +- `dlp_mode` +- `workspace_scope` +- `scanner_status` +- `dlp_total_replacements` +- `dlp_entity_counts` + +Raw prompts, web-search queries, snippets, raw matched values, and filenames are excluded. + +Example Azure Monitor alert concepts: + +```kusto +customEvents +| where name has "DLP" or tostring(customDimensions.activity_type) == "dlp_decision" +| where tostring(customDimensions.dlp_surface) == "web_search" +| where tostring(customDimensions.dlp_action) == "block" +| summarize blocks=count() by bin(timestamp, 15m) +``` + +```kusto +customEvents +| where tostring(customDimensions.activity_type) == "dlp_decision" +| where tostring(customDimensions.scanner_status) != "ok" +| summarize scanner_errors=count() by bin(timestamp, 15m), tostring(customDimensions.dlp_engine) +``` + +## Review And Retention + +The implemented default is `dlp_review_destination = none`; DLP findings are not written to the Safety Violations review area by default. Review summary helpers return distinct `policy_type` values such as `dlp_web_search` and counts-only entity metadata for future integration. + +Telemetry retention follows the configured Application Insights workspace. This PR does not create a dedicated DLP storage container or store raw DLP matches. + +## Limitations + +Regex DLP is intentionally lightweight. It is useful for structured identifiers such as SSNs, Luhn-valid credit card numbers, and administrator-defined exact-format identifiers, but it is weaker for names, addresses, contextual PII, international identifiers, secrets, and noisy prose. Use the external Presidio Analyzer endpoint when richer recognizers are needed and the production analyzer can be kept private, authenticated, and free of raw text logging. + +The app-level control cannot inspect Bing's internal grounding query after Foundry receives the request. It reduces egress risk by preventing or redacting sensitive text before the app sends the web-search message to the Foundry agent. + +## Testing And Validation + +Functional coverage: + +- `functional_tests/test_dlp_control_plane.py` +- `functional_tests/test_dlp_regex_rules.py` +- `functional_tests/test_dlp_telemetry.py` +- `functional_tests/test_dlp_admin_settings_ui.py` +- `functional_tests/test_dlp_admin_settings_roundtrip.py` +- `functional_tests/test_dlp_presidio_endpoint.py` +- `functional_tests/test_dlp_presidio_engine_integration.py` +- `functional_tests/test_dlp_review_events.py` +- `functional_tests/test_web_search_dlp_egress.py` +- `functional_tests/test_web_search_dlp_route_integration.py` + +Validated with Docker Python 3.12: + +- `python -m compileall application/single_app` +- The PR-specific functional tests above. + +Additional review-readiness validation: + +- `tools/local_dev/run_dlp_local_stack.md` documents a local Cosmos emulator smoke flow for the DLP admin UI. +- `tools/local_dev/render_dlp_admin_preview.py` renders collapsed and expanded DLP admin section previews from the real Jinja template without storing sensitive sample values. diff --git a/docs/explanation/release_notes.md b/docs/explanation/release_notes.md index 02424981..f46f8511 100644 --- a/docs/explanation/release_notes.md +++ b/docs/explanation/release_notes.md @@ -1,9 +1,63 @@ -This page tracks notable Simple Chat releases and organizes the detailed change log by version. The timeline below provides a quick visual overview of the current release progression through v0.242.071, and the per-version entries continue immediately after it. +This page tracks notable Simple Chat releases and organizes the detailed change log by version. The timeline below provides a quick visual overview of the current release progression through v0.242.075, and the per-version entries continue immediately after it. For feature-focused and fix-focused drill-downs by version, see [Features by Version](/explanation/features/) and [Fixes by Version](/explanation/fixes/). +### **(v0.242.075)** + +#### Bug Fixes + +* **Presidio Endpoint Authentication Guardrails** + * Requires non-loopback Presidio Analyzer endpoints to resolve the configured env-backed auth secret before SimpleChat sends raw scan text. + * Keeps unauthenticated Presidio calls limited to localhost development endpoints. + * Validates custom Presidio auth header names and rejects reserved HTTP headers such as `Host`, `Content-Type`, and `Connection`. + * (Ref: Presidio endpoint auth headers, DLP admin settings, Presidio deployment guidance) + +* **Presidio Endpoint Runtime Safety Hardening** + * Rejects Presidio Analyzer hostnames whose DNS answers include loopback, link-local, private, or otherwise non-global addresses unless the exact endpoint host is explicitly allowlisted. + * Normalizes untrusted Presidio entity labels to safe uppercase identifiers before redaction output, match counts, match summaries, and telemetry are built. + * Strips Presidio endpoint and private-host settings from non-admin settings sanitization output. + * (Ref: Presidio endpoint DNS validation, DLP entity label normalization, user settings sanitization) + +* **Presidio Endpoint Configuration Hardening** + * Rejects Presidio Analyzer endpoint URLs with userinfo, fragments, credential-like query parameters, or private/link-local/loopback hosts that are not explicitly allowlisted. + * Adds an admin-configurable `Allowed Private Hosts` allowlist for private Presidio deployments and preserves only validated endpoint settings. + * Disables redirects for Presidio analyzer calls and treats redirect responses as scanner errors under the existing fail-open/fail-closed policy. + * Restricts Presidio auth secret environment variable names to blank, `PRESIDIO_DLP_API_KEY`, or the `DLP_PRESIDIO_` namespace. + * (Ref: Presidio endpoint URL validation, Admin Settings DLP controls, Presidio deployment how-to) + +#### New Features + +* **External Presidio DLP Endpoint** + * Added optional support for a Presidio Analyzer-compatible endpoint as an advanced DLP engine without embedding Presidio packages in the SimpleChat app image. + * Added server-side endpoint calls with configurable timeout, score threshold, entity allowlist, and API-key header name sourced from an environment variable. + * Reused existing DLP monitor, redact, block, counts-only telemetry, upload, and web-search enforcement behavior. + * Documented local Docker smoke testing and production private-network plus API-key deployment patterns. + * (Ref: external Presidio DLP endpoint, Admin Settings DLP controls, Presidio deployment how-to) + +### **(v0.242.073)** + +#### New Features + +* **Configurable DLP Control Plane** + * Added admin-configurable regex DLP rules with bounded regex execution, optional Luhn validation, and keyword-proximity confidence shaping. + * Added web-search egress enforcement and upload-ingestion enforcement so administrators can monitor, redact, or block configured sensitive content before it leaves SimpleChat or enters embeddings/search indexing. + * Kept default rules intentionally narrow with U.S. SSN and Luhn-valid credit-card detection only. + * (Ref: configurable DLP rules, web-search DLP egress, upload DLP ingestion, Admin Settings DLP controls) + +#### Bug Fixes + +* **DLP Egress Bypass Closure** + * Applies DLP checks to Deep Research planned web-search queries immediately before outbound search. + * Prevents batch upload indexing from reintroducing raw vision text after DLP redaction. + * (Ref: Deep Research web-search DLP, upload DLP indexing redaction) + +* **Upload DLP Enforcement Edge Cases** + * Treats fail-on-match, fail-closed scanner errors, and truncated scans as enforced upload DLP paths when deciding whether content may be indexed or retained for enhanced citations. + * Sanitizes selected upload metadata before prompts, Search payloads, Cosmos updates, and logs while preserving counts-only DLP telemetry summaries. + * (Ref: upload DLP redaction, scanner failure handling, enhanced-citation safety) + ### **(v0.242.071)** #### New Features @@ -3434,4 +3488,4 @@ We introduced a robust user feedback system, expanded content-safety features fo 7. **App Roles & Enterprise Application** - Provides a robust way to control user access at scale. - - Admins can assign roles to new users or entire Azure AD groups. \ No newline at end of file + - Admins can assign roles to new users or entire Azure AD groups. diff --git a/docs/how-to/deploy_presidio_dlp.md b/docs/how-to/deploy_presidio_dlp.md new file mode 100644 index 00000000..22bfb77f --- /dev/null +++ b/docs/how-to/deploy_presidio_dlp.md @@ -0,0 +1,83 @@ +# Deploy External Presidio DLP + +SimpleChat can use Option C for richer DLP detection: call an external Presidio Analyzer-compatible HTTP endpoint from the server side while keeping Presidio out of the SimpleChat application image. SimpleChat does not embed Presidio packages, models, or recognizers; it sends text to an administrator-managed analyzer endpoint, receives spans, and applies its existing monitor, redact, or block behavior locally. + +## Recommended Production Shape + +Run the Presidio Analyzer-compatible service as sensitive internal infrastructure. The analyzer receives raw text before SimpleChat redacts it, so production deployments need both network and application controls. + +Required controls: + +- Use a private network path between SimpleChat and the analyzer. +- Require an API key header, usually `X-DLP-API-Key`, at a proxy, wrapper, gateway, or service boundary in front of the analyzer. +- Use HTTPS for every non-local endpoint. +- Do not expose a public unauthenticated Presidio Analyzer endpoint. +- Do not log raw request text, response bodies, snippets, or matched values in SimpleChat, the analyzer wrapper, reverse proxies, or platform diagnostics. +- Keep fail-closed scanner behavior enabled for protected upload and web-search paths when policy requires blocking on scanner errors. + +## SimpleChat Settings + +Configure these values in Admin Settings > Data Loss Prevention: + +- Default Engine: `External Presidio Analyzer endpoint` +- Analyzer Endpoint: `https:///analyze` +- Allowed Private Hosts: `` +- Auth Header: `X-DLP-API-Key` +- Secret Env Var: `PRESIDIO_DLP_API_KEY` +- Timeout Seconds: `5` +- Score Threshold: `0.5` +- Entities: `CREDIT_CARD, EMAIL_ADDRESS, PHONE_NUMBER, US_SSN` + +SimpleChat stores only the environment variable name in its admin settings, such as `PRESIDIO_DLP_API_KEY`. The API key value itself must live in the SimpleChat App Service application settings or in a Key Vault reference used by that App Service setting. Do not paste raw API key values into SimpleChat admin settings or Cosmos-backed configuration. Secret environment variable names are intentionally limited to blank, `PRESIDIO_DLP_API_KEY`, or names beginning with `DLP_PRESIDIO_`. + +Endpoint URLs must use strict URL hygiene. Do not include usernames, passwords, fragments, or credential-like query parameters such as `key`, `api_key`, `secret`, `token`, `password`, `connection`, or `sig`. Public HTTPS endpoints are accepted after these checks. Private, loopback, link-local, or internal-style hosts must also appear in `Allowed Private Hosts` as comma- or newline-separated hostnames or IP addresses. SimpleChat disables HTTP redirects when calling the analyzer and treats redirect responses as analyzer errors. + +## Local Docker Smoke Test + +Run the stock Presidio Analyzer container locally: + +```bash +docker run --rm -p 5002:3000 mcr.microsoft.com/presidio-analyzer:latest +``` + +Configure SimpleChat for a smoke test: + +```text +Default Engine: External Presidio Analyzer endpoint +Analyzer Endpoint: http://localhost:5002/analyze +Allowed Private Hosts: localhost +Auth Header: X-DLP-API-Key +Secret Env Var: PRESIDIO_DLP_API_KEY +Score Threshold: 0.5 +Entities: CREDIT_CARD, EMAIL_ADDRESS, PHONE_NUMBER, US_SSN +``` + +The stock local container does not require an API key. You can leave `PRESIDIO_DLP_API_KEY` unset for this local smoke path, or set it to any placeholder value while testing the SimpleChat configuration surface. Production deployments should add an authenticated proxy, wrapper, or service boundary before enabling the endpoint for protected traffic. + +Test with harmless synthetic content such as `a@example.com`. In `redact` mode, SimpleChat should call the analyzer, receive entity spans, and replace the detected value before web-search egress or upload indexing. In `block` mode, the same finding should prevent the protected action. + +## Separate Azure App Service + +Deploy the Presidio Analyzer-compatible container as a separate Linux Web App for Containers. Restrict ingress with private endpoints, virtual network integration, and access restrictions so only the SimpleChat environment can reach it. Add the analyzer hostname or private IP to SimpleChat's `Allowed Private Hosts` setting. If the analyzer endpoint is reachable beyond localhost, place an API-key-validating proxy or wrapper in front of it and configure SimpleChat to send the configured auth header. + +Use this shape when you want independent deployment and operational ownership for the analyzer while still running on App Service. Store the API key value as a SimpleChat App Service setting named by the SimpleChat admin setting, for example `PRESIDIO_DLP_API_KEY`, preferably backed by a Key Vault reference. + +## App Service Sidecar + +For deployments using App Service sidecar support, run the analyzer as a sidecar container next to SimpleChat and configure SimpleChat to call the sidecar endpoint over the local or private container network. Add the sidecar hostname, loopback host, or private IP to `Allowed Private Hosts`. This keeps Presidio dependencies out of the SimpleChat image while scaling the analyzer with the SimpleChat App Service instance count. + +Even with a sidecar, avoid raw text logging and keep the analyzer endpoint unreachable from the public internet. If the sidecar is fronted by a local wrapper, validate the `X-DLP-API-Key` or equivalent header there. + +## Azure Container Apps + +For independent scaling, deploy the analyzer as an internal Azure Container Apps service. Configure SimpleChat to reach the internal ingress URL over private networking, add that internal host to `Allowed Private Hosts`, and require the API key header at the Container Apps ingress, gateway, or wrapper service. + +This shape works well when analyzer CPU or model requirements scale differently from SimpleChat. Store the API key value in the SimpleChat App Service setting or Key Vault reference named by SimpleChat's `Secret Env Var` setting, not in the SimpleChat admin configuration. + +## Security Notes + +The analyzer receives raw user text, extracted document text, and selected metadata before SimpleChat applies redaction. Treat the endpoint as sensitive infrastructure with the same care as an internal document-processing service. + +Do not log raw request bodies, response bodies, matched values, or analyzer explanations. SimpleChat's DLP telemetry and stored metadata should remain counts-only. If you add a gateway, proxy, or wrapper around Presidio Analyzer, disable body logging and scrub diagnostics before sending them to centralized logs. + +Use `presidio_endpoint` only when the endpoint is private, authenticated, and operated by the same trust boundary that is allowed to process the source text. Keep regex DLP as the lightweight default and fallback path when the external analyzer is not configured. diff --git a/functional_tests/test_dlp_admin_settings_roundtrip.py b/functional_tests/test_dlp_admin_settings_roundtrip.py new file mode 100644 index 00000000..c43e5b85 --- /dev/null +++ b/functional_tests/test_dlp_admin_settings_roundtrip.py @@ -0,0 +1,317 @@ +# test_dlp_admin_settings_roundtrip.py +#!/usr/bin/env python3 +""" +Functional test for DLP admin settings roundtrip. +Version: 0.242.075 +Implemented in: 0.242.073 + +This test ensures DLP admin settings are normalized, persisted, and rendered +through the admin settings POST contract without requiring live Azure services. +""" + +import os +import sys +import ast +from pathlib import Path + + +ROOT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +APP_DIR = os.path.join(ROOT_DIR, "application", "single_app") +sys.path.insert(0, APP_DIR) +ADMIN_ROUTE_FILE = os.path.join(APP_DIR, "route_frontend_admin_settings.py") +ADMIN_TEMPLATE_FILE = os.path.join(APP_DIR, "templates", "admin_settings.html") +FUNCTIONS_SETTINGS_FILE = os.path.join(APP_DIR, "functions_settings.py") +ADMIN_TEMPLATE = Path(ADMIN_TEMPLATE_FILE) + + +NORMALIZED_ASSIGNMENTS = [ + "dlp_max_scan_chars = max(1000, dlp_max_scan_chars)", + "if dlp_default_engine not in ('regex', 'presidio_endpoint'):", + "dlp_default_engine = 'regex'", + "dlp_presidio_timeout_seconds = max(1, min(30, dlp_presidio_timeout_seconds))", + "dlp_presidio_score_threshold = max(0.0, min(1.0, dlp_presidio_score_threshold))", + "if web_search_dlp_mode not in ('monitor', 'redact', 'block'):", + "web_search_dlp_mode = 'monitor'", + "if dlp_review_destination not in ('none',):", + "dlp_review_destination = 'none'", +] + + +PERSISTED_DLP_FIELDS = { + "enable_dlp_control_plane": "form_data.get('enable_dlp_control_plane') == 'on'", + "dlp_default_engine": "dlp_default_engine", + "dlp_regex_rules": "normalized_dlp_regex_rules", + "dlp_max_scan_chars": "dlp_max_scan_chars", + "dlp_fail_closed_on_scanner_error": "form_data.get('dlp_fail_closed_on_scanner_error') == 'on'", + "dlp_audit_level": "'counts_only'", + "dlp_enable_structured_telemetry": "form_data.get('dlp_enable_structured_telemetry') == 'on'", + "dlp_telemetry_sample_allow_events": "form_data.get('dlp_telemetry_sample_allow_events') == 'on'", + "dlp_review_destination": "dlp_review_destination", + "enable_web_search_dlp": "form_data.get('enable_web_search_dlp') == 'on'", + "web_search_dlp_mode": "web_search_dlp_mode", + "enable_upload_dlp": "form_data.get('enable_upload_dlp') == 'on'", + "upload_dlp_mode": "upload_dlp_mode", + "upload_dlp_fail_upload_on_match": "form_data.get('upload_dlp_fail_upload_on_match') == 'on'", +} + + +UNSUPPORTED_DLP_FORM_FIELDS = [ + "dlp_presidio_use_service", + "dlp_scanner_timeout_seconds", + "dlp_review_include_redacted_preview", + "web_search_dlp_track_review_events", + "upload_dlp_track_review_events", +] + + +def read_file_text(path): + with open(path, "r", encoding="utf-8") as file_handle: + return file_handle.read() + + +def load_sanitize_settings_for_user(): + """Load the sanitizer function without importing optional app dependencies.""" + source = read_file_text(FUNCTIONS_SETTINGS_FILE) + tree = ast.parse(source, filename=FUNCTIONS_SETTINGS_FILE) + function_node = next( + node for node in tree.body + if isinstance(node, ast.FunctionDef) and node.name == "sanitize_settings_for_user" + ) + module = ast.Module(body=[function_node], type_ignores=[]) + ast.fix_missing_locations(module) + namespace = {} + exec(compile(module, FUNCTIONS_SETTINGS_FILE, "exec"), namespace) + return namespace["sanitize_settings_for_user"] + + +def assert_no_retired_structured_redaction_control(source, source_name): + """Retired structured-redaction controls should not appear in admin DLP sources.""" + redaction_prefix = "web_search_dlp_redact" + for line_number, line in enumerate(source.splitlines(), start=1): + normalized = line.lower() + has_retired_prefix = redaction_prefix in normalized + has_structured_identifier_wording = "structured" in normalized and "identifier" in normalized + assert not (has_retired_prefix and has_structured_identifier_wording), ( + f"Retired structured-redaction DLP control remains in {source_name}:{line_number}" + ) + + +def test_dlp_admin_post_normalizes_untrusted_form_values(): + """Admin POST should clamp numeric inputs and fail closed on enum-like fields.""" + print("Testing DLP admin POST normalization...") + route_source = read_file_text(ADMIN_ROUTE_FILE) + + for snippet in NORMALIZED_ASSIGNMENTS: + assert snippet in route_source, f"Missing DLP normalization contract: {snippet}" + + assert "safe_int_with_source(" in route_source + + for field_name in UNSUPPORTED_DLP_FORM_FIELDS: + assert f"form_data.get('{field_name}'" not in route_source, ( + f"Admin route still accepts unsupported DLP form field: {field_name}" + ) + assert_no_retired_structured_redaction_control(route_source, ADMIN_ROUTE_FILE) + + +def test_dlp_admin_post_persists_normalized_dlp_payload(): + """Admin POST should persist normalized values, not raw form strings.""" + print("Testing DLP admin POST persistence payload...") + route_source = read_file_text(ADMIN_ROUTE_FILE) + + for field_name, expected_value in PERSISTED_DLP_FIELDS.items(): + expected_mapping = f"'{field_name}': {expected_value}" + assert expected_mapping in route_source, f"Missing DLP persistence mapping: {expected_mapping}" + + +def test_dlp_admin_template_roundtrips_persisted_values(): + """Admin template should render the same fields that POST persists.""" + print("Testing DLP admin template roundtrip controls...") + template_source = read_file_text(ADMIN_TEMPLATE_FILE) + + for field_name in PERSISTED_DLP_FIELDS: + if field_name == "dlp_regex_rules": + assert 'id="dlp_regex_rules_json"' in template_source + assert 'name="dlp_regex_rules_json"' in template_source + else: + assert ( + f'id="{field_name}"' in template_source or f'name="{field_name}"' in template_source + ), f"Missing DLP admin control: {field_name}" + + assert 'id="dlp_control_plane_settings"' in template_source + assert 'id="web_search_dlp_mode_settings"' in template_source + + for field_name in UNSUPPORTED_DLP_FORM_FIELDS: + assert field_name not in template_source, f"Unsupported DLP control still rendered: {field_name}" + assert 'id="dlp_presidio_endpoint"' not in template_source + assert 'name="dlp_presidio_endpoint"' not in template_source + assert_no_retired_structured_redaction_control(template_source, ADMIN_TEMPLATE_FILE) + + +def test_dlp_review_destination_stays_unreachable_until_review_flow_exists(): + """Review records should stay disabled until a reachable review destination is implemented.""" + print("Testing DLP review destination fail-closed behavior...") + route_source = read_file_text(ADMIN_ROUTE_FILE) + template_source = read_file_text(ADMIN_TEMPLATE_FILE) + + assert "if dlp_review_destination not in ('none',):" in route_source + assert "dlp_review_destination = 'none'" in route_source + assert 'value="safety_violations"' not in template_source + + +def test_admin_dlp_controls_expose_supported_dlp_engines(): + template = ADMIN_TEMPLATE.read_text(encoding="utf-8") + route_source = read_file_text(ADMIN_ROUTE_FILE) + + assert 'value="regex"' in template + assert "if dlp_default_engine not in ('regex', 'presidio_endpoint'):" in route_source + assert 'name="dlp_regex_rules_json"' in template + assert "web_search_dlp_block_on_internal_phrases" not in template + assert "Detect internal phrases" not in template + assert_no_retired_structured_redaction_control(template, str(ADMIN_TEMPLATE)) + + for field_name in UNSUPPORTED_DLP_FORM_FIELDS: + assert f"'{field_name}':" not in route_source, f"Unsupported DLP field still persisted: {field_name}" + assert_no_retired_structured_redaction_control(route_source, ADMIN_ROUTE_FILE) + + +def test_admin_settings_post_validates_csrf_before_dlp_persistence(): + """Admin settings POST should validate CSRF before persisting security-sensitive DLP fields.""" + print("Testing admin settings CSRF validation ordering...") + source = read_file_text(ADMIN_ROUTE_FILE) + + post_index = source.find("if request.method == 'POST':") + form_index = source.find("form_data = request.form", post_index) + csrf_index = source.find("if not _validate_admin_settings_csrf_token(form_data):", form_index) + persist_index = source.find("'enable_dlp_control_plane': form_data.get('enable_dlp_control_plane') == 'on'", form_index) + + assert post_index != -1 + assert form_index > post_index + assert csrf_index > form_index + assert persist_index > csrf_index + assert "secrets.compare_digest" in source + assert "ADMIN_SETTINGS_CSRF_SESSION_KEY" in source + + +def test_admin_settings_persists_valid_dlp_regex_rules(): + """Admin settings should persist normalized configurable regex rules.""" + print("Testing admin regex rule persistence...") + source = read_file_text(ADMIN_ROUTE_FILE) + + assert "dlp_regex_rules_json" in source + assert "validate_dlp_regex_rules" in source + assert "'dlp_regex_rules': normalized_dlp_regex_rules" in source + + +def test_admin_settings_rejects_invalid_dlp_regex_rules_before_update(): + """Invalid DLP regex rules should be rejected before update_settings.""" + print("Testing invalid admin regex rule rejection ordering...") + source = read_file_text(ADMIN_ROUTE_FILE) + + parse_index = source.find("raw_dlp_regex_rules = form_data.get('dlp_regex_rules_json'") + validate_index = source.find("validate_dlp_regex_rules", parse_index) + update_index = source.find("if update_settings(new_settings):", validate_index) + + assert parse_index != -1 + assert validate_index > parse_index + assert update_index > validate_index + assert "return redirect(url_for('admin_settings'))" in source[validate_index:update_index] + + +def test_presidio_endpoint_settings_are_normalized_without_secret_persistence(): + """Admin POST should persist endpoint metadata but not raw API key values.""" + print("Testing Presidio endpoint metadata persistence...") + route_source = read_file_text(ADMIN_ROUTE_FILE) + + assert "'dlp_default_engine': dlp_default_engine" in route_source + assert "'dlp_presidio_analyzer_endpoint': dlp_presidio_analyzer_endpoint" in route_source + assert "'dlp_presidio_allowed_private_hosts': dlp_presidio_allowed_private_hosts" in route_source + assert "'dlp_presidio_auth_header_name': dlp_presidio_auth_header_name" in route_source + assert "'dlp_presidio_auth_secret_env_var': dlp_presidio_auth_secret_env_var" in route_source + assert "'dlp_presidio_timeout_seconds': dlp_presidio_timeout_seconds" in route_source + assert "'dlp_presidio_score_threshold': dlp_presidio_score_threshold" in route_source + assert "'dlp_presidio_language': dlp_presidio_language" in route_source + assert "'dlp_presidio_entities': dlp_presidio_entities" in route_source + assert "for item in dlp_presidio_entities_raw.split(',')" in route_source + assert "item.strip().upper()" in route_source + assert "if not dlp_presidio_entities:" in route_source + assert "dlp_presidio_entities = ['CREDIT_CARD', 'EMAIL_ADDRESS', 'PHONE_NUMBER', 'US_SSN']" in route_source + assert "validate_presidio_endpoint_url(" in route_source + assert "normalize_presidio_allowed_private_hosts(" in route_source + assert "normalize_presidio_auth_header_name(" in route_source + assert "normalize_presidio_secret_env_var_name(" in route_source + assert "settings.get('dlp_presidio_analyzer_endpoint', '')" in route_source + assert "'dlp_presidio_auth_secret'" not in route_source + assert "form_data.get('dlp_presidio_auth_secret'" not in route_source + + +def test_default_settings_include_presidio_endpoint_controls(): + """Default settings should include safe Presidio endpoint defaults.""" + print("Testing Presidio endpoint default settings...") + settings_source = read_file_text(FUNCTIONS_SETTINGS_FILE) + + assert "'dlp_default_engine': 'regex'" in settings_source + assert "'dlp_presidio_analyzer_endpoint': ''" in settings_source + assert "'dlp_presidio_allowed_private_hosts': ''" in settings_source + assert "'dlp_presidio_auth_header_name': 'X-DLP-API-Key'" in settings_source + assert "'dlp_presidio_auth_secret_env_var': 'PRESIDIO_DLP_API_KEY'" in settings_source + assert "'dlp_presidio_timeout_seconds': 5" in settings_source + assert "'dlp_presidio_score_threshold': 0.5" in settings_source + assert "'dlp_presidio_language': 'en'" in settings_source + assert "'dlp_presidio_entities': [" in settings_source + assert "'dlp_presidio_auth_secret'" not in settings_source + + +def test_user_settings_sanitization_strips_presidio_endpoint_controls(): + """Non-admin settings rendering should not expose Presidio endpoints or private host topology.""" + print("Testing Presidio endpoint user settings sanitization...") + sanitize_settings_for_user = load_sanitize_settings_for_user() + + sanitized = sanitize_settings_for_user( + { + "enable_dlp_control_plane": True, + "dlp_default_engine": "presidio_endpoint", + "dlp_presidio_analyzer_endpoint": "https://presidio.internal/analyze", + "dlp_presidio_allowed_private_hosts": "presidio.internal, 10.0.0.5", + "dlp_presidio_auth_header_name": "X-DLP-API-Key", + "dlp_presidio_auth_secret_env_var": "PRESIDIO_DLP_API_KEY", + "nested": { + "dlp_presidio_analyzer_endpoint": "https://nested-presidio.internal/analyze", + "safe": "visible", + }, + } + ) + + assert "dlp_presidio_analyzer_endpoint" not in sanitized + assert "dlp_presidio_allowed_private_hosts" not in sanitized + assert "dlp_presidio_auth_header_name" not in sanitized + assert "dlp_presidio_auth_secret_env_var" not in sanitized + assert "dlp_presidio_analyzer_endpoint" not in sanitized["nested"] + assert sanitized["nested"]["safe"] == "visible" + + +if __name__ == "__main__": + tests = [ + test_dlp_admin_post_normalizes_untrusted_form_values, + test_dlp_admin_post_persists_normalized_dlp_payload, + test_dlp_admin_template_roundtrips_persisted_values, + test_dlp_review_destination_stays_unreachable_until_review_flow_exists, + test_admin_dlp_controls_expose_supported_dlp_engines, + test_admin_settings_post_validates_csrf_before_dlp_persistence, + test_admin_settings_persists_valid_dlp_regex_rules, + test_admin_settings_rejects_invalid_dlp_regex_rules_before_update, + test_presidio_endpoint_settings_are_normalized_without_secret_persistence, + test_default_settings_include_presidio_endpoint_controls, + test_user_settings_sanitization_strips_presidio_endpoint_controls, + ] + + try: + for test in tests: + test() + print(f"All {len(tests)} DLP admin settings roundtrip tests passed.") + sys.exit(0) + except Exception as exc: + print(f"Test failed: {exc}") + import traceback + + traceback.print_exc() + sys.exit(1) diff --git a/functional_tests/test_dlp_admin_settings_ui.py b/functional_tests/test_dlp_admin_settings_ui.py new file mode 100644 index 00000000..a1c47337 --- /dev/null +++ b/functional_tests/test_dlp_admin_settings_ui.py @@ -0,0 +1,242 @@ +# test_dlp_admin_settings_ui.py +#!/usr/bin/env python3 +""" +Functional test for DLP admin settings UI. +Version: 0.242.075 +Implemented in: 0.242.073 + +This test ensures shared and web-search DLP defaults exist, admin settings +persist supported controls, the admin template exposes only implemented controls, +and new DLP JavaScript uses Bootstrap d-none instead of JavaScript display toggles. +""" + +import os +import sys + + +ROOT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +SETTINGS_FILE = os.path.join(ROOT_DIR, "application", "single_app", "functions_settings.py") +ADMIN_ROUTE_FILE = os.path.join(ROOT_DIR, "application", "single_app", "route_frontend_admin_settings.py") +ADMIN_TEMPLATE_FILE = os.path.join(ROOT_DIR, "application", "single_app", "templates", "admin_settings.html") +ADMIN_JS_FILE = os.path.join(ROOT_DIR, "application", "single_app", "static", "js", "admin", "admin_settings.js") + + +REQUIRED_KEYS = [ + "enable_dlp_control_plane", + "dlp_default_engine", + "dlp_regex_rules", + "dlp_max_scan_chars", + "dlp_fail_closed_on_scanner_error", + "dlp_audit_level", + "dlp_enable_structured_telemetry", + "dlp_telemetry_sample_allow_events", + "dlp_review_destination", + "enable_web_search_dlp", + "web_search_dlp_mode", + "enable_upload_dlp", + "upload_dlp_mode", + "upload_dlp_fail_upload_on_match", +] + + +UNSUPPORTED_ADMIN_CONTROL_IDS = [ + "dlp_presidio_use_service", + "dlp_presidio_service_settings", + "dlp_scanner_timeout_seconds", + "dlp_review_include_redacted_preview", + "web_search_dlp_track_review_events", + "upload_dlp_track_review_events", +] + +PRESIDIO_ENDPOINT_CONTROL_IDS = [ + "dlp_presidio_endpoint_settings", + "dlp_presidio_analyzer_endpoint", + "dlp_presidio_allowed_private_hosts", + "dlp_presidio_auth_header_name", + "dlp_presidio_auth_secret_env_var", + "dlp_presidio_timeout_seconds", + "dlp_presidio_score_threshold", + "dlp_presidio_language", + "dlp_presidio_entities", +] + + +RETIRED_DLP_SETTING_KEYS = [ + "dlp_presidio_use_service", + "dlp_presidio_endpoint", + "dlp_scanner_timeout_seconds", + "dlp_review_include_redacted_preview", + "web_search_dlp_track_review_events", + "upload_dlp_track_review_events", +] + + +def read_file_text(path): + with open(path, "r", encoding="utf-8") as file_handle: + return file_handle.read() + + +def assert_no_retired_structured_redaction_control(source, source_name): + """Retired structured-redaction controls should not appear in admin DLP sources.""" + redaction_prefix = "web_search_dlp_redact" + for line_number, line in enumerate(source.splitlines(), start=1): + normalized = line.lower() + has_retired_prefix = redaction_prefix in normalized + has_structured_identifier_wording = "structured" in normalized and "identifier" in normalized + assert not (has_retired_prefix and has_structured_identifier_wording), ( + f"Retired structured-redaction DLP control remains in {source_name}:{line_number}" + ) + + +def test_dlp_defaults_exist_and_are_safe(): + """Defaults should include shared/web-search DLP and keep review disabled.""" + print("Testing DLP defaults...") + source = read_file_text(SETTINGS_FILE) + + for key in REQUIRED_KEYS: + assert f"'{key}'" in source, f"Missing DLP default setting: {key}" + + assert "'dlp_review_destination': 'none'" in source + assert "'enable_web_search_dlp': False" in source + assert "raw_matches" not in source + + for key in RETIRED_DLP_SETTING_KEYS: + assert f"'{key}'" not in source, f"Retired DLP default setting remains: {key}" + assert_no_retired_structured_redaction_control(source, SETTINGS_FILE) + + +def test_admin_route_persists_dlp_settings(): + """Admin settings route should persist all PR1 DLP fields.""" + print("Testing DLP admin route persistence...") + source = read_file_text(ADMIN_ROUTE_FILE) + + for key in REQUIRED_KEYS: + assert key in source, f"Admin route does not persist or normalize {key}" + + +def test_admin_template_exposes_dlp_controls(): + """Admin UI should expose supported shared and web-search DLP controls.""" + print("Testing DLP admin template controls...") + source = read_file_text(ADMIN_TEMPLATE_FILE) + + assert "Data Loss Prevention" in source + assert 'id="dlp_control_plane_settings"' in source + assert 'id="web_search_dlp_settings"' in source + for key in REQUIRED_KEYS: + if key == "dlp_regex_rules": + assert 'id="dlp_regex_rules_json"' in source + assert 'name="dlp_regex_rules_json"' in source + else: + assert f'id="{key}"' in source or f'name="{key}"' in source, f"Missing DLP control: {key}" + + assert 'value="none"' in source + assert 'value="safety_violations"' not in source, ( + "Safety Violations destination should stay hidden unless PR1 implements reachable review integration" + ) + assert 'value="regex"' in source + assert 'value="presidio_endpoint"' in source + assert "Regex structured identifier scan" in source + assert "External Presidio Analyzer endpoint" in source + assert "Use regex for lightweight built-in scanning" in source + assert "Custom Regex Rules" in source + assert "{{ dlp_regex_rules_json }}" in source + assert "web_search_dlp_block_on_internal_phrases" not in source + assert "Detect internal phrases" not in source + + for unsupported_id in UNSUPPORTED_ADMIN_CONTROL_IDS: + assert unsupported_id not in source, f"Unsupported DLP control is still visible: {unsupported_id}" + + assert_no_retired_structured_redaction_control(source, ADMIN_TEMPLATE_FILE) + + +def test_presidio_endpoint_controls_are_rendered_without_secret_value_field(): + """DLP admin UI should configure endpoint metadata but not store raw API keys.""" + print("Testing Presidio endpoint admin controls...") + source = read_file_text(ADMIN_TEMPLATE_FILE) + + for control_id in PRESIDIO_ENDPOINT_CONTROL_IDS: + assert f'id="{control_id}"' in source, f"Missing Presidio endpoint control: {control_id}" + + assert 'name="dlp_presidio_analyzer_endpoint"' in source + assert 'name="dlp_presidio_allowed_private_hosts"' in source + assert 'name="dlp_presidio_auth_header_name"' in source + assert 'name="dlp_presidio_auth_secret_env_var"' in source + assert 'name="dlp_presidio_timeout_seconds"' in source + assert 'name="dlp_presidio_score_threshold"' in source + assert 'name="dlp_presidio_language"' in source + assert 'name="dlp_presidio_entities"' in source + assert 'name="dlp_presidio_auth_secret"' not in source + assert "private presidio endpoint with an env-backed api key" in source.lower() + assert "localhost endpoints may run without auth for local testing only" in source.lower() + assert "connection and content headers are rejected" in source.lower() + + +def test_admin_js_uses_d_none_for_dlp_toggles(): + """New DLP JS should use Bootstrap d-none, not style.display.""" + print("Testing DLP admin JavaScript visibility handling...") + source = read_file_text(ADMIN_JS_FILE) + + assert "initializeDlpSettings" in source + assert "dlp_control_plane_settings" in source + assert "web_search_dlp_settings" in source + assert "dlp_presidio_endpoint_settings" in source + assert "presidio_endpoint" in source + assert "classList.toggle('d-none'" in source or 'classList.toggle("d-none"' in source + + dlp_section = source[source.find("initializeDlpSettings"):] + assert ".style.display" not in dlp_section + + for unsupported_id in UNSUPPORTED_ADMIN_CONTROL_IDS: + assert unsupported_id not in dlp_section, f"Unsupported DLP JS hook remains: {unsupported_id}" + assert_no_retired_structured_redaction_control(dlp_section, ADMIN_JS_FILE) + + +def test_admin_settings_form_contains_csrf_token(): + """Admin settings form should submit a per-session CSRF token.""" + print("Testing admin settings CSRF template field...") + template = read_file_text(ADMIN_TEMPLATE_FILE) + + form_index = template.find('id="admin-settings-form"') + token_index = template.find('name="admin_settings_csrf_token"', form_index) + value_index = template.find('value="{{ admin_settings_csrf_token }}"', token_index) + + assert form_index != -1 + assert token_index > form_index + assert value_index > token_index + + +def test_admin_template_exposes_regex_rule_editor_without_internal_phrase_toggle(): + """Admin UI should expose configurable regex rules and remove hardcoded internal phrases.""" + print("Testing DLP regex rule admin editor...") + source = read_file_text(ADMIN_TEMPLATE_FILE) + + assert 'id="dlp_regex_rules_json"' in source + assert 'name="dlp_regex_rules_json"' in source + assert "{{ dlp_regex_rules_json }}" in source + assert "Custom Regex Rules" in source + assert "web_search_dlp_block_on_internal_phrases" not in source + assert "Detect internal phrases" not in source + + +if __name__ == "__main__": + tests = [ + test_dlp_defaults_exist_and_are_safe, + test_admin_route_persists_dlp_settings, + test_admin_template_exposes_dlp_controls, + test_presidio_endpoint_controls_are_rendered_without_secret_value_field, + test_admin_js_uses_d_none_for_dlp_toggles, + test_admin_settings_form_contains_csrf_token, + test_admin_template_exposes_regex_rule_editor_without_internal_phrase_toggle, + ] + + try: + for test in tests: + test() + print(f"All {len(tests)} DLP admin settings UI tests passed.") + sys.exit(0) + except Exception as exc: + print(f"Test failed: {exc}") + import traceback + + traceback.print_exc() + sys.exit(1) diff --git a/functional_tests/test_dlp_admin_ui_smoke.py b/functional_tests/test_dlp_admin_ui_smoke.py new file mode 100644 index 00000000..1f4960b2 --- /dev/null +++ b/functional_tests/test_dlp_admin_ui_smoke.py @@ -0,0 +1,146 @@ +# test_dlp_admin_ui_smoke.py +#!/usr/bin/env python3 +""" +Functional test for DLP admin UI smoke. +Version: 0.242.075 +Implemented in: 0.242.073 + +This test ensures the DLP admin settings card can be extracted into collapsed +and expanded previews for local visual review. +""" + +import importlib.util +import os +import sys +import tempfile +from pathlib import Path + + +ROOT_DIR = Path(__file__).resolve().parents[1] +ADMIN_TEMPLATE_FILE = ROOT_DIR / "application" / "single_app" / "templates" / "admin_settings.html" +PREVIEW_SCRIPT = ROOT_DIR / "tools" / "local_dev" / "render_dlp_admin_preview.py" + + +REQUIRED_CONTROLS = [ + "enable_dlp_control_plane", + "dlp_default_engine", + "dlp_presidio_endpoint_settings", + "dlp_presidio_analyzer_endpoint", + "dlp_presidio_allowed_private_hosts", + "dlp_presidio_auth_header_name", + "dlp_presidio_auth_secret_env_var", + "dlp_presidio_timeout_seconds", + "dlp_presidio_score_threshold", + "dlp_presidio_language", + "dlp_presidio_entities", + "dlp_regex_rules_json", + "dlp_max_scan_chars", + "enable_web_search_dlp", + "web_search_dlp_mode", + "enable_upload_dlp", + "upload_dlp_mode", + "upload_dlp_fail_upload_on_match", +] + + +RETIRED_CONTROLS = [ + "dlp_scanner_timeout_seconds", + "web_search_dlp_redact_structured_identifiers", + "web_search_dlp_block_on_internal_phrases", + "upload_dlp_track_review_events", +] + + +def load_preview_module(): + spec = importlib.util.spec_from_file_location("render_dlp_admin_preview", PREVIEW_SCRIPT) + module = importlib.util.module_from_spec(spec) + spec.loader.exec_module(module) + return module + + +def test_dlp_admin_preview_extractor_writes_collapsed_and_expanded_files(): + """Preview extraction should work against the real admin settings template.""" + print("Testing DLP admin preview extraction...") + module = load_preview_module() + + with tempfile.TemporaryDirectory() as temp_dir: + collapsed_path, expanded_path = module.render_previews(ADMIN_TEMPLATE_FILE, Path(temp_dir)) + + assert collapsed_path.exists() + assert expanded_path.exists() + assert collapsed_path.name == "admin-dlp-preview.html" + assert expanded_path.name == "admin-dlp-preview-expanded.html" + + collapsed_html = collapsed_path.read_text(encoding="utf-8") + expanded_html = expanded_path.read_text(encoding="utf-8") + + assert "Data Loss Prevention" in collapsed_html + assert "Data Loss Prevention" in expanded_html + assert 'id="dlp_control_plane_settings"' in collapsed_html + assert 'id="dlp_control_plane_settings"' in expanded_html + + +def test_expanded_dlp_admin_preview_contains_expected_controls(): + """Expanded preview should expose all DLP controls needed for review.""" + print("Testing expanded DLP admin preview controls...") + module = load_preview_module() + + with tempfile.TemporaryDirectory() as temp_dir: + _, expanded_path = module.render_previews(ADMIN_TEMPLATE_FILE, Path(temp_dir)) + expanded_html = expanded_path.read_text(encoding="utf-8") + + for control_id in REQUIRED_CONTROLS: + assert ( + f'id="{control_id}"' in expanded_html or f'name="{control_id}"' in expanded_html + ), f"Missing expanded DLP control: {control_id}" + + for control_id in RETIRED_CONTROLS: + assert ( + f'id="{control_id}"' not in expanded_html and f'name="{control_id}"' not in expanded_html + ), f"Retired DLP control still rendered: {control_id}" + + assert '
' not in expanded_html + assert '
' not in expanded_html + assert '
' not in expanded_html + + +def test_dlp_admin_preview_does_not_expose_raw_sensitive_values(): + """Preview files should include controls, not populated credentials or raw detector matches.""" + print("Testing DLP admin preview safety...") + module = load_preview_module() + + with tempfile.TemporaryDirectory() as temp_dir: + collapsed_path, expanded_path = module.render_previews(ADMIN_TEMPLATE_FILE, Path(temp_dir)) + rendered = ( + collapsed_path.read_text(encoding="utf-8") + + expanded_path.read_text(encoding="utf-8") + ) + + forbidden = [ + "123-45-6789", + "4111 1111 1111 1111", + "dlp_presidio_auth_secret\"", + "raw_matches", + ] + for value in forbidden: + assert value not in rendered, f"Preview leaked forbidden value: {value}" + + +if __name__ == "__main__": + tests = [ + test_dlp_admin_preview_extractor_writes_collapsed_and_expanded_files, + test_expanded_dlp_admin_preview_contains_expected_controls, + test_dlp_admin_preview_does_not_expose_raw_sensitive_values, + ] + + try: + for test in tests: + test() + print(f"All {len(tests)} DLP admin UI smoke tests passed.") + sys.exit(0) + except Exception as exc: + print(f"Test failed: {exc}") + import traceback + + traceback.print_exc() + sys.exit(1) diff --git a/functional_tests/test_dlp_control_plane.py b/functional_tests/test_dlp_control_plane.py new file mode 100644 index 00000000..17ed9155 --- /dev/null +++ b/functional_tests/test_dlp_control_plane.py @@ -0,0 +1,194 @@ +# test_dlp_control_plane.py +#!/usr/bin/env python3 +""" +Functional test for DLP control plane core behavior. +Version: 0.242.075 +Implemented in: 0.242.073 + +This test ensures the shared DLP core supports disabled, regex, Luhn-validated +credit-card, counts-only metadata, and ReDoS-resistant scanning without +persisting raw matched values. +""" + +import os +import sys +import time +from unittest.mock import patch + + +ROOT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +APP_DIR = os.path.join(ROOT_DIR, "application", "single_app") +sys.path.insert(0, APP_DIR) + + +RAW_SSN = "123-45-6789" +RAW_CARD = "4111 1111 1111 1111" +INVALID_CARD = "4111 1111 1111 1112" + + +def assert_no_raw_values(payload): + """Assert a nested DLP payload does not include raw sensitive values.""" + serialized = repr(payload) + forbidden_values = [RAW_SSN, RAW_CARD, INVALID_CARD, "Alice Example"] + for value in forbidden_values: + assert value not in serialized, f"Raw value leaked into payload: {value}" + + +def test_disabled_dlp_allows_original_text(): + """Disabled DLP should return the original text and an allow decision.""" + print("Testing disabled DLP behavior...") + from functions_dlp import evaluate_dlp_text + + text = f"Please search for {RAW_SSN}" + result = evaluate_dlp_text( + text, + settings={"enable_dlp_control_plane": False}, + surface="web_search", + ) + + assert result["decision"] == "allow" + assert result["text"] == text + assert result["redacted_text"] == text + assert result["total_replacements"] == 0 + assert result["match_counts"] == {} + assert result["matches"] == [] + + +def test_regex_redacts_ssn_and_counts_only_metadata(): + """Regex mode should redact SSNs and return counts-only metadata.""" + print("Testing SSN redaction and safe metadata...") + from functions_dlp import evaluate_dlp_text + + result = evaluate_dlp_text( + f"Customer SSN is {RAW_SSN}.", + settings={ + "enable_dlp_control_plane": True, + "dlp_default_engine": "regex", + "web_search_dlp_mode": "redact", + }, + surface="web_search", + ) + + assert result["decision"] == "redact" + assert "[REDACTED_US_SSN]" in result["redacted_text"] + assert result["match_counts"] == {"US_SSN": 1} + assert result["total_replacements"] == 1 + assert_no_raw_values(result) + + +def test_credit_card_requires_luhn_validation(): + """Credit-card-like values should redact only when Luhn-valid.""" + print("Testing credit card Luhn validation...") + from functions_dlp import evaluate_dlp_text + + valid_result = evaluate_dlp_text( + f"Use card {RAW_CARD} for the vendor.", + settings={"enable_dlp_control_plane": True, "web_search_dlp_mode": "redact"}, + surface="web_search", + ) + invalid_result = evaluate_dlp_text( + f"Ignore fake card {INVALID_CARD}.", + settings={"enable_dlp_control_plane": True, "web_search_dlp_mode": "redact"}, + surface="web_search", + ) + + assert valid_result["match_counts"] == {"CREDIT_CARD": 1} + assert "[REDACTED_CREDIT_CARD]" in valid_result["redacted_text"] + assert invalid_result["decision"] == "allow" + assert invalid_result["redacted_text"].endswith(f"{INVALID_CARD}.") + assert invalid_result["match_counts"] == {} + assert_no_raw_values(valid_result) + + +def test_regex_scan_is_bounded_on_long_non_matching_input(): + """Regex recognizers should avoid catastrophic backtracking.""" + print("Testing regex performance on long non-matching input...") + from functions_dlp import evaluate_dlp_text + + long_text = ("not-sensitive " * 20000) + "done" + started = time.perf_counter() + result = evaluate_dlp_text( + long_text, + settings={ + "enable_dlp_control_plane": True, + "web_search_dlp_mode": "redact", + "dlp_max_scan_chars": 500000, + }, + surface="web_search", + ) + elapsed = time.perf_counter() - started + + assert result["decision"] == "allow" + assert elapsed < 2.0, f"Regex scan took too long: {elapsed:.3f}s" + + +def test_enforced_dlp_blocks_when_text_exceeds_scan_limit(): + """Enforced DLP must not append unscanned text into sanitized output.""" + from functions_dlp import evaluate_dlp_text + + settings = { + "enable_dlp_control_plane": True, + "dlp_default_engine": "regex", + "dlp_max_scan_chars": 20, + "web_search_dlp_mode": "redact", + "enable_web_search_dlp": True, + } + text = "public prefix only " + ("x" * 25) + " SSN 123-45-6789" + + result = evaluate_dlp_text(text, settings=settings, surface="web_search") + + assert result["decision"] == "block" + assert result["scanner_status"] == "truncated" + assert result["text"] == "" + assert "123-45-6789" not in repr(result) + + +def test_enforced_truncation_blocks_before_scanner_error_fail_open(): + """Protected enforced surfaces should block truncated text before scanner errors.""" + import functions_dlp + + def fail_scan(text, settings, surface="generic"): + raise RuntimeError("scanner unavailable") + + settings = { + "enable_dlp_control_plane": True, + "dlp_fail_closed_on_scanner_error": False, + "dlp_max_scan_chars": 12, + "web_search_dlp_mode": "redact", + "enable_web_search_dlp": True, + } + text = "safe prefix " + ("x" * 25) + f" tail {RAW_SSN}" + + with patch.object(functions_dlp, "_apply_regex_engine", fail_scan): + result = functions_dlp.evaluate_dlp_text(text, settings=settings, surface="web_search") + + assert result["decision"] == "block" + assert result["scanner_status"] == "truncated" + assert result["text"] == "" + assert result["redacted_text"] == "" + assert result["metadata"]["skipped_chars"] > 0 + assert RAW_SSN not in repr(result) + assert "tail" not in repr(result) + + +if __name__ == "__main__": + tests = [ + test_disabled_dlp_allows_original_text, + test_regex_redacts_ssn_and_counts_only_metadata, + test_credit_card_requires_luhn_validation, + test_regex_scan_is_bounded_on_long_non_matching_input, + test_enforced_dlp_blocks_when_text_exceeds_scan_limit, + test_enforced_truncation_blocks_before_scanner_error_fail_open, + ] + + try: + for test in tests: + test() + print(f"All {len(tests)} DLP control plane tests passed.") + sys.exit(0) + except Exception as exc: + print(f"Test failed: {exc}") + import traceback + + traceback.print_exc() + sys.exit(1) diff --git a/functional_tests/test_dlp_presidio_endpoint.py b/functional_tests/test_dlp_presidio_endpoint.py new file mode 100644 index 00000000..3f96abb4 --- /dev/null +++ b/functional_tests/test_dlp_presidio_endpoint.py @@ -0,0 +1,541 @@ +# test_dlp_presidio_endpoint.py +#!/usr/bin/env python3 +""" +Functional test for external Presidio endpoint DLP adapter. +Version: 0.242.075 +Implemented in: 0.242.075 + +This test ensures SimpleChat can call a configured Presidio-compatible analyzer +endpoint without embedding Presidio packages or leaking raw scanned text. +""" + +import os +import socket +import sys +from unittest.mock import Mock + +ROOT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +APP_DIR = os.path.join(ROOT_DIR, "application", "single_app") +sys.path.insert(0, APP_DIR) + + +RAW_TEXT = "Contact me a@example.com" + + +def stub_dns_answers(monkeypatch, expected_host, addresses=None): + """Return deterministic DNS answers for endpoint validation tests.""" + host_answers = expected_host if isinstance(expected_host, dict) else {expected_host: addresses} + + def fake_getaddrinfo(host, port, *args, **kwargs): + assert host in host_answers + return [ + (socket.AF_INET, socket.SOCK_STREAM, 6, "", (address, port or 443)) + for address in host_answers[host] + ] + + monkeypatch.setattr(socket, "getaddrinfo", fake_getaddrinfo) + + +def test_validate_presidio_endpoint_allows_https_and_localhost(monkeypatch): + """Public HTTPS and explicitly allowlisted local HTTP endpoint URLs should be accepted.""" + from functions_dlp_presidio import validate_presidio_endpoint_url + + stub_dns_answers( + monkeypatch, + { + "presidio.example.com": ["93.184.216.34"], + "localhost": ["127.0.0.1"], + }, + ) + + assert validate_presidio_endpoint_url("https://presidio.example.com/analyze") == "https://presidio.example.com/analyze" + assert ( + validate_presidio_endpoint_url("http://localhost:5002/analyze", "localhost") + == "http://localhost:5002/analyze" + ) + assert ( + validate_presidio_endpoint_url("http://127.0.0.1:5002/analyze", "127.0.0.1") + == "http://127.0.0.1:5002/analyze" + ) + assert validate_presidio_endpoint_url("http://[::1]:5002/analyze", "::1") == "http://[::1]:5002/analyze" + + +def test_validate_presidio_endpoint_rejects_private_hosts_without_allowlist(): + """Private, link-local, and loopback endpoints should require an explicit allowlist.""" + from functions_dlp_presidio import PresidioEndpointConfigurationError, validate_presidio_endpoint_url + + blocked_urls = [ + "https://127.0.0.1:5002/analyze", + "https://[::1]:5002/analyze", + "https://10.1.2.3/analyze", + "https://172.16.0.10/analyze", + "https://192.168.1.20/analyze", + "https://169.254.169.254/metadata", + ] + + for blocked_url in blocked_urls: + try: + validate_presidio_endpoint_url(blocked_url) + except PresidioEndpointConfigurationError as exc: + assert "allowlist" in str(exc).lower() + continue + + raise AssertionError(f"Expected private endpoint to be rejected: {blocked_url}") + + +def test_validate_presidio_endpoint_rejects_public_hostname_resolving_to_private_ip(monkeypatch): + """Public-looking hostnames should be rejected when DNS resolves to non-global addresses.""" + from functions_dlp_presidio import PresidioEndpointConfigurationError, validate_presidio_endpoint_url + + stub_dns_answers(monkeypatch, "presidio.example.com", ["169.254.169.254"]) + + try: + validate_presidio_endpoint_url("https://presidio.example.com/analyze") + except PresidioEndpointConfigurationError as exc: + assert "allowlist" in str(exc).lower() + return + + raise AssertionError("Expected DNS-resolved metadata endpoint address to be rejected.") + + +def test_validate_presidio_endpoint_rejects_any_private_dns_answer(monkeypatch): + """Any non-global DNS answer should fail unless the endpoint host is explicitly allowlisted.""" + from functions_dlp_presidio import PresidioEndpointConfigurationError, validate_presidio_endpoint_url + + stub_dns_answers(monkeypatch, "presidio.example.com", ["93.184.216.34", "10.0.0.5"]) + + try: + validate_presidio_endpoint_url("https://presidio.example.com/analyze") + except PresidioEndpointConfigurationError as exc: + assert "allowlist" in str(exc).lower() + return + + raise AssertionError("Expected hostname with mixed public/private DNS answers to be rejected.") + + +def test_analyze_with_presidio_endpoint_blocks_dns_rebinding_before_socket_connect(monkeypatch): + """The request connection path should re-check DNS answers before opening a socket.""" + from functions_dlp_presidio import ( + PresidioEndpointConfigurationError, + PresidioEndpointRequestError, + analyze_with_presidio_endpoint, + ) + + dns_calls = [] + socket_attempts = {"count": 0} + + def fake_getaddrinfo(host, port, *args, **kwargs): + assert host == "presidio.example.com" + dns_calls.append(host) + address = "93.184.216.34" if len(dns_calls) == 1 else "169.254.169.254" + return [(socket.AF_INET, socket.SOCK_STREAM, 6, "", (address, port or 443))] + + class BlockingSocket: + def __init__(self, *args, **kwargs): + socket_attempts["count"] += 1 + raise AssertionError("Unsafe rebinding address reached socket creation.") + + monkeypatch.setenv("NO_PROXY", "*") + monkeypatch.setenv("PRESIDIO_DLP_API_KEY", "unit-test-secret") + monkeypatch.setattr(socket, "getaddrinfo", fake_getaddrinfo) + monkeypatch.setattr(socket, "socket", BlockingSocket) + + try: + analyze_with_presidio_endpoint( + RAW_TEXT, + { + "dlp_presidio_analyzer_endpoint": "https://presidio.example.com/analyze", + "dlp_presidio_auth_secret_env_var": "PRESIDIO_DLP_API_KEY", + }, + ) + except (PresidioEndpointConfigurationError, PresidioEndpointRequestError): + assert len(dns_calls) >= 2 + assert socket_attempts["count"] == 0 + return + + raise AssertionError("Expected rebinding request path to be blocked.") + + +def test_validate_presidio_endpoint_allows_private_dns_answer_for_exact_allowlisted_host(monkeypatch): + """A private DNS answer should be accepted only for the exact endpoint host in the allowlist.""" + from functions_dlp_presidio import validate_presidio_endpoint_url + + stub_dns_answers(monkeypatch, "presidio.example.com", ["10.0.0.5"]) + + assert ( + validate_presidio_endpoint_url( + "https://presidio.example.com/analyze", + "presidio.example.com", + ) + == "https://presidio.example.com/analyze" + ) + + +def test_validate_presidio_endpoint_allows_private_hosts_with_explicit_allowlist(): + """Private endpoint URLs should be accepted only when their host is explicitly allowlisted.""" + from functions_dlp_presidio import validate_presidio_endpoint_url + + allowed_private_hosts = "10.1.2.3\nlocalhost, ::1" + + assert ( + validate_presidio_endpoint_url("https://10.1.2.3/analyze", allowed_private_hosts) + == "https://10.1.2.3/analyze" + ) + assert ( + validate_presidio_endpoint_url("https://localhost:5002/analyze", allowed_private_hosts) + == "https://localhost:5002/analyze" + ) + assert ( + validate_presidio_endpoint_url("https://[::1]:5002/analyze", allowed_private_hosts) + == "https://[::1]:5002/analyze" + ) + + +def test_validate_presidio_endpoint_rejects_url_secret_persistence_vectors(): + """Endpoint URLs should reject userinfo, fragments, and credential-like query names.""" + from functions_dlp_presidio import PresidioEndpointConfigurationError, validate_presidio_endpoint_url + + blocked_urls = [ + "https://user:pass@presidio.example.com/analyze", + "https://presidio.example.com/analyze#fragment", + "https://presidio.example.com/analyze?key=abc", + "https://presidio.example.com/analyze?api_key=abc", + "https://presidio.example.com/analyze?apikey=abc", + "https://presidio.example.com/analyze?secret=abc", + "https://presidio.example.com/analyze?token=abc", + "https://presidio.example.com/analyze?password=abc", + "https://presidio.example.com/analyze?connection=abc", + "https://presidio.example.com/analyze?sig=abc", + "https://presidio.example.com/analyze?client_secret=abc", + "https://presidio.example.com/analyze?access_token=abc", + "https://presidio.example.com/analyze?subscription-key=abc", + ] + + for blocked_url in blocked_urls: + try: + validate_presidio_endpoint_url(blocked_url) + except PresidioEndpointConfigurationError: + continue + + raise AssertionError(f"Expected unsafe endpoint URL to be rejected: {blocked_url}") + + +def test_validate_presidio_endpoint_rejects_insecure_remote_http(): + """Remote HTTP endpoint URLs should be rejected.""" + from functions_dlp_presidio import PresidioEndpointConfigurationError, validate_presidio_endpoint_url + + try: + validate_presidio_endpoint_url("http://presidio.example.com/analyze") + except PresidioEndpointConfigurationError as exc: + assert "https" in str(exc).lower() + return + + raise AssertionError("Expected insecure remote HTTP endpoint to be rejected.") + + +def test_validate_presidio_endpoint_rejects_relative_url(): + """Endpoint URLs must be absolute HTTP(S) URLs.""" + from functions_dlp_presidio import PresidioEndpointConfigurationError, validate_presidio_endpoint_url + + try: + validate_presidio_endpoint_url("/analyze") + except PresidioEndpointConfigurationError as exc: + assert "absolute" in str(exc).lower() + return + + raise AssertionError("Expected relative endpoint URL to be rejected.") + + +def test_analyze_with_presidio_endpoint_posts_safe_payload_and_auth_header(monkeypatch): + """The endpoint adapter should post the Analyzer payload and env-backed auth header.""" + from functions_dlp_presidio import analyze_with_presidio_endpoint + + captured = {} + + def fake_post(url, json=None, headers=None, timeout=None, allow_redirects=None, allowed_private_hosts=None): + captured["url"] = url + captured["json"] = json + captured["headers"] = headers + captured["timeout"] = timeout + captured["allow_redirects"] = allow_redirects + response = Mock() + response.raise_for_status.return_value = None + response.json.return_value = [ + {"entity_type": "EMAIL_ADDRESS", "start": 11, "end": 24, "score": 0.91} + ] + return response + + monkeypatch.setattr("functions_dlp_presidio._post_presidio_endpoint", fake_post) + monkeypatch.setenv("PRESIDIO_DLP_API_KEY", "unit-test-secret") + settings = { + "dlp_presidio_analyzer_endpoint": "https://presidio.internal/analyze", + "dlp_presidio_allowed_private_hosts": "presidio.internal", + "dlp_presidio_auth_header_name": "X-DLP-API-Key", + "dlp_presidio_auth_secret_env_var": "PRESIDIO_DLP_API_KEY", + "dlp_presidio_entities": ["EMAIL_ADDRESS", "US_SSN"], + "dlp_presidio_score_threshold": 0.7, + "dlp_presidio_language": "en", + "dlp_presidio_timeout_seconds": 3, + } + + stub_dns_answers(monkeypatch, "presidio.internal", ["10.0.0.5"]) + results = analyze_with_presidio_endpoint(RAW_TEXT, settings) + + assert results == [{"entity_type": "EMAIL_ADDRESS", "start": 11, "end": 24, "score": 0.91}] + assert captured["url"] == "https://presidio.internal/analyze" + assert captured["json"] == { + "text": RAW_TEXT, + "language": "en", + "entities": ["EMAIL_ADDRESS", "US_SSN"], + "score_threshold": 0.7, + } + assert captured["headers"]["X-DLP-API-Key"] == "unit-test-secret" + assert captured["headers"]["Content-Type"] == "application/json" + assert captured["timeout"] == 3 + assert captured["allow_redirects"] is False + + +def test_analyze_with_presidio_endpoint_allows_localhost_without_env_secret(monkeypatch): + """Local development endpoints may omit auth, but only on loopback hosts.""" + from functions_dlp_presidio import analyze_with_presidio_endpoint + + captured = {} + + def fake_post(url, json=None, headers=None, timeout=None, allow_redirects=None, allowed_private_hosts=None): + captured["headers"] = headers + response = Mock() + response.raise_for_status.return_value = None + response.json.return_value = [] + return response + + monkeypatch.setattr("functions_dlp_presidio._post_presidio_endpoint", fake_post) + monkeypatch.delenv("PRESIDIO_DLP_API_KEY", raising=False) + stub_dns_answers(monkeypatch, "localhost", ["127.0.0.1"]) + + analyze_with_presidio_endpoint( + RAW_TEXT, + { + "dlp_presidio_analyzer_endpoint": "http://localhost:5002/analyze", + "dlp_presidio_allowed_private_hosts": "localhost", + "dlp_presidio_auth_header_name": "X-DLP-API-Key", + "dlp_presidio_auth_secret_env_var": "PRESIDIO_DLP_API_KEY", + }, + ) + + assert "X-DLP-API-Key" not in captured["headers"] + + +def test_analyze_with_presidio_endpoint_requires_auth_secret_for_nonlocal_endpoint(monkeypatch): + """Non-loopback endpoints should not receive raw text without env-backed auth.""" + from functions_dlp_presidio import PresidioEndpointConfigurationError, analyze_with_presidio_endpoint + + called = {"post": False} + + def fake_post(url, json=None, headers=None, timeout=None, allow_redirects=None, allowed_private_hosts=None): + called["post"] = True + response = Mock() + response.raise_for_status.return_value = None + response.json.return_value = [] + return response + + monkeypatch.setattr("functions_dlp_presidio._post_presidio_endpoint", fake_post) + monkeypatch.delenv("PRESIDIO_DLP_API_KEY", raising=False) + stub_dns_answers(monkeypatch, "presidio.internal", ["10.0.0.5"]) + + try: + analyze_with_presidio_endpoint( + RAW_TEXT, + { + "dlp_presidio_analyzer_endpoint": "https://presidio.internal/analyze", + "dlp_presidio_allowed_private_hosts": "presidio.internal", + "dlp_presidio_auth_header_name": "X-DLP-API-Key", + "dlp_presidio_auth_secret_env_var": "PRESIDIO_DLP_API_KEY", + }, + ) + except PresidioEndpointConfigurationError as exc: + assert "auth secret" in str(exc).lower() + assert called["post"] is False + return + + raise AssertionError("Expected missing non-local auth secret to block the request.") + + +def test_analyze_with_presidio_endpoint_raises_safe_error_without_raw_text(monkeypatch): + """Endpoint exceptions should not retain raw scanned text in messages or exception chains.""" + from functions_dlp_presidio import PresidioEndpointRequestError, analyze_with_presidio_endpoint + + def fake_post(url, json=None, headers=None, timeout=None, allow_redirects=None, allowed_private_hosts=None): + raise RuntimeError(f"upstream included {RAW_TEXT}") + + monkeypatch.setattr("functions_dlp_presidio._post_presidio_endpoint", fake_post) + monkeypatch.setenv("PRESIDIO_DLP_API_KEY", "unit-test-secret") + stub_dns_answers(monkeypatch, "presidio.internal", ["10.0.0.5"]) + + try: + analyze_with_presidio_endpoint( + RAW_TEXT, + { + "dlp_presidio_analyzer_endpoint": "https://presidio.internal/analyze", + "dlp_presidio_allowed_private_hosts": "presidio.internal", + "dlp_presidio_auth_secret_env_var": "PRESIDIO_DLP_API_KEY", + }, + ) + except PresidioEndpointRequestError as exc: + assert RAW_TEXT not in str(exc) + assert RAW_TEXT not in repr(exc) + assert "RuntimeError" in str(exc) + assert exc.__cause__ is None + assert exc.__context__ is None + return + + raise AssertionError("Expected endpoint request error.") + + +def test_analyze_with_presidio_endpoint_normalizes_response_items(monkeypatch): + """Recognizer responses should be filtered and normalized deterministically.""" + from functions_dlp_presidio import analyze_with_presidio_endpoint + + def fake_post(url, json=None, headers=None, timeout=None, allow_redirects=None, allowed_private_hosts=None): + response = Mock() + response.raise_for_status.return_value = None + response.json.return_value = [ + {"entity_type": "EMAIL_ADDRESS", "start": "11", "end": "24", "score": "0.91"}, + {"entity_type": "US_SSN", "start": -3, "end": "bad", "score": 0.99}, + {"entity_type": "", "start": 1, "end": 2, "score": 0.4}, + "ignored", + ] + return response + + monkeypatch.setattr("functions_dlp_presidio._post_presidio_endpoint", fake_post) + monkeypatch.setenv("PRESIDIO_DLP_API_KEY", "unit-test-secret") + stub_dns_answers(monkeypatch, "presidio.internal", ["10.0.0.5"]) + + results = analyze_with_presidio_endpoint( + RAW_TEXT, + { + "dlp_presidio_analyzer_endpoint": "https://presidio.internal/analyze", + "dlp_presidio_allowed_private_hosts": "presidio.internal", + "dlp_presidio_auth_secret_env_var": "PRESIDIO_DLP_API_KEY", + }, + ) + + assert results == [ + {"entity_type": "EMAIL_ADDRESS", "start": 11, "end": 24, "score": 0.91}, + {"entity_type": "", "start": 1, "end": 2, "score": 0.4}, + ] + + +def test_analyze_with_presidio_endpoint_treats_redirect_as_endpoint_error(monkeypatch): + """Redirect responses should not be followed or parsed as analyzer results.""" + from functions_dlp_presidio import PresidioEndpointRequestError, analyze_with_presidio_endpoint + + captured = {"calls": 0} + + def fake_post(url, json=None, headers=None, timeout=None, allow_redirects=None, allowed_private_hosts=None): + captured["calls"] += 1 + captured["allow_redirects"] = allow_redirects + response = Mock() + response.status_code = 302 + response.headers = {"Location": "https://attacker.example/analyze"} + response.raise_for_status.return_value = None + response.json.side_effect = AssertionError("Redirect responses must not be parsed.") + return response + + monkeypatch.setattr("functions_dlp_presidio._post_presidio_endpoint", fake_post) + monkeypatch.setenv("PRESIDIO_DLP_API_KEY", "unit-test-secret") + stub_dns_answers(monkeypatch, "presidio.example.com", ["93.184.216.34"]) + + try: + analyze_with_presidio_endpoint( + RAW_TEXT, + { + "dlp_presidio_analyzer_endpoint": "https://presidio.example.com/analyze", + "dlp_presidio_auth_secret_env_var": "PRESIDIO_DLP_API_KEY", + }, + ) + except PresidioEndpointRequestError as exc: + assert "redirect" in str(exc).lower() + assert captured["allow_redirects"] is False + assert captured["calls"] == 1 + return + + raise AssertionError("Expected redirect response to be handled as an endpoint error.") + + +def test_presidio_auth_secret_env_var_name_validation(monkeypatch): + """Only the dedicated Presidio DLP secret env var namespace should be read.""" + from functions_dlp_presidio import _get_auth_headers, normalize_presidio_secret_env_var_name + + monkeypatch.setenv("AZURE_OPENAI_KEY", "must-not-leak") + monkeypatch.setenv("COSMOS_CONNECTION_STRING", "must-not-leak") + monkeypatch.setenv("PRESIDIO_DLP_API_KEY", "presidio-secret") + monkeypatch.setenv("DLP_PRESIDIO_TOKEN", "prefixed-secret") + + assert normalize_presidio_secret_env_var_name("") == "" + assert normalize_presidio_secret_env_var_name("PRESIDIO_DLP_API_KEY") == "PRESIDIO_DLP_API_KEY" + assert normalize_presidio_secret_env_var_name("DLP_PRESIDIO_TOKEN") == "DLP_PRESIDIO_TOKEN" + assert normalize_presidio_secret_env_var_name("AZURE_OPENAI_KEY") == "" + assert normalize_presidio_secret_env_var_name("COSMOS_CONNECTION_STRING") == "" + assert _get_auth_headers( + { + "dlp_presidio_auth_header_name": "X-DLP-API-Key", + "dlp_presidio_auth_secret_env_var": "AZURE_OPENAI_KEY", + } + ) == {} + assert _get_auth_headers( + { + "dlp_presidio_auth_header_name": "X-DLP-API-Key", + "dlp_presidio_auth_secret_env_var": "PRESIDIO_DLP_API_KEY", + } + ) == {"X-DLP-API-Key": "presidio-secret"} + assert _get_auth_headers( + { + "dlp_presidio_auth_header_name": "X-DLP-API-Key", + "dlp_presidio_auth_secret_env_var": "DLP_PRESIDIO_TOKEN", + } + ) == {"X-DLP-API-Key": "prefixed-secret"} + + +def test_presidio_auth_header_name_validation(monkeypatch): + """Auth header names should reject reserved HTTP headers and malformed names.""" + from functions_dlp_presidio import ( + PresidioEndpointConfigurationError, + _get_auth_headers, + normalize_presidio_auth_header_name, + ) + + monkeypatch.setenv("PRESIDIO_DLP_API_KEY", "presidio-secret") + + assert normalize_presidio_auth_header_name("") == "X-DLP-API-Key" + assert normalize_presidio_auth_header_name("X-DLP-API-Key") == "X-DLP-API-Key" + assert normalize_presidio_auth_header_name("Authorization") == "Authorization" + assert normalize_presidio_auth_header_name("Content-Type") == "" + assert normalize_presidio_auth_header_name("Host") == "" + assert normalize_presidio_auth_header_name("Connection") == "" + assert normalize_presidio_auth_header_name("Bad Header") == "" + assert normalize_presidio_auth_header_name("X-DLP-API-Key\r\nX-Injected") == "" + assert _get_auth_headers( + { + "dlp_presidio_auth_header_name": "Authorization", + "dlp_presidio_auth_secret_env_var": "PRESIDIO_DLP_API_KEY", + } + ) == {"Authorization": "presidio-secret"} + + try: + _get_auth_headers( + { + "dlp_presidio_auth_header_name": "Content-Type", + "dlp_presidio_auth_secret_env_var": "PRESIDIO_DLP_API_KEY", + } + ) + except PresidioEndpointConfigurationError as exc: + assert "header" in str(exc).lower() + return + + raise AssertionError("Expected reserved auth header name to be rejected.") + + +if __name__ == "__main__": + import pytest + + sys.exit(pytest.main([__file__])) diff --git a/functional_tests/test_dlp_presidio_engine_integration.py b/functional_tests/test_dlp_presidio_engine_integration.py new file mode 100644 index 00000000..9e690232 --- /dev/null +++ b/functional_tests/test_dlp_presidio_engine_integration.py @@ -0,0 +1,161 @@ +# test_dlp_presidio_engine_integration.py +#!/usr/bin/env python3 +""" +Functional test for Presidio endpoint engine integration. +Version: 0.242.075 +Implemented in: 0.242.075 + +This test ensures the external Presidio endpoint engine reuses SimpleChat's +existing DLP decision, redaction, and fail-closed behavior. +""" + +import os +import sys + +ROOT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +APP_DIR = os.path.join(ROOT_DIR, "application", "single_app") +sys.path.insert(0, APP_DIR) + + +RAW_TEXT = "Contact me a@example.com" + + +def presidio_settings(mode="redact", fail_closed=True): + """Build deterministic settings for Presidio endpoint engine tests.""" + return { + "enable_dlp_control_plane": True, + "dlp_default_engine": "presidio_endpoint", + "dlp_presidio_analyzer_endpoint": "https://presidio.internal/analyze", + "dlp_presidio_allowed_private_hosts": "presidio.internal", + "dlp_presidio_timeout_seconds": 3, + "dlp_presidio_score_threshold": 0.7, + "dlp_presidio_entities": ["EMAIL_ADDRESS"], + "dlp_fail_closed_on_scanner_error": fail_closed, + "enable_web_search_dlp": True, + "web_search_dlp_mode": mode, + "enable_upload_dlp": True, + "upload_dlp_mode": mode, + } + + +def test_presidio_endpoint_redacts_with_existing_result_shape(monkeypatch): + """Presidio endpoint matches should redact using the shared DLP result shape.""" + import functions_dlp + + monkeypatch.setattr( + functions_dlp, + "analyze_with_presidio_endpoint", + lambda text, settings: [{"entity_type": "EMAIL_ADDRESS", "start": 11, "end": 24, "score": 0.92}], + ) + + result = functions_dlp.evaluate_dlp_text( + RAW_TEXT, + settings=presidio_settings("redact"), + surface="web_search", + ) + + assert result["engine"] == "presidio_endpoint" + assert result["decision"] == "redact" + assert result["text"] == "Contact me [REDACTED_EMAIL_ADDRESS]" + assert result["redacted_text"] == "Contact me [REDACTED_EMAIL_ADDRESS]" + assert result["match_counts"] == {"EMAIL_ADDRESS": 1} + assert result["scanner_status"] == "ok" + + +def test_presidio_endpoint_blocks_with_existing_result_shape(monkeypatch): + """Block mode should blank text fields while keeping safe counts.""" + import functions_dlp + + monkeypatch.setattr( + functions_dlp, + "analyze_with_presidio_endpoint", + lambda text, settings: [{"entity_type": "EMAIL_ADDRESS", "start": 11, "end": 24, "score": 0.92}], + ) + + result = functions_dlp.evaluate_dlp_text( + RAW_TEXT, + settings=presidio_settings("block"), + surface="upload", + ) + + assert result["engine"] == "presidio_endpoint" + assert result["decision"] == "block" + assert result["text"] == "" + assert result["redacted_text"] == "" + assert result["match_counts"] == {"EMAIL_ADDRESS": 1} + assert result["scanner_status"] == "ok" + + +def test_presidio_endpoint_scanner_error_fails_closed_without_raw_text(monkeypatch): + """Endpoint scanner errors should reuse fail-closed handling and avoid raw text.""" + import functions_dlp + + def fail_scan(text, settings): + raise RuntimeError(f"endpoint failed while scanning {RAW_TEXT}") + + monkeypatch.setattr(functions_dlp, "analyze_with_presidio_endpoint", fail_scan) + + result = functions_dlp.evaluate_dlp_text( + RAW_TEXT, + settings=presidio_settings("redact", fail_closed=True), + surface="web_search", + ) + + assert result["engine"] == "presidio_endpoint" + assert result["decision"] == "block" + assert result["text"] == "" + assert result["redacted_text"] == "" + assert result["scanner_status"] == "error" + assert RAW_TEXT not in repr(result) + + +def test_presidio_endpoint_sanitizes_untrusted_entity_labels(monkeypatch): + """Remote entity labels must not be copied into redaction output or count keys.""" + import functions_dlp + + malicious_label = "EMAIL_ADDRESS_a@example.com" + monkeypatch.setattr( + functions_dlp, + "analyze_with_presidio_endpoint", + lambda text, settings: [{"entity_type": malicious_label, "start": 11, "end": 24, "score": 0.92}], + ) + + result = functions_dlp.evaluate_dlp_text( + RAW_TEXT, + settings=presidio_settings("redact"), + surface="web_search", + ) + telemetry = functions_dlp.build_dlp_telemetry_properties(result, "web_search") + + assert result["redacted_text"] == "Contact me [REDACTED_UNKNOWN_ENTITY]" + assert result["match_counts"] == {"UNKNOWN_ENTITY": 1} + assert result["matches"] == [{"entity_type": "UNKNOWN_ENTITY", "count": 1}] + assert telemetry["dlp_entity_counts"] == {"UNKNOWN_ENTITY": 1} + assert malicious_label not in repr(result) + assert "a@example.com" not in repr(result) + assert malicious_label not in repr(telemetry) + assert "a@example.com" not in repr(telemetry) + + +def test_external_analyzer_normalizes_empty_and_too_long_entity_labels(): + """Invalid external analyzer labels should collapse to a fixed safe entity name.""" + import functions_dlp + + long_label = "A" * 65 + results = [ + {"entity_type": "", "start": 0, "end": 7}, + {"entity_type": long_label, "start": 11, "end": 24}, + ] + + normalized = functions_dlp.normalize_external_analyzer_results(RAW_TEXT, results, mode="redact") + + assert normalized["redacted_text"] == "[REDACTED_UNKNOWN_ENTITY] me [REDACTED_UNKNOWN_ENTITY]" + assert normalized["match_counts"] == {"UNKNOWN_ENTITY": 2} + assert normalized["matches"] == [{"entity_type": "UNKNOWN_ENTITY", "count": 2}] + assert long_label not in repr(normalized) + + +if __name__ == "__main__": + import pytest + + sys.exit(pytest.main([__file__])) diff --git a/functional_tests/test_dlp_regex_rules.py b/functional_tests/test_dlp_regex_rules.py new file mode 100644 index 00000000..e89c8856 --- /dev/null +++ b/functional_tests/test_dlp_regex_rules.py @@ -0,0 +1,241 @@ +# test_dlp_regex_rules.py +#!/usr/bin/env python3 +""" +Functional test for configurable DLP regex rules. +Version: 0.242.075 +Implemented in: 0.242.073 + +This test ensures DLP regex rules are admin-configurable, validated, +confidence-shaped, timeout-bounded, and safe to report without raw matched values. +""" + +import os +import sys + +ROOT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +APP_DIR = os.path.join(ROOT_DIR, "application", "single_app") +sys.path.insert(0, APP_DIR) + + +RAW_SSN = "123-45-6789" +RAW_CARD = "4111 1111 1111 1111" + + +def assert_no_raw_values(result): + payload = repr(result) + assert RAW_SSN not in payload + assert RAW_CARD not in payload + assert "ZX-12345" not in payload + + +def test_default_rules_include_ssn_and_credit_card_only(): + """Default DLP regex rules should be structured identifier defaults only.""" + print("Testing default DLP regex rules...") + from functions_dlp_rules import get_default_dlp_regex_rules + + rules = get_default_dlp_regex_rules() + ids = [rule["id"] for rule in rules] + + assert ids == ["us_ssn", "credit_card"] + assert all(rule["enabled"] is True for rule in rules) + assert "internal_phrase" not in ids + assert "confidential" not in repr(rules).lower() + + +def test_custom_regex_rule_redacts_on_configured_surface(): + """A configured custom rule should redact on an allowed surface.""" + print("Testing custom regex DLP rule...") + from functions_dlp import evaluate_dlp_text + + settings = { + "enable_dlp_control_plane": True, + "web_search_dlp_mode": "redact", + "dlp_regex_rules": [ + { + "id": "ticket_id", + "label": "Ticket ID", + "entity_type": "TICKET_ID", + "enabled": True, + "pattern": r"ZX-\d{5}", + "replacement": "[REDACTED_TICKET_ID]", + "surfaces": ["web_search"], + "flags": [], + "validator": "none", + "confidence": { + "regex_only": "medium", + "with_keywords": "high", + "keywords": ["ticket", "case"], + "window_chars": 24, + "minimum": "medium" + } + } + ], + } + + result = evaluate_dlp_text( + "Search for ticket ZX-12345", + settings=settings, + surface="web_search", + ) + + assert result["decision"] == "redact" + assert result["redacted_text"] == "Search for ticket [REDACTED_TICKET_ID]" + assert result["match_counts"] == {"TICKET_ID": 1} + assert result["matches"] == [{"entity_type": "TICKET_ID", "count": 1, "confidence": "high"}] + assert_no_raw_values(result) + + +def test_disabled_custom_rule_does_not_match(): + """Disabled rules should not produce matches.""" + print("Testing disabled custom regex DLP rule...") + from functions_dlp import evaluate_dlp_text + + settings = { + "enable_dlp_control_plane": True, + "web_search_dlp_mode": "redact", + "dlp_regex_rules": [ + { + "id": "ticket_id", + "label": "Ticket ID", + "entity_type": "TICKET_ID", + "enabled": False, + "pattern": r"ZX-\d{5}", + "replacement": "[REDACTED_TICKET_ID]", + "surfaces": ["web_search"], + "flags": [], + "validator": "none", + "confidence": { + "regex_only": "medium", + "with_keywords": "high", + "keywords": ["ticket"], + "window_chars": 24, + "minimum": "medium" + } + } + ], + } + + result = evaluate_dlp_text("Search for ticket ZX-12345", settings=settings, surface="web_search") + + assert result["decision"] == "allow" + assert result["match_counts"] == {} + assert "ZX-12345" in result["redacted_text"] + + +def test_confidence_requires_nearby_keyword_when_minimum_is_high(): + """Rules can require regex plus nearby keyword evidence for high-confidence matches.""" + print("Testing DLP confidence shaping...") + from functions_dlp import evaluate_dlp_text + + rule = { + "id": "employee_id", + "label": "Employee ID", + "entity_type": "EMPLOYEE_ID", + "enabled": True, + "pattern": r"EID-\d{6}", + "replacement": "[REDACTED_EMPLOYEE_ID]", + "surfaces": ["web_search"], + "flags": [], + "validator": "none", + "confidence": { + "regex_only": "low", + "with_keywords": "high", + "keywords": ["employee", "worker", "staff"], + "window_chars": 32, + "minimum": "high" + } + } + settings = { + "enable_dlp_control_plane": True, + "web_search_dlp_mode": "redact", + "dlp_regex_rules": [rule], + } + + low_result = evaluate_dlp_text("Search for EID-123456", settings=settings, surface="web_search") + high_result = evaluate_dlp_text("Search employee EID-123456", settings=settings, surface="web_search") + + assert low_result["decision"] == "allow" + assert low_result["match_counts"] == {} + assert high_result["decision"] == "redact" + assert high_result["match_counts"] == {"EMPLOYEE_ID": 1} + assert high_result["matches"] == [{"entity_type": "EMPLOYEE_ID", "count": 1, "confidence": "high"}] + + +def test_invalid_regex_rule_is_rejected_before_runtime(): + """Invalid admin regex rules should return validation errors.""" + print("Testing invalid regex rule validation...") + from functions_dlp_rules import validate_dlp_regex_rules + + normalized, errors = validate_dlp_regex_rules( + [ + { + "id": "bad", + "label": "Bad Rule", + "entity_type": "BAD", + "enabled": True, + "pattern": r"(", + "replacement": "[REDACTED_BAD]", + "surfaces": ["web_search"], + "flags": [], + "validator": "none", + "confidence": { + "regex_only": "medium", + "with_keywords": "high", + "keywords": [], + "window_chars": 16, + "minimum": "medium" + } + } + ] + ) + + assert normalized == [] + assert errors + assert "bad" in errors[0] + + +def test_internal_phrase_is_not_a_default_blocker(): + """Generic policy words should not be hardcoded blockers.""" + print("Testing internal phrase is not hardcoded...") + from functions_dlp import evaluate_web_search_egress + + result = evaluate_web_search_egress( + "Search for confidentiality agreement examples", + settings={ + "enable_dlp_control_plane": True, + "enable_web_search_dlp": True, + "web_search_dlp_mode": "redact", + }, + ) + + assert result["web_search_allowed"] is True + assert result["decision"] == "allow" + assert "confidentiality agreement" in result["web_search_query_text"] + + +if __name__ == "__main__": + tests = [ + test_default_rules_include_ssn_and_credit_card_only, + test_custom_regex_rule_redacts_on_configured_surface, + test_disabled_custom_rule_does_not_match, + test_confidence_requires_nearby_keyword_when_minimum_is_high, + test_invalid_regex_rule_is_rejected_before_runtime, + test_internal_phrase_is_not_a_default_blocker, + ] + + failures = [] + for test in tests: + try: + test() + except Exception as exc: + failures.append((test.__name__, exc)) + print(f"Test failed: {test.__name__}: {exc}") + import traceback + traceback.print_exc() + + if failures: + print(f"{len(failures)} of {len(tests)} configurable DLP regex rule tests failed.") + sys.exit(1) + + print(f"All {len(tests)} configurable DLP regex rule tests passed.") + sys.exit(0) diff --git a/functional_tests/test_dlp_review_events.py b/functional_tests/test_dlp_review_events.py new file mode 100644 index 00000000..0fba0af3 --- /dev/null +++ b/functional_tests/test_dlp_review_events.py @@ -0,0 +1,115 @@ +# test_dlp_review_events.py +#!/usr/bin/env python3 +""" +Functional test for DLP review event safety. +Version: 0.242.075 +Implemented in: 0.242.073 + +This test ensures DLP review routing defaults to disabled and any optional +review event summary uses distinct DLP policy typing with counts-only payloads. +""" + +import os +import sys + + +ROOT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +APP_DIR = os.path.join(ROOT_DIR, "application", "single_app") +SETTINGS_FILE = os.path.join(APP_DIR, "functions_settings.py") +sys.path.insert(0, APP_DIR) + + +RAW_VALUE = "123-45-6789" + + +def read_file_text(path): + with open(path, "r", encoding="utf-8") as file_handle: + return file_handle.read() + + +def test_review_destination_defaults_to_none(): + """DLP findings should not enter review queues by default.""" + print("Testing default DLP review routing...") + source = read_file_text(SETTINGS_FILE) + assert "'dlp_review_destination': 'none'" in source + assert "'web_search_dlp_track_review_events': False" not in source + assert "'upload_dlp_track_review_events': False" not in source + + +def test_review_summary_has_dlp_type_and_no_raw_values(): + """Review payload summaries should be distinctly typed and counts-only.""" + print("Testing safe DLP review summary...") + from functions_dlp import build_dlp_review_event_summary, evaluate_web_search_egress + + result = evaluate_web_search_egress( + f"Please search for {RAW_VALUE}", + settings={ + "enable_dlp_control_plane": True, + "enable_web_search_dlp": True, + "web_search_dlp_mode": "block", + "dlp_review_destination": "safety_violations", + "web_search_dlp_track_review_events": True, + }, + context={"conversation_id": "conversation-1", "chat_type": "user"}, + ) + summary = build_dlp_review_event_summary( + result, + surface="web_search", + context={"conversation_id": "conversation-1", "user_id": "user-1"}, + ) + + assert summary["policy_type"] == "dlp_web_search" + assert summary["violation_type"] == "dlp" + assert summary["action"] == "block" + assert summary["entity_counts"] == {"US_SSN": 1} + assert "raw_matches" not in summary or summary["raw_matches"] is None + assert RAW_VALUE not in repr(summary) + + +def test_upload_review_summary_has_distinct_type_and_no_raw_values(): + """Upload review payloads should be distinctly typed and counts-only.""" + print("Testing safe upload DLP review summary...") + from functions_dlp import build_dlp_review_event_summary, evaluate_upload_content + + result = evaluate_upload_content( + f"Document chunk has {RAW_VALUE}", + settings={ + "enable_dlp_control_plane": True, + "enable_upload_dlp": True, + "upload_dlp_mode": "redact", + "dlp_review_destination": "safety_violations", + "upload_dlp_track_review_events": True, + }, + context={"document_id": "doc-1", "workspace_scope": "group"}, + ) + summary = build_dlp_review_event_summary( + result, + surface="upload", + context={"document_id": "doc-1", "workspace_scope": "group"}, + ) + + assert summary["policy_type"] == "dlp_upload" + assert summary["violation_type"] == "dlp" + assert summary["action"] == "redact" + assert summary["entity_counts"] == {"US_SSN": 1} + assert RAW_VALUE not in repr(summary) + + +if __name__ == "__main__": + tests = [ + test_review_destination_defaults_to_none, + test_review_summary_has_dlp_type_and_no_raw_values, + test_upload_review_summary_has_distinct_type_and_no_raw_values, + ] + + try: + for test in tests: + test() + print(f"All {len(tests)} DLP review event tests passed.") + sys.exit(0) + except Exception as exc: + print(f"Test failed: {exc}") + import traceback + + traceback.print_exc() + sys.exit(1) diff --git a/functional_tests/test_dlp_telemetry.py b/functional_tests/test_dlp_telemetry.py new file mode 100644 index 00000000..95b2485c --- /dev/null +++ b/functional_tests/test_dlp_telemetry.py @@ -0,0 +1,210 @@ +# test_dlp_telemetry.py +#!/usr/bin/env python3 +""" +Functional test for safe DLP telemetry. +Version: 0.242.075 +Implemented in: 0.242.073 + +This test ensures DLP telemetry properties include bounded decision metadata +without raw matched values, raw prompts, raw web-search queries, raw chunk text, +or raw filenames. +""" + +import os +import sys + + +ROOT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +APP_DIR = os.path.join(ROOT_DIR, "application", "single_app") +FUNCTIONS_DLP_FILE = os.path.join(APP_DIR, "functions_dlp.py") +sys.path.insert(0, APP_DIR) + + +RAW_TEXT = "Search for 123-45-6789 in the confidential roadmap" +RAW_FILENAME = "alice-123-45-6789-roadmap.txt" + + +def test_telemetry_properties_are_counts_only(): + """Telemetry should contain safe bounded DLP properties only.""" + print("Testing DLP telemetry safety...") + from functions_dlp import build_dlp_telemetry_properties, evaluate_dlp_text + + result = evaluate_dlp_text( + RAW_TEXT, + settings={"enable_dlp_control_plane": True, "web_search_dlp_mode": "redact"}, + surface="web_search", + ) + properties = build_dlp_telemetry_properties( + result, + surface="web_search", + context={ + "conversation_id": "conversation-123", + "chat_type": "user", + "workspace_scope": "personal", + "file_name": RAW_FILENAME, + "raw_text": RAW_TEXT, + }, + ) + + assert properties["activity_type"] == "dlp_decision" + assert properties["dlp_surface"] == "web_search" + assert properties["dlp_action"] == "redact" + assert properties["dlp_engine"] == "regex" + assert properties["dlp_mode"] == "redact" + assert properties["workspace_scope"] == "personal" + assert properties["scanner_status"] == "ok" + assert properties["dlp_total_replacements"] == 1 + assert properties["dlp_entity_counts"] == {"US_SSN": 1} + + serialized = repr(properties) + forbidden = [ + "123-45-6789", + "confidential roadmap", + RAW_TEXT, + RAW_FILENAME, + "[REDACTED_US_SSN]", + ] + for value in forbidden: + assert value not in serialized, f"Unsafe telemetry value leaked: {value}" + + +def test_scanner_error_telemetry_is_safe(): + """Scanner failure telemetry should avoid source text and raw errors.""" + print("Testing scanner error telemetry safety...") + from functions_dlp import build_dlp_telemetry_properties + + result = { + "enabled": True, + "engine": "regex", + "mode": "block", + "decision": "block", + "scanner_status": "error", + "text": RAW_TEXT, + "redacted_text": RAW_TEXT, + "total_replacements": 0, + "match_counts": {}, + "matches": [], + "metadata": {"error": "service saw 123-45-6789 before timeout"}, + } + + properties = build_dlp_telemetry_properties( + result, + surface="web_search", + context={"raw_text": RAW_TEXT, "file_name": RAW_FILENAME}, + ) + + assert properties["scanner_status"] == "error" + assert "scanner_error" in properties + serialized = repr(properties) + assert "123-45-6789" not in serialized + assert RAW_TEXT not in serialized + assert RAW_FILENAME not in serialized + + +def test_upload_dlp_telemetry_is_safe(): + """Upload DLP telemetry should include counts and no raw chunk text.""" + print("Testing upload DLP telemetry safety...") + from functions_dlp import build_dlp_telemetry_properties, evaluate_upload_content + + result = evaluate_upload_content( + RAW_TEXT, + settings={ + "enable_dlp_control_plane": True, + "enable_upload_dlp": True, + "upload_dlp_mode": "redact", + }, + context={"document_id": "doc-1", "workspace_scope": "public"}, + ) + properties = build_dlp_telemetry_properties( + result, + surface="upload", + context={"document_id": "doc-1", "workspace_scope": "public", "raw_text": RAW_TEXT}, + ) + + assert properties["dlp_surface"] == "upload" + assert properties["dlp_action"] == "redact" + assert properties["workspace_scope"] == "public" + assert properties["dlp_entity_counts"] == {"US_SSN": 1} + assert RAW_TEXT not in repr(properties) + assert "123-45-6789" not in repr(properties) + + +def test_scanner_error_log_avoids_raw_traceback_capture(): + """Scanner exception logging should not send traceback text to telemetry.""" + print("Testing scanner error log traceback safety...") + with open(FUNCTIONS_DLP_FILE, "r", encoding="utf-8") as file_handle: + source = file_handle.read() + + scanner_error_index = source.find('"[DLP] Scanner error"') + traceback_index = source.find("exceptionTraceback=False", scanner_error_index) + error_type_index = source.find('"error_type": type(exc).__name__', scanner_error_index) + + assert scanner_error_index != -1 + assert traceback_index > scanner_error_index + assert error_type_index > scanner_error_index + assert "exceptionTraceback=True" not in source[scanner_error_index:traceback_index] + + +def test_monitor_detections_emit_telemetry_by_default(): + """Monitor-mode detections should emit telemetry even when allow sampling is disabled.""" + print("Testing monitor-mode DLP telemetry emission...") + from functions_dlp import evaluate_dlp_text, should_emit_dlp_telemetry + + result = evaluate_dlp_text( + RAW_TEXT, + settings={"enable_dlp_control_plane": True, "web_search_dlp_mode": "monitor"}, + surface="web_search", + ) + + assert result["decision"] == "monitor" + assert result["match_counts"] == {"US_SSN": 1} + assert result["total_replacements"] == 1 + assert should_emit_dlp_telemetry(result, settings={}) is True + assert should_emit_dlp_telemetry( + result, + settings={"dlp_telemetry_sample_allow_events": False}, + ) is True + + +def test_clean_allow_telemetry_respects_sampling_default(): + """Clean allow events should stay silent unless allow sampling is enabled.""" + print("Testing clean allow DLP telemetry sampling...") + from functions_dlp import evaluate_dlp_text, should_emit_dlp_telemetry + + result = evaluate_dlp_text( + "Search for public weather forecast", + settings={"enable_dlp_control_plane": True, "web_search_dlp_mode": "monitor"}, + surface="web_search", + ) + + assert result["decision"] == "allow" + assert result["match_counts"] == {} + assert result["total_replacements"] == 0 + assert should_emit_dlp_telemetry(result, settings={}) is False + assert should_emit_dlp_telemetry( + result, + settings={"dlp_telemetry_sample_allow_events": False}, + ) is False + + +if __name__ == "__main__": + tests = [ + test_telemetry_properties_are_counts_only, + test_scanner_error_telemetry_is_safe, + test_upload_dlp_telemetry_is_safe, + test_scanner_error_log_avoids_raw_traceback_capture, + test_monitor_detections_emit_telemetry_by_default, + test_clean_allow_telemetry_respects_sampling_default, + ] + + try: + for test in tests: + test() + print(f"All {len(tests)} DLP telemetry tests passed.") + sys.exit(0) + except Exception as exc: + print(f"Test failed: {exc}") + import traceback + + traceback.print_exc() + sys.exit(1) diff --git a/functional_tests/test_upload_dlp_ingestion_integration.py b/functional_tests/test_upload_dlp_ingestion_integration.py new file mode 100644 index 00000000..877c6b85 --- /dev/null +++ b/functional_tests/test_upload_dlp_ingestion_integration.py @@ -0,0 +1,718 @@ +# test_upload_dlp_ingestion_integration.py +#!/usr/bin/env python3 +""" +Functional test for upload DLP ingestion integration. +Version: 0.242.075 +Implemented in: 0.242.073 + +This test ensures upload DLP blocks stop before embeddings/search indexing and +redacted text is the only text passed into embedding/index payload construction. +""" + +import ast +import os +import sys +import types +from datetime import datetime, timezone +from pathlib import Path +from typing import List + + +ROOT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +APP_DIR = os.path.join(ROOT_DIR, "application", "single_app") +FUNCTIONS_DOCUMENTS_FILE = os.path.join(APP_DIR, "functions_documents.py") +FUNCTIONS_AUTHENTICATION_FILE = os.path.join(APP_DIR, "functions_authentication.py") +FUNCTIONS_DOCUMENTS = Path(FUNCTIONS_DOCUMENTS_FILE) +FUNCTIONS_AUTHENTICATION = Path(FUNCTIONS_AUTHENTICATION_FILE) +sys.path.insert(0, APP_DIR) + + +RAW_VALUE = "123-45-6789" + + +def read_file_text(path): + with open(path, "r", encoding="utf-8") as file_handle: + return file_handle.read() + + +def extract_function_source(source_text, function_name): + parsed = ast.parse(source_text, filename=FUNCTIONS_DOCUMENTS_FILE) + for node in ast.walk(parsed): + if isinstance(node, ast.FunctionDef) and node.name == function_name: + return ast.get_source_segment(source_text, node) + raise AssertionError(f"Function {function_name} not found") + + +def import_functions_documents_for_helper_tests(): + """Import functions_documents with lightweight stubs for optional app dependencies.""" + stub_modules = { + "config": types.ModuleType("config"), + "functions_content": types.ModuleType("functions_content"), + "functions_settings": types.ModuleType("functions_settings"), + "functions_search": types.ModuleType("functions_search"), + "functions_logging": types.ModuleType("functions_logging"), + "functions_authentication": types.ModuleType("functions_authentication"), + "functions_debug": types.ModuleType("functions_debug"), + "functions_keyvault": types.ModuleType("functions_keyvault"), + "azure": types.ModuleType("azure"), + "azure.cognitiveservices": types.ModuleType("azure.cognitiveservices"), + "azure.cognitiveservices.speech": types.ModuleType("azure.cognitiveservices.speech"), + } + stub_modules["config"].List = List + stub_modules["config"].datetime = datetime + stub_modules["config"].timezone = timezone + stub_modules["functions_settings"].get_settings = lambda: {} + stub_modules["functions_logging"].add_file_task_to_file_processing_log = lambda **kwargs: None + stub_modules["functions_logging"].log_event = lambda *args, **kwargs: None + stub_modules["functions_keyvault"].SecretReturnType = types.SimpleNamespace(VALUE="value") + stub_modules["functions_keyvault"].keyvault_model_endpoint_get_helper = lambda endpoint, return_type=None: endpoint + + original_modules = {module_name: sys.modules.get(module_name) for module_name in stub_modules} + try: + sys.modules.pop("functions_documents", None) + for module_name, module_stub in stub_modules.items(): + sys.modules[module_name] = module_stub + import functions_documents + finally: + for module_name, original_module in original_modules.items(): + if original_module is None: + sys.modules.pop(module_name, None) + else: + sys.modules[module_name] = original_module + + return functions_documents + + +def import_functions_authentication_for_helper_tests(): + """Import functions_authentication with lightweight stubs for optional app dependencies.""" + config_stub = types.ModuleType("config") + config_stub.AZURE_ENVIRONMENT = "public" + config_stub.CUSTOM_RESOURCE_MANAGER_URL_VALUE = "" + config_stub.DEFAULT_VIDEO_INDEXER_ARM_API_VERSION = "2024-01-01" + config_stub.OIDC_METADATA_URL = "https://login.example/.well-known/openid-configuration" + config_stub.AUDIENCE = "audience" + config_stub.ISSUER = "issuer" + config_stub.requests = types.SimpleNamespace() + config_stub.requests.exceptions = types.SimpleNamespace(RequestException=Exception) + config_stub.jwt = types.SimpleNamespace() + config_stub.DefaultAzureCredential = lambda: None + + stub_modules = { + "config": config_stub, + "functions_settings": types.ModuleType("functions_settings"), + "functions_debug": types.ModuleType("functions_debug"), + } + stub_modules["functions_debug"].debug_print = lambda *args, **kwargs: None + + original_modules = {module_name: sys.modules.get(module_name) for module_name in stub_modules} + try: + sys.modules.pop("functions_authentication", None) + for module_name, module_stub in stub_modules.items(): + sys.modules[module_name] = module_stub + import functions_authentication + finally: + for module_name, original_module in original_modules.items(): + if original_module is None: + sys.modules.pop(module_name, None) + else: + sys.modules[module_name] = original_module + + return functions_authentication + + +def test_upload_helper_blocks_before_returning_to_ingestion_paths(): + """The shared upload DLP evaluator should raise before callers can embed blocked text.""" + print("Testing upload DLP block gate...") + source = read_file_text(FUNCTIONS_DOCUMENTS_FILE) + helper_source = extract_function_source(source, "_evaluate_upload_dlp_text") + + record_index = helper_source.find("_record_upload_dlp_result(") + block_index = helper_source.find('if not result.get("upload_allowed", True):') + raise_index = helper_source.find('raise ValueError("Upload content blocked by DLP policy.")') + return_index = helper_source.find("return result") + + assert record_index != -1, "DLP result should be recorded before block handling" + assert block_index > record_index, "Block gate should run after safe metadata is recorded" + assert raise_index > block_index, "Blocked upload should raise a policy error" + assert return_index > raise_index, "Allowed result should return only after the block gate" + + +def test_single_chunk_uses_sanitized_text_for_embedding_and_indexing(): + """save_chunks should generate embeddings and search documents from sanitized text.""" + print("Testing single chunk sanitized text flow...") + source = read_file_text(FUNCTIONS_DOCUMENTS_FILE) + save_chunks_source = extract_function_source(source, "save_chunks") + + dlp_index = save_chunks_source.find("_evaluate_upload_dlp_text(") + sanitized_index = save_chunks_source.find('sanitized_chunk_text = upload_dlp_result.get("sanitized_text", enhanced_chunk_text)') + embedding_index = save_chunks_source.find("generate_embedding(sanitized_chunk_text)") + index_payload_index = save_chunks_source.find('"chunk_text": sanitized_chunk_text') + + assert dlp_index != -1 + assert sanitized_index > dlp_index + assert embedding_index > sanitized_index + assert index_payload_index > embedding_index + assert "generate_embedding(page_text_content)" not in save_chunks_source + assert RAW_VALUE not in save_chunks_source + + +def test_batch_chunks_use_sanitized_text_for_batch_embeddings_and_indexing(): + """save_chunks_batch should batch only sanitized chunk text.""" + print("Testing batch chunk sanitized text flow...") + source = read_file_text(FUNCTIONS_DOCUMENTS_FILE) + batch_source = extract_function_source(source, "save_chunks_batch") + + dlp_index = batch_source.find("_evaluate_upload_dlp_text(") + metadata_sanitize_index = batch_source.find("metadata, _metadata_dlp_summary = _sanitize_upload_metadata_for_dlp(") + author_index = batch_source.find("author = ensure_list(metadata.get('authors'))") + title_index = batch_source.find("title = metadata.get('title', '')") + sanitized_index = batch_source.find("sanitized_chunk_info['page_text_content']") + texts_index = batch_source.find("texts = [c['page_text_content'] for c in sanitized_chunks_data]") + embedding_index = batch_source.find("generate_embeddings_batch(texts)") + payload_index = batch_source.find('"chunk_text": enhanced_chunk_text') + + assert dlp_index != -1 + assert metadata_sanitize_index != -1 + assert author_index > metadata_sanitize_index + assert title_index > metadata_sanitize_index + assert sanitized_index > dlp_index + assert texts_index > sanitized_index + assert embedding_index > texts_index + assert payload_index > embedding_index + assert '"author": author' in batch_source + assert '"title": title' in batch_source + assert "texts = [c['page_text_content'] for c in chunks_data]" not in batch_source + assert "dlp_metadata" in batch_source + + +def test_batch_chunk_vision_text_is_not_reappended_after_dlp_redaction(): + """save_chunks_batch should index sanitized chunk text without raw vision text.""" + print("Testing batch chunk vision text DLP redaction before indexing...") + functions_documents = import_functions_documents_for_helper_tests() + + uploaded_batches = [] + embedded_texts = [] + + class FakeSearchClient: + def upload_documents(self, documents): + uploaded_batches.append(documents) + + original_get_settings = functions_documents.get_settings + original_get_document_metadata = functions_documents.get_document_metadata + original_update_document = getattr(functions_documents, "update_document", None) + original_clients = getattr(functions_documents, "CLIENTS", None) + original_functions_content = sys.modules.get("functions_content") + + functions_documents.get_settings = lambda: { + "enable_dlp_control_plane": True, + "enable_upload_dlp": True, + "upload_dlp_mode": "redact", + "dlp_default_engine": "regex", + "dlp_max_scan_chars": 200000, + } + functions_documents.get_document_metadata = lambda **kwargs: { + "version": 1, + "authors": ["Author"], + "title": "Document", + "document_classification": "None", + "tags": [], + "shared_user_ids": [], + "vision_analysis": { + "model": "vision-model", + "text": f"badge SSN {RAW_VALUE}", + }, + } + functions_documents.update_document = lambda **kwargs: None + functions_documents.CLIENTS = {"search_client_user": FakeSearchClient()} + + def fake_generate_embeddings_batch(texts): + embedded_texts.extend(texts) + return [([0.1, 0.2, 0.3], {"total_tokens": 1, "prompt_tokens": 1}) for _ in texts] + + functions_content_stub = types.ModuleType("functions_content") + functions_content_stub.generate_embeddings_batch = fake_generate_embeddings_batch + sys.modules["functions_content"] = functions_content_stub + + try: + functions_documents.save_chunks_batch( + [ + { + "page_text_content": "Safe page content.", + "page_number": 1, + "file_name": "vision.pdf", + } + ], + user_id="user-1", + document_id="doc-vision", + ) + finally: + functions_documents.get_settings = original_get_settings + functions_documents.get_document_metadata = original_get_document_metadata + if original_functions_content is None: + sys.modules.pop("functions_content", None) + else: + sys.modules["functions_content"] = original_functions_content + if original_update_document is None: + delattr(functions_documents, "update_document") + else: + functions_documents.update_document = original_update_document + if original_clients is None: + delattr(functions_documents, "CLIENTS") + else: + functions_documents.CLIENTS = original_clients + + assert uploaded_batches + indexed_chunk_text = uploaded_batches[0][0]["chunk_text"] + assert RAW_VALUE not in indexed_chunk_text + assert RAW_VALUE not in repr(embedded_texts) + assert indexed_chunk_text == embedded_texts[0] + assert "badge SSN [REDACTED_US_SSN]" in indexed_chunk_text + + +def test_video_chunks_use_sanitized_transcript_and_ocr_text(): + """save_video_chunk should sanitize transcript and OCR text before embedding/search.""" + print("Testing video chunk sanitized text flow...") + source = read_file_text(FUNCTIONS_DOCUMENTS_FILE) + video_source = extract_function_source(source, "save_video_chunk") + + transcript_dlp_index = video_source.find("transcript_dlp_result = _evaluate_upload_dlp_text(") + transcript_sanitized_index = video_source.find( + 'sanitized_transcript_text = transcript_dlp_result.get("sanitized_text", page_text_content)' + ) + ocr_dlp_index = video_source.find("ocr_dlp_result = _evaluate_upload_dlp_text(") + ocr_sanitized_index = video_source.find( + 'sanitized_ocr_text = ocr_dlp_result.get("sanitized_text", ocr_chunk_text)' + ) + embedding_index = video_source.find("generate_embedding(sanitized_transcript_text)") + transcript_payload_index = video_source.find('"chunk_text": sanitized_transcript_text') + ocr_payload_index = video_source.find('"video_ocr_chunk_text": sanitized_ocr_text') + + assert transcript_dlp_index != -1 + assert transcript_sanitized_index > transcript_dlp_index + assert ocr_dlp_index > transcript_sanitized_index + assert ocr_sanitized_index > ocr_dlp_index + assert embedding_index > ocr_sanitized_index + assert transcript_payload_index > embedding_index + assert ocr_payload_index > embedding_index + + +def test_video_chunks_preserve_public_workspace_scope(): + """Video chunks should use the public workspace metadata/search path when supplied.""" + print("Testing video chunk public workspace scope...") + source = read_file_text(FUNCTIONS_DOCUMENTS_FILE) + video_source = extract_function_source(source, "save_video_chunk") + process_video_source = extract_function_source(source, "process_video_document") + + assert "public_workspace_id=None" in video_source + assert "is_public_workspace = public_workspace_id is not None" in video_source + assert "public_workspace_id=public_workspace_id" in video_source + assert 'chunk["public_workspace_id"] = public_workspace_id' in video_source + assert 'CLIENTS["search_client_public"]' in video_source + assert "save_video_chunk(" in process_video_source + assert "public_workspace_id=public_workspace_id" in process_video_source + + +def test_video_dlp_block_errors_abort_processing(): + """Video processing should not swallow upload DLP block decisions.""" + print("Testing video DLP block propagation...") + source = read_file_text(FUNCTIONS_DOCUMENTS_FILE) + video_source = extract_function_source(source, "save_video_chunk") + process_video_source = extract_function_source(source, "process_video_document") + + save_chunk_call_index = process_video_source.find("save_video_chunk(") + catch_index = process_video_source.find("except Exception as e:", save_chunk_call_index) + dlp_guard_index = process_video_source.find('if str(e) == "Upload content blocked by DLP policy.":', catch_index) + raise_index = process_video_source.find("raise", dlp_guard_index) + log_index = process_video_source.find("Failed to save chunk", catch_index) + + assert 'if str(e) == "Upload content blocked by DLP policy.":' in video_source + assert save_chunk_call_index != -1 + assert catch_index > save_chunk_call_index + assert dlp_guard_index > catch_index + assert raise_index > dlp_guard_index + assert log_index > raise_index + + +def test_audio_chunks_preserve_public_workspace_scope(): + """Audio transcript chunks should pass public workspace scope through save_chunks.""" + print("Testing audio chunk public workspace scope...") + source = read_file_text(FUNCTIONS_DOCUMENTS_FILE) + audio_source = extract_function_source(source, "process_audio_document") + + save_chunks_index = audio_source.find("save_chunks(") + public_scope_index = audio_source.find("public_workspace_id=public_workspace_id", save_chunks_index) + + assert save_chunks_index != -1 + assert public_scope_index > save_chunks_index + + +def test_media_processing_logs_do_not_emit_raw_detector_text(): + """Media processors should log counts and lengths, not raw transcript/insight bodies.""" + print("Testing media processing log safety...") + source = read_file_text(FUNCTIONS_DOCUMENTS_FILE) + video_source = extract_function_source(source, "process_video_document") + audio_source = extract_function_source(source, "process_audio_document") + + forbidden_video = [ + "RAW INSIGHTS", + "insights_json", + "json.dumps(insights", + "TRANSCRIPT sample", + "OCR sample", + "KEYWORDS sample", + "sample:", + "First speech item: {speech_context[0]}", + "using insights as text: {chunk_text[:100]}", + "chunk_text[:100]", + ] + forbidden_audio = [ + "Recognized: {evt.result.text}", + "Recognized: {result.text}", + ] + + for snippet in forbidden_video: + assert snippet not in video_source, f"Unsafe video log remains: {snippet}" + for snippet in forbidden_audio: + assert snippet not in audio_source, f"Unsafe audio log remains: {snippet}" + + +def test_upload_dlp_metadata_is_counts_only(): + """Upload DLP metadata should store summaries, not raw detector matches.""" + print("Testing upload DLP metadata safety...") + from functions_dlp import evaluate_upload_content + + result = evaluate_upload_content( + f"employee ssn {RAW_VALUE}", + settings={ + "enable_dlp_control_plane": True, + "enable_upload_dlp": True, + "upload_dlp_mode": "redact", + }, + context={"document_id": "doc-1", "workspace_scope": "personal"}, + ) + + metadata = result["dlp_metadata"] + assert metadata["entity_counts"] == {"US_SSN": 1} + assert metadata["total_replacements"] == 1 + assert RAW_VALUE not in repr(metadata) + assert "matches" not in metadata + assert "raw_matches" not in metadata + + +def test_upload_dlp_enforcement_disables_enhanced_citation_blob_upload(): + """Enforced upload DLP should disable raw enhanced-citation blob upload.""" + print("Testing upload DLP enhanced-citation enforcement...") + source = FUNCTIONS_DOCUMENTS.read_text(encoding="utf-8") + assert "_should_disable_enhanced_citations_for_upload_dlp" in source + assert "enable_enhanced_citations = False" in source + assert "upload_dlp_mode" in source + assert 'settings.get("upload_dlp_fail_upload_on_match", False)' in source + assert 'settings.get("dlp_fail_closed_on_scanner_error", True)' in source + + upload_source = extract_function_source(source, "process_document_upload_background") + helper_source = extract_function_source(source, "_should_disable_enhanced_citations_for_upload_dlp") + conditional = "disabled_enhanced_citations_for_upload_dlp = (" + conditional_index = upload_source.find(conditional) + disable_index = upload_source.find("enable_enhanced_citations = False", conditional_index) + status_index = upload_source.find("Enhanced citations disabled because upload DLP enforcement is active") + dispatch_args_index = upload_source.find("args = {") + process_handler_indices = [ + index + for index in ( + upload_source.find("process_txt("), + upload_source.find("process_xml("), + upload_source.find("process_yaml("), + upload_source.find("process_log("), + upload_source.find("process_doc("), + upload_source.find("process_html("), + upload_source.find("process_md("), + upload_source.find("process_json("), + upload_source.find("process_tabular("), + upload_source.find("process_video_document("), + upload_source.find("process_audio_document("), + upload_source.find("process_di_document("), + ) + if index != -1 + ] + + assert conditional_index != -1 + assert 'settings.get("dlp_fail_closed_on_scanner_error", True)' in helper_source + assert 'settings.get("upload_dlp_fail_upload_on_match", False)' in helper_source + assert "return True" in helper_source + assert disable_index > conditional_index + assert status_index > disable_index + assert dispatch_args_index > disable_index + assert '"enable_enhanced_citations": enable_enhanced_citations' in upload_source + assert process_handler_indices + assert disable_index < min(process_handler_indices) + assert "enable_enhanced_citations=enable_enhanced_citations" in upload_source + + video_source = extract_function_source(source, "process_video_document") + audio_source = extract_function_source(source, "process_audio_document") + assert "enable_enhanced_citations=False" in video_source + assert "enable_enhanced_citations=False" in audio_source + assert 'if enable_enhanced_citations:' in video_source + assert 'if enable_enhanced_citations:' in audio_source + assert 'settings.get("enable_enhanced_citations", False)' not in video_source + assert 'settings.get("enable_enhanced_citations", False)' not in audio_source + + +def test_upload_metadata_sanitizer_redacts_counts_only_metadata(): + """Upload metadata sanitizer should redact raw values and return counts only.""" + print("Testing upload metadata DLP sanitizer...") + functions_documents = import_functions_documents_for_helper_tests() + + original_get_settings = functions_documents.get_settings + functions_documents.get_settings = lambda: { + "enable_dlp_control_plane": True, + "enable_upload_dlp": True, + "upload_dlp_mode": "redact", + "dlp_default_engine": "regex", + "dlp_max_scan_chars": 200000, + } + + try: + metadata = { + "title": "Roadmap 123-45-6789", + "authors": ["Alice 123-45-6789"], + "organization": "Org", + "publication_date": "06/2026", + "keywords": ["SSN 123-45-6789"], + "abstract": "Contains 123-45-6789", + } + + sanitized, summary = functions_documents._sanitize_upload_metadata_for_dlp( + metadata, + user_id="user-1", + document_id="doc-1", + ) + finally: + functions_documents.get_settings = original_get_settings + + assert "123-45-6789" not in repr(sanitized) + assert summary["entity_counts"]["US_SSN"] >= 1 + assert "raw_matches" not in repr(summary) + + +def test_upload_metadata_logs_use_safe_counts_and_lengths(): + """Metadata retrieval and extraction logs should not write raw metadata bodies.""" + print("Testing upload metadata log safety...") + source = FUNCTIONS_DOCUMENTS.read_text(encoding="utf-8") + get_metadata_source = extract_function_source(source, "get_document_metadata") + summary_source = extract_function_source(source, "_upload_metadata_log_summary") + + assert "Document metadata retrieved: {document_items}" not in get_metadata_source + assert "item_count: {len(document_items)}" in get_metadata_source + assert '"field_lengths"' in summary_source + assert "Final metadata for document {document_id}: {meta_data}" not in source + assert "Decoded JSON from GPT response for document {document_id}: {gpt_output}" not in source + + +def test_initial_di_metadata_is_sanitized_before_update_callback(): + """DI file properties should be sanitized before first metadata persistence.""" + print("Testing initial DI metadata sanitization...") + source = FUNCTIONS_DOCUMENTS.read_text(encoding="utf-8") + di_source = extract_function_source(source, "process_di_document") + + metadata_fields_index = di_source.find("metadata_update_fields = {") + sanitize_index = di_source.find("_sanitize_upload_metadata_for_dlp(") + update_index = di_source.find("update_callback(**update_fields)") + dlp_reraise_index = di_source.find('if str(e) == "Upload content blocked by DLP policy.":') + warning_index = di_source.find("Warning: Failed to extract initial metadata") + + assert metadata_fields_index != -1 + assert sanitize_index > metadata_fields_index + assert update_index > sanitize_index + assert dlp_reraise_index > update_index + assert warning_index > dlp_reraise_index + + +def test_video_indexer_upload_params_do_not_log_access_token(): + """Video Indexer upload logging should not include raw account tokens.""" + print("Testing Video Indexer upload parameter log safety...") + source = FUNCTIONS_DOCUMENTS.read_text(encoding="utf-8") + video_source = extract_function_source(source, "process_video_document") + + assert '"accessToken": token' in video_source + assert 'debug_print(f"[VIDEO INDEXER] Upload params: {params}")' not in video_source + assert 'debug_print(f"[VIDEO INDEXER] Index polling URL: {index_url}")' not in video_source + assert "accessToken_present={bool(token)}" in video_source + assert "name_length={len(original_filename or '')}" in video_source + assert "Upload params keys" in video_source + assert "Index polling request prepared" in video_source + assert "video_id_length={len(str(vid or ''))}" in video_source + + +def test_video_indexer_request_errors_redact_access_token(): + """Video Indexer request exceptions should redact token-bearing URLs before logging.""" + print("Testing Video Indexer request error redaction...") + functions_documents = import_functions_documents_for_helper_tests() + source = FUNCTIONS_DOCUMENTS.read_text(encoding="utf-8") + video_source = extract_function_source(source, "process_video_document") + + query_error = ( + "403 Client Error: Forbidden for url: " + "https://video.example/Index?accessToken=opaque-token&other=value" + ) + dict_error = "{'accessToken': 'opaque-token', 'name': 'example.mp4'}" + + redacted_query = functions_documents._sanitize_video_indexer_log_value(query_error) + redacted_dict = functions_documents._sanitize_video_indexer_log_value(dict_error) + + assert "opaque-token" not in redacted_query + assert "accessToken=[REDACTED]" in redacted_query + assert "other=value" in redacted_query + assert "opaque-token" not in redacted_dict + assert "[REDACTED]" in redacted_dict + assert "Authentication failed: {str(e)}" not in video_source + assert "AUTH ERROR: {e}" not in video_source + assert "auth failed → {e}" not in video_source + assert "Upload request failed: {str(e)}" not in video_source + assert "Poll request failed: {str(e)}" not in video_source + assert "Upload response text: {resp.text}" not in video_source + assert "No video ID in response: {response_data}" not in video_source + assert "_sanitize_video_indexer_log_value(e)" in video_source + assert "_sanitize_video_indexer_log_value(resp.text)" in video_source + assert "_sanitize_video_indexer_log_value(e.response.text)" in video_source + + +def test_video_indexer_auth_errors_redact_access_token(): + """Video Indexer auth response logging should redact token-bearing bodies.""" + print("Testing Video Indexer auth response redaction...") + functions_authentication = import_functions_authentication_for_helper_tests() + source = FUNCTIONS_AUTHENTICATION.read_text(encoding="utf-8") + auth_source = extract_function_source(source, "get_video_indexer_managed_identity_token") + + response_body = '{"accessToken":"opaque-token","expiresIn":"3600"}' + query_error = ( + "400 Client Error: Bad Request for url: " + "https://management.example/generateAccessToken?accessToken=opaque-token" + ) + + redacted_body = functions_authentication._sanitize_video_indexer_auth_log_value(response_body) + redacted_query = functions_authentication._sanitize_video_indexer_auth_log_value(query_error) + + assert "opaque-token" not in redacted_body + assert "opaque-token" not in redacted_query + assert "[REDACTED]" in redacted_body + assert "accessToken=[REDACTED]" in redacted_query + assert "ARM API response text: {resp.text}" not in auth_source + assert "ERROR: No accessToken in response: {response_data}" not in auth_source + assert "ERROR in ARM API request: {str(e)}" not in auth_source + assert "Error response text: {e.response.text}" not in auth_source + assert "_sanitize_video_indexer_auth_log_value(resp.text)" in auth_source + assert "_sanitize_video_indexer_auth_log_value(e.response.text)" in auth_source + + +def test_upload_dlp_document_status_aggregates_worst_result(): + """Document DLP summary should preserve the worst observed status.""" + print("Testing upload DLP document status aggregation...") + functions_documents = import_functions_documents_for_helper_tests() + + aggregate = functions_documents._merge_upload_dlp_document_summary( + existing={ + "status": "accepted_with_redactions", + "entity_counts": {"US_SSN": 1}, + "total_replacements": 1, + }, + incoming={ + "status": "accepted", + "entity_counts": {}, + "total_replacements": 0, + }, + ) + + assert aggregate["status"] == "accepted_with_redactions" + assert aggregate["entity_counts"]["US_SSN"] == 1 + + +def test_upload_dlp_record_merges_with_existing_document_status(): + """Recording a clean field should not downgrade an earlier redacted document status.""" + print("Testing upload DLP record persistence aggregation...") + functions_documents = import_functions_documents_for_helper_tests() + + updates = [] + original_get_settings = functions_documents.get_settings + original_get_document_metadata = functions_documents.get_document_metadata + original_update_document = functions_documents.update_document + functions_documents.get_settings = lambda: { + "enable_dlp_control_plane": True, + "enable_upload_dlp": True, + "upload_dlp_mode": "redact", + } + functions_documents.get_document_metadata = lambda **kwargs: { + "dlp_status": "accepted_with_redactions", + "dlp_metadata": { + "status": "accepted_with_redactions", + "entity_counts": {"US_SSN": 1}, + "total_replacements": 1, + "scanner_status": "ok", + }, + } + functions_documents.update_document = lambda **kwargs: updates.append(kwargs) + + try: + functions_documents._record_upload_dlp_result( + { + "status": "accepted", + "sanitized_text": "clean", + "dlp_metadata": { + "status": "accepted", + "entity_counts": {}, + "total_replacements": 0, + "scanner_status": "ok", + }, + }, + user_id="user-1", + document_id="doc-1", + ) + finally: + functions_documents.get_settings = original_get_settings + functions_documents.get_document_metadata = original_get_document_metadata + functions_documents.update_document = original_update_document + + assert updates + assert updates[0]["dlp_status"] == "accepted_with_redactions" + assert updates[0]["dlp_metadata"]["entity_counts"]["US_SSN"] == 1 + + +if __name__ == "__main__": + tests = [ + test_upload_helper_blocks_before_returning_to_ingestion_paths, + test_single_chunk_uses_sanitized_text_for_embedding_and_indexing, + test_batch_chunks_use_sanitized_text_for_batch_embeddings_and_indexing, + test_batch_chunk_vision_text_is_not_reappended_after_dlp_redaction, + test_video_chunks_use_sanitized_transcript_and_ocr_text, + test_video_chunks_preserve_public_workspace_scope, + test_video_dlp_block_errors_abort_processing, + test_audio_chunks_preserve_public_workspace_scope, + test_media_processing_logs_do_not_emit_raw_detector_text, + test_upload_dlp_metadata_is_counts_only, + test_upload_dlp_enforcement_disables_enhanced_citation_blob_upload, + test_upload_metadata_sanitizer_redacts_counts_only_metadata, + test_upload_metadata_logs_use_safe_counts_and_lengths, + test_initial_di_metadata_is_sanitized_before_update_callback, + test_video_indexer_upload_params_do_not_log_access_token, + test_video_indexer_request_errors_redact_access_token, + test_video_indexer_auth_errors_redact_access_token, + test_upload_dlp_document_status_aggregates_worst_result, + test_upload_dlp_record_merges_with_existing_document_status, + ] + + failures = [] + for test in tests: + try: + test() + except Exception as exc: + failures.append((test.__name__, exc)) + print(f"Test failed: {test.__name__}: {exc}") + import traceback + + traceback.print_exc() + + if failures: + print(f"{len(failures)} of {len(tests)} upload DLP ingestion integration tests failed.") + sys.exit(1) + + print(f"All {len(tests)} upload DLP ingestion integration tests passed.") + sys.exit(0) diff --git a/functional_tests/test_upload_dlp_redaction.py b/functional_tests/test_upload_dlp_redaction.py new file mode 100644 index 00000000..7d53cc25 --- /dev/null +++ b/functional_tests/test_upload_dlp_redaction.py @@ -0,0 +1,257 @@ +# test_upload_dlp_redaction.py +#!/usr/bin/env python3 +""" +Functional test for upload DLP redaction. +Version: 0.242.075 +Implemented in: 0.242.073 + +This test ensures upload DLP redacts chunk text before embeddings and Azure AI +Search indexing, hardens raw chunk logs, stores counts-only metadata, and emits +safe upload telemetry/review summaries. +""" + +import ast +import os +import sys + + +ROOT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +APP_DIR = os.path.join(ROOT_DIR, "application", "single_app") +FUNCTIONS_DOCUMENTS_FILE = os.path.join(APP_DIR, "functions_documents.py") +sys.path.insert(0, APP_DIR) + + +RAW_VALUE = "123-45-6789" + + +def read_file_text(path): + with open(path, "r", encoding="utf-8") as file_handle: + return file_handle.read() + + +def extract_function_source(source_text, function_name): + parsed = ast.parse(source_text, filename=FUNCTIONS_DOCUMENTS_FILE) + for node in ast.walk(parsed): + if isinstance(node, ast.FunctionDef) and node.name == function_name: + return ast.get_source_segment(source_text, node) + raise AssertionError(f"Function {function_name} not found") + + +def test_upload_helper_redacts_or_blocks_with_safe_state(): + """Upload helper should shape redact/block states without raw values.""" + print("Testing upload DLP helper behavior...") + from functions_dlp import evaluate_upload_content + + redact_result = evaluate_upload_content( + f"Chunk contains {RAW_VALUE}", + settings={ + "enable_dlp_control_plane": True, + "enable_upload_dlp": True, + "upload_dlp_mode": "redact", + }, + context={"document_id": "doc-1", "workspace_scope": "personal"}, + ) + block_result = evaluate_upload_content( + f"Chunk contains {RAW_VALUE}", + settings={ + "enable_dlp_control_plane": True, + "enable_upload_dlp": True, + "upload_dlp_mode": "block", + }, + context={"document_id": "doc-1", "workspace_scope": "personal"}, + ) + + assert redact_result["decision"] == "redact" + assert redact_result["upload_allowed"] is True + assert redact_result["status"] == "accepted_with_redactions" + assert "[REDACTED_US_SSN]" in redact_result["sanitized_text"] + assert RAW_VALUE not in repr(redact_result) + + assert block_result["decision"] == "block" + assert block_result["upload_allowed"] is False + assert block_result["status"] == "blocked" + assert block_result["sanitized_text"] == "" + assert RAW_VALUE not in repr(block_result) + + +def test_upload_fail_on_match_overrides_redact_mode(): + """Fail-on-match should block even when mode would otherwise redact.""" + print("Testing upload fail-on-match behavior...") + from functions_dlp import evaluate_upload_content + + result = evaluate_upload_content( + f"Chunk contains {RAW_VALUE}", + settings={ + "enable_dlp_control_plane": True, + "enable_upload_dlp": True, + "upload_dlp_mode": "redact", + "upload_dlp_fail_upload_on_match": True, + }, + context={"document_id": "doc-1", "workspace_scope": "personal"}, + ) + + assert result["decision"] == "block" + assert result["upload_allowed"] is False + assert result["status"] == "blocked" + assert result["sanitized_text"] == "" + assert RAW_VALUE not in repr(result) + + +def test_upload_dlp_uses_custom_regex_rules(): + """Upload DLP should honor admin-configured regex rules and confidence shaping.""" + print("Testing upload DLP custom regex rules...") + from functions_dlp import evaluate_upload_content + + raw_document_id = "DOC-123456" + result = evaluate_upload_content( + f"Customer document {raw_document_id} is ready", + settings={ + "enable_dlp_control_plane": True, + "enable_upload_dlp": True, + "upload_dlp_mode": "redact", + "dlp_regex_rules": [ + { + "id": "document_id", + "label": "Document ID", + "entity_type": "DOCUMENT_ID", + "enabled": True, + "pattern": r"DOC-\d{6}", + "replacement": "[REDACTED_DOCUMENT_ID]", + "surfaces": ["upload"], + "flags": [], + "validator": "none", + "confidence": { + "regex_only": "low", + "with_keywords": "high", + "keywords": ["document", "customer"], + "window_chars": 32, + "minimum": "high", + }, + } + ], + }, + context={"document_id": "doc-1", "workspace_scope": "personal"}, + ) + + assert result["decision"] == "redact" + assert result["upload_allowed"] is True + assert result["status"] == "accepted_with_redactions" + assert result["sanitized_text"] == "Customer document [REDACTED_DOCUMENT_ID] is ready" + assert result["match_counts"] == {"DOCUMENT_ID": 1} + assert raw_document_id not in repr(result) + + +def test_upload_dlp_blocks_when_text_exceeds_scan_limit(): + """Enforced upload DLP should block when text exceeds the scan limit.""" + print("Testing upload DLP scan-limit enforcement...") + from functions_dlp import evaluate_upload_content + + result = evaluate_upload_content( + "safe prefix " + ("x" * 50) + " 123-45-6789", + settings={ + "enable_dlp_control_plane": True, + "enable_upload_dlp": True, + "upload_dlp_mode": "redact", + "dlp_default_engine": "regex", + "dlp_max_scan_chars": 12, + }, + context={"document_id": "doc-scan-limit", "workspace_scope": "personal"}, + ) + + assert result["decision"] == "block" + assert result["upload_allowed"] is False + assert result["status"] == "scanner_failed" + assert result["scanner_status"] == "truncated" + assert result["sanitized_text"] == "" + assert "123-45-6789" not in repr(result) + + +def test_upload_fail_on_match_blocks_truncated_monitor_mode(): + """Fail-on-match is an enforcing upload mode and should block truncated scans.""" + print("Testing upload fail-on-match scan-limit enforcement...") + from functions_dlp import evaluate_upload_content + + result = evaluate_upload_content( + "safe prefix " + ("x" * 50) + " 123-45-6789", + settings={ + "enable_dlp_control_plane": True, + "enable_upload_dlp": True, + "upload_dlp_mode": "monitor", + "upload_dlp_fail_upload_on_match": True, + "dlp_default_engine": "regex", + "dlp_max_scan_chars": 12, + }, + context={"document_id": "doc-scan-limit", "workspace_scope": "personal"}, + ) + + assert result["decision"] == "block" + assert result["upload_allowed"] is False + assert result["status"] == "scanner_failed" + assert result["scanner_status"] == "truncated" + assert result["sanitized_text"] == "" + assert "123-45-6789" not in repr(result) + + +def test_save_chunks_redacts_before_embedding_and_logs_safe_summary(): + """save_chunks should evaluate DLP before generate_embedding and avoid raw logs.""" + print("Testing save_chunks DLP ordering and log safety...") + source = read_file_text(FUNCTIONS_DOCUMENTS_FILE) + save_chunks_source = extract_function_source(source, "save_chunks") + + assert "from functions_dlp import" in source + assert "evaluate_upload_content" in source + assert "build_upload_dlp_file_log_summary" in source + assert save_chunks_source.find("_evaluate_upload_dlp_text(") < save_chunks_source.find("generate_embedding(") + assert "generate_embedding(page_text_content)" not in save_chunks_source + assert "page_text_content:{page_text_content}" not in save_chunks_source + assert "chunk_text\": sanitized" in save_chunks_source or "chunk_text\": enhanced_chunk_text" in save_chunks_source + assert "dlp_metadata" in save_chunks_source + + +def test_save_chunks_batch_redacts_before_batch_embedding(): + """save_chunks_batch should sanitize each chunk before batch embeddings.""" + print("Testing save_chunks_batch DLP ordering...") + source = read_file_text(FUNCTIONS_DOCUMENTS_FILE) + batch_source = extract_function_source(source, "save_chunks_batch") + + assert batch_source.find("_evaluate_upload_dlp_text(") < batch_source.find("generate_embeddings_batch(") + assert "texts = [c['page_text_content'] for c in chunks_data]" not in batch_source + assert "sanitized_chunks_data" in batch_source + assert "dlp_metadata" in batch_source + + +def test_save_video_chunk_redacts_transcript_and_ocr_before_embedding_and_indexing(): + """save_video_chunk should sanitize transcript and OCR before embedding/search.""" + print("Testing save_video_chunk DLP ordering...") + source = read_file_text(FUNCTIONS_DOCUMENTS_FILE) + video_source = extract_function_source(source, "save_video_chunk") + + assert video_source.find("_evaluate_upload_dlp_text(") < video_source.find("generate_embedding(") + assert "generate_embedding(page_text_content)" not in video_source + assert '"chunk_text": sanitized_transcript_text' in video_source + assert '"video_ocr_chunk_text": sanitized_ocr_text' in video_source + + +if __name__ == "__main__": + tests = [ + test_upload_helper_redacts_or_blocks_with_safe_state, + test_upload_fail_on_match_overrides_redact_mode, + test_upload_dlp_uses_custom_regex_rules, + test_upload_dlp_blocks_when_text_exceeds_scan_limit, + test_upload_fail_on_match_blocks_truncated_monitor_mode, + test_save_chunks_redacts_before_embedding_and_logs_safe_summary, + test_save_chunks_batch_redacts_before_batch_embedding, + test_save_video_chunk_redacts_transcript_and_ocr_before_embedding_and_indexing, + ] + + try: + for test in tests: + test() + print(f"All {len(tests)} upload DLP redaction tests passed.") + sys.exit(0) + except Exception as exc: + print(f"Test failed: {exc}") + import traceback + + traceback.print_exc() + sys.exit(1) diff --git a/functional_tests/test_upload_dlp_workspace_scopes.py b/functional_tests/test_upload_dlp_workspace_scopes.py new file mode 100644 index 00000000..58db25c0 --- /dev/null +++ b/functional_tests/test_upload_dlp_workspace_scopes.py @@ -0,0 +1,79 @@ +# test_upload_dlp_workspace_scopes.py +#!/usr/bin/env python3 +""" +Functional test for upload DLP workspace scope coverage. +Version: 0.242.075 +Implemented in: 0.242.073 + +This test ensures personal, group, public, and external public upload routes +continue using the shared document processing path protected by upload DLP. +""" + +import os +import sys + + +ROOT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +APP_DIR = os.path.join(ROOT_DIR, "application", "single_app") + + +ROUTE_FILES = { + "personal": os.path.join(APP_DIR, "route_backend_documents.py"), + "group": os.path.join(APP_DIR, "route_backend_group_documents.py"), + "public": os.path.join(APP_DIR, "route_backend_public_documents.py"), + "external_public": os.path.join(APP_DIR, "route_external_public_documents.py"), +} +FUNCTIONS_DOCUMENTS_FILE = os.path.join(APP_DIR, "functions_documents.py") + + +def read_file_text(path): + with open(path, "r", encoding="utf-8") as file_handle: + return file_handle.read() + + +def test_upload_routes_remain_present_for_all_workspace_scopes(): + """All supported upload route files should expose upload endpoints.""" + print("Testing upload route coverage...") + expectations = { + "personal": "/api/documents/upload", + "group": "/api/group_documents/upload", + "public": "/api/public_documents/upload", + "external_public": "/external/public_documents/upload", + } + + for scope, route_file in ROUTE_FILES.items(): + source = read_file_text(route_file) + assert expectations[scope] in source, f"Missing upload route for {scope}" + assert "process_document_upload_background" in source, f"{scope} route does not use shared processing" + + +def test_shared_processing_path_carries_workspace_scope_to_upload_dlp(): + """functions_documents should map upload DLP context for personal/group/public scopes.""" + print("Testing upload DLP workspace context...") + source = read_file_text(FUNCTIONS_DOCUMENTS_FILE) + + assert "workspace_scope" in source + assert '"personal"' in source + assert '"group"' in source + assert '"public"' in source + assert "public_workspace_id" in source + assert "group_id" in source + + +if __name__ == "__main__": + tests = [ + test_upload_routes_remain_present_for_all_workspace_scopes, + test_shared_processing_path_carries_workspace_scope_to_upload_dlp, + ] + + try: + for test in tests: + test() + print(f"All {len(tests)} upload DLP workspace scope tests passed.") + sys.exit(0) + except Exception as exc: + print(f"Test failed: {exc}") + import traceback + + traceback.print_exc() + sys.exit(1) diff --git a/functional_tests/test_web_search_current_message_only.py b/functional_tests/test_web_search_current_message_only.py index 409ba628..557ff8b0 100644 --- a/functional_tests/test_web_search_current_message_only.py +++ b/functional_tests/test_web_search_current_message_only.py @@ -1,7 +1,7 @@ # test_web_search_current_message_only.py """ Functional test for current-message-only web search egress. -Version: 0.241.046 +Version: 0.242.075 Implemented in: 0.241.008 This test ensures external web search uses only the current user message, @@ -66,7 +66,10 @@ def test_perform_web_search_uses_explicit_outbound_query_and_empty_metadata(): perform_source = extract_function_source(source, 'perform_web_search') assert 'web_search_query_text,' in perform_source - assert 'query_text = (web_search_query_text or user_message or "").strip()' in perform_source + assert 'web_search_query_text or user_message' not in perform_source + assert 'query_text = (web_search_query_text or "").strip()' in perform_source + assert 'debug_print("[WebSearch] Empty approved web-search query; skipping Foundry call")' in perform_source + assert 'return True # Not an error, just empty approved query' in perform_source assert 'foundry_metadata = {}' in perform_source metadata_block = perform_source.split('foundry_metadata = {}', 1)[1].split( @@ -148,4 +151,4 @@ def test_web_search_adds_foundry_citations_to_source_review_seeds(): test() print(f'\nšŸ“Š Results: {len(tests)}/{len(tests)} tests passed') - sys.exit(0) \ No newline at end of file + sys.exit(0) diff --git a/functional_tests/test_web_search_dlp_egress.py b/functional_tests/test_web_search_dlp_egress.py new file mode 100644 index 00000000..13f3a0ac --- /dev/null +++ b/functional_tests/test_web_search_dlp_egress.py @@ -0,0 +1,390 @@ +# test_web_search_dlp_egress.py +#!/usr/bin/env python3 +""" +Functional test for web-search DLP egress. +Version: 0.242.075 +Implemented in: 0.242.073 + +This test ensures web-search DLP runs after current-message query construction +and before Foundry web-search execution, blocks sensitive egress, redacts when +configured, and avoids raw query debug logging when DLP is enabled. +""" + +import ast +import os +import sys +from unittest.mock import patch + + +ROOT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +APP_DIR = os.path.join(ROOT_DIR, "application", "single_app") +ROUTE_FILE = os.path.join(APP_DIR, "route_backend_chats.py") +sys.path.insert(0, APP_DIR) + + +RAW_VALUE = "123-45-6789" + + +def read_file_text(path): + with open(path, "r", encoding="utf-8") as file_handle: + return file_handle.read() + + +def extract_function_source(source_text, function_name): + parsed = ast.parse(source_text, filename=ROUTE_FILE) + for node in ast.walk(parsed): + if isinstance(node, ast.FunctionDef) and node.name == function_name: + return ast.get_source_segment(source_text, node) + raise AssertionError(f"Function {function_name} not found") + + +def test_route_imports_and_calls_dlp_before_web_search(): + """Both chat paths should evaluate DLP before perform_web_search.""" + print("Testing web-search DLP call ordering...") + source = read_file_text(ROUTE_FILE) + + assert "from functions_dlp import evaluate_web_search_egress" in source + assert source.count("evaluate_web_search_egress(") >= 2 + + non_stream_slice = source[source.find("web_search_query_text = build_web_search_query_text(user_message)"):] + non_stream_slice = non_stream_slice[:non_stream_slice.find("perform_web_search(") + len("perform_web_search(")] + assert "evaluate_web_search_egress(" in non_stream_slice + + streaming_start = source.find("def chat_stream_api") + streaming_source = source[streaming_start:] + streaming_slice = streaming_source[ + streaming_source.find("web_search_query_text = build_web_search_query_text(user_message)") : + ] + streaming_slice = streaming_slice[:streaming_slice.find("perform_web_search(") + len("perform_web_search(")] + assert "evaluate_web_search_egress(" in streaming_slice + + +def test_dlp_helper_blocks_or_redacts_web_search_text(): + """DLP helper should block or redact web-search egress text.""" + print("Testing web-search DLP helper behavior...") + from functions_dlp import evaluate_web_search_egress + + block_result = evaluate_web_search_egress( + f"Search the web for employee SSN {RAW_VALUE}", + settings={ + "enable_dlp_control_plane": True, + "enable_web_search_dlp": True, + "web_search_dlp_mode": "block", + }, + context={"chat_type": "user"}, + ) + redact_result = evaluate_web_search_egress( + f"Search the web for employee SSN {RAW_VALUE}", + settings={ + "enable_dlp_control_plane": True, + "enable_web_search_dlp": True, + "web_search_dlp_mode": "redact", + }, + context={"chat_type": "user"}, + ) + + assert block_result["decision"] == "block" + assert block_result["web_search_allowed"] is False + assert block_result["status_message"] == ( + "Web search was blocked because the message appears to contain non-public information." + ) + assert redact_result["decision"] == "redact" + assert redact_result["web_search_allowed"] is True + assert "[REDACTED_US_SSN]" in redact_result["web_search_query_text"] + assert RAW_VALUE not in redact_result["web_search_query_text"] + assert redact_result["status_message"] == "Sensitive details were removed before web search." + + +def test_custom_regex_rule_can_redact_web_search_in_redact_mode(): + """Custom regex rules can redact web search when a high-confidence policy rule matches.""" + print("Testing custom regex rule web-search behavior...") + from functions_dlp import evaluate_web_search_egress + + result = evaluate_web_search_egress( + "Search employee EID-123456", + settings={ + "enable_dlp_control_plane": True, + "enable_web_search_dlp": True, + "web_search_dlp_mode": "redact", + "dlp_regex_rules": [ + { + "id": "employee_id", + "label": "Employee ID", + "entity_type": "EMPLOYEE_ID", + "enabled": True, + "pattern": r"EID-\d{6}", + "replacement": "[REDACTED_EMPLOYEE_ID]", + "surfaces": ["web_search"], + "flags": [], + "validator": "none", + "confidence": { + "regex_only": "low", + "with_keywords": "high", + "keywords": ["employee"], + "window_chars": 32, + "minimum": "high" + } + } + ], + }, + context={"chat_type": "user"}, + ) + + assert result["decision"] == "redact" + assert result["web_search_allowed"] is True + assert result["web_search_query_text"] == "Search employee [REDACTED_EMPLOYEE_ID]" + assert "EID-123456" not in repr(result) + + +def test_scanner_error_fails_closed_by_default(): + """Scanner errors must not allow web-search egress by default.""" + import functions_dlp + + def fail_scan(text, settings, surface="generic"): + raise RuntimeError("scanner unavailable") + + with patch.object(functions_dlp, "_apply_regex_engine", fail_scan): + result = functions_dlp.evaluate_dlp_text( + "send 123-45-6789 to web", + settings={ + "enable_dlp_control_plane": True, + "enable_web_search_dlp": True, + "web_search_dlp_mode": "redact", + "dlp_default_engine": "regex", + }, + surface="web_search", + ) + egress_result = functions_dlp.evaluate_web_search_egress( + "send 123-45-6789 to web", + settings={ + "enable_dlp_control_plane": True, + "enable_web_search_dlp": True, + "web_search_dlp_mode": "redact", + "dlp_default_engine": "regex", + }, + context={"chat_type": "user"}, + ) + + assert result["scanner_status"] == "error" + assert egress_result["web_search_allowed"] is False + assert egress_result["web_search_query_text"] == "" + assert "123-45-6789" not in repr(egress_result) + assert result["decision"] == "block" + assert result["text"] == "" + + +def test_blocked_status_continues_normal_chat_without_foundry_web_search(): + """Route source should add safe augmentation instead of calling web search when blocked.""" + print("Testing blocked web-search safe augmentation...") + source = read_file_text(ROUTE_FILE) + assert "Web search was blocked because the message appears to contain non-public information." in source + assert "Sensitive details were removed before web search." in source + assert "web_search_allowed" in source + + +def test_perform_web_search_debug_logging_masks_dlp_queries(): + """perform_web_search should avoid raw query/result debug logging when DLP is enabled.""" + print("Testing web-search debug logging safety...") + source = read_file_text(ROUTE_FILE) + perform_source = extract_function_source(source, "perform_web_search") + citation_source = extract_function_source(source, "_extract_web_search_citations_from_content") + + forbidden = [ + "web_search_query_text[:100]", + "user_message[:100]", + "query_text[:100]", + "result.message[:500]", + "json.dumps(cit", + "json.dumps(citation", + "metadata_payload", + "Metadata: {result.metadata}", + "'search_query': query_text", + '"search_query": query_text', + "Adding agent citation with title", + "Foundry agent invocation failed: {exc}", + "Unexpected error invoking Foundry agent: {exc}", + "Web search failed with error: {exc}", + "Web search failed with an unexpected error: {exc}", + "exceptionTraceback=True", + "Failed to log web search token usage: {log_error}", + ] + for snippet in forbidden: + assert snippet not in perform_source, f"Unsafe debug logging remains: {snippet}" + + assert "Extracting citations from:\\n{content}" not in citation_source + assert " - {citations}" not in citation_source + assert "dlp" in perform_source.lower() + assert "query_length" in perform_source or "text_length" in perform_source + assert "search_query_length" in perform_source + + +def test_deep_research_planned_queries_are_rechecked_before_web_search(): + """Deep Research planner output should be DLP-checked before Foundry web search.""" + print("Testing Deep Research planned query DLP enforcement...") + source = read_file_text(ROUTE_FILE) + function_source = extract_function_source(source, "perform_research_web_searches") + + recorded = {"planner_user_message": None, "queries": []} + + def fake_build_deep_research_query_plan(**kwargs): + recorded["planner_user_message"] = kwargs.get("user_message") + return { + "queries": [ + { + "query": f"Find records for employee SSN {RAW_VALUE}", + "reason": "planner included sensitive source text", + "source": "planner", + } + ] + } + + def fake_perform_web_search(**kwargs): + recorded["queries"].append(kwargs["web_search_query_text"]) + + from functions_dlp import evaluate_web_search_egress + + namespace = { + "build_deep_research_query_plan": fake_build_deep_research_query_plan, + "perform_web_search": fake_perform_web_search, + "evaluate_web_search_egress": evaluate_web_search_egress, + "should_emit_dlp_telemetry": lambda *args, **kwargs: False, + "log_event": lambda *args, **kwargs: None, + "build_dlp_telemetry_properties": lambda *args, **kwargs: {}, + "WEB_SEARCH_DLP_BLOCKED_STATUS": ( + "Web search was blocked because the message appears to contain non-public information." + ), + } + exec(compile(function_source, ROUTE_FILE, "exec"), namespace) + + namespace["perform_research_web_searches"]( + settings={ + "enable_dlp_control_plane": True, + "enable_web_search_dlp": True, + "web_search_dlp_mode": "redact", + "dlp_default_engine": "regex", + }, + conversation_id="conv-1", + user_id="user-1", + user_message=f"Search for employee SSN {RAW_VALUE}", + user_message_id="msg-1", + chat_type="user", + document_scope="personal", + active_group_id=None, + active_public_workspace_id=None, + web_search_query_text="Search for employee SSN [REDACTED_US_SSN]", + system_messages_for_augmentation=[], + agent_citations_list=[], + web_search_citations_list=[], + deep_research_enabled=True, + deep_research_planner_client=object(), + deep_research_planner_model="planner", + ) + + assert RAW_VALUE not in recorded["planner_user_message"] + assert recorded["queries"] == ["Find records for employee SSN [REDACTED_US_SSN]"] + assert RAW_VALUE not in repr(recorded) + + +def test_deep_research_blocked_planned_queries_do_not_call_web_search(): + """Deep Research should skip planner queries when the per-query DLP check blocks.""" + print("Testing Deep Research planned query block enforcement...") + source = read_file_text(ROUTE_FILE) + function_source = extract_function_source(source, "perform_research_web_searches") + + recorded_queries = [] + system_messages = [] + + def fake_build_deep_research_query_plan(**kwargs): + return { + "queries": [ + { + "query": f"Find records for employee SSN {RAW_VALUE}", + "reason": "planner included sensitive source text", + "source": "planner", + } + ] + } + + def fake_perform_web_search(**kwargs): + recorded_queries.append(kwargs["web_search_query_text"]) + + from functions_dlp import evaluate_web_search_egress + + namespace = { + "build_deep_research_query_plan": fake_build_deep_research_query_plan, + "perform_web_search": fake_perform_web_search, + "evaluate_web_search_egress": evaluate_web_search_egress, + "should_emit_dlp_telemetry": lambda *args, **kwargs: False, + "log_event": lambda *args, **kwargs: None, + "build_dlp_telemetry_properties": lambda *args, **kwargs: {}, + "WEB_SEARCH_DLP_BLOCKED_STATUS": ( + "Web search was blocked because the message appears to contain non-public information." + ), + } + exec(compile(function_source, ROUTE_FILE, "exec"), namespace) + + result = namespace["perform_research_web_searches"]( + settings={ + "enable_dlp_control_plane": True, + "enable_web_search_dlp": True, + "web_search_dlp_mode": "block", + "dlp_default_engine": "regex", + }, + conversation_id="conv-1", + user_id="user-1", + user_message="Search for employee SSN [REDACTED_US_SSN]", + user_message_id="msg-1", + chat_type="user", + document_scope="personal", + active_group_id=None, + active_public_workspace_id=None, + web_search_query_text="Search for employee SSN [REDACTED_US_SSN]", + system_messages_for_augmentation=system_messages, + agent_citations_list=[], + web_search_citations_list=[], + deep_research_enabled=True, + deep_research_planner_client=object(), + deep_research_planner_model="planner", + ) + + assert recorded_queries == [] + assert result["web_search_runs"] == [] + assert system_messages == [{"role": "system", "content": namespace["WEB_SEARCH_DLP_BLOCKED_STATUS"]}] + assert RAW_VALUE not in repr(result) + assert RAW_VALUE not in repr(system_messages) + + +def test_token_usage_extraction_logs_metadata_shape_only(): + """Token usage validation should not log raw provider usage metadata.""" + print("Testing web-search token usage extraction log safety...") + source = read_file_text(ROUTE_FILE) + token_source = extract_function_source(source, "_extract_token_usage_from_metadata") + + assert "usage={usage}" not in token_source + assert "usage_keys={list(usage.keys())}" in token_source + + +if __name__ == "__main__": + tests = [ + test_route_imports_and_calls_dlp_before_web_search, + test_dlp_helper_blocks_or_redacts_web_search_text, + test_custom_regex_rule_can_redact_web_search_in_redact_mode, + test_scanner_error_fails_closed_by_default, + test_blocked_status_continues_normal_chat_without_foundry_web_search, + test_perform_web_search_debug_logging_masks_dlp_queries, + test_deep_research_planned_queries_are_rechecked_before_web_search, + test_deep_research_blocked_planned_queries_do_not_call_web_search, + test_token_usage_extraction_logs_metadata_shape_only, + ] + + try: + for test in tests: + test() + print(f"All {len(tests)} web-search DLP egress tests passed.") + sys.exit(0) + except Exception as exc: + print(f"Test failed: {exc}") + import traceback + + traceback.print_exc() + sys.exit(1) diff --git a/functional_tests/test_web_search_dlp_route_integration.py b/functional_tests/test_web_search_dlp_route_integration.py new file mode 100644 index 00000000..369b46ef --- /dev/null +++ b/functional_tests/test_web_search_dlp_route_integration.py @@ -0,0 +1,143 @@ +# test_web_search_dlp_route_integration.py +#!/usr/bin/env python3 +""" +Functional test for web-search DLP route integration. +Version: 0.242.075 +Implemented in: 0.242.073 + +This test ensures chat routes evaluate DLP before Foundry web search, suppress +Foundry calls on block, and send only the redacted query on redact. +""" + +import os +import sys +from pathlib import Path + + +ROOT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +APP_DIR = os.path.join(ROOT_DIR, "application", "single_app") +ROUTE_FILE = os.path.join(APP_DIR, "route_backend_chats.py") +ROUTE_BACKEND_CHATS = Path(ROUTE_FILE) + + +def read_file_text(path): + with open(path, "r", encoding="utf-8") as file_handle: + return file_handle.read() + + +def web_search_dlp_blocks(source_text): + marker = "web_search_dlp_result = evaluate_web_search_egress(" + blocks = [] + start = 0 + while True: + marker_index = source_text.find(marker, start) + if marker_index == -1: + break + perform_index = source_text.find("perform_web_search(", marker_index) + if perform_index == -1: + raise AssertionError("Found DLP evaluation without a later perform_web_search call") + block_start = source_text.rfind("if web_search_enabled:", 0, marker_index) + block_end = source_text.find(")", perform_index) + blocks.append(source_text[block_start:block_end]) + start = perform_index + len("perform_web_search(") + return blocks + + +def extract_top_level_function_source(source_text, function_name): + marker = f"def {function_name}(" + start = source_text.find(marker) + if start == -1: + raise AssertionError(f"Function {function_name} not found") + + next_function = source_text.find("\ndef ", start + len(marker)) + if next_function == -1: + return source_text[start:] + return source_text[start:next_function] + + +def test_dlp_guard_exists_in_both_chat_paths(): + """Both streaming and non-streaming routes should have a DLP-guarded web-search block.""" + print("Testing web-search DLP guarded blocks...") + source = read_file_text(ROUTE_FILE) + blocks = web_search_dlp_blocks(source) + + assert len(blocks) == 2, f"Expected two web-search DLP route blocks, found {len(blocks)}" + + +def test_blocked_dlp_result_suppresses_foundry_call(): + """Blocked DLP decisions should append a safe system message instead of calling Foundry.""" + print("Testing blocked DLP route behavior...") + source = read_file_text(ROUTE_FILE) + + for block in web_search_dlp_blocks(source): + dlp_index = block.find("web_search_dlp_result = evaluate_web_search_egress(") + block_decision_index = block.find('if not web_search_dlp_result.get("web_search_allowed", True):') + blocked_status_index = block.find("WEB_SEARCH_DLP_BLOCKED_STATUS") + else_index = block.find("else:", block_decision_index) + perform_index = block.find("perform_web_search(") + + assert dlp_index != -1 + assert block_decision_index > dlp_index + assert blocked_status_index > block_decision_index + assert else_index > blocked_status_index + assert perform_index > else_index, "Foundry web search must stay in the allowed else branch" + + +def test_redacted_query_is_forwarded_to_foundry(): + """Allowed/redacted DLP decisions should replace the query before Foundry invocation.""" + print("Testing redacted query forwarding...") + source = read_file_text(ROUTE_FILE) + + expected_assignment = ( + 'web_search_query_text = web_search_dlp_result.get("web_search_query_text", web_search_query_text)' + ) + for block in web_search_dlp_blocks(source): + assignment_index = block.find(expected_assignment) + perform_index = block.find("perform_web_search(") + query_argument_index = block.find("web_search_query_text=web_search_query_text", perform_index) + + assert assignment_index != -1, "Route must replace raw query with DLP-safe query" + assert perform_index > assignment_index, "Foundry call must occur after DLP-safe query assignment" + assert query_argument_index > perform_index, "Foundry call must receive the DLP-safe query variable" + + +def test_route_emits_counts_only_dlp_telemetry(): + """Route telemetry should use the shared counts-only telemetry builder.""" + print("Testing route DLP telemetry integration...") + source = read_file_text(ROUTE_FILE) + + for block in web_search_dlp_blocks(source): + assert "should_emit_dlp_telemetry(web_search_dlp_result, settings)" in block + assert "build_dlp_telemetry_properties(" in block + assert 'surface="web_search"' in block + assert "raw_matches" not in block + + +def test_perform_web_search_does_not_fallback_to_raw_user_message(): + source = ROUTE_BACKEND_CHATS.read_text(encoding="utf-8") + perform_source = extract_top_level_function_source(source, "perform_web_search") + + assert "web_search_query_text or user_message" not in perform_source + assert "query_text = (web_search_query_text or \"\").strip()" in perform_source + + +if __name__ == "__main__": + tests = [ + test_dlp_guard_exists_in_both_chat_paths, + test_blocked_dlp_result_suppresses_foundry_call, + test_redacted_query_is_forwarded_to_foundry, + test_route_emits_counts_only_dlp_telemetry, + test_perform_web_search_does_not_fallback_to_raw_user_message, + ] + + try: + for test in tests: + test() + print(f"All {len(tests)} web-search DLP route integration tests passed.") + sys.exit(0) + except Exception as exc: + print(f"Test failed: {exc}") + import traceback + + traceback.print_exc() + sys.exit(1) diff --git a/tools/local_dev/render_dlp_admin_preview.py b/tools/local_dev/render_dlp_admin_preview.py new file mode 100644 index 00000000..d70e13ca --- /dev/null +++ b/tools/local_dev/render_dlp_admin_preview.py @@ -0,0 +1,91 @@ +# render_dlp_admin_preview.py +#!/usr/bin/env python3 +""" +Extract the DLP admin settings card from a captured SimpleChat admin page. + +Usage: + python tools/local_dev/render_dlp_admin_preview.py .codex-local/admin-settings.html .codex-local +""" + +import sys +from pathlib import Path + +from bs4 import BeautifulSoup + + +def _wrap_preview(section_html): + return f""" + + + + + + + + + +
+{section_html} +
+ + +""" + + +def _expand_dlp_controls(section): + for checkbox_id in [ + "enable_dlp_control_plane", + "enable_web_search_dlp", + "enable_upload_dlp", + ]: + node = section.select_one(f"#{checkbox_id}") + if node: + node["checked"] = "" + + for visible_id in [ + "dlp_control_plane_settings", + "web_search_dlp_mode_settings", + "upload_dlp_mode_settings", + ]: + node = section.select_one(f"#{visible_id}") + if node and node.has_attr("class"): + node["class"] = [class_name for class_name in node.get("class", []) if class_name != "d-none"] + + +def render_previews(source_path, output_dir): + source_html = source_path.read_text(encoding="utf-8") + soup = BeautifulSoup(source_html, "html.parser") + section = soup.select_one("#dlp-section") + if section is None: + raise ValueError("Could not find #dlp-section in captured admin settings HTML.") + + output_dir.mkdir(parents=True, exist_ok=True) + collapsed_path = output_dir / "admin-dlp-preview.html" + expanded_path = output_dir / "admin-dlp-preview-expanded.html" + + collapsed_path.write_text(_wrap_preview(str(section)), encoding="utf-8") + _expand_dlp_controls(section) + expanded_path.write_text(_wrap_preview(str(section)), encoding="utf-8") + return collapsed_path, expanded_path + + +def main(argv): + if len(argv) != 3: + print("Usage: render_dlp_admin_preview.py ") + return 2 + + source_path = Path(argv[1]) + output_dir = Path(argv[2]) + collapsed_path, expanded_path = render_previews(source_path, output_dir) + print(f"Wrote {collapsed_path}") + print(f"Wrote {expanded_path}") + return 0 + + +if __name__ == "__main__": + sys.exit(main(sys.argv)) diff --git a/tools/local_dev/run_dlp_local_stack.md b/tools/local_dev/run_dlp_local_stack.md new file mode 100644 index 00000000..8bb45877 --- /dev/null +++ b/tools/local_dev/run_dlp_local_stack.md @@ -0,0 +1,134 @@ +# DLP Local Stack Smoke Runbook + +## Purpose + +Use this runbook to render the DLP admin settings UI against a disposable local Cosmos DB emulator without using Azure-hosted Cosmos. + +## Ports + +- Cosmos gateway: `9081` +- Cosmos health: `9082` +- Cosmos explorer: `1235` +- SimpleChat Flask dev server: `5000` + +Port `8081` is intentionally avoided because local proxy tools may already bind it. + +## Start Cosmos + +```bash +docker run --detach --name simplechat-cosmos-dlp --publish 9081:8081 --publish 9082:8080 --publish 1235:1234 mcr.microsoft.com/cosmosdb/linux/azure-cosmos-emulator:vnext-latest --gateway-endpoint localhost:9081 +``` + +## Verify Cosmos + +```bash +curl.exe -sS http://localhost:9082/status +``` + +The health endpoint should show PostgreSQL and Explorer as healthy. When +`--gateway-endpoint localhost:9081` is used, the container-internal gateway +probe can report unhealthy because it checks the host-advertised port from +inside the container. Use the SDK smoke test below as the authoritative check. + +## SDK Smoke Test + +Run this from the repository root after the Python environment is created: + +```bash +.venv\Scripts\python.exe -c "from azure.cosmos import CosmosClient, PartitionKey; key=''; c=CosmosClient('http://localhost:9081/', credential=key); db=c.create_database_if_not_exists('SimpleChatSmoke'); con=db.create_container_if_not_exists(id='smoke', partition_key=PartitionKey(path='/id')); con.upsert_item({'id':'ok','value':1}); print(con.read_item('ok', partition_key='ok')['value'])" +``` + +Expected output: + +```text +1 +``` + +## Python Environment + +```bash +python -m venv .venv +.venv\Scripts\python.exe -m pip install --upgrade pip +.venv\Scripts\python.exe -m pip install -r application\single_app\requirements.txt +``` + +## Start SimpleChat + +Set these environment variables before launch: + +```dotenv +AZURE_COSMOS_ENDPOINT=http://localhost:9081/ +AZURE_COSMOS_KEY= +AZURE_COSMOS_AUTHENTICATION_TYPE=key +NO_PROXY=localhost,127.0.0.1,::1 +no_proxy=localhost,127.0.0.1,::1 +FLASK_DEBUG=1 +SIMPLECHAT_USE_GUNICORN=0 +SIMPLECHAT_RUN_BACKGROUND_TASKS=0 +DISABLE_FLASK_INSTRUMENTATION=1 +CLIENT_ID=local-dev-client +TENANT_ID=local-dev-tenant +MICROSOFT_PROVIDER_AUTHENTICATION_SECRET=replace-me +``` + +Then run: + +```bash +cd application\single_app +..\..\.venv\Scripts\python.exe app.py +``` + +Open: + +```text +https://localhost:5000 +``` + +## Capture The DLP Admin Card + +After authenticating as an admin user, save the rendered admin settings page: + +```bash +curl.exe -k -sS -H "Cookie: session=" https://localhost:5000/admin/settings -o .codex-local/admin-settings.html +``` + +Then extract the DLP section for a focused visual review: + +```bash +python tools/local_dev/render_dlp_admin_preview.py .codex-local/admin-settings.html .codex-local +``` + +The script writes: + +- `.codex-local/admin-dlp-preview.html` +- `.codex-local/admin-dlp-preview-expanded.html` + +## Known Local Caveats + +- Browser automation may be blocked by Windows group policy. +- If Docker Desktop stops, the Flask process can keep serving cached pages while Cosmos requests fail. +- If another tool owns port `8081`, use `9081` and pass `--gateway-endpoint localhost:9081`. +- Keep `.codex-local/` untracked; it is for local smoke artifacts only. + +## Optional Presidio Analyzer Smoke + +SimpleChat can test the external `presidio_endpoint` engine against the stock local Presidio Analyzer container without adding Presidio dependencies to the SimpleChat app image. + +Run a local Presidio Analyzer container: + +```bash +docker run --rm -p 5002:3000 mcr.microsoft.com/presidio-analyzer:latest +``` + +Configure DLP Admin Settings: + +- Default Engine: External Presidio Analyzer endpoint +- Analyzer Endpoint: `http://localhost:5002/analyze` +- Auth Header: `X-DLP-API-Key` +- Secret Env Var: `PRESIDIO_DLP_API_KEY` +- Score Threshold: `0.5` +- Entities: `CREDIT_CARD, EMAIL_ADDRESS, PHONE_NUMBER, US_SSN` + +The stock local container does not require an API key. Production deployments should require a private network path plus an API key header at a proxy, wrapper, gateway, or service boundary. The API key value should live in App Service settings or a Key Vault reference; SimpleChat admin settings store only the environment variable name, such as `PRESIDIO_DLP_API_KEY`. + +Then test a web-search or upload input containing harmless synthetic content such as `a@example.com`. In redact mode, SimpleChat should use the Presidio Analyzer returned spans and replace the value before egress or indexing. Do not enable raw text logging for SimpleChat, proxies, or analyzer containers while testing.