RAG_Support_Assistant/.env.example at master · brownjuly2003-code/RAG_Support_Assistant · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
# GraceKelly is the default local orchestrator. Start D:\GraceKelly on this URL first.
GRACEKELLY_BASE_URL=http://127.0.0.1:8011
GRACEKELLY_API_KEY=
GRACEKELLY_API_KEY_ENV=GRACEKELLY_API_KEY
GRACEKELLY_HEALTH_CHECK_TIMEOUT_SEC=2.0
GRACEKELLY_REQUEST_TIMEOUT_SEC=30.0
FAILOVER_CHAIN_ENABLED=true
FAILOVER_FALLBACK_CACHE_SECONDS=300
# Provider registry and routing profile. `gracekelly-primary` is the default path.
# Use `local-first` only for explicit local-only Ollama mode.
PROVIDER_REGISTRY_PATH=config/providers.yml
LLM_PROVIDER_PROFILE=gracekelly-primary
# Optional Ollama settings for explicit `local-first` mode or GraceKelly fallback.
# In Docker Compose use http://ollama:11434
OLLAMA_BASE_URL=http://localhost:11434
OLLAMA_MODEL_NAME=qwen2.5:7b
# Benchmarks stay mock-only unless you explicitly allow paid provider calls.
LLM_BENCHMARK_ALLOW_PAID_APIS=false
# Fail fast when paid-provider spend for the current UTC day reaches this limit.
DAILY_COST_LIMIT_USD=5.0
# Paid-provider credentials. Placeholder values such as `changeme` are treated as missing.
MISTRAL_API_KEY=changeme
# Model routing: fast model for simple questions, strong model for complex ones
MODEL_ROUTING_ENABLED=false
OLLAMA_FAST_MODEL_NAME=llama3.2:3b
# Ingestion auto-categorizer model. Override when the default is not pulled locally.
INGESTION_CATEGORIZER_MODEL=llama3.2:3b
# Default token pricing used when a model is not listed in LLM_MODEL_PRICES.
LLM_INPUT_PRICE_PER_1M_TOKENS=0.0
LLM_OUTPUT_PRICE_PER_1M_TOKENS=0.0
# Optional per-model pricing override for analytics cost calculation.
# Example: {"mistral-small-latest":{"input":0.20,"output":0.60},"ollama-local":{"input":0.0,"output":0.0}}
LLM_MODEL_PRICES=
# Embedding model name used to vectorize documents and queries (local backend)
RAG_EMBEDDING_MODEL=BAAI/bge-m3
# Embedding backend: local (SentenceTransformer on RAG_DEVICE) | remote (OpenAI/Mistral API).
# Remote frees ingest/search from loading the heavy local model (e.g. unblocks Windows).
RAG_EMBEDDING_BACKEND=local
RAG_EMBEDDING_REMOTE_URL=https://api.mistral.ai/v1/embeddings
RAG_EMBEDDING_REMOTE_MODEL=mistral-embed
# Name of the env var holding the remote API key (the key itself is never stored here)
RAG_EMBEDDING_REMOTE_API_KEY_ENV=MISTRAL_API_KEY
RAG_EMBEDDING_REMOTE_BATCH=32
RAG_EMBEDDING_REMOTE_TIMEOUT_SEC=60
# Cross-encoder reranker model used to reorder retrieved documents
# (multilingual, pairs with BGE-M3; ms-marco is English-only and degrades RU retrieval)
RAG_RERANKER_MODEL=BAAI/bge-reranker-v2-m3
# Inference device for embedder + reranker: auto (cuda->mps->cpu) | cpu | cuda | cuda:0 | mps
RAG_DEVICE=auto
# Enable hybrid retrieval that combines BM25 with vector search
RAG_HYBRID_SEARCH=true
# Retrieval strategy: vector, hybrid, or graph (graph falls back to hybrid until configured)
RAG_RETRIEVAL_STRATEGY=hybrid
# Number of candidates to fetch before reranking
RAG_RETRIEVAL_TOP_K=20
# Number of top documents to keep after reranking
RAG_RERANK_TOP_K=5
# Reciprocal Rank Fusion smoothing constant
RRF_K=60
# Prefix length used to deduplicate RRF document keys
RRF_DOC_KEY_CHARS=200
# Minimum quality score required for route=auto in the graph
QUALITY_THRESHOLD=80
# Default chunking parameters used when no explicit chunk config is passed
CHUNK_SIZE=800
CHUNK_OVERLAP=200
# Default page size for admin list endpoints
API_DEFAULT_PAGE_SIZE=50
# Max LLM tool-call loops per turn in the agent (task-107)
AGENT_MAX_TOOL_LOOPS=5
# Confidence threshold below which conversations are escalated to human (task-106)
ESCALATION_THRESHOLD=0.7
# Semantic chunking - splits by semantic similarity instead of fixed size.
# Improves faithfulness ~+80%. Requires embedding model loaded.
RAG_SEMANTIC_CHUNKING=true
# Enable HyDE (Hypothetical Document Embeddings) for improved retrieval
RAG_HYDE=false
# Enable Parent-Child chunking (search child chunks, return parent context)
RAG_PARENT_CHILD=false
# Graph-retrieval lane activation gate (off|on|auto). The lane is Phase 2 of
# docs/plans/2026-06-05-graph-retrieval-activation.md; "auto" needs BOTH the
# chunk threshold and the connectivity gate. Decision logged at every ingest.
RAG_GRAPH_RETRIEVAL=off
RAG_GRAPH_MIN_CHUNKS=20000
RAG_GRAPH_MIN_CROSSDOC_SHARE=0.15
# Measured by scripts/graph_probe.py (2026-06-06, 200 chunks / 184 docs): 0.296.
RAG_GRAPH_CROSSDOC_SHARE=0.296
# Optional wall-clock budget (seconds) for one ConversationSession.ask() outside the
# HTTP path; 0 = off. When exceeded, ask() returns a graceful degraded result.
RAG_ASK_BUDGET_SEC=0
# Maximum number of Self-RAG retry iterations
RAG_SELF_RAG_MAX_ITER=2
# Minimum quality score required to avoid escalation or retry
RAG_SELF_RAG_MIN_QUALITY=70
# Streaming /api/ask/stream runs one cheap Self-RAG self-eval for quality parity
# with non-streaming. Set false to roll back to the legacy synthetic-score path.
STREAMING_QUALITY_EVAL=true
# Enable fact verification after answer generation
FACT_VERIFICATION_ENABLED=true
# Minimum factuality score reserved for future routing/alerting logic
FACT_VERIFICATION_MIN_SCORE=70
# Max retrieved docs used as evidence when verifying answer facts
FACT_VERIFY_CONTEXT_MAX_DOCS=5
# Chars per doc used as fact-verification evidence (aligned with parent-expansion)
FACT_VERIFY_CONTEXT_CHARS_PER_DOC=3600
# Trace duration threshold for adding a case to the review queue
SLOW_TRACE_THRESHOLD_MS=10000
# Master switch for automated review-queue collection and admin endpoints
REVIEW_QUEUE_ENABLED=true
# Master switch for per-trace lightweight online evaluators
ONLINE_EVALUATORS_ENABLED=true
# Per-trace online-evaluator wall-clock budget (seconds); exceeding runs are dropped.
# When persistence keeps failing (e.g. no Postgres), only the first failure logs at
# WARNING this process; identical repeats drop to DEBUG.
ONLINE_EVALUATORS_TIMEOUT_SEC=1.0
# Regression gate: fail the candidate if curated regressions exceed this count
REGRESSION_GATE_MAX_REGRESSIONS=2
# Regression gate: minimum candidate pass rate required on curated cases
REGRESSION_GATE_MIN_PASS_RATE=0.85
# Vector database backend to use for document storage
RAG_VECTOR_BACKEND=chroma
# Chroma collection prefix; full name = {prefix}_{tenant_id}
VECTORDB_COLLECTION_PREFIX=rag_docs
# Backend used to store escalations for human support
SUPPORT_SINK_BACKEND=local
# Bitrix24 webhook URL for sending escalations when Bitrix backend is enabled
BITRIX_WEBHOOK_URL=
# Telegram bot token (from @BotFather). Leave empty to disable.
TELEGRAM_BOT_TOKEN=
# Langfuse - LLM observability (optional, leave empty to disable)
LANGFUSE_PUBLIC_KEY=
LANGFUSE_SECRET_KEY=
LANGFUSE_HOST=https://cloud.langfuse.com
# Fail fast on startup if Ollama is unavailable. Set true only for explicit local-first mode.
REQUIRE_OLLAMA=false
# Circuit breaker for Ollama - fast-fail when explicit local/fallback Ollama is unhealthy
CIRCUIT_BREAKER_ENABLED=true
CIRCUIT_BREAKER_FAILURE_THRESHOLD=5
CIRCUIT_BREAKER_RESET_TIMEOUT_SEC=30
# Retry for Ollama transient network errors
OLLAMA_RETRY_MAX_ATTEMPTS=3
OLLAMA_RETRY_BASE_DELAY_SEC=0.5
OLLAMA_RETRY_MAX_DELAY_SEC=5.0
OLLAMA_RETRY_JITTER=true
# Timeout for a single Ollama HTTP call (seconds)
OLLAMA_REQUEST_TIMEOUT_SEC=60
# Wall-time limit for one /api/ask request (seconds). Exceeded requests return 504.
REQUEST_TIMEOUT_SEC=30
# Wall-clock budget for the SSE token loop in /api/ask/stream (separate from REQUEST_TIMEOUT_SEC)
STREAMING_TIMEOUT_SEC=120
# Timeout for persisting one conversation message to Postgres before the write is dropped
DB_PERSIST_TIMEOUT_SEC=2.0
# Maximum number of /api/ask pipelines running at once
MAX_CONCURRENT_PIPELINES=8
# How long to wait for a pipeline slot before returning 503 (seconds)
PIPELINE_ACQUIRE_TIMEOUT_SEC=0.5
# Session idle timeout in seconds (default 2 hours)
SESSION_TTL_SECONDS=7200
# Retention for SQLite traces in days. 0 disables automatic purge.
TRACE_RETENTION_DAYS=90
TRACE_PURGE_INTERVAL_SEC=86400
# Retention for audit_log in days. 0 disables automatic purge.
AUDIT_RETENTION_DAYS=180
AUDIT_PURGE_INTERVAL_SEC=86400
# Delay between SIGTERM readiness flip and actual cleanup/shutdown.
# Gives the k8s load balancer time to stop routing new traffic to the pod.
SHUTDOWN_READY_DELAY_SEC=5
# --- PostgreSQL ---
# Replace "changeme" with a strong password before first deploy.
POSTGRES_PASSWORD=changeme
DATABASE_URL=postgresql://rag:changeme@localhost:5432/rag_assistant

# --- Encryption ---
# Key for AES-256 application-layer encryption of sensitive columns (task-113).
# REQUIRED. Generate: python -c 'import secrets; print(secrets.token_urlsafe(32))'
# Must be rotated on suspected compromise (see docs/operations/backup-restore.md §2.3).
DB_ENCRYPTION_KEY=changeme-generate-with-secrets-token_urlsafe

# --- Redis ---
REDIS_URL=redis://localhost:6379/0
# LLM response cache for repeated support questions. Disabled by default for safe rollout.
LLM_CACHE_ENABLED=false
# TTL for cached /api/ask responses (seconds).
LLM_CACHE_TTL_SECONDS=3600

# API key for protecting /api/ask and /api/upload. Leave empty to disable auth.
API_KEY=
# Environment: development | staging | production
# In production CORS_ORIGINS="*" is forbidden (validate() fails on startup).
RAG_ENV=development
# CORS allowed origins. Comma-separated. Use "*" for dev, specific origins for production.
# Example: CORS_ORIGINS=https://app.example.com,https://admin.example.com
CORS_ORIGINS=*
# Preflight cache TTL (sec). Reduces OPTIONS load.
CORS_MAX_AGE_SEC=600
# Maximum request body size in bytes for non-upload endpoints. Default 1 MiB.
# /api/upload uses a separate MAX_UPLOAD_BYTES limit.
MAX_REQUEST_BODY_BYTES=1048576
# Maximum uploaded file size in bytes. Default 50 MiB.
MAX_UPLOAD_BYTES=52428800
# Prometheus metrics are exposed at GET /metrics (no auth)
# Alerting thresholds (scripts/check_alerts.py)
ALERT_WEBHOOK_URL=
ALERT_ESCALATION_PCT=35
ALERT_QUALITY_MIN=65
ALERT_LOW_QUALITY_PCT=30
ALERT_P95_LATENCY_SEC=12
ALERT_THUMBS_DOWN_PCT=20
ALERT_THUMBS_DOWN_MIN_N=50