RAGStack/ragstack.config.yaml at main · SonicStrain/RAGStack · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
llm:
  provider: anthropic           # anthropic | openai | gemini | ollama
  model: claude-sonnet-4-6
  max_tokens: 1024
  ollama:
    base_url: http://localhost:11434

layers:

  # ── Layer 0: Prompt Optimizer ──────────────────────────────────────────────
  # Strips filler words and verbose phrasing from the user's query BEFORE it
  # enters the rest of the pipeline.  Saves tokens on every single LLM call.
  optimizer:
    enabled: true
    backend: rules              # rules | llm | passthrough
    report_savings: true        # print token savings to console
    llm:                        # used when backend: llm
      model: claude-haiku-4-5-20251001   # use cheapest model for optimization
      max_tokens: 512

  # ── Layer 1: Semantic Cache ────────────────────────────────────────────────
  # Returns cached answers for semantically similar queries.
  # The optimized query is used as the cache key, so minor wording variations
  # that collapse to the same optimized form share cache entries.
  cache:
    enabled: true
    backend: memory             # memory | redis | qdrant
    similarity_threshold: 0.92
    ttl_seconds: 3600
    redis:
      url: redis://localhost:6379
      index_name: ragstack_cache
    qdrant:
      url: http://localhost:6333
      collection: ragstack_cache

  # ── Layer 2: Query Rewriter ────────────────────────────────────────────────
  # Enriches the query for better retrieval (expand abbreviations, synonyms).
  # Operates on the OPTIMIZED query so rewriting never re-introduces filler.
  rewriter:
    enabled: true
    backend: llm                # llm | hyde | passthrough
    model: claude-haiku-4-5-20251001
    strategy: expand
    hyde:
      model: claude-haiku-4-5-20251001

  # ── Layer 3: Retriever ─────────────────────────────────────────────────────
  retriever:
    enabled: true
    backend: memory             # graphify | chroma | pinecone | weaviate | memory
    top_k: 8
    graphify:
      graph_path: ./graphify-out/graph.json
      report_path: ./graphify-out/GRAPH_REPORT.md
      hop_limit: 3
      edge_types:
        - calls
        - depends_on
        - semantically_similar_to
        - rationale_for
    chroma:
      host: localhost
      port: 8000
      collection: ragstack_docs
    pinecone:
      index_name: ragstack-index
      environment: us-east-1
    weaviate:
      url: http://localhost:8080
      class_name: Document
    memory:
      docs:
        - text: >
            The authentication flow begins when a user submits credentials.
            The AuthController receives the request and delegates to AuthService.
            AuthService calls TokenValidator.validate() which checks the JWT signature
            and expiry. On success, a session token is issued and stored in Redis.
          source: auth/auth_service.py
        - text: >
            TokenValidator is called by AuthService.authenticate() and by the
            middleware layer on every protected route. It imports jwt and uses
            the RS256 public key from config. Validation failures raise
            AuthenticationError which the error handler converts to HTTP 401.
          source: auth/token_validator.py
        - text: >
            The API gateway routes all /api/* requests through the auth middleware
            before forwarding to downstream services. The middleware extracts the
            Bearer token from the Authorization header and passes it to TokenValidator.
            Rate limiting is applied after successful authentication.
          source: gateway/middleware.py
        - text: >
            Session tokens are 256-bit random values stored in Redis with a 24-hour
            TTL. The SessionStore class handles creation, lookup, and invalidation.
            Tokens are hashed (SHA-256) before storage so raw tokens never touch disk.
          source: auth/session_store.py

  # ── Layer 4: Compressor / Reranker ────────────────────────────────────────
  compressor:
    enabled: true
    backend: passthrough        # reranker | llmlingua | passthrough
    top_k: 3
    reranker:
      model: cross-encoder/ms-marco-MiniLM-L-6-v2
      score_threshold: 0.3
    llmlingua:
      ratio: 0.4
      model: microsoft/llmlingua-2-bert-base-multilingual-cased-meetingbank

  # ── Layer 5: Prompt Cache ──────────────────────────────────────────────────
  # Marks the static system prefix as cacheable so Anthropic/OpenAI can
  # serve it from cache on repeat calls, reducing cost and latency.
  prompt_cache:
    enabled: true
    backend: anthropic          # anthropic | openai | none
    cached_prefix: |
      You are a precise technical assistant. Answer only from the provided
      context. If the context does not contain enough information, say so.
      Cite the source file or node name for every claim you make.

embeddings:
  model: text-embedding-3-small
  dimensions: 1536