Skip to content

Commit 73a7002

Browse files
committed
Refactored the ingestion admin and docling servers
1 parent b726eb6 commit 73a7002

104 files changed

Lines changed: 3741 additions & 12732 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.env.example

Lines changed: 51 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -1,62 +1,62 @@
1-
# ============================================================================
2-
# OpenAPI HTML Server Environment Configuration
3-
# ============================================================================
4-
# Copy this file to .env and update with your values
5-
6-
# ============================================================================
7-
# OpenAI / LLM API Configuration
8-
# ============================================================================
9-
# For OpenAI
10-
OPENAI_API_KEY=sk-123
11-
OPENAI_BASE_URL=http://wingman.akhbar.home/v1
1+
# ------------------------------------------#LLM_MODEL=gpt-4-turbo
122

13-
# For Ollama (local)
14-
# OPENAI_API_KEY=sk-321
15-
# OPENAI_BASE_URL=http://localhost:11434/v1
3+
# General
4+
# ------------------------------------------
5+
DOCUMENTS_PATH=./docs
166

17-
# For other OpenAI-compatible APIs
18-
# OPENAI_API_KEY=your-key
19-
# OPENAI_BASE_URL=http://your-api-endpoint/v1
7+
# Callback base URL for async parsing
8+
# This should be the externally accessible URL of this admin server
9+
CALLBACK_BASE_URL=http://localhost:8005
2010

21-
# ============================================================================
22-
# LLM Model Configuration
23-
# ============================================================================
24-
LLM_MODEL=gpt-4-turbo
25-
# Alternatives: gpt-3.5-turbo, gpt-4, llama2, mistral, etc.
11+
DOCLING_SERVER=http://docling-server:8001
12+
MONGO_URI=mongodb://mongodb:27017/
13+
POSTGRES_URL=postgresql://postgres:password@pgvector:5432/ingestion
14+
# Async parsing configuration
15+
USE_ASYNC_PARSING=true
2616

27-
# ============================================================================
28-
# Embeddings Configuration
29-
# ============================================================================
30-
EMBEDDING_MODEL=bge-m3
31-
# Alternatives: text-embedding-ada-002, bge-large, etc.
17+
# Worker configuration
18+
# Number of concurrent workers for processing documents
19+
POOL_MIN_SIZE=10
20+
POOL_MAX_SIZE=20
21+
NUM_WORKERS=12
3222

33-
# ============================================================================
34-
# Qdrant Vector Database Configuration
35-
# ============================================================================
36-
QDRANT_HOST=wingman.akhbar.home
37-
QDRANT_PORT=6333
3823
COLLECTION_NAME=fetchcraft_chatbot
3924

40-
# ============================================================================
41-
# Documents Configuration
42-
# ============================================================================
43-
DOCUMENTS_PATH=/app/Documents
44-
45-
# ============================================================================
46-
# Chunking Configuration
47-
# ============================================================================
48-
CHUNK_SIZE=8192
49-
CHUNK_OVERLAP=200
25+
# ------------------------------------------
26+
# Embeddings
27+
# ------------------------------------------
28+
EMBEDDING_MODEL=bge-m3
29+
EMBEDDING_API_KEY=sk-123
30+
EMBEDDING_BASE_URL=http://localhost:8000/v1
5031

51-
# ============================================================================
52-
# Hybrid Search Configuration
53-
# ============================================================================
32+
# ------------------------------------------
33+
# LLM - OpenAI
34+
# ------------------------------------------
35+
OPENAI_API_KEY=sk-123
36+
OPENAI_BASE_URL=http://localhost:8000/v1
37+
LLM_MODEL=gpt-5
38+
39+
# ------------------------------------------
40+
# PostgreSQL
41+
# ------------------------------------------
42+
POSTGRES_USER=postgres
43+
POSTGRES_PASSWORD=password
44+
POSTGRES_DB=fetchcraft
45+
46+
# ------------------------------------------
47+
# Qdrant Vector Store
48+
# ------------------------------------------
49+
QDRANT_HOST=localhost
50+
QDRANT_PORT=6333
5451
ENABLE_HYBRID=true
5552
FUSION_METHOD=rrf
56-
# Alternatives: dbsf (distribution-based score fusion)
5753

58-
# ============================================================================
59-
# Server Configuration
60-
# ============================================================================
61-
HOST=0.0.0.0
62-
PORT=8001
54+
# ------------------------------------------
55+
# Vector Index
56+
# ------------------------------------------
57+
INDEX_ID=docs-index
58+
CHUNK_SIZE=16384
59+
CHILD_CHUNKS=4096,1024
60+
CHUNK_OVERLAP=200
61+
62+
HF_HOME=$HOME/.cache/huggingface

Dockerfile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
# Use Python 3.12 slim image
22
FROM python:3.12-slim
33

4-
RUN apt-get update && apt-get install ffmpeg libsm6 libxext6 -y
4+
RUN apt-get update && apt-get install ffmpeg libsm6 libxext6 htop vim wget -y
55

66
# Set working directory
77
WORKDIR /app

Dockerfile-gpu

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
1-
FROM pytorch/pytorch:2.6.0-cuda12.4-cudnn9-devel
1+
FROM pytorch/pytorch:2.6.0-cuda12.6-cudnn9-devel
22
#FROM pytorch/pytorch:2.6.0-cuda12.4-cudnn9-runtime
33

4-
#RUN apt-get update && apt install python3 install ffmpeg libsm6 libxext6 nvtop -y
4+
RUN apt-get update && apt install ffmpeg libsm6 libxext6 nvtop wget htop vim -y
55

66
# Set working directory
77
WORKDIR /app

0 commit comments

Comments
 (0)