My-Web-Intelligence-v2/.env.example at master · MyWebIntelligence/My-Web-Intelligence-v2 · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
# ===========================================================================
# MyWebIntelligence - Docker Compose Environment Configuration
# ===========================================================================
#
# QUICK START:
#   1. Copy this file:     cp .env.example .env
#   2. Configure:          python scripts/install-docker-compose.py
#   3. Start services:     docker compose up -d --build
#   4. Initialize DB:      docker compose exec mwi python mywi.py db setup
#
# OR manually edit .env to add your API keys and settings.
#
# WARNING: Do NOT commit .env to version control (contains sensitive data).
#
# ===========================================================================

# ---------------------------------------------------------------------------
# Build-time toggles (Docker build args)
# ---------------------------------------------------------------------------
# These control what gets installed during the Docker image build.
#
# MYWI_WITH_ML:
#   0 = Basic installation (no ML dependencies)
#   1 = Include ML extras (PyTorch, FAISS, Transformers, Sentence-Transformers)
#       Required for: embeddings, semantic search, NLI classification
#
# MYWI_WITH_PLAYWRIGHT_BROWSERS:
#   0 = Skip Playwright browsers (can install later)
#   1 = Pre-install Chromium/Firefox in image (larger image, slower build)
#       Required for: dynamic media extraction from JavaScript-heavy sites

MYWI_WITH_ML=0
MYWI_WITH_PLAYWRIGHT_BROWSERS=0

# ---------------------------------------------------------------------------
# Runtime settings (host + container)
# ---------------------------------------------------------------------------

# Timezone for container (affects logs and timestamps)
# Examples: UTC, Europe/Paris, America/New_York, Asia/Tokyo
TZ=UTC

# Host data directory (where database and exports are stored on YOUR computer)
# - Default: ./data (inside repository)
# - macOS/Linux: /Users/you/mywi_data or /home/you/mywi_data
# - Windows: C:/Users/You/mywi_data (use forward slashes)
HOST_DATA_DIR=./data

# Container data directory (internal path, DO NOT CHANGE unless you know what you're doing)
# This is where the app expects to find data inside the Docker container.
MYWI_DATA_DIR=/app/data

# ---------------------------------------------------------------------------
# Application settings (read by settings-example.py via MWI_* env vars)
# ---------------------------------------------------------------------------
# Fill in your API keys and configuration below.
# To configure interactively: python scripts/install-docker-compose.py
#
# IMPORTANT: Never commit .env with real API keys to version control!

# ---------------------------------------------------------------------------
# OpenRouter - AI relevance filtering (optional)
# ---------------------------------------------------------------------------
# Use LLM to automatically filter irrelevant pages during crawling.
# Get API key at: https://openrouter.ai/
# Free tier: Pay-per-use, very affordable for small models like DeepSeek
#
# Enable/disable: true or false
MWI_OPENROUTER_ENABLED=false
# Your API key (get from https://openrouter.ai/keys)
MWI_OPENROUTER_API_KEY=
# Model to use (see options: https://openrouter.ai/models)
# Recommended: deepseek/deepseek-chat-v3.1 (economical)
# Alternatives: openai/gpt-4o-mini, anthropic/claude-3-haiku, google/gemini-1.5-flash
MWI_OPENROUTER_MODEL=deepseek/deepseek-chat-v3.1
# Request timeout in seconds
MWI_OPENROUTER_TIMEOUT=15
# Minimum text length to process (characters)
MWI_OPENROUTER_MIN_CHARS=140
# Maximum text length to send to LLM (characters)
MWI_OPENROUTER_MAX_CHARS=12000
# Maximum LLM calls per crawl/readable run
MWI_OPENROUTER_MAX_CALLS=500

# ---------------------------------------------------------------------------
# SEO Rank - Traffic metrics enrichment (optional)
# ---------------------------------------------------------------------------
# Enrich pages with SEO metrics, traffic estimates, backlinks, social stats.
# Requires API access from your provider.
#
# API endpoint URL
MWI_SEORANK_API_BASE_URL=https://seo-rank.my-addr.com/api2/sr+fb
# Your API key
MWI_SEORANK_API_KEY=

# ---------------------------------------------------------------------------
# SerpAPI - Google search bootstrap (optional)
# ---------------------------------------------------------------------------
# Automatically gather URLs from Google search results to bootstrap research.
# Get API key at: https://serpapi.com/
# Free tier: 100 searches/month
#
# Your API key
MWI_SERPAPI_API_KEY=

# ---------------------------------------------------------------------------
# Embeddings - Paragraph vectors (required for semantic search)
# ---------------------------------------------------------------------------
# Choose a provider to generate embeddings from text paragraphs.
# Required if MYWI_WITH_ML=1
#
# Provider options:
#   - fake: Testing only (deterministic random vectors)
#   - openai: OpenAI API (text-embedding-3-small, text-embedding-ada-002)
#   - mistral: Mistral AI (mistral-embed) [RECOMMENDED - fast and affordable]
#   - gemini: Google Gemini (embedding-001)
#   - huggingface: HuggingFace Inference API (various models)
#   - ollama: Local Ollama (nomic-embed-text, runs on your machine)
#   - http: Custom HTTP endpoint
MWI_EMBED_PROVIDER=mistral
# Model name (depends on provider)
MWI_EMBED_MODEL=mistral-embed
# Custom HTTP endpoint (only for provider=http)
MWI_EMBED_API_URL=

# API keys for each provider (fill in only the one you're using)
# OpenAI: https://platform.openai.com/api-keys
MWI_OPENAI_API_KEY=
# Mistral: https://console.mistral.ai/
MWI_MISTRAL_API_KEY=
# Google Gemini: https://makersuite.google.com/app/apikey
MWI_GEMINI_API_KEY=
# HuggingFace: https://huggingface.co/settings/tokens
MWI_HF_API_KEY=
# Ollama local API URL (if using Ollama)
MWI_OLLAMA_BASE_URL=http://localhost:11434

# ---------------------------------------------------------------------------
# Semantic Search & NLI - Relation classification (optional, requires ML)
# ---------------------------------------------------------------------------
# NLI (Natural Language Inference) classifies semantic relations between texts.
# Used for entailment/neutral/contradiction detection in pseudolinks.
# Required if MYWI_WITH_ML=1 and using semantic similarity.
#
# NLI model (HuggingFace model name)
# Recommended: MoritzLaurer/mDeBERTa-v3-base-xnli-multilingual-nli-2mil7 (multilingual)
# Alternative: typeform/distilbert-base-uncased-mnli (English, faster)
MWI_NLI_MODEL_NAME=MoritzLaurer/mDeBERTa-v3-base-xnli-multilingual-nli-2mil7
# Backend preference: auto, transformers, crossencoder, fallback
MWI_NLI_BACKEND=fallback
# Number of CPU threads for PyTorch (1 = safe default)
MWI_NLI_TORCH_THREADS=1
# Fallback model if primary fails
MWI_NLI_FALLBACK_MODEL_NAME=typeform/distilbert-base-uncased-mnli

# Similarity backend for approximate nearest neighbor (ANN) search
# Options: faiss (fast, recommended) or bruteforce (slower, no deps)
MWI_SIMILARITY_BACKEND=faiss
# Number of nearest neighbors to retrieve per paragraph
MWI_SIMILARITY_TOP_K=50
# Classification thresholds (0.0 to 1.0)
MWI_NLI_ENTAILMENT_THRESHOLD=0.8
MWI_NLI_CONTRADICTION_THRESHOLD=0.8