-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathstart_backend.sh
More file actions
executable file
·604 lines (532 loc) · 19.4 KB
/
start_backend.sh
File metadata and controls
executable file
·604 lines (532 loc) · 19.4 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
#!/bin/bash
# Backend startup script for YA-PapersWithCode
# This script sets up the Python environment, downloads data, initializes the database, and starts the API server
set -e # Exit on error
echo "=== YA-PapersWithCode Backend Setup ==="
echo
# Interactive mode selection
echo "Select deployment mode:"
echo "1. Local mode (Local AI model + PapersWithCode database)"
echo "2. Model only (Only deploy AI model)"
echo "3. API mode (Check model API + Deploy PapersWithCode database)"
echo
read -p "Enter your choice (1-3): " MODE_CHOICE
case $MODE_CHOICE in
1)
MODE="local"
echo "Selected: Local mode"
;;
2)
MODE="model_only"
echo "Selected: Model only"
;;
3)
MODE="api_mode"
echo "Selected: API mode"
;;
*)
echo "Invalid choice. Using default local mode."
MODE="local"
;;
esac
echo
# Colors for output
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
NC='\033[0m' # No Color
# Function to print colored messages
log_info() {
echo -e "${GREEN}[INFO]${NC} $1"
}
log_warn() {
echo -e "${YELLOW}[WARN]${NC} $1"
}
log_error() {
echo -e "${RED}[ERROR]${NC} $1"
}
# Get script directory
SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"
BACKEND_DIR="$SCRIPT_DIR/data/ya-paperswithcode-database"
# Check for .env configuration
ENV_TEMPLATE="$SCRIPT_DIR/.env.template"
ENV_FILE="$SCRIPT_DIR/.env"
if [ -f "$ENV_TEMPLATE" ] && [ ! -f "$ENV_FILE" ]; then
log_info "Creating .env configuration file..."
cp "$ENV_TEMPLATE" "$ENV_FILE"
# Update deployment mode in .env based on user selection
case $MODE in
"local")
sed -i.bak 's/DEPLOYMENT_MODE=.*/DEPLOYMENT_MODE=local/' "$ENV_FILE"
;;
"model_only")
sed -i.bak 's/DEPLOYMENT_MODE=.*/DEPLOYMENT_MODE=model_only/' "$ENV_FILE"
;;
"api_mode")
sed -i.bak 's/DEPLOYMENT_MODE=.*/DEPLOYMENT_MODE=api_mode/' "$ENV_FILE"
;;
esac
rm -f "$ENV_FILE.bak" 2>/dev/null || true
log_info "Configuration file created at $ENV_FILE"
log_info "Please review and update the configuration as needed."
elif [ -f "$ENV_FILE" ]; then
log_info "Using existing .env configuration file"
else
log_warn "No .env template found. Using default configuration."
fi
# Change to backend directory
cd "$BACKEND_DIR"
log_info "Working directory: $BACKEND_DIR"
# Load .env file variables into environment
if [ -f "$SCRIPT_DIR/.env" ]; then
log_info "Loading environment variables from .env file..."
# Export all variables from .env file (excluding comments and empty lines)
while IFS='=' read -r key value; do
# Skip comments and empty lines
if [[ ! "$key" =~ ^#.*$ ]] && [[ -n "$key" ]]; then
# Remove quotes if present
value=$(echo "$value" | sed -e 's/^"//' -e 's/"$//' -e "s/^'//" -e "s/'$//")
export "$key=$value"
log_info " $key=$value"
fi
done < "$SCRIPT_DIR/.env"
fi
# Check if uv is installed
if ! command -v uv &> /dev/null; then
log_warn "uv is not installed. Installing uv..."
# Detect OS and install uv
if [[ "$OSTYPE" == "darwin"* ]]; then
# macOS
if command -v brew &> /dev/null; then
log_info "Installing uv using Homebrew..."
brew install uv
else
log_info "Installing uv using curl..."
curl -LsSf https://astral.sh/uv/install.sh | sh
# Add to PATH for current session
export PATH="$HOME/.cargo/bin:$PATH"
fi
elif [[ "$OSTYPE" == "linux-gnu"* ]]; then
# Linux
log_info "Installing uv using curl..."
curl -LsSf https://astral.sh/uv/install.sh | sh
# Add to PATH for current session
export PATH="$HOME/.cargo/bin:$PATH"
else
log_error "Unsupported OS: $OSTYPE"
log_info "Please install uv manually: https://github.com/astral-sh/uv"
exit 1
fi
# Verify installation
if ! command -v uv &> /dev/null; then
log_error "Failed to install uv. Please install it manually."
exit 1
fi
log_info "uv installed successfully!"
fi
# Create virtual environment if it doesn't exist
if [ ! -d ".venv" ]; then
log_info "Creating Python virtual environment..."
uv venv
fi
# Activate virtual environment
log_info "Activating virtual environment..."
source .venv/bin/activate
# Install dependencies
log_info "Installing Python dependencies..."
uv pip install fastapi uvicorn pydantic python-multipart aiofiles python-dotenv
# Install AI model dependencies
log_info "Installing AI model dependencies..."
uv pip install requests sentence-transformers torch transformers huggingface_hub || {
log_warn "Some AI dependencies could not be installed. AI features may be limited."
}
# Check required Python files
REQUIRED_FILES=("api_server.py" "init_database.py" "schema.sql")
for file in "${REQUIRED_FILES[@]}"; do
if [ ! -f "$file" ]; then
log_error "Required file $file is missing!"
log_info "Please ensure all backend files are present."
exit 1
fi
done
# Skip database setup for model_only mode
if [ "$MODE" != "model_only" ]; then
# Check if JSON data files exist
DATA_FILES=("papers-with-abstracts.json" "datasets.json" "methods.json" "evaluation-tables.json" "links-between-papers-and-code.json")
NEED_DOWNLOAD=false
for file in "${DATA_FILES[@]}"; do
if [ ! -f "$file" ]; then
NEED_DOWNLOAD=true
break
fi
done
# Download data if needed
if [ "$NEED_DOWNLOAD" = true ]; then
log_info "Data files not found. Downloading PapersWithCode data..."
# Download compressed files
BASE_URL="https://production-media.paperswithcode.com/about/"
for file in "${DATA_FILES[@]}"; do
if [ ! -f "$file" ]; then
GZ_FILE="${file}.gz"
if [ ! -f "$GZ_FILE" ]; then
log_info "Downloading $GZ_FILE..."
curl -L -o "$GZ_FILE" "${BASE_URL}${GZ_FILE}"
fi
log_info "Extracting $GZ_FILE..."
gunzip -k "$GZ_FILE" || true
fi
done
else
log_info "Data files already exist."
fi
else
log_info "Skipping database data download (model_only mode)"
fi
# Database initialization with duplicate checking (skip for model_only mode)
if [ "$MODE" != "model_only" ] && [ ! -f "paperswithcode.db" ]; then
log_info "Database not found. Creating new database..."
python init_database.py
# Remove duplicates after initial load
log_info "Removing duplicate entries..."
python -c "
import sqlite3
conn = sqlite3.connect('paperswithcode.db')
cursor = conn.cursor()
# Remove duplicate papers (keeping the first occurrence)
cursor.execute('''
DELETE FROM papers
WHERE rowid NOT IN (
SELECT MIN(rowid)
FROM papers
GROUP BY id
)
''')
papers_removed = cursor.rowcount
# Remove duplicate datasets (keeping the first occurrence)
cursor.execute('''
DELETE FROM datasets
WHERE rowid NOT IN (
SELECT MIN(rowid)
FROM datasets
GROUP BY id
)
''')
datasets_removed = cursor.rowcount
# Remove duplicate methods (keeping the first occurrence)
cursor.execute('''
DELETE FROM methods
WHERE rowid NOT IN (
SELECT MIN(rowid)
FROM methods
GROUP BY id
)
''')
methods_removed = cursor.rowcount
# Remove duplicate repositories (keeping the first occurrence based on paper_id and repo_url)
cursor.execute('''
DELETE FROM repositories
WHERE rowid NOT IN (
SELECT MIN(rowid)
FROM repositories
GROUP BY paper_id, repo_url
)
''')
repos_removed = cursor.rowcount
conn.commit()
conn.close()
if papers_removed > 0:
print(f'Removed {papers_removed} duplicate papers')
if datasets_removed > 0:
print(f'Removed {datasets_removed} duplicate datasets')
if methods_removed > 0:
print(f'Removed {methods_removed} duplicate methods')
if repos_removed > 0:
print(f'Removed {repos_removed} duplicate repositories')
" || log_warn "Could not check for duplicates"
log_info "Database initialized successfully!"
elif [ "$MODE" != "model_only" ]; then
log_info "Database already exists."
# Check if database has data
log_info "Checking database contents..."
PAPER_COUNT=$(python -c "
import sqlite3
try:
conn = sqlite3.connect('paperswithcode.db')
cursor = conn.cursor()
cursor.execute('SELECT COUNT(*) FROM papers')
count = cursor.fetchone()[0]
print(count)
conn.close()
except:
print(0)
" 2>/dev/null)
DATASET_COUNT=$(python -c "
import sqlite3
try:
conn = sqlite3.connect('paperswithcode.db')
cursor = conn.cursor()
cursor.execute('SELECT COUNT(*) FROM datasets')
count = cursor.fetchone()[0]
print(count)
conn.close()
except:
print(0)
" 2>/dev/null)
if [ "$PAPER_COUNT" -eq "0" ] || [ "$DATASET_COUNT" -eq "0" ]; then
log_warn "Database exists but is empty. Loading data..."
python init_database.py
# After loading, remove duplicates
log_info "Removing duplicate entries..."
python -c "
import sqlite3
conn = sqlite3.connect('paperswithcode.db')
cursor = conn.cursor()
# Remove duplicate papers (keeping the first occurrence)
cursor.execute('''
DELETE FROM papers
WHERE rowid NOT IN (
SELECT MIN(rowid)
FROM papers
GROUP BY id
)
''')
papers_removed = cursor.rowcount
# Remove duplicate datasets (keeping the first occurrence)
cursor.execute('''
DELETE FROM datasets
WHERE rowid NOT IN (
SELECT MIN(rowid)
FROM datasets
GROUP BY id
)
''')
datasets_removed = cursor.rowcount
# Remove duplicate methods (keeping the first occurrence)
cursor.execute('''
DELETE FROM methods
WHERE rowid NOT IN (
SELECT MIN(rowid)
FROM methods
GROUP BY id
)
''')
methods_removed = cursor.rowcount
# Remove duplicate repositories (keeping the first occurrence based on paper_id and repo_url)
cursor.execute('''
DELETE FROM repositories
WHERE rowid NOT IN (
SELECT MIN(rowid)
FROM repositories
GROUP BY paper_id, repo_url
)
''')
repos_removed = cursor.rowcount
conn.commit()
conn.close()
if papers_removed > 0:
print(f'Removed {papers_removed} duplicate papers')
if datasets_removed > 0:
print(f'Removed {datasets_removed} duplicate datasets')
if methods_removed > 0:
print(f'Removed {methods_removed} duplicate methods')
if repos_removed > 0:
print(f'Removed {repos_removed} duplicate repositories')
" || log_warn "Could not check for duplicates"
log_info "Data loading complete!"
else
log_info "Database already contains data:"
log_info " Papers: $PAPER_COUNT"
log_info " Datasets: $DATASET_COUNT"
log_info "Skipping data load - database is already populated."
fi
else
log_info "Skipping database setup (model_only mode)"
fi
# Display database statistics (skip for model_only mode)
if [ "$MODE" != "model_only" ]; then
log_info "Database Statistics:"
python -c "
import sqlite3
conn = sqlite3.connect('paperswithcode.db')
cursor = conn.cursor()
# Get counts
cursor.execute('SELECT COUNT(*) FROM papers')
papers = cursor.fetchone()[0]
cursor.execute('SELECT COUNT(*) FROM datasets')
datasets = cursor.fetchone()[0]
cursor.execute('SELECT COUNT(*) FROM methods')
methods = cursor.fetchone()[0]
cursor.execute('SELECT COUNT(*) FROM repositories')
repos = cursor.fetchone()[0]
print(f' • Papers: {papers:,}')
print(f' • Datasets: {datasets:,}')
print(f' • Methods: {methods:,}')
print(f' • Repositories: {repos:,}')
conn.close()
" 2>/dev/null || log_warn "Could not retrieve database statistics"
fi
# Setup AI models for Agent Search (skip for api_mode)
if [ "$MODE" != "api_mode" ]; then
log_info "Setting up AI models for Agent Search..."
# Check MODEL_PATH from .env if it exists
MODEL_PATH="checkpoints" # Default value
if [ -f "$SCRIPT_DIR/.env" ]; then
# Extract MODEL_PATH from .env file
ENV_MODEL_PATH=$(grep -E "^MODEL_PATH=" "$SCRIPT_DIR/.env" | cut -d'=' -f2 | tr -d "'" | tr -d '"')
if [ -n "$ENV_MODEL_PATH" ]; then
MODEL_PATH="$ENV_MODEL_PATH"
log_info "Using MODEL_PATH from .env: $MODEL_PATH"
fi
fi
if [ -f "download_models.py" ]; then
# Check if models are already set up
if [ ! -d "$MODEL_PATH" ]; then
log_info "Model directory '$MODEL_PATH' not found. Downloading and configuring models..."
python download_models.py || {
log_warn "Model setup encountered issues, but continuing with basic search capabilities."
}
else
log_info "Model directory '$MODEL_PATH' exists."
# Check if model files actually exist
MODEL_EXISTS=false
if [ -d "$MODEL_PATH/pasa-7b-crawler" ] && [ -d "$MODEL_PATH/pasa-7b-selector" ]; then
if [ -f "$MODEL_PATH/pasa-7b-crawler/pytorch_model.bin" ] || [ -f "$MODEL_PATH/pasa-7b-crawler/model.safetensors" ] || ls "$MODEL_PATH/pasa-7b-crawler"/model-*.safetensors >/dev/null 2>&1; then
if [ -f "$MODEL_PATH/pasa-7b-selector/pytorch_model.bin" ] || [ -f "$MODEL_PATH/pasa-7b-selector/model.safetensors" ]; then
MODEL_EXISTS=true
fi
fi
fi
if [ "$MODEL_EXISTS" = false ]; then
log_warn "Model files not found in '$MODEL_PATH'. Downloading models..."
python download_models.py || {
log_warn "Model download failed, but continuing with basic search capabilities."
}
else
# Verify config is up to date
if [ -f "agent-search/config.json" ]; then
# Check if fallback mode is enabled in config
if ! grep -q "fallback_enabled" "agent-search/config.json"; then
log_info "Updating model configuration..."
python download_models.py || {
log_warn "Model configuration update failed, but continuing."
}
fi
fi
fi
fi
else
log_warn "download_models.py not found. AI Agent Search will use basic mode only."
fi
else
log_info "Skipping AI model setup (api_mode - will use external API)"
fi
# Display model status (skip for api_mode)
if [ "$MODE" != "api_mode" ]; then
# Use the same MODEL_PATH logic as above
MODEL_PATH="checkpoints" # Default value
if [ -f "$SCRIPT_DIR/.env" ]; then
ENV_MODEL_PATH=$(grep -E "^MODEL_PATH=" "$SCRIPT_DIR/.env" | cut -d'=' -f2 | tr -d "'" | tr -d '"')
if [ -n "$ENV_MODEL_PATH" ]; then
MODEL_PATH="$ENV_MODEL_PATH"
fi
fi
if [ -d "$MODEL_PATH/pasa-7b-crawler" ] && [ -d "$MODEL_PATH/pasa-7b-selector" ]; then
log_info "Model directories found in '$MODEL_PATH':"
if [ -f "$MODEL_PATH/pasa-7b-crawler/pytorch_model.bin" ] || [ -f "$MODEL_PATH/pasa-7b-crawler/model.safetensors" ] || ls "$MODEL_PATH/pasa-7b-crawler"/model-*.safetensors >/dev/null 2>&1; then
log_info " ✓ PASA-7B Crawler model found"
else
log_warn " ✗ PASA-7B Crawler model not found (using mock configuration)"
fi
if [ -f "$MODEL_PATH/pasa-7b-selector/pytorch_model.bin" ] || [ -f "$MODEL_PATH/pasa-7b-selector/model.safetensors" ]; then
log_info " ✓ PASA-7B Selector model found"
else
log_warn " ✗ PASA-7B Selector model not found (using mock configuration)"
fi
else
log_warn "Model directories not found in '$MODEL_PATH'"
fi
fi
# Pre-build embeddings if needed (for local mode and api_mode)
if [ "$MODE" != "model_only" ]; then
# Check if embeddings exist
EMBEDDINGS_DIR="embeddings"
DATASETS_EMBEDDINGS="$EMBEDDINGS_DIR/datasets_embeddings.pkl"
DATASETS_FAISS="$EMBEDDINGS_DIR/datasets_embeddings.faiss"
if [ ! -f "$DATASETS_EMBEDDINGS" ] || [ ! -f "$DATASETS_FAISS" ]; then
log_info "Building semantic search embeddings (this only needs to be done once)..."
log_info "This may take a few minutes on first run..."
if [ -f "build_embeddings.py" ]; then
python build_embeddings.py --datasets || {
log_warn "Failed to build embeddings. Semantic search may be slower."
}
else
log_warn "build_embeddings.py not found. Embeddings will be built on-demand (may use more memory)."
fi
else
log_info "Pre-built embeddings found. Using cached embeddings for faster startup."
fi
fi
# Get backend port from environment or use default
BACKEND_PORT="${BACKEND_PORT:-8000}"
# Check if another instance is already running
if lsof -Pi :$BACKEND_PORT -sTCP:LISTEN -t >/dev/null 2>&1; then
log_warn "Port $BACKEND_PORT is already in use. Stopping existing process..."
kill $(lsof -Pi :$BACKEND_PORT -sTCP:LISTEN -t) 2>/dev/null || true
sleep 2
fi
# Start the API server
log_info "Starting API server on http://localhost:$BACKEND_PORT"
log_info "Press Ctrl+C to stop the server"
echo
echo "=== API Server is starting ==="
echo "API Documentation: http://localhost:$BACKEND_PORT/docs"
echo "Mode: $MODE"
echo "Available Features:"
# Database features
if [ "$MODE" != "model_only" ]; then
echo " • Basic SQL/JSON search: ✓ Enabled"
echo " • Standard filtering: ✓ Enabled"
echo " • Full-text search: ✓ Enabled"
else
echo " • Database features: ✗ Disabled (model_only mode)"
fi
# AI capabilities
if [ "$MODE" = "api_mode" ]; then
echo " • AI search: ✓ External API mode"
elif [ "$MODE" != "model_only" ]; then
# Check for AI capabilities in local/combined mode
if python -c "import sentence_transformers" 2>/dev/null; then
echo " • Semantic search: ✓ Enabled"
else
echo " • Semantic search: ✗ Disabled (install sentence-transformers)"
fi
# Use the same MODEL_PATH logic for final status check
MODEL_PATH="checkpoints" # Default value
if [ -f "$SCRIPT_DIR/.env" ]; then
ENV_MODEL_PATH=$(grep -E "^MODEL_PATH=" "$SCRIPT_DIR/.env" | cut -d'=' -f2 | tr -d "'" | tr -d '"')
if [ -n "$ENV_MODEL_PATH" ]; then
MODEL_PATH="$ENV_MODEL_PATH"
fi
fi
if [ -f "$MODEL_PATH/pasa-7b-crawler/config.json" ] && [ -f "$MODEL_PATH/pasa-7b-selector/config.json" ]; then
if [ -f "$MODEL_PATH/pasa-7b-crawler/pytorch_model.bin" ] || [ -f "$MODEL_PATH/pasa-7b-crawler/model.safetensors" ] || ls "$MODEL_PATH/pasa-7b-crawler"/model-*.safetensors >/dev/null 2>&1; then
echo " • AI-powered query expansion: ✓ Enabled"
echo " • Multi-layer paper expansion: ✓ Enabled"
else
echo " • AI-powered search: ✗ Using fallback mode"
fi
else
echo " • AI-powered search: ✗ Not configured"
fi
else
# model_only mode
if python -c "import sentence_transformers" 2>/dev/null; then
echo " • AI model services: ✓ Available"
else
echo " • AI model services: ✗ Missing dependencies"
fi
fi
echo
# Run the API server with mode configuration
export YA_DEPLOYMENT_MODE="$MODE"
python api_server.py