-
Notifications
You must be signed in to change notification settings - Fork 9
Expand file tree
/
Copy pathMakefile
More file actions
64 lines (52 loc) · 1.94 KB
/
Makefile
File metadata and controls
64 lines (52 loc) · 1.94 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
# DadAI Data Pipeline
# Run from project root: make all
#
# Prerequisites:
# - Activate venv: source .venv/bin/activate
# - Set up .env with Reddit API credentials
.PHONY: all collect format clean check sample prepare train test chat help
# Full pipeline: collect → format → clean → validate
all: collect format clean check
# Step 1: Collect posts from Reddit
collect:
python scripts/collect_reddit_data.py
# Step 2: Format into Mistral [INST] prompt/completion pairs
format:
python scripts/format_reddit_data.py
# Step 3: Clean and filter low-quality entries
clean:
python scripts/clean_dataset.py
# Step 4: Validate the cleaned dataset
check:
python scripts/check_dataset_format.py
# Show random samples for quality inspection
sample:
python scripts/show_random_sample.py
# Step 5: Prepare data for MLX training (split into train/valid/test)
prepare:
python scripts/prepare_training_data.py
# Step 6: Run LoRA fine-tuning
train: prepare
mlx_lm.lora --config training_config.yaml
# Step 7: Evaluate on test set
test:
mlx_lm.lora --model models/mistral-7b-instruct-v0.3-4bit --adapter-path adapters/dadai-lora --data data/mlx_training --test --test-batches 25
# Interactive chat with fine-tuned model
chat:
python scripts/inference.py
# Show help
help:
@echo "DadAI Data Pipeline"
@echo ""
@echo " make all - Run full pipeline (collect → format → clean → check)"
@echo " make collect - Step 1: Collect Reddit posts"
@echo " make format - Step 2: Format into training pairs"
@echo " make clean - Step 3: Clean and filter"
@echo " make check - Step 4: Validate dataset"
@echo " make sample - Show random samples"
@echo ""
@echo "Training:"
@echo " make prepare - Step 5: Prepare data for MLX training"
@echo " make train - Step 6: Run LoRA fine-tuning (includes prepare)"
@echo " make test - Step 7: Evaluate model on test set"
@echo " make chat - Interactive chat with fine-tuned model"