Skip to content

Commit 78fbaf2

Browse files
committed
updates
1 parent db8bd9a commit 78fbaf2

File tree

13 files changed

+947
-115
lines changed

13 files changed

+947
-115
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,7 @@ var/
4949
*.egg
5050

5151
# Data and models
52+
data/
5253
data/raw/*
5354
data/processed/*
5455
data/models/*

configs/student/real_1k.yaml

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
model:
2+
model_name: "prajjwal1/bert-tiny" # or your own TinyBERT
3+
num_labels: 2
4+
use_fast_tokenizer: true
5+
6+
data:
7+
dataset_path: "data/real_1k"
8+
max_len: 128
9+
10+
training:
11+
# ── bookkeeping ────────────────────────────────────────────────────────────
12+
output_dir: "runs/student/real_1k"
13+
overwrite_output_dir: true
14+
run_name: "student_real_1k"
15+
16+
report_to: "wandb"
17+
wandb_project: "senti_synth_student"
18+
19+
alpha: 0.5 # hard‑vs‑soft mix
20+
temperature: 2.0
21+
22+
# ── batch size & epochs ────────────────────────────────────────────────────
23+
per_device_train_batch_size: 16 # fits comfortably on 24 GB VRAM
24+
per_device_eval_batch_size: 16
25+
gradient_accumulation_steps: 1
26+
num_train_epochs: 50 # SST‑2 is tiny; 2–3 epochs suffice
27+
28+
# ── precision & speed ──────────────────────────────────────────────────────
29+
fp16: false # enable mixed precision
30+
bf16: true # turn off to avoid dual precision modes
31+
# torch_dtype: "auto" # (optional) lets HF pick fastest dtype
32+
33+
# ── optimiser & scheduler ─────────────────────────────────────────────────
34+
learning_rate: 0.00003 # good starting LR for GPT‑2 on small corpora
35+
warmup_ratio: 0.1
36+
37+
# ── misc performance knobs ────────────────────────────────────────────────
38+
dataloader_num_workers: 8
39+
gradient_checkpointing: true # big memory win on GPT‑style decoders
40+
max_grad_norm: 1.0
41+
42+
# ── logging, saving, early stop ───────────────────────────────────────────
43+
logging_steps: 20
44+
eval_steps: 100
45+
save_steps: 100
46+
save_total_limit: 3
47+
load_best_model_at_end: true
48+
metric_for_best_model: "eval_f1"
49+
greater_is_better: true
50+
51+
use_early_stopping: true
52+
early_stopping_patience: 2
53+
early_stopping_threshold: 0.0005
54+
55+
do_test_eval: true

configs/student/student_mix.yaml

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
model:
2+
model_name: "prajjwal1/bert-tiny"
3+
num_labels: 2
4+
5+
data:
6+
dataset_path: "data/mix_20k_real_1k_beta_1.0"
7+
max_len: 128
8+
9+
training:
10+
# ── bookkeeping ────────────────────────────────────────────────────────────
11+
output_dir: "runs/student/mix_20k_real_1k_alpha0.5_x_beta1.0"
12+
overwrite_output_dir: true
13+
run_name: "student_mix_20k_real_1k_alpha_0.5_beta_1.0_temp_2.0"
14+
15+
report_to: "wandb"
16+
wandb_project: "senti_synth_student_ablation_alpha_x_beta"
17+
18+
alpha: 0.5 # hard‑vs‑soft mix
19+
temperature: 2.0
20+
21+
# ── batch size & epochs ────────────────────────────────────────────────────
22+
per_device_train_batch_size: 32 # fits comfortably on 24 GB VRAM
23+
per_device_eval_batch_size: 64
24+
gradient_accumulation_steps: 1
25+
num_train_epochs: 10 # SST‑2 is tiny; 2–3 epochs suffice
26+
27+
# ── precision & speed ──────────────────────────────────────────────────────
28+
fp16: false # enable mixed precision
29+
bf16: true # turn off to avoid dual precision modes
30+
# torch_dtype: "auto" # (optional) lets HF pick fastest dtype
31+
32+
# ── optimiser & scheduler ─────────────────────────────────────────────────
33+
learning_rate: 0.00003 # good starting LR for GPT‑2 on small corpora
34+
warmup_ratio: 0.1
35+
36+
# ── misc performance knobs ────────────────────────────────────────────────
37+
dataloader_num_workers: 16
38+
gradient_checkpointing: true # big memory win on GPT‑style decoders
39+
max_grad_norm: 1.0
40+
41+
# ── logging, saving, early stop ───────────────────────────────────────────
42+
logging_steps: 100
43+
eval_steps: 500
44+
save_steps: 500
45+
save_total_limit: 3
46+
load_best_model_at_end: true
47+
metric_for_best_model: "eval_f1"
48+
greater_is_better: true
49+
50+
use_early_stopping: true
51+
early_stopping_patience: 2
52+
early_stopping_threshold: 0.005
53+
54+
do_test_eval: true

configs/synthetic/conf.yaml

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4,21 +4,24 @@ model:
44

55
teacher:
66
ckpt_dir: "runs/teacher/deberta_v3_base/"
7-
min_confidence: 0.85 # accept sample only if score ≥ this value
7+
min_confidence: 0.9 # accept sample only if score ≥ this value
8+
beta: 0.5 # ↓ weight for synthetic rows
9+
temperature: 2.0 # ↓ soften teacher logits
810

911
data:
10-
output_dir: "data/synthetic_sst2"
12+
output_dir: "data/synthetic_sst2_beta_0.5"
1113

1214
# Target number of *accepted* samples (total across classes)
1315
n_samples_total: 20000
1416

1517
# Fractions must sum to 1.0
18+
1619
split_ratio:
1720
train: 0.9
1821
val: 0.05
1922
test: 0.05
2023

2124
generation:
22-
batch_size: 128 # try 128 if max_new_tokens stays at 64
23-
temperature: 0.8 # a tad warmer improves diversity, less rejects
25+
batch_size: 256 # try 128 if max_new_tokens stays at 64
26+
temperature: 0.7 # a tad warmer improves diversity, less rejects
2427
num_return_sequences: 2 # doubles raw throughput with the same kernels

configs/teacher/sst2_hf.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ training:
1414
# ---------- bookkeeping ----------
1515
output_dir: "runs/teacher/deberta_v3_base"
1616
overwrite_output_dir: true
17-
run_name: "teacher_sst2_deberta_v3_base_h100"
17+
run_name: "teacher_sst2_deberta_v3_base_lr_0.00005"
1818

1919
report_to: "wandb"
2020
wandb_project: "senti_synth_teacher"
@@ -31,7 +31,7 @@ training:
3131
# If you prefer automatic selection, drop bf16/fp16 and add `torch_dtype: "auto"`
3232

3333
# ---------- optimiser & scheduler ----------
34-
learning_rate: 0.0001 # linear‑scale LR (16→64 batch ⇒ ×4 LR)
34+
learning_rate: 0.00005 # linear‑scale LR (16→64 batch ⇒ ×4 LR)
3535
warmup_ratio: 0.05 # keep warm‑up tokens roughly constant after batch change
3636

3737
# ---------- misc performance knobs ----------

0 commit comments

Comments
 (0)