paramkpr
diff --git a/‎.gitignore‎
Lines changed: 1 addition & 0 deletions b/‎.gitignore‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎configs/student/real_1k.yaml‎
Lines changed: 55 additions & 0 deletions b/‎configs/student/real_1k.yaml‎
Lines changed: 55 additions & 0 deletions
diff --git a/‎configs/student/student_mix.yaml‎
Lines changed: 54 additions & 0 deletions b/‎configs/student/student_mix.yaml‎
Lines changed: 54 additions & 0 deletions
diff --git a/‎configs/synthetic/conf.yaml‎
Lines changed: 7 additions & 4 deletions b/‎configs/synthetic/conf.yaml‎
Lines changed: 7 additions & 4 deletions
diff --git a/‎configs/teacher/sst2_hf.yaml‎
Lines changed: 2 additions & 2 deletions b/‎configs/teacher/sst2_hf.yaml‎
Lines changed: 2 additions & 2 deletions
@@ -49,6 +49,7 @@ var/
 *.egg
 
 # Data and models
+data/
 data/raw/*
 data/processed/*
 data/models/*
 
@@ -0,0 +1,55 @@
+model:
+  model_name: "prajjwal1/bert-tiny"      # or your own TinyBERT
+  num_labels: 2
+  use_fast_tokenizer: true
+
+data:
+  dataset_path: "data/real_1k"
+  max_len: 128
+
+training:
+  # ── bookkeeping ────────────────────────────────────────────────────────────
+  output_dir: "runs/student/real_1k"
+  overwrite_output_dir: true
+  run_name: "student_real_1k"
+
+  report_to: "wandb"
+  wandb_project: "senti_synth_student"
+
+  alpha:        0.5          # hard‑vs‑soft mix
+  temperature:  2.0
+
+  # ── batch size & epochs ────────────────────────────────────────────────────
+  per_device_train_batch_size: 16        # fits comfortably on 24 GB VRAM
+  per_device_eval_batch_size: 16
+  gradient_accumulation_steps: 1
+  num_train_epochs: 50                    # SST‑2 is tiny; 2–3 epochs suffice
+
+  # ── precision & speed ──────────────────────────────────────────────────────
+  fp16: false                             # enable mixed precision
+  bf16: true                            # turn off to avoid dual precision modes
+  # torch_dtype: "auto"                 # (optional) lets HF pick fastest dtype
+
+  # ── optimiser & scheduler ─────────────────────────────────────────────────
+  learning_rate: 0.00003                    # good starting LR for GPT‑2 on small corpora
+  warmup_ratio: 0.1
+
+  # ── misc performance knobs ────────────────────────────────────────────────
+  dataloader_num_workers: 8
+  gradient_checkpointing: true           # big memory win on GPT‑style decoders
+  max_grad_norm: 1.0
+
+  # ── logging, saving, early stop ───────────────────────────────────────────
+  logging_steps: 20
+  eval_steps: 100
+  save_steps: 100
+  save_total_limit: 3
+  load_best_model_at_end: true
+  metric_for_best_model: "eval_f1"
+  greater_is_better: true
+
+  use_early_stopping: true
+  early_stopping_patience: 2
+  early_stopping_threshold: 0.0005
+
+  do_test_eval: true
@@ -0,0 +1,54 @@
+model:
+  model_name: "prajjwal1/bert-tiny"
+  num_labels: 2
+
+data:
+  dataset_path: "data/mix_20k_real_1k_beta_1.0"
+  max_len: 128
+
+training:
+  # ── bookkeeping ────────────────────────────────────────────────────────────
+  output_dir: "runs/student/mix_20k_real_1k_alpha0.5_x_beta1.0"
+  overwrite_output_dir: true
+  run_name: "student_mix_20k_real_1k_alpha_0.5_beta_1.0_temp_2.0"
+
+  report_to: "wandb"
+  wandb_project: "senti_synth_student_ablation_alpha_x_beta"
+
+  alpha:        0.5          # hard‑vs‑soft mix
+  temperature:  2.0
+
+  # ── batch size & epochs ────────────────────────────────────────────────────
+  per_device_train_batch_size: 32        # fits comfortably on 24 GB VRAM
+  per_device_eval_batch_size: 64 
+  gradient_accumulation_steps: 1
+  num_train_epochs: 10                    # SST‑2 is tiny; 2–3 epochs suffice
+
+  # ── precision & speed ──────────────────────────────────────────────────────
+  fp16: false                             # enable mixed precision
+  bf16: true                            # turn off to avoid dual precision modes
+  # torch_dtype: "auto"                 # (optional) lets HF pick fastest dtype
+
+  # ── optimiser & scheduler ─────────────────────────────────────────────────
+  learning_rate: 0.00003                    # good starting LR for GPT‑2 on small corpora
+  warmup_ratio: 0.1
+
+  # ── misc performance knobs ────────────────────────────────────────────────
+  dataloader_num_workers: 16
+  gradient_checkpointing: true           # big memory win on GPT‑style decoders
+  max_grad_norm: 1.0
+
+  # ── logging, saving, early stop ───────────────────────────────────────────
+  logging_steps: 100
+  eval_steps: 500
+  save_steps: 500
+  save_total_limit: 3
+  load_best_model_at_end: true
+  metric_for_best_model: "eval_f1"
+  greater_is_better: true
+
+  use_early_stopping: true
+  early_stopping_patience: 2
+  early_stopping_threshold: 0.005
+
+  do_test_eval: true
@@ -4,21 +4,24 @@ model:
 
 teacher:
   ckpt_dir: "runs/teacher/deberta_v3_base/"
-  min_confidence: 0.85   # accept sample only if score ≥ this value
+  min_confidence: 0.9   # accept sample only if score ≥ this value
+  beta:            0.5       # ↓ weight for synthetic rows
+  temperature:     2.0       # ↓ soften teacher logits
 
 data:
-  output_dir: "data/synthetic_sst2"
+  output_dir: "data/synthetic_sst2_beta_0.5"
 
   # Target number of *accepted* samples (total across classes)
   n_samples_total: 20000
 
   # Fractions must sum to 1.0
+
   split_ratio:
     train: 0.9
     val:   0.05
     test:  0.05
 
 generation:
-  batch_size:        128        # try 128 if max_new_tokens stays at 64
-  temperature:       0.8       # a tad warmer improves diversity, less rejects
+  batch_size:        256        # try 128 if max_new_tokens stays at 64
+  temperature:       0.7       # a tad warmer improves diversity, less rejects
   num_return_sequences: 2      # doubles raw throughput with the same kernels
@@ -14,7 +14,7 @@ training:
   # ---------- bookkeeping ----------
   output_dir: "runs/teacher/deberta_v3_base"
   overwrite_output_dir: true
-  run_name: "teacher_sst2_deberta_v3_base_h100"
+  run_name: "teacher_sst2_deberta_v3_base_lr_0.00005"
 
   report_to: "wandb"
   wandb_project: "senti_synth_teacher"
@@ -31,7 +31,7 @@ training:
   # If you prefer automatic selection, drop bf16/fp16 and add `torch_dtype: "auto"`
 
   # ---------- optimiser & scheduler ----------
-  learning_rate: 0.0001   # linear‑scale LR (16→64 batch ⇒ ×4 LR)
+  learning_rate: 0.00005  # linear‑scale LR (16→64 batch ⇒ ×4 LR)
   warmup_ratio: 0.05      # keep warm‑up tokens roughly constant after batch change
 
   # ---------- misc performance knobs ----------