File tree Expand file tree Collapse file tree
Expand file tree Collapse file tree Original file line number Diff line number Diff line change @@ -20,11 +20,11 @@ vocab_size: 33
2020expansion_ratio : 2.6667 # 8/3
2121soft_logit_cap : 32.0
2222attention_soft_cap : 64.0
23- add_att_soft_cap : true
23+ add_att_soft_cap : false
2424p_attention : false
2525tie_embeddings : false
2626unet : true
27- token_dropout : true
27+ token_dropout : false
2828
2929# Data Configuration
3030input_bin : " data/omgprot50/omgprot50_train_*.bin"
@@ -38,7 +38,7 @@ mask_rate_steps: 2500
3838
3939# Training Hyperparameters
4040batch_size : 524288 # 8*64*1024 tokens
41- grad_accum : 1
41+ grad_accum : 8
4242num_steps : 50000
4343cooldown_steps : 5000
4444max_length : 1024
@@ -52,7 +52,7 @@ lr_embed: 0.06
5252lr_head : 0.008
5353lr_scalar : 0.04
5454grad_clip : 0.0
55- auto_grad_clip : true
55+ auto_grad_clip : false
5656auto_grad_clip_percentile : 10
5757
5858# Muon optimizer (for hidden layers)
You can’t perform that action at this time.
0 commit comments