-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathconfig_standard.yaml
More file actions
201 lines (164 loc) · 6.59 KB
/
config_standard.yaml
File metadata and controls
201 lines (164 loc) · 6.59 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
# StackWise Configuration - Standard Tokenizer Version
# Hierarchical configuration with validation and defaults
# Compatible with standard tokenizers (GPT-2, BERT, etc.)
model:
# Model dimensions - Standard sizes for compatibility
vocab_size: 50257 # GPT-2 vocabulary size
d_model: 768
n_heads: 12
n_kv_heads: 12
d_ff: 3072
# Architecture configuration
architecture:
n_stacks: 2 # Number of stacks (groups of blocks)
blocks_per_stack: 6 # Number of blocks per stack (12 total blocks)
# Attention configuration
attention_preset: "gpt_style" # bert_style | gpt_style | efficient_gqa | mla_attention | kernel_attention | mlgka | custom
# Custom attention configuration (only used when attention_preset: "custom")
attention_custom:
attention_type: "mha" # mha | mla (GQA is determined by n_kv_heads)
attention_mode: "causal" # bidirectional | causal
# MLA specific parameters
mla_rq: 64
mla_rkv: 32
# Kernel attention parameters
kernel_type: "linear" # linear | gaussian | laplacian | uniform
kernel_dim: 32
# Normalization and MLP
dropout: 0.1
tie_embeddings: true
freeze_up_proj: false
# Positional encoding
use_rope: true
rope_theta: 10000.0
# Mask-diffusion parameters
mask_fraction_min: 0.15
mask_fraction_max: 0.90
special_mask_id: 4
# Tokenizer and embedding configuration
tokenizer_embedding:
family: "gpt2"
embedding_option: "embed_tokens"
freeze_embeddings: false
adapter_hidden_dim: null
training:
# Training parameters
batch_size: 8
seq_len: 512
max_steps: 100
# Optimizer configuration
optimizer:
optimizer_type: "AdamW" # AdamW | SGD | custom
lr: 5.0e-5
weight_decay: 0.01
betas: [0.9, 0.95]
eps: 1.0e-8
# SGD specific (ignored for AdamW)
momentum: 0.9
dampening: 0.0
nesterov: false
# Custom parameters for advanced users
custom_params: {}
# Grouped parameters (optional)
use_groups: false
groups: []
# Device and memory
device: "cuda"
gradient_checkpointing: false
# Caching configuration
cache_dir: "./cache"
cache_mode: "stack" # stack | rack
# Saving configuration
save_stacks: true # Always save individual stacks (default enabled)
save_rack: false # Optionally save entire rack (default disabled)
# Mask-diffusion training
min_mask_fraction: 0.15
max_mask_fraction: 0.90
mask_schedule_type: "linear" # linear | exponential | cosine
mask_token_id: null # Will be set from tokenizer
epochs_per_stack: 1
# Training strategy: HOW to train
strategy: "progressive" # "progressive" | "end_to_end"
# progressive: Build and train stacks one by one
# end_to_end: Train the entire model at once
# End-to-end training scope: WHAT to train (only used when strategy="end_to_end")
end_to_end_scope: "stackwise" # "stackwise" | "rackwise"
# stackwise: Train each stack independently by caching previous stack activations
# rackwise: Train the entire rack together
# Progressive training configuration
progressive:
enabled: true
max_stacks: 12
target_stacks: 6 # Number of stacks to build progressively
building_mode: "append" # "append" or "prepend"
trunk_strategy: "frozen" # "frozen" | "qlora"
new_stack_precision: "full" # "full" | "nf_fp8" | "fp16"
cache_activations: true
time_interpretation: "depth" # "depth" or "input"
training_objective: "clm" # "mlm" | "clm" | "custom"
# Fine-tuning
fine_tune_mode: "clm" # clm | mlm | diffusion
# Run identification and organization
run_id: "standard_run" # Unique identifier for this training run
# QLoRA and quantization settings
qlora:
enabled: true
rank: 16 # Rank of low-rank adaptation matrices
alpha: 32 # Scaling factor (typically 2x rank)
dropout: 0.1 # Dropout rate for QLoRA adapters
lr: 1.0e-5 # Learning rate for QLoRA parameters
# Progressive QLoRA settings
progressive_enabled: false
progressive_rank: 8 # Base rank for progressive strategies
progressive_alpha: 16 # Base alpha for progressive strategies
# QLoRA Strategy and Patterns
strategy: "simplified" # simplified | progressive | variable
# simplified: Same QLoRA config for all stacks
# progressive: Gradually change QLoRA parameters across stacks
# variable: Custom QLoRA config per stack (defined in configs dict)
# Mixed precision training (auto-enabled with QLoRA)
mixed_precision: true # Frozen trunk in fp16 + adapters in fp32
rank_pattern: "constant" # constant | increasing | decreasing
# constant: Same rank for all stacks
# increasing: Rank increases with stack depth (early: low rank, later: high rank)
# decreasing: Rank decreases with stack depth (early: high rank, later: low rank)
alpha_pattern: "constant" # constant | increasing | decreasing
# constant: Same alpha for all stacks
# increasing: Alpha increases with stack depth
# decreasing: Alpha decreases with stack depth
current_block_lr: 5.0e-5
quantization_enabled: true
quantization_type: "fp16" # fp4 | fp8 | fp16 | fp32
# Time-step-based masking
time_step_masking: true # Enable time-step-based masking
num_time_steps: 6 # Number of discrete time steps
time_step_mask_fractions: [0.15, 0.25, 0.35, 0.45, 0.55, 0.65]
# Progressive training configuration
progressive:
enabled: true # Enable progressive training
trunk_strategy: "frozen" # "frozen" or "qlora"
new_stack_precision: "full" # "full", "half", "bfloat16", "qlora"
cache_activations: true # Cache activations for trunk training
# Note: QLoRA configuration is now in the main qlora section above
max_stacks: 12 # Maximum number of stacks
# Training objective
training_objective: "clm" # "mlm", "clm", "custom"
# Time interpretation
time_interpretation: "depth" # "input" or "depth"
# Logging and checkpointing
log_interval: 10
save_interval: 100
checkpoint_dir: "./checkpoints"
data:
# Dataset parameters
dataset_path: null
use_dummy_data: true
num_samples: 1000
# Data preprocessing
tokenizer_path: null
max_length: 512
padding: "right"
# Data loading
num_workers: 0
pin_memory: true
shuffle: true