stack-wise/config_standard.yaml at main · mlsquare/stack-wise · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
# StackWise Configuration - Standard Tokenizer Version
# Hierarchical configuration with validation and defaults
# Compatible with standard tokenizers (GPT-2, BERT, etc.)

model:
  # Model dimensions - Standard sizes for compatibility
  vocab_size: 50257  # GPT-2 vocabulary size
  d_model: 768
  n_heads: 12
  n_kv_heads: 12
  d_ff: 3072

  # Architecture configuration
  architecture:
    n_stacks: 2        # Number of stacks (groups of blocks)
    blocks_per_stack: 6  # Number of blocks per stack (12 total blocks)

  # Attention configuration
  attention_preset: "gpt_style"    # bert_style | gpt_style | efficient_gqa | mla_attention | kernel_attention | mlgka | custom

  # Custom attention configuration (only used when attention_preset: "custom")
  attention_custom:
    attention_type: "mha"              # mha | mla (GQA is determined by n_kv_heads)
    attention_mode: "causal"           # bidirectional | causal

    # MLA specific parameters
    mla_rq: 64
    mla_rkv: 32

    # Kernel attention parameters
    kernel_type: "linear"             # linear | gaussian | laplacian | uniform
    kernel_dim: 32

  # Normalization and MLP
  dropout: 0.1
  tie_embeddings: true
  freeze_up_proj: false

  # Positional encoding
  use_rope: true
  rope_theta: 10000.0

  # Mask-diffusion parameters
  mask_fraction_min: 0.15
  mask_fraction_max: 0.90
  special_mask_id: 4

  # Tokenizer and embedding configuration
  tokenizer_embedding:
    family: "gpt2"
    embedding_option: "embed_tokens"
    freeze_embeddings: false
    adapter_hidden_dim: null

training:
  # Training parameters
  batch_size: 8
  seq_len: 512
  max_steps: 100

  # Optimizer configuration
  optimizer:
    optimizer_type: "AdamW"  # AdamW | SGD | custom
    lr: 5.0e-5
    weight_decay: 0.01
    betas: [0.9, 0.95]
    eps: 1.0e-8
    # SGD specific (ignored for AdamW)
    momentum: 0.9
    dampening: 0.0
    nesterov: false
    # Custom parameters for advanced users
    custom_params: {}
    # Grouped parameters (optional)
    use_groups: false
    groups: []

  # Device and memory
  device: "cuda"
  gradient_checkpointing: false

  # Caching configuration
  cache_dir: "./cache"
  cache_mode: "stack"              # stack | rack

  # Saving configuration
  save_stacks: true                # Always save individual stacks (default enabled)
  save_rack: false                 # Optionally save entire rack (default disabled)

  # Mask-diffusion training
  min_mask_fraction: 0.15
  max_mask_fraction: 0.90
  mask_schedule_type: "linear"      # linear | exponential | cosine
  mask_token_id: null  # Will be set from tokenizer
  epochs_per_stack: 1

  # Training strategy: HOW to train
  strategy: "progressive"           # "progressive" | "end_to_end"
  # progressive: Build and train stacks one by one
  # end_to_end: Train the entire model at once

  # End-to-end training scope: WHAT to train (only used when strategy="end_to_end")
  end_to_end_scope: "stackwise"    # "stackwise" | "rackwise"
  # stackwise: Train each stack independently by caching previous stack activations
  # rackwise: Train the entire rack together

  # Progressive training configuration
  progressive:
    enabled: true
    max_stacks: 12
    target_stacks: 6                # Number of stacks to build progressively
    building_mode: "append"         # "append" or "prepend"
    trunk_strategy: "frozen"        # "frozen" | "qlora"
    new_stack_precision: "full"     # "full" | "nf_fp8" | "fp16"
    cache_activations: true
    time_interpretation: "depth"    # "depth" or "input"
    training_objective: "clm"       # "mlm" | "clm" | "custom"

  # Fine-tuning
  fine_tune_mode: "clm"             # clm | mlm | diffusion

  # Run identification and organization
  run_id: "standard_run"            # Unique identifier for this training run

  # QLoRA and quantization settings
  qlora:
    enabled: true
    rank: 16                        # Rank of low-rank adaptation matrices
    alpha: 32                       # Scaling factor (typically 2x rank)
    dropout: 0.1                    # Dropout rate for QLoRA adapters
    lr: 1.0e-5                     # Learning rate for QLoRA parameters

    # Progressive QLoRA settings
    progressive_enabled: false
    progressive_rank: 8             # Base rank for progressive strategies
    progressive_alpha: 16           # Base alpha for progressive strategies

    # QLoRA Strategy and Patterns
    strategy: "simplified"          # simplified | progressive | variable
    # simplified: Same QLoRA config for all stacks
    # progressive: Gradually change QLoRA parameters across stacks
    # variable: Custom QLoRA config per stack (defined in configs dict)

    # Mixed precision training (auto-enabled with QLoRA)
    mixed_precision: true           # Frozen trunk in fp16 + adapters in fp32

    rank_pattern: "constant"        # constant | increasing | decreasing
    # constant: Same rank for all stacks
    # increasing: Rank increases with stack depth (early: low rank, later: high rank)
    # decreasing: Rank decreases with stack depth (early: high rank, later: low rank)

    alpha_pattern: "constant"       # constant | increasing | decreasing
    # constant: Same alpha for all stacks
    # increasing: Alpha increases with stack depth
    # decreasing: Alpha decreases with stack depth
  current_block_lr: 5.0e-5
  quantization_enabled: true
  quantization_type: "fp16"         # fp4 | fp8 | fp16 | fp32

  # Time-step-based masking
  time_step_masking: true           # Enable time-step-based masking
  num_time_steps: 6                 # Number of discrete time steps
  time_step_mask_fractions: [0.15, 0.25, 0.35, 0.45, 0.55, 0.65]

  # Progressive training configuration
  progressive:
    enabled: true                   # Enable progressive training
    trunk_strategy: "frozen"        # "frozen" or "qlora"
    new_stack_precision: "full"     # "full", "half", "bfloat16", "qlora"
    cache_activations: true         # Cache activations for trunk training

    # Note: QLoRA configuration is now in the main qlora section above

    max_stacks: 12                  # Maximum number of stacks

    # Training objective
    training_objective: "clm"       # "mlm", "clm", "custom"

    # Time interpretation
    time_interpretation: "depth"    # "input" or "depth"

  # Logging and checkpointing
  log_interval: 10
  save_interval: 100
  checkpoint_dir: "./checkpoints"

data:
  # Dataset parameters
  dataset_path: null
  use_dummy_data: true
  num_samples: 1000

  # Data preprocessing
  tokenizer_path: null
  max_length: 512
  padding: "right"

  # Data loading
  num_workers: 0
  pin_memory: true
  shuffle: true