ma-ben · ma-ben · Mar 16, 2026 · Mar 22, 2026 · Apr 3, 2026
diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -2,7 +2,6 @@
     "files.exclude": {
         "**/.git/": true,
         "**/*.egg-info/": true,
-        "**/tmp": true,
         "**/__pycache__/": true
     }
 }
diff --git a/README.md b/README.md
diff --git a/TARGET/DO.md b/TARGET/DO.md
@@ -0,0 +1,16 @@
+有TP,PP,DP并行框架的基础，一些代码层面的基础知识不必重复
+想要理解清楚整个技术栈从代码到硬件的性能瓶颈
+
+
+
+必须精通
+L0 训练循环与状态机:forward/backward/optimizer/AMP/checkpoint /grad accum
+L1autograd(自动求导):graph、hook、saved tensor、inplace、view、backward 调度
+L2 dispatcher/ATen:op如何选后端、tensor metadata、stride/layout/contiguous
+L3 CUDA runtime(运行时)/stream/event/allocator/async 语义
+L4 distributed(分布式):DDP/FSDP/ZeRO/NCCL/bucket /overlap /topology
+L5 graph-level execution(图级执行):CUDA Graph、静态化、地址稳定、生命周期管理
+必须能看懂
+L6 compiler path(编译链):torch.compile/Inductor/Triton/PTX，至少知道代码怎么落到 kernel
+L7 kernel execution(核执行):grid/block/warp/occupancy/divergence/coalescing
+L8 memory hierarchy(存储层级):HBM/L2/shared memory/registers 对性能的影响L9 interconnect(互连):PCle/NVLink/IB 对 TP/DP/PP 的约束
diff --git a/TARGET/index.md b/TARGET/index.md
diff --git a/configs/gpt2.yaml b/configs/gpt2.yaml
@@ -4,11 +4,16 @@ distributed:
     pp_size: 1
     dp_size: 1
     pp_engine: "1f1b"
-    use_cpu: false
 
 model:
     name: "gpt2"
     tokenizer: "tokenizer_gpt2_2000"
+    # eager: 手写 attention，适合教学与追踪每一步张量流
+    # sdpa:  使用 PyTorch 官方 scaled_dot_product_attention 后端
+    attention_backend: "sdpa"
+    # activation checkpointing 会在反向时重算 block 前向，减少激活显存占用。
+    activation_checkpointing: false
+    dropout: 0.1
     vocab_size: 2000
     block_size: 32
     embed_dim: 4
@@ -24,15 +29,23 @@ training:
     num_samples: 400000
     max_tokens: null
     use_fused_adam: false
+    # 开启后在 CUDA 上使用 autocast；dtype 由运行时自动选择。
+    use_autocast: false
+    # 当前只建议在单卡 CUDA dense 路径打开 compile。
+    use_torch_compile: false
+    torch_compile_mode: "reduce-overhead"
 
 dataset:
+    # random: 用随机 token 做并行与吞吐基线
+    # hf:     走真实 HuggingFace dataset，验证训练闭环
+    loader: "random"
     name: "roneneldan/TinyStories"
     subset_name: null
     num_workers: 0
     num_proc: 4
 
 checkpoint:
-    save_dir: "tmp/gpt2-cpu"
+    save_dir: "tmp/gpt2"
     save_frequency: 300
     load_path: ""
 
@@ -41,6 +54,17 @@ logging:
     project_name: "multigpt"
     run_name: "multigpt"
 
+profiling:
+    enabled: false
+    output_dir: "tmp/profiler"
+    wait: 1
+    warmup: 1
+    active: 2
+    repeat: 1
+    record_shapes: true
+    profile_memory: true
+    with_stack: false
+
 environment:
     OMP_NUM_THREADS: "1"
     TOKENIZERS_PARALLELISM: "false"
diff --git a/configs/qwen3.yaml b/configs/qwen3.yaml
@@ -0,0 +1,68 @@
+distributed:
+    tp_size: 2
+    cp_size: 1
+    pp_size: 1
+    dp_size: 1
+    pp_engine: "1f1b"
+
+model:
+    name: "qwen3"
+    tokenizer: "tokenizer_gpt2_2000"
+    # 这里保留最小 Qwen3-like 结构：RMSNorm + RoPE + GQA + SwiGLU
+    attention_backend: "sdpa"
+    activation_checkpointing: false
+    dropout: 0.0
+    vocab_size: 2000
+    block_size: 32
+    embed_dim: 8
+    num_heads: 4
+    num_key_value_heads: 2
+    intermediate_size: 32
+    num_layers: 4
+    rms_norm_eps: 0.000001
+    rope_theta: 1000000.0
+
+training:
+    seed: 42
+    learning_rate: 0.0003
+    total_train_steps: 20000
+    micro_batch_size: 4
+    gradient_accumulation_steps: 2
+    num_samples: 400000
+    max_tokens: null
+    use_fused_adam: false
+    use_autocast: false
+    use_torch_compile: false
+    torch_compile_mode: "reduce-overhead"
+
+dataset:
+    loader: "random"
+    name: "roneneldan/TinyStories"
+    subset_name: null
+    num_workers: 0
+    num_proc: 4
+
+checkpoint:
+    save_dir: "tmp/qwen3"
+    save_frequency: 300
+    load_path: ""
+
+logging:
+    use_wandb: false
+    project_name: "multigpt"
+    run_name: "multigpt-qwen3"
+
+profiling:
+    enabled: false
+    output_dir: "tmp/profiler"
+    wait: 1
+    warmup: 1
+    active: 2
+    repeat: 1
+    record_shapes: true
+    profile_memory: true
+    with_stack: false
+
+environment:
+    OMP_NUM_THREADS: "1"
+    TOKENIZERS_PARALLELISM: "false"