Skip to content
Open

1 #1

Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion .vscode/settings.json
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
"files.exclude": {
"**/.git/": true,
"**/*.egg-info/": true,
"**/tmp": true,
"**/__pycache__/": true
}
}
Empty file removed README.md
Empty file.
16 changes: 16 additions & 0 deletions TARGET/DO.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
有TP,PP,DP并行框架的基础,一些代码层面的基础知识不必重复
想要理解清楚整个技术栈从代码到硬件的性能瓶颈



必须精通
L0 训练循环与状态机:forward/backward/optimizer/AMP/checkpoint /grad accum
L1autograd(自动求导):graph、hook、saved tensor、inplace、view、backward 调度
L2 dispatcher/ATen:op如何选后端、tensor metadata、stride/layout/contiguous
L3 CUDA runtime(运行时)/stream/event/allocator/async 语义
L4 distributed(分布式):DDP/FSDP/ZeRO/NCCL/bucket /overlap /topology
L5 graph-level execution(图级执行):CUDA Graph、静态化、地址稳定、生命周期管理
必须能看懂
L6 compiler path(编译链):torch.compile/Inductor/Triton/PTX,至少知道代码怎么落到 kernel
L7 kernel execution(核执行):grid/block/warp/occupancy/divergence/coalescing
L8 memory hierarchy(存储层级):HBM/L2/shared memory/registers 对性能的影响L9 interconnect(互连):PCle/NVLink/IB 对 TP/DP/PP 的约束
175 changes: 0 additions & 175 deletions TARGET/index.md

This file was deleted.

28 changes: 26 additions & 2 deletions configs/gpt2.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,16 @@ distributed:
pp_size: 1
dp_size: 1
pp_engine: "1f1b"
use_cpu: false

model:
name: "gpt2"
tokenizer: "tokenizer_gpt2_2000"
# eager: 手写 attention,适合教学与追踪每一步张量流
# sdpa: 使用 PyTorch 官方 scaled_dot_product_attention 后端
attention_backend: "sdpa"
# activation checkpointing 会在反向时重算 block 前向,减少激活显存占用。
activation_checkpointing: false
dropout: 0.1
vocab_size: 2000
block_size: 32
embed_dim: 4
Expand All @@ -24,15 +29,23 @@ training:
num_samples: 400000
max_tokens: null
use_fused_adam: false
# 开启后在 CUDA 上使用 autocast;dtype 由运行时自动选择。
use_autocast: false
# 当前只建议在单卡 CUDA dense 路径打开 compile。
use_torch_compile: false
torch_compile_mode: "reduce-overhead"

dataset:
# random: 用随机 token 做并行与吞吐基线
# hf: 走真实 HuggingFace dataset,验证训练闭环
loader: "random"
name: "roneneldan/TinyStories"
subset_name: null
num_workers: 0
num_proc: 4

checkpoint:
save_dir: "tmp/gpt2-cpu"
save_dir: "tmp/gpt2"
save_frequency: 300
load_path: ""

Expand All @@ -41,6 +54,17 @@ logging:
project_name: "multigpt"
run_name: "multigpt"

profiling:
enabled: false
output_dir: "tmp/profiler"
wait: 1
warmup: 1
active: 2
repeat: 1
record_shapes: true
profile_memory: true
with_stack: false

environment:
OMP_NUM_THREADS: "1"
TOKENIZERS_PARALLELISM: "false"
68 changes: 68 additions & 0 deletions configs/qwen3.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
distributed:
tp_size: 2
cp_size: 1
pp_size: 1
dp_size: 1
pp_engine: "1f1b"

model:
name: "qwen3"
tokenizer: "tokenizer_gpt2_2000"
# 这里保留最小 Qwen3-like 结构:RMSNorm + RoPE + GQA + SwiGLU
attention_backend: "sdpa"
activation_checkpointing: false
dropout: 0.0
vocab_size: 2000
block_size: 32
embed_dim: 8
num_heads: 4
num_key_value_heads: 2
intermediate_size: 32
num_layers: 4
rms_norm_eps: 0.000001
rope_theta: 1000000.0

training:
seed: 42
learning_rate: 0.0003
total_train_steps: 20000
micro_batch_size: 4
gradient_accumulation_steps: 2
num_samples: 400000
max_tokens: null
use_fused_adam: false
use_autocast: false
use_torch_compile: false
torch_compile_mode: "reduce-overhead"

dataset:
loader: "random"
name: "roneneldan/TinyStories"
subset_name: null
num_workers: 0
num_proc: 4

checkpoint:
save_dir: "tmp/qwen3"
save_frequency: 300
load_path: ""

logging:
use_wandb: false
project_name: "multigpt"
run_name: "multigpt-qwen3"

profiling:
enabled: false
output_dir: "tmp/profiler"
wait: 1
warmup: 1
active: 2
repeat: 1
record_shapes: true
profile_memory: true
with_stack: false

environment:
OMP_NUM_THREADS: "1"
TOKENIZERS_PARALLELISM: "false"
Loading