-
Notifications
You must be signed in to change notification settings - Fork 34
Training Benchmark #464
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Training Benchmark #464
Changes from all commits
cc302b3
b672787
d508183
b697027
750b6bb
1bf91b3
8162b3f
f9563bd
21d8cd5
63c233f
21c4e73
013d82e
982ae56
abc4c84
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Large diffs are not rendered by default.
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,105 @@ | ||
| # Base image | ||
| FROM docker.io/rocm/megatron-lm:v25.10 | ||
|
|
||
| # Specify the commit of Primus-Turbo when building: docker build --build-arg PRIMUS_TURBO_COMMIT=xxx .) | ||
| ARG PRIMUS_TURBO_COMMIT | ||
| ARG AINIC_BUNDLE_PATH | ||
|
|
||
| # Install basic dependencies | ||
| RUN apt-get update | ||
|
|
||
| # Clone and install the Primus-Turbo | ||
| WORKDIR /opt | ||
| RUN mkdir -p /opt && cd /opt && \ | ||
| git clone https://github.com/AMD-AGI/Primus-Turbo.git && \ | ||
| cd Primus-Turbo && \ | ||
| git checkout ${PRIMUS_TURBO_COMMIT} && \ | ||
| git submodule update --init --recursive && \ | ||
| pip3 install -r requirements.txt && \ | ||
| GPU_ARCHS="gfx942;gfx950" pip3 install --no-build-isolation . | ||
|
|
||
| RUN apt-get install --reinstall binutils -y && apt-get install numactl -y | ||
|
|
||
| WORKDIR /opt | ||
| ENV WORKDIR=/opt | ||
| ENV ROCM_PATH=/opt/rocm | ||
|
|
||
| RUN apt-get update && \ | ||
| apt-get install jq dpkg-dev kmod xz-utils \ | ||
| libfmt-dev libboost-all-dev \ | ||
| libibverbs-dev ibverbs-utils infiniband-diags -y | ||
|
|
||
| # =============================== Build AINIC Driver =============================== | ||
| # WARNING: Please ensure the following environment variables are correctly set: | ||
| # WARNING: 1. PATH: /usr/sbin must be included. | ||
| # WARNING: 2. LD_LIBRARY_PATH: /usr/lib must be included. | ||
| # WARNING: If these paths are missing, tools and libraries may not function correctly. | ||
| # INFO: Installation completed successfully | ||
|
|
||
| COPY ${AINIC_BUNDLE_PATH}/ainic_bundle_1.117.1-a-42.tar.gz /opt/ | ||
| RUN cd ${WORKDIR} && \ | ||
| echo "Building ainic bundle... current directory: ${WORKDIR}" && \ | ||
| tar zxf ainic_bundle_1.117.1-a-42.tar.gz && \ | ||
| cd ainic_bundle_1.117.1-a-42 && \ | ||
| tar zxf host_sw_pkg.tar.gz && \ | ||
| cd host_sw_pkg && \ | ||
| ./install.sh --domain=user -y 2>&1 | tee log_install.txt && \ | ||
| cd /opt | ||
|
|
||
| # =============================== Test AINIC Driver =============================== | ||
| # ibv_devices | ||
| # rdma link | ||
| # ethtool -i enp9s0 | ||
| # ibv_devinfo -vv | grep GID | ||
|
|
||
| # =============================== Build UCX =============================== | ||
| RUN cd ${WORKDIR} && wget https://github.com/openucx/ucx/releases/download/v1.18.0/ucx-1.18.0.tar.gz && \ | ||
| mkdir -p ucx-1.18.0 && \ | ||
| tar -zxf ucx-1.18.0.tar.gz -C ucx-1.18.0 --strip-components=1 && \ | ||
| cd ucx-1.18.0 && mkdir build && cd build && \ | ||
| ../configure --prefix=${WORKDIR}/ucx-1.18.0/install --with-rocm=${ROCM_PATH} 2>&1 | tee log_ucx_configure.txt && \ | ||
| make -j 16 2>&1 | tee log_ucx_build.txt && \ | ||
| make install && \ | ||
| cd ${WORKDIR} | ||
|
|
||
| ENV UCX_INSTALL_DIR=${WORKDIR}/ucx-1.18.0/install | ||
|
|
||
| # =============================== Build MPI =============================== | ||
| RUN cd ${WORKDIR} && \ | ||
| wget https://download.open-mpi.org/release/open-mpi/v4.1/openmpi-4.1.6.tar.gz && \ | ||
| mkdir -p ompi-4.1.6 && \ | ||
| tar -zxf openmpi-4.1.6.tar.gz -C ompi-4.1.6 --strip-components=1 && \ | ||
| cd ompi-4.1.6 && mkdir build && cd build && \ | ||
| ../configure --prefix=${WORKDIR}/ompi-4.1.6/install --with-ucx=${UCX_INSTALL_DIR} \ | ||
| --disable-oshmem --disable-mpi-fortran 2>&1 | tee log_mpi_configure.txt && \ | ||
| make -j 16 2>&1 | tee log_mpi_build.txt && \ | ||
| make install && \ | ||
| cd ${WORKDIR} | ||
|
|
||
| ENV MPI_PATH=${WORKDIR}/ompi-4.1.6/install | ||
|
|
||
| # =============================== Build RCCL =============================== | ||
| RUN cd ${WORKDIR} && \ | ||
| git clone https://github.com/ROCm/rccl.git && \ | ||
| cd rccl && git checkout drop/2025-08 && \ | ||
| ./install.sh -l --prefix build/ --disable-mscclpp \ | ||
| --disable-msccl-kernel --amdgpu_targets="gfx950" 2>&1 | tee log_rccl_install.txt && \ | ||
| cd ${WORKDIR} | ||
|
|
||
| ENV RCCL_HOME=${WORKDIR}/rccl | ||
|
|
||
| # =============================== Build AMD ANP =============================== | ||
|
|
||
| RUN cd ${WORKDIR} && git clone https://github.com/rocm/amd-anp.git && \ | ||
| cd amd-anp && git checkout tags/v1.1.0-5 && \ | ||
| sed -i '5a CFLAGS += --offload-arch=gfx950' ./Makefile && head -10 ./Makefile && \ | ||
| make -j 16 RCCL_BUILD=${RCCL_HOME}/build/release \ | ||
| MPI_INCLUDE=${MPI_PATH}/include/ \ | ||
| MPI_LIB_PATH=${MPI_PATH}/lib/ \ | ||
| ROCM_PATH=${ROCM_PATH} 2>&1 | tee log_amd_anp_build.txt | ||
|
|
||
| # Set the default working directory | ||
| WORKDIR /opt | ||
|
|
||
| # check the installed Primus-Turbo package | ||
| RUN python3 -m pip show primus-turbo || true |
| Original file line number | Diff line number | Diff line change | ||||||||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| @@ -0,0 +1,114 @@ | ||||||||||||||||||||
| # Base image | ||||||||||||||||||||
| FROM docker.io/rocm/primus:v25.11 | ||||||||||||||||||||
|
|
||||||||||||||||||||
| # Specify the commit of Primus-Turbo when building: docker build --build-arg PRIMUS_TURBO_COMMIT=xxx .) | ||||||||||||||||||||
| ARG PRIMUS_TURBO_COMMIT | ||||||||||||||||||||
| ARG AINIC_BUNDLE_PATH | ||||||||||||||||||||
|
|
||||||||||||||||||||
| # Install basic dependencies | ||||||||||||||||||||
| RUN apt-get update | ||||||||||||||||||||
|
|
||||||||||||||||||||
| # Clone and install the Primus-Turbo | ||||||||||||||||||||
| WORKDIR /opt | ||||||||||||||||||||
| RUN mkdir -p /opt && cd /opt && \ | ||||||||||||||||||||
| git clone https://github.com/AMD-AGI/Primus-Turbo.git && \ | ||||||||||||||||||||
| cd Primus-Turbo && \ | ||||||||||||||||||||
| git checkout ${PRIMUS_TURBO_COMMIT} && \ | ||||||||||||||||||||
| git submodule update --init --recursive && \ | ||||||||||||||||||||
| pip3 install -r requirements.txt && \ | ||||||||||||||||||||
| GPU_ARCHS="gfx942;gfx950" pip3 install --no-build-isolation . | ||||||||||||||||||||
|
|
||||||||||||||||||||
| RUN apt-get install --reinstall binutils -y && apt-get install numactl -y | ||||||||||||||||||||
|
|
||||||||||||||||||||
| WORKDIR /opt | ||||||||||||||||||||
| ENV WORKDIR=/opt | ||||||||||||||||||||
| ENV ROCM_PATH=/opt/rocm | ||||||||||||||||||||
|
|
||||||||||||||||||||
| RUN apt-get update && \ | ||||||||||||||||||||
| apt-get install jq dpkg-dev kmod xz-utils \ | ||||||||||||||||||||
| libfmt-dev libboost-all-dev \ | ||||||||||||||||||||
| libibverbs-dev ibverbs-utils infiniband-diags -y | ||||||||||||||||||||
|
|
||||||||||||||||||||
| # =============================== Build AINIC Driver =============================== | ||||||||||||||||||||
| # WARNING: Please ensure the following environment variables are correctly set: | ||||||||||||||||||||
| # WARNING: 1. PATH: /usr/sbin must be included. | ||||||||||||||||||||
| # WARNING: 2. LD_LIBRARY_PATH: /usr/lib must be included. | ||||||||||||||||||||
| # WARNING: If these paths are missing, tools and libraries may not function correctly. | ||||||||||||||||||||
| # INFO: Installation completed successfully | ||||||||||||||||||||
|
|
||||||||||||||||||||
| COPY ${AINIC_BUNDLE_PATH}/ainic_bundle_1.117.5-a-38.tar.gz /opt/ | ||||||||||||||||||||
| RUN cd ${WORKDIR} && \ | ||||||||||||||||||||
| echo "Building ainic bundle... current directory: ${WORKDIR}" && \ | ||||||||||||||||||||
| tar zxf ainic_bundle_1.117.5-a-38.tar.gz && \ | ||||||||||||||||||||
| cd ainic_bundle_1.117.5-a-38 && \ | ||||||||||||||||||||
| tar zxf host_sw_pkg.tar.gz && \ | ||||||||||||||||||||
| cd host_sw_pkg && \ | ||||||||||||||||||||
| ./install.sh --domain=user -y 2>&1 | tee log_install.txt && \ | ||||||||||||||||||||
| cd /opt | ||||||||||||||||||||
|
|
||||||||||||||||||||
| # =============================== Test AINIC Driver =============================== | ||||||||||||||||||||
| # ibv_devices | ||||||||||||||||||||
| # rdma link | ||||||||||||||||||||
| # ethtool -i enp9s0 | ||||||||||||||||||||
| # ibv_devinfo -vv | grep GID | ||||||||||||||||||||
|
|
||||||||||||||||||||
| # =============================== Build UCX =============================== | ||||||||||||||||||||
| ENV USE_UCX_VERSION="1.15.0" | ||||||||||||||||||||
| RUN cd ${WORKDIR} && wget https://github.com/openucx/ucx/releases/download/v${USE_UCX_VERSION}/ucx-${USE_UCX_VERSION}.tar.gz && \ | ||||||||||||||||||||
| mkdir -p ucx-${USE_UCX_VERSION} && \ | ||||||||||||||||||||
| tar -zxf ucx-${USE_UCX_VERSION}.tar.gz -C ucx-${USE_UCX_VERSION} --strip-components=1 && \ | ||||||||||||||||||||
| cd ucx-${USE_UCX_VERSION} && mkdir build && cd build && \ | ||||||||||||||||||||
| ../configure --prefix=${WORKDIR}/ucx-${USE_UCX_VERSION}/install --with-rocm=${ROCM_PATH} 2>&1 | tee log_ucx_configure.txt && \ | ||||||||||||||||||||
| make -j 16 2>&1 | tee log_ucx_build.txt && \ | ||||||||||||||||||||
| make install && \ | ||||||||||||||||||||
| cd ${WORKDIR} | ||||||||||||||||||||
|
|
||||||||||||||||||||
| ENV UCX_INSTALL_DIR=${WORKDIR}/ucx-${USE_UCX_VERSION}/install | ||||||||||||||||||||
|
|
||||||||||||||||||||
| # =============================== Build MPI =============================== | ||||||||||||||||||||
| RUN cd ${WORKDIR} && \ | ||||||||||||||||||||
| wget https://download.open-mpi.org/release/open-mpi/v4.1/openmpi-4.1.6.tar.gz && \ | ||||||||||||||||||||
| mkdir -p ompi-4.1.6 && \ | ||||||||||||||||||||
| tar -zxf openmpi-4.1.6.tar.gz -C ompi-4.1.6 --strip-components=1 && \ | ||||||||||||||||||||
| cd ompi-4.1.6 && mkdir build && cd build && \ | ||||||||||||||||||||
| ../configure --prefix=${WORKDIR}/ompi-4.1.6/install --with-ucx=${UCX_INSTALL_DIR} \ | ||||||||||||||||||||
| --disable-oshmem --disable-mpi-fortran 2>&1 | tee log_mpi_configure.txt && \ | ||||||||||||||||||||
| make -j 16 2>&1 | tee log_mpi_build.txt && \ | ||||||||||||||||||||
| make install && \ | ||||||||||||||||||||
| cd ${WORKDIR} | ||||||||||||||||||||
|
|
||||||||||||||||||||
| ENV MPI_PATH=${WORKDIR}/ompi-4.1.6/install | ||||||||||||||||||||
|
|
||||||||||||||||||||
| # =============================== Build RCCL =============================== | ||||||||||||||||||||
| # cd rccl && git checkout rocm-7.1.0 && \ | ||||||||||||||||||||
|
||||||||||||||||||||
| # cd rccl && git checkout rocm-7.1.0 && \ |
Copilot
AI
Jan 3, 2026
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The RCCL checkout uses a branch name 'release/rocm-rel-7.1' instead of a specific tag or commit. This could lead to non-reproducible builds as the branch may be updated. Consider using a specific tag or commit hash for reproducibility.
| cd rccl && git checkout release/rocm-rel-7.1 && \ | |
| cd rccl && git checkout rocm-7.1.0 && \ |
Copilot
AI
Dec 30, 2025
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This commented-out line appears redundant since line 102 performs the same operation. Consider removing this commented line to reduce clutter.
| # sed -i '5a CFLAGS += --offload-arch=gfx950' ./Makefile && head -10 ./Makefile && \ |
Copilot
AI
Dec 30, 2025
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
These commented-out patch-related lines suggest incomplete implementation. Either implement the patch application if needed or remove these lines with an explanation of why the patch is no longer required.
| # sed -i '5a CFLAGS += --offload-arch=gfx950' ./Makefile && head -10 ./Makefile && \ | |
| WORKDIR /opt | |
| # COPY ${AINIC_BUNDLE_PATH}/amd-anp-v1.3.0.patch /opt/ | |
| # git apply /opt/amd-anp-v1.3.0.patch && \ | |
| # Note: amd-anp v1.3.0 is built directly from the upstream tag; any required | |
| # customization is applied via the sed-based CFLAGS modification below, so no | |
| # additional patch file is applied here. | |
| WORKDIR /opt |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,74 @@ | ||
| work_group: ${PRIMUS_TEAM:amd} | ||
| user_name: ${PRIMUS_USER:root} | ||
| exp_name: ${PRIMUS_EXP_NAME:llama3_70B-pretrain} | ||
| workspace: ${PRIMUS_WORKSPACE:./output} | ||
|
|
||
| modules: | ||
| pre_trainer: | ||
| framework: megatron | ||
| config: pre_trainer.yaml | ||
| model: llama3.1_70B.yaml | ||
| overrides: | ||
| # log | ||
| wandb_project: "Primus_DeepSeek_Pretrain" | ||
| stderr_sink_level: DEBUG | ||
|
|
||
| log_avg_skip_iterations: 2 | ||
| log_avg_reset_interval: 50 | ||
|
|
||
| train_iters: 50 | ||
| micro_batch_size: 1 | ||
| global_batch_size: 32 | ||
|
|
||
| seq_length: 8192 | ||
| max_position_embeddings: 8192 | ||
|
|
||
| lr: 1.0e-5 | ||
| min_lr: 0.0 | ||
| lr_warmup_iters: 2 | ||
| lr_decay_iters: null | ||
| lr_decay_style: cosine | ||
| weight_decay: 0.1 | ||
| adam_beta1: 0.9 | ||
| adam_beta2: 0.95 | ||
| eod_mask_loss: true | ||
| init_method_std: 0.008 | ||
| norm_epsilon: 1.0e-6 | ||
|
|
||
| # parallel | ||
| tensor_model_parallel_size: 4 | ||
| pipeline_model_parallel_size: 8 | ||
| sequence_parallel: 1 | ||
|
|
||
| # data | ||
| mock_data: true | ||
| train_data_path: null | ||
| valid_data_path: null | ||
| test_data_path: null | ||
|
|
||
| # ckpt | ||
| finetune: false | ||
| auto_continue_train: false | ||
| load: null | ||
| no_load_optim: null | ||
| no_load_rng: null | ||
| save: null | ||
| save_interval: 20000 | ||
| no_save_optim: null | ||
| no_save_rng: null | ||
| disable_last_saving: true | ||
| ckpt_format: torch_dist | ||
|
|
||
| use_distributed_optimizer: true | ||
| overlap_grad_reduce: true | ||
| overlap_param_gather: true | ||
| gradient_accumulation_fusion: true | ||
|
|
||
| # recompute | ||
| recompute_granularity: full # full, selective | ||
| recompute_method: block # uniform, block | ||
| recompute_num_layers: 0 # int | ||
|
|
||
| # Cross entropy flags | ||
| # cross_entropy_fusion_impl: "te" | ||
| # cross_entropy_loss_fusion: true |
| Original file line number | Diff line number | Diff line change | ||||||||
|---|---|---|---|---|---|---|---|---|---|---|
|
|
@@ -158,12 +158,12 @@ if [ "$USING_AINIC" == "1" ]; then | |||||||||
| LOG_INFO_RANK0 "ANP_HOME_DIR: $ANP_HOME_DIR" | ||||||||||
| LOG_INFO_RANK0 "MPI_HOME_DIR: $MPI_HOME_DIR" | ||||||||||
|
|
||||||||||
| # unset NCCL_IB_GID_INDEX | ||||||||||
| export NCCL_IB_TC=${NCCL_IB_TC:-104} | ||||||||||
| export NCCL_IB_FIFO_TC=${NCCL_IB_FIFO_TC:-184} # 192 | ||||||||||
|
||||||||||
| export NCCL_IB_FIFO_TC=${NCCL_IB_FIFO_TC:-184} # 192 | |
| export NCCL_IB_FIFO_TC=${NCCL_IB_FIFO_TC:-184} # previous default: 192 |
Copilot
AI
Jan 5, 2026
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The inline comment '# 192' is unclear - it should explain why 192 is referenced when the default is 184, or be removed if it's no longer relevant.
| export NCCL_IB_FIFO_TC=${NCCL_IB_FIFO_TC:-184} # 192 | |
| export NCCL_IB_FIFO_TC=${NCCL_IB_FIFO_TC:-184} |
Copilot
AI
Jan 5, 2026
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The entire LD_PRELOAD configuration logic is commented out without explanation. This appears to be critical library loading code for AINIC functionality. Either provide documentation explaining why it's disabled or remove the dead code if it's permanently deprecated.
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
|
|
@@ -31,7 +31,7 @@ def get_batch_func(data_iterator, vp_stage=None): | |||||
| if not is_first_or_last_pipeline_stage(vp_stage): | ||||||
| return None, None, None, None, None | ||||||
|
|
||||||
| assert data_iterator is not None, f"data_iterator is None vp_stage: {vp_stage}" | ||||||
| # assert data_iterator is not None, f"data_iterator is None vp_stage: {vp_stage}" | ||||||
|
||||||
| # assert data_iterator is not None, f"data_iterator is None vp_stage: {vp_stage}" | |
| assert data_iterator is not None, f"data_iterator is None vp_stage: {vp_stage}" |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The environment variable name 'USE_UCX_VERSION' is inconsistent with naming conventions. Consider using 'UCX_VERSION' to match the pattern of other version variables in the file.