Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
421 changes: 229 additions & 192 deletions .github/workflows/ci.yaml

Large diffs are not rendered by default.

105 changes: 105 additions & 0 deletions .github/workflows/docker/Dockerfile_v25.10_ainic_temp
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
# Base image
FROM docker.io/rocm/megatron-lm:v25.10

# Specify the commit of Primus-Turbo when building: docker build --build-arg PRIMUS_TURBO_COMMIT=xxx .)
ARG PRIMUS_TURBO_COMMIT
ARG AINIC_BUNDLE_PATH

# Install basic dependencies
RUN apt-get update

# Clone and install the Primus-Turbo
WORKDIR /opt
RUN mkdir -p /opt && cd /opt && \
git clone https://github.com/AMD-AGI/Primus-Turbo.git && \
cd Primus-Turbo && \
git checkout ${PRIMUS_TURBO_COMMIT} && \
git submodule update --init --recursive && \
pip3 install -r requirements.txt && \
GPU_ARCHS="gfx942;gfx950" pip3 install --no-build-isolation .

RUN apt-get install --reinstall binutils -y && apt-get install numactl -y

WORKDIR /opt
ENV WORKDIR=/opt
ENV ROCM_PATH=/opt/rocm

RUN apt-get update && \
apt-get install jq dpkg-dev kmod xz-utils \
libfmt-dev libboost-all-dev \
libibverbs-dev ibverbs-utils infiniband-diags -y

# =============================== Build AINIC Driver ===============================
# WARNING: Please ensure the following environment variables are correctly set:
# WARNING: 1. PATH: /usr/sbin must be included.
# WARNING: 2. LD_LIBRARY_PATH: /usr/lib must be included.
# WARNING: If these paths are missing, tools and libraries may not function correctly.
# INFO: Installation completed successfully

COPY ${AINIC_BUNDLE_PATH}/ainic_bundle_1.117.1-a-42.tar.gz /opt/
RUN cd ${WORKDIR} && \
echo "Building ainic bundle... current directory: ${WORKDIR}" && \
tar zxf ainic_bundle_1.117.1-a-42.tar.gz && \
cd ainic_bundle_1.117.1-a-42 && \
tar zxf host_sw_pkg.tar.gz && \
cd host_sw_pkg && \
./install.sh --domain=user -y 2>&1 | tee log_install.txt && \
cd /opt

# =============================== Test AINIC Driver ===============================
# ibv_devices
# rdma link
# ethtool -i enp9s0
# ibv_devinfo -vv | grep GID

# =============================== Build UCX ===============================
RUN cd ${WORKDIR} && wget https://github.com/openucx/ucx/releases/download/v1.18.0/ucx-1.18.0.tar.gz && \
mkdir -p ucx-1.18.0 && \
tar -zxf ucx-1.18.0.tar.gz -C ucx-1.18.0 --strip-components=1 && \
cd ucx-1.18.0 && mkdir build && cd build && \
../configure --prefix=${WORKDIR}/ucx-1.18.0/install --with-rocm=${ROCM_PATH} 2>&1 | tee log_ucx_configure.txt && \
make -j 16 2>&1 | tee log_ucx_build.txt && \
make install && \
cd ${WORKDIR}

ENV UCX_INSTALL_DIR=${WORKDIR}/ucx-1.18.0/install

# =============================== Build MPI ===============================
RUN cd ${WORKDIR} && \
wget https://download.open-mpi.org/release/open-mpi/v4.1/openmpi-4.1.6.tar.gz && \
mkdir -p ompi-4.1.6 && \
tar -zxf openmpi-4.1.6.tar.gz -C ompi-4.1.6 --strip-components=1 && \
cd ompi-4.1.6 && mkdir build && cd build && \
../configure --prefix=${WORKDIR}/ompi-4.1.6/install --with-ucx=${UCX_INSTALL_DIR} \
--disable-oshmem --disable-mpi-fortran 2>&1 | tee log_mpi_configure.txt && \
make -j 16 2>&1 | tee log_mpi_build.txt && \
make install && \
cd ${WORKDIR}

ENV MPI_PATH=${WORKDIR}/ompi-4.1.6/install

# =============================== Build RCCL ===============================
RUN cd ${WORKDIR} && \
git clone https://github.com/ROCm/rccl.git && \
cd rccl && git checkout drop/2025-08 && \
./install.sh -l --prefix build/ --disable-mscclpp \
--disable-msccl-kernel --amdgpu_targets="gfx950" 2>&1 | tee log_rccl_install.txt && \
cd ${WORKDIR}

ENV RCCL_HOME=${WORKDIR}/rccl

# =============================== Build AMD ANP ===============================

RUN cd ${WORKDIR} && git clone https://github.com/rocm/amd-anp.git && \
cd amd-anp && git checkout tags/v1.1.0-5 && \
sed -i '5a CFLAGS += --offload-arch=gfx950' ./Makefile && head -10 ./Makefile && \
make -j 16 RCCL_BUILD=${RCCL_HOME}/build/release \
MPI_INCLUDE=${MPI_PATH}/include/ \
MPI_LIB_PATH=${MPI_PATH}/lib/ \
ROCM_PATH=${ROCM_PATH} 2>&1 | tee log_amd_anp_build.txt

# Set the default working directory
WORKDIR /opt

# check the installed Primus-Turbo package
RUN python3 -m pip show primus-turbo || true
114 changes: 114 additions & 0 deletions .github/workflows/docker/Dockerfile_v25.11_ainic
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
# Base image
FROM docker.io/rocm/primus:v25.11

# Specify the commit of Primus-Turbo when building: docker build --build-arg PRIMUS_TURBO_COMMIT=xxx .)
ARG PRIMUS_TURBO_COMMIT
ARG AINIC_BUNDLE_PATH

# Install basic dependencies
RUN apt-get update

# Clone and install the Primus-Turbo
WORKDIR /opt
RUN mkdir -p /opt && cd /opt && \
git clone https://github.com/AMD-AGI/Primus-Turbo.git && \
cd Primus-Turbo && \
git checkout ${PRIMUS_TURBO_COMMIT} && \
git submodule update --init --recursive && \
pip3 install -r requirements.txt && \
GPU_ARCHS="gfx942;gfx950" pip3 install --no-build-isolation .

RUN apt-get install --reinstall binutils -y && apt-get install numactl -y

WORKDIR /opt
ENV WORKDIR=/opt
ENV ROCM_PATH=/opt/rocm

RUN apt-get update && \
apt-get install jq dpkg-dev kmod xz-utils \
libfmt-dev libboost-all-dev \
libibverbs-dev ibverbs-utils infiniband-diags -y

# =============================== Build AINIC Driver ===============================
# WARNING: Please ensure the following environment variables are correctly set:
# WARNING: 1. PATH: /usr/sbin must be included.
# WARNING: 2. LD_LIBRARY_PATH: /usr/lib must be included.
# WARNING: If these paths are missing, tools and libraries may not function correctly.
# INFO: Installation completed successfully

COPY ${AINIC_BUNDLE_PATH}/ainic_bundle_1.117.5-a-38.tar.gz /opt/
RUN cd ${WORKDIR} && \
echo "Building ainic bundle... current directory: ${WORKDIR}" && \
tar zxf ainic_bundle_1.117.5-a-38.tar.gz && \
cd ainic_bundle_1.117.5-a-38 && \
tar zxf host_sw_pkg.tar.gz && \
cd host_sw_pkg && \
./install.sh --domain=user -y 2>&1 | tee log_install.txt && \
cd /opt

# =============================== Test AINIC Driver ===============================
# ibv_devices
# rdma link
# ethtool -i enp9s0
# ibv_devinfo -vv | grep GID

# =============================== Build UCX ===============================
ENV USE_UCX_VERSION="1.15.0"
RUN cd ${WORKDIR} && wget https://github.com/openucx/ucx/releases/download/v${USE_UCX_VERSION}/ucx-${USE_UCX_VERSION}.tar.gz && \
mkdir -p ucx-${USE_UCX_VERSION} && \
tar -zxf ucx-${USE_UCX_VERSION}.tar.gz -C ucx-${USE_UCX_VERSION} --strip-components=1 && \
cd ucx-${USE_UCX_VERSION} && mkdir build && cd build && \
../configure --prefix=${WORKDIR}/ucx-${USE_UCX_VERSION}/install --with-rocm=${ROCM_PATH} 2>&1 | tee log_ucx_configure.txt && \
make -j 16 2>&1 | tee log_ucx_build.txt && \
make install && \
cd ${WORKDIR}

ENV UCX_INSTALL_DIR=${WORKDIR}/ucx-${USE_UCX_VERSION}/install
Comment on lines +56 to +66
Copy link

Copilot AI Dec 30, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The environment variable name 'USE_UCX_VERSION' is inconsistent with naming conventions. Consider using 'UCX_VERSION' to match the pattern of other version variables in the file.

Suggested change
ENV USE_UCX_VERSION="1.15.0"
RUN cd ${WORKDIR} && wget https://github.com/openucx/ucx/releases/download/v${USE_UCX_VERSION}/ucx-${USE_UCX_VERSION}.tar.gz && \
mkdir -p ucx-${USE_UCX_VERSION} && \
tar -zxf ucx-${USE_UCX_VERSION}.tar.gz -C ucx-${USE_UCX_VERSION} --strip-components=1 && \
cd ucx-${USE_UCX_VERSION} && mkdir build && cd build && \
../configure --prefix=${WORKDIR}/ucx-${USE_UCX_VERSION}/install --with-rocm=${ROCM_PATH} 2>&1 | tee log_ucx_configure.txt && \
make -j 16 2>&1 | tee log_ucx_build.txt && \
make install && \
cd ${WORKDIR}
ENV UCX_INSTALL_DIR=${WORKDIR}/ucx-${USE_UCX_VERSION}/install
ENV UCX_VERSION="1.15.0"
RUN cd ${WORKDIR} && wget https://github.com/openucx/ucx/releases/download/v${UCX_VERSION}/ucx-${UCX_VERSION}.tar.gz && \
mkdir -p ucx-${UCX_VERSION} && \
tar -zxf ucx-${UCX_VERSION}.tar.gz -C ucx-${UCX_VERSION} --strip-components=1 && \
cd ucx-${UCX_VERSION} && mkdir build && cd build && \
../configure --prefix=${WORKDIR}/ucx-${UCX_VERSION}/install --with-rocm=${ROCM_PATH} 2>&1 | tee log_ucx_configure.txt && \
make -j 16 2>&1 | tee log_ucx_build.txt && \
make install && \
cd ${WORKDIR}
ENV UCX_INSTALL_DIR=${WORKDIR}/ucx-${UCX_VERSION}/install

Copilot uses AI. Check for mistakes.

# =============================== Build MPI ===============================
RUN cd ${WORKDIR} && \
wget https://download.open-mpi.org/release/open-mpi/v4.1/openmpi-4.1.6.tar.gz && \
mkdir -p ompi-4.1.6 && \
tar -zxf openmpi-4.1.6.tar.gz -C ompi-4.1.6 --strip-components=1 && \
cd ompi-4.1.6 && mkdir build && cd build && \
../configure --prefix=${WORKDIR}/ompi-4.1.6/install --with-ucx=${UCX_INSTALL_DIR} \
--disable-oshmem --disable-mpi-fortran 2>&1 | tee log_mpi_configure.txt && \
make -j 16 2>&1 | tee log_mpi_build.txt && \
make install && \
cd ${WORKDIR}

ENV MPI_PATH=${WORKDIR}/ompi-4.1.6/install

# =============================== Build RCCL ===============================
# cd rccl && git checkout rocm-7.1.0 && \
Copy link

Copilot AI Dec 30, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This commented-out line appears to be leftover code. If the git checkout command is intentionally disabled, add a comment explaining why; otherwise, remove it to avoid confusion.

Suggested change
# cd rccl && git checkout rocm-7.1.0 && \

Copilot uses AI. Check for mistakes.
RUN cd ${WORKDIR} && \
git clone https://github.com/ROCm/rccl.git && \
cd rccl && git checkout release/rocm-rel-7.1 && \
Copy link

Copilot AI Jan 3, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The RCCL checkout uses a branch name 'release/rocm-rel-7.1' instead of a specific tag or commit. This could lead to non-reproducible builds as the branch may be updated. Consider using a specific tag or commit hash for reproducibility.

Suggested change
cd rccl && git checkout release/rocm-rel-7.1 && \
cd rccl && git checkout rocm-7.1.0 && \

Copilot uses AI. Check for mistakes.
./install.sh -l --prefix build/ --disable-msccl-kernel \
--amdgpu_targets="gfx950" 2>&1 | tee log_rccl_install.txt && \
cd ${WORKDIR}

ENV RCCL_HOME=${WORKDIR}/rccl

# =============================== Build AMD ANP ===============================

# sed -i '5a CFLAGS += --offload-arch=gfx950' ./Makefile && head -10 ./Makefile && \

Comment on lines +95 to +96
Copy link

Copilot AI Dec 30, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This commented-out line appears redundant since line 102 performs the same operation. Consider removing this commented line to reduce clutter.

Suggested change
# sed -i '5a CFLAGS += --offload-arch=gfx950' ./Makefile && head -10 ./Makefile && \

Copilot uses AI. Check for mistakes.
WORKDIR /opt
# COPY ${AINIC_BUNDLE_PATH}/amd-anp-v1.3.0.patch /opt/
# git apply /opt/amd-anp-v1.3.0.patch && \
Comment on lines +95 to +99
Copy link

Copilot AI Dec 30, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These commented-out patch-related lines suggest incomplete implementation. Either implement the patch application if needed or remove these lines with an explanation of why the patch is no longer required.

Suggested change
# sed -i '5a CFLAGS += --offload-arch=gfx950' ./Makefile && head -10 ./Makefile && \
WORKDIR /opt
# COPY ${AINIC_BUNDLE_PATH}/amd-anp-v1.3.0.patch /opt/
# git apply /opt/amd-anp-v1.3.0.patch && \
# Note: amd-anp v1.3.0 is built directly from the upstream tag; any required
# customization is applied via the sed-based CFLAGS modification below, so no
# additional patch file is applied here.
WORKDIR /opt

Copilot uses AI. Check for mistakes.
RUN cd ${WORKDIR} && git clone https://github.com/rocm/amd-anp.git && \
cd amd-anp && git checkout tags/v1.3.0 && \
sed -i '5a CFLAGS += --offload-arch=gfx950' ./Makefile && head -10 ./Makefile && \
make -j 16 RCCL_HOME=${RCCL_HOME} \
MPI_INCLUDE=${MPI_PATH}/include/ \
MPI_LIB_PATH=${MPI_PATH}/lib/ \
ROCM_PATH=${ROCM_PATH} && \
make RCCL_HOME=$RCCL_HOME ROCM_PATH=${ROCM_PATH} install \
2>&1 | tee log_amd_anp_build.txt

# Set the default working directory
WORKDIR /opt

# check the installed Primus-Turbo package
RUN python3 -m pip show primus-turbo || true
Empty file added examples/__init__.py
Empty file.
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
work_group: ${PRIMUS_TEAM:amd}
user_name: ${PRIMUS_USER:root}
exp_name: ${PRIMUS_EXP_NAME:llama3_70B-pretrain}
workspace: ${PRIMUS_WORKSPACE:./output}

modules:
pre_trainer:
framework: megatron
config: pre_trainer.yaml
model: llama3.1_70B.yaml
overrides:
# log
wandb_project: "Primus_DeepSeek_Pretrain"
stderr_sink_level: DEBUG

log_avg_skip_iterations: 2
log_avg_reset_interval: 50

train_iters: 50
micro_batch_size: 1
global_batch_size: 32

seq_length: 8192
max_position_embeddings: 8192

lr: 1.0e-5
min_lr: 0.0
lr_warmup_iters: 2
lr_decay_iters: null
lr_decay_style: cosine
weight_decay: 0.1
adam_beta1: 0.9
adam_beta2: 0.95
eod_mask_loss: true
init_method_std: 0.008
norm_epsilon: 1.0e-6

# parallel
tensor_model_parallel_size: 4
pipeline_model_parallel_size: 8
sequence_parallel: 1

# data
mock_data: true
train_data_path: null
valid_data_path: null
test_data_path: null

# ckpt
finetune: false
auto_continue_train: false
load: null
no_load_optim: null
no_load_rng: null
save: null
save_interval: 20000
no_save_optim: null
no_save_rng: null
disable_last_saving: true
ckpt_format: torch_dist

use_distributed_optimizer: true
overlap_grad_reduce: true
overlap_param_gather: true
gradient_accumulation_fusion: true

# recompute
recompute_granularity: full # full, selective
recompute_method: block # uniform, block
recompute_num_layers: 0 # int

# Cross entropy flags
# cross_entropy_fusion_impl: "te"
# cross_entropy_loss_fusion: true
14 changes: 14 additions & 0 deletions examples/run_local_pretrain.sh
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,20 @@ if [[ "${CLEAN_DOCKER_CONTAINER:-0}" == "1" ]]; then
fi
fi

if [[ -n "${DOCKER_LOGIN_KEY}" ]]; then
if [[ -z "${DOCKER_LOGIN_USER}" ]]; then
echo "DOCKER_LOGIN_KEY is set but DOCKER_LOGIN_USER is empty. Exiting."
exit 1
fi
echo "Logging in to Docker registry as ${DOCKER_LOGIN_USER}..."
docker login -u "${DOCKER_LOGIN_USER}" -p "${DOCKER_LOGIN_KEY}"
if [[ $? -ne 0 ]]; then
echo "Docker login failed! Exiting."
exit 1
fi
fi


if [[ "${SKIP_TRAIN:-0}" == "1" ]]; then
echo "Node-${NODE_RANK}: Skipping training container launch."
exit 0
Expand Down
19 changes: 13 additions & 6 deletions examples/run_pretrain.sh
Original file line number Diff line number Diff line change
Expand Up @@ -158,12 +158,12 @@ if [ "$USING_AINIC" == "1" ]; then
LOG_INFO_RANK0 "ANP_HOME_DIR: $ANP_HOME_DIR"
LOG_INFO_RANK0 "MPI_HOME_DIR: $MPI_HOME_DIR"

# unset NCCL_IB_GID_INDEX
export NCCL_IB_TC=${NCCL_IB_TC:-104}
export NCCL_IB_FIFO_TC=${NCCL_IB_FIFO_TC:-184} # 192
Copy link

Copilot AI Dec 29, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The inline comment "# 192" suggests this was the previous value, but appears as trailing documentation. Consider removing this comment or clarifying why 192 is noted if it provides important context for future reference.

Suggested change
export NCCL_IB_FIFO_TC=${NCCL_IB_FIFO_TC:-184} # 192
export NCCL_IB_FIFO_TC=${NCCL_IB_FIFO_TC:-184} # previous default: 192

Copilot uses AI. Check for mistakes.
Copy link

Copilot AI Jan 5, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The inline comment '# 192' is unclear - it should explain why 192 is referenced when the default is 184, or be removed if it's no longer relevant.

Suggested change
export NCCL_IB_FIFO_TC=${NCCL_IB_FIFO_TC:-184} # 192
export NCCL_IB_FIFO_TC=${NCCL_IB_FIFO_TC:-184}

Copilot uses AI. Check for mistakes.

export NCCL_IB_GID_INDEX=1
# export NCCL_IB_ROCE_VERSION_NUM=2
export NCCL_IB_ROCE_VERSION_NUM=2
export NCCL_MAX_P2P_CHANNELS=56
export NCCL_IB_TC=104
export NCCL_IB_FIFO_TC=192
export NET_OPTIONAL_RECV_COMPLETION=1
export NCCL_IB_USE_INLINE=1
export RCCL_GDR_FLUSH_GPU_MEM_NO_RELAXED_ORDERING=0
Expand All @@ -178,7 +178,14 @@ if [ "$USING_AINIC" == "1" ]; then

# v25.09
export LD_LIBRARY_PATH=/usr/lib/x86_64-linux-gnu:/usr/lib/x86_64-linux-gnu/libibverbs:${RCCL_HOME_DIR}/build/release:${ANP_HOME_DIR}/build:${MPI_HOME_DIR}/install/lib:$LD_LIBRARY_PATH
export LD_PRELOAD=${ANP_HOME_DIR}/build/librccl-net.so:${RCCL_HOME_DIR}/build/release/librccl.so.1.0
# if [ -f "${ANP_HOME_DIR}/build/librccl-net.so" ]; then
# export LD_PRELOAD="${ANP_HOME_DIR}/build/librccl-net.so:${RCCL_HOME_DIR}/build/release/librccl.so.1.0"
# elif [ -f "${ANP_HOME_DIR}/build/librccl-anp.so" ]; then
# export LD_PRELOAD="${ANP_HOME_DIR}/build/librccl-anp.so:${RCCL_HOME_DIR}/build/release/librccl.so.1.0"
# else
# echo "ERROR: Neither librccl-net.so nor librccl-anp.so was found in ${ANP_HOME_DIR}/build"
# exit 1
# fi
Comment on lines +181 to +188
Copy link

Copilot AI Jan 5, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The entire LD_PRELOAD configuration logic is commented out without explanation. This appears to be critical library loading code for AINIC functionality. Either provide documentation explaining why it's disabled or remove the dead code if it's permanently deprecated.

Copilot uses AI. Check for mistakes.
else
export NCCL_IB_GID_INDEX=3
fi
Expand Down Expand Up @@ -477,7 +484,7 @@ fi
setup_pythonpath() {
local site_packages
site_packages=$(python -c "import sysconfig; print(sysconfig.get_paths()['purelib'])")
export PYTHONPATH="${site_packages}:${PRIMUS_PATH}:$:${PYTHONPATH}"
export PYTHONPATH="${PRIMUS_PATH}:${site_packages}:${PYTHONPATH}"
}

setup_pythonpath
Expand Down
Empty file added examples/scripts/__init__.py
Empty file.
2 changes: 1 addition & 1 deletion primus/modules/trainer/megatron/pre_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ def get_batch_func(data_iterator, vp_stage=None):
if not is_first_or_last_pipeline_stage(vp_stage):
return None, None, None, None, None

assert data_iterator is not None, f"data_iterator is None vp_stage: {vp_stage}"
# assert data_iterator is not None, f"data_iterator is None vp_stage: {vp_stage}"
Copy link

Copilot AI Jan 5, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The assertion is commented out without explanation. If this check is no longer needed for benchmarking scenarios, add a comment explaining why (e.g., '# Skip assertion for mock data benchmarks'). Otherwise, restore the assertion to maintain data validation.

Suggested change
# assert data_iterator is not None, f"data_iterator is None vp_stage: {vp_stage}"
assert data_iterator is not None, f"data_iterator is None vp_stage: {vp_stage}"

Copilot uses AI. Check for mistakes.
# get batches based on the TP rank you are on
batch = get_batch_on_this_tp_rank(data_iterator)

Expand Down
Loading