Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions examples/run_local_pretrain.sh
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,11 @@ ENV_ARGS+=("--env" "HF_TOKEN")
ENV_ARGS+=("--env" "WANDB_API_KEY")
ENV_ARGS+=("--env" "ENABLE_NUMA_BINDING")
ENV_ARGS+=("--env" "HSA_KERNARG_POOL_SIZE")
# MLflow environment variables
ENV_ARGS+=("--env" "DATABRICKS_TOKEN")
ENV_ARGS+=("--env" "DATABRICKS_HOST")
ENV_ARGS+=("--env" "MLFLOW_TRACKING_URI")
ENV_ARGS+=("--env" "MLFLOW_REGISTRY_URI")
echo "ENV_ARGS: ${ENV_ARGS[*]}"

HOSTNAME=$(hostname)
Expand Down Expand Up @@ -159,6 +164,12 @@ docker_podman_proxy run --rm \
--env GPUS_PER_NODE \
--env DATA_PATH \
--env TRAIN_LOG \
--env PRIMUS_WORKSPACE \
--env PRIMUS_EXP_NAME \
--env TIMESTAMP \
--env LOG_DIR \
--env PRIMUS_TEAM \
--env PRIMUS_USER \
--env HSA_NO_SCRATCH_RECLAIM \
--env NVTE_CK_USES_BWD_V3 \
--env GPU_MAX_HW_QUEUES \
Expand Down
21 changes: 19 additions & 2 deletions examples/run_pretrain.sh
Original file line number Diff line number Diff line change
Expand Up @@ -123,11 +123,28 @@ fi
# export AITER_JIT_DIR="${TMP_BUILD_DIR}/${CACHE_TAG}_aiter_cache"


TRAIN_LOG=${TRAIN_LOG:-"output/log_mp_pretrain_$(basename "$EXP" .yaml).txt"}
# Extract model name from EXP config file path (e.g., deepseek_v2_lite-pretrain.yaml -> deepseek_v2_lite-pretrain)
MODEL_NAME=$(basename "${EXP}" .yaml)

# Only generate new timestamp/paths if not already set by run_slurm_pretrain.sh.
# This ensures single-node runs get a fresh timestamp, while multi-node runs share the same directory.
if [ -z "${PRIMUS_EXP_NAME}" ]; then
TIMESTAMP=$(date +%Y%m%d_%H%M%S)
export PRIMUS_WORKSPACE=${PRIMUS_WORKSPACE:-"./output"}
export PRIMUS_EXP_NAME="${MODEL_NAME}_${TIMESTAMP}"
export LOG_DIR="${PRIMUS_WORKSPACE}/${PRIMUS_EXP_NAME}"
fi
# Clear work_group and user_name to simplify path: workspace/exp_name
export PRIMUS_TEAM=""
export PRIMUS_USER=""

mkdir -p "$LOG_DIR"
TRAIN_LOG="${LOG_DIR}/log_mp_pretrain.txt"

LOG_INFO_RANK0 "==========Training info=========="
LOG_INFO_RANK0 "EXP: $EXP"
LOG_INFO_RANK0 "EXP: $BACKEND"
LOG_INFO_RANK0 "BACKEND: $BACKEND"
LOG_INFO_RANK0 "OUTPUT_DIR: ${LOG_DIR}"
LOG_INFO_RANK0 "TRAIN_LOG: $TRAIN_LOG"
LOG_INFO_RANK0 "PRIMUS_PATH: $PRIMUS_PATH"
LOG_INFO_RANK0 "DATA_PATH: $DATA_PATH"
Expand Down
19 changes: 18 additions & 1 deletion examples/run_slurm_pretrain.sh
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,22 @@ export NNODES=${NNODES:-1}

SCRIPT_DIR=$(dirname "$(realpath "${BASH_SOURCE[0]}")")

export LOG_DIR=${LOG_DIR:-"./output"}
# -------------------- Unique Output Directory Per Run --------------------
# Extract model name from EXP config file path (e.g., deepseek_v2_lite-pretrain.yaml -> deepseek_v2_lite-pretrain)
MODEL_NAME=$(basename "${EXP:-unknown}" .yaml)
# Export TIMESTAMP so all nodes use the same value (prevents multi-node race condition)
TIMESTAMP=$(date +%Y%m%d_%H%M%S)
export TIMESTAMP

# Set PRIMUS environment variables for output paths
BASE_LOG_DIR=${LOG_DIR:-"./output"}
export PRIMUS_WORKSPACE="${BASE_LOG_DIR}"
export PRIMUS_EXP_NAME="${MODEL_NAME}_${TIMESTAMP}"
export LOG_DIR="${PRIMUS_WORKSPACE}/${PRIMUS_EXP_NAME}"
# Clear work_group and user_name to simplify path: workspace/exp_name
export PRIMUS_TEAM=""
export PRIMUS_USER=""

LOG_FILE="${LOG_DIR}/log_slurm_pretrain.txt"
mkdir -p "$LOG_DIR"

Expand All @@ -52,6 +67,8 @@ srun -N "${NNODES}" \
echo \"SLURM_GPUS_ON_NODE: \${SLURM_GPUS_ON_NODE}\"
echo \"\"
fi
# Log TIMESTAMP on each node to verify consistency across nodes
echo \"[Node \$SLURM_NODEID] TIMESTAMP=\${TIMESTAMP}\"
export MASTER_ADDR=\${node_array[0]}
export MASTER_PORT=\${MASTER_PORT}
export NNODES=\${SLURM_NNODES}
Expand Down
55 changes: 55 additions & 0 deletions primus/backends/megatron/training/global_vars.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,11 @@

from primus.modules.module_utils import debug_rank_0

from .mlflow_artifacts import upload_artifacts_to_mlflow

_GLOBAL_ARGS = None
_GLOBAL_MLFLOW_WRITER = None
_GLOBAL_EXP_ROOT_PATH = None


def set_args(args):
Expand All @@ -23,6 +26,17 @@ def get_args():
return _GLOBAL_ARGS


def set_exp_root_path(exp_root_path):
"""Set the experiment root path for artifact logging."""
global _GLOBAL_EXP_ROOT_PATH
_GLOBAL_EXP_ROOT_PATH = exp_root_path


def get_exp_root_path():
"""Return experiment root path. Can be None."""
return _GLOBAL_EXP_ROOT_PATH


def get_mlflow_writer():
"""Return mlflow writer. It can be None so no need
to check if it is initialized."""
Expand Down Expand Up @@ -62,14 +76,51 @@ def _set_mlflow_writer(args):
_GLOBAL_MLFLOW_WRITER = mlflow


def upload_mlflow_artifacts(
upload_traces: bool = True,
upload_logs: bool = True,
):
"""
Upload trace files and log files to MLflow as artifacts.

This should be called before ending the MLflow run to ensure all
artifacts are uploaded. Only the rank that initialized MLflow
(typically rank world_size - 1) should call this.

Args:
upload_traces: Whether to upload profiler trace files
upload_logs: Whether to upload training log files

Returns:
Dictionary with counts of uploaded files, or None if MLflow is not enabled
"""
mlflow_writer = get_mlflow_writer()
if mlflow_writer is None:
return None

args = get_args()
exp_root_path = get_exp_root_path()
tensorboard_dir = getattr(args, "tensorboard_dir", None)

return upload_artifacts_to_mlflow(
mlflow_writer=mlflow_writer,
tensorboard_dir=tensorboard_dir,
exp_root_path=exp_root_path,
upload_traces=upload_traces,
upload_logs=upload_logs,
)


def unset_global_variables():
"""Unset global vars."""

global _GLOBAL_ARGS
global _GLOBAL_MLFLOW_WRITER
global _GLOBAL_EXP_ROOT_PATH

_GLOBAL_ARGS = None
_GLOBAL_MLFLOW_WRITER = None
_GLOBAL_EXP_ROOT_PATH = None


def _ensure_var_is_initialized(var, name):
Expand All @@ -84,4 +135,8 @@ def _ensure_var_is_not_initialized(var, name):

def destroy_global_vars():
global _GLOBAL_ARGS
global _GLOBAL_MLFLOW_WRITER
global _GLOBAL_EXP_ROOT_PATH
_GLOBAL_ARGS = None
_GLOBAL_MLFLOW_WRITER = None
_GLOBAL_EXP_ROOT_PATH = None
Loading
Loading