Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
74 changes: 74 additions & 0 deletions cookbook/megatron/npu/tp_lora_npu.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
import os

from peft import LoraConfig

import twinkle
from twinkle import DeviceMesh, get_device_placement, get_logger
from twinkle.dataloader import DataLoader
from twinkle.dataset import Dataset, DatasetMeta
from twinkle.model import MegatronModel
from twinkle.preprocessor import SelfCognitionProcessor

# Build a device mesh for the verified NPU LoRA smoke.
MODEL_ID = os.environ.get('TWINKLE_LOCAL_MODEL_DIR', 'ms://Qwen/Qwen3-4B')
DATASET_PATH = os.environ.get(
'TWINKLE_LOCAL_DATASET_PATH',
'ms://swift/self-cognition',
)
MAX_STEPS = int(os.environ.get('TWINKLE_MAX_STEPS', '10'))
TRAIN_SAMPLES = int(os.environ.get('TWINKLE_TRAIN_SAMPLE_LIMIT', '160'))
BATCH_SIZE = int(os.environ.get('TWINKLE_BATCH_SIZE', '16'))

# 8 cards: dp=2, tp=2, pp=2
device_mesh = DeviceMesh.from_sizes(dp_size=2, tp_size=2, pp_size=2)
twinkle.initialize(mode='local', global_device_mesh=device_mesh)

logger = get_logger()


def build_dataloader() -> DataLoader:
dataset = Dataset(dataset_meta=DatasetMeta(DATASET_PATH, data_slice=range(TRAIN_SAMPLES)))
dataset.set_template('Template', model_id=MODEL_ID)
dataset.map(SelfCognitionProcessor('twinkle大模型', 'ModelScope社区'))
dataset.encode()
return DataLoader(dataset=dataset, batch_size=BATCH_SIZE)


def train():
dataloader = build_dataloader()

model = MegatronModel(model_id=MODEL_ID)
lora_config = LoraConfig(r=8, lora_alpha=32, target_modules='all-linear')
model.add_adapter_to_model('default', lora_config)
model.set_optimizer(optimizer_cls='default', lr=1e-4)
Copy link

Copilot AI Mar 27, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This hardcodes lr=1e-4 while the MoE smoke script makes LR configurable via env var. For consistency across cookbook examples (and easier reproduction/tuning), consider reading LR from an environment variable (similar to the other script) or defining a module-level LR constant used here.

Copilot uses AI. Check for mistakes.

# Keep the scheduler compatible with the shortened smoke run.
lr_decay_steps = max(MAX_STEPS, 2)
model.set_lr_scheduler(
scheduler_cls='default',
lr_warmup_steps=1,
lr_decay_steps=lr_decay_steps,
)

logger.info(get_device_placement())
logger.info(model.get_train_configs())
logger.info(
'LoRA NPU smoke config: '
f'model_id={MODEL_ID}, dataset={DATASET_PATH}, batch_size={BATCH_SIZE}, '
f'train_samples={TRAIN_SAMPLES}, max_steps={MAX_STEPS}'
)
logger.info(f'dataloader_steps={len(dataloader)}')

for step, batch in enumerate(dataloader):
model.forward_backward(inputs=batch)
model.clip_grad_and_step()
metric = model.calculate_metric(is_training=True)
logger.info(f'step={step} metric={metric}')
if step + 1 >= MAX_STEPS:
break

model.save('last-checkpoint')


if __name__ == '__main__':
train()
4 changes: 4 additions & 0 deletions cookbook/megatron/npu/tp_lora_npu.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
MEGATRON_LM_PATH=${MEGATRON_LM_PATH:-/path/to/Megatron-LM}
ASCEND_RT_VISIBLE_DEVICES=${ASCEND_RT_VISIBLE_DEVICES:-0,1,2,3,4,5,6,7} \
PYTHONPATH="${MEGATRON_LM_PATH}:${PYTHONPATH:-}" \
torchrun --nproc_per_node=8 cookbook/megatron/npu/tp_lora_npu.py
95 changes: 95 additions & 0 deletions cookbook/megatron/npu/tp_moe_lora_npu.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
import os

from peft import LoraConfig

import twinkle
from twinkle import DeviceMesh, get_device_placement, get_logger
from twinkle.dataloader import DataLoader
from twinkle.dataset import Dataset, DatasetMeta
from twinkle.model import MegatronModel
from twinkle.preprocessor import SelfCognitionProcessor

# Build a device mesh for the verified NPU MoE LoRA smoke.
# Expert LoRA currently only supports ETP=1, so we keep TP at 1 here.
MODEL_ID = os.environ.get(
'TWINKLE_LOCAL_MODEL_DIR',
'ms://Qwen/Qwen3-30B-A3B-Instruct-2507',
)
DATASET_PATH = os.environ.get(
'TWINKLE_LOCAL_DATASET_PATH',
'ms://swift/self-cognition',
)
MAX_STEPS = int(os.environ.get('TWINKLE_MAX_STEPS', '10'))
TRAIN_SAMPLES = int(os.environ.get('TWINKLE_TRAIN_SAMPLE_LIMIT', '80'))
BATCH_SIZE = int(os.environ.get('TWINKLE_BATCH_SIZE', '8'))
DP_SIZE = int(os.environ.get('TWINKLE_DP_SIZE', '8'))
TP_SIZE = int(os.environ.get('TWINKLE_TP_SIZE', '1'))
EP_SIZE = int(os.environ.get('TWINKLE_EP_SIZE', '2'))
PP_SIZE = int(os.environ.get('TWINKLE_PP_SIZE', '1'))
CP_SIZE = int(os.environ.get('TWINKLE_CP_SIZE', '1'))
LR = float(os.environ.get('TWINKLE_LR', '1e-4'))

# 8 cards: dp=8, tp=1, ep=2, pp=1, cp=1
device_mesh = DeviceMesh.from_sizes(
dp_size=DP_SIZE,
tp_size=TP_SIZE,
pp_size=PP_SIZE,
cp_size=CP_SIZE,
ep_size=EP_SIZE,
)
twinkle.initialize(mode='local', global_device_mesh=device_mesh)

logger = get_logger()


def build_dataloader() -> DataLoader:
dataset = Dataset(dataset_meta=DatasetMeta(DATASET_PATH, data_slice=range(TRAIN_SAMPLES)))
dataset.set_template('Template', model_id=MODEL_ID)
dataset.map(SelfCognitionProcessor('twinkle大模型', 'ModelScope社区'))
dataset.encode()
return DataLoader(dataset=dataset, batch_size=BATCH_SIZE)


def _to_loss_value(outputs) -> float:
loss = outputs['loss'] if isinstance(outputs, dict) else outputs.loss
return float(loss.detach().cpu()) if hasattr(loss, 'detach') else float(loss)


def train():
dataloader = build_dataloader()

model = MegatronModel(model_id=MODEL_ID)
lora_config = LoraConfig(r=8, lora_alpha=32, target_modules='all-linear')
model.add_adapter_to_model('default', lora_config)
model.set_optimizer(optimizer_cls='default', lr=LR)

# Keep the scheduler compatible with the shortened smoke run.
lr_decay_steps = max(MAX_STEPS, 2)
model.set_lr_scheduler(
scheduler_cls='default',
lr_warmup_steps=1,
lr_decay_steps=lr_decay_steps,
)

logger.info(get_device_placement())
logger.info(model.get_train_configs())
logger.info(
'MoE LoRA NPU smoke config: '
f'model_id={MODEL_ID}, dataset={DATASET_PATH}, batch_size={BATCH_SIZE}, '
f'train_samples={TRAIN_SAMPLES}, max_steps={MAX_STEPS}, '
f'dp={DP_SIZE}, tp={TP_SIZE}, ep={EP_SIZE}, pp={PP_SIZE}, cp={CP_SIZE}'
)
logger.info(f'dataloader_steps={len(dataloader)}')

for step, batch in enumerate(dataloader):
outputs = model.forward_backward(inputs=batch)
model.clip_grad_and_step()
logger.info(f'step={step} loss={_to_loss_value(outputs)}')
if step + 1 >= MAX_STEPS:
break

model.save('last-checkpoint')


if __name__ == '__main__':
train()
4 changes: 4 additions & 0 deletions cookbook/megatron/npu/tp_moe_lora_npu.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
MEGATRON_LM_PATH=${MEGATRON_LM_PATH:-/path/to/Megatron-LM}
ASCEND_RT_VISIBLE_DEVICES=${ASCEND_RT_VISIBLE_DEVICES:-0,1,2,3,4,5,6,7} \
PYTHONPATH="${MEGATRON_LM_PATH}:${PYTHONPATH:-}" \
torchrun --nproc_per_node=8 cookbook/megatron/npu/tp_moe_lora_npu.py
Loading
Loading