Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 23 additions & 0 deletions docker/patch/v0.5.9/megatron.patch
Original file line number Diff line number Diff line change
Expand Up @@ -314,6 +314,29 @@ index 1fd5dcfae..c9aeef1f0 100644
ctx.v_dim,
nheads,
batch_size,
diff --git a/megatron/core/models/common/embeddings/rotary_pos_embedding.py b/megatron/core/models/common/embeddings/rotary_pos_embedding.py
index 5d7b69cd3..2e0a26815 100644
--- a/megatron/core/models/common/embeddings/rotary_pos_embedding.py
+++ b/megatron/core/models/common/embeddings/rotary_pos_embedding.py
@@ -348,6 +348,7 @@ class MultimodalRotaryEmbedding(nn.Module):

# shape (seq_length, bs, 1, 2 * dim)
emb = emb[..., None, :].transpose(0, 1).contiguous()
+ packed_seq = packed_seq_params is not None and packed_seq_params.qkv_format == 'thd'
if packed_seq_params is not None and packed_seq_params.local_cp_size is not None:
if packed_seq_params.local_cp_size > 1:
# Set CP group to dynamic CP group for CP slicing
@@ -357,7 +358,9 @@ class MultimodalRotaryEmbedding(nn.Module):
cp_group = None
else:
cp_group = self.cp_group
- if cp_group is not None and cp_group.size() > 1:
+ # For THD (packed sequence) format, skip CP slicing here — it is handled
+ # per-sequence inside _apply_rotary_pos_emb_thd instead (same as RotaryEmbedding).
+ if cp_group is not None and cp_group.size() > 1 and not packed_seq:
# slice rotary_pos_emb along sequence dimension and select the parition of the current
# CP rank
emb = get_pos_emb_on_this_cp_rank(emb, 0, cp_group)
diff --git a/megatron/core/models/common/language_module/language_module.py b/megatron/core/models/common/language_module/language_module.py
index 13d74aa52..060898a7a 100644
--- a/megatron/core/models/common/language_module/language_module.py
Expand Down
Loading
Loading