Maf 18836 gpt mistal pp rope (#41)

92HyungjunOh · web-flow · commit b6f47b1bac52 · 2025-09-24T08:39:21.000+09:00
# What does this PR do? moreh pipeline 과 rope cache 기능을 추가함. 1. Pipeline을 사용하기 위해서는 아래의 코드가 python 레벨에서 호출이 되어야함. ``` hidden_states = torch.moreh.pipeline_assign(hidden_states) ``` 이를 위해 해당 코드를 추가함. 2. Mistral의 Rope를 cache 버전을 사용하도록 함. 기존 `MistralRotaryEmbedding` 에서는 forward 마다 매번 cos, sin tensor를 타켓 하드웨어(GPU, NPU) 위에서 연산하였음. 이 값을 계산하기위해서는 hidden의 dtype과 shape 정보만 있으면 미리 계산을 해두는 최적화를 적용 할 수 있음. (진짜 목적은 cache 버전을 사용하지 않으면 MAF의 Pipeline 실행이 안됨) 참고 코드. 구버전 HuggingFace의 llama Code. https://github.com/huggingface/transformers/blob/9c804f7ec42c94289ce52eaa84eed32f770311d7/src/transformers/models/deprecated/open_llama/modeling_open_llama.py#L109 MAF의 rope 코드 (CPU에서 미리 연산을 수행해둠) https://github.com/moreh-dev/framework/blob/df54f28ce96ff43dce4c0b40a0aeb7bff7fd6b0c/IR/driver/pytorch/torch/moreh_ops/rotary_embedding.py#L35 아래는 MAF에서 호출하는 예제 입니다. moreh-dev/framework#8819 sh 파일은 안올려놨는데 크게 다를건 없고 모델이름을 gpt2-small-moreh 이렇게 실행하면 됩니다. json 예제 ``` { "attention_dropout": 0.0, "bos_token_id": 1, "eos_token_id": 2, "head_dim": null, "hidden_act": "silu", "hidden_size": 2048, "initializer_range": 0.02, "intermediate_size": 7168, "max_position_embeddings": 131072, "model_type": "mistral-moreh", "num_attention_heads": 16, "num_hidden_layers": 7, "num_key_value_heads": 8, "pad_token_id": 2, "rms_norm_eps": 1e-06, "rope_theta": 10000.0, "sliding_window": null, "tie_word_embeddings": false, "transformers_version": "4.42.4", "use_cache": true, "vocab_size": 32000, "moreh_config": { "pipeline_layers": [3], "rope_cache": true } } ```
diff --git a/src/transformers/models/gpt2/configuration_gpt2_moreh.py b/src/transformers/models/gpt2/configuration_gpt2_moreh.py
@@ -159,6 +159,7 @@ def __init__(
         eos_token_id=50256,
         scale_attn_by_inverse_layer_idx=False,
         reorder_and_upcast_attn=False,
+        moreh_config=None,
         **kwargs,
     ):
         self.vocab_size = vocab_size
@@ -186,6 +187,8 @@ def __init__(
         self.bos_token_id = bos_token_id
         self.eos_token_id = eos_token_id
 
+        self.moreh_config = moreh_config
+
         super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
 
 
diff --git a/src/transformers/models/gpt2/modeling_gpt2_moreh.py b/src/transformers/models/gpt2/modeling_gpt2_moreh.py
@@ -1016,6 +1016,13 @@ def __init__(self, config):
         # Initialize weights and apply final processing
         self.post_init()
 
+        # Moreh Config
+        self.moreh_pipeline_layers = []
+        moreh_config = getattr(config, "moreh_config", None)
+        if moreh_config is not None and "pipeline_layers" in moreh_config:
+            self.moreh_pipeline_layers = moreh_config["pipeline_layers"]
+
+
     @add_start_docstrings(PARALLELIZE_DOCSTRING)
     def parallelize(self, device_map=None):
         # Check validity of device_map
@@ -1257,6 +1264,8 @@ def forward(
                 for k, v in self.device_map.items():
                     if i == v[-1] and "cuda:" + str(k) != self.last_device:
                         hidden_states = hidden_states.to("cuda:" + str(k + 1))
+            if i in self.moreh_pipeline_layers:
+                hidden_states = torch.moreh.pipeline_assign(hidden_states)
 
         hidden_states = self.ln_f(hidden_states)
 
@@ -1293,7 +1302,6 @@ class GPT2LMHeadModelMoreh(GPT2PreTrainedModel):
 
     def __init__(self, config):
         super().__init__(config)
-        print("GPT2LMHeadModelMoreh ##################################")
         self.transformer = GPT2Model(config)
         self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
 
diff --git a/src/transformers/models/mistral/configuration_mistral_moreh.py b/src/transformers/models/mistral/configuration_mistral_moreh.py
@@ -116,6 +116,7 @@ def __init__(
         rope_theta=10000.0,
         sliding_window=4096,
         attention_dropout=0.0,
+        moreh_config=None,
         **kwargs,
     ):
         self.vocab_size = vocab_size
@@ -138,6 +139,8 @@ def __init__(
         self.rope_theta = rope_theta
         self.attention_dropout = attention_dropout
 
+        self.moreh_config = moreh_config
+
         super().__init__(
             pad_token_id=pad_token_id,
             bos_token_id=bos_token_id,
diff --git a/src/transformers/models/mistral/modeling_mistral_moreh.py b/src/transformers/models/mistral/modeling_mistral_moreh.py
@@ -94,7 +94,7 @@ def forward(self, hidden_states):
 
 
 class MistralRotaryEmbedding(nn.Module):
-    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
+    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, use_rope_cache=False):
         super().__init__()
 
         self.dim = dim
@@ -103,9 +103,36 @@ def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
         inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2, dtype=torch.int64).float().to(device) / self.dim))
         self.register_buffer("inv_freq", inv_freq, persistent=False)
 
+        self.use_rope_cache = use_rope_cache
+        if self.use_rope_cache:
+            self._set_cos_sin_cache(max_position_embeddings, dtype=torch.float32)
+
+    def _set_cos_sin_cache(self, seq_len, dtype):
+        self.max_seq_len_cached = seq_len
+
+        t = torch.arange(seq_len, dtype=torch.float32, device="cpu")
+        freqs = torch.outer(t, self.inv_freq.cpu())  # [seq_len, dim/2]
+        emb = torch.cat((freqs, freqs), dim=-1)      # [seq_len, dim]
+
+        cos = emb.cos()
+        sin = emb.sin()
+
+        cos = cos.to(device='cuda', dtype=dtype)
+        sin = sin.to(device='cuda', dtype=dtype)
+
+        self.register_buffer("cos_cached", cos, persistent=False)
+        self.register_buffer("sin_cached", sin, persistent=False)
+
     @torch.no_grad()
     # Copied from transformers.models.llama.modeling_llama.LlamaRotaryEmbedding.forward
     def forward(self, x, position_ids):
+        if self.use_rope_cache:
+            seq_len = position_ids.shape[-1]
+            assert seq_len <= self.max_position_embeddings, "Sequence length exceeds maximum position embeddings"
+            cos = self.cos_cached[:seq_len].to(dtype=x.dtype, device=x.device).unsqueeze(0)
+            sin = self.sin_cached[:seq_len].to(dtype=x.dtype, device=x.device).unsqueeze(0)
+            return cos, sin
+
         # x: [bs, num_attention_heads, seq_len, head_size]
         inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
         position_ids_expanded = position_ids[:, None, :].float()
@@ -221,10 +248,16 @@ def __init__(self, config: MistralMorehConfig, layer_idx: Optional[int] = None):
         self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False)
         self.o_proj = nn.Linear(self.hidden_size, self.hidden_size, bias=False)
 
+        use_rope_cache = False
+        moreh_config = getattr(config, "moreh_config", None)
+        if moreh_config is not None and "rope_cache" in moreh_config:
+            use_rope_cache = moreh_config["rope_cache"]
+
         self.rotary_emb = MistralRotaryEmbedding(
             self.head_dim,
             max_position_embeddings=self.max_position_embeddings,
             base=self.rope_theta,
+            use_rope_cache=use_rope_cache,
         )
 
     def forward(
@@ -885,6 +918,12 @@ def __init__(self, config: MistralMorehConfig):
         # Initialize weights and apply final processing
         self.post_init()
 
+        # Moreh Config
+        self.moreh_pipeline_layers = []
+        moreh_config = getattr(config, "moreh_config", None)
+        if moreh_config is not None and "pipeline_layers" in moreh_config:
+            self.moreh_pipeline_layers = moreh_config["pipeline_layers"]
+
     def get_input_embeddings(self):
         return self.embed_tokens
 
@@ -957,7 +996,7 @@ def forward(
         all_self_attns = () if output_attentions else None
         next_decoder_cache = None
 
-        for decoder_layer in self.layers:
+        for layer_idx, decoder_layer in enumerate(self.layers):
             if output_hidden_states:
                 all_hidden_states += (hidden_states,)
 
@@ -984,6 +1023,8 @@ def forward(
                 )
 
             hidden_states = layer_outputs[0]
+            if layer_idx in self.moreh_pipeline_layers:
+                hidden_states = torch.moreh.pipeline_assign(hidden_states)
 
             if use_cache:
                 next_decoder_cache = layer_outputs[2 if output_attentions else 1]
@@ -1123,7 +1164,6 @@ class MistralForCausalLMMoreh(MistralPreTrainedModel):
 
     def __init__(self, config):
         super().__init__(config)
-        print("MistralForCausalLMMoreh #########################################")
         self.model = MistralModel(config)
         self.vocab_size = config.vocab_size
         self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)