feat: support mpositions for glm4v.

wly-115 · JimHsiung · commit b8c4168e71b7 · 2025-12-07T18:00:22.000+08:00
diff --git a/xllm/core/framework/batch/mposition.cpp b/xllm/core/framework/batch/mposition.cpp
@@ -15,10 +15,34 @@ limitations under the License.
 
 #include "mposition.h"
 
+#include <absl/strings/match.h>
+
 #include "framework/model/model_args.h"
 #include "framework/request/sequence.h"
+
 namespace xllm {
 
+namespace {
+std::vector<std::tuple<std::string, int, int>> groupByTokenType(
+    const std::vector<std::string>& token_types) {
+  std::vector<std::tuple<std::string, int, int>> groups;
+  if (token_types.empty()) return groups;
+
+  std::string current_key = token_types[0];
+  int start = 0;
+
+  for (int i = 1; i < token_types.size(); ++i) {
+    if (token_types[i] != current_key) {
+      groups.emplace_back(current_key, start, i);
+      current_key = token_types[i];
+      start = i;
+    }
+  }
+  groups.emplace_back(current_key, start, static_cast<int>(token_types.size()));
+  return groups;
+}
+}  // namespace
+
 torch::Tensor MPositionHelper::get_positions() {
   // if (seq_.is_chunked_prefill_stage()) {
   if (seq_.kv_state().kv_cache_tokens_num() < seq_.num_prompt_tokens()) {
@@ -35,16 +59,128 @@ torch::Tensor MPositionHelper::get_positions() {
     torch::Tensor second_per_grid_ts;
     if (auto res = mm_data.get<torch::Tensor>("second_per_grid_ts"))
       second_per_grid_ts = res.value();
-    auto res =
-        get_positions_p(image_grid_thw, video_grid_thw, second_per_grid_ts);
+    std::tuple<torch::Tensor, int> res;
+    if (!absl::StartsWith(args_.model_type(), "glm4v")) {
+      res = get_positions_p(image_grid_thw, video_grid_thw, second_per_grid_ts);
+    } else {
+      res = get_positions_glm(image_grid_thw, video_grid_thw);
+    }
     seq_.set_mrope_position_delta(std::get<1>(res));
-
     return std::get<0>(res);
   } else {
     return get_positions_d();
   }
 }
 
+std::tuple<torch::Tensor, int> MPositionHelper::get_positions_glm(
+    torch::Tensor image_grid_thw,
+    torch::Tensor video_grid_thw) {
+  auto input_tokens = seq_.tokens();
+  auto spatial_merge_size = args_.mm_spatial_merge_size();
+  auto image_token_id = args_.image_token_id();
+  auto video_token_id = args_.video_token_id();
+  auto video_start_token_id = args_.video_start_token_id();
+  auto video_end_token_id = args_.video_end_token_id();
+
+  auto dtype = torch::kInt32;
+
+  std::vector<std::string> input_token_type;
+  bool in_video = false;
+  int num_tokens = input_tokens.size();
+
+  for (int index = 0; index < num_tokens; ++index) {
+    auto token = input_tokens[index];
+    if (token == video_start_token_id) {
+      in_video = true;
+    } else if (token == video_end_token_id) {
+      in_video = false;
+    }
+
+    if (token == image_token_id && !in_video) {
+      input_token_type.push_back("image");
+    } else if (token == image_token_id && in_video) {
+      input_token_type.push_back("video");
+    } else {
+      input_token_type.push_back("text");
+    }
+  }
+  auto input_type_group = groupByTokenType(input_token_type);
+  int image_index = 0;
+  int video_index = 0;
+  int video_group_index = 0;
+
+  std::vector<torch::Tensor> llm_pos_ids_list;
+  int video_frame_num = 1;
+  for (const auto& group : input_type_group) {
+    const auto& modality_type = std::get<0>(group);
+    int start_idx = std::get<1>(group);
+    int end_idx = std::get<2>(group);
+    int st_idx = 0;
+    if (!llm_pos_ids_list.empty()) {
+      st_idx = llm_pos_ids_list.back().max().item<int>() + 1;
+    }
+
+    if (modality_type == "image") {
+      auto grid = image_grid_thw[image_index];
+      int t = grid[0].item<int>();
+      int h = grid[1].item<int>() / spatial_merge_size;
+      int w = grid[2].item<int>() / spatial_merge_size;
+
+      auto t_arange =
+          torch::arange(t, dtype).view({-1, 1}).expand({-1, h * w}).flatten();
+      auto h_arange =
+          torch::arange(h, dtype).view({1, -1, 1}).expand({t, -1, w}).flatten();
+      auto w_arange =
+          torch::arange(w, dtype).view({1, 1, -1}).expand({t, h, -1}).flatten();
+
+      auto pos = torch::stack({t_arange, h_arange, w_arange}) + st_idx;
+      llm_pos_ids_list.push_back(pos);
+      video_frame_num = 1;
+      image_index++;
+    } else if (modality_type == "video") {
+      int t = video_frame_num;
+      int h = video_grid_thw[video_index][1].item<int>() / spatial_merge_size;
+      int w = video_grid_thw[video_index][2].item<int>() / spatial_merge_size;
+
+      for (int t_idx = 0; t_idx < t; ++t_idx) {
+        auto t_tensor = torch::full({1, h * w}, t_idx, dtype).flatten();
+        auto h_tensor = torch::arange(h, dtype)
+                            .view({1, -1, 1})
+                            .expand({1, -1, w})
+                            .flatten();
+        auto w_tensor = torch::arange(w, dtype)
+                            .view({1, 1, -1})
+                            .expand({1, h, -1})
+                            .flatten();
+
+        auto pos = torch::stack({t_tensor, h_tensor, w_tensor}) + st_idx;
+        llm_pos_ids_list.push_back(pos);
+      }
+
+      video_group_index++;
+      if (video_group_index >= video_grid_thw[video_index][0].item<int>()) {
+        video_index++;
+        video_group_index = 0;
+      }
+      video_frame_num++;
+    } else {  // text
+      int text_len = end_idx - start_idx;
+      auto arange =
+          torch::arange(text_len, dtype).view({1, -1}).expand({3, -1}) + st_idx;
+      llm_pos_ids_list.push_back(arange);
+      video_frame_num = 1;
+    }
+  }
+
+  torch::Tensor llm_positions =
+      torch::cat(llm_pos_ids_list, /*dim=*/1).reshape({3, -1});
+  llm_positions = llm_positions;
+  int mrope_position_delta =
+      (llm_positions.max().item<int>() + 1 - input_tokens.size());
+
+  return std::make_pair(llm_positions, mrope_position_delta);
+}
+
 std::tuple<torch::Tensor, int> MPositionHelper::get_positions_p(
     torch::Tensor image_grid_thw,
     torch::Tensor video_grid_thw,
diff --git a/xllm/core/framework/batch/mposition.h b/xllm/core/framework/batch/mposition.h
@@ -37,6 +37,10 @@ class MPositionHelper {
       torch::Tensor image_grid_thw,
       torch::Tensor video_grid_thw,
       torch::Tensor second_per_grid_ts);
+  std::tuple<torch::Tensor, int> get_positions_glm(
+      torch::Tensor image_grid_thw,
+      torch::Tensor video_grid_thw);
+
   torch::Tensor get_positions_d();
 
  private: