From ddbe7dfe02751c4e0515673ef67dc756f168c0bc Mon Sep 17 00:00:00 2001
From: taoliang <taoliang@local_server>
Date: Wed, 20 May 2026 19:47:10 +0800
Subject: [PATCH 1/2] fix(volcengine): route vision embedding models to
 multimodal endpoint

Volcengine multimodal embedding models (e.g. doubao-embedding-vision-251215)
require POST /api/v3/embeddings/multimodal and reject the standard
/api/v3/embeddings path with a 400 InvalidParameter error:

  "the requested model doubao-embedding-vision-251215 does not support
   this api"

Detect by model-name keywords (case-insensitive: contains "embedding" AND
contains "vision" or "multimodal"). Both UpstreamModelName and
OriginModelName are checked so model_mapping aliases still work. Pure-text
embedding models continue to use the standard endpoint.
---
 relay/channel/volcengine/adaptor.go | 28 ++++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)

diff --git a/relay/channel/volcengine/adaptor.go b/relay/channel/volcengine/adaptor.go
index ba9f223bd2f..80b7bf42092 100644
--- a/relay/channel/volcengine/adaptor.go
+++ b/relay/channel/volcengine/adaptor.go
@@ -263,6 +263,15 @@ func (a *Adaptor) GetRequestURL(info *relaycommon.RelayInfo) (string, error) {
 			}
 			return fmt.Sprintf("%s/api/v3/chat/completions", baseUrl), nil
 		case constant.RelayModeEmbeddings:
+			// Volcengine multimodal embedding models (e.g. doubao-embedding-vision-*)
+			// require the dedicated /api/v3/embeddings/multimodal endpoint and reject
+			// the standard /embeddings path with a 400 InvalidParameter error.
+			// Heuristic: route by model name keywords. Both the request model and the
+			// upstream model name are checked so model_mapping aliases still work.
+			if isVolcengineMultimodalEmbedding(info.UpstreamModelName) ||
+				isVolcengineMultimodalEmbedding(info.OriginModelName) {
+				return fmt.Sprintf("%s/api/v3/embeddings/multimodal", baseUrl), nil
+			}
 			return fmt.Sprintf("%s/api/v3/embeddings", baseUrl), nil
 		//豆包的图生图也走generations接口: https://www.volcengine.com/docs/82379/1824121
 		case constant.RelayModeImagesGenerations, constant.RelayModeImagesEdits:
@@ -400,3 +409,22 @@ func (a *Adaptor) GetModelList() []string {
 func (a *Adaptor) GetChannelName() string {
 	return ChannelName
 }
+
+// isVolcengineMultimodalEmbedding reports whether the given model name targets
+// Volcengine's multimodal (image+text) embedding endpoint, which requires the
+// `/api/v3/embeddings/multimodal` path instead of the standard `/embeddings`.
+//
+// The detection is keyword-based on the model name. We accept both the original
+// "vision" series (e.g. doubao-embedding-vision-241215, doubao-embedding-vision-251215)
+// and any future "multimodal" naming. Matching is case-insensitive so user-supplied
+// model_mapping aliases work even with mixed case.
+func isVolcengineMultimodalEmbedding(modelName string) bool {
+	if modelName == "" {
+		return false
+	}
+	lower := strings.ToLower(modelName)
+	if !strings.Contains(lower, "embedding") {
+		return false
+	}
+	return strings.Contains(lower, "vision") || strings.Contains(lower, "multimodal")
+}

From bbbd8d8e630cefeff79c955c08c387f015434e5f Mon Sep 17 00:00:00 2001
From: taoliang <taoliang@local_server>
Date: Wed, 20 May 2026 20:09:28 +0800
Subject: [PATCH 2/2] fix(volcengine): use multimodal input shape when testing
 vision embedding channels

The follow-up fix for the multimodal endpoint routing exposed a second bug
in the channel test path:

  controller/channel-test.go always emits Input=["hello world"] for any
  embedding model, but Volcengine's /api/v3/embeddings/multimodal endpoint
  rejects flat string arrays with:
    "we could not parse the JSON body of your request"

So the dashboard "Test channel" button still failed for vision embedding
models even after routing was corrected.

Detect vision/multimodal embedding models by name keyword (case-insensitive)
and emit the typed-parts shape that matches the multimodal endpoint:

  {"input": [{"type": "text", "text": "hello world"}]}

Plain text embedding models (Doubao-embedding, doubao-embedding-text-*,
m3e, bge-*, OpenAI text-embedding-3-*, etc.) keep using the legacy string
array shape.
---
 controller/channel-test.go | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/controller/channel-test.go b/controller/channel-test.go
index b225585ed7a..a16aa495947 100644
--- a/controller/channel-test.go
+++ b/controller/channel-test.go
@@ -758,6 +758,24 @@ func buildTestRequest(model string, endpointType string, channel *model.Channel,
 	if strings.Contains(strings.ToLower(model), "embedding") ||
 		strings.HasPrefix(model, "m3e") ||
 		strings.Contains(model, "bge-") {
+		// Volcengine multimodal embedding endpoint (/api/v3/embeddings/multimodal)
+		// requires input to be an array of typed parts (e.g. [{type:"text",text:"..."}]),
+		// not a plain string array. Sending the standard `["hello world"]` shape
+		// against vision/multimodal models triggers a 400 from the upstream
+		// "we could not parse the JSON body of your request" — so emit the
+		// multimodal-compatible shape when the model name signals it.
+		lower := strings.ToLower(model)
+		if strings.Contains(lower, "vision") || strings.Contains(lower, "multimodal") {
+			return &dto.EmbeddingRequest{
+				Model: model,
+				Input: []any{
+					map[string]any{
+						"type": "text",
+						"text": "hello world",
+					},
+				},
+			}
+		}
 		// 返回 EmbeddingRequest
 		return &dto.EmbeddingRequest{
 			Model: model,