Skip to content

Commit d59119a

Browse files
committed
feat(custom-model): add base64 image content support for OpenAI and Anthropic adapters
(cherry picked from commit 8fde0de)
1 parent 003992d commit d59119a

2 files changed

Lines changed: 271 additions & 1 deletion

File tree

packages/core/src/core/customModelAdapter.test.ts

Lines changed: 223 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,9 +5,231 @@
55
*/
66

77
import { describe, it, expect, vi, beforeEach } from 'vitest';
8-
import { callOpenAICompatibleModelStream, callAnthropicModelStream } from './customModelAdapter.js';
8+
import { callOpenAICompatibleModelStream, callAnthropicModelStream, callOpenAICompatibleModel, callAnthropicModel } from './customModelAdapter.js';
99
import { MESSAGE_ROLES } from '../config/messageRoles.js';
1010

11+
describe('customModelAdapter - Image Content Support', () => {
12+
beforeEach(() => {
13+
vi.resetAllMocks();
14+
});
15+
16+
describe('OpenAI image format conversion', () => {
17+
it('should convert Gemini inlineData to OpenAI image_url format', async () => {
18+
let capturedBody: any;
19+
const mockResponse = {
20+
ok: true,
21+
json: async () => ({
22+
choices: [{ message: { content: 'I see an image' }, finish_reason: 'stop' }],
23+
usage: { prompt_tokens: 100, completion_tokens: 10 },
24+
}),
25+
};
26+
27+
global.fetch = vi.fn().mockImplementation(async (_url, options) => {
28+
capturedBody = JSON.parse(options.body);
29+
return mockResponse;
30+
});
31+
32+
const modelConfig = {
33+
provider: 'openai' as const,
34+
modelId: 'gpt-4-vision',
35+
baseUrl: 'https://api.openai.com/v1',
36+
apiKey: 'sk-test',
37+
displayName: 'GPT-4 Vision',
38+
};
39+
40+
const request = {
41+
contents: [
42+
{
43+
role: MESSAGE_ROLES.USER,
44+
parts: [
45+
{ text: 'What is in this image?' },
46+
{ inlineData: { mimeType: 'image/png', data: 'iVBORw0KGgoAAAANSUhEUg==' } },
47+
],
48+
},
49+
],
50+
};
51+
52+
await callOpenAICompatibleModel(modelConfig as any, request);
53+
54+
// Verify the request body was converted correctly
55+
expect(capturedBody.messages).toHaveLength(1);
56+
expect(capturedBody.messages[0].role).toBe('user');
57+
expect(Array.isArray(capturedBody.messages[0].content)).toBe(true);
58+
expect(capturedBody.messages[0].content).toHaveLength(2);
59+
60+
// Check text part
61+
expect(capturedBody.messages[0].content[0]).toEqual({
62+
type: 'text',
63+
text: 'What is in this image?',
64+
});
65+
66+
// Check image part - OpenAI format
67+
expect(capturedBody.messages[0].content[1]).toEqual({
68+
type: 'image_url',
69+
image_url: {
70+
url: 'data:image/png;base64,iVBORw0KGgoAAAANSUhEUg==',
71+
},
72+
});
73+
});
74+
75+
it('should handle multiple images in a single message', async () => {
76+
let capturedBody: any;
77+
const mockResponse = {
78+
ok: true,
79+
json: async () => ({
80+
choices: [{ message: { content: 'I see two images' }, finish_reason: 'stop' }],
81+
usage: { prompt_tokens: 200, completion_tokens: 15 },
82+
}),
83+
};
84+
85+
global.fetch = vi.fn().mockImplementation(async (_url, options) => {
86+
capturedBody = JSON.parse(options.body);
87+
return mockResponse;
88+
});
89+
90+
const modelConfig = {
91+
provider: 'openai' as const,
92+
modelId: 'gpt-4-vision',
93+
baseUrl: 'https://api.openai.com/v1',
94+
apiKey: 'sk-test',
95+
displayName: 'GPT-4 Vision',
96+
};
97+
98+
const request = {
99+
contents: [
100+
{
101+
role: MESSAGE_ROLES.USER,
102+
parts: [
103+
{ text: 'Compare these images' },
104+
{ inlineData: { mimeType: 'image/jpeg', data: 'base64data1' } },
105+
{ inlineData: { mimeType: 'image/png', data: 'base64data2' } },
106+
],
107+
},
108+
],
109+
};
110+
111+
await callOpenAICompatibleModel(modelConfig as any, request);
112+
113+
expect(capturedBody.messages[0].content).toHaveLength(3);
114+
expect(capturedBody.messages[0].content[1].image_url.url).toBe('data:image/jpeg;base64,base64data1');
115+
expect(capturedBody.messages[0].content[2].image_url.url).toBe('data:image/png;base64,base64data2');
116+
});
117+
});
118+
119+
describe('Anthropic image format conversion', () => {
120+
it('should convert Gemini inlineData to Anthropic image format', async () => {
121+
let capturedBody: any;
122+
const mockResponse = {
123+
ok: true,
124+
json: async () => ({
125+
content: [{ type: 'text', text: 'I see an image' }],
126+
stop_reason: 'end_turn',
127+
usage: { input_tokens: 100, output_tokens: 10 },
128+
}),
129+
};
130+
131+
global.fetch = vi.fn().mockImplementation(async (_url, options) => {
132+
capturedBody = JSON.parse(options.body);
133+
return mockResponse;
134+
});
135+
136+
const modelConfig = {
137+
provider: 'anthropic' as const,
138+
modelId: 'claude-3-sonnet',
139+
baseUrl: 'https://api.anthropic.com',
140+
apiKey: 'sk-ant-test',
141+
displayName: 'Claude 3 Sonnet',
142+
};
143+
144+
const request = {
145+
contents: [
146+
{
147+
role: MESSAGE_ROLES.USER,
148+
parts: [
149+
{ text: 'What is in this image?' },
150+
{ inlineData: { mimeType: 'image/png', data: 'iVBORw0KGgoAAAANSUhEUg==' } },
151+
],
152+
},
153+
],
154+
};
155+
156+
await callAnthropicModel(modelConfig as any, request);
157+
158+
// Verify the request body was converted correctly
159+
expect(capturedBody.messages).toHaveLength(1);
160+
expect(capturedBody.messages[0].role).toBe('user');
161+
expect(Array.isArray(capturedBody.messages[0].content)).toBe(true);
162+
expect(capturedBody.messages[0].content).toHaveLength(2);
163+
164+
// Check text part
165+
expect(capturedBody.messages[0].content[0]).toEqual({
166+
type: 'text',
167+
text: 'What is in this image?',
168+
});
169+
170+
// Check image part - Anthropic format
171+
expect(capturedBody.messages[0].content[1]).toEqual({
172+
type: 'image',
173+
source: {
174+
type: 'base64',
175+
media_type: 'image/png',
176+
data: 'iVBORw0KGgoAAAANSUhEUg==',
177+
},
178+
});
179+
});
180+
181+
it('should handle multiple images in a single message', async () => {
182+
let capturedBody: any;
183+
const mockResponse = {
184+
ok: true,
185+
json: async () => ({
186+
content: [{ type: 'text', text: 'I see two images' }],
187+
stop_reason: 'end_turn',
188+
usage: { input_tokens: 200, output_tokens: 15 },
189+
}),
190+
};
191+
192+
global.fetch = vi.fn().mockImplementation(async (_url, options) => {
193+
capturedBody = JSON.parse(options.body);
194+
return mockResponse;
195+
});
196+
197+
const modelConfig = {
198+
provider: 'anthropic' as const,
199+
modelId: 'claude-3-sonnet',
200+
baseUrl: 'https://api.anthropic.com',
201+
apiKey: 'sk-ant-test',
202+
displayName: 'Claude 3 Sonnet',
203+
};
204+
205+
const request = {
206+
contents: [
207+
{
208+
role: MESSAGE_ROLES.USER,
209+
parts: [
210+
{ text: 'Compare these images' },
211+
{ inlineData: { mimeType: 'image/jpeg', data: 'base64data1' } },
212+
{ inlineData: { mimeType: 'image/webp', data: 'base64data2' } },
213+
],
214+
},
215+
],
216+
};
217+
218+
await callAnthropicModel(modelConfig as any, request);
219+
220+
expect(capturedBody.messages[0].content).toHaveLength(3);
221+
expect(capturedBody.messages[0].content[1]).toEqual({
222+
type: 'image',
223+
source: { type: 'base64', media_type: 'image/jpeg', data: 'base64data1' },
224+
});
225+
expect(capturedBody.messages[0].content[2]).toEqual({
226+
type: 'image',
227+
source: { type: 'base64', media_type: 'image/webp', data: 'base64data2' },
228+
});
229+
});
230+
});
231+
});
232+
11233
describe('customModelAdapter - Streaming Tool Calls', () => {
12234
describe('OpenAI streaming', () => {
13235
it('should aggregate tool call deltas and yield complete tool call only at stream end', async () => {

packages/core/src/core/customModelAdapter.ts

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,27 @@ function parseJSONSafe(jsonStr: string): any {
7474
* OpenAI 格式转换工具
7575
*/
7676
const OpenAIConverter = {
77+
/**
78+
* 将单个 part 转换为 OpenAI content 格式
79+
* 支持 text 和 inlineData (图片)
80+
*/
81+
partToOpenAIContent(part: any): any | null {
82+
if (part.text) {
83+
return { type: 'text', text: part.text };
84+
}
85+
if (part.inlineData) {
86+
// 转换 Gemini inlineData 格式为 OpenAI image_url 格式
87+
const { mimeType, data } = part.inlineData;
88+
return {
89+
type: 'image_url',
90+
image_url: {
91+
url: `data:${mimeType};base64,${data}`,
92+
},
93+
};
94+
}
95+
return null;
96+
},
97+
7798
contentsToMessages(contents: any[]): any[] {
7899
return contents.map((content: any) => {
79100
const parts = content.parts || [];
@@ -108,6 +129,22 @@ const OpenAIConverter = {
108129
}));
109130
}
110131

132+
// 检查是否包含图片内容
133+
const hasImageContent = parts.some((p: any) => p.inlineData);
134+
135+
if (hasImageContent) {
136+
// 使用数组格式以支持混合内容(文本 + 图片)
137+
const contentParts = parts
138+
.map((part: any) => OpenAIConverter.partToOpenAIContent(part))
139+
.filter(Boolean);
140+
141+
return {
142+
role: content.role === MESSAGE_ROLES.MODEL ? 'assistant' : 'user',
143+
content: contentParts,
144+
};
145+
}
146+
147+
// 纯文本内容,使用简单字符串格式
111148
return {
112149
role: content.role === MESSAGE_ROLES.MODEL ? 'assistant' : 'user',
113150
content: parts.map((part: any) => part.text || '').join('\n'),
@@ -173,6 +210,17 @@ const AnthropicConverter = {
173210
if (part.text) {
174211
anthropicParts.push({ type: 'text', text: part.text });
175212
}
213+
if (part.inlineData) {
214+
// 转换 Gemini inlineData 格式为 Anthropic image 格式
215+
anthropicParts.push({
216+
type: 'image',
217+
source: {
218+
type: 'base64',
219+
media_type: part.inlineData.mimeType,
220+
data: part.inlineData.data,
221+
},
222+
});
223+
}
176224
if (part.functionCall) {
177225
anthropicParts.push({
178226
type: 'tool_use',

0 commit comments

Comments
 (0)