diff --git a/packages/bytebot-agent/src/agent/agent.constants.ts b/packages/bytebot-agent/src/agent/agent.constants.ts index 5b3d4e0d3..0500ef891 100644 --- a/packages/bytebot-agent/src/agent/agent.constants.ts +++ b/packages/bytebot-agent/src/agent/agent.constants.ts @@ -43,9 +43,11 @@ CORE WORKING PRINCIPLES 1. **Observe First** - *Always* invoke \`computer_screenshot\` before your first action **and** whenever the UI may have changed. Screenshot before every action when filling out forms. Never act blindly. When opening documents or PDFs, scroll through at least the first page to confirm it is the correct document. 2. **Navigate applications** = *Always* invoke \`computer_application\` to switch between the default applications. 3. **Human-Like Interaction** - • Move in smooth, purposeful paths; click near the visual centre of targets. - • Double-click desktop icons to open them. + • Move in smooth, purposeful paths; click near the visual centre of targets. + • Double-click desktop icons to open them. • Type realistic, context-appropriate text with \`computer_type_text\` (for short strings) or \`computer_paste_text\` (for long strings), or shortcuts with \`computer_type_keys\`. + • **Cursor Visibility**: The mouse cursor is visible in screenshots as a black arrow pointer with white outline. Use this to verify your current position before clicking. + • **Positioning Issues**: If you're having trouble clicking a target, use \`computer_cursor_position\` to check your current coordinates, then calculate the needed adjustment. Don't repeatedly click the same coordinates if it's not working - verify and adjust. 4. **Valid Keys Only** - Use **exactly** the identifiers listed in **VALID KEYS** below when supplying \`keys\` to \`computer_type_keys\` or \`computer_press_keys\`. All identifiers come from nut-tree's \`Key\` enum; they are case-sensitive and contain *no spaces*. 5. **Verify Every Step** - After each action: diff --git a/packages/bytebot-agent/src/agent/agent.processor.ts b/packages/bytebot-agent/src/agent/agent.processor.ts index c48912fae..f9cb31beb 100644 --- a/packages/bytebot-agent/src/agent/agent.processor.ts +++ b/packages/bytebot-agent/src/agent/agent.processor.ts @@ -182,7 +182,36 @@ export class AgentProcessor { `Sending ${messages.length} messages to LLM for processing`, ); - const model = task.model as unknown as BytebotAgentModel; + // Resolve the model - task.model can be either a string (model name) or an object + let model: BytebotAgentModel; + + if (typeof task.model === 'string') { + // Old format: task.model is a string (model name) + const modelName = task.model; + + // Determine provider from model name + if (modelName.startsWith('gpt-') || modelName.startsWith('o1-') || modelName.startsWith('o3-')) { + model = { provider: 'openai', name: modelName, title: modelName, contextWindow: 128000 }; + } else if (modelName.startsWith('claude-')) { + model = { provider: 'anthropic', name: modelName, title: modelName, contextWindow: 200000 }; + } else if (modelName.startsWith('gemini-')) { + model = { provider: 'google', name: modelName, title: modelName, contextWindow: 128000 }; + } else { + // Try to get model from OpenAI's available models list + const availableModels = await this.openaiService.getAvailableModels(); + const foundModel = availableModels.find(m => m.name === modelName); + if (foundModel) { + model = foundModel; + } else { + // Default to openai if no match found + model = { provider: 'openai', name: modelName, title: modelName, contextWindow: 128000 }; + } + } + } else { + // New format: task.model is already a BytebotAgentModel object + model = task.model as unknown as BytebotAgentModel; + } + let agentResponse: BytebotAgentResponse; const service = this.services[model.provider]; diff --git a/packages/bytebot-agent/src/openai/openai.constants.ts b/packages/bytebot-agent/src/openai/openai.constants.ts index 2df2b7cd5..f97aff6e0 100644 --- a/packages/bytebot-agent/src/openai/openai.constants.ts +++ b/packages/bytebot-agent/src/openai/openai.constants.ts @@ -1,17 +1,31 @@ import { BytebotAgentModel } from 'src/agent/agent.types'; +// Only include models that support vision (image inputs) +// This is required for computer-use agents that send screenshots export const OPENAI_MODELS: BytebotAgentModel[] = [ { provider: 'openai', - name: 'o3-2025-04-16', - title: 'o3', - contextWindow: 200000, + name: 'gpt-4o', + title: 'GPT-4o', + contextWindow: 128000, }, { provider: 'openai', - name: 'gpt-4.1-2025-04-14', - title: 'GPT-4.1', - contextWindow: 1047576, + name: 'gpt-4o-mini', + title: 'GPT-4o Mini', + contextWindow: 128000, + }, + { + provider: 'openai', + name: 'gpt-4-turbo', + title: 'GPT-4 Turbo', + contextWindow: 128000, + }, + { + provider: 'openai', + name: 'gpt-4', + title: 'GPT-4', + contextWindow: 8192, }, ]; diff --git a/packages/bytebot-agent/src/openai/openai.service.ts b/packages/bytebot-agent/src/openai/openai.service.ts index f78e7b1b0..d58416f2e 100644 --- a/packages/bytebot-agent/src/openai/openai.service.ts +++ b/packages/bytebot-agent/src/openai/openai.service.ts @@ -12,19 +12,23 @@ import { isComputerToolUseContentBlock, isImageContentBlock, } from '@bytebot/shared'; -import { DEFAULT_MODEL } from './openai.constants'; +import { DEFAULT_MODEL, OPENAI_MODELS } from './openai.constants'; import { Message, Role } from '@prisma/client'; import { openaiTools } from './openai.tools'; import { BytebotAgentService, BytebotAgentInterrupt, BytebotAgentResponse, + BytebotAgentModel, } from '../agent/agent.types'; @Injectable() export class OpenAIService implements BytebotAgentService { private readonly openai: OpenAI; private readonly logger = new Logger(OpenAIService.name); + private cachedModels: BytebotAgentModel[] | null = null; + private modelsCacheTime: number = 0; + private readonly CACHE_DURATION = 3600000; // 1 hour in milliseconds constructor(private readonly configService: ConfigService) { const apiKey = this.configService.get('OPENAI_API_KEY'); @@ -40,6 +44,105 @@ export class OpenAIService implements BytebotAgentService { }); } + /** + * Fetch available models from OpenAI API and cache them + */ + async getAvailableModels(): Promise { + // Return cached models if still valid + const now = Date.now(); + if ( + this.cachedModels && + now - this.modelsCacheTime < this.CACHE_DURATION + ) { + return this.cachedModels; + } + + try { + const apiKey = this.configService.get('OPENAI_API_KEY'); + if (!apiKey) { + this.logger.warn('OPENAI_API_KEY not set, returning hardcoded models'); + return OPENAI_MODELS; + } + + // Fetch models from OpenAI API + const modelsList = await this.openai.models.list(); + const models = modelsList.data; + + // Filter for relevant chat models that support vision (images/screenshots) + // Exclude O1 and O3 models as they don't support image inputs + const availableModels: BytebotAgentModel[] = models + .filter( + (model) => + model.id.startsWith('gpt-') && + !model.id.startsWith('gpt-3.5') && // Exclude GPT-3.5 (no vision support) + !model.id.includes('instruct'), // Exclude instruct models + ) + .map((model) => ({ + provider: 'openai' as const, + name: model.id, + title: this.formatModelTitle(model.id), + contextWindow: this.getContextWindow(model.id), + })) + .sort((a, b) => { + // Sort by priority: gpt-4o variants first, then gpt-4 variants + const priority = (name: string) => { + if (name.includes('gpt-4o')) return 0; + if (name.includes('gpt-4.1')) return 1; + if (name.includes('gpt-4')) return 2; + if (name.includes('gpt-5')) return 3; + return 4; + }; + return priority(a.name) - priority(b.name); + }); + + if (availableModels.length > 0) { + this.cachedModels = availableModels; + this.modelsCacheTime = now; + this.logger.log( + `Fetched ${availableModels.length} models from OpenAI API`, + ); + return availableModels; + } else { + this.logger.warn( + 'No suitable models found from OpenAI API, using hardcoded list', + ); + return OPENAI_MODELS; + } + } catch (error) { + this.logger.error(`Failed to fetch models from OpenAI: ${error.message}`); + return OPENAI_MODELS; + } + } + + /** + * Format model ID into a human-readable title + */ + private formatModelTitle(modelId: string): string { + // Convert model IDs like "gpt-4o-mini" to "GPT-4o Mini" + return modelId + .split('-') + .map((part) => { + if (part === 'gpt') return 'GPT'; + if (part.match(/^\d/)) return part; // Keep numbers as-is + return part.charAt(0).toUpperCase() + part.slice(1); + }) + .join('-') + .replace(/-/g, ' '); + } + + /** + * Get estimated context window for a model + */ + private getContextWindow(modelId: string): number { + if (modelId.includes('gpt-4o')) return 128000; + if (modelId.includes('gpt-4-turbo')) return 128000; + if (modelId.includes('gpt-4')) return 8192; + if (modelId.includes('o1')) return 128000; + if (modelId.includes('o3')) return 200000; + if (modelId.includes('gpt-3.5')) return 16385; + return 4096; // Default fallback + } + async generateMessage( systemPrompt: string, messages: Message[], diff --git a/packages/bytebot-agent/src/tasks/tasks.controller.ts b/packages/bytebot-agent/src/tasks/tasks.controller.ts index 982c4a4f1..801d8df4c 100644 --- a/packages/bytebot-agent/src/tasks/tasks.controller.ts +++ b/packages/bytebot-agent/src/tasks/tasks.controller.ts @@ -19,6 +19,7 @@ import { ANTHROPIC_MODELS } from '../anthropic/anthropic.constants'; import { OPENAI_MODELS } from '../openai/openai.constants'; import { GOOGLE_MODELS } from '../google/google.constants'; import { BytebotAgentModel } from 'src/agent/agent.types'; +import { OpenAIService } from '../openai/openai.service'; const geminiApiKey = process.env.GEMINI_API_KEY; const anthropicApiKey = process.env.ANTHROPIC_API_KEY; @@ -26,17 +27,12 @@ const openaiApiKey = process.env.OPENAI_API_KEY; const proxyUrl = process.env.BYTEBOT_LLM_PROXY_URL; -const models = [ - ...(anthropicApiKey ? ANTHROPIC_MODELS : []), - ...(openaiApiKey ? OPENAI_MODELS : []), - ...(geminiApiKey ? GOOGLE_MODELS : []), -]; - @Controller('tasks') export class TasksController { constructor( private readonly tasksService: TasksService, private readonly messagesService: MessagesService, + private readonly openaiService: OpenAIService, ) {} @Post() @@ -107,6 +103,31 @@ export class TasksController { ); } } + + // Fetch models dynamically from providers + const models: BytebotAgentModel[] = []; + + // Add Anthropic models if API key is present + if (anthropicApiKey) { + models.push(...ANTHROPIC_MODELS); + } + + // Fetch OpenAI models dynamically if API key is present + if (openaiApiKey) { + try { + const openaiModels = await this.openaiService.getAvailableModels(); + models.push(...openaiModels); + } catch (error) { + // Fallback to hardcoded models if fetch fails + models.push(...OPENAI_MODELS); + } + } + + // Add Google models if API key is present + if (geminiApiKey) { + models.push(...GOOGLE_MODELS); + } + return models; } diff --git a/packages/bytebot-agent/src/tasks/tasks.module.ts b/packages/bytebot-agent/src/tasks/tasks.module.ts index fdad46c9e..32bd00542 100644 --- a/packages/bytebot-agent/src/tasks/tasks.module.ts +++ b/packages/bytebot-agent/src/tasks/tasks.module.ts @@ -4,9 +4,10 @@ import { TasksService } from './tasks.service'; import { TasksGateway } from './tasks.gateway'; import { PrismaModule } from '../prisma/prisma.module'; import { MessagesModule } from '../messages/messages.module'; +import { OpenAIModule } from '../openai/openai.module'; @Module({ - imports: [PrismaModule, MessagesModule], + imports: [PrismaModule, MessagesModule, OpenAIModule], controllers: [TasksController], providers: [TasksService, TasksGateway], exports: [TasksService, TasksGateway], diff --git a/packages/bytebot-ui/src/components/ui/select.tsx b/packages/bytebot-ui/src/components/ui/select.tsx index dbaf05ce9..7e96e3901 100644 --- a/packages/bytebot-ui/src/components/ui/select.tsx +++ b/packages/bytebot-ui/src/components/ui/select.tsx @@ -51,9 +51,9 @@ const SelectContent = React.forwardRef< > {children} diff --git a/packages/bytebot-ui/src/components/vnc/VncViewer.tsx b/packages/bytebot-ui/src/components/vnc/VncViewer.tsx index 8f55bdbbf..6f480a3c6 100644 --- a/packages/bytebot-ui/src/components/vnc/VncViewer.tsx +++ b/packages/bytebot-ui/src/components/vnc/VncViewer.tsx @@ -39,6 +39,7 @@ export function VncViewer({ viewOnly = true }: VncViewerProps) { url={wsUrl} scaleViewport viewOnly={viewOnly} + showDotCursor={true} style={{ width: "100%", height: "100%" }} /> )} diff --git a/packages/bytebotd/Dockerfile b/packages/bytebotd/Dockerfile index f8a893bc8..7311e7074 100644 --- a/packages/bytebotd/Dockerfile +++ b/packages/bytebotd/Dockerfile @@ -117,17 +117,18 @@ RUN ARCH=$(dpkg --print-architecture) && \ echo "1Password is not available for $ARCH architecture."; \ fi -# Install Visual Studio Code +# Install Visual Studio Code (with fallback to .deb download if repo fails) RUN ARCH=$(dpkg --print-architecture) && \ if [ "$ARCH" = "amd64" ]; then \ apt-get update && apt-get install -y wget gpg apt-transport-https software-properties-common && \ - wget -qO- https://packages.microsoft.com/keys/microsoft.asc | gpg --dearmor -o /usr/share/keyrings/ms_vscode.gpg && \ - echo "deb [arch=amd64 signed-by=/usr/share/keyrings/ms_vscode.gpg] https://packages.microsoft.com/repos/code stable main" > /etc/apt/sources.list.d/vscode.list && \ - apt-get update && apt-get install -y code && \ + # Try downloading .deb directly (more reliable than repo) + wget -qO /tmp/code_amd64.deb "https://code.visualstudio.com/sha/download?build=stable&os=linux-deb-x64" && \ + apt-get install -y /tmp/code_amd64.deb && \ + rm -f /tmp/code_amd64.deb && \ apt-get clean && rm -rf /var/lib/apt/lists/* ; \ elif [ "$ARCH" = "arm64" ]; then \ apt-get update && apt-get install -y wget gpg && \ - wget -qO /tmp/code_arm64.deb https://update.code.visualstudio.com/latest/linux-deb-arm64/stable && \ + wget -qO /tmp/code_arm64.deb "https://code.visualstudio.com/sha/download?build=stable&os=linux-deb-arm64" && \ apt-get install -y /tmp/code_arm64.deb && \ rm -f /tmp/code_arm64.deb && \ apt-get clean && rm -rf /var/lib/apt/lists/* ; \ diff --git a/packages/bytebotd/root/etc/supervisor/conf.d/supervisord.conf b/packages/bytebotd/root/etc/supervisor/conf.d/supervisord.conf index 6c39ad2ec..bdb9c47ea 100644 --- a/packages/bytebotd/root/etc/supervisor/conf.d/supervisord.conf +++ b/packages/bytebotd/root/etc/supervisor/conf.d/supervisord.conf @@ -55,7 +55,7 @@ redirect_stderr=true depends_on=xvfb [program:x11vnc] -command=x11vnc -display :0 -N -forever -shared -rfbport 5900 +command=x11vnc -display :0 -N -forever -shared -rfbport 5900 -nopw -cursor arrow -cursorpos user=user autostart=true autorestart=true diff --git a/packages/bytebotd/src/nut/cursor-overlay.ts b/packages/bytebotd/src/nut/cursor-overlay.ts new file mode 100644 index 000000000..289d25c02 --- /dev/null +++ b/packages/bytebotd/src/nut/cursor-overlay.ts @@ -0,0 +1,68 @@ +import * as sharp from 'sharp'; + +/** + * Creates a cursor image as a Buffer. + * The cursor is a simple arrow pointer shape. + */ +export async function createCursorImage( + size: number = 24, + color: string = '#000000', + outlineColor: string = '#FFFFFF', +): Promise { + // Create a simple arrow cursor SVG + const svg = ` + + + + + + + `; + + return sharp(Buffer.from(svg)).png().toBuffer(); +} + +/** + * Overlays a cursor image onto a screenshot at the specified position. + * + * @param screenshotBuffer The screenshot image buffer + * @param cursorX The x coordinate of the cursor + * @param cursorY The y coordinate of the cursor + * @param cursorSize The size of the cursor (default 24) + * @returns A Buffer containing the screenshot with cursor overlay + */ +export async function overlayeCursorOnScreenshot( + screenshotBuffer: Buffer, + cursorX: number, + cursorY: number, + cursorSize: number = 24, +): Promise { + // Create the cursor image + const cursorBuffer = await createCursorImage(cursorSize); + + // Get screenshot metadata to ensure cursor stays within bounds + const metadata = await sharp(screenshotBuffer).metadata(); + const width = metadata.width || 1920; + const height = metadata.height || 1080; + + // Ensure cursor position is within screenshot bounds + const safeX = Math.max(0, Math.min(cursorX, width - 1)); + const safeY = Math.max(0, Math.min(cursorY, height - 1)); + + // Composite the cursor onto the screenshot + return sharp(screenshotBuffer) + .composite([ + { + input: cursorBuffer, + left: Math.round(safeX), + top: Math.round(safeY), + }, + ]) + .png() + .toBuffer(); +} diff --git a/packages/bytebotd/src/nut/nut.service.ts b/packages/bytebotd/src/nut/nut.service.ts index 70f988402..62ba88859 100644 --- a/packages/bytebotd/src/nut/nut.service.ts +++ b/packages/bytebotd/src/nut/nut.service.ts @@ -11,6 +11,7 @@ import { } from '@nut-tree-fork/nut-js'; import { spawn } from 'child_process'; import * as path from 'path'; +import { overlayeCursorOnScreenshot } from './cursor-overlay'; /** * Enum representing key codes supported by nut-js. @@ -463,21 +464,59 @@ export class NutService { } /** - * Takes a screenshot of the screen. + * Takes a screenshot of the screen with cursor overlay. * + * @param includeCursor Whether to include the mouse cursor in the screenshot (default: true) * @returns A Promise that resolves with a Buffer containing the image. */ - async screendump(): Promise { + async screendump(includeCursor: boolean = true): Promise { const filename = `screenshot-${Date.now()}.png`; const filepath = path.join(this.screenshotDir, filename); this.logger.log(`Taking screenshot to ${filepath}`); try { + // Get cursor position before taking screenshot + let cursorPosition: { x: number; y: number } | null = null; + if (includeCursor) { + try { + cursorPosition = await mouse.getPosition(); + this.logger.log( + `Cursor position: (${cursorPosition.x}, ${cursorPosition.y})`, + ); + } catch (cursorError) { + this.logger.warn( + `Failed to get cursor position: ${cursorError.message}`, + ); + } + } + // Take screenshot await screen.capture(filename, FileType.PNG, this.screenshotDir); - // Read the file back and return as buffer - return await import('fs').then((fs) => fs.promises.readFile(filepath)); + // Read the file back + const screenshotBuffer = await import('fs').then((fs) => + fs.promises.readFile(filepath), + ); + + // Overlay cursor if position was captured + if (includeCursor && cursorPosition) { + try { + const withCursor = await overlayeCursorOnScreenshot( + screenshotBuffer, + cursorPosition.x, + cursorPosition.y, + ); + this.logger.log('Cursor overlay applied to screenshot'); + return withCursor; + } catch (overlayError) { + this.logger.warn( + `Failed to overlay cursor: ${overlayError.message}. Returning screenshot without cursor.`, + ); + return screenshotBuffer; + } + } + + return screenshotBuffer; } catch (error) { this.logger.error(`Error taking screenshot: ${error.message}`); throw error;