bytebot-ai · DBOYttt · Nov 24, 2025 · Dec 3, 2025 · Dec 6, 2025 · Dec 6, 2025
diff --git a/packages/bytebot-agent/src/agent/agent.constants.ts b/packages/bytebot-agent/src/agent/agent.constants.ts
@@ -43,9 +43,11 @@ CORE WORKING PRINCIPLES
 1. **Observe First** - *Always* invoke \`computer_screenshot\` before your first action **and** whenever the UI may have changed. Screenshot before every action when filling out forms. Never act blindly. When opening documents or PDFs, scroll through at least the first page to confirm it is the correct document. 
 2. **Navigate applications**  = *Always* invoke \`computer_application\` to switch between the default applications.
 3. **Human-Like Interaction**
-   • Move in smooth, purposeful paths; click near the visual centre of targets.  
-   • Double-click desktop icons to open them.  
+   • Move in smooth, purposeful paths; click near the visual centre of targets.
+   • Double-click desktop icons to open them.
    • Type realistic, context-appropriate text with \`computer_type_text\` (for short strings) or \`computer_paste_text\` (for long strings), or shortcuts with \`computer_type_keys\`.
+   • **Cursor Visibility**: The mouse cursor is visible in screenshots as a black arrow pointer with white outline. Use this to verify your current position before clicking.
+   • **Positioning Issues**: If you're having trouble clicking a target, use \`computer_cursor_position\` to check your current coordinates, then calculate the needed adjustment. Don't repeatedly click the same coordinates if it's not working - verify and adjust.
 4. **Valid Keys Only** - 
    Use **exactly** the identifiers listed in **VALID KEYS** below when supplying \`keys\` to \`computer_type_keys\` or \`computer_press_keys\`. All identifiers come from nut-tree's \`Key\` enum; they are case-sensitive and contain *no spaces*.
 5. **Verify Every Step** - After each action:  

diff --git a/packages/bytebot-agent/src/agent/agent.processor.ts b/packages/bytebot-agent/src/agent/agent.processor.ts
@@ -182,7 +182,36 @@ export class AgentProcessor {
         `Sending ${messages.length} messages to LLM for processing`,
       );
 
-      const model = task.model as unknown as BytebotAgentModel;
+      // Resolve the model - task.model can be either a string (model name) or an object
+      let model: BytebotAgentModel;
+
+      if (typeof task.model === 'string') {
+        // Old format: task.model is a string (model name)
+        const modelName = task.model;
+
+        // Determine provider from model name
+        if (modelName.startsWith('gpt-') || modelName.startsWith('o1-') || modelName.startsWith('o3-')) {
+          model = { provider: 'openai', name: modelName, title: modelName, contextWindow: 128000 };
+        } else if (modelName.startsWith('claude-')) {
+          model = { provider: 'anthropic', name: modelName, title: modelName, contextWindow: 200000 };
+        } else if (modelName.startsWith('gemini-')) {
+          model = { provider: 'google', name: modelName, title: modelName, contextWindow: 128000 };
+        } else {
+          // Try to get model from OpenAI's available models list
+          const availableModels = await this.openaiService.getAvailableModels();
+          const foundModel = availableModels.find(m => m.name === modelName);
+          if (foundModel) {
+            model = foundModel;
+          } else {
+            // Default to openai if no match found
+            model = { provider: 'openai', name: modelName, title: modelName, contextWindow: 128000 };
+          }
+        }
+      } else {
+        // New format: task.model is already a BytebotAgentModel object
+        model = task.model as unknown as BytebotAgentModel;
+      }
+
       let agentResponse: BytebotAgentResponse;
 
       const service = this.services[model.provider];

diff --git a/packages/bytebot-agent/src/openai/openai.constants.ts b/packages/bytebot-agent/src/openai/openai.constants.ts
@@ -1,17 +1,31 @@
 import { BytebotAgentModel } from 'src/agent/agent.types';
 
+// Only include models that support vision (image inputs)
+// This is required for computer-use agents that send screenshots
 export const OPENAI_MODELS: BytebotAgentModel[] = [
   {
     provider: 'openai',
-    name: 'o3-2025-04-16',
-    title: 'o3',
-    contextWindow: 200000,
+    name: 'gpt-4o',
+    title: 'GPT-4o',
+    contextWindow: 128000,
   },
   {
     provider: 'openai',
-    name: 'gpt-4.1-2025-04-14',
-    title: 'GPT-4.1',
-    contextWindow: 1047576,
+    name: 'gpt-4o-mini',
+    title: 'GPT-4o Mini',
+    contextWindow: 128000,
+  },
+  {
+    provider: 'openai',
+    name: 'gpt-4-turbo',
+    title: 'GPT-4 Turbo',
+    contextWindow: 128000,
+  },
+  {
+    provider: 'openai',
+    name: 'gpt-4',
+    title: 'GPT-4',
+    contextWindow: 8192,
   },
 ];
 

diff --git a/packages/bytebot-agent/src/openai/openai.service.ts b/packages/bytebot-agent/src/openai/openai.service.ts
@@ -12,19 +12,23 @@ import {
   isComputerToolUseContentBlock,
   isImageContentBlock,
 } from '@bytebot/shared';
-import { DEFAULT_MODEL } from './openai.constants';
+import { DEFAULT_MODEL, OPENAI_MODELS } from './openai.constants';
 import { Message, Role } from '@prisma/client';
 import { openaiTools } from './openai.tools';
 import {
   BytebotAgentService,
   BytebotAgentInterrupt,
   BytebotAgentResponse,
+  BytebotAgentModel,
 } from '../agent/agent.types';
 
 @Injectable()
 export class OpenAIService implements BytebotAgentService {
   private readonly openai: OpenAI;
   private readonly logger = new Logger(OpenAIService.name);
+  private cachedModels: BytebotAgentModel[] | null = null;
+  private modelsCacheTime: number = 0;
+  private readonly CACHE_DURATION = 3600000; // 1 hour in milliseconds
 
   constructor(private readonly configService: ConfigService) {
     const apiKey = this.configService.get<string>('OPENAI_API_KEY');
@@ -40,6 +44,105 @@ export class OpenAIService implements BytebotAgentService {
     });
   }
 
+  /**
+   * Fetch available models from OpenAI API and cache them
+   */
+  async getAvailableModels(): Promise<BytebotAgentModel[]> {
+    // Return cached models if still valid
+    const now = Date.now();
+    if (
+      this.cachedModels &&
+      now - this.modelsCacheTime < this.CACHE_DURATION
+    ) {
+      return this.cachedModels;
+    }
+
+    try {
+      const apiKey = this.configService.get<string>('OPENAI_API_KEY');
+      if (!apiKey) {
+        this.logger.warn('OPENAI_API_KEY not set, returning hardcoded models');
+        return OPENAI_MODELS;
+      }
+
+      // Fetch models from OpenAI API
+      const modelsList = await this.openai.models.list();
+      const models = modelsList.data;
+
+      // Filter for relevant chat models that support vision (images/screenshots)
+      // Exclude O1 and O3 models as they don't support image inputs
-      // Exclude O1 and O3 models as they don't support image inputs
+      // Only include models whose IDs start with 'gpt-' (O1 and O3 models are excluded by this filter)
-      // Exclude O1 and O3 models as they don't support image inputs
+      // Only include models whose IDs start with 'gpt-' (O1 and O3 models are excluded by this filter)
+      const availableModels: BytebotAgentModel[] = models
+        .filter(
+          (model) =>
+            model.id.startsWith('gpt-') &&
+            !model.id.startsWith('gpt-3.5') && // Exclude GPT-3.5 (no vision support)
+            !model.id.includes('instruct'), // Exclude instruct models
-          (model) =>
-            model.id.startsWith('gpt-') &&
-            !model.id.startsWith('gpt-3.5') && // Exclude GPT-3.5 (no vision support)
-            !model.id.includes('instruct'), // Exclude instruct models
+          (model) => {
+            // Only include known vision-capable models
+            const id = model.id;
+            // Add to this list as new vision-capable models are released
+            return (
+              (
+                id.includes('gpt-4o') ||
+                id.includes('gpt-4-turbo') ||
+                id.includes('gpt-4-vision') ||
+                id.includes('gpt-4v')
+              ) &&
+              !id.includes('instruct')
+            );
+          },
-          (model) =>
-            model.id.startsWith('gpt-') &&
-            !model.id.startsWith('gpt-3.5') && // Exclude GPT-3.5 (no vision support)
-            !model.id.includes('instruct'), // Exclude instruct models
+          (model) => {
+            // Only include known vision-capable models
+            const id = model.id;
+            // Add to this list as new vision-capable models are released
+            return (
+              (
+                id.includes('gpt-4o') ||
+                id.includes('gpt-4-turbo') ||
+                id.includes('gpt-4-vision') ||
+                id.includes('gpt-4v')
+              ) &&
+              !id.includes('instruct')
+            );
+          },
+        )
+        .map((model) => ({
+          provider: 'openai' as const,
+          name: model.id,
+          title: this.formatModelTitle(model.id),
+          contextWindow: this.getContextWindow(model.id),
+        }))
+        .sort((a, b) => {
+          // Sort by priority: gpt-4o variants first, then gpt-4 variants
+          const priority = (name: string) => {
+            if (name.includes('gpt-4o')) return 0;
+            if (name.includes('gpt-4.1')) return 1;
+            if (name.includes('gpt-4')) return 2;
+            if (name.includes('gpt-5')) return 3;
+            return 4;
+          };
+          return priority(a.name) - priority(b.name);
+        });
+
+      if (availableModels.length > 0) {
+        this.cachedModels = availableModels;
+        this.modelsCacheTime = now;
+        this.logger.log(
+          `Fetched ${availableModels.length} models from OpenAI API`,
+        );
+        return availableModels;
+      } else {
+        this.logger.warn(
+          'No suitable models found from OpenAI API, using hardcoded list',
+        );
+        return OPENAI_MODELS;
+      }
+    } catch (error) {
+      this.logger.error(`Failed to fetch models from OpenAI: ${error.message}`);
+      return OPENAI_MODELS;
+    }
+  }
+
+  /**
+   * Format model ID into a human-readable title
+   */
+  private formatModelTitle(modelId: string): string {
+    // Convert model IDs like "gpt-4o-mini" to "GPT-4o Mini"
+    return modelId
+      .split('-')
+      .map((part) => {
+        if (part === 'gpt') return 'GPT';
+        if (part.match(/^\d/)) return part; // Keep numbers as-is
+        return part.charAt(0).toUpperCase() + part.slice(1);
+      })
+      .join('-')
+      .replace(/-/g, ' ');
-    // Convert model IDs like "gpt-4o-mini" to "GPT-4o Mini"
-    return modelId
-      .split('-')
-      .map((part) => {
-        if (part === 'gpt') return 'GPT';
-        if (part.match(/^\d/)) return part; // Keep numbers as-is
-        return part.charAt(0).toUpperCase() + part.slice(1);
-      })
-      .join('-')
-      .replace(/-/g, ' ');
+    // Convert model IDs like "gpt-4o-mini" to "GPT 4o Mini"
+    // If the model ID ends with a date (e.g., "2024-05-13"), keep the date together as a single part
+    const parts = modelId.split('-');
+    // Check if the last three parts form a date (YYYY-MM-DD)
+    const len = parts.length;
+    let formattedParts: string[];
+    if (
+      len >= 3 &&
+      /^\d{4}$/.test(parts[len - 3]) &&
+      /^\d{2}$/.test(parts[len - 2]) &&
+      /^\d{2}$/.test(parts[len - 1])
+    ) {
+      // Group the last three parts as a date
+      const datePart = `${parts[len - 3]}-${parts[len - 2]}-${parts[len - 1]}`;
+      formattedParts = parts.slice(0, len - 3).concat([datePart]);
+    } else {
+      formattedParts = parts;
+    }
+    return formattedParts
+      .map((part) => {
+        if (part === 'gpt') return 'GPT';
+        if (part.match(/^\d/)) return part; // Keep numbers as-is
+        // If part is a date (YYYY-MM-DD), keep as-is
+        if (/^\d{4}-\d{2}-\d{2}$/.test(part)) return part;
+        return part.charAt(0).toUpperCase() + part.slice(1);
+      })
+      .join(' ');
-    // Convert model IDs like "gpt-4o-mini" to "GPT-4o Mini"
-    return modelId
-      .split('-')
-      .map((part) => {
-        if (part === 'gpt') return 'GPT';
-        if (part.match(/^\d/)) return part; // Keep numbers as-is
-        return part.charAt(0).toUpperCase() + part.slice(1);
-      })
-      .join('-')
-      .replace(/-/g, ' ');
+    // Convert model IDs like "gpt-4o-mini" to "GPT 4o Mini"
+    // If the model ID ends with a date (e.g., "2024-05-13"), keep the date together as a single part
+    const parts = modelId.split('-');
+    // Check if the last three parts form a date (YYYY-MM-DD)
+    const len = parts.length;
+    let formattedParts: string[];
+    if (
+      len >= 3 &&
+      /^\d{4}$/.test(parts[len - 3]) &&
+      /^\d{2}$/.test(parts[len - 2]) &&
+      /^\d{2}$/.test(parts[len - 1])
+    ) {
+      // Group the last three parts as a date
+      const datePart = `${parts[len - 3]}-${parts[len - 2]}-${parts[len - 1]}`;
+      formattedParts = parts.slice(0, len - 3).concat([datePart]);
+    } else {
+      formattedParts = parts;
+    }
+    return formattedParts
+      .map((part) => {
+        if (part === 'gpt') return 'GPT';
+        if (part.match(/^\d/)) return part; // Keep numbers as-is
+        // If part is a date (YYYY-MM-DD), keep as-is
+        if (/^\d{4}-\d{2}-\d{2}$/.test(part)) return part;
+        return part.charAt(0).toUpperCase() + part.slice(1);
+      })
+      .join(' ');
+  }
+
+  /**
+   * Get estimated context window for a model
+   */
+  private getContextWindow(modelId: string): number {
+    if (modelId.includes('gpt-4o')) return 128000;
+    if (modelId.includes('gpt-4-turbo')) return 128000;
+    if (modelId.includes('gpt-4')) return 8192;
+    if (modelId.includes('o1')) return 128000;
+    if (modelId.includes('o3')) return 200000;
-    if (modelId.includes('o1')) return 128000;
-    if (modelId.includes('o3')) return 200000;
-    if (modelId.includes('o1')) return 128000;
-    if (modelId.includes('o3')) return 200000;
+    if (modelId.includes('gpt-3.5')) return 16385;
+    return 4096; // Default fallback
+  }
+
   async generateMessage(
     systemPrompt: string,
     messages: Message[],

diff --git a/packages/bytebot-agent/src/tasks/tasks.controller.ts b/packages/bytebot-agent/src/tasks/tasks.controller.ts
@@ -19,24 +19,20 @@ import { ANTHROPIC_MODELS } from '../anthropic/anthropic.constants';
 import { OPENAI_MODELS } from '../openai/openai.constants';
 import { GOOGLE_MODELS } from '../google/google.constants';
 import { BytebotAgentModel } from 'src/agent/agent.types';
+import { OpenAIService } from '../openai/openai.service';
 
 const geminiApiKey = process.env.GEMINI_API_KEY;
 const anthropicApiKey = process.env.ANTHROPIC_API_KEY;
 const openaiApiKey = process.env.OPENAI_API_KEY;
 
 const proxyUrl = process.env.BYTEBOT_LLM_PROXY_URL;
 
-const models = [
-  ...(anthropicApiKey ? ANTHROPIC_MODELS : []),
-  ...(openaiApiKey ? OPENAI_MODELS : []),
-  ...(geminiApiKey ? GOOGLE_MODELS : []),
-];
-
 @Controller('tasks')
 export class TasksController {
   constructor(
     private readonly tasksService: TasksService,
     private readonly messagesService: MessagesService,
+    private readonly openaiService: OpenAIService,
   ) {}
 
   @Post()
@@ -107,6 +103,31 @@ export class TasksController {
         );
       }
     }
+
+    // Fetch models dynamically from providers
+    const models: BytebotAgentModel[] = [];
+
+    // Add Anthropic models if API key is present
+    if (anthropicApiKey) {
+      models.push(...ANTHROPIC_MODELS);
+    }
+
+    // Fetch OpenAI models dynamically if API key is present
+    if (openaiApiKey) {
+      try {
+        const openaiModels = await this.openaiService.getAvailableModels();
+        models.push(...openaiModels);
+      } catch (error) {
+        // Fallback to hardcoded models if fetch fails
+        models.push(...OPENAI_MODELS);
+      }
+    }
+
+    // Add Google models if API key is present
+    if (geminiApiKey) {
+      models.push(...GOOGLE_MODELS);
+    }
+
     return models;
   }
 

diff --git a/packages/bytebot-agent/src/tasks/tasks.module.ts b/packages/bytebot-agent/src/tasks/tasks.module.ts
@@ -4,9 +4,10 @@ import { TasksService } from './tasks.service';
 import { TasksGateway } from './tasks.gateway';
 import { PrismaModule } from '../prisma/prisma.module';
 import { MessagesModule } from '../messages/messages.module';
+import { OpenAIModule } from '../openai/openai.module';
 
 @Module({
-  imports: [PrismaModule, MessagesModule],
+  imports: [PrismaModule, MessagesModule, OpenAIModule],
   controllers: [TasksController],
   providers: [TasksService, TasksGateway],
   exports: [TasksService, TasksGateway],

diff --git a/packages/bytebot-ui/src/components/ui/select.tsx b/packages/bytebot-ui/src/components/ui/select.tsx
@@ -51,9 +51,9 @@ const SelectContent = React.forwardRef<
     >
       <SelectPrimitive.Viewport
         className={cn(
-          "p-1",
+          "p-1 max-h-[300px] overflow-y-auto",
           position === "popper" &&
-            "h-[var(--radix-select-trigger-height)] w-full min-w-[var(--radix-select-trigger-width)]"
+            "w-full min-w-[var(--radix-select-trigger-width)]"
         )}
       >
         {children}

diff --git a/packages/bytebot-ui/src/components/vnc/VncViewer.tsx b/packages/bytebot-ui/src/components/vnc/VncViewer.tsx
@@ -39,6 +39,7 @@ export function VncViewer({ viewOnly = true }: VncViewerProps) {
           url={wsUrl}
           scaleViewport
           viewOnly={viewOnly}
+          showDotCursor={true}
           style={{ width: "100%", height: "100%" }}
         />
       )}

diff --git a/packages/bytebotd/Dockerfile b/packages/bytebotd/Dockerfile
@@ -117,17 +117,18 @@ RUN ARCH=$(dpkg --print-architecture) && \
         echo "1Password is not available for $ARCH architecture."; \
     fi
 
-# Install Visual Studio Code
+# Install Visual Studio Code (with fallback to .deb download if repo fails)
 RUN ARCH=$(dpkg --print-architecture) && \
     if [ "$ARCH" = "amd64" ]; then \
         apt-get update && apt-get install -y wget gpg apt-transport-https software-properties-common && \
-        wget -qO- https://packages.microsoft.com/keys/microsoft.asc | gpg --dearmor -o /usr/share/keyrings/ms_vscode.gpg && \
-        echo "deb [arch=amd64 signed-by=/usr/share/keyrings/ms_vscode.gpg] https://packages.microsoft.com/repos/code stable main" > /etc/apt/sources.list.d/vscode.list && \
-        apt-get update && apt-get install -y code && \
+        # Try downloading .deb directly (more reliable than repo)
+        wget -qO /tmp/code_amd64.deb "https://code.visualstudio.com/sha/download?build=stable&os=linux-deb-x64" && \
+        apt-get install -y /tmp/code_amd64.deb && \
+        rm -f /tmp/code_amd64.deb && \
         apt-get clean && rm -rf /var/lib/apt/lists/* ; \
     elif [ "$ARCH" = "arm64" ]; then \
         apt-get update && apt-get install -y wget gpg && \
-        wget -qO /tmp/code_arm64.deb https://update.code.visualstudio.com/latest/linux-deb-arm64/stable && \
+        wget -qO /tmp/code_arm64.deb "https://code.visualstudio.com/sha/download?build=stable&os=linux-deb-arm64" && \
         apt-get install -y /tmp/code_arm64.deb && \
         rm -f /tmp/code_arm64.deb && \
         apt-get clean && rm -rf /var/lib/apt/lists/* ; \

diff --git a/packages/bytebotd/root/etc/supervisor/conf.d/supervisord.conf b/packages/bytebotd/root/etc/supervisor/conf.d/supervisord.conf
@@ -55,7 +55,7 @@ redirect_stderr=true
 depends_on=xvfb
 
 [program:x11vnc]
-command=x11vnc -display :0 -N -forever -shared -rfbport 5900
+command=x11vnc -display :0 -N -forever -shared -rfbport 5900 -nopw -cursor arrow -cursorpos
 user=user
 autostart=true
 autorestart=true