Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions packages/bytebot-agent/src/agent/agent.constants.ts
Original file line number Diff line number Diff line change
Expand Up @@ -43,9 +43,11 @@ CORE WORKING PRINCIPLES
1. **Observe First** - *Always* invoke \`computer_screenshot\` before your first action **and** whenever the UI may have changed. Screenshot before every action when filling out forms. Never act blindly. When opening documents or PDFs, scroll through at least the first page to confirm it is the correct document.
2. **Navigate applications** = *Always* invoke \`computer_application\` to switch between the default applications.
3. **Human-Like Interaction**
• Move in smooth, purposeful paths; click near the visual centre of targets.
• Double-click desktop icons to open them.
• Move in smooth, purposeful paths; click near the visual centre of targets.
• Double-click desktop icons to open them.
• Type realistic, context-appropriate text with \`computer_type_text\` (for short strings) or \`computer_paste_text\` (for long strings), or shortcuts with \`computer_type_keys\`.
• **Cursor Visibility**: The mouse cursor is visible in screenshots as a black arrow pointer with white outline. Use this to verify your current position before clicking.
• **Positioning Issues**: If you're having trouble clicking a target, use \`computer_cursor_position\` to check your current coordinates, then calculate the needed adjustment. Don't repeatedly click the same coordinates if it's not working - verify and adjust.
4. **Valid Keys Only** -
Use **exactly** the identifiers listed in **VALID KEYS** below when supplying \`keys\` to \`computer_type_keys\` or \`computer_press_keys\`. All identifiers come from nut-tree's \`Key\` enum; they are case-sensitive and contain *no spaces*.
5. **Verify Every Step** - After each action:
Expand Down
31 changes: 30 additions & 1 deletion packages/bytebot-agent/src/agent/agent.processor.ts
Original file line number Diff line number Diff line change
Expand Up @@ -182,7 +182,36 @@ export class AgentProcessor {
`Sending ${messages.length} messages to LLM for processing`,
);

const model = task.model as unknown as BytebotAgentModel;
// Resolve the model - task.model can be either a string (model name) or an object
let model: BytebotAgentModel;

if (typeof task.model === 'string') {
// Old format: task.model is a string (model name)
const modelName = task.model;

// Determine provider from model name
if (modelName.startsWith('gpt-') || modelName.startsWith('o1-') || modelName.startsWith('o3-')) {
model = { provider: 'openai', name: modelName, title: modelName, contextWindow: 128000 };
} else if (modelName.startsWith('claude-')) {
model = { provider: 'anthropic', name: modelName, title: modelName, contextWindow: 200000 };
} else if (modelName.startsWith('gemini-')) {
model = { provider: 'google', name: modelName, title: modelName, contextWindow: 128000 };
} else {
// Try to get model from OpenAI's available models list
const availableModels = await this.openaiService.getAvailableModels();
const foundModel = availableModels.find(m => m.name === modelName);
if (foundModel) {
model = foundModel;
} else {
// Default to openai if no match found
model = { provider: 'openai', name: modelName, title: modelName, contextWindow: 128000 };
}
}
} else {
// New format: task.model is already a BytebotAgentModel object
model = task.model as unknown as BytebotAgentModel;
}

let agentResponse: BytebotAgentResponse;

const service = this.services[model.provider];
Expand Down
26 changes: 20 additions & 6 deletions packages/bytebot-agent/src/openai/openai.constants.ts
Original file line number Diff line number Diff line change
@@ -1,17 +1,31 @@
import { BytebotAgentModel } from 'src/agent/agent.types';

// Only include models that support vision (image inputs)
// This is required for computer-use agents that send screenshots
export const OPENAI_MODELS: BytebotAgentModel[] = [
{
provider: 'openai',
name: 'o3-2025-04-16',
title: 'o3',
contextWindow: 200000,
name: 'gpt-4o',
title: 'GPT-4o',
contextWindow: 128000,
},
{
provider: 'openai',
name: 'gpt-4.1-2025-04-14',
title: 'GPT-4.1',
contextWindow: 1047576,
name: 'gpt-4o-mini',
title: 'GPT-4o Mini',
contextWindow: 128000,
},
{
provider: 'openai',
name: 'gpt-4-turbo',
title: 'GPT-4 Turbo',
contextWindow: 128000,
},
{
provider: 'openai',
name: 'gpt-4',
title: 'GPT-4',
contextWindow: 8192,
},
];

Expand Down
105 changes: 104 additions & 1 deletion packages/bytebot-agent/src/openai/openai.service.ts
Original file line number Diff line number Diff line change
Expand Up @@ -12,19 +12,23 @@ import {
isComputerToolUseContentBlock,
isImageContentBlock,
} from '@bytebot/shared';
import { DEFAULT_MODEL } from './openai.constants';
import { DEFAULT_MODEL, OPENAI_MODELS } from './openai.constants';
import { Message, Role } from '@prisma/client';
import { openaiTools } from './openai.tools';
import {
BytebotAgentService,
BytebotAgentInterrupt,
BytebotAgentResponse,
BytebotAgentModel,
} from '../agent/agent.types';

@Injectable()
export class OpenAIService implements BytebotAgentService {
private readonly openai: OpenAI;
private readonly logger = new Logger(OpenAIService.name);
private cachedModels: BytebotAgentModel[] | null = null;
private modelsCacheTime: number = 0;
private readonly CACHE_DURATION = 3600000; // 1 hour in milliseconds

constructor(private readonly configService: ConfigService) {
const apiKey = this.configService.get<string>('OPENAI_API_KEY');
Expand All @@ -40,6 +44,105 @@ export class OpenAIService implements BytebotAgentService {
});
}

/**
* Fetch available models from OpenAI API and cache them
*/
async getAvailableModels(): Promise<BytebotAgentModel[]> {
// Return cached models if still valid
const now = Date.now();
if (
this.cachedModels &&
now - this.modelsCacheTime < this.CACHE_DURATION
) {
return this.cachedModels;
}

try {
const apiKey = this.configService.get<string>('OPENAI_API_KEY');
if (!apiKey) {
this.logger.warn('OPENAI_API_KEY not set, returning hardcoded models');
return OPENAI_MODELS;
}

// Fetch models from OpenAI API
const modelsList = await this.openai.models.list();
const models = modelsList.data;

// Filter for relevant chat models that support vision (images/screenshots)
// Exclude O1 and O3 models as they don't support image inputs
Copy link

Copilot AI Dec 3, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The comment states "Exclude O1 and O3 models as they don't support image inputs", but the filtering logic below only checks for models starting with 'gpt-'. O1 and O3 models (which would have IDs like 'o1-...' or 'o3-...') are already implicitly excluded by the first filter condition model.id.startsWith('gpt-'). The comment should be clarified to explain that O1/O3 models are excluded because they don't start with 'gpt-', or the comment should be removed if it's redundant.

Suggested change
// Exclude O1 and O3 models as they don't support image inputs
// Only include models whose IDs start with 'gpt-' (O1 and O3 models are excluded by this filter)

Copilot uses AI. Check for mistakes.
const availableModels: BytebotAgentModel[] = models
.filter(
(model) =>
model.id.startsWith('gpt-') &&
!model.id.startsWith('gpt-3.5') && // Exclude GPT-3.5 (no vision support)
!model.id.includes('instruct'), // Exclude instruct models
Comment on lines +75 to +78
Copy link

Copilot AI Dec 3, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The filter assumes all models starting with 'gpt-' (except gpt-3.5) support vision. However, not all GPT-4 models may support vision (e.g., older base 'gpt-4' vs 'gpt-4-vision-preview' or 'gpt-4-turbo'). Consider using a more explicit allowlist of known vision-capable model patterns (e.g., contains 'gpt-4o', 'gpt-4-turbo', 'gpt-4-vision', 'gpt-4v') or checking model capabilities via the OpenAI API if available. This would prevent non-vision models from being included and causing runtime errors when screenshots are sent.

Suggested change
(model) =>
model.id.startsWith('gpt-') &&
!model.id.startsWith('gpt-3.5') && // Exclude GPT-3.5 (no vision support)
!model.id.includes('instruct'), // Exclude instruct models
(model) => {
// Only include known vision-capable models
const id = model.id;
// Add to this list as new vision-capable models are released
return (
(
id.includes('gpt-4o') ||
id.includes('gpt-4-turbo') ||
id.includes('gpt-4-vision') ||
id.includes('gpt-4v')
) &&
!id.includes('instruct')
);
},

Copilot uses AI. Check for mistakes.
)
.map((model) => ({
provider: 'openai' as const,
name: model.id,
title: this.formatModelTitle(model.id),
contextWindow: this.getContextWindow(model.id),
}))
.sort((a, b) => {
// Sort by priority: gpt-4o variants first, then gpt-4 variants
const priority = (name: string) => {
if (name.includes('gpt-4o')) return 0;
if (name.includes('gpt-4.1')) return 1;
if (name.includes('gpt-4')) return 2;
if (name.includes('gpt-5')) return 3;
Comment on lines +90 to +92
Copy link

Copilot AI Dec 3, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The sort priority includes 'gpt-4.1' at priority 1 and 'gpt-5' at priority 3, but according to the filter logic (line 76), only models starting with 'gpt-' and excluding 'gpt-3.5' are included. Since GPT-4.1 and GPT-5 are hypothetical future models that may not exist yet, consider whether these priority cases are necessary. If they are intended for future-proofing, a comment explaining this would be helpful.

Copilot uses AI. Check for mistakes.
return 4;
};
return priority(a.name) - priority(b.name);
});

if (availableModels.length > 0) {
this.cachedModels = availableModels;
this.modelsCacheTime = now;
this.logger.log(
`Fetched ${availableModels.length} models from OpenAI API`,
);
return availableModels;
} else {
this.logger.warn(
'No suitable models found from OpenAI API, using hardcoded list',
);
return OPENAI_MODELS;
}
} catch (error) {
this.logger.error(`Failed to fetch models from OpenAI: ${error.message}`);
return OPENAI_MODELS;
}
}

/**
* Format model ID into a human-readable title
*/
private formatModelTitle(modelId: string): string {
// Convert model IDs like "gpt-4o-mini" to "GPT-4o Mini"
return modelId
.split('-')
.map((part) => {
if (part === 'gpt') return 'GPT';
if (part.match(/^\d/)) return part; // Keep numbers as-is
return part.charAt(0).toUpperCase() + part.slice(1);
})
.join('-')
.replace(/-/g, ' ');
Comment on lines +121 to +130
Copy link

Copilot AI Dec 3, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[nitpick] The formatModelTitle method transforms model IDs like "gpt-4o-mini" to "GPT 4o Mini". However, this transformation logic may not handle all edge cases correctly. For example, "gpt-4o-2024-05-13" would become "GPT 4o 2024 05 13" with spaces between date components. Consider adding test cases or documentation for expected behavior with dated model IDs.

Suggested change
// Convert model IDs like "gpt-4o-mini" to "GPT-4o Mini"
return modelId
.split('-')
.map((part) => {
if (part === 'gpt') return 'GPT';
if (part.match(/^\d/)) return part; // Keep numbers as-is
return part.charAt(0).toUpperCase() + part.slice(1);
})
.join('-')
.replace(/-/g, ' ');
// Convert model IDs like "gpt-4o-mini" to "GPT 4o Mini"
// If the model ID ends with a date (e.g., "2024-05-13"), keep the date together as a single part
const parts = modelId.split('-');
// Check if the last three parts form a date (YYYY-MM-DD)
const len = parts.length;
let formattedParts: string[];
if (
len >= 3 &&
/^\d{4}$/.test(parts[len - 3]) &&
/^\d{2}$/.test(parts[len - 2]) &&
/^\d{2}$/.test(parts[len - 1])
) {
// Group the last three parts as a date
const datePart = `${parts[len - 3]}-${parts[len - 2]}-${parts[len - 1]}`;
formattedParts = parts.slice(0, len - 3).concat([datePart]);
} else {
formattedParts = parts;
}
return formattedParts
.map((part) => {
if (part === 'gpt') return 'GPT';
if (part.match(/^\d/)) return part; // Keep numbers as-is
// If part is a date (YYYY-MM-DD), keep as-is
if (/^\d{4}-\d{2}-\d{2}$/.test(part)) return part;
return part.charAt(0).toUpperCase() + part.slice(1);
})
.join(' ');

Copilot uses AI. Check for mistakes.
}

/**
* Get estimated context window for a model
*/
private getContextWindow(modelId: string): number {
if (modelId.includes('gpt-4o')) return 128000;
if (modelId.includes('gpt-4-turbo')) return 128000;
if (modelId.includes('gpt-4')) return 8192;
if (modelId.includes('o1')) return 128000;
if (modelId.includes('o3')) return 200000;
Comment on lines +140 to +141
Copy link

Copilot AI Dec 3, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The getContextWindow method includes logic for 'o1' and 'o3' models (lines 140-141), but these models are filtered out in getAvailableModels (line 76) because they don't start with 'gpt-'. Since these models are intentionally excluded from the available models list, this dead code should be removed to avoid confusion.

Suggested change
if (modelId.includes('o1')) return 128000;
if (modelId.includes('o3')) return 200000;

Copilot uses AI. Check for mistakes.
if (modelId.includes('gpt-3.5')) return 16385;
return 4096; // Default fallback
}

async generateMessage(
systemPrompt: string,
messages: Message[],
Expand Down
33 changes: 27 additions & 6 deletions packages/bytebot-agent/src/tasks/tasks.controller.ts
Original file line number Diff line number Diff line change
Expand Up @@ -19,24 +19,20 @@ import { ANTHROPIC_MODELS } from '../anthropic/anthropic.constants';
import { OPENAI_MODELS } from '../openai/openai.constants';
import { GOOGLE_MODELS } from '../google/google.constants';
import { BytebotAgentModel } from 'src/agent/agent.types';
import { OpenAIService } from '../openai/openai.service';

const geminiApiKey = process.env.GEMINI_API_KEY;
const anthropicApiKey = process.env.ANTHROPIC_API_KEY;
const openaiApiKey = process.env.OPENAI_API_KEY;

const proxyUrl = process.env.BYTEBOT_LLM_PROXY_URL;

const models = [
...(anthropicApiKey ? ANTHROPIC_MODELS : []),
...(openaiApiKey ? OPENAI_MODELS : []),
...(geminiApiKey ? GOOGLE_MODELS : []),
];

@Controller('tasks')
export class TasksController {
constructor(
private readonly tasksService: TasksService,
private readonly messagesService: MessagesService,
private readonly openaiService: OpenAIService,
) {}

@Post()
Expand Down Expand Up @@ -107,6 +103,31 @@ export class TasksController {
);
}
}

// Fetch models dynamically from providers
const models: BytebotAgentModel[] = [];

// Add Anthropic models if API key is present
if (anthropicApiKey) {
models.push(...ANTHROPIC_MODELS);
}

// Fetch OpenAI models dynamically if API key is present
if (openaiApiKey) {
try {
const openaiModels = await this.openaiService.getAvailableModels();
models.push(...openaiModels);
} catch (error) {
// Fallback to hardcoded models if fetch fails
models.push(...OPENAI_MODELS);
}
}

// Add Google models if API key is present
if (geminiApiKey) {
models.push(...GOOGLE_MODELS);
}

return models;
}

Expand Down
3 changes: 2 additions & 1 deletion packages/bytebot-agent/src/tasks/tasks.module.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,10 @@ import { TasksService } from './tasks.service';
import { TasksGateway } from './tasks.gateway';
import { PrismaModule } from '../prisma/prisma.module';
import { MessagesModule } from '../messages/messages.module';
import { OpenAIModule } from '../openai/openai.module';

@Module({
imports: [PrismaModule, MessagesModule],
imports: [PrismaModule, MessagesModule, OpenAIModule],
controllers: [TasksController],
providers: [TasksService, TasksGateway],
exports: [TasksService, TasksGateway],
Expand Down
4 changes: 2 additions & 2 deletions packages/bytebot-ui/src/components/ui/select.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -51,9 +51,9 @@ const SelectContent = React.forwardRef<
>
<SelectPrimitive.Viewport
className={cn(
"p-1",
"p-1 max-h-[300px] overflow-y-auto",
position === "popper" &&
"h-[var(--radix-select-trigger-height)] w-full min-w-[var(--radix-select-trigger-width)]"
"w-full min-w-[var(--radix-select-trigger-width)]"
)}
>
{children}
Expand Down
1 change: 1 addition & 0 deletions packages/bytebot-ui/src/components/vnc/VncViewer.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ export function VncViewer({ viewOnly = true }: VncViewerProps) {
url={wsUrl}
scaleViewport
viewOnly={viewOnly}
showDotCursor={true}
style={{ width: "100%", height: "100%" }}
/>
)}
Expand Down
11 changes: 6 additions & 5 deletions packages/bytebotd/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -117,17 +117,18 @@ RUN ARCH=$(dpkg --print-architecture) && \
echo "1Password is not available for $ARCH architecture."; \
fi

# Install Visual Studio Code
# Install Visual Studio Code (with fallback to .deb download if repo fails)
RUN ARCH=$(dpkg --print-architecture) && \
if [ "$ARCH" = "amd64" ]; then \
apt-get update && apt-get install -y wget gpg apt-transport-https software-properties-common && \
wget -qO- https://packages.microsoft.com/keys/microsoft.asc | gpg --dearmor -o /usr/share/keyrings/ms_vscode.gpg && \
echo "deb [arch=amd64 signed-by=/usr/share/keyrings/ms_vscode.gpg] https://packages.microsoft.com/repos/code stable main" > /etc/apt/sources.list.d/vscode.list && \
apt-get update && apt-get install -y code && \
# Try downloading .deb directly (more reliable than repo)
wget -qO /tmp/code_amd64.deb "https://code.visualstudio.com/sha/download?build=stable&os=linux-deb-x64" && \
apt-get install -y /tmp/code_amd64.deb && \
rm -f /tmp/code_amd64.deb && \
apt-get clean && rm -rf /var/lib/apt/lists/* ; \
elif [ "$ARCH" = "arm64" ]; then \
apt-get update && apt-get install -y wget gpg && \
wget -qO /tmp/code_arm64.deb https://update.code.visualstudio.com/latest/linux-deb-arm64/stable && \
wget -qO /tmp/code_arm64.deb "https://code.visualstudio.com/sha/download?build=stable&os=linux-deb-arm64" && \
apt-get install -y /tmp/code_arm64.deb && \
rm -f /tmp/code_arm64.deb && \
apt-get clean && rm -rf /var/lib/apt/lists/* ; \
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ redirect_stderr=true
depends_on=xvfb

[program:x11vnc]
command=x11vnc -display :0 -N -forever -shared -rfbport 5900
command=x11vnc -display :0 -N -forever -shared -rfbport 5900 -nopw -cursor arrow -cursorpos
user=user
autostart=true
autorestart=true
Expand Down
Loading