diff --git a/.gitignore b/.gitignore index 121beaf..59df641 100644 --- a/.gitignore +++ b/.gitignore @@ -1,47 +1,78 @@ -# Miscellaneous -*.class +# Dependencies +.pub/ +.buildlog +.packages + +# Generated plugin files +lib/generated_plugin_registrant.dart + +# Android +**/android/** +!**/android/**/generated_plugin_registrant.dart + +# iOS +**/ios/** +!**/ios/**/Runner/GeneratedPluginRegistrant* + +# macOS +**/macos/** +!**/macos/**/generated_plugin_registrant.dart + +# Windows +**/windows/** +!**/windows/**/generated_plugin_registrant.dart + +# Linux +**/linux/** +!**/linux/**/generated_plugin_registrant.dart + +# Build outputs +build/ + +# Logs *.log -*.pyc -*.swp -.DS_Store -.atom/ -.build/ -.buildlog/ -.history -.svn/ -.swiftpm/ -migrate_working_dir/ - -# IntelliJ related -*.iml -*.ipr -*.iws -.idea/ -# The .vscode folder contains launch configuration and tasks you configure in -# VS Code which you may wish to be included in version control, so this line -# is commented out by default. -#.vscode/ - -# Flutter/Dart/Pub related -**/doc/api/ -**/ios/Flutter/.last_build_id -.dart_tool/ -.flutter-plugins -.flutter-plugins-dependencies -.pub-cache/ -.pub/ -/build/ +# Environment variables +.env +.env.local +*.env.* -# Symbolication related -app.*.symbols +# IDE files +.vscode/ +.idea/ +*.swp +*.swo +*.tmp + +# OS generated files +.DS_Store +Thumbs.db -# Obfuscation related -app.*.map.json +# Coverage +coverage/ +htmlcov/ +.coverage -# Android Studio will place build artifacts here -/android/app/debug -/android/app/profile -/android/app/release -.vscode/branch-timer.json -pubspec.lock +# Compressed files +*.zip +*.gz +*.tar +*.tgz +*.bz2 +*.xz +*.7z +*.rar +*.zst +*.lz4 +*.lzh +*.cab +*.arj +*.rpm +*.deb +*.Z +*.lz +*.lzo +*.tar.gz +*.tar.bz2 +*.tar.xz +*.tar.zst \ No newline at end of file diff --git a/HOLO3_INTEGRATION.md b/HOLO3_INTEGRATION.md new file mode 100644 index 0000000..cf69308 --- /dev/null +++ b/HOLO3_INTEGRATION.md @@ -0,0 +1,196 @@ +# Holo3 Vision Model Integration for NextDesk + +This document describes the integration of the HCompany Holo3 vision model with NextDesk via direct API access. + +## Overview + +Since OpenRouter does not support the Holo3 model, NextDesk now connects directly to HCompany's API for vision tasks when using Holo3. + +## Architecture + +``` +┌─────────────────┐ +│ NextDesk App │ +│ │ +│ VisionService │──────┐ +│ │ │ +└─────────────────┘ │ + │ + ┌───────────────┴───────────────┐ + │ │ + ▼ ▼ +┌─────────────────┐ ┌─────────────────┐ +│ Holo3Vision │ │ OpenRouter │ +│ Service │ │ Service │ +│ │ │ │ +│ Direct HCompany │ │ Other Models │ +│ API Call │ │ (Gemini, GPT) │ +└────────┬────────┘ └─────────────────┘ + │ + ▼ +┌─────────────────┐ +│ HCompany API │ +│ api.hcompany.ai│ +│ /v1/chat/ │ +│ completions │ +└─────────────────┘ +``` + +## Configuration + +### Environment Variables + +Set these environment variables before running NextDesk: + +```bash +# Required for Holo3 vision model +export HCOMPANY_API_KEY="your-hcompany-api-key" + +# Optional: For other models via OpenRouter +export OPENROUTER_API_KEY="your-openrouter-api-key" + +# Optional: Specify vision model (default: hcompany/holo3-35b-a3b) +export VISION_MODEL="hcompany/holo3-35b-a3b" + +# Optional: Specify chat model (default: google/gemini-3-flash-preview) +export CHAT_MODEL="google/gemini-3-flash-preview" +``` + +### Getting HCompany API Key + +1. Visit [https://hub.hcompany.ai/](https://hub.hcompany.ai/) +2. Create an account or sign in +3. Navigate to API Keys section +4. Generate a new API key +5. Copy and store securely + +## Files Modified/Created + +### New Files +- `lib/services/holo3_vision_service.dart` - Dedicated service for HCompany Holo3 API calls + +### Modified Files +- `lib/config/app_config.dart` - Added HCOMPANY_API_KEY configuration +- `lib/services/config_service.dart` - Added HCompany API key management +- `lib/services/vision_service.dart` - Updated to route Holo3 requests to direct API +- `lib/screens/settings_screen.dart` - Added UI for HCompany API key configuration + +## Usage + +### Automatic Model Selection + +The `VisionService` automatically selects the appropriate backend: + +```dart +// When vision model contains 'hcompany' or 'holo3', uses direct HCompany API +final result = await VisionService.detectElementPosition( + imageBytes, + "Find the submit button", + configService, +); +``` + +### Model Options + +**Vision Models:** +- `hcompany/holo3-35b-a3b` - Holo3 (uses direct HCompany API) +- `google/gemini-3-pro-preview` - Gemini 3 Pro (uses OpenRouter) +- `openai/gpt-4o` - GPT-4o (uses OpenRouter) +- `anthropic/claude-3.5-sonnet` - Claude 3.5 Sonnet (uses OpenRouter) + +**Chat Models:** +- `google/gemini-3-flash-preview` - Default +- `google/gemini-2.5-pro` +- `openai/gpt-4o-mini` +- `anthropic/claude-3.5-sonnet` + +## API Endpoints + +### HCompany Holo3 API +- **Base URL:** `https://api.hcompany.ai/v1/chat/completions` +- **Model:** `holo3-35b-a3b` +- **Authentication:** Bearer token via `Authorization` header +- **Format:** OpenAI-compatible Chat Completions API + +### Request Format + +```json +{ + "model": "holo3-35b-a3b", + "messages": [ + { + "role": "user", + "content": [ + { + "type": "text", + "text": "Analyze the provided screenshot..." + }, + { + "type": "image_url", + "image_url": { + "url": "data:image/png;base64," + } + } + ] + } + ] +} +``` + +### Response Format + +```json +{ + "choices": [ + { + "message": { + "content": "{\"x\": 100, \"y\": 200, \"confidence\": 0.95, ...}" + } + } + ] +} +``` + +## Error Handling + +The integration includes comprehensive error handling: + +1. **Missing API Key:** Returns error if HCompany API key is not configured +2. **API Failures:** Captures HTTP status codes and response bodies +3. **Invalid JSON:** Handles malformed responses gracefully +4. **Network Errors:** Catches connection timeouts and failures + +All errors return a `DetectionResult` with `status: "error"` and descriptive `errorMessage`. + +## Testing + +To test the Holo3 integration: + +1. Configure your HCompany API key in Settings or via environment variable +2. Select `hcompany/holo3-35b-a3b` as the vision model +3. Run a task that requires element detection +4. Monitor logs for API calls to `api.hcompany.ai` + +## Migration from OpenRouter + +If you were previously using a different vision model: + +1. Update `VISION_MODEL` environment variable to `hcompany/holo3-35b-a3b` +2. Set `HCOMPANY_API_KEY` with your HCompany credentials +3. The app will automatically use the direct HCompany API + +No code changes required - the routing is handled automatically by `VisionService`. + +## Security Best Practices + +- Never commit API keys to version control +- Use environment variables in production +- Rotate API keys periodically +- Monitor API usage through HCompany dashboard +- Implement rate limiting if needed + +## References + +- [HCompany Hub](https://hub.hcompany.ai/) +- [HCompany Quickstart](https://hub.hcompany.ai/quickstart) +- [OpenRouter API Reference](https://openrouter.ai/docs/api/reference/overview) diff --git a/lib/config/app_config.dart b/lib/config/app_config.dart index 0bcca0a..aaf2d2b 100644 --- a/lib/config/app_config.dart +++ b/lib/config/app_config.dart @@ -8,6 +8,25 @@ class AppConfig { static const String openRouterApiKey = String.fromEnvironment("OPENROUTER_API_KEY"); + /// HCompany API Key (for Holo3 model) + /// Get your API key from: https://hub.hcompany.ai/ + static const String hCompanyApiKey = + String.fromEnvironment("HCOMPANY_API_KEY"); + + /// Vision Model Provider + /// Options: 'google/gemini-3-pro-preview', 'hcompany/holo3-35b-a3b' + static const String visionModel = String.fromEnvironment( + "VISION_MODEL", + defaultValue: 'hcompany/holo3-35b-a3b', + ); + + /// Chat/Agent Model + /// Default chat model for automation tasks + static const String chatModel = String.fromEnvironment( + "CHAT_MODEL", + defaultValue: 'google/gemini-3-flash-preview', + ); + /// Maximum iterations for ReAct agent static const int maxIterations = 20; diff --git a/lib/screens/settings_screen.dart b/lib/screens/settings_screen.dart index 0b474c9..f00ca0a 100644 --- a/lib/screens/settings_screen.dart +++ b/lib/screens/settings_screen.dart @@ -14,10 +14,12 @@ class SettingsScreen extends StatefulWidget { class _SettingsScreenState extends State { late TextEditingController _openRouterKeyController; + late TextEditingController _hCompanyKeyController; late TextEditingController _maxIterationsController; late TextEditingController _waitSecondsController; bool _obscureOpenRouterKey = true; + bool _obscureHCompanyKey = true; bool _hasUnsavedChanges = false; bool _isSaving = false; @@ -27,6 +29,8 @@ class _SettingsScreenState extends State { final config = context.read(); _openRouterKeyController = TextEditingController(text: config.customOpenRouterKey); + _hCompanyKeyController = + TextEditingController(text: config.customHCompanyKey); _maxIterationsController = TextEditingController(text: config.maxIterations.toString()); _waitSecondsController = @@ -34,6 +38,7 @@ class _SettingsScreenState extends State { // Add listeners to track changes _openRouterKeyController.addListener(_markAsChanged); + _hCompanyKeyController.addListener(_markAsChanged); _maxIterationsController.addListener(_markAsChanged); _waitSecondsController.addListener(_markAsChanged); } @@ -47,6 +52,7 @@ class _SettingsScreenState extends State { @override void dispose() { _openRouterKeyController.dispose(); + _hCompanyKeyController.dispose(); _maxIterationsController.dispose(); _waitSecondsController.dispose(); super.dispose(); @@ -123,6 +129,26 @@ class _SettingsScreenState extends State { helpUrl: 'https://openrouter.ai/keys', ), + const SizedBox(height: AppTheme.spaceMd), + + _buildApiKeyCard( + config: config, + title: 'HCompany API Key (for Holo3)', + envKey: config.envHCompanyKey, + hasEnvKey: config.hasEnvHCompanyKey, + useEnv: config.useEnvHCompany, + controller: _hCompanyKeyController, + obscureText: _obscureHCompanyKey, + onUseEnvChanged: (value) async { + await config.setUseEnvHCompany(value); + _markAsChanged(); + }, + onKeyChanged: (_) async {}, // No immediate save + onToggleVisibility: () => setState( + () => _obscureHCompanyKey = !_obscureHCompanyKey), + helpUrl: 'https://hub.hcompany.ai/', + ), + const SizedBox(height: AppTheme.spaceLg), // Performance Section @@ -136,6 +162,17 @@ class _SettingsScreenState extends State { const SizedBox(height: AppTheme.spaceLg), + // AI Models Section + _buildSectionHeader( + icon: Icons.smart_toy_rounded, + title: 'AI Models', + subtitle: 'Configure vision and chat models', + ), + const SizedBox(height: AppTheme.spaceMd), + _buildModelsCard(config), + + const SizedBox(height: AppTheme.spaceLg), + // Info Section _buildInfoCard(), ], @@ -403,6 +440,120 @@ class _SettingsScreenState extends State { ); } + Widget _buildModelsCard(ConfigService config) { + return Container( + padding: const EdgeInsets.all(AppTheme.spaceMd), + decoration: BoxDecoration( + color: AppTheme.surfaceDark, + borderRadius: BorderRadius.circular(AppTheme.radiusMd), + border: Border.all(color: AppTheme.borderMedium), + ), + child: Column( + crossAxisAlignment: CrossAxisAlignment.start, + children: [ + _buildModelSelector( + label: 'Vision Model', + currentValue: config.visionModel, + options: const [ + {'value': 'hcompany/holo3-35b-a3b', 'label': 'Holo3 (hcompany)'}, + {'value': 'google/gemini-3-pro-preview', 'label': 'Gemini 3 Pro'}, + {'value': 'openai/gpt-4o', 'label': 'GPT-4o'}, + {'value': 'anthropic/claude-3.5-sonnet', 'label': 'Claude 3.5 Sonnet'}, + ], + onChanged: (value) { + config.setVisionModel(value); + _markAsChanged(); + }, + description: 'Model used for analyzing screenshots and detecting UI elements', + ), + const SizedBox(height: AppTheme.spaceMd), + _buildModelSelector( + label: 'Chat/Agent Model', + currentValue: config.chatModel, + options: const [ + {'value': 'google/gemini-3-flash-preview', 'label': 'Gemini 3 Flash'}, + {'value': 'google/gemini-2.5-pro', 'label': 'Gemini 2.5 Pro'}, + {'value': 'openai/gpt-4o-mini', 'label': 'GPT-4o Mini'}, + {'value': 'anthropic/claude-3.5-haiku', 'label': 'Claude 3.5 Haiku'}, + ], + onChanged: (value) { + config.setChatModel(value); + _markAsChanged(); + }, + description: 'Model used for automation reasoning and task execution', + ), + ], + ), + ); + } + + Widget _buildModelSelector({ + required String label, + required String currentValue, + required List> options, + required Function(String) onChanged, + required String description, + }) { + return Column( + crossAxisAlignment: CrossAxisAlignment.start, + children: [ + Text( + label, + style: const TextStyle( + color: AppTheme.textSecondary, + fontSize: 12, + fontWeight: FontWeight.w500, + ), + ), + const SizedBox(height: AppTheme.spaceXs), + Container( + padding: const EdgeInsets.symmetric(horizontal: AppTheme.spaceSm), + decoration: BoxDecoration( + color: AppTheme.surfaceMedium, + borderRadius: BorderRadius.circular(AppTheme.radiusSm), + border: Border.all(color: AppTheme.borderSubtle), + ), + child: DropdownButtonHideUnderline( + child: DropdownButton( + value: options.any((o) => o['value'] == currentValue) + ? currentValue + : options.first['value'], + isExpanded: true, + dropdownColor: AppTheme.surfaceDark, + style: const TextStyle( + color: AppTheme.textPrimary, + fontSize: 13, + ), + icon: const Icon( + Icons.arrow_drop_down_rounded, + color: AppTheme.textTertiary, + ), + items: options.map((option) { + return DropdownMenuItem( + value: option['value'], + child: Text(option['label'] ?? ''), + ); + }).toList(), + onChanged: (value) { + if (value != null) { + onChanged(value); + } + }, + ), + ), + ), + const SizedBox(height: AppTheme.spaceXs), + Text( + description, + style: const TextStyle( + color: AppTheme.textTertiary, + fontSize: 11, + ), + ), + ], + ); + } + Widget _buildNumberField({ required String label, required TextEditingController controller, diff --git a/lib/services/config_service.dart b/lib/services/config_service.dart index d253938..6d6db92 100644 --- a/lib/services/config_service.dart +++ b/lib/services/config_service.dart @@ -7,6 +7,10 @@ import 'package:shared_preferences/shared_preferences.dart'; class ConfigService extends ChangeNotifier { static const String _keyUseEnvOpenRouter = 'use_env_openrouter'; static const String _keyCustomOpenRouterKey = 'custom_openrouter_key'; + static const String _keyUseEnvHCompany = 'use_env_hcompany'; + static const String _keyCustomHCompanyKey = 'custom_hcompany_key'; + static const String _keyVisionModel = 'vision_model'; + static const String _keyChatModel = 'chat_model'; static const String _keyMaxIterations = 'max_iterations'; static const String _keyScreenshotQuality = 'screenshot_quality'; static const String _keyDefaultWaitSeconds = 'default_wait_seconds'; @@ -16,10 +20,15 @@ class ConfigService extends ChangeNotifier { // Default values from environment variables String? _envOpenRouterKey; + String? _envHCompanyKey; // User preferences bool _useEnvOpenRouter = true; String _customOpenRouterKey = ''; + bool _useEnvHCompany = true; + String _customHCompanyKey = ''; + String _visionModel = 'hcompany/holo3-35b-a3b'; + String _chatModel = 'google/gemini-3-flash-preview'; int _maxIterations = 20; double _screenshotQuality = 0.8; int _defaultWaitSeconds = 2; @@ -31,6 +40,7 @@ class ConfigService extends ChangeNotifier { /// Load environment variables void _loadEnvVariables() { _envOpenRouterKey = Platform.environment['OPENROUTER_API_KEY']; + _envHCompanyKey = Platform.environment['HCOMPANY_API_KEY']; } /// Initialize the service and load saved preferences @@ -49,6 +59,12 @@ class ConfigService extends ChangeNotifier { _useEnvOpenRouter = _prefs!.getBool(_keyUseEnvOpenRouter) ?? true; _customOpenRouterKey = _prefs!.getString(_keyCustomOpenRouterKey) ?? ''; + _useEnvHCompany = _prefs!.getBool(_keyUseEnvHCompany) ?? true; + _customHCompanyKey = _prefs!.getString(_keyCustomHCompanyKey) ?? ''; + _visionModel = + _prefs!.getString(_keyVisionModel) ?? 'hcompany/holo3-35b-a3b'; + _chatModel = + _prefs!.getString(_keyChatModel) ?? 'google/gemini-3-flash-preview'; _maxIterations = _prefs!.getInt(_keyMaxIterations) ?? 20; _screenshotQuality = _prefs!.getDouble(_keyScreenshotQuality) ?? 0.8; _defaultWaitSeconds = _prefs!.getInt(_keyDefaultWaitSeconds) ?? 2; @@ -60,10 +76,19 @@ class ConfigService extends ChangeNotifier { bool get hasEnvOpenRouterKey => _envOpenRouterKey != null && _envOpenRouterKey!.isNotEmpty; + bool get hasEnvHCompanyKey => + _envHCompanyKey != null && _envHCompanyKey!.isNotEmpty; + String? get envOpenRouterKey => _envOpenRouterKey; + String? get envHCompanyKey => _envHCompanyKey; + bool get useEnvOpenRouter => _useEnvOpenRouter; String get customOpenRouterKey => _customOpenRouterKey; + bool get useEnvHCompany => _useEnvHCompany; + String get customHCompanyKey => _customHCompanyKey; + String get visionModel => _visionModel; + String get chatModel => _chatModel; int get maxIterations => _maxIterations; double get screenshotQuality => _screenshotQuality; int get defaultWaitSeconds => _defaultWaitSeconds; @@ -76,9 +101,20 @@ class ConfigService extends ChangeNotifier { return _customOpenRouterKey; } + /// Get the active HCompany API key (env or custom) + String get hCompanyApiKey { + if (_useEnvHCompany && hasEnvHCompanyKey) { + return _envHCompanyKey!; + } + return _customHCompanyKey; + } + /// Check if OpenRouter is properly configured bool get isOpenRouterConfigured => openRouterApiKey.isNotEmpty; + /// Check if HCompany is properly configured + bool get isHCompanyConfigured => hCompanyApiKey.isNotEmpty; + // Setters with persistence Future setUseEnvOpenRouter(bool value) async { @@ -115,6 +151,10 @@ class ConfigService extends ChangeNotifier { Future resetToDefaults() async { _useEnvOpenRouter = true; _customOpenRouterKey = ''; + _useEnvHCompany = true; + _customHCompanyKey = ''; + _visionModel = 'hcompany/holo3-35b-a3b'; + _chatModel = 'google/gemini-3-flash-preview'; _maxIterations = 20; _screenshotQuality = 0.8; _defaultWaitSeconds = 2; @@ -122,4 +162,32 @@ class ConfigService extends ChangeNotifier { await _prefs?.clear(); notifyListeners(); } + + // Model setters with persistence + + Future setVisionModel(String value) async { + _visionModel = value; + await _prefs?.setString(_keyVisionModel, value); + notifyListeners(); + } + + Future setChatModel(String value) async { + _chatModel = value; + await _prefs?.setString(_keyChatModel, value); + notifyListeners(); + } + + // HCompany API key setters + + Future setUseEnvHCompany(bool value) async { + _useEnvHCompany = value; + await _prefs?.setBool(_keyUseEnvHCompany, value); + notifyListeners(); + } + + Future setCustomHCompanyKey(String value) async { + _customHCompanyKey = value; + await _prefs?.setString(_keyCustomHCompanyKey, value); + notifyListeners(); + } } diff --git a/lib/services/holo3_vision_service.dart b/lib/services/holo3_vision_service.dart new file mode 100644 index 0000000..b685af9 --- /dev/null +++ b/lib/services/holo3_vision_service.dart @@ -0,0 +1,183 @@ +import 'dart:convert'; +import 'dart:typed_data'; +import 'package:http/http.dart' as http; +import '../models/detection_result.dart'; +import '../config/app_config.dart'; +import 'config_service.dart'; + +/// Element Position Detection Service using HCompany Holo3 API +class Holo3VisionService { + static const String _hCompanyApiUrl = + "https://api.hcompany.ai/v1/chat/completions"; + + /// Detects the pixel coordinates of UI elements in a screenshot. + /// Uses HCompany Holo3 API directly. + static Future detectElementPosition( + Uint8List imageBytes, String elementDescription, ConfigService? config) async { + final hCompanyKey = config?.hCompanyApiKey ?? AppConfig.hCompanyApiKey; + + if (hCompanyKey.isEmpty) { + return DetectionResult( + status: "error", + errorMessage: + "HCompany API key is not configured. Please set HCOMPANY_API_KEY environment variable or configure in settings.", + x: null, + y: null, + confidence: 0.0, + imageSize: null, + ); + } + + return _detectWithHolo3(imageBytes, elementDescription, hCompanyKey); + } + + /// Detects the pixel coordinates of UI elements using HCompany Holo3 API. + static Future _detectWithHolo3( + Uint8List imageBytes, String elementDescription, String apiKey) async { + try { + final base64Image = base64Encode(imageBytes); + + // Prepare the prompt + final prompt = '''Analyze the provided screenshot. +Find the center pixel coordinates of the element described as: "$elementDescription". + +Provide a brief description of what you see in the screenshot. +If the exact element is not found but a similar alternative exists, explicitly mention both in the description. + +Return your response in this exact JSON format: +{ + "x": , + "y": , + "confidence": , + "screenshot_description": , + "image_size": { + "width": , + "height": + } +}'''; + + final requestBody = { + "model": "holo3-35b-a3b", + "messages": [ + { + "role": "user", + "content": [ + { + "type": "text", + "text": prompt, + }, + { + "type": "image_url", + "image_url": { + "url": "data:image/png;base64,$base64Image", + } + } + ] + } + ] + }; + + // Make API request to HCompany + final response = await http.post( + Uri.parse(_hCompanyApiUrl), + headers: { + 'Content-Type': 'application/json', + 'Authorization': 'Bearer $apiKey', + }, + body: jsonEncode(requestBody), + ); + + if (response.statusCode != 200) { + return DetectionResult( + status: "error", + errorMessage: + "HCompany API request failed with status: ${response.statusCode}\nResponse: ${response.body}", + x: null, + y: null, + confidence: 0.0, + imageSize: null, + ); + } + + // Parse response + final responseData = jsonDecode(response.body); + final choices = responseData['choices'] as List?; + + if (choices == null || choices.isEmpty) { + return DetectionResult( + status: "error", + errorMessage: "No response from HCompany Holo3 API", + x: null, + y: null, + confidence: 0.0, + imageSize: null, + ); + } + + final message = choices[0]['message']; + final content = message['content'] as String; + + // Parse JSON response + try { + final parsedResult = jsonDecode(content) as Map; + final xCoord = parsedResult['x']; + final yCoord = parsedResult['y']; + final confidence = parsedResult['confidence']; + final screenshotDesc = parsedResult['screenshot_description'] as String?; + final imageSize = parsedResult['image_size']; + + if (xCoord != null && yCoord != null) { + return DetectionResult( + status: "success", + x: xCoord is int ? xCoord : int.parse(xCoord.toString()), + y: yCoord is int ? yCoord : int.parse(yCoord.toString()), + screenshotDescription: screenshotDesc ?? elementDescription, + confidence: confidence is double + ? confidence + : (confidence != null + ? double.parse(confidence.toString()) + : 0.9), + imageSize: imageSize != null + ? { + 'width': imageSize['width'] as int, + 'height': imageSize['height'] as int, + } + : null, + ); + } else { + return DetectionResult( + status: "error", + errorMessage: + "Element not found by Holo3 API or coordinates are null", + x: null, + y: null, + screenshotDescription: screenshotDesc, + confidence: 0.0, + imageSize: imageSize != null + ? { + 'width': imageSize['width'] as int, + 'height': imageSize['height'] as int, + } + : null, + ); + } + } catch (jsonError) { + return DetectionResult( + status: "error", + errorMessage: "Holo3 API returned invalid JSON: $content", + x: null, + y: null, + confidence: 0.0, + ); + } + } catch (e) { + return DetectionResult( + status: "error", + errorMessage: "Failed to detect element position using Holo3: ${e.toString()}", + x: null, + y: null, + confidence: 0.0, + ); + } + } +} diff --git a/lib/services/vision_service.dart b/lib/services/vision_service.dart index edebb9f..286c80d 100644 --- a/lib/services/vision_service.dart +++ b/lib/services/vision_service.dart @@ -4,25 +4,33 @@ import 'package:http/http.dart' as http; import '../models/detection_result.dart'; import '../config/app_config.dart'; import 'config_service.dart'; +import 'holo3_vision_service.dart'; -/// Element Position Detection Service using OpenRouter Vision API +/// Element Position Detection Service using Vision API +/// Supports both OpenRouter and HCompany Holo3 backends class VisionService { - static const String _openRouterApiUrl = - "https://openrouter.ai/api/v1/chat/completions"; - /// Detects the pixel coordinates of UI elements in a screenshot. - /// Uses OpenRouter. + /// Automatically selects the appropriate backend based on configured vision model. static Future detectElementPosition(Uint8List imageBytes, String elementDescription, ConfigService? config) async { + final visionModel = config?.visionModel ?? AppConfig.visionModel; + + // Use HCompany Holo3 directly for holo3 models + if (visionModel.contains('hcompany') || visionModel.contains('holo3')) { + return Holo3VisionService.detectElementPosition( + imageBytes, elementDescription, config); + } + + // Fallback to OpenRouter for other models final openRouterKey = config?.openRouterApiKey ?? AppConfig.openRouterApiKey; - - return _detectWithOpenRouter(imageBytes, elementDescription, openRouterKey); + return _detectWithOpenRouter( + imageBytes, elementDescription, openRouterKey, visionModel); } /// Detects the pixel coordinates of UI elements using OpenRouter Vision API. static Future _detectWithOpenRouter( - Uint8List imageBytes, String elementDescription, String apiKey) async { + Uint8List imageBytes, String elementDescription, String apiKey, String visionModel) async { try { final base64Image = base64Encode(imageBytes); @@ -46,7 +54,7 @@ Return your response in this exact JSON format: }'''; final requestBody = { - "model": "google/gemini-3-pro-preview", + "model": visionModel, "messages": [ { "role": "user", @@ -68,7 +76,7 @@ Return your response in this exact JSON format: // Make API request final response = await http.post( - Uri.parse(_openRouterApiUrl), + Uri.parse("https://openrouter.ai/api/v1/chat/completions"), headers: { 'Content-Type': 'application/json', 'Authorization': 'Bearer $apiKey',