agi-inc · JacobFV · Feb 10, 2026 · Feb 10, 2026 · Feb 10, 2026 · Feb 10, 2026
diff --git a/MULTIMODAL_UPDATES.md b/MULTIMODAL_UPDATES.md
@@ -0,0 +1,128 @@
+# Multimodal Support - AGI CLI Updates
+
+This update adds comprehensive multimodal support to the AGI CLI.
+
+## New Features
+
+### Voice Mode (`--voice`)
+- Audio input from microphone
+- Automatic turn detection
+- Text-to-speech output
+- Requires: `OPENAI_API_KEY` environment variable
+
+### Camera Mode (`--camera`)
+- Webcam video feed
+- 30-second rolling buffer
+- Agent can see you
+
+### Screen Mode (`--screen`)
+- Screen recording
+- 30-second rolling buffer
+- Agent can see your screen
+
+### MCP Support (`--mcp`)
+- Load MCP servers from config
+- Default config: `~/.agi/mcp.json`
+- Custom config: `--mcp-config /path/to/mcp.json`
+
+## Usage Examples
+
+### Voice Mode
+```bash
+agi --voice "What's the current time?"
+```
+
+### Voice + Screen
+```bash
+agi --voice --screen "What's on my screen?"
+```
+
+### Full Multimodal
+```bash
+agi --voice --camera --screen "Can you see me and my screen?"
+```
+
+### MCP Servers
+```bash
+# Set up MCP config
+mkdir -p ~/.agi
+cat > ~/.agi/mcp.json << 'EOF'
+{
+  "filesystem": {
+    "command": "npx",
+    "args": ["-y", "@modelcontextprotocol/server-filesystem", "/Users/you/Documents"]
+  }
+}
+EOF
+
+# Use MCP
+agi --mcp "List my documents"
+```
+
+### Everything Combined
+```bash
+agi --voice --camera --screen --mcp "Help me with my work"
+```
+
+## Configuration
+
+### Environment Variables
+- `AGI_API_KEY`: Your AGI API key (required)
+- `OPENAI_API_KEY`: OpenAI key for voice features (required for --voice)
+
+### MCP Config Format
+```json
+{
+  "server-name": {
+    "command": "executable",
+    "args": ["arg1", "arg2"],
+    "env": {
+      "ENV_VAR": "value"
+    }
+  }
+}
+```
+
+## CLI Options
+
+| Option | Description |
+|--------|-------------|
+| `--voice` | Enable voice input/output |
+| `--camera` | Enable camera video |
+| `--screen` | Enable screen recording |
+| `--mcp` | Load MCP servers from config |
+| `--mcp-config PATH` | Custom MCP config path |
+| `-m, --model` | Model to use (default: claude-sonnet) |
+| `-v, --verbose` | Show agent thinking |
+| `--no-confirm` | Auto-approve confirmations |
+
+## Implementation
+
+Changes made:
+- Updated `src/cli.ts` to add multimodal options
+- Updated `src/hooks/useAgent.ts` to pass multimodal config to driver
+- Added UI components for multimodal events
+- Updated examples in help text
+
+## Testing
+
+```bash
+# Install dependencies
+npm install
+
+# Build
+npm run build
+
+# Test voice mode
+agi --voice "Hello"
+
+# Test full multimodal
+agi --voice --camera --screen --mcp "What do you see?"
+```
+
+## Related PRs
+
+- agi-api (driver): https://github.com/agi-inc/agents/pull/344
+- agi-python: https://github.com/agi-inc/agi-python/pull/8
+- agi-node: https://github.com/agi-inc/agi-node/pull/11
+- agi-csharp: https://github.com/agi-inc/agi-csharp/pull/8
diff --git a/package.json b/package.json
@@ -1,6 +1,6 @@
 {
   "name": "@agi_inc/cli",
-  "version": "0.5.14",
+  "version": "0.5.15",
   "description": "Terminal-based agent interaction for AGI desktop automation",
   "main": "./dist/index.js",
   "bin": {
@@ -48,7 +48,7 @@
     "LICENSE"
   ],
   "dependencies": {
-    "@agi_inc/agi-js": "^0.4.2",
+    "@agi_inc/agi-js": "^0.5.0",
     "ink": "^5.0.1",
     "ink-spinner": "^5.0.0",
     "ink-text-input": "^6.0.0",

diff --git a/src/app/App.tsx b/src/app/App.tsx
@@ -91,8 +91,14 @@ export const App: React.FC<AppProps> = ({ args }) => {
     clearEvents,
   } = useAgent({
     model,
+    apiUrl: args.apiUrl,
     verbose,
     noConfirm,
+    voice: args.voice,
+    camera: args.camera,
+    screen: args.screen,
+    mcp: args.mcp,
+    mcpConfig: args.mcpConfig,
     onFinished: (result) => {
       // Record in history
       if (currentGoal) {

diff --git a/src/cli.ts b/src/cli.ts
@@ -13,6 +13,13 @@ export interface CliArgs {
   model: string;
   verbose: boolean;
   noConfirm: boolean;
+  apiUrl?: string;
+  // Multimodal features
+  voice: boolean;
+  camera: boolean;
+  screen: boolean;
+  mcp: boolean;
+  mcpConfig: string;
 }
 
 export type ParseResult =
@@ -60,13 +67,45 @@ export async function parseArgs(): Promise<ParseResult> {
       type: 'boolean',
       default: false,
     })
+    .option('voice', {
+      describe: 'Enable voice input/output (requires OPENAI_API_KEY)',
+      type: 'boolean',
+      default: false,
+    })
+    .option('camera', {
+      describe: 'Enable camera video feed',
+      type: 'boolean',
+      default: false,
+    })
+    .option('screen', {
+      describe: 'Enable screen recording',
+      type: 'boolean',
+      default: false,
+    })
+    .option('mcp', {
+      describe: 'Load MCP servers from config',
+      type: 'boolean',
+      default: false,
+    })
+    .option('mcp-config', {
+      describe: 'Path to MCP config file',
+      type: 'string',
+      default: '~/.agi/mcp.json',
+    })
+    .option('api-url', {
+      describe: 'AGI API endpoint URL (defaults to production)',
+      type: 'string',
+    })
     .help()
     .alias('help', 'h')
     .version()
     .alias('version', 'V')
     .example('$0 "Open calculator and compute 2+2"', 'Basic task')
     .example('$0 "Find flights from SFO to JFK" --model claude-opus', 'Use a specific model')
     .example('$0 "Install Node.js" --verbose', 'Verbose output')
+    .example('$0 --voice "What\'s on my screen?"', 'Voice mode')
+    .example('$0 --voice --camera --screen "Show me what you see"', 'Full multimodal')
+    .example('$0 --mcp "List my documents"', 'With MCP servers')
     .example('$0', 'Start interactive mode')
     .example('$0 login', 'Authenticate with your API key')
     .example('$0 update', 'Update to the latest version')
@@ -85,6 +124,12 @@ export async function parseArgs(): Promise<ParseResult> {
       model: argv.model as string,
       verbose: argv.verbose as boolean,
       noConfirm: argv['no-confirm'] as boolean,
+      apiUrl: argv['api-url'] as string | undefined,
+      voice: argv.voice as boolean,
+      camera: argv.camera as boolean,
+      screen: argv.screen as boolean,
+      mcp: argv.mcp as boolean,
+      mcpConfig: argv['mcp-config'] as string,
     },
   };
 }
diff --git a/src/commands/slash.ts b/src/commands/slash.ts
@@ -5,8 +5,8 @@
  * and skill-based commands from .agi/skills/.
  */
 
-import { readFileSync, writeFileSync, existsSync, mkdirSync } from 'node:fs';
-import { basename, join } from 'node:path';
+import { readFileSync, writeFileSync, existsSync } from 'node:fs';
+import { basename } from 'node:path';
 import os from 'node:os';
 import { isBinaryAvailable } from '@agi_inc/agi-js';
 import { loadHistory, formatHistory, clearHistory } from '../history.js';
@@ -293,7 +293,6 @@ const builtinCommands: SlashCommand[] = [
       const lines = ['', '  System health:', ''];
       for (const check of checks) {
         const icon = check.ok ? '\u2714' : '\u2718';
-        const color = check.ok ? '' : ''; // Colors handled by Ink, here just text
         lines.push(`  ${icon} ${check.label.padEnd(22)} ${check.detail}`);
       }
 

diff --git a/src/hooks/useAgent.ts b/src/hooks/useAgent.ts
@@ -10,6 +10,11 @@ interface UseAgentOptions {
   apiUrl?: string;
   verbose: boolean;
   noConfirm: boolean;
+  voice: boolean;
+  camera: boolean;
+  screen: boolean;
+  mcp: boolean;
+  mcpConfig: string;
   onFinished?: (result: DriverResult) => void;
 }
 
@@ -38,7 +43,7 @@ interface UseAgentReturn {
 }
 
 export function useAgent(options: UseAgentOptions): UseAgentReturn {
-  const { model, agentName, apiUrl, verbose, noConfirm, onFinished } = options;
+  const { model, agentName, apiUrl, verbose, noConfirm, voice, camera, screen, mcp, mcpConfig, onFinished } = options;
 
   const [state, setState] = useState<DriverState>('idle');
   const [step, setStep] = useState(0);
@@ -89,7 +94,17 @@ export function useAgent(options: UseAgentOptions): UseAgentReturn {
         return prev;
       });
 
-      const driver = new AgentDriver({ model, mode: 'local', agentName, apiUrl });
+      const driver = new AgentDriver({
+        model,
+        mode: 'local',
+        agentName,
+        apiUrl,
+        voice,
+        camera,
+        screen,
+        mcp,
+        mcpConfig,
+      });
       driverRef.current = driver;
       taskStart.current = Date.now();
 
@@ -169,7 +184,7 @@ export function useAgent(options: UseAgentOptions): UseAgentReturn {
         addEvent({ type: 'error', message: String(error) });
       }
     },
-    [model, agentName, apiUrl, verbose, noConfirm, addEvent]
+    [model, agentName, apiUrl, verbose, noConfirm, voice, camera, screen, mcp, mcpConfig, addEvent]
   );
 
   const stop = useCallback(async () => {