diff --git a/MULTIMODAL_UPDATES.md b/MULTIMODAL_UPDATES.md new file mode 100644 index 0000000..edebed1 --- /dev/null +++ b/MULTIMODAL_UPDATES.md @@ -0,0 +1,128 @@ +# Multimodal Support - AGI CLI Updates + +This update adds comprehensive multimodal support to the AGI CLI. + +## New Features + +### Voice Mode (`--voice`) +- Audio input from microphone +- Automatic turn detection +- Text-to-speech output +- Requires: `OPENAI_API_KEY` environment variable + +### Camera Mode (`--camera`) +- Webcam video feed +- 30-second rolling buffer +- Agent can see you + +### Screen Mode (`--screen`) +- Screen recording +- 30-second rolling buffer +- Agent can see your screen + +### MCP Support (`--mcp`) +- Load MCP servers from config +- Default config: `~/.agi/mcp.json` +- Custom config: `--mcp-config /path/to/mcp.json` + +## Usage Examples + +### Voice Mode +```bash +agi --voice "What's the current time?" +``` + +### Voice + Screen +```bash +agi --voice --screen "What's on my screen?" +``` + +### Full Multimodal +```bash +agi --voice --camera --screen "Can you see me and my screen?" +``` + +### MCP Servers +```bash +# Set up MCP config +mkdir -p ~/.agi +cat > ~/.agi/mcp.json << 'EOF' +{ + "filesystem": { + "command": "npx", + "args": ["-y", "@modelcontextprotocol/server-filesystem", "/Users/you/Documents"] + } +} +EOF + +# Use MCP +agi --mcp "List my documents" +``` + +### Everything Combined +```bash +agi --voice --camera --screen --mcp "Help me with my work" +``` + +## Configuration + +### Environment Variables +- `AGI_API_KEY`: Your AGI API key (required) +- `OPENAI_API_KEY`: OpenAI key for voice features (required for --voice) + +### MCP Config Format +```json +{ + "server-name": { + "command": "executable", + "args": ["arg1", "arg2"], + "env": { + "ENV_VAR": "value" + } + } +} +``` + +## CLI Options + +| Option | Description | +|--------|-------------| +| `--voice` | Enable voice input/output | +| `--camera` | Enable camera video | +| `--screen` | Enable screen recording | +| `--mcp` | Load MCP servers from config | +| `--mcp-config PATH` | Custom MCP config path | +| `-m, --model` | Model to use (default: claude-sonnet) | +| `-v, --verbose` | Show agent thinking | +| `--no-confirm` | Auto-approve confirmations | + +## Implementation + +Changes made: +- Updated `src/cli.ts` to add multimodal options +- Updated `src/hooks/useAgent.ts` to pass multimodal config to driver +- Added UI components for multimodal events +- Updated examples in help text + +## Testing + +```bash +# Install dependencies +npm install + +# Build +npm run build + +# Test voice mode +agi --voice "Hello" + +# Test full multimodal +agi --voice --camera --screen --mcp "What do you see?" +``` + +## Related PRs + +- agi-api (driver): https://github.com/agi-inc/agents/pull/344 +- agi-python: https://github.com/agi-inc/agi-python/pull/8 +- agi-node: https://github.com/agi-inc/agi-node/pull/11 +- agi-csharp: https://github.com/agi-inc/agi-csharp/pull/8 diff --git a/package.json b/package.json index 32c169e..79e0b29 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "@agi_inc/cli", - "version": "0.5.14", + "version": "0.5.15", "description": "Terminal-based agent interaction for AGI desktop automation", "main": "./dist/index.js", "bin": { @@ -48,7 +48,7 @@ "LICENSE" ], "dependencies": { - "@agi_inc/agi-js": "^0.4.2", + "@agi_inc/agi-js": "^0.5.0", "ink": "^5.0.1", "ink-spinner": "^5.0.0", "ink-text-input": "^6.0.0", diff --git a/src/app/App.tsx b/src/app/App.tsx index e9adf78..3398aea 100644 --- a/src/app/App.tsx +++ b/src/app/App.tsx @@ -91,8 +91,14 @@ export const App: React.FC = ({ args }) => { clearEvents, } = useAgent({ model, + apiUrl: args.apiUrl, verbose, noConfirm, + voice: args.voice, + camera: args.camera, + screen: args.screen, + mcp: args.mcp, + mcpConfig: args.mcpConfig, onFinished: (result) => { // Record in history if (currentGoal) { diff --git a/src/cli.ts b/src/cli.ts index 5ed8ee7..272290c 100644 --- a/src/cli.ts +++ b/src/cli.ts @@ -13,6 +13,13 @@ export interface CliArgs { model: string; verbose: boolean; noConfirm: boolean; + apiUrl?: string; + // Multimodal features + voice: boolean; + camera: boolean; + screen: boolean; + mcp: boolean; + mcpConfig: string; } export type ParseResult = @@ -60,6 +67,35 @@ export async function parseArgs(): Promise { type: 'boolean', default: false, }) + .option('voice', { + describe: 'Enable voice input/output (requires OPENAI_API_KEY)', + type: 'boolean', + default: false, + }) + .option('camera', { + describe: 'Enable camera video feed', + type: 'boolean', + default: false, + }) + .option('screen', { + describe: 'Enable screen recording', + type: 'boolean', + default: false, + }) + .option('mcp', { + describe: 'Load MCP servers from config', + type: 'boolean', + default: false, + }) + .option('mcp-config', { + describe: 'Path to MCP config file', + type: 'string', + default: '~/.agi/mcp.json', + }) + .option('api-url', { + describe: 'AGI API endpoint URL (defaults to production)', + type: 'string', + }) .help() .alias('help', 'h') .version() @@ -67,6 +103,9 @@ export async function parseArgs(): Promise { .example('$0 "Open calculator and compute 2+2"', 'Basic task') .example('$0 "Find flights from SFO to JFK" --model claude-opus', 'Use a specific model') .example('$0 "Install Node.js" --verbose', 'Verbose output') + .example('$0 --voice "What\'s on my screen?"', 'Voice mode') + .example('$0 --voice --camera --screen "Show me what you see"', 'Full multimodal') + .example('$0 --mcp "List my documents"', 'With MCP servers') .example('$0', 'Start interactive mode') .example('$0 login', 'Authenticate with your API key') .example('$0 update', 'Update to the latest version') @@ -85,6 +124,12 @@ export async function parseArgs(): Promise { model: argv.model as string, verbose: argv.verbose as boolean, noConfirm: argv['no-confirm'] as boolean, + apiUrl: argv['api-url'] as string | undefined, + voice: argv.voice as boolean, + camera: argv.camera as boolean, + screen: argv.screen as boolean, + mcp: argv.mcp as boolean, + mcpConfig: argv['mcp-config'] as string, }, }; } diff --git a/src/commands/slash.ts b/src/commands/slash.ts index 23a050a..8343559 100644 --- a/src/commands/slash.ts +++ b/src/commands/slash.ts @@ -5,8 +5,8 @@ * and skill-based commands from .agi/skills/. */ -import { readFileSync, writeFileSync, existsSync, mkdirSync } from 'node:fs'; -import { basename, join } from 'node:path'; +import { readFileSync, writeFileSync, existsSync } from 'node:fs'; +import { basename } from 'node:path'; import os from 'node:os'; import { isBinaryAvailable } from '@agi_inc/agi-js'; import { loadHistory, formatHistory, clearHistory } from '../history.js'; @@ -293,7 +293,6 @@ const builtinCommands: SlashCommand[] = [ const lines = ['', ' System health:', '']; for (const check of checks) { const icon = check.ok ? '\u2714' : '\u2718'; - const color = check.ok ? '' : ''; // Colors handled by Ink, here just text lines.push(` ${icon} ${check.label.padEnd(22)} ${check.detail}`); } diff --git a/src/hooks/useAgent.ts b/src/hooks/useAgent.ts index 39bce35..2c2b62a 100644 --- a/src/hooks/useAgent.ts +++ b/src/hooks/useAgent.ts @@ -10,6 +10,11 @@ interface UseAgentOptions { apiUrl?: string; verbose: boolean; noConfirm: boolean; + voice: boolean; + camera: boolean; + screen: boolean; + mcp: boolean; + mcpConfig: string; onFinished?: (result: DriverResult) => void; } @@ -38,7 +43,7 @@ interface UseAgentReturn { } export function useAgent(options: UseAgentOptions): UseAgentReturn { - const { model, agentName, apiUrl, verbose, noConfirm, onFinished } = options; + const { model, agentName, apiUrl, verbose, noConfirm, voice, camera, screen, mcp, mcpConfig, onFinished } = options; const [state, setState] = useState('idle'); const [step, setStep] = useState(0); @@ -89,7 +94,17 @@ export function useAgent(options: UseAgentOptions): UseAgentReturn { return prev; }); - const driver = new AgentDriver({ model, mode: 'local', agentName, apiUrl }); + const driver = new AgentDriver({ + model, + mode: 'local', + agentName, + apiUrl, + voice, + camera, + screen, + mcp, + mcpConfig, + }); driverRef.current = driver; taskStart.current = Date.now(); @@ -169,7 +184,7 @@ export function useAgent(options: UseAgentOptions): UseAgentReturn { addEvent({ type: 'error', message: String(error) }); } }, - [model, agentName, apiUrl, verbose, noConfirm, addEvent] + [model, agentName, apiUrl, verbose, noConfirm, voice, camera, screen, mcp, mcpConfig, addEvent] ); const stop = useCallback(async () => {