Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
128 changes: 128 additions & 0 deletions MULTIMODAL_UPDATES.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,128 @@
# Multimodal Support - AGI CLI Updates

This update adds comprehensive multimodal support to the AGI CLI.

## New Features

### Voice Mode (`--voice`)
- Audio input from microphone
- Automatic turn detection
- Text-to-speech output
- Requires: `OPENAI_API_KEY` environment variable

### Camera Mode (`--camera`)
- Webcam video feed
- 30-second rolling buffer
- Agent can see you

### Screen Mode (`--screen`)
- Screen recording
- 30-second rolling buffer
- Agent can see your screen

### MCP Support (`--mcp`)
- Load MCP servers from config
- Default config: `~/.agi/mcp.json`
- Custom config: `--mcp-config /path/to/mcp.json`

## Usage Examples

### Voice Mode
```bash
agi --voice "What's the current time?"
```

### Voice + Screen
```bash
agi --voice --screen "What's on my screen?"
```

### Full Multimodal
```bash
agi --voice --camera --screen "Can you see me and my screen?"
```

### MCP Servers
```bash
# Set up MCP config
mkdir -p ~/.agi
cat > ~/.agi/mcp.json << 'EOF'
{
"filesystem": {
"command": "npx",
"args": ["-y", "@modelcontextprotocol/server-filesystem", "/Users/you/Documents"]
}
}
EOF

# Use MCP
agi --mcp "List my documents"
```

### Everything Combined
```bash
agi --voice --camera --screen --mcp "Help me with my work"
```

## Configuration

### Environment Variables
- `AGI_API_KEY`: Your AGI API key (required)
- `OPENAI_API_KEY`: OpenAI key for voice features (required for --voice)

### MCP Config Format
```json
{
"server-name": {
"command": "executable",
"args": ["arg1", "arg2"],
"env": {
"ENV_VAR": "value"
}
}
}
```

## CLI Options

| Option | Description |
|--------|-------------|
| `--voice` | Enable voice input/output |
| `--camera` | Enable camera video |
| `--screen` | Enable screen recording |
| `--mcp` | Load MCP servers from config |
| `--mcp-config PATH` | Custom MCP config path |
| `-m, --model` | Model to use (default: claude-sonnet) |
| `-v, --verbose` | Show agent thinking |
| `--no-confirm` | Auto-approve confirmations |

## Implementation

Changes made:
- Updated `src/cli.ts` to add multimodal options
- Updated `src/hooks/useAgent.ts` to pass multimodal config to driver
- Added UI components for multimodal events
- Updated examples in help text

## Testing

```bash
# Install dependencies
npm install

# Build
npm run build

# Test voice mode
agi --voice "Hello"

# Test full multimodal
agi --voice --camera --screen --mcp "What do you see?"
```

## Related PRs

- agi-api (driver): https://github.com/agi-inc/agents/pull/344
- agi-python: https://github.com/agi-inc/agi-python/pull/8
- agi-node: https://github.com/agi-inc/agi-node/pull/11
- agi-csharp: https://github.com/agi-inc/agi-csharp/pull/8
4 changes: 2 additions & 2 deletions package.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"name": "@agi_inc/cli",
"version": "0.5.14",
"version": "0.5.15",
"description": "Terminal-based agent interaction for AGI desktop automation",
"main": "./dist/index.js",
"bin": {
Expand Down Expand Up @@ -48,7 +48,7 @@
"LICENSE"
],
"dependencies": {
"@agi_inc/agi-js": "^0.4.2",
"@agi_inc/agi-js": "^0.5.0",
"ink": "^5.0.1",
"ink-spinner": "^5.0.0",
"ink-text-input": "^6.0.0",
Expand Down
6 changes: 6 additions & 0 deletions src/app/App.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -91,8 +91,14 @@ export const App: React.FC<AppProps> = ({ args }) => {
clearEvents,
} = useAgent({
model,
apiUrl: args.apiUrl,
verbose,
noConfirm,
voice: args.voice,
camera: args.camera,
screen: args.screen,
mcp: args.mcp,
mcpConfig: args.mcpConfig,
onFinished: (result) => {
// Record in history
if (currentGoal) {
Expand Down
45 changes: 45 additions & 0 deletions src/cli.ts
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,13 @@ export interface CliArgs {
model: string;
verbose: boolean;
noConfirm: boolean;
apiUrl?: string;
// Multimodal features
voice: boolean;
camera: boolean;
screen: boolean;
mcp: boolean;
mcpConfig: string;
}

export type ParseResult =
Expand Down Expand Up @@ -60,13 +67,45 @@ export async function parseArgs(): Promise<ParseResult> {
type: 'boolean',
default: false,
})
.option('voice', {
describe: 'Enable voice input/output (requires OPENAI_API_KEY)',
type: 'boolean',
default: false,
})
.option('camera', {
describe: 'Enable camera video feed',
type: 'boolean',
default: false,
})
.option('screen', {
describe: 'Enable screen recording',
type: 'boolean',
default: false,
})
.option('mcp', {
describe: 'Load MCP servers from config',
type: 'boolean',
default: false,
})
.option('mcp-config', {
describe: 'Path to MCP config file',
type: 'string',
default: '~/.agi/mcp.json',
})
.option('api-url', {
describe: 'AGI API endpoint URL (defaults to production)',
type: 'string',
})
.help()
.alias('help', 'h')
.version()
.alias('version', 'V')
.example('$0 "Open calculator and compute 2+2"', 'Basic task')
.example('$0 "Find flights from SFO to JFK" --model claude-opus', 'Use a specific model')
.example('$0 "Install Node.js" --verbose', 'Verbose output')
.example('$0 --voice "What\'s on my screen?"', 'Voice mode')
.example('$0 --voice --camera --screen "Show me what you see"', 'Full multimodal')
.example('$0 --mcp "List my documents"', 'With MCP servers')
.example('$0', 'Start interactive mode')
.example('$0 login', 'Authenticate with your API key')
.example('$0 update', 'Update to the latest version')
Expand All @@ -85,6 +124,12 @@ export async function parseArgs(): Promise<ParseResult> {
model: argv.model as string,
verbose: argv.verbose as boolean,
noConfirm: argv['no-confirm'] as boolean,
apiUrl: argv['api-url'] as string | undefined,
voice: argv.voice as boolean,
camera: argv.camera as boolean,
screen: argv.screen as boolean,
mcp: argv.mcp as boolean,
mcpConfig: argv['mcp-config'] as string,
},
};
}
5 changes: 2 additions & 3 deletions src/commands/slash.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@
* and skill-based commands from .agi/skills/.
*/

import { readFileSync, writeFileSync, existsSync, mkdirSync } from 'node:fs';
import { basename, join } from 'node:path';
import { readFileSync, writeFileSync, existsSync } from 'node:fs';
import { basename } from 'node:path';
import os from 'node:os';
import { isBinaryAvailable } from '@agi_inc/agi-js';
import { loadHistory, formatHistory, clearHistory } from '../history.js';
Expand Down Expand Up @@ -293,7 +293,6 @@ const builtinCommands: SlashCommand[] = [
const lines = ['', ' System health:', ''];
for (const check of checks) {
const icon = check.ok ? '\u2714' : '\u2718';
const color = check.ok ? '' : ''; // Colors handled by Ink, here just text
lines.push(` ${icon} ${check.label.padEnd(22)} ${check.detail}`);
}

Expand Down
21 changes: 18 additions & 3 deletions src/hooks/useAgent.ts
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,11 @@ interface UseAgentOptions {
apiUrl?: string;
verbose: boolean;
noConfirm: boolean;
voice: boolean;
camera: boolean;
screen: boolean;
mcp: boolean;
mcpConfig: string;
onFinished?: (result: DriverResult) => void;
}

Expand Down Expand Up @@ -38,7 +43,7 @@ interface UseAgentReturn {
}

export function useAgent(options: UseAgentOptions): UseAgentReturn {
const { model, agentName, apiUrl, verbose, noConfirm, onFinished } = options;
const { model, agentName, apiUrl, verbose, noConfirm, voice, camera, screen, mcp, mcpConfig, onFinished } = options;

const [state, setState] = useState<DriverState>('idle');
const [step, setStep] = useState(0);
Expand Down Expand Up @@ -89,7 +94,17 @@ export function useAgent(options: UseAgentOptions): UseAgentReturn {
return prev;
});

const driver = new AgentDriver({ model, mode: 'local', agentName, apiUrl });
const driver = new AgentDriver({
model,
mode: 'local',
agentName,
apiUrl,
voice,
camera,
screen,
mcp,
mcpConfig,
});
driverRef.current = driver;
taskStart.current = Date.now();

Expand Down Expand Up @@ -169,7 +184,7 @@ export function useAgent(options: UseAgentOptions): UseAgentReturn {
addEvent({ type: 'error', message: String(error) });
}
},
[model, agentName, apiUrl, verbose, noConfirm, addEvent]
[model, agentName, apiUrl, verbose, noConfirm, voice, camera, screen, mcp, mcpConfig, addEvent]
);

const stop = useCallback(async () => {
Expand Down
Loading