@@ -97,6 +97,28 @@ const LLM_BASE_URL = process.env.AEGIS_LLM_BASE_URL || '';
9797const VLM_API_TYPE = process . env . AEGIS_VLM_API_TYPE || 'openai-compatible' ;
9898const VLM_MODEL = process . env . AEGIS_VLM_MODEL || '' ;
9999
100+ // ─── OpenAI SDK Clients ──────────────────────────────────────────────────────
101+ const OpenAI = require ( 'openai' ) ;
102+
103+ // Resolve LLM base URL — priority: cloud provider → direct llama-server → gateway
104+ const strip = ( u ) => u . replace ( / \/ v 1 \/ ? $ / , '' ) ;
105+ const llmBaseUrl = LLM_BASE_URL
106+ ? `${ strip ( LLM_BASE_URL ) } /v1`
107+ : LLM_URL
108+ ? `${ strip ( LLM_URL ) } /v1`
109+ : `${ GATEWAY_URL } /v1` ;
110+
111+ const llmClient = new OpenAI ( {
112+ apiKey : LLM_API_KEY || 'not-needed' , // Local servers don't require auth
113+ baseURL : llmBaseUrl ,
114+ } ) ;
115+
116+ // VLM client — always local llama-server
117+ const vlmClient = VLM_URL ? new OpenAI ( {
118+ apiKey : 'not-needed' ,
119+ baseURL : `${ strip ( VLM_URL ) } /v1` ,
120+ } ) : null ;
121+
100122// ─── Skill Protocol: JSON lines on stdout, human text on stderr ──────────────
101123
102124/**
@@ -136,110 +158,70 @@ const results = {
136158} ;
137159
138160async function llmCall ( messages , opts = { } ) {
139- const body = { messages, stream : true } ;
140- if ( opts . model || LLM_MODEL ) body . model = opts . model || LLM_MODEL ;
141- if ( opts . maxTokens ) body . max_tokens = opts . maxTokens ;
142- if ( opts . temperature !== undefined ) body . temperature = opts . temperature ;
143- if ( opts . tools ) body . tools = opts . tools ;
144-
145- // Resolve LLM endpoint — priority:
146- // 1. Cloud provider base URL (e.g. https://api.openai.com/v1) when set via UI
147- // 2. Direct llama-server URL (port 5411) for builtin local models
148- // 3. Gateway (port 5407) as final fallback
149- const strip = ( u ) => u . replace ( / \/ v 1 \/ ? $ / , '' ) ;
150- let url ;
151- if ( opts . vlm ) {
152- const vlmBase = VLM_URL ? strip ( VLM_URL ) : '' ;
153- url = `${ vlmBase } /v1/chat/completions` ;
154- } else if ( LLM_BASE_URL ) {
155- url = `${ strip ( LLM_BASE_URL ) } /chat/completions` ;
156- } else if ( LLM_URL ) {
157- url = `${ strip ( LLM_URL ) } /v1/chat/completions` ;
158- } else {
159- url = `${ GATEWAY_URL } /v1/chat/completions` ;
161+ // Select the appropriate OpenAI client (LLM or VLM)
162+ const client = opts . vlm ? vlmClient : llmClient ;
163+ if ( ! client ) {
164+ throw new Error ( opts . vlm ? 'VLM client not configured' : 'LLM client not configured' ) ;
160165 }
161166
162- // Build headers — include API key if available (for direct cloud provider access)
163- const headers = { 'Content-Type' : 'application/json' } ;
164- if ( LLM_API_KEY && ! opts . vlm ) headers [ 'Authorization' ] = `Bearer ${ LLM_API_KEY } ` ;
167+ const model = opts . model || ( opts . vlm ? VLM_MODEL : LLM_MODEL ) || undefined ;
168+
169+ // Build request params
170+ const params = {
171+ messages,
172+ stream : true ,
173+ ...( model && { model } ) ,
174+ ...( opts . temperature !== undefined && { temperature : opts . temperature } ) ,
175+ ...( opts . maxTokens && { max_completion_tokens : opts . maxTokens } ) ,
176+ ...( opts . tools && { tools : opts . tools } ) ,
177+ } ;
165178
166- // Use an AbortController with idle timeout that resets on each SSE chunk.
167- // This way long inferences that stream tokens succeed, but requests
168- // stuck with no output for IDLE_TIMEOUT_MS still abort.
179+ // Use an AbortController with idle timeout that resets on each streamed chunk.
169180 const controller = new AbortController ( ) ;
170181 const idleMs = opts . timeout || IDLE_TIMEOUT_MS ;
171182 let idleTimer = setTimeout ( ( ) => controller . abort ( ) , idleMs ) ;
172183 const resetIdle = ( ) => { clearTimeout ( idleTimer ) ; idleTimer = setTimeout ( ( ) => controller . abort ( ) , idleMs ) ; } ;
173184
174185 try {
175- const response = await fetch ( url , {
176- method : 'POST' ,
177- headers,
178- body : JSON . stringify ( body ) ,
186+ const stream = await client . chat . completions . create ( params , {
179187 signal : controller . signal ,
180188 } ) ;
181189
182- if ( ! response . ok ) {
183- const errBody = await response . text ( ) . catch ( ( ) => '' ) ;
184- throw new Error ( `HTTP ${ response . status } : ${ errBody . slice ( 0 , 200 ) } ` ) ;
185- }
186-
187- // Parse SSE stream
188190 let content = '' ;
189191 let reasoningContent = '' ;
190192 let toolCalls = null ;
191193 let model = '' ;
192194 let usage = { } ;
193- let finishReason = '' ;
194195 let tokenCount = 0 ;
195196
196- const reader = response . body ;
197- const decoder = new TextDecoder ( ) ;
198- let buffer = '' ;
199-
200- for await ( const chunk of reader ) {
197+ for await ( const chunk of stream ) {
201198 resetIdle ( ) ;
202- buffer += decoder . decode ( chunk , { stream : true } ) ;
203-
204- // Process complete SSE lines
205- const lines = buffer . split ( '\n' ) ;
206- buffer = lines . pop ( ) ; // Keep incomplete line in buffer
207-
208- for ( const line of lines ) {
209- if ( ! line . startsWith ( 'data: ' ) ) continue ;
210- const payload = line . slice ( 6 ) . trim ( ) ;
211- if ( payload === '[DONE]' ) continue ;
212-
213- try {
214- const evt = JSON . parse ( payload ) ;
215- if ( evt . model ) model = evt . model ;
216-
217- const delta = evt . choices ?. [ 0 ] ?. delta ;
218- if ( delta ?. content ) content += delta . content ;
219- if ( delta ?. reasoning_content ) reasoningContent += delta . reasoning_content ;
220- if ( delta ?. content || delta ?. reasoning_content ) {
221- tokenCount ++ ;
222- // Log progress every 100 tokens so the console isn't silent
223- if ( tokenCount % 100 === 0 ) {
224- log ( ` … ${ tokenCount } tokens received` ) ;
225- }
226- }
227- if ( delta ?. tool_calls ) {
228- // Accumulate streamed tool calls
229- if ( ! toolCalls ) toolCalls = [ ] ;
230- for ( const tc of delta . tool_calls ) {
231- const idx = tc . index ?? 0 ;
232- if ( ! toolCalls [ idx ] ) {
233- toolCalls [ idx ] = { id : tc . id , type : tc . type || 'function' , function : { name : '' , arguments : '' } } ;
234- }
235- if ( tc . function ?. name ) toolCalls [ idx ] . function . name += tc . function . name ;
236- if ( tc . function ?. arguments ) toolCalls [ idx ] . function . arguments += tc . function . arguments ;
237- }
199+
200+ if ( chunk . model ) model = chunk . model ;
201+
202+ const delta = chunk . choices ?. [ 0 ] ?. delta ;
203+ if ( delta ?. content ) content += delta . content ;
204+ if ( delta ?. reasoning_content ) reasoningContent += delta . reasoning_content ;
205+ if ( delta ?. content || delta ?. reasoning_content ) {
206+ tokenCount ++ ;
207+ if ( tokenCount % 100 === 0 ) {
208+ log ( ` … ${ tokenCount } tokens received` ) ;
209+ }
210+ }
211+
212+ if ( delta ?. tool_calls ) {
213+ if ( ! toolCalls ) toolCalls = [ ] ;
214+ for ( const tc of delta . tool_calls ) {
215+ const idx = tc . index ?? 0 ;
216+ if ( ! toolCalls [ idx ] ) {
217+ toolCalls [ idx ] = { id : tc . id , type : tc . type || 'function' , function : { name : '' , arguments : '' } } ;
238218 }
239- if ( evt . choices ?. [ 0 ] ?. finish_reason ) finishReason = evt . choices [ 0 ] . finish_reason ;
240- if ( evt . usage ) usage = evt . usage ;
241- } catch { /* skip malformed SSE */ }
219+ if ( tc . function ?. name ) toolCalls [ idx ] . function . name += tc . function . name ;
220+ if ( tc . function ?. arguments ) toolCalls [ idx ] . function . arguments += tc . function . arguments ;
221+ }
242222 }
223+
224+ if ( chunk . usage ) usage = chunk . usage ;
243225 }
244226
245227 // If the model only produced reasoning_content (thinking) with no content,
@@ -264,6 +246,7 @@ async function llmCall(messages, opts = {}) {
264246 } finally {
265247 clearTimeout ( idleTimer ) ;
266248 }
249+
267250}
268251
269252function stripThink ( text ) {
@@ -1787,29 +1770,18 @@ async function main() {
17871770 log ( ` Mode: ${ IS_SKILL_MODE ? 'Aegis Skill' : 'Standalone' } (streaming, ${ IDLE_TIMEOUT_MS / 1000 } s idle timeout)` ) ;
17881771 log ( ` Time: ${ new Date ( ) . toLocaleString ( ) } ` ) ;
17891772
1790- // Healthcheck — ping the actual LLM endpoint directly
1791- const healthUrl = LLM_BASE_URL
1792- ? `${ LLM_BASE_URL . replace ( / \/ v 1 \/ ? $ / , '' ) } /v1/chat/completions`
1793- : LLM_URL
1794- ? `${ LLM_URL . replace ( / \/ v 1 \/ ? $ / , '' ) } /v1/chat/completions`
1795- : `${ GATEWAY_URL } /v1/chat/completions` ;
1796- const healthHeaders = { 'Content-Type' : 'application/json' } ;
1797- if ( LLM_API_KEY ) healthHeaders [ 'Authorization' ] = `Bearer ${ LLM_API_KEY } ` ;
1798-
1773+ // Healthcheck — ping the LLM endpoint via SDK
17991774 try {
1800- const ping = await fetch ( healthUrl , {
1801- method : 'POST' ,
1802- headers : healthHeaders ,
1803- body : JSON . stringify ( { messages : [ { role : 'user' , content : 'ping' } ] , stream : false , max_tokens : 1 } ) ,
1804- signal : AbortSignal . timeout ( 15000 ) ,
1775+ const ping = await llmClient . chat . completions . create ( {
1776+ ...( LLM_MODEL && { model : LLM_MODEL } ) ,
1777+ messages : [ { role : 'user' , content : 'ping' } ] ,
1778+ max_completion_tokens : 1 ,
18051779 } ) ;
1806- if ( ! ping . ok ) throw new Error ( `HTTP ${ ping . status } ` ) ;
1807- const data = await ping . json ( ) ;
1808- results . model . name = data . model || 'unknown' ;
1780+ results . model . name = ping . model || 'unknown' ;
18091781 log ( ` Model: ${ results . model . name } ` ) ;
18101782 } catch ( err ) {
18111783 log ( `\n ❌ Cannot reach LLM endpoint: ${ err . message } ` ) ;
1812- log ( ` Endpoint : ${ healthUrl } ` ) ;
1784+ log ( ` Base URL : ${ llmBaseUrl } ` ) ;
18131785 log ( ' Check that the LLM server is running.\n' ) ;
18141786 emit ( { event : 'error' , message : `Cannot reach LLM endpoint: ${ err . message } ` } ) ;
18151787 process . exit ( 1 ) ;
0 commit comments