diff --git a/docs/paper/.gitignore b/docs/paper/.gitignore new file mode 100644 index 00000000..908987e3 --- /dev/null +++ b/docs/paper/.gitignore @@ -0,0 +1,10 @@ +# LaTeX build artifacts +*.aux +*.log +*.out +*.synctex.gz +*.toc +*.bbl +*.blg +*.fls +*.fdb_latexmk diff --git a/docs/paper/home-security-benchmark.pdf b/docs/paper/home-security-benchmark.pdf index 85677bfe..f5a588fc 100644 Binary files a/docs/paper/home-security-benchmark.pdf and b/docs/paper/home-security-benchmark.pdf differ diff --git a/docs/paper/home-security-benchmark.tex b/docs/paper/home-security-benchmark.tex index b577720e..7d469256 100644 --- a/docs/paper/home-security-benchmark.tex +++ b/docs/paper/home-security-benchmark.tex @@ -71,9 +71,9 @@ tool selection across five security-domain APIs, extraction of durable knowledge from user conversations, and scene understanding from security camera feeds including infrared imagery. The suite comprises -\textbf{16~test suites} with \textbf{131~individual tests} spanning both +\textbf{16~test suites} with \textbf{143~individual tests} spanning both text-only LLM reasoning (96~tests) and multimodal VLM scene analysis -(35~tests). We present results from \textbf{34~benchmark runs} across +(47~tests). We present results from \textbf{34~benchmark runs} across three model configurations: a local 4B-parameter quantized model (Qwen3.5-4B-Q4\_1 GGUF), a frontier cloud model (GPT-5.2-codex), and a hybrid configuration pairing the cloud LLM with a local 1.6B-parameter @@ -142,7 +142,7 @@ \section{Introduction} \textbf{Contributions.} This paper makes four contributions: \begin{enumerate}[nosep] - \item \textbf{HomeSec-Bench}: A 131-test benchmark suite covering + \item \textbf{HomeSec-Bench}: A 143-test benchmark suite covering 16~evaluation dimensions specific to home security AI, spanning both LLM text reasoning and VLM scene analysis, including novel suites for prompt injection resistance, multi-turn contextual @@ -299,7 +299,7 @@ \section{Benchmark Design} HomeSec-Bench comprises 16~test suites organized into two categories: text-only LLM reasoning (15~suites, 96~tests) and multimodal VLM scene -analysis (1~suite, 35~tests). Table~\ref{tab:suites_overview} provides +analysis (1~suite, 47~tests). Table~\ref{tab:suites_overview} provides a structural overview. \begin{table}[h] @@ -325,9 +325,9 @@ \section{Benchmark Design} Alert Routing & 5 & LLM & Channel, schedule \\ Knowledge Injection & 5 & LLM & KI use, relevance \\ VLM-to-Alert Triage & 5 & LLM & Urgency + notify \\ -VLM Scene & 35 & VLM & Entity detect \\ +VLM Scene & 47 & VLM & Entity detect \\ \midrule -\textbf{Total} & \textbf{131} & & \\ +\textbf{Total} & \textbf{143} & & \\ \bottomrule \end{tabular} \end{table} @@ -405,7 +405,7 @@ \subsection{LLM Suite 4: Event Deduplication} and expects a structured judgment: \texttt{\{``duplicate'': bool, ``reason'': ``...'', ``confidence'': ``high/medium/low''\}}. -Five scenarios probe progressive reasoning difficulty: +Eight scenarios probe progressive reasoning difficulty: \begin{enumerate}[nosep] \item \textbf{Same person, same camera, 120s}: Man in blue shirt @@ -422,6 +422,15 @@ \subsection{LLM Suite 4: Event Deduplication} with package, then walking back to van. Expected: duplicate---requires understanding that arrival and departure are phases of one event. + \item \textbf{Weather/lighting change, 3600s}: Same backyard tree + motion at sunset then darkness. Expected: unique---lighting context + constitutes a different event. + \item \textbf{Continuous activity, 180s}: Man unloading groceries + then carrying bags inside. Expected: duplicate---single + unloading activity. + \item \textbf{Group split, 2700s}: Three people arrive together; + one person leaves alone 45~minutes later. Expected: unique---different + participant count and direction. \end{enumerate} \subsection{LLM Suite 5: Tool Use} @@ -439,7 +448,7 @@ \subsection{LLM Suite 5: Tool Use} \item \texttt{event\_subscribe}: Subscribe to future security events \end{itemize} -Twelve scenarios test tool selection across a spectrum of specificity: +Sixteen scenarios test tool selection across a spectrum of specificity: \noindent\textbf{Straightforward} (6~tests): ``What happened today?'' $\rightarrow$ \texttt{video\_search}; ``Check this footage'' @@ -460,12 +469,20 @@ \subsection{LLM Suite 5: Tool Use} (proactive); ``Were there any cars yesterday?'' $\rightarrow$ \texttt{video\_search} (retrospective). +\noindent\textbf{Negative} (1~test): ``Thanks, that's all for now!'' +$\rightarrow$ no tool call; the model must respond with natural text. + +\noindent\textbf{Complex} (2~tests): Multi-step requests (``find and +send me the clip'') requiring the first tool before the second; +historical comparison (``more activity today vs.\ yesterday?''); +user-renamed cameras. + Multi-turn history is provided for context-dependent scenarios (e.g., clip analysis following a search result). \subsection{LLM Suite 6: Chat \& JSON Compliance} -Eight tests verify fundamental assistant capabilities: +Eleven tests verify fundamental assistant capabilities: \begin{itemize}[nosep] \item \textbf{Persona adherence}: Response mentions security/cameras @@ -484,6 +501,12 @@ \subsection{LLM Suite 6: Chat \& JSON Compliance} \item \textbf{Emergency tone}: For ``Someone is trying to break into my house right now!'' the response must mention calling 911/police or indicate urgency---casual or dismissive responses fail. + \item \textbf{Multilingual input}: ``¿Qué ha pasado hoy en las + cámaras?'' must produce a coherent response, not a refusal. + \item \textbf{Contradictory instructions}: Succinct system prompt + + user request for detailed explanation; model must balance. + \item \textbf{Partial JSON}: User requests JSON with specified keys; + model must produce parseable output with the requested schema. \end{itemize} \subsection{LLM Suite 7: Security Classification} @@ -502,7 +525,8 @@ \subsection{LLM Suite 7: Security Classification} \end{itemize} Output: \texttt{\{``classification'': ``...'', ``tags'': [...], -``reason'': ``...''\}}. Eight scenarios span the full taxonomy: +``reason'': ``...''\}}. Twelve scenarios span the full taxonomy: + \begin{table}[h] \centering @@ -520,6 +544,10 @@ \subsection{LLM Suite 7: Security Classification} Cat on IR camera at night & normal \\ Door-handle tampering at 2\,AM & suspicious/critical \\ Amazon van delivery & normal \\ +Door-to-door solicitor (daytime) & monitor \\ +Utility worker inspecting meter & normal \\ +Children playing at dusk & normal \\ +Masked person at 1\,AM & critical/suspicious \\ \bottomrule \end{tabular} \end{table} @@ -527,7 +555,7 @@ \subsection{LLM Suite 7: Security Classification} \subsection{LLM Suite 8: Narrative Synthesis} Given structured clip data (timestamps, cameras, summaries, clip~IDs), -the model must produce user-friendly narratives. Three tests verify +the model must produce user-friendly narratives. Four tests verify complementary capabilities: \begin{enumerate}[nosep] @@ -540,15 +568,17 @@ \subsection{LLM Suite 8: Narrative Synthesis} \item \textbf{Camera grouping}: 5~events across 3~cameras $\rightarrow$ when user asks ``breakdown by camera,'' each camera name must appear as an organizer. + \item \textbf{Large volume}: 22~events across 4~cameras + $\rightarrow$ model must group related events (e.g., landscaping + sequence) and produce a concise narrative, not enumerate all 22. \end{enumerate} -\subsection{VLM Suite: Scene Analysis} +\subsection{Phase~2 Expansion} -\textbf{New in v2:} Four additional LLM suites evaluate error recovery, -privacy compliance, robustness, and contextual reasoning. Two entirely new -suites---Error Recovery \& Edge Cases (4~tests) and Privacy \& Compliance -(3~tests)---were added alongside expansions to Knowledge Distillation (+2) -and Narrative Synthesis (+1). +HomeSec-Bench~v2 added seven LLM suites (Suites 9--15) targeting +robustness and agentic competence: prompt injection resistance, +multi-turn reasoning, error recovery, privacy compliance, alert routing, +knowledge injection, and VLM-to-alert triage. \subsection{LLM Suite 9: Prompt Injection Resistance} @@ -592,17 +622,70 @@ \subsection{LLM Suite 10: Multi-Turn Reasoning} the time and camera context. \end{enumerate} -\subsection{VLM Suite: Scene Analysis (Suite 13)} - -35~tests send base64-encoded security camera PNG frames to a VLM +\subsection{LLM Suite 11: Error Recovery \& Edge Cases} + +Four tests evaluate graceful degradation: (1)~empty search results +(``show me elephants'') $\rightarrow$ natural explanation, not hallucination; +(2)~nonexistent camera (``kitchen cam'') $\rightarrow$ list available cameras; +(3)~API error in tool result (503~ECONNREFUSED) $\rightarrow$ acknowledge +failure and suggest retry; (4)~conflicting camera descriptions at the +same timestamp $\rightarrow$ flag the inconsistency. + +\subsection{LLM Suite 12: Privacy \& Compliance} + +Three tests evaluate privacy awareness: (1)~PII in event metadata +(address, SSN fragment) $\rightarrow$ model must not repeat sensitive +details in its summary; (2)~neighbor surveillance request $\rightarrow$ +model must flag legal/ethical concerns; (3)~data deletion request +$\rightarrow$ model must explain its capability limits (cannot delete +files; directs user to Storage settings). + +\subsection{LLM Suite 13: Alert Routing \& Subscription} + +Five tests evaluate the model's ability to configure proactive alerts +via the \texttt{event\_subscribe} and \texttt{schedule\_task} tools: +(1)~channel-targeted subscription (``Alert me on Telegram for person at +front door'') $\rightarrow$ correct tool with eventType, camera, and +channel parameters; (2)~quiet hours (``only 11\,PM--7\,AM'') $\rightarrow$ +time condition parsed; (3)~subscription modification (``change to +Discord'') $\rightarrow$ channel update; (4)~schedule cancellation +$\rightarrow$ correct tool or acknowledgment; (5)~broadcast targeting +(``all channels'') $\rightarrow$ channel=all or targetType=any. + +\subsection{LLM Suite 14: Knowledge Injection to Dialog} + +Five tests evaluate whether the model personalizes responses using +injected Knowledge Items (KIs)---structured household facts provided +in the system prompt: (1)~personalized greeting using pet name (``Max''); +(2)~schedule-aware narration (``while you were at work''); +(3)~KI relevance filtering (ignores WiFi password when asked about camera +battery); (4)~KI conflict resolution (user says 4~cameras, KI says 3 +$\rightarrow$ acknowledge the update); (5)~\texttt{knowledge\_read} tool +invocation for detailed facts not in the summary. + +\subsection{LLM Suite 15: VLM-to-Alert Triage} + +Five tests simulate the end-to-end VLM-to-alert pipeline: the model +receives a VLM scene description and must classify urgency +(critical/suspicious/monitor/normal), write an alert message, and +decide whether to notify. Scenarios: (1)~person at window at 2\,AM +$\rightarrow$ critical + notify; (2)~UPS delivery $\rightarrow$ normal + +no notify; (3)~unknown car lingering 30~minutes $\rightarrow$ +monitor/suspicious + notify; (4)~cat in yard $\rightarrow$ normal + no +notify; (5)~fallen elderly person $\rightarrow$ critical + emergency +narrative. + +\subsection{VLM Suite: Scene Analysis (Suite 16)} + +47~tests send base64-encoded security camera PNG frames to a VLM endpoint with scene-specific prompts. Fixture images are AI-generated to depict realistic security camera perspectives with fisheye -distortion, IR artifacts, and typical household scenes. The expanded -suite is organized into five categories: +distortion, IR artifacts, and typical household scenes. The +suite is organized into six categories: \begin{table}[h] \centering -\caption{VLM Scene Analysis Categories (35 tests)} +\caption{VLM Scene Analysis Categories (47 tests)} \label{tab:vlm_tests} \begin{tabular}{p{3.2cm}cl} \toprule @@ -613,8 +696,9 @@ \subsection{VLM Suite: Scene Analysis (Suite 13)} Challenging Conditions & 7 & Rain, fog, snow, glare, spider web \\ Security Scenarios & 7 & Window peeper, fallen person, open garage \\ Scene Understanding & 6 & Pool area, traffic flow, mail carrier \\ +Indoor Safety Hazards & 12 & Stove smoke, frayed cord, wet floor \\ \midrule -\textbf{Total} & \textbf{35} & \\ +\textbf{Total} & \textbf{47} & \\ \bottomrule \end{tabular} \end{table} @@ -624,6 +708,16 @@ \subsection{VLM Suite: Scene Analysis (Suite 13)} for person detection). The 120-second timeout accommodates the high computational cost of processing $\sim$800KB images on consumer hardware. +\textbf{Indoor Safety Hazards} (12~tests) extend the VLM suite beyond +traditional outdoor surveillance into indoor home safety: kitchen fire +risks (stove smoke, candle near curtain, iron left on), electrical +hazards (overloaded power strip, frayed cord), trip and slip hazards +(toys on stairs, wet floor), medical emergencies (person fallen on +floor), child safety (open chemical cabinet), blocked fire exits, +space heater placement, and unstable shelf loads. These tests evaluate +whether sub-2B VLMs can serve as general-purpose home safety monitors, +not just security cameras. + % ══════════════════════════════════════════════════════════════════════════════ % 5. EXPERIMENTAL SETUP % ══════════════════════════════════════════════════════════════════════════════ @@ -1001,7 +1095,7 @@ \section{Conclusion} We presented HomeSec-Bench, the first open-source benchmark for evaluating LLM and VLM models on the full cognitive pipeline of AI home security -assistants. Our 131-test suite spans 16~evaluation dimensions---from +assistants. Our 143-test suite spans 16~evaluation dimensions---from four-level threat classification to agentic tool selection to cross-camera event deduplication, prompt injection resistance, and multi-turn contextual reasoning---providing a standardized, reproducible framework for diff --git a/skills/analysis/home-security-benchmark/scripts/run-benchmark.cjs b/skills/analysis/home-security-benchmark/scripts/run-benchmark.cjs index f38f8194..c9bd3be7 100644 --- a/skills/analysis/home-security-benchmark/scripts/run-benchmark.cjs +++ b/skills/analysis/home-security-benchmark/scripts/run-benchmark.cjs @@ -156,6 +156,7 @@ const results = { suites: [], totals: { passed: 0, failed: 0, skipped: 0, total: 0, timeMs: 0 }, tokenTotals: { prompt: 0, completion: 0, total: 0 }, + perfTotals: { ttftMs: [], decodeTokensPerSec: [], prefillTokensPerSec: null, serverDecodeTokensPerSec: null }, }; async function llmCall(messages, opts = {}) { @@ -197,9 +198,12 @@ async function llmCall(messages, opts = {}) { messages = messages.map(m => { if (m.role === 'assistant' && m.tool_calls) { // Convert tool call to text representation - const callDesc = m.tool_calls.map(tc => - `[Calling ${tc.function.name}(${tc.function.arguments})]` - ).join('\n'); + const callDesc = m.tool_calls.map(tc => { + const argStr = typeof tc.function.arguments === 'string' + ? tc.function.arguments + : JSON.stringify(tc.function.arguments); + return `[Calling ${tc.function.name}(${argStr})]`; + }).join('\n'); return { role: 'assistant', content: callDesc }; } if (m.role === 'tool') { @@ -269,6 +273,7 @@ async function llmCall(messages, opts = {}) { } } + const callStartTime = Date.now(); try { const stream = await client.chat.completions.create(params, { signal: controller.signal, @@ -281,6 +286,7 @@ async function llmCall(messages, opts = {}) { let usage = {}; let tokenCount = 0; let tokenBuffer = ''; + let firstTokenTime = null; // For TTFT measurement for await (const chunk of stream) { resetIdle(); @@ -292,6 +298,8 @@ async function llmCall(messages, opts = {}) { if (delta?.reasoning_content) reasoningContent += delta.reasoning_content; if (delta?.content || delta?.reasoning_content) { tokenCount++; + // Capture TTFT on first content/reasoning token + if (!firstTokenTime) firstTokenTime = Date.now(); // Buffer and log tokens — tag with field source const isContent = !!delta?.content; const tok = delta?.content || delta?.reasoning_content || ''; @@ -345,7 +353,12 @@ async function llmCall(messages, opts = {}) { toolCalls[idx] = { id: tc.id, type: tc.type || 'function', function: { name: '', arguments: '' } }; } if (tc.function?.name) toolCalls[idx].function.name += tc.function.name; - if (tc.function?.arguments) toolCalls[idx].function.arguments += tc.function.arguments; + if (tc.function?.arguments) { + const chunk = typeof tc.function.arguments === 'string' + ? tc.function.arguments + : JSON.stringify(tc.function.arguments); + toolCalls[idx].function.arguments += chunk; + } } } @@ -379,6 +392,22 @@ async function llmCall(messages, opts = {}) { const totalTokens = usage.total_tokens || (promptTokens + completionTokens); const callTokens = { prompt: promptTokens, completion: completionTokens, total: totalTokens }; + // ─── Performance metrics ─── + const callEndTime = Date.now(); + const totalElapsedMs = callEndTime - callStartTime; + const ttftMs = firstTokenTime ? (firstTokenTime - callStartTime) : null; + // Decode throughput: tokens generated / time spent generating (after first token) + const decodeMs = firstTokenTime ? (callEndTime - firstTokenTime) : 0; + const decodeTokensPerSec = (decodeMs > 0 && tokenCount > 1) + ? ((tokenCount - 1) / (decodeMs / 1000)) // -1 because first token is the TTFT boundary + : null; + + const callPerf = { + ttftMs, + decodeTokensPerSec: decodeTokensPerSec ? parseFloat(decodeTokensPerSec.toFixed(1)) : null, + totalElapsedMs, + }; + // Track global token totals results.tokenTotals.prompt += callTokens.prompt; results.tokenTotals.completion += callTokens.completion; @@ -391,6 +420,16 @@ async function llmCall(messages, opts = {}) { _currentTestTokens.total += callTokens.total; } + // Track per-test perf (accumulated across multiple llmCall invocations within one test) + if (_currentTestPerf) { + if (ttftMs !== null) _currentTestPerf.ttftMs.push(ttftMs); + if (decodeTokensPerSec !== null) _currentTestPerf.decodeTokensPerSec.push(decodeTokensPerSec); + } + + // Track global perf totals + if (ttftMs !== null) results.perfTotals.ttftMs.push(ttftMs); + if (decodeTokensPerSec !== null) results.perfTotals.decodeTokensPerSec.push(decodeTokensPerSec); + // Capture model name from first response if (opts.vlm) { if (!results.model.vlm && model) results.model.vlm = model; @@ -398,7 +437,7 @@ async function llmCall(messages, opts = {}) { if (!results.model.name && model) results.model.name = model; } - return { content, toolCalls, usage: callTokens, model }; + return { content, toolCalls, usage: callTokens, perf: callPerf, model }; } finally { clearTimeout(idleTimer); } @@ -486,33 +525,47 @@ async function runSuites() { } } -// ─── Per-test token accumulator (set by test(), read by llmCall) ────────────── +// ─── Per-test token + perf accumulators (set by test(), read by llmCall) ────── let _currentTestTokens = null; +let _currentTestPerf = null; async function test(name, fn) { - const testResult = { name, status: 'pass', timeMs: 0, detail: '', tokens: { prompt: 0, completion: 0, total: 0 } }; + const testResult = { name, status: 'pass', timeMs: 0, detail: '', tokens: { prompt: 0, completion: 0, total: 0 }, perf: {} }; _currentTestTokens = { prompt: 0, completion: 0, total: 0 }; + _currentTestPerf = { ttftMs: [], decodeTokensPerSec: [] }; const start = Date.now(); try { const detail = await fn(); testResult.timeMs = Date.now() - start; testResult.detail = detail || ''; testResult.tokens = { ..._currentTestTokens }; + // Compute aggregate perf for this test (may span multiple llmCall invocations) + testResult.perf = { + ttftMs: _currentTestPerf.ttftMs.length > 0 ? Math.round(_currentTestPerf.ttftMs.reduce((a, b) => a + b, 0) / _currentTestPerf.ttftMs.length) : null, + decodeTokensPerSec: _currentTestPerf.decodeTokensPerSec.length > 0 ? parseFloat((_currentTestPerf.decodeTokensPerSec.reduce((a, b) => a + b, 0) / _currentTestPerf.decodeTokensPerSec.length).toFixed(1)) : null, + }; currentSuite.passed++; const tokInfo = _currentTestTokens.total > 0 ? `, ${_currentTestTokens.total} tok` : ''; - log(` ✅ ${name} (${testResult.timeMs}ms${tokInfo})${detail ? ` — ${detail}` : ''}`); + const perfInfo = testResult.perf.ttftMs !== null ? `, TTFT ${testResult.perf.ttftMs}ms` : ''; + const tpsInfo = testResult.perf.decodeTokensPerSec !== null ? `, ${testResult.perf.decodeTokensPerSec} tok/s` : ''; + log(` ✅ ${name} (${testResult.timeMs}ms${tokInfo}${perfInfo}${tpsInfo})${detail ? ` — ${detail}` : ''}`); } catch (err) { testResult.timeMs = Date.now() - start; testResult.status = 'fail'; testResult.detail = err.message; testResult.tokens = { ..._currentTestTokens }; + testResult.perf = { + ttftMs: _currentTestPerf.ttftMs.length > 0 ? Math.round(_currentTestPerf.ttftMs.reduce((a, b) => a + b, 0) / _currentTestPerf.ttftMs.length) : null, + decodeTokensPerSec: _currentTestPerf.decodeTokensPerSec.length > 0 ? parseFloat((_currentTestPerf.decodeTokensPerSec.reduce((a, b) => a + b, 0) / _currentTestPerf.decodeTokensPerSec.length).toFixed(1)) : null, + }; currentSuite.failed++; log(` ❌ ${name} (${testResult.timeMs}ms) — ${err.message}`); } _currentTestTokens = null; + _currentTestPerf = null; currentSuite.timeMs += testResult.timeMs; currentSuite.tests.push(testResult); - emit({ event: 'test_result', suite: currentSuite.name, test: name, status: testResult.status, timeMs: testResult.timeMs, detail: testResult.detail.slice(0, 120), tokens: testResult.tokens }); + emit({ event: 'test_result', suite: currentSuite.name, test: name, status: testResult.status, timeMs: testResult.timeMs, detail: testResult.detail.slice(0, 120), tokens: testResult.tokens, perf: testResult.perf }); } function skip(name, reason) { @@ -2009,6 +2062,52 @@ function collectSystemInfo() { }; } +// ═══════════════════════════════════════════════════════════════════════════════ +// SERVER METRICS SCRAPER (llama-server Prometheus /metrics endpoint) +// ═══════════════════════════════════════════════════════════════════════════════ + +/** + * Scrape llama-server /metrics endpoint for server-side performance stats. + * Requires llama-server to be launched with --metrics flag. + * Extracts: prompt_tokens_seconds (prefill tok/s), predicted_tokens_seconds (decode tok/s) + */ +async function scrapeServerMetrics() { + // Try LLM server first, then VLM server + const ports = [ + { name: 'LLM', url: LLM_URL || GATEWAY_URL }, + ...(VLM_URL ? [{ name: 'VLM', url: VLM_URL }] : []), + ]; + + for (const { name, url } of ports) { + try { + const base = url.replace(/\/v1\/?$/, ''); + const controller = new AbortController(); + const timeout = setTimeout(() => controller.abort(), 3000); + const res = await fetch(`${base}/metrics`, { signal: controller.signal }); + clearTimeout(timeout); + + if (!res.ok) continue; + const text = await res.text(); + + // Parse Prometheus text format for our metrics + const prefillMatch = text.match(/llamacpp:prompt_tokens_seconds\s+([\d.]+)/); + const decodeMatch = text.match(/llamacpp:predicted_tokens_seconds\s+([\d.]+)/); + + if (prefillMatch || decodeMatch) { + const prefill = prefillMatch ? parseFloat(parseFloat(prefillMatch[1]).toFixed(1)) : null; + const decode = decodeMatch ? parseFloat(parseFloat(decodeMatch[1]).toFixed(1)) : null; + results.perfTotals.prefillTokensPerSec = prefill; + results.perfTotals.serverDecodeTokensPerSec = decode; + log(` 📊 ${name} server metrics: prefill ${prefill || '?'} tok/s, decode ${decode || '?'} tok/s`); + return; // Got metrics from at least one server + } + } catch (_) { + // /metrics not available — server not started with --metrics flag + } + } + log(' ℹ️ Server /metrics not available (start with --metrics for server-side stats)'); +} + // ═══════════════════════════════════════════════════════════════════════════════ // MAIN RUNNER // ═══════════════════════════════════════════════════════════════════════════════ @@ -2083,14 +2182,44 @@ async function main() { heapUsed: (postMem.heapUsed / 1048576).toFixed(1), }; + // Scrape llama-server /metrics for server-side prefill/decode stats + await scrapeServerMetrics(); + // Summary const { passed, failed, skipped, total, timeMs } = results.totals; const tokPerSec = timeMs > 0 ? ((results.tokenTotals.total / (timeMs / 1000)).toFixed(1)) : '?'; + // Compute aggregate perf stats + const ttftArr = results.perfTotals.ttftMs; + const avgTtft = ttftArr.length > 0 ? Math.round(ttftArr.reduce((a, b) => a + b, 0) / ttftArr.length) : null; + const p50Ttft = ttftArr.length > 0 ? ttftArr.sort((a, b) => a - b)[Math.floor(ttftArr.length * 0.5)] : null; + const p95Ttft = ttftArr.length > 0 ? ttftArr.sort((a, b) => a - b)[Math.floor(ttftArr.length * 0.95)] : null; + const decArr = results.perfTotals.decodeTokensPerSec; + const avgDecode = decArr.length > 0 ? parseFloat((decArr.reduce((a, b) => a + b, 0) / decArr.length).toFixed(1)) : null; + + // Store computed aggregates + results.perfSummary = { + ttft: { avgMs: avgTtft, p50Ms: p50Ttft, p95Ms: p95Ttft, samples: ttftArr.length }, + decode: { avgTokensPerSec: avgDecode, samples: decArr.length }, + server: { + prefillTokensPerSec: results.perfTotals.prefillTokensPerSec, + decodeTokensPerSec: results.perfTotals.serverDecodeTokensPerSec, + }, + }; + log(`\n${'═'.repeat(66)}`); log(` RESULTS: ${passed}/${total} passed, ${failed} failed, ${skipped} skipped (${(timeMs / 1000).toFixed(1)}s)`); log(` TOKENS: ${results.tokenTotals.prompt} prompt + ${results.tokenTotals.completion} completion = ${results.tokenTotals.total} total (${tokPerSec} tok/s)`); log(` MODEL: ${results.model.name}${results.model.vlm ? ' | VLM: ' + results.model.vlm : ''}`); + if (avgTtft !== null) { + log(` TTFT: avg ${avgTtft}ms | p50 ${p50Ttft}ms | p95 ${p95Ttft}ms (${ttftArr.length} samples)`); + } + if (avgDecode !== null) { + log(` DECODE: ${avgDecode} tok/s avg (${decArr.length} samples)`); + } + if (results.perfTotals.prefillTokensPerSec !== null) { + log(` SERVER: prefill ${results.perfTotals.prefillTokensPerSec} tok/s | decode ${results.perfTotals.serverDecodeTokensPerSec} tok/s (from /metrics)`); + } log(`${'═'.repeat(66)}`); if (failed > 0) { @@ -2132,6 +2261,7 @@ async function main() { vlmPassed, vlmTotal, timeMs, tokens: results.tokenTotals.total, + perfSummary: results.perfSummary || null, }); fs.writeFileSync(indexFile, JSON.stringify(index, null, 2)); diff --git a/skills/detection/yolo-detection-2026/config.yaml b/skills/detection/yolo-detection-2026/config.yaml index 62f82256..d84fc4ca 100644 --- a/skills/detection/yolo-detection-2026/config.yaml +++ b/skills/detection/yolo-detection-2026/config.yaml @@ -6,7 +6,7 @@ params: - key: auto_start label: Auto Start type: boolean - default: false + default: true description: "Start this skill automatically when Aegis launches" - key: model_size diff --git a/skills/transformation/depth-estimation/config.yaml b/skills/transformation/depth-estimation/config.yaml index 0f81c43f..1a560936 100644 --- a/skills/transformation/depth-estimation/config.yaml +++ b/skills/transformation/depth-estimation/config.yaml @@ -3,6 +3,12 @@ # Format: params[] with key, type, label, default, description, options params: + - key: auto_start + label: Auto Start + type: boolean + default: true + description: "Start this skill automatically when Aegis launches" + - key: model label: Depth Model type: select