From f3ebce78e494488aa8d99dcddfcdb08f65443d89 Mon Sep 17 00:00:00 2001 From: Noah Kiser Date: Thu, 7 May 2026 15:58:20 +0000 Subject: [PATCH 1/2] feat (browsers): create throughput benchmark for browser providers --- .../browser-throughput-benchmarks.yml | 190 +++++++++ THROUGHPUT.md | 189 +++++++++ package.json | 6 + results/browser-throughput/.gitkeep | 0 src/browser/generate-throughput-svg.ts | 266 ++++++++++++ src/browser/throughput-benchmark.ts | 378 ++++++++++++++++++ src/browser/throughput-providers.ts | 68 ++++ src/browser/throughput-scoring.ts | 97 +++++ src/browser/throughput-types.ts | 70 ++++ src/merge-results.ts | 109 ++++- src/run.ts | 82 +++- 11 files changed, 1451 insertions(+), 4 deletions(-) create mode 100644 .github/workflows/browser-throughput-benchmarks.yml create mode 100644 THROUGHPUT.md create mode 100644 results/browser-throughput/.gitkeep create mode 100644 src/browser/generate-throughput-svg.ts create mode 100644 src/browser/throughput-benchmark.ts create mode 100644 src/browser/throughput-providers.ts create mode 100644 src/browser/throughput-scoring.ts create mode 100644 src/browser/throughput-types.ts diff --git a/.github/workflows/browser-throughput-benchmarks.yml b/.github/workflows/browser-throughput-benchmarks.yml new file mode 100644 index 0000000..6f0fde5 --- /dev/null +++ b/.github/workflows/browser-throughput-benchmarks.yml @@ -0,0 +1,190 @@ +name: Browser Throughput Benchmark + +on: + pull_request: + paths: + - 'src/browser/**' + - 'src/util/**' + - 'src/run.ts' + - 'src/merge-results.ts' + - 'package.json' + schedule: + - cron: '0 3 * * *' # Daily at 03:00 UTC (offset from main browser benchmark) + workflow_dispatch: + inputs: + iterations: + description: 'Iterations per provider (sessions)' + required: false + default: '10' + +concurrency: + group: browser-throughput-benchmarks + cancel-in-progress: true + +permissions: + contents: write + pull-requests: write + +jobs: + bench: + name: Bench ${{ matrix.provider }} + runs-on: namespace-profile-default + timeout-minutes: 60 + strategy: + fail-fast: false + matrix: + provider: + - browserbase + - hyperbrowser + - kernel + - steel + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-node@v4 + with: + node-version: 24 + cache: 'npm' + - name: Install dependencies + run: | + if [ "${{ github.event_name }}" = "schedule" ]; then + npm update + else + npm ci + fi + - name: Clear stale results from checkout + run: rm -rf results/browser-throughput/ + - name: Run browser throughput benchmark + env: + BROWSERBASE_API_KEY: ${{ secrets.BROWSERBASE_API_KEY }} + BROWSERBASE_PROJECT_ID: ${{ secrets.BROWSERBASE_PROJECT_ID }} + HYPERBROWSER_API_KEY: ${{ secrets.HYPERBROWSER_API_KEY }} + KERNEL_API_KEY: ${{ secrets.KERNEL_API_KEY }} + STEEL_API_KEY: ${{ secrets.STEEL_API_KEY }} + run: | + npm run bench -- \ + --mode browser-throughput \ + --provider ${{ matrix.provider }} \ + --iterations ${{ github.event_name == 'pull_request' && '3' || github.event.inputs.iterations || '10' }} + - name: Upload results + if: always() + uses: actions/upload-artifact@v4 + with: + name: browser-throughput-results-${{ matrix.provider }} + path: results/browser-throughput/ + if-no-files-found: ignore + retention-days: 7 + + collect: + name: Collect Results + runs-on: namespace-profile-default + needs: bench + if: always() + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-node@v4 + with: + node-version: 24 + cache: 'npm' + - name: Install dependencies + run: | + if [ "${{ github.event_name }}" = "schedule" ]; then + npm update + else + npm ci + fi + - name: Download all artifacts + uses: actions/download-artifact@v4 + with: + path: artifacts/ + pattern: browser-throughput-results-* + - name: Merge results + run: npx tsx src/merge-results.ts --input artifacts --mode browser-throughput + - run: npm run generate-browser-throughput-svg + - name: Upload SVG as artifact + if: github.event_name == 'pull_request' + uses: actions/upload-artifact@v4 + with: + name: browser-throughput-benchmark-svg + path: browser-throughput.svg + if-no-files-found: ignore + retention-days: 7 + - name: Post results to PR + if: github.event_name == 'pull_request' + continue-on-error: true + uses: actions/github-script@v7 + with: + script: | + const fs = require('fs'); + const path = require('path'); + + const runUrl = `${context.serverUrl}/${context.repo.owner}/${context.repo.repo}/actions/runs/${context.runId}`; + const latestPath = path.join('results', 'browser-throughput', 'latest.json'); + + let body = '## Browser Throughput Benchmark Results\n\n'; + + if (!fs.existsSync(latestPath)) { + body += '> No browser-throughput benchmark results were generated.\n\n'; + } else { + const data = JSON.parse(fs.readFileSync(latestPath, 'utf-8')); + const results = data.results + .filter(r => !r.skipped) + .sort((a, b) => (b.compositeScore || 0) - (a.compositeScore || 0)); + + if (results.length === 0) { + body += '> No browser-throughput benchmark results were generated.\n\n'; + } else { + body += '| # | Provider | Score | APS (med) | Task (med) | Task (p95) | Screenshot | Status |\n'; + body += '|---|----------|-------|-----------|------------|------------|------------|--------|\n'; + + results.forEach((r, i) => { + const name = r.provider.charAt(0).toUpperCase() + r.provider.slice(1); + const score = r.compositeScore !== undefined ? r.compositeScore.toFixed(1) : '--'; + const aps = r.summary.actionsPerSecond.median.toFixed(2) + '/s'; + const taskMed = (r.summary.taskMs.median / 1000).toFixed(2) + 's'; + const taskP95 = (r.summary.taskMs.p95 / 1000).toFixed(2) + 's'; + const screenshotMed = Math.round(r.summary.perActionType.screenshot?.median || 0) + 'ms'; + const expectedActions = 50; + const ok = r.iterations.filter(it => !it.error && it.actionsCompleted === expectedActions).length; + const count = r.iterations.length; + body += `| ${i + 1} | ${name} | ${score} | ${aps} | ${taskMed} | ${taskP95} | ${screenshotMed} | ${ok}/${count} |\n`; + }); + + body += '\n'; + } + } + + body += `---\n*[View full run](${runUrl}) · SVG available as [build artifact](${runUrl}#artifacts)*`; + + const marker = '## Browser Throughput Benchmark Results'; + const { data: comments } = await github.rest.issues.listComments({ + owner: context.repo.owner, + repo: context.repo.repo, + issue_number: context.issue.number, + }); + + const existing = comments.find(c => c.body.startsWith(marker)); + + if (existing) { + await github.rest.issues.updateComment({ + owner: context.repo.owner, + repo: context.repo.repo, + comment_id: existing.id, + body, + }); + } else { + await github.rest.issues.createComment({ + owner: context.repo.owner, + repo: context.repo.repo, + issue_number: context.issue.number, + body, + }); + } + - name: Commit and push + if: github.event_name != 'pull_request' + run: | + git config user.name "github-actions[bot]" + git config user.email "github-actions[bot]@users.noreply.github.com" + git add package.json package-lock.json browser-throughput.svg results/browser-throughput/ + git diff --cached --quiet && echo "No changes to commit" && exit 0 + git commit -m "chore: update browser throughput benchmark results [skip ci]" + git push diff --git a/THROUGHPUT.md b/THROUGHPUT.md new file mode 100644 index 0000000..543acc9 --- /dev/null +++ b/THROUGHPUT.md @@ -0,0 +1,189 @@ +# Browser Step Throughput Benchmark + +This document describes the **browser step throughput benchmark** — a measurement of how fast a browser provider can execute sequential agent-style actions inside a single running session. It is a complement to the existing browser lifecycle benchmark, which measures session provisioning latency. + +## Why this benchmark exists + +The existing browser benchmark (`src/browser/benchmark.ts`) measures the **lifecycle**: + +``` +session create → CDP connect → single page load → release +``` + +That answers one important question: *how fast can I get a fresh browser?* It is the right metric for short-lived sessions where each task spins up a new browser. + +It does **not** answer the question that matters for long-running agent workloads: + +> *Once a session is up, how fast does each individual action complete, and does performance hold up over the course of a session?* + +A vision-based agent might run for thirty minutes to several hours and execute hundreds of browser actions inside a single session. For those workloads, provisioning speed is a negligible fraction of total runtime — the per-action throughput is the bottleneck. A provider that creates a session in 200ms but takes 800ms per screenshot will lose to a provider that takes 2s to create a session but only 100ms per screenshot, every single time. + +This benchmark closes that gap. + +## What gets measured + +For each provider, the benchmark runs **N sessions** (default 10, configurable). Each session executes a fixed sequence of **50 sequential actions** end-to-end inside one running browser. We record, in order: + +- Session creation time (`createMs`) +- CDP connection time (`connectMs`) +- Wall-clock duration of each individual action, tagged by action type +- Session release time (`releaseMs`) +- Total wall-clock time (`totalMs`) +- Sum of action durations (`taskMs`) +- Actions per second over the session (`actionsCompleted / (taskMs / 1000)`) + +From that raw data we summarize across iterations: + +- `actionsPerSecond` — median, p95, p99 +- `taskMs` — median, p95, p99 +- `totalMs` — median, p95, p99 +- `createMs` — median, p95, p99 +- `perActionType` — median, p95, p99 for each of the six action types + +## The 50-action sequence + +Each session repeats a 10-action loop five times against Wikipedia: + +``` +1. goto('https://en.wikipedia.org/wiki/Special:Random') +2. waitForSelector('#firstHeading') +3. screenshot() +4. textContent('#firstHeading') +5. click('#mw-content-text a[href^="/wiki/"]:not([href*=":"])') +6. waitForSelector('#firstHeading') +7. screenshot() +8. textContent('#firstHeading') +9. page.goBack({ waitUntil: 'commit' }) +10. waitForSelector('#firstHeading') +``` + +Five loops × ten actions = **50 actions per session**. + +This pattern simulates what a vision-based agent actually does on each turn: navigate, wait for the DOM, capture a screenshot for an LLM to look at, extract some text, take an action, observe the result, and move on. + +### Why Wikipedia + +Wikipedia's `Special:Random` endpoint is intentionally chosen over real-world target sites. It gives us: + +- **Global availability** — no geographic restrictions, no auth flows. +- **Consistent structure** — every article page has `#firstHeading` and a `#mw-content-text` body container, so the same selectors work everywhere. +- **A rich, deterministic link graph** — every random article exposes many `/wiki/...` outbound links to follow. +- **Stable, predictable load times** — Wikipedia's CDN serves pages quickly and consistently across regions. +- **No meaningful bot detection** for scripted, polite traffic. + +That isolates the variable we care about: the provider's per-action overhead. Page-level variance is small enough that differences between providers are real, not noise from the target site. + +### Why these six action types + +Together they cover the surface area of nearly every agent action: + +| Action type | Represents | +| ------------------ | ----------------------------------------------------------- | +| `navigate` | Full-page transitions (HTTP + page load + render) | +| `waitForSelector` | DOM polling — measures CDP round-trip + selector evaluation | +| `screenshot` | Pixel capture — relevant for vision-based agents | +| `textContent` | DOM read — cheapest possible action, isolates raw CDP cost | +| `click` | Synthetic input event + waiting for the navigation it triggers | +| `goBack` | History navigation, exercises bfcache behavior | + +Per-action breakdown matters: two providers can have identical end-to-end times but very different cost structures (one is screenshot-bound, the other is click-bound). The `perActionType` summary surfaces those differences. + +### Stealth + real viewport + +Every provider is configured with the settings agent workloads typically use: + +```typescript +sessionCreateOptions: { + stealth: true, + headless: true, + viewport: { width: 1920, height: 1080 }, +} +``` + +This makes the comparison apples-to-apples and reflects realistic agent conditions (stealth mode often changes performance characteristics, and a 1920×1080 viewport produces meaningfully larger screenshots than the default). + +## How the runner behaves + +A few deliberate choices in `runThroughputIteration`: + +- **Each action is timed individually** with `performance.now()` immediately before and after the Playwright call. The session timing is the *sum of action durations*, not measured separately — that way action-level numbers always add up to the session number. +- **A failing action does not abort the session.** If `click` times out on action 5, the loop records the failure and proceeds with action 6. This lets us measure partial completion rates and observe how providers degrade under stress, instead of throwing away an entire session because one action got unlucky. +- **The action index is recorded.** With 50 ordered actions per session, downstream analysis can detect if late-session actions are systematically slower than early-session ones — a useful signal for memory leaks or resource exhaustion in long-running sessions. +- **Action timeout is 30 seconds**, applied per-action via `withTimeout`. A single slow action can't hang an entire run, and the timeout lands well above any reasonable real action duration. +- **`page.goBack` uses `waitUntil: 'commit'`** rather than the Playwright default of `'load'`, because browsers restoring a page from the back-forward cache fire `pageshow` instead of `load` — `'load'` would hang for the full timeout on every bfcache restore. The next `waitForSelector` confirms arrival on the previous page. + +## Scoring + +The composite score is a single number (0–100, higher is better) for at-a-glance comparison. The weighting was chosen to reflect what actually matters for agent workloads: + +``` +score = ( + 0.40 × score(actionsPerSecond.median) // throughput is the primary signal + + 0.25 × score(taskMs.median) // total time per session + + 0.20 × score(taskMs.p95) // tail consistency (worst sessions) + + 0.15 × score(screenshot.median) // vision-agent proxy +) × successRate +``` + +Where the sub-scores are linear: + +- `score(actionsPerSecond)` — 0/sec → 0, 10/sec → 100 (linear). +- `score(latencyMs)` — 0ms → 100, 30,000ms → 0 (linear, clamped to 0). +- `successRate` — fraction of sessions that completed all 50 actions without error. A session that completes only 49/50 does not count toward `successRate`. This deliberately punishes flakiness — an agent that fails 1 action in 50 fails 1 in every 50, period. + +### Why these weights + +- **40% on throughput**, because actions/sec is the headline metric for agent workloads. Doubling APS halves the wall-clock cost of any agent task. +- **25% on median task time**, to reward the typical case. +- **20% on p95 task time**, to reward consistency. A provider with a great median but a long tail is dangerous for agents that run for hours — the tail is what you actually pay. +- **15% on screenshot median**, because vision agents bottleneck on screenshot capture. It's separated out so this specific cost can't hide inside the aggregate. +- **× successRate**, because partial successes aren't useful. A provider that wins on speed but fails 10% of sessions is worse than a slower one that finishes. + +### Why not just use APS + +A single-axis score would hide important detail. A provider can have great throughput but terrible p95 (one in twenty sessions falls off a cliff) — which is unusable for production agents. The composite score forces all four axes to be acceptable to score well. + +The full per-action distribution is preserved in the JSON output, so anyone who cares about a different weighting can compute their own score from the raw data. + +## Running it + +```bash +# Single provider, single session — useful for development +npm run bench:browser-throughput:browserbase -- --iterations 1 + +# All four providers, default 10 sessions each +npm run bench:browser-throughput + +# Specific provider with custom iteration count +npm run bench -- --mode browser-throughput --provider hyperbrowser --iterations 25 +``` + +Required environment variables (set in `.env` or your shell): + +- `BROWSERBASE_API_KEY`, `BROWSERBASE_PROJECT_ID` +- `HYPERBROWSER_API_KEY` +- `KERNEL_API_KEY` +- `STEEL_API_KEY` + +Missing credentials cause that provider to be reported as `SKIPPED` rather than failing the run. + +## Output + +Results are written to `results/browser-throughput/YYYY-MM-DD.json` and copied to `results/browser-throughput/latest.json`. Each iteration's JSON includes the full ordered action list with per-action durations, success flags, and errors — enough to reconstruct any per-action analysis without re-running. + +The SVG generator produces `browser-throughput.svg` with a ranked comparison table: + +```bash +npm run generate-browser-throughput-svg +``` + +## Scheduling + +The GitHub Actions workflow `browser-throughput-benchmarks.yml` runs daily at 01:00 UTC (offset from the lifecycle browser benchmark at 00:00) with 10 iterations per provider. Pull requests touching browser code run a faster 3-iteration version and post a comparison table as a PR comment. + +## Limitations + +- Wikipedia's CDN is fast and globally distributed — providers in regions closer to Wikipedia's edge nodes will benefit. This is acceptable for a relative comparison but it is not representative of every real-world target site. +- A 50-action session is short relative to real agent workloads. It catches per-action overhead and basic session drift, but multi-hour memory leaks or long-tail GC pauses will not show up here. +- The benchmark does not currently model concurrent sessions per account. Some providers may have very different per-action latency under high concurrency. +- Wikipedia's HTML occasionally changes. If `#firstHeading` or `#mw-content-text` get renamed or restructured, the selectors in the runner will need updating. diff --git a/package.json b/package.json index 0a0a9fb..29641fb 100644 --- a/package.json +++ b/package.json @@ -30,6 +30,11 @@ "bench:browser:steel": "tsx src/run.ts --mode browser --provider steel", "bench:browser:browseruse": "tsx src/run.ts --mode browser --provider browseruse", "bench:browser:anchorbrowser": "tsx src/run.ts --mode browser --provider anchorbrowser", + "bench:browser-throughput": "tsx src/run.ts --mode browser-throughput", + "bench:browser-throughput:browserbase": "tsx src/run.ts --mode browser-throughput --provider browserbase", + "bench:browser-throughput:hyperbrowser": "tsx src/run.ts --mode browser-throughput --provider hyperbrowser", + "bench:browser-throughput:kernel": "tsx src/run.ts --mode browser-throughput --provider kernel", + "bench:browser-throughput:steel": "tsx src/run.ts --mode browser-throughput --provider steel", "bench:storage": "tsx src/run.ts --mode storage", "bench:storage:s3": "tsx src/run.ts --mode storage --provider aws-s3", "bench:storage:r2": "tsx src/run.ts --mode storage --provider cloudflare-r2", @@ -45,6 +50,7 @@ "generate-svg:burst": "tsx src/sandbox/generate-svg.ts --mode burst", "generate-storage-svg": "tsx src/storage/generate-svg.ts", "generate-browser-svg": "tsx src/browser/generate-svg.ts", + "generate-browser-throughput-svg": "tsx src/browser/generate-throughput-svg.ts", "generate-pricing-svg": "tsx src/sandbox/generate-pricing-svg.ts" }, "dependencies": { diff --git a/results/browser-throughput/.gitkeep b/results/browser-throughput/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/src/browser/generate-throughput-svg.ts b/src/browser/generate-throughput-svg.ts new file mode 100644 index 0000000..7b28092 --- /dev/null +++ b/src/browser/generate-throughput-svg.ts @@ -0,0 +1,266 @@ +import fs from 'fs'; +import path from 'path'; +import { fileURLToPath } from 'url'; +import type { ThroughputBenchmarkResult } from './throughput-types.js'; +import { + computeThroughputCompositeScores, + sortThroughputByCompositeScore, +} from './throughput-scoring.js'; + +const __dirname = path.dirname(fileURLToPath(import.meta.url)); +const ROOT = path.resolve(__dirname, '../..'); +const RESULTS_DIR = path.join(ROOT, 'results', 'browser-throughput'); +const SPONSORS_DIR_TIER1 = path.join(ROOT, 'sponsors', 'tier-1'); +const SPONSORS_DIR_TIER2 = path.join(ROOT, 'sponsors', 'tier-2'); + +function loadSponsorImages(): { dataUri: string; name: string }[] { + const allSponsors: { dataUri: string; name: string }[] = []; + + const mimeTypes: Record = { + '.png': 'image/png', + '.jpg': 'image/jpeg', + '.jpeg': 'image/jpeg', + '.svg': 'image/svg+xml', + }; + + const loadFromDir = (dir: string) => { + if (!fs.existsSync(dir)) return; + + const files = fs.readdirSync(dir) + .filter(f => /\.(png|jpe?g|svg)$/i.test(f)) + .sort(); + + for (const file of files) { + const ext = path.extname(file).toLowerCase(); + const mime = mimeTypes[ext] || 'image/png'; + const raw = fs.readFileSync(path.join(dir, file)); + const b64 = raw.toString('base64'); + const name = path.basename(file, ext); + allSponsors.push({ dataUri: `data:${mime};base64,${b64}`, name }); + } + }; + + loadFromDir(SPONSORS_DIR_TIER1); + loadFromDir(SPONSORS_DIR_TIER2); + + return allSponsors; +} + +const LOGO_C_PATH = `M1036.26,1002.28h237.87l-.93,19.09c-8.38,110.32-49.81,198.3-123.82,262.07-73.09,63.31-170.84,95.43-290.48,95.43-130.81,0-235.55-44.69-311.43-133.6-74.48-87.98-112.65-209.48-112.65-361.23v-60.51c0-96.83,17.7-183.41,51.68-257.43,34.91-74.95,85.19-133.61,149.89-173.63,64.7-40.04,140.12-60.52,225.3-60.52,117.77,0,214.13,32.12,286.29,95.9,72.62,63.3,114.98,153.61,126.15,267.67l1.86,19.08h-238.34l-.93-15.83c-4.65-59.11-20.95-101.94-47.95-127.08-27-25.6-69.83-38.17-127.08-38.17-61.91,0-107.06,20.95-137.33,65.17-31.65,45.15-47.94,117.77-48.87,215.53v74.48c0,102.41,15.36,177.83,45.62,223.91,28.86,44.22,74.01,65.63,137.79,65.63,58.19,0,101.48-12.57,128.95-38.17,26.99-25.14,43.29-66.1,47.48-121.5l.93-16.3Z`; + +interface ResultFile { + timestamp: string; + results: ThroughputBenchmarkResult[]; +} + +function formatProviderName(s: string): string { + return s.charAt(0).toUpperCase() + s.slice(1); +} + +function formatSeconds(ms: number): string { + return (ms / 1000).toFixed(2) + 's'; +} + +function formatMs(ms: number): string { + return `${Math.round(ms)}ms`; +} + +const sponsorImages = loadSponsorImages(); + +function generateSVG(results: ThroughputBenchmarkResult[], timestamp: string): string { + if (!results.every(r => r.compositeScore !== undefined)) { + computeThroughputCompositeScores(results); + } + + const sorted = sortThroughputByCompositeScore(results).filter(r => !r.skipped); + + const rowHeight = 44; + const headerHeight = 110; + const tableHeaderHeight = 44; + const padding = 24; + const width = 1280; + const tableTop = headerHeight + padding; + const tableBottom = tableTop + tableHeaderHeight + (sorted.length * rowHeight); + const footnoteHeight = 20; + + const height = tableBottom + padding + 30 + footnoteHeight; + + const cols = { + rank: 40, + provider: 80, + score: 240, + aps: 360, + task: 500, + taskP95: 640, + screenshot: 780, + create: 920, + total: 1040, + status: 1180, + }; + + const title = 'Browser Step Throughput Benchmarks'; + const subtitle = '50-action Wikipedia loop per session — agent-style sequential actions'; + + let svg = ` + + + + + + + + + + + + + + + + + + + + + ${title} + ${subtitle} +${sponsorImages.length > 0 ? (() => { + const logoW = 100; + const logoH = 32; + const logoGap = 12; + const totalLogosW = sponsorImages.length * logoW + (sponsorImages.length - 1) * logoGap; + const logosStartX = width - padding - totalLogosW; + return ` + + SPONSORED BY + ${sponsorImages.map((img, i) => ``).join('\n ')}`; +})() + : ''} + + + + + # + Provider + Score + APS (med) + Task (med) + Task (p95) + Screenshot + Create + Total + Status +`; + + sorted.forEach((r, i) => { + const y = tableTop + tableHeaderHeight + (i * rowHeight) + 30; + const rank = i + 1; + const aps = r.summary.actionsPerSecond.median; + const taskMed = r.summary.taskMs.median; + const taskP95 = r.summary.taskMs.p95; + const totalMs = r.summary.totalMs.median; + const screenshotMed = r.summary.perActionType.screenshot?.median ?? 0; + const createMed = r.summary.createMs.median; + + const expectedActions = 50; + const fullSuccess = r.iterations.filter(it => !it.error && it.actionsCompleted === expectedActions).length; + const total = r.iterations.length; + const allFailed = fullSuccess === 0; + const score = r.compositeScore !== undefined ? r.compositeScore.toFixed(1) : '--'; + + let speedClass = allFailed ? 'slow' : 'fast'; + if (!allFailed && aps < 2.0) speedClass = 'slow'; + else if (!allFailed && aps < 3.5) speedClass = 'medium'; + + let rankClass = 'rank'; + if (rank === 1) rankClass = 'rank rank-1'; + else if (rank === 2) rankClass = 'rank rank-2'; + else if (rank === 3) rankClass = 'rank rank-3'; + + const apsDisplay = allFailed ? '--' : `${aps.toFixed(2)}/s`; + const taskDisplay = allFailed ? '--' : formatSeconds(taskMed); + const taskP95Display = allFailed ? '--' : formatSeconds(taskP95); + const screenshotDisplay = allFailed ? '--' : formatMs(screenshotMed); + const createDisplay = allFailed ? '--' : formatSeconds(createMed); + const totalDisplay = allFailed ? '--' : formatSeconds(totalMs); + + svg += ` + + ${rank} + ${formatProviderName(r.provider)} + ${score} + ${apsDisplay} + ${taskDisplay} + ${taskP95Display} + ${screenshotDisplay} + ${createDisplay} + ${totalDisplay} + ${fullSuccess}/${total} +`; + + if (i < sorted.length - 1) { + const lineY = tableTop + tableHeaderHeight + ((i + 1) * rowHeight); + svg += ` +`; + } + }); + + const date = new Date(timestamp).toLocaleDateString('en-US', { + year: 'numeric', + month: 'short', + day: 'numeric', + hour: '2-digit', + minute: '2-digit', + timeZoneName: 'short' + }); + + svg += ` + + Last updated: ${date} + + + Each session runs 50 sequential actions (navigate, wait, screenshot, textContent, click, goBack) on Wikipedia. Higher APS is better. + +`; + + return svg; +} + +function main() { + const latestPath = path.join(RESULTS_DIR, 'latest.json'); + + if (!fs.existsSync(latestPath)) { + console.error(`No throughput benchmark results found at ${latestPath}`); + process.exit(1); + } + + const raw = fs.readFileSync(latestPath, 'utf-8'); + const data: ResultFile = JSON.parse(raw); + + const svg = generateSVG(data.results, data.timestamp); + const svgPath = path.join(ROOT, 'browser-throughput.svg'); + fs.writeFileSync(svgPath, svg); + console.log(`SVG written to ${svgPath}`); +} + +main(); diff --git a/src/browser/throughput-benchmark.ts b/src/browser/throughput-benchmark.ts new file mode 100644 index 0000000..4dc79f5 --- /dev/null +++ b/src/browser/throughput-benchmark.ts @@ -0,0 +1,378 @@ +import { chromium, type Browser, type Page } from 'playwright-core'; +import { withTimeout } from '../util/timeout.js'; +import { + ACTION_TYPES, + type ActionResult, + type ActionType, + type ThroughputBenchmarkResult, + type ThroughputProviderConfig, + type ThroughputStats, + type ThroughputStatsTriple, + type ThroughputTimingResult, +} from './throughput-types.js'; + +const RANDOM_URL = 'https://en.wikipedia.org/wiki/Special:Random'; +const FIRST_HEADING = '#firstHeading'; +const ARTICLE_LINK = '#mw-content-text a[href^="/wiki/"]:not([href*=":"])'; +const LOOPS_PER_SESSION = 5; +const ACTIONS_PER_LOOP = 10; +const ACTIONS_PER_SESSION = LOOPS_PER_SESSION * ACTIONS_PER_LOOP; // 50 + +const ACTION_TIMEOUT_MS = 30_000; + +function round(n: number): number { + return Math.round(n * 100) / 100; +} + +function percentile(sorted: number[], p: number): number { + if (sorted.length === 0) return 0; + const idx = Math.ceil((p / 100) * sorted.length) - 1; + return sorted[Math.min(Math.max(idx, 0), sorted.length - 1)]; +} + +function computeStats(values: number[]): ThroughputStatsTriple { + if (values.length === 0) return { median: 0, p95: 0, p99: 0 }; + + const sorted = [...values].sort((a, b) => a - b); + // Trim 5% tails when we have enough samples to make trimming meaningful + const trimCount = Math.floor(sorted.length * 0.05); + const trimmed = trimCount > 0 && sorted.length - 2 * trimCount > 0 + ? sorted.slice(trimCount, sorted.length - trimCount) + : sorted; + + const mid = Math.floor(trimmed.length / 2); + const median = trimmed.length % 2 === 0 + ? (trimmed[mid - 1] + trimmed[mid]) / 2 + : trimmed[mid]; + + return { + median, + p95: percentile(trimmed, 95), + p99: percentile(trimmed, 99), + }; +} + +async function timeAction( + fn: () => Promise, +): Promise<{ durationMs: number; success: boolean; error?: string; value?: T }> { + const start = performance.now(); + try { + const value = await withTimeout(fn(), ACTION_TIMEOUT_MS, 'Action timed out'); + return { durationMs: performance.now() - start, success: true, value }; + } catch (err) { + const error = err instanceof Error ? err.message : String(err); + return { durationMs: performance.now() - start, success: false, error }; + } +} + +async function runActionLoop(page: Page, results: ActionResult[]): Promise { + for (let loop = 0; loop < LOOPS_PER_SESSION; loop++) { + const baseIdx = loop * ACTIONS_PER_LOOP; + + // 1. Navigate to a random article + { + const r = await timeAction(() => + page.goto(RANDOM_URL, { waitUntil: 'load' }) as Promise, + ); + results.push({ index: baseIdx + 1, type: 'navigate', durationMs: r.durationMs, success: r.success, error: r.error }); + } + + // 2. Wait for #firstHeading + { + const r = await timeAction(() => page.waitForSelector(FIRST_HEADING)); + results.push({ index: baseIdx + 2, type: 'waitForSelector', durationMs: r.durationMs, success: r.success, error: r.error }); + } + + // 3. Screenshot + { + const r = await timeAction(() => page.screenshot()); + results.push({ index: baseIdx + 3, type: 'screenshot', durationMs: r.durationMs, success: r.success, error: r.error }); + } + + // 4. Read text content of #firstHeading + { + const r = await timeAction(() => page.textContent(FIRST_HEADING)); + results.push({ index: baseIdx + 4, type: 'textContent', durationMs: r.durationMs, success: r.success, error: r.error }); + } + + // 5. Click first article link (filter out meta pages like Help:, File:, etc.) + { + const r = await timeAction(async () => { + const link = await page.waitForSelector(ARTICLE_LINK, { timeout: 10_000 }); + await link.click(); + }); + results.push({ index: baseIdx + 5, type: 'click', durationMs: r.durationMs, success: r.success, error: r.error }); + } + + // 6. Wait for #firstHeading on the new page + { + const r = await timeAction(() => page.waitForSelector(FIRST_HEADING)); + results.push({ index: baseIdx + 6, type: 'waitForSelector', durationMs: r.durationMs, success: r.success, error: r.error }); + } + + // 7. Screenshot the new page + { + const r = await timeAction(() => page.screenshot()); + results.push({ index: baseIdx + 7, type: 'screenshot', durationMs: r.durationMs, success: r.success, error: r.error }); + } + + // 8. Read text content of #firstHeading on the new page + { + const r = await timeAction(() => page.textContent(FIRST_HEADING)); + results.push({ index: baseIdx + 8, type: 'textContent', durationMs: r.durationMs, success: r.success, error: r.error }); + } + + // 9. Go back. Use `waitUntil: 'commit'` because back-forward cache restores + // fire `pageshow` instead of `load`, and Playwright's default + // `waitUntil: 'load'` hangs for the full timeout on a bfcache restore. + // Resolving on commit returns as soon as the navigation lands; the next + // waitForSelector confirms #firstHeading is present. + { + const r = await timeAction(() => page.goBack({ waitUntil: 'commit' }) as Promise); + results.push({ index: baseIdx + 9, type: 'goBack', durationMs: r.durationMs, success: r.success, error: r.error }); + } + + // 10. Wait for #firstHeading on the previous page + { + const r = await timeAction(() => page.waitForSelector(FIRST_HEADING)); + results.push({ index: baseIdx + 10, type: 'waitForSelector', durationMs: r.durationMs, success: r.success, error: r.error }); + } + } +} + +async function runThroughputIteration( + provider: any, + timeout: number, + sessionCreateOptions: Record, +): Promise { + const totalStart = performance.now(); + const actions: ActionResult[] = []; + let createMs = 0; + let connectMs = 0; + let releaseMs = 0; + + let session: { sessionId: string; connectUrl: string } | undefined; + let browser: Browser | undefined; + let iterationError: string | undefined; + + try { + // 1. Create session + const createStart = performance.now(); + session = await withTimeout( + provider.session.create(sessionCreateOptions), + timeout, + 'Session creation timed out', + ) as { sessionId: string; connectUrl: string }; + createMs = performance.now() - createStart; + + // 2. Connect over CDP + const connectStart = performance.now(); + browser = await withTimeout( + chromium.connectOverCDP(session.connectUrl), + 30_000, + 'CDP connection timed out', + ); + + const [context] = browser.contexts(); + if (!context) throw new Error('No default browser context found'); + const [existingPage] = context.pages(); + const page = existingPage ?? await context.newPage(); + connectMs = performance.now() - connectStart; + + // 3. Run the 50-action loop. Individual action failures are recorded but + // do not abort the session. + await runActionLoop(page, actions); + } catch (err) { + iterationError = err instanceof Error ? err.message : String(err); + } finally { + if (browser) { + await browser.close().catch(() => { }); + } + if (session) { + const releaseStart = performance.now(); + try { + await withTimeout( + provider.session.destroy(session.sessionId), + 15_000, + 'Session destroy timed out', + ); + } catch { + // Swallow release errors — they're recorded via releaseMs but should + // not mask the more important action timings. + } + releaseMs = performance.now() - releaseStart; + } + } + + const totalMs = performance.now() - totalStart; + const taskMs = actions.reduce((sum, a) => sum + a.durationMs, 0); + const actionsCompleted = actions.filter(a => a.success).length; + const actionsPerSecond = taskMs > 0 ? actionsCompleted / (taskMs / 1000) : 0; + + return { + createMs, + connectMs, + actions, + releaseMs, + totalMs, + taskMs, + actionsCompleted, + actionsPerSecond, + ...(iterationError ? { error: iterationError } : {}), + }; +} + +function summarizeIterations(iterations: ThroughputTimingResult[]): ThroughputStats { + const createValues = iterations.map(i => i.createMs).filter(v => v > 0); + const taskValues = iterations.map(i => i.taskMs).filter(v => v > 0); + const totalValues = iterations.map(i => i.totalMs).filter(v => v > 0); + const apsValues = iterations.map(i => i.actionsPerSecond).filter(v => v > 0); + + const perActionType = {} as Record; + for (const type of ACTION_TYPES) { + const values: number[] = []; + for (const iter of iterations) { + for (const a of iter.actions) { + if (a.type === type && a.success) values.push(a.durationMs); + } + } + perActionType[type] = computeStats(values); + } + + return { + createMs: computeStats(createValues), + taskMs: computeStats(taskValues), + totalMs: computeStats(totalValues), + actionsPerSecond: computeStats(apsValues), + perActionType, + }; +} + +function emptySummary(): ThroughputStats { + const empty: ThroughputStatsTriple = { median: 0, p95: 0, p99: 0 }; + const perActionType = {} as Record; + for (const t of ACTION_TYPES) perActionType[t] = { ...empty }; + return { + createMs: { ...empty }, + taskMs: { ...empty }, + totalMs: { ...empty }, + actionsPerSecond: { ...empty }, + perActionType, + }; +} + +export async function runThroughputBenchmark( + config: ThroughputProviderConfig, +): Promise { + const { + name, + iterations = 10, + timeout = 120_000, + requiredEnvVars, + sessionCreateOptions = {}, + } = config; + + const missingVars = requiredEnvVars.filter(v => !process.env[v]); + if (missingVars.length > 0) { + return { + provider: name, + mode: 'browser-throughput', + iterations: [], + summary: emptySummary(), + skipped: true, + skipReason: `Missing: ${missingVars.join(', ')}`, + }; + } + + const provider = config.createBrowserProvider(); + const results: ThroughputTimingResult[] = []; + + console.log(`\n--- Throughput Benchmark: ${name} (${iterations} sessions × ${ACTIONS_PER_SESSION} actions) ---`); + console.log('Sess Create Connect Task Release Total APS Actions'); + console.log('──── ─────── ─────── ─────── ─────── ─────── ───── ───────'); + + for (let i = 0; i < iterations; i++) { + const result = await runThroughputIteration(provider, timeout, sessionCreateOptions); + results.push(result); + + const pad = (n: number) => `${Math.round(n)}ms`.padStart(7); + const aps = result.actionsPerSecond.toFixed(1).padStart(5); + const status = `${result.actionsCompleted}/${ACTIONS_PER_SESSION}`; + const errSuffix = result.error ? ` ✗ ${result.error.slice(0, 50)}` : ''; + console.log( + `${String(i + 1).padStart(4)} ${pad(result.createMs)} ${pad(result.connectMs)} ${pad(result.taskMs)} ${pad(result.releaseMs)} ${pad(result.totalMs)} ${aps} ${status}${errSuffix}`, + ); + } + + return { + provider: name, + mode: 'browser-throughput', + iterations: results, + summary: summarizeIterations(results), + }; +} + +function roundStats(s: ThroughputStatsTriple): ThroughputStatsTriple { + return { median: round(s.median), p95: round(s.p95), p99: round(s.p99) }; +} + +export async function writeThroughputResultsJson( + results: ThroughputBenchmarkResult[], + outPath: string, +): Promise { + const fs = await import('fs'); + const os = await import('os'); + + const cleanResults = results.map(r => ({ + provider: r.provider, + mode: r.mode, + iterations: r.iterations.map(i => ({ + createMs: round(i.createMs), + connectMs: round(i.connectMs), + releaseMs: round(i.releaseMs), + totalMs: round(i.totalMs), + taskMs: round(i.taskMs), + actionsCompleted: i.actionsCompleted, + actionsPerSecond: round(i.actionsPerSecond), + actions: i.actions.map(a => ({ + index: a.index, + type: a.type, + durationMs: round(a.durationMs), + success: a.success, + ...(a.error ? { error: a.error } : {}), + })), + ...(i.error ? { error: i.error } : {}), + })), + summary: { + createMs: roundStats(r.summary.createMs), + taskMs: roundStats(r.summary.taskMs), + totalMs: roundStats(r.summary.totalMs), + actionsPerSecond: roundStats(r.summary.actionsPerSecond), + perActionType: Object.fromEntries( + ACTION_TYPES.map(t => [t, roundStats(r.summary.perActionType[t])]), + ), + }, + ...(r.compositeScore !== undefined ? { compositeScore: round(r.compositeScore) } : {}), + ...(r.successRate !== undefined ? { successRate: round(r.successRate) } : {}), + ...(r.skipped ? { skipped: r.skipped, skipReason: r.skipReason } : {}), + })); + + const output = { + version: '1.0', + timestamp: new Date().toISOString(), + environment: { + node: process.version, + platform: os.platform(), + arch: os.arch(), + }, + config: { + iterations: results[0]?.iterations.length || 0, + actionsPerSession: ACTIONS_PER_SESSION, + timeoutMs: 120_000, + }, + results: cleanResults, + }; + + fs.writeFileSync(outPath, JSON.stringify(output, null, 2)); + console.log(`Results written to ${outPath}`); +} diff --git a/src/browser/throughput-providers.ts b/src/browser/throughput-providers.ts new file mode 100644 index 0000000..96c8549 --- /dev/null +++ b/src/browser/throughput-providers.ts @@ -0,0 +1,68 @@ +import { browserbase } from '@computesdk/browserbase'; +import { hyperbrowser } from '@computesdk/hyperbrowser'; +import { kernel } from '@computesdk/kernel'; +import { steel } from '@computesdk/steel'; +import type { ThroughputProviderConfig } from './throughput-types.js'; + +/** + * Throughput benchmark provider configurations. + * + * Mirrors src/browser/providers.ts but overrides sessionCreateOptions to + * include stealth + a 1920x1080 viewport for every provider — these are the + * settings agent workloads typically use. + */ +const VIEWPORT = { width: 1920, height: 1080 }; + +export const throughputProviders: ThroughputProviderConfig[] = [ + { + name: 'browserbase', + requiredEnvVars: ['BROWSERBASE_API_KEY', 'BROWSERBASE_PROJECT_ID'], + createBrowserProvider: () => browserbase({ + apiKey: process.env.BROWSERBASE_API_KEY!, + projectId: process.env.BROWSERBASE_PROJECT_ID!, + }), + sessionCreateOptions: { + region: 'us-east-1', + stealth: true, + headless: true, + viewport: VIEWPORT, + }, + }, + { + name: 'hyperbrowser', + requiredEnvVars: ['HYPERBROWSER_API_KEY'], + createBrowserProvider: () => hyperbrowser({ + apiKey: process.env.HYPERBROWSER_API_KEY!, + }), + sessionCreateOptions: { + region: 'us-east', + stealth: true, + headless: true, + viewport: VIEWPORT, + }, + }, + { + name: 'kernel', + requiredEnvVars: ['KERNEL_API_KEY'], + createBrowserProvider: () => kernel({ + apiKey: process.env.KERNEL_API_KEY!, + }), + sessionCreateOptions: { + stealth: true, + headless: true, + viewport: VIEWPORT, + }, + }, + { + name: 'steel', + requiredEnvVars: ['STEEL_API_KEY'], + createBrowserProvider: () => steel({ + apiKey: process.env.STEEL_API_KEY!, + }), + sessionCreateOptions: { + stealth: true, + headless: true, + viewport: VIEWPORT, + }, + }, +]; diff --git a/src/browser/throughput-scoring.ts b/src/browser/throughput-scoring.ts new file mode 100644 index 0000000..5637053 --- /dev/null +++ b/src/browser/throughput-scoring.ts @@ -0,0 +1,97 @@ +import type { ThroughputBenchmarkResult } from './throughput-types.js'; + +export interface ThroughputScoringWeights { + apsMedian: number; + taskMedian: number; + taskP95: number; + screenshotMedian: number; +} + +export const DEFAULT_THROUGHPUT_WEIGHTS: ThroughputScoringWeights = { + apsMedian: 0.40, // Throughput is the primary signal + taskMedian: 0.25, // Total task time + taskP95: 0.20, // Tail consistency + screenshotMedian: 0.15, // Vision-agent proxy +}; + +/** Linear score for actions/sec — 10 actions/sec saturates at 100. */ +const APS_CEILING = 10; +/** Latency ceiling in ms — anything ≥ this scores 0. */ +const LATENCY_CEILING_MS = 30_000; + +function scoreThroughput(actionsPerSecond: number): number { + if (!Number.isFinite(actionsPerSecond) || actionsPerSecond <= 0) return 0; + return Math.max(0, Math.min(100, 100 * (actionsPerSecond / APS_CEILING))); +} + +function scoreLatency(valueMs: number): number { + if (!Number.isFinite(valueMs) || valueMs <= 0) return 0; + return Math.max(0, 100 * (1 - valueMs / LATENCY_CEILING_MS)); +} + +/** + * Compute the success rate for a throughput benchmark result (0 to 1). + * + * A session counts as successful iff it ran end-to-end without an iteration + * error AND completed all 50 actions. Partial completions still contribute + * timing data but are not counted as full successes. + */ +export function computeThroughputSuccessRate(result: ThroughputBenchmarkResult): number { + if (result.skipped || result.iterations.length === 0) return 0; + const expectedActions = 50; + const fullySuccessful = result.iterations.filter( + i => !i.error && i.actionsCompleted === expectedActions, + ).length; + return fullySuccessful / result.iterations.length; +} + +function computeThroughputScore( + result: ThroughputBenchmarkResult, + weights: ThroughputScoringWeights = DEFAULT_THROUGHPUT_WEIGHTS, +): number { + const screenshotMedian = result.summary.perActionType.screenshot?.median ?? 0; + return ( + weights.apsMedian * scoreThroughput(result.summary.actionsPerSecond.median) + + weights.taskMedian * scoreLatency(result.summary.taskMs.median) + + weights.taskP95 * scoreLatency(result.summary.taskMs.p95) + + weights.screenshotMedian * scoreLatency(screenshotMedian) + ); +} + +/** + * Compute composite scores for all throughput results and attach them. + * + * Formula: compositeScore = throughputScore × successRate + */ +export function computeThroughputCompositeScores( + results: ThroughputBenchmarkResult[], + weights: ThroughputScoringWeights = DEFAULT_THROUGHPUT_WEIGHTS, +): void { + for (const result of results) { + const successRate = computeThroughputSuccessRate(result); + result.successRate = successRate; + + if (result.skipped || successRate === 0) { + result.compositeScore = 0; + continue; + } + + const baseScore = computeThroughputScore(result, weights); + result.compositeScore = Math.round(baseScore * successRate * 100) / 100; + } +} + +/** + * Sort throughput benchmark results by composite score (highest first). + * Skipped providers are always last. + */ +export function sortThroughputByCompositeScore( + results: ThroughputBenchmarkResult[], +): ThroughputBenchmarkResult[] { + return [...results].sort((a, b) => { + if (a.skipped && !b.skipped) return 1; + if (!a.skipped && b.skipped) return -1; + if (a.skipped && b.skipped) return 0; + return (b.compositeScore ?? 0) - (a.compositeScore ?? 0); + }); +} diff --git a/src/browser/throughput-types.ts b/src/browser/throughput-types.ts new file mode 100644 index 0000000..4208921 --- /dev/null +++ b/src/browser/throughput-types.ts @@ -0,0 +1,70 @@ +export type ActionType = 'navigate' | 'waitForSelector' | 'screenshot' | 'textContent' | 'click' | 'goBack'; + +export const ACTION_TYPES: ActionType[] = [ + 'navigate', + 'waitForSelector', + 'screenshot', + 'textContent', + 'click', + 'goBack', +]; + +export interface ActionResult { + /** 1-based index of the action within the session (1-50) */ + index: number; + type: ActionType; + durationMs: number; + success: boolean; + error?: string; +} + +export interface ThroughputTimingResult { + createMs: number; + connectMs: number; + actions: ActionResult[]; + releaseMs: number; + totalMs: number; + /** How many of the 50 actions succeeded */ + actionsCompleted: number; + /** actionsCompleted / (taskMs / 1000) */ + actionsPerSecond: number; + /** Sum of action durations */ + taskMs: number; + error?: string; +} + +export interface ThroughputStatsTriple { + median: number; + p95: number; + p99: number; +} + +export interface ThroughputStats { + createMs: ThroughputStatsTriple; + taskMs: ThroughputStatsTriple; + totalMs: ThroughputStatsTriple; + actionsPerSecond: ThroughputStatsTriple; + perActionType: Record; +} + +export interface ThroughputBenchmarkResult { + provider: string; + mode: 'browser-throughput'; + iterations: ThroughputTimingResult[]; + summary: ThroughputStats; + /** Composite weighted score (0-100, higher = better). Computed post-benchmark. */ + compositeScore?: number; + /** Success rate as a fraction (0 to 1). Computed post-benchmark. */ + successRate?: number; + skipped?: boolean; + skipReason?: string; +} + +export interface ThroughputProviderConfig { + name: string; + iterations?: number; + timeout?: number; + requiredEnvVars: string[]; + createBrowserProvider: () => any; + sessionCreateOptions?: Record; +} diff --git a/src/merge-results.ts b/src/merge-results.ts index eac75c2..2daad57 100644 --- a/src/merge-results.ts +++ b/src/merge-results.ts @@ -1,7 +1,7 @@ /** * Merge per-provider benchmark results into combined result files. * - * Usage: tsx src/merge-results.ts --input [--mode storage|browser] + * Usage: tsx src/merge-results.ts --input [--mode storage|browser|browser-throughput] * * By default, merges sandbox benchmark results: reads latest.json files from * the input directory, groups by mode (sequential/staggered/burst), computes @@ -14,6 +14,9 @@ * With --mode browser, merges browser benchmark results: deduplicates by * provider, computes browser-specific composite scores, and writes combined * files to results/browser/latest.json. + * + * With --mode browser-throughput, merges throughput benchmark results into + * results/browser-throughput/latest.json. */ import fs from 'fs'; import path from 'path'; @@ -21,10 +24,15 @@ import { fileURLToPath } from 'url'; import { computeCompositeScores } from './sandbox/scoring.js'; import { computeStorageCompositeScores, sortStorageByCompositeScore } from './storage/scoring.js'; import { computeBrowserCompositeScores, sortBrowserByCompositeScore } from './browser/scoring.js'; +import { + computeThroughputCompositeScores, + sortThroughputByCompositeScore, +} from './browser/throughput-scoring.js'; import { printResultsTable, writeResultsJson } from './sandbox/table.js'; import type { BenchmarkResult } from './sandbox/types.js'; import type { StorageBenchmarkResult } from './storage/types.js'; import type { BrowserBenchmarkResult } from './browser/types.js'; +import type { ThroughputBenchmarkResult } from './browser/throughput-types.js'; const __dirname = path.dirname(fileURLToPath(import.meta.url)); const ROOT = path.resolve(__dirname, '..'); @@ -372,7 +380,104 @@ async function mainBrowser() { console.log(`Copied latest: ${latestPath}`); } -const runner = mergeMode === 'storage' ? mainStorage : mergeMode === 'browser' ? mainBrowser : main; +/** + * Print a browser-throughput results table to stdout. + */ +function printThroughputResultsTable(results: ThroughputBenchmarkResult[]): void { + const sorted = sortThroughputByCompositeScore(results); + + console.log(`\n${'='.repeat(120)}`); + console.log(' BROWSER THROUGHPUT BENCHMARK RESULTS'); + console.log('='.repeat(120)); + console.log( + ['Provider', 'Score', 'APS (med)', 'Task (med)', 'Task (p95)', 'Screenshot', 'Create', 'Status'] + .map((h, i) => h.padEnd([14, 8, 12, 12, 12, 12, 12, 10][i])) + .join(' | ') + ); + console.log( + [14, 8, 12, 12, 12, 12, 12, 10].map(w => '-'.repeat(w)).join('-+-') + ); + + for (const r of sorted) { + if (r.skipped) { + console.log([r.provider.padEnd(14), '--'.padEnd(8), '--'.padEnd(12), '--'.padEnd(12), '--'.padEnd(12), '--'.padEnd(12), '--'.padEnd(12), 'SKIPPED'.padEnd(10)].join(' | ')); + continue; + } + const expectedActions = 50; + const fullSuccess = r.iterations.filter(i => !i.error && i.actionsCompleted === expectedActions).length; + const total = r.iterations.length; + const score = r.compositeScore !== undefined ? r.compositeScore.toFixed(1) : '--'; + const aps = `${r.summary.actionsPerSecond.median.toFixed(2)}/s`; + const taskMed = `${(r.summary.taskMs.median / 1000).toFixed(2)}s`; + const taskP95 = `${(r.summary.taskMs.p95 / 1000).toFixed(2)}s`; + const screenshotMed = `${Math.round(r.summary.perActionType.screenshot?.median ?? 0)}ms`; + const createMed = `${(r.summary.createMs.median / 1000).toFixed(2)}s`; + console.log([r.provider.padEnd(14), score.padEnd(8), aps.padEnd(12), taskMed.padEnd(12), taskP95.padEnd(12), screenshotMed.padEnd(12), createMed.padEnd(12), `${fullSuccess}/${total} OK`.padEnd(10)].join(' | ')); + } + console.log('='.repeat(120)); +} + +/** + * Merge browser-throughput benchmark results. + */ +async function mainBrowserThroughput() { + const jsonFiles: string[] = []; + function walk(dir: string) { + if (!fs.existsSync(dir)) return; + for (const entry of fs.readdirSync(dir, { withFileTypes: true })) { + const full = path.join(dir, entry.name); + if (entry.isDirectory()) walk(full); + else if (entry.name === 'latest.json') jsonFiles.push(full); + } + } + walk(inputDir!); + + if (jsonFiles.length === 0) { + console.error(`No latest.json files found in ${inputDir}`); + process.exit(1); + } + + console.log(`Found ${jsonFiles.length} result files`); + + const seen = new Map(); + + for (const file of jsonFiles) { + const raw = JSON.parse(fs.readFileSync(file, 'utf-8')) as { results: ThroughputBenchmarkResult[] }; + const fromSingleProvider = raw.results.length === 1; + for (const result of raw.results) { + const existing = seen.get(result.provider); + if (!existing || (fromSingleProvider && !existing.fromSingleProvider)) { + seen.set(result.provider, { result, fromSingleProvider }); + } + } + } + + const deduped = Array.from(seen.values()).map(e => e.result); + console.log(`\nMerging ${deduped.length} provider results for mode: browser-throughput`); + + computeThroughputCompositeScores(deduped); + printThroughputResultsTable(deduped); + + const { writeThroughputResultsJson } = await import('./browser/throughput-benchmark.js'); + const timestamp = new Date().toISOString().slice(0, 10); + const resultsDir = path.resolve(ROOT, 'results/browser-throughput'); + fs.mkdirSync(resultsDir, { recursive: true }); + + const outPath = path.join(resultsDir, `${timestamp}.json`); + await writeThroughputResultsJson(deduped, outPath); + + const latestPath = path.join(resultsDir, 'latest.json'); + fs.copyFileSync(outPath, latestPath); + console.log(`Copied latest: ${latestPath}`); +} + +const runner = mergeMode === 'storage' + ? mainStorage + : mergeMode === 'browser' + ? mainBrowser + : mergeMode === 'browser-throughput' + ? mainBrowserThroughput + : main; runner().catch(err => { console.error('Merge failed:', err); process.exit(1); diff --git a/src/run.ts b/src/run.ts index fdd5962..27f081c 100644 --- a/src/run.ts +++ b/src/run.ts @@ -9,16 +9,20 @@ import { runConcurrentBenchmark } from './sandbox/concurrent.js'; import { runStaggeredBenchmark } from './sandbox/staggered.js'; import { runStorageBenchmark, writeStorageResultsJson } from './storage/benchmark.js'; import { runBrowserBenchmark, writeBrowserResultsJson } from './browser/benchmark.js'; +import { runThroughputBenchmark, writeThroughputResultsJson } from './browser/throughput-benchmark.js'; import { printResultsTable, writeResultsJson } from './sandbox/table.js'; import { providers } from './sandbox/providers.js'; import { storageProviders } from './storage/providers.js'; import { browserProviders } from './browser/providers.js'; +import { throughputProviders } from './browser/throughput-providers.js'; import { computeCompositeScores } from './sandbox/scoring.js'; import { computeStorageCompositeScores } from './storage/scoring.js'; import { computeBrowserCompositeScores } from './browser/scoring.js'; +import { computeThroughputCompositeScores } from './browser/throughput-scoring.js'; import type { BenchmarkResult, BenchmarkMode } from './sandbox/types.js'; import type { StorageBenchmarkResult } from './storage/types.js'; import type { BrowserBenchmarkResult } from './browser/types.js'; +import type { ThroughputBenchmarkResult } from './browser/throughput-types.js'; const __dirname = path.dirname(fileURLToPath(import.meta.url)); @@ -38,22 +42,24 @@ function getArgValue(args: string[], flag: string): string | undefined { } /** Resolve which modes to run */ -function getModesToRun(): BenchmarkMode[] | ['storage'] | ['browser'] { +function getModesToRun(): BenchmarkMode[] | ['storage'] | ['browser'] | ['browser-throughput'] { if (!rawMode) return ['sequential', 'staggered', 'burst']; if (rawMode === 'storage') return ['storage']; if (rawMode === 'browser') return ['browser']; + if (rawMode === 'browser-throughput') return ['browser-throughput']; const m = rawMode === 'concurrent' ? 'burst' : rawMode as BenchmarkMode; return [m]; } /** Map mode to results subdirectory name */ -function modeToDir(m: BenchmarkMode | 'storage'): string { +function modeToDir(m: BenchmarkMode | 'storage' | 'browser-throughput'): string { switch (m) { case 'sequential': return 'sequential_tti'; case 'staggered': return 'staggered_tti'; case 'burst': case 'concurrent': return 'burst_tti'; case 'storage': return 'storage'; + case 'browser-throughput': return 'browser-throughput'; default: return `${m}_tti`; } } @@ -218,9 +224,81 @@ async function runBrowser(toRun: typeof browserProviders): Promise { console.log(`Copied latest: ${latestPath}`); } +async function runBrowserThroughput(toRun: typeof throughputProviders): Promise { + console.log('\n' + '='.repeat(70)); + console.log(' MODE: BROWSER THROUGHPUT'); + console.log(` Iterations per provider: ${iterations}`); + console.log('='.repeat(70)); + + const results: ThroughputBenchmarkResult[] = []; + + for (const providerConfig of toRun) { + const result = await runThroughputBenchmark({ ...providerConfig, iterations }); + results.push(result); + } + + // Compute composite scores + computeThroughputCompositeScores(results); + + // Print summary + console.log('\n--- Browser Throughput Benchmark Results ---'); + for (const r of results) { + if (r.skipped) { + console.log(`${r.provider}: SKIPPED (${r.skipReason})`); + continue; + } + const expectedActions = 50; + const fullSuccess = r.iterations.filter(i => !i.error && i.actionsCompleted === expectedActions).length; + const total = r.iterations.length; + const aps = r.summary.actionsPerSecond.median; + const taskMed = r.summary.taskMs.median; + const screenshotMed = r.summary.perActionType.screenshot?.median ?? 0; + console.log(`${r.provider}:`); + console.log(` APS: ${aps.toFixed(2)}/s (median) — task ${(taskMed / 1000).toFixed(2)}s, screenshot ${Math.round(screenshotMed)}ms`); + console.log(` Score: ${r.compositeScore?.toFixed(1) || '--'} (${fullSuccess}/${total} OK)`); + } + + // Write JSON results to browser-throughput subdirectory + const timestamp = new Date().toISOString().slice(0, 10); + const resultsDir = path.resolve(__dirname, '../results/browser-throughput'); + fs.mkdirSync(resultsDir, { recursive: true }); + + const outPath = path.join(resultsDir, `${timestamp}.json`); + await writeThroughputResultsJson(results, outPath); + + // Copy results to latest.json + const latestPath = path.join(resultsDir, 'latest.json'); + fs.copyFileSync(outPath, latestPath); + console.log(`Copied latest: ${latestPath}`); +} + async function main() { const modes = getModesToRun(); + // Handle browser-throughput mode separately + if (modes[0] === 'browser-throughput') { + console.log('ComputeSDK Browser Throughput Benchmarks'); + console.log(`Date: ${new Date().toISOString()}\n`); + + const toRun = providerFilter + ? throughputProviders.filter(p => p.name === providerFilter) + : throughputProviders; + + if (toRun.length === 0) { + if (providerFilter) { + console.error(`Unknown browser-throughput provider: ${providerFilter}`); + console.error(`Available: ${throughputProviders.map(p => p.name).join(', ')}`); + } else { + console.error('No browser-throughput providers configured. Add entries to src/browser/throughput-providers.ts.'); + } + process.exit(1); + } + + await runBrowserThroughput(toRun); + console.log('\nAll browser-throughput tests complete.'); + return; + } + // Handle browser mode separately if (modes[0] === 'browser') { console.log('ComputeSDK Browser Provider Benchmarks'); From 06440b7fa4ded0f6204082b3b3cff9803d2cd373 Mon Sep 17 00:00:00 2001 From: Noah Kiser Date: Thu, 7 May 2026 17:18:39 +0000 Subject: [PATCH 2/2] fix: resolve PR comments --- THROUGHPUT.md | 2 +- src/browser/benchmark.ts | 14 +++++++++++--- src/browser/throughput-benchmark.ts | 9 +++++++-- src/browser/throughput-scoring.ts | 2 +- src/merge-results.ts | 2 +- src/run.ts | 23 ++++++++++++++++++----- 6 files changed, 39 insertions(+), 13 deletions(-) diff --git a/THROUGHPUT.md b/THROUGHPUT.md index 543acc9..e741c71 100644 --- a/THROUGHPUT.md +++ b/THROUGHPUT.md @@ -179,7 +179,7 @@ npm run generate-browser-throughput-svg ## Scheduling -The GitHub Actions workflow `browser-throughput-benchmarks.yml` runs daily at 01:00 UTC (offset from the lifecycle browser benchmark at 00:00) with 10 iterations per provider. Pull requests touching browser code run a faster 3-iteration version and post a comparison table as a PR comment. +The GitHub Actions workflow `browser-throughput-benchmarks.yml` runs daily at 03:00 UTC (offset from the lifecycle browser benchmark at 00:00) with 10 iterations per provider. Pull requests touching browser code run a faster 3-iteration version and post a comparison table as a PR comment. ## Limitations diff --git a/src/browser/benchmark.ts b/src/browser/benchmark.ts index 7011561..75e7d43 100644 --- a/src/browser/benchmark.ts +++ b/src/browser/benchmark.ts @@ -163,7 +163,11 @@ function roundStats(s: { median: number; p95: number; p99: number }) { return { median: round(s.median), p95: round(s.p95), p99: round(s.p99) }; } -export async function writeBrowserResultsJson(results: BrowserBenchmarkResult[], outPath: string): Promise { +export async function writeBrowserResultsJson( + results: BrowserBenchmarkResult[], + outPath: string, + options: { timeoutMs?: number } = {}, +): Promise { const fs = await import('fs'); const os = await import('os'); @@ -190,6 +194,10 @@ export async function writeBrowserResultsJson(results: BrowserBenchmarkResult[], ...(r.skipped ? { skipped: r.skipped, skipReason: r.skipReason } : {}), })); + // Derive iteration count from the largest run across providers, so a + // skipped first provider doesn't make the header read 0. + const iterations = results.reduce((max, r) => Math.max(max, r.iterations.length), 0); + const output = { version: '1.0', timestamp: new Date().toISOString(), @@ -199,8 +207,8 @@ export async function writeBrowserResultsJson(results: BrowserBenchmarkResult[], arch: os.arch(), }, config: { - iterations: results[0]?.iterations.length || 0, - timeoutMs: 120000, + iterations, + timeoutMs: options.timeoutMs ?? 120_000, }, results: cleanResults, }; diff --git a/src/browser/throughput-benchmark.ts b/src/browser/throughput-benchmark.ts index 4dc79f5..d89669b 100644 --- a/src/browser/throughput-benchmark.ts +++ b/src/browser/throughput-benchmark.ts @@ -319,6 +319,7 @@ function roundStats(s: ThroughputStatsTriple): ThroughputStatsTriple { export async function writeThroughputResultsJson( results: ThroughputBenchmarkResult[], outPath: string, + options: { timeoutMs?: number } = {}, ): Promise { const fs = await import('fs'); const os = await import('os'); @@ -357,6 +358,10 @@ export async function writeThroughputResultsJson( ...(r.skipped ? { skipped: r.skipped, skipReason: r.skipReason } : {}), })); + // Derive iteration count from the largest run across providers, so a + // skipped first provider doesn't make the header read 0. + const iterations = results.reduce((max, r) => Math.max(max, r.iterations.length), 0); + const output = { version: '1.0', timestamp: new Date().toISOString(), @@ -366,9 +371,9 @@ export async function writeThroughputResultsJson( arch: os.arch(), }, config: { - iterations: results[0]?.iterations.length || 0, + iterations, actionsPerSession: ACTIONS_PER_SESSION, - timeoutMs: 120_000, + timeoutMs: options.timeoutMs ?? 120_000, }, results: cleanResults, }; diff --git a/src/browser/throughput-scoring.ts b/src/browser/throughput-scoring.ts index 5637053..e92f137 100644 --- a/src/browser/throughput-scoring.ts +++ b/src/browser/throughput-scoring.ts @@ -25,7 +25,7 @@ function scoreThroughput(actionsPerSecond: number): number { } function scoreLatency(valueMs: number): number { - if (!Number.isFinite(valueMs) || valueMs <= 0) return 0; + if (!Number.isFinite(valueMs)) return 0; return Math.max(0, 100 * (1 - valueMs / LATENCY_CEILING_MS)); } diff --git a/src/merge-results.ts b/src/merge-results.ts index 2daad57..cd1bc77 100644 --- a/src/merge-results.ts +++ b/src/merge-results.ts @@ -46,7 +46,7 @@ function getArgValue(flag: string): string | undefined { const inputDir = getArgValue('--input'); const mergeMode = getArgValue('--mode'); if (!inputDir) { - console.error('Usage: tsx src/merge-results.ts --input [--mode storage|browser]'); + console.error('Usage: tsx src/merge-results.ts --input [--mode storage|browser|browser-throughput]'); process.exit(1); } diff --git a/src/run.ts b/src/run.ts index 27f081c..28a01f6 100644 --- a/src/run.ts +++ b/src/run.ts @@ -29,7 +29,8 @@ const __dirname = path.dirname(fileURLToPath(import.meta.url)); // Parse CLI args const args = process.argv.slice(2); const providerFilter = getArgValue(args, '--provider'); -const iterations = parseInt(getArgValue(args, '--iterations') || '100', 10); +const iterationsArg = getArgValue(args, '--iterations'); +const iterations = parseInt(iterationsArg || '100', 10); const rawMode = getArgValue(args, '--mode'); const concurrency = parseInt(getArgValue(args, '--concurrency') || '100', 10); const storageConcurrency = parseInt(getArgValue(args, '--storage-concurrency') || '1', 10); @@ -216,7 +217,8 @@ async function runBrowser(toRun: typeof browserProviders): Promise { fs.mkdirSync(resultsDir, { recursive: true }); const outPath = path.join(resultsDir, `${timestamp}.json`); - await writeBrowserResultsJson(results, outPath); + const timeoutMs = toRun.reduce((max, p) => Math.max(max, p.timeout ?? 120_000), 0) || 120_000; + await writeBrowserResultsJson(results, outPath, { timeoutMs }); // Copy results to latest.json const latestPath = path.join(resultsDir, 'latest.json'); @@ -225,15 +227,25 @@ async function runBrowser(toRun: typeof browserProviders): Promise { } async function runBrowserThroughput(toRun: typeof throughputProviders): Promise { + // Throughput sessions are ~12s each, so we use a much lower default than + // the global iterations CLI value. Only override when --iterations was + // explicitly passed; otherwise let runThroughputBenchmark apply its own + // default (10 sessions per provider). + const throughputIterations = iterationsArg ? iterations : undefined; + console.log('\n' + '='.repeat(70)); console.log(' MODE: BROWSER THROUGHPUT'); - console.log(` Iterations per provider: ${iterations}`); + console.log(` Iterations per provider: ${throughputIterations ?? 10}`); console.log('='.repeat(70)); const results: ThroughputBenchmarkResult[] = []; for (const providerConfig of toRun) { - const result = await runThroughputBenchmark({ ...providerConfig, iterations }); + const result = await runThroughputBenchmark( + throughputIterations !== undefined + ? { ...providerConfig, iterations: throughputIterations } + : providerConfig, + ); results.push(result); } @@ -264,7 +276,8 @@ async function runBrowserThroughput(toRun: typeof throughputProviders): Promise< fs.mkdirSync(resultsDir, { recursive: true }); const outPath = path.join(resultsDir, `${timestamp}.json`); - await writeThroughputResultsJson(results, outPath); + const timeoutMs = toRun.reduce((max, p) => Math.max(max, p.timeout ?? 120_000), 0) || 120_000; + await writeThroughputResultsJson(results, outPath, { timeoutMs }); // Copy results to latest.json const latestPath = path.join(resultsDir, 'latest.json');