From 98e896390e74f5fadf8d5a2a978b97ae79a83d30 Mon Sep 17 00:00:00 2001 From: Jongsun Suh Date: Fri, 5 Jun 2026 14:54:15 -0400 Subject: [PATCH] Add platform domain plus testing and utility skills --- CHANGELOG.md | 4 + .../skills/resilient-api-collection/skill.md | 188 ++++++++++++++++++ .../browser-extension-profiling/skill.md | 65 ++++++ .../knowledge/extension-architecture.md | 89 +++++++++ .../platform/knowledge/mv3-service-worker.md | 94 +++++++++ .../repos/metamask-extension.md | 46 +++++ .../extension-errors-debugging/skill.md | 59 ++++++ .../extension-lifecycle-decoupling/skill.md | 55 +++++ .../benchmark-statistical-hygiene.md | 45 +++++ .../testing/skills/benchmark-design/skill.md | 71 +++++++ 10 files changed, 716 insertions(+) create mode 100644 domains/coding/skills/resilient-api-collection/skill.md create mode 100644 domains/performance/skills/browser-extension-profiling/skill.md create mode 100644 domains/platform/knowledge/extension-architecture.md create mode 100644 domains/platform/knowledge/mv3-service-worker.md create mode 100644 domains/platform/skills/extension-errors-debugging/repos/metamask-extension.md create mode 100644 domains/platform/skills/extension-errors-debugging/skill.md create mode 100644 domains/platform/skills/extension-lifecycle-decoupling/skill.md create mode 100644 domains/testing/knowledge/benchmark-statistical-hygiene.md create mode 100644 domains/testing/skills/benchmark-design/skill.md diff --git a/CHANGELOG.md b/CHANGELOG.md index 383d49c7..a4a265e2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +### Added + +- Add `platform` domain (extension errors/lifecycle debugging + architecture knowledge), `testing/benchmark-design`, `performance/browser-extension-profiling`, and `coding/resilient-api-collection` + ## [0.1.0] ### Added diff --git a/domains/coding/skills/resilient-api-collection/skill.md b/domains/coding/skills/resilient-api-collection/skill.md new file mode 100644 index 00000000..6d00ca51 --- /dev/null +++ b/domains/coding/skills/resilient-api-collection/skill.md @@ -0,0 +1,188 @@ +--- +name: resilient-api-collection +description: Build resilient data collection scripts that paginate APIs, handle rate limits, and retry transient errors. Use when writing scrapers, API collectors, data pipelines, or any script that fetches paginated data from external APIs (GitHub GraphQL, REST APIs, etc.). +--- + +# Resilient API Collection Scripts + +## Core Architecture + +Every collection script needs these layers: + +``` +run_query() → single request with retry + error classification +fetch_all_pages() → pagination loop with adaptive page sizing +main() → orchestration, dedup, persistence +``` + +## 1. Error Classification + +Classify errors **before** choosing a recovery strategy. Different errors need different fixes. + +| Error Type | Signal | Recovery | +|---|---|---| +| **Resource/complexity limit** | Query too expensive for server | Reduce page size | +| **Rate limit** (primary) | 429, `X-RateLimit-Remaining: 0` | Wait until reset time | +| **Rate limit** (secondary) | 403 + "secondary rate limit" | Exponential backoff (start 60s) | +| **Transient server error** | 502, 503, 504, stream reset | Retry with exponential backoff | +| **Client error** | 400, 401, 404 | Don't retry — fix the request | + +### CLI tools hide error details + +Tools like `gh`, `curl`, `httpie` surface errors differently than raw HTTP responses: + +- **`gh api graphql`**: "Resource limits exceeded" appears in `stderr` with non-zero exit code, NOT in the JSON response `errors` array. Always check `stderr` first, before checking `returncode`. +- Rate limit info may be in response headers (not visible via CLI) or in error messages. + +```python +# Check stderr BEFORE returncode — some errors are in stderr even on exit 0 +stderr = result.stderr.strip() + +if "Resource limits" in stderr or "resource limit" in stderr.lower(): + return RESOURCE_LIMIT_SIGNAL # caller reduces page size + +if result.returncode == 0: + data = json.loads(result.stdout) + # Also check JSON errors (some APIs put limits here) + if "errors" in data: + msg = data["errors"][0].get("message", "") + if "Resource limits" in msg or "timeout" in msg.lower(): + return RESOURCE_LIMIT_SIGNAL + return data + +# Classify non-zero exit +is_transient = any(s in stderr for s in [ + "502", "503", "504", "429", "rate limit", + "secondary", "stream error", "CANCEL" +]) +``` + +## 2. Retry with Exponential Backoff + +```python +MAX_RETRIES = 5 +INITIAL_BACKOFF = 5 # seconds + +for attempt in range(1, MAX_RETRIES + 1): + result = execute_request(...) + + if success: + return result + if is_resource_limit(error): + return RESOURCE_LIMIT_SIGNAL # don't retry, reduce page size + if not is_transient(error): + return None # permanent failure + if attempt == MAX_RETRIES: + return None # exhausted + + wait = INITIAL_BACKOFF * (2 ** (attempt - 1)) + log(f"Transient error (attempt {attempt}/{MAX_RETRIES}), retrying in {wait}s") + time.sleep(wait) +``` + +Key: resource-limit errors should NOT be retried — the same query will fail identically. Signal the caller to reduce page size instead. + +## 3. Adaptive Page Sizing + +Start conservatively. Halve on resource-limit errors. Set a floor. + +```python +MIN_PAGE_SIZE = 5 +MAX_REDUCTIONS = 4 +page_size = 50 # not 100 — nested sub-selections multiply complexity + +while has_more_pages: + data = run_query(..., page_size=page_size) + + if data == RESOURCE_LIMIT_SIGNAL: + reductions += 1 + if reductions > MAX_REDUCTIONS or page_size <= MIN_PAGE_SIZE: + break # can't go smaller + page_size = max(MIN_PAGE_SIZE, page_size // 2) + time.sleep(10) # cool down before retry + continue # retry same page with smaller size + + # process nodes, advance cursor... + time.sleep(2) # inter-page delay to avoid secondary rate limits +``` + +### Why 50, not 100? + +GraphQL query cost = `nodes × sub-selections`. A query fetching 100 PRs with `reviews(first:50)`, `participants(first:30)`, `commits(first:1)` easily exceeds GitHub's 500K node limit. Starting at 50 avoids most resource-limit errors. + +## 4. Deduplication and Incremental Collection + +Always dedup by natural key before writing. This lets re-runs extend existing data. + +```python +def dedup(existing, new, key_fn): + by_key = {} + for item in existing: + by_key[key_fn(item)] = item + for item in new: + by_key[key_fn(item)] = item # new overwrites old + return list(by_key.values()) + +# On write: +existing = load_json(path) if os.path.exists(path) else [] +final = dedup(existing, new_items, key_fn=lambda x: (x["repo"], x["number"])) +save_json(path, final) +``` + +## 5. Observability + +### Force unbuffered output + +Python buffers stdout when output is captured (subprocess, pipe, file redirect). Progress lines never appear. + +```python +import sys +sys.stdout.reconfigure(line_buffering=True) +# OR run with: python3 -u script.py +``` + +### Log structure for monitoring + +``` +=== repo-name (query-type) === + Page 1: 50 nodes, hasNext=True (size=50) + Page 2: 50 nodes, hasNext=True (size=50) + Resource limit exceeded (page_size=50), signaling page-size reduction + Reducing page size to 25 and retrying page 3 (reduction 1/4) + Page 3: 25 nodes, hasNext=True (size=25) + ... + Total: 430 items, collected 430, 3169 sub-items +``` + +Every log line should include: page number, items returned, whether there are more pages, and current page size. + +## 6. Inter-Page Delays + +GitHub's secondary rate limit triggers on sustained request volume, not individual request cost. Add 2-3s between pages. + +```python +PAGE_DELAY = 2 # seconds + +# After each successful page: +time.sleep(PAGE_DELAY) + +# After a resource-limit reduction: +time.sleep(INITIAL_BACKOFF * 2) # longer cooldown +``` + +## Checklist + +When writing a collection script, verify: + +- [ ] Error classification distinguishes resource-limit from rate-limit from transient +- [ ] Resource-limit errors reduce page size (not retry same query) +- [ ] Transient errors retry with exponential backoff +- [ ] Non-retryable errors fail fast +- [ ] Page size starts at 50 or lower for nested queries +- [ ] Page size has a floor (5-10) and max-reduction cap +- [ ] Inter-page delay prevents secondary rate limits +- [ ] Output is unbuffered (`-u` flag or `reconfigure`) +- [ ] Each log line includes page number, count, hasNext, page size +- [ ] Data is deduped by natural key before writing +- [ ] Re-runs merge with existing data (incremental collection) +- [ ] Collection log records run metadata (timestamps, repos, filters) diff --git a/domains/performance/skills/browser-extension-profiling/skill.md b/domains/performance/skills/browser-extension-profiling/skill.md new file mode 100644 index 00000000..d956dea8 --- /dev/null +++ b/domains/performance/skills/browser-extension-profiling/skill.md @@ -0,0 +1,65 @@ +--- +maturity: experimental +name: browser-extension-profiling +description: Compare browser extension performance between branches using WDYR, React DevTools Profiler, and E2E benchmarks with statistical rigor. +--- + +# Browser Extension Profiling + +Methodology for profiling and comparing extension performance across branches or commits. + +## When To Use + +- Validating that a refactor reduces unnecessary re-renders (needs before/after comparison) +- Establishing baseline metrics for a performance initiative +- Investigating a reported UI slowdown in the extension + +## Do Not Use When + +- Single-run comparisons — statistical significance requires ≥10 runs per scenario +- The change touches only non-render paths (background scripts, network with no UI impact) +- Target behavior is server-side latency, not UI rendering + +## Workflow + +1. **Build both branches** with `yarn build:test` on the same machine and Chrome version + +2. **WDYR profiling** (unnecessary re-render counts) + ```bash + ENABLE_WHY_DID_YOU_RENDER=true yarn start + ``` + Flags to watch: + - `different objects that are equal by value` → object recreation + - `different functions with the same name` → callback recreation + - `props object itself changed but values equal` → parent cascade + +3. **React DevTools Profiler** for flame graphs and commit timings + ```bash + yarn devtools:react + ``` + +4. **E2E benchmarks** for scenario durations + ```bash + yarn test:e2e:benchmark + ``` + +5. **Collect ≥10 runs** per scenario. Discard top/bottom 10%. Report mean, median, stddev, p75, p95. + +6. **Statistical threshold:** Cohen's d > 0.5 for a meaningful difference. + +## Common Pitfalls + +| Mistake | Correct Approach | +|---------|-----------------| +| Running branches on different machines or Chrome versions | Same machine, same Chrome, no other apps running | +| Pooling all runs including noisy late-session ones | Compute per-round stats first; report cleanest signal with explicit round attribution | +| Reporting absolute re-render counts without scenario context | Normalize per-action; cascade fixes show multiplied impact at root | +| Skipping cache and state reset between runs | Clear browser cache, reset extension state for each run | + +## Pre-Profiling Checklist + +- [ ] Both branches built with `yarn build:test` +- [ ] Same machine, same Chrome version +- [ ] No other tabs or applications running +- [ ] WDYR enabled: `ENABLE_WHY_DID_YOU_RENDER=true` +- [ ] Cache and extension state cleared between runs diff --git a/domains/platform/knowledge/extension-architecture.md b/domains/platform/knowledge/extension-architecture.md new file mode 100644 index 00000000..9b87656e --- /dev/null +++ b/domains/platform/knowledge/extension-architecture.md @@ -0,0 +1,89 @@ +--- +name: extension-architecture +domain: platform +description: MetaMask extension — background/UI boundary, state sync, build types, key directories +--- + +# Extension Architecture + +## Background / UI Boundary + +The extension runs two separate JavaScript contexts that cannot share memory. + +| Context | Entry | Access | +|---------|-------|--------| +| Background (Service Worker / background page) | `app/scripts/` | DOM-less; controllers, wallet logic | +| UI (popup/tab) | `ui/` | React + Redux; rendering only | +| Shared | `shared/` | Constants, utilities, type definitions | + +Communication is message-based (Chrome runtime messaging). Code in `app/scripts/` cannot `import` from `ui/` and vice versa. + +## State Sync Flow + +``` +Controller state changes (app/scripts/) + ↓ +metamask-controller.js batches via debounce (200ms) + ↓ +UI receives batched state via sendUpdate + ↓ +Redux dispatches UPDATE_METAMASK_STATE + ↓ +Immer applies patches (structural sharing — unchanged paths keep stable references) + ↓ +useSelector evaluates; components re-render if output changed +``` + +Key file: `app/scripts/metamask-controller.js` — aggregates all controller state. + +## Build Types + +| Build | Command | Background | Security Policy | +|-------|---------|------------|-----------------| +| Development | `yarn start` | Webpack, hot reload | No LavaMoat | +| Production | `yarn dist` | Browserify | LavaMoat enforced | +| Test | `yarn build:test` | Browserify | Partial LavaMoat | + +LavaMoat restricts package capabilities at runtime. After adding/updating dependencies, run `yarn lavamoat:auto` to regenerate policies. + +## Manifest Versions + +| Version | Background | Lifecycle | +|---------|------------|-----------| +| MV3 (Chrome) | Service Worker | Can terminate and restart | +| MV2 (Firefox) | Background Page | Always running | + +Errors concentrated in MV3 (99%+) → root cause is service worker lifecycle, not application logic. + +## Key Directories + +``` +app/scripts/ +├── controllers/ # Feature controllers (one per domain) +├── lib/ # Background utilities +└── metamask-controller.js # Main aggregator; 200ms debounce + +ui/ +├── components/ # Reusable React components +├── pages/ # Page-level components +│ ├── routes/ # routes.component.tsx (high selector count) +│ └── home/ # home.container.js (legacy connect()) +├── ducks/ # Redux slices +├── selectors/ # All selectors +│ ├── selectors.js # Main file (~2500 lines) +│ └── .ts # Feature-specific selectors +└── contexts/ # React Context providers + +shared/ +├── constants/ +├── lib/ +└── modules/ + └── selectors/ + └── selector-creators.ts +``` + +## React Compiler Scope + +Enabled for `ui/components`, `ui/contexts`, `ui/hooks`, `ui/layouts`, `ui/pages`. + +Does NOT cross file boundaries — selector values from `useSelector` require manual `useMemo`. diff --git a/domains/platform/knowledge/mv3-service-worker.md b/domains/platform/knowledge/mv3-service-worker.md new file mode 100644 index 00000000..5acdc7a1 --- /dev/null +++ b/domains/platform/knowledge/mv3-service-worker.md @@ -0,0 +1,94 @@ +--- +name: mv3-service-worker +domain: platform +description: MV3 service worker lifecycle — Chrome background termination model, MetaMask's idle-termination mitigation, and cold-start failure modes +--- + +# MV3 Service Worker Lifecycle + +## MV2 vs MV3 + +| Manifest | Background | Default Lifecycle | Mitigated in MetaMask? | +|----------|------------|-------------------|------------------------| +| MV3 (Chrome) | Service Worker | Idle termination after 30s, hard cap ~5 min | Yes — see Idle Termination Mitigation | +| MV2 (Firefox) | Background Page | Always running | N/A | + +## Idle Termination Mitigation + +`app/scripts/background.js:750-758` runs a 2s `browser.storage.session` write loop. `saveTimestamp` (defined at `background.js:651-655`) writes an ISO timestamp into session storage: + + function saveTimestamp() { + const timestamp = new Date().toISOString(); + browser.storage.session.set({ timestamp }); + } + ... + const SAVE_TIMESTAMP_INTERVAL_MS = 2 * 1000; + saveTimestamp(); + setInterval(saveTimestamp, SAVE_TIMESTAMP_INTERVAL_MS); + +Each `chrome.*` / `browser.*` API call resets the 30s idle timer. At 2s cadence the worker stays alive indefinitely while the extension is active. `storage.session` (not `storage.local`) is deliberate — it is MV3-only, in-memory, and does not accumulate disk writes from a heartbeat. + +| Property | Value | +|---|---| +| API | `browser.storage.session.set` (MV3-only, in-memory) | +| Interval | 2000 ms (`SAVE_TIMESTAMP_INTERVAL_MS`) | +| Gate | `PreferencesController.enableMV3TimestampSave !== false` (default true) | +| Inline comment | `background.js:752` — "This keeps the service worker alive" | +| Pattern origin | De facto community consensus, not officially endorsed by Chrome DevRel | +| Re-verify if | Chromium policy change on idle-timer API interactions | + +Ongoing idle termination is **not** a live failure mode while the extension is running. Cold starts (browser launch, extension enable/reload, crash recovery) are the actual source of MV3-concentrated failures. + +## Verification Discipline + +Before attributing an MV3-concentrated error to "idle termination pressure": + +1. Verify `background.js:750-758` keepalive loop still exists and `saveTimestamp` still calls a `chrome.*` / `browser.*` API +2. Verify `enableMV3TimestampSave` is not disabled in affected Sentry events +3. Check whether error timing correlates with cold-start events, not idle periods + +If any check out, the working hypothesis is cold-start cascade race, not ongoing termination. + +## Error Concentration Signal + +| Distribution | Conclusion | +|---|---| +| ~50/50 MV3/MV2 | Application bug (affects both contexts equally) | +| 99%+ MV3 only | MV3 service worker lifecycle — check cold-start cascade before assuming idle termination | +| 99%+ MV2 only | Firefox-specific browser behavior | + +## Sentry Tag Dimensions + +Independent — do not conflate. + +| Tag | Meaning | +|-----|---------| +| `environment` | Build configuration (production, staging, development) | +| `installType` | How extension was loaded (normal, development, sideload, admin) | +| `dist` | Manifest version (mv3, mv2) | + +A production build can have `installType: development` if loaded unpacked. Filter carefully. + +## MV3-Specific Failure Modes + +| Failure | Cause | Mitigated? | +|---------|-------|------------| +| Cold-start cascade race (`APP_INIT_ALIVE` sent before UI listener bound) | `app-init.js` → dynamic-import `background.js` → listener registration races against an open port | No | +| `Background connection unresponsive` via ongoing idle termination | Worker idle-killed mid-session | Yes — 2s keepalive loop | +| `Background connection unresponsive` via cold-start latency | Cold start on browser launch + first-flush latency before `startUiSync` | No — keepalive does not apply before worker exists | +| Silent `postMessage` failure | Port disconnected during wake/termination, try/catch swallows error | No | +| In-memory state lost on cold start | New worker instance has empty in-memory state | No (fresh persistence read required) | + +## Sentry Diagnostic Instrumentation + +| Tag | Purpose | Status | +|-----|---------|--------| +| `uiStartup.receivedAppInitPing` | Distinguishes cold-start cascade race cases; `false` + `ALIVE` received ⇒ `APP_INIT_ALIVE` lost on cold start | Missing on `Background connection unresponsive` path as of 13.26.0 — instrumentation gap, being fixed | +| Phase-specific critical error types (`BACKGROUND_INITIALIZED`, `START_UI_SYNC`) | Distinguishes which startup phase hung | Added by 3-phase startup watchdog (PR #40306) | + +## When to Investigate MV3 Separately + +- Error volume is 10× higher in Chrome than Firefox +- Error involves background connectivity, keepalive, or startup handshake +- Error disappears when running with the worker kept alive manually +- Error correlates with browser-launch or extension-reload timestamps, not idle gaps diff --git a/domains/platform/skills/extension-errors-debugging/repos/metamask-extension.md b/domains/platform/skills/extension-errors-debugging/repos/metamask-extension.md new file mode 100644 index 00000000..5020917c --- /dev/null +++ b/domains/platform/skills/extension-errors-debugging/repos/metamask-extension.md @@ -0,0 +1,46 @@ +--- +repo: metamask-extension +parent: extension-errors-debugging +--- + +## Sentry Filters + +Filter by `dist` tag to isolate manifest version: +- `dist:mv3` — Chrome builds +- `dist:mv2` — Firefox builds + +Filter by `installType` to exclude developer-loaded builds: +- `installType:normal` — store-installed +- `installType:development` — sideloaded (unpacked); includes production builds loaded via developer mode + +## Build Commands + +```bash +# MV3 development (Chrome, service worker) +yarn start + +# MV2 development (Firefox, background page) +yarn start:mv2 + +# Production build (both manifests) +yarn dist + +# After dependency changes — regenerate LavaMoat policies +yarn lavamoat:auto +``` + +## Background Keepalive + +| Property | Value | +|---|---| +| Location | `app/scripts/background.js:750-758` | +| Function | `saveTimestamp` at `background.js:651-655` calls `browser.storage.session.set({ timestamp })` | +| Cadence | 2000 ms via `setInterval` | +| Effect | Each call resets Chrome's 30s SW idle timer — prevents idle eviction during active sessions | +| Gate | `PreferencesController.enableMV3TimestampSave !== false` | + +Active-session keepalive failures are rare and should be investigated as code bugs, not platform behavior. Cold-start cascade and first-flush latency are the actual MV3-concentrated failure modes — see `mv3-service-worker` knowledge for mechanism, failure modes table, and verification discipline. + +## Controller-Messenger Pattern + +Controllers communicate via `ControllerMessenger` (`@metamask/base-controller`). A controller's public API is its registered actions and events — not direct method calls. Cross-controller calls that bypass the messenger will not work across the background/UI boundary. diff --git a/domains/platform/skills/extension-errors-debugging/skill.md b/domains/platform/skills/extension-errors-debugging/skill.md new file mode 100644 index 00000000..885f59f8 --- /dev/null +++ b/domains/platform/skills/extension-errors-debugging/skill.md @@ -0,0 +1,59 @@ +--- +maturity: experimental +name: extension-errors-debugging +description: Diagnose browser extension errors — MV3 vs MV2, background/UI context, error tagging +--- + +# Extension Errors Debugging + +## When To Use + +- Errors appear in one manifest version but not the other +- Background connection or keepalive failures +- Errors that are hard to reproduce in development (only manifest in prod) +- Diagnosing Sentry errors before attributing root cause + +## Do Not Use When + +- Local development errors with full stack traces and reliable repro +- Build/compile errors (TypeScript, ESLint, bundler) +- Test failures unrelated to extension runtime behavior + +## Workflow + +1. **Check distribution** — Filter by `dist` tag. Is the error 99%+ MV3, MV2, or split? +2. **Classify root cause** — MV3-only → service worker lifecycle (specifically cold-start cascade; ongoing idle termination is mitigated — see `mv3-service-worker` knowledge). Split → application logic. MV2-only → Firefox behavior. +3. **Identify context** — Is the error from background (`app/scripts/`) or UI (`ui/`)? Stack trace file paths reveal this. +4. **Check error tags** — Verify `environment`, `installType`, and `dist` are what you expect (these are independent dimensions). +5. **Reproduce** — Use `dist` tag filter to reproduce in the right manifest version. + +## Context Identification from Stack Traces + +| Path prefix in trace | Context | +|---------------------|---------| +| `app/scripts/controllers/` | Background controller | +| `app/scripts/metamask-controller.js` | Background aggregator | +| `ui/components/` or `ui/pages/` | UI (React) | +| `shared/` | Either — shared module | + +## Background-Specific Error Types + +| Error | MV3 Root Cause | Mitigated? | +|-------|---------------|------------| +| Background connection unresponsive (cold-start cascade) | `app-init.js` → `background.js` listener race on worker cold start | No | +| Background connection unresponsive (first-flush latency) | Cold start + background state aggregation before `startUiSync` | No | +| Background connection unresponsive (idle termination) | Worker idle-killed mid-session | Yes — 2s `browser.storage.session` keepalive | +| Port disconnected (wake/termination race) | Port closed during worker lifecycle transition; silent via try/catch | No | +| Keepalive timer missed (active session) | Would imply `browser.storage.session.set` interval failed — rare; investigate as application bug, not platform behavior | N/A | +| In-memory state lost (cold start) | New worker instance re-reads persisted state | No | + +## Common Pitfalls + +| Mistake | Correct Approach | +|---------|-----------------| +| Attribute 99% MV3 error to application code | Check if error requires running background; MV3 SW lifecycle is the likely root cause | +| Default to "SW was terminated mid-session" for MV3 errors | Ongoing idle termination is mitigated by the 2s `browser.storage.session` keepalive. The likely mechanism is cold-start cascade or first-flush latency — see `mv3-service-worker` knowledge | +| "Keepalive timer missed" ⇒ SW slept | The 2s keepalive prevents idle sleep while active. A missed keepalive during active session is a code bug, not platform behavior | +| Use `environment` to filter for dev builds | Use `installType: development` — a prod build can be sideloaded | +| Conflate `dist` and `environment` | They are independent; filter both when needed | +| Reproduce MV2-only error in Chrome | Use Firefox; `installType` doesn't replicate MV3/MV2 lifecycle difference | diff --git a/domains/platform/skills/extension-lifecycle-decoupling/skill.md b/domains/platform/skills/extension-lifecycle-decoupling/skill.md new file mode 100644 index 00000000..ef908aed --- /dev/null +++ b/domains/platform/skills/extension-lifecycle-decoupling/skill.md @@ -0,0 +1,55 @@ +--- +maturity: experimental +name: extension-lifecycle-decoupling +description: Verify platform lifecycle events before assuming they cause application-level side effects +--- + +# Extension Lifecycle Decoupling + +## When To Use + +- Estimating event frequency based on service worker eviction +- Debugging behavior that "should" trigger on lock/unlock but doesn't +- Investigating keepalive, timer, or state persistence behavior + +## Do Not Use When + +- Working on UI-only code with no background process interaction +- The behavior reproduces reliably in development without service worker eviction + +## Core Distinction + +| Layer | Examples | Characteristics | +|-------|---------|----------------| +| Platform lifecycle | SW eviction, page unload | Infrastructure-level | +| Application lifecycle | Lock, unlock, init | User-level | + +These layers are often **decoupled**. The mapping between them is an implementation detail — verify it, don't assume it. + +## Verification Checklist + +Before claiming a platform lifecycle event causes application behavior: + +1. Is there an explicit handler (`onSuspend`, `beforeunload`) that triggers the claimed effect? +2. Is there a keepalive mechanism preventing the lifecycle event? +3. Does relevant state persist across restarts (`chrome.storage.session`, IndexedDB)? +4. Are timers alarm-based (persist across SW restart) or `setTimeout`-based (don't)? +5. Is the guard/flag reset by the lifecycle event or by a separate application event? + +## MV3 MetaMask Specifics + +| Assumption | Reality | +|------------|---------| +| SW eviction triggers lock | No `onSuspend` lock handler — SW eviction does NOT trigger lock | +| Timers lost on SW restart | Auto-lock uses Chrome Alarms API — persists across SW restarts | +| State lost on SW restart | Wallet state persists in `chrome.storage.session` and IndexedDB | +| SW evicts frequently during active use | `background.js:750-758` calls `browser.storage.session.set` every 2s. Each `chrome.*`/`browser.*` call resets the 30s idle timer, so active-session eviction is effectively prevented. Cold starts (browser launch, extension reload) still happen. See `mv3-service-worker` knowledge for mechanism and verification discipline | + +## Common Pitfalls + +| Mistake | Correct Approach | +|---------|-----------------| +| "SW evicts N times/day → event fires N times/day" | Check if application code has handler for eviction | +| Assume frequency from platform behavior | Grep for actual handler chains in `background.js`, `app-state-controller.ts` | +| Conflate platform restart with application reset | Check which state is persisted vs re-initialized | +| "Keepalive uses `chrome.alarms`" | It does not — keepalive uses `browser.storage.session.set` at 2s cadence. `chrome.alarms` is used separately for auto-lock timers that must persist across SW restart | diff --git a/domains/testing/knowledge/benchmark-statistical-hygiene.md b/domains/testing/knowledge/benchmark-statistical-hygiene.md new file mode 100644 index 00000000..7015bbc5 --- /dev/null +++ b/domains/testing/knowledge/benchmark-statistical-hygiene.md @@ -0,0 +1,45 @@ +--- +name: benchmark-statistical-hygiene +domain: testing +description: Three patterns for defensible A/B benchmark results: per-round best-subset reporting, fix-vector isolation, and artifact sort-order trap. +--- + +# Benchmark Statistical Hygiene + +Three patterns that prevent the most common classes of invalid benchmark conclusions. + +## Pattern: Per-Round Best-Subset Reporting + +Later benchmark rounds accumulate system noise (background load, memory pressure, I/O contention). Pooling all rounds blindly treats noisy late-session data equally with clean early-session data. + +**Instead:** Compute per-round statistics first, then report the cleanest signal per metric with explicit round attribution. + +``` +Round 1 (clean): metric X → treatment wins, p=0.04, d=-1.7 +Round 2 (moderate): metric X → treatment wins, p=0.08, d=-0.9 +Round 3 (noisy): metric X → no effect, p=0.90, d=+0.04 + +Pooled (all): metric X → no effect, p=0.50, d=-0.2 ← signal destroyed + +Correct report: "X improved 49% (Round 1, n=5, p=0.04, d=-1.7). + Pooled n=20 loses significance due to Round 3 outliers." +``` + +A small N with large effect size (|d| > 1.5, p < 0.05) is more defensible than a large N where noise has diluted significance to nothing. + +## Pattern: Isolate the Fix Vector + +Design each benchmark flow to exercise the optimization's specific input vector as its primary signal source. Incidental coverage produces fragile results where signal-to-noise depends on how much of the measured duration is optimization-affected. + +| | Weak | Strong | +|-|------|--------| +| Design | End-to-end flow that incidentally triggers target once among many other operations | Rapid sequence of actions each triggering the target with minimal other overhead | +| Optimization signal | ~5% of measured duration | ~80% of measured duration | + +## Pattern: Artifact Sort-Order Trap + +Unpadded iteration numbers in filenames break lexicographic sorting: `iteration-1, iteration-10, iteration-2, ...` interleaves data from different rounds when processed in glob order. + +**Rule:** When processing sequentially-numbered artifacts, extract the embedded timestamp or numeric value for sorting. Never rely on string sort order when numbers cross digit boundaries. + +**Diagnosis:** If pipeline results look implausible (p-values that are too perfect, round-level stats that don't match spot checks), print the actual file ordering the pipeline used. Check for lexicographic interleaving at digit boundaries. Re-sort by extracted timestamp or zero-padded key. diff --git a/domains/testing/skills/benchmark-design/skill.md b/domains/testing/skills/benchmark-design/skill.md new file mode 100644 index 00000000..578691df --- /dev/null +++ b/domains/testing/skills/benchmark-design/skill.md @@ -0,0 +1,71 @@ +--- +maturity: experimental +name: benchmark-design +description: Design, run, and analyze E2E performance benchmarks — session hygiene, per-round reporting, artifact grouping +--- + +# Benchmark Design + +## When To Use + +- Writing a new E2E benchmark flow +- Interpreting or presenting benchmark results +- Adding new metrics to existing benchmarks +- Diagnosing unexpected benchmark results + +## Do Not Use When + +- Adding unit, integration, or correctness E2E tests +- Profiling a single user-reported slowdown (use `selector-anti-pattern-review`) +- Writing micro-benchmarks outside the E2E harness + +## Workflow + +1. **Design the flow** — target ONE optimization vector per benchmark. Maximize ratio of optimization-affected time to total measured time. +2. **Run reference benchmarks first** in any session — session state degrades over time. +3. **Compute per-round statistics** before pooling. Check each round for stability (CV < 0.3 is a reasonable threshold). +4. **Group artifacts by timestamp**, not filename sort order. +5. **Report per-metric best subset** with explicit round attribution. Show pooled data as supplementary. + +## Flow Design by Optimization Type + +| Optimization | Primary cascade vector | Recommended flow | +|---|---|---| +| Selector memoization | State mutations | Multi-confirmation queue | +| Context memoization | Any state update | Account switching cycle | +| HOC stabilization | Route changes | Rapid route cycling (8+ transitions) | +| Dead code removal | Navigation | Return-to-home timer | + +## Session Hygiene + +System state degrades over long sessions — background load and memory pressure inflate variance and can **invert** treatment effects. + +- Run reference/critical benchmarks first +- If a late round contradicts clean earlier rounds, suspect session degradation before re-running the full suite + +## Artifact Grouping + +Filenames use `{test}-iteration-{N}-{ISO-timestamp}.json`. Unpadded N produces incorrect lexicographic sort. + +```javascript +// Extract seconds-since-midnight for round assignment +const match = filename.match(/T(\d{2})-(\d{2})-(\d{2})/); +const secondsOfDay = +match[1] * 3600 + +match[2] * 60 + +match[3]; +// Group by time range — never by filename position or array index +``` + +## Adding Metrics + +Extend `collectMetrics()` in `test/e2e/webdriver/driver.js` and register the metric key in `test/e2e/benchmarks/utils/constants.ts` → `ALL_METRICS`. + +- **Performance API metrics** (paint, navigation timing): collect directly inside `collectMetrics()` via `window.performance.getEntriesByType(...)`. +- **Long Task / TBT metrics**: already wired — `collectMetrics()` reads `window.stateHooks.getLongTaskMetricsWithTBT()`. Adding new long-task-derived metrics requires extending the `stateHooks` observer, not the driver. + +## Common Pitfalls + +| Mistake | Correct Approach | +|---------|-----------------| +| Pool all rounds before checking per-round stats | Per-round first — late-session noise can invert the treatment effect | +| Sort artifacts by filename | Extract ISO timestamp; sort by numeric time value | +| Benchmark flow that exercises multiple vectors | One vector per flow — mixed flows produce ambiguous signal | +| Report pooled p-value as primary result | Report cleanest per-metric signal with round attribution; pooled is supplementary |