Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 16 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,25 @@
/modules
/prompts
/.claude
.serena
IDEAS.md
node_modules/
*.log
*.jsonl
.mcp.json
CLAUDE.md
/.pos-supervisor

# Test artifacts — integration tests write to fixture .pos-supervisor dirs
**/.pos-supervisor/sessions/
**/.pos-supervisor/blobs/
**/.pos-supervisor/analytics.db
**/.pos-supervisor/analytics.db-wal
**/.pos-supervisor/analytics.db-shm

# Stray test/scratch files
/t
/2026-*.txt
.serena/project.yml
.gitignore~
.serena/project.yml
997 changes: 997 additions & 0 deletions CHANGELOG.md

Large diffs are not rendered by default.

945 changes: 945 additions & 0 deletions SYSTEM_ARCHITECTURE.md

Large diffs are not rendered by default.

10 changes: 6 additions & 4 deletions package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 2 additions & 2 deletions package.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"name": "@platformos/pos-supervisor",
"version": "0.5.2",
"version": "0.7.3",
"description": "platformOS domain-specific MCP server for LLM agents",
"type": "module",
"bin": {
Expand All @@ -13,7 +13,7 @@
},
"dependencies": {
"@modelcontextprotocol/sdk": "^1.29.0",
"@platformos/liquid-html-parser": "^0.0.11",
"@platformos/liquid-html-parser": "^0.0.17",
"js-yaml": "^4.1.1",
"zod": "^4.3.6"
},
Expand Down
94 changes: 94 additions & 0 deletions scripts/cleanup-live-console-rows.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
#!/usr/bin/env bun
/**
* One-off cleanup — remove analytics rows that originated from the dashboard
* Live Diagnostic Console before A3 introduced the `untracked` gate.
*
* Symptom those rows caused: `__pos_live_console__` files appearing in the
* `OrphanedPartial` and `pos-supervisor:MissingDocBlock` file distributions
* of the supervisor report. Every live-console validation wrote a
* `validator_emit` that the store replayed into diagnostics/outcomes.
*
* Usage:
* bun scripts/cleanup-live-console-rows.js [/path/to/project]
*
* Project path defaults to POS_SUPERVISOR_PROJECT_DIR or the current working
* directory. Runs against `.pos-supervisor/analytics.db` under that project.
*
* Safe to re-run — purely a DELETE of rows whose file column matches the
* live-console sentinel. No schema changes.
*/

import { join, resolve } from 'node:path';
import { existsSync } from 'node:fs';
import { openAnalyticsStore } from '../src/core/analytics-store.js';

const LIVE_CONSOLE_NEEDLE = '__pos_live_console__';

function parseArgs() {
const projectArg = process.argv[2];
const projectDir = resolve(projectArg ?? process.env.POS_SUPERVISOR_PROJECT_DIR ?? process.cwd());
return { projectDir };
}

function main() {
const { projectDir } = parseArgs();
const dbPath = join(projectDir, '.pos-supervisor', 'analytics.db');

if (!existsSync(dbPath)) {
console.error(`No analytics DB at ${dbPath}. Nothing to clean.`);
process.exit(0);
}

const store = openAnalyticsStore(dbPath);
const db = store.db;

const beforeCounts = {
events: db.prepare(`SELECT COUNT(*) AS n FROM events WHERE payload LIKE ?`).get(`%${LIVE_CONSOLE_NEEDLE}%`).n,
diagnostics: db.prepare(`SELECT COUNT(*) AS n FROM diagnostics WHERE file LIKE ?`).get(`%${LIVE_CONSOLE_NEEDLE}%`).n,
outcomes: db.prepare(`SELECT COUNT(*) AS n FROM outcomes WHERE file LIKE ?`).get(`%${LIVE_CONSOLE_NEEDLE}%`).n,
windows: db.prepare(`SELECT COUNT(*) AS n FROM windows WHERE file LIKE ?`).get(`%${LIVE_CONSOLE_NEEDLE}%`).n,
proposed_fixes: db.prepare(
`SELECT COUNT(*) AS n FROM proposed_fixes pf
WHERE EXISTS (SELECT 1 FROM diagnostics d WHERE d.fp = pf.fp AND d.file LIKE ?)`,
).get(`%${LIVE_CONSOLE_NEEDLE}%`).n,
};

db.exec('BEGIN');
try {
db.prepare(
`DELETE FROM proposed_fixes
WHERE fp IN (SELECT fp FROM diagnostics WHERE file LIKE ?)`,
).run(`%${LIVE_CONSOLE_NEEDLE}%`);

db.prepare(
`DELETE FROM outcomes WHERE file LIKE ?`,
).run(`%${LIVE_CONSOLE_NEEDLE}%`);

db.prepare(
`DELETE FROM windows WHERE file LIKE ?`,
).run(`%${LIVE_CONSOLE_NEEDLE}%`);

db.prepare(
`DELETE FROM diagnostics WHERE file LIKE ?`,
).run(`%${LIVE_CONSOLE_NEEDLE}%`);

db.prepare(
`DELETE FROM events WHERE payload LIKE ?`,
).run(`%${LIVE_CONSOLE_NEEDLE}%`);

db.exec('COMMIT');
} catch (e) {
db.exec('ROLLBACK');
console.error('Cleanup failed; rolled back.');
throw e;
}

console.log(`Removed live-console rows from ${dbPath}:`);
for (const [table, count] of Object.entries(beforeCounts)) {
console.log(` ${table.padEnd(16)} ${count}`);
}

store.close();
}

main();
57 changes: 57 additions & 0 deletions scripts/rebuild-analytics.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
#!/usr/bin/env node
/**
* Rebuild the analytics DB from session event logs.
*
* Usage:
* node scripts/rebuild-analytics.js /path/to/project
*
* The project must have a .pos-supervisor/ directory with sessions/ and analytics.db.
* The server must NOT be running when this script executes (WAL mode allows reads
* but schema migrations can conflict with a live server).
*/

import { join } from 'node:path';
import { existsSync } from 'node:fs';
import { openAnalyticsStore } from '../src/core/analytics-store.js';
import { openBlobStore } from '../src/core/blob-store.js';

const projectDir = process.argv[2];
if (!projectDir) {
console.error('Usage: node scripts/rebuild-analytics.js /path/to/project');
process.exit(1);
}

const supervisorDir = join(projectDir, '.pos-supervisor');
const dbPath = join(supervisorDir, 'analytics.db');
const sessionsDir = join(supervisorDir, 'sessions');
const blobsDir = join(supervisorDir, 'blobs');

if (!existsSync(supervisorDir)) {
console.error(`No .pos-supervisor directory found at: ${supervisorDir}`);
process.exit(1);
}
if (!existsSync(sessionsDir)) {
console.error(`No sessions directory found at: ${sessionsDir}`);
process.exit(1);
}

console.log(`DB: ${dbPath}`);
console.log(`Sessions: ${sessionsDir}`);
console.log(`Blobs: ${blobsDir}`);
console.log('Rebuilding...');

// Blob store is required for fix-adoption classification (reads start/end file
// snapshots and proposed-fix texts). Without it, every outcome row lands with
// fix_applied = null. Fine if the blobs dir doesn't exist yet — classification
// just degrades to null for that session.
let blobStore = null;
try {
blobStore = openBlobStore(blobsDir);
} catch (e) {
console.warn(`Blob store unavailable (${e.message}); fix adoption will not be classified.`);
}

const store = openAnalyticsStore(dbPath, { blobStore });
const { sessions, events } = store.rebuild(sessionsDir);

console.log(`Done. Replayed ${events} events across ${sessions} sessions.`);
126 changes: 126 additions & 0 deletions src/core/analytics-labels.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
/**
* Analytics labels — single source of truth for the GOOD / OK / LOW / HARMFUL,
* AT RISK / UNMATCHED, and INSUFFICIENT_DATA presentation-layer labels.
*
* Pure functions, intentionally side-effect-free. The HTTP layer attaches
* `.label` to each scorecard / rule-performance row before serialising; the
* dashboard browser code and Markdown report consume that field directly so
* label logic isn't duplicated (or drifted) between server and client.
*
* INSUFFICIENT_DATA gate (`LABEL_MIN_OUTCOMES`) is the load-bearing change.
* Labels computed from a sample of one — `AT RISK -100%` on a single
* regression — are statistically meaningless and previously caused operators
* to chase ghosts of already-fixed rules. Below the threshold we return a
* neutral label that says "we don't know yet" instead of a confident wrong
* answer.
*
* The threshold is conservative on purpose: 5 outcomes lets a Beta(2,2)
* posterior collapse from "wide ribbon" to a meaningful interval. Engine-side
* decisions (auto-disable in case-base.ruleScores) use a stricter gate of 10
* because promotion/demotion is more consequential than display.
*/

export const LABEL_MIN_OUTCOMES = 5;

/**
* Normalise a Beta-posterior object or bare number to a scalar in [0, 1].
* Mirrors the dashboard `rateVal()` helper exactly so the server emits the
* same labels the browser would have computed inline.
*/
function asRate(r) {
if (r && typeof r === 'object' && typeof r.mean === 'number') return r.mean;
if (typeof r === 'number') return r;
return 0;
}

/**
* Per-check scorecard label.
*
* Accepts a row from `checkScorecards()` carrying `.resolution_rate`,
* `.mislead_rate`, and either `.sample_size` (preferred) or `.total_outcomes`.
* Each rate may be a Beta posterior `{ mean, lower95, upper95 }` or a number.
*
* Returns one of:
* - INSUFFICIENT_DATA — fewer than LABEL_MIN_OUTCOMES outcomes
* - GOOD — effectiveness > 0.5
* - OK — 0.15 < effectiveness <= 0.5
* - LOW — 0 <= effectiveness <= 0.15
* - HARMFUL — effectiveness < 0
*/
export function checkLabel(card) {
if (!card || typeof card !== 'object') return 'INSUFFICIENT_DATA';
const sampleSize = Number(card.sample_size ?? card.total_outcomes ?? 0);
if (!Number.isFinite(sampleSize) || sampleSize < LABEL_MIN_OUTCOMES) {
return 'INSUFFICIENT_DATA';
}
const effectiveness = asRate(card.resolution_rate) - asRate(card.mislead_rate);
if (effectiveness > 0.5) return 'GOOD';
if (effectiveness > 0.15) return 'OK';
if (effectiveness >= 0) return 'LOW';
return 'HARMFUL';
}

/**
* Per-rule_id performance label.
*
* Accepts a row from `rulePerformance()` / `ruleScores()` carrying
* `.unmatched`, `.effectiveness`, and `.total_outcomes`.
*
* Precedence:
* 1. UNMATCHED — `.unmatched === true` always wins. Coverage gap is
* actionable regardless of sample size; one emit on a
* rule-less check still tells the operator a rule needs
* writing.
* 2. INSUFFICIENT_DATA — `total_outcomes < LABEL_MIN_OUTCOMES`. We don't
* know enough to call the rule risky.
* 3. AT RISK — effectiveness < 0.15. Real signal, real concern.
* 4. OK — everything else.
*
* Note: `effectiveness` here is `resolution_rate - regression_rate`, not the
* 0..1 percentage the case-base disable-gate uses. A negative number is
* possible (rule causes more regressions than it resolves).
*/
export function ruleLabel(rule) {
if (!rule || typeof rule !== 'object') return 'INSUFFICIENT_DATA';
if (rule.unmatched) return 'UNMATCHED';
const totalOutcomes = Number(rule.total_outcomes ?? 0);
if (!Number.isFinite(totalOutcomes) || totalOutcomes < LABEL_MIN_OUTCOMES) {
return 'INSUFFICIENT_DATA';
}
const effectiveness = Number(rule.effectiveness ?? 0);
if (!Number.isFinite(effectiveness)) return 'INSUFFICIENT_DATA';
if (effectiveness < 0.15) return 'AT RISK';
return 'OK';
}

/**
* Filter scorecards down to the rows that warrant a HARMFUL headline in the
* Markdown report's executive summary. Honours the same sample-size gate so
* we don't trumpet "HARMFUL" off a single regression — which is exactly the
* stale-data trap that motivated this whole module.
*/
export function harmfulSummary(scorecards) {
if (!Array.isArray(scorecards)) return [];
return scorecards.filter(c => checkLabel(c) === 'HARMFUL');
}

/**
* Attach a `.label` field to every row in a scorecard array. Returns a NEW
* array; rows are shallow-copied so callers can't accidentally mutate the
* underlying analytics-queries result. HTTP handlers wrap the array with this
* before sending so the dashboard receives labelled rows it can render
* without re-computing.
*/
export function withCheckLabels(scorecards) {
if (!Array.isArray(scorecards)) return [];
return scorecards.map(card => ({ ...card, label: checkLabel(card) }));
}

/**
* Attach a `.label` field to every row in a rule-performance / rule-score
* array. See `withCheckLabels`.
*/
export function withRuleLabels(rules) {
if (!Array.isArray(rules)) return [];
return rules.map(rule => ({ ...rule, label: ruleLabel(rule) }));
}
Loading