diff --git a/CHANGELOG.md b/CHANGELOG.md index 95d22e0..1c06794 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,14 @@ All notable changes to Supertag CLI are documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [Unreleased] + +### Fixed +- **Delta-sync could wedge indefinitely on a single bad node (HTTP 500)** — When Tana's Local API `/nodes/search` returns HTTP 500 while serializing one node in the changed set (a Tana-side serializer bug), the whole page failed. Because the watermark only advanced on a fully successful sync, the same poisoned request was retried every cycle and never recovered — observed stuck 8+ days. `DeltaSyncService` now isolates the offending node via offset/limit bisection, skips just that node (`poisonNodesSkipped` in the result), and advances the watermark so sync keeps making progress. A subsequent full `supertag sync index` re-captures any skipped node from the export. 500 is treated as a skippable poison node; 400/401/404/network errors still propagate as real failures. + +### Added +- **Delta-sync failure-streak escalation** — `DeltaSyncPoller` now tracks consecutive failed cycles and emits a loud, actionable warning ("failed N cycles in a row… run `supertag sync index`") once the streak crosses a threshold, instead of logging an identical per-cycle error that's easy to miss. Exposed via `getFailureState()`. Catches the "silently wedged for days" failure mode. + ## [2.5.7] - 2026-04-17 ### Fixed diff --git a/src/mcp/delta-sync-poller.ts b/src/mcp/delta-sync-poller.ts index 3390d8e..54d2881 100644 --- a/src/mcp/delta-sync-poller.ts +++ b/src/mcp/delta-sync-poller.ts @@ -80,6 +80,15 @@ export class DeltaSyncPoller { private wasHealthy = true; private lastResult: DeltaSyncResult | null = null; private tickCount = 0; + private consecutiveFailures = 0; + private lastErrorMessage: string | null = null; + + /** + * After this many consecutive failures, escalate from a per-cycle error log + * to a loud, actionable warning. Catches the "silently wedged for days" + * failure mode (e.g. Tana Local API 500 on every cycle). + */ + private static readonly FAILURE_ALERT_THRESHOLD = 3; constructor(private options: DeltaSyncPollerOptions) { this.service = new DeltaSyncService({ @@ -125,6 +134,8 @@ export class DeltaSyncPoller { async triggerNow(): Promise { const result = await this.service.sync(); this.lastResult = result; + this.consecutiveFailures = 0; + this.lastErrorMessage = null; return result; } @@ -188,6 +199,19 @@ export class DeltaSyncPoller { const result = await this.service.sync(); this.lastResult = result; + // A completed cycle (even one that only skipped poison nodes) clears the + // failure streak — sync is unwedged and making progress again. + this.consecutiveFailures = 0; + this.lastErrorMessage = null; + + if (result.poisonNodesSkipped > 0) { + this.options.logger?.warn( + "Delta-sync skipped node(s) that crash Tana Local API search (HTTP 500). " + + "Run 'supertag sync index' for a full re-sync to capture them.", + { poisonNodesSkipped: result.poisonNodesSkipped } + ); + } + if (result.nodesFound > 0) { this.options.logger?.info("Delta-sync cycle complete", { nodesFound: result.nodesFound, @@ -197,12 +221,38 @@ export class DeltaSyncPoller { }); } } catch (error) { + this.consecutiveFailures++; + this.lastErrorMessage = String(error); this.options.logger?.error("Delta-sync cycle failed", { error: String(error), + consecutiveFailures: this.consecutiveFailures, }); + + // Escalate once the streak crosses the threshold so a persistent wedge + // (e.g. Tana 500 on every cycle) can't run unnoticed for days. + if (this.consecutiveFailures >= DeltaSyncPoller.FAILURE_ALERT_THRESHOLD) { + this.options.logger?.error( + `Delta-sync has failed ${this.consecutiveFailures} cycles in a row. ` + + "Tana's Local API may be rejecting the request (e.g. HTTP 500). " + + "Run 'supertag sync index' for a full re-sync to recover.", + { consecutiveFailures: this.consecutiveFailures } + ); + } // Never crash - just log and continue polling } } + + /** + * Failure-streak state for status reporting. `consecutiveFailures` resets to + * 0 on any completed cycle; a non-zero value means delta-sync is currently + * wedged and `lastErrorMessage` holds the most recent failure. + */ + getFailureState(): { consecutiveFailures: number; lastErrorMessage: string | null } { + return { + consecutiveFailures: this.consecutiveFailures, + lastErrorMessage: this.lastErrorMessage, + }; + } } // ============================================================================= diff --git a/src/mcp/tools/__tests__/sync-delta.test.ts b/src/mcp/tools/__tests__/sync-delta.test.ts index 1f322df..b8e3b69 100644 --- a/src/mcp/tools/__tests__/sync-delta.test.ts +++ b/src/mcp/tools/__tests__/sync-delta.test.ts @@ -78,6 +78,7 @@ describe("MCP tana_sync delta mode (T-3.3)", () => { watermarkAfter: 2000000, durationMs: 2340, pages: 3, + poisonNodesSkipped: 0, }, }; diff --git a/src/services/delta-sync.ts b/src/services/delta-sync.ts index 62407ad..1f66dee 100644 --- a/src/services/delta-sync.ts +++ b/src/services/delta-sync.ts @@ -21,6 +21,38 @@ import type { /** Page size for API pagination */ const PAGE_SIZE = 100; +/** + * Hard cap on outer pagination iterations. Guards against an infinite loop if + * the API keeps returning full pages without ever emptying (or pathological + * poison-skip crawling). 100k iterations * PAGE_SIZE covers any realistic graph. + */ +const MAX_PAGE_ITERATIONS = 100_000; + +/** + * Detect a Tana Local API HTTP 500 "poison page" error. + * + * The Local API occasionally 500s while serializing the response for a + * specific changed node (a known class of Tana-side serializer bug). Because + * `edited.since` returns the whole changed set, a single such node poisons the + * entire page — and since the delta watermark only advances on a fully + * successful sync, the same poisoned request is retried forever, wedging sync + * indefinitely (observed: stuck 8+ days). + * + * We detect 500 specifically (not 502/503/504, which the client already retries + * as transient, nor 400/401/404, which are real client-side errors that must + * propagate). Shape-tolerant so it works with both the real LocalApiClient + * (StructuredError with `details.status`) and test doubles that throw plain + * Errors with "HTTP 500" in the message. + */ +function isPoisonPageError(error: unknown): boolean { + if (error && typeof error === "object") { + const status = (error as { details?: { status?: unknown } }).details?.status; + if (status === 500) return true; + } + if (error instanceof Error && /\bHTTP 500\b/.test(error.message)) return true; + return false; +} + /** * DeltaSyncService handles incremental sync of Tana nodes * from the local API into the SQLite database. @@ -251,27 +283,102 @@ export class DeltaSyncService { * no-op after its first run. We keep watermarks in ms internally (for * backward-compat with existing databases) and convert at this boundary. */ - async *fetchChangedNodes(sinceMs: number): AsyncGenerator { + async *fetchChangedNodes( + sinceMs: number, + stats: { poisonNodesSkipped: number } = { poisonNodesSkipped: 0 } + ): AsyncGenerator { // Convert ms → seconds. Floor so we don't skip edits that happened // within the sub-second of the previous watermark. Clamp to min 1 // because the API rejects `since=0` with a validation error. const sinceSec = Math.max(1, Math.floor(sinceMs / 1000)); let offset = 0; + let iterations = 0; while (true) { - const page = await this.localApiClient.searchNodes( - { edited: { since: sinceSec } }, - { limit: PAGE_SIZE, offset } - ); + if (++iterations > MAX_PAGE_ITERATIONS) { + this.logger.error( + "Delta-sync pagination exceeded safety limit; stopping", + { offset, iterations } + ); + break; + } + + const result = await this.fetchPageResilient(sinceSec, offset, PAGE_SIZE, stats); + + // null = the single node at `offset` crashes Tana's serializer (HTTP 500). + // Skip exactly that node and keep going so it can't wedge the whole sync. + if (result === null) { + offset += 1; + continue; + } - if (page.length === 0) break; + const { nodes, reduced } = result; - yield page; + if (nodes.length === 0) break; - if (page.length < PAGE_SIZE) break; + yield nodes; - offset += PAGE_SIZE; + offset += nodes.length; + + // Normal end-of-results short-circuit: a full-size request that returned a + // partial page means we've reached the end. We must NOT trust a short page + // when it was produced by bisection (reduced=true) — more nodes may follow. + if (!reduced && nodes.length < PAGE_SIZE) break; + } + } + + /** + * Fetch one page, isolating poison nodes via offset/limit bisection. + * + * On HTTP 500, the page contains a node Tana's search serializer crashes on. + * We can't tell which from a failed request, so we halve the window and retry + * at the same offset, recursively narrowing until either the window succeeds + * (the poison node is past it) or we reach limit=1 — at which point the single + * node at `offset` IS the poison node, so we skip it (return null) and let the + * caller advance past it. This keeps the watermark moving instead of retrying + * the same poisoned request forever. + * + * Non-500 errors (400 validation, 401 auth, network) are real failures and + * propagate unchanged — only 500 is treated as a skippable poison node. + * + * @returns `{ nodes, reduced }` where `reduced` is true if the window was + * narrowed below PAGE_SIZE; or `null` to signal "skip one node". + */ + private async fetchPageResilient( + sinceSec: number, + offset: number, + limit: number, + stats: { poisonNodesSkipped: number } + ): Promise<{ nodes: SearchResultNode[]; reduced: boolean } | null> { + try { + const nodes = await this.localApiClient.searchNodes( + { edited: { since: sinceSec } }, + { limit, offset } + ); + return { nodes, reduced: limit < PAGE_SIZE }; + } catch (error) { + // Real client-side error (400/401/404/network) — propagate, do not skip. + if (!isPoisonPageError(error)) throw error; + + if (limit <= 1) { + // The single node at this offset crashes Tana's search serializer. + // Skip it; a full `sync index` will re-capture it from the export. + this.logger.warn( + "Skipping a node that crashes Tana Local API search (HTTP 500). " + + "Run 'supertag sync index' for a full re-sync to capture it.", + { offset, sinceSec } + ); + stats.poisonNodesSkipped++; + return null; + } + + const half = Math.max(1, Math.floor(limit / 2)); + this.logger.warn( + "Delta-sync page returned HTTP 500; bisecting to isolate the bad node", + { offset, limit, retryLimit: half } + ); + return this.fetchPageResilient(sinceSec, offset, half, stats); } } @@ -304,6 +411,7 @@ export class DeltaSyncService { watermarkAfter: 0, durationMs: 0, pages: 0, + poisonNodesSkipped: 0, }; } @@ -333,8 +441,9 @@ export class DeltaSyncService { let fieldValuesCleared = 0; let pages = 0; const changedNodeIds: string[] = []; + const stats = { poisonNodesSkipped: 0 }; - for await (const page of this.fetchChangedNodes(sinceMs)) { + for await (const page of this.fetchChangedNodes(sinceMs, stats)) { pages++; nodesFound += page.length; @@ -363,9 +472,13 @@ export class DeltaSyncService { embeddingsGenerated = 0; } - // Step 5: Update watermark + // Step 5: Update watermark. + // Advance when we found nodes OR skipped poison nodes — skipping past a + // poison node is real progress, and advancing the watermark stops the + // next cycle from re-requesting (and re-skipping) the same bad node. const watermarkAfter = Date.now(); - if (nodesFound > 0) { + const madeProgress = nodesFound > 0 || stats.poisonNodesSkipped > 0; + if (madeProgress) { this.updateWatermark(watermarkAfter, nodesFound); } @@ -381,9 +494,10 @@ export class DeltaSyncService { embeddingsGenerated, embeddingsSkipped, watermarkBefore: sinceMs, - watermarkAfter: nodesFound > 0 ? watermarkAfter : sinceMs, + watermarkAfter: madeProgress ? watermarkAfter : sinceMs, durationMs, pages, + poisonNodesSkipped: stats.poisonNodesSkipped, }; } finally { // T-2.3: Always release lock diff --git a/src/types/local-api.ts b/src/types/local-api.ts index bcbaa35..413ebe1 100644 --- a/src/types/local-api.ts +++ b/src/types/local-api.ts @@ -390,6 +390,13 @@ export interface DeltaSyncResult { durationMs: number; /** Number of API pages fetched */ pages: number; + /** + * Number of nodes skipped because Tana's Local API returned HTTP 500 while + * serializing them (v2.6 poison-node isolation). A subsequent full + * `sync index` re-captures them from the export. >0 means delta sync made + * progress past a node that would otherwise have wedged it indefinitely. + */ + poisonNodesSkipped: number; } /** diff --git a/tests/unit/delta-sync-pagination.test.ts b/tests/unit/delta-sync-pagination.test.ts index 0e9c6d8..08ea61c 100644 --- a/tests/unit/delta-sync-pagination.test.ts +++ b/tests/unit/delta-sync-pagination.test.ts @@ -184,6 +184,115 @@ describe("DeltaSyncService - Pagination + Sync Orchestration (T-2.2)", () => { expect(q.edited.since).toBe(1); }); + it("isolates and skips a single poison node that 500s (v2.6)", async () => { + // Simulate Tana serializing fine for every node EXCEPT the one at + // absolute offset 1. Any request whose [offset, offset+limit) window + // includes offset 1 returns HTTP 500; otherwise it returns real nodes. + const POISON_OFFSET = 1; + const TOTAL = 3; + const calls: Array<{ offset: number; limit: number }> = []; + + service = new DeltaSyncService({ + dbPath, + localApiClient: { + searchNodes: async (_query, options) => { + const offset = options?.offset ?? 0; + const limit = options?.limit ?? 100; + calls.push({ offset, limit }); + const windowEnd = offset + limit; + if (POISON_OFFSET >= offset && POISON_OFFSET < windowEnd) { + throw Object.assign(new Error("Local API returned HTTP 500"), { + details: { status: 500 }, + }); + } + // Return nodes for [offset, min(windowEnd, TOTAL)) excluding poison. + const out: SearchResultNode[] = []; + for (let i = offset; i < Math.min(windowEnd, TOTAL); i++) { + if (i === POISON_OFFSET) continue; + out.push(createTestNode(`n-${i}`, `Node ${i}`)); + } + return out; + }, + health: async () => true, + }, + }); + + const stats = { poisonNodesSkipped: 0 }; + const collected: SearchResultNode[] = []; + for await (const page of service.fetchChangedNodes(1000, stats)) { + collected.push(...page); + } + + // Poison node skipped exactly once; the good nodes still come through. + expect(stats.poisonNodesSkipped).toBe(1); + const ids = collected.map((n) => n.id).sort(); + expect(ids).toEqual(["n-0", "n-2"]); + }); + + it("propagates non-500 errors instead of skipping (v2.6)", async () => { + // A 400/auth/network error is a real failure and must NOT be silently + // skipped as a poison node. + service = new DeltaSyncService({ + dbPath, + localApiClient: { + searchNodes: async () => { + throw Object.assign(new Error("Local API returned HTTP 400"), { + details: { status: 400 }, + }); + }, + health: async () => true, + }, + }); + + const iterate = async () => { + for await (const _page of service.fetchChangedNodes(1000)) { + // drain + } + }; + await expect(iterate()).rejects.toThrow("HTTP 400"); + }); + + it("advances watermark past a poison node so sync un-wedges (v2.6)", async () => { + // Every cycle's changed-set contains one poison node at offset 0. + // Pre-v2.6 this 500'd the whole sync, watermark never moved, and the + // next cycle re-requested the same poison forever. Now sync skips it and + // advances the watermark. + service = new DeltaSyncService({ + dbPath, + localApiClient: { + searchNodes: async (_query, options) => { + const offset = options?.offset ?? 0; + const limit = options?.limit ?? 100; + // Poison is the only node, at offset 0. + if (offset === 0 && limit >= 1) { + throw Object.assign(new Error("Local API returned HTTP 500"), { + details: { status: 500 }, + }); + } + return []; + }, + health: async () => true, + }, + }); + + const before = Date.now(); + const result = await service.sync(); + + expect(result.poisonNodesSkipped).toBe(1); + expect(result.nodesFound).toBe(0); + // Watermark must advance even with zero nodes found, because progress + // was made past the poison node. + expect(result.watermarkAfter).toBeGreaterThanOrEqual(before); + expect(result.watermarkAfter).toBeGreaterThan(result.watermarkBefore); + + const checkDb = new Database(dbPath, { readonly: true }); + const row = checkDb + .query("SELECT delta_sync_timestamp FROM sync_metadata WHERE id = 1") + .get() as { delta_sync_timestamp: number }; + expect(row.delta_sync_timestamp).toBeGreaterThanOrEqual(before); + checkDb.close(); + }); + it("handles single partial page", async () => { const nodes = [createTestNode("n-1", "Node 1"), createTestNode("n-2", "Node 2")];