Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,14 @@ All notable changes to Supertag CLI are documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## [Unreleased]

### Fixed
- **Delta-sync could wedge indefinitely on a single bad node (HTTP 500)** — When Tana's Local API `/nodes/search` returns HTTP 500 while serializing one node in the changed set (a Tana-side serializer bug), the whole page failed. Because the watermark only advanced on a fully successful sync, the same poisoned request was retried every cycle and never recovered — observed stuck 8+ days. `DeltaSyncService` now isolates the offending node via offset/limit bisection, skips just that node (`poisonNodesSkipped` in the result), and advances the watermark so sync keeps making progress. A subsequent full `supertag sync index` re-captures any skipped node from the export. 500 is treated as a skippable poison node; 400/401/404/network errors still propagate as real failures.

### Added
- **Delta-sync failure-streak escalation** — `DeltaSyncPoller` now tracks consecutive failed cycles and emits a loud, actionable warning ("failed N cycles in a row… run `supertag sync index`") once the streak crosses a threshold, instead of logging an identical per-cycle error that's easy to miss. Exposed via `getFailureState()`. Catches the "silently wedged for days" failure mode.

## [2.5.7] - 2026-04-17

### Fixed
Expand Down
50 changes: 50 additions & 0 deletions src/mcp/delta-sync-poller.ts
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,15 @@ export class DeltaSyncPoller {
private wasHealthy = true;
private lastResult: DeltaSyncResult | null = null;
private tickCount = 0;
private consecutiveFailures = 0;
private lastErrorMessage: string | null = null;

/**
* After this many consecutive failures, escalate from a per-cycle error log
* to a loud, actionable warning. Catches the "silently wedged for days"
* failure mode (e.g. Tana Local API 500 on every cycle).
*/
private static readonly FAILURE_ALERT_THRESHOLD = 3;

constructor(private options: DeltaSyncPollerOptions) {
this.service = new DeltaSyncService({
Expand Down Expand Up @@ -125,6 +134,8 @@ export class DeltaSyncPoller {
async triggerNow(): Promise<DeltaSyncResult> {
const result = await this.service.sync();
this.lastResult = result;
this.consecutiveFailures = 0;
this.lastErrorMessage = null;
return result;
}

Expand Down Expand Up @@ -188,6 +199,19 @@ export class DeltaSyncPoller {
const result = await this.service.sync();
this.lastResult = result;

// A completed cycle (even one that only skipped poison nodes) clears the
// failure streak — sync is unwedged and making progress again.
this.consecutiveFailures = 0;
this.lastErrorMessage = null;

if (result.poisonNodesSkipped > 0) {
this.options.logger?.warn(
"Delta-sync skipped node(s) that crash Tana Local API search (HTTP 500). " +
"Run 'supertag sync index' for a full re-sync to capture them.",
{ poisonNodesSkipped: result.poisonNodesSkipped }
);
}

if (result.nodesFound > 0) {
this.options.logger?.info("Delta-sync cycle complete", {
nodesFound: result.nodesFound,
Expand All @@ -197,12 +221,38 @@ export class DeltaSyncPoller {
});
}
} catch (error) {
this.consecutiveFailures++;
this.lastErrorMessage = String(error);
this.options.logger?.error("Delta-sync cycle failed", {
error: String(error),
consecutiveFailures: this.consecutiveFailures,
});

// Escalate once the streak crosses the threshold so a persistent wedge
// (e.g. Tana 500 on every cycle) can't run unnoticed for days.
if (this.consecutiveFailures >= DeltaSyncPoller.FAILURE_ALERT_THRESHOLD) {
this.options.logger?.error(
`Delta-sync has failed ${this.consecutiveFailures} cycles in a row. ` +
"Tana's Local API may be rejecting the request (e.g. HTTP 500). " +
"Run 'supertag sync index' for a full re-sync to recover.",
{ consecutiveFailures: this.consecutiveFailures }
);
}
// Never crash - just log and continue polling
}
}

/**
* Failure-streak state for status reporting. `consecutiveFailures` resets to
* 0 on any completed cycle; a non-zero value means delta-sync is currently
* wedged and `lastErrorMessage` holds the most recent failure.
*/
getFailureState(): { consecutiveFailures: number; lastErrorMessage: string | null } {
return {
consecutiveFailures: this.consecutiveFailures,
lastErrorMessage: this.lastErrorMessage,
};
}
}

// =============================================================================
Expand Down
1 change: 1 addition & 0 deletions src/mcp/tools/__tests__/sync-delta.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@ describe("MCP tana_sync delta mode (T-3.3)", () => {
watermarkAfter: 2000000,
durationMs: 2340,
pages: 3,
poisonNodesSkipped: 0,
},
};

Expand Down
140 changes: 127 additions & 13 deletions src/services/delta-sync.ts
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,38 @@ import type {
/** Page size for API pagination */
const PAGE_SIZE = 100;

/**
* Hard cap on outer pagination iterations. Guards against an infinite loop if
* the API keeps returning full pages without ever emptying (or pathological
* poison-skip crawling). 100k iterations * PAGE_SIZE covers any realistic graph.
*/
const MAX_PAGE_ITERATIONS = 100_000;

/**
* Detect a Tana Local API HTTP 500 "poison page" error.
*
* The Local API occasionally 500s while serializing the response for a
* specific changed node (a known class of Tana-side serializer bug). Because
* `edited.since` returns the whole changed set, a single such node poisons the
* entire page — and since the delta watermark only advances on a fully
* successful sync, the same poisoned request is retried forever, wedging sync
* indefinitely (observed: stuck 8+ days).
*
* We detect 500 specifically (not 502/503/504, which the client already retries
* as transient, nor 400/401/404, which are real client-side errors that must
* propagate). Shape-tolerant so it works with both the real LocalApiClient
* (StructuredError with `details.status`) and test doubles that throw plain
* Errors with "HTTP 500" in the message.
*/
function isPoisonPageError(error: unknown): boolean {
if (error && typeof error === "object") {
const status = (error as { details?: { status?: unknown } }).details?.status;
if (status === 500) return true;
}
if (error instanceof Error && /\bHTTP 500\b/.test(error.message)) return true;
return false;
}

/**
* DeltaSyncService handles incremental sync of Tana nodes
* from the local API into the SQLite database.
Expand Down Expand Up @@ -251,27 +283,102 @@ export class DeltaSyncService {
* no-op after its first run. We keep watermarks in ms internally (for
* backward-compat with existing databases) and convert at this boundary.
*/
async *fetchChangedNodes(sinceMs: number): AsyncGenerator<SearchResultNode[]> {
async *fetchChangedNodes(
sinceMs: number,
stats: { poisonNodesSkipped: number } = { poisonNodesSkipped: 0 }
): AsyncGenerator<SearchResultNode[]> {
// Convert ms → seconds. Floor so we don't skip edits that happened
// within the sub-second of the previous watermark. Clamp to min 1
// because the API rejects `since=0` with a validation error.
const sinceSec = Math.max(1, Math.floor(sinceMs / 1000));

let offset = 0;
let iterations = 0;

while (true) {
const page = await this.localApiClient.searchNodes(
{ edited: { since: sinceSec } },
{ limit: PAGE_SIZE, offset }
);
if (++iterations > MAX_PAGE_ITERATIONS) {
this.logger.error(
"Delta-sync pagination exceeded safety limit; stopping",
{ offset, iterations }
);
break;
}

const result = await this.fetchPageResilient(sinceSec, offset, PAGE_SIZE, stats);

// null = the single node at `offset` crashes Tana's serializer (HTTP 500).
// Skip exactly that node and keep going so it can't wedge the whole sync.
if (result === null) {
offset += 1;
continue;
}

if (page.length === 0) break;
const { nodes, reduced } = result;

yield page;
if (nodes.length === 0) break;

if (page.length < PAGE_SIZE) break;
yield nodes;

offset += PAGE_SIZE;
offset += nodes.length;

// Normal end-of-results short-circuit: a full-size request that returned a
// partial page means we've reached the end. We must NOT trust a short page
// when it was produced by bisection (reduced=true) — more nodes may follow.
if (!reduced && nodes.length < PAGE_SIZE) break;
}
}

/**
* Fetch one page, isolating poison nodes via offset/limit bisection.
*
* On HTTP 500, the page contains a node Tana's search serializer crashes on.
* We can't tell which from a failed request, so we halve the window and retry
* at the same offset, recursively narrowing until either the window succeeds
* (the poison node is past it) or we reach limit=1 — at which point the single
* node at `offset` IS the poison node, so we skip it (return null) and let the
* caller advance past it. This keeps the watermark moving instead of retrying
* the same poisoned request forever.
*
* Non-500 errors (400 validation, 401 auth, network) are real failures and
* propagate unchanged — only 500 is treated as a skippable poison node.
*
* @returns `{ nodes, reduced }` where `reduced` is true if the window was
* narrowed below PAGE_SIZE; or `null` to signal "skip one node".
*/
private async fetchPageResilient(
sinceSec: number,
offset: number,
limit: number,
stats: { poisonNodesSkipped: number }
): Promise<{ nodes: SearchResultNode[]; reduced: boolean } | null> {
try {
const nodes = await this.localApiClient.searchNodes(
{ edited: { since: sinceSec } },
{ limit, offset }
);
return { nodes, reduced: limit < PAGE_SIZE };
} catch (error) {
// Real client-side error (400/401/404/network) — propagate, do not skip.
if (!isPoisonPageError(error)) throw error;

if (limit <= 1) {
// The single node at this offset crashes Tana's search serializer.
// Skip it; a full `sync index` will re-capture it from the export.
this.logger.warn(
"Skipping a node that crashes Tana Local API search (HTTP 500). " +
"Run 'supertag sync index' for a full re-sync to capture it.",
{ offset, sinceSec }
);
stats.poisonNodesSkipped++;
return null;
}

const half = Math.max(1, Math.floor(limit / 2));
this.logger.warn(
"Delta-sync page returned HTTP 500; bisecting to isolate the bad node",
{ offset, limit, retryLimit: half }
);
return this.fetchPageResilient(sinceSec, offset, half, stats);
}
}

Expand Down Expand Up @@ -304,6 +411,7 @@ export class DeltaSyncService {
watermarkAfter: 0,
durationMs: 0,
pages: 0,
poisonNodesSkipped: 0,
};
}

Expand Down Expand Up @@ -333,8 +441,9 @@ export class DeltaSyncService {
let fieldValuesCleared = 0;
let pages = 0;
const changedNodeIds: string[] = [];
const stats = { poisonNodesSkipped: 0 };

for await (const page of this.fetchChangedNodes(sinceMs)) {
for await (const page of this.fetchChangedNodes(sinceMs, stats)) {
pages++;
nodesFound += page.length;

Expand Down Expand Up @@ -363,9 +472,13 @@ export class DeltaSyncService {
embeddingsGenerated = 0;
}

// Step 5: Update watermark
// Step 5: Update watermark.
// Advance when we found nodes OR skipped poison nodes — skipping past a
// poison node is real progress, and advancing the watermark stops the
// next cycle from re-requesting (and re-skipping) the same bad node.
const watermarkAfter = Date.now();
if (nodesFound > 0) {
const madeProgress = nodesFound > 0 || stats.poisonNodesSkipped > 0;
if (madeProgress) {
this.updateWatermark(watermarkAfter, nodesFound);
}

Expand All @@ -381,9 +494,10 @@ export class DeltaSyncService {
embeddingsGenerated,
embeddingsSkipped,
watermarkBefore: sinceMs,
watermarkAfter: nodesFound > 0 ? watermarkAfter : sinceMs,
watermarkAfter: madeProgress ? watermarkAfter : sinceMs,
durationMs,
pages,
poisonNodesSkipped: stats.poisonNodesSkipped,
};
} finally {
// T-2.3: Always release lock
Expand Down
7 changes: 7 additions & 0 deletions src/types/local-api.ts
Original file line number Diff line number Diff line change
Expand Up @@ -390,6 +390,13 @@ export interface DeltaSyncResult {
durationMs: number;
/** Number of API pages fetched */
pages: number;
/**
* Number of nodes skipped because Tana's Local API returned HTTP 500 while
* serializing them (v2.6 poison-node isolation). A subsequent full
* `sync index` re-captures them from the export. >0 means delta sync made
* progress past a node that would otherwise have wedged it indefinitely.
*/
poisonNodesSkipped: number;
}

/**
Expand Down
Loading
Loading