From 556f392373c5e2eea99d3fa75d9623fa47351676 Mon Sep 17 00:00:00 2001 From: Russell Dempsey <1173416+SgtPooki@users.noreply.github.com> Date: Wed, 20 May 2026 11:33:40 -0400 Subject: [PATCH 1/7] fix(retrieval): mark deals cleaned_up when SP reports piece missing MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Pre-flight SP `/pdp/piece/:pieceCid/status` before IPNI verify + block fetch in the retrieval check path. On 404 mark the deal cleaned_up and skip — saves the 30s IPNI timeout plus downstream block-fetch 404 noise per stale candidate. Closes #555 --- .../src/retrieval/retrieval.service.spec.ts | 153 ++++++++++++++++++ .../src/retrieval/retrieval.service.ts | 44 +++++ 2 files changed, 197 insertions(+) diff --git a/apps/backend/src/retrieval/retrieval.service.spec.ts b/apps/backend/src/retrieval/retrieval.service.spec.ts index a467e92c..0fa333cf 100644 --- a/apps/backend/src/retrieval/retrieval.service.spec.ts +++ b/apps/backend/src/retrieval/retrieval.service.spec.ts @@ -630,3 +630,156 @@ describe("RetrievalService DB/provider drift", () => { expect(cleanedUpCall?.params).toEqual({ cleanedUp: false }); }); }); + +describe("RetrievalService SP piece status pre-flight", () => { + type RetrievalServicePrivate = RetrievalService & { + performAllRetrievals: (deal: Deal, signal?: AbortSignal) => Promise; + }; + + const mockRetrievalAddonsService = { + testAllRetrievalMethods: vi.fn().mockResolvedValue({ + dealId: "deal-1", + results: [], + summary: { totalMethods: 0, successfulMethods: 0, failedMethods: 0 }, + testedAt: new Date(), + }), + getApplicableStrategies: vi.fn().mockReturnValue([{}]), + }; + const mockConfigService = { + get: vi.fn((key: string) => { + if (key === "app") return { runMode: "api" }; + if (key === "jobs") return { pgbossSchedulerEnabled: false }; + if (key === "blockchain") return { network: "calibration" }; + if (key === "dataset") return { randomDatasetSizes: [] }; + return undefined; + }), + }; + const mockDealRepository = { + update: vi.fn().mockResolvedValue({ affected: 1 }), + }; + const mockSpRepository = { findOne: vi.fn() }; + const mockRetrievalMetrics = { + observeCheckDuration: vi.fn(), + recordStatus: vi.fn(), + observeFirstByteMs: vi.fn(), + observeLastByteMs: vi.fn(), + observeThroughput: vi.fn(), + recordHttpResponseCode: vi.fn(), + recordResultMetrics: vi.fn(), + }; + + afterEach(() => { + vi.restoreAllMocks(); + mockDealRepository.update.mockClear(); + mockRetrievalAddonsService.testAllRetrievalMethods.mockClear(); + }); + + const buildDeal = (overrides: Partial = {}): Deal => + ({ + id: "deal-1", + spAddress: "0xsp", + walletAddress: "0xwallet", + pieceCid: "bafy-piece", + ...overrides, + }) as Deal; + + const createService = async (): Promise => { + const module: TestingModule = await Test.createTestingModule({ + providers: [ + RetrievalService, + { provide: RetrievalAddonsService, useValue: mockRetrievalAddonsService }, + { provide: getRepositoryToken(Deal), useValue: mockDealRepository }, + { provide: getRepositoryToken(Retrieval), useValue: { create: vi.fn(), save: vi.fn() } }, + { provide: getRepositoryToken(StorageProvider), useValue: mockSpRepository }, + { provide: RetrievalCheckMetrics, useValue: mockRetrievalMetrics }, + { provide: DiscoverabilityCheckMetrics, useValue: { recordStatus: vi.fn(), observeIpniVerifyMs: vi.fn() } }, + { provide: IpniVerificationService, useValue: { verify: vi.fn() } }, + { provide: ConfigService, useValue: mockConfigService }, + { provide: ClickhouseService, useValue: { insert: vi.fn(), probeLocation: "test" } }, + ], + }).compile(); + return module.get(RetrievalService) as unknown as RetrievalServicePrivate; + }; + + it("marks deal cleaned_up and skips retrieval when SP returns 404 for piece status", async () => { + const service = await createService(); + mockSpRepository.findOne.mockResolvedValue({ + address: "0xsp", + providerId: 5, + isApproved: true, + name: "Test SP", + serviceUrl: "https://sp.example.com", + }); + vi.stubGlobal( + "fetch", + vi.fn().mockResolvedValue({ status: 404, ok: false } as Response), + ); + + const result = await service.performAllRetrievals(buildDeal()); + + expect(result).toEqual([]); + expect(mockDealRepository.update).toHaveBeenCalledWith( + { id: "deal-1", cleanedUp: false }, + expect.objectContaining({ cleanedUp: true, cleanedUpAt: expect.any(Date) }), + ); + expect(mockRetrievalAddonsService.testAllRetrievalMethods).not.toHaveBeenCalled(); + }); + + it("proceeds with retrieval when SP confirms piece exists", async () => { + const service = await createService(); + mockSpRepository.findOne.mockResolvedValue({ + address: "0xsp", + providerId: 5, + isApproved: true, + name: "Test SP", + serviceUrl: "https://sp.example.com", + }); + vi.stubGlobal( + "fetch", + vi.fn().mockResolvedValue({ status: 200, ok: true } as Response), + ); + + await service.performAllRetrievals(buildDeal()).catch(() => undefined); + + expect(mockDealRepository.update).not.toHaveBeenCalled(); + expect(mockRetrievalAddonsService.testAllRetrievalMethods).toHaveBeenCalled(); + }); + + it("proceeds with retrieval when SP probe fails with a network error (unknown)", async () => { + const service = await createService(); + mockSpRepository.findOne.mockResolvedValue({ + address: "0xsp", + providerId: 5, + isApproved: true, + name: "Test SP", + serviceUrl: "https://sp.example.com", + }); + vi.stubGlobal( + "fetch", + vi.fn().mockRejectedValue(new Error("network unreachable")), + ); + + await service.performAllRetrievals(buildDeal()).catch(() => undefined); + + expect(mockDealRepository.update).not.toHaveBeenCalled(); + expect(mockRetrievalAddonsService.testAllRetrievalMethods).toHaveBeenCalled(); + }); + + it("skips probe and proceeds when provider has no serviceUrl", async () => { + const service = await createService(); + mockSpRepository.findOne.mockResolvedValue({ + address: "0xsp", + providerId: 5, + isApproved: true, + name: "Test SP", + serviceUrl: null, + }); + const fetchMock = vi.fn(); + vi.stubGlobal("fetch", fetchMock); + + await service.performAllRetrievals(buildDeal()).catch(() => undefined); + + expect(fetchMock).not.toHaveBeenCalled(); + expect(mockDealRepository.update).not.toHaveBeenCalled(); + }); +}); diff --git a/apps/backend/src/retrieval/retrieval.service.ts b/apps/backend/src/retrieval/retrieval.service.ts index fea7c897..d3d358bb 100644 --- a/apps/backend/src/retrieval/retrieval.service.ts +++ b/apps/backend/src/retrieval/retrieval.service.ts @@ -120,6 +120,25 @@ export class RetrievalService { return []; } + // Pre-flight: if the SP itself reports it no longer has this piece, mark the deal + // cleaned_up so it stops polluting future retrieval candidates and skip the check. + // Saves the 30s IPNI verify timeout + downstream block-fetch 404 noise per stale deal. + if (provider.serviceUrl && deal.pieceCid) { + const presence = await this.probeSpPieceStatus(provider.serviceUrl, deal.pieceCid, signal); + if (presence === "missing") { + await this.dealRepository.update( + { id: deal.id, cleanedUp: false }, + { cleanedUp: true, cleanedUpAt: new Date() }, + ); + this.logger.warn({ + ...retrievalLogContext, + event: "retrieval_skipped_piece_missing", + message: "SP reports piece missing; marked deal cleaned_up and skipped retrieval", + }); + return []; + } + } + type SubStatus = "success" | "failure.timedout" | "failure.other"; let terminalStatus: SubStatus | null = null; let retrievals: Retrieval[] = []; @@ -366,6 +385,31 @@ export class RetrievalService { return this.spRepository.findOne({ where: { address } }); } + /** + * Probe `${serviceUrl}/pdp/piece/:pieceCid/status` to determine whether the SP + * currently has the piece. Returns: + * - "missing": SP responded 404 (authoritative — SP does not have the piece) + * - "exists": SP responded 2xx (piece is held) + * - "unknown": network error, timeout, or other status (don't act on it) + */ + private async probeSpPieceStatus( + serviceUrl: string, + pieceCid: string, + outerSignal?: AbortSignal, + ): Promise<"missing" | "exists" | "unknown"> { + const url = `${serviceUrl.replace(/\/$/, "")}/pdp/piece/${pieceCid}/status`; + const timeoutSignal = AbortSignal.timeout(10_000); + const signal = outerSignal ? AbortSignal.any([outerSignal, timeoutSignal]) : timeoutSignal; + try { + const res = await fetch(url, { method: "GET", signal }); + if (res.status === 404) return "missing"; + if (res.ok) return "exists"; + return "unknown"; + } catch { + return "unknown"; + } + } + /** * We select a random successful deal (DEAL_CREATED only) for a given provider. * Uses Postgres ORDER BY RANDOM() since Dealbot is Postgres-only. From 642de456309978834db4f2d6b47c909dfc8cdb27 Mon Sep 17 00:00:00 2001 From: Russell Dempsey <1173416+SgtPooki@users.noreply.github.com> Date: Wed, 20 May 2026 11:40:11 -0400 Subject: [PATCH 2/7] fix(retrieval): tighten piece-status probe per peer review - Build probe URL via WHATWG URL + encodeURIComponent (handles trailing slash + non-trivial pieceCid chars) - Re-throw outer-signal aborts; only probe-timeout/network fall through - Post-probe signal.throwIfAborted() to stop promptly on cancellation - 10s -> 5s probe timeout; safer pre-flight that beats 30s IPNI path - Record retrievalStatus="skipped.piece_missing" + log statusUrl/statusCode/probeDurationMs - User-Agent header on probe; unstubAllGlobals in tests for isolation - 2 new tests: outer-signal abort re-thrown; pieceCid URL-encoded --- .../src/retrieval/retrieval.service.spec.ts | 60 +++++++++++++++++-- .../src/retrieval/retrieval.service.ts | 47 +++++++++++---- 2 files changed, 90 insertions(+), 17 deletions(-) diff --git a/apps/backend/src/retrieval/retrieval.service.spec.ts b/apps/backend/src/retrieval/retrieval.service.spec.ts index 0fa333cf..40153c76 100644 --- a/apps/backend/src/retrieval/retrieval.service.spec.ts +++ b/apps/backend/src/retrieval/retrieval.service.spec.ts @@ -632,7 +632,8 @@ describe("RetrievalService DB/provider drift", () => { }); describe("RetrievalService SP piece status pre-flight", () => { - type RetrievalServicePrivate = RetrievalService & { + type PublicInterface = { [K in keyof T]: T[K] }; + type RetrievalServicePrivate = PublicInterface & { performAllRetrievals: (deal: Deal, signal?: AbortSignal) => Promise; }; @@ -669,9 +670,11 @@ describe("RetrievalService SP piece status pre-flight", () => { }; afterEach(() => { + vi.unstubAllGlobals(); vi.restoreAllMocks(); mockDealRepository.update.mockClear(); mockRetrievalAddonsService.testAllRetrievalMethods.mockClear(); + mockRetrievalMetrics.recordStatus.mockClear(); }); const buildDeal = (overrides: Partial = {}): Deal => @@ -710,10 +713,8 @@ describe("RetrievalService SP piece status pre-flight", () => { name: "Test SP", serviceUrl: "https://sp.example.com", }); - vi.stubGlobal( - "fetch", - vi.fn().mockResolvedValue({ status: 404, ok: false } as Response), - ); + const fetchMock = vi.fn().mockResolvedValue({ status: 404, ok: false } as Response); + vi.stubGlobal("fetch", fetchMock); const result = await service.performAllRetrievals(buildDeal()); @@ -723,6 +724,14 @@ describe("RetrievalService SP piece status pre-flight", () => { expect.objectContaining({ cleanedUp: true, cleanedUpAt: expect.any(Date) }), ); expect(mockRetrievalAddonsService.testAllRetrievalMethods).not.toHaveBeenCalled(); + expect(mockRetrievalMetrics.recordStatus).toHaveBeenCalledWith( + expect.any(Object), + "skipped.piece_missing", + ); + expect(fetchMock).toHaveBeenCalledWith( + "https://sp.example.com/pdp/piece/bafy-piece/status", + expect.objectContaining({ method: "GET" }), + ); }); it("proceeds with retrieval when SP confirms piece exists", async () => { @@ -782,4 +791,45 @@ describe("RetrievalService SP piece status pre-flight", () => { expect(fetchMock).not.toHaveBeenCalled(); expect(mockDealRepository.update).not.toHaveBeenCalled(); }); + + it("re-throws when the outer signal aborts during the probe", async () => { + const service = await createService(); + mockSpRepository.findOne.mockResolvedValue({ + address: "0xsp", + providerId: 5, + isApproved: true, + name: "Test SP", + serviceUrl: "https://sp.example.com", + }); + const ac = new AbortController(); + const fetchMock = vi.fn().mockImplementation(() => { + ac.abort(new Error("cancelled by job")); + return Promise.reject(new DOMException("aborted", "AbortError")); + }); + vi.stubGlobal("fetch", fetchMock); + + await expect(service.performAllRetrievals(buildDeal(), ac.signal)).rejects.toThrow(); + expect(mockDealRepository.update).not.toHaveBeenCalled(); + expect(mockRetrievalAddonsService.testAllRetrievalMethods).not.toHaveBeenCalled(); + }); + + it("URL-encodes the pieceCid in the probe URL", async () => { + const service = await createService(); + mockSpRepository.findOne.mockResolvedValue({ + address: "0xsp", + providerId: 5, + isApproved: true, + name: "Test SP", + serviceUrl: "https://sp.example.com/base/", + }); + const fetchMock = vi.fn().mockResolvedValue({ status: 200, ok: true } as Response); + vi.stubGlobal("fetch", fetchMock); + + await service.performAllRetrievals(buildDeal({ pieceCid: "bafy/with/slashes" })).catch(() => undefined); + + expect(fetchMock).toHaveBeenCalledWith( + "https://sp.example.com/pdp/piece/bafy%2Fwith%2Fslashes/status", + expect.any(Object), + ); + }); }); diff --git a/apps/backend/src/retrieval/retrieval.service.ts b/apps/backend/src/retrieval/retrieval.service.ts index d3d358bb..872756b7 100644 --- a/apps/backend/src/retrieval/retrieval.service.ts +++ b/apps/backend/src/retrieval/retrieval.service.ts @@ -25,6 +25,11 @@ import type { RetrievalTestResult, } from "../retrieval-addons/types.js"; +/** Timeout for the pre-flight SP piece-status probe. Short enough that an unresponsive + * SP still beats falling through to the 30s IPNI verify path; on timeout we treat + * the result as "unknown" and proceed with the normal retrieval. */ +const SP_PIECE_STATUS_PROBE_TIMEOUT_MS = 5_000; + @Injectable() export class RetrievalService { private readonly logger = new Logger(RetrievalService.name); @@ -124,16 +129,21 @@ export class RetrievalService { // cleaned_up so it stops polluting future retrieval candidates and skip the check. // Saves the 30s IPNI verify timeout + downstream block-fetch 404 noise per stale deal. if (provider.serviceUrl && deal.pieceCid) { - const presence = await this.probeSpPieceStatus(provider.serviceUrl, deal.pieceCid, signal); - if (presence === "missing") { + const probe = await this.probeSpPieceStatus(provider.serviceUrl, deal.pieceCid, signal); + signal?.throwIfAborted(); + if (probe.result === "missing") { await this.dealRepository.update( { id: deal.id, cleanedUp: false }, { cleanedUp: true, cleanedUpAt: new Date() }, ); + this.retrievalMetrics.recordStatus(providerLabels, "skipped.piece_missing"); this.logger.warn({ ...retrievalLogContext, event: "retrieval_skipped_piece_missing", message: "SP reports piece missing; marked deal cleaned_up and skipped retrieval", + statusUrl: probe.url, + statusCode: probe.statusCode, + probeDurationMs: probe.durationMs, }); return []; } @@ -390,23 +400,36 @@ export class RetrievalService { * currently has the piece. Returns: * - "missing": SP responded 404 (authoritative — SP does not have the piece) * - "exists": SP responded 2xx (piece is held) - * - "unknown": network error, timeout, or other status (don't act on it) + * - "unknown": network error, probe timeout, or other status (don't act on it) + * An outer-signal abort during the probe is re-thrown so the caller can stop. */ private async probeSpPieceStatus( serviceUrl: string, pieceCid: string, outerSignal?: AbortSignal, - ): Promise<"missing" | "exists" | "unknown"> { - const url = `${serviceUrl.replace(/\/$/, "")}/pdp/piece/${pieceCid}/status`; - const timeoutSignal = AbortSignal.timeout(10_000); + ): Promise<{ result: "missing" | "exists" | "unknown"; url: string; statusCode: number | null; durationMs: number }> { + const url = new URL(`/pdp/piece/${encodeURIComponent(pieceCid)}/status`, serviceUrl).toString(); + const timeoutSignal = AbortSignal.timeout(SP_PIECE_STATUS_PROBE_TIMEOUT_MS); const signal = outerSignal ? AbortSignal.any([outerSignal, timeoutSignal]) : timeoutSignal; + const start = Date.now(); try { - const res = await fetch(url, { method: "GET", signal }); - if (res.status === 404) return "missing"; - if (res.ok) return "exists"; - return "unknown"; - } catch { - return "unknown"; + const res = await fetch(url, { + method: "GET", + signal, + headers: { "User-Agent": "dealbot/probe" }, + }); + const durationMs = Date.now() - start; + if (res.status === 404) return { result: "missing", url, statusCode: 404, durationMs }; + if (res.ok) return { result: "exists", url, statusCode: res.status, durationMs }; + return { result: "unknown", url, statusCode: res.status, durationMs }; + } catch (error) { + // Re-throw caller-initiated aborts so retrieval stops promptly. Probe-timeout + // and network errors fall through as "unknown" — we don't want to mark deals + // cleaned_up on flaky infra. + if (outerSignal?.aborted) { + throw error; + } + return { result: "unknown", url, statusCode: null, durationMs: Date.now() - start }; } } From 97aa1515012857a2fe2cefb47f4feeca8f80352e Mon Sep 17 00:00:00 2001 From: Russell Dempsey <1173416+SgtPooki@users.noreply.github.com> Date: Wed, 20 May 2026 11:41:10 -0400 Subject: [PATCH 3/7] fix(retrieval): drain probe response + report update affected count Address Copilot review: - Cancel fetch response body to release the undici socket back to the pool - Capture dealRepository.update affected count; log distinguishes "marked cleaned_up" from "concurrent writer already cleaned" --- apps/backend/src/retrieval/retrieval.service.ts | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/apps/backend/src/retrieval/retrieval.service.ts b/apps/backend/src/retrieval/retrieval.service.ts index 872756b7..fc51516a 100644 --- a/apps/backend/src/retrieval/retrieval.service.ts +++ b/apps/backend/src/retrieval/retrieval.service.ts @@ -132,18 +132,23 @@ export class RetrievalService { const probe = await this.probeSpPieceStatus(provider.serviceUrl, deal.pieceCid, signal); signal?.throwIfAborted(); if (probe.result === "missing") { - await this.dealRepository.update( + const updateResult = await this.dealRepository.update( { id: deal.id, cleanedUp: false }, { cleanedUp: true, cleanedUpAt: new Date() }, ); + const affected = updateResult.affected ?? 0; this.retrievalMetrics.recordStatus(providerLabels, "skipped.piece_missing"); this.logger.warn({ ...retrievalLogContext, event: "retrieval_skipped_piece_missing", - message: "SP reports piece missing; marked deal cleaned_up and skipped retrieval", + message: + affected > 0 + ? "SP reports piece missing; marked deal cleaned_up and skipped retrieval" + : "SP reports piece missing; deal already cleaned_up (concurrent writer)", statusUrl: probe.url, statusCode: probe.statusCode, probeDurationMs: probe.durationMs, + affected, }); return []; } @@ -418,6 +423,9 @@ export class RetrievalService { signal, headers: { "User-Agent": "dealbot/probe" }, }); + // Drain/cancel the body so undici returns the socket to the pool instead of + // keeping it pinned to an unread response. Body content is irrelevant here. + await res.body?.cancel().catch(() => undefined); const durationMs = Date.now() - start; if (res.status === 404) return { result: "missing", url, statusCode: 404, durationMs }; if (res.ok) return { result: "exists", url, statusCode: res.status, durationMs }; From 134ec76901e4cbaf766a6f041fee199b2bc54cea Mon Sep 17 00:00:00 2001 From: Russell Dempsey <1173416+SgtPooki@users.noreply.github.com> Date: Wed, 20 May 2026 11:50:46 -0400 Subject: [PATCH 4/7] style: apply biome format to retrieval.service.spec.ts --- .../src/retrieval/retrieval.service.spec.ts | 15 +++------------ 1 file changed, 3 insertions(+), 12 deletions(-) diff --git a/apps/backend/src/retrieval/retrieval.service.spec.ts b/apps/backend/src/retrieval/retrieval.service.spec.ts index 40153c76..0563f123 100644 --- a/apps/backend/src/retrieval/retrieval.service.spec.ts +++ b/apps/backend/src/retrieval/retrieval.service.spec.ts @@ -724,10 +724,7 @@ describe("RetrievalService SP piece status pre-flight", () => { expect.objectContaining({ cleanedUp: true, cleanedUpAt: expect.any(Date) }), ); expect(mockRetrievalAddonsService.testAllRetrievalMethods).not.toHaveBeenCalled(); - expect(mockRetrievalMetrics.recordStatus).toHaveBeenCalledWith( - expect.any(Object), - "skipped.piece_missing", - ); + expect(mockRetrievalMetrics.recordStatus).toHaveBeenCalledWith(expect.any(Object), "skipped.piece_missing"); expect(fetchMock).toHaveBeenCalledWith( "https://sp.example.com/pdp/piece/bafy-piece/status", expect.objectContaining({ method: "GET" }), @@ -743,10 +740,7 @@ describe("RetrievalService SP piece status pre-flight", () => { name: "Test SP", serviceUrl: "https://sp.example.com", }); - vi.stubGlobal( - "fetch", - vi.fn().mockResolvedValue({ status: 200, ok: true } as Response), - ); + vi.stubGlobal("fetch", vi.fn().mockResolvedValue({ status: 200, ok: true } as Response)); await service.performAllRetrievals(buildDeal()).catch(() => undefined); @@ -763,10 +757,7 @@ describe("RetrievalService SP piece status pre-flight", () => { name: "Test SP", serviceUrl: "https://sp.example.com", }); - vi.stubGlobal( - "fetch", - vi.fn().mockRejectedValue(new Error("network unreachable")), - ); + vi.stubGlobal("fetch", vi.fn().mockRejectedValue(new Error("network unreachable"))); await service.performAllRetrievals(buildDeal()).catch(() => undefined); From d6b197193f553747445628cf935f36cdc7f79911 Mon Sep 17 00:00:00 2001 From: Russell Dempsey <1173416+SgtPooki@users.noreply.github.com> Date: Wed, 20 May 2026 11:53:04 -0400 Subject: [PATCH 5/7] fix(retrieval): drop speculative concurrent-writer log branch affected=0 today is effectively unreachable. Forward-looking branch was weakly justified. Log affected count and let readers interpret without speculating about cause. --- apps/backend/src/retrieval/retrieval.service.ts | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/apps/backend/src/retrieval/retrieval.service.ts b/apps/backend/src/retrieval/retrieval.service.ts index fc51516a..774103bc 100644 --- a/apps/backend/src/retrieval/retrieval.service.ts +++ b/apps/backend/src/retrieval/retrieval.service.ts @@ -136,19 +136,15 @@ export class RetrievalService { { id: deal.id, cleanedUp: false }, { cleanedUp: true, cleanedUpAt: new Date() }, ); - const affected = updateResult.affected ?? 0; this.retrievalMetrics.recordStatus(providerLabels, "skipped.piece_missing"); this.logger.warn({ ...retrievalLogContext, event: "retrieval_skipped_piece_missing", - message: - affected > 0 - ? "SP reports piece missing; marked deal cleaned_up and skipped retrieval" - : "SP reports piece missing; deal already cleaned_up (concurrent writer)", + message: "SP reports piece missing; marked deal cleaned_up and skipped retrieval", statusUrl: probe.url, statusCode: probe.statusCode, probeDurationMs: probe.durationMs, - affected, + affected: updateResult.affected ?? 0, }); return []; } From 99f508aa23040226fe517955831165d86ace33db Mon Sep 17 00:00:00 2001 From: Russell Dempsey <1173416+SgtPooki@users.noreply.github.com> Date: Wed, 20 May 2026 12:21:17 -0400 Subject: [PATCH 6/7] fix(retrieval): match existing serviceUrl join convention Synapse-sdk and dealbot's ipni.strategy + pull-check use direct concat on serviceUrl. Match that. Base paths are not part of the SP serviceUrl contract. --- apps/backend/src/retrieval/retrieval.service.spec.ts | 4 ++-- apps/backend/src/retrieval/retrieval.service.ts | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/apps/backend/src/retrieval/retrieval.service.spec.ts b/apps/backend/src/retrieval/retrieval.service.spec.ts index 0563f123..a5f286cf 100644 --- a/apps/backend/src/retrieval/retrieval.service.spec.ts +++ b/apps/backend/src/retrieval/retrieval.service.spec.ts @@ -804,14 +804,14 @@ describe("RetrievalService SP piece status pre-flight", () => { expect(mockRetrievalAddonsService.testAllRetrievalMethods).not.toHaveBeenCalled(); }); - it("URL-encodes the pieceCid in the probe URL", async () => { + it("strips trailing slash from serviceUrl and URL-encodes the pieceCid", async () => { const service = await createService(); mockSpRepository.findOne.mockResolvedValue({ address: "0xsp", providerId: 5, isApproved: true, name: "Test SP", - serviceUrl: "https://sp.example.com/base/", + serviceUrl: "https://sp.example.com/", }); const fetchMock = vi.fn().mockResolvedValue({ status: 200, ok: true } as Response); vi.stubGlobal("fetch", fetchMock); diff --git a/apps/backend/src/retrieval/retrieval.service.ts b/apps/backend/src/retrieval/retrieval.service.ts index 774103bc..fb47c2a4 100644 --- a/apps/backend/src/retrieval/retrieval.service.ts +++ b/apps/backend/src/retrieval/retrieval.service.ts @@ -409,7 +409,7 @@ export class RetrievalService { pieceCid: string, outerSignal?: AbortSignal, ): Promise<{ result: "missing" | "exists" | "unknown"; url: string; statusCode: number | null; durationMs: number }> { - const url = new URL(`/pdp/piece/${encodeURIComponent(pieceCid)}/status`, serviceUrl).toString(); + const url = `${serviceUrl.replace(/\/$/, "")}/pdp/piece/${encodeURIComponent(pieceCid)}/status`; const timeoutSignal = AbortSignal.timeout(SP_PIECE_STATUS_PROBE_TIMEOUT_MS); const signal = outerSignal ? AbortSignal.any([outerSignal, timeoutSignal]) : timeoutSignal; const start = Date.now(); From 6709f8a81fd791844573031d21c7acec613f8311 Mon Sep 17 00:00:00 2001 From: Russell Dempsey <1173416+SgtPooki@users.noreply.github.com> Date: Wed, 20 May 2026 13:08:35 -0400 Subject: [PATCH 7/7] docs(retrieval): document skipped.piece_missing retrievalStatus Per PR review on #556. Adds a row note on retrievalStatus in events-and-metrics.md and a subsection in retrievals.md explaining when the status is emitted and how it differs from a transport failure. --- docs/checks/events-and-metrics.md | 2 +- docs/checks/retrievals.md | 4 ++++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/docs/checks/events-and-metrics.md b/docs/checks/events-and-metrics.md index 89fea66c..a1761fc8 100644 --- a/docs/checks/events-and-metrics.md +++ b/docs/checks/events-and-metrics.md @@ -140,7 +140,7 @@ sequenceDiagram | `dataStorageStatus` | Data Storage | When the Data Storage check completes (all four sub-statuses done) | `success`, `failure.timedout`, `failure.other` from [Deal Status Progression](./data-storage.md#deal-status-progression). | [`deal.service.ts`](../../apps/backend/src/deal/deal.service.ts) | | `discoverabilityStatus` | Data Storage, Retrieval | [`ipniVerificationComplete`](#ipniVerificationComplete) | `success`, `failure.timedout`, `failure.other` from [Data Storage Sub-status meanings](./data-storage.md#sub-status-meanings). | | | `ipfsRetrievalHttpResponseCode` | Data Storage, Retrieval | [`ipfsRetrievalLastByteReceived`](#ipfsRetrievalLastByteReceived) | `200`, `500`, `2xxSuccess`, `4xxClientError`, `5xxServerError`, `otherHttpStatusCodes`, `failure` | [`retrieval.service.ts`](../../apps/backend/src/retrieval/retrieval.service.ts) | -| `retrievalStatus` | Data Storage, Retrieval | [`ipfsRetrievalIntegrityChecked`](#ipfsRetrievalIntegrityChecked) | `success`, `failure.timedout`, `failure.other` from [Data Storage Sub-status meanings](./data-storage.md#sub-status-meanings). | | +| `retrievalStatus` | Data Storage, Retrieval | [`ipfsRetrievalIntegrityChecked`](#ipfsRetrievalIntegrityChecked) | `success`, `failure.timedout`, `failure.other` from [Data Storage Sub-status meanings](./data-storage.md#sub-status-meanings). `skipped.piece_missing` is emitted on the Retrieval path only when the SP's `/pdp/piece/:pieceCid/status` returns 404 during the pre-flight probe; the deal is marked `cleaned_up=true` and removed from the retrieval candidate pool. | | | `dataSetCreationStatus` | Data-Set Creation | Not tied to an [event above](#event-list) but rather to data-set creation start (`pending`) and completion (`success`/`failure.*`) | `pending`, `success`, `failure.timedout`, `failure.other` | [`deal.service.ts`](../../apps/backend/src/deal/deal.service.ts) | | `dataSetChallengeStatus` | Data Retention | Emitted on each [Data Retention Check](./data-retention.md) poll when a provider's confirmed proving-period totals advance (strictly positive deltas). Unit: **challenges** (period delta × `CHALLENGES_PER_PROVING_PERIOD = 5`). | `success` (challenges in successfully-proven periods), `failure` (challenges in faulted periods) | [`data-retention.service.ts`](../../apps/backend/src/data-retention/data-retention.service.ts) | | `pdp_provider_estimated_overdue_periods` | Data Retention | Emitted on every [Data Retention Check](./data-retention.md) poll for every successfully processed provider. | Gauge value in proving periods (non-negative integer) | [`data-retention.service.ts`](../../apps/backend/src/data-retention/data-retention.service.ts) | diff --git a/docs/checks/retrievals.md b/docs/checks/retrievals.md index 43cee258..a205a8e7 100644 --- a/docs/checks/retrievals.md +++ b/docs/checks/retrievals.md @@ -113,6 +113,10 @@ Metric definitions (including Prometheus metrics) live in [Dealbot Events & Metr `retrievalStatus` counts the `/ipfs` transport stage only (assertions 2 and 3). The IPNI assertion (1) is counted on `discoverabilityStatus`. There is no composite "retrieval check" counter; overall success comes from `dataStorageStatus`, which is `success` only when all sub-statuses succeed. See [Deal Status Progression](./data-storage.md#deal-status-progression). +### `skipped.piece_missing` + +Emitted when a retrieval pre-flight probe to `${serviceUrl}/pdp/piece/:pieceCid/status` returns `404`. The deal is marked `cleaned_up=true` and removed from future retrieval candidate selection. This is not a failure of the SP's transport surface, but a signal that the piece no longer exists on the SP while the dataset is still live (for example, the SP scheduled a piece removal via PDP, or the piece dropped without an on-chain notification). The probe runs before IPNI verification and transport, so a 30s IPNI timeout is avoided on stale candidates. Search logs for `retrieval_skipped_piece_missing` to correlate. + ## Configuration Key environment variables that control retrieval testing: