From c9bdfa4393b5cb28199c310ee81cea26a40c890a Mon Sep 17 00:00:00 2001 From: Dennis Trautwein Date: Tue, 21 Apr 2026 14:54:42 +0200 Subject: [PATCH 01/55] feat: anon piece selection and retrieval --- .gitignore | 2 + apps/backend/.env.example | 17 +- apps/backend/README.md | 2 +- apps/backend/src/app.module.ts | 2 + apps/backend/src/config/app.config.ts | 86 +++++- .../data-retention/data-retention.module.ts | 4 +- .../data-retention.service.spec.ts | 182 ++++++------- .../data-retention/data-retention.service.ts | 16 +- apps/backend/src/database/database.module.ts | 9 +- .../entities/anon-retrieval.entity.ts | 100 +++++++ .../entities/job-schedule-state.entity.ts | 1 + .../1762000000000-CreateAnonRetrievals.ts | 64 +++++ .../http-client/http-client.service.spec.ts | 93 ++++++- .../src/http-client/http-client.service.ts | 87 +++++-- apps/backend/src/http-client/types.ts | 2 + apps/backend/src/jobs/job-queues.ts | 1 + apps/backend/src/jobs/jobs.module.ts | 2 + apps/backend/src/jobs/jobs.service.spec.ts | 128 ++++----- apps/backend/src/jobs/jobs.service.ts | 101 +++++++- .../metrics-prometheus/check-metric-labels.ts | 2 +- .../check-metrics.service.ts | 63 +++++ .../metrics-prometheus.module.ts | 53 ++++ .../src/pdp-subgraph/pdp-subgraph.module.ts | 8 - apps/backend/src/pdp-subgraph/queries.ts | 24 -- .../anon-piece-selector.service.spec.ts | 168 ++++++++++++ .../anon-piece-selector.service.ts | 208 +++++++++++++++ .../anon-retrieval.service.spec.ts | 189 ++++++++++++++ .../retrieval-anon/anon-retrieval.service.ts | 244 ++++++++++++++++++ .../retrieval-anon/car-validation.service.ts | 223 ++++++++++++++++ .../retrieval-anon/piece-retrieval.service.ts | 195 ++++++++++++++ .../retrieval-anon/retrieval-anon.module.ts | 27 ++ apps/backend/src/retrieval-anon/types.ts | 35 +++ apps/backend/src/subgraph/queries.ts | 78 ++++++ apps/backend/src/subgraph/subgraph.module.ts | 8 + .../subgraph.service.spec.ts} | 167 +++++++++++- .../subgraph.service.ts} | 232 ++++++++++++++--- .../{pdp-subgraph => subgraph}/types.spec.ts | 0 .../src/{pdp-subgraph => subgraph}/types.ts | 101 ++++++++ .../src/wallet-sdk/wallet-sdk.service.spec.ts | 2 +- docs/checks/data-retention.md | 10 +- ...-configuration-and-approval-methodology.md | 2 +- docs/environment-variables.md | 34 ++- .../local/backend-configmap-local.yaml | 2 +- pnpm-lock.yaml | 36 +-- 44 files changed, 2683 insertions(+), 327 deletions(-) create mode 100644 apps/backend/src/database/entities/anon-retrieval.entity.ts create mode 100644 apps/backend/src/database/migrations/1762000000000-CreateAnonRetrievals.ts delete mode 100644 apps/backend/src/pdp-subgraph/pdp-subgraph.module.ts delete mode 100644 apps/backend/src/pdp-subgraph/queries.ts create mode 100644 apps/backend/src/retrieval-anon/anon-piece-selector.service.spec.ts create mode 100644 apps/backend/src/retrieval-anon/anon-piece-selector.service.ts create mode 100644 apps/backend/src/retrieval-anon/anon-retrieval.service.spec.ts create mode 100644 apps/backend/src/retrieval-anon/anon-retrieval.service.ts create mode 100644 apps/backend/src/retrieval-anon/car-validation.service.ts create mode 100644 apps/backend/src/retrieval-anon/piece-retrieval.service.ts create mode 100644 apps/backend/src/retrieval-anon/retrieval-anon.module.ts create mode 100644 apps/backend/src/retrieval-anon/types.ts create mode 100644 apps/backend/src/subgraph/queries.ts create mode 100644 apps/backend/src/subgraph/subgraph.module.ts rename apps/backend/src/{pdp-subgraph/pdp-subgraph.service.spec.ts => subgraph/subgraph.service.spec.ts} (79%) rename apps/backend/src/{pdp-subgraph/pdp-subgraph.service.ts => subgraph/subgraph.service.ts} (52%) rename apps/backend/src/{pdp-subgraph => subgraph}/types.spec.ts (100%) rename apps/backend/src/{pdp-subgraph => subgraph}/types.ts (58%) diff --git a/.gitignore b/.gitignore index fc72832b..cbf7f9d7 100644 --- a/.gitignore +++ b/.gitignore @@ -21,3 +21,5 @@ coverage/ # per-package lockfiles are stray apps/*/pnpm-lock.yaml !pnpm-lock.yaml + +.tool-versions diff --git a/apps/backend/.env.example b/apps/backend/.env.example index 6815a66f..26469c52 100644 --- a/apps/backend/.env.example +++ b/apps/backend/.env.example @@ -23,7 +23,8 @@ WALLET_ADDRESS=0x0000000000000000000000000000000000000000 WALLET_PRIVATE_KEY=your_private_key_here CHECK_DATASET_CREATION_FEES=true USE_ONLY_APPROVED_PROVIDERS=true -PDP_SUBGRAPH_ENDPOINT=https://api.thegraph.com/subgraphs/filecoin/pdp +# Point at the dealbot-owned subgraph on Goldsky (see apps/subgraph/README.md). +SUBGRAPH_ENDPOINT=https://api.goldsky.com/api/public//subgraphs/dealbot-subgraph//gn # Minimum number of datasets per SP (default: 1). When > 1, a separate data_set_creation job provisions extra datasets. MIN_NUM_DATASETS_FOR_CHECKS=1 @@ -52,6 +53,9 @@ DEALBOT_MAINTENANCE_WINDOW_MINUTES=20 DEALS_PER_SP_PER_HOUR=2 DATASET_CREATIONS_PER_SP_PER_HOUR=1 RETRIEVALS_PER_SP_PER_HOUR=1 +RETRIEVALS_ANON_PER_SP_PER_HOUR= +ANON_RETRIEVAL_BLOCK_SAMPLE_COUNT=5 +METRICS_PER_HOUR=2 PG_BOSS_LOCAL_CONCURRENCY=20 JOB_SCHEDULER_POLL_SECONDS=300 JOB_WORKER_POLL_SECONDS=60 @@ -60,6 +64,7 @@ JOB_SCHEDULE_PHASE_SECONDS=0 JOB_ENQUEUE_JITTER_SECONDS=0 DEAL_JOB_TIMEOUT_SECONDS=360 # 6m: Max runtime for deal jobs (TODO: reduce default to 3m) RETRIEVAL_JOB_TIMEOUT_SECONDS=60 # 1m: Max runtime for retrieval jobs (TODO: reduce default to 30s) +ANON_RETRIEVAL_JOB_TIMEOUT_SECONDS=360 # 6m: Max runtime for anon retrieval jobs (pieces up to ~70 MiB) IPFS_BLOCK_FETCH_CONCURRENCY=6 # Parallel block fetches when validating IPFS DAGs DEALBOT_PGBOSS_POOL_MAX=1 DEALBOT_PGBOSS_SCHEDULER_ENABLED=true @@ -73,9 +78,13 @@ PROXY_LIST=http://username:password@host:port,http://username:password@host:port PROXY_LOCATIONS=l1,l2 # Timeout Configuration (in milliseconds) -CONNECT_TIMEOUT_MS=10000 # 10s: Initial connection timeout -HTTP_REQUEST_TIMEOUT_MS=240000 # 4m: Total transfer timeout for HTTP/1.1 (10MiB @ 170KB/s + overhead) -HTTP2_REQUEST_TIMEOUT_MS=240000 # 4m: Total transfer timeout for HTTP/2 (10MiB @ 170KB/s + overhead) +CONNECT_TIMEOUT_MS=10000 # 10s: Connection + response-headers timeout (scoped to the header phase only) +# HTTP_REQUEST_TIMEOUT_MS and HTTP2_REQUEST_TIMEOUT_MS default to the longest job timeout above +# (max of DEAL_/RETRIEVAL_/ANON_RETRIEVAL_/DATA_SET_CREATION_/MAX_PIECE_CLEANUP_ * 1000 ms) so the +# HTTP-level ceiling never pre-empts a job-scoped AbortSignal. Only override when you have a non-job +# caller of HttpClientService that needs a specific deadline. +# HTTP_REQUEST_TIMEOUT_MS=360000 +# HTTP2_REQUEST_TIMEOUT_MS=360000 # SP Blocklists configuration # BLOCKED_SP_IDS=1234,5678 diff --git a/apps/backend/README.md b/apps/backend/README.md index 19ee970a..4805080f 100644 --- a/apps/backend/README.md +++ b/apps/backend/README.md @@ -104,7 +104,7 @@ All configuration is done via environment variables in `.env`. | `CHECK_DATASET_CREATION_FEES` | Check fees before dataset creation | `true` | | `ENABLE_IPNI_TESTING` | IPNI testing mode (`disabled`/`random`/`always`) | `always` | | `USE_ONLY_APPROVED_PROVIDERS` | Only use approved storage providers | `true` | -| `PDP_SUBGRAPH_ENDPOINT` | PDP subgraph API endpoint for PDP proof-set/data-retention | `https://api.thegraph.com/subgraphs/filecoin/pdp` | +| `SUBGRAPH_ENDPOINT` | Subgraph GraphQL endpoint for PDP proof-set/data-retention and anon-retrieval queries | `https://api.goldsky.com/api/public//subgraphs/dealbot-subgraph//gn` | ### Scheduling Configuration (pg-boss) diff --git a/apps/backend/src/app.module.ts b/apps/backend/src/app.module.ts index 569ec5e4..0580f339 100644 --- a/apps/backend/src/app.module.ts +++ b/apps/backend/src/app.module.ts @@ -13,6 +13,7 @@ import { JobsModule } from "./jobs/jobs.module.js"; import { MetricsPrometheusModule } from "./metrics-prometheus/metrics-prometheus.module.js"; import { ProvidersModule } from "./providers/providers.module.js"; import { RetrievalModule } from "./retrieval/retrieval.module.js"; +import { RetrievalAnonModule } from "./retrieval-anon/retrieval-anon.module.js"; @Module({ imports: [ @@ -28,6 +29,7 @@ import { RetrievalModule } from "./retrieval/retrieval.module.js"; JobsModule, DealModule, RetrievalModule, + RetrievalAnonModule, DataSourceModule, ProvidersModule, ...(process.env.ENABLE_DEV_MODE === "true" ? [DevToolsModule] : []), diff --git a/apps/backend/src/config/app.config.ts b/apps/backend/src/config/app.config.ts index b3b32a37..4e49e4d8 100644 --- a/apps/backend/src/config/app.config.ts +++ b/apps/backend/src/config/app.config.ts @@ -56,7 +56,7 @@ export const configValidationSchema = Joi.object({ USE_ONLY_APPROVED_PROVIDERS: Joi.boolean().default(true), DEALBOT_DATASET_VERSION: Joi.string().optional(), MIN_NUM_DATASETS_FOR_CHECKS: Joi.number().integer().min(1).default(1), - PDP_SUBGRAPH_ENDPOINT: Joi.string().uri().optional().allow(""), + SUBGRAPH_ENDPOINT: Joi.string().uri().optional().allow(""), // Scheduling PROVIDERS_REFRESH_INTERVAL_SECONDS: Joi.number().default(4 * 3600), @@ -80,6 +80,7 @@ export const configValidationSchema = Joi.object({ DEALS_PER_SP_PER_HOUR: Joi.number().min(0.001).max(20).default(4), DATASET_CREATIONS_PER_SP_PER_HOUR: Joi.number().min(0.001).max(20).default(1), RETRIEVALS_PER_SP_PER_HOUR: Joi.number().min(0.001).max(20).default(2), + RETRIEVALS_ANON_PER_SP_PER_HOUR: Joi.number().min(0.001).max(20).optional(), // Polling interval for pg-boss scheduler (lower = more responsive, higher = less DB chatter). JOB_SCHEDULER_POLL_SECONDS: Joi.number().min(60).default(300), JOB_WORKER_POLL_SECONDS: Joi.number().min(5).default(60), @@ -91,8 +92,10 @@ export const configValidationSchema = Joi.object({ JOB_ENQUEUE_JITTER_SECONDS: Joi.number().min(0).default(0), DEAL_JOB_TIMEOUT_SECONDS: Joi.number().min(120).default(360), // 6 minutes max runtime for data storage jobs (TODO: reduce default to 3 minutes) RETRIEVAL_JOB_TIMEOUT_SECONDS: Joi.number().min(60).default(60), // 1 minute max runtime for retrieval jobs (TODO: reduce default to 30 seconds) + ANON_RETRIEVAL_JOB_TIMEOUT_SECONDS: Joi.number().min(60).default(360), // 6 minutes max runtime for anon retrieval jobs (pieces can be up to ~70 MiB) DATA_SET_CREATION_JOB_TIMEOUT_SECONDS: Joi.number().min(60).default(300), // 5 minutes max runtime for dataset creation jobs IPFS_BLOCK_FETCH_CONCURRENCY: Joi.number().integer().min(1).max(32).default(6), + ANON_RETRIEVAL_BLOCK_SAMPLE_COUNT: Joi.number().integer().min(1).max(50).default(5), // Piece Cleanup MAX_DATASET_STORAGE_SIZE_BYTES: Joi.number() @@ -131,8 +134,9 @@ export const configValidationSchema = Joi.object({ // Timeouts (in milliseconds) CONNECT_TIMEOUT_MS: Joi.number().min(1000).default(10000), // 10 seconds to establish connection/receive headers - HTTP_REQUEST_TIMEOUT_MS: Joi.number().min(1000).default(240000), // 4 minutes total for HTTP requests (10MiB @ 170KB/s + overhead) - HTTP2_REQUEST_TIMEOUT_MS: Joi.number().min(1000).default(240000), // 4 minutes total for HTTP/2 requests (10MiB @ 170KB/s + overhead) + // Defaults intentionally omitted so loadConfig can derive them from the longest job timeout. + HTTP_REQUEST_TIMEOUT_MS: Joi.number().min(1000).optional(), + HTTP2_REQUEST_TIMEOUT_MS: Joi.number().min(1000).optional(), IPNI_VERIFICATION_TIMEOUT_MS: Joi.number().min(1000).default(60000), // 60 seconds max time to wait for IPNI verification IPNI_VERIFICATION_POLLING_MS: Joi.number().min(250).default(2000), // 2 seconds between IPNI verification polls @@ -173,7 +177,7 @@ export interface IBlockchainConfig { useOnlyApprovedProviders: boolean; dealbotDataSetVersion?: string; minNumDataSetsForChecks: number; - pdpSubgraphEndpoint?: string; + subgraphEndpoint?: string; } export interface ISchedulingConfig { @@ -264,6 +268,14 @@ export interface IJobsConfig { * Uses AbortController to actively cancel job execution. */ retrievalJobTimeoutSeconds: number; + /** + * Maximum runtime (seconds) for anonymous retrieval jobs before forced abort. + * + * Anonymous retrievals fetch arbitrary pieces (up to ~70 MiB), so this is + * typically larger than `retrievalJobTimeoutSeconds`. Uses AbortController + * to actively cancel job execution while still persisting partial metrics. + */ + anonRetrievalJobTimeoutSeconds: number; /** * Target number of piece cleanup runs per storage provider per hour. * @@ -278,6 +290,12 @@ export interface IJobsConfig { * Only used when `DEALBOT_JOBS_MODE=pgboss`. */ maxPieceCleanupRuntimeSeconds: number; + + /** + * Target number of anonymous retrieval tests per storage provider per hour. + * Defaults to retrievalsPerSpPerHour when not set. + */ + retrievalsAnonPerSpPerHour: number; } export interface IDatasetConfig { @@ -295,6 +313,10 @@ export interface ITimeoutConfig { export interface IRetrievalConfig { ipfsBlockFetchConcurrency: number; + /** + * Number of CAR blocks to sample for IPNI + block-fetch validation. + */ + anonBlockSampleCount: number; } export interface IPieceCleanupConfig { @@ -336,6 +358,43 @@ export interface IConfig { } export function loadConfig(): IConfig { + const jobTimeoutSeconds = { + deal: Number.parseInt(process.env.DEAL_JOB_TIMEOUT_SECONDS || "360", 10), + retrieval: Number.parseInt(process.env.RETRIEVAL_JOB_TIMEOUT_SECONDS || "60", 10), + anonRetrieval: Number.parseInt(process.env.ANON_RETRIEVAL_JOB_TIMEOUT_SECONDS || "360", 10), + dataSetCreation: Number.parseInt(process.env.DATA_SET_CREATION_JOB_TIMEOUT_SECONDS || "300", 10), + pieceCleanup: Number.parseInt(process.env.MAX_PIECE_CLEANUP_RUNTIME_SECONDS || "300", 10), + }; + + // HTTP-level request timeouts default to the longest job timeout so the + // per-request ceiling never caps below the per-job budget. Any job-scoped + // AbortSignal fires first and is authoritative; the HTTP timer only kicks + // in for callers that do not pass a parent signal. + const longestJobTimeoutMs = Math.max(...Object.values(jobTimeoutSeconds)) * 1000; + + const httpRequestTimeoutMs = Number.parseInt(process.env.HTTP_REQUEST_TIMEOUT_MS || String(longestJobTimeoutMs), 10); + const http2RequestTimeoutMs = Number.parseInt( + process.env.HTTP2_REQUEST_TIMEOUT_MS || String(longestJobTimeoutMs), + 10, + ); + + // Misconfiguration guard: if someone explicitly sets an HTTP timeout below + // the longest job timeout, the HTTP-level timer will abort in-flight work + // before the job signal has a chance to report it. Warn loudly so this is + // caught at boot rather than inferred from short-timeout incidents later. + for (const [name, value] of [ + ["HTTP_REQUEST_TIMEOUT_MS", httpRequestTimeoutMs], + ["HTTP2_REQUEST_TIMEOUT_MS", http2RequestTimeoutMs], + ] as const) { + if (value < longestJobTimeoutMs) { + // eslint-disable-next-line no-console + console.warn( + `[config] ${name}=${value}ms is lower than the longest job timeout (${longestJobTimeoutMs}ms). ` + + `HTTP requests may abort before the job signal fires, producing short, unexplained timeouts.`, + ); + } + } + return { app: { env: process.env.NODE_ENV || "development", @@ -378,7 +437,7 @@ export function loadConfig(): IConfig { useOnlyApprovedProviders: process.env.USE_ONLY_APPROVED_PROVIDERS !== "false", dealbotDataSetVersion: process.env.DEALBOT_DATASET_VERSION, minNumDataSetsForChecks: Number.parseInt(process.env.MIN_NUM_DATASETS_FOR_CHECKS || "1", 10), - pdpSubgraphEndpoint: process.env.PDP_SUBGRAPH_ENDPOINT || "", + subgraphEndpoint: process.env.SUBGRAPH_ENDPOINT || "", }, scheduling: { providersRefreshIntervalSeconds: Number.parseInt(process.env.PROVIDERS_REFRESH_INTERVAL_SECONDS || "14400", 10), @@ -401,11 +460,15 @@ export function loadConfig(): IConfig { catchupMaxEnqueue: Number.parseInt(process.env.JOB_CATCHUP_MAX_ENQUEUE || "10", 10), schedulePhaseSeconds: Number.parseInt(process.env.JOB_SCHEDULE_PHASE_SECONDS || "0", 10), enqueueJitterSeconds: Number.parseInt(process.env.JOB_ENQUEUE_JITTER_SECONDS || "0", 10), - dealJobTimeoutSeconds: Number.parseInt(process.env.DEAL_JOB_TIMEOUT_SECONDS || "360", 10), - retrievalJobTimeoutSeconds: Number.parseInt(process.env.RETRIEVAL_JOB_TIMEOUT_SECONDS || "60", 10), - dataSetCreationJobTimeoutSeconds: Number.parseInt(process.env.DATA_SET_CREATION_JOB_TIMEOUT_SECONDS || "300", 10), + dealJobTimeoutSeconds: jobTimeoutSeconds.deal, + retrievalJobTimeoutSeconds: jobTimeoutSeconds.retrieval, + anonRetrievalJobTimeoutSeconds: jobTimeoutSeconds.anonRetrieval, + retrievalsAnonPerSpPerHour: Number.parseFloat( + process.env.RETRIEVALS_ANON_PER_SP_PER_HOUR || process.env.RETRIEVALS_PER_SP_PER_HOUR || "2", + ), + dataSetCreationJobTimeoutSeconds: jobTimeoutSeconds.dataSetCreation, pieceCleanupPerSpPerHour: Number.parseFloat(process.env.JOB_PIECE_CLEANUP_PER_SP_PER_HOUR || String(1 / 24)), - maxPieceCleanupRuntimeSeconds: Number.parseInt(process.env.MAX_PIECE_CLEANUP_RUNTIME_SECONDS || "300", 10), + maxPieceCleanupRuntimeSeconds: jobTimeoutSeconds.pieceCleanup, }, dataset: { localDatasetsPath: process.env.DEALBOT_LOCAL_DATASETS_PATH || DEFAULT_LOCAL_DATASETS_PATH, @@ -427,13 +490,14 @@ export function loadConfig(): IConfig { }, timeouts: { connectTimeoutMs: Number.parseInt(process.env.CONNECT_TIMEOUT_MS || "10000", 10), - httpRequestTimeoutMs: Number.parseInt(process.env.HTTP_REQUEST_TIMEOUT_MS || "240000", 10), - http2RequestTimeoutMs: Number.parseInt(process.env.HTTP2_REQUEST_TIMEOUT_MS || "240000", 10), + httpRequestTimeoutMs, + http2RequestTimeoutMs, ipniVerificationTimeoutMs: Number.parseInt(process.env.IPNI_VERIFICATION_TIMEOUT_MS || "60000", 10), ipniVerificationPollingMs: Number.parseInt(process.env.IPNI_VERIFICATION_POLLING_MS || "2000", 10), }, retrieval: { ipfsBlockFetchConcurrency: Number.parseInt(process.env.IPFS_BLOCK_FETCH_CONCURRENCY || "6", 10), + anonBlockSampleCount: Number.parseInt(process.env.ANON_RETRIEVAL_BLOCK_SAMPLE_COUNT || "5", 10), }, clickhouse: { url: process.env.CLICKHOUSE_URL || undefined, diff --git a/apps/backend/src/data-retention/data-retention.module.ts b/apps/backend/src/data-retention/data-retention.module.ts index f459570a..f0aec1ec 100644 --- a/apps/backend/src/data-retention/data-retention.module.ts +++ b/apps/backend/src/data-retention/data-retention.module.ts @@ -2,12 +2,12 @@ import { Module } from "@nestjs/common"; import { TypeOrmModule } from "@nestjs/typeorm"; import { DataRetentionBaseline } from "../database/entities/data-retention-baseline.entity.js"; import { StorageProvider } from "../database/entities/storage-provider.entity.js"; -import { PdpSubgraphModule } from "../pdp-subgraph/pdp-subgraph.module.js"; +import { SubgraphModule } from "../subgraph/subgraph.module.js"; import { WalletSdkModule } from "../wallet-sdk/wallet-sdk.module.js"; import { DataRetentionService } from "./data-retention.service.js"; @Module({ - imports: [WalletSdkModule, PdpSubgraphModule, TypeOrmModule.forFeature([DataRetentionBaseline, StorageProvider])], + imports: [WalletSdkModule, SubgraphModule, TypeOrmModule.forFeature([DataRetentionBaseline, StorageProvider])], providers: [DataRetentionService], exports: [DataRetentionService], }) diff --git a/apps/backend/src/data-retention/data-retention.service.spec.ts b/apps/backend/src/data-retention/data-retention.service.spec.ts index 87ced66a..d2d539cf 100644 --- a/apps/backend/src/data-retention/data-retention.service.spec.ts +++ b/apps/backend/src/data-retention/data-retention.service.spec.ts @@ -7,8 +7,8 @@ import type { IConfig } from "../config/app.config.js"; import type { DataRetentionBaseline } from "../database/entities/data-retention-baseline.entity.js"; import { StorageProvider } from "../database/entities/storage-provider.entity.js"; import { buildCheckMetricLabels } from "../metrics-prometheus/check-metric-labels.js"; -import type { PDPSubgraphService } from "../pdp-subgraph/pdp-subgraph.service.js"; -import type { ProviderDataSetResponse } from "../pdp-subgraph/types.js"; +import type { SubgraphService } from "../subgraph/subgraph.service.js"; +import type { ProviderDataSetResponse } from "../subgraph/types.js"; import type { WalletSdkService } from "../wallet-sdk/wallet-sdk.service.js"; import { DataRetentionService } from "./data-retention.service.js"; @@ -42,7 +42,7 @@ describe("DataRetentionService", () => { let walletSdkServiceMock: { getTestingProviders: ReturnType; }; - let pdpSubgraphServiceMock: { + let subgraphServiceMock: { fetchSubgraphMeta: ReturnType; fetchProvidersWithDatasets: ReturnType; }; @@ -69,7 +69,7 @@ describe("DataRetentionService", () => { configServiceMock = { get: vi.fn((key: keyof IConfig) => { if (key === "blockchain") { - return { pdpSubgraphEndpoint: "https://example.com/subgraph" }; + return { subgraphEndpoint: "https://example.com/subgraph" }; } if (key === "spBlocklists") { return { ids: new Set(), addresses: new Set() }; @@ -95,7 +95,7 @@ describe("DataRetentionService", () => { ]), }; - pdpSubgraphServiceMock = { + subgraphServiceMock = { fetchSubgraphMeta: vi.fn().mockResolvedValue({ _meta: { block: { @@ -146,7 +146,7 @@ describe("DataRetentionService", () => { service = new DataRetentionService( configServiceMock, walletSdkServiceMock as unknown as WalletSdkService, - pdpSubgraphServiceMock as unknown as PDPSubgraphService, + subgraphServiceMock as unknown as SubgraphService, mockBaselineRepository as unknown as Repository, mockSPRepository as unknown as Repository, counterMock as unknown as Counter, @@ -155,15 +155,15 @@ describe("DataRetentionService", () => { ); }); - it("returns early when pdpSubgraphEndpoint is empty", async () => { + it("returns early when subgraphEndpoint is empty", async () => { (configServiceMock.get as ReturnType).mockReturnValue({ - pdpSubgraphEndpoint: "", + subgraphEndpoint: "", }); await service.pollDataRetention(); - expect(pdpSubgraphServiceMock.fetchSubgraphMeta).not.toHaveBeenCalled(); - expect(pdpSubgraphServiceMock.fetchProvidersWithDatasets).not.toHaveBeenCalled(); + expect(subgraphServiceMock.fetchSubgraphMeta).not.toHaveBeenCalled(); + expect(subgraphServiceMock.fetchProvidersWithDatasets).not.toHaveBeenCalled(); }); it("returns early when no testing providers configured", async () => { @@ -171,31 +171,31 @@ describe("DataRetentionService", () => { await service.pollDataRetention(); - expect(pdpSubgraphServiceMock.fetchProvidersWithDatasets).not.toHaveBeenCalled(); + expect(subgraphServiceMock.fetchProvidersWithDatasets).not.toHaveBeenCalled(); }); it("returns early when all providers are blocked for data-retention", async () => { (configServiceMock.get as ReturnType).mockImplementation((key: string) => { - if (key === "blockchain") return { pdpSubgraphEndpoint: "https://example.com/subgraph" }; + if (key === "blockchain") return { subgraphEndpoint: "https://example.com/subgraph" }; if (key === "spBlocklists") return { ids: new Set(), addresses: new Set([PROVIDER_A, PROVIDER_B]) }; }); await service.pollDataRetention(); - expect(pdpSubgraphServiceMock.fetchProvidersWithDatasets).not.toHaveBeenCalled(); + expect(subgraphServiceMock.fetchProvidersWithDatasets).not.toHaveBeenCalled(); }); it("excludes blocked providers from data-retention polling while retaining unblocked ones", async () => { (configServiceMock.get as ReturnType).mockImplementation((key: string) => { - if (key === "blockchain") return { pdpSubgraphEndpoint: "https://example.com/subgraph" }; + if (key === "blockchain") return { subgraphEndpoint: "https://example.com/subgraph" }; if (key === "spBlocklists") return { ids: new Set(), addresses: new Set([PROVIDER_A]) }; }); - pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider({ address: PROVIDER_B })]); + subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider({ address: PROVIDER_B })]); await service.pollDataRetention(); const allAddressesPolled: string[] = ( - pdpSubgraphServiceMock.fetchProvidersWithDatasets.mock.calls as [{ addresses: string[] }][] + subgraphServiceMock.fetchProvidersWithDatasets.mock.calls as [{ addresses: string[] }][] ).flatMap(([{ addresses }]) => addresses); expect(allAddressesPolled).toContain(PROVIDER_B.toLowerCase()); expect(allAddressesPolled).not.toContain(PROVIDER_A.toLowerCase()); @@ -206,16 +206,16 @@ describe("DataRetentionService", () => { await service.pollDataRetention(); - expect(pdpSubgraphServiceMock.fetchProvidersWithDatasets).not.toHaveBeenCalled(); + expect(subgraphServiceMock.fetchProvidersWithDatasets).not.toHaveBeenCalled(); }); it("sets baseline on first poll without emitting counters (fresh deploy / new provider)", async () => { - pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider()]); + subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider()]); await service.pollDataRetention(); - expect(pdpSubgraphServiceMock.fetchSubgraphMeta).toHaveBeenCalled(); - expect(pdpSubgraphServiceMock.fetchProvidersWithDatasets).toHaveBeenCalledWith({ + expect(subgraphServiceMock.fetchSubgraphMeta).toHaveBeenCalled(); + expect(subgraphServiceMock.fetchProvidersWithDatasets).toHaveBeenCalledWith({ blockNumber: 1200, addresses: [PROVIDER_A, PROVIDER_B], }); @@ -239,20 +239,20 @@ describe("DataRetentionService", () => { it("computes deltas correctly on consecutive polls", async () => { // First poll: blockNumber=1200 - pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider()]); + subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider()]); await service.pollDataRetention(); const firstCallCount = counterMock.labels.mock.calls.length; // Second poll: blockNumber=1300, provider totals changed - pdpSubgraphServiceMock.fetchSubgraphMeta.mockResolvedValueOnce({ + subgraphServiceMock.fetchSubgraphMeta.mockResolvedValueOnce({ _meta: { block: { number: 1300, }, }, }); - pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([ + subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([ makeProvider({ totalFaultedPeriods: 12n, totalProvingPeriods: 105n, @@ -266,7 +266,7 @@ describe("DataRetentionService", () => { }); it("does not increment counters when deltas are zero", async () => { - pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValue([makeProvider()]); + subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValue([makeProvider()]); // First poll await service.pollDataRetention(); @@ -288,7 +288,7 @@ describe("DataRetentionService", () => { const providerA = makeProvider({ address: PROVIDER_A, totalFaultedPeriods: 5n }); const providerB = makeProvider({ address: PROVIDER_B, totalFaultedPeriods: 20n }); - pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([providerA, providerB]); + subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([providerA, providerB]); await service.pollDataRetention(); @@ -310,7 +310,7 @@ describe("DataRetentionService", () => { ]); const provider = makeProvider(); - pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([provider]); + subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([provider]); await service.pollDataRetention(); @@ -333,7 +333,7 @@ describe("DataRetentionService", () => { }); it("handles empty providers array without errors", async () => { - pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([]); + subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([]); await service.pollDataRetention(); @@ -347,7 +347,7 @@ describe("DataRetentionService", () => { ]); const provider = makeProvider(); - pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([provider]); + subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([provider]); await service.pollDataRetention(); @@ -370,7 +370,7 @@ describe("DataRetentionService", () => { }); it("catches and logs errors without rethrowing", async () => { - pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockRejectedValueOnce(new Error("subgraph down")); + subgraphServiceMock.fetchProvidersWithDatasets.mockRejectedValueOnce(new Error("subgraph down")); // Should not throw await expect(service.pollDataRetention()).resolves.toBeUndefined(); @@ -378,14 +378,14 @@ describe("DataRetentionService", () => { it("resets baseline on negative deltas without incrementing counters", async () => { // First poll: high values - pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([ + subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([ makeProvider({ totalFaultedPeriods: 100n, totalProvingPeriods: 200n }), ]); await service.pollDataRetention(); counterMock.labels.mockClear(); // Second poll: lower values (e.g., chain reorg or subgraph correction) - pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([ + subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([ makeProvider({ totalFaultedPeriods: 50n, totalProvingPeriods: 100n }), ]); await service.pollDataRetention(); @@ -394,7 +394,7 @@ describe("DataRetentionService", () => { expect(counterMock.labels).not.toHaveBeenCalled(); // Third poll: values increase from new baseline - pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([ + subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([ makeProvider({ totalFaultedPeriods: 52n, totalProvingPeriods: 105n }), ]); await service.pollDataRetention(); @@ -412,7 +412,7 @@ describe("DataRetentionService", () => { { providerAddress: PROVIDER_A, faultedPeriods: "0", successPeriods: "0", lastBlockNumber: "1000" }, ]); - pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([ + subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([ makeProvider({ totalFaultedPeriods: largeValue, totalProvingPeriods: largeValue * 2n }), ]); @@ -436,7 +436,7 @@ describe("DataRetentionService", () => { { providerAddress: PROVIDER_A, faultedPeriods: "0", successPeriods: "0", lastBlockNumber: "1000" }, ]); - pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([ + subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([ makeProvider({ totalFaultedPeriods: maxSafeInt, totalProvingPeriods: maxSafeInt * 2n }), ]); @@ -456,7 +456,7 @@ describe("DataRetentionService", () => { totalFaultedPeriods: 5n, totalProvingPeriods: 50n, }); - pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([provider]); + subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([provider]); await service.pollDataRetention(); @@ -475,18 +475,18 @@ describe("DataRetentionService", () => { })); walletSdkServiceMock.getTestingProviders.mockReturnValueOnce(manyProviders); - pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValue([]); + subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValue([]); await service.pollDataRetention(); // Should be called twice: once for first 50, once for remaining 25 - expect(pdpSubgraphServiceMock.fetchProvidersWithDatasets).toHaveBeenCalledTimes(2); - expect(pdpSubgraphServiceMock.fetchProvidersWithDatasets).toHaveBeenNthCalledWith(1, { + expect(subgraphServiceMock.fetchProvidersWithDatasets).toHaveBeenCalledTimes(2); + expect(subgraphServiceMock.fetchProvidersWithDatasets).toHaveBeenNthCalledWith(1, { addresses: expect.arrayContaining([expect.any(String)]), blockNumber: 1200, }); - expect(pdpSubgraphServiceMock.fetchProvidersWithDatasets.mock.calls[0][0].addresses).toHaveLength(50); - expect(pdpSubgraphServiceMock.fetchProvidersWithDatasets.mock.calls[1][0].addresses).toHaveLength(25); + expect(subgraphServiceMock.fetchProvidersWithDatasets.mock.calls[0][0].addresses).toHaveLength(50); + expect(subgraphServiceMock.fetchProvidersWithDatasets.mock.calls[1][0].addresses).toHaveLength(25); }); it("continues processing next batch if one batch fails", async () => { @@ -499,20 +499,20 @@ describe("DataRetentionService", () => { walletSdkServiceMock.getTestingProviders.mockReturnValueOnce(manyProviders); // First batch fails, second succeeds - pdpSubgraphServiceMock.fetchProvidersWithDatasets + subgraphServiceMock.fetchProvidersWithDatasets .mockRejectedValueOnce(new Error("Subgraph timeout")) .mockResolvedValueOnce([]); await service.pollDataRetention(); // Both batches should be attempted - expect(pdpSubgraphServiceMock.fetchProvidersWithDatasets).toHaveBeenCalledTimes(2); + expect(subgraphServiceMock.fetchProvidersWithDatasets).toHaveBeenCalledTimes(2); }); it("logs error and skips counter update when provider not found in cache but returned from subgraph", async () => { // Provider C not in cache const PROVIDER_C = "0x1234567890123456789012345678901234567890"; - pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider({ address: PROVIDER_C })]); + subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider({ address: PROVIDER_C })]); await service.pollDataRetention(); @@ -523,7 +523,7 @@ describe("DataRetentionService", () => { describe("cleanupStaleProviders", () => { it("does not cleanup when no stale providers exist", async () => { // First poll establishes baseline for both providers - pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([ + subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([ makeProvider({ address: PROVIDER_A }), makeProvider({ address: PROVIDER_B }), ]); @@ -536,7 +536,7 @@ describe("DataRetentionService", () => { it("successfully cleans up stale provider with valid database entry", async () => { // First poll: establish baseline for PROVIDER_A - pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider({ address: PROVIDER_A })]); + subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider({ address: PROVIDER_A })]); await service.pollDataRetention(); // Second poll: PROVIDER_A removed from active list, only PROVIDER_B active @@ -558,7 +558,7 @@ describe("DataRetentionService", () => { }, ]); - pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider({ address: PROVIDER_B })]); + subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider({ address: PROVIDER_B })]); await service.pollDataRetention(); @@ -589,7 +589,7 @@ describe("DataRetentionService", () => { it("skips cleanup entirely when database fetch fails", async () => { // First poll: establish baseline - pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider({ address: PROVIDER_A })]); + subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider({ address: PROVIDER_A })]); await service.pollDataRetention(); // Second poll: provider removed, but DB fails @@ -604,7 +604,7 @@ describe("DataRetentionService", () => { mockSPRepository.find.mockRejectedValueOnce(new Error("Database connection failed")); - pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider({ address: PROVIDER_B })]); + subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider({ address: PROVIDER_B })]); await service.pollDataRetention(); @@ -624,7 +624,7 @@ describe("DataRetentionService", () => { }, ]); - pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([ + subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([ makeProvider({ address: PROVIDER_A, totalFaultedPeriods: 12n, totalProvingPeriods: 105n }), ]); @@ -637,7 +637,7 @@ describe("DataRetentionService", () => { it("retains baseline when provider not found in database", async () => { // First poll: establish baseline - pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider({ address: PROVIDER_A })]); + subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider({ address: PROVIDER_A })]); await service.pollDataRetention(); // Second poll: provider removed from active list @@ -653,7 +653,7 @@ describe("DataRetentionService", () => { // Database returns empty array (provider not found) mockSPRepository.find.mockResolvedValueOnce([]); - pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider({ address: PROVIDER_B })]); + subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider({ address: PROVIDER_B })]); await service.pollDataRetention(); @@ -670,7 +670,7 @@ describe("DataRetentionService", () => { }, ]); - pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([ + subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([ makeProvider({ address: PROVIDER_A, totalFaultedPeriods: 12n, totalProvingPeriods: 105n }), ]); @@ -683,7 +683,7 @@ describe("DataRetentionService", () => { it("retains baseline when provider has null providerId", async () => { // First poll: establish baseline - pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider({ address: PROVIDER_A })]); + subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider({ address: PROVIDER_A })]); await service.pollDataRetention(); // Second poll: provider removed @@ -706,7 +706,7 @@ describe("DataRetentionService", () => { }, ]); - pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider({ address: PROVIDER_B })]); + subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider({ address: PROVIDER_B })]); await service.pollDataRetention(); @@ -716,7 +716,7 @@ describe("DataRetentionService", () => { it("retains baseline when counter removal throws error", async () => { // First poll: establish baseline - pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider({ address: PROVIDER_A })]); + subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider({ address: PROVIDER_A })]); await service.pollDataRetention(); // Second poll: provider removed @@ -743,7 +743,7 @@ describe("DataRetentionService", () => { throw new Error("Counter removal failed"); }); - pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider({ address: PROVIDER_B })]); + subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider({ address: PROVIDER_B })]); await service.pollDataRetention(); @@ -760,7 +760,7 @@ describe("DataRetentionService", () => { }, ]); - pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([ + subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([ makeProvider({ address: PROVIDER_A, totalFaultedPeriods: 12n, totalProvingPeriods: 110n }), ]); @@ -781,7 +781,7 @@ describe("DataRetentionService", () => { { id: 3, serviceProvider: PROVIDER_C, name: "Provider C", isApproved: true }, ]); - pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([ + subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([ makeProvider({ address: PROVIDER_A }), makeProvider({ address: PROVIDER_B }), makeProvider({ address: PROVIDER_C }), @@ -799,7 +799,7 @@ describe("DataRetentionService", () => { { address: PROVIDER_C, name: "Provider C", providerId: 3, isApproved: true }, ]); - pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider({ address: PROVIDER_A })]); + subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider({ address: PROVIDER_A })]); await service.pollDataRetention(); @@ -815,7 +815,7 @@ describe("DataRetentionService", () => { it("skips cleanup when processing errors occurred", async () => { // First poll: establish baseline - pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider({ address: PROVIDER_A })]); + subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider({ address: PROVIDER_A })]); await service.pollDataRetention(); // Second poll: provider removed, but processing has errors @@ -824,7 +824,7 @@ describe("DataRetentionService", () => { ]); // Simulate processing error - pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockRejectedValueOnce(new Error("Processing failed")); + subgraphServiceMock.fetchProvidersWithDatasets.mockRejectedValueOnce(new Error("Processing failed")); await service.pollDataRetention(); @@ -841,7 +841,7 @@ describe("DataRetentionService", () => { { id: 1, serviceProvider: PROVIDER_MIXED_CASE, name: "Provider A", isApproved: true }, ]); - pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([ + subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([ makeProvider({ address: PROVIDER_MIXED_CASE.toLowerCase() as `0x${string}` }), ]); @@ -861,7 +861,7 @@ describe("DataRetentionService", () => { }, ]); - pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider({ address: PROVIDER_B })]); + subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider({ address: PROVIDER_B })]); await service.pollDataRetention(); @@ -885,7 +885,7 @@ describe("DataRetentionService", () => { // Subgraph returns same values: totalFaultedPeriods=10, totalProvingPeriods=100 // confirmedTotalSuccess = 100 - 10 = 90 // With DB baseline: faultedDelta = 10 - 10 = 0, successDelta = 90 - 90 = 0 - pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider()]); + subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider()]); await service.pollDataRetention(); @@ -907,7 +907,7 @@ describe("DataRetentionService", () => { // Subgraph returns: totalFaultedPeriods=10, totalProvingPeriods=100 // confirmedTotalSuccess = 100 - 10 = 90 // faultedDelta = 10 - 8 = 2, successDelta = 90 - 85 = 5 - pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider()]); + subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider()]); await service.pollDataRetention(); @@ -922,7 +922,7 @@ describe("DataRetentionService", () => { }); it("reloads baselines from DB on every poll", async () => { - pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValue([makeProvider()]); + subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValue([makeProvider()]); await service.pollDataRetention(); await service.pollDataRetention(); @@ -932,13 +932,13 @@ describe("DataRetentionService", () => { }); it("does not double-count when poll ownership alternates across worker pods", async () => { - pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider()]); + subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider()]); await service.pollDataRetention(); const secondPod = new DataRetentionService( configServiceMock, walletSdkServiceMock as unknown as WalletSdkService, - pdpSubgraphServiceMock as unknown as PDPSubgraphService, + subgraphServiceMock as unknown as SubgraphService, mockBaselineRepository as unknown as Repository, mockSPRepository as unknown as Repository, counterMock as unknown as Counter, @@ -946,8 +946,8 @@ describe("DataRetentionService", () => { { insert: vi.fn(), probeLocation: "test" } as unknown as ClickhouseService, ); - pdpSubgraphServiceMock.fetchSubgraphMeta.mockResolvedValueOnce({ _meta: { block: { number: 1300 } } }); - pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([ + subgraphServiceMock.fetchSubgraphMeta.mockResolvedValueOnce({ _meta: { block: { number: 1300 } } }); + subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([ makeProvider({ totalFaultedPeriods: 11n, totalProvingPeriods: 102n }), ]); await secondPod.pollDataRetention(); @@ -955,8 +955,8 @@ describe("DataRetentionService", () => { counterMock.labels.mockClear(); counterMock.inc.mockClear(); - pdpSubgraphServiceMock.fetchSubgraphMeta.mockResolvedValueOnce({ _meta: { block: { number: 1400 } } }); - pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([ + subgraphServiceMock.fetchSubgraphMeta.mockResolvedValueOnce({ _meta: { block: { number: 1400 } } }); + subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([ makeProvider({ totalFaultedPeriods: 12n, totalProvingPeriods: 104n }), ]); await service.pollDataRetention(); @@ -972,8 +972,8 @@ describe("DataRetentionService", () => { ]; mockBaselineRepository.upsert.mockRejectedValueOnce(new Error("DB write failed")); - pdpSubgraphServiceMock.fetchSubgraphMeta.mockResolvedValueOnce({ _meta: { block: { number: 1300 } } }); - pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([ + subgraphServiceMock.fetchSubgraphMeta.mockResolvedValueOnce({ _meta: { block: { number: 1300 } } }); + subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([ makeProvider({ totalFaultedPeriods: 12n, totalProvingPeriods: 105n }), ]); @@ -981,8 +981,8 @@ describe("DataRetentionService", () => { expect(counterMock.labels).not.toHaveBeenCalled(); - pdpSubgraphServiceMock.fetchSubgraphMeta.mockResolvedValueOnce({ _meta: { block: { number: 1400 } } }); - pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([ + subgraphServiceMock.fetchSubgraphMeta.mockResolvedValueOnce({ _meta: { block: { number: 1400 } } }); + subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([ makeProvider({ totalFaultedPeriods: 12n, totalProvingPeriods: 105n }), ]); @@ -1003,12 +1003,12 @@ describe("DataRetentionService", () => { }, ]); - pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValue([makeProvider()]); + subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValue([makeProvider()]); // First poll: DB load fails, poll bails out to avoid emitting bloated values await service.pollDataRetention(); expect(mockBaselineRepository.find).toHaveBeenCalledTimes(1); - expect(pdpSubgraphServiceMock.fetchSubgraphMeta).not.toHaveBeenCalled(); + expect(subgraphServiceMock.fetchSubgraphMeta).not.toHaveBeenCalled(); expect(counterMock.labels).not.toHaveBeenCalled(); // Second poll: DB load succeeds, baselines restored, normal delta computation @@ -1021,16 +1021,16 @@ describe("DataRetentionService", () => { it("emits real deltas on second poll after fresh deploy baseline-only first poll", async () => { // First poll: fresh deploy, no baselines in DB // Baseline set to: faultedPeriods=10, successPeriods=90 - pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider()]); + subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider()]); await service.pollDataRetention(); counterMock.labels.mockClear(); counterMock.inc.mockClear(); // Second poll: values have increased - pdpSubgraphServiceMock.fetchSubgraphMeta.mockResolvedValueOnce({ + subgraphServiceMock.fetchSubgraphMeta.mockResolvedValueOnce({ _meta: { block: { number: 1300 } }, }); - pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([ + subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([ makeProvider({ totalFaultedPeriods: 12n, totalProvingPeriods: 105n }), ]); @@ -1044,7 +1044,7 @@ describe("DataRetentionService", () => { it("deletes baseline from DB when stale provider is cleaned up", async () => { // First poll: establish baseline for PROVIDER_A - pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider({ address: PROVIDER_A })]); + subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider({ address: PROVIDER_A })]); await service.pollDataRetention(); // Second poll: PROVIDER_A removed from active list @@ -1056,7 +1056,7 @@ describe("DataRetentionService", () => { { address: PROVIDER_A, name: "Provider A", providerId: 1, isApproved: true }, ]); - pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider({ address: PROVIDER_B })]); + subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider({ address: PROVIDER_B })]); await service.pollDataRetention(); @@ -1069,7 +1069,7 @@ describe("DataRetentionService", () => { it("emits overdue gauge on first poll (baseline-only)", async () => { // Provider is overdue: currentBlock=1200, // estimatedOverduePeriods = (1200 - 901) / 100 = 2.99 -> 2 - pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider()]); + subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider()]); await service.pollDataRetention(); @@ -1086,7 +1086,7 @@ describe("DataRetentionService", () => { it("emits overdue gauge = 0 when provider is not overdue", async () => { // nextDeadline=2000 > currentBlock=1200 - pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider({ proofSets: [] })]); + subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider({ proofSets: [] })]); await service.pollDataRetention(); @@ -1095,7 +1095,7 @@ describe("DataRetentionService", () => { it("emits overdue gauge even on negative delta (baseline reset)", async () => { // First poll: high values - pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([ + subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([ makeProvider({ totalFaultedPeriods: 100n, totalProvingPeriods: 200n }), ]); await service.pollDataRetention(); @@ -1103,7 +1103,7 @@ describe("DataRetentionService", () => { gaugeMock.set.mockClear(); // Second poll: lower values (negative delta) but still overdue - pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([ + subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([ makeProvider({ totalFaultedPeriods: 50n, totalProvingPeriods: 100n }), ]); await service.pollDataRetention(); @@ -1115,7 +1115,7 @@ describe("DataRetentionService", () => { it("naturally resets gauge to 0 when subgraph catches up", async () => { // First poll: provider is overdue (currentBlock=1200, nextDeadline=1000) - pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider()]); + subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider()]); await service.pollDataRetention(); expect(gaugeMock.set).toHaveBeenCalledWith(2); @@ -1124,7 +1124,7 @@ describe("DataRetentionService", () => { gaugeMock.set.mockClear(); // Second poll: subgraph caught up, nextDeadline advanced past currentBlock - pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([ + subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([ makeProvider({ totalFaultedPeriods: 12n, totalProvingPeriods: 102n, @@ -1140,7 +1140,7 @@ describe("DataRetentionService", () => { it("removes overdue gauge when stale provider is cleaned up", async () => { // First poll: establish baseline for PROVIDER_A - pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider({ address: PROVIDER_A })]); + subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider({ address: PROVIDER_A })]); await service.pollDataRetention(); // Second poll: PROVIDER_A removed from active list @@ -1152,7 +1152,7 @@ describe("DataRetentionService", () => { { address: PROVIDER_A, name: "Provider A", providerId: 1, isApproved: true }, ]); - pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider({ address: PROVIDER_B })]); + subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider({ address: PROVIDER_B })]); await service.pollDataRetention(); diff --git a/apps/backend/src/data-retention/data-retention.service.ts b/apps/backend/src/data-retention/data-retention.service.ts index c6ece7b5..1422bbfd 100644 --- a/apps/backend/src/data-retention/data-retention.service.ts +++ b/apps/backend/src/data-retention/data-retention.service.ts @@ -11,8 +11,8 @@ import { IConfig } from "../config/app.config.js"; import { DataRetentionBaseline } from "../database/entities/data-retention-baseline.entity.js"; import { StorageProvider } from "../database/entities/storage-provider.entity.js"; import { buildCheckMetricLabels, CheckMetricLabels } from "../metrics-prometheus/check-metric-labels.js"; -import { PDPSubgraphService } from "../pdp-subgraph/pdp-subgraph.service.js"; -import { type ProviderDataSetResponse } from "../pdp-subgraph/types.js"; +import { SubgraphService } from "../subgraph/subgraph.service.js"; +import { type ProviderDataSetResponse } from "../subgraph/types.js"; import { WalletSdkService } from "../wallet-sdk/wallet-sdk.service.js"; import { type PDPProviderEx } from "../wallet-sdk/wallet-sdk.types.js"; @@ -41,7 +41,7 @@ export class DataRetentionService { constructor( private readonly configService: ConfigService, private readonly walletSdkService: WalletSdkService, - private readonly pdpSubgraphService: PDPSubgraphService, + private readonly subgraphService: SubgraphService, @InjectRepository(DataRetentionBaseline) private readonly baselineRepository: Repository, @InjectRepository(StorageProvider) @@ -59,10 +59,10 @@ export class DataRetentionService { * challenge delta since the last poll. */ async pollDataRetention(): Promise { - const pdpSubgraphEndpoint = this.configService.get("blockchain").pdpSubgraphEndpoint; - if (!pdpSubgraphEndpoint) { + const subgraphEndpoint = this.configService.get("blockchain").subgraphEndpoint; + if (!subgraphEndpoint) { this.logger.warn({ - event: "pdp_subgraph_endpoint_not_configured", + event: "subgraph_endpoint_not_configured", message: "No PDP subgraph endpoint configured", }); return; @@ -75,7 +75,7 @@ export class DataRetentionService { } try { - const subgraphMeta = await this.pdpSubgraphService.fetchSubgraphMeta(); + const subgraphMeta = await this.subgraphService.fetchSubgraphMeta(); const allProviderInfos = this.walletSdkService.getTestingProviders(); const spBlocklists = this.configService.get("spBlocklists"); const providerInfos = allProviderInfos?.filter((p) => !isSpBlocked(spBlocklists, p.serviceProvider, p.id)); @@ -104,7 +104,7 @@ export class DataRetentionService { ); try { - const providersFromSubgraph = await this.pdpSubgraphService.fetchProvidersWithDatasets({ + const providersFromSubgraph = await this.subgraphService.fetchProvidersWithDatasets({ blockNumber, addresses: batchAddresses, }); diff --git a/apps/backend/src/database/database.module.ts b/apps/backend/src/database/database.module.ts index 9249c3a9..f3f9ed09 100644 --- a/apps/backend/src/database/database.module.ts +++ b/apps/backend/src/database/database.module.ts @@ -7,6 +7,7 @@ import { fileURLToPath } from "url"; import { toStructuredError } from "../common/logging.js"; import { createPinoExitLogger } from "../common/pino.config.js"; import type { IAppConfig, IConfig, IDatabaseConfig } from "../config/app.config.js"; +import { AnonRetrieval } from "./entities/anon-retrieval.entity.js"; import { DataRetentionBaseline } from "./entities/data-retention-baseline.entity.js"; import { Deal } from "./entities/deal.entity.js"; import { JobScheduleState } from "./entities/job-schedule-state.entity.js"; @@ -49,7 +50,7 @@ function toSafeDataSourceContext(options: DataSourceOptions): Record { + await queryRunner.query(` + CREATE TABLE anon_retrievals ( + id UUID PRIMARY KEY DEFAULT gen_random_uuid(), + sp_address VARCHAR NOT NULL, + piece_cid VARCHAR NOT NULL, + data_set_id BIGINT NOT NULL, + piece_id BIGINT NOT NULL, + raw_size BIGINT NOT NULL, + with_ipfs_indexing BOOLEAN NOT NULL, + ipfs_root_cid VARCHAR NULL, + service_type VARCHAR NOT NULL DEFAULT 'direct_sp', + retrieval_endpoint VARCHAR NOT NULL, + status VARCHAR NOT NULL DEFAULT 'pending', + started_at TIMESTAMPTZ NOT NULL, + completed_at TIMESTAMPTZ NULL, + latency_ms INT NULL, + ttfb_ms INT NULL, + throughput_bps INT NULL, + bytes_retrieved BIGINT NULL, + response_code INT NULL, + error_message VARCHAR NULL, + commp_valid BOOLEAN NULL, + car_valid BOOLEAN NULL, + created_at TIMESTAMPTZ NOT NULL DEFAULT now(), + updated_at TIMESTAMPTZ NOT NULL DEFAULT now() + ) + `); + + // Per-SP dashboards. + await queryRunner.query(` + CREATE INDEX "IDX_anon_retrievals_sp_address" + ON anon_retrievals (sp_address) + `); + + // Used by the recent-dedup query in AnonPieceSelectorService — keeps the + // most-recently-tested CIDs out of the next selection. + await queryRunner.query(` + CREATE INDEX "IDX_anon_retrievals_piece_cid" + ON anon_retrievals (piece_cid) + `); + + // Supports "last N anonymous retrievals" ordering used by the selector. + await queryRunner.query(` + CREATE INDEX "IDX_anon_retrievals_created_at" + ON anon_retrievals (created_at DESC) + `); + } + + public async down(queryRunner: QueryRunner): Promise { + await queryRunner.query(`DROP TABLE IF EXISTS anon_retrievals`); + } +} diff --git a/apps/backend/src/http-client/http-client.service.spec.ts b/apps/backend/src/http-client/http-client.service.spec.ts index 96604139..511910ba 100644 --- a/apps/backend/src/http-client/http-client.service.spec.ts +++ b/apps/backend/src/http-client/http-client.service.spec.ts @@ -64,25 +64,94 @@ describe("HttpClientService", () => { expect(config.timeout).toBe(120000); }); - it("times out HTTP/2 requests using the connection timeout", async () => { + it("passes the configured headersTimeout to undici and translates its error", async () => { const service = await createService(); - if (typeof AbortSignal.timeout !== "function") { - (AbortSignal as any).timeout = () => new AbortController().signal; + let receivedHeadersTimeout: number | undefined; + undiciRequestMock.mockImplementationOnce((_url: string, options: { headersTimeout?: number }) => { + receivedHeadersTimeout = options.headersTimeout; + const err = new Error("Headers Timeout Error") as Error & { code?: string }; + err.name = "HeadersTimeoutError"; + err.code = "UND_ERR_HEADERS_TIMEOUT"; + return Promise.reject(err); + }); + + await expect(service.requestWithMetrics("http://example.com", { httpVersion: "2" })).rejects.toThrow( + "HTTP/2 connection/headers timed out after 25ms", + ); + + expect(receivedHeadersTimeout).toBe(25); + }); + + it("keeps the request signal alive after the connect timeout window elapses", async () => { + const service = await createService(); + + // Previously, connectTimeoutMs (25ms) was folded into the request signal, + // so any download lasting longer than 25ms was aborted mid-stream. The + // signal must now stay live until the transfer timeout or parent signal + // fires. + let sawAbortBeforeResolve = false; + undiciRequestMock.mockImplementationOnce(async (_url: string, options: { signal?: AbortSignal }) => { + await new Promise((r) => setTimeout(r, 75)); + sawAbortBeforeResolve = options.signal?.aborted === true; + async function* body() { + yield Buffer.from("ok"); + } + return { statusCode: 200, body: body() }; + }); + + const result = await service.requestWithMetrics("http://example.com", { httpVersion: "2" }); + + expect(sawAbortBeforeResolve).toBe(false); + expect(result.aborted).toBeUndefined(); + expect(result.metrics.statusCode).toBe(200); + }); + + it("returns partial bytes and metrics when HTTP/2 download is aborted after headers", async () => { + const service = await createService(); + + const parentAbort = new AbortController(); + + async function* abortingBody() { + yield Buffer.from("hello"); + yield Buffer.from(" world"); + // Simulate an abort mid-stream after two chunks. + parentAbort.abort(new Error("Anon retrieval job timeout (60s) for sp1")); + throw new Error("aborted"); } - undiciRequestMock.mockImplementationOnce((_url: string, options: { signal?: AbortSignal }) => { - return new Promise((_resolve, reject) => { - options.signal?.addEventListener("abort", () => reject(new Error("aborted")), { once: true }); - }); + undiciRequestMock.mockImplementationOnce(async () => ({ + statusCode: 200, + body: abortingBody(), + })); + + const result = await service.requestWithMetrics("http://example.com/piece", { + httpVersion: "2", + signal: parentAbort.signal, }); - vi.useFakeTimers(); + expect(result.aborted).toBe(true); + expect(result.abortReason).toContain("timeout"); + expect(result.metrics.statusCode).toBe(200); + expect(result.metrics.responseSize).toBe(11); + expect(Buffer.isBuffer(result.data) ? result.data.toString() : "").toBe("hello world"); + }); + + it("rethrows non-abort download errors on HTTP/2", async () => { + const service = await createService(); - const promise = service.requestWithMetrics("http://example.com", { httpVersion: "2" }); - const assertion = expect(promise).rejects.toThrow("HTTP/2 connection/headers timed out after 25ms"); - await vi.advanceTimersByTimeAsync(25); + async function* brokenBody() { + yield Buffer.from("partial"); + throw new Error("network reset"); + } + + undiciRequestMock.mockImplementationOnce(async () => ({ + statusCode: 200, + body: brokenBody(), + })); - await assertion; + await expect(service.requestWithMetrics("http://example.com/piece", { httpVersion: "2" })).rejects.toThrow( + "network reset", + ); }); }); diff --git a/apps/backend/src/http-client/http-client.service.ts b/apps/backend/src/http-client/http-client.service.ts index 48e10e5c..81140162 100644 --- a/apps/backend/src/http-client/http-client.service.ts +++ b/apps/backend/src/http-client/http-client.service.ts @@ -81,12 +81,11 @@ export class HttpClientService { let ttfbTime = 0; let statusCode = 0; - /** - * Dual-timeout strategy for HTTP/2 requests: - * 1. AbortSignal.timeout() - Undici's native timeout (10 min default) - * 2. AbortSignal.timeout() for connection/headers (10 sec default) - */ - const { signal, connectTimeoutSignal } = this.buildHttp2Signals(options.signal); + // Dual-timeout strategy for HTTP/2 requests: + // - `headersTimeout` (undici): scopes the connect + response-headers phase. + // - Combined AbortSignal: transfer-timeout ceiling + parent (job) signal. + const transferTimeoutSignal = AbortSignal.timeout(this.http2TimeoutMs); + const signal = options.signal ? anySignal([transferTimeoutSignal, options.signal]) : transferTimeoutSignal; const requestOptions: any = { method, headers: { @@ -94,6 +93,7 @@ export class HttpClientService { ...headers, }, signal, + headersTimeout: this.connectTimeoutMs, }; if (data) { @@ -105,7 +105,8 @@ export class HttpClientService { try { response = await undiciRequest(url, requestOptions); } catch (error) { - if (connectTimeoutSignal.aborted) { + // discern connection error from transfer error + if (isHeadersTimeoutError(error)) { throw new Error(`HTTP/2 connection/headers timed out after ${this.connectTimeoutMs}ms`); } throw error; @@ -115,8 +116,15 @@ export class HttpClientService { statusCode = response.statusCode; const chunks: Buffer[] = []; - for await (const chunk of response.body) { - chunks.push(Buffer.from(chunk)); + let downloadError: unknown; + try { + for await (const chunk of response.body) { + chunks.push(Buffer.from(chunk)); + } + } catch (error) { + // Download-phase failures (e.g. abort signal) fall through so we can + // return the partial buffer + metrics collected so far. + downloadError = error; } const dataBuffer = Buffer.concat(chunks); @@ -133,6 +141,29 @@ export class HttpClientService { httpVersion: "2", }; + if (downloadError !== undefined) { + const aborted = options.signal?.aborted === true || isAbortLikeError(downloadError); + if (!aborted) { + throw downloadError; + } + const abortReason = describeAbortReason(options.signal, downloadError); + this.logger.warn({ + event: "http2_download_aborted", + message: "HTTP/2 download aborted after headers; returning partial data", + url, + bytesReceived: dataBuffer.length, + totalTime: metrics.totalTime, + ttfb: metrics.ttfb, + abortReason, + }); + return { + data: dataBuffer as T, + metrics, + aborted: true, + abortReason, + }; + } + return { data: dataBuffer as T, metrics, @@ -255,24 +286,28 @@ export class HttpClientService { // Fallback for objects/arrays return Buffer.from(JSON.stringify(data)); } +} - private buildHttp2Signals(parentSignal?: AbortSignal): { - signal: AbortSignal; - connectTimeoutSignal: AbortSignal; - } { - const transferTimeoutSignal = AbortSignal.timeout(this.http2TimeoutMs); - const connectTimeoutSignal = AbortSignal.timeout(this.connectTimeoutMs); +function isAbortLikeError(error: unknown): boolean { + if (error instanceof Error) { + return error.name === "AbortError" || error.name === "TimeoutError" || /abort/i.test(error.message); + } + return false; +} - if (parentSignal) { - return { - signal: anySignal([transferTimeoutSignal, connectTimeoutSignal, parentSignal]), - connectTimeoutSignal, - }; - } +/** + * Determines if a given error represents a "Headers Timeout" error. + */ +function isHeadersTimeoutError(error: unknown): boolean { + if (!(error instanceof Error)) return false; + const code = (error as Error & { code?: string }).code; + return error.name === "HeadersTimeoutError" || code === "UND_ERR_HEADERS_TIMEOUT"; +} - return { - signal: anySignal([transferTimeoutSignal, connectTimeoutSignal]), - connectTimeoutSignal, - }; - } +function describeAbortReason(signal: AbortSignal | undefined, fallback: unknown): string { + const reason = signal?.reason; + if (reason instanceof Error && reason.message) return reason.message; + if (typeof reason === "string" && reason.length > 0) return reason; + if (fallback instanceof Error && fallback.message) return fallback.message; + return "aborted"; } diff --git a/apps/backend/src/http-client/types.ts b/apps/backend/src/http-client/types.ts index 7e48ce7d..26892ee6 100644 --- a/apps/backend/src/http-client/types.ts +++ b/apps/backend/src/http-client/types.ts @@ -13,4 +13,6 @@ export interface RequestMetrics { export interface RequestWithMetrics { data: T; metrics: RequestMetrics; + aborted?: boolean; // Set when the request was aborted mid-download after response headers arrived. + abortReason?: string; // Error message when `aborted` is true; human-readable summary of the abort reason. } diff --git a/apps/backend/src/jobs/job-queues.ts b/apps/backend/src/jobs/job-queues.ts index 9488ce7b..db475d49 100644 --- a/apps/backend/src/jobs/job-queues.ts +++ b/apps/backend/src/jobs/job-queues.ts @@ -7,3 +7,4 @@ export const LEGACY_DEAL_QUEUE = "deal.run"; export const LEGACY_RETRIEVAL_QUEUE = "retrieval.run"; export const DATA_RETENTION_POLL_QUEUE = "data.retention.poll"; export const PROVIDERS_REFRESH_QUEUE = "providers.refresh"; +export const RETRIEVAL_ANON_QUEUE = "retrieval.anon.run"; diff --git a/apps/backend/src/jobs/jobs.module.ts b/apps/backend/src/jobs/jobs.module.ts index 15ad4d64..69f1edb1 100644 --- a/apps/backend/src/jobs/jobs.module.ts +++ b/apps/backend/src/jobs/jobs.module.ts @@ -7,6 +7,7 @@ import { StorageProvider } from "../database/entities/storage-provider.entity.js import { DealModule } from "../deal/deal.module.js"; import { PieceCleanupModule } from "../piece-cleanup/piece-cleanup.module.js"; import { RetrievalModule } from "../retrieval/retrieval.module.js"; +import { RetrievalAnonModule } from "../retrieval-anon/retrieval-anon.module.js"; import { WalletSdkModule } from "../wallet-sdk/wallet-sdk.module.js"; import { JobsService } from "./jobs.service.js"; import { JobScheduleRepository } from "./repositories/job-schedule.repository.js"; @@ -17,6 +18,7 @@ import { JobScheduleRepository } from "./repositories/job-schedule.repository.js TypeOrmModule.forFeature([StorageProvider, JobScheduleState]), DealModule, RetrievalModule, + RetrievalAnonModule, WalletSdkModule, DataRetentionModule, PieceCleanupModule, diff --git a/apps/backend/src/jobs/jobs.service.spec.ts b/apps/backend/src/jobs/jobs.service.spec.ts index d556f3d6..c20d0890 100644 --- a/apps/backend/src/jobs/jobs.service.spec.ts +++ b/apps/backend/src/jobs/jobs.service.spec.ts @@ -30,18 +30,18 @@ describe("JobsService schedule rows", () => { }; let dataRetentionServiceMock: { pollDataRetention: ReturnType }; let metricsMocks: { - jobsQueuedGauge: JobsServiceDeps[8]; - jobsRetryScheduledGauge: JobsServiceDeps[9]; - oldestQueuedAgeGauge: JobsServiceDeps[10]; - oldestInFlightAgeGauge: JobsServiceDeps[11]; - jobsInFlightGauge: JobsServiceDeps[12]; - jobsEnqueueAttemptsCounter: JobsServiceDeps[13]; - jobsStartedCounter: JobsServiceDeps[14]; - jobsCompletedCounter: JobsServiceDeps[15]; - jobsPausedGauge: JobsServiceDeps[16]; - jobDuration: JobsServiceDeps[17]; - storageProvidersActive: JobsServiceDeps[18]; - storageProvidersTested: JobsServiceDeps[19]; + jobsQueuedGauge: JobsServiceDeps[9]; + jobsRetryScheduledGauge: JobsServiceDeps[10]; + oldestQueuedAgeGauge: JobsServiceDeps[11]; + oldestInFlightAgeGauge: JobsServiceDeps[12]; + jobsInFlightGauge: JobsServiceDeps[13]; + jobsEnqueueAttemptsCounter: JobsServiceDeps[14]; + jobsStartedCounter: JobsServiceDeps[15]; + jobsCompletedCounter: JobsServiceDeps[16]; + jobsPausedGauge: JobsServiceDeps[17]; + jobDuration: JobsServiceDeps[18]; + storageProvidersActive: JobsServiceDeps[19]; + storageProvidersTested: JobsServiceDeps[20]; }; let baseConfigValues: Partial; let configService: JobsServiceDeps[0]; @@ -52,21 +52,22 @@ describe("JobsService schedule rows", () => { jobScheduleRepository: JobsServiceDeps[2]; dealService: JobsServiceDeps[3]; retrievalService: JobsServiceDeps[4]; - walletSdkService: JobsServiceDeps[5]; - dataRetentionService: JobsServiceDeps[6]; - pieceCleanupService: JobsServiceDeps[7]; - jobsQueuedGauge: JobsServiceDeps[8]; - jobsRetryScheduledGauge: JobsServiceDeps[9]; - oldestQueuedAgeGauge: JobsServiceDeps[10]; - oldestInFlightAgeGauge: JobsServiceDeps[11]; - jobsInFlightGauge: JobsServiceDeps[12]; - jobsEnqueueAttemptsCounter: JobsServiceDeps[13]; - jobsStartedCounter: JobsServiceDeps[14]; - jobsCompletedCounter: JobsServiceDeps[15]; - jobsPausedGauge: JobsServiceDeps[16]; - jobDuration: JobsServiceDeps[17]; - storageProvidersActive: JobsServiceDeps[18]; - storageProvidersTested: JobsServiceDeps[19]; + anonRetrievalService: JobsServiceDeps[5]; + walletSdkService: JobsServiceDeps[6]; + dataRetentionService: JobsServiceDeps[7]; + pieceCleanupService: JobsServiceDeps[8]; + jobsQueuedGauge: JobsServiceDeps[9]; + jobsRetryScheduledGauge: JobsServiceDeps[10]; + oldestQueuedAgeGauge: JobsServiceDeps[11]; + oldestInFlightAgeGauge: JobsServiceDeps[12]; + jobsInFlightGauge: JobsServiceDeps[13]; + jobsEnqueueAttemptsCounter: JobsServiceDeps[14]; + jobsStartedCounter: JobsServiceDeps[15]; + jobsCompletedCounter: JobsServiceDeps[16]; + jobsPausedGauge: JobsServiceDeps[17]; + jobDuration: JobsServiceDeps[18]; + storageProvidersActive: JobsServiceDeps[19]; + storageProvidersTested: JobsServiceDeps[20]; }>, ) => JobsService; @@ -96,18 +97,18 @@ describe("JobsService schedule rows", () => { }; metricsMocks = { - jobsQueuedGauge: { set: vi.fn() } as unknown as JobsServiceDeps[8], - jobsRetryScheduledGauge: { set: vi.fn() } as unknown as JobsServiceDeps[9], - oldestQueuedAgeGauge: { set: vi.fn() } as unknown as JobsServiceDeps[10], - oldestInFlightAgeGauge: { set: vi.fn() } as unknown as JobsServiceDeps[11], - jobsInFlightGauge: { set: vi.fn() } as unknown as JobsServiceDeps[12], - jobsEnqueueAttemptsCounter: { inc: vi.fn() } as unknown as JobsServiceDeps[13], - jobsStartedCounter: { inc: vi.fn() } as unknown as JobsServiceDeps[14], - jobsCompletedCounter: { inc: vi.fn() } as unknown as JobsServiceDeps[15], - jobsPausedGauge: { set: vi.fn() } as unknown as JobsServiceDeps[16], - jobDuration: { observe: vi.fn() } as unknown as JobsServiceDeps[17], - storageProvidersActive: { set: vi.fn() } as unknown as JobsServiceDeps[18], - storageProvidersTested: { set: vi.fn() } as unknown as JobsServiceDeps[19], + jobsQueuedGauge: { set: vi.fn() } as unknown as JobsServiceDeps[9], + jobsRetryScheduledGauge: { set: vi.fn() } as unknown as JobsServiceDeps[10], + oldestQueuedAgeGauge: { set: vi.fn() } as unknown as JobsServiceDeps[11], + oldestInFlightAgeGauge: { set: vi.fn() } as unknown as JobsServiceDeps[12], + jobsInFlightGauge: { set: vi.fn() } as unknown as JobsServiceDeps[13], + jobsEnqueueAttemptsCounter: { inc: vi.fn() } as unknown as JobsServiceDeps[14], + jobsStartedCounter: { inc: vi.fn() } as unknown as JobsServiceDeps[15], + jobsCompletedCounter: { inc: vi.fn() } as unknown as JobsServiceDeps[16], + jobsPausedGauge: { set: vi.fn() } as unknown as JobsServiceDeps[17], + jobDuration: { observe: vi.fn() } as unknown as JobsServiceDeps[18], + storageProvidersActive: { set: vi.fn() } as unknown as JobsServiceDeps[19], + storageProvidersTested: { set: vi.fn() } as unknown as JobsServiceDeps[20], }; const emptySpBlocklists: ISpBlocklistConfig = { @@ -133,6 +134,7 @@ describe("JobsService schedule rows", () => { dataSetCreationJobTimeoutSeconds: 300, pieceCleanupPerSpPerHour: 1, maxPieceCleanupRuntimeSeconds: 300, + retrievalsAnonPerSpPerHour: 2, } as IConfig["jobs"], database: { host: "localhost", @@ -158,9 +160,10 @@ describe("JobsService schedule rows", () => { overrides.jobScheduleRepository ?? (jobScheduleRepositoryMock as unknown as JobsServiceDeps[2]), overrides.dealService ?? ({} as JobsServiceDeps[3]), overrides.retrievalService ?? ({} as JobsServiceDeps[4]), - overrides.walletSdkService ?? ({} as JobsServiceDeps[5]), - overrides.dataRetentionService ?? (dataRetentionServiceMock as unknown as JobsServiceDeps[6]), - overrides.pieceCleanupService ?? ({} as JobsServiceDeps[7]), + overrides.anonRetrievalService ?? ({} as JobsServiceDeps[5]), + overrides.walletSdkService ?? ({} as JobsServiceDeps[6]), + overrides.dataRetentionService ?? (dataRetentionServiceMock as unknown as JobsServiceDeps[7]), + overrides.pieceCleanupService ?? ({} as JobsServiceDeps[8]), overrides.jobsQueuedGauge ?? metricsMocks.jobsQueuedGauge, overrides.jobsRetryScheduledGauge ?? metricsMocks.jobsRetryScheduledGauge, overrides.oldestQueuedAgeGauge ?? metricsMocks.oldestQueuedAgeGauge, @@ -284,7 +287,7 @@ describe("JobsService schedule rows", () => { service = buildService({ configService, dealService: dealService as unknown as ConstructorParameters[3], - walletSdkService: walletSdkService as unknown as ConstructorParameters[5], + walletSdkService: walletSdkService as unknown as ConstructorParameters[6], }); // Trigger the timeout immediately by using fake timers @@ -343,7 +346,7 @@ describe("JobsService schedule rows", () => { service = buildService({ configService, retrievalService: retrievalService as unknown as ConstructorParameters[4], - walletSdkService: walletSdkService as unknown as ConstructorParameters[5], + walletSdkService: walletSdkService as unknown as ConstructorParameters[6], }); vi.useFakeTimers(); @@ -382,7 +385,7 @@ describe("JobsService schedule rows", () => { service = buildService({ retrievalService: retrievalService as unknown as ConstructorParameters[4], - walletSdkService: walletSdkService as unknown as ConstructorParameters[5], + walletSdkService: walletSdkService as unknown as ConstructorParameters[6], }); await callPrivate(service, "handleRetrievalJob", { @@ -422,7 +425,7 @@ describe("JobsService schedule rows", () => { service = buildService({ retrievalService: retrievalService as unknown as ConstructorParameters[4], - walletSdkService: walletSdkService as unknown as ConstructorParameters[5], + walletSdkService: walletSdkService as unknown as ConstructorParameters[6], }); await expect( @@ -615,12 +618,13 @@ describe("JobsService schedule rows", () => { // Check upserts for providerB const upsertCalls = jobScheduleRepositoryMock.upsertSchedule.mock.calls; const upsertsForB = upsertCalls.filter((call) => call[1] === providerB.address); - expect(upsertsForB).toHaveLength(4); + expect(upsertsForB).toHaveLength(5); expect(upsertsForB.map((call) => call[0]).sort()).toEqual([ "data_set_creation", "deal", "piece_cleanup", "retrieval", + "retrieval_anon", ]); }); @@ -924,7 +928,7 @@ describe("JobsService schedule rows", () => { service = buildService({ dealService: dealService as unknown as ConstructorParameters[3], - walletSdkService: walletSdkService as unknown as ConstructorParameters[5], + walletSdkService: walletSdkService as unknown as ConstructorParameters[6], }); await callPrivate(service, "handleDealJob", { @@ -963,8 +967,8 @@ describe("JobsService schedule rows", () => { service = buildService({ dealService: dealService as unknown as ConstructorParameters[3], - walletSdkService: walletSdkService as unknown as ConstructorParameters[5], - pieceCleanupService: pieceCleanupService as unknown as JobsServiceDeps[7], + walletSdkService: walletSdkService as unknown as ConstructorParameters[6], + pieceCleanupService: pieceCleanupService as unknown as JobsServiceDeps[8], }); await callPrivate(service, "handleDealJob", { @@ -976,7 +980,7 @@ describe("JobsService schedule rows", () => { expect(dealService.createDealForProvider).toHaveBeenCalledTimes(1); }); - it("deal job maps DealJobTerminatedDataSetError to handler_result=error", async () => { + it("data storage job does not run data-storage check when data-set selection aborts", async () => { const completedCounter = metricsMocks.jobsCompletedCounter as unknown as { inc: ReturnType }; vi.useFakeTimers(); vi.setSystemTime(new Date("2024-01-01T12:00:00Z")); @@ -996,7 +1000,7 @@ describe("JobsService schedule rows", () => { service = buildService({ dealService: dealService as unknown as ConstructorParameters[3], - walletSdkService: walletSdkService as unknown as ConstructorParameters[5], + walletSdkService: walletSdkService as unknown as ConstructorParameters[6], }); await callPrivate(service, "handleDealJob", { @@ -1025,7 +1029,7 @@ describe("JobsService schedule rows", () => { service = buildService({ dealService: dealService as unknown as ConstructorParameters[3], - walletSdkService: walletSdkService as unknown as ConstructorParameters[5], + walletSdkService: walletSdkService as unknown as ConstructorParameters[6], }); await callPrivate(service, "handleDataSetCreationJob", { @@ -1067,7 +1071,7 @@ describe("JobsService schedule rows", () => { service = buildService({ configService, dealService: dealService as unknown as ConstructorParameters[3], - walletSdkService: walletSdkService as unknown as ConstructorParameters[5], + walletSdkService: walletSdkService as unknown as ConstructorParameters[6], }); await callPrivate(service, "handleDataSetCreationJob", { @@ -1108,7 +1112,7 @@ describe("JobsService schedule rows", () => { service = buildService({ configService, dealService: dealService as unknown as ConstructorParameters[3], - walletSdkService: walletSdkService as unknown as ConstructorParameters[5], + walletSdkService: walletSdkService as unknown as ConstructorParameters[6], }); await callPrivate(service, "handleDataSetCreationJob", { @@ -1153,7 +1157,7 @@ describe("JobsService schedule rows", () => { service = buildService({ configService, dealService: dealService as unknown as ConstructorParameters[3], - walletSdkService: walletSdkService as unknown as ConstructorParameters[5], + walletSdkService: walletSdkService as unknown as ConstructorParameters[6], }); await callPrivate(service, "handleDataSetCreationJob", { @@ -1326,7 +1330,7 @@ describe("JobsService schedule rows", () => { service = buildService({ dealService: dealService as unknown as JobsServiceDeps[3], - walletSdkService: walletSdkService as unknown as JobsServiceDeps[5], + walletSdkService: walletSdkService as unknown as JobsServiceDeps[6], }); await callPrivate(service, "handleDealJob", { @@ -1350,7 +1354,7 @@ describe("JobsService schedule rows", () => { service = buildService({ retrievalService: retrievalService as unknown as JobsServiceDeps[4], - walletSdkService: walletSdkService as unknown as JobsServiceDeps[5], + walletSdkService: walletSdkService as unknown as JobsServiceDeps[6], }); await callPrivate(service, "handleRetrievalJob", { @@ -1379,7 +1383,7 @@ describe("JobsService schedule rows", () => { service = buildService({ dealService: dealService as unknown as JobsServiceDeps[3], - walletSdkService: walletSdkService as unknown as JobsServiceDeps[5], + walletSdkService: walletSdkService as unknown as JobsServiceDeps[6], }); await callPrivate(service, "handleDataSetCreationJob", { @@ -1421,7 +1425,7 @@ describe("JobsService schedule rows", () => { intervalSeconds: 60, service: buildService({ dealService: dealService as unknown as JobsServiceDeps[3], - walletSdkService: walletSdkService as unknown as JobsServiceDeps[5], + walletSdkService: walletSdkService as unknown as JobsServiceDeps[6], }), expectCheckNotRun: () => expect(dealService.createDealForProvider).not.toHaveBeenCalled(), }, @@ -1431,7 +1435,7 @@ describe("JobsService schedule rows", () => { intervalSeconds: 60, service: buildService({ retrievalService: retrievalService as unknown as JobsServiceDeps[4], - walletSdkService: walletSdkService as unknown as JobsServiceDeps[5], + walletSdkService: walletSdkService as unknown as JobsServiceDeps[6], }), expectCheckNotRun: () => expect(retrievalService.performRandomRetrievalForProvider).not.toHaveBeenCalled(), }, @@ -1441,7 +1445,7 @@ describe("JobsService schedule rows", () => { intervalSeconds: 3600, service: buildService({ dealService: dataSetDealService as unknown as JobsServiceDeps[3], - walletSdkService: walletSdkService as unknown as JobsServiceDeps[5], + walletSdkService: walletSdkService as unknown as JobsServiceDeps[6], }), expectCheckNotRun: () => expect(dataSetDealService.createDataSetWithPiece).not.toHaveBeenCalled(), }, diff --git a/apps/backend/src/jobs/jobs.service.ts b/apps/backend/src/jobs/jobs.service.ts index f8fe1d80..b070de5a 100644 --- a/apps/backend/src/jobs/jobs.service.ts +++ b/apps/backend/src/jobs/jobs.service.ts @@ -16,18 +16,32 @@ import { StorageProvider } from "../database/entities/storage-provider.entity.js import { DealService } from "../deal/deal.service.js"; import { PieceCleanupService } from "../piece-cleanup/piece-cleanup.service.js"; import { RetrievalService } from "../retrieval/retrieval.service.js"; +import { AnonRetrievalService } from "../retrieval-anon/anon-retrieval.service.js"; import { WalletSdkService } from "../wallet-sdk/wallet-sdk.service.js"; import { provisionNextMissingDataSet } from "./data-set-creation.handler.js"; -import { DATA_RETENTION_POLL_QUEUE, PROVIDERS_REFRESH_QUEUE, SP_WORK_QUEUE } from "./job-queues.js"; +import { + DATA_RETENTION_POLL_QUEUE, + PROVIDERS_REFRESH_QUEUE, + RETRIEVAL_ANON_QUEUE, + SP_WORK_QUEUE, +} from "./job-queues.js"; import { JobScheduleRepository } from "./repositories/job-schedule.repository.js"; -type SpJobType = "deal" | "retrieval" | "data_set_creation" | "piece_cleanup"; -const SP_JOB_TYPES: ReadonlySet = new Set(["deal", "retrieval", "data_set_creation", "piece_cleanup"]); +type SpJobType = "deal" | "retrieval" | "data_set_creation" | "retrieval_anon" | "piece_cleanup"; +const SP_JOB_TYPES: ReadonlySet = new Set([ + "deal", + "retrieval", + "retrieval_anon", + "data_set_creation", + "piece_cleanup", +]); + function isSpJobType(jobType: string): jobType is SpJobType { return SP_JOB_TYPES.has(jobType); } type SpJobData = { jobType: SpJobType; spAddress: string; intervalSeconds: number }; +type AnonRetrievalJobData = { spAddress: string; intervalSeconds: number }; type ProvidersRefreshJobData = { intervalSeconds: number }; type SpJob = Job; type DataRetentionJobData = { intervalSeconds: number }; @@ -58,6 +72,7 @@ export class JobsService implements OnModuleInit, OnApplicationShutdown { private readonly jobScheduleRepository: JobScheduleRepository, private readonly dealService: DealService, private readonly retrievalService: RetrievalService, + private readonly anonRetrievalService: AnonRetrievalService, private readonly walletSdkService: WalletSdkService, private readonly dataRetentionService: DataRetentionService, private readonly pieceCleanupService: PieceCleanupService, @@ -258,6 +273,7 @@ export class JobsService implements OnModuleInit, OnApplicationShutdown { await boss.createQueue(SP_WORK_QUEUE, { policy: "singleton" }); await boss.createQueue(PROVIDERS_REFRESH_QUEUE); await boss.createQueue(DATA_RETENTION_POLL_QUEUE); + await boss.createQueue(RETRIEVAL_ANON_QUEUE); } private registerWorkers(): void { @@ -335,6 +351,23 @@ export class JobsService implements OnModuleInit, OnApplicationShutdown { error: toStructuredError(error), }), ); + void this.boss + .work( + RETRIEVAL_ANON_QUEUE, + { batchSize: 1, localConcurrency: spConcurrency, pollingIntervalSeconds: workerPollSeconds }, + async ([job]) => { + if (!job) return; + await this.handleAnonRetrievalJob(job); + }, + ) + .catch((error) => + this.logger.error({ + event: "worker_register_failed", + message: "Failed to register worker", + queue: RETRIEVAL_ANON_QUEUE, + error: toStructuredError(error), + }), + ); } private getMaintenanceWindowStatus(now: Date = new Date()) { @@ -587,6 +620,51 @@ export class JobsService implements OnModuleInit, OnApplicationShutdown { }); } + private async handleAnonRetrievalJob(job: Job): Promise { + const data = job.data; + const spAddress = data.spAddress; + + // Create AbortController for job timeout enforcement + const abortController = new AbortController(); + const timeoutSeconds = this.configService.get("jobs").anonRetrievalJobTimeoutSeconds; + const timeoutMs = Math.max(60000, timeoutSeconds * 1000); + const effectiveTimeoutSeconds = Math.round(timeoutMs / 1000); + const abortReason = new Error(`Anon retrieval job timeout (${effectiveTimeoutSeconds}s) for ${spAddress}`); + const timeoutId = setTimeout(() => { + abortController.abort(abortReason); + }, timeoutMs); + + await this.recordJobExecution("retrieval_anon", async () => { + const logContext = await this.resolveProviderJobContext(spAddress, job.id); + try { + await this.anonRetrievalService.performForProvider(spAddress, abortController.signal, logContext); + return "success"; + } catch (error) { + if (abortController.signal.aborted) { + const reason = abortController.signal.reason; + const reasonMessage = reason instanceof Error ? reason.message : String(reason ?? ""); + this.logger.error({ + ...logContext, + event: "anon_retrieval_job_aborted", + message: reasonMessage || "Anon retrieval job aborted after timeout", + timeoutSeconds: effectiveTimeoutSeconds, + error: toStructuredError(reason ?? error), + }); + return "aborted"; + } + this.logger.error({ + ...logContext, + event: "anon_retrieval_job_failed", + message: "Anon retrieval job failed", + error: toStructuredError(error), + }); + throw error; + } finally { + clearTimeout(timeoutId); + } + }); + } + private async handleDataRetentionJob(data: DataRetentionJobData): Promise { void data; await this.recordJobExecution("data_retention_poll", async () => { @@ -865,6 +943,7 @@ export class JobsService implements OnModuleInit, OnApplicationShutdown { private getIntervalSecondsForRates(): { dealIntervalSeconds: number; retrievalIntervalSeconds: number; + retrievalAnonIntervalSeconds: number; dataSetCreationIntervalSeconds: number; dataRetentionPollIntervalSeconds: number; providersRefreshIntervalSeconds: number; @@ -885,9 +964,13 @@ export class JobsService implements OnModuleInit, OnApplicationShutdown { const dataRetentionPollIntervalSeconds = scheduling.dataRetentionPollIntervalSeconds; const providersRefreshIntervalSeconds = scheduling.providersRefreshIntervalSeconds; + const retrievalsAnonPerHour = jobsConfig.retrievalsAnonPerSpPerHour; + const retrievalAnonIntervalSeconds = Math.max(1, Math.round(3600 / retrievalsAnonPerHour)); + return { dealIntervalSeconds, retrievalIntervalSeconds, + retrievalAnonIntervalSeconds, dataSetCreationIntervalSeconds, dataRetentionPollIntervalSeconds, providersRefreshIntervalSeconds, @@ -907,6 +990,7 @@ export class JobsService implements OnModuleInit, OnApplicationShutdown { const { dealIntervalSeconds, retrievalIntervalSeconds, + retrievalAnonIntervalSeconds, dataSetCreationIntervalSeconds, dataRetentionPollIntervalSeconds, providersRefreshIntervalSeconds, @@ -924,6 +1008,7 @@ export class JobsService implements OnModuleInit, OnApplicationShutdown { const phaseMs = this.schedulePhaseSeconds() * 1000; const dealStartAt = new Date(now.getTime() + phaseMs); const retrievalStartAt = new Date(now.getTime() + phaseMs); + const retrievalAnonStartAt = new Date(now.getTime() + phaseMs); const dataSetCreationStartAt = new Date(now.getTime() + phaseMs); const dataRetentionPollStartAt = new Date(now.getTime() + phaseMs); const providersRefreshStartAt = new Date(now.getTime() + phaseMs); @@ -947,6 +1032,12 @@ export class JobsService implements OnModuleInit, OnApplicationShutdown { for (const address of unblockedAddresses) { await this.jobScheduleRepository.upsertSchedule("deal", address, dealIntervalSeconds, dealStartAt); await this.jobScheduleRepository.upsertSchedule("retrieval", address, retrievalIntervalSeconds, retrievalStartAt); + await this.jobScheduleRepository.upsertSchedule( + "retrieval_anon", + address, + retrievalAnonIntervalSeconds, + retrievalAnonStartAt, + ); if (minDataSets >= 1) { await this.jobScheduleRepository.upsertSchedule( "data_set_creation", @@ -1104,6 +1195,8 @@ export class JobsService implements OnModuleInit, OnApplicationShutdown { return SP_WORK_QUEUE; case "piece_cleanup": return SP_WORK_QUEUE; + case "retrieval_anon": + return RETRIEVAL_ANON_QUEUE; case "data_retention_poll": return DATA_RETENTION_POLL_QUEUE; case "providers_refresh": @@ -1123,6 +1216,7 @@ export class JobsService implements OnModuleInit, OnApplicationShutdown { if ( row.job_type === "deal" || row.job_type === "retrieval" || + row.job_type === "retrieval_anon" || row.job_type === "data_set_creation" || row.job_type === "piece_cleanup" ) { @@ -1195,6 +1289,7 @@ export class JobsService implements OnModuleInit, OnApplicationShutdown { const jobTypes: JobType[] = [ "deal", "retrieval", + "retrieval_anon", "data_set_creation", "piece_cleanup", "data_retention_poll", diff --git a/apps/backend/src/metrics-prometheus/check-metric-labels.ts b/apps/backend/src/metrics-prometheus/check-metric-labels.ts index d8447160..9d776586 100644 --- a/apps/backend/src/metrics-prometheus/check-metric-labels.ts +++ b/apps/backend/src/metrics-prometheus/check-metric-labels.ts @@ -1,4 +1,4 @@ -export type CheckType = "dataStorage" | "retrieval" | "dataRetention" | "dataSetCreation"; +export type CheckType = "dataStorage" | "retrieval" | "anon_retrieval" | "dataRetention" | "dataSetCreation"; export type ProviderStatus = "approved" | "unapproved"; export type CheckMetricLabels = { diff --git a/apps/backend/src/metrics-prometheus/check-metrics.service.ts b/apps/backend/src/metrics-prometheus/check-metrics.service.ts index 55975cad..85f1cdcf 100644 --- a/apps/backend/src/metrics-prometheus/check-metrics.service.ts +++ b/apps/backend/src/metrics-prometheus/check-metrics.service.ts @@ -248,3 +248,66 @@ export class DataSetCreationCheckMetrics { this.dataSetCreationStatusCounter.inc({ ...labels, value }); } } + +@Injectable() +export class AnonRetrievalCheckMetrics { + constructor( + @InjectMetric("anonPieceRetrievalFirstByteMs") + private readonly firstByteMs: Histogram, + @InjectMetric("anonPieceRetrievalLastByteMs") + private readonly lastByteMs: Histogram, + @InjectMetric("anonPieceRetrievalThroughputBps") + private readonly throughputBps: Histogram, + @InjectMetric("anonRetrievalCheckMs") + private readonly checkMs: Histogram, + @InjectMetric("anonRetrievalStatus") + private readonly statusCounter: Counter, + @InjectMetric("anonPieceHttpResponseCode") + private readonly httpResponseCounter: Counter, + @InjectMetric("anonCarParseStatus") + private readonly carParseCounter: Counter, + @InjectMetric("anonIpniStatus") + private readonly ipniCounter: Counter, + @InjectMetric("anonBlockFetchStatus") + private readonly blockFetchCounter: Counter, + ) {} + + observeFirstByteMs(labels: CheckMetricLabels, value: number | null | undefined): void { + observePositive(this.firstByteMs, labels, value); + } + + observeLastByteMs(labels: CheckMetricLabels, value: number | null | undefined): void { + observePositive(this.lastByteMs, labels, value); + } + + observeThroughput(labels: CheckMetricLabels, value: number | null | undefined): void { + observePositive(this.throughputBps, labels, value); + } + + observeCheckDuration(labels: CheckMetricLabels, value: number | null | undefined): void { + observePositive(this.checkMs, labels, value); + } + + recordStatus(labels: CheckMetricLabels, value: string): void { + this.statusCounter.inc({ ...labels, value }); + } + + recordHttpResponseCode(labels: CheckMetricLabels, statusCode: number): void { + this.httpResponseCounter.inc({ + ...labels, + value: classifyHttpResponseCode(statusCode), + }); + } + + recordCarParseStatus(labels: CheckMetricLabels, parseable: boolean): void { + this.carParseCounter.inc({ ...labels, value: parseable ? "parseable" : "not_parseable" }); + } + + recordIpniStatus(labels: CheckMetricLabels, value: "valid" | "invalid" | "skipped"): void { + this.ipniCounter.inc({ ...labels, value }); + } + + recordBlockFetchStatus(labels: CheckMetricLabels, value: "valid" | "invalid" | "skipped"): void { + this.blockFetchCounter.inc({ ...labels, value }); + } +} diff --git a/apps/backend/src/metrics-prometheus/metrics-prometheus.module.ts b/apps/backend/src/metrics-prometheus/metrics-prometheus.module.ts index 18bda30d..45f728b6 100644 --- a/apps/backend/src/metrics-prometheus/metrics-prometheus.module.ts +++ b/apps/backend/src/metrics-prometheus/metrics-prometheus.module.ts @@ -8,6 +8,7 @@ import { } from "@willsoto/nestjs-prometheus"; import { WalletSdkModule } from "../wallet-sdk/wallet-sdk.module.js"; import { + AnonRetrievalCheckMetrics, DataSetCreationCheckMetrics, DataStorageCheckMetrics, DiscoverabilityCheckMetrics, @@ -207,6 +208,56 @@ const metricProviders = [ help: "Estimated number of unrecorded overdue proving periods per provider. Resets to 0 when the subgraph catches up.", labelNames: ["checkType", "providerId", "providerName", "providerStatus"] as const, }), + // Anonymous Retrieval Metrics + makeHistogramProvider({ + name: "anonPieceRetrievalFirstByteMs", + help: "Time to first byte for anonymous piece retrievals via /piece/{cid} (ms)", + labelNames: ["checkType", "providerId", "providerName", "providerStatus"] as const, + buckets: [1, 5, 10, 50, 100, 250, 500, 1000, 2000, 5000, 10000, 30000], + }), + makeHistogramProvider({ + name: "anonPieceRetrievalLastByteMs", + help: "Total time to retrieve an anonymous piece via /piece/{cid} (ms)", + labelNames: ["checkType", "providerId", "providerName", "providerStatus"] as const, + buckets: [1, 5, 10, 50, 100, 250, 500, 1000, 2000, 5000, 10000, 30000, 60000, 120000, 300000], + }), + makeHistogramProvider({ + name: "anonPieceRetrievalThroughputBps", + help: "Throughput for anonymous piece retrievals (bytes/s)", + labelNames: ["checkType", "providerId", "providerName", "providerStatus"] as const, + buckets: throughputBuckets, + }), + makeHistogramProvider({ + name: "anonRetrievalCheckMs", + help: "End-to-end anonymous retrieval check duration (ms)", + labelNames: ["checkType", "providerId", "providerName", "providerStatus"] as const, + buckets: [100, 500, 1000, 2000, 5000, 10000, 30000, 60000, 120000, 300000, 600000], + }), + makeCounterProvider({ + name: "anonRetrievalStatus", + help: "Anonymous retrieval overall outcome", + labelNames: ["checkType", "providerId", "providerName", "providerStatus", "value"] as const, + }), + makeCounterProvider({ + name: "anonPieceHttpResponseCode", + help: "HTTP response codes for anonymous piece retrieval requests", + labelNames: ["checkType", "providerId", "providerName", "providerStatus", "value"] as const, + }), + makeCounterProvider({ + name: "anonCarParseStatus", + help: "Anonymous retrieval CAR parse outcomes (parseable / not_parseable)", + labelNames: ["checkType", "providerId", "providerName", "providerStatus", "value"] as const, + }), + makeCounterProvider({ + name: "anonIpniStatus", + help: "Anonymous retrieval IPNI check outcomes (valid / invalid / skipped)", + labelNames: ["checkType", "providerId", "providerName", "providerStatus", "value"] as const, + }), + makeCounterProvider({ + name: "anonBlockFetchStatus", + help: "Anonymous retrieval block fetch validation outcomes (valid / invalid / skipped)", + labelNames: ["checkType", "providerId", "providerName", "providerStatus", "value"] as const, + }), // Storage provider metrics: absolute counts, independent of query filters. makeGaugeProvider({ name: "storage_providers_active", @@ -333,6 +384,7 @@ const metricProviders = [ RetrievalCheckMetrics, DiscoverabilityCheckMetrics, DataSetCreationCheckMetrics, + AnonRetrievalCheckMetrics, WalletBalanceCollector, // HTTP metrics interceptor { @@ -347,6 +399,7 @@ const metricProviders = [ RetrievalCheckMetrics, DiscoverabilityCheckMetrics, DataSetCreationCheckMetrics, + AnonRetrievalCheckMetrics, WalletBalanceCollector, ], }) diff --git a/apps/backend/src/pdp-subgraph/pdp-subgraph.module.ts b/apps/backend/src/pdp-subgraph/pdp-subgraph.module.ts deleted file mode 100644 index 6e084fc1..00000000 --- a/apps/backend/src/pdp-subgraph/pdp-subgraph.module.ts +++ /dev/null @@ -1,8 +0,0 @@ -import { Module } from "@nestjs/common"; -import { PDPSubgraphService } from "./pdp-subgraph.service.js"; - -@Module({ - providers: [PDPSubgraphService], - exports: [PDPSubgraphService], -}) -export class PdpSubgraphModule {} diff --git a/apps/backend/src/pdp-subgraph/queries.ts b/apps/backend/src/pdp-subgraph/queries.ts deleted file mode 100644 index a21a3991..00000000 --- a/apps/backend/src/pdp-subgraph/queries.ts +++ /dev/null @@ -1,24 +0,0 @@ -export const Queries = { - GET_PROVIDERS_WITH_DATASETS: ` - query GetProvidersWithDataSet($addresses: [Bytes!], $blockNumber: BigInt!) { - providers(where: {address_in: $addresses}) { - address - totalFaultedPeriods - totalProvingPeriods - proofSets (where: {nextDeadline_lt: $blockNumber, status: PROVING}) { - nextDeadline - maxProvingPeriod - } - } - } - `, - GET_SUBGRAPH_META: ` - query GetSubgraphMeta { - _meta { - block { - number - } - } - } - `, -} as const; diff --git a/apps/backend/src/retrieval-anon/anon-piece-selector.service.spec.ts b/apps/backend/src/retrieval-anon/anon-piece-selector.service.spec.ts new file mode 100644 index 00000000..b822fe5f --- /dev/null +++ b/apps/backend/src/retrieval-anon/anon-piece-selector.service.spec.ts @@ -0,0 +1,168 @@ +import type { ConfigService } from "@nestjs/config"; +import type { Repository } from "typeorm"; +import { beforeEach, describe, expect, it, vi } from "vitest"; +import type { IConfig } from "../config/app.config.js"; +import type { AnonRetrieval } from "../database/entities/anon-retrieval.entity.js"; +import type { SampleAnonPieceParams, SubgraphService } from "../subgraph/subgraph.service.js"; +import type { AnonCandidatePiece } from "../subgraph/types.js"; +import { AnonPieceSelectorService } from "./anon-piece-selector.service.js"; + +const SP_ADDRESS = "0xAaAaAAaAaaaAaAAAAaaaaAAaaAaaaAAaaaaa1111"; +const DEALBOT_PAYER = "0xBbBBBbBBbbbBbBBBBBbbbbbBBbbBbbbBBbbbb2222"; + +const makePiece = (overrides: Partial = {}): AnonCandidatePiece => ({ + pieceCid: `baga6ea4seaqpiece${Math.random().toString(36).slice(2, 10)}`, + pieceId: "1", + dataSetId: "42", + rawSize: "1048576", + withIPFSIndexing: true, + ipfsRootCid: "bafyroot", + indexedAtBlock: 12345, + pdpPaymentEndEpoch: null, + ...overrides, +}); + +const makeRetrievalRepository = (recentPieceCids: string[]): Repository => { + const queryBuilder = { + select: vi.fn().mockReturnThis(), + orderBy: vi.fn().mockReturnThis(), + limit: vi.fn().mockReturnThis(), + getRawMany: vi.fn().mockResolvedValue(recentPieceCids.map((c) => ({ pieceCid: c }))), + }; + return { + createQueryBuilder: vi.fn().mockReturnValue(queryBuilder), + } as unknown as Repository; +}; + +const makeConfigService = (): ConfigService => + ({ + get: vi.fn((key: string) => { + if (key === "blockchain") { + return { walletAddress: DEALBOT_PAYER }; + } + return undefined; + }), + }) as unknown as ConfigService; + +describe("AnonPieceSelectorService", () => { + let subgraphService: SubgraphService; + let sampleAnonPiece: ReturnType; + + beforeEach(() => { + sampleAnonPiece = vi.fn(); + subgraphService = { sampleAnonPiece } as unknown as SubgraphService; + }); + + it("returns null when every fallback attempt yields no piece", async () => { + sampleAnonPiece.mockResolvedValue(null); + const service = new AnonPieceSelectorService(subgraphService, makeConfigService(), makeRetrievalRepository([])); + + const result = await service.selectPieceForProvider(SP_ADDRESS); + + expect(result).toBeNull(); + expect(sampleAnonPiece).toHaveBeenCalled(); + }); + + it("returns the sampled piece with SP address lowercased", async () => { + sampleAnonPiece.mockResolvedValueOnce(makePiece({ pieceCid: "baga-the-one" })); + const service = new AnonPieceSelectorService(subgraphService, makeConfigService(), makeRetrievalRepository([])); + + const result = await service.selectPieceForProvider(SP_ADDRESS); + + expect(result).not.toBeNull(); + expect(result?.pieceCid).toBe("baga-the-one"); + expect(result?.serviceProvider).toBe(SP_ADDRESS.toLowerCase()); + }); + + it("passes the dealbot payer address to sampleAnonPiece for exclusion", async () => { + sampleAnonPiece.mockResolvedValueOnce(makePiece()); + const service = new AnonPieceSelectorService(subgraphService, makeConfigService(), makeRetrievalRepository([])); + + await service.selectPieceForProvider(SP_ADDRESS); + + const call = sampleAnonPiece.mock.calls[0][0] as SampleAnonPieceParams; + expect(call.payer).toBe(DEALBOT_PAYER); + expect(call.serviceProvider).toBe(SP_ADDRESS); + }); + + it("redraws when the first sampled piece's payment has already terminated", async () => { + const staleCid = "baga-terminated"; + const freshCid = "baga-live"; + sampleAnonPiece + .mockResolvedValueOnce(makePiece({ pieceCid: staleCid, pdpPaymentEndEpoch: 100n, indexedAtBlock: 200 })) + .mockResolvedValueOnce(makePiece({ pieceCid: freshCid, pdpPaymentEndEpoch: null })); + + const service = new AnonPieceSelectorService(subgraphService, makeConfigService(), makeRetrievalRepository([])); + const result = await service.selectPieceForProvider(SP_ADDRESS); + + expect(result?.pieceCid).toBe(freshCid); + }); + + it("redraws when the first sampled piece was recently tested", async () => { + const staleCid = "baga-stale"; + const freshCid = "baga-fresh"; + sampleAnonPiece + .mockResolvedValueOnce(makePiece({ pieceCid: staleCid })) + .mockResolvedValueOnce(makePiece({ pieceCid: freshCid })); + + const service = new AnonPieceSelectorService( + subgraphService, + makeConfigService(), + makeRetrievalRepository([staleCid]), + ); + const result = await service.selectPieceForProvider(SP_ADDRESS); + + expect(result?.pieceCid).toBe(freshCid); + }); + + it("falls back to the opposite pool when the preferred one is empty", async () => { + // First pool call returns nothing twice (both attempts), second pool succeeds. + const fresh = makePiece({ pieceCid: "baga-other-pool" }); + sampleAnonPiece.mockResolvedValueOnce(null).mockResolvedValueOnce(null).mockResolvedValueOnce(fresh); + + const service = new AnonPieceSelectorService(subgraphService, makeConfigService(), makeRetrievalRepository([])); + const result = await service.selectPieceForProvider(SP_ADDRESS); + + expect(result?.pieceCid).toBe("baga-other-pool"); + + // The second (fallback) call should target the opposite pool. + const firstCall = sampleAnonPiece.mock.calls[0][0] as SampleAnonPieceParams; + const fallbackCall = sampleAnonPiece.mock.calls[2][0] as SampleAnonPieceParams; + expect(fallbackCall.pool).not.toBe(firstCall.pool); + }); + + it("widens size bucket to 'any' after both pools fail in the primary bucket", async () => { + // 4 empty attempts across (bucket × both pools × 2 draws each) then + // succeed on the first `any` bucket call. + sampleAnonPiece + .mockResolvedValueOnce(null) + .mockResolvedValueOnce(null) + .mockResolvedValueOnce(null) + .mockResolvedValueOnce(null) + .mockResolvedValueOnce(makePiece({ pieceCid: "baga-any-bucket" })); + + const service = new AnonPieceSelectorService(subgraphService, makeConfigService(), makeRetrievalRepository([])); + const result = await service.selectPieceForProvider(SP_ADDRESS); + + expect(result?.pieceCid).toBe("baga-any-bucket"); + + // The 5th call (index 4) should be the widened-bucket attempt; its size + // range covers at least the 32 GiB ceiling of the "large" bucket. + const widened = sampleAnonPiece.mock.calls[4][0] as SampleAnonPieceParams; + expect(BigInt(widened.maxSize)).toBeGreaterThanOrEqual(32n * 1024n * 1024n * 1024n); + expect(widened.minSize).toBe("0"); + }); + + it("draws a fresh sampleKey for each subgraph call", async () => { + sampleAnonPiece.mockResolvedValueOnce(null).mockResolvedValueOnce(makePiece()); + + const service = new AnonPieceSelectorService(subgraphService, makeConfigService(), makeRetrievalRepository([])); + await service.selectPieceForProvider(SP_ADDRESS); + + const call1 = sampleAnonPiece.mock.calls[0][0] as SampleAnonPieceParams; + const call2 = sampleAnonPiece.mock.calls[1][0] as SampleAnonPieceParams; + expect(call1.sampleKey).toMatch(/^0x[0-9a-f]{64}$/); + expect(call2.sampleKey).toMatch(/^0x[0-9a-f]{64}$/); + expect(call1.sampleKey).not.toBe(call2.sampleKey); + }); +}); diff --git a/apps/backend/src/retrieval-anon/anon-piece-selector.service.ts b/apps/backend/src/retrieval-anon/anon-piece-selector.service.ts new file mode 100644 index 00000000..acc19832 --- /dev/null +++ b/apps/backend/src/retrieval-anon/anon-piece-selector.service.ts @@ -0,0 +1,208 @@ +import { randomBytes } from "node:crypto"; +import { Injectable, Logger } from "@nestjs/common"; +import { ConfigService } from "@nestjs/config"; +import { InjectRepository } from "@nestjs/typeorm"; +import type { Repository } from "typeorm"; +import type { IConfig } from "../config/app.config.js"; +import { AnonRetrieval } from "../database/entities/anon-retrieval.entity.js"; +import type { AnonPiecePool, SampleAnonPieceParams } from "../subgraph/subgraph.service.js"; +import { SubgraphService } from "../subgraph/subgraph.service.js"; +import type { AnonCandidatePiece } from "../subgraph/types.js"; +import type { AnonPiece } from "./types.js"; + +/** + * Number of most-recently-tested anonymous pieces to exclude from selection + * to avoid immediately retesting the same piece. Piece CIDs are globally + * unique and each one lives on a single SP's dataset, so scoping by CID + * is equivalent to scoping by (SP, CID) for this workload. + */ +const RECENT_DEDUP_WINDOW = 500; + +/** + * Piece size buckets, in raw (unpadded) bytes. Weighted sampling across + * these buckets keeps tests meaningful for bandwidth measurement without + * locking out SPs whose corpus skews small or large. + */ +type SizeBucket = "small" | "medium" | "large"; +type SizeRange = { min: bigint; max: bigint }; + +const MIB = 1024n * 1024n; + +// All downloads are buffered in-memory, so we need to keep piece sizes reasonable +const SIZE_BUCKETS: Record = { + small: { min: 1n * MIB, max: 20n * MIB - 1n }, + medium: { min: 20n * MIB, max: 100n * MIB - 1n }, + large: { min: 100n * MIB, max: 500n * MIB - 1n }, +}; + +/** Weights for choosing a bucket per selection. Must sum to 1. */ +const BUCKET_WEIGHTS: Record = { + small: 0.2, + medium: 0.5, + large: 0.3, +}; + +/** + * Probability the primary draw targets the withIPFSIndexing pool. + * The rest of the time we sample across all FWSS pieces so SPs can't + * optimise only their CAR corpus. + */ +const IPFS_INDEXED_SAMPLE_RATE = 0.8; + +@Injectable() +export class AnonPieceSelectorService { + private readonly logger = new Logger(AnonPieceSelectorService.name); + + constructor( + private readonly subgraphService: SubgraphService, + private readonly configService: ConfigService, + @InjectRepository(AnonRetrieval) + private readonly anonRetrievalRepository: Repository, + ) {} + + /** + * Select an anonymous piece to test against the given SP. + * + * Strategy: + * 1. Pick a size bucket by weighted random. + * 2. Pick a pool (`indexed` 80% / `any` 20%). + * 3. Generate a uniform-random sampleKey and query the subgraph for the + * smallest `Root.sampleKey ≥ $sampleKey` matching the filters. + * 4. Drop the pick if `pdpPaymentEndEpoch` has passed or it was tested + * recently; redraw once. + * 5. If still empty, fall back through: (same bucket, opposite pool) → + * (any bucket, indexed) → (any bucket, any). + */ + async selectPieceForProvider(spAddress: string): Promise { + const dealbotPayer = this.configService.get("blockchain", { infer: true }).walletAddress; + const recentlyTested = await this.loadRecentlyTestedPieceCids(); + + const bucket = this.pickBucket(); + const pool: AnonPiecePool = Math.random() < IPFS_INDEXED_SAMPLE_RATE ? "indexed" : "any"; + + const attempts: Array<{ bucket: SizeBucket | "any"; pool: AnonPiecePool }> = [ + { bucket, pool }, + { bucket, pool: pool === "indexed" ? "any" : "indexed" }, + { bucket: "any", pool: "indexed" }, + { bucket: "any", pool: "any" }, + ]; + + for (const attempt of attempts) { + const piece = await this.drawPiece({ + spAddress, + dealbotPayer, + bucket: attempt.bucket, + pool: attempt.pool, + recentlyTested, + }); + + if (piece) { + this.logger.log({ + event: "anon_piece_selected", + message: "Selected anonymous piece for retrieval test", + spAddress, + pieceCid: piece.pieceCid, + dataSetId: piece.dataSetId, + withIPFSIndexing: piece.withIPFSIndexing, + bucket: attempt.bucket, + pool: attempt.pool, + }); + return { + pieceCid: piece.pieceCid, + dataSetId: piece.dataSetId, + pieceId: piece.pieceId, + serviceProvider: spAddress.toLowerCase(), + withIPFSIndexing: piece.withIPFSIndexing, + ipfsRootCid: piece.ipfsRootCid, + rawSize: piece.rawSize, + }; + } + } + + this.logger.warn({ + event: "anon_no_candidates", + message: "No anonymous piece found after all fallbacks", + spAddress, + }); + return null; + } + + /** + * Try to draw a piece for one (bucket, pool) combination. Up to two draws + * with fresh sampleKeys, each filtered by dedup + epoch-termination. + */ + private async drawPiece(args: { + spAddress: string; + dealbotPayer: string; + bucket: SizeBucket | "any"; + pool: AnonPiecePool; + recentlyTested: Set; + }): Promise { + const range = args.bucket === "any" ? fullRange() : SIZE_BUCKETS[args.bucket]; + + for (let attempt = 0; attempt < 2; attempt++) { + const params: SampleAnonPieceParams = { + serviceProvider: args.spAddress, + payer: args.dealbotPayer, + sampleKey: randomSampleKey(), + minSize: range.min.toString(), + maxSize: range.max.toString(), + pool: args.pool, + }; + + const piece = await this.subgraphService.sampleAnonPiece(params); + if (!piece) { + continue; + } + + if (piece.pdpPaymentEndEpoch != null && piece.pdpPaymentEndEpoch <= BigInt(piece.indexedAtBlock)) { + continue; + } + + if (args.recentlyTested.has(piece.pieceCid)) { + continue; + } + + return piece; + } + + return null; + } + + private pickBucket(): SizeBucket { + const r = Math.random(); + let acc = 0; + for (const [name, weight] of Object.entries(BUCKET_WEIGHTS) as Array<[SizeBucket, number]>) { + acc += weight; + if (r < acc) { + return name; + } + } + return "medium"; + } + + /** + * Return the set of piece CIDs tested in the last RECENT_DEDUP_WINDOW + * anonymous retrievals across all SPs. + */ + private async loadRecentlyTestedPieceCids(): Promise> { + const rows = await this.anonRetrievalRepository + .createQueryBuilder("r") + .select("r.piece_cid", "pieceCid") + .orderBy("r.created_at", "DESC") + .limit(RECENT_DEDUP_WINDOW) + .getRawMany<{ pieceCid: string }>(); + + return new Set(rows.map((row) => row.pieceCid)); + } +} + +/** Uniform-random 32-byte sort key as `0x`-prefixed hex. */ +function randomSampleKey(): string { + return `0x${randomBytes(32).toString("hex")}`; +} + +/** The full size range (used when bucket fallback is "any"). */ +function fullRange(): SizeRange { + return { min: 0n, max: (1n << 63n) - 1n }; +} diff --git a/apps/backend/src/retrieval-anon/anon-retrieval.service.spec.ts b/apps/backend/src/retrieval-anon/anon-retrieval.service.spec.ts new file mode 100644 index 00000000..61e97105 --- /dev/null +++ b/apps/backend/src/retrieval-anon/anon-retrieval.service.spec.ts @@ -0,0 +1,189 @@ +import type { Repository } from "typeorm"; +import { beforeEach, describe, expect, it, vi } from "vitest"; +import type { AnonRetrieval } from "../database/entities/anon-retrieval.entity.js"; +import type { StorageProvider } from "../database/entities/storage-provider.entity.js"; +import { RetrievalStatus } from "../database/types.js"; +import type { AnonRetrievalCheckMetrics } from "../metrics-prometheus/check-metrics.service.js"; +import type { WalletSdkService } from "../wallet-sdk/wallet-sdk.service.js"; +import type { AnonPieceSelectorService } from "./anon-piece-selector.service.js"; +import { AnonRetrievalService } from "./anon-retrieval.service.js"; +import type { CarValidationService } from "./car-validation.service.js"; +import type { PieceRetrievalService } from "./piece-retrieval.service.js"; +import type { PieceRetrievalResult } from "./types.js"; + +const SP_ADDRESS = "0xaaaa0000000000000000000000000000000000aa"; + +const PIECE = { + pieceCid: "baga6ea4seaqpiece", + pieceId: "1", + dataSetId: "42", + rawSize: "1048576", + withIPFSIndexing: false, + ipfsRootCid: null, + serviceProvider: SP_ADDRESS, +}; + +function makeProvider(): StorageProvider { + return { + address: SP_ADDRESS, + providerId: 7, + name: "sp-test", + isApproved: true, + } as unknown as StorageProvider; +} + +function makeService(opts: { + pieceResult: PieceRetrievalResult; + fetchPieceImpl?: (signal?: AbortSignal) => Promise; +}): { + service: AnonRetrievalService; + saveSpy: ReturnType; + fetchSpy: ReturnType; +} { + const saveSpy = vi.fn(async (entity: AnonRetrieval) => entity); + const createdEntities: Partial[] = []; + const anonRetrievalRepository = { + create: vi.fn((data: Partial) => { + createdEntities.push(data); + return data; + }), + save: saveSpy, + } as unknown as Repository; + + const spRepository = { + findOne: vi.fn(async () => makeProvider()), + } as unknown as Repository; + + const anonPieceSelector = { + selectPieceForProvider: vi.fn(async () => PIECE), + } as unknown as AnonPieceSelectorService; + + const fetchSpy = vi.fn(opts.fetchPieceImpl ?? (async () => opts.pieceResult)); + const pieceRetrievalService = { + fetchPiece: fetchSpy, + } as unknown as PieceRetrievalService; + + const carValidationService = { + validateCarPiece: vi.fn(), + } as unknown as CarValidationService; + + const walletSdkService = { + getProviderInfo: vi.fn(() => ({ pdp: { serviceURL: "https://sp.test/" } })), + } as unknown as WalletSdkService; + + const metrics = { + observeFirstByteMs: vi.fn(), + observeLastByteMs: vi.fn(), + observeThroughput: vi.fn(), + observeCheckDuration: vi.fn(), + recordStatus: vi.fn(), + recordHttpResponseCode: vi.fn(), + recordCarParseStatus: vi.fn(), + recordIpniStatus: vi.fn(), + recordBlockFetchStatus: vi.fn(), + } as unknown as AnonRetrievalCheckMetrics; + + const service = new AnonRetrievalService( + anonPieceSelector, + pieceRetrievalService, + carValidationService, + walletSdkService, + metrics, + anonRetrievalRepository, + spRepository, + ); + + return { service, saveSpy, fetchSpy }; +} + +describe("AnonRetrievalService", () => { + beforeEach(() => { + vi.clearAllMocks(); + }); + + it("persists partial metrics when fetchPiece returns aborted=true", async () => { + const partial: PieceRetrievalResult = { + success: false, + pieceCid: PIECE.pieceCid, + bytesReceived: 524288, + pieceBytes: null, + latencyMs: 42000, + ttfbMs: 150, + throughputBps: 12500, + statusCode: 200, + commPValid: false, + errorMessage: "Anon retrieval job timeout (60s) for sp1", + aborted: true, + }; + + const { service, saveSpy } = makeService({ pieceResult: partial }); + + await service.performForProvider(SP_ADDRESS); + + expect(saveSpy).toHaveBeenCalledTimes(1); + const saved = saveSpy.mock.calls[0][0] as Partial; + expect(saved.status).toBe(RetrievalStatus.FAILED); + expect(saved.bytesRetrieved).toBe(524288); + expect(saved.ttfbMs).toBe(150); + expect(saved.latencyMs).toBe(42000); + expect(saved.throughputBps).toBe(12500); + expect(saved.responseCode).toBe(200); + expect(saved.errorMessage).toContain("Anon retrieval job timeout"); + }); + + it("still saves a row when the signal aborts before fetchPiece runs", async () => { + const ac = new AbortController(); + ac.abort(new Error("Anon retrieval job timeout (60s) for sp1")); + + const never: PieceRetrievalResult = { + success: false, + pieceCid: PIECE.pieceCid, + bytesReceived: 0, + pieceBytes: null, + latencyMs: 0, + ttfbMs: 0, + throughputBps: 0, + statusCode: 0, + commPValid: false, + }; + + const { service, saveSpy, fetchSpy } = makeService({ pieceResult: never }); + + await service.performForProvider(SP_ADDRESS, ac.signal); + + expect(fetchSpy).not.toHaveBeenCalled(); + expect(saveSpy).toHaveBeenCalledTimes(1); + const saved = saveSpy.mock.calls[0][0] as Partial; + expect(saved.status).toBe(RetrievalStatus.FAILED); + expect(saved.errorMessage).toContain("Anon retrieval job timeout"); + expect(saved.bytesRetrieved).toBeNull(); + expect(saved.ttfbMs).toBeNull(); + }); + + it("still saves a row when fetchPiece throws unexpectedly", async () => { + const never: PieceRetrievalResult = { + success: false, + pieceCid: PIECE.pieceCid, + bytesReceived: 0, + pieceBytes: null, + latencyMs: 0, + ttfbMs: 0, + throughputBps: 0, + statusCode: 0, + commPValid: false, + }; + + const { service, saveSpy } = makeService({ + pieceResult: never, + fetchPieceImpl: async () => { + throw new Error("network down"); + }, + }); + + await expect(service.performForProvider(SP_ADDRESS)).rejects.toThrow("network down"); + + expect(saveSpy).toHaveBeenCalledTimes(1); + const saved = saveSpy.mock.calls[0][0] as Partial; + expect(saved.status).toBe(RetrievalStatus.FAILED); + }); +}); diff --git a/apps/backend/src/retrieval-anon/anon-retrieval.service.ts b/apps/backend/src/retrieval-anon/anon-retrieval.service.ts new file mode 100644 index 00000000..d40fe315 --- /dev/null +++ b/apps/backend/src/retrieval-anon/anon-retrieval.service.ts @@ -0,0 +1,244 @@ +import { Injectable, Logger } from "@nestjs/common"; +import { InjectRepository } from "@nestjs/typeorm"; +import type { Repository } from "typeorm"; +import { type ProviderJobContext, toStructuredError } from "../common/logging.js"; +import { AnonRetrieval } from "../database/entities/anon-retrieval.entity.js"; +import { StorageProvider } from "../database/entities/storage-provider.entity.js"; +import { RetrievalStatus, ServiceType } from "../database/types.js"; +import { buildCheckMetricLabels } from "../metrics-prometheus/check-metric-labels.js"; +import { AnonRetrievalCheckMetrics } from "../metrics-prometheus/check-metrics.service.js"; +import { WalletSdkService } from "../wallet-sdk/wallet-sdk.service.js"; +import { AnonPieceSelectorService } from "./anon-piece-selector.service.js"; +import { CarValidationService } from "./car-validation.service.js"; +import { PieceRetrievalService } from "./piece-retrieval.service.js"; +import type { CarValidationResult, PieceRetrievalResult } from "./types.js"; + +@Injectable() +export class AnonRetrievalService { + private readonly logger = new Logger(AnonRetrievalService.name); + + constructor( + private readonly anonPieceSelectorService: AnonPieceSelectorService, + private readonly pieceRetrievalService: PieceRetrievalService, + private readonly carValidationService: CarValidationService, + private readonly walletSdkService: WalletSdkService, + private readonly metrics: AnonRetrievalCheckMetrics, + @InjectRepository(AnonRetrieval) + private readonly anonRetrievalRepository: Repository, + @InjectRepository(StorageProvider) + private readonly spRepository: Repository, + ) {} + + async performForProvider( + spAddress: string, + signal?: AbortSignal, + logContext?: ProviderJobContext, + ): Promise { + // Build metric labels + const provider = await this.spRepository.findOne({ where: { address: spAddress } }); + const labels = buildCheckMetricLabels({ + checkType: "anon_retrieval", + providerId: provider?.providerId, + providerName: provider?.name, + providerIsApproved: provider?.isApproved, + }); + + // 1. Select an anonymous piece + const piece = await this.anonPieceSelectorService.selectPieceForProvider(spAddress); + if (!piece) { + this.logger.warn({ + ...logContext, + event: "anon_retrieval_no_piece", + message: "No anonymous piece found for SP", + spAddress, + }); + this.metrics.recordStatus(labels, "failure.no_piece"); + return null; + } + + this.logger.log({ + ...logContext, + event: "anon_retrieval_started", + message: "Starting anonymous retrieval test", + pieceCid: piece.pieceCid, + dataSetId: piece.dataSetId, + pieceId: piece.pieceId, + withIPFSIndexing: piece.withIPFSIndexing, + spAddress, + }); + + const checkStart = Date.now(); + const startedAt = new Date(); + + let pieceResult: PieceRetrievalResult | null = null; + let carResult: CarValidationResult | null = null; + let saved: AnonRetrieval | null = null; + + try { + // 2. Fetch the piece. fetchPiece never throws on abort — it returns a + // result with partial metrics so we can persist what we have. + if (signal?.aborted) { + pieceResult = buildAbortedPlaceholder(piece.pieceCid, signal.reason); + } else { + pieceResult = await this.pieceRetrievalService.fetchPiece(spAddress, piece.pieceCid, signal); + } + + // Emit piece retrieval metrics + this.metrics.observeFirstByteMs(labels, pieceResult.ttfbMs); + this.metrics.observeLastByteMs(labels, pieceResult.latencyMs); + this.metrics.observeThroughput(labels, pieceResult.throughputBps); + this.metrics.recordHttpResponseCode(labels, pieceResult.statusCode); + + // 3. CAR validation (only if piece was successfully retrieved and has IPFS indexing) + if ( + pieceResult.success && + piece.withIPFSIndexing && + piece.ipfsRootCid && + pieceResult.pieceBytes && + provider && + !signal?.aborted + ) { + try { + carResult = await this.carValidationService.validateCarPiece( + pieceResult.pieceBytes, + provider, + piece.ipfsRootCid, + signal, + ); + } catch (error) { + this.logger.warn({ + ...logContext, + event: "anon_retrieval_car_validation_failed", + message: "CAR validation threw an error", + pieceCid: piece.pieceCid, + spAddress, + error: toStructuredError(error), + }); + } + } + + // Emit CAR validation metrics + if (carResult) { + this.metrics.recordCarParseStatus(labels, carResult.carParseable); + this.metrics.recordIpniStatus( + labels, + carResult.ipniValid === null ? "skipped" : carResult.ipniValid ? "valid" : "invalid", + ); + this.metrics.recordBlockFetchStatus( + labels, + carResult.blockFetchValid === null ? "skipped" : carResult.blockFetchValid ? "valid" : "invalid", + ); + } else if (!pieceResult.success) { + // Piece retrieval failed — IPNI and block fetch were skipped + this.metrics.recordIpniStatus(labels, "skipped"); + this.metrics.recordBlockFetchStatus(labels, "skipped"); + } + + // Overall check duration and status + this.metrics.observeCheckDuration(labels, Date.now() - checkStart); + this.metrics.recordStatus( + labels, + pieceResult.success ? "success" : pieceResult.aborted ? "failure.aborted" : "failure.http", + ); + } finally { + // Always save a record — even on abort or unexpected error — so we never + // lose the evidence (ttfb, bytes, response code) we already collected. + pieceResult ??= buildAbortedPlaceholder(piece.pieceCid, signal?.reason); + saved = await this.saveRetrievalRecord(spAddress, piece, pieceResult, carResult, startedAt, logContext); + } + + return saved; + } + + private async saveRetrievalRecord( + spAddress: string, + piece: { + pieceCid: string; + dataSetId: string; + pieceId: string; + rawSize: string; + withIPFSIndexing: boolean; + ipfsRootCid: string | null; + }, + pieceResult: PieceRetrievalResult, + carResult: CarValidationResult | null, + startedAt: Date, + logContext?: ProviderJobContext, + ): Promise { + const providerInfo = this.walletSdkService.getProviderInfo(spAddress); + const spBaseUrl = providerInfo?.pdp.serviceURL.replace(/\/$/, "") ?? spAddress; + + const retrieval = this.anonRetrievalRepository.create({ + spAddress, + pieceCid: piece.pieceCid, + dataSetId: BigInt(piece.dataSetId), + pieceId: BigInt(piece.pieceId), + rawSize: BigInt(piece.rawSize), + withIpfsIndexing: piece.withIPFSIndexing, + ipfsRootCid: piece.ipfsRootCid, + serviceType: ServiceType.DIRECT_SP, + retrievalEndpoint: `${spBaseUrl}/piece/${piece.pieceCid}`, + status: pieceResult.success ? RetrievalStatus.SUCCESS : RetrievalStatus.FAILED, + startedAt, + completedAt: new Date(), + latencyMs: pieceResult.latencyMs > 0 ? Math.round(pieceResult.latencyMs) : null, + ttfbMs: pieceResult.ttfbMs > 0 ? Math.round(pieceResult.ttfbMs) : null, + throughputBps: pieceResult.throughputBps > 0 ? Math.round(pieceResult.throughputBps) : null, + bytesRetrieved: pieceResult.bytesReceived > 0 ? pieceResult.bytesReceived : null, + responseCode: pieceResult.statusCode > 0 ? pieceResult.statusCode : null, + errorMessage: pieceResult.errorMessage ?? null, + commpValid: pieceResult.success ? pieceResult.commPValid : null, + carValid: carResult ? carResult.ipniValid !== false && carResult.blockFetchValid !== false : null, + }); + + try { + await this.anonRetrievalRepository.save(retrieval); + } catch (error) { + this.logger.warn({ + ...logContext, + event: "anon_retrieval_save_failed", + message: "Failed to save anonymous retrieval record", + pieceCid: piece.pieceCid, + spAddress, + error: toStructuredError(error), + }); + return null; + } + + this.logger.log({ + ...logContext, + event: "anon_retrieval_completed", + message: "Anonymous retrieval test completed", + pieceCid: piece.pieceCid, + spAddress, + success: pieceResult.success, + aborted: pieceResult.aborted === true, + latencyMs: pieceResult.latencyMs, + ttfbMs: pieceResult.ttfbMs, + bytesRetrieved: pieceResult.bytesReceived, + carParseable: carResult?.carParseable, + ipniValid: carResult?.ipniValid, + blockFetchValid: carResult?.blockFetchValid, + }); + + return retrieval; + } +} + +function buildAbortedPlaceholder(pieceCid: string, reason: unknown): PieceRetrievalResult { + const message = + reason instanceof Error && reason.message ? reason.message : typeof reason === "string" ? reason : "aborted"; + return { + success: false, + pieceCid, + bytesReceived: 0, + pieceBytes: null, + latencyMs: 0, + ttfbMs: 0, + throughputBps: 0, + statusCode: 0, + commPValid: false, + errorMessage: message, + aborted: true, + }; +} diff --git a/apps/backend/src/retrieval-anon/car-validation.service.ts b/apps/backend/src/retrieval-anon/car-validation.service.ts new file mode 100644 index 00000000..8019b8df --- /dev/null +++ b/apps/backend/src/retrieval-anon/car-validation.service.ts @@ -0,0 +1,223 @@ +import { CarReader } from "@ipld/car"; +import * as dagPB from "@ipld/dag-pb"; +import { Injectable, Logger } from "@nestjs/common"; +import { ConfigService } from "@nestjs/config"; +import { create as createBlock } from "multiformats/block"; +import { CID } from "multiformats/cid"; +import * as raw from "multiformats/codecs/raw"; +import { sha256 } from "multiformats/hashes/sha2"; +import { toStructuredError } from "../common/logging.js"; +import type { IConfig } from "../config/app.config.js"; +import type { StorageProvider } from "../database/entities/storage-provider.entity.js"; +import { HttpClientService } from "../http-client/http-client.service.js"; +import { IpniVerificationService } from "../ipni/ipni-verification.service.js"; +import { WalletSdkService } from "../wallet-sdk/wallet-sdk.service.js"; +import type { CarValidationResult } from "./types.js"; + +// UnixFS DAGs use only dag-pb (interior nodes) and raw (leaf data) codecs +const unixfsCodecs: Record unknown }> = { + [dagPB.code]: dagPB, + [raw.code]: raw, +}; + +@Injectable() +export class CarValidationService { + private readonly logger = new Logger(CarValidationService.name); + + constructor( + private readonly configService: ConfigService, + private readonly httpClientService: HttpClientService, + private readonly walletSdkService: WalletSdkService, + private readonly ipniVerificationService: IpniVerificationService, + ) {} + + /** + * Validate an anonymous piece retrieved as a CAR: + * 1. parse the CAR, + * 2. sample random blocks, + * 3. confirm the SP is advertised for the root + sampled CIDs via IPNI, + * 4. fetch each sampled block from the SP and hash-verify it. + * + * CAR parse failure is attributed to the client (bad upload), not the SP. + */ + async validateCarPiece( + pieceBytes: Buffer, + provider: StorageProvider, + ipfsRootCid: string, + signal?: AbortSignal, + ): Promise { + const blocks = await this.parseCar(pieceBytes, provider.address, ipfsRootCid); + if (blocks === null) { + return { carParseable: false, blockCount: 0, sampledCidCount: 0, ipniValid: null, blockFetchValid: null }; + } + if (blocks.length === 0) { + return { + carParseable: true, + blockCount: 0, + sampledCidCount: 0, + ipniValid: null, + blockFetchValid: null, + errorMessage: "CAR contained no blocks", + }; + } + + const sampleCount = this.configService.get("retrieval", { infer: true }).anonBlockSampleCount; + const shuffled = [...blocks].sort(() => Math.random() - 0.5); + const sampledBlocks = shuffled.slice(0, sampleCount); + + const ipniValid = await this.checkIpni(provider, ipfsRootCid, sampledBlocks, signal); + const blockFetchResult = await this.checkBlockFetch(sampledBlocks, provider.address, signal); + + return { + carParseable: true, + blockCount: blocks.length, + sampledCidCount: sampledBlocks.length, + ipniValid, + blockFetchValid: blockFetchResult.valid, + errorMessage: blockFetchResult.errorMessage, + }; + } + + private async parseCar( + pieceBytes: Buffer, + spAddress: string, + ipfsRootCid: string, + ): Promise<{ cid: CID; bytes: Uint8Array }[] | null> { + try { + const reader = await CarReader.fromBytes(new Uint8Array(pieceBytes)); + const blocks: { cid: CID; bytes: Uint8Array }[] = []; + for await (const block of reader.blocks()) { + blocks.push({ cid: block.cid, bytes: block.bytes }); + } + return blocks; + } catch (error) { + this.logger.debug({ + event: "car_parse_failed", + message: "Failed to parse piece bytes as CAR - client fault, not SP", + spAddress, + ipfsRootCid, + error: toStructuredError(error), + }); + return null; + } + } + + /** + * Verify via IPNI that the SP is advertised for the root CID and each sampled child CID. + * Delegates to the shared IpniVerificationService which uses filecoin-pin's provider-scoped check. + */ + private async checkIpni( + provider: StorageProvider, + ipfsRootCid: string, + sampledBlocks: ReadonlyArray<{ cid: CID }>, + signal?: AbortSignal, + ): Promise { + const timeouts = this.configService.get("timeouts", { infer: true }); + let rootCid: CID; + try { + rootCid = CID.parse(ipfsRootCid); + } catch (error) { + this.logger.warn({ + event: "ipni_root_cid_invalid", + message: "Failed to parse ipfsRootCID", + ipfsRootCid, + providerAddress: provider.address, + error: toStructuredError(error), + }); + return false; + } + + const result = await this.ipniVerificationService.verify({ + rootCid, + blockCids: sampledBlocks.map((b) => b.cid), + storageProvider: provider, + timeoutMs: timeouts.ipniVerificationTimeoutMs, + pollIntervalMs: timeouts.ipniVerificationPollingMs, + signal, + }); + + return result.rootCIDVerified; + } + + /** + * Fetch each sampled block from the SP endpoint and hash-verify the response + * against the declared CID. Mirrors IpfsBlockRetrievalStrategy's per-block + * verification for the sampled subset (no DAG traversal). + */ + private async checkBlockFetch( + sampledBlocks: ReadonlyArray<{ cid: CID; bytes: Uint8Array }>, + spAddress: string, + signal?: AbortSignal, + ): Promise<{ valid: boolean | null; errorMessage?: string }> { + const providerInfo = this.walletSdkService.getProviderInfo(spAddress); + if (!providerInfo) { + return { valid: null, errorMessage: `Provider info not found for ${spAddress}` }; + } + + const spBaseUrl = providerInfo.pdp.serviceURL.replace(/\/$/, ""); + let allValid = true; + + for (const block of sampledBlocks) { + signal?.throwIfAborted(); + const cidStr = block.cid.toString(); + const blockUrl = `${spBaseUrl}/ipfs/${cidStr}?format=raw`; + + try { + const resp = await this.httpClientService.requestWithMetrics(blockUrl, { + headers: { Accept: "application/vnd.ipld.raw" }, + httpVersion: "2", + signal, + }); + + if (resp.metrics.statusCode < 200 || resp.metrics.statusCode >= 300) { + allValid = false; + this.logger.warn({ + event: "block_fetch_non_2xx", + message: "Block fetch returned non-2xx status", + cid: cidStr, + spAddress, + statusCode: resp.metrics.statusCode, + }); + continue; + } + + if (block.cid.multihash.code !== sha256.code) { + this.logger.warn({ + event: "block_unsupported_hash", + message: `Unsupported hash algorithm 0x${block.cid.multihash.code.toString(16)}`, + cid: cidStr, + spAddress, + }); + allValid = false; + continue; + } + + const codec = unixfsCodecs[block.cid.code]; + if (!codec) { + this.logger.warn({ + event: "block_unsupported_codec", + message: `Unsupported codec 0x${block.cid.code.toString(16)}`, + cid: cidStr, + spAddress, + }); + allValid = false; + continue; + } + + // Hash-verifies and decodes; throws on mismatch + await createBlock({ bytes: resp.data, cid: block.cid, hasher: sha256, codec }); + } catch (error) { + allValid = false; + this.logger.warn({ + event: "block_fetch_failed", + message: "Block fetch or hash verification failed", + cid: cidStr, + spAddress, + error: toStructuredError(error), + }); + } + } + + return { valid: allValid }; + } +} diff --git a/apps/backend/src/retrieval-anon/piece-retrieval.service.ts b/apps/backend/src/retrieval-anon/piece-retrieval.service.ts new file mode 100644 index 00000000..51150661 --- /dev/null +++ b/apps/backend/src/retrieval-anon/piece-retrieval.service.ts @@ -0,0 +1,195 @@ +import { asPieceCID, calculate as calculatePieceCid } from "@filoz/synapse-core/piece"; +import { Injectable, Logger } from "@nestjs/common"; +import { toStructuredError } from "../common/logging.js"; +import { HttpClientService } from "../http-client/http-client.service.js"; +import { WalletSdkService } from "../wallet-sdk/wallet-sdk.service.js"; +import type { PieceRetrievalResult } from "./types.js"; + +@Injectable() +export class PieceRetrievalService { + private readonly logger = new Logger(PieceRetrievalService.name); + + constructor( + private readonly walletSdkService: WalletSdkService, + private readonly httpClientService: HttpClientService, + ) {} + + async fetchPiece(spAddress: string, pieceCid: string, signal?: AbortSignal): Promise { + const providerInfo = this.walletSdkService.getProviderInfo(spAddress); + + if (!providerInfo) { + this.logger.warn({ + event: "provider_info_not_found", + message: "Cannot fetch piece: provider info not found", + spAddress, + pieceCid, + }); + + return { + success: false, + pieceCid, + bytesReceived: 0, + pieceBytes: null, + latencyMs: 0, + ttfbMs: 0, + throughputBps: 0, + statusCode: 0, + commPValid: false, + errorMessage: `Provider info not found for ${spAddress}`, + }; + } + + const baseUrl = providerInfo.pdp.serviceURL.replace(/\/$/, ""); + const url = `${baseUrl}/piece/${pieceCid}`; + + try { + const result = await this.httpClientService.requestWithMetrics(url, { + httpVersion: "2", + signal, + }); + + const { metrics } = result; + const isSuccess = metrics.statusCode >= 200 && metrics.statusCode < 300; + const throughputBps = metrics.totalTime > 0 ? metrics.responseSize / (metrics.totalTime / 1000) : 0; + + if (result.aborted) { + this.logger.warn({ + event: "piece_fetch_aborted", + message: "Piece fetch aborted mid-download; returning partial metrics", + url, + pieceCid, + spAddress, + bytesReceived: metrics.responseSize, + ttfbMs: metrics.ttfb, + abortReason: result.abortReason, + }); + + return { + success: false, + pieceCid, + bytesReceived: metrics.responseSize, + pieceBytes: null, + latencyMs: metrics.totalTime, + ttfbMs: metrics.ttfb, + throughputBps, + statusCode: metrics.statusCode, + commPValid: false, + errorMessage: result.abortReason ?? "aborted", + aborted: true, + }; + } + + if (!isSuccess) { + this.logger.warn({ + event: "piece_fetch_non_2xx", + message: "Piece fetch returned non-2xx status", + url, + statusCode: metrics.statusCode, + pieceCid, + spAddress, + }); + + return { + success: false, + pieceCid, + bytesReceived: metrics.responseSize, + pieceBytes: null, + latencyMs: metrics.totalTime, + ttfbMs: metrics.ttfb, + throughputBps, + statusCode: metrics.statusCode, + commPValid: false, + errorMessage: `HTTP ${metrics.statusCode}`, + }; + } + + const pieceBytes = Buffer.isBuffer(result.data) ? result.data : Buffer.from(result.data); + const commPValid = await this.validateCommP(pieceBytes, pieceCid); + + this.logger.debug({ + event: "piece_fetch_success", + message: "Piece fetched successfully", + pieceCid, + spAddress, + bytesReceived: metrics.responseSize, + latencyMs: metrics.totalTime, + ttfbMs: metrics.ttfb, + }); + + return { + success: true, + pieceCid, + bytesReceived: metrics.responseSize, + pieceBytes, + latencyMs: metrics.totalTime, + ttfbMs: metrics.ttfb, + throughputBps, + statusCode: metrics.statusCode, + commPValid, + }; + } catch (error) { + const aborted = signal?.aborted === true; + this.logger.warn({ + event: "piece_fetch_failed", + message: "Piece fetch threw an error", + url, + pieceCid, + spAddress, + aborted, + error: toStructuredError(error), + }); + + return { + success: false, + pieceCid, + bytesReceived: 0, + pieceBytes: null, + latencyMs: 0, + ttfbMs: 0, + throughputBps: 0, + statusCode: 0, + commPValid: false, + errorMessage: error instanceof Error ? error.message : String(error), + aborted, + }; + } + } + + /** + * Compute the piece CID (sha2-256-trunc254-padded) of the retrieved bytes and compare + * against the expected CID. Returns false on parse failure, computation failure, or mismatch. + */ + private async validateCommP(bytes: Buffer, pieceCid: string): Promise { + const expected = asPieceCID(pieceCid); + if (!expected) { + this.logger.warn({ + event: "commp_invalid_piece_cid", + message: "Cannot parse expected piece CID for CommP validation", + pieceCid, + }); + return false; + } + + try { + const computed = calculatePieceCid(bytes); + const matches = computed.toString() === expected.toString(); + if (!matches) { + this.logger.warn({ + event: "commp_mismatch", + message: "Piece CID mismatch: SP-returned bytes hash to a different CID", + expected: expected.toString(), + computed: computed.toString(), + }); + } + return matches; + } catch (error) { + this.logger.warn({ + event: "commp_validation_error", + message: "CommP computation threw an error", + pieceCid, + error: toStructuredError(error), + }); + return false; + } + } +} diff --git a/apps/backend/src/retrieval-anon/retrieval-anon.module.ts b/apps/backend/src/retrieval-anon/retrieval-anon.module.ts new file mode 100644 index 00000000..4e9e38df --- /dev/null +++ b/apps/backend/src/retrieval-anon/retrieval-anon.module.ts @@ -0,0 +1,27 @@ +import { Module } from "@nestjs/common"; +import { ConfigModule } from "@nestjs/config"; +import { TypeOrmModule } from "@nestjs/typeorm"; +import { AnonRetrieval } from "../database/entities/anon-retrieval.entity.js"; +import { StorageProvider } from "../database/entities/storage-provider.entity.js"; +import { HttpClientModule } from "../http-client/http-client.module.js"; +import { IpniModule } from "../ipni/ipni.module.js"; +import { SubgraphModule } from "../subgraph/subgraph.module.js"; +import { WalletSdkModule } from "../wallet-sdk/wallet-sdk.module.js"; +import { AnonPieceSelectorService } from "./anon-piece-selector.service.js"; +import { AnonRetrievalService } from "./anon-retrieval.service.js"; +import { CarValidationService } from "./car-validation.service.js"; +import { PieceRetrievalService } from "./piece-retrieval.service.js"; + +@Module({ + imports: [ + ConfigModule, + TypeOrmModule.forFeature([AnonRetrieval, StorageProvider]), + SubgraphModule, + WalletSdkModule, + HttpClientModule, + IpniModule, + ], + providers: [AnonPieceSelectorService, PieceRetrievalService, CarValidationService, AnonRetrievalService], + exports: [AnonRetrievalService], +}) +export class RetrievalAnonModule {} diff --git a/apps/backend/src/retrieval-anon/types.ts b/apps/backend/src/retrieval-anon/types.ts new file mode 100644 index 00000000..2c3384d5 --- /dev/null +++ b/apps/backend/src/retrieval-anon/types.ts @@ -0,0 +1,35 @@ +/** The result of anonymous piece selection. */ +export type AnonPiece = { + pieceCid: string; + dataSetId: string; + pieceId: string; + serviceProvider: string; + withIPFSIndexing: boolean; + ipfsRootCid: string | null; + rawSize: string; +}; + +/** Result of piece retrieval. */ +export type PieceRetrievalResult = { + success: boolean; + pieceCid: string; + bytesReceived: number; + pieceBytes: Buffer | null; + latencyMs: number; + ttfbMs: number; + throughputBps: number; + statusCode: number; + commPValid: boolean; + errorMessage?: string; + aborted?: boolean; +}; + +/** Result of CAR validation. */ +export type CarValidationResult = { + carParseable: boolean; + blockCount: number; + sampledCidCount: number; + ipniValid: boolean | null; + blockFetchValid: boolean | null; + errorMessage?: string; +}; diff --git a/apps/backend/src/subgraph/queries.ts b/apps/backend/src/subgraph/queries.ts new file mode 100644 index 00000000..74802ddf --- /dev/null +++ b/apps/backend/src/subgraph/queries.ts @@ -0,0 +1,78 @@ +export const Queries = { + GET_PROVIDERS_WITH_DATASETS: ` + query GetProvidersWithDataSet($addresses: [Bytes!], $blockNumber: BigInt!) { + providers(where: {address_in: $addresses}) { + address + totalFaultedPeriods + totalProvingPeriods + proofSets (where: {nextDeadline_lt: $blockNumber, status: PROVING}) { + nextDeadline + maxProvingPeriod + } + } + } + `, + GET_SUBGRAPH_META: ` + query GetSubgraphMeta { + _meta { + block { + number + } + } + } + `, +} as const; + +/** + * Build a sampleAnonPiece query scoped to the requested pool. The single + * piece of query shape that differs is whether the proofSet filter pins + * `withIPFSIndexing: true`; assembling the fragment here keeps the rest + * of the query and the returned selection set shared. + */ +export function buildSampleAnonPieceQuery(pool: "indexed" | "any"): string { + const indexingFilter = pool === "indexed" ? "withIPFSIndexing: true" : ""; + return ` + query SampleAnonPiece( + $serviceProvider: Bytes! + $payer: Bytes! + $sampleKey: Bytes! + $minSize: BigInt! + $maxSize: BigInt! + ) { + _meta { + block { + number + } + } + roots( + first: 1 + orderBy: sampleKey + orderDirection: asc + where: { + sampleKey_gte: $sampleKey + removed: false + rawSize_gte: $minSize + rawSize_lte: $maxSize + proofSet_: { + fwssServiceProvider: $serviceProvider + fwssPayer_not: $payer + isActive: true + ${indexingFilter} + } + } + subgraphError: allow + ) { + rootId + cid + rawSize + ipfsRootCID + proofSet { + setId + withIPFSIndexing + fwssPayer + pdpPaymentEndEpoch + } + } + } + `; +} diff --git a/apps/backend/src/subgraph/subgraph.module.ts b/apps/backend/src/subgraph/subgraph.module.ts new file mode 100644 index 00000000..7834c39b --- /dev/null +++ b/apps/backend/src/subgraph/subgraph.module.ts @@ -0,0 +1,8 @@ +import { Module } from "@nestjs/common"; +import { SubgraphService } from "./subgraph.service.js"; + +@Module({ + providers: [SubgraphService], + exports: [SubgraphService], +}) +export class SubgraphModule {} diff --git a/apps/backend/src/pdp-subgraph/pdp-subgraph.service.spec.ts b/apps/backend/src/subgraph/subgraph.service.spec.ts similarity index 79% rename from apps/backend/src/pdp-subgraph/pdp-subgraph.service.spec.ts rename to apps/backend/src/subgraph/subgraph.service.spec.ts index cd3a1ea8..4dc2cd5e 100644 --- a/apps/backend/src/pdp-subgraph/pdp-subgraph.service.spec.ts +++ b/apps/backend/src/subgraph/subgraph.service.spec.ts @@ -1,7 +1,8 @@ import type { ConfigService } from "@nestjs/config"; +import { CID } from "multiformats/cid"; import { afterEach, beforeEach, describe, expect, it, vi } from "vitest"; import type { IConfig } from "../config/app.config.js"; -import { PDPSubgraphService } from "./pdp-subgraph.service.js"; +import { SubgraphService } from "./subgraph.service.js"; const VALID_ADDRESS = "0xd8da6bf26964af9d7eed9e03e53415d37aa96045" as const; const SUBGRAPH_ENDPOINT = "https://api.thegraph.com/subgraphs/filecoin/pdp" as const; @@ -35,21 +36,57 @@ const makeSubgraphMetaResponse = (blockNumber = 12345) => ({ }, }); -describe("PDPSubgraphService", () => { - let service: PDPSubgraphService; +const FWSS_SP_ADDRESS = "0xAaaaAAaaaaAAaaaAaAaAaaAaaaAaAaAaaAaaa111"; +const FWSS_PAYER = "0xBBbbBBbbBBbBBbBbbBBbbBBbbbbBbBBbbBBbb222"; +const EXAMPLE_PIECE_CID = "baga6ea4seaqpzwrimvoc4jp4l7mk6knsknf6owsc2ev4krrs2peenl5qelh6u4y"; +const pieceCidHex = `0x${Buffer.from(CID.parse(EXAMPLE_PIECE_CID).bytes).toString("hex")}`; + +const makeSampleRoot = (overrides: Record = {}) => ({ + rootId: "1", + cid: pieceCidHex, + rawSize: "1048576", + ipfsRootCID: "bafyroot", + proofSet: { + setId: "42", + withIPFSIndexing: true, + fwssPayer: FWSS_PAYER.toLowerCase(), + pdpPaymentEndEpoch: null, + }, + ...overrides, +}); + +const makeSampleResponse = (roots: Record[] = [], blockNumber = 12345) => ({ + data: { + _meta: { block: { number: blockNumber } }, + roots, + }, +}); + +const SAMPLE_KEY = "0x0000000000000000000000000000000000000000000000000000000000000001"; +const defaultSampleParams = { + serviceProvider: FWSS_SP_ADDRESS, + payer: FWSS_PAYER, + sampleKey: SAMPLE_KEY, + minSize: "0", + maxSize: "1000000000000", + pool: "indexed" as const, +}; + +describe("SubgraphService", () => { + let service: SubgraphService; let fetchMock: ReturnType; beforeEach(() => { const configService = { get: vi.fn((key: keyof IConfig) => { if (key === "blockchain") { - return { pdpSubgraphEndpoint: SUBGRAPH_ENDPOINT }; + return { subgraphEndpoint: SUBGRAPH_ENDPOINT }; } return undefined; }), } as unknown as ConfigService; - service = new PDPSubgraphService(configService); + service = new SubgraphService(configService); fetchMock = vi.fn(); vi.stubGlobal("fetch", fetchMock); @@ -362,10 +399,10 @@ describe("PDPSubgraphService", () => { it("throws when PDP subgraph endpoint is not configured", async () => { const configService = { - get: vi.fn(() => ({ pdpSubgraphEndpoint: "" })), + get: vi.fn(() => ({ subgraphEndpoint: "" })), } as unknown as ConfigService; - const serviceWithoutEndpoint = new PDPSubgraphService(configService); + const serviceWithoutEndpoint = new SubgraphService(configService); await expect(serviceWithoutEndpoint.fetchSubgraphMeta()).rejects.toThrow("No PDP subgraph endpoint configured"); }); @@ -691,4 +728,120 @@ describe("PDPSubgraphService", () => { expect(timestamps.length).toBe(1); }); }); + + describe("sampleAnonPiece", () => { + it("returns null when endpoint is not configured", async () => { + const noEndpointConfig = { + get: vi.fn(() => ({ subgraphEndpoint: "" })), + } as unknown as ConfigService; + const noEndpointService = new SubgraphService(noEndpointConfig); + + const piece = await noEndpointService.sampleAnonPiece(defaultSampleParams); + expect(piece).toBeNull(); + expect(fetchMock).not.toHaveBeenCalled(); + }); + + it("returns null when the subgraph yields no matching root", async () => { + fetchMock.mockResolvedValueOnce({ + ok: true, + json: async () => makeSampleResponse([]), + }); + + const piece = await service.sampleAnonPiece(defaultSampleParams); + expect(piece).toBeNull(); + }); + + it("parses the sampled root into a decoded candidate piece", async () => { + fetchMock.mockResolvedValueOnce({ + ok: true, + json: async () => makeSampleResponse([makeSampleRoot()]), + }); + + const piece = await service.sampleAnonPiece(defaultSampleParams); + + expect(piece).toMatchObject({ + pieceCid: EXAMPLE_PIECE_CID, + pieceId: "1", + dataSetId: "42", + rawSize: "1048576", + withIPFSIndexing: true, + ipfsRootCid: "bafyroot", + pdpPaymentEndEpoch: null, + indexedAtBlock: 12345, + }); + }); + + it("returns pdpPaymentEndEpoch as bigint when the dataset is terminating", async () => { + fetchMock.mockResolvedValueOnce({ + ok: true, + json: async () => + makeSampleResponse([ + makeSampleRoot({ + proofSet: { + setId: "42", + withIPFSIndexing: true, + fwssPayer: FWSS_PAYER.toLowerCase(), + pdpPaymentEndEpoch: "5000", + }, + }), + ]), + }); + + const piece = await service.sampleAnonPiece(defaultSampleParams); + expect(piece?.pdpPaymentEndEpoch).toBe(5000n); + }); + + it("lowercases SP and payer addresses before querying", async () => { + fetchMock.mockResolvedValueOnce({ ok: true, json: async () => makeSampleResponse([]) }); + + await service.sampleAnonPiece(defaultSampleParams); + + const [, opts] = fetchMock.mock.calls[0]; + const body = JSON.parse(opts.body as string); + expect(body.variables.serviceProvider).toBe(FWSS_SP_ADDRESS.toLowerCase()); + expect(body.variables.payer).toBe(FWSS_PAYER.toLowerCase()); + expect(body.query).toContain("withIPFSIndexing: true"); + }); + + it("uses the any-pool query when pool is 'any'", async () => { + fetchMock.mockResolvedValueOnce({ ok: true, json: async () => makeSampleResponse([]) }); + + await service.sampleAnonPiece({ ...defaultSampleParams, pool: "any" }); + + const [, opts] = fetchMock.mock.calls[0]; + const body = JSON.parse(opts.body as string); + expect(body.query).not.toContain("withIPFSIndexing: true"); + }); + + it("returns null when the sampled root has an undecodable CID", async () => { + fetchMock.mockResolvedValueOnce({ + ok: true, + json: async () => makeSampleResponse([makeSampleRoot({ cid: "0xdeadbeef" })]), + }); + + const piece = await service.sampleAnonPiece(defaultSampleParams); + expect(piece).toBeNull(); + }); + + it("throws after max retries on repeated HTTP errors", async () => { + fetchMock.mockResolvedValue({ ok: false, status: 500, statusText: "Internal Server Error" }); + + const promise = service.sampleAnonPiece(defaultSampleParams); + promise.catch(() => {}); + await vi.runAllTimersAsync(); + + await expect(promise).rejects.toThrow("Failed to fetch subgraph sample_anon_piece_indexed after 3 attempts"); + expect(fetchMock).toHaveBeenCalledTimes(3); + }); + + it("does not retry on schema validation failure", async () => { + fetchMock.mockResolvedValueOnce({ + ok: true, + json: async () => ({ data: { _meta: { block: { number: 1 } } } }), // missing roots + }); + + await expect(service.sampleAnonPiece(defaultSampleParams)).rejects.toThrow(/validation failed/i); + expect(fetchMock).toHaveBeenCalledTimes(1); + }); + }); }); diff --git a/apps/backend/src/pdp-subgraph/pdp-subgraph.service.ts b/apps/backend/src/subgraph/subgraph.service.ts similarity index 52% rename from apps/backend/src/pdp-subgraph/pdp-subgraph.service.ts rename to apps/backend/src/subgraph/subgraph.service.ts index aedd8bce..55359179 100644 --- a/apps/backend/src/pdp-subgraph/pdp-subgraph.service.ts +++ b/apps/backend/src/subgraph/subgraph.service.ts @@ -2,9 +2,40 @@ import { Injectable, Logger } from "@nestjs/common"; import { ConfigService } from "@nestjs/config"; import { toStructuredError } from "../common/logging.js"; import type { IBlockchainConfig, IConfig } from "../config/app.config.js"; -import { Queries } from "./queries.js"; -import type { GraphQLResponse, ProviderDataSetResponse, ProvidersWithDataSetsOptions, SubgraphMeta } from "./types.js"; -import { validateProviderDataSetResponse, validateSubgraphMetaResponse } from "./types.js"; +import { buildSampleAnonPieceQuery, Queries } from "./queries.js"; +import type { + AnonCandidatePiece, + GraphQLResponse, + ProviderDataSetResponse, + ProvidersWithDataSetsOptions, + RawSampleAnonPieceResponse, + SubgraphMeta, +} from "./types.js"; +import { + decodePieceCid, + validateProviderDataSetResponse, + validateSampleAnonPieceResponse, + validateSubgraphMetaResponse, +} from "./types.js"; + +/** Pool of pieces to sample from. */ +export type AnonPiecePool = "indexed" | "any"; + +/** Inputs for a single anonymous piece sample query. */ +export type SampleAnonPieceParams = { + /** Service provider address (lowercase hex). */ + serviceProvider: string; + /** Dealbot's own payer address (excluded to keep the sample non-dealbot). */ + payer: string; + /** Uniform-random 32-byte sort key as `0x`-prefixed hex. */ + sampleKey: string; + /** Inclusive lower bound on raw piece size in bytes (decimal string). */ + minSize: string; + /** Inclusive upper bound on raw piece size in bytes (decimal string). */ + maxSize: string; + /** Which pool to sample from. */ + pool: AnonPiecePool; +}; /** * Error thrown when data validation fails. @@ -21,8 +52,8 @@ class ValidationError extends Error { } @Injectable() -export class PDPSubgraphService { - private readonly logger: Logger = new Logger(PDPSubgraphService.name); +export class SubgraphService { + private readonly logger: Logger = new Logger(SubgraphService.name); private readonly blockchainConfig: IBlockchainConfig; private static readonly MAX_PROVIDERS_PER_QUERY = 100; @@ -45,14 +76,14 @@ export class PDPSubgraphService { * @throws Error if endpoint is not configured or after MAX_RETRIES attempts */ async fetchSubgraphMeta(attempt: number = 1): Promise { - if (!this.blockchainConfig.pdpSubgraphEndpoint) { + if (!this.blockchainConfig.subgraphEndpoint) { throw new Error("No PDP subgraph endpoint configured"); } try { await this.enforceRateLimit(); - const response = await fetch(this.blockchainConfig.pdpSubgraphEndpoint, { + const response = await fetch(this.blockchainConfig.subgraphEndpoint, { method: "POST", headers: { "Content-Type": "application/json", @@ -95,13 +126,13 @@ export class PDPSubgraphService { } // Retry on network/HTTP errors - if (attempt < PDPSubgraphService.MAX_RETRIES) { - const delay = PDPSubgraphService.INITIAL_RETRY_DELAY_MS * (1 << (attempt - 1)); + if (attempt < SubgraphService.MAX_RETRIES) { + const delay = SubgraphService.INITIAL_RETRY_DELAY_MS * (1 << (attempt - 1)); this.logger.warn({ event: "subgraph_meta_request_retry", message: "Subgraph meta request failed. Retrying...", attempt, - maxRetries: PDPSubgraphService.MAX_RETRIES, + maxRetries: SubgraphService.MAX_RETRIES, retryDelayMs: delay, error: toStructuredError(error), }); @@ -112,11 +143,11 @@ export class PDPSubgraphService { this.logger.error({ event: "subgraph_meta_request_failed", message: "Subgraph meta request failed after maximum retries", - maxRetries: PDPSubgraphService.MAX_RETRIES, + maxRetries: SubgraphService.MAX_RETRIES, error: toStructuredError(error), }); throw new Error( - `Failed to fetch subgraph metadata after ${PDPSubgraphService.MAX_RETRIES} attempts: ${errorMessage}`, + `Failed to fetch subgraph metadata after ${SubgraphService.MAX_RETRIES} attempts: ${errorMessage}`, ); } } @@ -136,13 +167,154 @@ export class PDPSubgraphService { return []; } - if (addresses.length <= PDPSubgraphService.MAX_PROVIDERS_PER_QUERY) { + if (addresses.length <= SubgraphService.MAX_PROVIDERS_PER_QUERY) { return this.fetchWithRetry(blockNumber, addresses); } return this.fetchMultipleBatchesWithRateLimit(blockNumber, addresses); } + /** + * Draw a single random anonymous piece for retrieval testing. + * + * Uses the Root.sampleKey (keccak256 of the entity id) to pick the + * smallest key ≥ `params.sampleKey` that matches the filters — a uniform + * random pick when `sampleKey` is generated uniformly. Server-side filters + * cover SP, payer-exclusion, active status, size range, and optionally + * `withIPFSIndexing`. Returns null when no piece matches (callers should + * retry with a fresh sampleKey or relax the pool/bucket). + * + * `pdpPaymentEndEpoch` is returned to the caller for a cheap client-side + * epoch comparison — GraphQL filters on nullable BigInts are awkward. + */ + async sampleAnonPiece(params: SampleAnonPieceParams): Promise { + if (!this.blockchainConfig.subgraphEndpoint) { + return null; + } + + const query = buildSampleAnonPieceQuery(params.pool); + const variables = { + serviceProvider: params.serviceProvider.toLowerCase(), + payer: params.payer.toLowerCase(), + sampleKey: params.sampleKey, + minSize: params.minSize, + maxSize: params.maxSize, + }; + + const validated = await this.executeQuery( + `sample_anon_piece_${params.pool}`, + query, + variables, + validateSampleAnonPieceResponse, + ); + + const root = validated.roots[0]; + if (!root) { + return null; + } + + try { + return { + pieceCid: decodePieceCid(root.cid), + pieceId: root.rootId, + dataSetId: root.proofSet.setId, + rawSize: root.rawSize, + withIPFSIndexing: root.proofSet.withIPFSIndexing, + ipfsRootCid: root.ipfsRootCID ?? null, + indexedAtBlock: validated._meta.block.number, + pdpPaymentEndEpoch: root.proofSet.pdpPaymentEndEpoch != null ? BigInt(root.proofSet.pdpPaymentEndEpoch) : null, + }; + } catch (error) { + this.logger.warn({ + event: "anon_piece_cid_decode_failed", + message: "Failed to decode piece CID from subgraph data", + dataSetId: root.proofSet.setId, + pieceId: root.rootId, + error: toStructuredError(error), + }); + return null; + } + } + + /** + * Generic single-query helper with retry and rate limiting. Used by queries that + * don't fit the batched provider-fetch shape. + */ + private async executeQuery( + operationName: string, + query: string, + variables: Record, + transform: (data: unknown) => T, + attempt: number = 1, + ): Promise { + if (!this.blockchainConfig.subgraphEndpoint) { + throw new Error("No PDP subgraph endpoint configured"); + } + + try { + await this.enforceRateLimit(); + + const response = await fetch(this.blockchainConfig.subgraphEndpoint, { + method: "POST", + headers: { "Content-Type": "application/json" }, + body: JSON.stringify({ query, variables }), + }); + + if (!response.ok) { + throw new Error(`HTTP ${response.status}: ${response.statusText}`); + } + + const result = (await response.json()) as GraphQLResponse; + + if (result.errors) { + const errorMessage = result.errors?.[0]?.message || "Unknown GraphQL error"; + throw new Error(`GraphQL error: ${errorMessage}`); + } + + try { + return transform(result.data); + } catch (validationError) { + const errorMessage = validationError instanceof Error ? validationError.message : "Unknown validation error"; + throw new ValidationError(`Data validation failed: ${errorMessage}`); + } + } catch (error) { + const errorMessage = error instanceof Error ? error.message : "Unknown error"; + + if (error instanceof ValidationError) { + this.logger.error({ + event: `subgraph_${operationName}_validation_failed`, + message: `Subgraph ${operationName} validation failed`, + error: toStructuredError(error), + }); + throw error; + } + + if (attempt < SubgraphService.MAX_RETRIES) { + const delay = SubgraphService.INITIAL_RETRY_DELAY_MS * (1 << (attempt - 1)); + this.logger.warn({ + event: `subgraph_${operationName}_request_retry`, + message: `Subgraph ${operationName} request failed. Retrying...`, + attempt, + maxRetries: SubgraphService.MAX_RETRIES, + retryDelayMs: delay, + error: toStructuredError(error), + }); + await new Promise((resolve) => setTimeout(resolve, delay)); + return this.executeQuery(operationName, query, variables, transform, attempt + 1); + } + + this.logger.error({ + event: `subgraph_${operationName}_request_failed`, + message: `Subgraph ${operationName} request failed after maximum retries`, + maxRetries: SubgraphService.MAX_RETRIES, + error: toStructuredError(error), + }); + throw new Error( + `Failed to fetch subgraph ${operationName} after ${SubgraphService.MAX_RETRIES} attempts: ${errorMessage}`, + ); + } + } + /** * Fetch multiple batches with rate limiting and concurrency control */ @@ -151,15 +323,15 @@ export class PDPSubgraphService { addresses: string[], ): Promise { const batches: string[][] = []; - for (let i = 0; i < addresses.length; i += PDPSubgraphService.MAX_PROVIDERS_PER_QUERY) { - const addressesLimit = Math.min(addresses.length, i + PDPSubgraphService.MAX_PROVIDERS_PER_QUERY); + for (let i = 0; i < addresses.length; i += SubgraphService.MAX_PROVIDERS_PER_QUERY) { + const addressesLimit = Math.min(addresses.length, i + SubgraphService.MAX_PROVIDERS_PER_QUERY); batches.push(addresses.slice(i, addressesLimit)); } const allProviders: ProviderDataSetResponse["providers"] = []; - for (let i = 0; i < batches.length; i += PDPSubgraphService.MAX_CONCURRENT_REQUESTS) { - const batchGroup = batches.slice(i, i + PDPSubgraphService.MAX_CONCURRENT_REQUESTS); + for (let i = 0; i < batches.length; i += SubgraphService.MAX_CONCURRENT_REQUESTS) { + const batchGroup = batches.slice(i, i + SubgraphService.MAX_CONCURRENT_REQUESTS); const results = await Promise.all(batchGroup.map((batch) => this.fetchWithRetry(blockNumber, batch))); @@ -178,7 +350,7 @@ export class PDPSubgraphService { addresses: string[], attempt: number = 1, ): Promise { - if (!this.blockchainConfig.pdpSubgraphEndpoint) { + if (!this.blockchainConfig.subgraphEndpoint) { throw new Error("No PDP subgraph endpoint configured"); } @@ -190,7 +362,7 @@ export class PDPSubgraphService { try { await this.enforceRateLimit(); - const response = await fetch(this.blockchainConfig.pdpSubgraphEndpoint, { + const response = await fetch(this.blockchainConfig.subgraphEndpoint, { method: "POST", headers: { "Content-Type": "application/json", @@ -235,13 +407,13 @@ export class PDPSubgraphService { } // Retry on network/HTTP errors - if (attempt < PDPSubgraphService.MAX_RETRIES) { - const delay = PDPSubgraphService.INITIAL_RETRY_DELAY_MS * (1 << (attempt - 1)); + if (attempt < SubgraphService.MAX_RETRIES) { + const delay = SubgraphService.INITIAL_RETRY_DELAY_MS * (1 << (attempt - 1)); this.logger.warn({ event: "subgraph_provider_request_retry", message: "Subgraph provider request failed. Retrying...", attempt, - maxRetries: PDPSubgraphService.MAX_RETRIES, + maxRetries: SubgraphService.MAX_RETRIES, retryDelayMs: delay, addressCount: addresses.length, error: toStructuredError(error), @@ -253,14 +425,12 @@ export class PDPSubgraphService { this.logger.error({ event: "subgraph_provider_request_failed", message: "Subgraph provider request failed after maximum retries", - maxRetries: PDPSubgraphService.MAX_RETRIES, + maxRetries: SubgraphService.MAX_RETRIES, blockNumber, addressCount: addresses.length, error: toStructuredError(error), }); - throw new Error( - `Failed to fetch provider data after ${PDPSubgraphService.MAX_RETRIES} attempts: ${errorMessage}`, - ); + throw new Error(`Failed to fetch provider data after ${SubgraphService.MAX_RETRIES} attempts: ${errorMessage}`); } } @@ -270,18 +440,18 @@ export class PDPSubgraphService { * Read more here: https://docs.goldsky.com/subgraphs/graphql-endpoints#public-endpoints */ private async enforceRateLimit(requestCount: number = 1): Promise { - if (requestCount > PDPSubgraphService.MAX_CONCURRENT_REQUESTS) { + if (requestCount > SubgraphService.MAX_CONCURRENT_REQUESTS) { throw new Error( - `Cannot request ${requestCount} items; exceeds rate limit window of ${PDPSubgraphService.MAX_CONCURRENT_REQUESTS}`, + `Cannot request ${requestCount} items; exceeds rate limit window of ${SubgraphService.MAX_CONCURRENT_REQUESTS}`, ); } const now = Date.now(); - const windowStart = now - PDPSubgraphService.RATE_LIMIT_WINDOW_MS; + const windowStart = now - SubgraphService.RATE_LIMIT_WINDOW_MS; this.requestTimestamps = this.requestTimestamps.filter((timestamp) => timestamp > windowStart); - const availableSlots = PDPSubgraphService.MAX_CONCURRENT_REQUESTS - this.requestTimestamps.length; + const availableSlots = SubgraphService.MAX_CONCURRENT_REQUESTS - this.requestTimestamps.length; if (requestCount > availableSlots) { const requiredSlots = requestCount - availableSlots; @@ -290,7 +460,7 @@ export class PDPSubgraphService { const oldestTimestamp = this.requestTimestamps[index] || now; // wait time with 10ms buffer - const waitTime = oldestTimestamp + PDPSubgraphService.RATE_LIMIT_WINDOW_MS - now + 10; + const waitTime = oldestTimestamp + SubgraphService.RATE_LIMIT_WINDOW_MS - now + 10; if (waitTime > 0) { await new Promise((resolve) => setTimeout(resolve, waitTime)); diff --git a/apps/backend/src/pdp-subgraph/types.spec.ts b/apps/backend/src/subgraph/types.spec.ts similarity index 100% rename from apps/backend/src/pdp-subgraph/types.spec.ts rename to apps/backend/src/subgraph/types.spec.ts diff --git a/apps/backend/src/pdp-subgraph/types.ts b/apps/backend/src/subgraph/types.ts similarity index 58% rename from apps/backend/src/pdp-subgraph/types.ts rename to apps/backend/src/subgraph/types.ts index ad8dcdc4..3a89f360 100644 --- a/apps/backend/src/pdp-subgraph/types.ts +++ b/apps/backend/src/subgraph/types.ts @@ -1,4 +1,5 @@ import Joi from "joi"; +import { CID } from "multiformats/cid"; import { Hex, isAddress } from "viem"; // ----------------------------------------- @@ -54,6 +55,58 @@ export type ProviderDataSetResponse = { }[]; }; +/** A piece eligible for anonymous retrieval. */ +export type AnonCandidatePiece = { + /** Decoded piece CID string (e.g. "bafk..."). */ + pieceCid: string; + /** On-chain piece ID (rootId) as a decimal string. */ + pieceId: string; + /** On-chain dataset ID (setId) as a decimal string. */ + dataSetId: string; + /** Raw piece size in bytes, as a decimal string. */ + rawSize: string; + /** True iff the parent dataset declared withIPFSIndexing metadata. */ + withIPFSIndexing: boolean; + /** IPFS root CID declared by the client when uploading, or null. */ + ipfsRootCid: string | null; + /** Subgraph-indexed block number at query time. */ + indexedAtBlock: number; + /** pdpPaymentEndEpoch from the parent dataset, or null. */ + pdpPaymentEndEpoch: bigint | null; +}; + +/** + * Validated raw shape of the anonymous piece sampling subgraph response. + * At most one root is returned (`first: 1`). + */ +export type RawSampleAnonPieceResponse = { + _meta: { block: { number: number } }; + roots: Array<{ + rootId: string; + cid: string; + rawSize: string; + ipfsRootCID: string | null; + proofSet: { + setId: string; + withIPFSIndexing: boolean; + fwssPayer: string | null; + pdpPaymentEndEpoch: string | null; + }; + }>; +}; + +// ----------------------------------------- +// Helpers +// ----------------------------------------- + +/** + * Decodes a hex-encoded CID (0x...) into its string representation. + */ +export function decodePieceCid(hexData: string): string { + const bytes = Buffer.from(hexData.slice(2), "hex"); + return CID.decode(new Uint8Array(bytes)).toString(); +} + // ----------------------------------------- // Joi Custom Schema Converters // ----------------------------------------- @@ -117,6 +170,41 @@ const providerDataSetResponseSchema = Joi.object({ .unknown(true) .required(); +const sampleRootProofSetSchema = Joi.object({ + setId: Joi.string().pattern(/^\d+$/).required(), + withIPFSIndexing: Joi.boolean().required(), + fwssPayer: Joi.string() + .pattern(/^0x[0-9a-fA-F]{40}$/) + .allow(null) + .optional(), + pdpPaymentEndEpoch: Joi.string().pattern(/^\d+$/).allow(null).optional(), +}).unknown(true); + +const sampleRootSchema = Joi.object({ + rootId: Joi.string().pattern(/^\d+$/).required(), + cid: Joi.string() + .pattern(/^0x[0-9a-fA-F]+$/) + .required(), + rawSize: Joi.string().pattern(/^\d+$/).required(), + ipfsRootCID: Joi.string().allow(null).optional(), + proofSet: sampleRootProofSetSchema.required(), +}).unknown(true); + +const sampleAnonPieceResponseSchema = Joi.object({ + _meta: Joi.object({ + block: Joi.object({ + number: Joi.number().integer().positive().required(), + }) + .unknown(true) + .required(), + }) + .unknown(true) + .required(), + roots: Joi.array().items(sampleRootSchema).max(1).required(), +}) + .unknown(true) + .required(); + // ----------------------------------------- // Validator Functions // ----------------------------------------- @@ -149,3 +237,16 @@ export function validateProviderDataSetResponse(value: unknown): ProviderDataSet } return validated as ProviderDataSetResponse; } + +/** + * Validates the raw sampleAnonPiece response from the subgraph. + * + * @throws Error if validation fails + */ +export function validateSampleAnonPieceResponse(value: unknown): RawSampleAnonPieceResponse { + const { error, value: validated } = sampleAnonPieceResponseSchema.validate(value, { abortEarly: false }); + if (error) { + throw new Error(`Invalid sampleAnonPiece response format: ${error.message}`); + } + return validated as RawSampleAnonPieceResponse; +} diff --git a/apps/backend/src/wallet-sdk/wallet-sdk.service.spec.ts b/apps/backend/src/wallet-sdk/wallet-sdk.service.spec.ts index d6613a31..195db19f 100644 --- a/apps/backend/src/wallet-sdk/wallet-sdk.service.spec.ts +++ b/apps/backend/src/wallet-sdk/wallet-sdk.service.spec.ts @@ -18,7 +18,7 @@ const baseConfig: IBlockchainConfig = { checkDatasetCreationFees: false, useOnlyApprovedProviders: false, minNumDataSetsForChecks: 1, - pdpSubgraphEndpoint: "https://api.thegraph.com/subgraphs/filecoin/pdp", + subgraphEndpoint: "https://api.thegraph.com/subgraphs/filecoin/pdp", }; const makeProvider = (overrides: Partial): PDPProviderEx => diff --git a/docs/checks/data-retention.md b/docs/checks/data-retention.md index 605753e7..4eb7a912 100644 --- a/docs/checks/data-retention.md +++ b/docs/checks/data-retention.md @@ -27,7 +27,7 @@ Dealbot polls The Graph API endpoint for PDP (Proof of Data Possession) data at **Subgraph repository**: [FilOzone/pdp-explorer](https://github.com/FilOzone/pdp-explorer/blob/main/subgraph/src/pdp-verifier.ts) -**Subgraph endpoint**: Configured via `PDP_SUBGRAPH_ENDPOINT` environment variable (see [environment-variables.md](../environment-variables.md#pdp_subgraph_endpoint)) +**Subgraph endpoint**: Configured via `SUBGRAPH_ENDPOINT` environment variable (see [environment-variables.md](../environment-variables.md#subgraph_endpoint)) > **Note**: The production subgraph URL is currently being finalized [here](https://github.com/FilOzone/pdp-explorer/pull/86). @@ -48,7 +48,7 @@ From `GET_PROVIDERS_WITH_DATASETS` query for each provider: > **Note**: The subgraph query uses the field name `proofSets`, but this refers to "dataSets" in the current codebase. The terminology was updated from "proof set" to "data set" but the subgraph schema retains the old naming. -Source: [`pdp-subgraph.service.ts` (`fetchSubgraphMeta`, `fetchProvidersWithDatasets`)](../../apps/backend/src/pdp-subgraph/pdp-subgraph.service.ts) +Source: [`subgraph.service.ts` (`fetchSubgraphMeta`, `fetchProvidersWithDatasets`)](../../apps/backend/src/subgraph/subgraph.service.ts) ### 2. Compute Challenge Totals and Overdue Estimates @@ -170,7 +170,7 @@ The PDP subgraph service enforces Goldsky's public endpoint rate limits: Rate limiting is enforced client-side to prevent 429 errors. -Source: [`pdp-subgraph.service.ts` (`enforceRateLimit`)](../../apps/backend/src/pdp-subgraph/pdp-subgraph.service.ts) +Source: [`subgraph.service.ts` (`enforceRateLimit`)](../../apps/backend/src/subgraph/subgraph.service.ts) ## Metrics Recorded @@ -210,11 +210,11 @@ Key environment variables that control data retention check behavior: | Variable | Required | Default | Description | | ----------------------- | -------- | ------------ | ------------------------------------------------------------------------------------------------ | -| `PDP_SUBGRAPH_ENDPOINT` | No | Empty string | The Graph API endpoint for PDP subgraph queries. When empty, data retention checks are disabled. | +| `SUBGRAPH_ENDPOINT` | No | Empty string | The Graph API endpoint for PDP subgraph queries. When empty, data retention checks are disabled. | Source: [`app.config.ts`](../../apps/backend/src/config/app.config.ts) -See also: [`environment-variables.md`](../environment-variables.md#pdp_subgraph_endpoint) for the full configuration reference. +See also: [`environment-variables.md`](../environment-variables.md#subgraph_endpoint) for the full configuration reference. ## Error Handling diff --git a/docs/checks/production-configuration-and-approval-methodology.md b/docs/checks/production-configuration-and-approval-methodology.md index 5566904d..3d956aa4 100644 --- a/docs/checks/production-configuration-and-approval-methodology.md +++ b/docs/checks/production-configuration-and-approval-methodology.md @@ -40,7 +40,7 @@ Relevant parameters include: | Parameter | Value | Notes | |-----------|-------|-------| -| [`PDP_SUBGRAPH_ENDPOINT`](../environment-variables.md#pdp_subgraph_endpoint) | TODO: fill this in | Uses the subgraph from [pdp-explorer](https://github.com/FilOzone/pdp-explorer). | +| [`SUBGRAPH_ENDPOINT`](../environment-variables.md#subgraph_endpoint) | TODO: fill this in | Uses the subgraph from [pdp-explorer](https://github.com/FilOzone/pdp-explorer). | | [`MIN_NUM_DATASETS_FOR_CHECKS`](../environment-variables.md#dataset-configuration) | 15 | Ensure there are enough datasets with pieces being added so that statistical significance for [Data Retention Fault Rate](#data-retention-fault-rate) can be achieved quicker. Note that on mainnet each dataset incurs 5 challenges[^1] per daily proof[^2]. With this many datasets, an SP can be approved for data retention after a faultless ~7 days even if the SP doesn't have other datasets. | See [How are data retention statistics/thresholds calculated?](#how-are-data-retention-statisticsthresholds-calculated) for more details. diff --git a/docs/environment-variables.md b/docs/environment-variables.md index 359d86da..2f25943c 100644 --- a/docs/environment-variables.md +++ b/docs/environment-variables.md @@ -8,10 +8,10 @@ This document provides a comprehensive guide to all environment variables used b | ----------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------ | | [Application](#application-configuration) | `NODE_ENV`, `DEALBOT_PORT`, `DEALBOT_HOST`, `DEALBOT_RUN_MODE`, `DEALBOT_METRICS_PORT`, `DEALBOT_METRICS_HOST`, `DEALBOT_ALLOWED_ORIGINS`, `ENABLE_DEV_MODE` | | [Database](#database-configuration) | `DATABASE_HOST`, `DATABASE_PORT`, `DATABASE_POOL_MAX`, `DATABASE_USER`, `DATABASE_PASSWORD`, `DATABASE_NAME` | -| [Blockchain](#blockchain-configuration) | `NETWORK`, `RPC_URL`, `WALLET_ADDRESS`, `WALLET_PRIVATE_KEY`, `SESSION_KEY_PRIVATE_KEY`, `CHECK_DATASET_CREATION_FEES`, `USE_ONLY_APPROVED_PROVIDERS`, `PDP_SUBGRAPH_ENDPOINT` | +| [Blockchain](#blockchain-configuration) | `NETWORK`, `RPC_URL`, `WALLET_ADDRESS`, `WALLET_PRIVATE_KEY`, `SESSION_KEY_PRIVATE_KEY`, `CHECK_DATASET_CREATION_FEES`, `USE_ONLY_APPROVED_PROVIDERS`, `SUBGRAPH_ENDPOINT` | | [Dataset Versioning](#dataset-versioning) | `DEALBOT_DATASET_VERSION` | | [Scheduling](#scheduling-configuration) | `PROVIDERS_REFRESH_INTERVAL_SECONDS`, `DATA_RETENTION_POLL_INTERVAL_SECONDS`, `DEALBOT_MAINTENANCE_WINDOWS_UTC`, `DEALBOT_MAINTENANCE_WINDOW_MINUTES` | -| [Jobs (pg-boss)](#jobs-pg-boss) | `DEALBOT_PGBOSS_SCHEDULER_ENABLED`, `DEALBOT_PGBOSS_POOL_MAX`, `DEALS_PER_SP_PER_HOUR`, `DATASET_CREATIONS_PER_SP_PER_HOUR`, `RETRIEVALS_PER_SP_PER_HOUR`, `JOB_SCHEDULER_POLL_SECONDS`, `JOB_WORKER_POLL_SECONDS`, `PG_BOSS_LOCAL_CONCURRENCY`, `JOB_CATCHUP_MAX_ENQUEUE`, `JOB_SCHEDULE_PHASE_SECONDS`, `JOB_ENQUEUE_JITTER_SECONDS`, `DEAL_JOB_TIMEOUT_SECONDS`, `RETRIEVAL_JOB_TIMEOUT_SECONDS`, `IPFS_BLOCK_FETCH_CONCURRENCY` | +| [Jobs (pg-boss)](#jobs-pg-boss) | `DEALBOT_PGBOSS_SCHEDULER_ENABLED`, `DEALBOT_PGBOSS_POOL_MAX`, `DEALS_PER_SP_PER_HOUR`, `DATASET_CREATIONS_PER_SP_PER_HOUR`, `RETRIEVALS_PER_SP_PER_HOUR`, `JOB_SCHEDULER_POLL_SECONDS`, `JOB_WORKER_POLL_SECONDS`, `PG_BOSS_LOCAL_CONCURRENCY`, `JOB_CATCHUP_MAX_ENQUEUE`, `JOB_SCHEDULE_PHASE_SECONDS`, `JOB_ENQUEUE_JITTER_SECONDS`, `DEAL_JOB_TIMEOUT_SECONDS`, `RETRIEVAL_JOB_TIMEOUT_SECONDS`, `ANON_RETRIEVAL_JOB_TIMEOUT_SECONDS`, `IPFS_BLOCK_FETCH_CONCURRENCY` | | [Dataset](#dataset-configuration) | `DEALBOT_LOCAL_DATASETS_PATH`, `RANDOM_PIECE_SIZES` | | [ClickHouse](#clickhouse-configuration) | `CLICKHOUSE_URL`, `CLICKHOUSE_BATCH_SIZE`, `CLICKHOUSE_FLUSH_INTERVAL_MS`, `DEALBOT_PROBE_LOCATION` | | [Timeouts](#timeout-configuration) | `CONNECT_TIMEOUT_MS`, `HTTP_REQUEST_TIMEOUT_MS`, `HTTP2_REQUEST_TIMEOUT_MS`, `IPNI_VERIFICATION_TIMEOUT_MS`, `IPNI_VERIFICATION_POLLING_MS` | @@ -425,22 +425,25 @@ Session keys are scoped (only storage operations, not deposits or withdrawals) a --- -### `PDP_SUBGRAPH_ENDPOINT` +### `SUBGRAPH_ENDPOINT` - **Type**: `string` (URL) - **Required**: No - **Default**: Empty string (feature disabled) -**Role**: The Graph API endpoint for querying PDP (Proof of Data Possession) subgraph data. This endpoint is used to retrieve data retention info for provider data. +**Role**: The Graph API endpoint for querying PDP (Proof of Data Possession) subgraph data. Drives the overdue-periods metric and the anonymous-retrieval candidate-piece query. + +The dealbot-owned subgraph lives at `apps/subgraph/` (package `@dealbot/subgraph`) and is deployed to Goldsky. Point this variable at one of those slots; the exact slugs are documented in `apps/subgraph/README.md`. **When to update**: -- When switching between different Graph API endpoints +- When swapping between the dealbot-owned subgraph slots on Goldsky (mainnet vs calibnet). +- When deploying a new subgraph version. **Example**: ```bash -PDP_SUBGRAPH_ENDPOINT=https://api.thegraph.com/subgraphs/filecoin/pdp +SUBGRAPH_ENDPOINT=https://api.goldsky.com/api/public//subgraphs/dealbot-subgraph//gn ``` --- @@ -784,6 +787,25 @@ Use this to stagger multiple dealbot deployments that are not sharing a database **Note**: This is independent of HTTP-level timeouts. The job timeout enforces end-to-end execution time of a Retrieval Check job. +--- + +### `ANON_RETRIEVAL_JOB_TIMEOUT_SECONDS` + +- **Type**: `number` +- **Required**: No +- **Default**: `360` (6 minutes) +- **Minimum**: `60` +- **Enforced**: Yes (config validation) + +**Role**: Maximum runtime for anonymous retrieval jobs before forced abort. Anonymous retrievals fetch arbitrary pieces (up to ~70 MiB) that were not produced by the dealbot, so this is typically larger than `RETRIEVAL_JOB_TIMEOUT_SECONDS`. When the timeout trips, partial metrics (`ttfb_ms`, `bytes_retrieved`, `response_code`) are still persisted so the abort is not silently lost. + +**When to update**: + +- Increase if large pieces are consistently being cut off mid-download +- Decrease to detect and fail stuck retrievals faster + +**Note**: This is independent of HTTP-level timeouts (`CONNECT_TIMEOUT_MS`, `HTTP2_REQUEST_TIMEOUT_MS`). The job timeout covers the end-to-end execution of an Anon Retrieval Check (piece selection, download, CommP validation, CAR/IPNI validation). + --- ### `IPFS_BLOCK_FETCH_CONCURRENCY` diff --git a/kustomize/overlays/local/backend-configmap-local.yaml b/kustomize/overlays/local/backend-configmap-local.yaml index 9226d24e..b4febf61 100644 --- a/kustomize/overlays/local/backend-configmap-local.yaml +++ b/kustomize/overlays/local/backend-configmap-local.yaml @@ -26,7 +26,7 @@ data: PG_BOSS_LOCAL_CONCURRENCY: "3" JOB_WORKER_POLL_SECONDS: "60" RANDOM_PIECE_SIZES: "10485760" - PDP_SUBGRAPH_ENDPOINT: "https://api.goldsky.com/api/public/project_cmdfaaxeuz6us01u359yjdctw/subgraphs/pdp-explorer/calibration311a/gn" + SUBGRAPH_ENDPOINT: "https://api.goldsky.com/api/public/project_cmdfaaxeuz6us01u359yjdctw/subgraphs/pdp-explorer/calibration311a/gn" JOB_SCHEDULER_POLL_SECONDS: "60" CLICKHOUSE_URL: "http://default:@dealbot-clickhouse:8123/dealbot" DEALBOT_PROBE_LOCATION: "local" diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 0495aa11..8089b756 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -1513,24 +1513,24 @@ packages: engines: {node: ^14.18.0 || >=16.10.0, npm: '>=5.10.0'} hasBin: true - '@oclif/core@4.10.5': - resolution: {integrity: sha512-qcdCF7NrdWPfme6Kr34wwljRCXbCVpL1WVxiNy0Ep6vbWKjxAjFQwuhqkoyL0yjI+KdwtLcOCGn5z2yzdijc8w==} + '@oclif/core@4.10.6': + resolution: {integrity: sha512-ySCOYnPKZE3KACT1V9It99hWG9b8E5MpagbRdWxPNRO3beMqmbr4SLUQoFtZ9XRtW++kks1ZVwZOdpnR8rpb9A==} engines: {node: '>=18.0.0'} '@oclif/core@4.5.5': resolution: {integrity: sha512-iQzlaJQgPeUXrtrX71OzDwxPikQ7c2FhNd8U8rBB7BCtj2XYfmzBT/Hmbc+g9OKDIG/JkbJT0fXaWMMBrhi+1A==} engines: {node: '>=18.0.0'} - '@oclif/plugin-autocomplete@3.2.45': - resolution: {integrity: sha512-ENrUg8rbVCjh40uvi3MC9kGbiUoEf11nyqE59RBzegeeLpRXNo/Zp27L9j1tUmPEqGgfS2/wvHPihNzkpK1FDw==} + '@oclif/plugin-autocomplete@3.2.46': + resolution: {integrity: sha512-TFvuD6JlmqEVsEvMqunyj3cyCz/l2Q4MqCjp/XtlSLS9x3xTlam7PGlqWi4WAhxl/K8CtpYqVlMYFEnlLTHspw==} engines: {node: '>=18.0.0'} - '@oclif/plugin-not-found@3.2.80': - resolution: {integrity: sha512-yTLjWvR1r/Rd/cO2LxHdMCDoL5sQhBYRUcOMCmxZtWVWhx4rAZ8KVUPDVsb+SvjJDV5ADTDBgt1H52fFx7YWqg==} + '@oclif/plugin-not-found@3.2.81': + resolution: {integrity: sha512-M88tLONBH36hLAbkFbmCo1hoZPSdU5l8Px1xEIlIgSmGMam+CoAzx4kGqpLbokgfpaHeP8/Jx3QJ18u9ef/2Qw==} engines: {node: '>=18.0.0'} - '@oclif/plugin-warn-if-update-available@3.1.60': - resolution: {integrity: sha512-cRKBZm14IuA6G8W84dfd3iXj3BTAoxQ5o3pUE8DKEQ4n/tVha20t5nkVeD+ISC68e0Fuw5koTMvRwXb1lJSnzg==} + '@oclif/plugin-warn-if-update-available@3.1.61': + resolution: {integrity: sha512-4XcrTxcCs+brR/eZ0BPeuiREiH3USlJiaHbUqPhnIBuyxhhUSYVd8ZO6s5MQN7AXJq4SMQ+B5zLaHq+ep/afIw==} engines: {node: '>=18.0.0'} '@open-draft/deferred-promise@2.2.0': @@ -7599,9 +7599,9 @@ snapshots: dependencies: '@float-capital/float-subgraph-uncrashable': 0.0.0-internal-testing.5 '@oclif/core': 4.5.5 - '@oclif/plugin-autocomplete': 3.2.45 - '@oclif/plugin-not-found': 3.2.80(@types/node@25.6.2) - '@oclif/plugin-warn-if-update-available': 3.1.60 + '@oclif/plugin-autocomplete': 3.2.46 + '@oclif/plugin-not-found': 3.2.81(@types/node@25.2.3) + '@oclif/plugin-warn-if-update-available': 3.1.61 '@pinax/graph-networks-registry': 0.7.1 '@whatwg-node/fetch': 0.10.13 assemblyscript: 0.19.23 @@ -8937,7 +8937,7 @@ snapshots: dependencies: consola: 3.4.2 - '@oclif/core@4.10.5': + '@oclif/core@4.10.6': dependencies: ansi-escapes: 4.3.2 ansis: 3.17.0 @@ -8979,7 +8979,7 @@ snapshots: wordwrap: 1.0.0 wrap-ansi: 7.0.0 - '@oclif/plugin-autocomplete@3.2.45': + '@oclif/plugin-autocomplete@3.2.46': dependencies: '@oclif/core': 4.5.5 ansis: 3.17.0 @@ -8988,16 +8988,16 @@ snapshots: transitivePeerDependencies: - supports-color - '@oclif/plugin-not-found@3.2.80(@types/node@25.6.2)': + '@oclif/plugin-not-found@3.2.81(@types/node@25.2.3)': dependencies: - '@inquirer/prompts': 7.10.1(@types/node@25.6.2) - '@oclif/core': 4.10.5 + '@inquirer/prompts': 7.10.1(@types/node@25.2.3) + '@oclif/core': 4.10.6 ansis: 3.17.0 fast-levenshtein: 3.0.0 transitivePeerDependencies: - '@types/node' - '@oclif/plugin-warn-if-update-available@3.1.60': + '@oclif/plugin-warn-if-update-available@3.1.61': dependencies: '@oclif/core': 4.5.5 ansis: 3.17.0 @@ -11779,7 +11779,7 @@ snapshots: dependencies: foreground-child: 3.3.1 jackspeak: 4.2.3 - minimatch: 10.2.4 + minimatch: 10.2.5 minipass: 7.1.2 package-json-from-dist: 1.0.1 path-scurry: 2.0.1 From 96c82c66f050f5de83c2530f74dda0b18c68618a Mon Sep 17 00:00:00 2001 From: Dennis Trautwein Date: Wed, 29 Apr 2026 10:14:56 +0200 Subject: [PATCH 02/55] refactor(anon): only use clickhouse --- .../src/clickhouse/clickhouse.schema.ts | 36 ++++++ apps/backend/src/database/database.module.ts | 9 +- .../entities/anon-retrieval.entity.ts | 100 ----------------- .../1762000000000-CreateAnonRetrievals.ts | 64 ----------- .../anon-piece-selector.service.spec.ts | 49 ++++----- .../anon-piece-selector.service.ts | 54 ++++----- .../anon-retrieval.service.spec.ts | 98 ++++++++++------- .../retrieval-anon/anon-retrieval.service.ts | 104 ++++++++++-------- .../retrieval-anon/retrieval-anon.module.ts | 3 +- 9 files changed, 208 insertions(+), 309 deletions(-) delete mode 100644 apps/backend/src/database/entities/anon-retrieval.entity.ts delete mode 100644 apps/backend/src/database/migrations/1762000000000-CreateAnonRetrievals.ts diff --git a/apps/backend/src/clickhouse/clickhouse.schema.ts b/apps/backend/src/clickhouse/clickhouse.schema.ts index 85d91052..8af769d7 100644 --- a/apps/backend/src/clickhouse/clickhouse.schema.ts +++ b/apps/backend/src/clickhouse/clickhouse.schema.ts @@ -62,6 +62,42 @@ export function buildMigrations(database: string): string[] { PARTITION BY toStartOfMonth(timestamp) TTL toDateTime(timestamp) + INTERVAL 1 YEAR`, + `CREATE TABLE IF NOT EXISTS ${database}.anon_retrieval_checks +( + timestamp DateTime64(3, 'UTC'), -- when the check completed + probe_location LowCardinality(String), -- dealbot location + sp_address String, -- storage provider address (lowercased) + sp_id Nullable(UInt64), -- storage provider numeric id + sp_name Nullable(String), -- storage provider name + + retrieval_id UUID, -- per-event correlation id (log/Prometheus join) + + piece_cid String, -- piece CID (v2/CommP) sampled from the subgraph + data_set_id UInt64, -- on-chain data set id + piece_id UInt64, -- on-chain piece id within the data set + raw_size UInt64, -- raw (unpadded) piece size, bytes + with_ipfs_indexing Bool, -- whether the piece advertises IPNI metadata + ipfs_root_cid Nullable(String), -- root CID of the contained DAG; null when not IPFS-indexed + + service_type LowCardinality(String), -- 'direct_sp' (only mode for anon retrievals today) + retrieval_endpoint String, -- URL probed (e.g. {spBaseUrl}/piece/{pieceCid}) + + status LowCardinality(String), -- RetrievalStatus: 'success' | 'failed' | 'pending' | 'in_progress' | 'timeout' + http_response_code Nullable(UInt16), -- raw HTTP status; null on transport failure + first_byte_ms Nullable(Float64), -- time to first response byte + last_byte_ms Nullable(Float64), -- time to last response byte + bytes_retrieved Nullable(UInt64), -- bytes received from /piece/{cid} + throughput_bps Nullable(UInt64), -- effective throughput, bytes per second + + commp_valid Nullable(Bool), -- null when retrieval failed before CommP could be hashed + car_valid Nullable(Bool), -- null when CAR validation was skipped (no IPFS indexing or piece fetch failed) + + error_message Nullable(String) -- failure reason; null on success +) ENGINE MergeTree() + PRIMARY KEY (probe_location, sp_address, timestamp) + PARTITION BY toStartOfMonth(timestamp) + TTL toDateTime(timestamp) + INTERVAL 1 YEAR`, + `CREATE TABLE IF NOT EXISTS ${database}.data_retention_challenges ( timestamp DateTime64(3, 'UTC'), -- when the poll ran and detected these periods diff --git a/apps/backend/src/database/database.module.ts b/apps/backend/src/database/database.module.ts index f3f9ed09..9249c3a9 100644 --- a/apps/backend/src/database/database.module.ts +++ b/apps/backend/src/database/database.module.ts @@ -7,7 +7,6 @@ import { fileURLToPath } from "url"; import { toStructuredError } from "../common/logging.js"; import { createPinoExitLogger } from "../common/pino.config.js"; import type { IAppConfig, IConfig, IDatabaseConfig } from "../config/app.config.js"; -import { AnonRetrieval } from "./entities/anon-retrieval.entity.js"; import { DataRetentionBaseline } from "./entities/data-retention-baseline.entity.js"; import { Deal } from "./entities/deal.entity.js"; import { JobScheduleState } from "./entities/job-schedule-state.entity.js"; @@ -50,7 +49,7 @@ function toSafeDataSourceContext(options: DataSourceOptions): Record { - await queryRunner.query(` - CREATE TABLE anon_retrievals ( - id UUID PRIMARY KEY DEFAULT gen_random_uuid(), - sp_address VARCHAR NOT NULL, - piece_cid VARCHAR NOT NULL, - data_set_id BIGINT NOT NULL, - piece_id BIGINT NOT NULL, - raw_size BIGINT NOT NULL, - with_ipfs_indexing BOOLEAN NOT NULL, - ipfs_root_cid VARCHAR NULL, - service_type VARCHAR NOT NULL DEFAULT 'direct_sp', - retrieval_endpoint VARCHAR NOT NULL, - status VARCHAR NOT NULL DEFAULT 'pending', - started_at TIMESTAMPTZ NOT NULL, - completed_at TIMESTAMPTZ NULL, - latency_ms INT NULL, - ttfb_ms INT NULL, - throughput_bps INT NULL, - bytes_retrieved BIGINT NULL, - response_code INT NULL, - error_message VARCHAR NULL, - commp_valid BOOLEAN NULL, - car_valid BOOLEAN NULL, - created_at TIMESTAMPTZ NOT NULL DEFAULT now(), - updated_at TIMESTAMPTZ NOT NULL DEFAULT now() - ) - `); - - // Per-SP dashboards. - await queryRunner.query(` - CREATE INDEX "IDX_anon_retrievals_sp_address" - ON anon_retrievals (sp_address) - `); - - // Used by the recent-dedup query in AnonPieceSelectorService — keeps the - // most-recently-tested CIDs out of the next selection. - await queryRunner.query(` - CREATE INDEX "IDX_anon_retrievals_piece_cid" - ON anon_retrievals (piece_cid) - `); - - // Supports "last N anonymous retrievals" ordering used by the selector. - await queryRunner.query(` - CREATE INDEX "IDX_anon_retrievals_created_at" - ON anon_retrievals (created_at DESC) - `); - } - - public async down(queryRunner: QueryRunner): Promise { - await queryRunner.query(`DROP TABLE IF EXISTS anon_retrievals`); - } -} diff --git a/apps/backend/src/retrieval-anon/anon-piece-selector.service.spec.ts b/apps/backend/src/retrieval-anon/anon-piece-selector.service.spec.ts index b822fe5f..6a787fbb 100644 --- a/apps/backend/src/retrieval-anon/anon-piece-selector.service.spec.ts +++ b/apps/backend/src/retrieval-anon/anon-piece-selector.service.spec.ts @@ -1,8 +1,6 @@ import type { ConfigService } from "@nestjs/config"; -import type { Repository } from "typeorm"; import { beforeEach, describe, expect, it, vi } from "vitest"; import type { IConfig } from "../config/app.config.js"; -import type { AnonRetrieval } from "../database/entities/anon-retrieval.entity.js"; import type { SampleAnonPieceParams, SubgraphService } from "../subgraph/subgraph.service.js"; import type { AnonCandidatePiece } from "../subgraph/types.js"; import { AnonPieceSelectorService } from "./anon-piece-selector.service.js"; @@ -22,18 +20,6 @@ const makePiece = (overrides: Partial = {}): AnonCandidatePi ...overrides, }); -const makeRetrievalRepository = (recentPieceCids: string[]): Repository => { - const queryBuilder = { - select: vi.fn().mockReturnThis(), - orderBy: vi.fn().mockReturnThis(), - limit: vi.fn().mockReturnThis(), - getRawMany: vi.fn().mockResolvedValue(recentPieceCids.map((c) => ({ pieceCid: c }))), - }; - return { - createQueryBuilder: vi.fn().mockReturnValue(queryBuilder), - } as unknown as Repository; -}; - const makeConfigService = (): ConfigService => ({ get: vi.fn((key: string) => { @@ -55,7 +41,7 @@ describe("AnonPieceSelectorService", () => { it("returns null when every fallback attempt yields no piece", async () => { sampleAnonPiece.mockResolvedValue(null); - const service = new AnonPieceSelectorService(subgraphService, makeConfigService(), makeRetrievalRepository([])); + const service = new AnonPieceSelectorService(subgraphService, makeConfigService()); const result = await service.selectPieceForProvider(SP_ADDRESS); @@ -65,7 +51,7 @@ describe("AnonPieceSelectorService", () => { it("returns the sampled piece with SP address lowercased", async () => { sampleAnonPiece.mockResolvedValueOnce(makePiece({ pieceCid: "baga-the-one" })); - const service = new AnonPieceSelectorService(subgraphService, makeConfigService(), makeRetrievalRepository([])); + const service = new AnonPieceSelectorService(subgraphService, makeConfigService()); const result = await service.selectPieceForProvider(SP_ADDRESS); @@ -76,7 +62,7 @@ describe("AnonPieceSelectorService", () => { it("passes the dealbot payer address to sampleAnonPiece for exclusion", async () => { sampleAnonPiece.mockResolvedValueOnce(makePiece()); - const service = new AnonPieceSelectorService(subgraphService, makeConfigService(), makeRetrievalRepository([])); + const service = new AnonPieceSelectorService(subgraphService, makeConfigService()); await service.selectPieceForProvider(SP_ADDRESS); @@ -92,27 +78,30 @@ describe("AnonPieceSelectorService", () => { .mockResolvedValueOnce(makePiece({ pieceCid: staleCid, pdpPaymentEndEpoch: 100n, indexedAtBlock: 200 })) .mockResolvedValueOnce(makePiece({ pieceCid: freshCid, pdpPaymentEndEpoch: null })); - const service = new AnonPieceSelectorService(subgraphService, makeConfigService(), makeRetrievalRepository([])); + const service = new AnonPieceSelectorService(subgraphService, makeConfigService()); const result = await service.selectPieceForProvider(SP_ADDRESS); expect(result?.pieceCid).toBe(freshCid); }); - it("redraws when the first sampled piece was recently tested", async () => { + it("redraws when the first sampled piece was recently selected by this process", async () => { const staleCid = "baga-stale"; const freshCid = "baga-fresh"; + + const service = new AnonPieceSelectorService(subgraphService, makeConfigService()); + + // Prime the in-memory ring buffer by first selecting `staleCid`. + sampleAnonPiece.mockResolvedValueOnce(makePiece({ pieceCid: staleCid })); + const first = await service.selectPieceForProvider(SP_ADDRESS); + expect(first?.pieceCid).toBe(staleCid); + + // Now the second selection should skip `staleCid` and use `freshCid`. sampleAnonPiece .mockResolvedValueOnce(makePiece({ pieceCid: staleCid })) .mockResolvedValueOnce(makePiece({ pieceCid: freshCid })); + const second = await service.selectPieceForProvider(SP_ADDRESS); - const service = new AnonPieceSelectorService( - subgraphService, - makeConfigService(), - makeRetrievalRepository([staleCid]), - ); - const result = await service.selectPieceForProvider(SP_ADDRESS); - - expect(result?.pieceCid).toBe(freshCid); + expect(second?.pieceCid).toBe(freshCid); }); it("falls back to the opposite pool when the preferred one is empty", async () => { @@ -120,7 +109,7 @@ describe("AnonPieceSelectorService", () => { const fresh = makePiece({ pieceCid: "baga-other-pool" }); sampleAnonPiece.mockResolvedValueOnce(null).mockResolvedValueOnce(null).mockResolvedValueOnce(fresh); - const service = new AnonPieceSelectorService(subgraphService, makeConfigService(), makeRetrievalRepository([])); + const service = new AnonPieceSelectorService(subgraphService, makeConfigService()); const result = await service.selectPieceForProvider(SP_ADDRESS); expect(result?.pieceCid).toBe("baga-other-pool"); @@ -141,7 +130,7 @@ describe("AnonPieceSelectorService", () => { .mockResolvedValueOnce(null) .mockResolvedValueOnce(makePiece({ pieceCid: "baga-any-bucket" })); - const service = new AnonPieceSelectorService(subgraphService, makeConfigService(), makeRetrievalRepository([])); + const service = new AnonPieceSelectorService(subgraphService, makeConfigService()); const result = await service.selectPieceForProvider(SP_ADDRESS); expect(result?.pieceCid).toBe("baga-any-bucket"); @@ -156,7 +145,7 @@ describe("AnonPieceSelectorService", () => { it("draws a fresh sampleKey for each subgraph call", async () => { sampleAnonPiece.mockResolvedValueOnce(null).mockResolvedValueOnce(makePiece()); - const service = new AnonPieceSelectorService(subgraphService, makeConfigService(), makeRetrievalRepository([])); + const service = new AnonPieceSelectorService(subgraphService, makeConfigService()); await service.selectPieceForProvider(SP_ADDRESS); const call1 = sampleAnonPiece.mock.calls[0][0] as SampleAnonPieceParams; diff --git a/apps/backend/src/retrieval-anon/anon-piece-selector.service.ts b/apps/backend/src/retrieval-anon/anon-piece-selector.service.ts index acc19832..8de50fa3 100644 --- a/apps/backend/src/retrieval-anon/anon-piece-selector.service.ts +++ b/apps/backend/src/retrieval-anon/anon-piece-selector.service.ts @@ -1,10 +1,7 @@ import { randomBytes } from "node:crypto"; import { Injectable, Logger } from "@nestjs/common"; import { ConfigService } from "@nestjs/config"; -import { InjectRepository } from "@nestjs/typeorm"; -import type { Repository } from "typeorm"; import type { IConfig } from "../config/app.config.js"; -import { AnonRetrieval } from "../database/entities/anon-retrieval.entity.js"; import type { AnonPiecePool, SampleAnonPieceParams } from "../subgraph/subgraph.service.js"; import { SubgraphService } from "../subgraph/subgraph.service.js"; import type { AnonCandidatePiece } from "../subgraph/types.js"; @@ -15,6 +12,9 @@ import type { AnonPiece } from "./types.js"; * to avoid immediately retesting the same piece. Piece CIDs are globally * unique and each one lives on a single SP's dataset, so scoping by CID * is equivalent to scoping by (SP, CID) for this workload. + * + * The buffer is process-local: a duplicate piece that gets retested shortly + * after a restart is harmless (still a valid measurement, just less diverse). */ const RECENT_DEDUP_WINDOW = 500; @@ -44,7 +44,7 @@ const BUCKET_WEIGHTS: Record = { /** * Probability the primary draw targets the withIPFSIndexing pool. - * The rest of the time we sample across all FWSS pieces so SPs can't + * The rest of the time we sample across all FWSS pieces, so SPs can't * optimise only their CAR corpus. */ const IPFS_INDEXED_SAMPLE_RATE = 0.8; @@ -53,11 +53,13 @@ const IPFS_INDEXED_SAMPLE_RATE = 0.8; export class AnonPieceSelectorService { private readonly logger = new Logger(AnonPieceSelectorService.name); + /** Bounded FIFO of recently-selected piece CIDs. Process-local; lost on restart. */ + private readonly recentlyTested = new Set(); + private readonly recentlyTestedQueue: string[] = []; + constructor( private readonly subgraphService: SubgraphService, private readonly configService: ConfigService, - @InjectRepository(AnonRetrieval) - private readonly anonRetrievalRepository: Repository, ) {} /** @@ -75,14 +77,13 @@ export class AnonPieceSelectorService { */ async selectPieceForProvider(spAddress: string): Promise { const dealbotPayer = this.configService.get("blockchain", { infer: true }).walletAddress; - const recentlyTested = await this.loadRecentlyTestedPieceCids(); const bucket = this.pickBucket(); const pool: AnonPiecePool = Math.random() < IPFS_INDEXED_SAMPLE_RATE ? "indexed" : "any"; const attempts: Array<{ bucket: SizeBucket | "any"; pool: AnonPiecePool }> = [ - { bucket, pool }, - { bucket, pool: pool === "indexed" ? "any" : "indexed" }, + { bucket: bucket, pool: pool }, + { bucket: bucket, pool: pool === "indexed" ? "any" : "indexed" }, { bucket: "any", pool: "indexed" }, { bucket: "any", pool: "any" }, ]; @@ -93,10 +94,10 @@ export class AnonPieceSelectorService { dealbotPayer, bucket: attempt.bucket, pool: attempt.pool, - recentlyTested, }); if (piece) { + this.rememberRecent(piece.pieceCid); this.logger.log({ event: "anon_piece_selected", message: "Selected anonymous piece for retrieval test", @@ -107,6 +108,7 @@ export class AnonPieceSelectorService { bucket: attempt.bucket, pool: attempt.pool, }); + return { pieceCid: piece.pieceCid, dataSetId: piece.dataSetId, @@ -124,6 +126,7 @@ export class AnonPieceSelectorService { message: "No anonymous piece found after all fallbacks", spAddress, }); + return null; } @@ -136,7 +139,6 @@ export class AnonPieceSelectorService { dealbotPayer: string; bucket: SizeBucket | "any"; pool: AnonPiecePool; - recentlyTested: Set; }): Promise { const range = args.bucket === "any" ? fullRange() : SIZE_BUCKETS[args.bucket]; @@ -159,7 +161,7 @@ export class AnonPieceSelectorService { continue; } - if (args.recentlyTested.has(piece.pieceCid)) { + if (this.recentlyTested.has(piece.pieceCid)) { continue; } @@ -181,19 +183,21 @@ export class AnonPieceSelectorService { return "medium"; } - /** - * Return the set of piece CIDs tested in the last RECENT_DEDUP_WINDOW - * anonymous retrievals across all SPs. - */ - private async loadRecentlyTestedPieceCids(): Promise> { - const rows = await this.anonRetrievalRepository - .createQueryBuilder("r") - .select("r.piece_cid", "pieceCid") - .orderBy("r.created_at", "DESC") - .limit(RECENT_DEDUP_WINDOW) - .getRawMany<{ pieceCid: string }>(); - - return new Set(rows.map((row) => row.pieceCid)); + /** Push a CID into the bounded FIFO; evict the oldest when at capacity. */ + private rememberRecent(pieceCid: string): void { + if (this.recentlyTested.has(pieceCid)) { + return; + } + + this.recentlyTested.add(pieceCid); + this.recentlyTestedQueue.push(pieceCid); + + while (this.recentlyTestedQueue.length > RECENT_DEDUP_WINDOW) { + const evicted = this.recentlyTestedQueue.shift(); + if (evicted !== undefined) { + this.recentlyTested.delete(evicted); + } + } } } diff --git a/apps/backend/src/retrieval-anon/anon-retrieval.service.spec.ts b/apps/backend/src/retrieval-anon/anon-retrieval.service.spec.ts index 61e97105..e6619e32 100644 --- a/apps/backend/src/retrieval-anon/anon-retrieval.service.spec.ts +++ b/apps/backend/src/retrieval-anon/anon-retrieval.service.spec.ts @@ -1,6 +1,6 @@ import type { Repository } from "typeorm"; import { beforeEach, describe, expect, it, vi } from "vitest"; -import type { AnonRetrieval } from "../database/entities/anon-retrieval.entity.js"; +import type { ClickhouseService } from "../clickhouse/clickhouse.service.js"; import type { StorageProvider } from "../database/entities/storage-provider.entity.js"; import { RetrievalStatus } from "../database/types.js"; import type { AnonRetrievalCheckMetrics } from "../metrics-prometheus/check-metrics.service.js"; @@ -35,20 +35,18 @@ function makeProvider(): StorageProvider { function makeService(opts: { pieceResult: PieceRetrievalResult; fetchPieceImpl?: (signal?: AbortSignal) => Promise; + clickhouseEnabled?: boolean; }): { service: AnonRetrievalService; - saveSpy: ReturnType; + insertSpy: ReturnType; fetchSpy: ReturnType; } { - const saveSpy = vi.fn(async (entity: AnonRetrieval) => entity); - const createdEntities: Partial[] = []; - const anonRetrievalRepository = { - create: vi.fn((data: Partial) => { - createdEntities.push(data); - return data; - }), - save: saveSpy, - } as unknown as Repository; + const insertSpy = vi.fn(); + const clickhouseService = { + insert: insertSpy, + enabled: opts.clickhouseEnabled ?? true, + probeLocation: "test-location", + } as unknown as ClickhouseService; const spRepository = { findOne: vi.fn(async () => makeProvider()), @@ -89,11 +87,11 @@ function makeService(opts: { carValidationService, walletSdkService, metrics, - anonRetrievalRepository, + clickhouseService, spRepository, ); - return { service, saveSpy, fetchSpy }; + return { service, insertSpy, fetchSpy }; } describe("AnonRetrievalService", () => { @@ -101,7 +99,7 @@ describe("AnonRetrievalService", () => { vi.clearAllMocks(); }); - it("persists partial metrics when fetchPiece returns aborted=true", async () => { + it("emits a ClickHouse row with partial metrics when fetchPiece returns aborted=true", async () => { const partial: PieceRetrievalResult = { success: false, pieceCid: PIECE.pieceCid, @@ -116,22 +114,28 @@ describe("AnonRetrievalService", () => { aborted: true, }; - const { service, saveSpy } = makeService({ pieceResult: partial }); + const { service, insertSpy } = makeService({ pieceResult: partial }); await service.performForProvider(SP_ADDRESS); - expect(saveSpy).toHaveBeenCalledTimes(1); - const saved = saveSpy.mock.calls[0][0] as Partial; - expect(saved.status).toBe(RetrievalStatus.FAILED); - expect(saved.bytesRetrieved).toBe(524288); - expect(saved.ttfbMs).toBe(150); - expect(saved.latencyMs).toBe(42000); - expect(saved.throughputBps).toBe(12500); - expect(saved.responseCode).toBe(200); - expect(saved.errorMessage).toContain("Anon retrieval job timeout"); + expect(insertSpy).toHaveBeenCalledTimes(1); + const [table, row] = insertSpy.mock.calls[0] as [string, Record]; + expect(table).toBe("anon_retrieval_checks"); + expect(row.status).toBe(RetrievalStatus.FAILED); + expect(row.bytes_retrieved).toBe(524288); + expect(row.first_byte_ms).toBe(150); + expect(row.last_byte_ms).toBe(42000); + expect(row.throughput_bps).toBe(12500); + expect(row.http_response_code).toBe(200); + expect(row.error_message).toContain("Anon retrieval job timeout"); + expect(row.piece_cid).toBe(PIECE.pieceCid); + expect(row.sp_address).toBe(SP_ADDRESS); + expect(row.sp_id).toBe(7); + expect(row.probe_location).toBe("test-location"); + expect(typeof row.retrieval_id).toBe("string"); }); - it("still saves a row when the signal aborts before fetchPiece runs", async () => { + it("still emits a row when the signal aborts before fetchPiece runs", async () => { const ac = new AbortController(); ac.abort(new Error("Anon retrieval job timeout (60s) for sp1")); @@ -147,20 +151,20 @@ describe("AnonRetrievalService", () => { commPValid: false, }; - const { service, saveSpy, fetchSpy } = makeService({ pieceResult: never }); + const { service, insertSpy, fetchSpy } = makeService({ pieceResult: never }); await service.performForProvider(SP_ADDRESS, ac.signal); expect(fetchSpy).not.toHaveBeenCalled(); - expect(saveSpy).toHaveBeenCalledTimes(1); - const saved = saveSpy.mock.calls[0][0] as Partial; - expect(saved.status).toBe(RetrievalStatus.FAILED); - expect(saved.errorMessage).toContain("Anon retrieval job timeout"); - expect(saved.bytesRetrieved).toBeNull(); - expect(saved.ttfbMs).toBeNull(); + expect(insertSpy).toHaveBeenCalledTimes(1); + const [, row] = insertSpy.mock.calls[0] as [string, Record]; + expect(row.status).toBe(RetrievalStatus.FAILED); + expect(row.error_message).toContain("Anon retrieval job timeout"); + expect(row.bytes_retrieved).toBeNull(); + expect(row.first_byte_ms).toBeNull(); }); - it("still saves a row when fetchPiece throws unexpectedly", async () => { + it("still emits a row when fetchPiece throws unexpectedly", async () => { const never: PieceRetrievalResult = { success: false, pieceCid: PIECE.pieceCid, @@ -173,7 +177,7 @@ describe("AnonRetrievalService", () => { commPValid: false, }; - const { service, saveSpy } = makeService({ + const { service, insertSpy } = makeService({ pieceResult: never, fetchPieceImpl: async () => { throw new Error("network down"); @@ -182,8 +186,28 @@ describe("AnonRetrievalService", () => { await expect(service.performForProvider(SP_ADDRESS)).rejects.toThrow("network down"); - expect(saveSpy).toHaveBeenCalledTimes(1); - const saved = saveSpy.mock.calls[0][0] as Partial; - expect(saved.status).toBe(RetrievalStatus.FAILED); + expect(insertSpy).toHaveBeenCalledTimes(1); + const [, row] = insertSpy.mock.calls[0] as [string, Record]; + expect(row.status).toBe(RetrievalStatus.FAILED); + }); + + it("skips ClickHouse insert when ClickHouse is disabled", async () => { + const ok: PieceRetrievalResult = { + success: true, + pieceCid: PIECE.pieceCid, + bytesReceived: 1024, + pieceBytes: null, + latencyMs: 100, + ttfbMs: 10, + throughputBps: 10240, + statusCode: 200, + commPValid: true, + }; + + const { service, insertSpy } = makeService({ pieceResult: ok, clickhouseEnabled: false }); + + await service.performForProvider(SP_ADDRESS); + + expect(insertSpy).not.toHaveBeenCalled(); }); }); diff --git a/apps/backend/src/retrieval-anon/anon-retrieval.service.ts b/apps/backend/src/retrieval-anon/anon-retrieval.service.ts index d40fe315..1d56d2f0 100644 --- a/apps/backend/src/retrieval-anon/anon-retrieval.service.ts +++ b/apps/backend/src/retrieval-anon/anon-retrieval.service.ts @@ -1,8 +1,9 @@ +import { randomUUID } from "node:crypto"; import { Injectable, Logger } from "@nestjs/common"; import { InjectRepository } from "@nestjs/typeorm"; import type { Repository } from "typeorm"; +import { ClickhouseService } from "../clickhouse/clickhouse.service.js"; import { type ProviderJobContext, toStructuredError } from "../common/logging.js"; -import { AnonRetrieval } from "../database/entities/anon-retrieval.entity.js"; import { StorageProvider } from "../database/entities/storage-provider.entity.js"; import { RetrievalStatus, ServiceType } from "../database/types.js"; import { buildCheckMetricLabels } from "../metrics-prometheus/check-metric-labels.js"; @@ -13,6 +14,8 @@ import { CarValidationService } from "./car-validation.service.js"; import { PieceRetrievalService } from "./piece-retrieval.service.js"; import type { CarValidationResult, PieceRetrievalResult } from "./types.js"; +const ANON_RETRIEVAL_CHECKS_TABLE = "anon_retrieval_checks"; + @Injectable() export class AnonRetrievalService { private readonly logger = new Logger(AnonRetrievalService.name); @@ -23,17 +26,12 @@ export class AnonRetrievalService { private readonly carValidationService: CarValidationService, private readonly walletSdkService: WalletSdkService, private readonly metrics: AnonRetrievalCheckMetrics, - @InjectRepository(AnonRetrieval) - private readonly anonRetrievalRepository: Repository, + private readonly clickhouseService: ClickhouseService, @InjectRepository(StorageProvider) private readonly spRepository: Repository, ) {} - async performForProvider( - spAddress: string, - signal?: AbortSignal, - logContext?: ProviderJobContext, - ): Promise { + async performForProvider(spAddress: string, signal?: AbortSignal, logContext?: ProviderJobContext): Promise { // Build metric labels const provider = await this.spRepository.findOne({ where: { address: spAddress } }); const labels = buildCheckMetricLabels({ @@ -53,7 +51,7 @@ export class AnonRetrievalService { spAddress, }); this.metrics.recordStatus(labels, "failure.no_piece"); - return null; + return; } this.logger.log({ @@ -72,7 +70,6 @@ export class AnonRetrievalService { let pieceResult: PieceRetrievalResult | null = null; let carResult: CarValidationResult | null = null; - let saved: AnonRetrieval | null = null; try { // 2. Fetch the piece. fetchPiece never throws on abort — it returns a @@ -141,16 +138,15 @@ export class AnonRetrievalService { pieceResult.success ? "success" : pieceResult.aborted ? "failure.aborted" : "failure.http", ); } finally { - // Always save a record — even on abort or unexpected error — so we never - // lose the evidence (ttfb, bytes, response code) we already collected. + // Always emit a ClickHouse row — even on abort or unexpected error — so + // we never lose the evidence (ttfb, bytes, response code) we already + // collected. pieceResult ??= buildAbortedPlaceholder(piece.pieceCid, signal?.reason); - saved = await this.saveRetrievalRecord(spAddress, piece, pieceResult, carResult, startedAt, logContext); + this.emitClickhouseRow(spAddress, piece, pieceResult, carResult, startedAt, provider, logContext); } - - return saved; } - private async saveRetrievalRecord( + private emitClickhouseRow( spAddress: string, piece: { pieceCid: string; @@ -163,52 +159,70 @@ export class AnonRetrievalService { pieceResult: PieceRetrievalResult, carResult: CarValidationResult | null, startedAt: Date, + provider: StorageProvider | null, logContext?: ProviderJobContext, - ): Promise { + ): void { + if (!this.clickhouseService.enabled) { + this.logger.debug({ + ...logContext, + event: "anon_retrieval_clickhouse_disabled", + message: "ClickHouse disabled — anon retrieval row not emitted", + pieceCid: piece.pieceCid, + spAddress, + }); + return; + } + const providerInfo = this.walletSdkService.getProviderInfo(spAddress); const spBaseUrl = providerInfo?.pdp.serviceURL.replace(/\/$/, "") ?? spAddress; - - const retrieval = this.anonRetrievalRepository.create({ - spAddress, - pieceCid: piece.pieceCid, - dataSetId: BigInt(piece.dataSetId), - pieceId: BigInt(piece.pieceId), - rawSize: BigInt(piece.rawSize), - withIpfsIndexing: piece.withIPFSIndexing, - ipfsRootCid: piece.ipfsRootCid, - serviceType: ServiceType.DIRECT_SP, - retrievalEndpoint: `${spBaseUrl}/piece/${piece.pieceCid}`, - status: pieceResult.success ? RetrievalStatus.SUCCESS : RetrievalStatus.FAILED, - startedAt, - completedAt: new Date(), - latencyMs: pieceResult.latencyMs > 0 ? Math.round(pieceResult.latencyMs) : null, - ttfbMs: pieceResult.ttfbMs > 0 ? Math.round(pieceResult.ttfbMs) : null, - throughputBps: pieceResult.throughputBps > 0 ? Math.round(pieceResult.throughputBps) : null, - bytesRetrieved: pieceResult.bytesReceived > 0 ? pieceResult.bytesReceived : null, - responseCode: pieceResult.statusCode > 0 ? pieceResult.statusCode : null, - errorMessage: pieceResult.errorMessage ?? null, - commpValid: pieceResult.success ? pieceResult.commPValid : null, - carValid: carResult ? carResult.ipniValid !== false && carResult.blockFetchValid !== false : null, - }); + const status = pieceResult.success ? RetrievalStatus.SUCCESS : RetrievalStatus.FAILED; + const carValid = carResult ? carResult.ipniValid !== false && carResult.blockFetchValid !== false : null; + const retrievalId = randomUUID(); try { - await this.anonRetrievalRepository.save(retrieval); + this.clickhouseService.insert(ANON_RETRIEVAL_CHECKS_TABLE, { + timestamp: startedAt.getTime(), + probe_location: this.clickhouseService.probeLocation, + sp_address: spAddress, + sp_id: provider?.providerId != null ? Number(provider.providerId) : null, + sp_name: provider?.name ?? null, + retrieval_id: retrievalId, + piece_cid: piece.pieceCid, + data_set_id: piece.dataSetId, + piece_id: piece.pieceId, + raw_size: piece.rawSize, + with_ipfs_indexing: piece.withIPFSIndexing, + ipfs_root_cid: piece.ipfsRootCid, + service_type: ServiceType.DIRECT_SP, + retrieval_endpoint: `${spBaseUrl}/piece/${piece.pieceCid}`, + status, + http_response_code: pieceResult.statusCode > 0 ? pieceResult.statusCode : null, + first_byte_ms: pieceResult.ttfbMs > 0 ? pieceResult.ttfbMs : null, + last_byte_ms: pieceResult.latencyMs > 0 ? pieceResult.latencyMs : null, + bytes_retrieved: pieceResult.bytesReceived > 0 ? pieceResult.bytesReceived : null, + throughput_bps: pieceResult.throughputBps > 0 ? Math.round(pieceResult.throughputBps) : null, + commp_valid: pieceResult.success ? pieceResult.commPValid : null, + car_valid: carValid, + error_message: pieceResult.errorMessage ?? null, + }); } catch (error) { + // ClickhouseService.insert is buffered/non-throwing in normal operation, but + // guard against unexpected runtime errors so we don't break the probe cycle. this.logger.warn({ ...logContext, - event: "anon_retrieval_save_failed", - message: "Failed to save anonymous retrieval record", + event: "anon_retrieval_clickhouse_insert_failed", + message: "Failed to enqueue anonymous retrieval row to ClickHouse", pieceCid: piece.pieceCid, spAddress, error: toStructuredError(error), }); - return null; } this.logger.log({ ...logContext, event: "anon_retrieval_completed", message: "Anonymous retrieval test completed", + retrievalId, pieceCid: piece.pieceCid, spAddress, success: pieceResult.success, @@ -220,8 +234,6 @@ export class AnonRetrievalService { ipniValid: carResult?.ipniValid, blockFetchValid: carResult?.blockFetchValid, }); - - return retrieval; } } diff --git a/apps/backend/src/retrieval-anon/retrieval-anon.module.ts b/apps/backend/src/retrieval-anon/retrieval-anon.module.ts index 4e9e38df..c05dcb5f 100644 --- a/apps/backend/src/retrieval-anon/retrieval-anon.module.ts +++ b/apps/backend/src/retrieval-anon/retrieval-anon.module.ts @@ -1,7 +1,6 @@ import { Module } from "@nestjs/common"; import { ConfigModule } from "@nestjs/config"; import { TypeOrmModule } from "@nestjs/typeorm"; -import { AnonRetrieval } from "../database/entities/anon-retrieval.entity.js"; import { StorageProvider } from "../database/entities/storage-provider.entity.js"; import { HttpClientModule } from "../http-client/http-client.module.js"; import { IpniModule } from "../ipni/ipni.module.js"; @@ -15,7 +14,7 @@ import { PieceRetrievalService } from "./piece-retrieval.service.js"; @Module({ imports: [ ConfigModule, - TypeOrmModule.forFeature([AnonRetrieval, StorageProvider]), + TypeOrmModule.forFeature([StorageProvider]), SubgraphModule, WalletSdkModule, HttpClientModule, From 81a38b1fa9fa62e8cd6707e74058bb1b0454c084 Mon Sep 17 00:00:00 2001 From: Dennis Trautwein Date: Wed, 29 Apr 2026 11:01:55 +0200 Subject: [PATCH 03/55] feat(retrieval-anon): track ipni metrics --- .../src/clickhouse/clickhouse.schema.ts | 68 +++++++++++-------- .../anon-retrieval.service.spec.ts | 6 +- .../retrieval-anon/anon-retrieval.service.ts | 18 +++-- .../retrieval-anon/car-validation.service.ts | 65 ++++++++++++++---- apps/backend/src/retrieval-anon/types.ts | 5 ++ 5 files changed, 112 insertions(+), 50 deletions(-) diff --git a/apps/backend/src/clickhouse/clickhouse.schema.ts b/apps/backend/src/clickhouse/clickhouse.schema.ts index 8af769d7..e30f6151 100644 --- a/apps/backend/src/clickhouse/clickhouse.schema.ts +++ b/apps/backend/src/clickhouse/clickhouse.schema.ts @@ -64,35 +64,45 @@ export function buildMigrations(database: string): string[] { `CREATE TABLE IF NOT EXISTS ${database}.anon_retrieval_checks ( - timestamp DateTime64(3, 'UTC'), -- when the check completed - probe_location LowCardinality(String), -- dealbot location - sp_address String, -- storage provider address (lowercased) - sp_id Nullable(UInt64), -- storage provider numeric id - sp_name Nullable(String), -- storage provider name - - retrieval_id UUID, -- per-event correlation id (log/Prometheus join) - - piece_cid String, -- piece CID (v2/CommP) sampled from the subgraph - data_set_id UInt64, -- on-chain data set id - piece_id UInt64, -- on-chain piece id within the data set - raw_size UInt64, -- raw (unpadded) piece size, bytes - with_ipfs_indexing Bool, -- whether the piece advertises IPNI metadata - ipfs_root_cid Nullable(String), -- root CID of the contained DAG; null when not IPFS-indexed - - service_type LowCardinality(String), -- 'direct_sp' (only mode for anon retrievals today) - retrieval_endpoint String, -- URL probed (e.g. {spBaseUrl}/piece/{pieceCid}) - - status LowCardinality(String), -- RetrievalStatus: 'success' | 'failed' | 'pending' | 'in_progress' | 'timeout' - http_response_code Nullable(UInt16), -- raw HTTP status; null on transport failure - first_byte_ms Nullable(Float64), -- time to first response byte - last_byte_ms Nullable(Float64), -- time to last response byte - bytes_retrieved Nullable(UInt64), -- bytes received from /piece/{cid} - throughput_bps Nullable(UInt64), -- effective throughput, bytes per second - - commp_valid Nullable(Bool), -- null when retrieval failed before CommP could be hashed - car_valid Nullable(Bool), -- null when CAR validation was skipped (no IPFS indexing or piece fetch failed) - - error_message Nullable(String) -- failure reason; null on success + timestamp DateTime64(3, 'UTC'), -- when the check completed + probe_location LowCardinality(String), -- dealbot location + sp_address String, -- storage provider address (lowercased) + sp_id Nullable(UInt64), -- storage provider numeric id + sp_name Nullable(String), -- storage provider name + + retrieval_id UUID, -- per-event correlation id (log/Prometheus join) + + piece_cid String, -- piece CID (v2/CommP) sampled from the subgraph + data_set_id UInt64, -- on-chain data set id + piece_id UInt64, -- on-chain piece id within the data set + raw_size UInt64, -- raw (unpadded) piece size, bytes + with_ipfs_indexing Bool, -- whether the piece advertises IPNI metadata + ipfs_root_cid Nullable(String), -- root CID of the contained DAG; null when not IPFS-indexed + + service_type LowCardinality(String), -- 'direct_sp' (only mode for anon retrievals today) + retrieval_endpoint String, -- URL probed (e.g. {spBaseUrl}/piece/{pieceCid}) + + piece_fetch_status LowCardinality(String), -- 'success' | 'failed' — outcome of GET /piece/ (HTTP 2xx AND CommP match). CAR/IPNI/block-fetch outcomes live in their own columns. + http_response_code Nullable(UInt16), -- raw HTTP status; null on transport failure + first_byte_ms Nullable(Float64), -- time to first response byte + last_byte_ms Nullable(Float64), -- time to last response byte + bytes_retrieved Nullable(UInt64), -- bytes received from /piece/{cid} + throughput_bps Nullable(UInt64), -- effective throughput, bytes per second + + commp_valid Nullable(Bool), -- null when retrieval failed before CommP could be hashed + car_parseable Nullable(Bool), -- null when CAR validation was skipped (no IPFS indexing or piece fetch failed); true if bytes parsed as a CAR + car_block_count Nullable(UInt32), -- total number of blocks observed inside the CAR; null when skipped or unparseable + block_fetch_endpoint Nullable(String), -- gateway base URL probed for block fetch (e.g. {spBaseUrl}/ipfs/); null when skipped + block_fetch_valid Nullable(Bool), -- null when skipped; true if all sampled blocks fetched + hash-verified + block_fetch_sampled_count Nullable(UInt32), -- number of blocks sampled and probed via /ipfs/?format=raw + block_fetch_failed_count Nullable(UInt32), -- number of sampled blocks that failed (non-2xx, hash mismatch, unsupported codec, or transport error) + + ipni_status LowCardinality(String), -- 'valid' | 'invalid' | 'skipped' (mirrors data_storage_checks naming) + ipni_verify_ms Nullable(Float64), -- IPNI verification duration; null when skipped + ipni_verified_cids_count Nullable(UInt32), -- CIDs confirmed findable via IPNI + ipni_unverified_cids_count Nullable(UInt32), -- CIDs checked but not findable + + error_message Nullable(String) -- failure reason; null on success ) ENGINE MergeTree() PRIMARY KEY (probe_location, sp_address, timestamp) PARTITION BY toStartOfMonth(timestamp) diff --git a/apps/backend/src/retrieval-anon/anon-retrieval.service.spec.ts b/apps/backend/src/retrieval-anon/anon-retrieval.service.spec.ts index e6619e32..275a3de2 100644 --- a/apps/backend/src/retrieval-anon/anon-retrieval.service.spec.ts +++ b/apps/backend/src/retrieval-anon/anon-retrieval.service.spec.ts @@ -121,7 +121,7 @@ describe("AnonRetrievalService", () => { expect(insertSpy).toHaveBeenCalledTimes(1); const [table, row] = insertSpy.mock.calls[0] as [string, Record]; expect(table).toBe("anon_retrieval_checks"); - expect(row.status).toBe(RetrievalStatus.FAILED); + expect(row.piece_fetch_status).toBe(RetrievalStatus.FAILED); expect(row.bytes_retrieved).toBe(524288); expect(row.first_byte_ms).toBe(150); expect(row.last_byte_ms).toBe(42000); @@ -158,7 +158,7 @@ describe("AnonRetrievalService", () => { expect(fetchSpy).not.toHaveBeenCalled(); expect(insertSpy).toHaveBeenCalledTimes(1); const [, row] = insertSpy.mock.calls[0] as [string, Record]; - expect(row.status).toBe(RetrievalStatus.FAILED); + expect(row.piece_fetch_status).toBe(RetrievalStatus.FAILED); expect(row.error_message).toContain("Anon retrieval job timeout"); expect(row.bytes_retrieved).toBeNull(); expect(row.first_byte_ms).toBeNull(); @@ -188,7 +188,7 @@ describe("AnonRetrievalService", () => { expect(insertSpy).toHaveBeenCalledTimes(1); const [, row] = insertSpy.mock.calls[0] as [string, Record]; - expect(row.status).toBe(RetrievalStatus.FAILED); + expect(row.piece_fetch_status).toBe(RetrievalStatus.FAILED); }); it("skips ClickHouse insert when ClickHouse is disabled", async () => { diff --git a/apps/backend/src/retrieval-anon/anon-retrieval.service.ts b/apps/backend/src/retrieval-anon/anon-retrieval.service.ts index 1d56d2f0..8f2e135a 100644 --- a/apps/backend/src/retrieval-anon/anon-retrieval.service.ts +++ b/apps/backend/src/retrieval-anon/anon-retrieval.service.ts @@ -175,8 +175,9 @@ export class AnonRetrievalService { const providerInfo = this.walletSdkService.getProviderInfo(spAddress); const spBaseUrl = providerInfo?.pdp.serviceURL.replace(/\/$/, "") ?? spAddress; - const status = pieceResult.success ? RetrievalStatus.SUCCESS : RetrievalStatus.FAILED; - const carValid = carResult ? carResult.ipniValid !== false && carResult.blockFetchValid !== false : null; + const pieceFetchStatus = pieceResult.success ? RetrievalStatus.SUCCESS : RetrievalStatus.FAILED; + const ipniStatus = + carResult == null || carResult.ipniValid === null ? "skipped" : carResult.ipniValid ? "valid" : "invalid"; const retrievalId = randomUUID(); try { @@ -195,14 +196,23 @@ export class AnonRetrievalService { ipfs_root_cid: piece.ipfsRootCid, service_type: ServiceType.DIRECT_SP, retrieval_endpoint: `${spBaseUrl}/piece/${piece.pieceCid}`, - status, + piece_fetch_status: pieceFetchStatus, http_response_code: pieceResult.statusCode > 0 ? pieceResult.statusCode : null, first_byte_ms: pieceResult.ttfbMs > 0 ? pieceResult.ttfbMs : null, last_byte_ms: pieceResult.latencyMs > 0 ? pieceResult.latencyMs : null, bytes_retrieved: pieceResult.bytesReceived > 0 ? pieceResult.bytesReceived : null, throughput_bps: pieceResult.throughputBps > 0 ? Math.round(pieceResult.throughputBps) : null, commp_valid: pieceResult.success ? pieceResult.commPValid : null, - car_valid: carValid, + car_parseable: carResult ? carResult.carParseable : null, + car_block_count: carResult?.carParseable ? carResult.blockCount : null, + block_fetch_endpoint: carResult?.blockFetchEndpoint ?? null, + block_fetch_valid: carResult ? carResult.blockFetchValid : null, + block_fetch_sampled_count: carResult?.carParseable ? carResult.sampledCidCount : null, + block_fetch_failed_count: carResult?.blockFetchFailedCount ?? null, + ipni_status: ipniStatus, + ipni_verify_ms: carResult?.ipniVerifyMs ?? null, + ipni_verified_cids_count: carResult?.ipniVerifiedCidsCount ?? null, + ipni_unverified_cids_count: carResult?.ipniUnverifiedCidsCount ?? null, error_message: pieceResult.errorMessage ?? null, }); } catch (error) { diff --git a/apps/backend/src/retrieval-anon/car-validation.service.ts b/apps/backend/src/retrieval-anon/car-validation.service.ts index 8019b8df..017a38e8 100644 --- a/apps/backend/src/retrieval-anon/car-validation.service.ts +++ b/apps/backend/src/retrieval-anon/car-validation.service.ts @@ -48,7 +48,18 @@ export class CarValidationService { ): Promise { const blocks = await this.parseCar(pieceBytes, provider.address, ipfsRootCid); if (blocks === null) { - return { carParseable: false, blockCount: 0, sampledCidCount: 0, ipniValid: null, blockFetchValid: null }; + return { + carParseable: false, + blockCount: 0, + sampledCidCount: 0, + ipniValid: null, + ipniVerifyMs: null, + ipniVerifiedCidsCount: null, + ipniUnverifiedCidsCount: null, + blockFetchValid: null, + blockFetchFailedCount: null, + blockFetchEndpoint: null, + }; } if (blocks.length === 0) { return { @@ -56,7 +67,12 @@ export class CarValidationService { blockCount: 0, sampledCidCount: 0, ipniValid: null, + ipniVerifyMs: null, + ipniVerifiedCidsCount: null, + ipniUnverifiedCidsCount: null, blockFetchValid: null, + blockFetchFailedCount: null, + blockFetchEndpoint: null, errorMessage: "CAR contained no blocks", }; } @@ -65,15 +81,20 @@ export class CarValidationService { const shuffled = [...blocks].sort(() => Math.random() - 0.5); const sampledBlocks = shuffled.slice(0, sampleCount); - const ipniValid = await this.checkIpni(provider, ipfsRootCid, sampledBlocks, signal); + const ipni = await this.checkIpni(provider, ipfsRootCid, sampledBlocks, signal); const blockFetchResult = await this.checkBlockFetch(sampledBlocks, provider.address, signal); return { carParseable: true, blockCount: blocks.length, sampledCidCount: sampledBlocks.length, - ipniValid, + ipniValid: ipni.valid, + ipniVerifyMs: ipni.durationMs, + ipniVerifiedCidsCount: ipni.verifiedCount, + ipniUnverifiedCidsCount: ipni.unverifiedCount, blockFetchValid: blockFetchResult.valid, + blockFetchFailedCount: blockFetchResult.failedCount, + blockFetchEndpoint: blockFetchResult.endpoint, errorMessage: blockFetchResult.errorMessage, }; } @@ -111,7 +132,12 @@ export class CarValidationService { ipfsRootCid: string, sampledBlocks: ReadonlyArray<{ cid: CID }>, signal?: AbortSignal, - ): Promise { + ): Promise<{ + valid: boolean; + durationMs: number | null; + verifiedCount: number | null; + unverifiedCount: number | null; + }> { const timeouts = this.configService.get("timeouts", { infer: true }); let rootCid: CID; try { @@ -124,7 +150,7 @@ export class CarValidationService { providerAddress: provider.address, error: toStructuredError(error), }); - return false; + return { valid: false, durationMs: null, verifiedCount: null, unverifiedCount: null }; } const result = await this.ipniVerificationService.verify({ @@ -136,7 +162,12 @@ export class CarValidationService { signal, }); - return result.rootCIDVerified; + return { + valid: result.rootCIDVerified, + durationMs: result.durationMs, + verifiedCount: result.verified, + unverifiedCount: result.unverified, + }; } /** @@ -148,14 +179,20 @@ export class CarValidationService { sampledBlocks: ReadonlyArray<{ cid: CID; bytes: Uint8Array }>, spAddress: string, signal?: AbortSignal, - ): Promise<{ valid: boolean | null; errorMessage?: string }> { + ): Promise<{ valid: boolean | null; failedCount: number | null; endpoint: string | null; errorMessage?: string }> { const providerInfo = this.walletSdkService.getProviderInfo(spAddress); if (!providerInfo) { - return { valid: null, errorMessage: `Provider info not found for ${spAddress}` }; + return { + valid: null, + failedCount: null, + endpoint: null, + errorMessage: `Provider info not found for ${spAddress}`, + }; } const spBaseUrl = providerInfo.pdp.serviceURL.replace(/\/$/, ""); - let allValid = true; + const endpoint = `${spBaseUrl}/ipfs/`; + let failedCount = 0; for (const block of sampledBlocks) { signal?.throwIfAborted(); @@ -170,7 +207,7 @@ export class CarValidationService { }); if (resp.metrics.statusCode < 200 || resp.metrics.statusCode >= 300) { - allValid = false; + failedCount += 1; this.logger.warn({ event: "block_fetch_non_2xx", message: "Block fetch returned non-2xx status", @@ -188,7 +225,7 @@ export class CarValidationService { cid: cidStr, spAddress, }); - allValid = false; + failedCount += 1; continue; } @@ -200,14 +237,14 @@ export class CarValidationService { cid: cidStr, spAddress, }); - allValid = false; + failedCount += 1; continue; } // Hash-verifies and decodes; throws on mismatch await createBlock({ bytes: resp.data, cid: block.cid, hasher: sha256, codec }); } catch (error) { - allValid = false; + failedCount += 1; this.logger.warn({ event: "block_fetch_failed", message: "Block fetch or hash verification failed", @@ -218,6 +255,6 @@ export class CarValidationService { } } - return { valid: allValid }; + return { valid: failedCount === 0, failedCount, endpoint }; } } diff --git a/apps/backend/src/retrieval-anon/types.ts b/apps/backend/src/retrieval-anon/types.ts index 2c3384d5..3ba2b9f9 100644 --- a/apps/backend/src/retrieval-anon/types.ts +++ b/apps/backend/src/retrieval-anon/types.ts @@ -30,6 +30,11 @@ export type CarValidationResult = { blockCount: number; sampledCidCount: number; ipniValid: boolean | null; + ipniVerifyMs: number | null; + ipniVerifiedCidsCount: number | null; + ipniUnverifiedCidsCount: number | null; blockFetchValid: boolean | null; + blockFetchFailedCount: number | null; + blockFetchEndpoint: string | null; errorMessage?: string; }; From 072a096b44ca2194bf2607f96abbba66364aae11 Mon Sep 17 00:00:00 2001 From: Dennis Trautwein Date: Wed, 29 Apr 2026 12:57:14 +0200 Subject: [PATCH 04/55] test(retrieval-anon): new ipni fields --- .../anon-retrieval.service.spec.ts | 157 +++++++++++++++++- 1 file changed, 153 insertions(+), 4 deletions(-) diff --git a/apps/backend/src/retrieval-anon/anon-retrieval.service.spec.ts b/apps/backend/src/retrieval-anon/anon-retrieval.service.spec.ts index 275a3de2..812b8169 100644 --- a/apps/backend/src/retrieval-anon/anon-retrieval.service.spec.ts +++ b/apps/backend/src/retrieval-anon/anon-retrieval.service.spec.ts @@ -9,7 +9,7 @@ import type { AnonPieceSelectorService } from "./anon-piece-selector.service.js" import { AnonRetrievalService } from "./anon-retrieval.service.js"; import type { CarValidationService } from "./car-validation.service.js"; import type { PieceRetrievalService } from "./piece-retrieval.service.js"; -import type { PieceRetrievalResult } from "./types.js"; +import type { AnonPiece, CarValidationResult, PieceRetrievalResult } from "./types.js"; const SP_ADDRESS = "0xaaaa0000000000000000000000000000000000aa"; @@ -36,10 +36,13 @@ function makeService(opts: { pieceResult: PieceRetrievalResult; fetchPieceImpl?: (signal?: AbortSignal) => Promise; clickhouseEnabled?: boolean; + piece?: AnonPiece; + carResult?: CarValidationResult; }): { service: AnonRetrievalService; insertSpy: ReturnType; fetchSpy: ReturnType; + validateCarSpy: ReturnType; } { const insertSpy = vi.fn(); const clickhouseService = { @@ -53,7 +56,7 @@ function makeService(opts: { } as unknown as Repository; const anonPieceSelector = { - selectPieceForProvider: vi.fn(async () => PIECE), + selectPieceForProvider: vi.fn(async () => opts.piece ?? PIECE), } as unknown as AnonPieceSelectorService; const fetchSpy = vi.fn(opts.fetchPieceImpl ?? (async () => opts.pieceResult)); @@ -61,8 +64,9 @@ function makeService(opts: { fetchPiece: fetchSpy, } as unknown as PieceRetrievalService; + const validateCarSpy = vi.fn(async () => opts.carResult); const carValidationService = { - validateCarPiece: vi.fn(), + validateCarPiece: validateCarSpy, } as unknown as CarValidationService; const walletSdkService = { @@ -91,7 +95,7 @@ function makeService(opts: { spRepository, ); - return { service, insertSpy, fetchSpy }; + return { service, insertSpy, fetchSpy, validateCarSpy }; } describe("AnonRetrievalService", () => { @@ -133,6 +137,19 @@ describe("AnonRetrievalService", () => { expect(row.sp_id).toBe(7); expect(row.probe_location).toBe("test-location"); expect(typeof row.retrieval_id).toBe("string"); + + // CAR/IPNI/block-fetch were never run on a non-IPFS-indexed piece — every + // dimension column should explicitly say "skipped" (ipni_status) or null. + expect(row.car_parseable).toBeNull(); + expect(row.car_block_count).toBeNull(); + expect(row.block_fetch_endpoint).toBeNull(); + expect(row.block_fetch_valid).toBeNull(); + expect(row.block_fetch_sampled_count).toBeNull(); + expect(row.block_fetch_failed_count).toBeNull(); + expect(row.ipni_status).toBe("skipped"); + expect(row.ipni_verify_ms).toBeNull(); + expect(row.ipni_verified_cids_count).toBeNull(); + expect(row.ipni_unverified_cids_count).toBeNull(); }); it("still emits a row when the signal aborts before fetchPiece runs", async () => { @@ -210,4 +227,136 @@ describe("AnonRetrievalService", () => { expect(insertSpy).not.toHaveBeenCalled(); }); + + describe("with IPFS indexing", () => { + const INDEXED_PIECE: AnonPiece = { + ...PIECE, + withIPFSIndexing: true, + ipfsRootCid: "bafyrootcid", + }; + + function okPiece(bytes: Buffer): PieceRetrievalResult { + return { + success: true, + pieceCid: INDEXED_PIECE.pieceCid, + bytesReceived: bytes.length, + pieceBytes: bytes, + latencyMs: 200, + ttfbMs: 20, + throughputBps: 51200, + statusCode: 200, + commPValid: true, + }; + } + + it("emits populated CAR/IPNI/block-fetch columns when validation fully succeeds", async () => { + const carResult: CarValidationResult = { + carParseable: true, + blockCount: 42, + sampledCidCount: 5, + ipniValid: true, + ipniVerifyMs: 137, + ipniVerifiedCidsCount: 6, + ipniUnverifiedCidsCount: 0, + blockFetchValid: true, + blockFetchFailedCount: 0, + blockFetchEndpoint: "https://sp.test/ipfs/", + }; + + const { service, insertSpy, validateCarSpy } = makeService({ + pieceResult: okPiece(Buffer.from("car-bytes")), + piece: INDEXED_PIECE, + carResult, + }); + + await service.performForProvider(SP_ADDRESS); + + expect(validateCarSpy).toHaveBeenCalledTimes(1); + const [, row] = insertSpy.mock.calls[0] as [string, Record]; + expect(row.piece_fetch_status).toBe(RetrievalStatus.SUCCESS); + expect(row.commp_valid).toBe(true); + expect(row.car_parseable).toBe(true); + expect(row.car_block_count).toBe(42); + expect(row.block_fetch_endpoint).toBe("https://sp.test/ipfs/"); + expect(row.block_fetch_valid).toBe(true); + expect(row.block_fetch_sampled_count).toBe(5); + expect(row.block_fetch_failed_count).toBe(0); + expect(row.ipni_status).toBe("valid"); + expect(row.ipni_verify_ms).toBe(137); + expect(row.ipni_verified_cids_count).toBe(6); + expect(row.ipni_unverified_cids_count).toBe(0); + }); + + it("distinguishes IPNI invalid from block-fetch failures with explicit counts", async () => { + const carResult: CarValidationResult = { + carParseable: true, + blockCount: 100, + sampledCidCount: 5, + ipniValid: false, + ipniVerifyMs: 250, + ipniVerifiedCidsCount: 0, + ipniUnverifiedCidsCount: 6, + blockFetchValid: false, + blockFetchFailedCount: 2, + blockFetchEndpoint: "https://sp.test/ipfs/", + }; + + const { service, insertSpy } = makeService({ + pieceResult: okPiece(Buffer.from("car-bytes")), + piece: INDEXED_PIECE, + carResult, + }); + + await service.performForProvider(SP_ADDRESS); + + const [, row] = insertSpy.mock.calls[0] as [string, Record]; + // The piece-fetch path still succeeded — failures are surfaced as + // independent dimensions, not folded into piece_fetch_status. + expect(row.piece_fetch_status).toBe(RetrievalStatus.SUCCESS); + expect(row.car_parseable).toBe(true); + expect(row.ipni_status).toBe("invalid"); + expect(row.ipni_verified_cids_count).toBe(0); + expect(row.ipni_unverified_cids_count).toBe(6); + expect(row.block_fetch_valid).toBe(false); + expect(row.block_fetch_sampled_count).toBe(5); + expect(row.block_fetch_failed_count).toBe(2); + }); + + it("emits car_parseable=false with skipped IPNI/block-fetch when bytes don't parse as CAR", async () => { + const carResult: CarValidationResult = { + carParseable: false, + blockCount: 0, + sampledCidCount: 0, + ipniValid: null, + ipniVerifyMs: null, + ipniVerifiedCidsCount: null, + ipniUnverifiedCidsCount: null, + blockFetchValid: null, + blockFetchFailedCount: null, + blockFetchEndpoint: null, + }; + + const { service, insertSpy } = makeService({ + pieceResult: okPiece(Buffer.from("not-a-car")), + piece: INDEXED_PIECE, + carResult, + }); + + await service.performForProvider(SP_ADDRESS); + + const [, row] = insertSpy.mock.calls[0] as [string, Record]; + expect(row.car_parseable).toBe(false); + // car_block_count and block_fetch_sampled_count are gated on carParseable + // so an unparseable CAR doesn't emit a misleading 0. + expect(row.car_block_count).toBeNull(); + expect(row.block_fetch_sampled_count).toBeNull(); + expect(row.block_fetch_endpoint).toBeNull(); + expect(row.block_fetch_valid).toBeNull(); + expect(row.block_fetch_failed_count).toBeNull(); + expect(row.ipni_status).toBe("skipped"); + expect(row.ipni_verify_ms).toBeNull(); + expect(row.ipni_verified_cids_count).toBeNull(); + expect(row.ipni_unverified_cids_count).toBeNull(); + }); + }); }); From 1fcee6001cda14f6ead2117c68ee1c40b2b927ff Mon Sep 17 00:00:00 2001 From: Dennis Trautwein Date: Wed, 29 Apr 2026 13:10:13 +0200 Subject: [PATCH 05/55] refactor(retrieval-anon): function signatures --- .../retrieval-anon/anon-retrieval.service.ts | 171 ++++++++---------- .../retrieval-anon/car-validation.service.ts | 40 ++-- 2 files changed, 93 insertions(+), 118 deletions(-) diff --git a/apps/backend/src/retrieval-anon/anon-retrieval.service.ts b/apps/backend/src/retrieval-anon/anon-retrieval.service.ts index 8f2e135a..4c6ade8a 100644 --- a/apps/backend/src/retrieval-anon/anon-retrieval.service.ts +++ b/apps/backend/src/retrieval-anon/anon-retrieval.service.ts @@ -141,109 +141,90 @@ export class AnonRetrievalService { // Always emit a ClickHouse row — even on abort or unexpected error — so // we never lose the evidence (ttfb, bytes, response code) we already // collected. - pieceResult ??= buildAbortedPlaceholder(piece.pieceCid, signal?.reason); - this.emitClickhouseRow(spAddress, piece, pieceResult, carResult, startedAt, provider, logContext); - } - } + const finalPieceResult = pieceResult ?? buildAbortedPlaceholder(piece.pieceCid, signal?.reason); + const retrievalId = randomUUID(); - private emitClickhouseRow( - spAddress: string, - piece: { - pieceCid: string; - dataSetId: string; - pieceId: string; - rawSize: string; - withIPFSIndexing: boolean; - ipfsRootCid: string | null; - }, - pieceResult: PieceRetrievalResult, - carResult: CarValidationResult | null, - startedAt: Date, - provider: StorageProvider | null, - logContext?: ProviderJobContext, - ): void { - if (!this.clickhouseService.enabled) { - this.logger.debug({ - ...logContext, - event: "anon_retrieval_clickhouse_disabled", - message: "ClickHouse disabled — anon retrieval row not emitted", - pieceCid: piece.pieceCid, - spAddress, - }); - return; - } + if (this.clickhouseService.enabled) { + const providerInfo = this.walletSdkService.getProviderInfo(spAddress); + const spBaseUrl = providerInfo?.pdp.serviceURL.replace(/\/$/, "") ?? spAddress; + const pieceFetchStatus = finalPieceResult.success ? RetrievalStatus.SUCCESS : RetrievalStatus.FAILED; + const ipniStatus = + carResult == null || carResult.ipniValid === null ? "skipped" : carResult.ipniValid ? "valid" : "invalid"; - const providerInfo = this.walletSdkService.getProviderInfo(spAddress); - const spBaseUrl = providerInfo?.pdp.serviceURL.replace(/\/$/, "") ?? spAddress; - const pieceFetchStatus = pieceResult.success ? RetrievalStatus.SUCCESS : RetrievalStatus.FAILED; - const ipniStatus = - carResult == null || carResult.ipniValid === null ? "skipped" : carResult.ipniValid ? "valid" : "invalid"; - const retrievalId = randomUUID(); + try { + this.clickhouseService.insert(ANON_RETRIEVAL_CHECKS_TABLE, { + timestamp: startedAt.getTime(), + probe_location: this.clickhouseService.probeLocation, + sp_address: spAddress, + sp_id: provider?.providerId != null ? Number(provider.providerId) : null, + sp_name: provider?.name ?? null, + retrieval_id: retrievalId, + piece_cid: piece.pieceCid, + data_set_id: piece.dataSetId, + piece_id: piece.pieceId, + raw_size: piece.rawSize, + with_ipfs_indexing: piece.withIPFSIndexing, + ipfs_root_cid: piece.ipfsRootCid, + service_type: ServiceType.DIRECT_SP, + retrieval_endpoint: `${spBaseUrl}/piece/${piece.pieceCid}`, + piece_fetch_status: pieceFetchStatus, + http_response_code: finalPieceResult.statusCode > 0 ? finalPieceResult.statusCode : null, + first_byte_ms: finalPieceResult.ttfbMs > 0 ? finalPieceResult.ttfbMs : null, + last_byte_ms: finalPieceResult.latencyMs > 0 ? finalPieceResult.latencyMs : null, + bytes_retrieved: finalPieceResult.bytesReceived > 0 ? finalPieceResult.bytesReceived : null, + throughput_bps: finalPieceResult.throughputBps > 0 ? Math.round(finalPieceResult.throughputBps) : null, + commp_valid: finalPieceResult.success ? finalPieceResult.commPValid : null, + car_parseable: carResult ? carResult.carParseable : null, + car_block_count: carResult != null && carResult.carParseable ? carResult.blockCount : null, + block_fetch_endpoint: carResult?.blockFetchEndpoint ?? null, + block_fetch_valid: carResult ? carResult.blockFetchValid : null, + block_fetch_sampled_count: carResult != null && carResult.carParseable ? carResult.sampledCidCount : null, + block_fetch_failed_count: carResult?.blockFetchFailedCount ?? null, + ipni_status: ipniStatus, + ipni_verify_ms: carResult?.ipniVerifyMs ?? null, + ipni_verified_cids_count: carResult?.ipniVerifiedCidsCount ?? null, + ipni_unverified_cids_count: carResult?.ipniUnverifiedCidsCount ?? null, + error_message: finalPieceResult.errorMessage ?? null, + }); + } catch (error) { + // ClickhouseService.insert is buffered/non-throwing in normal operation, but + // guard against unexpected runtime errors so we don't break the probe cycle. + this.logger.warn({ + ...logContext, + event: "anon_retrieval_clickhouse_insert_failed", + message: "Failed to enqueue anonymous retrieval row to ClickHouse", + pieceCid: piece.pieceCid, + spAddress, + error: toStructuredError(error), + }); + } + } else { + this.logger.debug({ + ...logContext, + event: "anon_retrieval_clickhouse_disabled", + message: "ClickHouse disabled — anon retrieval row not emitted", + pieceCid: piece.pieceCid, + spAddress, + }); + } - try { - this.clickhouseService.insert(ANON_RETRIEVAL_CHECKS_TABLE, { - timestamp: startedAt.getTime(), - probe_location: this.clickhouseService.probeLocation, - sp_address: spAddress, - sp_id: provider?.providerId != null ? Number(provider.providerId) : null, - sp_name: provider?.name ?? null, - retrieval_id: retrievalId, - piece_cid: piece.pieceCid, - data_set_id: piece.dataSetId, - piece_id: piece.pieceId, - raw_size: piece.rawSize, - with_ipfs_indexing: piece.withIPFSIndexing, - ipfs_root_cid: piece.ipfsRootCid, - service_type: ServiceType.DIRECT_SP, - retrieval_endpoint: `${spBaseUrl}/piece/${piece.pieceCid}`, - piece_fetch_status: pieceFetchStatus, - http_response_code: pieceResult.statusCode > 0 ? pieceResult.statusCode : null, - first_byte_ms: pieceResult.ttfbMs > 0 ? pieceResult.ttfbMs : null, - last_byte_ms: pieceResult.latencyMs > 0 ? pieceResult.latencyMs : null, - bytes_retrieved: pieceResult.bytesReceived > 0 ? pieceResult.bytesReceived : null, - throughput_bps: pieceResult.throughputBps > 0 ? Math.round(pieceResult.throughputBps) : null, - commp_valid: pieceResult.success ? pieceResult.commPValid : null, - car_parseable: carResult ? carResult.carParseable : null, - car_block_count: carResult?.carParseable ? carResult.blockCount : null, - block_fetch_endpoint: carResult?.blockFetchEndpoint ?? null, - block_fetch_valid: carResult ? carResult.blockFetchValid : null, - block_fetch_sampled_count: carResult?.carParseable ? carResult.sampledCidCount : null, - block_fetch_failed_count: carResult?.blockFetchFailedCount ?? null, - ipni_status: ipniStatus, - ipni_verify_ms: carResult?.ipniVerifyMs ?? null, - ipni_verified_cids_count: carResult?.ipniVerifiedCidsCount ?? null, - ipni_unverified_cids_count: carResult?.ipniUnverifiedCidsCount ?? null, - error_message: pieceResult.errorMessage ?? null, - }); - } catch (error) { - // ClickhouseService.insert is buffered/non-throwing in normal operation, but - // guard against unexpected runtime errors so we don't break the probe cycle. - this.logger.warn({ + this.logger.log({ ...logContext, - event: "anon_retrieval_clickhouse_insert_failed", - message: "Failed to enqueue anonymous retrieval row to ClickHouse", + event: "anon_retrieval_completed", + message: "Anonymous retrieval test completed", + retrievalId, pieceCid: piece.pieceCid, spAddress, - error: toStructuredError(error), + success: finalPieceResult.success, + aborted: finalPieceResult.aborted === true, + latencyMs: finalPieceResult.latencyMs, + ttfbMs: finalPieceResult.ttfbMs, + bytesRetrieved: finalPieceResult.bytesReceived, + carParseable: carResult?.carParseable, + ipniValid: carResult?.ipniValid, + blockFetchValid: carResult?.blockFetchValid, }); } - - this.logger.log({ - ...logContext, - event: "anon_retrieval_completed", - message: "Anonymous retrieval test completed", - retrievalId, - pieceCid: piece.pieceCid, - spAddress, - success: pieceResult.success, - aborted: pieceResult.aborted === true, - latencyMs: pieceResult.latencyMs, - ttfbMs: pieceResult.ttfbMs, - bytesRetrieved: pieceResult.bytesReceived, - carParseable: carResult?.carParseable, - ipniValid: carResult?.ipniValid, - blockFetchValid: carResult?.blockFetchValid, - }); } } diff --git a/apps/backend/src/retrieval-anon/car-validation.service.ts b/apps/backend/src/retrieval-anon/car-validation.service.ts index 017a38e8..789f5ba6 100644 --- a/apps/backend/src/retrieval-anon/car-validation.service.ts +++ b/apps/backend/src/retrieval-anon/car-validation.service.ts @@ -46,8 +46,17 @@ export class CarValidationService { ipfsRootCid: string, signal?: AbortSignal, ): Promise { - const blocks = await this.parseCar(pieceBytes, provider.address, ipfsRootCid); - if (blocks === null) { + let blocks: { cid: CID; bytes: Uint8Array }[]; + try { + blocks = await this.parseCar(pieceBytes); + } catch (error) { + this.logger.debug({ + event: "car_parse_failed", + message: "Failed to parse piece bytes as CAR - client fault, not SP", + spAddress: provider.address, + ipfsRootCid, + error: toStructuredError(error), + }); return { carParseable: false, blockCount: 0, @@ -99,28 +108,13 @@ export class CarValidationService { }; } - private async parseCar( - pieceBytes: Buffer, - spAddress: string, - ipfsRootCid: string, - ): Promise<{ cid: CID; bytes: Uint8Array }[] | null> { - try { - const reader = await CarReader.fromBytes(new Uint8Array(pieceBytes)); - const blocks: { cid: CID; bytes: Uint8Array }[] = []; - for await (const block of reader.blocks()) { - blocks.push({ cid: block.cid, bytes: block.bytes }); - } - return blocks; - } catch (error) { - this.logger.debug({ - event: "car_parse_failed", - message: "Failed to parse piece bytes as CAR - client fault, not SP", - spAddress, - ipfsRootCid, - error: toStructuredError(error), - }); - return null; + private async parseCar(pieceBytes: Buffer): Promise<{ cid: CID; bytes: Uint8Array }[]> { + const reader = await CarReader.fromBytes(new Uint8Array(pieceBytes)); + const blocks: { cid: CID; bytes: Uint8Array }[] = []; + for await (const block of reader.blocks()) { + blocks.push({ cid: block.cid, bytes: block.bytes }); } + return blocks; } /** From 4527d292c1cb537287274eb9638a46a5641eff21 Mon Sep 17 00:00:00 2001 From: Dennis Trautwein Date: Wed, 29 Apr 2026 14:24:41 +0200 Subject: [PATCH 06/55] refactor(retrieval-anon): cleanup --- .../check-metrics.service.ts | 4 +- .../anon-piece-selector.service.spec.ts | 16 ++++ .../anon-piece-selector.service.ts | 11 +-- .../anon-retrieval.service.spec.ts | 53 +++++++++-- .../retrieval-anon/anon-retrieval.service.ts | 33 +++---- .../retrieval-anon/car-validation.service.ts | 1 - .../src/subgraph/subgraph.service.spec.ts | 10 ++- apps/backend/src/subgraph/subgraph.service.ts | 89 +++---------------- 8 files changed, 103 insertions(+), 114 deletions(-) diff --git a/apps/backend/src/metrics-prometheus/check-metrics.service.ts b/apps/backend/src/metrics-prometheus/check-metrics.service.ts index 85f1cdcf..8d4be313 100644 --- a/apps/backend/src/metrics-prometheus/check-metrics.service.ts +++ b/apps/backend/src/metrics-prometheus/check-metrics.service.ts @@ -303,11 +303,11 @@ export class AnonRetrievalCheckMetrics { this.carParseCounter.inc({ ...labels, value: parseable ? "parseable" : "not_parseable" }); } - recordIpniStatus(labels: CheckMetricLabels, value: "valid" | "invalid" | "skipped"): void { + recordIpniStatus(labels: CheckMetricLabels, value: "valid" | "invalid" | "skipped" | "error"): void { this.ipniCounter.inc({ ...labels, value }); } - recordBlockFetchStatus(labels: CheckMetricLabels, value: "valid" | "invalid" | "skipped"): void { + recordBlockFetchStatus(labels: CheckMetricLabels, value: "valid" | "invalid" | "skipped" | "error"): void { this.blockFetchCounter.inc({ ...labels, value }); } } diff --git a/apps/backend/src/retrieval-anon/anon-piece-selector.service.spec.ts b/apps/backend/src/retrieval-anon/anon-piece-selector.service.spec.ts index 6a787fbb..32d13719 100644 --- a/apps/backend/src/retrieval-anon/anon-piece-selector.service.spec.ts +++ b/apps/backend/src/retrieval-anon/anon-piece-selector.service.spec.ts @@ -84,6 +84,22 @@ describe("AnonPieceSelectorService", () => { expect(result?.pieceCid).toBe(freshCid); }); + it("treats payment-end exactly equal to current epoch as terminated (boundary)", async () => { + // pdpPaymentEndEpoch === indexedAtBlock should be rejected (<=, not <). + // This guards against an off-by-one regression where pieces in the final + // payment epoch silently slip through. + const boundaryCid = "baga-boundary"; + const liveCid = "baga-still-live"; + sampleAnonPiece + .mockResolvedValueOnce(makePiece({ pieceCid: boundaryCid, pdpPaymentEndEpoch: 200n, indexedAtBlock: 200 })) + .mockResolvedValueOnce(makePiece({ pieceCid: liveCid, pdpPaymentEndEpoch: 201n, indexedAtBlock: 200 })); + + const service = new AnonPieceSelectorService(subgraphService, makeConfigService()); + const result = await service.selectPieceForProvider(SP_ADDRESS); + + expect(result?.pieceCid).toBe(liveCid); + }); + it("redraws when the first sampled piece was recently selected by this process", async () => { const staleCid = "baga-stale"; const freshCid = "baga-fresh"; diff --git a/apps/backend/src/retrieval-anon/anon-piece-selector.service.ts b/apps/backend/src/retrieval-anon/anon-piece-selector.service.ts index 8de50fa3..342a4780 100644 --- a/apps/backend/src/retrieval-anon/anon-piece-selector.service.ts +++ b/apps/backend/src/retrieval-anon/anon-piece-selector.service.ts @@ -8,13 +8,7 @@ import type { AnonCandidatePiece } from "../subgraph/types.js"; import type { AnonPiece } from "./types.js"; /** - * Number of most-recently-tested anonymous pieces to exclude from selection - * to avoid immediately retesting the same piece. Piece CIDs are globally - * unique and each one lives on a single SP's dataset, so scoping by CID - * is equivalent to scoping by (SP, CID) for this workload. - * - * The buffer is process-local: a duplicate piece that gets retested shortly - * after a restart is harmless (still a valid measurement, just less diverse). + * Number of most-recently-tested piece CIDs to exclude from re-selection. */ const RECENT_DEDUP_WINDOW = 500; @@ -157,6 +151,9 @@ export class AnonPieceSelectorService { continue; } + // On Filecoin FEVM the EVM block number IS the chain epoch (one block per + // epoch), so the subgraph's indexedAtBlock is a safe proxy for "now" when + // checking if PDP payment for this piece has already terminated. if (piece.pdpPaymentEndEpoch != null && piece.pdpPaymentEndEpoch <= BigInt(piece.indexedAtBlock)) { continue; } diff --git a/apps/backend/src/retrieval-anon/anon-retrieval.service.spec.ts b/apps/backend/src/retrieval-anon/anon-retrieval.service.spec.ts index 812b8169..b5f17c57 100644 --- a/apps/backend/src/retrieval-anon/anon-retrieval.service.spec.ts +++ b/apps/backend/src/retrieval-anon/anon-retrieval.service.spec.ts @@ -36,13 +36,17 @@ function makeService(opts: { pieceResult: PieceRetrievalResult; fetchPieceImpl?: (signal?: AbortSignal) => Promise; clickhouseEnabled?: boolean; - piece?: AnonPiece; + piece?: AnonPiece | null; carResult?: CarValidationResult; + validateCarImpl?: () => Promise; }): { service: AnonRetrievalService; insertSpy: ReturnType; fetchSpy: ReturnType; validateCarSpy: ReturnType; + metricsRecordStatusSpy: ReturnType; + metricsRecordIpniSpy: ReturnType; + metricsRecordBlockFetchSpy: ReturnType; } { const insertSpy = vi.fn(); const clickhouseService = { @@ -56,7 +60,7 @@ function makeService(opts: { } as unknown as Repository; const anonPieceSelector = { - selectPieceForProvider: vi.fn(async () => opts.piece ?? PIECE), + selectPieceForProvider: vi.fn(async () => (opts.piece === null ? null : (opts.piece ?? PIECE))), } as unknown as AnonPieceSelectorService; const fetchSpy = vi.fn(opts.fetchPieceImpl ?? (async () => opts.pieceResult)); @@ -64,7 +68,7 @@ function makeService(opts: { fetchPiece: fetchSpy, } as unknown as PieceRetrievalService; - const validateCarSpy = vi.fn(async () => opts.carResult); + const validateCarSpy = vi.fn(opts.validateCarImpl ?? (async () => opts.carResult)); const carValidationService = { validateCarPiece: validateCarSpy, } as unknown as CarValidationService; @@ -73,16 +77,19 @@ function makeService(opts: { getProviderInfo: vi.fn(() => ({ pdp: { serviceURL: "https://sp.test/" } })), } as unknown as WalletSdkService; + const metricsRecordStatusSpy = vi.fn(); + const metricsRecordIpniSpy = vi.fn(); + const metricsRecordBlockFetchSpy = vi.fn(); const metrics = { observeFirstByteMs: vi.fn(), observeLastByteMs: vi.fn(), observeThroughput: vi.fn(), observeCheckDuration: vi.fn(), - recordStatus: vi.fn(), + recordStatus: metricsRecordStatusSpy, recordHttpResponseCode: vi.fn(), recordCarParseStatus: vi.fn(), - recordIpniStatus: vi.fn(), - recordBlockFetchStatus: vi.fn(), + recordIpniStatus: metricsRecordIpniSpy, + recordBlockFetchStatus: metricsRecordBlockFetchSpy, } as unknown as AnonRetrievalCheckMetrics; const service = new AnonRetrievalService( @@ -95,7 +102,15 @@ function makeService(opts: { spRepository, ); - return { service, insertSpy, fetchSpy, validateCarSpy }; + return { + service, + insertSpy, + fetchSpy, + validateCarSpy, + metricsRecordStatusSpy, + metricsRecordIpniSpy, + metricsRecordBlockFetchSpy, + }; } describe("AnonRetrievalService", () => { @@ -322,6 +337,30 @@ describe("AnonRetrievalService", () => { expect(row.block_fetch_failed_count).toBe(2); }); + it("emits ipni_status='error' (not 'skipped') when CAR validation throws on a successful piece", async () => { + // Distinguishes a real infra outage (e.g. IpniVerificationService down) + // from a piece that legitimately had no IPFS indexing. Without the + // distinction, an outage looks like normal non-IPFS volume in dashboards. + const { service, insertSpy, metricsRecordIpniSpy, metricsRecordBlockFetchSpy } = makeService({ + pieceResult: okPiece(Buffer.from("car-bytes")), + piece: INDEXED_PIECE, + validateCarImpl: async () => { + throw new Error("IpniVerificationService down"); + }, + }); + + await service.performForProvider(SP_ADDRESS); + + expect(metricsRecordIpniSpy).toHaveBeenCalledWith(expect.anything(), "error"); + expect(metricsRecordBlockFetchSpy).toHaveBeenCalledWith(expect.anything(), "error"); + + const [, row] = insertSpy.mock.calls[0] as [string, Record]; + expect(row.ipni_status).toBe("error"); + // Piece-fetch path itself succeeded — only the validation pipeline failed. + expect(row.commp_valid).toBe(true); + expect(row.car_parseable).toBeNull(); + }); + it("emits car_parseable=false with skipped IPNI/block-fetch when bytes don't parse as CAR", async () => { const carResult: CarValidationResult = { carParseable: false, diff --git a/apps/backend/src/retrieval-anon/anon-retrieval.service.ts b/apps/backend/src/retrieval-anon/anon-retrieval.service.ts index 4c6ade8a..418ea8d2 100644 --- a/apps/backend/src/retrieval-anon/anon-retrieval.service.ts +++ b/apps/backend/src/retrieval-anon/anon-retrieval.service.ts @@ -70,6 +70,7 @@ export class AnonRetrievalService { let pieceResult: PieceRetrievalResult | null = null; let carResult: CarValidationResult | null = null; + let validatedCarPiece: boolean = false; try { // 2. Fetch the piece. fetchPiece never throws on abort — it returns a @@ -96,13 +97,24 @@ export class AnonRetrievalService { !signal?.aborted ) { try { + validatedCarPiece = true; carResult = await this.carValidationService.validateCarPiece( pieceResult.pieceBytes, provider, piece.ipfsRootCid, signal, ); + this.metrics.recordCarParseStatus(labels, carResult.carParseable); + this.metrics.recordIpniStatus(labels, ipniStatusFromResult(carResult)); + this.metrics.recordBlockFetchStatus( + labels, + carResult.blockFetchValid === null ? "skipped" : carResult.blockFetchValid ? "valid" : "invalid", + ); } catch (error) { + // Validation was attempted on a successful piece retrieval but threw. + this.metrics.recordCarParseStatus(labels, false); + this.metrics.recordIpniStatus(labels, "error"); + this.metrics.recordBlockFetchStatus(labels, "error"); this.logger.warn({ ...logContext, event: "anon_retrieval_car_validation_failed", @@ -112,19 +124,6 @@ export class AnonRetrievalService { error: toStructuredError(error), }); } - } - - // Emit CAR validation metrics - if (carResult) { - this.metrics.recordCarParseStatus(labels, carResult.carParseable); - this.metrics.recordIpniStatus( - labels, - carResult.ipniValid === null ? "skipped" : carResult.ipniValid ? "valid" : "invalid", - ); - this.metrics.recordBlockFetchStatus( - labels, - carResult.blockFetchValid === null ? "skipped" : carResult.blockFetchValid ? "valid" : "invalid", - ); } else if (!pieceResult.success) { // Piece retrieval failed — IPNI and block fetch were skipped this.metrics.recordIpniStatus(labels, "skipped"); @@ -148,8 +147,7 @@ export class AnonRetrievalService { const providerInfo = this.walletSdkService.getProviderInfo(spAddress); const spBaseUrl = providerInfo?.pdp.serviceURL.replace(/\/$/, "") ?? spAddress; const pieceFetchStatus = finalPieceResult.success ? RetrievalStatus.SUCCESS : RetrievalStatus.FAILED; - const ipniStatus = - carResult == null || carResult.ipniValid === null ? "skipped" : carResult.ipniValid ? "valid" : "invalid"; + const ipniStatus = !validatedCarPiece ? "skipped" : carResult ? ipniStatusFromResult(carResult) : "error"; try { this.clickhouseService.insert(ANON_RETRIEVAL_CHECKS_TABLE, { @@ -228,6 +226,11 @@ export class AnonRetrievalService { } } +function ipniStatusFromResult(result: CarValidationResult): "valid" | "invalid" | "skipped" { + if (result.ipniValid === null) return "skipped"; + return result.ipniValid ? "valid" : "invalid"; +} + function buildAbortedPlaceholder(pieceCid: string, reason: unknown): PieceRetrievalResult { const message = reason instanceof Error && reason.message ? reason.message : typeof reason === "string" ? reason : "aborted"; diff --git a/apps/backend/src/retrieval-anon/car-validation.service.ts b/apps/backend/src/retrieval-anon/car-validation.service.ts index 789f5ba6..27ec2744 100644 --- a/apps/backend/src/retrieval-anon/car-validation.service.ts +++ b/apps/backend/src/retrieval-anon/car-validation.service.ts @@ -189,7 +189,6 @@ export class CarValidationService { let failedCount = 0; for (const block of sampledBlocks) { - signal?.throwIfAborted(); const cidStr = block.cid.toString(); const blockUrl = `${spBaseUrl}/ipfs/${cidStr}?format=raw`; diff --git a/apps/backend/src/subgraph/subgraph.service.spec.ts b/apps/backend/src/subgraph/subgraph.service.spec.ts index 4dc2cd5e..8703b2c5 100644 --- a/apps/backend/src/subgraph/subgraph.service.spec.ts +++ b/apps/backend/src/subgraph/subgraph.service.spec.ts @@ -730,14 +730,18 @@ describe("SubgraphService", () => { }); describe("sampleAnonPiece", () => { - it("returns null when endpoint is not configured", async () => { + it("throws when endpoint is not configured (distinct from empty result)", async () => { + // Returning null here would make a misconfigured deployment indistinguishable + // from a genuinely empty candidate pool — every anon job would silently + // no-op forever. Fail loudly instead. const noEndpointConfig = { get: vi.fn(() => ({ subgraphEndpoint: "" })), } as unknown as ConfigService; const noEndpointService = new SubgraphService(noEndpointConfig); - const piece = await noEndpointService.sampleAnonPiece(defaultSampleParams); - expect(piece).toBeNull(); + await expect(noEndpointService.sampleAnonPiece(defaultSampleParams)).rejects.toThrow( + "No PDP subgraph endpoint configured", + ); expect(fetchMock).not.toHaveBeenCalled(); }); diff --git a/apps/backend/src/subgraph/subgraph.service.ts b/apps/backend/src/subgraph/subgraph.service.ts index 55359179..3067532c 100644 --- a/apps/backend/src/subgraph/subgraph.service.ts +++ b/apps/backend/src/subgraph/subgraph.service.ts @@ -69,87 +69,12 @@ export class SubgraphService { } /** - * Fetch subgraph metadata including the latest indexed block number + * Fetch subgraph metadata including the latest indexed block number. * - * @param attempt - Current retry attempt number (default: 1) - * @returns Subgraph metadata with block number * @throws Error if endpoint is not configured or after MAX_RETRIES attempts */ - async fetchSubgraphMeta(attempt: number = 1): Promise { - if (!this.blockchainConfig.subgraphEndpoint) { - throw new Error("No PDP subgraph endpoint configured"); - } - - try { - await this.enforceRateLimit(); - - const response = await fetch(this.blockchainConfig.subgraphEndpoint, { - method: "POST", - headers: { - "Content-Type": "application/json", - }, - body: JSON.stringify({ - query: Queries.GET_SUBGRAPH_META, - }), - }); - - if (!response.ok) { - throw new Error(`HTTP ${response.status}: ${response.statusText}`); - } - - const result = (await response.json()) as GraphQLResponse; - - if (result.errors) { - const errorMessage = result.errors?.[0]?.message || "Unknown GraphQL error"; - throw new Error(`GraphQL error: ${errorMessage}`); - } - let validated: SubgraphMeta; - try { - validated = validateSubgraphMetaResponse(result.data); - } catch (validationError) { - const errorMessage = validationError instanceof Error ? validationError.message : "Unknown validation error"; - throw new ValidationError(`Data validation failed: ${errorMessage}`); - } - - return validated; - } catch (error) { - const errorMessage = error instanceof Error ? error.message : "Unknown error"; - - // No need to retry on validation errors - they indicate schema/data issues, not transient failures - if (error instanceof ValidationError) { - this.logger.error({ - event: "subgraph_meta_validation_failed", - message: "Subgraph data validation failed", - error: toStructuredError(error), - }); - throw error; - } - - // Retry on network/HTTP errors - if (attempt < SubgraphService.MAX_RETRIES) { - const delay = SubgraphService.INITIAL_RETRY_DELAY_MS * (1 << (attempt - 1)); - this.logger.warn({ - event: "subgraph_meta_request_retry", - message: "Subgraph meta request failed. Retrying...", - attempt, - maxRetries: SubgraphService.MAX_RETRIES, - retryDelayMs: delay, - error: toStructuredError(error), - }); - await new Promise((resolve) => setTimeout(resolve, delay)); - return this.fetchSubgraphMeta(attempt + 1); - } - - this.logger.error({ - event: "subgraph_meta_request_failed", - message: "Subgraph meta request failed after maximum retries", - maxRetries: SubgraphService.MAX_RETRIES, - error: toStructuredError(error), - }); - throw new Error( - `Failed to fetch subgraph metadata after ${SubgraphService.MAX_RETRIES} attempts: ${errorMessage}`, - ); - } + async fetchSubgraphMeta(): Promise { + return this.executeQuery("metadata", Queries.GET_SUBGRAPH_META, {}, validateSubgraphMetaResponse); } /** @@ -189,7 +114,13 @@ export class SubgraphService { */ async sampleAnonPiece(params: SampleAnonPieceParams): Promise { if (!this.blockchainConfig.subgraphEndpoint) { - return null; + // Surface misconfiguration distinctly so it does not look like an empty + // candidate pool (which silently no-ops every anon retrieval job). + this.logger.error({ + event: "subgraph_endpoint_not_configured", + message: "Cannot sample anonymous piece — no PDP subgraph endpoint configured", + }); + throw new Error("No PDP subgraph endpoint configured"); } const query = buildSampleAnonPieceQuery(params.pool); From a797c15255549fe57510301da22e6010086f2989 Mon Sep 17 00:00:00 2001 From: Dennis Trautwein Date: Wed, 29 Apr 2026 14:27:11 +0200 Subject: [PATCH 07/55] chore: format code --- apps/backend/src/retrieval-anon/anon-retrieval.service.ts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/apps/backend/src/retrieval-anon/anon-retrieval.service.ts b/apps/backend/src/retrieval-anon/anon-retrieval.service.ts index 418ea8d2..c11daa19 100644 --- a/apps/backend/src/retrieval-anon/anon-retrieval.service.ts +++ b/apps/backend/src/retrieval-anon/anon-retrieval.service.ts @@ -107,8 +107,8 @@ export class AnonRetrievalService { this.metrics.recordCarParseStatus(labels, carResult.carParseable); this.metrics.recordIpniStatus(labels, ipniStatusFromResult(carResult)); this.metrics.recordBlockFetchStatus( - labels, - carResult.blockFetchValid === null ? "skipped" : carResult.blockFetchValid ? "valid" : "invalid", + labels, + carResult.blockFetchValid === null ? "skipped" : carResult.blockFetchValid ? "valid" : "invalid", ); } catch (error) { // Validation was attempted on a successful piece retrieval but threw. From 54cc48719c1fb24222ce63ef7f78216061f9c8bc Mon Sep 17 00:00:00 2001 From: Dennis Trautwein Date: Thu, 30 Apr 2026 09:59:58 +0200 Subject: [PATCH 08/55] fix: biome checks --- apps/backend/src/retrieval-anon/anon-retrieval.service.ts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/apps/backend/src/retrieval-anon/anon-retrieval.service.ts b/apps/backend/src/retrieval-anon/anon-retrieval.service.ts index c11daa19..5343d59a 100644 --- a/apps/backend/src/retrieval-anon/anon-retrieval.service.ts +++ b/apps/backend/src/retrieval-anon/anon-retrieval.service.ts @@ -173,10 +173,10 @@ export class AnonRetrievalService { throughput_bps: finalPieceResult.throughputBps > 0 ? Math.round(finalPieceResult.throughputBps) : null, commp_valid: finalPieceResult.success ? finalPieceResult.commPValid : null, car_parseable: carResult ? carResult.carParseable : null, - car_block_count: carResult != null && carResult.carParseable ? carResult.blockCount : null, + car_block_count: carResult?.carParseable ? carResult?.blockCount : null, block_fetch_endpoint: carResult?.blockFetchEndpoint ?? null, block_fetch_valid: carResult ? carResult.blockFetchValid : null, - block_fetch_sampled_count: carResult != null && carResult.carParseable ? carResult.sampledCidCount : null, + block_fetch_sampled_count: carResult?.carParseable ? carResult?.sampledCidCount : null, block_fetch_failed_count: carResult?.blockFetchFailedCount ?? null, ipni_status: ipniStatus, ipni_verify_ms: carResult?.ipniVerifyMs ?? null, From fcfe569e5c4bac09e96388cd78a90951e493ddfd Mon Sep 17 00:00:00 2001 From: Dennis Trautwein Date: Thu, 30 Apr 2026 10:00:28 +0200 Subject: [PATCH 09/55] fix(ipni): return actual verified/unverfied counts --- .../src/ipni/ipni-verification.service.ts | 122 +++++++++++------- 1 file changed, 72 insertions(+), 50 deletions(-) diff --git a/apps/backend/src/ipni/ipni-verification.service.ts b/apps/backend/src/ipni/ipni-verification.service.ts index 3d7d52f9..51fcc8e0 100644 --- a/apps/backend/src/ipni/ipni-verification.service.ts +++ b/apps/backend/src/ipni/ipni-verification.service.ts @@ -3,7 +3,7 @@ import { PDPProvider } from "filecoin-pin"; import { waitForIpniProviderResults } from "filecoin-pin/core/utils"; import { CID } from "multiformats/cid"; import type { StorageProvider } from "../database/entities/storage-provider.entity.js"; -import type { IPNIVerificationResult } from "../deal-addons/strategies/ipni.types.js"; +import type { FailedCID, IPNIVerificationResult } from "../deal-addons/strategies/ipni.types.js"; export type IpniVerificationInput = { rootCid: CID; @@ -44,7 +44,6 @@ export class IpniVerificationService { const expectedProviders = [this.buildExpectedProviderInfo(storageProvider as StorageProviderWithUrl)]; const timeoutSignal = AbortSignal.timeout(timeoutMs); const verificationSignal = signal ? AbortSignal.any([signal, timeoutSignal]) : timeoutSignal; - let failureReason = "IPNI did not return expected provider results via filecoin-pin"; this.logger.log({ event: "ipni_verification_started", @@ -61,56 +60,69 @@ export class IpniVerificationService { }); const ipniVerificationStartTime = Date.now(); + const cidsToValidate: { cid: CID; isRoot: boolean }[] = [ + { cid: rootCid, isRoot: true }, + ...blockCids.map((cid) => ({ cid, isRoot: false })), + ]; - const ipniValidated = await waitForIpniProviderResults(rootCid, { - childBlocks: blockCids, - maxAttempts, - delayMs, - expectedProviders, - signal: verificationSignal, - }).catch((error) => { + let verified = 0; + const failedCIDs: FailedCID[] = []; + let rootCIDVerified = false; + + // waitForIpniProviderResults is all-or-nothing per call (throws on first failure), + // so we invoke it once per CID to get accurate per-CID verified/unverified counts. + // The shared verificationSignal bounds total wall-clock time across all CIDs. + for (const { cid, isRoot } of cidsToValidate) { if (signal?.aborted) { signal.throwIfAborted(); } + if (verificationSignal.aborted) { - failureReason = `IPNI verification timed out after ${timeoutMs}ms`; - this.logger.error({ - event: "ipni_verification_timed_out", - message: failureReason, - rootCID: rootCid.toString(), + failedCIDs.push({ cid: cid.toString(), reason: `IPNI verification timed out after ${timeoutMs}ms` }); + continue; + } + + try { + await waitForIpniProviderResults(cid, { + maxAttempts, + delayMs, + expectedProviders, + signal: verificationSignal, + }); + verified += 1; + if (isRoot) rootCIDVerified = true; + } catch (error) { + if (signal?.aborted) { + signal.throwIfAborted(); + } + + const reason = verificationSignal.aborted + ? `IPNI verification timed out after ${timeoutMs}ms` + : error instanceof Error + ? error.message + : String(error); + + failedCIDs.push({ cid: cid.toString(), reason }); + + this.logger.warn({ + event: "ipni_cid_verification_failed", + message: "IPNI verification failed for CID", + cid: cid.toString(), + isRoot, providerAddress: storageProvider.address, providerId: storageProvider.providerId, providerName: storageProvider.name, serviceUrl: storageProvider.serviceUrl, - blockCIDCount: blockCids.length, - timeoutMs, - pollIntervalMs: delayMs, - maxAttempts, + failureReason: reason, }); - return false; } - const errorMessage = error instanceof Error ? error.message : String(error); - failureReason = errorMessage; - this.logger.error({ - event: "ipni_verification_failed", - message: "IPNI verification failed", - rootCID: rootCid.toString(), - providerAddress: storageProvider.address, - providerId: storageProvider.providerId, - providerName: storageProvider.name, - serviceUrl: storageProvider.serviceUrl, - blockCIDCount: blockCids.length, - timeoutMs, - pollIntervalMs: delayMs, - maxAttempts, - failureReason, - }); - return false; - }); + } const ipniVerificationDurationMs = Date.now() - ipniVerificationStartTime; + const total = cidsToValidate.length; + const unverified = total - verified; - if (ipniValidated) { + if (verified === total) { this.logger.log({ event: "ipni_verification_succeeded", message: "IPNI verification succeeded", @@ -121,22 +133,32 @@ export class IpniVerificationService { verifyDurationMs: ipniVerificationDurationMs, blockCIDCount: blockCids.length, }); + } else { + this.logger.error({ + event: verificationSignal.aborted ? "ipni_verification_timed_out" : "ipni_verification_failed", + message: "IPNI verification did not fully succeed", + rootCID: rootCid.toString(), + providerAddress: storageProvider.address, + providerId: storageProvider.providerId, + providerName: storageProvider.name, + serviceUrl: storageProvider.serviceUrl, + blockCIDCount: blockCids.length, + timeoutMs, + pollIntervalMs: delayMs, + maxAttempts, + verified, + unverified, + total, + }); } return { - verified: ipniValidated ? 1 : 0, - unverified: ipniValidated ? 0 : 1, - total: 1, - rootCIDVerified: ipniValidated, + verified: verified, + unverified: unverified, + total: total, + rootCIDVerified: rootCIDVerified, durationMs: ipniVerificationDurationMs, - failedCIDs: ipniValidated - ? [] - : [ - { - cid: rootCid.toString(), - reason: failureReason, - }, - ], + failedCIDs: failedCIDs, verifiedAt: new Date().toISOString(), }; } From fb45bd076600779eac47999e0a8a26d45182c542 Mon Sep 17 00:00:00 2001 From: Dennis Trautwein Date: Thu, 30 Apr 2026 13:17:03 +0200 Subject: [PATCH 10/55] refactor: store anon retrieval data primarily in postgres --- .../src/clickhouse/clickhouse.schema.ts | 14 +- apps/backend/src/database/database.module.ts | 9 +- .../entities/anon-retrieval.entity.ts | 120 +++++++++++ .../1776300000000-CreateAnonRetrievals.ts | 72 +++++++ apps/backend/src/database/types.ts | 12 ++ .../anon-retrieval.service.spec.ts | 203 +++++++++++------- .../retrieval-anon/anon-retrieval.service.ts | 172 +++++++++------ .../retrieval-anon/retrieval-anon.module.ts | 3 +- 8 files changed, 444 insertions(+), 161 deletions(-) create mode 100644 apps/backend/src/database/entities/anon-retrieval.entity.ts create mode 100644 apps/backend/src/database/migrations/1776300000000-CreateAnonRetrievals.ts diff --git a/apps/backend/src/clickhouse/clickhouse.schema.ts b/apps/backend/src/clickhouse/clickhouse.schema.ts index e30f6151..5a9a805e 100644 --- a/apps/backend/src/clickhouse/clickhouse.schema.ts +++ b/apps/backend/src/clickhouse/clickhouse.schema.ts @@ -70,17 +70,12 @@ export function buildMigrations(database: string): string[] { sp_id Nullable(UInt64), -- storage provider numeric id sp_name Nullable(String), -- storage provider name - retrieval_id UUID, -- per-event correlation id (log/Prometheus join) + retrieval_id UUID, -- per-event correlation id (matches anon_retrievals.id in Postgres) - piece_cid String, -- piece CID (v2/CommP) sampled from the subgraph - data_set_id UInt64, -- on-chain data set id - piece_id UInt64, -- on-chain piece id within the data set raw_size UInt64, -- raw (unpadded) piece size, bytes with_ipfs_indexing Bool, -- whether the piece advertises IPNI metadata - ipfs_root_cid Nullable(String), -- root CID of the contained DAG; null when not IPFS-indexed service_type LowCardinality(String), -- 'direct_sp' (only mode for anon retrievals today) - retrieval_endpoint String, -- URL probed (e.g. {spBaseUrl}/piece/{pieceCid}) piece_fetch_status LowCardinality(String), -- 'success' | 'failed' — outcome of GET /piece/ (HTTP 2xx AND CommP match). CAR/IPNI/block-fetch outcomes live in their own columns. http_response_code Nullable(UInt16), -- raw HTTP status; null on transport failure @@ -92,17 +87,14 @@ export function buildMigrations(database: string): string[] { commp_valid Nullable(Bool), -- null when retrieval failed before CommP could be hashed car_parseable Nullable(Bool), -- null when CAR validation was skipped (no IPFS indexing or piece fetch failed); true if bytes parsed as a CAR car_block_count Nullable(UInt32), -- total number of blocks observed inside the CAR; null when skipped or unparseable - block_fetch_endpoint Nullable(String), -- gateway base URL probed for block fetch (e.g. {spBaseUrl}/ipfs/); null when skipped block_fetch_valid Nullable(Bool), -- null when skipped; true if all sampled blocks fetched + hash-verified block_fetch_sampled_count Nullable(UInt32), -- number of blocks sampled and probed via /ipfs/?format=raw block_fetch_failed_count Nullable(UInt32), -- number of sampled blocks that failed (non-2xx, hash mismatch, unsupported codec, or transport error) - ipni_status LowCardinality(String), -- 'valid' | 'invalid' | 'skipped' (mirrors data_storage_checks naming) + ipni_status LowCardinality(String), -- 'valid' | 'invalid' | 'skipped' | 'error' ipni_verify_ms Nullable(Float64), -- IPNI verification duration; null when skipped ipni_verified_cids_count Nullable(UInt32), -- CIDs confirmed findable via IPNI - ipni_unverified_cids_count Nullable(UInt32), -- CIDs checked but not findable - - error_message Nullable(String) -- failure reason; null on success + ipni_unverified_cids_count Nullable(UInt32) -- CIDs checked but not findable ) ENGINE MergeTree() PRIMARY KEY (probe_location, sp_address, timestamp) PARTITION BY toStartOfMonth(timestamp) diff --git a/apps/backend/src/database/database.module.ts b/apps/backend/src/database/database.module.ts index 9249c3a9..f3f9ed09 100644 --- a/apps/backend/src/database/database.module.ts +++ b/apps/backend/src/database/database.module.ts @@ -7,6 +7,7 @@ import { fileURLToPath } from "url"; import { toStructuredError } from "../common/logging.js"; import { createPinoExitLogger } from "../common/pino.config.js"; import type { IAppConfig, IConfig, IDatabaseConfig } from "../config/app.config.js"; +import { AnonRetrieval } from "./entities/anon-retrieval.entity.js"; import { DataRetentionBaseline } from "./entities/data-retention-baseline.entity.js"; import { Deal } from "./entities/deal.entity.js"; import { JobScheduleState } from "./entities/job-schedule-state.entity.js"; @@ -49,7 +50,7 @@ function toSafeDataSourceContext(options: DataSourceOptions): Record { + await queryRunner.query(` + CREATE TYPE anon_retrievals_piece_fetch_status_enum AS ENUM ('success', 'failed') + `); + await queryRunner.query(` + CREATE TYPE anon_retrievals_ipni_status_enum AS ENUM ('valid', 'invalid', 'skipped', 'error') + `); + await queryRunner.query(` + CREATE TYPE anon_retrievals_service_type_enum AS ENUM ('direct_sp', 'ipfs_pin') + `); + + await queryRunner.query(` + CREATE TABLE IF NOT EXISTS anon_retrievals ( + id UUID NOT NULL PRIMARY KEY DEFAULT gen_random_uuid(), + started_at TIMESTAMPTZ NOT NULL, + probe_location VARCHAR NOT NULL, + sp_address VARCHAR NOT NULL, + sp_id BIGINT, + sp_name VARCHAR, + piece_cid VARCHAR NOT NULL, + data_set_id BIGINT NOT NULL, + piece_id BIGINT NOT NULL, + raw_size BIGINT NOT NULL, + with_ipfs_indexing BOOLEAN NOT NULL, + ipfs_root_cid VARCHAR, + service_type anon_retrievals_service_type_enum NOT NULL DEFAULT 'direct_sp', + retrieval_endpoint VARCHAR NOT NULL, + piece_fetch_status anon_retrievals_piece_fetch_status_enum NOT NULL, + http_response_code INTEGER, + first_byte_ms DOUBLE PRECISION, + last_byte_ms DOUBLE PRECISION, + bytes_retrieved BIGINT, + throughput_bps BIGINT, + commp_valid BOOLEAN, + car_parseable BOOLEAN, + car_block_count INTEGER, + block_fetch_endpoint VARCHAR, + block_fetch_valid BOOLEAN, + block_fetch_sampled_count INTEGER, + block_fetch_failed_count INTEGER, + ipni_status anon_retrievals_ipni_status_enum NOT NULL, + ipni_verify_ms DOUBLE PRECISION, + ipni_verified_cids_count INTEGER, + ipni_unverified_cids_count INTEGER, + error_message VARCHAR, + created_at TIMESTAMPTZ NOT NULL DEFAULT NOW() + ) + `); + + await queryRunner.query(` + CREATE INDEX IF NOT EXISTS "IDX_anon_retrievals_sp_address_started_at" + ON anon_retrievals (sp_address, started_at) + `); + + await queryRunner.query(` + CREATE INDEX IF NOT EXISTS "IDX_anon_retrievals_started_at" + ON anon_retrievals (started_at) + `); + } + + public async down(queryRunner: QueryRunner): Promise { + await queryRunner.query(`DROP TABLE IF EXISTS anon_retrievals CASCADE`); + await queryRunner.query(`DROP TYPE IF EXISTS anon_retrievals_service_type_enum`); + await queryRunner.query(`DROP TYPE IF EXISTS anon_retrievals_ipni_status_enum`); + await queryRunner.query(`DROP TYPE IF EXISTS anon_retrievals_piece_fetch_status_enum`); + } +} diff --git a/apps/backend/src/database/types.ts b/apps/backend/src/database/types.ts index 46fd5d28..e09d1dd3 100644 --- a/apps/backend/src/database/types.ts +++ b/apps/backend/src/database/types.ts @@ -28,6 +28,18 @@ export enum IpniStatus { FAILED = "failed", } +export enum PieceFetchStatus { + SUCCESS = "success", + FAILED = "failed", +} + +export enum IpniCheckStatus { + VALID = "valid", + INVALID = "invalid", + SKIPPED = "skipped", + ERROR = "error", +} + /** * Metadata schema for deal storage and retrieval */ diff --git a/apps/backend/src/retrieval-anon/anon-retrieval.service.spec.ts b/apps/backend/src/retrieval-anon/anon-retrieval.service.spec.ts index b5f17c57..4f775150 100644 --- a/apps/backend/src/retrieval-anon/anon-retrieval.service.spec.ts +++ b/apps/backend/src/retrieval-anon/anon-retrieval.service.spec.ts @@ -1,8 +1,9 @@ import type { Repository } from "typeorm"; import { beforeEach, describe, expect, it, vi } from "vitest"; import type { ClickhouseService } from "../clickhouse/clickhouse.service.js"; +import type { AnonRetrieval } from "../database/entities/anon-retrieval.entity.js"; import type { StorageProvider } from "../database/entities/storage-provider.entity.js"; -import { RetrievalStatus } from "../database/types.js"; +import { IpniCheckStatus, PieceFetchStatus } from "../database/types.js"; import type { AnonRetrievalCheckMetrics } from "../metrics-prometheus/check-metrics.service.js"; import type { WalletSdkService } from "../wallet-sdk/wallet-sdk.service.js"; import type { AnonPieceSelectorService } from "./anon-piece-selector.service.js"; @@ -26,7 +27,7 @@ const PIECE = { function makeProvider(): StorageProvider { return { address: SP_ADDRESS, - providerId: 7, + providerId: 7n, name: "sp-test", isApproved: true, } as unknown as StorageProvider; @@ -39,6 +40,7 @@ function makeService(opts: { piece?: AnonPiece | null; carResult?: CarValidationResult; validateCarImpl?: () => Promise; + saveImpl?: (entity: AnonRetrieval) => Promise; }): { service: AnonRetrievalService; insertSpy: ReturnType; @@ -47,6 +49,7 @@ function makeService(opts: { metricsRecordStatusSpy: ReturnType; metricsRecordIpniSpy: ReturnType; metricsRecordBlockFetchSpy: ReturnType; + saveSpy: ReturnType; } { const insertSpy = vi.fn(); const clickhouseService = { @@ -59,6 +62,11 @@ function makeService(opts: { findOne: vi.fn(async () => makeProvider()), } as unknown as Repository; + const saveSpy = vi.fn(opts.saveImpl ?? (async (entity: AnonRetrieval) => entity)); + const anonRetrievalRepository = { + save: saveSpy, + } as unknown as Repository; + const anonPieceSelector = { selectPieceForProvider: vi.fn(async () => (opts.piece === null ? null : (opts.piece ?? PIECE))), } as unknown as AnonPieceSelectorService; @@ -100,6 +108,7 @@ function makeService(opts: { metrics, clickhouseService, spRepository, + anonRetrievalRepository, ); return { @@ -110,6 +119,7 @@ function makeService(opts: { metricsRecordStatusSpy, metricsRecordIpniSpy, metricsRecordBlockFetchSpy, + saveSpy, }; } @@ -118,7 +128,7 @@ describe("AnonRetrievalService", () => { vi.clearAllMocks(); }); - it("emits a ClickHouse row with partial metrics when fetchPiece returns aborted=true", async () => { + it("persists a Postgres row with partial metrics when fetchPiece returns aborted=true", async () => { const partial: PieceRetrievalResult = { success: false, pieceCid: PIECE.pieceCid, @@ -133,41 +143,59 @@ describe("AnonRetrievalService", () => { aborted: true, }; - const { service, insertSpy } = makeService({ pieceResult: partial }); + const { service, saveSpy, insertSpy } = makeService({ pieceResult: partial }); await service.performForProvider(SP_ADDRESS); + expect(saveSpy).toHaveBeenCalledTimes(1); + const entity = saveSpy.mock.calls[0]?.[0] as AnonRetrieval; + expect(entity.pieceFetchStatus).toBe(PieceFetchStatus.FAILED); + expect(entity.bytesRetrieved).toBe(524288n); + expect(entity.firstByteMs).toBe(150); + expect(entity.lastByteMs).toBe(42000); + expect(entity.throughputBps).toBe(12500n); + expect(entity.httpResponseCode).toBe(200); + expect(entity.errorMessage).toContain("Anon retrieval job timeout"); + expect(entity.pieceCid).toBe(PIECE.pieceCid); + expect(entity.spAddress).toBe(SP_ADDRESS); + expect(entity.spId).toBe(7n); + expect(entity.probeLocation).toBe("test-location"); + expect(entity.retrievalEndpoint).toBe(`https://sp.test/piece/${PIECE.pieceCid}`); + expect(typeof entity.id).toBe("string"); + + // CAR/IPNI/block-fetch were never run on a non-IPFS-indexed piece. + expect(entity.carParseable).toBeNull(); + expect(entity.carBlockCount).toBeNull(); + expect(entity.blockFetchEndpoint).toBeNull(); + expect(entity.blockFetchValid).toBeNull(); + expect(entity.blockFetchSampledCount).toBeNull(); + expect(entity.blockFetchFailedCount).toBeNull(); + expect(entity.ipniStatus).toBe(IpniCheckStatus.SKIPPED); + + // ClickHouse mirror is also written. expect(insertSpy).toHaveBeenCalledTimes(1); const [table, row] = insertSpy.mock.calls[0] as [string, Record]; expect(table).toBe("anon_retrieval_checks"); - expect(row.piece_fetch_status).toBe(RetrievalStatus.FAILED); + expect(row.retrieval_id).toBe(entity.id); + expect(row.piece_fetch_status).toBe(PieceFetchStatus.FAILED); expect(row.bytes_retrieved).toBe(524288); expect(row.first_byte_ms).toBe(150); expect(row.last_byte_ms).toBe(42000); expect(row.throughput_bps).toBe(12500); expect(row.http_response_code).toBe(200); - expect(row.error_message).toContain("Anon retrieval job timeout"); - expect(row.piece_cid).toBe(PIECE.pieceCid); - expect(row.sp_address).toBe(SP_ADDRESS); - expect(row.sp_id).toBe(7); - expect(row.probe_location).toBe("test-location"); - expect(typeof row.retrieval_id).toBe("string"); - - // CAR/IPNI/block-fetch were never run on a non-IPFS-indexed piece — every - // dimension column should explicitly say "skipped" (ipni_status) or null. - expect(row.car_parseable).toBeNull(); - expect(row.car_block_count).toBeNull(); - expect(row.block_fetch_endpoint).toBeNull(); - expect(row.block_fetch_valid).toBeNull(); - expect(row.block_fetch_sampled_count).toBeNull(); - expect(row.block_fetch_failed_count).toBeNull(); - expect(row.ipni_status).toBe("skipped"); - expect(row.ipni_verify_ms).toBeNull(); - expect(row.ipni_verified_cids_count).toBeNull(); - expect(row.ipni_unverified_cids_count).toBeNull(); + expect(row.ipni_status).toBe(IpniCheckStatus.SKIPPED); + + // Trimmed CH columns must NOT appear (they live only in Postgres). + expect(row).not.toHaveProperty("piece_cid"); + expect(row).not.toHaveProperty("data_set_id"); + expect(row).not.toHaveProperty("piece_id"); + expect(row).not.toHaveProperty("ipfs_root_cid"); + expect(row).not.toHaveProperty("retrieval_endpoint"); + expect(row).not.toHaveProperty("block_fetch_endpoint"); + expect(row).not.toHaveProperty("error_message"); }); - it("still emits a row when the signal aborts before fetchPiece runs", async () => { + it("still persists when the signal aborts before fetchPiece runs", async () => { const ac = new AbortController(); ac.abort(new Error("Anon retrieval job timeout (60s) for sp1")); @@ -183,20 +211,21 @@ describe("AnonRetrievalService", () => { commPValid: false, }; - const { service, insertSpy, fetchSpy } = makeService({ pieceResult: never }); + const { service, saveSpy, insertSpy, fetchSpy } = makeService({ pieceResult: never }); await service.performForProvider(SP_ADDRESS, ac.signal); expect(fetchSpy).not.toHaveBeenCalled(); + expect(saveSpy).toHaveBeenCalledTimes(1); + const entity = saveSpy.mock.calls[0]?.[0] as AnonRetrieval; + expect(entity.pieceFetchStatus).toBe(PieceFetchStatus.FAILED); + expect(entity.errorMessage).toContain("Anon retrieval job timeout"); + expect(entity.bytesRetrieved).toBeNull(); + expect(entity.firstByteMs).toBeNull(); expect(insertSpy).toHaveBeenCalledTimes(1); - const [, row] = insertSpy.mock.calls[0] as [string, Record]; - expect(row.piece_fetch_status).toBe(RetrievalStatus.FAILED); - expect(row.error_message).toContain("Anon retrieval job timeout"); - expect(row.bytes_retrieved).toBeNull(); - expect(row.first_byte_ms).toBeNull(); }); - it("still emits a row when fetchPiece throws unexpectedly", async () => { + it("still persists when fetchPiece throws unexpectedly", async () => { const never: PieceRetrievalResult = { success: false, pieceCid: PIECE.pieceCid, @@ -209,7 +238,7 @@ describe("AnonRetrievalService", () => { commPValid: false, }; - const { service, insertSpy } = makeService({ + const { service, saveSpy } = makeService({ pieceResult: never, fetchPieceImpl: async () => { throw new Error("network down"); @@ -218,12 +247,12 @@ describe("AnonRetrievalService", () => { await expect(service.performForProvider(SP_ADDRESS)).rejects.toThrow("network down"); - expect(insertSpy).toHaveBeenCalledTimes(1); - const [, row] = insertSpy.mock.calls[0] as [string, Record]; - expect(row.piece_fetch_status).toBe(RetrievalStatus.FAILED); + expect(saveSpy).toHaveBeenCalledTimes(1); + const entity = saveSpy.mock.calls[0]?.[0] as AnonRetrieval; + expect(entity.pieceFetchStatus).toBe(PieceFetchStatus.FAILED); }); - it("skips ClickHouse insert when ClickHouse is disabled", async () => { + it("does not throw when Postgres save fails and still attempts the CH insert", async () => { const ok: PieceRetrievalResult = { success: true, pieceCid: PIECE.pieceCid, @@ -236,11 +265,20 @@ describe("AnonRetrievalService", () => { commPValid: true, }; - const { service, insertSpy } = makeService({ pieceResult: ok, clickhouseEnabled: false }); + const { service, saveSpy, insertSpy } = makeService({ + pieceResult: ok, + saveImpl: async () => { + throw new Error("connection refused"); + }, + }); - await service.performForProvider(SP_ADDRESS); + await expect(service.performForProvider(SP_ADDRESS)).resolves.toBeUndefined(); - expect(insertSpy).not.toHaveBeenCalled(); + expect(saveSpy).toHaveBeenCalledTimes(1); + // CH still gets the row keyed by the client-side uuid. + expect(insertSpy).toHaveBeenCalledTimes(1); + const [, row] = insertSpy.mock.calls[0] as [string, Record]; + expect(typeof row.retrieval_id).toBe("string"); }); describe("with IPFS indexing", () => { @@ -264,7 +302,7 @@ describe("AnonRetrievalService", () => { }; } - it("emits populated CAR/IPNI/block-fetch columns when validation fully succeeds", async () => { + it("populates CAR/IPNI/block-fetch columns when validation fully succeeds", async () => { const carResult: CarValidationResult = { carParseable: true, blockCount: 42, @@ -278,7 +316,7 @@ describe("AnonRetrievalService", () => { blockFetchEndpoint: "https://sp.test/ipfs/", }; - const { service, insertSpy, validateCarSpy } = makeService({ + const { service, saveSpy, insertSpy, validateCarSpy } = makeService({ pieceResult: okPiece(Buffer.from("car-bytes")), piece: INDEXED_PIECE, carResult, @@ -287,19 +325,24 @@ describe("AnonRetrievalService", () => { await service.performForProvider(SP_ADDRESS); expect(validateCarSpy).toHaveBeenCalledTimes(1); + const entity = saveSpy.mock.calls[0]?.[0] as AnonRetrieval; + expect(entity.pieceFetchStatus).toBe(PieceFetchStatus.SUCCESS); + expect(entity.commpValid).toBe(true); + expect(entity.carParseable).toBe(true); + expect(entity.carBlockCount).toBe(42); + expect(entity.blockFetchEndpoint).toBe("https://sp.test/ipfs/"); + expect(entity.blockFetchValid).toBe(true); + expect(entity.blockFetchSampledCount).toBe(5); + expect(entity.blockFetchFailedCount).toBe(0); + expect(entity.ipniStatus).toBe(IpniCheckStatus.VALID); + expect(entity.ipniVerifyMs).toBe(137); + expect(entity.ipniVerifiedCidsCount).toBe(6); + expect(entity.ipniUnverifiedCidsCount).toBe(0); + const [, row] = insertSpy.mock.calls[0] as [string, Record]; - expect(row.piece_fetch_status).toBe(RetrievalStatus.SUCCESS); - expect(row.commp_valid).toBe(true); + expect(row.piece_fetch_status).toBe(PieceFetchStatus.SUCCESS); expect(row.car_parseable).toBe(true); - expect(row.car_block_count).toBe(42); - expect(row.block_fetch_endpoint).toBe("https://sp.test/ipfs/"); - expect(row.block_fetch_valid).toBe(true); - expect(row.block_fetch_sampled_count).toBe(5); - expect(row.block_fetch_failed_count).toBe(0); - expect(row.ipni_status).toBe("valid"); - expect(row.ipni_verify_ms).toBe(137); - expect(row.ipni_verified_cids_count).toBe(6); - expect(row.ipni_unverified_cids_count).toBe(0); + expect(row.ipni_status).toBe(IpniCheckStatus.VALID); }); it("distinguishes IPNI invalid from block-fetch failures with explicit counts", async () => { @@ -316,7 +359,7 @@ describe("AnonRetrievalService", () => { blockFetchEndpoint: "https://sp.test/ipfs/", }; - const { service, insertSpy } = makeService({ + const { service, saveSpy } = makeService({ pieceResult: okPiece(Buffer.from("car-bytes")), piece: INDEXED_PIECE, carResult, @@ -324,24 +367,24 @@ describe("AnonRetrievalService", () => { await service.performForProvider(SP_ADDRESS); - const [, row] = insertSpy.mock.calls[0] as [string, Record]; + const entity = saveSpy.mock.calls[0]?.[0] as AnonRetrieval; // The piece-fetch path still succeeded — failures are surfaced as // independent dimensions, not folded into piece_fetch_status. - expect(row.piece_fetch_status).toBe(RetrievalStatus.SUCCESS); - expect(row.car_parseable).toBe(true); - expect(row.ipni_status).toBe("invalid"); - expect(row.ipni_verified_cids_count).toBe(0); - expect(row.ipni_unverified_cids_count).toBe(6); - expect(row.block_fetch_valid).toBe(false); - expect(row.block_fetch_sampled_count).toBe(5); - expect(row.block_fetch_failed_count).toBe(2); + expect(entity.pieceFetchStatus).toBe(PieceFetchStatus.SUCCESS); + expect(entity.carParseable).toBe(true); + expect(entity.ipniStatus).toBe(IpniCheckStatus.INVALID); + expect(entity.ipniVerifiedCidsCount).toBe(0); + expect(entity.ipniUnverifiedCidsCount).toBe(6); + expect(entity.blockFetchValid).toBe(false); + expect(entity.blockFetchSampledCount).toBe(5); + expect(entity.blockFetchFailedCount).toBe(2); }); it("emits ipni_status='error' (not 'skipped') when CAR validation throws on a successful piece", async () => { // Distinguishes a real infra outage (e.g. IpniVerificationService down) // from a piece that legitimately had no IPFS indexing. Without the // distinction, an outage looks like normal non-IPFS volume in dashboards. - const { service, insertSpy, metricsRecordIpniSpy, metricsRecordBlockFetchSpy } = makeService({ + const { service, saveSpy, metricsRecordIpniSpy, metricsRecordBlockFetchSpy } = makeService({ pieceResult: okPiece(Buffer.from("car-bytes")), piece: INDEXED_PIECE, validateCarImpl: async () => { @@ -354,11 +397,11 @@ describe("AnonRetrievalService", () => { expect(metricsRecordIpniSpy).toHaveBeenCalledWith(expect.anything(), "error"); expect(metricsRecordBlockFetchSpy).toHaveBeenCalledWith(expect.anything(), "error"); - const [, row] = insertSpy.mock.calls[0] as [string, Record]; - expect(row.ipni_status).toBe("error"); + const entity = saveSpy.mock.calls[0]?.[0] as AnonRetrieval; + expect(entity.ipniStatus).toBe(IpniCheckStatus.ERROR); // Piece-fetch path itself succeeded — only the validation pipeline failed. - expect(row.commp_valid).toBe(true); - expect(row.car_parseable).toBeNull(); + expect(entity.commpValid).toBe(true); + expect(entity.carParseable).toBeNull(); }); it("emits car_parseable=false with skipped IPNI/block-fetch when bytes don't parse as CAR", async () => { @@ -375,7 +418,7 @@ describe("AnonRetrievalService", () => { blockFetchEndpoint: null, }; - const { service, insertSpy } = makeService({ + const { service, saveSpy } = makeService({ pieceResult: okPiece(Buffer.from("not-a-car")), piece: INDEXED_PIECE, carResult, @@ -383,19 +426,19 @@ describe("AnonRetrievalService", () => { await service.performForProvider(SP_ADDRESS); - const [, row] = insertSpy.mock.calls[0] as [string, Record]; - expect(row.car_parseable).toBe(false); + const entity = saveSpy.mock.calls[0]?.[0] as AnonRetrieval; + expect(entity.carParseable).toBe(false); // car_block_count and block_fetch_sampled_count are gated on carParseable // so an unparseable CAR doesn't emit a misleading 0. - expect(row.car_block_count).toBeNull(); - expect(row.block_fetch_sampled_count).toBeNull(); - expect(row.block_fetch_endpoint).toBeNull(); - expect(row.block_fetch_valid).toBeNull(); - expect(row.block_fetch_failed_count).toBeNull(); - expect(row.ipni_status).toBe("skipped"); - expect(row.ipni_verify_ms).toBeNull(); - expect(row.ipni_verified_cids_count).toBeNull(); - expect(row.ipni_unverified_cids_count).toBeNull(); + expect(entity.carBlockCount).toBeNull(); + expect(entity.blockFetchSampledCount).toBeNull(); + expect(entity.blockFetchEndpoint).toBeNull(); + expect(entity.blockFetchValid).toBeNull(); + expect(entity.blockFetchFailedCount).toBeNull(); + expect(entity.ipniStatus).toBe(IpniCheckStatus.SKIPPED); + expect(entity.ipniVerifyMs).toBeNull(); + expect(entity.ipniVerifiedCidsCount).toBeNull(); + expect(entity.ipniUnverifiedCidsCount).toBeNull(); }); }); }); diff --git a/apps/backend/src/retrieval-anon/anon-retrieval.service.ts b/apps/backend/src/retrieval-anon/anon-retrieval.service.ts index 5343d59a..d8298776 100644 --- a/apps/backend/src/retrieval-anon/anon-retrieval.service.ts +++ b/apps/backend/src/retrieval-anon/anon-retrieval.service.ts @@ -4,8 +4,9 @@ import { InjectRepository } from "@nestjs/typeorm"; import type { Repository } from "typeorm"; import { ClickhouseService } from "../clickhouse/clickhouse.service.js"; import { type ProviderJobContext, toStructuredError } from "../common/logging.js"; +import { AnonRetrieval } from "../database/entities/anon-retrieval.entity.js"; import { StorageProvider } from "../database/entities/storage-provider.entity.js"; -import { RetrievalStatus, ServiceType } from "../database/types.js"; +import { IpniCheckStatus, PieceFetchStatus, ServiceType } from "../database/types.js"; import { buildCheckMetricLabels } from "../metrics-prometheus/check-metric-labels.js"; import { AnonRetrievalCheckMetrics } from "../metrics-prometheus/check-metrics.service.js"; import { WalletSdkService } from "../wallet-sdk/wallet-sdk.service.js"; @@ -29,6 +30,8 @@ export class AnonRetrievalService { private readonly clickhouseService: ClickhouseService, @InjectRepository(StorageProvider) private readonly spRepository: Repository, + @InjectRepository(AnonRetrieval) + private readonly anonRetrievalRepository: Repository, ) {} async performForProvider(spAddress: string, signal?: AbortSignal, logContext?: ProviderJobContext): Promise { @@ -137,80 +140,75 @@ export class AnonRetrievalService { pieceResult.success ? "success" : pieceResult.aborted ? "failure.aborted" : "failure.http", ); } finally { - // Always emit a ClickHouse row — even on abort or unexpected error — so - // we never lose the evidence (ttfb, bytes, response code) we already - // collected. + // Always persist a row — even on abort or unexpected error — so we never + // lose the evidence (ttfb, bytes, response code) we already collected. const finalPieceResult = pieceResult ?? buildAbortedPlaceholder(piece.pieceCid, signal?.reason); - const retrievalId = randomUUID(); + const providerInfo = this.walletSdkService.getProviderInfo(spAddress); + const spBaseUrl = providerInfo?.pdp.serviceURL.replace(/\/$/, "") ?? spAddress; + const retrievalEndpoint = `${spBaseUrl}/piece/${piece.pieceCid}`; + const pieceFetchStatus = finalPieceResult.success ? PieceFetchStatus.SUCCESS : PieceFetchStatus.FAILED; + const ipniStatus: IpniCheckStatus = !validatedCarPiece + ? IpniCheckStatus.SKIPPED + : carResult + ? ipniStatusFromResult(carResult) + : IpniCheckStatus.ERROR; - if (this.clickhouseService.enabled) { - const providerInfo = this.walletSdkService.getProviderInfo(spAddress); - const spBaseUrl = providerInfo?.pdp.serviceURL.replace(/\/$/, "") ?? spAddress; - const pieceFetchStatus = finalPieceResult.success ? RetrievalStatus.SUCCESS : RetrievalStatus.FAILED; - const ipniStatus = !validatedCarPiece ? "skipped" : carResult ? ipniStatusFromResult(carResult) : "error"; + const entity: AnonRetrieval = { + id: randomUUID(), + createdAt: startedAt, + startedAt, + probeLocation: this.clickhouseService.probeLocation, + spAddress, + spId: provider?.providerId ?? null, + spName: provider?.name ?? null, + pieceCid: piece.pieceCid, + dataSetId: BigInt(piece.dataSetId), + pieceId: BigInt(piece.pieceId), + rawSize: BigInt(piece.rawSize), + withIpfsIndexing: piece.withIPFSIndexing, + ipfsRootCid: piece.ipfsRootCid, + serviceType: ServiceType.DIRECT_SP, + retrievalEndpoint, + pieceFetchStatus, + httpResponseCode: finalPieceResult.statusCode > 0 ? finalPieceResult.statusCode : null, + firstByteMs: finalPieceResult.ttfbMs > 0 ? finalPieceResult.ttfbMs : null, + lastByteMs: finalPieceResult.latencyMs > 0 ? finalPieceResult.latencyMs : null, + bytesRetrieved: finalPieceResult.bytesReceived > 0 ? BigInt(finalPieceResult.bytesReceived) : null, + throughputBps: finalPieceResult.throughputBps > 0 ? BigInt(Math.round(finalPieceResult.throughputBps)) : null, + commpValid: finalPieceResult.success ? finalPieceResult.commPValid : null, + carParseable: carResult ? carResult.carParseable : null, + carBlockCount: carResult?.carParseable ? carResult.blockCount : null, + blockFetchEndpoint: carResult?.blockFetchEndpoint ?? null, + blockFetchValid: carResult ? carResult.blockFetchValid : null, + blockFetchSampledCount: carResult?.carParseable ? carResult.sampledCidCount : null, + blockFetchFailedCount: carResult?.blockFetchFailedCount ?? null, + ipniStatus, + ipniVerifyMs: carResult?.ipniVerifyMs ?? null, + ipniVerifiedCidsCount: carResult?.ipniVerifiedCidsCount ?? null, + ipniUnverifiedCidsCount: carResult?.ipniUnverifiedCidsCount ?? null, + errorMessage: finalPieceResult.errorMessage ?? null, + }; - try { - this.clickhouseService.insert(ANON_RETRIEVAL_CHECKS_TABLE, { - timestamp: startedAt.getTime(), - probe_location: this.clickhouseService.probeLocation, - sp_address: spAddress, - sp_id: provider?.providerId != null ? Number(provider.providerId) : null, - sp_name: provider?.name ?? null, - retrieval_id: retrievalId, - piece_cid: piece.pieceCid, - data_set_id: piece.dataSetId, - piece_id: piece.pieceId, - raw_size: piece.rawSize, - with_ipfs_indexing: piece.withIPFSIndexing, - ipfs_root_cid: piece.ipfsRootCid, - service_type: ServiceType.DIRECT_SP, - retrieval_endpoint: `${spBaseUrl}/piece/${piece.pieceCid}`, - piece_fetch_status: pieceFetchStatus, - http_response_code: finalPieceResult.statusCode > 0 ? finalPieceResult.statusCode : null, - first_byte_ms: finalPieceResult.ttfbMs > 0 ? finalPieceResult.ttfbMs : null, - last_byte_ms: finalPieceResult.latencyMs > 0 ? finalPieceResult.latencyMs : null, - bytes_retrieved: finalPieceResult.bytesReceived > 0 ? finalPieceResult.bytesReceived : null, - throughput_bps: finalPieceResult.throughputBps > 0 ? Math.round(finalPieceResult.throughputBps) : null, - commp_valid: finalPieceResult.success ? finalPieceResult.commPValid : null, - car_parseable: carResult ? carResult.carParseable : null, - car_block_count: carResult?.carParseable ? carResult?.blockCount : null, - block_fetch_endpoint: carResult?.blockFetchEndpoint ?? null, - block_fetch_valid: carResult ? carResult.blockFetchValid : null, - block_fetch_sampled_count: carResult?.carParseable ? carResult?.sampledCidCount : null, - block_fetch_failed_count: carResult?.blockFetchFailedCount ?? null, - ipni_status: ipniStatus, - ipni_verify_ms: carResult?.ipniVerifyMs ?? null, - ipni_verified_cids_count: carResult?.ipniVerifiedCidsCount ?? null, - ipni_unverified_cids_count: carResult?.ipniUnverifiedCidsCount ?? null, - error_message: finalPieceResult.errorMessage ?? null, - }); - } catch (error) { - // ClickhouseService.insert is buffered/non-throwing in normal operation, but - // guard against unexpected runtime errors so we don't break the probe cycle. - this.logger.warn({ - ...logContext, - event: "anon_retrieval_clickhouse_insert_failed", - message: "Failed to enqueue anonymous retrieval row to ClickHouse", - pieceCid: piece.pieceCid, - spAddress, - error: toStructuredError(error), - }); - } - } else { - this.logger.debug({ + try { + await this.anonRetrievalRepository.save(entity); + } catch (error) { + this.logger.warn({ ...logContext, - event: "anon_retrieval_clickhouse_disabled", - message: "ClickHouse disabled — anon retrieval row not emitted", + event: "anon_retrieval_save_failed", + message: "Failed to persist anonymous retrieval row to Postgres", pieceCid: piece.pieceCid, spAddress, + error: toStructuredError(error), }); } + this.clickhouseService.insert(ANON_RETRIEVAL_CHECKS_TABLE, toClickhouseRow(entity)); + this.logger.log({ ...logContext, event: "anon_retrieval_completed", message: "Anonymous retrieval test completed", - retrievalId, + retrievalId: entity.id, pieceCid: piece.pieceCid, spAddress, success: finalPieceResult.success, @@ -226,9 +224,53 @@ export class AnonRetrievalService { } } -function ipniStatusFromResult(result: CarValidationResult): "valid" | "invalid" | "skipped" { - if (result.ipniValid === null) return "skipped"; - return result.ipniValid ? "valid" : "invalid"; +function ipniStatusFromResult(result: CarValidationResult): IpniCheckStatus { + switch (result.ipniValid) { + case null: + return IpniCheckStatus.SKIPPED; + case true: + return IpniCheckStatus.VALID; + case false: + return IpniCheckStatus.INVALID; + default: + throw new Error(`Unexpected IPNI validation result: ${result.ipniValid}`); + } +} + +/** + * Project an AnonRetrieval entity to the chartable subset stored in ClickHouse. + * High-cardinality identifiers (piece_cid, data_set_id, piece_id, ipfs_root_cid), + * URLs (retrieval_endpoint, block_fetch_endpoint), and free-text columns + * (error_message) are intentionally dropped — they live only in Postgres. + */ +function toClickhouseRow(entity: AnonRetrieval): Record { + return { + timestamp: entity.startedAt.getTime(), + probe_location: entity.probeLocation, + sp_address: entity.spAddress, + sp_id: entity.spId != null ? Number(entity.spId) : null, + sp_name: entity.spName, + retrieval_id: entity.id, + raw_size: Number(entity.rawSize), + with_ipfs_indexing: entity.withIpfsIndexing, + service_type: entity.serviceType, + piece_fetch_status: entity.pieceFetchStatus, + http_response_code: entity.httpResponseCode, + first_byte_ms: entity.firstByteMs, + last_byte_ms: entity.lastByteMs, + bytes_retrieved: entity.bytesRetrieved != null ? Number(entity.bytesRetrieved) : null, + throughput_bps: entity.throughputBps != null ? Number(entity.throughputBps) : null, + commp_valid: entity.commpValid, + car_parseable: entity.carParseable, + car_block_count: entity.carBlockCount, + block_fetch_valid: entity.blockFetchValid, + block_fetch_sampled_count: entity.blockFetchSampledCount, + block_fetch_failed_count: entity.blockFetchFailedCount, + ipni_status: entity.ipniStatus, + ipni_verify_ms: entity.ipniVerifyMs, + ipni_verified_cids_count: entity.ipniVerifiedCidsCount, + ipni_unverified_cids_count: entity.ipniUnverifiedCidsCount, + }; } function buildAbortedPlaceholder(pieceCid: string, reason: unknown): PieceRetrievalResult { diff --git a/apps/backend/src/retrieval-anon/retrieval-anon.module.ts b/apps/backend/src/retrieval-anon/retrieval-anon.module.ts index c05dcb5f..4e9e38df 100644 --- a/apps/backend/src/retrieval-anon/retrieval-anon.module.ts +++ b/apps/backend/src/retrieval-anon/retrieval-anon.module.ts @@ -1,6 +1,7 @@ import { Module } from "@nestjs/common"; import { ConfigModule } from "@nestjs/config"; import { TypeOrmModule } from "@nestjs/typeorm"; +import { AnonRetrieval } from "../database/entities/anon-retrieval.entity.js"; import { StorageProvider } from "../database/entities/storage-provider.entity.js"; import { HttpClientModule } from "../http-client/http-client.module.js"; import { IpniModule } from "../ipni/ipni.module.js"; @@ -14,7 +15,7 @@ import { PieceRetrievalService } from "./piece-retrieval.service.js"; @Module({ imports: [ ConfigModule, - TypeOrmModule.forFeature([StorageProvider]), + TypeOrmModule.forFeature([AnonRetrieval, StorageProvider]), SubgraphModule, WalletSdkModule, HttpClientModule, From 92c40a85fb4798aa74ad03d8490ea4f1e0e62899 Mon Sep 17 00:00:00 2001 From: Dennis Trautwein Date: Mon, 4 May 2026 08:12:26 +0200 Subject: [PATCH 11/55] Revert "refactor: store anon retrieval data primarily in postgres" This reverts commit 6824f752b106f8bbd8e443aa2f74f680a8afe4c1. --- .../src/clickhouse/clickhouse.schema.ts | 14 +- apps/backend/src/database/database.module.ts | 9 +- .../entities/anon-retrieval.entity.ts | 120 ----------- .../1776300000000-CreateAnonRetrievals.ts | 72 ------- apps/backend/src/database/types.ts | 12 -- .../anon-retrieval.service.spec.ts | 203 +++++++----------- .../retrieval-anon/anon-retrieval.service.ts | 172 ++++++--------- .../retrieval-anon/retrieval-anon.module.ts | 3 +- 8 files changed, 161 insertions(+), 444 deletions(-) delete mode 100644 apps/backend/src/database/entities/anon-retrieval.entity.ts delete mode 100644 apps/backend/src/database/migrations/1776300000000-CreateAnonRetrievals.ts diff --git a/apps/backend/src/clickhouse/clickhouse.schema.ts b/apps/backend/src/clickhouse/clickhouse.schema.ts index 5a9a805e..e30f6151 100644 --- a/apps/backend/src/clickhouse/clickhouse.schema.ts +++ b/apps/backend/src/clickhouse/clickhouse.schema.ts @@ -70,12 +70,17 @@ export function buildMigrations(database: string): string[] { sp_id Nullable(UInt64), -- storage provider numeric id sp_name Nullable(String), -- storage provider name - retrieval_id UUID, -- per-event correlation id (matches anon_retrievals.id in Postgres) + retrieval_id UUID, -- per-event correlation id (log/Prometheus join) + piece_cid String, -- piece CID (v2/CommP) sampled from the subgraph + data_set_id UInt64, -- on-chain data set id + piece_id UInt64, -- on-chain piece id within the data set raw_size UInt64, -- raw (unpadded) piece size, bytes with_ipfs_indexing Bool, -- whether the piece advertises IPNI metadata + ipfs_root_cid Nullable(String), -- root CID of the contained DAG; null when not IPFS-indexed service_type LowCardinality(String), -- 'direct_sp' (only mode for anon retrievals today) + retrieval_endpoint String, -- URL probed (e.g. {spBaseUrl}/piece/{pieceCid}) piece_fetch_status LowCardinality(String), -- 'success' | 'failed' — outcome of GET /piece/ (HTTP 2xx AND CommP match). CAR/IPNI/block-fetch outcomes live in their own columns. http_response_code Nullable(UInt16), -- raw HTTP status; null on transport failure @@ -87,14 +92,17 @@ export function buildMigrations(database: string): string[] { commp_valid Nullable(Bool), -- null when retrieval failed before CommP could be hashed car_parseable Nullable(Bool), -- null when CAR validation was skipped (no IPFS indexing or piece fetch failed); true if bytes parsed as a CAR car_block_count Nullable(UInt32), -- total number of blocks observed inside the CAR; null when skipped or unparseable + block_fetch_endpoint Nullable(String), -- gateway base URL probed for block fetch (e.g. {spBaseUrl}/ipfs/); null when skipped block_fetch_valid Nullable(Bool), -- null when skipped; true if all sampled blocks fetched + hash-verified block_fetch_sampled_count Nullable(UInt32), -- number of blocks sampled and probed via /ipfs/?format=raw block_fetch_failed_count Nullable(UInt32), -- number of sampled blocks that failed (non-2xx, hash mismatch, unsupported codec, or transport error) - ipni_status LowCardinality(String), -- 'valid' | 'invalid' | 'skipped' | 'error' + ipni_status LowCardinality(String), -- 'valid' | 'invalid' | 'skipped' (mirrors data_storage_checks naming) ipni_verify_ms Nullable(Float64), -- IPNI verification duration; null when skipped ipni_verified_cids_count Nullable(UInt32), -- CIDs confirmed findable via IPNI - ipni_unverified_cids_count Nullable(UInt32) -- CIDs checked but not findable + ipni_unverified_cids_count Nullable(UInt32), -- CIDs checked but not findable + + error_message Nullable(String) -- failure reason; null on success ) ENGINE MergeTree() PRIMARY KEY (probe_location, sp_address, timestamp) PARTITION BY toStartOfMonth(timestamp) diff --git a/apps/backend/src/database/database.module.ts b/apps/backend/src/database/database.module.ts index f3f9ed09..9249c3a9 100644 --- a/apps/backend/src/database/database.module.ts +++ b/apps/backend/src/database/database.module.ts @@ -7,7 +7,6 @@ import { fileURLToPath } from "url"; import { toStructuredError } from "../common/logging.js"; import { createPinoExitLogger } from "../common/pino.config.js"; import type { IAppConfig, IConfig, IDatabaseConfig } from "../config/app.config.js"; -import { AnonRetrieval } from "./entities/anon-retrieval.entity.js"; import { DataRetentionBaseline } from "./entities/data-retention-baseline.entity.js"; import { Deal } from "./entities/deal.entity.js"; import { JobScheduleState } from "./entities/job-schedule-state.entity.js"; @@ -50,7 +49,7 @@ function toSafeDataSourceContext(options: DataSourceOptions): Record { - await queryRunner.query(` - CREATE TYPE anon_retrievals_piece_fetch_status_enum AS ENUM ('success', 'failed') - `); - await queryRunner.query(` - CREATE TYPE anon_retrievals_ipni_status_enum AS ENUM ('valid', 'invalid', 'skipped', 'error') - `); - await queryRunner.query(` - CREATE TYPE anon_retrievals_service_type_enum AS ENUM ('direct_sp', 'ipfs_pin') - `); - - await queryRunner.query(` - CREATE TABLE IF NOT EXISTS anon_retrievals ( - id UUID NOT NULL PRIMARY KEY DEFAULT gen_random_uuid(), - started_at TIMESTAMPTZ NOT NULL, - probe_location VARCHAR NOT NULL, - sp_address VARCHAR NOT NULL, - sp_id BIGINT, - sp_name VARCHAR, - piece_cid VARCHAR NOT NULL, - data_set_id BIGINT NOT NULL, - piece_id BIGINT NOT NULL, - raw_size BIGINT NOT NULL, - with_ipfs_indexing BOOLEAN NOT NULL, - ipfs_root_cid VARCHAR, - service_type anon_retrievals_service_type_enum NOT NULL DEFAULT 'direct_sp', - retrieval_endpoint VARCHAR NOT NULL, - piece_fetch_status anon_retrievals_piece_fetch_status_enum NOT NULL, - http_response_code INTEGER, - first_byte_ms DOUBLE PRECISION, - last_byte_ms DOUBLE PRECISION, - bytes_retrieved BIGINT, - throughput_bps BIGINT, - commp_valid BOOLEAN, - car_parseable BOOLEAN, - car_block_count INTEGER, - block_fetch_endpoint VARCHAR, - block_fetch_valid BOOLEAN, - block_fetch_sampled_count INTEGER, - block_fetch_failed_count INTEGER, - ipni_status anon_retrievals_ipni_status_enum NOT NULL, - ipni_verify_ms DOUBLE PRECISION, - ipni_verified_cids_count INTEGER, - ipni_unverified_cids_count INTEGER, - error_message VARCHAR, - created_at TIMESTAMPTZ NOT NULL DEFAULT NOW() - ) - `); - - await queryRunner.query(` - CREATE INDEX IF NOT EXISTS "IDX_anon_retrievals_sp_address_started_at" - ON anon_retrievals (sp_address, started_at) - `); - - await queryRunner.query(` - CREATE INDEX IF NOT EXISTS "IDX_anon_retrievals_started_at" - ON anon_retrievals (started_at) - `); - } - - public async down(queryRunner: QueryRunner): Promise { - await queryRunner.query(`DROP TABLE IF EXISTS anon_retrievals CASCADE`); - await queryRunner.query(`DROP TYPE IF EXISTS anon_retrievals_service_type_enum`); - await queryRunner.query(`DROP TYPE IF EXISTS anon_retrievals_ipni_status_enum`); - await queryRunner.query(`DROP TYPE IF EXISTS anon_retrievals_piece_fetch_status_enum`); - } -} diff --git a/apps/backend/src/database/types.ts b/apps/backend/src/database/types.ts index e09d1dd3..46fd5d28 100644 --- a/apps/backend/src/database/types.ts +++ b/apps/backend/src/database/types.ts @@ -28,18 +28,6 @@ export enum IpniStatus { FAILED = "failed", } -export enum PieceFetchStatus { - SUCCESS = "success", - FAILED = "failed", -} - -export enum IpniCheckStatus { - VALID = "valid", - INVALID = "invalid", - SKIPPED = "skipped", - ERROR = "error", -} - /** * Metadata schema for deal storage and retrieval */ diff --git a/apps/backend/src/retrieval-anon/anon-retrieval.service.spec.ts b/apps/backend/src/retrieval-anon/anon-retrieval.service.spec.ts index 4f775150..b5f17c57 100644 --- a/apps/backend/src/retrieval-anon/anon-retrieval.service.spec.ts +++ b/apps/backend/src/retrieval-anon/anon-retrieval.service.spec.ts @@ -1,9 +1,8 @@ import type { Repository } from "typeorm"; import { beforeEach, describe, expect, it, vi } from "vitest"; import type { ClickhouseService } from "../clickhouse/clickhouse.service.js"; -import type { AnonRetrieval } from "../database/entities/anon-retrieval.entity.js"; import type { StorageProvider } from "../database/entities/storage-provider.entity.js"; -import { IpniCheckStatus, PieceFetchStatus } from "../database/types.js"; +import { RetrievalStatus } from "../database/types.js"; import type { AnonRetrievalCheckMetrics } from "../metrics-prometheus/check-metrics.service.js"; import type { WalletSdkService } from "../wallet-sdk/wallet-sdk.service.js"; import type { AnonPieceSelectorService } from "./anon-piece-selector.service.js"; @@ -27,7 +26,7 @@ const PIECE = { function makeProvider(): StorageProvider { return { address: SP_ADDRESS, - providerId: 7n, + providerId: 7, name: "sp-test", isApproved: true, } as unknown as StorageProvider; @@ -40,7 +39,6 @@ function makeService(opts: { piece?: AnonPiece | null; carResult?: CarValidationResult; validateCarImpl?: () => Promise; - saveImpl?: (entity: AnonRetrieval) => Promise; }): { service: AnonRetrievalService; insertSpy: ReturnType; @@ -49,7 +47,6 @@ function makeService(opts: { metricsRecordStatusSpy: ReturnType; metricsRecordIpniSpy: ReturnType; metricsRecordBlockFetchSpy: ReturnType; - saveSpy: ReturnType; } { const insertSpy = vi.fn(); const clickhouseService = { @@ -62,11 +59,6 @@ function makeService(opts: { findOne: vi.fn(async () => makeProvider()), } as unknown as Repository; - const saveSpy = vi.fn(opts.saveImpl ?? (async (entity: AnonRetrieval) => entity)); - const anonRetrievalRepository = { - save: saveSpy, - } as unknown as Repository; - const anonPieceSelector = { selectPieceForProvider: vi.fn(async () => (opts.piece === null ? null : (opts.piece ?? PIECE))), } as unknown as AnonPieceSelectorService; @@ -108,7 +100,6 @@ function makeService(opts: { metrics, clickhouseService, spRepository, - anonRetrievalRepository, ); return { @@ -119,7 +110,6 @@ function makeService(opts: { metricsRecordStatusSpy, metricsRecordIpniSpy, metricsRecordBlockFetchSpy, - saveSpy, }; } @@ -128,7 +118,7 @@ describe("AnonRetrievalService", () => { vi.clearAllMocks(); }); - it("persists a Postgres row with partial metrics when fetchPiece returns aborted=true", async () => { + it("emits a ClickHouse row with partial metrics when fetchPiece returns aborted=true", async () => { const partial: PieceRetrievalResult = { success: false, pieceCid: PIECE.pieceCid, @@ -143,59 +133,41 @@ describe("AnonRetrievalService", () => { aborted: true, }; - const { service, saveSpy, insertSpy } = makeService({ pieceResult: partial }); + const { service, insertSpy } = makeService({ pieceResult: partial }); await service.performForProvider(SP_ADDRESS); - expect(saveSpy).toHaveBeenCalledTimes(1); - const entity = saveSpy.mock.calls[0]?.[0] as AnonRetrieval; - expect(entity.pieceFetchStatus).toBe(PieceFetchStatus.FAILED); - expect(entity.bytesRetrieved).toBe(524288n); - expect(entity.firstByteMs).toBe(150); - expect(entity.lastByteMs).toBe(42000); - expect(entity.throughputBps).toBe(12500n); - expect(entity.httpResponseCode).toBe(200); - expect(entity.errorMessage).toContain("Anon retrieval job timeout"); - expect(entity.pieceCid).toBe(PIECE.pieceCid); - expect(entity.spAddress).toBe(SP_ADDRESS); - expect(entity.spId).toBe(7n); - expect(entity.probeLocation).toBe("test-location"); - expect(entity.retrievalEndpoint).toBe(`https://sp.test/piece/${PIECE.pieceCid}`); - expect(typeof entity.id).toBe("string"); - - // CAR/IPNI/block-fetch were never run on a non-IPFS-indexed piece. - expect(entity.carParseable).toBeNull(); - expect(entity.carBlockCount).toBeNull(); - expect(entity.blockFetchEndpoint).toBeNull(); - expect(entity.blockFetchValid).toBeNull(); - expect(entity.blockFetchSampledCount).toBeNull(); - expect(entity.blockFetchFailedCount).toBeNull(); - expect(entity.ipniStatus).toBe(IpniCheckStatus.SKIPPED); - - // ClickHouse mirror is also written. expect(insertSpy).toHaveBeenCalledTimes(1); const [table, row] = insertSpy.mock.calls[0] as [string, Record]; expect(table).toBe("anon_retrieval_checks"); - expect(row.retrieval_id).toBe(entity.id); - expect(row.piece_fetch_status).toBe(PieceFetchStatus.FAILED); + expect(row.piece_fetch_status).toBe(RetrievalStatus.FAILED); expect(row.bytes_retrieved).toBe(524288); expect(row.first_byte_ms).toBe(150); expect(row.last_byte_ms).toBe(42000); expect(row.throughput_bps).toBe(12500); expect(row.http_response_code).toBe(200); - expect(row.ipni_status).toBe(IpniCheckStatus.SKIPPED); - - // Trimmed CH columns must NOT appear (they live only in Postgres). - expect(row).not.toHaveProperty("piece_cid"); - expect(row).not.toHaveProperty("data_set_id"); - expect(row).not.toHaveProperty("piece_id"); - expect(row).not.toHaveProperty("ipfs_root_cid"); - expect(row).not.toHaveProperty("retrieval_endpoint"); - expect(row).not.toHaveProperty("block_fetch_endpoint"); - expect(row).not.toHaveProperty("error_message"); + expect(row.error_message).toContain("Anon retrieval job timeout"); + expect(row.piece_cid).toBe(PIECE.pieceCid); + expect(row.sp_address).toBe(SP_ADDRESS); + expect(row.sp_id).toBe(7); + expect(row.probe_location).toBe("test-location"); + expect(typeof row.retrieval_id).toBe("string"); + + // CAR/IPNI/block-fetch were never run on a non-IPFS-indexed piece — every + // dimension column should explicitly say "skipped" (ipni_status) or null. + expect(row.car_parseable).toBeNull(); + expect(row.car_block_count).toBeNull(); + expect(row.block_fetch_endpoint).toBeNull(); + expect(row.block_fetch_valid).toBeNull(); + expect(row.block_fetch_sampled_count).toBeNull(); + expect(row.block_fetch_failed_count).toBeNull(); + expect(row.ipni_status).toBe("skipped"); + expect(row.ipni_verify_ms).toBeNull(); + expect(row.ipni_verified_cids_count).toBeNull(); + expect(row.ipni_unverified_cids_count).toBeNull(); }); - it("still persists when the signal aborts before fetchPiece runs", async () => { + it("still emits a row when the signal aborts before fetchPiece runs", async () => { const ac = new AbortController(); ac.abort(new Error("Anon retrieval job timeout (60s) for sp1")); @@ -211,21 +183,20 @@ describe("AnonRetrievalService", () => { commPValid: false, }; - const { service, saveSpy, insertSpy, fetchSpy } = makeService({ pieceResult: never }); + const { service, insertSpy, fetchSpy } = makeService({ pieceResult: never }); await service.performForProvider(SP_ADDRESS, ac.signal); expect(fetchSpy).not.toHaveBeenCalled(); - expect(saveSpy).toHaveBeenCalledTimes(1); - const entity = saveSpy.mock.calls[0]?.[0] as AnonRetrieval; - expect(entity.pieceFetchStatus).toBe(PieceFetchStatus.FAILED); - expect(entity.errorMessage).toContain("Anon retrieval job timeout"); - expect(entity.bytesRetrieved).toBeNull(); - expect(entity.firstByteMs).toBeNull(); expect(insertSpy).toHaveBeenCalledTimes(1); + const [, row] = insertSpy.mock.calls[0] as [string, Record]; + expect(row.piece_fetch_status).toBe(RetrievalStatus.FAILED); + expect(row.error_message).toContain("Anon retrieval job timeout"); + expect(row.bytes_retrieved).toBeNull(); + expect(row.first_byte_ms).toBeNull(); }); - it("still persists when fetchPiece throws unexpectedly", async () => { + it("still emits a row when fetchPiece throws unexpectedly", async () => { const never: PieceRetrievalResult = { success: false, pieceCid: PIECE.pieceCid, @@ -238,7 +209,7 @@ describe("AnonRetrievalService", () => { commPValid: false, }; - const { service, saveSpy } = makeService({ + const { service, insertSpy } = makeService({ pieceResult: never, fetchPieceImpl: async () => { throw new Error("network down"); @@ -247,12 +218,12 @@ describe("AnonRetrievalService", () => { await expect(service.performForProvider(SP_ADDRESS)).rejects.toThrow("network down"); - expect(saveSpy).toHaveBeenCalledTimes(1); - const entity = saveSpy.mock.calls[0]?.[0] as AnonRetrieval; - expect(entity.pieceFetchStatus).toBe(PieceFetchStatus.FAILED); + expect(insertSpy).toHaveBeenCalledTimes(1); + const [, row] = insertSpy.mock.calls[0] as [string, Record]; + expect(row.piece_fetch_status).toBe(RetrievalStatus.FAILED); }); - it("does not throw when Postgres save fails and still attempts the CH insert", async () => { + it("skips ClickHouse insert when ClickHouse is disabled", async () => { const ok: PieceRetrievalResult = { success: true, pieceCid: PIECE.pieceCid, @@ -265,20 +236,11 @@ describe("AnonRetrievalService", () => { commPValid: true, }; - const { service, saveSpy, insertSpy } = makeService({ - pieceResult: ok, - saveImpl: async () => { - throw new Error("connection refused"); - }, - }); + const { service, insertSpy } = makeService({ pieceResult: ok, clickhouseEnabled: false }); - await expect(service.performForProvider(SP_ADDRESS)).resolves.toBeUndefined(); + await service.performForProvider(SP_ADDRESS); - expect(saveSpy).toHaveBeenCalledTimes(1); - // CH still gets the row keyed by the client-side uuid. - expect(insertSpy).toHaveBeenCalledTimes(1); - const [, row] = insertSpy.mock.calls[0] as [string, Record]; - expect(typeof row.retrieval_id).toBe("string"); + expect(insertSpy).not.toHaveBeenCalled(); }); describe("with IPFS indexing", () => { @@ -302,7 +264,7 @@ describe("AnonRetrievalService", () => { }; } - it("populates CAR/IPNI/block-fetch columns when validation fully succeeds", async () => { + it("emits populated CAR/IPNI/block-fetch columns when validation fully succeeds", async () => { const carResult: CarValidationResult = { carParseable: true, blockCount: 42, @@ -316,7 +278,7 @@ describe("AnonRetrievalService", () => { blockFetchEndpoint: "https://sp.test/ipfs/", }; - const { service, saveSpy, insertSpy, validateCarSpy } = makeService({ + const { service, insertSpy, validateCarSpy } = makeService({ pieceResult: okPiece(Buffer.from("car-bytes")), piece: INDEXED_PIECE, carResult, @@ -325,24 +287,19 @@ describe("AnonRetrievalService", () => { await service.performForProvider(SP_ADDRESS); expect(validateCarSpy).toHaveBeenCalledTimes(1); - const entity = saveSpy.mock.calls[0]?.[0] as AnonRetrieval; - expect(entity.pieceFetchStatus).toBe(PieceFetchStatus.SUCCESS); - expect(entity.commpValid).toBe(true); - expect(entity.carParseable).toBe(true); - expect(entity.carBlockCount).toBe(42); - expect(entity.blockFetchEndpoint).toBe("https://sp.test/ipfs/"); - expect(entity.blockFetchValid).toBe(true); - expect(entity.blockFetchSampledCount).toBe(5); - expect(entity.blockFetchFailedCount).toBe(0); - expect(entity.ipniStatus).toBe(IpniCheckStatus.VALID); - expect(entity.ipniVerifyMs).toBe(137); - expect(entity.ipniVerifiedCidsCount).toBe(6); - expect(entity.ipniUnverifiedCidsCount).toBe(0); - const [, row] = insertSpy.mock.calls[0] as [string, Record]; - expect(row.piece_fetch_status).toBe(PieceFetchStatus.SUCCESS); + expect(row.piece_fetch_status).toBe(RetrievalStatus.SUCCESS); + expect(row.commp_valid).toBe(true); expect(row.car_parseable).toBe(true); - expect(row.ipni_status).toBe(IpniCheckStatus.VALID); + expect(row.car_block_count).toBe(42); + expect(row.block_fetch_endpoint).toBe("https://sp.test/ipfs/"); + expect(row.block_fetch_valid).toBe(true); + expect(row.block_fetch_sampled_count).toBe(5); + expect(row.block_fetch_failed_count).toBe(0); + expect(row.ipni_status).toBe("valid"); + expect(row.ipni_verify_ms).toBe(137); + expect(row.ipni_verified_cids_count).toBe(6); + expect(row.ipni_unverified_cids_count).toBe(0); }); it("distinguishes IPNI invalid from block-fetch failures with explicit counts", async () => { @@ -359,7 +316,7 @@ describe("AnonRetrievalService", () => { blockFetchEndpoint: "https://sp.test/ipfs/", }; - const { service, saveSpy } = makeService({ + const { service, insertSpy } = makeService({ pieceResult: okPiece(Buffer.from("car-bytes")), piece: INDEXED_PIECE, carResult, @@ -367,24 +324,24 @@ describe("AnonRetrievalService", () => { await service.performForProvider(SP_ADDRESS); - const entity = saveSpy.mock.calls[0]?.[0] as AnonRetrieval; + const [, row] = insertSpy.mock.calls[0] as [string, Record]; // The piece-fetch path still succeeded — failures are surfaced as // independent dimensions, not folded into piece_fetch_status. - expect(entity.pieceFetchStatus).toBe(PieceFetchStatus.SUCCESS); - expect(entity.carParseable).toBe(true); - expect(entity.ipniStatus).toBe(IpniCheckStatus.INVALID); - expect(entity.ipniVerifiedCidsCount).toBe(0); - expect(entity.ipniUnverifiedCidsCount).toBe(6); - expect(entity.blockFetchValid).toBe(false); - expect(entity.blockFetchSampledCount).toBe(5); - expect(entity.blockFetchFailedCount).toBe(2); + expect(row.piece_fetch_status).toBe(RetrievalStatus.SUCCESS); + expect(row.car_parseable).toBe(true); + expect(row.ipni_status).toBe("invalid"); + expect(row.ipni_verified_cids_count).toBe(0); + expect(row.ipni_unverified_cids_count).toBe(6); + expect(row.block_fetch_valid).toBe(false); + expect(row.block_fetch_sampled_count).toBe(5); + expect(row.block_fetch_failed_count).toBe(2); }); it("emits ipni_status='error' (not 'skipped') when CAR validation throws on a successful piece", async () => { // Distinguishes a real infra outage (e.g. IpniVerificationService down) // from a piece that legitimately had no IPFS indexing. Without the // distinction, an outage looks like normal non-IPFS volume in dashboards. - const { service, saveSpy, metricsRecordIpniSpy, metricsRecordBlockFetchSpy } = makeService({ + const { service, insertSpy, metricsRecordIpniSpy, metricsRecordBlockFetchSpy } = makeService({ pieceResult: okPiece(Buffer.from("car-bytes")), piece: INDEXED_PIECE, validateCarImpl: async () => { @@ -397,11 +354,11 @@ describe("AnonRetrievalService", () => { expect(metricsRecordIpniSpy).toHaveBeenCalledWith(expect.anything(), "error"); expect(metricsRecordBlockFetchSpy).toHaveBeenCalledWith(expect.anything(), "error"); - const entity = saveSpy.mock.calls[0]?.[0] as AnonRetrieval; - expect(entity.ipniStatus).toBe(IpniCheckStatus.ERROR); + const [, row] = insertSpy.mock.calls[0] as [string, Record]; + expect(row.ipni_status).toBe("error"); // Piece-fetch path itself succeeded — only the validation pipeline failed. - expect(entity.commpValid).toBe(true); - expect(entity.carParseable).toBeNull(); + expect(row.commp_valid).toBe(true); + expect(row.car_parseable).toBeNull(); }); it("emits car_parseable=false with skipped IPNI/block-fetch when bytes don't parse as CAR", async () => { @@ -418,7 +375,7 @@ describe("AnonRetrievalService", () => { blockFetchEndpoint: null, }; - const { service, saveSpy } = makeService({ + const { service, insertSpy } = makeService({ pieceResult: okPiece(Buffer.from("not-a-car")), piece: INDEXED_PIECE, carResult, @@ -426,19 +383,19 @@ describe("AnonRetrievalService", () => { await service.performForProvider(SP_ADDRESS); - const entity = saveSpy.mock.calls[0]?.[0] as AnonRetrieval; - expect(entity.carParseable).toBe(false); + const [, row] = insertSpy.mock.calls[0] as [string, Record]; + expect(row.car_parseable).toBe(false); // car_block_count and block_fetch_sampled_count are gated on carParseable // so an unparseable CAR doesn't emit a misleading 0. - expect(entity.carBlockCount).toBeNull(); - expect(entity.blockFetchSampledCount).toBeNull(); - expect(entity.blockFetchEndpoint).toBeNull(); - expect(entity.blockFetchValid).toBeNull(); - expect(entity.blockFetchFailedCount).toBeNull(); - expect(entity.ipniStatus).toBe(IpniCheckStatus.SKIPPED); - expect(entity.ipniVerifyMs).toBeNull(); - expect(entity.ipniVerifiedCidsCount).toBeNull(); - expect(entity.ipniUnverifiedCidsCount).toBeNull(); + expect(row.car_block_count).toBeNull(); + expect(row.block_fetch_sampled_count).toBeNull(); + expect(row.block_fetch_endpoint).toBeNull(); + expect(row.block_fetch_valid).toBeNull(); + expect(row.block_fetch_failed_count).toBeNull(); + expect(row.ipni_status).toBe("skipped"); + expect(row.ipni_verify_ms).toBeNull(); + expect(row.ipni_verified_cids_count).toBeNull(); + expect(row.ipni_unverified_cids_count).toBeNull(); }); }); }); diff --git a/apps/backend/src/retrieval-anon/anon-retrieval.service.ts b/apps/backend/src/retrieval-anon/anon-retrieval.service.ts index d8298776..5343d59a 100644 --- a/apps/backend/src/retrieval-anon/anon-retrieval.service.ts +++ b/apps/backend/src/retrieval-anon/anon-retrieval.service.ts @@ -4,9 +4,8 @@ import { InjectRepository } from "@nestjs/typeorm"; import type { Repository } from "typeorm"; import { ClickhouseService } from "../clickhouse/clickhouse.service.js"; import { type ProviderJobContext, toStructuredError } from "../common/logging.js"; -import { AnonRetrieval } from "../database/entities/anon-retrieval.entity.js"; import { StorageProvider } from "../database/entities/storage-provider.entity.js"; -import { IpniCheckStatus, PieceFetchStatus, ServiceType } from "../database/types.js"; +import { RetrievalStatus, ServiceType } from "../database/types.js"; import { buildCheckMetricLabels } from "../metrics-prometheus/check-metric-labels.js"; import { AnonRetrievalCheckMetrics } from "../metrics-prometheus/check-metrics.service.js"; import { WalletSdkService } from "../wallet-sdk/wallet-sdk.service.js"; @@ -30,8 +29,6 @@ export class AnonRetrievalService { private readonly clickhouseService: ClickhouseService, @InjectRepository(StorageProvider) private readonly spRepository: Repository, - @InjectRepository(AnonRetrieval) - private readonly anonRetrievalRepository: Repository, ) {} async performForProvider(spAddress: string, signal?: AbortSignal, logContext?: ProviderJobContext): Promise { @@ -140,75 +137,80 @@ export class AnonRetrievalService { pieceResult.success ? "success" : pieceResult.aborted ? "failure.aborted" : "failure.http", ); } finally { - // Always persist a row — even on abort or unexpected error — so we never - // lose the evidence (ttfb, bytes, response code) we already collected. + // Always emit a ClickHouse row — even on abort or unexpected error — so + // we never lose the evidence (ttfb, bytes, response code) we already + // collected. const finalPieceResult = pieceResult ?? buildAbortedPlaceholder(piece.pieceCid, signal?.reason); - const providerInfo = this.walletSdkService.getProviderInfo(spAddress); - const spBaseUrl = providerInfo?.pdp.serviceURL.replace(/\/$/, "") ?? spAddress; - const retrievalEndpoint = `${spBaseUrl}/piece/${piece.pieceCid}`; - const pieceFetchStatus = finalPieceResult.success ? PieceFetchStatus.SUCCESS : PieceFetchStatus.FAILED; - const ipniStatus: IpniCheckStatus = !validatedCarPiece - ? IpniCheckStatus.SKIPPED - : carResult - ? ipniStatusFromResult(carResult) - : IpniCheckStatus.ERROR; + const retrievalId = randomUUID(); - const entity: AnonRetrieval = { - id: randomUUID(), - createdAt: startedAt, - startedAt, - probeLocation: this.clickhouseService.probeLocation, - spAddress, - spId: provider?.providerId ?? null, - spName: provider?.name ?? null, - pieceCid: piece.pieceCid, - dataSetId: BigInt(piece.dataSetId), - pieceId: BigInt(piece.pieceId), - rawSize: BigInt(piece.rawSize), - withIpfsIndexing: piece.withIPFSIndexing, - ipfsRootCid: piece.ipfsRootCid, - serviceType: ServiceType.DIRECT_SP, - retrievalEndpoint, - pieceFetchStatus, - httpResponseCode: finalPieceResult.statusCode > 0 ? finalPieceResult.statusCode : null, - firstByteMs: finalPieceResult.ttfbMs > 0 ? finalPieceResult.ttfbMs : null, - lastByteMs: finalPieceResult.latencyMs > 0 ? finalPieceResult.latencyMs : null, - bytesRetrieved: finalPieceResult.bytesReceived > 0 ? BigInt(finalPieceResult.bytesReceived) : null, - throughputBps: finalPieceResult.throughputBps > 0 ? BigInt(Math.round(finalPieceResult.throughputBps)) : null, - commpValid: finalPieceResult.success ? finalPieceResult.commPValid : null, - carParseable: carResult ? carResult.carParseable : null, - carBlockCount: carResult?.carParseable ? carResult.blockCount : null, - blockFetchEndpoint: carResult?.blockFetchEndpoint ?? null, - blockFetchValid: carResult ? carResult.blockFetchValid : null, - blockFetchSampledCount: carResult?.carParseable ? carResult.sampledCidCount : null, - blockFetchFailedCount: carResult?.blockFetchFailedCount ?? null, - ipniStatus, - ipniVerifyMs: carResult?.ipniVerifyMs ?? null, - ipniVerifiedCidsCount: carResult?.ipniVerifiedCidsCount ?? null, - ipniUnverifiedCidsCount: carResult?.ipniUnverifiedCidsCount ?? null, - errorMessage: finalPieceResult.errorMessage ?? null, - }; + if (this.clickhouseService.enabled) { + const providerInfo = this.walletSdkService.getProviderInfo(spAddress); + const spBaseUrl = providerInfo?.pdp.serviceURL.replace(/\/$/, "") ?? spAddress; + const pieceFetchStatus = finalPieceResult.success ? RetrievalStatus.SUCCESS : RetrievalStatus.FAILED; + const ipniStatus = !validatedCarPiece ? "skipped" : carResult ? ipniStatusFromResult(carResult) : "error"; - try { - await this.anonRetrievalRepository.save(entity); - } catch (error) { - this.logger.warn({ + try { + this.clickhouseService.insert(ANON_RETRIEVAL_CHECKS_TABLE, { + timestamp: startedAt.getTime(), + probe_location: this.clickhouseService.probeLocation, + sp_address: spAddress, + sp_id: provider?.providerId != null ? Number(provider.providerId) : null, + sp_name: provider?.name ?? null, + retrieval_id: retrievalId, + piece_cid: piece.pieceCid, + data_set_id: piece.dataSetId, + piece_id: piece.pieceId, + raw_size: piece.rawSize, + with_ipfs_indexing: piece.withIPFSIndexing, + ipfs_root_cid: piece.ipfsRootCid, + service_type: ServiceType.DIRECT_SP, + retrieval_endpoint: `${spBaseUrl}/piece/${piece.pieceCid}`, + piece_fetch_status: pieceFetchStatus, + http_response_code: finalPieceResult.statusCode > 0 ? finalPieceResult.statusCode : null, + first_byte_ms: finalPieceResult.ttfbMs > 0 ? finalPieceResult.ttfbMs : null, + last_byte_ms: finalPieceResult.latencyMs > 0 ? finalPieceResult.latencyMs : null, + bytes_retrieved: finalPieceResult.bytesReceived > 0 ? finalPieceResult.bytesReceived : null, + throughput_bps: finalPieceResult.throughputBps > 0 ? Math.round(finalPieceResult.throughputBps) : null, + commp_valid: finalPieceResult.success ? finalPieceResult.commPValid : null, + car_parseable: carResult ? carResult.carParseable : null, + car_block_count: carResult?.carParseable ? carResult?.blockCount : null, + block_fetch_endpoint: carResult?.blockFetchEndpoint ?? null, + block_fetch_valid: carResult ? carResult.blockFetchValid : null, + block_fetch_sampled_count: carResult?.carParseable ? carResult?.sampledCidCount : null, + block_fetch_failed_count: carResult?.blockFetchFailedCount ?? null, + ipni_status: ipniStatus, + ipni_verify_ms: carResult?.ipniVerifyMs ?? null, + ipni_verified_cids_count: carResult?.ipniVerifiedCidsCount ?? null, + ipni_unverified_cids_count: carResult?.ipniUnverifiedCidsCount ?? null, + error_message: finalPieceResult.errorMessage ?? null, + }); + } catch (error) { + // ClickhouseService.insert is buffered/non-throwing in normal operation, but + // guard against unexpected runtime errors so we don't break the probe cycle. + this.logger.warn({ + ...logContext, + event: "anon_retrieval_clickhouse_insert_failed", + message: "Failed to enqueue anonymous retrieval row to ClickHouse", + pieceCid: piece.pieceCid, + spAddress, + error: toStructuredError(error), + }); + } + } else { + this.logger.debug({ ...logContext, - event: "anon_retrieval_save_failed", - message: "Failed to persist anonymous retrieval row to Postgres", + event: "anon_retrieval_clickhouse_disabled", + message: "ClickHouse disabled — anon retrieval row not emitted", pieceCid: piece.pieceCid, spAddress, - error: toStructuredError(error), }); } - this.clickhouseService.insert(ANON_RETRIEVAL_CHECKS_TABLE, toClickhouseRow(entity)); - this.logger.log({ ...logContext, event: "anon_retrieval_completed", message: "Anonymous retrieval test completed", - retrievalId: entity.id, + retrievalId, pieceCid: piece.pieceCid, spAddress, success: finalPieceResult.success, @@ -224,53 +226,9 @@ export class AnonRetrievalService { } } -function ipniStatusFromResult(result: CarValidationResult): IpniCheckStatus { - switch (result.ipniValid) { - case null: - return IpniCheckStatus.SKIPPED; - case true: - return IpniCheckStatus.VALID; - case false: - return IpniCheckStatus.INVALID; - default: - throw new Error(`Unexpected IPNI validation result: ${result.ipniValid}`); - } -} - -/** - * Project an AnonRetrieval entity to the chartable subset stored in ClickHouse. - * High-cardinality identifiers (piece_cid, data_set_id, piece_id, ipfs_root_cid), - * URLs (retrieval_endpoint, block_fetch_endpoint), and free-text columns - * (error_message) are intentionally dropped — they live only in Postgres. - */ -function toClickhouseRow(entity: AnonRetrieval): Record { - return { - timestamp: entity.startedAt.getTime(), - probe_location: entity.probeLocation, - sp_address: entity.spAddress, - sp_id: entity.spId != null ? Number(entity.spId) : null, - sp_name: entity.spName, - retrieval_id: entity.id, - raw_size: Number(entity.rawSize), - with_ipfs_indexing: entity.withIpfsIndexing, - service_type: entity.serviceType, - piece_fetch_status: entity.pieceFetchStatus, - http_response_code: entity.httpResponseCode, - first_byte_ms: entity.firstByteMs, - last_byte_ms: entity.lastByteMs, - bytes_retrieved: entity.bytesRetrieved != null ? Number(entity.bytesRetrieved) : null, - throughput_bps: entity.throughputBps != null ? Number(entity.throughputBps) : null, - commp_valid: entity.commpValid, - car_parseable: entity.carParseable, - car_block_count: entity.carBlockCount, - block_fetch_valid: entity.blockFetchValid, - block_fetch_sampled_count: entity.blockFetchSampledCount, - block_fetch_failed_count: entity.blockFetchFailedCount, - ipni_status: entity.ipniStatus, - ipni_verify_ms: entity.ipniVerifyMs, - ipni_verified_cids_count: entity.ipniVerifiedCidsCount, - ipni_unverified_cids_count: entity.ipniUnverifiedCidsCount, - }; +function ipniStatusFromResult(result: CarValidationResult): "valid" | "invalid" | "skipped" { + if (result.ipniValid === null) return "skipped"; + return result.ipniValid ? "valid" : "invalid"; } function buildAbortedPlaceholder(pieceCid: string, reason: unknown): PieceRetrievalResult { diff --git a/apps/backend/src/retrieval-anon/retrieval-anon.module.ts b/apps/backend/src/retrieval-anon/retrieval-anon.module.ts index 4e9e38df..c05dcb5f 100644 --- a/apps/backend/src/retrieval-anon/retrieval-anon.module.ts +++ b/apps/backend/src/retrieval-anon/retrieval-anon.module.ts @@ -1,7 +1,6 @@ import { Module } from "@nestjs/common"; import { ConfigModule } from "@nestjs/config"; import { TypeOrmModule } from "@nestjs/typeorm"; -import { AnonRetrieval } from "../database/entities/anon-retrieval.entity.js"; import { StorageProvider } from "../database/entities/storage-provider.entity.js"; import { HttpClientModule } from "../http-client/http-client.module.js"; import { IpniModule } from "../ipni/ipni.module.js"; @@ -15,7 +14,7 @@ import { PieceRetrievalService } from "./piece-retrieval.service.js"; @Module({ imports: [ ConfigModule, - TypeOrmModule.forFeature([AnonRetrieval, StorageProvider]), + TypeOrmModule.forFeature([StorageProvider]), SubgraphModule, WalletSdkModule, HttpClientModule, From d4f7d802f93a3c48cec49bc9f145bc28c2815ea3 Mon Sep 17 00:00:00 2001 From: Dennis Trautwein Date: Mon, 4 May 2026 08:29:28 +0200 Subject: [PATCH 12/55] refactor(retrieval-anon): introduce IpniCheckStatus enum and drop redundant clickhouse-enabled gate - Replace string literals ("valid"|"invalid"|"skipped"|"error") with IpniCheckStatus enum in anon-retrieval.service.ts - Drop the `if (clickhouseService.enabled)` wrapper around the insert call; ClickhouseService.insert is already a no-op when disabled, matching the pattern used by other retrieval flows - Fix outdated ipni_status schema comment to include the 'error' value --- .../src/clickhouse/clickhouse.schema.ts | 2 +- apps/backend/src/database/types.ts | 7 + .../anon-retrieval.service.spec.ts | 23 +-- .../retrieval-anon/anon-retrieval.service.ts | 133 +++++++++--------- 4 files changed, 74 insertions(+), 91 deletions(-) diff --git a/apps/backend/src/clickhouse/clickhouse.schema.ts b/apps/backend/src/clickhouse/clickhouse.schema.ts index e30f6151..05684154 100644 --- a/apps/backend/src/clickhouse/clickhouse.schema.ts +++ b/apps/backend/src/clickhouse/clickhouse.schema.ts @@ -97,7 +97,7 @@ export function buildMigrations(database: string): string[] { block_fetch_sampled_count Nullable(UInt32), -- number of blocks sampled and probed via /ipfs/?format=raw block_fetch_failed_count Nullable(UInt32), -- number of sampled blocks that failed (non-2xx, hash mismatch, unsupported codec, or transport error) - ipni_status LowCardinality(String), -- 'valid' | 'invalid' | 'skipped' (mirrors data_storage_checks naming) + ipni_status LowCardinality(String), -- 'valid' | 'invalid' | 'skipped' | 'error' ipni_verify_ms Nullable(Float64), -- IPNI verification duration; null when skipped ipni_verified_cids_count Nullable(UInt32), -- CIDs confirmed findable via IPNI ipni_unverified_cids_count Nullable(UInt32), -- CIDs checked but not findable diff --git a/apps/backend/src/database/types.ts b/apps/backend/src/database/types.ts index 46fd5d28..c56b355a 100644 --- a/apps/backend/src/database/types.ts +++ b/apps/backend/src/database/types.ts @@ -28,6 +28,13 @@ export enum IpniStatus { FAILED = "failed", } +export enum IpniCheckStatus { + VALID = "valid", + INVALID = "invalid", + SKIPPED = "skipped", + ERROR = "error", +} + /** * Metadata schema for deal storage and retrieval */ diff --git a/apps/backend/src/retrieval-anon/anon-retrieval.service.spec.ts b/apps/backend/src/retrieval-anon/anon-retrieval.service.spec.ts index b5f17c57..c82eed76 100644 --- a/apps/backend/src/retrieval-anon/anon-retrieval.service.spec.ts +++ b/apps/backend/src/retrieval-anon/anon-retrieval.service.spec.ts @@ -35,7 +35,6 @@ function makeProvider(): StorageProvider { function makeService(opts: { pieceResult: PieceRetrievalResult; fetchPieceImpl?: (signal?: AbortSignal) => Promise; - clickhouseEnabled?: boolean; piece?: AnonPiece | null; carResult?: CarValidationResult; validateCarImpl?: () => Promise; @@ -51,7 +50,7 @@ function makeService(opts: { const insertSpy = vi.fn(); const clickhouseService = { insert: insertSpy, - enabled: opts.clickhouseEnabled ?? true, + enabled: true, probeLocation: "test-location", } as unknown as ClickhouseService; @@ -223,26 +222,6 @@ describe("AnonRetrievalService", () => { expect(row.piece_fetch_status).toBe(RetrievalStatus.FAILED); }); - it("skips ClickHouse insert when ClickHouse is disabled", async () => { - const ok: PieceRetrievalResult = { - success: true, - pieceCid: PIECE.pieceCid, - bytesReceived: 1024, - pieceBytes: null, - latencyMs: 100, - ttfbMs: 10, - throughputBps: 10240, - statusCode: 200, - commPValid: true, - }; - - const { service, insertSpy } = makeService({ pieceResult: ok, clickhouseEnabled: false }); - - await service.performForProvider(SP_ADDRESS); - - expect(insertSpy).not.toHaveBeenCalled(); - }); - describe("with IPFS indexing", () => { const INDEXED_PIECE: AnonPiece = { ...PIECE, diff --git a/apps/backend/src/retrieval-anon/anon-retrieval.service.ts b/apps/backend/src/retrieval-anon/anon-retrieval.service.ts index 5343d59a..c1d08c0e 100644 --- a/apps/backend/src/retrieval-anon/anon-retrieval.service.ts +++ b/apps/backend/src/retrieval-anon/anon-retrieval.service.ts @@ -5,7 +5,7 @@ import type { Repository } from "typeorm"; import { ClickhouseService } from "../clickhouse/clickhouse.service.js"; import { type ProviderJobContext, toStructuredError } from "../common/logging.js"; import { StorageProvider } from "../database/entities/storage-provider.entity.js"; -import { RetrievalStatus, ServiceType } from "../database/types.js"; +import { IpniCheckStatus, RetrievalStatus, ServiceType } from "../database/types.js"; import { buildCheckMetricLabels } from "../metrics-prometheus/check-metric-labels.js"; import { AnonRetrievalCheckMetrics } from "../metrics-prometheus/check-metrics.service.js"; import { WalletSdkService } from "../wallet-sdk/wallet-sdk.service.js"; @@ -108,13 +108,17 @@ export class AnonRetrievalService { this.metrics.recordIpniStatus(labels, ipniStatusFromResult(carResult)); this.metrics.recordBlockFetchStatus( labels, - carResult.blockFetchValid === null ? "skipped" : carResult.blockFetchValid ? "valid" : "invalid", + carResult.blockFetchValid === null + ? IpniCheckStatus.SKIPPED + : carResult.blockFetchValid + ? IpniCheckStatus.VALID + : IpniCheckStatus.INVALID, ); } catch (error) { // Validation was attempted on a successful piece retrieval but threw. this.metrics.recordCarParseStatus(labels, false); - this.metrics.recordIpniStatus(labels, "error"); - this.metrics.recordBlockFetchStatus(labels, "error"); + this.metrics.recordIpniStatus(labels, IpniCheckStatus.ERROR); + this.metrics.recordBlockFetchStatus(labels, IpniCheckStatus.ERROR); this.logger.warn({ ...logContext, event: "anon_retrieval_car_validation_failed", @@ -126,8 +130,8 @@ export class AnonRetrievalService { } } else if (!pieceResult.success) { // Piece retrieval failed — IPNI and block fetch were skipped - this.metrics.recordIpniStatus(labels, "skipped"); - this.metrics.recordBlockFetchStatus(labels, "skipped"); + this.metrics.recordIpniStatus(labels, IpniCheckStatus.SKIPPED); + this.metrics.recordBlockFetchStatus(labels, IpniCheckStatus.SKIPPED); } // Overall check duration and status @@ -139,70 +143,63 @@ export class AnonRetrievalService { } finally { // Always emit a ClickHouse row — even on abort or unexpected error — so // we never lose the evidence (ttfb, bytes, response code) we already - // collected. + // collected. ClickhouseService.insert is a no-op when disabled. const finalPieceResult = pieceResult ?? buildAbortedPlaceholder(piece.pieceCid, signal?.reason); const retrievalId = randomUUID(); - - if (this.clickhouseService.enabled) { - const providerInfo = this.walletSdkService.getProviderInfo(spAddress); - const spBaseUrl = providerInfo?.pdp.serviceURL.replace(/\/$/, "") ?? spAddress; - const pieceFetchStatus = finalPieceResult.success ? RetrievalStatus.SUCCESS : RetrievalStatus.FAILED; - const ipniStatus = !validatedCarPiece ? "skipped" : carResult ? ipniStatusFromResult(carResult) : "error"; - - try { - this.clickhouseService.insert(ANON_RETRIEVAL_CHECKS_TABLE, { - timestamp: startedAt.getTime(), - probe_location: this.clickhouseService.probeLocation, - sp_address: spAddress, - sp_id: provider?.providerId != null ? Number(provider.providerId) : null, - sp_name: provider?.name ?? null, - retrieval_id: retrievalId, - piece_cid: piece.pieceCid, - data_set_id: piece.dataSetId, - piece_id: piece.pieceId, - raw_size: piece.rawSize, - with_ipfs_indexing: piece.withIPFSIndexing, - ipfs_root_cid: piece.ipfsRootCid, - service_type: ServiceType.DIRECT_SP, - retrieval_endpoint: `${spBaseUrl}/piece/${piece.pieceCid}`, - piece_fetch_status: pieceFetchStatus, - http_response_code: finalPieceResult.statusCode > 0 ? finalPieceResult.statusCode : null, - first_byte_ms: finalPieceResult.ttfbMs > 0 ? finalPieceResult.ttfbMs : null, - last_byte_ms: finalPieceResult.latencyMs > 0 ? finalPieceResult.latencyMs : null, - bytes_retrieved: finalPieceResult.bytesReceived > 0 ? finalPieceResult.bytesReceived : null, - throughput_bps: finalPieceResult.throughputBps > 0 ? Math.round(finalPieceResult.throughputBps) : null, - commp_valid: finalPieceResult.success ? finalPieceResult.commPValid : null, - car_parseable: carResult ? carResult.carParseable : null, - car_block_count: carResult?.carParseable ? carResult?.blockCount : null, - block_fetch_endpoint: carResult?.blockFetchEndpoint ?? null, - block_fetch_valid: carResult ? carResult.blockFetchValid : null, - block_fetch_sampled_count: carResult?.carParseable ? carResult?.sampledCidCount : null, - block_fetch_failed_count: carResult?.blockFetchFailedCount ?? null, - ipni_status: ipniStatus, - ipni_verify_ms: carResult?.ipniVerifyMs ?? null, - ipni_verified_cids_count: carResult?.ipniVerifiedCidsCount ?? null, - ipni_unverified_cids_count: carResult?.ipniUnverifiedCidsCount ?? null, - error_message: finalPieceResult.errorMessage ?? null, - }); - } catch (error) { - // ClickhouseService.insert is buffered/non-throwing in normal operation, but - // guard against unexpected runtime errors so we don't break the probe cycle. - this.logger.warn({ - ...logContext, - event: "anon_retrieval_clickhouse_insert_failed", - message: "Failed to enqueue anonymous retrieval row to ClickHouse", - pieceCid: piece.pieceCid, - spAddress, - error: toStructuredError(error), - }); - } - } else { - this.logger.debug({ + const providerInfo = this.walletSdkService.getProviderInfo(spAddress); + const spBaseUrl = providerInfo?.pdp.serviceURL.replace(/\/$/, "") ?? spAddress; + const pieceFetchStatus = finalPieceResult.success ? RetrievalStatus.SUCCESS : RetrievalStatus.FAILED; + const ipniStatus: IpniCheckStatus = !validatedCarPiece + ? IpniCheckStatus.SKIPPED + : carResult + ? ipniStatusFromResult(carResult) + : IpniCheckStatus.ERROR; + + try { + this.clickhouseService.insert(ANON_RETRIEVAL_CHECKS_TABLE, { + timestamp: startedAt.getTime(), + probe_location: this.clickhouseService.probeLocation, + sp_address: spAddress, + sp_id: provider?.providerId != null ? Number(provider.providerId) : null, + sp_name: provider?.name ?? null, + retrieval_id: retrievalId, + piece_cid: piece.pieceCid, + data_set_id: piece.dataSetId, + piece_id: piece.pieceId, + raw_size: piece.rawSize, + with_ipfs_indexing: piece.withIPFSIndexing, + ipfs_root_cid: piece.ipfsRootCid, + service_type: ServiceType.DIRECT_SP, + retrieval_endpoint: `${spBaseUrl}/piece/${piece.pieceCid}`, + piece_fetch_status: pieceFetchStatus, + http_response_code: finalPieceResult.statusCode > 0 ? finalPieceResult.statusCode : null, + first_byte_ms: finalPieceResult.ttfbMs > 0 ? finalPieceResult.ttfbMs : null, + last_byte_ms: finalPieceResult.latencyMs > 0 ? finalPieceResult.latencyMs : null, + bytes_retrieved: finalPieceResult.bytesReceived > 0 ? finalPieceResult.bytesReceived : null, + throughput_bps: finalPieceResult.throughputBps > 0 ? Math.round(finalPieceResult.throughputBps) : null, + commp_valid: finalPieceResult.success ? finalPieceResult.commPValid : null, + car_parseable: carResult ? carResult.carParseable : null, + car_block_count: carResult?.carParseable ? carResult?.blockCount : null, + block_fetch_endpoint: carResult?.blockFetchEndpoint ?? null, + block_fetch_valid: carResult ? carResult.blockFetchValid : null, + block_fetch_sampled_count: carResult?.carParseable ? carResult?.sampledCidCount : null, + block_fetch_failed_count: carResult?.blockFetchFailedCount ?? null, + ipni_status: ipniStatus, + ipni_verify_ms: carResult?.ipniVerifyMs ?? null, + ipni_verified_cids_count: carResult?.ipniVerifiedCidsCount ?? null, + ipni_unverified_cids_count: carResult?.ipniUnverifiedCidsCount ?? null, + error_message: finalPieceResult.errorMessage ?? null, + }); + } catch (error) { + // ClickhouseService.insert is buffered/non-throwing in normal operation, but + // guard against unexpected runtime errors so we don't break the probe cycle. + this.logger.warn({ ...logContext, - event: "anon_retrieval_clickhouse_disabled", - message: "ClickHouse disabled — anon retrieval row not emitted", + event: "anon_retrieval_clickhouse_insert_failed", + message: "Failed to enqueue anonymous retrieval row to ClickHouse", pieceCid: piece.pieceCid, spAddress, + error: toStructuredError(error), }); } @@ -226,9 +223,9 @@ export class AnonRetrievalService { } } -function ipniStatusFromResult(result: CarValidationResult): "valid" | "invalid" | "skipped" { - if (result.ipniValid === null) return "skipped"; - return result.ipniValid ? "valid" : "invalid"; +function ipniStatusFromResult(result: CarValidationResult): IpniCheckStatus { + if (result.ipniValid === null) return IpniCheckStatus.SKIPPED; + return result.ipniValid ? IpniCheckStatus.VALID : IpniCheckStatus.INVALID; } function buildAbortedPlaceholder(pieceCid: string, reason: unknown): PieceRetrievalResult { From ab3748a047415581dbc1aa2ed09651ff4f11d80e Mon Sep 17 00:00:00 2001 From: Dennis Trautwein Date: Mon, 4 May 2026 09:00:59 +0200 Subject: [PATCH 13/55] remove(retrieval-anon): dedup window logic --- .../anon-piece-selector.service.spec.ts | 20 ------------ .../anon-piece-selector.service.ts | 31 ------------------- 2 files changed, 51 deletions(-) diff --git a/apps/backend/src/retrieval-anon/anon-piece-selector.service.spec.ts b/apps/backend/src/retrieval-anon/anon-piece-selector.service.spec.ts index 32d13719..30a04486 100644 --- a/apps/backend/src/retrieval-anon/anon-piece-selector.service.spec.ts +++ b/apps/backend/src/retrieval-anon/anon-piece-selector.service.spec.ts @@ -100,26 +100,6 @@ describe("AnonPieceSelectorService", () => { expect(result?.pieceCid).toBe(liveCid); }); - it("redraws when the first sampled piece was recently selected by this process", async () => { - const staleCid = "baga-stale"; - const freshCid = "baga-fresh"; - - const service = new AnonPieceSelectorService(subgraphService, makeConfigService()); - - // Prime the in-memory ring buffer by first selecting `staleCid`. - sampleAnonPiece.mockResolvedValueOnce(makePiece({ pieceCid: staleCid })); - const first = await service.selectPieceForProvider(SP_ADDRESS); - expect(first?.pieceCid).toBe(staleCid); - - // Now the second selection should skip `staleCid` and use `freshCid`. - sampleAnonPiece - .mockResolvedValueOnce(makePiece({ pieceCid: staleCid })) - .mockResolvedValueOnce(makePiece({ pieceCid: freshCid })); - const second = await service.selectPieceForProvider(SP_ADDRESS); - - expect(second?.pieceCid).toBe(freshCid); - }); - it("falls back to the opposite pool when the preferred one is empty", async () => { // First pool call returns nothing twice (both attempts), second pool succeeds. const fresh = makePiece({ pieceCid: "baga-other-pool" }); diff --git a/apps/backend/src/retrieval-anon/anon-piece-selector.service.ts b/apps/backend/src/retrieval-anon/anon-piece-selector.service.ts index 342a4780..0ee51fc7 100644 --- a/apps/backend/src/retrieval-anon/anon-piece-selector.service.ts +++ b/apps/backend/src/retrieval-anon/anon-piece-selector.service.ts @@ -7,11 +7,6 @@ import { SubgraphService } from "../subgraph/subgraph.service.js"; import type { AnonCandidatePiece } from "../subgraph/types.js"; import type { AnonPiece } from "./types.js"; -/** - * Number of most-recently-tested piece CIDs to exclude from re-selection. - */ -const RECENT_DEDUP_WINDOW = 500; - /** * Piece size buckets, in raw (unpadded) bytes. Weighted sampling across * these buckets keeps tests meaningful for bandwidth measurement without @@ -47,10 +42,6 @@ const IPFS_INDEXED_SAMPLE_RATE = 0.8; export class AnonPieceSelectorService { private readonly logger = new Logger(AnonPieceSelectorService.name); - /** Bounded FIFO of recently-selected piece CIDs. Process-local; lost on restart. */ - private readonly recentlyTested = new Set(); - private readonly recentlyTestedQueue: string[] = []; - constructor( private readonly subgraphService: SubgraphService, private readonly configService: ConfigService, @@ -91,7 +82,6 @@ export class AnonPieceSelectorService { }); if (piece) { - this.rememberRecent(piece.pieceCid); this.logger.log({ event: "anon_piece_selected", message: "Selected anonymous piece for retrieval test", @@ -158,10 +148,6 @@ export class AnonPieceSelectorService { continue; } - if (this.recentlyTested.has(piece.pieceCid)) { - continue; - } - return piece; } @@ -179,23 +165,6 @@ export class AnonPieceSelectorService { } return "medium"; } - - /** Push a CID into the bounded FIFO; evict the oldest when at capacity. */ - private rememberRecent(pieceCid: string): void { - if (this.recentlyTested.has(pieceCid)) { - return; - } - - this.recentlyTested.add(pieceCid); - this.recentlyTestedQueue.push(pieceCid); - - while (this.recentlyTestedQueue.length > RECENT_DEDUP_WINDOW) { - const evicted = this.recentlyTestedQueue.shift(); - if (evicted !== undefined) { - this.recentlyTested.delete(evicted); - } - } - } } /** Uniform-random 32-byte sort key as `0x`-prefixed hex. */ From beffac7be083ae84e56bfba5818a48257d4b4922 Mon Sep 17 00:00:00 2001 From: Dennis Trautwein Date: Mon, 4 May 2026 09:09:36 +0200 Subject: [PATCH 14/55] revert(ipni): sequential block CID verification Context: https://github.com/filecoin-project/filecoin-pin/issues/417 --- .../src/clickhouse/clickhouse.schema.ts | 4 +- .../src/ipni/ipni-verification.service.ts | 122 +++++++----------- .../anon-retrieval.service.spec.ts | 16 +-- .../retrieval-anon/anon-retrieval.service.ts | 2 - .../retrieval-anon/car-validation.service.ts | 12 +- apps/backend/src/retrieval-anon/types.ts | 2 - 6 files changed, 53 insertions(+), 105 deletions(-) diff --git a/apps/backend/src/clickhouse/clickhouse.schema.ts b/apps/backend/src/clickhouse/clickhouse.schema.ts index 05684154..e8612056 100644 --- a/apps/backend/src/clickhouse/clickhouse.schema.ts +++ b/apps/backend/src/clickhouse/clickhouse.schema.ts @@ -97,10 +97,8 @@ export function buildMigrations(database: string): string[] { block_fetch_sampled_count Nullable(UInt32), -- number of blocks sampled and probed via /ipfs/?format=raw block_fetch_failed_count Nullable(UInt32), -- number of sampled blocks that failed (non-2xx, hash mismatch, unsupported codec, or transport error) - ipni_status LowCardinality(String), -- 'valid' | 'invalid' | 'skipped' | 'error' + ipni_status LowCardinality(String), -- 'valid' | 'invalid' | 'skipped' | 'error' — all-or-nothing across the root CID and the sampled child CIDs (filecoin-pin verifies them as a single batch) ipni_verify_ms Nullable(Float64), -- IPNI verification duration; null when skipped - ipni_verified_cids_count Nullable(UInt32), -- CIDs confirmed findable via IPNI - ipni_unverified_cids_count Nullable(UInt32), -- CIDs checked but not findable error_message Nullable(String) -- failure reason; null on success ) ENGINE MergeTree() diff --git a/apps/backend/src/ipni/ipni-verification.service.ts b/apps/backend/src/ipni/ipni-verification.service.ts index 51fcc8e0..3d7d52f9 100644 --- a/apps/backend/src/ipni/ipni-verification.service.ts +++ b/apps/backend/src/ipni/ipni-verification.service.ts @@ -3,7 +3,7 @@ import { PDPProvider } from "filecoin-pin"; import { waitForIpniProviderResults } from "filecoin-pin/core/utils"; import { CID } from "multiformats/cid"; import type { StorageProvider } from "../database/entities/storage-provider.entity.js"; -import type { FailedCID, IPNIVerificationResult } from "../deal-addons/strategies/ipni.types.js"; +import type { IPNIVerificationResult } from "../deal-addons/strategies/ipni.types.js"; export type IpniVerificationInput = { rootCid: CID; @@ -44,6 +44,7 @@ export class IpniVerificationService { const expectedProviders = [this.buildExpectedProviderInfo(storageProvider as StorageProviderWithUrl)]; const timeoutSignal = AbortSignal.timeout(timeoutMs); const verificationSignal = signal ? AbortSignal.any([signal, timeoutSignal]) : timeoutSignal; + let failureReason = "IPNI did not return expected provider results via filecoin-pin"; this.logger.log({ event: "ipni_verification_started", @@ -60,69 +61,56 @@ export class IpniVerificationService { }); const ipniVerificationStartTime = Date.now(); - const cidsToValidate: { cid: CID; isRoot: boolean }[] = [ - { cid: rootCid, isRoot: true }, - ...blockCids.map((cid) => ({ cid, isRoot: false })), - ]; - let verified = 0; - const failedCIDs: FailedCID[] = []; - let rootCIDVerified = false; - - // waitForIpniProviderResults is all-or-nothing per call (throws on first failure), - // so we invoke it once per CID to get accurate per-CID verified/unverified counts. - // The shared verificationSignal bounds total wall-clock time across all CIDs. - for (const { cid, isRoot } of cidsToValidate) { + const ipniValidated = await waitForIpniProviderResults(rootCid, { + childBlocks: blockCids, + maxAttempts, + delayMs, + expectedProviders, + signal: verificationSignal, + }).catch((error) => { if (signal?.aborted) { signal.throwIfAborted(); } - if (verificationSignal.aborted) { - failedCIDs.push({ cid: cid.toString(), reason: `IPNI verification timed out after ${timeoutMs}ms` }); - continue; - } - - try { - await waitForIpniProviderResults(cid, { - maxAttempts, - delayMs, - expectedProviders, - signal: verificationSignal, - }); - verified += 1; - if (isRoot) rootCIDVerified = true; - } catch (error) { - if (signal?.aborted) { - signal.throwIfAborted(); - } - - const reason = verificationSignal.aborted - ? `IPNI verification timed out after ${timeoutMs}ms` - : error instanceof Error - ? error.message - : String(error); - - failedCIDs.push({ cid: cid.toString(), reason }); - - this.logger.warn({ - event: "ipni_cid_verification_failed", - message: "IPNI verification failed for CID", - cid: cid.toString(), - isRoot, + failureReason = `IPNI verification timed out after ${timeoutMs}ms`; + this.logger.error({ + event: "ipni_verification_timed_out", + message: failureReason, + rootCID: rootCid.toString(), providerAddress: storageProvider.address, providerId: storageProvider.providerId, providerName: storageProvider.name, serviceUrl: storageProvider.serviceUrl, - failureReason: reason, + blockCIDCount: blockCids.length, + timeoutMs, + pollIntervalMs: delayMs, + maxAttempts, }); + return false; } - } + const errorMessage = error instanceof Error ? error.message : String(error); + failureReason = errorMessage; + this.logger.error({ + event: "ipni_verification_failed", + message: "IPNI verification failed", + rootCID: rootCid.toString(), + providerAddress: storageProvider.address, + providerId: storageProvider.providerId, + providerName: storageProvider.name, + serviceUrl: storageProvider.serviceUrl, + blockCIDCount: blockCids.length, + timeoutMs, + pollIntervalMs: delayMs, + maxAttempts, + failureReason, + }); + return false; + }); const ipniVerificationDurationMs = Date.now() - ipniVerificationStartTime; - const total = cidsToValidate.length; - const unverified = total - verified; - if (verified === total) { + if (ipniValidated) { this.logger.log({ event: "ipni_verification_succeeded", message: "IPNI verification succeeded", @@ -133,32 +121,22 @@ export class IpniVerificationService { verifyDurationMs: ipniVerificationDurationMs, blockCIDCount: blockCids.length, }); - } else { - this.logger.error({ - event: verificationSignal.aborted ? "ipni_verification_timed_out" : "ipni_verification_failed", - message: "IPNI verification did not fully succeed", - rootCID: rootCid.toString(), - providerAddress: storageProvider.address, - providerId: storageProvider.providerId, - providerName: storageProvider.name, - serviceUrl: storageProvider.serviceUrl, - blockCIDCount: blockCids.length, - timeoutMs, - pollIntervalMs: delayMs, - maxAttempts, - verified, - unverified, - total, - }); } return { - verified: verified, - unverified: unverified, - total: total, - rootCIDVerified: rootCIDVerified, + verified: ipniValidated ? 1 : 0, + unverified: ipniValidated ? 0 : 1, + total: 1, + rootCIDVerified: ipniValidated, durationMs: ipniVerificationDurationMs, - failedCIDs: failedCIDs, + failedCIDs: ipniValidated + ? [] + : [ + { + cid: rootCid.toString(), + reason: failureReason, + }, + ], verifiedAt: new Date().toISOString(), }; } diff --git a/apps/backend/src/retrieval-anon/anon-retrieval.service.spec.ts b/apps/backend/src/retrieval-anon/anon-retrieval.service.spec.ts index c82eed76..adc75920 100644 --- a/apps/backend/src/retrieval-anon/anon-retrieval.service.spec.ts +++ b/apps/backend/src/retrieval-anon/anon-retrieval.service.spec.ts @@ -162,8 +162,6 @@ describe("AnonRetrievalService", () => { expect(row.block_fetch_failed_count).toBeNull(); expect(row.ipni_status).toBe("skipped"); expect(row.ipni_verify_ms).toBeNull(); - expect(row.ipni_verified_cids_count).toBeNull(); - expect(row.ipni_unverified_cids_count).toBeNull(); }); it("still emits a row when the signal aborts before fetchPiece runs", async () => { @@ -250,8 +248,6 @@ describe("AnonRetrievalService", () => { sampledCidCount: 5, ipniValid: true, ipniVerifyMs: 137, - ipniVerifiedCidsCount: 6, - ipniUnverifiedCidsCount: 0, blockFetchValid: true, blockFetchFailedCount: 0, blockFetchEndpoint: "https://sp.test/ipfs/", @@ -277,19 +273,15 @@ describe("AnonRetrievalService", () => { expect(row.block_fetch_failed_count).toBe(0); expect(row.ipni_status).toBe("valid"); expect(row.ipni_verify_ms).toBe(137); - expect(row.ipni_verified_cids_count).toBe(6); - expect(row.ipni_unverified_cids_count).toBe(0); }); - it("distinguishes IPNI invalid from block-fetch failures with explicit counts", async () => { + it("distinguishes IPNI invalid from block-fetch failures", async () => { const carResult: CarValidationResult = { carParseable: true, blockCount: 100, sampledCidCount: 5, ipniValid: false, ipniVerifyMs: 250, - ipniVerifiedCidsCount: 0, - ipniUnverifiedCidsCount: 6, blockFetchValid: false, blockFetchFailedCount: 2, blockFetchEndpoint: "https://sp.test/ipfs/", @@ -309,8 +301,6 @@ describe("AnonRetrievalService", () => { expect(row.piece_fetch_status).toBe(RetrievalStatus.SUCCESS); expect(row.car_parseable).toBe(true); expect(row.ipni_status).toBe("invalid"); - expect(row.ipni_verified_cids_count).toBe(0); - expect(row.ipni_unverified_cids_count).toBe(6); expect(row.block_fetch_valid).toBe(false); expect(row.block_fetch_sampled_count).toBe(5); expect(row.block_fetch_failed_count).toBe(2); @@ -347,8 +337,6 @@ describe("AnonRetrievalService", () => { sampledCidCount: 0, ipniValid: null, ipniVerifyMs: null, - ipniVerifiedCidsCount: null, - ipniUnverifiedCidsCount: null, blockFetchValid: null, blockFetchFailedCount: null, blockFetchEndpoint: null, @@ -373,8 +361,6 @@ describe("AnonRetrievalService", () => { expect(row.block_fetch_failed_count).toBeNull(); expect(row.ipni_status).toBe("skipped"); expect(row.ipni_verify_ms).toBeNull(); - expect(row.ipni_verified_cids_count).toBeNull(); - expect(row.ipni_unverified_cids_count).toBeNull(); }); }); }); diff --git a/apps/backend/src/retrieval-anon/anon-retrieval.service.ts b/apps/backend/src/retrieval-anon/anon-retrieval.service.ts index c1d08c0e..25b34e82 100644 --- a/apps/backend/src/retrieval-anon/anon-retrieval.service.ts +++ b/apps/backend/src/retrieval-anon/anon-retrieval.service.ts @@ -186,8 +186,6 @@ export class AnonRetrievalService { block_fetch_failed_count: carResult?.blockFetchFailedCount ?? null, ipni_status: ipniStatus, ipni_verify_ms: carResult?.ipniVerifyMs ?? null, - ipni_verified_cids_count: carResult?.ipniVerifiedCidsCount ?? null, - ipni_unverified_cids_count: carResult?.ipniUnverifiedCidsCount ?? null, error_message: finalPieceResult.errorMessage ?? null, }); } catch (error) { diff --git a/apps/backend/src/retrieval-anon/car-validation.service.ts b/apps/backend/src/retrieval-anon/car-validation.service.ts index 27ec2744..c3a6c717 100644 --- a/apps/backend/src/retrieval-anon/car-validation.service.ts +++ b/apps/backend/src/retrieval-anon/car-validation.service.ts @@ -63,8 +63,6 @@ export class CarValidationService { sampledCidCount: 0, ipniValid: null, ipniVerifyMs: null, - ipniVerifiedCidsCount: null, - ipniUnverifiedCidsCount: null, blockFetchValid: null, blockFetchFailedCount: null, blockFetchEndpoint: null, @@ -77,8 +75,6 @@ export class CarValidationService { sampledCidCount: 0, ipniValid: null, ipniVerifyMs: null, - ipniVerifiedCidsCount: null, - ipniUnverifiedCidsCount: null, blockFetchValid: null, blockFetchFailedCount: null, blockFetchEndpoint: null, @@ -99,8 +95,6 @@ export class CarValidationService { sampledCidCount: sampledBlocks.length, ipniValid: ipni.valid, ipniVerifyMs: ipni.durationMs, - ipniVerifiedCidsCount: ipni.verifiedCount, - ipniUnverifiedCidsCount: ipni.unverifiedCount, blockFetchValid: blockFetchResult.valid, blockFetchFailedCount: blockFetchResult.failedCount, blockFetchEndpoint: blockFetchResult.endpoint, @@ -129,8 +123,6 @@ export class CarValidationService { ): Promise<{ valid: boolean; durationMs: number | null; - verifiedCount: number | null; - unverifiedCount: number | null; }> { const timeouts = this.configService.get("timeouts", { infer: true }); let rootCid: CID; @@ -144,7 +136,7 @@ export class CarValidationService { providerAddress: provider.address, error: toStructuredError(error), }); - return { valid: false, durationMs: null, verifiedCount: null, unverifiedCount: null }; + return { valid: false, durationMs: null }; } const result = await this.ipniVerificationService.verify({ @@ -159,8 +151,6 @@ export class CarValidationService { return { valid: result.rootCIDVerified, durationMs: result.durationMs, - verifiedCount: result.verified, - unverifiedCount: result.unverified, }; } diff --git a/apps/backend/src/retrieval-anon/types.ts b/apps/backend/src/retrieval-anon/types.ts index 3ba2b9f9..9013a5ea 100644 --- a/apps/backend/src/retrieval-anon/types.ts +++ b/apps/backend/src/retrieval-anon/types.ts @@ -31,8 +31,6 @@ export type CarValidationResult = { sampledCidCount: number; ipniValid: boolean | null; ipniVerifyMs: number | null; - ipniVerifiedCidsCount: number | null; - ipniUnverifiedCidsCount: number | null; blockFetchValid: boolean | null; blockFetchFailedCount: number | null; blockFetchEndpoint: string | null; From f26744b8dfc661e779fae511cdc56a9985942e2c Mon Sep 17 00:00:00 2001 From: Dennis Trautwein Date: Mon, 4 May 2026 11:18:50 +0200 Subject: [PATCH 15/55] docs(retrieval-anon): flow description and metrics definitions --- .../anon-piece-selector.service.ts | 6 +- docs/checks/README.md | 3 +- docs/checks/anon-retrievals.md | 145 ++++++++++++++++++ docs/checks/events-and-metrics.md | 20 +++ 4 files changed, 172 insertions(+), 2 deletions(-) create mode 100644 docs/checks/anon-retrievals.md diff --git a/apps/backend/src/retrieval-anon/anon-piece-selector.service.ts b/apps/backend/src/retrieval-anon/anon-piece-selector.service.ts index 0ee51fc7..d354a222 100644 --- a/apps/backend/src/retrieval-anon/anon-piece-selector.service.ts +++ b/apps/backend/src/retrieval-anon/anon-piece-selector.service.ts @@ -18,13 +18,15 @@ type SizeRange = { min: bigint; max: bigint }; const MIB = 1024n * 1024n; // All downloads are buffered in-memory, so we need to keep piece sizes reasonable +// When changing these values, also update ./docs/checks/anon-retrievals.md#piece-selection const SIZE_BUCKETS: Record = { small: { min: 1n * MIB, max: 20n * MIB - 1n }, medium: { min: 20n * MIB, max: 100n * MIB - 1n }, large: { min: 100n * MIB, max: 500n * MIB - 1n }, }; -/** Weights for choosing a bucket per selection. Must sum to 1. */ +// Weights for choosing a bucket per selection. Must sum to 1. +// When changing these values, also update ./docs/checks/anon-retrievals.md#piece-selection const BUCKET_WEIGHTS: Record = { small: 0.2, medium: 0.5, @@ -35,6 +37,8 @@ const BUCKET_WEIGHTS: Record = { * Probability the primary draw targets the withIPFSIndexing pool. * The rest of the time we sample across all FWSS pieces, so SPs can't * optimise only their CAR corpus. + * + * When changing this value, also update ./docs/checks/anon-retrievals.md#piece-selection */ const IPFS_INDEXED_SAMPLE_RATE = 0.8; diff --git a/docs/checks/README.md b/docs/checks/README.md index 74b1a872..136349ee 100644 --- a/docs/checks/README.md +++ b/docs/checks/README.md @@ -4,6 +4,7 @@ The files are: - [production-configuration-and-approval-methodology.md](./production-configuration-and-approval-methodology.md): Defines the production configuration and approval methodology. - [data-storage.md](./data-storage.md): Defines the "data storage check" and how it is calculated. - [retrievals.md](./retrievals.md): Defines the "retrieval check" and how it is calculated. +- [anon-retrievals.md](./anon-retrievals.md): Defines the "anonymous retrieval check" (sampled public pieces, not dealbot-uploaded) and how it is calculated. - [data-retention.md](./data-retention.md): Defines the "data retention check" and how it is calculated. - [events-and-metrics.md](./events-and-metrics.md): Defines the events and metrics that are used to assess SP performance. @@ -14,7 +15,7 @@ DealBot creates synthetic traffic for SPs in the onchain SP registry and monitor ## Terminology ### Check -A "check" refers to a task type that dealbot performs on a SP. We currently have [Data Storage](./data-storage.md) and [Retrieval](./retrievals.md) checks. +A "check" refers to a task type that dealbot performs on an SP. We currently have [Data Storage](./data-storage.md), [Retrieval](./retrievals.md), [Anonymous Retrieval](./anon-retrievals.md), and [Data Retention](./data-retention.md) checks. ### Deal This is synonym for "Data Storage Check". This is covered in the [data-storage.md](./data-storage.md). diff --git a/docs/checks/anon-retrievals.md b/docs/checks/anon-retrievals.md new file mode 100644 index 00000000..0a303462 --- /dev/null +++ b/docs/checks/anon-retrievals.md @@ -0,0 +1,145 @@ +# Anonymous Retrieval Check + +This document is the **source of truth** for how dealbot's Anonymous Retrieval check works. + +Source code links throughout this document point to the current implementation. + +For event and metric definitions to be used by the dashboard, see [Dealbot Events & Metrics](./events-and-metrics.md). + +## Overview + +The Anonymous Retrieval check (sometimes referred to internally as [retrieval++](https://github.com/FilOzone/dealbot/pull/427)) tests publicly discoverable pieces on a storage provider (pieces that were *not* uploaded by dealbot). The intent is to measure SP retrievability against real-world tenant data, not just dealbot's own corpus. + +This is distinct from the [Retrieval check](./retrievals.md), which exercises pieces dealbot itself uploaded as part of a [Data Storage check](./data-storage.md). The Anonymous Retrieval check answers a different question: does the SP serve arbitrary pieces from its broader public corpus, with the same correctness and performance properties as dealbot's controlled pieces? + +### Definition of Successful Retrieval + +A successful anonymous retrieval requires: + +1. **Piece fetch** — `GET {spBaseUrl}/piece/{pieceCid}` returns HTTP 2xx and the response bytes hash to the declared CommP (piece CID). + +If the piece advertises IPFS indexing (`withIPFSIndexing = true` and a non-null `ipfsRootCid`), three additional dimensions are validated *independently*. Importantly, they do not gate the overall `piece_fetch_status`, and each is recorded as its own outcome column / metric: + +2. **CAR parseable:** the fetched bytes parse as a CAR file. +3. **IPNI:** the SP is advertised as a provider for the root CID and a sample of child CIDs via filecoinpin.contact. +4. **Block fetch:** a sample of CIDs from the parsed CAR is re-fetched via `{spBaseUrl}/ipfs/{cid}?format=raw` and each response is hash-verified against its declared CID. + +A piece without IPFS indexing is exercised only at step (1). + +Operational timeouts exist to prevent jobs from running indefinitely. If the job exceeds `ANON_RETRIEVAL_JOB_TIMEOUT_SECONDS`, it is aborted; a row is still emitted so that partial metrics (TTFB, bytes, response code) are not lost. + +## Piece Selection + +Unlike the [Retrieval check](./retrievals.md#piece-selection), dealbot does not retrieve from its own deals. Pieces are sampled from the on-chain subgraph of all FWSS-served pieces for the SP under test. + +Selection strategy (per scheduled job, per SP): + +1. **Pick a size bucket** by weighted random: + - `small` (1–20 MiB) — 20% + - `medium` (20–100 MiB) — 50% + - `large` (100–500 MiB) — 30% +2. **Pick a pool**: + - `indexed` (IPFS-indexed pieces) — 80% + - `any` (all FWSS pieces) — 20% +3. **Generate a uniform-random `sampleKey`** and query the subgraph for the smallest `Root.sampleKey ≥ $sampleKey` matching the SP, payer, size range, and pool filters. +4. **Drop the candidate** if `pdpPaymentEndEpoch` has passed. +5. **Fall back** through: (same bucket, opposite pool) → (any bucket, indexed) → (any bucket, any). + +The 80/20 split for `indexed` vs `any` exists so that SPs cannot optimize only their CAR corpus and still appear healthy on this check. + +> [!NOTE] +> The bucket sizes were chosen such that the whole file will still fit into memory. In the future we may implement a streaming verification and parsing. + +Source: [`anon-piece-selector.service.ts`](../../apps/backend/src/retrieval-anon/anon-piece-selector.service.ts) + +## What Happens Each Cycle + +```mermaid +flowchart TD + Select["Sample anonymous piece for SP from subgraph"] --> Fetch["GET /piece/{pieceCid}"] + Fetch --> CommP["Hash bytes → verify CommP"] + CommP --> HasIpfs{"piece.withIPFSIndexing
and ipfsRootCid?"} + HasIpfs -- "no" --> Record["Persist row + metrics"] + HasIpfs -- "yes" --> ParseCar["Parse bytes as CAR"] + ParseCar --> SampleBlocks["Pick N random CIDs
(ANON_RETRIEVAL_BLOCK_SAMPLE_COUNT)"] + SampleBlocks --> Ipni["IPNI: verify SP advertises root + sampled CIDs"] + SampleBlocks --> BlockFetch["GET /ipfs/{cid}?format=raw for each sampled CID"] + BlockFetch --> HashCheck["Hash-verify each response against its CID"] + Ipni --> Record + HashCheck --> Record +``` + +### Piece Fetch + +- **URL:** `{spBaseUrl}/piece/{pieceCid}` (HTTP/2) +- **Buffered in memory** — piece sizes are capped at 500 MiB by selection. +- **Validates CommP** — the CommP of the response bytes must match `pieceCid`. + +Source: [`piece-retrieval.service.ts`](../../apps/backend/src/retrieval-anon/piece-retrieval.service.ts) + +### CAR Validation (only when piece advertises IPFS indexing) + +When the selected piece has `withIPFSIndexing = true` and a non-null `ipfsRootCid`, the fetched bytes are parsed as a CAR and a random sample of `ANON_RETRIEVAL_BLOCK_SAMPLE_COUNT` CIDs is exercised: + +- **IPNI check:** `IpniVerificationService.verify(rootCid, sampledCids, sp)` polls filecoinpin.contact until each CID resolves to the SP under test, the timeout fires, or `IPNI_VERIFICATION_TIMEOUT_MS` is reached. +- **Block fetch check:** for each sampled CID, fetch `{spBaseUrl}/ipfs/{cid}?format=raw` and hash-verify the response against the CID. Non-2xx, hash mismatch, unsupported codec, or transport errors all count as a single failed block. + +Source: [`car-validation.service.ts`](../../apps/backend/src/retrieval-anon/car-validation.service.ts) + +## What Gets Asserted + +| # | Assertion | How It's Checked | Retries | Relevant Metric | Implemented? | +|---|-----------|------------------|:---:|------------------|:---:| +| 1 | SP serves the piece | `GET /piece/{pieceCid}` returns HTTP 2xx | 0 | [`anonPieceRetrievalLastByteMs`](./events-and-metrics.md#anonPieceRetrievalLastByteMs) | Yes | +| 2 | Bytes match the declared CommP | Hash of response bytes equals `pieceCid` | 0 | [`anonRetrievalStatus`](./events-and-metrics.md#anonRetrievalStatus) | Yes | +| 3 | Bytes parse as a CAR (IPFS-indexed pieces only) | `@ipld/car` parses the response | 0 | [`anonCarParseStatus`](./events-and-metrics.md#anonCarParseStatus) | Yes | +| 4 | SP is advertised on IPNI for root + sampled CIDs | filecoinpin.contact returns provider records | polling until timeout | [`anonIpniStatus`](./events-and-metrics.md#anonIpniStatus) | Yes | +| 5 | Sampled blocks fetch + hash-verify | `/ipfs/{cid}?format=raw` for each sample | 0 | [`anonBlockFetchStatus`](./events-and-metrics.md#anonBlockFetchStatus) | Yes | + +## Result Recording + +Each anonymous retrieval attempt writes one row to the `anon_retrieval_checks` ClickHouse table. The row is emitted **even on abort or unexpected error** so that the partial evidence (TTFB, bytes, response code) is preserved. + +The DDL and column-level comments in [`clickhouse.schema.ts`](../../apps/backend/src/clickhouse/clickhouse.schema.ts) are authoritative. The summary below is for orientation. + +| Column | Meaning | +|--------|---------| +| `timestamp` | When the check started (ms UTC) | +| `probe_location` | Dealbot probe location (`DEALBOT_PROBE_LOCATION`) | +| `sp_address`, `sp_id`, `sp_name` | SP identity | +| `retrieval_id` | Per-event UUID; correlates row to logs and Prometheus | +| `piece_cid`, `data_set_id`, `piece_id`, `raw_size` | Sampled piece identity | +| `with_ipfs_indexing`, `ipfs_root_cid` | Whether the piece advertises IPNI metadata | +| `service_type` | Always `direct_sp` today | +| `retrieval_endpoint` | URL probed for piece fetch | +| `piece_fetch_status` | `success` or `failed` — outcome of `/piece/{cid}` (HTTP 2xx **and** CommP match). CAR/IPNI/block-fetch outcomes live in their own columns and do **not** flip this status. | +| `http_response_code` | Raw HTTP status; null on transport failure | +| `first_byte_ms`, `last_byte_ms`, `bytes_retrieved`, `throughput_bps` | Piece-fetch performance | +| `commp_valid` | Null when retrieval failed before CommP could be hashed | +| `car_parseable`, `car_block_count` | Null when CAR validation was skipped (no IPFS indexing or piece fetch failed) | +| `block_fetch_endpoint`, `block_fetch_valid`, `block_fetch_sampled_count`, `block_fetch_failed_count` | Block-fetch outcomes; null when skipped | +| `ipni_status` | `valid` \| `invalid` \| `skipped` \| `error` | +| `ipni_verify_ms`, `ipni_verified_cids_count`, `ipni_unverified_cids_count` | IPNI check details | +| `error_message` | Failure reason; null on success | + +Source: [`anon-retrieval.service.ts`](../../apps/backend/src/retrieval-anon/anon-retrieval.service.ts) + +## Metrics Recorded + +Anonymous-retrieval Prometheus metric definitions live in [Dealbot Events & Metrics](./events-and-metrics.md). All anon-retrieval metrics carry `checkType=anon_retrieval`. + +## Configuration + +Key environment variables that control anonymous retrieval testing: + +| Variable | Description | +|----------|-------------| +| `RETRIEVALS_ANON_PER_SP_PER_HOUR` | Anonymous retrieval rate per SP. Falls back to `RETRIEVALS_PER_SP_PER_HOUR` when unset. | +| `ANON_RETRIEVAL_JOB_TIMEOUT_SECONDS` | Max end-to-end anon retrieval job runtime before forced abort (default 360s). | +| `ANON_RETRIEVAL_BLOCK_SAMPLE_COUNT` | Number of CIDs sampled from the parsed CAR for IPNI + block-fetch verification (default 5, max 50). | +| `IPNI_VERIFICATION_TIMEOUT_MS` | Max time to wait for IPNI provider verification (shared with the Retrieval check). | +| `IPNI_VERIFICATION_POLLING_MS` | Poll interval between IPNI verification attempts (shared). | +| `CONNECT_TIMEOUT_MS` | Connection/header timeout for HTTP requests. | +| `HTTP2_REQUEST_TIMEOUT_MS` | Total timeout for HTTP/2 retrieval requests. | + +See also: [`docs/environment-variables.md`](../environment-variables.md) for the full configuration reference. diff --git a/docs/checks/events-and-metrics.md b/docs/checks/events-and-metrics.md index 6c461f7f..f5d89b23 100644 --- a/docs/checks/events-and-metrics.md +++ b/docs/checks/events-and-metrics.md @@ -4,6 +4,16 @@ This document is the intended **source of truth** for the events emitted by deal > **Note on "events":** the entries in the [Event List](#event-list) are named **timing markers** used to define metric Timer Starts/Ends — they are not all emitted as discrete Prometheus events or log lines. Each marker is anchored in code (as a timestamp variable, log line, or status transition) and used to compute the metrics in the [Metrics](#metrics) section. +## Anonymous Retrieval Event Model + +The [Anonymous Retrieval check](./anon-retrievals.md) is a single-shot flow per piece: select → fetch piece → (optional) parse CAR + IPNI + block fetch → write one ClickHouse row. + +It is not modeled as a sequence of named lifecycle events. Instead it emits: + +- **Outcome metrics** when each step completes — see the [time](#time-related-metrics) and [status](#status-count-related-metrics) metric tables for `anonPieceRetrievalFirstByteMs`, `anonRetrievalCheckMs`, `anonRetrievalStatus`, `anonCarParseStatus`, `anonIpniStatus`, `anonBlockFetchStatus`, and friends. +- **One row per attempt** in the `anon_retrieval_checks` [ClickHouse table](#clickhouse-tables), emitted even on abort or unexpected error. +- **Structured log lines** (`anon_retrieval_started`, `anon_retrieval_completed`, `anon_retrieval_no_piece`, `anon_retrieval_car_validation_failed`, `anon_retrieval_clickhouse_insert_failed`) carrying a `retrievalId` so each row can be joined back to log evidence. + ## Data Storage Event Model Below are the sequence of events for a [Data Storage check](./data-storage.md). The Data Storage flow is used because it encapsulates a [Retrieval check](./retrievals.md) as well. @@ -87,6 +97,10 @@ sequenceDiagram | `dataStorageCheckMs` | Data Storage | [`uploadToSpStart`](#uploadToSpStart) | [`ipfsRetrievalIntegrityChecked`](#ipfsRetrievalIntegrityChecked) | Duration of a Data Storage check | | | `retrievalCheckMs` | Retrieval | Retrieval check start | [`ipfsRetrievalIntegrityChecked`](#ipfsRetrievalIntegrityChecked) | Duration of a Retrieval check | | | `dataSetCreationMs` | Data-Set Creation | Data-set creation uploadToSpStart | Data-set creation pieceConfirmed | Duration of one data-set creation with confirmed piece (all using `createDataSetWithPiece`) | [`deal.service.ts`](../../apps/backend/src/deal/deal.service.ts) | +| `anonPieceRetrievalFirstByteMs` | Anonymous Retrieval | Piece fetch start | First byte received from `/piece/{pieceCid}` | Time to first byte for anonymous piece retrievals | [`anon-retrieval.service.ts`](../../apps/backend/src/retrieval-anon/anon-retrieval.service.ts) | +| `anonPieceRetrievalLastByteMs` | Anonymous Retrieval | Piece fetch start | Last byte received from `/piece/{pieceCid}` | Total time to retrieve an anonymous piece | [`anon-retrieval.service.ts`](../../apps/backend/src/retrieval-anon/anon-retrieval.service.ts) | +| `anonPieceRetrievalThroughputBps` | Anonymous Retrieval | n/a | n/a | `(bytesRetrieved / anonPieceRetrievalLastByteMs) * 1000` | [`anon-retrieval.service.ts`](../../apps/backend/src/retrieval-anon/anon-retrieval.service.ts) | +| `anonRetrievalCheckMs` | Anonymous Retrieval | Anon retrieval check start | After CAR/IPNI/block-fetch validation completes (or on abort) | End-to-end anonymous retrieval check duration | [`anon-retrieval.service.ts`](../../apps/backend/src/retrieval-anon/anon-retrieval.service.ts) | ### Status Count Related Metrics @@ -106,6 +120,11 @@ sequenceDiagram | `dataSetCreationStatus` | Data-Set Creation | Not tied to an [event above](#event-list) but rather to data-set creation start (`pending`) and completion (`success`/`failure.*`) | `pending`, `success`, `failure.timedout`, `failure.other` | [`deal.service.ts`](../../apps/backend/src/deal/deal.service.ts) | | `dataSetChallengeStatus` | Data Retention | Emitted on each [Data Retention Check](./data-retention.md) poll when a provider's confirmed proving-period totals advance (strictly positive deltas). Unit: **challenges** (period delta × `CHALLENGES_PER_PROVING_PERIOD = 5`). | `success` (challenges in successfully-proven periods), `failure` (challenges in faulted periods) | [`data-retention.service.ts`](../../apps/backend/src/data-retention/data-retention.service.ts) | | `pdp_provider_estimated_overdue_periods` | Data Retention | Emitted on every [Data Retention Check](./data-retention.md) poll for every successfully processed provider. | Gauge value in proving periods (non-negative integer) | [`data-retention.service.ts`](../../apps/backend/src/data-retention/data-retention.service.ts) | +| `anonRetrievalStatus` | Anonymous Retrieval | After piece fetch completes (or on abort) | `success`, `failure.http`, `failure.aborted`, `failure.no_piece` | [`anon-retrieval.service.ts`](../../apps/backend/src/retrieval-anon/anon-retrieval.service.ts) | +| `anonPieceHttpResponseCode` | Anonymous Retrieval | After piece fetch completes | `200`, `500`, `2xxSuccess`, `4xxClientError`, `5xxServerError`, `otherHttpStatusCodes`, `failure` (same classifier as [`ipfsRetrievalHttpResponseCode`](#ipfsRetrievalHttpResponseCode)) | [`anon-retrieval.service.ts`](../../apps/backend/src/retrieval-anon/anon-retrieval.service.ts) | +| `anonCarParseStatus` | Anonymous Retrieval | After CAR validation runs (skipped when piece fetch failed or piece is not IPFS-indexed) | `parseable`, `not_parseable` | [`anon-retrieval.service.ts`](../../apps/backend/src/retrieval-anon/anon-retrieval.service.ts) | +| `anonIpniStatus` | Anonymous Retrieval | After CAR validation runs, **or** when piece fetch failed (records `skipped`) | `valid`, `invalid`, `skipped`, `error` | [`anon-retrieval.service.ts`](../../apps/backend/src/retrieval-anon/anon-retrieval.service.ts) | +| `anonBlockFetchStatus` | Anonymous Retrieval | After block-fetch sampling runs, **or** when piece fetch failed (records `skipped`) | `valid`, `invalid`, `skipped`, `error` | [`anon-retrieval.service.ts`](../../apps/backend/src/retrieval-anon/anon-retrieval.service.ts) | ## ClickHouse Tables @@ -115,6 +134,7 @@ When `CLICKHOUSE_URL` is configured, dealbot writes one row per check result to - **`data_storage_checks`** — one row written each time a deal is saved (on every status transition). Populated by [`deal.service.ts`](../../apps/backend/src/deal/deal.service.ts). - **`retrieval_checks`** — one row per retrieval attempt. Populated by [`retrieval.service.ts`](../../apps/backend/src/retrieval/retrieval.service.ts). +- **`anon_retrieval_checks`** — one row per [Anonymous Retrieval check](./anon-retrievals.md) attempt; emitted even on abort or unexpected error. Populated by [`anon-retrieval.service.ts`](../../apps/backend/src/retrieval-anon/anon-retrieval.service.ts). See [Anonymous Retrieval § Result Recording](./anon-retrievals.md#result-recording) for column-level meanings. - **`data_retention_challenges`** — one row per provider per poll cycle. Populated by [`data-retention.service.ts`](../../apps/backend/src/data-retention/data-retention.service.ts). All tables share the primary key `(probe_location, sp_address, timestamp)`: From 5cee3ee85975342302fe8b1e418e8758c723aaf1 Mon Sep 17 00:00:00 2001 From: Dennis Trautwein Date: Mon, 4 May 2026 12:05:14 +0200 Subject: [PATCH 16/55] docs: add missing anonymous retrieval env vars --- docs/environment-variables.md | 40 ++++++++++++++++++++++++++++++++++- 1 file changed, 39 insertions(+), 1 deletion(-) diff --git a/docs/environment-variables.md b/docs/environment-variables.md index 2f25943c..e2b23735 100644 --- a/docs/environment-variables.md +++ b/docs/environment-variables.md @@ -11,7 +11,7 @@ This document provides a comprehensive guide to all environment variables used b | [Blockchain](#blockchain-configuration) | `NETWORK`, `RPC_URL`, `WALLET_ADDRESS`, `WALLET_PRIVATE_KEY`, `SESSION_KEY_PRIVATE_KEY`, `CHECK_DATASET_CREATION_FEES`, `USE_ONLY_APPROVED_PROVIDERS`, `SUBGRAPH_ENDPOINT` | | [Dataset Versioning](#dataset-versioning) | `DEALBOT_DATASET_VERSION` | | [Scheduling](#scheduling-configuration) | `PROVIDERS_REFRESH_INTERVAL_SECONDS`, `DATA_RETENTION_POLL_INTERVAL_SECONDS`, `DEALBOT_MAINTENANCE_WINDOWS_UTC`, `DEALBOT_MAINTENANCE_WINDOW_MINUTES` | -| [Jobs (pg-boss)](#jobs-pg-boss) | `DEALBOT_PGBOSS_SCHEDULER_ENABLED`, `DEALBOT_PGBOSS_POOL_MAX`, `DEALS_PER_SP_PER_HOUR`, `DATASET_CREATIONS_PER_SP_PER_HOUR`, `RETRIEVALS_PER_SP_PER_HOUR`, `JOB_SCHEDULER_POLL_SECONDS`, `JOB_WORKER_POLL_SECONDS`, `PG_BOSS_LOCAL_CONCURRENCY`, `JOB_CATCHUP_MAX_ENQUEUE`, `JOB_SCHEDULE_PHASE_SECONDS`, `JOB_ENQUEUE_JITTER_SECONDS`, `DEAL_JOB_TIMEOUT_SECONDS`, `RETRIEVAL_JOB_TIMEOUT_SECONDS`, `ANON_RETRIEVAL_JOB_TIMEOUT_SECONDS`, `IPFS_BLOCK_FETCH_CONCURRENCY` | +| [Jobs (pg-boss)](#jobs-pg-boss) | `DEALBOT_PGBOSS_SCHEDULER_ENABLED`, `DEALBOT_PGBOSS_POOL_MAX`, `DEALS_PER_SP_PER_HOUR`, `DATASET_CREATIONS_PER_SP_PER_HOUR`, `RETRIEVALS_PER_SP_PER_HOUR`, `RETRIEVALS_ANON_PER_SP_PER_HOUR`, `JOB_SCHEDULER_POLL_SECONDS`, `JOB_WORKER_POLL_SECONDS`, `PG_BOSS_LOCAL_CONCURRENCY`, `JOB_CATCHUP_MAX_ENQUEUE`, `JOB_SCHEDULE_PHASE_SECONDS`, `JOB_ENQUEUE_JITTER_SECONDS`, `DEAL_JOB_TIMEOUT_SECONDS`, `RETRIEVAL_JOB_TIMEOUT_SECONDS`, `ANON_RETRIEVAL_JOB_TIMEOUT_SECONDS`, `ANON_RETRIEVAL_BLOCK_SAMPLE_COUNT`, `IPFS_BLOCK_FETCH_CONCURRENCY` | | [Dataset](#dataset-configuration) | `DEALBOT_LOCAL_DATASETS_PATH`, `RANDOM_PIECE_SIZES` | | [ClickHouse](#clickhouse-configuration) | `CLICKHOUSE_URL`, `CLICKHOUSE_BATCH_SIZE`, `CLICKHOUSE_FLUSH_INTERVAL_MS`, `DEALBOT_PROBE_LOCATION` | | [Timeouts](#timeout-configuration) | `CONNECT_TIMEOUT_MS`, `HTTP_REQUEST_TIMEOUT_MS`, `HTTP2_REQUEST_TIMEOUT_MS`, `IPNI_VERIFICATION_TIMEOUT_MS`, `IPNI_VERIFICATION_POLLING_MS` | @@ -622,6 +622,19 @@ rate-based (per hour) and persisted in Postgres so restarts do not reset timing. --- +### `RETRIEVALS_ANON_PER_SP_PER_HOUR` + +- **Type**: `number` +- **Required**: No +- **Default**: Falls back to `RETRIEVALS_PER_SP_PER_HOUR`, which itself defaults to `2` +- **Limits**: `0.001` – `20` + +**Role**: Target [anonymous retrieval](./checks/anon-retrievals.md) check rate per storage provider. Anonymous retrievals fetch arbitrary FWSS pieces sampled from the on-chain subgraph (not pieces dealbot uploaded), so this rate controls coverage of the SP's broader public corpus independently of the dealbot-owned [retrieval check](./checks/retrievals.md) rate. + +**Notes**: Fractional values are supported. For example, `0.5` means one anon retrieval every 2 hours per storage provider. + +--- + ### `DATASET_CREATIONS_PER_SP_PER_HOUR` - **Type**: `number` @@ -806,6 +819,31 @@ Use this to stagger multiple dealbot deployments that are not sharing a database **Note**: This is independent of HTTP-level timeouts (`CONNECT_TIMEOUT_MS`, `HTTP2_REQUEST_TIMEOUT_MS`). The job timeout covers the end-to-end execution of an Anon Retrieval Check (piece selection, download, CommP validation, CAR/IPNI validation). +--- + +### `ANON_RETRIEVAL_BLOCK_SAMPLE_COUNT` + +- **Type**: `number` (integer) +- **Required**: No +- **Default**: `5` +- **Minimum**: `1` +- **Maximum**: `50` +- **Enforced**: Yes (config validation) + +**Role**: Number of CIDs randomly sampled from the parsed CAR for IPNI verification and block-fetch validation during an [anonymous retrieval check](./checks/anon-retrievals.md). Only applies to pieces with IPFS indexing enabled — pieces without an `ipfsRootCid` skip CAR validation entirely. + +For each sampled CID, dealbot: + +1. Confirms via filecoinpin.contact that the SP is advertised as a provider for the CID. +2. Re-fetches the block via `{spBaseUrl}/ipfs/{cid}?format=raw` and hash-verifies the response. + +**When to update**: + +- Increase for stronger statistical confidence that the SP serves the entire DAG correctly (more IPNI queries + per-block fetches per check) +- Decrease to reduce per-check load on the SP and on filecoinpin.contact + +**Note**: A higher sample count multiplies both IPNI traffic and block-fetch traffic per check. The IPNI step is all-or-nothing across the root CID and the sampled child CIDs — see [Anonymous Retrieval § CAR Validation](./checks/anon-retrievals.md#car-validation-only-when-piece-advertises-ipfs-indexing). + --- ### `IPFS_BLOCK_FETCH_CONCURRENCY` From 95a2dff643b032ea02878251a0f9986a9a12f825 Mon Sep 17 00:00:00 2001 From: Dennis Trautwein Date: Mon, 4 May 2026 12:07:11 +0200 Subject: [PATCH 17/55] docs: fix obsolete reference to the pdp-explorer-owned subgraph --- .../production-configuration-and-approval-methodology.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/checks/production-configuration-and-approval-methodology.md b/docs/checks/production-configuration-and-approval-methodology.md index 3d956aa4..6b2859aa 100644 --- a/docs/checks/production-configuration-and-approval-methodology.md +++ b/docs/checks/production-configuration-and-approval-methodology.md @@ -40,8 +40,8 @@ Relevant parameters include: | Parameter | Value | Notes | |-----------|-------|-------| -| [`SUBGRAPH_ENDPOINT`](../environment-variables.md#subgraph_endpoint) | TODO: fill this in | Uses the subgraph from [pdp-explorer](https://github.com/FilOzone/pdp-explorer). | -| [`MIN_NUM_DATASETS_FOR_CHECKS`](../environment-variables.md#dataset-configuration) | 15 | Ensure there are enough datasets with pieces being added so that statistical significance for [Data Retention Fault Rate](#data-retention-fault-rate) can be achieved quicker. Note that on mainnet each dataset incurs 5 challenges[^1] per daily proof[^2]. With this many datasets, an SP can be approved for data retention after a faultless ~7 days even if the SP doesn't have other datasets. | +| [`SUBGRAPH_ENDPOINT`](../environment-variables.md#subgraph_endpoint) | - | Points at a Goldsky deployment of the dealbot-owned subgraph in [`apps/subgraph/`](../../apps/subgraph/) (package `@dealbot/subgraph`). | +| [`MIN_NUM_DATASETS_FOR_CHECKS`](../environment-variables.md#dataset-configuration) | 15 | Ensure there are enough datasets with pieces being added so that statistical significance for [Data Retention Fault Rate](#data-retention-fault-rate) can be achieved quicker. Note that on mainnet each dataset incurs 5 challenges[^1] per daily proof[^2]. With this many datasets, an SP can be approved for data retention after a faultless ~7 days even if the SP doesn't have other datasets. | See [How are data retention statistics/thresholds calculated?](#how-are-data-retention-statisticsthresholds-calculated) for more details. From cff31713aa28d97ce4ba41135c6e73d95ca2a17f Mon Sep 17 00:00:00 2001 From: Dennis Trautwein Date: Mon, 4 May 2026 12:30:56 +0200 Subject: [PATCH 18/55] improve: clarity around piece fetch status and commp validation --- apps/backend/src/clickhouse/clickhouse.schema.ts | 2 +- .../backend/src/retrieval-anon/anon-retrieval.service.ts | 9 ++++++++- docs/checks/events-and-metrics.md | 6 +++--- 3 files changed, 12 insertions(+), 5 deletions(-) diff --git a/apps/backend/src/clickhouse/clickhouse.schema.ts b/apps/backend/src/clickhouse/clickhouse.schema.ts index e8612056..b27ba0e2 100644 --- a/apps/backend/src/clickhouse/clickhouse.schema.ts +++ b/apps/backend/src/clickhouse/clickhouse.schema.ts @@ -82,7 +82,7 @@ export function buildMigrations(database: string): string[] { service_type LowCardinality(String), -- 'direct_sp' (only mode for anon retrievals today) retrieval_endpoint String, -- URL probed (e.g. {spBaseUrl}/piece/{pieceCid}) - piece_fetch_status LowCardinality(String), -- 'success' | 'failed' — outcome of GET /piece/ (HTTP 2xx AND CommP match). CAR/IPNI/block-fetch outcomes live in their own columns. + piece_fetch_status LowCardinality(String), -- 'success' | 'failed' — HTTP transport outcome of GET /piece/ (HTTP 2xx). CommP validity, CAR/IPNI/block-fetch outcomes live in their own columns. http_response_code Nullable(UInt16), -- raw HTTP status; null on transport failure first_byte_ms Nullable(Float64), -- time to first response byte last_byte_ms Nullable(Float64), -- time to last response byte diff --git a/apps/backend/src/retrieval-anon/anon-retrieval.service.ts b/apps/backend/src/retrieval-anon/anon-retrieval.service.ts index 25b34e82..eddc88f0 100644 --- a/apps/backend/src/retrieval-anon/anon-retrieval.service.ts +++ b/apps/backend/src/retrieval-anon/anon-retrieval.service.ts @@ -136,9 +136,16 @@ export class AnonRetrievalService { // Overall check duration and status this.metrics.observeCheckDuration(labels, Date.now() - checkStart); + const pieceServedCorrectly = pieceResult.success && pieceResult.commPValid; this.metrics.recordStatus( labels, - pieceResult.success ? "success" : pieceResult.aborted ? "failure.aborted" : "failure.http", + pieceServedCorrectly + ? "success" + : pieceResult.aborted + ? "failure.aborted" + : pieceResult.success + ? "failure.commp" + : "failure.http", ); } finally { // Always emit a ClickHouse row — even on abort or unexpected error — so diff --git a/docs/checks/events-and-metrics.md b/docs/checks/events-and-metrics.md index f5d89b23..fba8b003 100644 --- a/docs/checks/events-and-metrics.md +++ b/docs/checks/events-and-metrics.md @@ -118,9 +118,9 @@ sequenceDiagram | `ipfsRetrievalHttpResponseCode` | Data Storage, Retrieval | [`ipfsRetrievalLastByteReceived`](#ipfsRetrievalLastByteReceived) | `200`, `500`, `2xxSuccess`, `4xxClientError`, `5xxServerError`, `otherHttpStatusCodes`, `failure` | [`retrieval.service.ts`](../../apps/backend/src/retrieval/retrieval.service.ts) | | `retrievalStatus` | Data Storage, Retrieval | [`ipfsRetrievalIntegrityChecked`](#ipfsRetrievalIntegrityChecked) | `success`, `failure.timedout`, `failure.other` from [Data Storage Sub-status meanings](./data-storage.md#sub-status-meanings). | | | `dataSetCreationStatus` | Data-Set Creation | Not tied to an [event above](#event-list) but rather to data-set creation start (`pending`) and completion (`success`/`failure.*`) | `pending`, `success`, `failure.timedout`, `failure.other` | [`deal.service.ts`](../../apps/backend/src/deal/deal.service.ts) | -| `dataSetChallengeStatus` | Data Retention | Emitted on each [Data Retention Check](./data-retention.md) poll when a provider's confirmed proving-period totals advance (strictly positive deltas). Unit: **challenges** (period delta × `CHALLENGES_PER_PROVING_PERIOD = 5`). | `success` (challenges in successfully-proven periods), `failure` (challenges in faulted periods) | [`data-retention.service.ts`](../../apps/backend/src/data-retention/data-retention.service.ts) | -| `pdp_provider_estimated_overdue_periods` | Data Retention | Emitted on every [Data Retention Check](./data-retention.md) poll for every successfully processed provider. | Gauge value in proving periods (non-negative integer) | [`data-retention.service.ts`](../../apps/backend/src/data-retention/data-retention.service.ts) | -| `anonRetrievalStatus` | Anonymous Retrieval | After piece fetch completes (or on abort) | `success`, `failure.http`, `failure.aborted`, `failure.no_piece` | [`anon-retrieval.service.ts`](../../apps/backend/src/retrieval-anon/anon-retrieval.service.ts) | +| `dataSetChallengeStatus` | Data Retention | Not tied to an [event above](#event-list) but rather to the periodic chain-checking done in the [Data Retention Check](./data-retention.md) | `success`, `failure` | [`data-retention.service.ts`](../../apps/backend/src/data-retention/data-retention.service.ts) | +| `pdp_provider_overdue_periods` | Data Retention | Emitted on every poll | Gauge value (estimated overdue periods) | [`data-retention.service.ts`](../../apps/backend/src/data-retention/data-retention.service.ts) | +| `anonRetrievalStatus` | Anonymous Retrieval | After piece fetch completes (or on abort) | `success` (HTTP 2xx **and** CommP matches), `failure.http`, `failure.commp` (HTTP 2xx but bytes hashed to a different CID), `failure.aborted`, `failure.no_piece`. | [`anon-retrieval.service.ts`](../../apps/backend/src/retrieval-anon/anon-retrieval.service.ts) | | `anonPieceHttpResponseCode` | Anonymous Retrieval | After piece fetch completes | `200`, `500`, `2xxSuccess`, `4xxClientError`, `5xxServerError`, `otherHttpStatusCodes`, `failure` (same classifier as [`ipfsRetrievalHttpResponseCode`](#ipfsRetrievalHttpResponseCode)) | [`anon-retrieval.service.ts`](../../apps/backend/src/retrieval-anon/anon-retrieval.service.ts) | | `anonCarParseStatus` | Anonymous Retrieval | After CAR validation runs (skipped when piece fetch failed or piece is not IPFS-indexed) | `parseable`, `not_parseable` | [`anon-retrieval.service.ts`](../../apps/backend/src/retrieval-anon/anon-retrieval.service.ts) | | `anonIpniStatus` | Anonymous Retrieval | After CAR validation runs, **or** when piece fetch failed (records `skipped`) | `valid`, `invalid`, `skipped`, `error` | [`anon-retrieval.service.ts`](../../apps/backend/src/retrieval-anon/anon-retrieval.service.ts) | From 3c2a69899944ca5d4aa8acfe6d8a95e26e2c454e Mon Sep 17 00:00:00 2001 From: Dennis Trautwein Date: Tue, 5 May 2026 09:04:48 +0200 Subject: [PATCH 19/55] refactor: let two subgraph endpoints coexist --- apps/backend/.env.example | 5 +- apps/backend/README.md | 3 +- apps/backend/src/config/app.config.ts | 13 +- .../data-retention/data-retention.module.ts | 4 +- .../data-retention.service.spec.ts | 184 ++--- .../data-retention/data-retention.service.ts | 16 +- .../src/pdp-subgraph/pdp-subgraph.module.ts | 8 + .../pdp-subgraph/pdp-subgraph.service.spec.ts | 694 ++++++++++++++++++ .../src/pdp-subgraph/pdp-subgraph.service.ts | 306 ++++++++ apps/backend/src/pdp-subgraph/queries.ts | 24 + apps/backend/src/pdp-subgraph/types.spec.ts | 245 +++++++ apps/backend/src/pdp-subgraph/types.ts | 151 ++++ apps/backend/src/subgraph/subgraph.service.ts | 15 + .../src/wallet-sdk/wallet-sdk.service.spec.ts | 2 +- docs/checks/data-retention.md | 10 +- ...-configuration-and-approval-methodology.md | 2 +- docs/environment-variables.md | 26 +- .../local/backend-configmap-local.yaml | 1 + 18 files changed, 1595 insertions(+), 114 deletions(-) create mode 100644 apps/backend/src/pdp-subgraph/pdp-subgraph.module.ts create mode 100644 apps/backend/src/pdp-subgraph/pdp-subgraph.service.spec.ts create mode 100644 apps/backend/src/pdp-subgraph/pdp-subgraph.service.ts create mode 100644 apps/backend/src/pdp-subgraph/queries.ts create mode 100644 apps/backend/src/pdp-subgraph/types.spec.ts create mode 100644 apps/backend/src/pdp-subgraph/types.ts diff --git a/apps/backend/.env.example b/apps/backend/.env.example index 26469c52..30556e7a 100644 --- a/apps/backend/.env.example +++ b/apps/backend/.env.example @@ -23,7 +23,10 @@ WALLET_ADDRESS=0x0000000000000000000000000000000000000000 WALLET_PRIVATE_KEY=your_private_key_here CHECK_DATASET_CREATION_FEES=true USE_ONLY_APPROVED_PROVIDERS=true -# Point at the dealbot-owned subgraph on Goldsky (see apps/subgraph/README.md). +# Upstream pdp-explorer subgraph — drives the data-retention / overdue-periods path. +PDP_SUBGRAPH_ENDPOINT=https://api.thegraph.com/subgraphs/filecoin/pdp +# Dealbot-owned subgraph on Goldsky (see apps/subgraph/README.md) — drives only +# the new anonymous-retrieval candidate-piece query for now. SUBGRAPH_ENDPOINT=https://api.goldsky.com/api/public//subgraphs/dealbot-subgraph//gn # Minimum number of datasets per SP (default: 1). When > 1, a separate data_set_creation job provisions extra datasets. diff --git a/apps/backend/README.md b/apps/backend/README.md index 4805080f..e4dafd6e 100644 --- a/apps/backend/README.md +++ b/apps/backend/README.md @@ -104,7 +104,8 @@ All configuration is done via environment variables in `.env`. | `CHECK_DATASET_CREATION_FEES` | Check fees before dataset creation | `true` | | `ENABLE_IPNI_TESTING` | IPNI testing mode (`disabled`/`random`/`always`) | `always` | | `USE_ONLY_APPROVED_PROVIDERS` | Only use approved storage providers | `true` | -| `SUBGRAPH_ENDPOINT` | Subgraph GraphQL endpoint for PDP proof-set/data-retention and anon-retrieval queries | `https://api.goldsky.com/api/public//subgraphs/dealbot-subgraph//gn` | +| `PDP_SUBGRAPH_ENDPOINT` | PDP subgraph API endpoint for PDP proof-set/data-retention | `https://api.thegraph.com/subgraphs/filecoin/pdp` | +| `SUBGRAPH_ENDPOINT` | Subgraph GraphQL endpoint for anon-retrieval queries | `https://api.goldsky.com/api/public//subgraphs/dealbot-subgraph//gn` | ### Scheduling Configuration (pg-boss) diff --git a/apps/backend/src/config/app.config.ts b/apps/backend/src/config/app.config.ts index 4e49e4d8..7906be8c 100644 --- a/apps/backend/src/config/app.config.ts +++ b/apps/backend/src/config/app.config.ts @@ -56,6 +56,15 @@ export const configValidationSchema = Joi.object({ USE_ONLY_APPROVED_PROVIDERS: Joi.boolean().default(true), DEALBOT_DATASET_VERSION: Joi.string().optional(), MIN_NUM_DATASETS_FOR_CHECKS: Joi.number().integer().min(1).default(1), + // Two subgraph endpoints coexist intentionally to limit blast radius while we + // migrate off the upstream pdp-explorer subgraph: + // - PDP_SUBGRAPH_ENDPOINT drives the established overdue-periods / data + // retention path against the existing pdp-explorer subgraph. + // - SUBGRAPH_ENDPOINT drives only the new anonymous-retrieval candidate + // piece query against the dealbot-owned subgraph. + // Once the dealbot-owned subgraph has soaked in production we can drop + // PDP_SUBGRAPH_ENDPOINT and route everything through SUBGRAPH_ENDPOINT. + PDP_SUBGRAPH_ENDPOINT: Joi.string().uri().optional().allow(""), SUBGRAPH_ENDPOINT: Joi.string().uri().optional().allow(""), // Scheduling @@ -177,7 +186,8 @@ export interface IBlockchainConfig { useOnlyApprovedProviders: boolean; dealbotDataSetVersion?: string; minNumDataSetsForChecks: number; - subgraphEndpoint?: string; + pdpSubgraphEndpoint?: string; + subgraphEndpoint?: string; // Endpoint of the dealbot-owned subgraph. Eventually replaces `pdpSubgraphEndpoint` } export interface ISchedulingConfig { @@ -437,6 +447,7 @@ export function loadConfig(): IConfig { useOnlyApprovedProviders: process.env.USE_ONLY_APPROVED_PROVIDERS !== "false", dealbotDataSetVersion: process.env.DEALBOT_DATASET_VERSION, minNumDataSetsForChecks: Number.parseInt(process.env.MIN_NUM_DATASETS_FOR_CHECKS || "1", 10), + pdpSubgraphEndpoint: process.env.PDP_SUBGRAPH_ENDPOINT || "", subgraphEndpoint: process.env.SUBGRAPH_ENDPOINT || "", }, scheduling: { diff --git a/apps/backend/src/data-retention/data-retention.module.ts b/apps/backend/src/data-retention/data-retention.module.ts index f0aec1ec..f459570a 100644 --- a/apps/backend/src/data-retention/data-retention.module.ts +++ b/apps/backend/src/data-retention/data-retention.module.ts @@ -2,12 +2,12 @@ import { Module } from "@nestjs/common"; import { TypeOrmModule } from "@nestjs/typeorm"; import { DataRetentionBaseline } from "../database/entities/data-retention-baseline.entity.js"; import { StorageProvider } from "../database/entities/storage-provider.entity.js"; -import { SubgraphModule } from "../subgraph/subgraph.module.js"; +import { PdpSubgraphModule } from "../pdp-subgraph/pdp-subgraph.module.js"; import { WalletSdkModule } from "../wallet-sdk/wallet-sdk.module.js"; import { DataRetentionService } from "./data-retention.service.js"; @Module({ - imports: [WalletSdkModule, SubgraphModule, TypeOrmModule.forFeature([DataRetentionBaseline, StorageProvider])], + imports: [WalletSdkModule, PdpSubgraphModule, TypeOrmModule.forFeature([DataRetentionBaseline, StorageProvider])], providers: [DataRetentionService], exports: [DataRetentionService], }) diff --git a/apps/backend/src/data-retention/data-retention.service.spec.ts b/apps/backend/src/data-retention/data-retention.service.spec.ts index d2d539cf..3fde29e8 100644 --- a/apps/backend/src/data-retention/data-retention.service.spec.ts +++ b/apps/backend/src/data-retention/data-retention.service.spec.ts @@ -7,8 +7,8 @@ import type { IConfig } from "../config/app.config.js"; import type { DataRetentionBaseline } from "../database/entities/data-retention-baseline.entity.js"; import { StorageProvider } from "../database/entities/storage-provider.entity.js"; import { buildCheckMetricLabels } from "../metrics-prometheus/check-metric-labels.js"; -import type { SubgraphService } from "../subgraph/subgraph.service.js"; -import type { ProviderDataSetResponse } from "../subgraph/types.js"; +import type { PDPSubgraphService } from "../pdp-subgraph/pdp-subgraph.service.js"; +import type { ProviderDataSetResponse } from "../pdp-subgraph/types.js"; import type { WalletSdkService } from "../wallet-sdk/wallet-sdk.service.js"; import { DataRetentionService } from "./data-retention.service.js"; @@ -42,7 +42,7 @@ describe("DataRetentionService", () => { let walletSdkServiceMock: { getTestingProviders: ReturnType; }; - let subgraphServiceMock: { + let pdpSubgraphServiceMock: { fetchSubgraphMeta: ReturnType; fetchProvidersWithDatasets: ReturnType; }; @@ -69,7 +69,7 @@ describe("DataRetentionService", () => { configServiceMock = { get: vi.fn((key: keyof IConfig) => { if (key === "blockchain") { - return { subgraphEndpoint: "https://example.com/subgraph" }; + return { pdpSubgraphEndpoint: "https://example.com/subgraph" }; } if (key === "spBlocklists") { return { ids: new Set(), addresses: new Set() }; @@ -95,7 +95,7 @@ describe("DataRetentionService", () => { ]), }; - subgraphServiceMock = { + pdpSubgraphServiceMock = { fetchSubgraphMeta: vi.fn().mockResolvedValue({ _meta: { block: { @@ -146,7 +146,7 @@ describe("DataRetentionService", () => { service = new DataRetentionService( configServiceMock, walletSdkServiceMock as unknown as WalletSdkService, - subgraphServiceMock as unknown as SubgraphService, + pdpSubgraphServiceMock as unknown as PDPSubgraphService, mockBaselineRepository as unknown as Repository, mockSPRepository as unknown as Repository, counterMock as unknown as Counter, @@ -155,15 +155,15 @@ describe("DataRetentionService", () => { ); }); - it("returns early when subgraphEndpoint is empty", async () => { + it("returns early when pdpSubgraphEndpoint is empty", async () => { (configServiceMock.get as ReturnType).mockReturnValue({ - subgraphEndpoint: "", + pdpSubgraphEndpoint: "", }); await service.pollDataRetention(); - expect(subgraphServiceMock.fetchSubgraphMeta).not.toHaveBeenCalled(); - expect(subgraphServiceMock.fetchProvidersWithDatasets).not.toHaveBeenCalled(); + expect(pdpSubgraphServiceMock.fetchSubgraphMeta).not.toHaveBeenCalled(); + expect(pdpSubgraphServiceMock.fetchProvidersWithDatasets).not.toHaveBeenCalled(); }); it("returns early when no testing providers configured", async () => { @@ -171,31 +171,31 @@ describe("DataRetentionService", () => { await service.pollDataRetention(); - expect(subgraphServiceMock.fetchProvidersWithDatasets).not.toHaveBeenCalled(); + expect(pdpSubgraphServiceMock.fetchProvidersWithDatasets).not.toHaveBeenCalled(); }); it("returns early when all providers are blocked for data-retention", async () => { (configServiceMock.get as ReturnType).mockImplementation((key: string) => { - if (key === "blockchain") return { subgraphEndpoint: "https://example.com/subgraph" }; + if (key === "blockchain") return { pdpSubgraphEndpoint: "https://example.com/subgraph" }; if (key === "spBlocklists") return { ids: new Set(), addresses: new Set([PROVIDER_A, PROVIDER_B]) }; }); await service.pollDataRetention(); - expect(subgraphServiceMock.fetchProvidersWithDatasets).not.toHaveBeenCalled(); + expect(pdpSubgraphServiceMock.fetchProvidersWithDatasets).not.toHaveBeenCalled(); }); it("excludes blocked providers from data-retention polling while retaining unblocked ones", async () => { (configServiceMock.get as ReturnType).mockImplementation((key: string) => { - if (key === "blockchain") return { subgraphEndpoint: "https://example.com/subgraph" }; + if (key === "blockchain") return { pdpSubgraphEndpoint: "https://example.com/subgraph" }; if (key === "spBlocklists") return { ids: new Set(), addresses: new Set([PROVIDER_A]) }; }); - subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider({ address: PROVIDER_B })]); + pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider({ address: PROVIDER_B })]); await service.pollDataRetention(); const allAddressesPolled: string[] = ( - subgraphServiceMock.fetchProvidersWithDatasets.mock.calls as [{ addresses: string[] }][] + pdpSubgraphServiceMock.fetchProvidersWithDatasets.mock.calls as [{ addresses: string[] }][] ).flatMap(([{ addresses }]) => addresses); expect(allAddressesPolled).toContain(PROVIDER_B.toLowerCase()); expect(allAddressesPolled).not.toContain(PROVIDER_A.toLowerCase()); @@ -206,16 +206,16 @@ describe("DataRetentionService", () => { await service.pollDataRetention(); - expect(subgraphServiceMock.fetchProvidersWithDatasets).not.toHaveBeenCalled(); + expect(pdpSubgraphServiceMock.fetchProvidersWithDatasets).not.toHaveBeenCalled(); }); it("sets baseline on first poll without emitting counters (fresh deploy / new provider)", async () => { - subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider()]); + pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider()]); await service.pollDataRetention(); - expect(subgraphServiceMock.fetchSubgraphMeta).toHaveBeenCalled(); - expect(subgraphServiceMock.fetchProvidersWithDatasets).toHaveBeenCalledWith({ + expect(pdpSubgraphServiceMock.fetchSubgraphMeta).toHaveBeenCalled(); + expect(pdpSubgraphServiceMock.fetchProvidersWithDatasets).toHaveBeenCalledWith({ blockNumber: 1200, addresses: [PROVIDER_A, PROVIDER_B], }); @@ -239,20 +239,20 @@ describe("DataRetentionService", () => { it("computes deltas correctly on consecutive polls", async () => { // First poll: blockNumber=1200 - subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider()]); + pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider()]); await service.pollDataRetention(); const firstCallCount = counterMock.labels.mock.calls.length; // Second poll: blockNumber=1300, provider totals changed - subgraphServiceMock.fetchSubgraphMeta.mockResolvedValueOnce({ + pdpSubgraphServiceMock.fetchSubgraphMeta.mockResolvedValueOnce({ _meta: { block: { number: 1300, }, }, }); - subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([ + pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([ makeProvider({ totalFaultedPeriods: 12n, totalProvingPeriods: 105n, @@ -266,7 +266,7 @@ describe("DataRetentionService", () => { }); it("does not increment counters when deltas are zero", async () => { - subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValue([makeProvider()]); + pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValue([makeProvider()]); // First poll await service.pollDataRetention(); @@ -288,7 +288,7 @@ describe("DataRetentionService", () => { const providerA = makeProvider({ address: PROVIDER_A, totalFaultedPeriods: 5n }); const providerB = makeProvider({ address: PROVIDER_B, totalFaultedPeriods: 20n }); - subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([providerA, providerB]); + pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([providerA, providerB]); await service.pollDataRetention(); @@ -310,7 +310,7 @@ describe("DataRetentionService", () => { ]); const provider = makeProvider(); - subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([provider]); + pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([provider]); await service.pollDataRetention(); @@ -333,7 +333,7 @@ describe("DataRetentionService", () => { }); it("handles empty providers array without errors", async () => { - subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([]); + pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([]); await service.pollDataRetention(); @@ -347,7 +347,7 @@ describe("DataRetentionService", () => { ]); const provider = makeProvider(); - subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([provider]); + pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([provider]); await service.pollDataRetention(); @@ -370,7 +370,7 @@ describe("DataRetentionService", () => { }); it("catches and logs errors without rethrowing", async () => { - subgraphServiceMock.fetchProvidersWithDatasets.mockRejectedValueOnce(new Error("subgraph down")); + pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockRejectedValueOnce(new Error("subgraph down")); // Should not throw await expect(service.pollDataRetention()).resolves.toBeUndefined(); @@ -378,14 +378,14 @@ describe("DataRetentionService", () => { it("resets baseline on negative deltas without incrementing counters", async () => { // First poll: high values - subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([ + pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([ makeProvider({ totalFaultedPeriods: 100n, totalProvingPeriods: 200n }), ]); await service.pollDataRetention(); counterMock.labels.mockClear(); // Second poll: lower values (e.g., chain reorg or subgraph correction) - subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([ + pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([ makeProvider({ totalFaultedPeriods: 50n, totalProvingPeriods: 100n }), ]); await service.pollDataRetention(); @@ -394,7 +394,7 @@ describe("DataRetentionService", () => { expect(counterMock.labels).not.toHaveBeenCalled(); // Third poll: values increase from new baseline - subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([ + pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([ makeProvider({ totalFaultedPeriods: 52n, totalProvingPeriods: 105n }), ]); await service.pollDataRetention(); @@ -412,7 +412,7 @@ describe("DataRetentionService", () => { { providerAddress: PROVIDER_A, faultedPeriods: "0", successPeriods: "0", lastBlockNumber: "1000" }, ]); - subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([ + pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([ makeProvider({ totalFaultedPeriods: largeValue, totalProvingPeriods: largeValue * 2n }), ]); @@ -436,7 +436,7 @@ describe("DataRetentionService", () => { { providerAddress: PROVIDER_A, faultedPeriods: "0", successPeriods: "0", lastBlockNumber: "1000" }, ]); - subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([ + pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([ makeProvider({ totalFaultedPeriods: maxSafeInt, totalProvingPeriods: maxSafeInt * 2n }), ]); @@ -456,7 +456,7 @@ describe("DataRetentionService", () => { totalFaultedPeriods: 5n, totalProvingPeriods: 50n, }); - subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([provider]); + pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([provider]); await service.pollDataRetention(); @@ -475,18 +475,18 @@ describe("DataRetentionService", () => { })); walletSdkServiceMock.getTestingProviders.mockReturnValueOnce(manyProviders); - subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValue([]); + pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValue([]); await service.pollDataRetention(); // Should be called twice: once for first 50, once for remaining 25 - expect(subgraphServiceMock.fetchProvidersWithDatasets).toHaveBeenCalledTimes(2); - expect(subgraphServiceMock.fetchProvidersWithDatasets).toHaveBeenNthCalledWith(1, { + expect(pdpSubgraphServiceMock.fetchProvidersWithDatasets).toHaveBeenCalledTimes(2); + expect(pdpSubgraphServiceMock.fetchProvidersWithDatasets).toHaveBeenNthCalledWith(1, { addresses: expect.arrayContaining([expect.any(String)]), blockNumber: 1200, }); - expect(subgraphServiceMock.fetchProvidersWithDatasets.mock.calls[0][0].addresses).toHaveLength(50); - expect(subgraphServiceMock.fetchProvidersWithDatasets.mock.calls[1][0].addresses).toHaveLength(25); + expect(pdpSubgraphServiceMock.fetchProvidersWithDatasets.mock.calls[0][0].addresses).toHaveLength(50); + expect(pdpSubgraphServiceMock.fetchProvidersWithDatasets.mock.calls[1][0].addresses).toHaveLength(25); }); it("continues processing next batch if one batch fails", async () => { @@ -499,20 +499,20 @@ describe("DataRetentionService", () => { walletSdkServiceMock.getTestingProviders.mockReturnValueOnce(manyProviders); // First batch fails, second succeeds - subgraphServiceMock.fetchProvidersWithDatasets + pdpSubgraphServiceMock.fetchProvidersWithDatasets .mockRejectedValueOnce(new Error("Subgraph timeout")) .mockResolvedValueOnce([]); await service.pollDataRetention(); // Both batches should be attempted - expect(subgraphServiceMock.fetchProvidersWithDatasets).toHaveBeenCalledTimes(2); + expect(pdpSubgraphServiceMock.fetchProvidersWithDatasets).toHaveBeenCalledTimes(2); }); it("logs error and skips counter update when provider not found in cache but returned from subgraph", async () => { // Provider C not in cache const PROVIDER_C = "0x1234567890123456789012345678901234567890"; - subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider({ address: PROVIDER_C })]); + pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider({ address: PROVIDER_C })]); await service.pollDataRetention(); @@ -523,7 +523,7 @@ describe("DataRetentionService", () => { describe("cleanupStaleProviders", () => { it("does not cleanup when no stale providers exist", async () => { // First poll establishes baseline for both providers - subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([ + pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([ makeProvider({ address: PROVIDER_A }), makeProvider({ address: PROVIDER_B }), ]); @@ -536,7 +536,7 @@ describe("DataRetentionService", () => { it("successfully cleans up stale provider with valid database entry", async () => { // First poll: establish baseline for PROVIDER_A - subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider({ address: PROVIDER_A })]); + pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider({ address: PROVIDER_A })]); await service.pollDataRetention(); // Second poll: PROVIDER_A removed from active list, only PROVIDER_B active @@ -558,7 +558,7 @@ describe("DataRetentionService", () => { }, ]); - subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider({ address: PROVIDER_B })]); + pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider({ address: PROVIDER_B })]); await service.pollDataRetention(); @@ -589,7 +589,7 @@ describe("DataRetentionService", () => { it("skips cleanup entirely when database fetch fails", async () => { // First poll: establish baseline - subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider({ address: PROVIDER_A })]); + pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider({ address: PROVIDER_A })]); await service.pollDataRetention(); // Second poll: provider removed, but DB fails @@ -604,7 +604,7 @@ describe("DataRetentionService", () => { mockSPRepository.find.mockRejectedValueOnce(new Error("Database connection failed")); - subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider({ address: PROVIDER_B })]); + pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider({ address: PROVIDER_B })]); await service.pollDataRetention(); @@ -624,7 +624,7 @@ describe("DataRetentionService", () => { }, ]); - subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([ + pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([ makeProvider({ address: PROVIDER_A, totalFaultedPeriods: 12n, totalProvingPeriods: 105n }), ]); @@ -637,7 +637,7 @@ describe("DataRetentionService", () => { it("retains baseline when provider not found in database", async () => { // First poll: establish baseline - subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider({ address: PROVIDER_A })]); + pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider({ address: PROVIDER_A })]); await service.pollDataRetention(); // Second poll: provider removed from active list @@ -653,7 +653,7 @@ describe("DataRetentionService", () => { // Database returns empty array (provider not found) mockSPRepository.find.mockResolvedValueOnce([]); - subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider({ address: PROVIDER_B })]); + pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider({ address: PROVIDER_B })]); await service.pollDataRetention(); @@ -670,7 +670,7 @@ describe("DataRetentionService", () => { }, ]); - subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([ + pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([ makeProvider({ address: PROVIDER_A, totalFaultedPeriods: 12n, totalProvingPeriods: 105n }), ]); @@ -683,7 +683,7 @@ describe("DataRetentionService", () => { it("retains baseline when provider has null providerId", async () => { // First poll: establish baseline - subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider({ address: PROVIDER_A })]); + pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider({ address: PROVIDER_A })]); await service.pollDataRetention(); // Second poll: provider removed @@ -706,7 +706,7 @@ describe("DataRetentionService", () => { }, ]); - subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider({ address: PROVIDER_B })]); + pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider({ address: PROVIDER_B })]); await service.pollDataRetention(); @@ -716,7 +716,7 @@ describe("DataRetentionService", () => { it("retains baseline when counter removal throws error", async () => { // First poll: establish baseline - subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider({ address: PROVIDER_A })]); + pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider({ address: PROVIDER_A })]); await service.pollDataRetention(); // Second poll: provider removed @@ -743,7 +743,7 @@ describe("DataRetentionService", () => { throw new Error("Counter removal failed"); }); - subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider({ address: PROVIDER_B })]); + pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider({ address: PROVIDER_B })]); await service.pollDataRetention(); @@ -760,7 +760,7 @@ describe("DataRetentionService", () => { }, ]); - subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([ + pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([ makeProvider({ address: PROVIDER_A, totalFaultedPeriods: 12n, totalProvingPeriods: 110n }), ]); @@ -781,7 +781,7 @@ describe("DataRetentionService", () => { { id: 3, serviceProvider: PROVIDER_C, name: "Provider C", isApproved: true }, ]); - subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([ + pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([ makeProvider({ address: PROVIDER_A }), makeProvider({ address: PROVIDER_B }), makeProvider({ address: PROVIDER_C }), @@ -799,7 +799,7 @@ describe("DataRetentionService", () => { { address: PROVIDER_C, name: "Provider C", providerId: 3, isApproved: true }, ]); - subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider({ address: PROVIDER_A })]); + pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider({ address: PROVIDER_A })]); await service.pollDataRetention(); @@ -815,7 +815,7 @@ describe("DataRetentionService", () => { it("skips cleanup when processing errors occurred", async () => { // First poll: establish baseline - subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider({ address: PROVIDER_A })]); + pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider({ address: PROVIDER_A })]); await service.pollDataRetention(); // Second poll: provider removed, but processing has errors @@ -824,7 +824,7 @@ describe("DataRetentionService", () => { ]); // Simulate processing error - subgraphServiceMock.fetchProvidersWithDatasets.mockRejectedValueOnce(new Error("Processing failed")); + pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockRejectedValueOnce(new Error("Processing failed")); await service.pollDataRetention(); @@ -841,7 +841,7 @@ describe("DataRetentionService", () => { { id: 1, serviceProvider: PROVIDER_MIXED_CASE, name: "Provider A", isApproved: true }, ]); - subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([ + pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([ makeProvider({ address: PROVIDER_MIXED_CASE.toLowerCase() as `0x${string}` }), ]); @@ -861,7 +861,7 @@ describe("DataRetentionService", () => { }, ]); - subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider({ address: PROVIDER_B })]); + pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider({ address: PROVIDER_B })]); await service.pollDataRetention(); @@ -885,7 +885,7 @@ describe("DataRetentionService", () => { // Subgraph returns same values: totalFaultedPeriods=10, totalProvingPeriods=100 // confirmedTotalSuccess = 100 - 10 = 90 // With DB baseline: faultedDelta = 10 - 10 = 0, successDelta = 90 - 90 = 0 - subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider()]); + pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider()]); await service.pollDataRetention(); @@ -907,7 +907,7 @@ describe("DataRetentionService", () => { // Subgraph returns: totalFaultedPeriods=10, totalProvingPeriods=100 // confirmedTotalSuccess = 100 - 10 = 90 // faultedDelta = 10 - 8 = 2, successDelta = 90 - 85 = 5 - subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider()]); + pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider()]); await service.pollDataRetention(); @@ -921,8 +921,8 @@ describe("DataRetentionService", () => { expect(incCalls).toEqual(expect.arrayContaining([[10], [25]])); }); - it("reloads baselines from DB on every poll", async () => { - subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValue([makeProvider()]); + it("only loads baselines from DB once across multiple polls", async () => { + pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValue([makeProvider()]); await service.pollDataRetention(); await service.pollDataRetention(); @@ -932,13 +932,13 @@ describe("DataRetentionService", () => { }); it("does not double-count when poll ownership alternates across worker pods", async () => { - subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider()]); + pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider()]); await service.pollDataRetention(); const secondPod = new DataRetentionService( configServiceMock, walletSdkServiceMock as unknown as WalletSdkService, - subgraphServiceMock as unknown as SubgraphService, + pdpSubgraphServiceMock as unknown as PDPSubgraphService, mockBaselineRepository as unknown as Repository, mockSPRepository as unknown as Repository, counterMock as unknown as Counter, @@ -946,8 +946,8 @@ describe("DataRetentionService", () => { { insert: vi.fn(), probeLocation: "test" } as unknown as ClickhouseService, ); - subgraphServiceMock.fetchSubgraphMeta.mockResolvedValueOnce({ _meta: { block: { number: 1300 } } }); - subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([ + pdpSubgraphServiceMock.fetchSubgraphMeta.mockResolvedValueOnce({ _meta: { block: { number: 1300 } } }); + pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([ makeProvider({ totalFaultedPeriods: 11n, totalProvingPeriods: 102n }), ]); await secondPod.pollDataRetention(); @@ -955,8 +955,8 @@ describe("DataRetentionService", () => { counterMock.labels.mockClear(); counterMock.inc.mockClear(); - subgraphServiceMock.fetchSubgraphMeta.mockResolvedValueOnce({ _meta: { block: { number: 1400 } } }); - subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([ + pdpSubgraphServiceMock.fetchSubgraphMeta.mockResolvedValueOnce({ _meta: { block: { number: 1400 } } }); + pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([ makeProvider({ totalFaultedPeriods: 12n, totalProvingPeriods: 104n }), ]); await service.pollDataRetention(); @@ -972,8 +972,8 @@ describe("DataRetentionService", () => { ]; mockBaselineRepository.upsert.mockRejectedValueOnce(new Error("DB write failed")); - subgraphServiceMock.fetchSubgraphMeta.mockResolvedValueOnce({ _meta: { block: { number: 1300 } } }); - subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([ + pdpSubgraphServiceMock.fetchSubgraphMeta.mockResolvedValueOnce({ _meta: { block: { number: 1300 } } }); + pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([ makeProvider({ totalFaultedPeriods: 12n, totalProvingPeriods: 105n }), ]); @@ -981,8 +981,8 @@ describe("DataRetentionService", () => { expect(counterMock.labels).not.toHaveBeenCalled(); - subgraphServiceMock.fetchSubgraphMeta.mockResolvedValueOnce({ _meta: { block: { number: 1400 } } }); - subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([ + pdpSubgraphServiceMock.fetchSubgraphMeta.mockResolvedValueOnce({ _meta: { block: { number: 1400 } } }); + pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([ makeProvider({ totalFaultedPeriods: 12n, totalProvingPeriods: 105n }), ]); @@ -1003,12 +1003,12 @@ describe("DataRetentionService", () => { }, ]); - subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValue([makeProvider()]); + pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValue([makeProvider()]); // First poll: DB load fails, poll bails out to avoid emitting bloated values await service.pollDataRetention(); expect(mockBaselineRepository.find).toHaveBeenCalledTimes(1); - expect(subgraphServiceMock.fetchSubgraphMeta).not.toHaveBeenCalled(); + expect(pdpSubgraphServiceMock.fetchSubgraphMeta).not.toHaveBeenCalled(); expect(counterMock.labels).not.toHaveBeenCalled(); // Second poll: DB load succeeds, baselines restored, normal delta computation @@ -1021,16 +1021,16 @@ describe("DataRetentionService", () => { it("emits real deltas on second poll after fresh deploy baseline-only first poll", async () => { // First poll: fresh deploy, no baselines in DB // Baseline set to: faultedPeriods=10, successPeriods=90 - subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider()]); + pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider()]); await service.pollDataRetention(); counterMock.labels.mockClear(); counterMock.inc.mockClear(); // Second poll: values have increased - subgraphServiceMock.fetchSubgraphMeta.mockResolvedValueOnce({ + pdpSubgraphServiceMock.fetchSubgraphMeta.mockResolvedValueOnce({ _meta: { block: { number: 1300 } }, }); - subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([ + pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([ makeProvider({ totalFaultedPeriods: 12n, totalProvingPeriods: 105n }), ]); @@ -1044,7 +1044,7 @@ describe("DataRetentionService", () => { it("deletes baseline from DB when stale provider is cleaned up", async () => { // First poll: establish baseline for PROVIDER_A - subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider({ address: PROVIDER_A })]); + pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider({ address: PROVIDER_A })]); await service.pollDataRetention(); // Second poll: PROVIDER_A removed from active list @@ -1056,7 +1056,7 @@ describe("DataRetentionService", () => { { address: PROVIDER_A, name: "Provider A", providerId: 1, isApproved: true }, ]); - subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider({ address: PROVIDER_B })]); + pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider({ address: PROVIDER_B })]); await service.pollDataRetention(); @@ -1069,7 +1069,7 @@ describe("DataRetentionService", () => { it("emits overdue gauge on first poll (baseline-only)", async () => { // Provider is overdue: currentBlock=1200, // estimatedOverduePeriods = (1200 - 901) / 100 = 2.99 -> 2 - subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider()]); + pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider()]); await service.pollDataRetention(); @@ -1086,7 +1086,7 @@ describe("DataRetentionService", () => { it("emits overdue gauge = 0 when provider is not overdue", async () => { // nextDeadline=2000 > currentBlock=1200 - subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider({ proofSets: [] })]); + pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider({ proofSets: [] })]); await service.pollDataRetention(); @@ -1095,7 +1095,7 @@ describe("DataRetentionService", () => { it("emits overdue gauge even on negative delta (baseline reset)", async () => { // First poll: high values - subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([ + pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([ makeProvider({ totalFaultedPeriods: 100n, totalProvingPeriods: 200n }), ]); await service.pollDataRetention(); @@ -1103,7 +1103,7 @@ describe("DataRetentionService", () => { gaugeMock.set.mockClear(); // Second poll: lower values (negative delta) but still overdue - subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([ + pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([ makeProvider({ totalFaultedPeriods: 50n, totalProvingPeriods: 100n }), ]); await service.pollDataRetention(); @@ -1115,7 +1115,7 @@ describe("DataRetentionService", () => { it("naturally resets gauge to 0 when subgraph catches up", async () => { // First poll: provider is overdue (currentBlock=1200, nextDeadline=1000) - subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider()]); + pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider()]); await service.pollDataRetention(); expect(gaugeMock.set).toHaveBeenCalledWith(2); @@ -1124,7 +1124,7 @@ describe("DataRetentionService", () => { gaugeMock.set.mockClear(); // Second poll: subgraph caught up, nextDeadline advanced past currentBlock - subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([ + pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([ makeProvider({ totalFaultedPeriods: 12n, totalProvingPeriods: 102n, @@ -1140,7 +1140,7 @@ describe("DataRetentionService", () => { it("removes overdue gauge when stale provider is cleaned up", async () => { // First poll: establish baseline for PROVIDER_A - subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider({ address: PROVIDER_A })]); + pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider({ address: PROVIDER_A })]); await service.pollDataRetention(); // Second poll: PROVIDER_A removed from active list @@ -1152,7 +1152,7 @@ describe("DataRetentionService", () => { { address: PROVIDER_A, name: "Provider A", providerId: 1, isApproved: true }, ]); - subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider({ address: PROVIDER_B })]); + pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider({ address: PROVIDER_B })]); await service.pollDataRetention(); diff --git a/apps/backend/src/data-retention/data-retention.service.ts b/apps/backend/src/data-retention/data-retention.service.ts index 1422bbfd..c6ece7b5 100644 --- a/apps/backend/src/data-retention/data-retention.service.ts +++ b/apps/backend/src/data-retention/data-retention.service.ts @@ -11,8 +11,8 @@ import { IConfig } from "../config/app.config.js"; import { DataRetentionBaseline } from "../database/entities/data-retention-baseline.entity.js"; import { StorageProvider } from "../database/entities/storage-provider.entity.js"; import { buildCheckMetricLabels, CheckMetricLabels } from "../metrics-prometheus/check-metric-labels.js"; -import { SubgraphService } from "../subgraph/subgraph.service.js"; -import { type ProviderDataSetResponse } from "../subgraph/types.js"; +import { PDPSubgraphService } from "../pdp-subgraph/pdp-subgraph.service.js"; +import { type ProviderDataSetResponse } from "../pdp-subgraph/types.js"; import { WalletSdkService } from "../wallet-sdk/wallet-sdk.service.js"; import { type PDPProviderEx } from "../wallet-sdk/wallet-sdk.types.js"; @@ -41,7 +41,7 @@ export class DataRetentionService { constructor( private readonly configService: ConfigService, private readonly walletSdkService: WalletSdkService, - private readonly subgraphService: SubgraphService, + private readonly pdpSubgraphService: PDPSubgraphService, @InjectRepository(DataRetentionBaseline) private readonly baselineRepository: Repository, @InjectRepository(StorageProvider) @@ -59,10 +59,10 @@ export class DataRetentionService { * challenge delta since the last poll. */ async pollDataRetention(): Promise { - const subgraphEndpoint = this.configService.get("blockchain").subgraphEndpoint; - if (!subgraphEndpoint) { + const pdpSubgraphEndpoint = this.configService.get("blockchain").pdpSubgraphEndpoint; + if (!pdpSubgraphEndpoint) { this.logger.warn({ - event: "subgraph_endpoint_not_configured", + event: "pdp_subgraph_endpoint_not_configured", message: "No PDP subgraph endpoint configured", }); return; @@ -75,7 +75,7 @@ export class DataRetentionService { } try { - const subgraphMeta = await this.subgraphService.fetchSubgraphMeta(); + const subgraphMeta = await this.pdpSubgraphService.fetchSubgraphMeta(); const allProviderInfos = this.walletSdkService.getTestingProviders(); const spBlocklists = this.configService.get("spBlocklists"); const providerInfos = allProviderInfos?.filter((p) => !isSpBlocked(spBlocklists, p.serviceProvider, p.id)); @@ -104,7 +104,7 @@ export class DataRetentionService { ); try { - const providersFromSubgraph = await this.subgraphService.fetchProvidersWithDatasets({ + const providersFromSubgraph = await this.pdpSubgraphService.fetchProvidersWithDatasets({ blockNumber, addresses: batchAddresses, }); diff --git a/apps/backend/src/pdp-subgraph/pdp-subgraph.module.ts b/apps/backend/src/pdp-subgraph/pdp-subgraph.module.ts new file mode 100644 index 00000000..6e084fc1 --- /dev/null +++ b/apps/backend/src/pdp-subgraph/pdp-subgraph.module.ts @@ -0,0 +1,8 @@ +import { Module } from "@nestjs/common"; +import { PDPSubgraphService } from "./pdp-subgraph.service.js"; + +@Module({ + providers: [PDPSubgraphService], + exports: [PDPSubgraphService], +}) +export class PdpSubgraphModule {} diff --git a/apps/backend/src/pdp-subgraph/pdp-subgraph.service.spec.ts b/apps/backend/src/pdp-subgraph/pdp-subgraph.service.spec.ts new file mode 100644 index 00000000..cd3a1ea8 --- /dev/null +++ b/apps/backend/src/pdp-subgraph/pdp-subgraph.service.spec.ts @@ -0,0 +1,694 @@ +import type { ConfigService } from "@nestjs/config"; +import { afterEach, beforeEach, describe, expect, it, vi } from "vitest"; +import type { IConfig } from "../config/app.config.js"; +import { PDPSubgraphService } from "./pdp-subgraph.service.js"; + +const VALID_ADDRESS = "0xd8da6bf26964af9d7eed9e03e53415d37aa96045" as const; +const SUBGRAPH_ENDPOINT = "https://api.thegraph.com/subgraphs/filecoin/pdp" as const; + +const makeSubgraphResponse = (providers: Record[] = []) => ({ + data: { providers }, +}); + +const makeValidProvider = (overrides: Record = {}) => ({ + address: VALID_ADDRESS, + totalFaultedPeriods: "10", + totalProvingPeriods: "100", + proofSets: [ + { + totalFaultedPeriods: "2", + currentDeadlineCount: "5", + nextDeadline: "1000", + maxProvingPeriod: "100", + }, + ], + ...overrides, +}); + +const makeSubgraphMetaResponse = (blockNumber = 12345) => ({ + data: { + _meta: { + block: { + number: blockNumber, + }, + }, + }, +}); + +describe("PDPSubgraphService", () => { + let service: PDPSubgraphService; + let fetchMock: ReturnType; + + beforeEach(() => { + const configService = { + get: vi.fn((key: keyof IConfig) => { + if (key === "blockchain") { + return { pdpSubgraphEndpoint: SUBGRAPH_ENDPOINT }; + } + return undefined; + }), + } as unknown as ConfigService; + + service = new PDPSubgraphService(configService); + + fetchMock = vi.fn(); + vi.stubGlobal("fetch", fetchMock); + + vi.useFakeTimers(); + }); + + afterEach(() => { + vi.restoreAllMocks(); + vi.useRealTimers(); + }); + + describe("fetchProvidersWithDatasets", () => { + it("fetches and returns validated providers with bigint fields", async () => { + fetchMock.mockResolvedValueOnce({ + ok: true, + json: async () => makeSubgraphResponse([makeValidProvider()]), + }); + + const providers = await service.fetchProvidersWithDatasets({ + blockNumber: 5000, + addresses: [VALID_ADDRESS], + }); + + expect(fetchMock).toHaveBeenCalledWith(SUBGRAPH_ENDPOINT, { + method: "POST", + headers: { "Content-Type": "application/json" }, + body: expect.stringContaining('"addresses"'), + }); + + expect(providers).toHaveLength(1); + expect(providers[0].address).toBe(VALID_ADDRESS); + expect(providers[0].totalFaultedPeriods).toBe(10n); + expect(providers[0].totalProvingPeriods).toBe(100n); + expect(providers[0].proofSets[0].maxProvingPeriod).toBe(100n); + }); + + it("returns empty array when no providers exist", async () => { + fetchMock.mockResolvedValueOnce({ + ok: true, + json: async () => makeSubgraphResponse([]), + }); + + const providers = await service.fetchProvidersWithDatasets({ + blockNumber: 5000, + addresses: [VALID_ADDRESS], + }); + expect(providers).toEqual([]); + }); + + it("returns empty array when addresses array is empty", async () => { + const providers = await service.fetchProvidersWithDatasets({ + blockNumber: 5000, + addresses: [], + }); + + expect(providers).toEqual([]); + expect(fetchMock).not.toHaveBeenCalled(); + }); + + it("throws on HTTP error response", async () => { + fetchMock.mockResolvedValue({ + ok: false, + status: 500, + }); + + const promise = service.fetchProvidersWithDatasets({ + blockNumber: 5000, + addresses: [VALID_ADDRESS], + }); + + // This stops Node.js from throwing an Unhandled Rejection during fast-forward. + promise.catch(() => {}); + + await vi.runAllTimersAsync(); + + await expect(promise).rejects.toThrow("Failed to fetch provider data after 3 attempts"); + expect(fetchMock).toHaveBeenCalledTimes(3); + }); + + it("throws on GraphQL errors in response", async () => { + fetchMock.mockResolvedValueOnce({ + ok: true, + json: async () => ({ + data: null, + errors: [{ message: "Query failed" }], + }), + }); + + const promise = service.fetchProvidersWithDatasets({ + blockNumber: 5000, + addresses: [VALID_ADDRESS], + }); + promise.catch(() => {}); + + await vi.runAllTimersAsync(); + + // Now await the final promise to catch the expected error + await expect(promise).rejects.toThrow("Failed to fetch provider data after 3 attempts"); + expect(fetchMock).toHaveBeenCalledTimes(3); + }); + + it("throws on network failure", async () => { + fetchMock.mockRejectedValueOnce(new Error("Network error")); + + const promise = service.fetchProvidersWithDatasets({ + blockNumber: 5000, + addresses: [VALID_ADDRESS], + }); + promise.catch(() => {}); + + await vi.runAllTimersAsync(); + + // Now await the final promise to catch the expected error + await expect(promise).rejects.toThrow("Failed to fetch provider data after 3 attempts"); + expect(fetchMock).toHaveBeenCalledTimes(3); // Initial + 2 retries = 3 total + }); + + it("throws immediately on validation error without retrying", async () => { + fetchMock.mockResolvedValueOnce({ + ok: true, + json: async () => ({ + data: { providers: [{ address: "invalid" }] }, + }), + }); + + await expect( + service.fetchProvidersWithDatasets({ + blockNumber: 5000, + addresses: [VALID_ADDRESS], + }), + ).rejects.toThrow("Data validation failed"); + + // Should only be called once - no retries for validation errors + expect(fetchMock).toHaveBeenCalledTimes(1); + }); + + it("throws immediately when response data is missing required fields", async () => { + fetchMock.mockResolvedValueOnce({ + ok: true, + json: async () => ({ + data: { providers: [{ address: VALID_ADDRESS }] }, // Missing required fields + }), + }); + + await expect( + service.fetchProvidersWithDatasets({ + blockNumber: 5000, + addresses: [VALID_ADDRESS], + }), + ).rejects.toThrow("Data validation failed"); + + // Should only be called once - no retries for validation errors + expect(fetchMock).toHaveBeenCalledTimes(1); + }); + + it("sends blockNumber as string in the GraphQL variables", async () => { + fetchMock.mockResolvedValueOnce({ + ok: true, + json: async () => makeSubgraphResponse([makeValidProvider()]), + }); + + await service.fetchProvidersWithDatasets({ + blockNumber: 12345, + addresses: [VALID_ADDRESS], + }); + + const body = JSON.parse(fetchMock.mock.calls[0][1].body); + expect(body.variables.blockNumber).toBe("12345"); + }); + + it("retries network errors but not validation errors", async () => { + // First attempt: network error (should retry) + fetchMock.mockRejectedValueOnce(new Error("Network timeout")); + + // Second attempt: succeeds but validation fails (should not retry) + fetchMock.mockResolvedValueOnce({ + ok: true, + json: async () => ({ + data: { providers: [{ address: "invalid" }] }, + }), + }); + + const promise = service.fetchProvidersWithDatasets({ + blockNumber: 5000, + addresses: [VALID_ADDRESS], + }); + promise.catch(() => {}); + + await vi.runAllTimersAsync(); + + // Now await the final promise to catch the expected error + await expect(promise).rejects.toThrow("Data validation failed"); + + // Should be called twice: initial network error + 1 retry that fails validation + expect(fetchMock).toHaveBeenCalledTimes(2); + }); + + it("sends addresses array in the GraphQL variables", async () => { + fetchMock.mockResolvedValueOnce({ + ok: true, + json: async () => makeSubgraphResponse([makeValidProvider()]), + }); + + const addresses = [VALID_ADDRESS, "0xAb5801a7D398351b8bE11C439e05C5B3259aeC9B"]; + await service.fetchProvidersWithDatasets({ + blockNumber: 5000, + addresses, + }); + + const body = JSON.parse(fetchMock.mock.calls[0][1].body); + expect(body.variables.addresses).toEqual(addresses); + }); + + it("batches large address lists into chunks of MAX_PROVIDERS_PER_QUERY", async () => { + // Create 150 addresses (should be split into 2 batches: 100 + 50) + const addresses = Array.from({ length: 150 }, (_, i) => `0x${i.toString().padStart(40, "0")}`); + + fetchMock.mockResolvedValue({ + ok: true, + json: async () => makeSubgraphResponse([]), + }); + + await service.fetchProvidersWithDatasets({ + blockNumber: 5000, + addresses, + }); + + // Should make 2 requests + expect(fetchMock).toHaveBeenCalledTimes(2); + }); + + it("retries failed requests with exponential backoff", async () => { + // Fail on first attempt, succeed on second attempt (1 retry) + fetchMock.mockRejectedValueOnce(new Error("Network timeout")).mockResolvedValueOnce({ + ok: true, + json: async () => makeSubgraphResponse([makeValidProvider()]), + }); + + const promise = service.fetchProvidersWithDatasets({ + blockNumber: 5000, + addresses: [VALID_ADDRESS], + }); + + await vi.runAllTimersAsync(); + + // Now await the final promise to resolve + const providers = await promise; + + expect(fetchMock).toHaveBeenCalledTimes(2); // Initial attempt + 1 retry + expect(providers).toHaveLength(1); + }); + + it("processes batches with concurrency control", async () => { + // Create 120 addresses (should be 2 batches of 100 each, but processed with concurrency limit) + const addresses = Array.from({ length: 120 }, (_, i) => `0x${i.toString().padStart(40, "0")}`); + + let concurrentCalls = 0; + let maxConcurrentCalls = 0; + + fetchMock.mockImplementation(async () => { + concurrentCalls++; + maxConcurrentCalls = Math.max(maxConcurrentCalls, concurrentCalls); + await new Promise((resolve) => setTimeout(resolve, 10)); + concurrentCalls--; + return { + ok: true, + json: async () => makeSubgraphResponse([]), + }; + }); + + const fetchPromise = service.fetchProvidersWithDatasets({ + blockNumber: 5000, + addresses, + }); + + await vi.runAllTimersAsync(); + + await fetchPromise; + + // Should respect MAX_CONCURRENT_REQUESTS (50) + expect(maxConcurrentCalls).toBeLessThanOrEqual(50); + expect(fetchMock).toHaveBeenCalledTimes(2); + }); + }); + + describe("fetchSubgraphMeta", () => { + it("fetches and returns subgraph metadata with block number", async () => { + fetchMock.mockResolvedValueOnce({ + ok: true, + json: async () => makeSubgraphMetaResponse(12345), + }); + + const meta = await service.fetchSubgraphMeta(); + + expect(fetchMock).toHaveBeenCalledWith(SUBGRAPH_ENDPOINT, { + method: "POST", + headers: { "Content-Type": "application/json" }, + body: expect.stringContaining("GetSubgraphMeta"), + }); + + expect(meta).toEqual({ + _meta: { + block: { + number: 12345, + }, + }, + }); + }); + + it("throws when PDP subgraph endpoint is not configured", async () => { + const configService = { + get: vi.fn(() => ({ pdpSubgraphEndpoint: "" })), + } as unknown as ConfigService; + + const serviceWithoutEndpoint = new PDPSubgraphService(configService); + + await expect(serviceWithoutEndpoint.fetchSubgraphMeta()).rejects.toThrow("No PDP subgraph endpoint configured"); + }); + + it("throws on HTTP error response", async () => { + fetchMock.mockResolvedValueOnce({ + ok: false, + status: 500, + statusText: "Internal Server Error", + }); + + const promise = service.fetchSubgraphMeta(); + promise.catch(() => {}); + + await vi.runAllTimersAsync(); + + // Now await the final promise to catch the expected error + await expect(promise).rejects.toThrow("Failed to fetch subgraph metadata after 3 attempts"); + }); + + it("throws on GraphQL errors in response", async () => { + fetchMock.mockResolvedValueOnce({ + ok: true, + json: async () => ({ + errors: [{ message: "Query timeout" }], + }), + }); + + const promise = service.fetchSubgraphMeta(); + promise.catch(() => {}); + + await vi.runAllTimersAsync(); + + // Now await the final promise to catch the expected error + await expect(promise).rejects.toThrow("Failed to fetch subgraph metadata after 3 attempts"); + }); + + it("throws on validation failure without retry", async () => { + fetchMock.mockResolvedValueOnce({ + ok: true, + json: async () => ({ + data: { + _meta: { + block: { + number: "not-a-number", // Invalid - should be number + }, + }, + }, + }), + }); + + await expect(service.fetchSubgraphMeta()).rejects.toThrow("Data validation failed"); + expect(fetchMock).toHaveBeenCalledTimes(1); // Should not retry validation errors + }); + + it("throws on missing required fields", async () => { + fetchMock.mockResolvedValueOnce({ + ok: true, + json: async () => ({ + data: { + _meta: { + block: { + number: undefined, // missing required field + }, + }, + }, + }), + }); + + await expect(service.fetchSubgraphMeta()).rejects.toThrow("Data validation failed"); + expect(fetchMock).toHaveBeenCalledTimes(1); + }); + + it("retries on network failures with exponential backoff", async () => { + fetchMock.mockRejectedValueOnce(new Error("Network timeout")).mockResolvedValueOnce({ + ok: true, + json: async () => makeSubgraphMetaResponse(12345), + }); + + const promise = service.fetchSubgraphMeta(); + + await vi.runAllTimersAsync(); + + // Now await the second promise to resolve + const meta = await promise; + + expect(fetchMock).toHaveBeenCalledTimes(2); // Initial + 1 retry + expect(meta._meta.block.number).toBe(12345); + }); + + it("throws after MAX_RETRIES attempts on persistent network errors", async () => { + fetchMock.mockRejectedValue(new Error("Network timeout")); + + const promise = service.fetchSubgraphMeta(); + promise.catch(() => {}); + + await vi.runAllTimersAsync(); + + // Now await the final promise to catch the expected error + await expect(promise).rejects.toThrow("Failed to fetch subgraph metadata after 3 attempts"); + expect(fetchMock).toHaveBeenCalledTimes(3); + }); + }); + + describe("enforceRateLimit (sliding window)", () => { + it("allows requests when under the rate limit", async () => { + fetchMock.mockResolvedValue({ + ok: true, + json: async () => makeSubgraphMetaResponse(12345), + }); + + const startTime = Date.now(); + + // Make 5 requests - should all go through immediately + const promises = Array.from({ length: 5 }, () => service.fetchSubgraphMeta()); + + await Promise.all(promises); + + const endTime = Date.now(); + const elapsed = endTime - startTime; + + // Should complete quickly (no waiting) + expect(elapsed).toBeLessThan(100); + expect(fetchMock).toHaveBeenCalledTimes(5); + }); + + it("enforces rate limit when exceeding MAX_CONCURRENT_REQUESTS", async () => { + fetchMock.mockResolvedValue({ + ok: true, + json: async () => makeSubgraphMetaResponse(12345), + }); + + // Fill up the rate limit window with 50 requests + const initialPromises = Array.from({ length: 50 }, () => service.fetchSubgraphMeta()); + await Promise.all(initialPromises); + + fetchMock.mockClear(); + + // Try to make one more request - should wait for oldest to expire + const promise = service.fetchSubgraphMeta(); + + // Advance past the 10 second window + buffer + await vi.advanceTimersByTimeAsync(10010); + await promise; + + expect(fetchMock).toHaveBeenCalledTimes(1); + }); + + it("throws error when requestCount exceeds MAX_CONCURRENT_REQUESTS", async () => { + // Access private method via type assertion for testing + const enforceRateLimit = (service as any).enforceRateLimit.bind(service); + + await expect(enforceRateLimit(51)).rejects.toThrow("Cannot request 51 items; exceeds rate limit window of 50"); + }); + + it("correctly calculates wait time for multiple required slots", async () => { + fetchMock.mockResolvedValue({ + ok: true, + json: async () => makeSubgraphMetaResponse(12345), + }); + + // Fill 48 slots + const initialPromises = Array.from({ length: 48 }, () => service.fetchSubgraphMeta()); + await vi.runAllTimersAsync(); + await Promise.all(initialPromises); + + fetchMock.mockClear(); + + // Request 5 more slots (need 3 to free up: 5 - 2 available = 3) + // Should wait for the 3rd oldest timestamp to expire + const enforceRateLimit = (service as any).enforceRateLimit.bind(service); + const promise = enforceRateLimit(5); + + // The 3rd request should expire at ~10 seconds + await vi.advanceTimersByTimeAsync(10010); + await promise; + + // Verify slots were reserved + // After 10s, the first 48 expired, so we should only have the 5 new ones + const timestamps = (service as any).requestTimestamps; + expect(timestamps.length).toBe(5); // Only the 5 new slots remain + }); + + it("handles sliding window correctly as old requests expire", async () => { + fetchMock.mockResolvedValue({ + ok: true, + json: async () => makeSubgraphMetaResponse(12345), + }); + + // Make 30 requests at t=0 + const batch1 = Array.from({ length: 30 }, () => service.fetchSubgraphMeta()); + await vi.runAllTimersAsync(); + await Promise.all(batch1); + + // Advance 5 seconds + await vi.advanceTimersByTimeAsync(5000); + + // Make 20 more requests at t=5000 + const batch2 = Array.from({ length: 20 }, () => service.fetchSubgraphMeta()); + await vi.runAllTimersAsync(); + await Promise.all(batch2); + + // Now at t=5000, we have 50 requests in the window + // Advance to t=10100 - first 30 should expire + await vi.advanceTimersByTimeAsync(5100); + + fetchMock.mockClear(); + + // Should be able to make 30 more requests immediately + const batch3 = Array.from({ length: 30 }, () => service.fetchSubgraphMeta()); + await vi.runAllTimersAsync(); + await Promise.all(batch3); + + expect(fetchMock).toHaveBeenCalledTimes(30); + }); + + it("adds 10ms buffer to prevent timing edge cases", async () => { + fetchMock.mockResolvedValue({ + ok: true, + json: async () => makeSubgraphMetaResponse(12345), + }); + + // Fill the window + const initialPromises = Array.from({ length: 50 }, () => service.fetchSubgraphMeta()); + await vi.runAllTimersAsync(); + await Promise.all(initialPromises); + + fetchMock.mockClear(); + + const promise = service.fetchSubgraphMeta(); + + // Advance past the window + buffer + await vi.advanceTimersByTimeAsync(10010); + await promise; + + expect(fetchMock).toHaveBeenCalledTimes(1); + }); + + it("recursively waits when multiple batches need to expire", async () => { + fetchMock.mockResolvedValue({ + ok: true, + json: async () => makeSubgraphMetaResponse(12345), + }); + + // Fill window with 50 requests + const batch1 = Array.from({ length: 50 }, () => service.fetchSubgraphMeta()); + await vi.runAllTimersAsync(); + await Promise.all(batch1); + + // Advance 5 seconds + await vi.advanceTimersByTimeAsync(5000); + + fetchMock.mockClear(); + + // Try to request 30 slots (need to wait for 30 to expire) + const enforceRateLimit = (service as any).enforceRateLimit.bind(service); + const promise = enforceRateLimit(30); + + // First recursion: wait for 30th oldest to expire (~10s from start) + await vi.advanceTimersByTimeAsync(5010); + + // Should recursively check and complete + await promise; + + const timestamps = (service as any).requestTimestamps; + // After 10s from start, all 50 initial requests expired, only 30 new ones remain + expect(timestamps.length).toBe(30); // Only the 30 new slots + }); + + it("reserves slots immediately to prevent race conditions", async () => { + fetchMock.mockResolvedValue({ + ok: true, + json: async () => makeSubgraphMetaResponse(12345), + }); + + // Fill 47 slots + const initial = Array.from({ length: 47 }, () => service.fetchSubgraphMeta()); + await vi.runAllTimersAsync(); + await Promise.all(initial); + + // Now we have 3 available slots + const enforceRateLimit = (service as any).enforceRateLimit.bind(service); + + // Request 3 slots - should succeed immediately + await enforceRateLimit(3); + + const timestamps = (service as any).requestTimestamps; + expect(timestamps.length).toBe(50); // 47 + 3 = 50 (full) + + // Try to request 1 more - should need to wait + const promise = enforceRateLimit(1); + + // Advance time to free up a slot + await vi.advanceTimersByTimeAsync(10010); + await promise; + + // After waiting, the old slots expired and new one was added + const finalTimestamps = (service as any).requestTimestamps; + expect(finalTimestamps.length).toBe(1); // Only the new request remains + }); + + it("filters out expired timestamps from the sliding window", async () => { + fetchMock.mockResolvedValue({ + ok: true, + json: async () => makeSubgraphMetaResponse(12345), + }); + + // Make 20 requests + const batch1 = Array.from({ length: 20 }, () => service.fetchSubgraphMeta()); + await vi.runAllTimersAsync(); + await Promise.all(batch1); + + // Advance past the window + await vi.advanceTimersByTimeAsync(11000); + + fetchMock.mockClear(); + + // Make another request - should have full window available + await service.fetchSubgraphMeta(); + + const timestamps = (service as any).requestTimestamps; + // Should only have 1 timestamp (the new one), old ones filtered out + expect(timestamps.length).toBe(1); + }); + }); +}); diff --git a/apps/backend/src/pdp-subgraph/pdp-subgraph.service.ts b/apps/backend/src/pdp-subgraph/pdp-subgraph.service.ts new file mode 100644 index 00000000..aedd8bce --- /dev/null +++ b/apps/backend/src/pdp-subgraph/pdp-subgraph.service.ts @@ -0,0 +1,306 @@ +import { Injectable, Logger } from "@nestjs/common"; +import { ConfigService } from "@nestjs/config"; +import { toStructuredError } from "../common/logging.js"; +import type { IBlockchainConfig, IConfig } from "../config/app.config.js"; +import { Queries } from "./queries.js"; +import type { GraphQLResponse, ProviderDataSetResponse, ProvidersWithDataSetsOptions, SubgraphMeta } from "./types.js"; +import { validateProviderDataSetResponse, validateSubgraphMetaResponse } from "./types.js"; + +/** + * Error thrown when data validation fails. + * These errors should not be retried as they indicate schema/data issues. + */ +class ValidationError extends Error { + constructor(message: string) { + super(message); + this.name = "ValidationError"; + if (Error.captureStackTrace) { + Error.captureStackTrace(this, ValidationError); + } + } +} + +@Injectable() +export class PDPSubgraphService { + private readonly logger: Logger = new Logger(PDPSubgraphService.name); + private readonly blockchainConfig: IBlockchainConfig; + + private static readonly MAX_PROVIDERS_PER_QUERY = 100; + private static readonly MAX_CONCURRENT_REQUESTS = 50; + private static readonly RATE_LIMIT_WINDOW_MS = 10000; + private static readonly MAX_RETRIES = 3; + private static readonly INITIAL_RETRY_DELAY_MS = 1000; + + private requestTimestamps: number[] = []; + + constructor(private readonly configService: ConfigService) { + this.blockchainConfig = this.configService.get("blockchain"); + } + + /** + * Fetch subgraph metadata including the latest indexed block number + * + * @param attempt - Current retry attempt number (default: 1) + * @returns Subgraph metadata with block number + * @throws Error if endpoint is not configured or after MAX_RETRIES attempts + */ + async fetchSubgraphMeta(attempt: number = 1): Promise { + if (!this.blockchainConfig.pdpSubgraphEndpoint) { + throw new Error("No PDP subgraph endpoint configured"); + } + + try { + await this.enforceRateLimit(); + + const response = await fetch(this.blockchainConfig.pdpSubgraphEndpoint, { + method: "POST", + headers: { + "Content-Type": "application/json", + }, + body: JSON.stringify({ + query: Queries.GET_SUBGRAPH_META, + }), + }); + + if (!response.ok) { + throw new Error(`HTTP ${response.status}: ${response.statusText}`); + } + + const result = (await response.json()) as GraphQLResponse; + + if (result.errors) { + const errorMessage = result.errors?.[0]?.message || "Unknown GraphQL error"; + throw new Error(`GraphQL error: ${errorMessage}`); + } + let validated: SubgraphMeta; + try { + validated = validateSubgraphMetaResponse(result.data); + } catch (validationError) { + const errorMessage = validationError instanceof Error ? validationError.message : "Unknown validation error"; + throw new ValidationError(`Data validation failed: ${errorMessage}`); + } + + return validated; + } catch (error) { + const errorMessage = error instanceof Error ? error.message : "Unknown error"; + + // No need to retry on validation errors - they indicate schema/data issues, not transient failures + if (error instanceof ValidationError) { + this.logger.error({ + event: "subgraph_meta_validation_failed", + message: "Subgraph data validation failed", + error: toStructuredError(error), + }); + throw error; + } + + // Retry on network/HTTP errors + if (attempt < PDPSubgraphService.MAX_RETRIES) { + const delay = PDPSubgraphService.INITIAL_RETRY_DELAY_MS * (1 << (attempt - 1)); + this.logger.warn({ + event: "subgraph_meta_request_retry", + message: "Subgraph meta request failed. Retrying...", + attempt, + maxRetries: PDPSubgraphService.MAX_RETRIES, + retryDelayMs: delay, + error: toStructuredError(error), + }); + await new Promise((resolve) => setTimeout(resolve, delay)); + return this.fetchSubgraphMeta(attempt + 1); + } + + this.logger.error({ + event: "subgraph_meta_request_failed", + message: "Subgraph meta request failed after maximum retries", + maxRetries: PDPSubgraphService.MAX_RETRIES, + error: toStructuredError(error), + }); + throw new Error( + `Failed to fetch subgraph metadata after ${PDPSubgraphService.MAX_RETRIES} attempts: ${errorMessage}`, + ); + } + } + + /** + * Fetch provider-level totals from subgraph with batching, pagination, and rate limiting + * + * @param options - Options containing block number and provider addresses + * @returns Array of providers with their data sets currently proving + */ + async fetchProvidersWithDatasets( + options: ProvidersWithDataSetsOptions, + ): Promise { + const { blockNumber, addresses } = options; + + if (addresses.length === 0) { + return []; + } + + if (addresses.length <= PDPSubgraphService.MAX_PROVIDERS_PER_QUERY) { + return this.fetchWithRetry(blockNumber, addresses); + } + + return this.fetchMultipleBatchesWithRateLimit(blockNumber, addresses); + } + + /** + * Fetch multiple batches with rate limiting and concurrency control + */ + private async fetchMultipleBatchesWithRateLimit( + blockNumber: number, + addresses: string[], + ): Promise { + const batches: string[][] = []; + for (let i = 0; i < addresses.length; i += PDPSubgraphService.MAX_PROVIDERS_PER_QUERY) { + const addressesLimit = Math.min(addresses.length, i + PDPSubgraphService.MAX_PROVIDERS_PER_QUERY); + batches.push(addresses.slice(i, addressesLimit)); + } + + const allProviders: ProviderDataSetResponse["providers"] = []; + + for (let i = 0; i < batches.length; i += PDPSubgraphService.MAX_CONCURRENT_REQUESTS) { + const batchGroup = batches.slice(i, i + PDPSubgraphService.MAX_CONCURRENT_REQUESTS); + + const results = await Promise.all(batchGroup.map((batch) => this.fetchWithRetry(blockNumber, batch))); + + allProviders.push(...results.flat()); + } + + return allProviders; + } + + /** + * Fetch with exponential backoff retry mechanism + * Assuming initial request to be first attempt + */ + private async fetchWithRetry( + blockNumber: number, + addresses: string[], + attempt: number = 1, + ): Promise { + if (!this.blockchainConfig.pdpSubgraphEndpoint) { + throw new Error("No PDP subgraph endpoint configured"); + } + + const variables = { + blockNumber: blockNumber.toString(), + addresses, + }; + + try { + await this.enforceRateLimit(); + + const response = await fetch(this.blockchainConfig.pdpSubgraphEndpoint, { + method: "POST", + headers: { + "Content-Type": "application/json", + }, + body: JSON.stringify({ + query: Queries.GET_PROVIDERS_WITH_DATASETS, + variables, + }), + }); + + if (!response.ok) { + throw new Error(`HTTP ${response.status}: ${response.statusText}`); + } + + const result = (await response.json()) as GraphQLResponse; + + if (result.errors) { + const errorMessage = result.errors?.[0]?.message || "Unknown GraphQL error"; + throw new Error(`GraphQL error: ${errorMessage}`); + } + + let validated: ProviderDataSetResponse; + try { + validated = validateProviderDataSetResponse(result.data); + } catch (validationError) { + const errorMessage = validationError instanceof Error ? validationError.message : "Unknown validation error"; + throw new ValidationError(`Data validation failed: ${errorMessage}`); + } + + return validated.providers; + } catch (error) { + const errorMessage = error instanceof Error ? error.message : "Unknown error"; + + // No need to retry on validation errors - they indicate schema/data issues, not transient failures + if (error instanceof ValidationError) { + this.logger.error({ + event: "subgraph_provider_data_validation_failed", + message: "Subgraph data validation failed", + error: toStructuredError(error), + }); + throw error; + } + + // Retry on network/HTTP errors + if (attempt < PDPSubgraphService.MAX_RETRIES) { + const delay = PDPSubgraphService.INITIAL_RETRY_DELAY_MS * (1 << (attempt - 1)); + this.logger.warn({ + event: "subgraph_provider_request_retry", + message: "Subgraph provider request failed. Retrying...", + attempt, + maxRetries: PDPSubgraphService.MAX_RETRIES, + retryDelayMs: delay, + addressCount: addresses.length, + error: toStructuredError(error), + }); + await new Promise((resolve) => setTimeout(resolve, delay)); + return this.fetchWithRetry(blockNumber, addresses, attempt + 1); + } + + this.logger.error({ + event: "subgraph_provider_request_failed", + message: "Subgraph provider request failed after maximum retries", + maxRetries: PDPSubgraphService.MAX_RETRIES, + blockNumber, + addressCount: addresses.length, + error: toStructuredError(error), + }); + throw new Error( + `Failed to fetch provider data after ${PDPSubgraphService.MAX_RETRIES} attempts: ${errorMessage}`, + ); + } + } + + /** + * Enforce rate limiting: max 50 requests per 10 seconds + * This rate limit is applied by Goldsky on their public endpoints + * Read more here: https://docs.goldsky.com/subgraphs/graphql-endpoints#public-endpoints + */ + private async enforceRateLimit(requestCount: number = 1): Promise { + if (requestCount > PDPSubgraphService.MAX_CONCURRENT_REQUESTS) { + throw new Error( + `Cannot request ${requestCount} items; exceeds rate limit window of ${PDPSubgraphService.MAX_CONCURRENT_REQUESTS}`, + ); + } + + const now = Date.now(); + const windowStart = now - PDPSubgraphService.RATE_LIMIT_WINDOW_MS; + + this.requestTimestamps = this.requestTimestamps.filter((timestamp) => timestamp > windowStart); + + const availableSlots = PDPSubgraphService.MAX_CONCURRENT_REQUESTS - this.requestTimestamps.length; + + if (requestCount > availableSlots) { + const requiredSlots = requestCount - availableSlots; + + const index = Math.min(this.requestTimestamps.length, requiredSlots) - 1; + const oldestTimestamp = this.requestTimestamps[index] || now; + + // wait time with 10ms buffer + const waitTime = oldestTimestamp + PDPSubgraphService.RATE_LIMIT_WINDOW_MS - now + 10; + + if (waitTime > 0) { + await new Promise((resolve) => setTimeout(resolve, waitTime)); + return this.enforceRateLimit(requestCount); + } + } + + // Reserve the slots NOW + for (let i = 0; i < requestCount; i++) { + this.requestTimestamps.push(Date.now()); + } + } +} diff --git a/apps/backend/src/pdp-subgraph/queries.ts b/apps/backend/src/pdp-subgraph/queries.ts new file mode 100644 index 00000000..a21a3991 --- /dev/null +++ b/apps/backend/src/pdp-subgraph/queries.ts @@ -0,0 +1,24 @@ +export const Queries = { + GET_PROVIDERS_WITH_DATASETS: ` + query GetProvidersWithDataSet($addresses: [Bytes!], $blockNumber: BigInt!) { + providers(where: {address_in: $addresses}) { + address + totalFaultedPeriods + totalProvingPeriods + proofSets (where: {nextDeadline_lt: $blockNumber, status: PROVING}) { + nextDeadline + maxProvingPeriod + } + } + } + `, + GET_SUBGRAPH_META: ` + query GetSubgraphMeta { + _meta { + block { + number + } + } + } + `, +} as const; diff --git a/apps/backend/src/pdp-subgraph/types.spec.ts b/apps/backend/src/pdp-subgraph/types.spec.ts new file mode 100644 index 00000000..02e6eee0 --- /dev/null +++ b/apps/backend/src/pdp-subgraph/types.spec.ts @@ -0,0 +1,245 @@ +import { describe, expect, it } from "vitest"; +import { validateProviderDataSetResponse, validateSubgraphMetaResponse } from "./types.js"; + +// Subgraph stores addresses in lowercase +const VALID_ADDRESS = "0xd8da6bf26964af9d7eed9e03e53415d37aa96045" as const; + +const makeValidProvider = (overrides: Record = {}) => ({ + address: VALID_ADDRESS, + totalFaultedPeriods: "10", + totalProvingPeriods: "100", + proofSets: [ + { + nextDeadline: "1000", + maxProvingPeriod: "100", + }, + ], + ...overrides, +}); + +const makeValidResponse = (providers = [makeValidProvider()]) => ({ + providers, +}); + +describe("validateProviderDataSetResponse", () => { + it("validates and transforms a well-formed response", () => { + const result = validateProviderDataSetResponse(makeValidResponse()); + + expect(result.providers).toHaveLength(1); + const provider = result.providers[0]; + expect(provider.address).toBe(VALID_ADDRESS); + expect(provider.totalFaultedPeriods).toBe(10n); + expect(provider.totalProvingPeriods).toBe(100n); + + const proofSet = provider.proofSets[0]; + expect(proofSet.nextDeadline).toBe(1000n); + expect(proofSet.maxProvingPeriod).toBe(100n); + }); + + it("converts string numbers to bigint", () => { + const result = validateProviderDataSetResponse( + makeValidResponse([ + makeValidProvider({ + totalFaultedPeriods: "999999999999999999", + totalProvingPeriods: "1000000000000000000", + }), + ]), + ); + + expect(typeof result.providers[0].totalFaultedPeriods).toBe("bigint"); + expect(result.providers[0].totalFaultedPeriods).toBe(999999999999999999n); + expect(result.providers[0].totalProvingPeriods).toBe(1000000000000000000n); + }); + + it("accepts an empty providers array", () => { + const result = validateProviderDataSetResponse({ providers: [] }); + expect(result.providers).toEqual([]); + }); + + it("accepts a provider with empty proofSets", () => { + const result = validateProviderDataSetResponse(makeValidResponse([makeValidProvider({ proofSets: [] })])); + expect(result.providers[0].proofSets).toEqual([]); + }); + + it("preserves unknown fields (schema uses .unknown(true))", () => { + const result = validateProviderDataSetResponse(makeValidResponse([makeValidProvider({ extraField: "hello" })])); + expect((result.providers[0] as Record).extraField).toBe("hello"); + }); + + it("throws on missing providers field", () => { + expect(() => validateProviderDataSetResponse({})).toThrow("Invalid provider dataset response format"); + }); + + it("throws on null input", () => { + expect(() => validateProviderDataSetResponse(null)).toThrow("Invalid provider dataset response format"); + }); + + it("throws on missing required provider fields", () => { + expect(() => + validateProviderDataSetResponse({ + providers: [{ address: VALID_ADDRESS }], + }), + ).toThrow("Invalid provider dataset response format"); + }); + + it("throws on invalid Ethereum address", () => { + expect(() => + validateProviderDataSetResponse(makeValidResponse([makeValidProvider({ address: "not-an-address" })])), + ).toThrow("Invalid provider dataset response format"); + }); + + it("throws on non-numeric string for bigint fields", () => { + expect(() => + validateProviderDataSetResponse(makeValidResponse([makeValidProvider({ totalFaultedPeriods: "abc" })])), + ).toThrow("Invalid provider dataset response format"); + }); + + it("throws on negative number string for bigint fields", () => { + expect(() => + validateProviderDataSetResponse(makeValidResponse([makeValidProvider({ totalFaultedPeriods: "-1" })])), + ).toThrow("Invalid provider dataset response format"); + }); + + it("throws on missing proofSet fields", () => { + expect(() => + validateProviderDataSetResponse( + makeValidResponse([ + makeValidProvider({ + proofSets: [{ totalFaultedPeriods: "1" }], + }), + ]), + ), + ).toThrow("Invalid provider dataset response format"); + }); + + it("validates multiple providers in a single response", () => { + const provider1 = makeValidProvider({ address: VALID_ADDRESS, totalFaultedPeriods: "5" }); + const provider2 = makeValidProvider({ + address: "0xAb5801a7D398351b8bE11C439e05C5B3259aeC9B", + totalFaultedPeriods: "15", + }); + + const result = validateProviderDataSetResponse(makeValidResponse([provider1, provider2])); + + expect(result.providers).toHaveLength(2); + expect(result.providers[0].totalFaultedPeriods).toBe(5n); + expect(result.providers[1].totalFaultedPeriods).toBe(15n); + }); + + it("handles zero values correctly", () => { + const result = validateProviderDataSetResponse( + makeValidResponse([ + makeValidProvider({ + totalFaultedPeriods: "0", + totalProvingPeriods: "0", + proofSets: [ + { + nextDeadline: "0", + maxProvingPeriod: "0", + }, + ], + }), + ]), + ); + + expect(result.providers[0].totalFaultedPeriods).toBe(0n); + expect(result.providers[0].totalProvingPeriods).toBe(0n); + expect(result.providers[0].proofSets[0].maxProvingPeriod).toBe(0n); + }); +}); + +describe("validateSubgraphMetaResponse", () => { + it("validates a well-formed subgraph meta response", () => { + const input = { + _meta: { + block: { + number: 12345, + }, + }, + }; + + const result = validateSubgraphMetaResponse(input); + + expect(result._meta.block.number).toBe(12345); + }); + + it("accepts large block numbers", () => { + const input = { + _meta: { + block: { + number: 999999999, + }, + }, + }; + + const result = validateSubgraphMetaResponse(input); + + expect(result._meta.block.number).toBe(999999999); + }); + + it("accepts numeric strings block number", () => { + const result = validateSubgraphMetaResponse({ + _meta: { + block: { + number: "12345", + }, + }, + }); + + expect(result._meta.block.number).toBe(12345); + }); + + it("throws on missing _meta field", () => { + expect(() => validateSubgraphMetaResponse({})).toThrow("Invalid subgraph meta response format"); + }); + + it("throws on missing block field", () => { + expect(() => + validateSubgraphMetaResponse({ + _meta: {}, + }), + ).toThrow("Invalid subgraph meta response format"); + }); + + it("throws on missing number field", () => { + expect(() => + validateSubgraphMetaResponse({ + _meta: { + block: {}, + }, + }), + ).toThrow("Invalid subgraph meta response format"); + }); + + it("throws on null input", () => { + expect(() => validateSubgraphMetaResponse(null)).toThrow("Invalid subgraph meta response format"); + }); + + it("throws on undefined input", () => { + expect(() => validateSubgraphMetaResponse(undefined)).toThrow("Invalid subgraph meta response format"); + }); + + it("throws on negative block number", () => { + expect(() => + validateSubgraphMetaResponse({ + _meta: { + block: { + number: -1, + }, + }, + }), + ).toThrow("Invalid subgraph meta response format"); + }); + + it("throws on floating point block number", () => { + expect(() => + validateSubgraphMetaResponse({ + _meta: { + block: { + number: 123.45, + }, + }, + }), + ).toThrow("Invalid subgraph meta response format"); + }); +}); diff --git a/apps/backend/src/pdp-subgraph/types.ts b/apps/backend/src/pdp-subgraph/types.ts new file mode 100644 index 00000000..ad8dcdc4 --- /dev/null +++ b/apps/backend/src/pdp-subgraph/types.ts @@ -0,0 +1,151 @@ +import Joi from "joi"; +import { Hex, isAddress } from "viem"; + +// ----------------------------------------- +// Types +// ----------------------------------------- + +/** The response from the subgraph GraphQL query */ +export type GraphQLResponse = { + /** The data from the query */ + data?: unknown; + /** The errors from the query */ + errors?: { message: string }[]; +}; + +/** + * Options for fetching providers with data sets + */ +export type ProvidersWithDataSetsOptions = { + addresses: string[]; + blockNumber: number; +}; + +/** + * Validated response from the PDP subgraph meta query. + */ +export type SubgraphMeta = { + _meta: { + block: { + number: number; + }; + }; +}; + +/** + * A single proof set within a provider, representing deadline-related proving data. + * All numeric fields are bigints converted from the subgraph string representation. + */ +export type DataSet = { + nextDeadline: bigint; + maxProvingPeriod: bigint; +}; + +/** + * Validated and transformed response from the PDP subgraph providers query. + * Numeric fields are converted from subgraph string representation to bigint. + */ +export type ProviderDataSetResponse = { + providers: { + address: Hex; + totalFaultedPeriods: bigint; + totalProvingPeriods: bigint; + proofSets: DataSet[]; + }[]; +}; + +// ----------------------------------------- +// Joi Custom Schema Converters +// ----------------------------------------- + +/** Joi custom validator that converts a numeric string to bigint. */ +const toBigInt = (value: unknown, helpers: Joi.CustomHelpers) => { + try { + return BigInt(value as string); + } catch { + return helpers.error("any.invalid", { + message: "Invalid bigint value", + }); + } +}; + +/** Joi custom validator to validate an Ethereum address and normalize to lowercase. */ +const toEthereumAddress = (value: unknown, helpers: Joi.CustomHelpers) => { + if (!isAddress(value as string)) { + return helpers.error("any.invalid", { message: "Invalid Ethereum address" }); + } + + // Normalize to lowercase for consistent key lookups + return (value as string).toLowerCase() as Hex; +}; + +// ----------------------------------------- +// Joi Schemas +// ----------------------------------------- + +const metaSchema = Joi.object({ + _meta: Joi.object({ + block: Joi.object({ + number: Joi.number().integer().positive().required(), + }) + .unknown(true) + .required(), + }) + .unknown(true) + .required(), +}) + .unknown(true) + .required(); + +const dataSetSchema = Joi.object({ + nextDeadline: Joi.string().pattern(/^\d+$/).required().custom(toBigInt), + maxProvingPeriod: Joi.string().pattern(/^\d+$/).required().custom(toBigInt), +}).unknown(true); + +const providerDataSetResponseSchema = Joi.object({ + providers: Joi.array() + .items( + Joi.object({ + address: Joi.string().required().custom(toEthereumAddress), + totalFaultedPeriods: Joi.string().pattern(/^\d+$/).required().custom(toBigInt), + totalProvingPeriods: Joi.string().pattern(/^\d+$/).required().custom(toBigInt), + proofSets: Joi.array().items(dataSetSchema).required(), + }).unknown(true), + ) + .required(), +}) + .unknown(true) + .required(); + +// ----------------------------------------- +// Validator Functions +// ----------------------------------------- + +/** + * Validates a raw subgraph meta response into SubgraphMeta. + * + * @param value - The raw parsed JSON from the subgraph + * @throws Error if validation fails + */ +export function validateSubgraphMetaResponse(value: unknown): SubgraphMeta { + const { error, value: validated } = metaSchema.validate(value, { abortEarly: false }); + if (error) { + throw new Error(`Invalid subgraph meta response format: ${error.message}`); + } + return validated as SubgraphMeta; +} + +/** + * Validates and transforms a raw subgraph response into ProviderDataSetResponse. + * Converts string fields to bigint. + * + * @param value - The raw parsed JSON from the subgraph + * @throws Error if validation fails + */ +export function validateProviderDataSetResponse(value: unknown): ProviderDataSetResponse { + const { error, value: validated } = providerDataSetResponseSchema.validate(value, { abortEarly: false }); + if (error) { + throw new Error(`Invalid provider dataset response format: ${error.message}`); + } + return validated as ProviderDataSetResponse; +} diff --git a/apps/backend/src/subgraph/subgraph.service.ts b/apps/backend/src/subgraph/subgraph.service.ts index 3067532c..97472c3c 100644 --- a/apps/backend/src/subgraph/subgraph.service.ts +++ b/apps/backend/src/subgraph/subgraph.service.ts @@ -51,6 +51,21 @@ class ValidationError extends Error { } } +/** + * Client for the dealbot-owned subgraph (driven by `SUBGRAPH_ENDPOINT`). + * + * Functionally a superset of `PDPSubgraphService`: it exposes the same + * `fetchSubgraphMeta` / `fetchProvidersWithDatasets` surface plus the new + * `sampleAnonPiece` query used by anonymous retrievals. + * + * The two services intentionally coexist while we migrate off the upstream + * pdp-explorer subgraph: `PDPSubgraphService` continues to drive the + * established data-retention path against `PDP_SUBGRAPH_ENDPOINT`, and + * `SubgraphService` is scoped to the new anonymous-retrieval flow only. + * Once the dealbot-owned subgraph has soaked in production, this service + * should become the single drop-in replacement for `PDPSubgraphService` + * and `PDP_SUBGRAPH_ENDPOINT` can be retired. + */ @Injectable() export class SubgraphService { private readonly logger: Logger = new Logger(SubgraphService.name); diff --git a/apps/backend/src/wallet-sdk/wallet-sdk.service.spec.ts b/apps/backend/src/wallet-sdk/wallet-sdk.service.spec.ts index 195db19f..d6613a31 100644 --- a/apps/backend/src/wallet-sdk/wallet-sdk.service.spec.ts +++ b/apps/backend/src/wallet-sdk/wallet-sdk.service.spec.ts @@ -18,7 +18,7 @@ const baseConfig: IBlockchainConfig = { checkDatasetCreationFees: false, useOnlyApprovedProviders: false, minNumDataSetsForChecks: 1, - subgraphEndpoint: "https://api.thegraph.com/subgraphs/filecoin/pdp", + pdpSubgraphEndpoint: "https://api.thegraph.com/subgraphs/filecoin/pdp", }; const makeProvider = (overrides: Partial): PDPProviderEx => diff --git a/docs/checks/data-retention.md b/docs/checks/data-retention.md index 4eb7a912..605753e7 100644 --- a/docs/checks/data-retention.md +++ b/docs/checks/data-retention.md @@ -27,7 +27,7 @@ Dealbot polls The Graph API endpoint for PDP (Proof of Data Possession) data at **Subgraph repository**: [FilOzone/pdp-explorer](https://github.com/FilOzone/pdp-explorer/blob/main/subgraph/src/pdp-verifier.ts) -**Subgraph endpoint**: Configured via `SUBGRAPH_ENDPOINT` environment variable (see [environment-variables.md](../environment-variables.md#subgraph_endpoint)) +**Subgraph endpoint**: Configured via `PDP_SUBGRAPH_ENDPOINT` environment variable (see [environment-variables.md](../environment-variables.md#pdp_subgraph_endpoint)) > **Note**: The production subgraph URL is currently being finalized [here](https://github.com/FilOzone/pdp-explorer/pull/86). @@ -48,7 +48,7 @@ From `GET_PROVIDERS_WITH_DATASETS` query for each provider: > **Note**: The subgraph query uses the field name `proofSets`, but this refers to "dataSets" in the current codebase. The terminology was updated from "proof set" to "data set" but the subgraph schema retains the old naming. -Source: [`subgraph.service.ts` (`fetchSubgraphMeta`, `fetchProvidersWithDatasets`)](../../apps/backend/src/subgraph/subgraph.service.ts) +Source: [`pdp-subgraph.service.ts` (`fetchSubgraphMeta`, `fetchProvidersWithDatasets`)](../../apps/backend/src/pdp-subgraph/pdp-subgraph.service.ts) ### 2. Compute Challenge Totals and Overdue Estimates @@ -170,7 +170,7 @@ The PDP subgraph service enforces Goldsky's public endpoint rate limits: Rate limiting is enforced client-side to prevent 429 errors. -Source: [`subgraph.service.ts` (`enforceRateLimit`)](../../apps/backend/src/subgraph/subgraph.service.ts) +Source: [`pdp-subgraph.service.ts` (`enforceRateLimit`)](../../apps/backend/src/pdp-subgraph/pdp-subgraph.service.ts) ## Metrics Recorded @@ -210,11 +210,11 @@ Key environment variables that control data retention check behavior: | Variable | Required | Default | Description | | ----------------------- | -------- | ------------ | ------------------------------------------------------------------------------------------------ | -| `SUBGRAPH_ENDPOINT` | No | Empty string | The Graph API endpoint for PDP subgraph queries. When empty, data retention checks are disabled. | +| `PDP_SUBGRAPH_ENDPOINT` | No | Empty string | The Graph API endpoint for PDP subgraph queries. When empty, data retention checks are disabled. | Source: [`app.config.ts`](../../apps/backend/src/config/app.config.ts) -See also: [`environment-variables.md`](../environment-variables.md#subgraph_endpoint) for the full configuration reference. +See also: [`environment-variables.md`](../environment-variables.md#pdp_subgraph_endpoint) for the full configuration reference. ## Error Handling diff --git a/docs/checks/production-configuration-and-approval-methodology.md b/docs/checks/production-configuration-and-approval-methodology.md index 6b2859aa..2e89a45d 100644 --- a/docs/checks/production-configuration-and-approval-methodology.md +++ b/docs/checks/production-configuration-and-approval-methodology.md @@ -40,7 +40,7 @@ Relevant parameters include: | Parameter | Value | Notes | |-----------|-------|-------| -| [`SUBGRAPH_ENDPOINT`](../environment-variables.md#subgraph_endpoint) | - | Points at a Goldsky deployment of the dealbot-owned subgraph in [`apps/subgraph/`](../../apps/subgraph/) (package `@dealbot/subgraph`). | +| [`PDP_SUBGRAPH_ENDPOINT`](../environment-variables.md#pdp_subgraph_endpoint) | TODO: fill this in | Uses the subgraph from [pdp-explorer](https://github.com/FilOzone/pdp-explorer). | | [`MIN_NUM_DATASETS_FOR_CHECKS`](../environment-variables.md#dataset-configuration) | 15 | Ensure there are enough datasets with pieces being added so that statistical significance for [Data Retention Fault Rate](#data-retention-fault-rate) can be achieved quicker. Note that on mainnet each dataset incurs 5 challenges[^1] per daily proof[^2]. With this many datasets, an SP can be approved for data retention after a faultless ~7 days even if the SP doesn't have other datasets. | See [How are data retention statistics/thresholds calculated?](#how-are-data-retention-statisticsthresholds-calculated) for more details. diff --git a/docs/environment-variables.md b/docs/environment-variables.md index e2b23735..91e28abc 100644 --- a/docs/environment-variables.md +++ b/docs/environment-variables.md @@ -8,7 +8,7 @@ This document provides a comprehensive guide to all environment variables used b | ----------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------ | | [Application](#application-configuration) | `NODE_ENV`, `DEALBOT_PORT`, `DEALBOT_HOST`, `DEALBOT_RUN_MODE`, `DEALBOT_METRICS_PORT`, `DEALBOT_METRICS_HOST`, `DEALBOT_ALLOWED_ORIGINS`, `ENABLE_DEV_MODE` | | [Database](#database-configuration) | `DATABASE_HOST`, `DATABASE_PORT`, `DATABASE_POOL_MAX`, `DATABASE_USER`, `DATABASE_PASSWORD`, `DATABASE_NAME` | -| [Blockchain](#blockchain-configuration) | `NETWORK`, `RPC_URL`, `WALLET_ADDRESS`, `WALLET_PRIVATE_KEY`, `SESSION_KEY_PRIVATE_KEY`, `CHECK_DATASET_CREATION_FEES`, `USE_ONLY_APPROVED_PROVIDERS`, `SUBGRAPH_ENDPOINT` | +| [Blockchain](#blockchain-configuration) | `NETWORK`, `RPC_URL`, `WALLET_ADDRESS`, `WALLET_PRIVATE_KEY`, `SESSION_KEY_PRIVATE_KEY`, `CHECK_DATASET_CREATION_FEES`, `USE_ONLY_APPROVED_PROVIDERS`, `PDP_SUBGRAPH_ENDPOINT`, `SUBGRAPH_ENDPOINT` | | [Dataset Versioning](#dataset-versioning) | `DEALBOT_DATASET_VERSION` | | [Scheduling](#scheduling-configuration) | `PROVIDERS_REFRESH_INTERVAL_SECONDS`, `DATA_RETENTION_POLL_INTERVAL_SECONDS`, `DEALBOT_MAINTENANCE_WINDOWS_UTC`, `DEALBOT_MAINTENANCE_WINDOW_MINUTES` | | [Jobs (pg-boss)](#jobs-pg-boss) | `DEALBOT_PGBOSS_SCHEDULER_ENABLED`, `DEALBOT_PGBOSS_POOL_MAX`, `DEALS_PER_SP_PER_HOUR`, `DATASET_CREATIONS_PER_SP_PER_HOUR`, `RETRIEVALS_PER_SP_PER_HOUR`, `RETRIEVALS_ANON_PER_SP_PER_HOUR`, `JOB_SCHEDULER_POLL_SECONDS`, `JOB_WORKER_POLL_SECONDS`, `PG_BOSS_LOCAL_CONCURRENCY`, `JOB_CATCHUP_MAX_ENQUEUE`, `JOB_SCHEDULE_PHASE_SECONDS`, `JOB_ENQUEUE_JITTER_SECONDS`, `DEAL_JOB_TIMEOUT_SECONDS`, `RETRIEVAL_JOB_TIMEOUT_SECONDS`, `ANON_RETRIEVAL_JOB_TIMEOUT_SECONDS`, `ANON_RETRIEVAL_BLOCK_SAMPLE_COUNT`, `IPFS_BLOCK_FETCH_CONCURRENCY` | @@ -425,13 +425,35 @@ Session keys are scoped (only storage operations, not deposits or withdrawals) a --- +### `PDP_SUBGRAPH_ENDPOINT` + +- **Type**: `string` (URL) +- **Required**: No +- **Default**: Empty string (feature disabled) + +**Role**: The Graph API endpoint for the upstream pdp-explorer subgraph. Drives the data-retention overdue-periods metric. + +This variable is kept distinct from [`SUBGRAPH_ENDPOINT`](#subgraph_endpoint) so the dealbot-owned subgraph can be rolled out incrementally — only the new anonymous-retrieval flow points at the new endpoint while the established data-retention path stays on the upstream subgraph. + +**When to update**: + +- When switching between different Graph API endpoints for the pdp-explorer subgraph. + +**Example**: + +```bash +PDP_SUBGRAPH_ENDPOINT=https://api.thegraph.com/subgraphs/filecoin/pdp +``` + +--- + ### `SUBGRAPH_ENDPOINT` - **Type**: `string` (URL) - **Required**: No - **Default**: Empty string (feature disabled) -**Role**: The Graph API endpoint for querying PDP (Proof of Data Possession) subgraph data. Drives the overdue-periods metric and the anonymous-retrieval candidate-piece query. +**Role**: The Graph API endpoint for the dealbot-owned subgraph. Currently drives only the [anonymous-retrieval](./checks/anon-retrievals.md) candidate-piece query. Once the dealbot-owned subgraph has soaked in production it is intended to replace [`PDP_SUBGRAPH_ENDPOINT`](#pdp_subgraph_endpoint). The dealbot-owned subgraph lives at `apps/subgraph/` (package `@dealbot/subgraph`) and is deployed to Goldsky. Point this variable at one of those slots; the exact slugs are documented in `apps/subgraph/README.md`. diff --git a/kustomize/overlays/local/backend-configmap-local.yaml b/kustomize/overlays/local/backend-configmap-local.yaml index b4febf61..52918aa2 100644 --- a/kustomize/overlays/local/backend-configmap-local.yaml +++ b/kustomize/overlays/local/backend-configmap-local.yaml @@ -26,6 +26,7 @@ data: PG_BOSS_LOCAL_CONCURRENCY: "3" JOB_WORKER_POLL_SECONDS: "60" RANDOM_PIECE_SIZES: "10485760" + PDP_SUBGRAPH_ENDPOINT: "https://api.goldsky.com/api/public/project_cmdfaaxeuz6us01u359yjdctw/subgraphs/pdp-explorer/calibration311a/gn" SUBGRAPH_ENDPOINT: "https://api.goldsky.com/api/public/project_cmdfaaxeuz6us01u359yjdctw/subgraphs/pdp-explorer/calibration311a/gn" JOB_SCHEDULER_POLL_SECONDS: "60" CLICKHOUSE_URL: "http://default:@dealbot-clickhouse:8123/dealbot" From d82222f530489c9a054e7166a2a61fcc86bbec5c Mon Sep 17 00:00:00 2001 From: Dennis Trautwein Date: Tue, 5 May 2026 09:17:25 +0200 Subject: [PATCH 20/55] refactor: reduce pr diff noise --- apps/backend/src/jobs/jobs.module.ts | 2 +- apps/backend/src/jobs/jobs.service.spec.ts | 52 +++++++++++----------- apps/backend/src/jobs/jobs.service.ts | 3 +- 3 files changed, 29 insertions(+), 28 deletions(-) diff --git a/apps/backend/src/jobs/jobs.module.ts b/apps/backend/src/jobs/jobs.module.ts index 69f1edb1..fb708e09 100644 --- a/apps/backend/src/jobs/jobs.module.ts +++ b/apps/backend/src/jobs/jobs.module.ts @@ -18,10 +18,10 @@ import { JobScheduleRepository } from "./repositories/job-schedule.repository.js TypeOrmModule.forFeature([StorageProvider, JobScheduleState]), DealModule, RetrievalModule, - RetrievalAnonModule, WalletSdkModule, DataRetentionModule, PieceCleanupModule, + RetrievalAnonModule, ], providers: [JobsService, JobScheduleRepository], }) diff --git a/apps/backend/src/jobs/jobs.service.spec.ts b/apps/backend/src/jobs/jobs.service.spec.ts index c20d0890..8983c723 100644 --- a/apps/backend/src/jobs/jobs.service.spec.ts +++ b/apps/backend/src/jobs/jobs.service.spec.ts @@ -52,10 +52,10 @@ describe("JobsService schedule rows", () => { jobScheduleRepository: JobsServiceDeps[2]; dealService: JobsServiceDeps[3]; retrievalService: JobsServiceDeps[4]; - anonRetrievalService: JobsServiceDeps[5]; - walletSdkService: JobsServiceDeps[6]; - dataRetentionService: JobsServiceDeps[7]; - pieceCleanupService: JobsServiceDeps[8]; + walletSdkService: JobsServiceDeps[5]; + dataRetentionService: JobsServiceDeps[6]; + pieceCleanupService: JobsServiceDeps[7]; + anonRetrievalService: JobsServiceDeps[8]; jobsQueuedGauge: JobsServiceDeps[9]; jobsRetryScheduledGauge: JobsServiceDeps[10]; oldestQueuedAgeGauge: JobsServiceDeps[11]; @@ -160,10 +160,10 @@ describe("JobsService schedule rows", () => { overrides.jobScheduleRepository ?? (jobScheduleRepositoryMock as unknown as JobsServiceDeps[2]), overrides.dealService ?? ({} as JobsServiceDeps[3]), overrides.retrievalService ?? ({} as JobsServiceDeps[4]), - overrides.anonRetrievalService ?? ({} as JobsServiceDeps[5]), - overrides.walletSdkService ?? ({} as JobsServiceDeps[6]), - overrides.dataRetentionService ?? (dataRetentionServiceMock as unknown as JobsServiceDeps[7]), - overrides.pieceCleanupService ?? ({} as JobsServiceDeps[8]), + overrides.walletSdkService ?? ({} as JobsServiceDeps[5]), + overrides.dataRetentionService ?? (dataRetentionServiceMock as unknown as JobsServiceDeps[6]), + overrides.pieceCleanupService ?? ({} as JobsServiceDeps[7]), + overrides.anonRetrievalService ?? ({} as JobsServiceDeps[8]), overrides.jobsQueuedGauge ?? metricsMocks.jobsQueuedGauge, overrides.jobsRetryScheduledGauge ?? metricsMocks.jobsRetryScheduledGauge, overrides.oldestQueuedAgeGauge ?? metricsMocks.oldestQueuedAgeGauge, @@ -287,7 +287,7 @@ describe("JobsService schedule rows", () => { service = buildService({ configService, dealService: dealService as unknown as ConstructorParameters[3], - walletSdkService: walletSdkService as unknown as ConstructorParameters[6], + walletSdkService: walletSdkService as unknown as ConstructorParameters[5], }); // Trigger the timeout immediately by using fake timers @@ -346,7 +346,7 @@ describe("JobsService schedule rows", () => { service = buildService({ configService, retrievalService: retrievalService as unknown as ConstructorParameters[4], - walletSdkService: walletSdkService as unknown as ConstructorParameters[6], + walletSdkService: walletSdkService as unknown as ConstructorParameters[5], }); vi.useFakeTimers(); @@ -385,7 +385,7 @@ describe("JobsService schedule rows", () => { service = buildService({ retrievalService: retrievalService as unknown as ConstructorParameters[4], - walletSdkService: walletSdkService as unknown as ConstructorParameters[6], + walletSdkService: walletSdkService as unknown as ConstructorParameters[5], }); await callPrivate(service, "handleRetrievalJob", { @@ -425,7 +425,7 @@ describe("JobsService schedule rows", () => { service = buildService({ retrievalService: retrievalService as unknown as ConstructorParameters[4], - walletSdkService: walletSdkService as unknown as ConstructorParameters[6], + walletSdkService: walletSdkService as unknown as ConstructorParameters[5], }); await expect( @@ -928,7 +928,7 @@ describe("JobsService schedule rows", () => { service = buildService({ dealService: dealService as unknown as ConstructorParameters[3], - walletSdkService: walletSdkService as unknown as ConstructorParameters[6], + walletSdkService: walletSdkService as unknown as ConstructorParameters[5], }); await callPrivate(service, "handleDealJob", { @@ -967,8 +967,8 @@ describe("JobsService schedule rows", () => { service = buildService({ dealService: dealService as unknown as ConstructorParameters[3], - walletSdkService: walletSdkService as unknown as ConstructorParameters[6], - pieceCleanupService: pieceCleanupService as unknown as JobsServiceDeps[8], + walletSdkService: walletSdkService as unknown as ConstructorParameters[5], + pieceCleanupService: pieceCleanupService as unknown as JobsServiceDeps[7], }); await callPrivate(service, "handleDealJob", { @@ -1000,7 +1000,7 @@ describe("JobsService schedule rows", () => { service = buildService({ dealService: dealService as unknown as ConstructorParameters[3], - walletSdkService: walletSdkService as unknown as ConstructorParameters[6], + walletSdkService: walletSdkService as unknown as ConstructorParameters[5], }); await callPrivate(service, "handleDealJob", { @@ -1029,7 +1029,7 @@ describe("JobsService schedule rows", () => { service = buildService({ dealService: dealService as unknown as ConstructorParameters[3], - walletSdkService: walletSdkService as unknown as ConstructorParameters[6], + walletSdkService: walletSdkService as unknown as ConstructorParameters[5], }); await callPrivate(service, "handleDataSetCreationJob", { @@ -1071,7 +1071,7 @@ describe("JobsService schedule rows", () => { service = buildService({ configService, dealService: dealService as unknown as ConstructorParameters[3], - walletSdkService: walletSdkService as unknown as ConstructorParameters[6], + walletSdkService: walletSdkService as unknown as ConstructorParameters[5], }); await callPrivate(service, "handleDataSetCreationJob", { @@ -1112,7 +1112,7 @@ describe("JobsService schedule rows", () => { service = buildService({ configService, dealService: dealService as unknown as ConstructorParameters[3], - walletSdkService: walletSdkService as unknown as ConstructorParameters[6], + walletSdkService: walletSdkService as unknown as ConstructorParameters[5], }); await callPrivate(service, "handleDataSetCreationJob", { @@ -1157,7 +1157,7 @@ describe("JobsService schedule rows", () => { service = buildService({ configService, dealService: dealService as unknown as ConstructorParameters[3], - walletSdkService: walletSdkService as unknown as ConstructorParameters[6], + walletSdkService: walletSdkService as unknown as ConstructorParameters[5], }); await callPrivate(service, "handleDataSetCreationJob", { @@ -1330,7 +1330,7 @@ describe("JobsService schedule rows", () => { service = buildService({ dealService: dealService as unknown as JobsServiceDeps[3], - walletSdkService: walletSdkService as unknown as JobsServiceDeps[6], + walletSdkService: walletSdkService as unknown as JobsServiceDeps[5], }); await callPrivate(service, "handleDealJob", { @@ -1354,7 +1354,7 @@ describe("JobsService schedule rows", () => { service = buildService({ retrievalService: retrievalService as unknown as JobsServiceDeps[4], - walletSdkService: walletSdkService as unknown as JobsServiceDeps[6], + walletSdkService: walletSdkService as unknown as JobsServiceDeps[5], }); await callPrivate(service, "handleRetrievalJob", { @@ -1383,7 +1383,7 @@ describe("JobsService schedule rows", () => { service = buildService({ dealService: dealService as unknown as JobsServiceDeps[3], - walletSdkService: walletSdkService as unknown as JobsServiceDeps[6], + walletSdkService: walletSdkService as unknown as JobsServiceDeps[5], }); await callPrivate(service, "handleDataSetCreationJob", { @@ -1425,7 +1425,7 @@ describe("JobsService schedule rows", () => { intervalSeconds: 60, service: buildService({ dealService: dealService as unknown as JobsServiceDeps[3], - walletSdkService: walletSdkService as unknown as JobsServiceDeps[6], + walletSdkService: walletSdkService as unknown as JobsServiceDeps[5], }), expectCheckNotRun: () => expect(dealService.createDealForProvider).not.toHaveBeenCalled(), }, @@ -1435,7 +1435,7 @@ describe("JobsService schedule rows", () => { intervalSeconds: 60, service: buildService({ retrievalService: retrievalService as unknown as JobsServiceDeps[4], - walletSdkService: walletSdkService as unknown as JobsServiceDeps[6], + walletSdkService: walletSdkService as unknown as JobsServiceDeps[5], }), expectCheckNotRun: () => expect(retrievalService.performRandomRetrievalForProvider).not.toHaveBeenCalled(), }, @@ -1445,7 +1445,7 @@ describe("JobsService schedule rows", () => { intervalSeconds: 3600, service: buildService({ dealService: dataSetDealService as unknown as JobsServiceDeps[3], - walletSdkService: walletSdkService as unknown as JobsServiceDeps[6], + walletSdkService: walletSdkService as unknown as JobsServiceDeps[5], }), expectCheckNotRun: () => expect(dataSetDealService.createDataSetWithPiece).not.toHaveBeenCalled(), }, diff --git a/apps/backend/src/jobs/jobs.service.ts b/apps/backend/src/jobs/jobs.service.ts index b070de5a..e09cf42c 100644 --- a/apps/backend/src/jobs/jobs.service.ts +++ b/apps/backend/src/jobs/jobs.service.ts @@ -72,10 +72,11 @@ export class JobsService implements OnModuleInit, OnApplicationShutdown { private readonly jobScheduleRepository: JobScheduleRepository, private readonly dealService: DealService, private readonly retrievalService: RetrievalService, - private readonly anonRetrievalService: AnonRetrievalService, private readonly walletSdkService: WalletSdkService, private readonly dataRetentionService: DataRetentionService, private readonly pieceCleanupService: PieceCleanupService, + private readonly anonRetrievalService: AnonRetrievalService, + @InjectMetric("jobs_queued") private readonly jobsQueuedGauge: Gauge, @InjectMetric("jobs_retry_scheduled") From 527283fcc8f082f8af2920fa8367f84e4f87cdb8 Mon Sep 17 00:00:00 2001 From: Dennis Trautwein Date: Tue, 5 May 2026 09:33:25 +0200 Subject: [PATCH 21/55] remove: residual references to a pdp subgraph in the subgraph module --- apps/backend/src/subgraph/subgraph.service.spec.ts | 6 +++--- apps/backend/src/subgraph/subgraph.service.ts | 8 ++++---- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/apps/backend/src/subgraph/subgraph.service.spec.ts b/apps/backend/src/subgraph/subgraph.service.spec.ts index 8703b2c5..64f28435 100644 --- a/apps/backend/src/subgraph/subgraph.service.spec.ts +++ b/apps/backend/src/subgraph/subgraph.service.spec.ts @@ -397,14 +397,14 @@ describe("SubgraphService", () => { }); }); - it("throws when PDP subgraph endpoint is not configured", async () => { + it("throws when subgraph endpoint is not configured", async () => { const configService = { get: vi.fn(() => ({ subgraphEndpoint: "" })), } as unknown as ConfigService; const serviceWithoutEndpoint = new SubgraphService(configService); - await expect(serviceWithoutEndpoint.fetchSubgraphMeta()).rejects.toThrow("No PDP subgraph endpoint configured"); + await expect(serviceWithoutEndpoint.fetchSubgraphMeta()).rejects.toThrow("No subgraph endpoint configured"); }); it("throws on HTTP error response", async () => { @@ -740,7 +740,7 @@ describe("SubgraphService", () => { const noEndpointService = new SubgraphService(noEndpointConfig); await expect(noEndpointService.sampleAnonPiece(defaultSampleParams)).rejects.toThrow( - "No PDP subgraph endpoint configured", + "No subgraph endpoint configured", ); expect(fetchMock).not.toHaveBeenCalled(); }); diff --git a/apps/backend/src/subgraph/subgraph.service.ts b/apps/backend/src/subgraph/subgraph.service.ts index 97472c3c..3d4e8370 100644 --- a/apps/backend/src/subgraph/subgraph.service.ts +++ b/apps/backend/src/subgraph/subgraph.service.ts @@ -133,9 +133,9 @@ export class SubgraphService { // candidate pool (which silently no-ops every anon retrieval job). this.logger.error({ event: "subgraph_endpoint_not_configured", - message: "Cannot sample anonymous piece — no PDP subgraph endpoint configured", + message: "Cannot sample anonymous piece — no subgraph endpoint configured", }); - throw new Error("No PDP subgraph endpoint configured"); + throw new Error("No subgraph endpoint configured"); } const query = buildSampleAnonPieceQuery(params.pool); @@ -194,7 +194,7 @@ export class SubgraphService { attempt: number = 1, ): Promise { if (!this.blockchainConfig.subgraphEndpoint) { - throw new Error("No PDP subgraph endpoint configured"); + throw new Error("No subgraph endpoint configured"); } try { @@ -297,7 +297,7 @@ export class SubgraphService { attempt: number = 1, ): Promise { if (!this.blockchainConfig.subgraphEndpoint) { - throw new Error("No PDP subgraph endpoint configured"); + throw new Error("No subgraph endpoint configured"); } const variables = { From 8dfb3ca9f2508cb24dea95f22d5380d65643d51c Mon Sep 17 00:00:00 2001 From: Dennis Trautwein Date: Fri, 15 May 2026 21:10:42 +0200 Subject: [PATCH 22/55] Apply suggestion from @BigLep Co-authored-by: Steve Loeppky --- docs/checks/anon-retrievals.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/checks/anon-retrievals.md b/docs/checks/anon-retrievals.md index 0a303462..2c15f3ed 100644 --- a/docs/checks/anon-retrievals.md +++ b/docs/checks/anon-retrievals.md @@ -30,7 +30,7 @@ Operational timeouts exist to prevent jobs from running indefinitely. If the job ## Piece Selection -Unlike the [Retrieval check](./retrievals.md#piece-selection), dealbot does not retrieve from its own deals. Pieces are sampled from the on-chain subgraph of all FWSS-served pieces for the SP under test. +Unlike the [Retrieval check](./retrievals.md#piece-selection), dealbot does not retrieve from its own deals. Pieces are sampled from the [on-chain subgraph](../../src/subgraph) of all FWSS-served pieces for the SP under test. Selection strategy (per scheduled job, per SP): @@ -59,7 +59,7 @@ flowchart TD Select["Sample anonymous piece for SP from subgraph"] --> Fetch["GET /piece/{pieceCid}"] Fetch --> CommP["Hash bytes → verify CommP"] CommP --> HasIpfs{"piece.withIPFSIndexing
and ipfsRootCid?"} - HasIpfs -- "no" --> Record["Persist row + metrics"] + HasIpfs -- "no" --> Record["Persist Clickhosue row + emit Prometheus metrics"] HasIpfs -- "yes" --> ParseCar["Parse bytes as CAR"] ParseCar --> SampleBlocks["Pick N random CIDs
(ANON_RETRIEVAL_BLOCK_SAMPLE_COUNT)"] SampleBlocks --> Ipni["IPNI: verify SP advertises root + sampled CIDs"] From b8a2621ce5747d2d351066147d4d3487dbd56169 Mon Sep 17 00:00:00 2001 From: Dennis Trautwein Date: Fri, 15 May 2026 22:01:13 +0200 Subject: [PATCH 23/55] chore: align pnpm-lock.yaml with main --- pnpm-lock.yaml | 36 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 8089b756..0495aa11 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -1513,24 +1513,24 @@ packages: engines: {node: ^14.18.0 || >=16.10.0, npm: '>=5.10.0'} hasBin: true - '@oclif/core@4.10.6': - resolution: {integrity: sha512-ySCOYnPKZE3KACT1V9It99hWG9b8E5MpagbRdWxPNRO3beMqmbr4SLUQoFtZ9XRtW++kks1ZVwZOdpnR8rpb9A==} + '@oclif/core@4.10.5': + resolution: {integrity: sha512-qcdCF7NrdWPfme6Kr34wwljRCXbCVpL1WVxiNy0Ep6vbWKjxAjFQwuhqkoyL0yjI+KdwtLcOCGn5z2yzdijc8w==} engines: {node: '>=18.0.0'} '@oclif/core@4.5.5': resolution: {integrity: sha512-iQzlaJQgPeUXrtrX71OzDwxPikQ7c2FhNd8U8rBB7BCtj2XYfmzBT/Hmbc+g9OKDIG/JkbJT0fXaWMMBrhi+1A==} engines: {node: '>=18.0.0'} - '@oclif/plugin-autocomplete@3.2.46': - resolution: {integrity: sha512-TFvuD6JlmqEVsEvMqunyj3cyCz/l2Q4MqCjp/XtlSLS9x3xTlam7PGlqWi4WAhxl/K8CtpYqVlMYFEnlLTHspw==} + '@oclif/plugin-autocomplete@3.2.45': + resolution: {integrity: sha512-ENrUg8rbVCjh40uvi3MC9kGbiUoEf11nyqE59RBzegeeLpRXNo/Zp27L9j1tUmPEqGgfS2/wvHPihNzkpK1FDw==} engines: {node: '>=18.0.0'} - '@oclif/plugin-not-found@3.2.81': - resolution: {integrity: sha512-M88tLONBH36hLAbkFbmCo1hoZPSdU5l8Px1xEIlIgSmGMam+CoAzx4kGqpLbokgfpaHeP8/Jx3QJ18u9ef/2Qw==} + '@oclif/plugin-not-found@3.2.80': + resolution: {integrity: sha512-yTLjWvR1r/Rd/cO2LxHdMCDoL5sQhBYRUcOMCmxZtWVWhx4rAZ8KVUPDVsb+SvjJDV5ADTDBgt1H52fFx7YWqg==} engines: {node: '>=18.0.0'} - '@oclif/plugin-warn-if-update-available@3.1.61': - resolution: {integrity: sha512-4XcrTxcCs+brR/eZ0BPeuiREiH3USlJiaHbUqPhnIBuyxhhUSYVd8ZO6s5MQN7AXJq4SMQ+B5zLaHq+ep/afIw==} + '@oclif/plugin-warn-if-update-available@3.1.60': + resolution: {integrity: sha512-cRKBZm14IuA6G8W84dfd3iXj3BTAoxQ5o3pUE8DKEQ4n/tVha20t5nkVeD+ISC68e0Fuw5koTMvRwXb1lJSnzg==} engines: {node: '>=18.0.0'} '@open-draft/deferred-promise@2.2.0': @@ -7599,9 +7599,9 @@ snapshots: dependencies: '@float-capital/float-subgraph-uncrashable': 0.0.0-internal-testing.5 '@oclif/core': 4.5.5 - '@oclif/plugin-autocomplete': 3.2.46 - '@oclif/plugin-not-found': 3.2.81(@types/node@25.2.3) - '@oclif/plugin-warn-if-update-available': 3.1.61 + '@oclif/plugin-autocomplete': 3.2.45 + '@oclif/plugin-not-found': 3.2.80(@types/node@25.6.2) + '@oclif/plugin-warn-if-update-available': 3.1.60 '@pinax/graph-networks-registry': 0.7.1 '@whatwg-node/fetch': 0.10.13 assemblyscript: 0.19.23 @@ -8937,7 +8937,7 @@ snapshots: dependencies: consola: 3.4.2 - '@oclif/core@4.10.6': + '@oclif/core@4.10.5': dependencies: ansi-escapes: 4.3.2 ansis: 3.17.0 @@ -8979,7 +8979,7 @@ snapshots: wordwrap: 1.0.0 wrap-ansi: 7.0.0 - '@oclif/plugin-autocomplete@3.2.46': + '@oclif/plugin-autocomplete@3.2.45': dependencies: '@oclif/core': 4.5.5 ansis: 3.17.0 @@ -8988,16 +8988,16 @@ snapshots: transitivePeerDependencies: - supports-color - '@oclif/plugin-not-found@3.2.81(@types/node@25.2.3)': + '@oclif/plugin-not-found@3.2.80(@types/node@25.6.2)': dependencies: - '@inquirer/prompts': 7.10.1(@types/node@25.2.3) - '@oclif/core': 4.10.6 + '@inquirer/prompts': 7.10.1(@types/node@25.6.2) + '@oclif/core': 4.10.5 ansis: 3.17.0 fast-levenshtein: 3.0.0 transitivePeerDependencies: - '@types/node' - '@oclif/plugin-warn-if-update-available@3.1.61': + '@oclif/plugin-warn-if-update-available@3.1.60': dependencies: '@oclif/core': 4.5.5 ansis: 3.17.0 @@ -11779,7 +11779,7 @@ snapshots: dependencies: foreground-child: 3.3.1 jackspeak: 4.2.3 - minimatch: 10.2.5 + minimatch: 10.2.4 minipass: 7.1.2 package-json-from-dist: 1.0.1 path-scurry: 2.0.1 From 70af7c07ad194a71b8c82f05e391e386daba1827 Mon Sep 17 00:00:00 2001 From: Dennis Trautwein Date: Fri, 15 May 2026 22:16:25 +0200 Subject: [PATCH 24/55] fix: wrong reference to an old maximum anon retrieval piece size --- apps/backend/.env.example | 2 +- apps/backend/src/config/app.config.ts | 4 ++-- docs/environment-variables.md | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/apps/backend/.env.example b/apps/backend/.env.example index 30556e7a..807de908 100644 --- a/apps/backend/.env.example +++ b/apps/backend/.env.example @@ -67,7 +67,7 @@ JOB_SCHEDULE_PHASE_SECONDS=0 JOB_ENQUEUE_JITTER_SECONDS=0 DEAL_JOB_TIMEOUT_SECONDS=360 # 6m: Max runtime for deal jobs (TODO: reduce default to 3m) RETRIEVAL_JOB_TIMEOUT_SECONDS=60 # 1m: Max runtime for retrieval jobs (TODO: reduce default to 30s) -ANON_RETRIEVAL_JOB_TIMEOUT_SECONDS=360 # 6m: Max runtime for anon retrieval jobs (pieces up to ~70 MiB) +ANON_RETRIEVAL_JOB_TIMEOUT_SECONDS=360 # 6m: Max runtime for anon retrieval jobs (pieces up to ~500 MiB) IPFS_BLOCK_FETCH_CONCURRENCY=6 # Parallel block fetches when validating IPFS DAGs DEALBOT_PGBOSS_POOL_MAX=1 DEALBOT_PGBOSS_SCHEDULER_ENABLED=true diff --git a/apps/backend/src/config/app.config.ts b/apps/backend/src/config/app.config.ts index 7906be8c..49b55606 100644 --- a/apps/backend/src/config/app.config.ts +++ b/apps/backend/src/config/app.config.ts @@ -101,7 +101,7 @@ export const configValidationSchema = Joi.object({ JOB_ENQUEUE_JITTER_SECONDS: Joi.number().min(0).default(0), DEAL_JOB_TIMEOUT_SECONDS: Joi.number().min(120).default(360), // 6 minutes max runtime for data storage jobs (TODO: reduce default to 3 minutes) RETRIEVAL_JOB_TIMEOUT_SECONDS: Joi.number().min(60).default(60), // 1 minute max runtime for retrieval jobs (TODO: reduce default to 30 seconds) - ANON_RETRIEVAL_JOB_TIMEOUT_SECONDS: Joi.number().min(60).default(360), // 6 minutes max runtime for anon retrieval jobs (pieces can be up to ~70 MiB) + ANON_RETRIEVAL_JOB_TIMEOUT_SECONDS: Joi.number().min(60).default(360), // 6 minutes max runtime for anon retrieval jobs (pieces can be up to 500 MiB) DATA_SET_CREATION_JOB_TIMEOUT_SECONDS: Joi.number().min(60).default(300), // 5 minutes max runtime for dataset creation jobs IPFS_BLOCK_FETCH_CONCURRENCY: Joi.number().integer().min(1).max(32).default(6), ANON_RETRIEVAL_BLOCK_SAMPLE_COUNT: Joi.number().integer().min(1).max(50).default(5), @@ -281,7 +281,7 @@ export interface IJobsConfig { /** * Maximum runtime (seconds) for anonymous retrieval jobs before forced abort. * - * Anonymous retrievals fetch arbitrary pieces (up to ~70 MiB), so this is + * Anonymous retrievals fetch arbitrary pieces (up to ~500 MiB), so this is * typically larger than `retrievalJobTimeoutSeconds`. Uses AbortController * to actively cancel job execution while still persisting partial metrics. */ diff --git a/docs/environment-variables.md b/docs/environment-variables.md index 91e28abc..547170ac 100644 --- a/docs/environment-variables.md +++ b/docs/environment-variables.md @@ -832,7 +832,7 @@ Use this to stagger multiple dealbot deployments that are not sharing a database - **Minimum**: `60` - **Enforced**: Yes (config validation) -**Role**: Maximum runtime for anonymous retrieval jobs before forced abort. Anonymous retrievals fetch arbitrary pieces (up to ~70 MiB) that were not produced by the dealbot, so this is typically larger than `RETRIEVAL_JOB_TIMEOUT_SECONDS`. When the timeout trips, partial metrics (`ttfb_ms`, `bytes_retrieved`, `response_code`) are still persisted so the abort is not silently lost. +**Role**: Maximum runtime for anonymous retrieval jobs before forced abort. Anonymous retrievals fetch arbitrary pieces (up to ~500 MiB) that were not produced by the dealbot, so this is typically larger than `RETRIEVAL_JOB_TIMEOUT_SECONDS`. When the timeout trips, partial metrics (`ttfb_ms`, `bytes_retrieved`, `response_code`) are still persisted so the abort is not silently lost. **When to update**: From b003d78250412cecf36a86fa5f0f78f60876cc47 Mon Sep 17 00:00:00 2001 From: Dennis Trautwein Date: Fri, 15 May 2026 22:17:16 +0200 Subject: [PATCH 25/55] docs: improve anon retrieval documentation --- docs/checks/events-and-metrics.md | 2 +- docs/environment-variables.md | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/checks/events-and-metrics.md b/docs/checks/events-and-metrics.md index fba8b003..9c8a5ae0 100644 --- a/docs/checks/events-and-metrics.md +++ b/docs/checks/events-and-metrics.md @@ -4,7 +4,7 @@ This document is the intended **source of truth** for the events emitted by deal > **Note on "events":** the entries in the [Event List](#event-list) are named **timing markers** used to define metric Timer Starts/Ends — they are not all emitted as discrete Prometheus events or log lines. Each marker is anchored in code (as a timestamp variable, log line, or status transition) and used to compute the metrics in the [Metrics](#metrics) section. -## Anonymous Retrieval Event Model +## Data Storage Event Model The [Anonymous Retrieval check](./anon-retrievals.md) is a single-shot flow per piece: select → fetch piece → (optional) parse CAR + IPNI + block fetch → write one ClickHouse row. diff --git a/docs/environment-variables.md b/docs/environment-variables.md index 547170ac..72fadca0 100644 --- a/docs/environment-variables.md +++ b/docs/environment-variables.md @@ -431,9 +431,9 @@ Session keys are scoped (only storage operations, not deposits or withdrawals) a - **Required**: No - **Default**: Empty string (feature disabled) -**Role**: The Graph API endpoint for the upstream pdp-explorer subgraph. Drives the data-retention overdue-periods metric. +**Role**: The Graph API endpoint for querying PDP (Proof of Data Possession) subgraph data. This endpoint is used to retrieve data retention info for provider data. -This variable is kept distinct from [`SUBGRAPH_ENDPOINT`](#subgraph_endpoint) so the dealbot-owned subgraph can be rolled out incrementally — only the new anonymous-retrieval flow points at the new endpoint while the established data-retention path stays on the upstream subgraph. +This variable is kept distinct from [`SUBGRAPH_ENDPOINT`](#subgraph_endpoint) so the [dealbot-owned subgraph](../../src/subgraph) can be rolled out incrementally. Only the newer [anonymous-retrieval check](./checks/anon-retrievals.md) points at the new endpoint while the established [data-retention check](./checks/data-retention.md) stays on the upstream subgraph. **When to update**: @@ -455,7 +455,7 @@ PDP_SUBGRAPH_ENDPOINT=https://api.thegraph.com/subgraphs/filecoin/pdp **Role**: The Graph API endpoint for the dealbot-owned subgraph. Currently drives only the [anonymous-retrieval](./checks/anon-retrievals.md) candidate-piece query. Once the dealbot-owned subgraph has soaked in production it is intended to replace [`PDP_SUBGRAPH_ENDPOINT`](#pdp_subgraph_endpoint). -The dealbot-owned subgraph lives at `apps/subgraph/` (package `@dealbot/subgraph`) and is deployed to Goldsky. Point this variable at one of those slots; the exact slugs are documented in `apps/subgraph/README.md`. +The dealbot-owned subgraph lives at [`apps/subgraph/`](../apps/subgraph) (package `@dealbot/subgraph`) and is deployed to [Goldsky](https://goldsky.com). **When to update**: From 21b4f2d5045dc6261b915c7e14c75521ddb83d89 Mon Sep 17 00:00:00 2001 From: Dennis Trautwein Date: Fri, 15 May 2026 22:54:51 +0200 Subject: [PATCH 26/55] docs: fix accidental changes to untouched event descriptions --- docs/checks/events-and-metrics.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/checks/events-and-metrics.md b/docs/checks/events-and-metrics.md index 9c8a5ae0..1e9d8583 100644 --- a/docs/checks/events-and-metrics.md +++ b/docs/checks/events-and-metrics.md @@ -118,8 +118,8 @@ sequenceDiagram | `ipfsRetrievalHttpResponseCode` | Data Storage, Retrieval | [`ipfsRetrievalLastByteReceived`](#ipfsRetrievalLastByteReceived) | `200`, `500`, `2xxSuccess`, `4xxClientError`, `5xxServerError`, `otherHttpStatusCodes`, `failure` | [`retrieval.service.ts`](../../apps/backend/src/retrieval/retrieval.service.ts) | | `retrievalStatus` | Data Storage, Retrieval | [`ipfsRetrievalIntegrityChecked`](#ipfsRetrievalIntegrityChecked) | `success`, `failure.timedout`, `failure.other` from [Data Storage Sub-status meanings](./data-storage.md#sub-status-meanings). | | | `dataSetCreationStatus` | Data-Set Creation | Not tied to an [event above](#event-list) but rather to data-set creation start (`pending`) and completion (`success`/`failure.*`) | `pending`, `success`, `failure.timedout`, `failure.other` | [`deal.service.ts`](../../apps/backend/src/deal/deal.service.ts) | -| `dataSetChallengeStatus` | Data Retention | Not tied to an [event above](#event-list) but rather to the periodic chain-checking done in the [Data Retention Check](./data-retention.md) | `success`, `failure` | [`data-retention.service.ts`](../../apps/backend/src/data-retention/data-retention.service.ts) | -| `pdp_provider_overdue_periods` | Data Retention | Emitted on every poll | Gauge value (estimated overdue periods) | [`data-retention.service.ts`](../../apps/backend/src/data-retention/data-retention.service.ts) | +| `dataSetChallengeStatus` | Data Retention | Emitted on each [Data Retention Check](./data-retention.md) poll when a provider's confirmed proving-period totals advance (strictly positive deltas). Unit: **challenges** (period delta × `CHALLENGES_PER_PROVING_PERIOD = 5`). | `success` (challenges in successfully-proven periods), `failure` (challenges in faulted periods) | [`data-retention.service.ts`](../../apps/backend/src/data-retention/data-retention.service.ts) | +| `pdp_provider_estimated_overdue_periods` | Data Retention | Emitted on every [Data Retention Check](./data-retention.md) poll for every successfully processed provider. | Gauge value in proving periods (non-negative integer) | [`data-retention.service.ts`](../../apps/backend/src/data-retention/data-retention.service.ts) | | `anonRetrievalStatus` | Anonymous Retrieval | After piece fetch completes (or on abort) | `success` (HTTP 2xx **and** CommP matches), `failure.http`, `failure.commp` (HTTP 2xx but bytes hashed to a different CID), `failure.aborted`, `failure.no_piece`. | [`anon-retrieval.service.ts`](../../apps/backend/src/retrieval-anon/anon-retrieval.service.ts) | | `anonPieceHttpResponseCode` | Anonymous Retrieval | After piece fetch completes | `200`, `500`, `2xxSuccess`, `4xxClientError`, `5xxServerError`, `otherHttpStatusCodes`, `failure` (same classifier as [`ipfsRetrievalHttpResponseCode`](#ipfsRetrievalHttpResponseCode)) | [`anon-retrieval.service.ts`](../../apps/backend/src/retrieval-anon/anon-retrieval.service.ts) | | `anonCarParseStatus` | Anonymous Retrieval | After CAR validation runs (skipped when piece fetch failed or piece is not IPFS-indexed) | `parseable`, `not_parseable` | [`anon-retrieval.service.ts`](../../apps/backend/src/retrieval-anon/anon-retrieval.service.ts) | From a4f0b38fdb789de01d1dbff7e8977434320c0008 Mon Sep 17 00:00:00 2001 From: Dennis Trautwein Date: Fri, 15 May 2026 22:56:32 +0200 Subject: [PATCH 27/55] rename: metric anonRetrievalStatus to anonPieceRetrievalStatus https://github.com/FilOzone/dealbot/pull/487/changes#r3245245410 --- apps/backend/src/metrics-prometheus/check-metrics.service.ts | 2 +- .../src/metrics-prometheus/metrics-prometheus.module.ts | 4 ++-- docs/checks/anon-retrievals.md | 2 +- docs/checks/events-and-metrics.md | 4 ++-- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/apps/backend/src/metrics-prometheus/check-metrics.service.ts b/apps/backend/src/metrics-prometheus/check-metrics.service.ts index 8d4be313..76a8ee31 100644 --- a/apps/backend/src/metrics-prometheus/check-metrics.service.ts +++ b/apps/backend/src/metrics-prometheus/check-metrics.service.ts @@ -260,7 +260,7 @@ export class AnonRetrievalCheckMetrics { private readonly throughputBps: Histogram, @InjectMetric("anonRetrievalCheckMs") private readonly checkMs: Histogram, - @InjectMetric("anonRetrievalStatus") + @InjectMetric("anonPieceRetrievalStatus") private readonly statusCounter: Counter, @InjectMetric("anonPieceHttpResponseCode") private readonly httpResponseCounter: Counter, diff --git a/apps/backend/src/metrics-prometheus/metrics-prometheus.module.ts b/apps/backend/src/metrics-prometheus/metrics-prometheus.module.ts index 45f728b6..4ebeb01a 100644 --- a/apps/backend/src/metrics-prometheus/metrics-prometheus.module.ts +++ b/apps/backend/src/metrics-prometheus/metrics-prometheus.module.ts @@ -234,8 +234,8 @@ const metricProviders = [ buckets: [100, 500, 1000, 2000, 5000, 10000, 30000, 60000, 120000, 300000, 600000], }), makeCounterProvider({ - name: "anonRetrievalStatus", - help: "Anonymous retrieval overall outcome", + name: "anonPieceRetrievalStatus", + help: "Anonymous piece retrieval overall outcome", labelNames: ["checkType", "providerId", "providerName", "providerStatus", "value"] as const, }), makeCounterProvider({ diff --git a/docs/checks/anon-retrievals.md b/docs/checks/anon-retrievals.md index 2c15f3ed..c3b69610 100644 --- a/docs/checks/anon-retrievals.md +++ b/docs/checks/anon-retrievals.md @@ -91,7 +91,7 @@ Source: [`car-validation.service.ts`](../../apps/backend/src/retrieval-anon/car- | # | Assertion | How It's Checked | Retries | Relevant Metric | Implemented? | |---|-----------|------------------|:---:|------------------|:---:| | 1 | SP serves the piece | `GET /piece/{pieceCid}` returns HTTP 2xx | 0 | [`anonPieceRetrievalLastByteMs`](./events-and-metrics.md#anonPieceRetrievalLastByteMs) | Yes | -| 2 | Bytes match the declared CommP | Hash of response bytes equals `pieceCid` | 0 | [`anonRetrievalStatus`](./events-and-metrics.md#anonRetrievalStatus) | Yes | +| 2 | Bytes match the declared CommP | Hash of response bytes equals `pieceCid` | 0 | [`anonPieceRetrievalStatus`](./events-and-metrics.md#anonPieceRetrievalStatus) | Yes | | 3 | Bytes parse as a CAR (IPFS-indexed pieces only) | `@ipld/car` parses the response | 0 | [`anonCarParseStatus`](./events-and-metrics.md#anonCarParseStatus) | Yes | | 4 | SP is advertised on IPNI for root + sampled CIDs | filecoinpin.contact returns provider records | polling until timeout | [`anonIpniStatus`](./events-and-metrics.md#anonIpniStatus) | Yes | | 5 | Sampled blocks fetch + hash-verify | `/ipfs/{cid}?format=raw` for each sample | 0 | [`anonBlockFetchStatus`](./events-and-metrics.md#anonBlockFetchStatus) | Yes | diff --git a/docs/checks/events-and-metrics.md b/docs/checks/events-and-metrics.md index 1e9d8583..2421242c 100644 --- a/docs/checks/events-and-metrics.md +++ b/docs/checks/events-and-metrics.md @@ -10,7 +10,7 @@ The [Anonymous Retrieval check](./anon-retrievals.md) is a single-shot flow per It is not modeled as a sequence of named lifecycle events. Instead it emits: -- **Outcome metrics** when each step completes — see the [time](#time-related-metrics) and [status](#status-count-related-metrics) metric tables for `anonPieceRetrievalFirstByteMs`, `anonRetrievalCheckMs`, `anonRetrievalStatus`, `anonCarParseStatus`, `anonIpniStatus`, `anonBlockFetchStatus`, and friends. +- **Outcome metrics** when each step completes — see the [time](#time-related-metrics) and [status](#status-count-related-metrics) metric tables for `anonPieceRetrievalFirstByteMs`, `anonRetrievalCheckMs`, `anonPieceRetrievalStatus`, `anonCarParseStatus`, `anonIpniStatus`, `anonBlockFetchStatus`, and friends. - **One row per attempt** in the `anon_retrieval_checks` [ClickHouse table](#clickhouse-tables), emitted even on abort or unexpected error. - **Structured log lines** (`anon_retrieval_started`, `anon_retrieval_completed`, `anon_retrieval_no_piece`, `anon_retrieval_car_validation_failed`, `anon_retrieval_clickhouse_insert_failed`) carrying a `retrievalId` so each row can be joined back to log evidence. @@ -120,7 +120,7 @@ sequenceDiagram | `dataSetCreationStatus` | Data-Set Creation | Not tied to an [event above](#event-list) but rather to data-set creation start (`pending`) and completion (`success`/`failure.*`) | `pending`, `success`, `failure.timedout`, `failure.other` | [`deal.service.ts`](../../apps/backend/src/deal/deal.service.ts) | | `dataSetChallengeStatus` | Data Retention | Emitted on each [Data Retention Check](./data-retention.md) poll when a provider's confirmed proving-period totals advance (strictly positive deltas). Unit: **challenges** (period delta × `CHALLENGES_PER_PROVING_PERIOD = 5`). | `success` (challenges in successfully-proven periods), `failure` (challenges in faulted periods) | [`data-retention.service.ts`](../../apps/backend/src/data-retention/data-retention.service.ts) | | `pdp_provider_estimated_overdue_periods` | Data Retention | Emitted on every [Data Retention Check](./data-retention.md) poll for every successfully processed provider. | Gauge value in proving periods (non-negative integer) | [`data-retention.service.ts`](../../apps/backend/src/data-retention/data-retention.service.ts) | -| `anonRetrievalStatus` | Anonymous Retrieval | After piece fetch completes (or on abort) | `success` (HTTP 2xx **and** CommP matches), `failure.http`, `failure.commp` (HTTP 2xx but bytes hashed to a different CID), `failure.aborted`, `failure.no_piece`. | [`anon-retrieval.service.ts`](../../apps/backend/src/retrieval-anon/anon-retrieval.service.ts) | +| `anonPieceRetrievalStatus` | Anonymous Retrieval | After piece fetch completes (or on abort) | `success` (HTTP 2xx **and** CommP matches), `failure.http`, `failure.commp` (HTTP 2xx but bytes hashed to a different CID), `failure.aborted`, `failure.no_piece`. | [`anon-retrieval.service.ts`](../../apps/backend/src/retrieval-anon/anon-retrieval.service.ts) | | `anonPieceHttpResponseCode` | Anonymous Retrieval | After piece fetch completes | `200`, `500`, `2xxSuccess`, `4xxClientError`, `5xxServerError`, `otherHttpStatusCodes`, `failure` (same classifier as [`ipfsRetrievalHttpResponseCode`](#ipfsRetrievalHttpResponseCode)) | [`anon-retrieval.service.ts`](../../apps/backend/src/retrieval-anon/anon-retrieval.service.ts) | | `anonCarParseStatus` | Anonymous Retrieval | After CAR validation runs (skipped when piece fetch failed or piece is not IPFS-indexed) | `parseable`, `not_parseable` | [`anon-retrieval.service.ts`](../../apps/backend/src/retrieval-anon/anon-retrieval.service.ts) | | `anonIpniStatus` | Anonymous Retrieval | After CAR validation runs, **or** when piece fetch failed (records `skipped`) | `valid`, `invalid`, `skipped`, `error` | [`anon-retrieval.service.ts`](../../apps/backend/src/retrieval-anon/anon-retrieval.service.ts) | From 1a32373e29e7868c39f19d68ded25e3d238b9858 Mon Sep 17 00:00:00 2001 From: Dennis Trautwein Date: Fri, 15 May 2026 22:58:07 +0200 Subject: [PATCH 28/55] fix: interpret abort signal as timed out for metric --- apps/backend/src/retrieval-anon/anon-retrieval.service.ts | 2 +- docs/checks/events-and-metrics.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/apps/backend/src/retrieval-anon/anon-retrieval.service.ts b/apps/backend/src/retrieval-anon/anon-retrieval.service.ts index eddc88f0..a74c2bf0 100644 --- a/apps/backend/src/retrieval-anon/anon-retrieval.service.ts +++ b/apps/backend/src/retrieval-anon/anon-retrieval.service.ts @@ -142,7 +142,7 @@ export class AnonRetrievalService { pieceServedCorrectly ? "success" : pieceResult.aborted - ? "failure.aborted" + ? "failure.timedout" : pieceResult.success ? "failure.commp" : "failure.http", diff --git a/docs/checks/events-and-metrics.md b/docs/checks/events-and-metrics.md index 2421242c..37761e89 100644 --- a/docs/checks/events-and-metrics.md +++ b/docs/checks/events-and-metrics.md @@ -120,7 +120,7 @@ sequenceDiagram | `dataSetCreationStatus` | Data-Set Creation | Not tied to an [event above](#event-list) but rather to data-set creation start (`pending`) and completion (`success`/`failure.*`) | `pending`, `success`, `failure.timedout`, `failure.other` | [`deal.service.ts`](../../apps/backend/src/deal/deal.service.ts) | | `dataSetChallengeStatus` | Data Retention | Emitted on each [Data Retention Check](./data-retention.md) poll when a provider's confirmed proving-period totals advance (strictly positive deltas). Unit: **challenges** (period delta × `CHALLENGES_PER_PROVING_PERIOD = 5`). | `success` (challenges in successfully-proven periods), `failure` (challenges in faulted periods) | [`data-retention.service.ts`](../../apps/backend/src/data-retention/data-retention.service.ts) | | `pdp_provider_estimated_overdue_periods` | Data Retention | Emitted on every [Data Retention Check](./data-retention.md) poll for every successfully processed provider. | Gauge value in proving periods (non-negative integer) | [`data-retention.service.ts`](../../apps/backend/src/data-retention/data-retention.service.ts) | -| `anonPieceRetrievalStatus` | Anonymous Retrieval | After piece fetch completes (or on abort) | `success` (HTTP 2xx **and** CommP matches), `failure.http`, `failure.commp` (HTTP 2xx but bytes hashed to a different CID), `failure.aborted`, `failure.no_piece`. | [`anon-retrieval.service.ts`](../../apps/backend/src/retrieval-anon/anon-retrieval.service.ts) | +| `anonPieceRetrievalStatus` | Anonymous Retrieval | After piece fetch completes (or on abort) | `success` (HTTP 2xx **and** CommP matches), `failure.http`, `failure.commp` (HTTP 2xx but bytes hashed to a different CID), `failure.timedout`, `failure.no_piece`. | [`anon-retrieval.service.ts`](../../apps/backend/src/retrieval-anon/anon-retrieval.service.ts) | | `anonPieceHttpResponseCode` | Anonymous Retrieval | After piece fetch completes | `200`, `500`, `2xxSuccess`, `4xxClientError`, `5xxServerError`, `otherHttpStatusCodes`, `failure` (same classifier as [`ipfsRetrievalHttpResponseCode`](#ipfsRetrievalHttpResponseCode)) | [`anon-retrieval.service.ts`](../../apps/backend/src/retrieval-anon/anon-retrieval.service.ts) | | `anonCarParseStatus` | Anonymous Retrieval | After CAR validation runs (skipped when piece fetch failed or piece is not IPFS-indexed) | `parseable`, `not_parseable` | [`anon-retrieval.service.ts`](../../apps/backend/src/retrieval-anon/anon-retrieval.service.ts) | | `anonIpniStatus` | Anonymous Retrieval | After CAR validation runs, **or** when piece fetch failed (records `skipped`) | `valid`, `invalid`, `skipped`, `error` | [`anon-retrieval.service.ts`](../../apps/backend/src/retrieval-anon/anon-retrieval.service.ts) | From fc8f3785586f56730b563d574ec63fec7660744e Mon Sep 17 00:00:00 2001 From: Dennis Trautwein Date: Fri, 15 May 2026 23:03:43 +0200 Subject: [PATCH 29/55] docs: incorporate pr feedback --- docs/checks/events-and-metrics.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/checks/events-and-metrics.md b/docs/checks/events-and-metrics.md index 37761e89..65c735a4 100644 --- a/docs/checks/events-and-metrics.md +++ b/docs/checks/events-and-metrics.md @@ -121,7 +121,7 @@ sequenceDiagram | `dataSetChallengeStatus` | Data Retention | Emitted on each [Data Retention Check](./data-retention.md) poll when a provider's confirmed proving-period totals advance (strictly positive deltas). Unit: **challenges** (period delta × `CHALLENGES_PER_PROVING_PERIOD = 5`). | `success` (challenges in successfully-proven periods), `failure` (challenges in faulted periods) | [`data-retention.service.ts`](../../apps/backend/src/data-retention/data-retention.service.ts) | | `pdp_provider_estimated_overdue_periods` | Data Retention | Emitted on every [Data Retention Check](./data-retention.md) poll for every successfully processed provider. | Gauge value in proving periods (non-negative integer) | [`data-retention.service.ts`](../../apps/backend/src/data-retention/data-retention.service.ts) | | `anonPieceRetrievalStatus` | Anonymous Retrieval | After piece fetch completes (or on abort) | `success` (HTTP 2xx **and** CommP matches), `failure.http`, `failure.commp` (HTTP 2xx but bytes hashed to a different CID), `failure.timedout`, `failure.no_piece`. | [`anon-retrieval.service.ts`](../../apps/backend/src/retrieval-anon/anon-retrieval.service.ts) | -| `anonPieceHttpResponseCode` | Anonymous Retrieval | After piece fetch completes | `200`, `500`, `2xxSuccess`, `4xxClientError`, `5xxServerError`, `otherHttpStatusCodes`, `failure` (same classifier as [`ipfsRetrievalHttpResponseCode`](#ipfsRetrievalHttpResponseCode)) | [`anon-retrieval.service.ts`](../../apps/backend/src/retrieval-anon/anon-retrieval.service.ts) | +| `anonPieceHttpResponseCode` | Anonymous Retrieval | After piece fetch completes | Same as [`ipfsRetrievalHttpResponseCode`](#ipfsRetrievalHttpResponseCode). | [`anon-retrieval.service.ts`](../../apps/backend/src/retrieval-anon/anon-retrieval.service.ts) | | `anonCarParseStatus` | Anonymous Retrieval | After CAR validation runs (skipped when piece fetch failed or piece is not IPFS-indexed) | `parseable`, `not_parseable` | [`anon-retrieval.service.ts`](../../apps/backend/src/retrieval-anon/anon-retrieval.service.ts) | | `anonIpniStatus` | Anonymous Retrieval | After CAR validation runs, **or** when piece fetch failed (records `skipped`) | `valid`, `invalid`, `skipped`, `error` | [`anon-retrieval.service.ts`](../../apps/backend/src/retrieval-anon/anon-retrieval.service.ts) | | `anonBlockFetchStatus` | Anonymous Retrieval | After block-fetch sampling runs, **or** when piece fetch failed (records `skipped`) | `valid`, `invalid`, `skipped`, `error` | [`anon-retrieval.service.ts`](../../apps/backend/src/retrieval-anon/anon-retrieval.service.ts) | From 12b5eb67c8cdd63cd2141b529ea1386fbd487137 Mon Sep 17 00:00:00 2001 From: Dennis Trautwein Date: Fri, 15 May 2026 23:08:17 +0200 Subject: [PATCH 30/55] remove: unnecessary whitespace --- .../checks/production-configuration-and-approval-methodology.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/checks/production-configuration-and-approval-methodology.md b/docs/checks/production-configuration-and-approval-methodology.md index 2e89a45d..5566904d 100644 --- a/docs/checks/production-configuration-and-approval-methodology.md +++ b/docs/checks/production-configuration-and-approval-methodology.md @@ -41,7 +41,7 @@ Relevant parameters include: | Parameter | Value | Notes | |-----------|-------|-------| | [`PDP_SUBGRAPH_ENDPOINT`](../environment-variables.md#pdp_subgraph_endpoint) | TODO: fill this in | Uses the subgraph from [pdp-explorer](https://github.com/FilOzone/pdp-explorer). | -| [`MIN_NUM_DATASETS_FOR_CHECKS`](../environment-variables.md#dataset-configuration) | 15 | Ensure there are enough datasets with pieces being added so that statistical significance for [Data Retention Fault Rate](#data-retention-fault-rate) can be achieved quicker. Note that on mainnet each dataset incurs 5 challenges[^1] per daily proof[^2]. With this many datasets, an SP can be approved for data retention after a faultless ~7 days even if the SP doesn't have other datasets. | +| [`MIN_NUM_DATASETS_FOR_CHECKS`](../environment-variables.md#dataset-configuration) | 15 | Ensure there are enough datasets with pieces being added so that statistical significance for [Data Retention Fault Rate](#data-retention-fault-rate) can be achieved quicker. Note that on mainnet each dataset incurs 5 challenges[^1] per daily proof[^2]. With this many datasets, an SP can be approved for data retention after a faultless ~7 days even if the SP doesn't have other datasets. | See [How are data retention statistics/thresholds calculated?](#how-are-data-retention-statisticsthresholds-calculated) for more details. From 79b6bbcba17632762f2310ddd30d4bae90898c8d Mon Sep 17 00:00:00 2001 From: Dennis Trautwein Date: Wed, 20 May 2026 07:52:01 +0200 Subject: [PATCH 31/55] fix(anon-retrieval): defer job during maintenance window --- apps/backend/src/jobs/jobs.service.spec.ts | 52 +++++++++++++++++++++- apps/backend/src/jobs/jobs.service.ts | 19 +++++++- 2 files changed, 69 insertions(+), 2 deletions(-) diff --git a/apps/backend/src/jobs/jobs.service.spec.ts b/apps/backend/src/jobs/jobs.service.spec.ts index 8983c723..944a8daf 100644 --- a/apps/backend/src/jobs/jobs.service.spec.ts +++ b/apps/backend/src/jobs/jobs.service.spec.ts @@ -1,7 +1,12 @@ import { afterEach, beforeEach, describe, expect, it, vi } from "vitest"; import { DealJobTerminatedDataSetError } from "../common/errors.js"; import type { IConfig, ISpBlocklistConfig } from "../config/app.config.js"; -import { DATA_RETENTION_POLL_QUEUE, PROVIDERS_REFRESH_QUEUE, SP_WORK_QUEUE } from "./job-queues.js"; +import { + DATA_RETENTION_POLL_QUEUE, + PROVIDERS_REFRESH_QUEUE, + RETRIEVAL_ANON_QUEUE, + SP_WORK_QUEUE, +} from "./job-queues.js"; import { JobsService } from "./jobs.service.js"; type JobsServiceDeps = ConstructorParameters; @@ -912,6 +917,51 @@ describe("JobsService schedule rows", () => { ); }); + it("anon retrieval job defers to RETRIEVAL_ANON_QUEUE during maintenance window", async () => { + vi.useFakeTimers(); + vi.setSystemTime(new Date("2024-01-01T07:05:00Z")); + + baseConfigValues = { + ...baseConfigValues, + scheduling: { + ...baseConfigValues.scheduling, + maintenanceWindowsUtc: ["07:00"], + maintenanceWindowMinutes: 20, + } as IConfig["scheduling"], + }; + configService = { + get: vi.fn((key: keyof IConfig) => baseConfigValues[key]), + } as unknown as JobsServiceDeps[0]; + + const anonRetrievalService = { performForProvider: vi.fn() }; + const walletSdkService = { + getProviderInfo: vi.fn(() => ({ id: 1n, name: "sp" })), + }; + + service = buildService({ + configService, + anonRetrievalService: anonRetrievalService as unknown as JobsServiceDeps[8], + walletSdkService: walletSdkService as unknown as JobsServiceDeps[5], + }); + + const safeSend = vi.fn().mockResolvedValue(true); + (service as unknown as { safeSend: typeof safeSend }).safeSend = safeSend; + + await callPrivate(service, "handleAnonRetrievalJob", { + id: "job-anon-maintenance", + data: { spAddress: "0xaaa", intervalSeconds: 60 }, + }); + + const expectedResumeAt = new Date("2024-01-01T07:20:00Z"); + expect(anonRetrievalService.performForProvider).not.toHaveBeenCalled(); + expect(safeSend).toHaveBeenCalledWith( + "retrieval_anon", + RETRIEVAL_ANON_QUEUE, + { jobType: "retrieval_anon", spAddress: "0xaaa", intervalSeconds: 60 }, + { startAfter: expectedResumeAt }, + ); + }); + it("deal job delegates to createDealForProvider", async () => { vi.useFakeTimers(); vi.setSystemTime(new Date("2024-01-01T12:00:00Z")); diff --git a/apps/backend/src/jobs/jobs.service.ts b/apps/backend/src/jobs/jobs.service.ts index e09cf42c..3f3eb452 100644 --- a/apps/backend/src/jobs/jobs.service.ts +++ b/apps/backend/src/jobs/jobs.service.ts @@ -624,6 +624,23 @@ export class JobsService implements OnModuleInit, OnApplicationShutdown { private async handleAnonRetrievalJob(job: Job): Promise { const data = job.data; const spAddress = data.spAddress; + const now = new Date(); + const maintenance = this.getMaintenanceWindowStatus(now); + if (maintenance.active) { + this.logMaintenanceSkip(`retrieval_anon job for ${spAddress}`, maintenance.window?.label, { + jobId: job.id, + providerAddress: spAddress, + providerId: this.walletSdkService.getProviderInfo(spAddress)?.id, + providerName: this.walletSdkService.getProviderInfo(spAddress)?.name, + }); + await this.deferJobForMaintenance( + "retrieval_anon", + { jobType: "retrieval_anon", spAddress, intervalSeconds: data.intervalSeconds }, + maintenance, + now, + ); + return; + } // Create AbortController for job timeout enforcement const abortController = new AbortController(); @@ -891,7 +908,7 @@ export class JobsService implements OnModuleInit, OnApplicationShutdown { if (resumeAt == null) { return; } - await this.safeSend(jobType, SP_WORK_QUEUE, data, { startAfter: resumeAt }); + await this.safeSend(jobType, this.mapJobName(jobType), data, { startAfter: resumeAt }); } /** From 9ae0457893787ae7ab1be0835f132dffb6ab4a8c Mon Sep 17 00:00:00 2001 From: Dennis Trautwein Date: Wed, 20 May 2026 07:52:20 +0200 Subject: [PATCH 32/55] fix(retrieval-anon): skip job if SP is blocked --- apps/backend/src/jobs/jobs.service.spec.ts | 24 ++++++++++++++++++++++ apps/backend/src/jobs/jobs.service.ts | 11 +++++++++- 2 files changed, 34 insertions(+), 1 deletion(-) diff --git a/apps/backend/src/jobs/jobs.service.spec.ts b/apps/backend/src/jobs/jobs.service.spec.ts index 944a8daf..ab99915c 100644 --- a/apps/backend/src/jobs/jobs.service.spec.ts +++ b/apps/backend/src/jobs/jobs.service.spec.ts @@ -1444,6 +1444,30 @@ describe("JobsService schedule rows", () => { expect(dealService.createDataSetWithPiece).not.toHaveBeenCalled(); }); + it("anon retrieval job is skipped at runtime when provider is blocked", async () => { + vi.useFakeTimers(); + vi.setSystemTime(new Date("2024-01-01T12:00:00Z")); + + baseConfigValues.spBlocklists = { ids: new Set(["4"]), addresses: new Set() }; + + const anonRetrievalService = { performForProvider: vi.fn() }; + const walletSdkService = { + getProviderInfo: vi.fn(() => ({ id: 4n, name: "sp" })), + }; + + service = buildService({ + anonRetrievalService: anonRetrievalService as unknown as JobsServiceDeps[8], + walletSdkService: walletSdkService as unknown as JobsServiceDeps[5], + }); + + await callPrivate(service, "handleAnonRetrievalJob", { + id: "job-blocked-anon", + data: { spAddress: "0xaaa", intervalSeconds: 60 }, + }); + + expect(anonRetrievalService.performForProvider).not.toHaveBeenCalled(); + }); + it("SP jobs skip address-blocked providers before resolving missing provider context", async () => { vi.useFakeTimers(); vi.setSystemTime(new Date("2024-01-01T12:00:00Z")); diff --git a/apps/backend/src/jobs/jobs.service.ts b/apps/backend/src/jobs/jobs.service.ts index 3f3eb452..0b821613 100644 --- a/apps/backend/src/jobs/jobs.service.ts +++ b/apps/backend/src/jobs/jobs.service.ts @@ -653,7 +653,16 @@ export class JobsService implements OnModuleInit, OnApplicationShutdown { }, timeoutMs); await this.recordJobExecution("retrieval_anon", async () => { - const logContext = await this.resolveProviderJobContext(spAddress, job.id); + const logContext = await this.resolveRunnableProviderJobContext( + "retrieval_anon", + spAddress, + job.id, + "Anon retrieval job skipped: provider is blocked for scheduled retrieval checks", + ); + if (logContext == null) { + clearTimeout(timeoutId); + return "success"; + } try { await this.anonRetrievalService.performForProvider(spAddress, abortController.signal, logContext); return "success"; From d82bfaf6329c85a93a96eee27957b361aa940254 Mon Sep 17 00:00:00 2001 From: Dennis Trautwein Date: Wed, 20 May 2026 08:07:51 +0200 Subject: [PATCH 33/55] fix(retrieval-anon): gate job creation on subgraphEndpoint presence --- apps/backend/src/jobs/jobs.service.spec.ts | 33 +++++++++++++++++++++- apps/backend/src/jobs/jobs.service.ts | 18 ++++++++---- 2 files changed, 44 insertions(+), 7 deletions(-) diff --git a/apps/backend/src/jobs/jobs.service.spec.ts b/apps/backend/src/jobs/jobs.service.spec.ts index ab99915c..8127655b 100644 --- a/apps/backend/src/jobs/jobs.service.spec.ts +++ b/apps/backend/src/jobs/jobs.service.spec.ts @@ -123,7 +123,11 @@ describe("JobsService schedule rows", () => { baseConfigValues = { app: { runMode: "both" } as IConfig["app"], - blockchain: { useOnlyApprovedProviders: false, minNumDataSetsForChecks: 1 } as IConfig["blockchain"], + blockchain: { + useOnlyApprovedProviders: false, + minNumDataSetsForChecks: 1, + subgraphEndpoint: "https://example.com/subgraph", + } as IConfig["blockchain"], scheduling: { providersRefreshIntervalSeconds: 4 * 3600, dataRetentionPollIntervalSeconds: 3600, @@ -633,6 +637,33 @@ describe("JobsService schedule rows", () => { ]); }); + it("skips retrieval_anon schedule when subgraph endpoint is not configured", async () => { + baseConfigValues = { + ...baseConfigValues, + blockchain: { ...baseConfigValues.blockchain, subgraphEndpoint: "" } as IConfig["blockchain"], + }; + configService = { + get: vi.fn((key: keyof IConfig) => baseConfigValues[key]), + } as unknown as JobsServiceDeps[0]; + + service = buildService({ configService }); + + const providerA = { address: "0xaaa" }; + storageProviderRepositoryMock.find.mockResolvedValueOnce([providerA]); + + await callPrivate(service, "ensureScheduleRows"); + + const upsertsForA = jobScheduleRepositoryMock.upsertSchedule.mock.calls.filter( + (call) => call[1] === providerA.address, + ); + expect(upsertsForA.map((call) => call[0]).sort()).toEqual([ + "data_set_creation", + "deal", + "piece_cleanup", + "retrieval", + ]); + }); + it("deletes schedule rows for providers no longer present", async () => { const providerA = { address: "0xaaa" }; storageProviderRepositoryMock.find.mockResolvedValueOnce([providerA]); diff --git a/apps/backend/src/jobs/jobs.service.ts b/apps/backend/src/jobs/jobs.service.ts index 0b821613..5cc40fe2 100644 --- a/apps/backend/src/jobs/jobs.service.ts +++ b/apps/backend/src/jobs/jobs.service.ts @@ -1043,6 +1043,10 @@ export class JobsService implements OnModuleInit, OnApplicationShutdown { const minDataSets = this.configService.get("blockchain").minNumDataSetsForChecks; const cleanupStartAt = new Date(now.getTime() + phaseMs); + // Anon retrieval depends on the dealbot-owned subgraph. Without SUBGRAPH_ENDPOINT every + // job would fail in SubgraphService.sampleAnonPiece(), so gate schedule creation on it. + const anonRetrievalEnabled = Boolean(this.configService.get("blockchain").subgraphEndpoint); + const spBlocklistsCfg = this.configService.get("spBlocklists"); const unblockedAddresses = providers .filter(({ address, providerId }) => !isSpBlocked(spBlocklistsCfg, address, providerId)) @@ -1059,12 +1063,14 @@ export class JobsService implements OnModuleInit, OnApplicationShutdown { for (const address of unblockedAddresses) { await this.jobScheduleRepository.upsertSchedule("deal", address, dealIntervalSeconds, dealStartAt); await this.jobScheduleRepository.upsertSchedule("retrieval", address, retrievalIntervalSeconds, retrievalStartAt); - await this.jobScheduleRepository.upsertSchedule( - "retrieval_anon", - address, - retrievalAnonIntervalSeconds, - retrievalAnonStartAt, - ); + if (anonRetrievalEnabled) { + await this.jobScheduleRepository.upsertSchedule( + "retrieval_anon", + address, + retrievalAnonIntervalSeconds, + retrievalAnonStartAt, + ); + } if (minDataSets >= 1) { await this.jobScheduleRepository.upsertSchedule( "data_set_creation", From 42f756867c57c1d42569946dfd87d09d367ed361 Mon Sep 17 00:00:00 2001 From: Dennis Trautwein Date: Wed, 20 May 2026 08:12:05 +0200 Subject: [PATCH 34/55] refactor: use IpniCheckStatus enum values for check metrics --- apps/backend/src/metrics-prometheus/check-metrics.service.ts | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/apps/backend/src/metrics-prometheus/check-metrics.service.ts b/apps/backend/src/metrics-prometheus/check-metrics.service.ts index 76a8ee31..62d2d137 100644 --- a/apps/backend/src/metrics-prometheus/check-metrics.service.ts +++ b/apps/backend/src/metrics-prometheus/check-metrics.service.ts @@ -2,6 +2,7 @@ import { Injectable, Logger } from "@nestjs/common"; import { InjectMetric } from "@willsoto/nestjs-prometheus"; import type { Counter, Histogram } from "prom-client"; import type { Deal } from "../database/entities/deal.entity.js"; +import { IpniCheckStatus } from "../database/types.js"; import type { RetrievalExecutionResult } from "../retrieval-addons/types.js"; import { buildCheckMetricLabels, type CheckMetricLabels } from "./check-metric-labels.js"; @@ -303,7 +304,7 @@ export class AnonRetrievalCheckMetrics { this.carParseCounter.inc({ ...labels, value: parseable ? "parseable" : "not_parseable" }); } - recordIpniStatus(labels: CheckMetricLabels, value: "valid" | "invalid" | "skipped" | "error"): void { + recordIpniStatus(labels: CheckMetricLabels, value: IpniCheckStatus): void { this.ipniCounter.inc({ ...labels, value }); } From 19e3af3e1d12d461d009773e0b49002e6c33eef5 Mon Sep 17 00:00:00 2001 From: Dennis Trautwein Date: Wed, 20 May 2026 08:19:26 +0200 Subject: [PATCH 35/55] fix(retrieval-anon): skip downstream validation for invalid commp --- .../anon-retrieval.service.spec.ts | 43 +++++++++++++++++++ .../retrieval-anon/anon-retrieval.service.ts | 9 ++-- 2 files changed, 49 insertions(+), 3 deletions(-) diff --git a/apps/backend/src/retrieval-anon/anon-retrieval.service.spec.ts b/apps/backend/src/retrieval-anon/anon-retrieval.service.spec.ts index adc75920..519ec413 100644 --- a/apps/backend/src/retrieval-anon/anon-retrieval.service.spec.ts +++ b/apps/backend/src/retrieval-anon/anon-retrieval.service.spec.ts @@ -330,6 +330,49 @@ describe("AnonRetrievalService", () => { expect(row.car_parseable).toBeNull(); }); + it("skips CAR/IPNI/block-fetch when SP returns 2xx with wrong bytes (commPValid=false)", async () => { + // If commP doesn't match, downstream parsing/IPNI/block-fetch would be + // checking unrelated data and record meaningless failures under the wrong + // dimension. The overall status must surface as failure.commp. + const wrongBytes: PieceRetrievalResult = { + success: true, + pieceCid: INDEXED_PIECE.pieceCid, + bytesReceived: 1024, + pieceBytes: Buffer.from("garbage-bytes"), + latencyMs: 200, + ttfbMs: 20, + throughputBps: 51200, + statusCode: 200, + commPValid: false, + }; + + const { + service, + insertSpy, + validateCarSpy, + metricsRecordStatusSpy, + metricsRecordIpniSpy, + metricsRecordBlockFetchSpy, + } = makeService({ + pieceResult: wrongBytes, + piece: INDEXED_PIECE, + }); + + await service.performForProvider(SP_ADDRESS); + + expect(validateCarSpy).not.toHaveBeenCalled(); + expect(metricsRecordIpniSpy).toHaveBeenCalledWith(expect.anything(), "skipped"); + expect(metricsRecordBlockFetchSpy).toHaveBeenCalledWith(expect.anything(), "skipped"); + expect(metricsRecordStatusSpy).toHaveBeenCalledWith(expect.anything(), "failure.commp"); + + const [, row] = insertSpy.mock.calls[0] as [string, Record]; + expect(row.piece_fetch_status).toBe(RetrievalStatus.SUCCESS); + expect(row.commp_valid).toBe(false); + expect(row.car_parseable).toBeNull(); + expect(row.ipni_status).toBe("skipped"); + expect(row.block_fetch_valid).toBeNull(); + }); + it("emits car_parseable=false with skipped IPNI/block-fetch when bytes don't parse as CAR", async () => { const carResult: CarValidationResult = { carParseable: false, diff --git a/apps/backend/src/retrieval-anon/anon-retrieval.service.ts b/apps/backend/src/retrieval-anon/anon-retrieval.service.ts index a74c2bf0..9aa9998f 100644 --- a/apps/backend/src/retrieval-anon/anon-retrieval.service.ts +++ b/apps/backend/src/retrieval-anon/anon-retrieval.service.ts @@ -87,9 +87,10 @@ export class AnonRetrievalService { this.metrics.observeThroughput(labels, pieceResult.throughputBps); this.metrics.recordHttpResponseCode(labels, pieceResult.statusCode); - // 3. CAR validation (only if piece was successfully retrieved and has IPFS indexing) + // 3. CAR validation (only if piece was successfully retrieved with matching commp and has IPFS indexing). if ( pieceResult.success && + pieceResult.commPValid && piece.withIPFSIndexing && piece.ipfsRootCid && pieceResult.pieceBytes && @@ -128,8 +129,10 @@ export class AnonRetrievalService { error: toStructuredError(error), }); } - } else if (!pieceResult.success) { - // Piece retrieval failed — IPNI and block fetch were skipped + } else if (!pieceResult.success || !pieceResult.commPValid) { + // Piece retrieval failed or SP returned bytes that don't match the requested + // commP — downstream validation was skipped because there is nothing + // trustworthy to validate. this.metrics.recordIpniStatus(labels, IpniCheckStatus.SKIPPED); this.metrics.recordBlockFetchStatus(labels, IpniCheckStatus.SKIPPED); } From 2ef91a1b190c5f0a4e5540b6b28479f9cb7a7002 Mon Sep 17 00:00:00 2001 From: Dennis Trautwein Date: Wed, 20 May 2026 08:39:21 +0200 Subject: [PATCH 36/55] refactor(retrieval-anon): avoid nested ternaries --- .../retrieval-anon/anon-retrieval.service.ts | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/apps/backend/src/retrieval-anon/anon-retrieval.service.ts b/apps/backend/src/retrieval-anon/anon-retrieval.service.ts index 9aa9998f..6f0953cf 100644 --- a/apps/backend/src/retrieval-anon/anon-retrieval.service.ts +++ b/apps/backend/src/retrieval-anon/anon-retrieval.service.ts @@ -139,17 +139,7 @@ export class AnonRetrievalService { // Overall check duration and status this.metrics.observeCheckDuration(labels, Date.now() - checkStart); - const pieceServedCorrectly = pieceResult.success && pieceResult.commPValid; - this.metrics.recordStatus( - labels, - pieceServedCorrectly - ? "success" - : pieceResult.aborted - ? "failure.timedout" - : pieceResult.success - ? "failure.commp" - : "failure.http", - ); + this.metrics.recordStatus(labels, anonPieceRetrievalStatus(pieceResult)); } finally { // Always emit a ClickHouse row — even on abort or unexpected error — so // we never lose the evidence (ttfb, bytes, response code) we already @@ -236,6 +226,13 @@ function ipniStatusFromResult(result: CarValidationResult): IpniCheckStatus { return result.ipniValid ? IpniCheckStatus.VALID : IpniCheckStatus.INVALID; } +function anonPieceRetrievalStatus(pieceResult: PieceRetrievalResult): string { + if (pieceResult.success && pieceResult.commPValid) return "success"; + if (pieceResult.aborted) return "failure.timedout"; + if (pieceResult.success) return "failure.commp"; + return "failure.http"; +} + function buildAbortedPlaceholder(pieceCid: string, reason: unknown): PieceRetrievalResult { const message = reason instanceof Error && reason.message ? reason.message : typeof reason === "string" ? reason : "aborted"; From 30ca84233cb9be00645eb0eed19600c859703ee6 Mon Sep 17 00:00:00 2001 From: Dennis Trautwein Date: Wed, 20 May 2026 08:56:49 +0200 Subject: [PATCH 37/55] refactor(retrieval-anon): wire abort signal to piece selection --- .../anon-piece-selector.service.spec.ts | 13 +++++++++ .../anon-piece-selector.service.ts | 12 ++++++-- .../retrieval-anon/anon-retrieval.service.ts | 2 +- apps/backend/src/subgraph/subgraph.service.ts | 28 +++++++++++++------ 4 files changed, 43 insertions(+), 12 deletions(-) diff --git a/apps/backend/src/retrieval-anon/anon-piece-selector.service.spec.ts b/apps/backend/src/retrieval-anon/anon-piece-selector.service.spec.ts index 30a04486..c5acef79 100644 --- a/apps/backend/src/retrieval-anon/anon-piece-selector.service.spec.ts +++ b/apps/backend/src/retrieval-anon/anon-piece-selector.service.spec.ts @@ -60,6 +60,19 @@ describe("AnonPieceSelectorService", () => { expect(result?.serviceProvider).toBe(SP_ADDRESS.toLowerCase()); }); + it("returns null without sampling when the signal is already aborted", async () => { + sampleAnonPiece.mockResolvedValue(makePiece()); + const service = new AnonPieceSelectorService(subgraphService, makeConfigService()); + + const ac = new AbortController(); + ac.abort(new Error("Anon retrieval job timeout")); + + const result = await service.selectPieceForProvider(SP_ADDRESS, ac.signal); + + expect(result).toBeNull(); + expect(sampleAnonPiece).not.toHaveBeenCalled(); + }); + it("passes the dealbot payer address to sampleAnonPiece for exclusion", async () => { sampleAnonPiece.mockResolvedValueOnce(makePiece()); const service = new AnonPieceSelectorService(subgraphService, makeConfigService()); diff --git a/apps/backend/src/retrieval-anon/anon-piece-selector.service.ts b/apps/backend/src/retrieval-anon/anon-piece-selector.service.ts index d354a222..719f637f 100644 --- a/apps/backend/src/retrieval-anon/anon-piece-selector.service.ts +++ b/apps/backend/src/retrieval-anon/anon-piece-selector.service.ts @@ -64,7 +64,7 @@ export class AnonPieceSelectorService { * 5. If still empty, fall back through: (same bucket, opposite pool) → * (any bucket, indexed) → (any bucket, any). */ - async selectPieceForProvider(spAddress: string): Promise { + async selectPieceForProvider(spAddress: string, signal?: AbortSignal): Promise { const dealbotPayer = this.configService.get("blockchain", { infer: true }).walletAddress; const bucket = this.pickBucket(); @@ -78,11 +78,15 @@ export class AnonPieceSelectorService { ]; for (const attempt of attempts) { + if (signal?.aborted) { + return null; + } const piece = await this.drawPiece({ spAddress, dealbotPayer, bucket: attempt.bucket, pool: attempt.pool, + signal, }); if (piece) { @@ -127,10 +131,14 @@ export class AnonPieceSelectorService { dealbotPayer: string; bucket: SizeBucket | "any"; pool: AnonPiecePool; + signal?: AbortSignal; }): Promise { const range = args.bucket === "any" ? fullRange() : SIZE_BUCKETS[args.bucket]; for (let attempt = 0; attempt < 2; attempt++) { + if (args.signal?.aborted) { + return null; + } const params: SampleAnonPieceParams = { serviceProvider: args.spAddress, payer: args.dealbotPayer, @@ -140,7 +148,7 @@ export class AnonPieceSelectorService { pool: args.pool, }; - const piece = await this.subgraphService.sampleAnonPiece(params); + const piece = await this.subgraphService.sampleAnonPiece(params, args.signal); if (!piece) { continue; } diff --git a/apps/backend/src/retrieval-anon/anon-retrieval.service.ts b/apps/backend/src/retrieval-anon/anon-retrieval.service.ts index 6f0953cf..79f0eeb5 100644 --- a/apps/backend/src/retrieval-anon/anon-retrieval.service.ts +++ b/apps/backend/src/retrieval-anon/anon-retrieval.service.ts @@ -42,7 +42,7 @@ export class AnonRetrievalService { }); // 1. Select an anonymous piece - const piece = await this.anonPieceSelectorService.selectPieceForProvider(spAddress); + const piece = await this.anonPieceSelectorService.selectPieceForProvider(spAddress, signal); if (!piece) { this.logger.warn({ ...logContext, diff --git a/apps/backend/src/subgraph/subgraph.service.ts b/apps/backend/src/subgraph/subgraph.service.ts index 3d4e8370..1f8cd0de 100644 --- a/apps/backend/src/subgraph/subgraph.service.ts +++ b/apps/backend/src/subgraph/subgraph.service.ts @@ -1,5 +1,6 @@ import { Injectable, Logger } from "@nestjs/common"; import { ConfigService } from "@nestjs/config"; +import { delay } from "../common/abort-utils.js"; import { toStructuredError } from "../common/logging.js"; import type { IBlockchainConfig, IConfig } from "../config/app.config.js"; import { buildSampleAnonPieceQuery, Queries } from "./queries.js"; @@ -127,7 +128,7 @@ export class SubgraphService { * `pdpPaymentEndEpoch` is returned to the caller for a cheap client-side * epoch comparison — GraphQL filters on nullable BigInts are awkward. */ - async sampleAnonPiece(params: SampleAnonPieceParams): Promise { + async sampleAnonPiece(params: SampleAnonPieceParams, signal?: AbortSignal): Promise { if (!this.blockchainConfig.subgraphEndpoint) { // Surface misconfiguration distinctly so it does not look like an empty // candidate pool (which silently no-ops every anon retrieval job). @@ -152,6 +153,7 @@ export class SubgraphService { query, variables, validateSampleAnonPieceResponse, + signal, ); const root = validated.roots[0]; @@ -191,6 +193,7 @@ export class SubgraphService { query: string, variables: Record, transform: (data: unknown) => T, + signal?: AbortSignal, attempt: number = 1, ): Promise { if (!this.blockchainConfig.subgraphEndpoint) { @@ -198,12 +201,13 @@ export class SubgraphService { } try { - await this.enforceRateLimit(); + await this.enforceRateLimit(1, signal); const response = await fetch(this.blockchainConfig.subgraphEndpoint, { method: "POST", headers: { "Content-Type": "application/json" }, body: JSON.stringify({ query, variables }), + signal, }); if (!response.ok) { @@ -235,18 +239,24 @@ export class SubgraphService { throw error; } + // Aborted requests should surface immediately — retrying after the job + // budget has been spent only wastes the remaining MAX_RETRIES slots. + if (signal?.aborted || (error instanceof Error && error.name === "AbortError")) { + throw error; + } + if (attempt < SubgraphService.MAX_RETRIES) { - const delay = SubgraphService.INITIAL_RETRY_DELAY_MS * (1 << (attempt - 1)); + const delayMs = SubgraphService.INITIAL_RETRY_DELAY_MS * (1 << (attempt - 1)); this.logger.warn({ event: `subgraph_${operationName}_request_retry`, message: `Subgraph ${operationName} request failed. Retrying...`, attempt, maxRetries: SubgraphService.MAX_RETRIES, - retryDelayMs: delay, + retryDelayMs: delayMs, error: toStructuredError(error), }); - await new Promise((resolve) => setTimeout(resolve, delay)); - return this.executeQuery(operationName, query, variables, transform, attempt + 1); + await delay(delayMs, signal); + return this.executeQuery(operationName, query, variables, transform, signal, attempt + 1); } this.logger.error({ @@ -385,7 +395,7 @@ export class SubgraphService { * This rate limit is applied by Goldsky on their public endpoints * Read more here: https://docs.goldsky.com/subgraphs/graphql-endpoints#public-endpoints */ - private async enforceRateLimit(requestCount: number = 1): Promise { + private async enforceRateLimit(requestCount: number = 1, signal?: AbortSignal): Promise { if (requestCount > SubgraphService.MAX_CONCURRENT_REQUESTS) { throw new Error( `Cannot request ${requestCount} items; exceeds rate limit window of ${SubgraphService.MAX_CONCURRENT_REQUESTS}`, @@ -409,8 +419,8 @@ export class SubgraphService { const waitTime = oldestTimestamp + SubgraphService.RATE_LIMIT_WINDOW_MS - now + 10; if (waitTime > 0) { - await new Promise((resolve) => setTimeout(resolve, waitTime)); - return this.enforceRateLimit(requestCount); + await delay(waitTime, signal); + return this.enforceRateLimit(requestCount, signal); } } From 33a3d99b0e2a72f96a3d6d8397f25a2a7a0e9468 Mon Sep 17 00:00:00 2001 From: Dennis Trautwein Date: Wed, 20 May 2026 09:13:02 +0200 Subject: [PATCH 38/55] refactor(retrieval-anon): wire abort signal to car validation --- apps/backend/src/retrieval-anon/car-validation.service.ts | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/apps/backend/src/retrieval-anon/car-validation.service.ts b/apps/backend/src/retrieval-anon/car-validation.service.ts index c3a6c717..bc14720c 100644 --- a/apps/backend/src/retrieval-anon/car-validation.service.ts +++ b/apps/backend/src/retrieval-anon/car-validation.service.ts @@ -48,8 +48,11 @@ export class CarValidationService { ): Promise { let blocks: { cid: CID; bytes: Uint8Array }[]; try { - blocks = await this.parseCar(pieceBytes); + blocks = await this.parseCar(pieceBytes, signal); } catch (error) { + if (signal?.aborted || (error instanceof Error && error.name === "AbortError")) { + throw error; + } this.logger.debug({ event: "car_parse_failed", message: "Failed to parse piece bytes as CAR - client fault, not SP", @@ -102,10 +105,11 @@ export class CarValidationService { }; } - private async parseCar(pieceBytes: Buffer): Promise<{ cid: CID; bytes: Uint8Array }[]> { + private async parseCar(pieceBytes: Buffer, signal?: AbortSignal): Promise<{ cid: CID; bytes: Uint8Array }[]> { const reader = await CarReader.fromBytes(new Uint8Array(pieceBytes)); const blocks: { cid: CID; bytes: Uint8Array }[] = []; for await (const block of reader.blocks()) { + signal?.throwIfAborted(); blocks.push({ cid: block.cid, bytes: block.bytes }); } return blocks; From dadd4f8166b5bcd547804a117d8473492ee5a581 Mon Sep 17 00:00:00 2001 From: Dennis Trautwein Date: Wed, 20 May 2026 09:13:46 +0200 Subject: [PATCH 39/55] fix(retrieval-anon): empty-string value fails Joi number coercion --- apps/backend/src/config/app.config.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/backend/src/config/app.config.ts b/apps/backend/src/config/app.config.ts index 49b55606..2655ba0b 100644 --- a/apps/backend/src/config/app.config.ts +++ b/apps/backend/src/config/app.config.ts @@ -89,7 +89,7 @@ export const configValidationSchema = Joi.object({ DEALS_PER_SP_PER_HOUR: Joi.number().min(0.001).max(20).default(4), DATASET_CREATIONS_PER_SP_PER_HOUR: Joi.number().min(0.001).max(20).default(1), RETRIEVALS_PER_SP_PER_HOUR: Joi.number().min(0.001).max(20).default(2), - RETRIEVALS_ANON_PER_SP_PER_HOUR: Joi.number().min(0.001).max(20).optional(), + RETRIEVALS_ANON_PER_SP_PER_HOUR: Joi.number().min(0.001).max(20).empty("").optional(), // Polling interval for pg-boss scheduler (lower = more responsive, higher = less DB chatter). JOB_SCHEDULER_POLL_SECONDS: Joi.number().min(60).default(300), JOB_WORKER_POLL_SECONDS: Joi.number().min(5).default(60), From 48ecadb21bb9eecf73ea2e7c8209a007ff4d4050 Mon Sep 17 00:00:00 2001 From: Dennis Trautwein Date: Wed, 20 May 2026 11:05:48 +0200 Subject: [PATCH 40/55] fix(retrieval-anon): treat commP mismatch as a fetch failure --- .../anon-retrieval.service.spec.ts | 18 ++++++--- .../retrieval-anon/anon-retrieval.service.ts | 22 ++++++----- .../retrieval-anon/piece-retrieval.service.ts | 39 ++++++++++++++++++- apps/backend/src/retrieval-anon/types.ts | 1 + 4 files changed, 63 insertions(+), 17 deletions(-) diff --git a/apps/backend/src/retrieval-anon/anon-retrieval.service.spec.ts b/apps/backend/src/retrieval-anon/anon-retrieval.service.spec.ts index 519ec413..922e6ba2 100644 --- a/apps/backend/src/retrieval-anon/anon-retrieval.service.spec.ts +++ b/apps/backend/src/retrieval-anon/anon-retrieval.service.spec.ts @@ -127,6 +127,7 @@ describe("AnonRetrievalService", () => { ttfbMs: 150, throughputBps: 12500, statusCode: 200, + httpSuccess: false, commPValid: false, errorMessage: "Anon retrieval job timeout (60s) for sp1", aborted: true, @@ -177,6 +178,7 @@ describe("AnonRetrievalService", () => { ttfbMs: 0, throughputBps: 0, statusCode: 0, + httpSuccess: false, commPValid: false, }; @@ -203,6 +205,7 @@ describe("AnonRetrievalService", () => { ttfbMs: 0, throughputBps: 0, statusCode: 0, + httpSuccess: false, commPValid: false, }; @@ -237,6 +240,7 @@ describe("AnonRetrievalService", () => { ttfbMs: 20, throughputBps: 51200, statusCode: 200, + httpSuccess: true, commPValid: true, }; } @@ -331,11 +335,13 @@ describe("AnonRetrievalService", () => { }); it("skips CAR/IPNI/block-fetch when SP returns 2xx with wrong bytes (commPValid=false)", async () => { - // If commP doesn't match, downstream parsing/IPNI/block-fetch would be - // checking unrelated data and record meaningless failures under the wrong - // dimension. The overall status must surface as failure.commp. + // fetchPiece flips success=false on a commP mismatch (a 2xx response with + // the wrong bytes is a retrieval failure, not a success). Downstream + // parsing/IPNI/block-fetch must therefore be skipped, and the overall + // status must surface as failure.commp — distinguished from failure.http + // by the still-2xx statusCode. const wrongBytes: PieceRetrievalResult = { - success: true, + success: false, pieceCid: INDEXED_PIECE.pieceCid, bytesReceived: 1024, pieceBytes: Buffer.from("garbage-bytes"), @@ -343,7 +349,9 @@ describe("AnonRetrievalService", () => { ttfbMs: 20, throughputBps: 51200, statusCode: 200, + httpSuccess: true, commPValid: false, + errorMessage: `CommP mismatch: bytes do not match ${INDEXED_PIECE.pieceCid}`, }; const { @@ -366,7 +374,7 @@ describe("AnonRetrievalService", () => { expect(metricsRecordStatusSpy).toHaveBeenCalledWith(expect.anything(), "failure.commp"); const [, row] = insertSpy.mock.calls[0] as [string, Record]; - expect(row.piece_fetch_status).toBe(RetrievalStatus.SUCCESS); + expect(row.piece_fetch_status).toBe(RetrievalStatus.FAILED); expect(row.commp_valid).toBe(false); expect(row.car_parseable).toBeNull(); expect(row.ipni_status).toBe("skipped"); diff --git a/apps/backend/src/retrieval-anon/anon-retrieval.service.ts b/apps/backend/src/retrieval-anon/anon-retrieval.service.ts index 79f0eeb5..d5ddd485 100644 --- a/apps/backend/src/retrieval-anon/anon-retrieval.service.ts +++ b/apps/backend/src/retrieval-anon/anon-retrieval.service.ts @@ -87,10 +87,11 @@ export class AnonRetrievalService { this.metrics.observeThroughput(labels, pieceResult.throughputBps); this.metrics.recordHttpResponseCode(labels, pieceResult.statusCode); - // 3. CAR validation (only if piece was successfully retrieved with matching commp and has IPFS indexing). + // 3. CAR validation (only if piece was successfully retrieved and has IPFS indexing). + // `pieceResult.success` already encodes "HTTP 2xx AND commP matches" — fetchPiece + // flips success=false on a commP mismatch so we never parse mismatched bytes. if ( pieceResult.success && - pieceResult.commPValid && piece.withIPFSIndexing && piece.ipfsRootCid && pieceResult.pieceBytes && @@ -129,10 +130,9 @@ export class AnonRetrievalService { error: toStructuredError(error), }); } - } else if (!pieceResult.success || !pieceResult.commPValid) { - // Piece retrieval failed or SP returned bytes that don't match the requested - // commP — downstream validation was skipped because there is nothing - // trustworthy to validate. + } else if (!pieceResult.success) { + // Piece retrieval failed (HTTP error or commP mismatch) — downstream + // validation was skipped because there is nothing trustworthy to validate. this.metrics.recordIpniStatus(labels, IpniCheckStatus.SKIPPED); this.metrics.recordBlockFetchStatus(labels, IpniCheckStatus.SKIPPED); } @@ -177,7 +177,7 @@ export class AnonRetrievalService { last_byte_ms: finalPieceResult.latencyMs > 0 ? finalPieceResult.latencyMs : null, bytes_retrieved: finalPieceResult.bytesReceived > 0 ? finalPieceResult.bytesReceived : null, throughput_bps: finalPieceResult.throughputBps > 0 ? Math.round(finalPieceResult.throughputBps) : null, - commp_valid: finalPieceResult.success ? finalPieceResult.commPValid : null, + commp_valid: !finalPieceResult.aborted && finalPieceResult.httpSuccess ? finalPieceResult.commPValid : null, car_parseable: carResult ? carResult.carParseable : null, car_block_count: carResult?.carParseable ? carResult?.blockCount : null, block_fetch_endpoint: carResult?.blockFetchEndpoint ?? null, @@ -227,10 +227,11 @@ function ipniStatusFromResult(result: CarValidationResult): IpniCheckStatus { } function anonPieceRetrievalStatus(pieceResult: PieceRetrievalResult): string { - if (pieceResult.success && pieceResult.commPValid) return "success"; + if (pieceResult.success) return "success"; if (pieceResult.aborted) return "failure.timedout"; - if (pieceResult.success) return "failure.commp"; - return "failure.http"; + if (!pieceResult.httpSuccess) return "failure.http"; + if (!pieceResult.commPValid) return "failure.commp"; + return "failure.other"; } function buildAbortedPlaceholder(pieceCid: string, reason: unknown): PieceRetrievalResult { @@ -245,6 +246,7 @@ function buildAbortedPlaceholder(pieceCid: string, reason: unknown): PieceRetrie ttfbMs: 0, throughputBps: 0, statusCode: 0, + httpSuccess: false, commPValid: false, errorMessage: message, aborted: true, diff --git a/apps/backend/src/retrieval-anon/piece-retrieval.service.ts b/apps/backend/src/retrieval-anon/piece-retrieval.service.ts index 51150661..805fe4a8 100644 --- a/apps/backend/src/retrieval-anon/piece-retrieval.service.ts +++ b/apps/backend/src/retrieval-anon/piece-retrieval.service.ts @@ -34,6 +34,7 @@ export class PieceRetrievalService { ttfbMs: 0, throughputBps: 0, statusCode: 0, + httpSuccess: false, commPValid: false, errorMessage: `Provider info not found for ${spAddress}`, }; @@ -49,7 +50,7 @@ export class PieceRetrievalService { }); const { metrics } = result; - const isSuccess = metrics.statusCode >= 200 && metrics.statusCode < 300; + const isHttpSuccess = metrics.statusCode >= 200 && metrics.statusCode < 300; const throughputBps = metrics.totalTime > 0 ? metrics.responseSize / (metrics.totalTime / 1000) : 0; if (result.aborted) { @@ -73,13 +74,14 @@ export class PieceRetrievalService { ttfbMs: metrics.ttfb, throughputBps, statusCode: metrics.statusCode, + httpSuccess: false, commPValid: false, errorMessage: result.abortReason ?? "aborted", aborted: true, }; } - if (!isSuccess) { + if (!isHttpSuccess) { this.logger.warn({ event: "piece_fetch_non_2xx", message: "Piece fetch returned non-2xx status", @@ -98,6 +100,7 @@ export class PieceRetrievalService { ttfbMs: metrics.ttfb, throughputBps, statusCode: metrics.statusCode, + httpSuccess: false, commPValid: false, errorMessage: `HTTP ${metrics.statusCode}`, }; @@ -106,6 +109,36 @@ export class PieceRetrievalService { const pieceBytes = Buffer.isBuffer(result.data) ? result.data : Buffer.from(result.data); const commPValid = await this.validateCommP(pieceBytes, pieceCid); + if (!commPValid) { + // A 2xx response with bytes that don't hash to the requested piece CID + // is a retrieval failure, not a success — downstream consumers must not + // treat it as a successfully-served piece. Don't propagate the wrong + // bytes either, so a misbehaving SP can't drag CAR parsing into the + // failure mode. + this.logger.warn({ + event: "piece_fetch_commp_mismatch", + message: "Piece fetched but bytes do not match requested piece CID", + url, + pieceCid, + spAddress, + bytesReceived: metrics.responseSize, + }); + + return { + success: false, + pieceCid, + bytesReceived: metrics.responseSize, + pieceBytes: null, + latencyMs: metrics.totalTime, + ttfbMs: metrics.ttfb, + throughputBps, + statusCode: metrics.statusCode, + httpSuccess: isHttpSuccess, + commPValid: false, + errorMessage: `CommP mismatch: bytes do not match ${pieceCid}`, + }; + } + this.logger.debug({ event: "piece_fetch_success", message: "Piece fetched successfully", @@ -125,6 +158,7 @@ export class PieceRetrievalService { ttfbMs: metrics.ttfb, throughputBps, statusCode: metrics.statusCode, + httpSuccess: isHttpSuccess, commPValid, }; } catch (error) { @@ -148,6 +182,7 @@ export class PieceRetrievalService { ttfbMs: 0, throughputBps: 0, statusCode: 0, + httpSuccess: false, commPValid: false, errorMessage: error instanceof Error ? error.message : String(error), aborted, diff --git a/apps/backend/src/retrieval-anon/types.ts b/apps/backend/src/retrieval-anon/types.ts index 9013a5ea..da3d64c0 100644 --- a/apps/backend/src/retrieval-anon/types.ts +++ b/apps/backend/src/retrieval-anon/types.ts @@ -19,6 +19,7 @@ export type PieceRetrievalResult = { ttfbMs: number; throughputBps: number; statusCode: number; + httpSuccess: boolean; commPValid: boolean; errorMessage?: string; aborted?: boolean; From 2850db821e2a340a28c4c3b0a9645fbc4ffb21d2 Mon Sep 17 00:00:00 2001 From: Dennis Trautwein Date: Tue, 26 May 2026 09:16:56 +0200 Subject: [PATCH 41/55] docs: imporve anon retrieval event model / timeline --- docs/checks/anon-retrievals.md | 54 +++++++++++++++++++--- docs/checks/events-and-metrics.md | 75 +++++++++++++++++++++++-------- 2 files changed, 104 insertions(+), 25 deletions(-) diff --git a/docs/checks/anon-retrievals.md b/docs/checks/anon-retrievals.md index c3b69610..d73a007b 100644 --- a/docs/checks/anon-retrievals.md +++ b/docs/checks/anon-retrievals.md @@ -88,13 +88,53 @@ Source: [`car-validation.service.ts`](../../apps/backend/src/retrieval-anon/car- ## What Gets Asserted -| # | Assertion | How It's Checked | Retries | Relevant Metric | Implemented? | -|---|-----------|------------------|:---:|------------------|:---:| -| 1 | SP serves the piece | `GET /piece/{pieceCid}` returns HTTP 2xx | 0 | [`anonPieceRetrievalLastByteMs`](./events-and-metrics.md#anonPieceRetrievalLastByteMs) | Yes | -| 2 | Bytes match the declared CommP | Hash of response bytes equals `pieceCid` | 0 | [`anonPieceRetrievalStatus`](./events-and-metrics.md#anonPieceRetrievalStatus) | Yes | -| 3 | Bytes parse as a CAR (IPFS-indexed pieces only) | `@ipld/car` parses the response | 0 | [`anonCarParseStatus`](./events-and-metrics.md#anonCarParseStatus) | Yes | -| 4 | SP is advertised on IPNI for root + sampled CIDs | filecoinpin.contact returns provider records | polling until timeout | [`anonIpniStatus`](./events-and-metrics.md#anonIpniStatus) | Yes | -| 5 | Sampled blocks fetch + hash-verify | `/ipfs/{cid}?format=raw` for each sample | 0 | [`anonBlockFetchStatus`](./events-and-metrics.md#anonBlockFetchStatus) | Yes | +| # | Assertion | How It's Checked | Retries | Relevant Metric | +|---|-----------|------------------|:---:|------------------| +| 1 | SP serves the piece | `GET /piece/{pieceCid}` returns HTTP 2xx | 0 | [`anonPieceRetrievalLastByteMs`](./events-and-metrics.md#anonPieceRetrievalLastByteMs) | +| 2 | Bytes match the declared CommP | Hash of response bytes equals `pieceCid` | 0 | [`anonPieceRetrievalStatus`](./events-and-metrics.md#anonPieceRetrievalStatus) | +| 3 | Bytes parse as a CAR (IPFS-indexed pieces only) | `@ipld/car` parses the response | 0 | [`anonCarParseStatus`](./events-and-metrics.md#anonCarParseStatus) | +| 4 | SP is advertised on IPNI for root + sampled CIDs | filecoinpin.contact returns provider records | polling until timeout | [`anonIpniStatus`](./events-and-metrics.md#anonIpniStatus) | +| 5 | Sampled blocks fetch + hash-verify | `/ipfs/{cid}?format=raw` for each sample | 0 | [`anonBlockFetchStatus`](./events-and-metrics.md#anonBlockFetchStatus) | + +## Sub-status meanings + +Unlike a Data Storage deal, anonymous retrieval does **not** have a single rolled-up status. CAR / IPNI / block-fetch outcomes are recorded independently and do **not** flip the piece-fetch verdict. Each status metric below is emitted exactly once per check (except `anonPieceRetrievalStatus`, which is also emitted as `failure.no_piece` when piece selection itself fails). + +| anonPieceRetrievalStatus | Meaning | +|--------|---------| +| `success` | `GET /piece/{pieceCid}` returned HTTP 2xx **and** the response bytes hashed to the declared CommP. | +| `failure.http` | Piece fetch did not return HTTP 2xx, or the request failed at the transport layer (DNS, TLS, connection reset, etc.). | +| `failure.commp` | Piece fetch returned HTTP 2xx, but the response bytes hashed to a different CID than `pieceCid`. The bytes are discarded — downstream CAR / IPNI / block-fetch validation is skipped to avoid amplifying a misbehaving SP. | +| `failure.timedout` | The job-level `AbortSignal` fired (most often `ANON_RETRIEVAL_JOB_TIMEOUT_SECONDS`). Partial timing/byte evidence is still persisted. | +| `failure.no_piece` | The subgraph returned no candidate piece for the SP after all selection fallbacks. No HTTP request was attempted. | +| `failure.other` | Catch-all for retrieval failures that do not match any of the categories above. | + +| anonCarParseStatus | Meaning | +|--------|---------| +| `parseable` | The fetched piece bytes were successfully parsed as a CAR by `@ipld/car`. | +| `not_parseable` | The fetched piece bytes could not be parsed as a CAR (malformed header, truncated content, unexpected encoding, or parser threw). | + +> Emitted only when piece fetch succeeded **and** the piece advertises IPFS indexing (`withIPFSIndexing = true` with a non-null `ipfsRootCid`). Skipped otherwise; no row value is recorded. + +| anonIpniStatus | Meaning | +|--------|---------| +| `valid` | filecoinpin.contact returned the SP as a provider for the root CID **and** every sampled child CID within `IPNI_VERIFICATION_TIMEOUT_MS`. | +| `invalid` | IPNI was queried but at least one CID never resolved to the SP under test before the timeout (or the timeout fired with unresolved CIDs). | +| `skipped` | IPNI verification was not attempted — piece fetch failed, the piece does not advertise IPFS indexing, or the SP is not registered with `WalletSdkService` so no IPNI sampling could run. | +| `error` | IPNI verification was attempted but the CAR-validation step threw before producing a result (e.g. invalid root CID, transport error, unexpected exception). | + +| anonBlockFetchStatus | Meaning | +|--------|---------| +| `valid` | Every sampled CID was fetched via `GET {spBaseUrl}/ipfs/{cid}?format=raw` and the response bytes hash-verified against the declared CID. | +| `invalid` | At least one sampled block fetch failed: non-2xx HTTP, hash mismatch, unsupported codec, unsupported hash, or transport error. Each failed sample counts as one failed block. | +| `skipped` | Block-fetch sampling was not attempted — piece fetch failed, the piece does not advertise IPFS indexing, or CAR parsing produced no sampleable CIDs. | +| `error` | Block-fetch sampling was attempted but the CAR-validation step threw before completing (e.g. CAR parser threw, unexpected exception). | + +Sources: +- [`anon-retrieval.service.ts`](../../apps/backend/src/retrieval-anon/anon-retrieval.service.ts) — emits the four status metrics +- [`piece-retrieval.service.ts`](../../apps/backend/src/retrieval-anon/piece-retrieval.service.ts) — classifies piece-fetch outcomes +- [`car-validation.service.ts`](../../apps/backend/src/retrieval-anon/car-validation.service.ts) — produces CAR / IPNI / block-fetch outcomes +- [`types.ts` (`IpniCheckStatus`)](../../apps/backend/src/database/types.ts) — enum source of truth ## Result Recording diff --git a/docs/checks/events-and-metrics.md b/docs/checks/events-and-metrics.md index 65c735a4..6f91bd03 100644 --- a/docs/checks/events-and-metrics.md +++ b/docs/checks/events-and-metrics.md @@ -6,16 +6,6 @@ This document is the intended **source of truth** for the events emitted by deal ## Data Storage Event Model -The [Anonymous Retrieval check](./anon-retrievals.md) is a single-shot flow per piece: select → fetch piece → (optional) parse CAR + IPNI + block fetch → write one ClickHouse row. - -It is not modeled as a sequence of named lifecycle events. Instead it emits: - -- **Outcome metrics** when each step completes — see the [time](#time-related-metrics) and [status](#status-count-related-metrics) metric tables for `anonPieceRetrievalFirstByteMs`, `anonRetrievalCheckMs`, `anonPieceRetrievalStatus`, `anonCarParseStatus`, `anonIpniStatus`, `anonBlockFetchStatus`, and friends. -- **One row per attempt** in the `anon_retrieval_checks` [ClickHouse table](#clickhouse-tables), emitted even on abort or unexpected error. -- **Structured log lines** (`anon_retrieval_started`, `anon_retrieval_completed`, `anon_retrieval_no_piece`, `anon_retrieval_car_validation_failed`, `anon_retrieval_clickhouse_insert_failed`) carrying a `retrievalId` so each row can be joined back to log evidence. - -## Data Storage Event Model - Below are the sequence of events for a [Data Storage check](./data-storage.md). The Data Storage flow is used because it encapsulates a [Retrieval check](./retrievals.md) as well. ### Data Storage Event Timeline @@ -65,6 +55,55 @@ sequenceDiagram | `ipfsRetrievalLastByteReceived` | Last byte received from `/ipfs/{rootCid}`. | Data Storage, Retrieval | [`retrieval-addons.service.ts`](../../apps/backend/src/retrieval-addons/retrieval-addons.service.ts) (drives `ipfsRetrievalLastByteMs`) | | `ipfsRetrievalIntegrityChecked` | Retrieved content matches expected CID (per-block sha256 hash verification via `createBlock`). Inline check at end of DAG traversal; no discrete event emission. | Data Storage, Retrieval | [`ipfs-block.strategy.ts`](../../apps/backend/src/retrieval-addons/strategies/ipfs-block.strategy.ts) | +## Anon Retrieval Check Event Model + +Below are the events for an [Anonymous Retrieval check](./anon-retrievals.md). Unlike the Data Storage flow, anonymous retrieval is a single-shot per-piece flow: select a publicly-discoverable piece from the FWSS subgraph → fetch the piece from the SP → optionally parse the CAR, verify IPNI, and sample block fetches → write one row to ClickHouse. The check emits one row to `anon_retrieval_checks` even on abort or unexpected error, so partial timing/byte evidence is never lost. + +### Anon Retrieval Check Event Timeline + +```mermaid +sequenceDiagram + autonumber + participant Dealbot + participant Subgraph as Subgraph + participant SP as Storage Provider + participant IPNI as filecoinpin.contact + + Dealbot->>Subgraph: anonPieceSelectionStart + Subgraph-->>Dealbot: anonPieceSelected + Dealbot->>SP: anonPieceFetchStart (GET /piece/{pieceCid}) + SP-->>Dealbot: anonPieceFetchFirstByteReceived + SP-->>Dealbot: anonPieceFetchLastByteReceived + Dealbot-->>Dealbot: anonCommPVerified + + opt if piece advertises IPFS indexing + Dealbot-->>Dealbot: anonCarParsed + Dealbot->>IPNI: anonIpniVerificationStart + IPNI-->>Dealbot: anonIpniVerificationComplete + Dealbot->>SP: anonBlockFetchStart (GET /ipfs/{cid}?format=raw, sampled) + SP-->>Dealbot: anonBlockFetchComplete + end + + Dealbot-->>Dealbot: anonRetrievalCheckComplete +``` + +### Anon Retrieval Check Event List + +| Event | Definition | Source of truth | +|------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-----------------| +| `anonPieceSelectionStart` | Dealbot begins selecting an anonymous piece for the SP under test from Dealbot's subgraph (size-bucket + indexed/any pool sampling with fallbacks). | [`anon-piece-selector.service.ts`](../../apps/backend/src/retrieval-anon/anon-piece-selector.service.ts) | +| `anonPieceSelected` | Subgraph returned a candidate piece (or all fallbacks were exhausted and the check is recorded as `failure.no_piece`). | [`anon-piece-selector.service.ts`](../../apps/backend/src/retrieval-anon/anon-piece-selector.service.ts) | +| `anonPieceFetchStart` | Dealbot initiates `GET {spBaseUrl}/piece/{pieceCid}`. | [`piece-retrieval.service.ts`](../../apps/backend/src/retrieval-anon/piece-retrieval.service.ts) (logs `anon_retrieval_started`) | +| `anonPieceFetchFirstByteReceived` | First byte received from `/piece/{pieceCid}`. | [`piece-retrieval.service.ts`](../../apps/backend/src/retrieval-anon/piece-retrieval.service.ts) (drives `anonPieceRetrievalFirstByteMs`) | +| `anonPieceFetchLastByteReceived` | Last byte received from `/piece/{pieceCid}`. | [`piece-retrieval.service.ts`](../../apps/backend/src/retrieval-anon/piece-retrieval.service.ts) (drives `anonPieceRetrievalLastByteMs`) | +| `anonCommPVerified` | Response bytes hashed and the resulting CommP compared against the declared `pieceCid`. Inline check; no discrete event emission. Failure flips piece fetch to `failure.commp`. | [`piece-retrieval.service.ts`](../../apps/backend/src/retrieval-anon/piece-retrieval.service.ts) | +| `anonCarParsed` | Fetched piece bytes are parsed as a CAR and a random sample of `ANON_RETRIEVAL_BLOCK_SAMPLE_COUNT` CIDs is selected. Only runs when the piece advertises IPFS indexing. | [`car-validation.service.ts`](../../apps/backend/src/retrieval-anon/car-validation.service.ts) | +| `anonIpniVerificationStart` | Dealbot begins polling filecoinpin.contact for `` provider records. | [`car-validation.service.ts`](../../apps/backend/src/retrieval-anon/car-validation.service.ts) | +| `anonIpniVerificationComplete` | IPNI verification finishes (all CIDs resolved to the SP, `IPNI_VERIFICATION_TIMEOUT_MS` reached, or error). | [`car-validation.service.ts`](../../apps/backend/src/retrieval-anon/car-validation.service.ts) (drives `ipni_verify_ms`) | +| `anonBlockFetchStart` | Dealbot starts fetching the sampled CIDs via `GET {spBaseUrl}/ipfs/{cid}?format=raw`. | [`car-validation.service.ts`](../../apps/backend/src/retrieval-anon/car-validation.service.ts) | +| `anonBlockFetchComplete` | All sampled block fetches finished; each response was hash-verified against its declared CID (any non-2xx, hash mismatch, unsupported codec, or transport error counts as a failed block). | [`car-validation.service.ts`](../../apps/backend/src/retrieval-anon/car-validation.service.ts) | +| `anonRetrievalCheckComplete` | Anonymous retrieval check terminates — successful piece fetch (plus optional CAR/IPNI/block-fetch validations) or any failure / abort. Drives the `anon_retrieval_checks` ClickHouse row and `anonRetrievalCheckMs`. | [`anon-retrieval.service.ts`](../../apps/backend/src/retrieval-anon/anon-retrieval.service.ts) (logs `anon_retrieval_completed`) | + ## Metrics * Many of the metrics below are derived from the [events above](#event-list). @@ -97,10 +136,10 @@ sequenceDiagram | `dataStorageCheckMs` | Data Storage | [`uploadToSpStart`](#uploadToSpStart) | [`ipfsRetrievalIntegrityChecked`](#ipfsRetrievalIntegrityChecked) | Duration of a Data Storage check | | | `retrievalCheckMs` | Retrieval | Retrieval check start | [`ipfsRetrievalIntegrityChecked`](#ipfsRetrievalIntegrityChecked) | Duration of a Retrieval check | | | `dataSetCreationMs` | Data-Set Creation | Data-set creation uploadToSpStart | Data-set creation pieceConfirmed | Duration of one data-set creation with confirmed piece (all using `createDataSetWithPiece`) | [`deal.service.ts`](../../apps/backend/src/deal/deal.service.ts) | -| `anonPieceRetrievalFirstByteMs` | Anonymous Retrieval | Piece fetch start | First byte received from `/piece/{pieceCid}` | Time to first byte for anonymous piece retrievals | [`anon-retrieval.service.ts`](../../apps/backend/src/retrieval-anon/anon-retrieval.service.ts) | -| `anonPieceRetrievalLastByteMs` | Anonymous Retrieval | Piece fetch start | Last byte received from `/piece/{pieceCid}` | Total time to retrieve an anonymous piece | [`anon-retrieval.service.ts`](../../apps/backend/src/retrieval-anon/anon-retrieval.service.ts) | +| `anonPieceRetrievalFirstByteMs` | Anonymous Retrieval | [`anonPieceFetchStart`](#anonPieceFetchStart) | [`anonPieceFetchFirstByteReceived`](#anonPieceFetchFirstByteReceived) | Time to first byte for anonymous piece retrievals | [`anon-retrieval.service.ts`](../../apps/backend/src/retrieval-anon/anon-retrieval.service.ts) | +| `anonPieceRetrievalLastByteMs` | Anonymous Retrieval | [`anonPieceFetchStart`](#anonPieceFetchStart) | [`anonPieceFetchLastByteReceived`](#anonPieceFetchLastByteReceived) | Total time to retrieve an anonymous piece | [`anon-retrieval.service.ts`](../../apps/backend/src/retrieval-anon/anon-retrieval.service.ts) | | `anonPieceRetrievalThroughputBps` | Anonymous Retrieval | n/a | n/a | `(bytesRetrieved / anonPieceRetrievalLastByteMs) * 1000` | [`anon-retrieval.service.ts`](../../apps/backend/src/retrieval-anon/anon-retrieval.service.ts) | -| `anonRetrievalCheckMs` | Anonymous Retrieval | Anon retrieval check start | After CAR/IPNI/block-fetch validation completes (or on abort) | End-to-end anonymous retrieval check duration | [`anon-retrieval.service.ts`](../../apps/backend/src/retrieval-anon/anon-retrieval.service.ts) | +| `anonRetrievalCheckMs` | Anonymous Retrieval | [`anonPieceSelected`](#anonPieceSelected) | [`anonRetrievalCheckComplete`](#anonRetrievalCheckComplete) | End-to-end anonymous retrieval check duration (excludes piece selection; includes CAR/IPNI/block-fetch validation when applicable). Emitted even on abort. | [`anon-retrieval.service.ts`](../../apps/backend/src/retrieval-anon/anon-retrieval.service.ts) | ### Status Count Related Metrics @@ -120,11 +159,11 @@ sequenceDiagram | `dataSetCreationStatus` | Data-Set Creation | Not tied to an [event above](#event-list) but rather to data-set creation start (`pending`) and completion (`success`/`failure.*`) | `pending`, `success`, `failure.timedout`, `failure.other` | [`deal.service.ts`](../../apps/backend/src/deal/deal.service.ts) | | `dataSetChallengeStatus` | Data Retention | Emitted on each [Data Retention Check](./data-retention.md) poll when a provider's confirmed proving-period totals advance (strictly positive deltas). Unit: **challenges** (period delta × `CHALLENGES_PER_PROVING_PERIOD = 5`). | `success` (challenges in successfully-proven periods), `failure` (challenges in faulted periods) | [`data-retention.service.ts`](../../apps/backend/src/data-retention/data-retention.service.ts) | | `pdp_provider_estimated_overdue_periods` | Data Retention | Emitted on every [Data Retention Check](./data-retention.md) poll for every successfully processed provider. | Gauge value in proving periods (non-negative integer) | [`data-retention.service.ts`](../../apps/backend/src/data-retention/data-retention.service.ts) | -| `anonPieceRetrievalStatus` | Anonymous Retrieval | After piece fetch completes (or on abort) | `success` (HTTP 2xx **and** CommP matches), `failure.http`, `failure.commp` (HTTP 2xx but bytes hashed to a different CID), `failure.timedout`, `failure.no_piece`. | [`anon-retrieval.service.ts`](../../apps/backend/src/retrieval-anon/anon-retrieval.service.ts) | -| `anonPieceHttpResponseCode` | Anonymous Retrieval | After piece fetch completes | Same as [`ipfsRetrievalHttpResponseCode`](#ipfsRetrievalHttpResponseCode). | [`anon-retrieval.service.ts`](../../apps/backend/src/retrieval-anon/anon-retrieval.service.ts) | -| `anonCarParseStatus` | Anonymous Retrieval | After CAR validation runs (skipped when piece fetch failed or piece is not IPFS-indexed) | `parseable`, `not_parseable` | [`anon-retrieval.service.ts`](../../apps/backend/src/retrieval-anon/anon-retrieval.service.ts) | -| `anonIpniStatus` | Anonymous Retrieval | After CAR validation runs, **or** when piece fetch failed (records `skipped`) | `valid`, `invalid`, `skipped`, `error` | [`anon-retrieval.service.ts`](../../apps/backend/src/retrieval-anon/anon-retrieval.service.ts) | -| `anonBlockFetchStatus` | Anonymous Retrieval | After block-fetch sampling runs, **or** when piece fetch failed (records `skipped`) | `valid`, `invalid`, `skipped`, `error` | [`anon-retrieval.service.ts`](../../apps/backend/src/retrieval-anon/anon-retrieval.service.ts) | +| `anonPieceRetrievalStatus` | Anonymous Retrieval | [`anonRetrievalCheckComplete`](#anonRetrievalCheckComplete) | `success`, `failure.http`, `failure.commp`, `failure.timedout`, `failure.no_piece`, `failure.other` from [Anonymous Retrieval Sub-status meanings](./anon-retrievals.md#sub-status-meanings). | [`anon-retrieval.service.ts`](../../apps/backend/src/retrieval-anon/anon-retrieval.service.ts) | +| `anonPieceHttpResponseCode` | Anonymous Retrieval | [`anonPieceFetchLastByteReceived`](#anonPieceFetchLastByteReceived) | Same as [`ipfsRetrievalHttpResponseCode`](#ipfsRetrievalHttpResponseCode). | [`anon-retrieval.service.ts`](../../apps/backend/src/retrieval-anon/anon-retrieval.service.ts) | +| `anonCarParseStatus` | Anonymous Retrieval | [`anonCarParsed`](#anonCarParsed) (skipped when piece fetch failed or piece is not IPFS-indexed) | `parseable`, `not_parseable` from [Anonymous Retrieval Sub-status meanings](./anon-retrievals.md#sub-status-meanings). | [`anon-retrieval.service.ts`](../../apps/backend/src/retrieval-anon/anon-retrieval.service.ts) | +| `anonIpniStatus` | Anonymous Retrieval | [`anonIpniVerificationComplete`](#anonIpniVerificationComplete), **or** when piece fetch failed (records `skipped`) | `valid`, `invalid`, `skipped`, `error` from [Anonymous Retrieval Sub-status meanings](./anon-retrievals.md#sub-status-meanings). | [`anon-retrieval.service.ts`](../../apps/backend/src/retrieval-anon/anon-retrieval.service.ts) | +| `anonBlockFetchStatus` | Anonymous Retrieval | [`anonBlockFetchComplete`](#anonBlockFetchComplete), **or** when piece fetch failed (records `skipped`) | `valid`, `invalid`, `skipped`, `error` from [Anonymous Retrieval Sub-status meanings](./anon-retrievals.md#sub-status-meanings). | [`anon-retrieval.service.ts`](../../apps/backend/src/retrieval-anon/anon-retrieval.service.ts) | ## ClickHouse Tables From c325038fa326a555b7911babb3864009ec73ed08 Mon Sep 17 00:00:00 2001 From: Dennis Trautwein Date: Tue, 26 May 2026 11:24:30 +0200 Subject: [PATCH 42/55] change(retrieval-anon): maximum size bucket to 100MiB --- apps/backend/src/config/app.config.ts | 2 +- .../src/retrieval-anon/anon-piece-selector.service.ts | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/apps/backend/src/config/app.config.ts b/apps/backend/src/config/app.config.ts index 2655ba0b..4c62aa0b 100644 --- a/apps/backend/src/config/app.config.ts +++ b/apps/backend/src/config/app.config.ts @@ -281,7 +281,7 @@ export interface IJobsConfig { /** * Maximum runtime (seconds) for anonymous retrieval jobs before forced abort. * - * Anonymous retrievals fetch arbitrary pieces (up to ~500 MiB), so this is + * Anonymous retrievals fetch arbitrary pieces (up to 100 MiB), so this is * typically larger than `retrievalJobTimeoutSeconds`. Uses AbortController * to actively cancel job execution while still persisting partial metrics. */ diff --git a/apps/backend/src/retrieval-anon/anon-piece-selector.service.ts b/apps/backend/src/retrieval-anon/anon-piece-selector.service.ts index 719f637f..3f8009eb 100644 --- a/apps/backend/src/retrieval-anon/anon-piece-selector.service.ts +++ b/apps/backend/src/retrieval-anon/anon-piece-selector.service.ts @@ -20,9 +20,9 @@ const MIB = 1024n * 1024n; // All downloads are buffered in-memory, so we need to keep piece sizes reasonable // When changing these values, also update ./docs/checks/anon-retrievals.md#piece-selection const SIZE_BUCKETS: Record = { - small: { min: 1n * MIB, max: 20n * MIB - 1n }, - medium: { min: 20n * MIB, max: 100n * MIB - 1n }, - large: { min: 100n * MIB, max: 500n * MIB - 1n }, + small: { min: 1n * MIB, max: 10n * MIB - 1n }, + medium: { min: 10n * MIB, max: 50n * MIB - 1n }, + large: { min: 50n * MIB, max: 100n * MIB - 1n }, }; // Weights for choosing a bucket per selection. Must sum to 1. From d098e37ddd07b39bb8920d1371bb2a8e34973b2f Mon Sep 17 00:00:00 2001 From: Dennis Trautwein Date: Wed, 27 May 2026 10:27:46 +0200 Subject: [PATCH 43/55] fix(retrieval-anon): wrap-around sampleKey subgraph query --- .../anon-piece-selector.service.ts | 18 ++-- apps/backend/src/subgraph/queries.ts | 19 ++-- .../src/subgraph/subgraph.service.spec.ts | 41 +++++++-- apps/backend/src/subgraph/subgraph.service.ts | 89 +++++++++++-------- 4 files changed, 111 insertions(+), 56 deletions(-) diff --git a/apps/backend/src/retrieval-anon/anon-piece-selector.service.ts b/apps/backend/src/retrieval-anon/anon-piece-selector.service.ts index 3f8009eb..2b272c90 100644 --- a/apps/backend/src/retrieval-anon/anon-piece-selector.service.ts +++ b/apps/backend/src/retrieval-anon/anon-piece-selector.service.ts @@ -58,9 +58,10 @@ export class AnonPieceSelectorService { * 1. Pick a size bucket by weighted random. * 2. Pick a pool (`indexed` 80% / `any` 20%). * 3. Generate a uniform-random sampleKey and query the subgraph for the - * smallest `Root.sampleKey ≥ $sampleKey` matching the filters. - * 4. Drop the pick if `pdpPaymentEndEpoch` has passed or it was tested - * recently; redraw once. + * piece closest to that key. `sampleAnonPiece` handles the wrap-around + * dead zone internally via a reverse-direction fallback. + * 4. Drop the pick if `pdpPaymentEndEpoch` has passed; redraw once with a + * fresh sampleKey. * 5. If still empty, fall back through: (same bucket, opposite pool) → * (any bucket, indexed) → (any bucket, any). */ @@ -123,8 +124,15 @@ export class AnonPieceSelectorService { } /** - * Try to draw a piece for one (bucket, pool) combination. Up to two draws - * with fresh sampleKeys, each filtered by dedup + epoch-termination. + * Try to draw a piece for one (bucket, pool) combination. Up to two + * draws with fresh sampleKeys so we can retry past a piece whose + * `pdpPaymentEndEpoch` has already terminated. Boundary handling + * (random key above all matching sampleKeys) lives inside + * `sampleAnonPiece`, so the retry here is solely for epoch-termination. + * + * Change this logic when https://github.com/FilOzone/dealbot/issues/579 has + * landed. Then we don't need to retry because sampleAnonPiece can directly + * query for pieces that have not already terminated. */ private async drawPiece(args: { spAddress: string; diff --git a/apps/backend/src/subgraph/queries.ts b/apps/backend/src/subgraph/queries.ts index 74802ddf..14254467 100644 --- a/apps/backend/src/subgraph/queries.ts +++ b/apps/backend/src/subgraph/queries.ts @@ -24,13 +24,18 @@ export const Queries = { } as const; /** - * Build a sampleAnonPiece query scoped to the requested pool. The single - * piece of query shape that differs is whether the proofSet filter pins - * `withIPFSIndexing: true`; assembling the fragment here keeps the rest - * of the query and the returned selection set shared. + * Build a sampleAnonPiece query scoped to the requested pool. The query + * shape varies in two ways: whether the proofSet filter pins + * `withIPFSIndexing: true`, and whether sampleKey is searched forward + * (`_gte` + asc — smallest key at or above the target) or backward + * (`_lt` + desc — largest key below the target). Filter direction and + * sort direction move together so both modes return the piece closest + * to the target sampleKey. */ -export function buildSampleAnonPieceQuery(pool: "indexed" | "any"): string { +export function buildSampleAnonPieceQuery(pool: "indexed" | "any", reverse: boolean = false): string { const indexingFilter = pool === "indexed" ? "withIPFSIndexing: true" : ""; + const sampleKeyFilter = reverse ? "sampleKey_lt" : "sampleKey_gte"; + const orderDirection = reverse ? "desc" : "asc"; return ` query SampleAnonPiece( $serviceProvider: Bytes! @@ -47,9 +52,9 @@ export function buildSampleAnonPieceQuery(pool: "indexed" | "any"): string { roots( first: 1 orderBy: sampleKey - orderDirection: asc + orderDirection: ${orderDirection} where: { - sampleKey_gte: $sampleKey + ${sampleKeyFilter}: $sampleKey removed: false rawSize_gte: $minSize rawSize_lte: $maxSize diff --git a/apps/backend/src/subgraph/subgraph.service.spec.ts b/apps/backend/src/subgraph/subgraph.service.spec.ts index 64f28435..0a176c73 100644 --- a/apps/backend/src/subgraph/subgraph.service.spec.ts +++ b/apps/backend/src/subgraph/subgraph.service.spec.ts @@ -745,14 +745,43 @@ describe("SubgraphService", () => { expect(fetchMock).not.toHaveBeenCalled(); }); - it("returns null when the subgraph yields no matching root", async () => { - fetchMock.mockResolvedValueOnce({ + it("returns null when neither directional query yields a matching root", async () => { + fetchMock.mockResolvedValue({ ok: true, json: async () => makeSampleResponse([]), }); const piece = await service.sampleAnonPiece(defaultSampleParams); expect(piece).toBeNull(); + // Forward then reverse — confirms the wrap-around fallback fires when forward is empty. + expect(fetchMock).toHaveBeenCalledTimes(2); + }); + + it("falls back to the reverse-direction query when the forward query is empty", async () => { + fetchMock + .mockResolvedValueOnce({ ok: true, json: async () => makeSampleResponse([]) }) + .mockResolvedValueOnce({ ok: true, json: async () => makeSampleResponse([makeSampleRoot()]) }); + + const piece = await service.sampleAnonPiece(defaultSampleParams); + + expect(piece).not.toBeNull(); + expect(fetchMock).toHaveBeenCalledTimes(2); + const forwardQuery = JSON.parse(fetchMock.mock.calls[0][1].body as string).query as string; + const reverseQuery = JSON.parse(fetchMock.mock.calls[1][1].body as string).query as string; + expect(forwardQuery).toContain("sampleKey_gte"); + expect(forwardQuery).toContain("orderDirection: asc"); + expect(reverseQuery).toContain("sampleKey_lt"); + expect(reverseQuery).toContain("orderDirection: desc"); + }); + + it("skips the reverse query when the forward query already returned a root", async () => { + fetchMock.mockResolvedValueOnce({ + ok: true, + json: async () => makeSampleResponse([makeSampleRoot()]), + }); + + await service.sampleAnonPiece(defaultSampleParams); + expect(fetchMock).toHaveBeenCalledTimes(1); }); it("parses the sampled root into a decoded candidate piece", async () => { @@ -796,7 +825,7 @@ describe("SubgraphService", () => { }); it("lowercases SP and payer addresses before querying", async () => { - fetchMock.mockResolvedValueOnce({ ok: true, json: async () => makeSampleResponse([]) }); + fetchMock.mockResolvedValue({ ok: true, json: async () => makeSampleResponse([]) }); await service.sampleAnonPiece(defaultSampleParams); @@ -808,7 +837,7 @@ describe("SubgraphService", () => { }); it("uses the any-pool query when pool is 'any'", async () => { - fetchMock.mockResolvedValueOnce({ ok: true, json: async () => makeSampleResponse([]) }); + fetchMock.mockResolvedValue({ ok: true, json: async () => makeSampleResponse([]) }); await service.sampleAnonPiece({ ...defaultSampleParams, pool: "any" }); @@ -834,7 +863,9 @@ describe("SubgraphService", () => { promise.catch(() => {}); await vi.runAllTimersAsync(); - await expect(promise).rejects.toThrow("Failed to fetch subgraph sample_anon_piece_indexed after 3 attempts"); + await expect(promise).rejects.toThrow( + "Failed to fetch subgraph sample_anon_piece_indexed_forward after 3 attempts", + ); expect(fetchMock).toHaveBeenCalledTimes(3); }); diff --git a/apps/backend/src/subgraph/subgraph.service.ts b/apps/backend/src/subgraph/subgraph.service.ts index 1f8cd0de..0e2de339 100644 --- a/apps/backend/src/subgraph/subgraph.service.ts +++ b/apps/backend/src/subgraph/subgraph.service.ts @@ -118,15 +118,22 @@ export class SubgraphService { /** * Draw a single random anonymous piece for retrieval testing. * - * Uses the Root.sampleKey (keccak256 of the entity id) to pick the - * smallest key ≥ `params.sampleKey` that matches the filters — a uniform - * random pick when `sampleKey` is generated uniformly. Server-side filters - * cover SP, payer-exclusion, active status, size range, and optionally - * `withIPFSIndexing`. Returns null when no piece matches (callers should - * retry with a fresh sampleKey or relax the pool/bucket). + * Uses the Root.sampleKey (keccak256 of the entity id) to pick the piece + * closest to `params.sampleKey`. Runs the forward query first + * (`sampleKey_gte`, asc — smallest key at or above the target) and falls + * back to the reverse query (`sampleKey_lt`, desc — largest key below the + * target) only when the forward query returns nothing. The fallback covers + * the wrap-around dead zone where the random key happens to exceed every + * matching sampleKey; without it those draws would waste a fresh + * sampleKey roundtrip in the caller. + * + * Server-side filters cover SP, payer-exclusion, active status, size + * range, and optionally `withIPFSIndexing`. Returns null only when no + * piece in either direction matches the filters. * * `pdpPaymentEndEpoch` is returned to the caller for a cheap client-side * epoch comparison — GraphQL filters on nullable BigInts are awkward. + * However this will be changed in the context of https://github.com/FilOzone/dealbot/issues/579. */ async sampleAnonPiece(params: SampleAnonPieceParams, signal?: AbortSignal): Promise { if (!this.blockchainConfig.subgraphEndpoint) { @@ -139,7 +146,6 @@ export class SubgraphService { throw new Error("No subgraph endpoint configured"); } - const query = buildSampleAnonPieceQuery(params.pool); const variables = { serviceProvider: params.serviceProvider.toLowerCase(), payer: params.payer.toLowerCase(), @@ -148,40 +154,45 @@ export class SubgraphService { maxSize: params.maxSize, }; - const validated = await this.executeQuery( - `sample_anon_piece_${params.pool}`, - query, - variables, - validateSampleAnonPieceResponse, - signal, - ); - - const root = validated.roots[0]; - if (!root) { - return null; - } + for (const reverse of [false, true]) { + const validated = await this.executeQuery( + `sample_anon_piece_${params.pool}_${reverse ? "reverse" : "forward"}`, + buildSampleAnonPieceQuery(params.pool, reverse), + variables, + validateSampleAnonPieceResponse, + signal, + ); - try { - return { - pieceCid: decodePieceCid(root.cid), - pieceId: root.rootId, - dataSetId: root.proofSet.setId, - rawSize: root.rawSize, - withIPFSIndexing: root.proofSet.withIPFSIndexing, - ipfsRootCid: root.ipfsRootCID ?? null, - indexedAtBlock: validated._meta.block.number, - pdpPaymentEndEpoch: root.proofSet.pdpPaymentEndEpoch != null ? BigInt(root.proofSet.pdpPaymentEndEpoch) : null, - }; - } catch (error) { - this.logger.warn({ - event: "anon_piece_cid_decode_failed", - message: "Failed to decode piece CID from subgraph data", - dataSetId: root.proofSet.setId, - pieceId: root.rootId, - error: toStructuredError(error), - }); - return null; + const root = validated.roots[0]; + if (!root) { + continue; + } + + try { + return { + pieceCid: decodePieceCid(root.cid), + pieceId: root.rootId, + dataSetId: root.proofSet.setId, + rawSize: root.rawSize, + withIPFSIndexing: root.proofSet.withIPFSIndexing, + ipfsRootCid: root.ipfsRootCID ?? null, + indexedAtBlock: validated._meta.block.number, + pdpPaymentEndEpoch: + root.proofSet.pdpPaymentEndEpoch != null ? BigInt(root.proofSet.pdpPaymentEndEpoch) : null, + }; + } catch (error) { + this.logger.warn({ + event: "anon_piece_cid_decode_failed", + message: "Failed to decode piece CID from subgraph data", + dataSetId: root.proofSet.setId, + pieceId: root.rootId, + error: toStructuredError(error), + }); + return null; + } } + + return null; } /** From 70686e188118ea40d21b785a5d7698414eea1da4 Mon Sep 17 00:00:00 2001 From: Dennis Trautwein Date: Wed, 27 May 2026 10:28:16 +0200 Subject: [PATCH 44/55] fix(retrieval-anon): handle aborted signal in car response --- .../src/retrieval-anon/car-validation.service.ts | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/apps/backend/src/retrieval-anon/car-validation.service.ts b/apps/backend/src/retrieval-anon/car-validation.service.ts index bc14720c..f8d08a76 100644 --- a/apps/backend/src/retrieval-anon/car-validation.service.ts +++ b/apps/backend/src/retrieval-anon/car-validation.service.ts @@ -193,6 +193,18 @@ export class CarValidationService { signal, }); + if (resp.aborted) { + failedCount += 1; + this.logger.warn({ + event: "block_fetch_aborted", + message: "Block fetch was aborted", + cid: cidStr, + spAddress, + abortReason: resp.abortReason, + }); + continue; + } + if (resp.metrics.statusCode < 200 || resp.metrics.statusCode >= 300) { failedCount += 1; this.logger.warn({ From 007cfd7d694d7f4ac2032f913c2b685a2532c778 Mon Sep 17 00:00:00 2001 From: Dennis Trautwein Date: Wed, 27 May 2026 10:49:19 +0200 Subject: [PATCH 45/55] fix(retrieval-anon): correctly interpret operator-driven cancellation --- .../anon-retrieval.service.spec.ts | 26 +++++++++++++ .../retrieval-anon/anon-retrieval.service.ts | 38 +++++++++++++------ 2 files changed, 52 insertions(+), 12 deletions(-) diff --git a/apps/backend/src/retrieval-anon/anon-retrieval.service.spec.ts b/apps/backend/src/retrieval-anon/anon-retrieval.service.spec.ts index 922e6ba2..8d437e26 100644 --- a/apps/backend/src/retrieval-anon/anon-retrieval.service.spec.ts +++ b/apps/backend/src/retrieval-anon/anon-retrieval.service.spec.ts @@ -310,6 +310,32 @@ describe("AnonRetrievalService", () => { expect(row.block_fetch_failed_count).toBe(2); }); + it("does not record SP-fault metrics when CAR validation is interrupted by signal abort", async () => { + // An operator-driven abort (job timeout, shutdown) that interrupts + // validateCarPiece must not show up as carParseable=false + + // ipni/blockFetch=ERROR — that would misattribute our cancellation to + // the SP and pollute scoreboards. + const ac = new AbortController(); + const { service, insertSpy, metricsRecordIpniSpy, metricsRecordBlockFetchSpy } = makeService({ + pieceResult: okPiece(Buffer.from("car-bytes")), + piece: INDEXED_PIECE, + validateCarImpl: async () => { + ac.abort(new Error("Anon retrieval job timeout")); + throw Object.assign(new Error("aborted"), { name: "AbortError" }); + }, + }); + + await service.performForProvider(SP_ADDRESS, ac.signal); + + expect(metricsRecordIpniSpy).not.toHaveBeenCalledWith(expect.anything(), "error"); + expect(metricsRecordBlockFetchSpy).not.toHaveBeenCalledWith(expect.anything(), "error"); + + const [, row] = insertSpy.mock.calls[0] as [string, Record]; + expect(row.ipni_status).toBe("skipped"); + expect(row.car_parseable).toBeNull(); + expect(row.block_fetch_valid).toBeNull(); + }); + it("emits ipni_status='error' (not 'skipped') when CAR validation throws on a successful piece", async () => { // Distinguishes a real infra outage (e.g. IpniVerificationService down) // from a piece that legitimately had no IPFS indexing. Without the diff --git a/apps/backend/src/retrieval-anon/anon-retrieval.service.ts b/apps/backend/src/retrieval-anon/anon-retrieval.service.ts index d5ddd485..2f062def 100644 --- a/apps/backend/src/retrieval-anon/anon-retrieval.service.ts +++ b/apps/backend/src/retrieval-anon/anon-retrieval.service.ts @@ -117,18 +117,32 @@ export class AnonRetrievalService { : IpniCheckStatus.INVALID, ); } catch (error) { - // Validation was attempted on a successful piece retrieval but threw. - this.metrics.recordCarParseStatus(labels, false); - this.metrics.recordIpniStatus(labels, IpniCheckStatus.ERROR); - this.metrics.recordBlockFetchStatus(labels, IpniCheckStatus.ERROR); - this.logger.warn({ - ...logContext, - event: "anon_retrieval_car_validation_failed", - message: "CAR validation threw an error", - pieceCid: piece.pieceCid, - spAddress, - error: toStructuredError(error), - }); + if (signal?.aborted) { + // Operator-driven cancellation, not an SP fault. Suppress the + // SP-fault metrics and downgrade the downstream ClickHouse status + // so we don't pollute SP scoreboards with our own aborts. + validatedCarPiece = false; + this.logger.warn({ + ...logContext, + event: "anon_retrieval_car_validation_aborted", + message: "CAR validation aborted before completion", + pieceCid: piece.pieceCid, + spAddress, + }); + } else { + // Validation was attempted on a successful piece retrieval but threw. + this.metrics.recordCarParseStatus(labels, false); + this.metrics.recordIpniStatus(labels, IpniCheckStatus.ERROR); + this.metrics.recordBlockFetchStatus(labels, IpniCheckStatus.ERROR); + this.logger.warn({ + ...logContext, + event: "anon_retrieval_car_validation_failed", + message: "CAR validation threw an error", + pieceCid: piece.pieceCid, + spAddress, + error: toStructuredError(error), + }); + } } } else if (!pieceResult.success) { // Piece retrieval failed (HTTP error or commP mismatch) — downstream From 9611e6c4f45a4978e9de78c8efb9cb150ef61bf8 Mon Sep 17 00:00:00 2001 From: Dennis Trautwein Date: Wed, 27 May 2026 10:49:43 +0200 Subject: [PATCH 46/55] docs(anon-retrievals): incorporate pr feedback --- docs/checks/anon-retrievals.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/checks/anon-retrievals.md b/docs/checks/anon-retrievals.md index d73a007b..08211211 100644 --- a/docs/checks/anon-retrievals.md +++ b/docs/checks/anon-retrievals.md @@ -36,8 +36,8 @@ Selection strategy (per scheduled job, per SP): 1. **Pick a size bucket** by weighted random: - `small` (1–20 MiB) — 20% - - `medium` (20–100 MiB) — 50% - - `large` (100–500 MiB) — 30% + - `medium` (20–50 MiB) — 50% + - `large` (50–100 MiB) — 30% 2. **Pick a pool**: - `indexed` (IPFS-indexed pieces) — 80% - `any` (all FWSS pieces) — 20% @@ -98,7 +98,7 @@ Source: [`car-validation.service.ts`](../../apps/backend/src/retrieval-anon/car- ## Sub-status meanings -Unlike a Data Storage deal, anonymous retrieval does **not** have a single rolled-up status. CAR / IPNI / block-fetch outcomes are recorded independently and do **not** flip the piece-fetch verdict. Each status metric below is emitted exactly once per check (except `anonPieceRetrievalStatus`, which is also emitted as `failure.no_piece` when piece selection itself fails). +Unlike the [Data Storage check](./data-storage.md#deal-status-progression), anonymous retrieval does **not** have a rolled-up status (e.g., `anonRetrievalStatus). Piece retrieval, CAR parsing, IPNI verification, block-fetch outcomes are recorded independently. Each status metric below is emitted exactly once per check, except when `anonPieceRetrievalStatus=failure.no_piece` because selection itself fails. | anonPieceRetrievalStatus | Meaning | |--------|---------| From aa19374494cd3a56c9769bc733a7df62cd36cc8c Mon Sep 17 00:00:00 2001 From: Dennis Trautwein Date: Wed, 27 May 2026 12:26:53 +0200 Subject: [PATCH 47/55] refactor(anon-retrieval): decouple CAR parse / IPNI verify / Block Fetch --- .../src/clickhouse/clickhouse.schema.ts | 6 +- apps/backend/src/database/types.ts | 13 + .../check-metrics.service.ts | 10 +- .../anon-retrieval.service.spec.ts | 320 +++++++++++------- .../retrieval-anon/anon-retrieval.service.ts | 156 +++++---- .../retrieval-anon/car-validation.service.ts | 259 -------------- .../piece-validation.service.ts | 281 +++++++++++++++ .../retrieval-anon/retrieval-anon.module.ts | 4 +- apps/backend/src/retrieval-anon/types.ts | 44 ++- docs/checks/anon-retrievals.md | 53 +-- docs/checks/events-and-metrics.md | 30 +- docs/environment-variables.md | 2 +- 12 files changed, 665 insertions(+), 513 deletions(-) delete mode 100644 apps/backend/src/retrieval-anon/car-validation.service.ts create mode 100644 apps/backend/src/retrieval-anon/piece-validation.service.ts diff --git a/apps/backend/src/clickhouse/clickhouse.schema.ts b/apps/backend/src/clickhouse/clickhouse.schema.ts index b27ba0e2..7fab8022 100644 --- a/apps/backend/src/clickhouse/clickhouse.schema.ts +++ b/apps/backend/src/clickhouse/clickhouse.schema.ts @@ -90,10 +90,10 @@ export function buildMigrations(database: string): string[] { throughput_bps Nullable(UInt64), -- effective throughput, bytes per second commp_valid Nullable(Bool), -- null when retrieval failed before CommP could be hashed - car_parseable Nullable(Bool), -- null when CAR validation was skipped (no IPFS indexing or piece fetch failed); true if bytes parsed as a CAR - car_block_count Nullable(UInt32), -- total number of blocks observed inside the CAR; null when skipped or unparseable + car_status LowCardinality(String), -- 'parseable' | 'not_parseable' | 'skipped' | 'error' — mirrors anonCarParseStatus; skipped when piece fetch failed, piece is not IPFS-indexed, or the job aborted before parsing + car_block_count Nullable(UInt32), -- total number of blocks observed inside the CAR; null when skipped or not parseable block_fetch_endpoint Nullable(String), -- gateway base URL probed for block fetch (e.g. {spBaseUrl}/ipfs/); null when skipped - block_fetch_valid Nullable(Bool), -- null when skipped; true if all sampled blocks fetched + hash-verified + block_fetch_status LowCardinality(String), -- 'valid' | 'invalid' | 'skipped' | 'error' — mirrors anonBlockFetchStatus; skipped when CAR validation didn't run or SP info missing block_fetch_sampled_count Nullable(UInt32), -- number of blocks sampled and probed via /ipfs/?format=raw block_fetch_failed_count Nullable(UInt32), -- number of sampled blocks that failed (non-2xx, hash mismatch, unsupported codec, or transport error) diff --git a/apps/backend/src/database/types.ts b/apps/backend/src/database/types.ts index c56b355a..58465abf 100644 --- a/apps/backend/src/database/types.ts +++ b/apps/backend/src/database/types.ts @@ -35,6 +35,19 @@ export enum IpniCheckStatus { ERROR = "error", } +export enum CarParseStatus { + PARSEABLE = "parseable", + NOT_PARSEABLE = "not_parseable", + SKIPPED = "skipped", +} + +export enum BlockFetchStatus { + SUCCESS = "success", + FAILURE = "failure", + SKIPPED = "skipped", + ERROR = "error", +} + /** * Metadata schema for deal storage and retrieval */ diff --git a/apps/backend/src/metrics-prometheus/check-metrics.service.ts b/apps/backend/src/metrics-prometheus/check-metrics.service.ts index 62d2d137..35961a55 100644 --- a/apps/backend/src/metrics-prometheus/check-metrics.service.ts +++ b/apps/backend/src/metrics-prometheus/check-metrics.service.ts @@ -2,7 +2,7 @@ import { Injectable, Logger } from "@nestjs/common"; import { InjectMetric } from "@willsoto/nestjs-prometheus"; import type { Counter, Histogram } from "prom-client"; import type { Deal } from "../database/entities/deal.entity.js"; -import { IpniCheckStatus } from "../database/types.js"; +import { BlockFetchStatus, CarParseStatus, IpniCheckStatus } from "../database/types.js"; import type { RetrievalExecutionResult } from "../retrieval-addons/types.js"; import { buildCheckMetricLabels, type CheckMetricLabels } from "./check-metric-labels.js"; @@ -289,7 +289,7 @@ export class AnonRetrievalCheckMetrics { observePositive(this.checkMs, labels, value); } - recordStatus(labels: CheckMetricLabels, value: string): void { + recordPieceRetrievalStatus(labels: CheckMetricLabels, value: string): void { this.statusCounter.inc({ ...labels, value }); } @@ -300,15 +300,15 @@ export class AnonRetrievalCheckMetrics { }); } - recordCarParseStatus(labels: CheckMetricLabels, parseable: boolean): void { - this.carParseCounter.inc({ ...labels, value: parseable ? "parseable" : "not_parseable" }); + recordCarParseStatus(labels: CheckMetricLabels, value: CarParseStatus): void { + this.carParseCounter.inc({ ...labels, value }); } recordIpniStatus(labels: CheckMetricLabels, value: IpniCheckStatus): void { this.ipniCounter.inc({ ...labels, value }); } - recordBlockFetchStatus(labels: CheckMetricLabels, value: "valid" | "invalid" | "skipped" | "error"): void { + recordBlockFetchStatus(labels: CheckMetricLabels, value: BlockFetchStatus): void { this.blockFetchCounter.inc({ ...labels, value }); } } diff --git a/apps/backend/src/retrieval-anon/anon-retrieval.service.spec.ts b/apps/backend/src/retrieval-anon/anon-retrieval.service.spec.ts index 8d437e26..eca9da56 100644 --- a/apps/backend/src/retrieval-anon/anon-retrieval.service.spec.ts +++ b/apps/backend/src/retrieval-anon/anon-retrieval.service.spec.ts @@ -2,14 +2,21 @@ import type { Repository } from "typeorm"; import { beforeEach, describe, expect, it, vi } from "vitest"; import type { ClickhouseService } from "../clickhouse/clickhouse.service.js"; import type { StorageProvider } from "../database/entities/storage-provider.entity.js"; -import { RetrievalStatus } from "../database/types.js"; +import { BlockFetchStatus, CarParseStatus, IpniCheckStatus, RetrievalStatus } from "../database/types.js"; import type { AnonRetrievalCheckMetrics } from "../metrics-prometheus/check-metrics.service.js"; import type { WalletSdkService } from "../wallet-sdk/wallet-sdk.service.js"; import type { AnonPieceSelectorService } from "./anon-piece-selector.service.js"; import { AnonRetrievalService } from "./anon-retrieval.service.js"; -import type { CarValidationService } from "./car-validation.service.js"; import type { PieceRetrievalService } from "./piece-retrieval.service.js"; -import type { AnonPiece, CarValidationResult, PieceRetrievalResult } from "./types.js"; +import type { PieceValidationService } from "./piece-validation.service.js"; +import type { + AnonPiece, + BlockFetchOutcome, + CarParseOutcome, + IpniCheckOutcome, + PieceRetrievalResult, + SampledBlock, +} from "./types.js"; const SP_ADDRESS = "0xaaaa0000000000000000000000000000000000aa"; @@ -23,6 +30,8 @@ const PIECE = { serviceProvider: SP_ADDRESS, }; +const SAMPLED_BLOCKS = [] as SampledBlock[]; + function makeProvider(): StorageProvider { return { address: SP_ADDRESS, @@ -36,14 +45,21 @@ function makeService(opts: { pieceResult: PieceRetrievalResult; fetchPieceImpl?: (signal?: AbortSignal) => Promise; piece?: AnonPiece | null; - carResult?: CarValidationResult; - validateCarImpl?: () => Promise; + parseCarOutcome?: CarParseOutcome; + parseCarImpl?: (bytes: Buffer, signal?: AbortSignal) => Promise; + checkIpniOutcome?: IpniCheckOutcome; + checkIpniImpl?: () => Promise; + checkBlockFetchOutcome?: BlockFetchOutcome; + checkBlockFetchImpl?: () => Promise; }): { service: AnonRetrievalService; insertSpy: ReturnType; fetchSpy: ReturnType; - validateCarSpy: ReturnType; + parseCarSpy: ReturnType; + checkIpniSpy: ReturnType; + checkBlockFetchSpy: ReturnType; metricsRecordStatusSpy: ReturnType; + metricsRecordCarParseSpy: ReturnType; metricsRecordIpniSpy: ReturnType; metricsRecordBlockFetchSpy: ReturnType; } { @@ -67,16 +83,40 @@ function makeService(opts: { fetchPiece: fetchSpy, } as unknown as PieceRetrievalService; - const validateCarSpy = vi.fn(opts.validateCarImpl ?? (async () => opts.carResult)); - const carValidationService = { - validateCarPiece: validateCarSpy, - } as unknown as CarValidationService; + const parseCarSpy = vi.fn( + opts.parseCarImpl ?? + (async () => + opts.parseCarOutcome ?? { + status: CarParseStatus.PARSEABLE, + blockCount: 0, + sampledBlocks: SAMPLED_BLOCKS, + }), + ); + const checkIpniSpy = vi.fn( + opts.checkIpniImpl ?? (async () => opts.checkIpniOutcome ?? { status: IpniCheckStatus.VALID, durationMs: 0 }), + ); + const checkBlockFetchSpy = vi.fn( + opts.checkBlockFetchImpl ?? + (async () => + opts.checkBlockFetchOutcome ?? { + status: IpniCheckStatus.VALID, + sampledCount: 0, + failedCount: 0, + endpoint: "https://sp.test/ipfs/", + }), + ); + const pieceValidationService = { + parseCar: parseCarSpy, + checkIpni: checkIpniSpy, + checkBlockFetch: checkBlockFetchSpy, + } as unknown as PieceValidationService; const walletSdkService = { getProviderInfo: vi.fn(() => ({ pdp: { serviceURL: "https://sp.test/" } })), } as unknown as WalletSdkService; const metricsRecordStatusSpy = vi.fn(); + const metricsRecordCarParseSpy = vi.fn(); const metricsRecordIpniSpy = vi.fn(); const metricsRecordBlockFetchSpy = vi.fn(); const metrics = { @@ -86,7 +126,7 @@ function makeService(opts: { observeCheckDuration: vi.fn(), recordStatus: metricsRecordStatusSpy, recordHttpResponseCode: vi.fn(), - recordCarParseStatus: vi.fn(), + recordCarParseStatus: metricsRecordCarParseSpy, recordIpniStatus: metricsRecordIpniSpy, recordBlockFetchStatus: metricsRecordBlockFetchSpy, } as unknown as AnonRetrievalCheckMetrics; @@ -94,7 +134,7 @@ function makeService(opts: { const service = new AnonRetrievalService( anonPieceSelector, pieceRetrievalService, - carValidationService, + pieceValidationService, walletSdkService, metrics, clickhouseService, @@ -105,8 +145,11 @@ function makeService(opts: { service, insertSpy, fetchSpy, - validateCarSpy, + parseCarSpy, + checkIpniSpy, + checkBlockFetchSpy, metricsRecordStatusSpy, + metricsRecordCarParseSpy, metricsRecordIpniSpy, metricsRecordBlockFetchSpy, }; @@ -154,11 +197,11 @@ describe("AnonRetrievalService", () => { expect(typeof row.retrieval_id).toBe("string"); // CAR/IPNI/block-fetch were never run on a non-IPFS-indexed piece — every - // dimension column should explicitly say "skipped" (ipni_status) or null. - expect(row.car_parseable).toBeNull(); + // dimension status should explicitly say "skipped". + expect(row.car_status).toBe("skipped"); expect(row.car_block_count).toBeNull(); expect(row.block_fetch_endpoint).toBeNull(); - expect(row.block_fetch_valid).toBeNull(); + expect(row.block_fetch_status).toBe("skipped"); expect(row.block_fetch_sampled_count).toBeNull(); expect(row.block_fetch_failed_count).toBeNull(); expect(row.ipni_status).toBe("skipped"); @@ -246,33 +289,31 @@ describe("AnonRetrievalService", () => { } it("emits populated CAR/IPNI/block-fetch columns when validation fully succeeds", async () => { - const carResult: CarValidationResult = { - carParseable: true, - blockCount: 42, - sampledCidCount: 5, - ipniValid: true, - ipniVerifyMs: 137, - blockFetchValid: true, - blockFetchFailedCount: 0, - blockFetchEndpoint: "https://sp.test/ipfs/", - }; - - const { service, insertSpy, validateCarSpy } = makeService({ + const { service, insertSpy, parseCarSpy, checkIpniSpy, checkBlockFetchSpy } = makeService({ pieceResult: okPiece(Buffer.from("car-bytes")), piece: INDEXED_PIECE, - carResult, + parseCarOutcome: { status: CarParseStatus.PARSEABLE, blockCount: 42, sampledBlocks: SAMPLED_BLOCKS }, + checkIpniOutcome: { status: IpniCheckStatus.VALID, durationMs: 137 }, + checkBlockFetchOutcome: { + status: BlockFetchStatus.SUCCESS, + sampledCount: 5, + failedCount: 0, + endpoint: "https://sp.test/ipfs/", + }, }); await service.performForProvider(SP_ADDRESS); - expect(validateCarSpy).toHaveBeenCalledTimes(1); + expect(parseCarSpy).toHaveBeenCalledTimes(1); + expect(checkIpniSpy).toHaveBeenCalledTimes(1); + expect(checkBlockFetchSpy).toHaveBeenCalledTimes(1); const [, row] = insertSpy.mock.calls[0] as [string, Record]; expect(row.piece_fetch_status).toBe(RetrievalStatus.SUCCESS); expect(row.commp_valid).toBe(true); - expect(row.car_parseable).toBe(true); + expect(row.car_status).toBe("parseable"); expect(row.car_block_count).toBe(42); expect(row.block_fetch_endpoint).toBe("https://sp.test/ipfs/"); - expect(row.block_fetch_valid).toBe(true); + expect(row.block_fetch_status).toBe("success"); expect(row.block_fetch_sampled_count).toBe(5); expect(row.block_fetch_failed_count).toBe(0); expect(row.ipni_status).toBe("valid"); @@ -280,21 +321,17 @@ describe("AnonRetrievalService", () => { }); it("distinguishes IPNI invalid from block-fetch failures", async () => { - const carResult: CarValidationResult = { - carParseable: true, - blockCount: 100, - sampledCidCount: 5, - ipniValid: false, - ipniVerifyMs: 250, - blockFetchValid: false, - blockFetchFailedCount: 2, - blockFetchEndpoint: "https://sp.test/ipfs/", - }; - const { service, insertSpy } = makeService({ pieceResult: okPiece(Buffer.from("car-bytes")), piece: INDEXED_PIECE, - carResult, + parseCarOutcome: { status: CarParseStatus.PARSEABLE, blockCount: 100, sampledBlocks: SAMPLED_BLOCKS }, + checkIpniOutcome: { status: IpniCheckStatus.INVALID, durationMs: 250 }, + checkBlockFetchOutcome: { + status: BlockFetchStatus.FAILURE, + sampledCount: 5, + failedCount: 2, + endpoint: "https://sp.test/ipfs/", + }, }); await service.performForProvider(SP_ADDRESS); @@ -303,61 +340,143 @@ describe("AnonRetrievalService", () => { // The piece-fetch path still succeeded — failures are surfaced as // independent dimensions, not folded into piece_fetch_status. expect(row.piece_fetch_status).toBe(RetrievalStatus.SUCCESS); - expect(row.car_parseable).toBe(true); + expect(row.car_status).toBe("parseable"); expect(row.ipni_status).toBe("invalid"); - expect(row.block_fetch_valid).toBe(false); + expect(row.block_fetch_status).toBe("failure"); expect(row.block_fetch_sampled_count).toBe(5); expect(row.block_fetch_failed_count).toBe(2); }); - it("does not record SP-fault metrics when CAR validation is interrupted by signal abort", async () => { - // An operator-driven abort (job timeout, shutdown) that interrupts - // validateCarPiece must not show up as carParseable=false + - // ipni/blockFetch=ERROR — that would misattribute our cancellation to - // the SP and pollute scoreboards. - const ac = new AbortController(); - const { service, insertSpy, metricsRecordIpniSpy, metricsRecordBlockFetchSpy } = makeService({ - pieceResult: okPiece(Buffer.from("car-bytes")), + it("skips downstream dimensions when parseCar returns NOT_PARSEABLE", async () => { + // The decoupled service guarantees that an unparseable CAR never even + // attempts IPNI or block fetch — there are no CIDs to verify or fetch. + const { service, insertSpy, parseCarSpy, checkIpniSpy, checkBlockFetchSpy } = makeService({ + pieceResult: okPiece(Buffer.from("not-a-car")), piece: INDEXED_PIECE, - validateCarImpl: async () => { - ac.abort(new Error("Anon retrieval job timeout")); - throw Object.assign(new Error("aborted"), { name: "AbortError" }); - }, + parseCarOutcome: { status: CarParseStatus.NOT_PARSEABLE }, }); - await service.performForProvider(SP_ADDRESS, ac.signal); + await service.performForProvider(SP_ADDRESS); - expect(metricsRecordIpniSpy).not.toHaveBeenCalledWith(expect.anything(), "error"); - expect(metricsRecordBlockFetchSpy).not.toHaveBeenCalledWith(expect.anything(), "error"); + expect(parseCarSpy).toHaveBeenCalledTimes(1); + expect(checkIpniSpy).not.toHaveBeenCalled(); + expect(checkBlockFetchSpy).not.toHaveBeenCalled(); const [, row] = insertSpy.mock.calls[0] as [string, Record]; + expect(row.car_status).toBe("not_parseable"); + expect(row.car_block_count).toBeNull(); + expect(row.block_fetch_sampled_count).toBeNull(); + expect(row.block_fetch_endpoint).toBeNull(); + expect(row.block_fetch_status).toBe("skipped"); + expect(row.block_fetch_failed_count).toBeNull(); expect(row.ipni_status).toBe("skipped"); - expect(row.car_parseable).toBeNull(); - expect(row.block_fetch_valid).toBeNull(); + expect(row.ipni_verify_ms).toBeNull(); }); - it("emits ipni_status='error' (not 'skipped') when CAR validation throws on a successful piece", async () => { - // Distinguishes a real infra outage (e.g. IpniVerificationService down) - // from a piece that legitimately had no IPFS indexing. Without the - // distinction, an outage looks like normal non-IPFS volume in dashboards. - const { service, insertSpy, metricsRecordIpniSpy, metricsRecordBlockFetchSpy } = makeService({ + it("propagates checkIpni's SKIPPED status to the row (root CID unparseable)", async () => { + // Previously this case was bucketed as INVALID, which misattributed a + // client-side data problem (bad root CID from the subgraph) to the SP. + const { service, insertSpy, metricsRecordIpniSpy } = makeService({ pieceResult: okPiece(Buffer.from("car-bytes")), piece: INDEXED_PIECE, - validateCarImpl: async () => { - throw new Error("IpniVerificationService down"); + parseCarOutcome: { status: CarParseStatus.PARSEABLE, blockCount: 1, sampledBlocks: SAMPLED_BLOCKS }, + checkIpniOutcome: { status: IpniCheckStatus.SKIPPED, durationMs: null }, + checkBlockFetchOutcome: { + status: BlockFetchStatus.SUCCESS, + sampledCount: 1, + failedCount: 0, + endpoint: "https://sp.test/ipfs/", }, }); await service.performForProvider(SP_ADDRESS); - expect(metricsRecordIpniSpy).toHaveBeenCalledWith(expect.anything(), "error"); - expect(metricsRecordBlockFetchSpy).toHaveBeenCalledWith(expect.anything(), "error"); + expect(metricsRecordIpniSpy).toHaveBeenCalledWith(expect.anything(), IpniCheckStatus.SKIPPED); + const [, row] = insertSpy.mock.calls[0] as [string, Record]; + expect(row.ipni_status).toBe("skipped"); + // car_status / block_fetch_status remain whatever their own steps returned. + expect(row.car_status).toBe("parseable"); + expect(row.block_fetch_status).toBe("success"); + }); + + it("propagates checkIpni's ERROR status only to ipni_status (not other dimensions)", async () => { + // The whole point of decoupling: an unexpected throw in IPNI verification + // cannot bleed into car_status or block_fetch_status. + const { service, insertSpy, metricsRecordIpniSpy, metricsRecordCarParseSpy, metricsRecordBlockFetchSpy } = + makeService({ + pieceResult: okPiece(Buffer.from("car-bytes")), + piece: INDEXED_PIECE, + parseCarOutcome: { status: CarParseStatus.PARSEABLE, blockCount: 1, sampledBlocks: SAMPLED_BLOCKS }, + checkIpniOutcome: { status: IpniCheckStatus.ERROR, durationMs: null }, + checkBlockFetchOutcome: { + status: BlockFetchStatus.SUCCESS, + sampledCount: 1, + failedCount: 0, + endpoint: "https://sp.test/ipfs/", + }, + }); + + await service.performForProvider(SP_ADDRESS); + + expect(metricsRecordCarParseSpy).toHaveBeenCalledWith(expect.anything(), CarParseStatus.PARSEABLE); + expect(metricsRecordIpniSpy).toHaveBeenCalledWith(expect.anything(), IpniCheckStatus.ERROR); + expect(metricsRecordBlockFetchSpy).toHaveBeenCalledWith(expect.anything(), BlockFetchStatus.SUCCESS); const [, row] = insertSpy.mock.calls[0] as [string, Record]; + expect(row.car_status).toBe("parseable"); expect(row.ipni_status).toBe("error"); - // Piece-fetch path itself succeeded — only the validation pipeline failed. - expect(row.commp_valid).toBe(true); - expect(row.car_parseable).toBeNull(); + expect(row.block_fetch_status).toBe("success"); + }); + + it("propagates checkBlockFetch's SKIPPED status (SP info missing) without affecting other dimensions", async () => { + const { service, insertSpy } = makeService({ + pieceResult: okPiece(Buffer.from("car-bytes")), + piece: INDEXED_PIECE, + parseCarOutcome: { status: CarParseStatus.PARSEABLE, blockCount: 1, sampledBlocks: SAMPLED_BLOCKS }, + checkIpniOutcome: { status: IpniCheckStatus.VALID, durationMs: 50 }, + checkBlockFetchOutcome: { + status: BlockFetchStatus.SKIPPED, + sampledCount: 1, + failedCount: null, + endpoint: null, + errorMessage: "Provider info not found", + }, + }); + + await service.performForProvider(SP_ADDRESS); + + const [, row] = insertSpy.mock.calls[0] as [string, Record]; + expect(row.car_status).toBe("parseable"); + expect(row.ipni_status).toBe("valid"); + expect(row.block_fetch_status).toBe("skipped"); + expect(row.block_fetch_endpoint).toBeNull(); + expect(row.block_fetch_failed_count).toBeNull(); + }); + + it("marks every dimension SKIPPED when the signal aborts during parseCar", async () => { + // Operator-driven aborts must never charge an SP-fault bucket. The + // service propagates the abort; orchestrator's helpers default to SKIPPED. + const ac = new AbortController(); + const { service, insertSpy, metricsRecordCarParseSpy, metricsRecordIpniSpy, metricsRecordBlockFetchSpy } = + makeService({ + pieceResult: okPiece(Buffer.from("car-bytes")), + piece: INDEXED_PIECE, + parseCarImpl: async () => { + ac.abort(new Error("Anon retrieval job timeout")); + throw Object.assign(new Error("aborted"), { name: "AbortError" }); + }, + }); + + await service.performForProvider(SP_ADDRESS, ac.signal); + + expect(metricsRecordCarParseSpy).toHaveBeenCalledWith(expect.anything(), CarParseStatus.SKIPPED); + expect(metricsRecordIpniSpy).toHaveBeenCalledWith(expect.anything(), IpniCheckStatus.SKIPPED); + expect(metricsRecordBlockFetchSpy).toHaveBeenCalledWith(expect.anything(), IpniCheckStatus.SKIPPED); + + const [, row] = insertSpy.mock.calls[0] as [string, Record]; + expect(row.car_status).toBe("skipped"); + expect(row.ipni_status).toBe("skipped"); + expect(row.block_fetch_status).toBe("skipped"); }); it("skips CAR/IPNI/block-fetch when SP returns 2xx with wrong bytes (commPValid=false)", async () => { @@ -383,8 +502,11 @@ describe("AnonRetrievalService", () => { const { service, insertSpy, - validateCarSpy, + parseCarSpy, + checkIpniSpy, + checkBlockFetchSpy, metricsRecordStatusSpy, + metricsRecordCarParseSpy, metricsRecordIpniSpy, metricsRecordBlockFetchSpy, } = makeService({ @@ -394,50 +516,20 @@ describe("AnonRetrievalService", () => { await service.performForProvider(SP_ADDRESS); - expect(validateCarSpy).not.toHaveBeenCalled(); - expect(metricsRecordIpniSpy).toHaveBeenCalledWith(expect.anything(), "skipped"); - expect(metricsRecordBlockFetchSpy).toHaveBeenCalledWith(expect.anything(), "skipped"); + expect(parseCarSpy).not.toHaveBeenCalled(); + expect(checkIpniSpy).not.toHaveBeenCalled(); + expect(checkBlockFetchSpy).not.toHaveBeenCalled(); + expect(metricsRecordCarParseSpy).toHaveBeenCalledWith(expect.anything(), CarParseStatus.SKIPPED); + expect(metricsRecordIpniSpy).toHaveBeenCalledWith(expect.anything(), IpniCheckStatus.SKIPPED); + expect(metricsRecordBlockFetchSpy).toHaveBeenCalledWith(expect.anything(), IpniCheckStatus.SKIPPED); expect(metricsRecordStatusSpy).toHaveBeenCalledWith(expect.anything(), "failure.commp"); const [, row] = insertSpy.mock.calls[0] as [string, Record]; expect(row.piece_fetch_status).toBe(RetrievalStatus.FAILED); expect(row.commp_valid).toBe(false); - expect(row.car_parseable).toBeNull(); + expect(row.car_status).toBe("skipped"); expect(row.ipni_status).toBe("skipped"); - expect(row.block_fetch_valid).toBeNull(); - }); - - it("emits car_parseable=false with skipped IPNI/block-fetch when bytes don't parse as CAR", async () => { - const carResult: CarValidationResult = { - carParseable: false, - blockCount: 0, - sampledCidCount: 0, - ipniValid: null, - ipniVerifyMs: null, - blockFetchValid: null, - blockFetchFailedCount: null, - blockFetchEndpoint: null, - }; - - const { service, insertSpy } = makeService({ - pieceResult: okPiece(Buffer.from("not-a-car")), - piece: INDEXED_PIECE, - carResult, - }); - - await service.performForProvider(SP_ADDRESS); - - const [, row] = insertSpy.mock.calls[0] as [string, Record]; - expect(row.car_parseable).toBe(false); - // car_block_count and block_fetch_sampled_count are gated on carParseable - // so an unparseable CAR doesn't emit a misleading 0. - expect(row.car_block_count).toBeNull(); - expect(row.block_fetch_sampled_count).toBeNull(); - expect(row.block_fetch_endpoint).toBeNull(); - expect(row.block_fetch_valid).toBeNull(); - expect(row.block_fetch_failed_count).toBeNull(); - expect(row.ipni_status).toBe("skipped"); - expect(row.ipni_verify_ms).toBeNull(); + expect(row.block_fetch_status).toBe("skipped"); }); }); }); diff --git a/apps/backend/src/retrieval-anon/anon-retrieval.service.ts b/apps/backend/src/retrieval-anon/anon-retrieval.service.ts index 2f062def..7d3226cb 100644 --- a/apps/backend/src/retrieval-anon/anon-retrieval.service.ts +++ b/apps/backend/src/retrieval-anon/anon-retrieval.service.ts @@ -1,18 +1,19 @@ import { randomUUID } from "node:crypto"; import { Injectable, Logger } from "@nestjs/common"; import { InjectRepository } from "@nestjs/typeorm"; +import { CID } from "multiformats/cid"; import type { Repository } from "typeorm"; import { ClickhouseService } from "../clickhouse/clickhouse.service.js"; import { type ProviderJobContext, toStructuredError } from "../common/logging.js"; import { StorageProvider } from "../database/entities/storage-provider.entity.js"; -import { IpniCheckStatus, RetrievalStatus, ServiceType } from "../database/types.js"; +import { BlockFetchStatus, CarParseStatus, IpniCheckStatus, RetrievalStatus, ServiceType } from "../database/types.js"; import { buildCheckMetricLabels } from "../metrics-prometheus/check-metric-labels.js"; import { AnonRetrievalCheckMetrics } from "../metrics-prometheus/check-metrics.service.js"; import { WalletSdkService } from "../wallet-sdk/wallet-sdk.service.js"; import { AnonPieceSelectorService } from "./anon-piece-selector.service.js"; -import { CarValidationService } from "./car-validation.service.js"; import { PieceRetrievalService } from "./piece-retrieval.service.js"; -import type { CarValidationResult, PieceRetrievalResult } from "./types.js"; +import { PieceValidationService } from "./piece-validation.service.js"; +import type { BlockFetchOutcome, CarParseOutcome, IpniCheckOutcome, PieceRetrievalResult } from "./types.js"; const ANON_RETRIEVAL_CHECKS_TABLE = "anon_retrieval_checks"; @@ -23,7 +24,7 @@ export class AnonRetrievalService { constructor( private readonly anonPieceSelectorService: AnonPieceSelectorService, private readonly pieceRetrievalService: PieceRetrievalService, - private readonly carValidationService: CarValidationService, + private readonly pieceValidationService: PieceValidationService, private readonly walletSdkService: WalletSdkService, private readonly metrics: AnonRetrievalCheckMetrics, private readonly clickhouseService: ClickhouseService, @@ -50,7 +51,7 @@ export class AnonRetrievalService { message: "No anonymous piece found for SP", spAddress, }); - this.metrics.recordStatus(labels, "failure.no_piece"); + this.metrics.recordPieceRetrievalStatus(labels, "skipped"); return; } @@ -69,8 +70,9 @@ export class AnonRetrievalService { const startedAt = new Date(); let pieceResult: PieceRetrievalResult | null = null; - let carResult: CarValidationResult | null = null; - let validatedCarPiece: boolean = false; + let parse: CarParseOutcome | null = null; + let ipni: IpniCheckOutcome | null = null; + let blockFetch: BlockFetchOutcome | null = null; try { // 2. Fetch the piece. fetchPiece never throws on abort — it returns a @@ -87,9 +89,9 @@ export class AnonRetrievalService { this.metrics.observeThroughput(labels, pieceResult.throughputBps); this.metrics.recordHttpResponseCode(labels, pieceResult.statusCode); - // 3. CAR validation (only if piece was successfully retrieved and has IPFS indexing). - // `pieceResult.success` already encodes "HTTP 2xx AND commP matches" — fetchPiece - // flips success=false on a commP mismatch so we never parse mismatched bytes. + // 3. CAR / IPNI / block-fetch validation (only when piece was successfully + // retrieved, advertises IPFS indexing, and the job hasn't been cancelled). + // Each dimension is computed independently if ( pieceResult.success && piece.withIPFSIndexing && @@ -99,61 +101,36 @@ export class AnonRetrievalService { !signal?.aborted ) { try { - validatedCarPiece = true; - carResult = await this.carValidationService.validateCarPiece( - pieceResult.pieceBytes, - provider, - piece.ipfsRootCid, - signal, - ); - this.metrics.recordCarParseStatus(labels, carResult.carParseable); - this.metrics.recordIpniStatus(labels, ipniStatusFromResult(carResult)); - this.metrics.recordBlockFetchStatus( - labels, - carResult.blockFetchValid === null - ? IpniCheckStatus.SKIPPED - : carResult.blockFetchValid - ? IpniCheckStatus.VALID - : IpniCheckStatus.INVALID, - ); - } catch (error) { - if (signal?.aborted) { - // Operator-driven cancellation, not an SP fault. Suppress the - // SP-fault metrics and downgrade the downstream ClickHouse status - // so we don't pollute SP scoreboards with our own aborts. - validatedCarPiece = false; - this.logger.warn({ - ...logContext, - event: "anon_retrieval_car_validation_aborted", - message: "CAR validation aborted before completion", - pieceCid: piece.pieceCid, - spAddress, - }); - } else { - // Validation was attempted on a successful piece retrieval but threw. - this.metrics.recordCarParseStatus(labels, false); - this.metrics.recordIpniStatus(labels, IpniCheckStatus.ERROR); - this.metrics.recordBlockFetchStatus(labels, IpniCheckStatus.ERROR); - this.logger.warn({ - ...logContext, - event: "anon_retrieval_car_validation_failed", - message: "CAR validation threw an error", - pieceCid: piece.pieceCid, - spAddress, - error: toStructuredError(error), - }); + parse = await this.pieceValidationService.parseCar(pieceResult.pieceBytes, signal); + + if (parse.status === CarParseStatus.PARSEABLE) { + ipni = await this.pieceValidationService.checkIpni( + provider, + piece.ipfsRootCid, + parse.sampledBlocks, + signal, + ); + blockFetch = await this.pieceValidationService.checkBlockFetch(parse.sampledBlocks, spAddress, signal); } + } catch (error) { + // pieceValidationService methods only throw on abort (via signal.throwIfAborted in + // their catch blocks). Operator-driven cancellation must not bubble + // out of performForProvider — the finally block still emits the row, + // and the status helpers default whatever didn't run to SKIPPED. + // Anything else is a genuine bug and is re-thrown. + if (!signal?.aborted) throw error; } - } else if (!pieceResult.success) { - // Piece retrieval failed (HTTP error or commP mismatch) — downstream - // validation was skipped because there is nothing trustworthy to validate. - this.metrics.recordIpniStatus(labels, IpniCheckStatus.SKIPPED); - this.metrics.recordBlockFetchStatus(labels, IpniCheckStatus.SKIPPED); } - // Overall check duration and status + const carStatus = carStatusForRow(parse); + const ipniStatus = ipniStatusForRow(parse, ipni); + const blockFetchStatus = blockFetchStatusForRow(parse, blockFetch); + + this.metrics.recordCarParseStatus(labels, carStatus); + this.metrics.recordIpniStatus(labels, ipniStatus); + this.metrics.recordBlockFetchStatus(labels, blockFetchStatus); this.metrics.observeCheckDuration(labels, Date.now() - checkStart); - this.metrics.recordStatus(labels, anonPieceRetrievalStatus(pieceResult)); + this.metrics.recordPieceRetrievalStatus(labels, anonPieceRetrievalStatus(pieceResult)); } finally { // Always emit a ClickHouse row — even on abort or unexpected error — so // we never lose the evidence (ttfb, bytes, response code) we already @@ -163,11 +140,10 @@ export class AnonRetrievalService { const providerInfo = this.walletSdkService.getProviderInfo(spAddress); const spBaseUrl = providerInfo?.pdp.serviceURL.replace(/\/$/, "") ?? spAddress; const pieceFetchStatus = finalPieceResult.success ? RetrievalStatus.SUCCESS : RetrievalStatus.FAILED; - const ipniStatus: IpniCheckStatus = !validatedCarPiece - ? IpniCheckStatus.SKIPPED - : carResult - ? ipniStatusFromResult(carResult) - : IpniCheckStatus.ERROR; + + const carStatus = carStatusForRow(parse); + const ipniStatus = ipniStatusForRow(parse, ipni); + const blockFetchStatus = blockFetchStatusForRow(parse, blockFetch); try { this.clickhouseService.insert(ANON_RETRIEVAL_CHECKS_TABLE, { @@ -192,14 +168,15 @@ export class AnonRetrievalService { bytes_retrieved: finalPieceResult.bytesReceived > 0 ? finalPieceResult.bytesReceived : null, throughput_bps: finalPieceResult.throughputBps > 0 ? Math.round(finalPieceResult.throughputBps) : null, commp_valid: !finalPieceResult.aborted && finalPieceResult.httpSuccess ? finalPieceResult.commPValid : null, - car_parseable: carResult ? carResult.carParseable : null, - car_block_count: carResult?.carParseable ? carResult?.blockCount : null, - block_fetch_endpoint: carResult?.blockFetchEndpoint ?? null, - block_fetch_valid: carResult ? carResult.blockFetchValid : null, - block_fetch_sampled_count: carResult?.carParseable ? carResult?.sampledCidCount : null, - block_fetch_failed_count: carResult?.blockFetchFailedCount ?? null, + car_status: carStatus, + car_block_count: parse && parse.status === CarParseStatus.PARSEABLE ? parse.blockCount : null, + block_fetch_endpoint: blockFetch?.endpoint ?? null, + block_fetch_status: blockFetchStatus, + block_fetch_sampled_count: + parse?.status === CarParseStatus.PARSEABLE && blockFetch ? blockFetch.sampledCount : null, + block_fetch_failed_count: blockFetch?.failedCount ?? null, ipni_status: ipniStatus, - ipni_verify_ms: carResult?.ipniVerifyMs ?? null, + ipni_verify_ms: ipni?.durationMs ?? null, error_message: finalPieceResult.errorMessage ?? null, }); } catch (error) { @@ -227,19 +204,14 @@ export class AnonRetrievalService { latencyMs: finalPieceResult.latencyMs, ttfbMs: finalPieceResult.ttfbMs, bytesRetrieved: finalPieceResult.bytesReceived, - carParseable: carResult?.carParseable, - ipniValid: carResult?.ipniValid, - blockFetchValid: carResult?.blockFetchValid, + carStatus, + ipniStatus, + blockFetchStatus, }); } } } -function ipniStatusFromResult(result: CarValidationResult): IpniCheckStatus { - if (result.ipniValid === null) return IpniCheckStatus.SKIPPED; - return result.ipniValid ? IpniCheckStatus.VALID : IpniCheckStatus.INVALID; -} - function anonPieceRetrievalStatus(pieceResult: PieceRetrievalResult): string { if (pieceResult.success) return "success"; if (pieceResult.aborted) return "failure.timedout"; @@ -248,6 +220,30 @@ function anonPieceRetrievalStatus(pieceResult: PieceRetrievalResult): string { return "failure.other"; } +/** + * The per-dimension statuses default to SKIPPED whenever the dimension's + * prerequisite wasn't met — no IPFS indexing, piece fetch failed, the job + * was aborted, or an upstream dimension didn't produce a usable result. + * Service methods only ever return their concrete outcomes (VALID, INVALID, + * NOT_PARSEABLE, etc.); SKIPPED is the helper's contribution. + */ +function carStatusForRow(parse: CarParseOutcome | null): CarParseStatus { + if (!parse) return CarParseStatus.SKIPPED; + return parse.status; +} + +function ipniStatusForRow(parse: CarParseOutcome | null, ipni: IpniCheckOutcome | null): IpniCheckStatus { + if (!parse || parse.status !== CarParseStatus.PARSEABLE) return IpniCheckStatus.SKIPPED; + if (!ipni) return IpniCheckStatus.SKIPPED; + return ipni.status; +} + +function blockFetchStatusForRow(parse: CarParseOutcome | null, blockFetch: BlockFetchOutcome | null): BlockFetchStatus { + if (!parse || parse.status !== CarParseStatus.PARSEABLE) return BlockFetchStatus.SKIPPED; + if (!blockFetch) return BlockFetchStatus.SKIPPED; + return blockFetch.status; +} + function buildAbortedPlaceholder(pieceCid: string, reason: unknown): PieceRetrievalResult { const message = reason instanceof Error && reason.message ? reason.message : typeof reason === "string" ? reason : "aborted"; diff --git a/apps/backend/src/retrieval-anon/car-validation.service.ts b/apps/backend/src/retrieval-anon/car-validation.service.ts deleted file mode 100644 index f8d08a76..00000000 --- a/apps/backend/src/retrieval-anon/car-validation.service.ts +++ /dev/null @@ -1,259 +0,0 @@ -import { CarReader } from "@ipld/car"; -import * as dagPB from "@ipld/dag-pb"; -import { Injectable, Logger } from "@nestjs/common"; -import { ConfigService } from "@nestjs/config"; -import { create as createBlock } from "multiformats/block"; -import { CID } from "multiformats/cid"; -import * as raw from "multiformats/codecs/raw"; -import { sha256 } from "multiformats/hashes/sha2"; -import { toStructuredError } from "../common/logging.js"; -import type { IConfig } from "../config/app.config.js"; -import type { StorageProvider } from "../database/entities/storage-provider.entity.js"; -import { HttpClientService } from "../http-client/http-client.service.js"; -import { IpniVerificationService } from "../ipni/ipni-verification.service.js"; -import { WalletSdkService } from "../wallet-sdk/wallet-sdk.service.js"; -import type { CarValidationResult } from "./types.js"; - -// UnixFS DAGs use only dag-pb (interior nodes) and raw (leaf data) codecs -const unixfsCodecs: Record unknown }> = { - [dagPB.code]: dagPB, - [raw.code]: raw, -}; - -@Injectable() -export class CarValidationService { - private readonly logger = new Logger(CarValidationService.name); - - constructor( - private readonly configService: ConfigService, - private readonly httpClientService: HttpClientService, - private readonly walletSdkService: WalletSdkService, - private readonly ipniVerificationService: IpniVerificationService, - ) {} - - /** - * Validate an anonymous piece retrieved as a CAR: - * 1. parse the CAR, - * 2. sample random blocks, - * 3. confirm the SP is advertised for the root + sampled CIDs via IPNI, - * 4. fetch each sampled block from the SP and hash-verify it. - * - * CAR parse failure is attributed to the client (bad upload), not the SP. - */ - async validateCarPiece( - pieceBytes: Buffer, - provider: StorageProvider, - ipfsRootCid: string, - signal?: AbortSignal, - ): Promise { - let blocks: { cid: CID; bytes: Uint8Array }[]; - try { - blocks = await this.parseCar(pieceBytes, signal); - } catch (error) { - if (signal?.aborted || (error instanceof Error && error.name === "AbortError")) { - throw error; - } - this.logger.debug({ - event: "car_parse_failed", - message: "Failed to parse piece bytes as CAR - client fault, not SP", - spAddress: provider.address, - ipfsRootCid, - error: toStructuredError(error), - }); - return { - carParseable: false, - blockCount: 0, - sampledCidCount: 0, - ipniValid: null, - ipniVerifyMs: null, - blockFetchValid: null, - blockFetchFailedCount: null, - blockFetchEndpoint: null, - }; - } - if (blocks.length === 0) { - return { - carParseable: true, - blockCount: 0, - sampledCidCount: 0, - ipniValid: null, - ipniVerifyMs: null, - blockFetchValid: null, - blockFetchFailedCount: null, - blockFetchEndpoint: null, - errorMessage: "CAR contained no blocks", - }; - } - - const sampleCount = this.configService.get("retrieval", { infer: true }).anonBlockSampleCount; - const shuffled = [...blocks].sort(() => Math.random() - 0.5); - const sampledBlocks = shuffled.slice(0, sampleCount); - - const ipni = await this.checkIpni(provider, ipfsRootCid, sampledBlocks, signal); - const blockFetchResult = await this.checkBlockFetch(sampledBlocks, provider.address, signal); - - return { - carParseable: true, - blockCount: blocks.length, - sampledCidCount: sampledBlocks.length, - ipniValid: ipni.valid, - ipniVerifyMs: ipni.durationMs, - blockFetchValid: blockFetchResult.valid, - blockFetchFailedCount: blockFetchResult.failedCount, - blockFetchEndpoint: blockFetchResult.endpoint, - errorMessage: blockFetchResult.errorMessage, - }; - } - - private async parseCar(pieceBytes: Buffer, signal?: AbortSignal): Promise<{ cid: CID; bytes: Uint8Array }[]> { - const reader = await CarReader.fromBytes(new Uint8Array(pieceBytes)); - const blocks: { cid: CID; bytes: Uint8Array }[] = []; - for await (const block of reader.blocks()) { - signal?.throwIfAborted(); - blocks.push({ cid: block.cid, bytes: block.bytes }); - } - return blocks; - } - - /** - * Verify via IPNI that the SP is advertised for the root CID and each sampled child CID. - * Delegates to the shared IpniVerificationService which uses filecoin-pin's provider-scoped check. - */ - private async checkIpni( - provider: StorageProvider, - ipfsRootCid: string, - sampledBlocks: ReadonlyArray<{ cid: CID }>, - signal?: AbortSignal, - ): Promise<{ - valid: boolean; - durationMs: number | null; - }> { - const timeouts = this.configService.get("timeouts", { infer: true }); - let rootCid: CID; - try { - rootCid = CID.parse(ipfsRootCid); - } catch (error) { - this.logger.warn({ - event: "ipni_root_cid_invalid", - message: "Failed to parse ipfsRootCID", - ipfsRootCid, - providerAddress: provider.address, - error: toStructuredError(error), - }); - return { valid: false, durationMs: null }; - } - - const result = await this.ipniVerificationService.verify({ - rootCid, - blockCids: sampledBlocks.map((b) => b.cid), - storageProvider: provider, - timeoutMs: timeouts.ipniVerificationTimeoutMs, - pollIntervalMs: timeouts.ipniVerificationPollingMs, - signal, - }); - - return { - valid: result.rootCIDVerified, - durationMs: result.durationMs, - }; - } - - /** - * Fetch each sampled block from the SP endpoint and hash-verify the response - * against the declared CID. Mirrors IpfsBlockRetrievalStrategy's per-block - * verification for the sampled subset (no DAG traversal). - */ - private async checkBlockFetch( - sampledBlocks: ReadonlyArray<{ cid: CID; bytes: Uint8Array }>, - spAddress: string, - signal?: AbortSignal, - ): Promise<{ valid: boolean | null; failedCount: number | null; endpoint: string | null; errorMessage?: string }> { - const providerInfo = this.walletSdkService.getProviderInfo(spAddress); - if (!providerInfo) { - return { - valid: null, - failedCount: null, - endpoint: null, - errorMessage: `Provider info not found for ${spAddress}`, - }; - } - - const spBaseUrl = providerInfo.pdp.serviceURL.replace(/\/$/, ""); - const endpoint = `${spBaseUrl}/ipfs/`; - let failedCount = 0; - - for (const block of sampledBlocks) { - const cidStr = block.cid.toString(); - const blockUrl = `${spBaseUrl}/ipfs/${cidStr}?format=raw`; - - try { - const resp = await this.httpClientService.requestWithMetrics(blockUrl, { - headers: { Accept: "application/vnd.ipld.raw" }, - httpVersion: "2", - signal, - }); - - if (resp.aborted) { - failedCount += 1; - this.logger.warn({ - event: "block_fetch_aborted", - message: "Block fetch was aborted", - cid: cidStr, - spAddress, - abortReason: resp.abortReason, - }); - continue; - } - - if (resp.metrics.statusCode < 200 || resp.metrics.statusCode >= 300) { - failedCount += 1; - this.logger.warn({ - event: "block_fetch_non_2xx", - message: "Block fetch returned non-2xx status", - cid: cidStr, - spAddress, - statusCode: resp.metrics.statusCode, - }); - continue; - } - - if (block.cid.multihash.code !== sha256.code) { - this.logger.warn({ - event: "block_unsupported_hash", - message: `Unsupported hash algorithm 0x${block.cid.multihash.code.toString(16)}`, - cid: cidStr, - spAddress, - }); - failedCount += 1; - continue; - } - - const codec = unixfsCodecs[block.cid.code]; - if (!codec) { - this.logger.warn({ - event: "block_unsupported_codec", - message: `Unsupported codec 0x${block.cid.code.toString(16)}`, - cid: cidStr, - spAddress, - }); - failedCount += 1; - continue; - } - - // Hash-verifies and decodes; throws on mismatch - await createBlock({ bytes: resp.data, cid: block.cid, hasher: sha256, codec }); - } catch (error) { - failedCount += 1; - this.logger.warn({ - event: "block_fetch_failed", - message: "Block fetch or hash verification failed", - cid: cidStr, - spAddress, - error: toStructuredError(error), - }); - } - } - - return { valid: failedCount === 0, failedCount, endpoint }; - } -} diff --git a/apps/backend/src/retrieval-anon/piece-validation.service.ts b/apps/backend/src/retrieval-anon/piece-validation.service.ts new file mode 100644 index 00000000..191b5887 --- /dev/null +++ b/apps/backend/src/retrieval-anon/piece-validation.service.ts @@ -0,0 +1,281 @@ +import { CarReader } from "@ipld/car"; +import * as dagPB from "@ipld/dag-pb"; +import { Injectable, Logger } from "@nestjs/common"; +import { ConfigService } from "@nestjs/config"; +import { create as createBlock } from "multiformats/block"; +import { CID } from "multiformats/cid"; +import * as raw from "multiformats/codecs/raw"; +import { sha256 } from "multiformats/hashes/sha2"; +import { toStructuredError } from "../common/logging.js"; +import type { IConfig } from "../config/app.config.js"; +import type { StorageProvider } from "../database/entities/storage-provider.entity.js"; +import { BlockFetchStatus, CarParseStatus, IpniCheckStatus } from "../database/types.js"; +import { HttpClientService } from "../http-client/http-client.service.js"; +import { IpniVerificationService } from "../ipni/ipni-verification.service.js"; +import { WalletSdkService } from "../wallet-sdk/wallet-sdk.service.js"; +import type { BlockFetchOutcome, CarParseOutcome, IpniCheckOutcome, SampledBlock } from "./types.js"; + +// UnixFS DAGs use only dag-pb (interior nodes) and raw (leaf data) codecs +const unixfsCodecs: Record unknown }> = { + [dagPB.code]: dagPB, + [raw.code]: raw, +}; + +/** + * Per-dimension validators for anonymous-retrieval pieces that advertise + * IPFS indexing. Each method is independent: a failure in one cannot bleed + * into another's status. Service methods only throw on abort; every other + * failure mode is encoded in the returned outcome's `status`. + */ +@Injectable() +export class PieceValidationService { + private readonly logger = new Logger(PieceValidationService.name); + + constructor( + private readonly configService: ConfigService, + private readonly httpClientService: HttpClientService, + private readonly walletSdkService: WalletSdkService, + private readonly ipniVerificationService: IpniVerificationService, + ) {} + + /** + * Parse the fetched piece bytes as a CAR and pre-sample a random subset + * for downstream IPNI + block-fetch checks. CAR parse failure is + * attributed to the client (bad upload), not the SP. + * + * Returns `NOT_PARSEABLE` on parser exceptions. Propagates abort. + */ + async parseCar(pieceBytes: Buffer, signal?: AbortSignal): Promise { + let blocks: SampledBlock[]; + try { + blocks = await this.readBlocks(pieceBytes, signal); + } catch (error) { + signal?.throwIfAborted(); + this.logger.debug({ + event: "car_parse_failed", + message: "Failed to parse piece bytes as CAR - client fault, not SP", + error: toStructuredError(error), + }); + return { status: CarParseStatus.NOT_PARSEABLE }; + } + + if (blocks.length === 0) { + return { + status: CarParseStatus.PARSEABLE, + blockCount: 0, + sampledBlocks: [], + }; + } + + const sampleCount = this.configService.get("retrieval", { infer: true }).anonBlockSampleCount; + const shuffled = [...blocks].sort(() => Math.random() - 0.5); + const sampledBlocks = shuffled.slice(0, sampleCount); + + return { + status: CarParseStatus.PARSEABLE, + blockCount: blocks.length, + sampledBlocks, + }; + } + + /** + * Verify via IPNI that the SP is advertised for the root CID and each + * sampled child CID. SKIPPED when the root CID can't be parsed (a client + * upload artifact, not something to attempt against the SP). ERROR is + * reserved for unexpected exceptions from the verifier. + */ + async checkIpni( + provider: StorageProvider, + ipfsRootCid: string, + sampledBlocks: ReadonlyArray, + signal?: AbortSignal, + ): Promise { + let rootCid: CID; + try { + rootCid = CID.parse(ipfsRootCid); + } catch (error) { + this.logger.warn({ + event: "ipni_root_cid_invalid", + message: "Failed to parse ipfsRootCID — skipping IPNI verification", + ipfsRootCid, + providerAddress: provider.address, + error: toStructuredError(error), + }); + return { status: IpniCheckStatus.SKIPPED, durationMs: null }; + } + + const timeouts = this.configService.get("timeouts", { infer: true }); + + try { + const result = await this.ipniVerificationService.verify({ + rootCid, + blockCids: sampledBlocks.map((b) => b.cid), + storageProvider: provider, + timeoutMs: timeouts.ipniVerificationTimeoutMs, + pollIntervalMs: timeouts.ipniVerificationPollingMs, + signal, + }); + return { + status: result.rootCIDVerified ? IpniCheckStatus.VALID : IpniCheckStatus.INVALID, + durationMs: result.durationMs, + }; + } catch (error) { + signal?.throwIfAborted(); + this.logger.warn({ + event: "ipni_verification_failed", + message: "IPNI verification threw unexpectedly", + providerAddress: provider.address, + ipfsRootCid, + error: toStructuredError(error), + }); + return { status: IpniCheckStatus.ERROR, durationMs: null }; + } + } + + /** + * Fetch each sampled block from the SP endpoint and hash-verify the + * response against the declared CID. SKIPPED when SP info is missing + * (not the SP's fault — we couldn't even find the gateway). ERROR is + * reserved for unexpected exceptions outside the per-block loop; + * per-block failures aggregate into `failedCount` and map to INVALID. + */ + async checkBlockFetch( + sampledBlocks: ReadonlyArray, + spAddress: string, + signal?: AbortSignal, + ): Promise { + const providerInfo = this.walletSdkService.getProviderInfo(spAddress); + if (!providerInfo) { + return { + status: BlockFetchStatus.SKIPPED, + sampledCount: sampledBlocks.length, + failedCount: null, + endpoint: null, + errorMessage: `Provider info not found for ${spAddress}`, + }; + } + + const spBaseUrl = providerInfo.pdp.serviceURL.replace(/\/$/, ""); + const endpoint = `${spBaseUrl}/ipfs/`; + + try { + let failedCount = 0; + for (const block of sampledBlocks) { + if (!(await this.fetchAndVerifyBlock(block, spBaseUrl, spAddress, signal))) { + failedCount += 1; + } + } + return { + status: failedCount === 0 ? BlockFetchStatus.SUCCESS : BlockFetchStatus.FAILURE, + sampledCount: sampledBlocks.length, + failedCount, + endpoint, + }; + } catch (error) { + signal?.throwIfAborted(); + this.logger.warn({ + event: "block_fetch_unexpected_error", + message: "Block fetch loop threw unexpectedly", + spAddress, + error: toStructuredError(error), + }); + return { + status: BlockFetchStatus.ERROR, + sampledCount: sampledBlocks.length, + failedCount: null, + endpoint, + errorMessage: error instanceof Error ? error.message : "Unknown block-fetch error", + }; + } + } + + private async readBlocks(pieceBytes: Buffer, signal?: AbortSignal): Promise { + const reader = await CarReader.fromBytes(new Uint8Array(pieceBytes)); + const blocks: SampledBlock[] = []; + for await (const block of reader.blocks()) { + signal?.throwIfAborted(); + blocks.push({ cid: block.cid, bytes: block.bytes }); + } + return blocks; + } + + /** + * Fetch one sampled block and hash-verify it. Returns true on success. + * Per-block failures are logged at warn; they never throw out of the + * caller's loop, so transient block errors stay attributable to that + * single block rather than terminating the whole check. + */ + private async fetchAndVerifyBlock( + block: SampledBlock, + spBaseUrl: string, + spAddress: string, + signal?: AbortSignal, + ): Promise { + const cidStr = block.cid.toString(); + const blockUrl = `${spBaseUrl}/ipfs/${cidStr}?format=raw`; + + try { + const resp = await this.httpClientService.requestWithMetrics(blockUrl, { + headers: { Accept: "application/vnd.ipld.raw" }, + httpVersion: "2", + signal, + }); + + if (resp.aborted) { + this.logger.warn({ + event: "block_fetch_aborted", + message: "Block fetch was aborted", + cid: cidStr, + spAddress, + abortReason: resp.abortReason, + }); + return false; + } + + if (resp.metrics.statusCode < 200 || resp.metrics.statusCode >= 300) { + this.logger.warn({ + event: "block_fetch_non_2xx", + message: "Block fetch returned non-2xx status", + cid: cidStr, + spAddress, + statusCode: resp.metrics.statusCode, + }); + return false; + } + + if (block.cid.multihash.code !== sha256.code) { + this.logger.warn({ + event: "block_unsupported_hash", + message: `Unsupported hash algorithm 0x${block.cid.multihash.code.toString(16)}`, + cid: cidStr, + spAddress, + }); + return false; + } + + const codec = unixfsCodecs[block.cid.code]; + if (!codec) { + this.logger.warn({ + event: "block_unsupported_codec", + message: `Unsupported codec 0x${block.cid.code.toString(16)}`, + cid: cidStr, + spAddress, + }); + return false; + } + + // Hash-verifies and decodes; throws on mismatch + await createBlock({ bytes: resp.data, cid: block.cid, hasher: sha256, codec }); + return true; + } catch (error) { + this.logger.warn({ + event: "block_fetch_failed", + message: "Block fetch or hash verification failed", + cid: cidStr, + spAddress, + error: toStructuredError(error), + }); + return false; + } + } +} diff --git a/apps/backend/src/retrieval-anon/retrieval-anon.module.ts b/apps/backend/src/retrieval-anon/retrieval-anon.module.ts index c05dcb5f..f1199650 100644 --- a/apps/backend/src/retrieval-anon/retrieval-anon.module.ts +++ b/apps/backend/src/retrieval-anon/retrieval-anon.module.ts @@ -8,8 +8,8 @@ import { SubgraphModule } from "../subgraph/subgraph.module.js"; import { WalletSdkModule } from "../wallet-sdk/wallet-sdk.module.js"; import { AnonPieceSelectorService } from "./anon-piece-selector.service.js"; import { AnonRetrievalService } from "./anon-retrieval.service.js"; -import { CarValidationService } from "./car-validation.service.js"; import { PieceRetrievalService } from "./piece-retrieval.service.js"; +import { PieceValidationService } from "./piece-validation.service.js"; @Module({ imports: [ @@ -20,7 +20,7 @@ import { PieceRetrievalService } from "./piece-retrieval.service.js"; HttpClientModule, IpniModule, ], - providers: [AnonPieceSelectorService, PieceRetrievalService, CarValidationService, AnonRetrievalService], + providers: [AnonPieceSelectorService, PieceRetrievalService, PieceValidationService, AnonRetrievalService], exports: [AnonRetrievalService], }) export class RetrievalAnonModule {} diff --git a/apps/backend/src/retrieval-anon/types.ts b/apps/backend/src/retrieval-anon/types.ts index da3d64c0..959d5c0f 100644 --- a/apps/backend/src/retrieval-anon/types.ts +++ b/apps/backend/src/retrieval-anon/types.ts @@ -1,3 +1,6 @@ +import type { CID } from "multiformats/cid"; +import { BlockFetchStatus, CarParseStatus, IpniCheckStatus } from "../database/types.js"; + /** The result of anonymous piece selection. */ export type AnonPiece = { pieceCid: string; @@ -25,15 +28,36 @@ export type PieceRetrievalResult = { aborted?: boolean; }; -/** Result of CAR validation. */ -export type CarValidationResult = { - carParseable: boolean; - blockCount: number; - sampledCidCount: number; - ipniValid: boolean | null; - ipniVerifyMs: number | null; - blockFetchValid: boolean | null; - blockFetchFailedCount: number | null; - blockFetchEndpoint: string | null; +/** A block decoded from the CAR, retained for IPNI verification + block fetch. */ +export type SampledBlock = { cid: CID; bytes: Uint8Array }; + +/** + * Result of CAR parsing. SKIPPED / ERROR are never produced here — the + * caller decides "this dimension never ran" semantics. + */ +export type CarParseOutcome = + | { status: CarParseStatus.PARSEABLE; blockCount: number; sampledBlocks: SampledBlock[] } + | { status: CarParseStatus.NOT_PARSEABLE; errorMessage?: string }; + +/** + * Result of an IPNI verification attempt. `SKIPPED` is returned when a + * structural prerequisite couldn't be met (root CID won't parse). `ERROR` + * is reserved for unexpected exceptions raised by the verifier. + */ +export type IpniCheckOutcome = { + status: IpniCheckStatus; + durationMs: number | null; +}; + +/** + * Result of the block-fetch sampling step. `SKIPPED` is returned when a + * structural prerequisite couldn't be met (SP info not registered). + * `ERROR` is reserved for unexpected exceptions raised by the fetcher. + */ +export type BlockFetchOutcome = { + status: BlockFetchStatus; + sampledCount: number; + failedCount: number | null; + endpoint: string | null; errorMessage?: string; }; diff --git a/docs/checks/anon-retrievals.md b/docs/checks/anon-retrievals.md index 08211211..f167b306 100644 --- a/docs/checks/anon-retrievals.md +++ b/docs/checks/anon-retrievals.md @@ -77,14 +77,17 @@ flowchart TD Source: [`piece-retrieval.service.ts`](../../apps/backend/src/retrieval-anon/piece-retrieval.service.ts) -### CAR Validation (only when piece advertises IPFS indexing) +### CAR / IPNI / block-fetch validation (only when piece advertises IPFS indexing) -When the selected piece has `withIPFSIndexing = true` and a non-null `ipfsRootCid`, the fetched bytes are parsed as a CAR and a random sample of `ANON_RETRIEVAL_BLOCK_SAMPLE_COUNT` CIDs is exercised: +When the selected piece has `withIPFSIndexing = true` and a non-null `ipfsRootCid`, three dimensions are exercised by `PieceValidationService`. Each dimension has an independent outcome; a failure or skip in one never bleeds into another's status. -- **IPNI check:** `IpniVerificationService.verify(rootCid, sampledCids, sp)` polls filecoinpin.contact until each CID resolves to the SP under test, the timeout fires, or `IPNI_VERIFICATION_TIMEOUT_MS` is reached. -- **Block fetch check:** for each sampled CID, fetch `{spBaseUrl}/ipfs/{cid}?format=raw` and hash-verify the response against the CID. Non-2xx, hash mismatch, unsupported codec, or transport errors all count as a single failed block. +1. **CAR parse:** `@ipld/car` parses the response bytes; a random sample of `ANON_RETRIEVAL_BLOCK_SAMPLE_COUNT` CIDs is selected for the next two steps. +2. **IPNI check:** `IpniVerificationService.verify(rootCid, sampledCids, sp)` polls filecoinpin.contact until each CID resolves to the SP under test, the timeout fires, or `IPNI_VERIFICATION_TIMEOUT_MS` is reached. +3. **Block fetch check:** for each sampled CID, fetch `{spBaseUrl}/ipfs/{cid}?format=raw` and hash-verify the response against the CID. Non-2xx, hash mismatch, unsupported codec, or transport errors all count as a single failed block. -Source: [`car-validation.service.ts`](../../apps/backend/src/retrieval-anon/car-validation.service.ts) +CAR parse failure (`not_parseable`) is attributed to the client (bad upload), not the SP. When the CAR is unparseable, IPNI and block fetch are skipped because there are no sampleable CIDs to verify or fetch. + +Source: [`piece-validation.service.ts`](../../apps/backend/src/retrieval-anon/piece-validation.service.ts) ## What Gets Asserted @@ -103,38 +106,37 @@ Unlike the [Data Storage check](./data-storage.md#deal-status-progression), anon | anonPieceRetrievalStatus | Meaning | |--------|---------| | `success` | `GET /piece/{pieceCid}` returned HTTP 2xx **and** the response bytes hashed to the declared CommP. | +| `skipped` | The subgraph returned no candidate piece for the SP after all selection fallbacks. No HTTP request was attempted. | | `failure.http` | Piece fetch did not return HTTP 2xx, or the request failed at the transport layer (DNS, TLS, connection reset, etc.). | | `failure.commp` | Piece fetch returned HTTP 2xx, but the response bytes hashed to a different CID than `pieceCid`. The bytes are discarded — downstream CAR / IPNI / block-fetch validation is skipped to avoid amplifying a misbehaving SP. | | `failure.timedout` | The job-level `AbortSignal` fired (most often `ANON_RETRIEVAL_JOB_TIMEOUT_SECONDS`). Partial timing/byte evidence is still persisted. | -| `failure.no_piece` | The subgraph returned no candidate piece for the SP after all selection fallbacks. No HTTP request was attempted. | | `failure.other` | Catch-all for retrieval failures that do not match any of the categories above. | -| anonCarParseStatus | Meaning | -|--------|---------| -| `parseable` | The fetched piece bytes were successfully parsed as a CAR by `@ipld/car`. | -| `not_parseable` | The fetched piece bytes could not be parsed as a CAR (malformed header, truncated content, unexpected encoding, or parser threw). | - -> Emitted only when piece fetch succeeded **and** the piece advertises IPFS indexing (`withIPFSIndexing = true` with a non-null `ipfsRootCid`). Skipped otherwise; no row value is recorded. +| anonCarParseStatus | Meaning | +|--------|-------------------------------------------------------------------------------------------------------------------------------------------------------| +| `parseable` | The fetched piece bytes were successfully parsed as a CAR by `@ipld/car`. | +| `not_parseable` | The fetched piece bytes could not be parsed as a CAR (malformed header, truncated content, unexpected encoding, or parser threw an error). | +| `skipped` | CAR parsing was not attempted — piece fetch failed, the piece does not advertise IPFS indexing, or the job aborted before parsing. | | anonIpniStatus | Meaning | |--------|---------| | `valid` | filecoinpin.contact returned the SP as a provider for the root CID **and** every sampled child CID within `IPNI_VERIFICATION_TIMEOUT_MS`. | | `invalid` | IPNI was queried but at least one CID never resolved to the SP under test before the timeout (or the timeout fired with unresolved CIDs). | -| `skipped` | IPNI verification was not attempted — piece fetch failed, the piece does not advertise IPFS indexing, or the SP is not registered with `WalletSdkService` so no IPNI sampling could run. | -| `error` | IPNI verification was attempted but the CAR-validation step threw before producing a result (e.g. invalid root CID, transport error, unexpected exception). | +| `skipped` | IPNI verification was not attempted — piece fetch failed, the piece does not advertise IPFS indexing, CAR parsing returned `not_parseable`, the root CID itself failed to parse, or the job aborted. | +| `error` | IPNI verification was attempted and `IpniVerificationService.verify` threw unexpectedly (transport error, service down, etc.). | | anonBlockFetchStatus | Meaning | |--------|---------| -| `valid` | Every sampled CID was fetched via `GET {spBaseUrl}/ipfs/{cid}?format=raw` and the response bytes hash-verified against the declared CID. | -| `invalid` | At least one sampled block fetch failed: non-2xx HTTP, hash mismatch, unsupported codec, unsupported hash, or transport error. Each failed sample counts as one failed block. | -| `skipped` | Block-fetch sampling was not attempted — piece fetch failed, the piece does not advertise IPFS indexing, or CAR parsing produced no sampleable CIDs. | -| `error` | Block-fetch sampling was attempted but the CAR-validation step threw before completing (e.g. CAR parser threw, unexpected exception). | +| `success` | Every sampled CID was fetched via `GET {spBaseUrl}/ipfs/{cid}?format=raw` and the response bytes hash-verified against the declared CID. | +| `failure` | At least one sampled block fetch failed: non-2xx HTTP, hash mismatch, unsupported codec, unsupported hash, or transport error. Each failed sample counts as one failed block. | +| `skipped` | Block-fetch sampling was not attempted — piece fetch failed, the piece does not advertise IPFS indexing, CAR parsing returned `not_parseable`, or the job aborted. | +| `error` | Block-fetch sampling was attempted but the loop threw unexpectedly outside the per-block try/catch. | Sources: -- [`anon-retrieval.service.ts`](../../apps/backend/src/retrieval-anon/anon-retrieval.service.ts) — emits the four status metrics +- [`anon-retrieval.service.ts`](../../apps/backend/src/retrieval-anon/anon-retrieval.service.ts) — orchestrates the dimensions and emits the four status metrics - [`piece-retrieval.service.ts`](../../apps/backend/src/retrieval-anon/piece-retrieval.service.ts) — classifies piece-fetch outcomes -- [`car-validation.service.ts`](../../apps/backend/src/retrieval-anon/car-validation.service.ts) — produces CAR / IPNI / block-fetch outcomes -- [`types.ts` (`IpniCheckStatus`)](../../apps/backend/src/database/types.ts) — enum source of truth +- [`piece-validation.service.ts`](../../apps/backend/src/retrieval-anon/piece-validation.service.ts) — produces CAR / IPNI / block-fetch outcomes independently +- [`types.ts` (`CarParseStatus`, `IpniCheckStatus`)](../../apps/backend/src/database/types.ts) — enum source of truth ## Result Recording @@ -156,9 +158,12 @@ The DDL and column-level comments in [`clickhouse.schema.ts`](../../apps/backend | `http_response_code` | Raw HTTP status; null on transport failure | | `first_byte_ms`, `last_byte_ms`, `bytes_retrieved`, `throughput_bps` | Piece-fetch performance | | `commp_valid` | Null when retrieval failed before CommP could be hashed | -| `car_parseable`, `car_block_count` | Null when CAR validation was skipped (no IPFS indexing or piece fetch failed) | -| `block_fetch_endpoint`, `block_fetch_valid`, `block_fetch_sampled_count`, `block_fetch_failed_count` | Block-fetch outcomes; null when skipped | -| `ipni_status` | `valid` \| `invalid` \| `skipped` \| `error` | +| `car_status` | `parseable` \| `not_parseable` \| `skipped` \| `error` — mirrors `anonCarParseStatus` | +| `car_block_count` | Total CAR block count; null unless `car_status='parseable'` | +| `block_fetch_endpoint` | Gateway base URL probed; null when skipped or SP info missing | +| `block_fetch_status` | `valid` \| `invalid` \| `skipped` \| `error` — mirrors `anonBlockFetchStatus` | +| `block_fetch_sampled_count`, `block_fetch_failed_count` | Sampled / failed block counts; null when skipped | +| `ipni_status` | `valid` \| `invalid` \| `skipped` \| `error` — mirrors `anonIpniStatus` | | `ipni_verify_ms`, `ipni_verified_cids_count`, `ipni_unverified_cids_count` | IPNI check details | | `error_message` | Failure reason; null on success | diff --git a/docs/checks/events-and-metrics.md b/docs/checks/events-and-metrics.md index 6f91bd03..c8cdd691 100644 --- a/docs/checks/events-and-metrics.md +++ b/docs/checks/events-and-metrics.md @@ -89,20 +89,20 @@ sequenceDiagram ### Anon Retrieval Check Event List -| Event | Definition | Source of truth | -|------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-----------------| -| `anonPieceSelectionStart` | Dealbot begins selecting an anonymous piece for the SP under test from Dealbot's subgraph (size-bucket + indexed/any pool sampling with fallbacks). | [`anon-piece-selector.service.ts`](../../apps/backend/src/retrieval-anon/anon-piece-selector.service.ts) | -| `anonPieceSelected` | Subgraph returned a candidate piece (or all fallbacks were exhausted and the check is recorded as `failure.no_piece`). | [`anon-piece-selector.service.ts`](../../apps/backend/src/retrieval-anon/anon-piece-selector.service.ts) | -| `anonPieceFetchStart` | Dealbot initiates `GET {spBaseUrl}/piece/{pieceCid}`. | [`piece-retrieval.service.ts`](../../apps/backend/src/retrieval-anon/piece-retrieval.service.ts) (logs `anon_retrieval_started`) | -| `anonPieceFetchFirstByteReceived` | First byte received from `/piece/{pieceCid}`. | [`piece-retrieval.service.ts`](../../apps/backend/src/retrieval-anon/piece-retrieval.service.ts) (drives `anonPieceRetrievalFirstByteMs`) | -| `anonPieceFetchLastByteReceived` | Last byte received from `/piece/{pieceCid}`. | [`piece-retrieval.service.ts`](../../apps/backend/src/retrieval-anon/piece-retrieval.service.ts) (drives `anonPieceRetrievalLastByteMs`) | -| `anonCommPVerified` | Response bytes hashed and the resulting CommP compared against the declared `pieceCid`. Inline check; no discrete event emission. Failure flips piece fetch to `failure.commp`. | [`piece-retrieval.service.ts`](../../apps/backend/src/retrieval-anon/piece-retrieval.service.ts) | -| `anonCarParsed` | Fetched piece bytes are parsed as a CAR and a random sample of `ANON_RETRIEVAL_BLOCK_SAMPLE_COUNT` CIDs is selected. Only runs when the piece advertises IPFS indexing. | [`car-validation.service.ts`](../../apps/backend/src/retrieval-anon/car-validation.service.ts) | -| `anonIpniVerificationStart` | Dealbot begins polling filecoinpin.contact for `` provider records. | [`car-validation.service.ts`](../../apps/backend/src/retrieval-anon/car-validation.service.ts) | -| `anonIpniVerificationComplete` | IPNI verification finishes (all CIDs resolved to the SP, `IPNI_VERIFICATION_TIMEOUT_MS` reached, or error). | [`car-validation.service.ts`](../../apps/backend/src/retrieval-anon/car-validation.service.ts) (drives `ipni_verify_ms`) | -| `anonBlockFetchStart` | Dealbot starts fetching the sampled CIDs via `GET {spBaseUrl}/ipfs/{cid}?format=raw`. | [`car-validation.service.ts`](../../apps/backend/src/retrieval-anon/car-validation.service.ts) | -| `anonBlockFetchComplete` | All sampled block fetches finished; each response was hash-verified against its declared CID (any non-2xx, hash mismatch, unsupported codec, or transport error counts as a failed block). | [`car-validation.service.ts`](../../apps/backend/src/retrieval-anon/car-validation.service.ts) | -| `anonRetrievalCheckComplete` | Anonymous retrieval check terminates — successful piece fetch (plus optional CAR/IPNI/block-fetch validations) or any failure / abort. Drives the `anon_retrieval_checks` ClickHouse row and `anonRetrievalCheckMs`. | [`anon-retrieval.service.ts`](../../apps/backend/src/retrieval-anon/anon-retrieval.service.ts) (logs `anon_retrieval_completed`) | +| Event | Definition | Source of truth | +|------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-----------------| +| `anonPieceSelectionStart` | Dealbot begins selecting an anonymous piece for the SP under test from Dealbot's subgraph (size-bucket + indexed/any pool sampling with fallbacks). | [`anon-piece-selector.service.ts`](../../apps/backend/src/retrieval-anon/anon-piece-selector.service.ts) | +| `anonPieceSelected` | Subgraph returned a candidate piece (or all fallbacks were exhausted and the check is recorded as `failure.no_piece`). | [`anon-piece-selector.service.ts`](../../apps/backend/src/retrieval-anon/anon-piece-selector.service.ts) | +| `anonPieceFetchStart` | Dealbot initiates `GET {spBaseUrl}/piece/{pieceCid}`. | [`piece-retrieval.service.ts`](../../apps/backend/src/retrieval-anon/piece-retrieval.service.ts) (logs `anon_retrieval_started`) | +| `anonPieceFetchFirstByteReceived` | First byte received from `/piece/{pieceCid}`. | [`piece-retrieval.service.ts`](../../apps/backend/src/retrieval-anon/piece-retrieval.service.ts) (drives `anonPieceRetrievalFirstByteMs`) | +| `anonPieceFetchLastByteReceived` | Last byte received from `/piece/{pieceCid}`. | [`piece-retrieval.service.ts`](../../apps/backend/src/retrieval-anon/piece-retrieval.service.ts) (drives `anonPieceRetrievalLastByteMs`) | +| `anonCommPVerified` | Response bytes hashed and the resulting CommP compared against the declared `pieceCid`. Inline check; no discrete event emission. Failure flips piece fetch to `failure.commp`. | [`piece-retrieval.service.ts`](../../apps/backend/src/retrieval-anon/piece-retrieval.service.ts) | +| `anonCarParsed` | Fetched piece bytes are parsed as a CAR and a random sample of `ANON_RETRIEVAL_BLOCK_SAMPLE_COUNT` CIDs is selected. Only emitted when the piece advertises IPFS indexing. | [`piece-validation.service.ts`](../../apps/backend/src/retrieval-anon/piece-validation.service.ts) | +| `anonIpniVerificationStart` | Dealbot begins polling filecoinpin.contact for `` provider records. | [`piece-validation.service.ts`](../../apps/backend/src/retrieval-anon/piece-validation.service.ts) | +| `anonIpniVerificationComplete` | IPNI verification finishes (all CIDs resolved to the SP, `IPNI_VERIFICATION_TIMEOUT_MS` reached, or error). | [`piece-validation.service.ts`](../../apps/backend/src/retrieval-anon/piece-validation.service.ts) (drives `ipni_verify_ms`) | +| `anonBlockFetchStart` | Dealbot starts fetching the sampled CIDs via `GET {spBaseUrl}/ipfs/{cid}?format=raw`. | [`piece-validation.service.ts`](../../apps/backend/src/retrieval-anon/piece-validation.service.ts) | +| `anonBlockFetchComplete` | All sampled block fetches finished; each response was hash-verified against its declared CID (any non-2xx, hash mismatch, unsupported codec, a timeout, or transport error counts as a failed block). | [`piece-validation.service.ts`](../../apps/backend/src/retrieval-anon/piece-validation.service.ts) | +| `anonRetrievalCheckComplete` | Anonymous retrieval check terminates — successful piece fetch (plus optional CAR/IPNI/block-fetch validations) or any failure / abort / timeout. Drives the `anon_retrieval_checks` ClickHouse row and `anonRetrievalCheckMs`. | [`anon-retrieval.service.ts`](../../apps/backend/src/retrieval-anon/anon-retrieval.service.ts) (logs `anon_retrieval_completed`) | ## Metrics @@ -161,7 +161,7 @@ sequenceDiagram | `pdp_provider_estimated_overdue_periods` | Data Retention | Emitted on every [Data Retention Check](./data-retention.md) poll for every successfully processed provider. | Gauge value in proving periods (non-negative integer) | [`data-retention.service.ts`](../../apps/backend/src/data-retention/data-retention.service.ts) | | `anonPieceRetrievalStatus` | Anonymous Retrieval | [`anonRetrievalCheckComplete`](#anonRetrievalCheckComplete) | `success`, `failure.http`, `failure.commp`, `failure.timedout`, `failure.no_piece`, `failure.other` from [Anonymous Retrieval Sub-status meanings](./anon-retrievals.md#sub-status-meanings). | [`anon-retrieval.service.ts`](../../apps/backend/src/retrieval-anon/anon-retrieval.service.ts) | | `anonPieceHttpResponseCode` | Anonymous Retrieval | [`anonPieceFetchLastByteReceived`](#anonPieceFetchLastByteReceived) | Same as [`ipfsRetrievalHttpResponseCode`](#ipfsRetrievalHttpResponseCode). | [`anon-retrieval.service.ts`](../../apps/backend/src/retrieval-anon/anon-retrieval.service.ts) | -| `anonCarParseStatus` | Anonymous Retrieval | [`anonCarParsed`](#anonCarParsed) (skipped when piece fetch failed or piece is not IPFS-indexed) | `parseable`, `not_parseable` from [Anonymous Retrieval Sub-status meanings](./anon-retrievals.md#sub-status-meanings). | [`anon-retrieval.service.ts`](../../apps/backend/src/retrieval-anon/anon-retrieval.service.ts) | +| `anonCarParseStatus` | Anonymous Retrieval | [`anonCarParsed`](#anonCarParsed), **or** when CAR parsing didn't run (records `skipped`) | `parseable`, `not_parseable`, `skipped`, `error` from [Anonymous Retrieval Sub-status meanings](./anon-retrievals.md#sub-status-meanings). | [`anon-retrieval.service.ts`](../../apps/backend/src/retrieval-anon/anon-retrieval.service.ts) | | `anonIpniStatus` | Anonymous Retrieval | [`anonIpniVerificationComplete`](#anonIpniVerificationComplete), **or** when piece fetch failed (records `skipped`) | `valid`, `invalid`, `skipped`, `error` from [Anonymous Retrieval Sub-status meanings](./anon-retrievals.md#sub-status-meanings). | [`anon-retrieval.service.ts`](../../apps/backend/src/retrieval-anon/anon-retrieval.service.ts) | | `anonBlockFetchStatus` | Anonymous Retrieval | [`anonBlockFetchComplete`](#anonBlockFetchComplete), **or** when piece fetch failed (records `skipped`) | `valid`, `invalid`, `skipped`, `error` from [Anonymous Retrieval Sub-status meanings](./anon-retrievals.md#sub-status-meanings). | [`anon-retrieval.service.ts`](../../apps/backend/src/retrieval-anon/anon-retrieval.service.ts) | diff --git a/docs/environment-variables.md b/docs/environment-variables.md index 72fadca0..6daa1424 100644 --- a/docs/environment-variables.md +++ b/docs/environment-variables.md @@ -864,7 +864,7 @@ For each sampled CID, dealbot: - Increase for stronger statistical confidence that the SP serves the entire DAG correctly (more IPNI queries + per-block fetches per check) - Decrease to reduce per-check load on the SP and on filecoinpin.contact -**Note**: A higher sample count multiplies both IPNI traffic and block-fetch traffic per check. The IPNI step is all-or-nothing across the root CID and the sampled child CIDs — see [Anonymous Retrieval § CAR Validation](./checks/anon-retrievals.md#car-validation-only-when-piece-advertises-ipfs-indexing). +**Note**: A higher sample count multiplies both IPNI traffic and block-fetch traffic per check. The IPNI step is all-or-nothing across the root CID and the sampled child CIDs — see [Anonymous Retrieval § CAR / IPNI / block-fetch validation](./checks/anon-retrievals.md#car--ipni--block-fetch-validation-only-when-piece-advertises-ipfs-indexing). --- ### `IPFS_BLOCK_FETCH_CONCURRENCY` From 24ef113d4ae5cc4b020de22e976967d26f2105f9 Mon Sep 17 00:00:00 2001 From: Dennis Trautwein Date: Wed, 27 May 2026 12:49:55 +0200 Subject: [PATCH 48/55] docs: claude review --- .../metrics-prometheus.module.ts | 6 +++--- docs/checks/anon-retrievals.md | 16 ++++++++-------- docs/checks/events-and-metrics.md | 8 ++++---- 3 files changed, 15 insertions(+), 15 deletions(-) diff --git a/apps/backend/src/metrics-prometheus/metrics-prometheus.module.ts b/apps/backend/src/metrics-prometheus/metrics-prometheus.module.ts index 4ebeb01a..52d872ec 100644 --- a/apps/backend/src/metrics-prometheus/metrics-prometheus.module.ts +++ b/apps/backend/src/metrics-prometheus/metrics-prometheus.module.ts @@ -245,17 +245,17 @@ const metricProviders = [ }), makeCounterProvider({ name: "anonCarParseStatus", - help: "Anonymous retrieval CAR parse outcomes (parseable / not_parseable)", + help: "Anonymous retrieval CAR parse outcomes (parseable / not_parseable / skipped)", labelNames: ["checkType", "providerId", "providerName", "providerStatus", "value"] as const, }), makeCounterProvider({ name: "anonIpniStatus", - help: "Anonymous retrieval IPNI check outcomes (valid / invalid / skipped)", + help: "Anonymous retrieval IPNI check outcomes (valid / invalid / skipped / error)", labelNames: ["checkType", "providerId", "providerName", "providerStatus", "value"] as const, }), makeCounterProvider({ name: "anonBlockFetchStatus", - help: "Anonymous retrieval block fetch validation outcomes (valid / invalid / skipped)", + help: "Anonymous retrieval block fetch validation outcomes (success / failure / skipped / error)", labelNames: ["checkType", "providerId", "providerName", "providerStatus", "value"] as const, }), // Storage provider metrics: absolute counts, independent of query filters. diff --git a/docs/checks/anon-retrievals.md b/docs/checks/anon-retrievals.md index f167b306..9e3b729a 100644 --- a/docs/checks/anon-retrievals.md +++ b/docs/checks/anon-retrievals.md @@ -35,13 +35,13 @@ Unlike the [Retrieval check](./retrievals.md#piece-selection), dealbot does not Selection strategy (per scheduled job, per SP): 1. **Pick a size bucket** by weighted random: - - `small` (1–20 MiB) — 20% - - `medium` (20–50 MiB) — 50% + - `small` (1–10 MiB) — 20% + - `medium` (10–50 MiB) — 50% - `large` (50–100 MiB) — 30% 2. **Pick a pool**: - `indexed` (IPFS-indexed pieces) — 80% - `any` (all FWSS pieces) — 20% -3. **Generate a uniform-random `sampleKey`** and query the subgraph for the smallest `Root.sampleKey ≥ $sampleKey` matching the SP, payer, size range, and pool filters. +3. **Generate a uniform-random `sampleKey`** and query the subgraph for the smallest `Root.sampleKey ≥ $sampleKey` matching the SP, payer, size range, and pool filters. If no such row exists (the random key fell above every matching `sampleKey`), `sampleAnonPiece` retries in the reverse direction (largest `sampleKey < $sampleKey`) so the highest keys are not a dead zone. 4. **Drop the candidate** if `pdpPaymentEndEpoch` has passed. 5. **Fall back** through: (same bucket, opposite pool) → (any bucket, indexed) → (any bucket, any). @@ -59,7 +59,7 @@ flowchart TD Select["Sample anonymous piece for SP from subgraph"] --> Fetch["GET /piece/{pieceCid}"] Fetch --> CommP["Hash bytes → verify CommP"] CommP --> HasIpfs{"piece.withIPFSIndexing
and ipfsRootCid?"} - HasIpfs -- "no" --> Record["Persist Clickhosue row + emit Prometheus metrics"] + HasIpfs -- "no" --> Record["Persist ClickHouse row + emit Prometheus metrics"] HasIpfs -- "yes" --> ParseCar["Parse bytes as CAR"] ParseCar --> SampleBlocks["Pick N random CIDs
(ANON_RETRIEVAL_BLOCK_SAMPLE_COUNT)"] SampleBlocks --> Ipni["IPNI: verify SP advertises root + sampled CIDs"] @@ -72,7 +72,7 @@ flowchart TD ### Piece Fetch - **URL:** `{spBaseUrl}/piece/{pieceCid}` (HTTP/2) -- **Buffered in memory** — piece sizes are capped at 500 MiB by selection. +- **Buffered in memory** — piece sizes are capped at 100 MiB by selection (the upper bound of the `large` bucket). - **Validates CommP** — the CommP of the response bytes must match `pieceCid`. Source: [`piece-retrieval.service.ts`](../../apps/backend/src/retrieval-anon/piece-retrieval.service.ts) @@ -158,13 +158,13 @@ The DDL and column-level comments in [`clickhouse.schema.ts`](../../apps/backend | `http_response_code` | Raw HTTP status; null on transport failure | | `first_byte_ms`, `last_byte_ms`, `bytes_retrieved`, `throughput_bps` | Piece-fetch performance | | `commp_valid` | Null when retrieval failed before CommP could be hashed | -| `car_status` | `parseable` \| `not_parseable` \| `skipped` \| `error` — mirrors `anonCarParseStatus` | +| `car_status` | `parseable` \| `not_parseable` \| `skipped` — mirrors `anonCarParseStatus` | | `car_block_count` | Total CAR block count; null unless `car_status='parseable'` | | `block_fetch_endpoint` | Gateway base URL probed; null when skipped or SP info missing | -| `block_fetch_status` | `valid` \| `invalid` \| `skipped` \| `error` — mirrors `anonBlockFetchStatus` | +| `block_fetch_status` | `success` \| `failure` \| `skipped` \| `error` — mirrors `anonBlockFetchStatus` | | `block_fetch_sampled_count`, `block_fetch_failed_count` | Sampled / failed block counts; null when skipped | | `ipni_status` | `valid` \| `invalid` \| `skipped` \| `error` — mirrors `anonIpniStatus` | -| `ipni_verify_ms`, `ipni_verified_cids_count`, `ipni_unverified_cids_count` | IPNI check details | +| `ipni_verify_ms` | IPNI verification duration; null when skipped | | `error_message` | Failure reason; null on success | Source: [`anon-retrieval.service.ts`](../../apps/backend/src/retrieval-anon/anon-retrieval.service.ts) diff --git a/docs/checks/events-and-metrics.md b/docs/checks/events-and-metrics.md index c8cdd691..dbe6d756 100644 --- a/docs/checks/events-and-metrics.md +++ b/docs/checks/events-and-metrics.md @@ -110,7 +110,7 @@ sequenceDiagram * They are exported via Prometheus. * All Prometheus/OpenTelemetry metrics have label/attributes for: - `network=calibration|mainnet` - - `checkType=dataStorage|retrieval|dataRetention|dataSetCreation` — attribute metrics to a particular check/job + - `checkType=dataStorage|retrieval|anon_retrieval|dataRetention|dataSetCreation` — attribute metrics to a particular check/job - `providerId` — attribute metrics to a particular SP - `providerName` — human-readable name of the SP (defaults to `"unknown"` when not available) - `providerStatus=approved|unapproved` — attribute metrics to only approved SPs for example @@ -159,11 +159,11 @@ sequenceDiagram | `dataSetCreationStatus` | Data-Set Creation | Not tied to an [event above](#event-list) but rather to data-set creation start (`pending`) and completion (`success`/`failure.*`) | `pending`, `success`, `failure.timedout`, `failure.other` | [`deal.service.ts`](../../apps/backend/src/deal/deal.service.ts) | | `dataSetChallengeStatus` | Data Retention | Emitted on each [Data Retention Check](./data-retention.md) poll when a provider's confirmed proving-period totals advance (strictly positive deltas). Unit: **challenges** (period delta × `CHALLENGES_PER_PROVING_PERIOD = 5`). | `success` (challenges in successfully-proven periods), `failure` (challenges in faulted periods) | [`data-retention.service.ts`](../../apps/backend/src/data-retention/data-retention.service.ts) | | `pdp_provider_estimated_overdue_periods` | Data Retention | Emitted on every [Data Retention Check](./data-retention.md) poll for every successfully processed provider. | Gauge value in proving periods (non-negative integer) | [`data-retention.service.ts`](../../apps/backend/src/data-retention/data-retention.service.ts) | -| `anonPieceRetrievalStatus` | Anonymous Retrieval | [`anonRetrievalCheckComplete`](#anonRetrievalCheckComplete) | `success`, `failure.http`, `failure.commp`, `failure.timedout`, `failure.no_piece`, `failure.other` from [Anonymous Retrieval Sub-status meanings](./anon-retrievals.md#sub-status-meanings). | [`anon-retrieval.service.ts`](../../apps/backend/src/retrieval-anon/anon-retrieval.service.ts) | +| `anonPieceRetrievalStatus` | Anonymous Retrieval | [`anonRetrievalCheckComplete`](#anonRetrievalCheckComplete) | `success`, `skipped`, `failure.http`, `failure.commp`, `failure.timedout`, `failure.other` from [Anonymous Retrieval Sub-status meanings](./anon-retrievals.md#sub-status-meanings). | [`anon-retrieval.service.ts`](../../apps/backend/src/retrieval-anon/anon-retrieval.service.ts) | | `anonPieceHttpResponseCode` | Anonymous Retrieval | [`anonPieceFetchLastByteReceived`](#anonPieceFetchLastByteReceived) | Same as [`ipfsRetrievalHttpResponseCode`](#ipfsRetrievalHttpResponseCode). | [`anon-retrieval.service.ts`](../../apps/backend/src/retrieval-anon/anon-retrieval.service.ts) | -| `anonCarParseStatus` | Anonymous Retrieval | [`anonCarParsed`](#anonCarParsed), **or** when CAR parsing didn't run (records `skipped`) | `parseable`, `not_parseable`, `skipped`, `error` from [Anonymous Retrieval Sub-status meanings](./anon-retrievals.md#sub-status-meanings). | [`anon-retrieval.service.ts`](../../apps/backend/src/retrieval-anon/anon-retrieval.service.ts) | +| `anonCarParseStatus` | Anonymous Retrieval | [`anonCarParsed`](#anonCarParsed), **or** when CAR parsing didn't run (records `skipped`) | `parseable`, `not_parseable`, `skipped` from [Anonymous Retrieval Sub-status meanings](./anon-retrievals.md#sub-status-meanings). | [`anon-retrieval.service.ts`](../../apps/backend/src/retrieval-anon/anon-retrieval.service.ts) | | `anonIpniStatus` | Anonymous Retrieval | [`anonIpniVerificationComplete`](#anonIpniVerificationComplete), **or** when piece fetch failed (records `skipped`) | `valid`, `invalid`, `skipped`, `error` from [Anonymous Retrieval Sub-status meanings](./anon-retrievals.md#sub-status-meanings). | [`anon-retrieval.service.ts`](../../apps/backend/src/retrieval-anon/anon-retrieval.service.ts) | -| `anonBlockFetchStatus` | Anonymous Retrieval | [`anonBlockFetchComplete`](#anonBlockFetchComplete), **or** when piece fetch failed (records `skipped`) | `valid`, `invalid`, `skipped`, `error` from [Anonymous Retrieval Sub-status meanings](./anon-retrievals.md#sub-status-meanings). | [`anon-retrieval.service.ts`](../../apps/backend/src/retrieval-anon/anon-retrieval.service.ts) | +| `anonBlockFetchStatus` | Anonymous Retrieval | [`anonBlockFetchComplete`](#anonBlockFetchComplete), **or** when piece fetch failed (records `skipped`) | `success`, `failure`, `skipped`, `error` from [Anonymous Retrieval Sub-status meanings](./anon-retrievals.md#sub-status-meanings). | [`anon-retrieval.service.ts`](../../apps/backend/src/retrieval-anon/anon-retrieval.service.ts) | ## ClickHouse Tables From 9099370248f3d2ac068b5bcc3a7197f919848867 Mon Sep 17 00:00:00 2001 From: Dennis Trautwein Date: Wed, 27 May 2026 13:20:09 +0200 Subject: [PATCH 49/55] docs: claude consolidation --- apps/backend/src/clickhouse/clickhouse.schema.ts | 4 ++-- docs/checks/anon-retrievals.md | 2 +- docs/checks/events-and-metrics.md | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/apps/backend/src/clickhouse/clickhouse.schema.ts b/apps/backend/src/clickhouse/clickhouse.schema.ts index d2298cc4..5584735e 100644 --- a/apps/backend/src/clickhouse/clickhouse.schema.ts +++ b/apps/backend/src/clickhouse/clickhouse.schema.ts @@ -90,10 +90,10 @@ export function buildMigrations(database: string): string[] { throughput_bps Nullable(UInt64), -- effective throughput, bytes per second commp_valid Nullable(Bool), -- null when retrieval failed before CommP could be hashed - car_status LowCardinality(String), -- 'parseable' | 'not_parseable' | 'skipped' | 'error' — mirrors anonCarParseStatus; skipped when piece fetch failed, piece is not IPFS-indexed, or the job aborted before parsing + car_status LowCardinality(String), -- 'parseable' | 'not_parseable' | 'skipped' — mirrors anonCarParseStatus; skipped when piece fetch failed, piece is not IPFS-indexed, or the job aborted before parsing car_block_count Nullable(UInt32), -- total number of blocks observed inside the CAR; null when skipped or not parseable block_fetch_endpoint Nullable(String), -- gateway base URL probed for block fetch (e.g. {spBaseUrl}/ipfs/); null when skipped - block_fetch_status LowCardinality(String), -- 'valid' | 'invalid' | 'skipped' | 'error' — mirrors anonBlockFetchStatus; skipped when CAR validation didn't run or SP info missing + block_fetch_status LowCardinality(String), -- 'success' | 'failure' | 'skipped' | 'error' — mirrors anonBlockFetchStatus; skipped when CAR validation didn't run or SP info missing block_fetch_sampled_count Nullable(UInt32), -- number of blocks sampled and probed via /ipfs/?format=raw block_fetch_failed_count Nullable(UInt32), -- number of sampled blocks that failed (non-2xx, hash mismatch, unsupported codec, or transport error) diff --git a/docs/checks/anon-retrievals.md b/docs/checks/anon-retrievals.md index 9e3b729a..55f446e6 100644 --- a/docs/checks/anon-retrievals.md +++ b/docs/checks/anon-retrievals.md @@ -101,7 +101,7 @@ Source: [`piece-validation.service.ts`](../../apps/backend/src/retrieval-anon/pi ## Sub-status meanings -Unlike the [Data Storage check](./data-storage.md#deal-status-progression), anonymous retrieval does **not** have a rolled-up status (e.g., `anonRetrievalStatus). Piece retrieval, CAR parsing, IPNI verification, block-fetch outcomes are recorded independently. Each status metric below is emitted exactly once per check, except when `anonPieceRetrievalStatus=failure.no_piece` because selection itself fails. +Unlike the [Data Storage check](./data-storage.md#deal-status-progression), anonymous retrieval does **not** have a rolled-up status (e.g., `anonRetrievalStatus). Piece retrieval, CAR parsing, IPNI verification, block-fetch outcomes are recorded independently. Each status metric below is emitted exactly once per check, except when `anonPieceRetrievalStatus=skipped` because selection itself failed and no HTTP/CAR/IPNI/block-fetch work ran. | anonPieceRetrievalStatus | Meaning | |--------|---------| diff --git a/docs/checks/events-and-metrics.md b/docs/checks/events-and-metrics.md index 6d430044..36f542a6 100644 --- a/docs/checks/events-and-metrics.md +++ b/docs/checks/events-and-metrics.md @@ -126,7 +126,7 @@ sequenceDiagram | Event | Definition | Source of truth | |------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-----------------| | `anonPieceSelectionStart` | Dealbot begins selecting an anonymous piece for the SP under test from Dealbot's subgraph (size-bucket + indexed/any pool sampling with fallbacks). | [`anon-piece-selector.service.ts`](../../apps/backend/src/retrieval-anon/anon-piece-selector.service.ts) | -| `anonPieceSelected` | Subgraph returned a candidate piece (or all fallbacks were exhausted and the check is recorded as `failure.no_piece`). | [`anon-piece-selector.service.ts`](../../apps/backend/src/retrieval-anon/anon-piece-selector.service.ts) | +| `anonPieceSelected` | Subgraph returned a candidate piece (or all fallbacks were exhausted and the check is recorded as `skipped`). | [`anon-piece-selector.service.ts`](../../apps/backend/src/retrieval-anon/anon-piece-selector.service.ts) | | `anonPieceFetchStart` | Dealbot initiates `GET {spBaseUrl}/piece/{pieceCid}`. | [`piece-retrieval.service.ts`](../../apps/backend/src/retrieval-anon/piece-retrieval.service.ts) (logs `anon_retrieval_started`) | | `anonPieceFetchFirstByteReceived` | First byte received from `/piece/{pieceCid}`. | [`piece-retrieval.service.ts`](../../apps/backend/src/retrieval-anon/piece-retrieval.service.ts) (drives `anonPieceRetrievalFirstByteMs`) | | `anonPieceFetchLastByteReceived` | Last byte received from `/piece/{pieceCid}`. | [`piece-retrieval.service.ts`](../../apps/backend/src/retrieval-anon/piece-retrieval.service.ts) (drives `anonPieceRetrievalLastByteMs`) | From 8d4c1f8cf6bdc3ab409e85cbc33ad8d12aadde78 Mon Sep 17 00:00:00 2001 From: Dennis Trautwein Date: Wed, 27 May 2026 13:22:11 +0200 Subject: [PATCH 50/55] refactor: put newest anon retrieval table last --- .../src/clickhouse/clickhouse.schema.ts | 80 +++++++++---------- 1 file changed, 40 insertions(+), 40 deletions(-) diff --git a/apps/backend/src/clickhouse/clickhouse.schema.ts b/apps/backend/src/clickhouse/clickhouse.schema.ts index 5584735e..5b51a0a1 100644 --- a/apps/backend/src/clickhouse/clickhouse.schema.ts +++ b/apps/backend/src/clickhouse/clickhouse.schema.ts @@ -62,6 +62,46 @@ export function buildMigrations(database: string): string[] { PARTITION BY toStartOfMonth(timestamp) TTL toDateTime(timestamp) + INTERVAL 1 YEAR`, + `CREATE TABLE IF NOT EXISTS ${database}.data_retention_challenges +( + timestamp DateTime64(3, 'UTC'), -- when the poll ran and detected these periods + probe_location LowCardinality(String), -- dealbot location + sp_address String, -- storage provider address + sp_id Nullable(UInt64), -- storage provider numeric id + sp_name Nullable(String), -- storage provider name + + total_periods_due UInt32, -- cumulative proving periods due (confirmed by subgraph) + total_faulted_periods UInt32, -- cumulative periods where proof was not submitted + total_success_periods UInt32, -- cumulative periods where proof was submitted (= due - faulted) + estimated_overdue_periods UInt32 -- estimated periods not yet recorded on-chain but past deadline +) ENGINE MergeTree() + PRIMARY KEY (probe_location, sp_address, timestamp) + PARTITION BY toStartOfMonth(timestamp) + TTL toDateTime(timestamp) + INTERVAL 1 YEAR`, + + `CREATE TABLE IF NOT EXISTS ${database}.pull_checks +( + timestamp DateTime64(3, 'UTC'), -- when the pull check terminated + probe_location LowCardinality(String), -- dealbot location + sp_address String, -- storage provider address + sp_id Nullable(UInt64), -- storage provider numeric id + sp_name Nullable(String), -- storage provider name + + piece_cid Nullable(String), -- piece CID of the synthetic test piece; null if preparation failed + piece_size_bytes Nullable(UInt64), -- size of the synthetic piece in bytes; null if preparation failed + + status LowCardinality(String), -- 'success' | 'failure.timedout' | 'failure.other' + provider_status LowCardinality(Nullable(String)), -- raw SP-reported terminal pull status (e.g. 'complete', 'failed'); null if the request was never acknowledged or if waiting for pull status errored or timed out + + acknowledgement_latency_ms Nullable(Float64), -- time from pullPieces submission to SP acknowledgement (ms) + completion_latency_ms Nullable(Float64), -- time from pullPieces submission to terminal SP pull status (ms) + first_byte_ms Nullable(Float64), -- time from pullPieces submission to SP reading first byte of hosted piece (ms); null when check failed before first byte + throughput_bps Nullable(Float64) -- approx bytes/sec = piece_size_bytes / completion_latency_ms * 1000; null on failure +) ENGINE MergeTree() + PRIMARY KEY (probe_location, sp_address, timestamp) + PARTITION BY toStartOfMonth(timestamp) + TTL toDateTime(timestamp) + INTERVAL 1 YEAR`, + `CREATE TABLE IF NOT EXISTS ${database}.anon_retrieval_checks ( timestamp DateTime64(3, 'UTC'), -- when the check completed @@ -105,45 +145,5 @@ export function buildMigrations(database: string): string[] { PRIMARY KEY (probe_location, sp_address, timestamp) PARTITION BY toStartOfMonth(timestamp) TTL toDateTime(timestamp) + INTERVAL 1 YEAR`, - - `CREATE TABLE IF NOT EXISTS ${database}.data_retention_challenges -( - timestamp DateTime64(3, 'UTC'), -- when the poll ran and detected these periods - probe_location LowCardinality(String), -- dealbot location - sp_address String, -- storage provider address - sp_id Nullable(UInt64), -- storage provider numeric id - sp_name Nullable(String), -- storage provider name - - total_periods_due UInt32, -- cumulative proving periods due (confirmed by subgraph) - total_faulted_periods UInt32, -- cumulative periods where proof was not submitted - total_success_periods UInt32, -- cumulative periods where proof was submitted (= due - faulted) - estimated_overdue_periods UInt32 -- estimated periods not yet recorded on-chain but past deadline -) ENGINE MergeTree() - PRIMARY KEY (probe_location, sp_address, timestamp) - PARTITION BY toStartOfMonth(timestamp) - TTL toDateTime(timestamp) + INTERVAL 1 YEAR`, - - `CREATE TABLE IF NOT EXISTS ${database}.pull_checks -( - timestamp DateTime64(3, 'UTC'), -- when the pull check terminated - probe_location LowCardinality(String), -- dealbot location - sp_address String, -- storage provider address - sp_id Nullable(UInt64), -- storage provider numeric id - sp_name Nullable(String), -- storage provider name - - piece_cid Nullable(String), -- piece CID of the synthetic test piece; null if preparation failed - piece_size_bytes Nullable(UInt64), -- size of the synthetic piece in bytes; null if preparation failed - - status LowCardinality(String), -- 'success' | 'failure.timedout' | 'failure.other' - provider_status LowCardinality(Nullable(String)), -- raw SP-reported terminal pull status (e.g. 'complete', 'failed'); null if the request was never acknowledged or if waiting for pull status errored or timed out - - acknowledgement_latency_ms Nullable(Float64), -- time from pullPieces submission to SP acknowledgement (ms) - completion_latency_ms Nullable(Float64), -- time from pullPieces submission to terminal SP pull status (ms) - first_byte_ms Nullable(Float64), -- time from pullPieces submission to SP reading first byte of hosted piece (ms); null when check failed before first byte - throughput_bps Nullable(Float64) -- approx bytes/sec = piece_size_bytes / completion_latency_ms * 1000; null on failure -) ENGINE MergeTree() - PRIMARY KEY (probe_location, sp_address, timestamp) - PARTITION BY toStartOfMonth(timestamp) - TTL toDateTime(timestamp) + INTERVAL 1 YEAR`, ]; } From 2066a31664ee8b00ffe8be7ca81047d979e602fc Mon Sep 17 00:00:00 2001 From: Dennis Trautwein Date: Wed, 27 May 2026 13:34:16 +0200 Subject: [PATCH 51/55] fix(retrieval-anon): mark job as failed if no piece was found --- .../backend/src/retrieval-anon/anon-retrieval.service.ts | 9 +-------- docs/checks/anon-retrievals.md | 2 +- 2 files changed, 2 insertions(+), 9 deletions(-) diff --git a/apps/backend/src/retrieval-anon/anon-retrieval.service.ts b/apps/backend/src/retrieval-anon/anon-retrieval.service.ts index c95c7361..a85bd1a2 100644 --- a/apps/backend/src/retrieval-anon/anon-retrieval.service.ts +++ b/apps/backend/src/retrieval-anon/anon-retrieval.service.ts @@ -44,14 +44,7 @@ export class AnonRetrievalService { // 1. Select an anonymous piece const piece = await this.anonPieceSelectorService.selectPieceForProvider(spAddress, signal); if (!piece) { - this.logger.warn({ - ...logContext, - event: "anon_retrieval_no_piece", - message: "No anonymous piece found for SP", - spAddress, - }); - this.metrics.recordPieceRetrievalStatus(labels, "skipped"); - return; + throw new Error(`No anonymous piece found for SP ${spAddress}`); } this.logger.log({ diff --git a/docs/checks/anon-retrievals.md b/docs/checks/anon-retrievals.md index 55f446e6..23a91a02 100644 --- a/docs/checks/anon-retrievals.md +++ b/docs/checks/anon-retrievals.md @@ -140,7 +140,7 @@ Sources: ## Result Recording -Each anonymous retrieval attempt writes one row to the `anon_retrieval_checks` ClickHouse table. The row is emitted **even on abort or unexpected error** so that the partial evidence (TTFB, bytes, response code) is preserved. +Each anonymous retrieval attempt writes one row to the `anon_retrieval_checks` ClickHouse table unless we could not find a piece to probe for the SP. The row is emitted **even on abort or unexpected error** so that the partial evidence (TTFB, bytes, response code) is preserved. The DDL and column-level comments in [`clickhouse.schema.ts`](../../apps/backend/src/clickhouse/clickhouse.schema.ts) are authoritative. The summary below is for orientation. From 24c9f552e726faa7d206bd11157f8f6f79228e28 Mon Sep 17 00:00:00 2001 From: Dennis Trautwein Date: Wed, 27 May 2026 13:41:30 +0200 Subject: [PATCH 52/55] fix: job service dep indices --- apps/backend/src/jobs/jobs.service.spec.ts | 52 +++++++++++----------- 1 file changed, 26 insertions(+), 26 deletions(-) diff --git a/apps/backend/src/jobs/jobs.service.spec.ts b/apps/backend/src/jobs/jobs.service.spec.ts index fad2444e..68acf62b 100644 --- a/apps/backend/src/jobs/jobs.service.spec.ts +++ b/apps/backend/src/jobs/jobs.service.spec.ts @@ -36,18 +36,18 @@ describe("JobsService schedule rows", () => { }; let dataRetentionServiceMock: { pollDataRetention: ReturnType }; let metricsMocks: { - jobsQueuedGauge: JobsServiceDeps[9]; - jobsRetryScheduledGauge: JobsServiceDeps[10]; - oldestQueuedAgeGauge: JobsServiceDeps[11]; - oldestInFlightAgeGauge: JobsServiceDeps[12]; - jobsInFlightGauge: JobsServiceDeps[13]; - jobsEnqueueAttemptsCounter: JobsServiceDeps[14]; - jobsStartedCounter: JobsServiceDeps[15]; - jobsCompletedCounter: JobsServiceDeps[16]; - jobsPausedGauge: JobsServiceDeps[17]; - jobDuration: JobsServiceDeps[18]; - storageProvidersActive: JobsServiceDeps[19]; - storageProvidersTested: JobsServiceDeps[20]; + jobsQueuedGauge: JobsServiceDeps[10]; + jobsRetryScheduledGauge: JobsServiceDeps[11]; + oldestQueuedAgeGauge: JobsServiceDeps[12]; + oldestInFlightAgeGauge: JobsServiceDeps[13]; + jobsInFlightGauge: JobsServiceDeps[14]; + jobsEnqueueAttemptsCounter: JobsServiceDeps[15]; + jobsStartedCounter: JobsServiceDeps[16]; + jobsCompletedCounter: JobsServiceDeps[17]; + jobsPausedGauge: JobsServiceDeps[18]; + jobDuration: JobsServiceDeps[19]; + storageProvidersActive: JobsServiceDeps[20]; + storageProvidersTested: JobsServiceDeps[21]; }; let baseConfigValues: Partial; let configService: JobsServiceDeps[0]; @@ -104,18 +104,18 @@ describe("JobsService schedule rows", () => { }; metricsMocks = { - jobsQueuedGauge: { set: vi.fn() } as unknown as JobsServiceDeps[9], - jobsRetryScheduledGauge: { set: vi.fn() } as unknown as JobsServiceDeps[10], - oldestQueuedAgeGauge: { set: vi.fn() } as unknown as JobsServiceDeps[11], - oldestInFlightAgeGauge: { set: vi.fn() } as unknown as JobsServiceDeps[12], - jobsInFlightGauge: { set: vi.fn() } as unknown as JobsServiceDeps[13], - jobsEnqueueAttemptsCounter: { inc: vi.fn() } as unknown as JobsServiceDeps[14], - jobsStartedCounter: { inc: vi.fn() } as unknown as JobsServiceDeps[15], - jobsCompletedCounter: { inc: vi.fn() } as unknown as JobsServiceDeps[16], - jobsPausedGauge: { set: vi.fn() } as unknown as JobsServiceDeps[17], - jobDuration: { observe: vi.fn() } as unknown as JobsServiceDeps[18], - storageProvidersActive: { set: vi.fn() } as unknown as JobsServiceDeps[19], - storageProvidersTested: { set: vi.fn() } as unknown as JobsServiceDeps[20], + jobsQueuedGauge: { set: vi.fn() } as unknown as JobsServiceDeps[10], + jobsRetryScheduledGauge: { set: vi.fn() } as unknown as JobsServiceDeps[11], + oldestQueuedAgeGauge: { set: vi.fn() } as unknown as JobsServiceDeps[12], + oldestInFlightAgeGauge: { set: vi.fn() } as unknown as JobsServiceDeps[13], + jobsInFlightGauge: { set: vi.fn() } as unknown as JobsServiceDeps[14], + jobsEnqueueAttemptsCounter: { inc: vi.fn() } as unknown as JobsServiceDeps[15], + jobsStartedCounter: { inc: vi.fn() } as unknown as JobsServiceDeps[16], + jobsCompletedCounter: { inc: vi.fn() } as unknown as JobsServiceDeps[17], + jobsPausedGauge: { set: vi.fn() } as unknown as JobsServiceDeps[18], + jobDuration: { observe: vi.fn() } as unknown as JobsServiceDeps[19], + storageProvidersActive: { set: vi.fn() } as unknown as JobsServiceDeps[20], + storageProvidersTested: { set: vi.fn() } as unknown as JobsServiceDeps[21], }; const emptySpBlocklists: ISpBlocklistConfig = { @@ -994,7 +994,7 @@ describe("JobsService schedule rows", () => { service = buildService({ configService, - anonRetrievalService: anonRetrievalService as unknown as JobsServiceDeps[8], + anonRetrievalService: anonRetrievalService as unknown as JobsServiceDeps[9], walletSdkService: walletSdkService as unknown as JobsServiceDeps[5], }); @@ -1510,7 +1510,7 @@ describe("JobsService schedule rows", () => { }; service = buildService({ - anonRetrievalService: anonRetrievalService as unknown as JobsServiceDeps[8], + anonRetrievalService: anonRetrievalService as unknown as JobsServiceDeps[9], walletSdkService: walletSdkService as unknown as JobsServiceDeps[5], }); From 4714bf8b258b34fafafb39b8565689d5699430d5 Mon Sep 17 00:00:00 2001 From: Dennis Trautwein Date: Fri, 29 May 2026 08:56:04 +0200 Subject: [PATCH 53/55] Update docs/checks/anon-retrievals.md Co-authored-by: Steve Loeppky --- docs/checks/anon-retrievals.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/checks/anon-retrievals.md b/docs/checks/anon-retrievals.md index 23a91a02..e54a04f6 100644 --- a/docs/checks/anon-retrievals.md +++ b/docs/checks/anon-retrievals.md @@ -101,7 +101,7 @@ Source: [`piece-validation.service.ts`](../../apps/backend/src/retrieval-anon/pi ## Sub-status meanings -Unlike the [Data Storage check](./data-storage.md#deal-status-progression), anonymous retrieval does **not** have a rolled-up status (e.g., `anonRetrievalStatus). Piece retrieval, CAR parsing, IPNI verification, block-fetch outcomes are recorded independently. Each status metric below is emitted exactly once per check, except when `anonPieceRetrievalStatus=skipped` because selection itself failed and no HTTP/CAR/IPNI/block-fetch work ran. +Unlike the [Data Storage check](./data-storage.md#deal-status-progression), anonymous retrieval does **not** have a rolled-up status (e.g., `anonRetrievalStatus`). Piece retrieval, CAR parsing, IPNI verification, block-fetch outcomes are recorded independently. Each status metric below is emitted exactly once per check, except when `anonPieceRetrievalStatus=skipped` because selection itself failed and no HTTP/CAR/IPNI/block-fetch work ran. | anonPieceRetrievalStatus | Meaning | |--------|---------| From b3d7731f771a0f2908d5c220874001b4d02beb94 Mon Sep 17 00:00:00 2001 From: Dennis Trautwein Date: Fri, 29 May 2026 09:40:21 +0200 Subject: [PATCH 54/55] refactor(retrieval-anon): align check statuses to success/failure.* convention --- .../src/clickhouse/clickhouse.schema.ts | 6 +- apps/backend/src/database/types.ts | 13 ++-- .../anon-retrieval.service.spec.ts | 60 +++++++++---------- .../retrieval-anon/anon-retrieval.service.ts | 14 ++--- .../piece-validation.service.ts | 22 +++---- apps/backend/src/retrieval-anon/types.ts | 13 ++-- docs/checks/anon-retrievals.md | 25 ++++---- docs/checks/events-and-metrics.md | 6 +- 8 files changed, 79 insertions(+), 80 deletions(-) diff --git a/apps/backend/src/clickhouse/clickhouse.schema.ts b/apps/backend/src/clickhouse/clickhouse.schema.ts index 5b51a0a1..028837a4 100644 --- a/apps/backend/src/clickhouse/clickhouse.schema.ts +++ b/apps/backend/src/clickhouse/clickhouse.schema.ts @@ -130,14 +130,14 @@ export function buildMigrations(database: string): string[] { throughput_bps Nullable(UInt64), -- effective throughput, bytes per second commp_valid Nullable(Bool), -- null when retrieval failed before CommP could be hashed - car_status LowCardinality(String), -- 'parseable' | 'not_parseable' | 'skipped' — mirrors anonCarParseStatus; skipped when piece fetch failed, piece is not IPFS-indexed, or the job aborted before parsing + car_status LowCardinality(String), -- 'success' | 'skipped' | 'failure.not_parseable' — mirrors anonCarParseStatus; skipped when piece fetch failed, piece is not IPFS-indexed, or the job aborted before parsing car_block_count Nullable(UInt32), -- total number of blocks observed inside the CAR; null when skipped or not parseable block_fetch_endpoint Nullable(String), -- gateway base URL probed for block fetch (e.g. {spBaseUrl}/ipfs/); null when skipped - block_fetch_status LowCardinality(String), -- 'success' | 'failure' | 'skipped' | 'error' — mirrors anonBlockFetchStatus; skipped when CAR validation didn't run or SP info missing + block_fetch_status LowCardinality(String), -- 'success' | 'skipped' | 'failure.other' — mirrors anonBlockFetchStatus; skipped when CAR validation didn't run or SP info missing block_fetch_sampled_count Nullable(UInt32), -- number of blocks sampled and probed via /ipfs/?format=raw block_fetch_failed_count Nullable(UInt32), -- number of sampled blocks that failed (non-2xx, hash mismatch, unsupported codec, or transport error) - ipni_status LowCardinality(String), -- 'valid' | 'invalid' | 'skipped' | 'error' — all-or-nothing across the root CID and the sampled child CIDs (filecoin-pin verifies them as a single batch) + ipni_status LowCardinality(String), -- 'success' | 'skipped' | 'failure.timedout' | 'failure.other' — mirrors anonIpniStatus; all-or-nothing across the root CID and the sampled child CIDs (filecoin-pin verifies them as a single batch) ipni_verify_ms Nullable(Float64), -- IPNI verification duration; null when skipped error_message Nullable(String) -- failure reason; null on success diff --git a/apps/backend/src/database/types.ts b/apps/backend/src/database/types.ts index 58465abf..bf9fbd68 100644 --- a/apps/backend/src/database/types.ts +++ b/apps/backend/src/database/types.ts @@ -29,23 +29,22 @@ export enum IpniStatus { } export enum IpniCheckStatus { - VALID = "valid", - INVALID = "invalid", + SUCCESS = "success", SKIPPED = "skipped", - ERROR = "error", + FAILURE_TIMEDOUT = "failure.timedout", + FAILURE_OTHER = "failure.other", } export enum CarParseStatus { - PARSEABLE = "parseable", - NOT_PARSEABLE = "not_parseable", + SUCCESS = "success", SKIPPED = "skipped", + FAILURE_NOT_PARSEABLE = "failure.not_parseable", } export enum BlockFetchStatus { SUCCESS = "success", - FAILURE = "failure", SKIPPED = "skipped", - ERROR = "error", + FAILURE_OTHER = "failure.other", } /** diff --git a/apps/backend/src/retrieval-anon/anon-retrieval.service.spec.ts b/apps/backend/src/retrieval-anon/anon-retrieval.service.spec.ts index ec767817..9a2298e9 100644 --- a/apps/backend/src/retrieval-anon/anon-retrieval.service.spec.ts +++ b/apps/backend/src/retrieval-anon/anon-retrieval.service.spec.ts @@ -87,19 +87,19 @@ function makeService(opts: { opts.parseCarImpl ?? (async () => opts.parseCarOutcome ?? { - status: CarParseStatus.PARSEABLE, + status: CarParseStatus.SUCCESS, blockCount: 0, sampledBlocks: SAMPLED_BLOCKS, }), ); const checkIpniSpy = vi.fn( - opts.checkIpniImpl ?? (async () => opts.checkIpniOutcome ?? { status: IpniCheckStatus.VALID, durationMs: 0 }), + opts.checkIpniImpl ?? (async () => opts.checkIpniOutcome ?? { status: IpniCheckStatus.SUCCESS, durationMs: 0 }), ); const checkBlockFetchSpy = vi.fn( opts.checkBlockFetchImpl ?? (async () => opts.checkBlockFetchOutcome ?? { - status: IpniCheckStatus.VALID, + status: IpniCheckStatus.SUCCESS, sampledCount: 0, failedCount: 0, endpoint: "https://sp.test/ipfs/", @@ -292,8 +292,8 @@ describe("AnonRetrievalService", () => { const { service, insertSpy, parseCarSpy, checkIpniSpy, checkBlockFetchSpy } = makeService({ pieceResult: okPiece(Buffer.from("car-bytes")), piece: INDEXED_PIECE, - parseCarOutcome: { status: CarParseStatus.PARSEABLE, blockCount: 42, sampledBlocks: SAMPLED_BLOCKS }, - checkIpniOutcome: { status: IpniCheckStatus.VALID, durationMs: 137 }, + parseCarOutcome: { status: CarParseStatus.SUCCESS, blockCount: 42, sampledBlocks: SAMPLED_BLOCKS }, + checkIpniOutcome: { status: IpniCheckStatus.SUCCESS, durationMs: 137 }, checkBlockFetchOutcome: { status: BlockFetchStatus.SUCCESS, sampledCount: 5, @@ -310,24 +310,24 @@ describe("AnonRetrievalService", () => { const [, row] = insertSpy.mock.calls[0] as [string, Record]; expect(row.piece_fetch_status).toBe(RetrievalStatus.SUCCESS); expect(row.commp_valid).toBe(true); - expect(row.car_status).toBe("parseable"); + expect(row.car_status).toBe("success"); expect(row.car_block_count).toBe(42); expect(row.block_fetch_endpoint).toBe("https://sp.test/ipfs/"); expect(row.block_fetch_status).toBe("success"); expect(row.block_fetch_sampled_count).toBe(5); expect(row.block_fetch_failed_count).toBe(0); - expect(row.ipni_status).toBe("valid"); + expect(row.ipni_status).toBe("success"); expect(row.ipni_verify_ms).toBe(137); }); - it("distinguishes IPNI invalid from block-fetch failures", async () => { + it("distinguishes IPNI failure.timedout from block-fetch failure.other", async () => { const { service, insertSpy } = makeService({ pieceResult: okPiece(Buffer.from("car-bytes")), piece: INDEXED_PIECE, - parseCarOutcome: { status: CarParseStatus.PARSEABLE, blockCount: 100, sampledBlocks: SAMPLED_BLOCKS }, - checkIpniOutcome: { status: IpniCheckStatus.INVALID, durationMs: 250 }, + parseCarOutcome: { status: CarParseStatus.SUCCESS, blockCount: 100, sampledBlocks: SAMPLED_BLOCKS }, + checkIpniOutcome: { status: IpniCheckStatus.FAILURE_TIMEDOUT, durationMs: 250 }, checkBlockFetchOutcome: { - status: BlockFetchStatus.FAILURE, + status: BlockFetchStatus.FAILURE_OTHER, sampledCount: 5, failedCount: 2, endpoint: "https://sp.test/ipfs/", @@ -340,20 +340,20 @@ describe("AnonRetrievalService", () => { // The piece-fetch path still succeeded — failures are surfaced as // independent dimensions, not folded into piece_fetch_status. expect(row.piece_fetch_status).toBe(RetrievalStatus.SUCCESS); - expect(row.car_status).toBe("parseable"); - expect(row.ipni_status).toBe("invalid"); - expect(row.block_fetch_status).toBe("failure"); + expect(row.car_status).toBe("success"); + expect(row.ipni_status).toBe("failure.timedout"); + expect(row.block_fetch_status).toBe("failure.other"); expect(row.block_fetch_sampled_count).toBe(5); expect(row.block_fetch_failed_count).toBe(2); }); - it("skips downstream dimensions when parseCar returns NOT_PARSEABLE", async () => { + it("skips downstream dimensions when parseCar returns failure.not_parseable", async () => { // The decoupled service guarantees that an unparseable CAR never even // attempts IPNI or block fetch — there are no CIDs to verify or fetch. const { service, insertSpy, parseCarSpy, checkIpniSpy, checkBlockFetchSpy } = makeService({ pieceResult: okPiece(Buffer.from("not-a-car")), piece: INDEXED_PIECE, - parseCarOutcome: { status: CarParseStatus.NOT_PARSEABLE }, + parseCarOutcome: { status: CarParseStatus.FAILURE_NOT_PARSEABLE }, }); await service.performForProvider(SP_ADDRESS); @@ -363,7 +363,7 @@ describe("AnonRetrievalService", () => { expect(checkBlockFetchSpy).not.toHaveBeenCalled(); const [, row] = insertSpy.mock.calls[0] as [string, Record]; - expect(row.car_status).toBe("not_parseable"); + expect(row.car_status).toBe("failure.not_parseable"); expect(row.car_block_count).toBeNull(); expect(row.block_fetch_sampled_count).toBeNull(); expect(row.block_fetch_endpoint).toBeNull(); @@ -379,7 +379,7 @@ describe("AnonRetrievalService", () => { const { service, insertSpy, metricsRecordIpniSpy } = makeService({ pieceResult: okPiece(Buffer.from("car-bytes")), piece: INDEXED_PIECE, - parseCarOutcome: { status: CarParseStatus.PARSEABLE, blockCount: 1, sampledBlocks: SAMPLED_BLOCKS }, + parseCarOutcome: { status: CarParseStatus.SUCCESS, blockCount: 1, sampledBlocks: SAMPLED_BLOCKS }, checkIpniOutcome: { status: IpniCheckStatus.SKIPPED, durationMs: null }, checkBlockFetchOutcome: { status: BlockFetchStatus.SUCCESS, @@ -395,19 +395,19 @@ describe("AnonRetrievalService", () => { const [, row] = insertSpy.mock.calls[0] as [string, Record]; expect(row.ipni_status).toBe("skipped"); // car_status / block_fetch_status remain whatever their own steps returned. - expect(row.car_status).toBe("parseable"); + expect(row.car_status).toBe("success"); expect(row.block_fetch_status).toBe("success"); }); - it("propagates checkIpni's ERROR status only to ipni_status (not other dimensions)", async () => { + it("propagates checkIpni's failure.other status only to ipni_status (not other dimensions)", async () => { // The whole point of decoupling: an unexpected throw in IPNI verification // cannot bleed into car_status or block_fetch_status. const { service, insertSpy, metricsRecordIpniSpy, metricsRecordCarParseSpy, metricsRecordBlockFetchSpy } = makeService({ pieceResult: okPiece(Buffer.from("car-bytes")), piece: INDEXED_PIECE, - parseCarOutcome: { status: CarParseStatus.PARSEABLE, blockCount: 1, sampledBlocks: SAMPLED_BLOCKS }, - checkIpniOutcome: { status: IpniCheckStatus.ERROR, durationMs: null }, + parseCarOutcome: { status: CarParseStatus.SUCCESS, blockCount: 1, sampledBlocks: SAMPLED_BLOCKS }, + checkIpniOutcome: { status: IpniCheckStatus.FAILURE_OTHER, durationMs: null }, checkBlockFetchOutcome: { status: BlockFetchStatus.SUCCESS, sampledCount: 1, @@ -418,13 +418,13 @@ describe("AnonRetrievalService", () => { await service.performForProvider(SP_ADDRESS); - expect(metricsRecordCarParseSpy).toHaveBeenCalledWith(expect.anything(), CarParseStatus.PARSEABLE); - expect(metricsRecordIpniSpy).toHaveBeenCalledWith(expect.anything(), IpniCheckStatus.ERROR); + expect(metricsRecordCarParseSpy).toHaveBeenCalledWith(expect.anything(), CarParseStatus.SUCCESS); + expect(metricsRecordIpniSpy).toHaveBeenCalledWith(expect.anything(), IpniCheckStatus.FAILURE_OTHER); expect(metricsRecordBlockFetchSpy).toHaveBeenCalledWith(expect.anything(), BlockFetchStatus.SUCCESS); const [, row] = insertSpy.mock.calls[0] as [string, Record]; - expect(row.car_status).toBe("parseable"); - expect(row.ipni_status).toBe("error"); + expect(row.car_status).toBe("success"); + expect(row.ipni_status).toBe("failure.other"); expect(row.block_fetch_status).toBe("success"); }); @@ -432,8 +432,8 @@ describe("AnonRetrievalService", () => { const { service, insertSpy } = makeService({ pieceResult: okPiece(Buffer.from("car-bytes")), piece: INDEXED_PIECE, - parseCarOutcome: { status: CarParseStatus.PARSEABLE, blockCount: 1, sampledBlocks: SAMPLED_BLOCKS }, - checkIpniOutcome: { status: IpniCheckStatus.VALID, durationMs: 50 }, + parseCarOutcome: { status: CarParseStatus.SUCCESS, blockCount: 1, sampledBlocks: SAMPLED_BLOCKS }, + checkIpniOutcome: { status: IpniCheckStatus.SUCCESS, durationMs: 50 }, checkBlockFetchOutcome: { status: BlockFetchStatus.SKIPPED, sampledCount: 1, @@ -446,8 +446,8 @@ describe("AnonRetrievalService", () => { await service.performForProvider(SP_ADDRESS); const [, row] = insertSpy.mock.calls[0] as [string, Record]; - expect(row.car_status).toBe("parseable"); - expect(row.ipni_status).toBe("valid"); + expect(row.car_status).toBe("success"); + expect(row.ipni_status).toBe("success"); expect(row.block_fetch_status).toBe("skipped"); expect(row.block_fetch_endpoint).toBeNull(); expect(row.block_fetch_failed_count).toBeNull(); diff --git a/apps/backend/src/retrieval-anon/anon-retrieval.service.ts b/apps/backend/src/retrieval-anon/anon-retrieval.service.ts index a85bd1a2..6c514985 100644 --- a/apps/backend/src/retrieval-anon/anon-retrieval.service.ts +++ b/apps/backend/src/retrieval-anon/anon-retrieval.service.ts @@ -95,7 +95,7 @@ export class AnonRetrievalService { try { parse = await this.pieceValidationService.parseCar(pieceResult.pieceBytes, signal); - if (parse.status === CarParseStatus.PARSEABLE) { + if (parse.status === CarParseStatus.SUCCESS) { ipni = await this.pieceValidationService.checkIpni( provider, piece.ipfsRootCid, @@ -161,11 +161,11 @@ export class AnonRetrievalService { throughput_bps: finalPieceResult.throughputBps > 0 ? Math.round(finalPieceResult.throughputBps) : null, commp_valid: !finalPieceResult.aborted && finalPieceResult.httpSuccess ? finalPieceResult.commPValid : null, car_status: carStatus, - car_block_count: parse && parse.status === CarParseStatus.PARSEABLE ? parse.blockCount : null, + car_block_count: parse && parse.status === CarParseStatus.SUCCESS ? parse.blockCount : null, block_fetch_endpoint: blockFetch?.endpoint ?? null, block_fetch_status: blockFetchStatus, block_fetch_sampled_count: - parse?.status === CarParseStatus.PARSEABLE && blockFetch ? blockFetch.sampledCount : null, + parse?.status === CarParseStatus.SUCCESS && blockFetch ? blockFetch.sampledCount : null, block_fetch_failed_count: blockFetch?.failedCount ?? null, ipni_status: ipniStatus, ipni_verify_ms: ipni?.durationMs ?? null, @@ -216,8 +216,8 @@ function anonPieceRetrievalStatus(pieceResult: PieceRetrievalResult): string { * The per-dimension statuses default to SKIPPED whenever the dimension's * prerequisite wasn't met — no IPFS indexing, piece fetch failed, the job * was aborted, or an upstream dimension didn't produce a usable result. - * Service methods only ever return their concrete outcomes (VALID, INVALID, - * NOT_PARSEABLE, etc.); SKIPPED is the helper's contribution. + * Service methods only ever return their concrete outcomes (success, + * failure.*, etc.); SKIPPED is the helper's contribution. */ function carStatusForRow(parse: CarParseOutcome | null): CarParseStatus { if (!parse) return CarParseStatus.SKIPPED; @@ -225,13 +225,13 @@ function carStatusForRow(parse: CarParseOutcome | null): CarParseStatus { } function ipniStatusForRow(parse: CarParseOutcome | null, ipni: IpniCheckOutcome | null): IpniCheckStatus { - if (!parse || parse.status !== CarParseStatus.PARSEABLE) return IpniCheckStatus.SKIPPED; + if (!parse || parse.status !== CarParseStatus.SUCCESS) return IpniCheckStatus.SKIPPED; if (!ipni) return IpniCheckStatus.SKIPPED; return ipni.status; } function blockFetchStatusForRow(parse: CarParseOutcome | null, blockFetch: BlockFetchOutcome | null): BlockFetchStatus { - if (!parse || parse.status !== CarParseStatus.PARSEABLE) return BlockFetchStatus.SKIPPED; + if (!parse || parse.status !== CarParseStatus.SUCCESS) return BlockFetchStatus.SKIPPED; if (!blockFetch) return BlockFetchStatus.SKIPPED; return blockFetch.status; } diff --git a/apps/backend/src/retrieval-anon/piece-validation.service.ts b/apps/backend/src/retrieval-anon/piece-validation.service.ts index 191b5887..99cfcc44 100644 --- a/apps/backend/src/retrieval-anon/piece-validation.service.ts +++ b/apps/backend/src/retrieval-anon/piece-validation.service.ts @@ -43,7 +43,7 @@ export class PieceValidationService { * for downstream IPNI + block-fetch checks. CAR parse failure is * attributed to the client (bad upload), not the SP. * - * Returns `NOT_PARSEABLE` on parser exceptions. Propagates abort. + * Returns `failure.not_parseable` on parser exceptions. Propagates abort. */ async parseCar(pieceBytes: Buffer, signal?: AbortSignal): Promise { let blocks: SampledBlock[]; @@ -56,12 +56,12 @@ export class PieceValidationService { message: "Failed to parse piece bytes as CAR - client fault, not SP", error: toStructuredError(error), }); - return { status: CarParseStatus.NOT_PARSEABLE }; + return { status: CarParseStatus.FAILURE_NOT_PARSEABLE }; } if (blocks.length === 0) { return { - status: CarParseStatus.PARSEABLE, + status: CarParseStatus.SUCCESS, blockCount: 0, sampledBlocks: [], }; @@ -72,7 +72,7 @@ export class PieceValidationService { const sampledBlocks = shuffled.slice(0, sampleCount); return { - status: CarParseStatus.PARSEABLE, + status: CarParseStatus.SUCCESS, blockCount: blocks.length, sampledBlocks, }; @@ -116,7 +116,7 @@ export class PieceValidationService { signal, }); return { - status: result.rootCIDVerified ? IpniCheckStatus.VALID : IpniCheckStatus.INVALID, + status: result.rootCIDVerified ? IpniCheckStatus.SUCCESS : IpniCheckStatus.FAILURE_TIMEDOUT, durationMs: result.durationMs, }; } catch (error) { @@ -128,16 +128,16 @@ export class PieceValidationService { ipfsRootCid, error: toStructuredError(error), }); - return { status: IpniCheckStatus.ERROR, durationMs: null }; + return { status: IpniCheckStatus.FAILURE_OTHER, durationMs: null }; } } /** * Fetch each sampled block from the SP endpoint and hash-verify the * response against the declared CID. SKIPPED when SP info is missing - * (not the SP's fault — we couldn't even find the gateway). ERROR is - * reserved for unexpected exceptions outside the per-block loop; - * per-block failures aggregate into `failedCount` and map to INVALID. + * (not the SP's fault — we couldn't even find the gateway). Both per-block + * verification failures (aggregated into `failedCount`) and unexpected + * exceptions outside the per-block loop map to FAILURE_OTHER. */ async checkBlockFetch( sampledBlocks: ReadonlyArray, @@ -166,7 +166,7 @@ export class PieceValidationService { } } return { - status: failedCount === 0 ? BlockFetchStatus.SUCCESS : BlockFetchStatus.FAILURE, + status: failedCount === 0 ? BlockFetchStatus.SUCCESS : BlockFetchStatus.FAILURE_OTHER, sampledCount: sampledBlocks.length, failedCount, endpoint, @@ -180,7 +180,7 @@ export class PieceValidationService { error: toStructuredError(error), }); return { - status: BlockFetchStatus.ERROR, + status: BlockFetchStatus.FAILURE_OTHER, sampledCount: sampledBlocks.length, failedCount: null, endpoint, diff --git a/apps/backend/src/retrieval-anon/types.ts b/apps/backend/src/retrieval-anon/types.ts index 959d5c0f..02d7d8c4 100644 --- a/apps/backend/src/retrieval-anon/types.ts +++ b/apps/backend/src/retrieval-anon/types.ts @@ -32,17 +32,17 @@ export type PieceRetrievalResult = { export type SampledBlock = { cid: CID; bytes: Uint8Array }; /** - * Result of CAR parsing. SKIPPED / ERROR are never produced here — the + * Result of CAR parsing. SKIPPED is never produced here — the * caller decides "this dimension never ran" semantics. */ export type CarParseOutcome = - | { status: CarParseStatus.PARSEABLE; blockCount: number; sampledBlocks: SampledBlock[] } - | { status: CarParseStatus.NOT_PARSEABLE; errorMessage?: string }; + | { status: CarParseStatus.SUCCESS; blockCount: number; sampledBlocks: SampledBlock[] } + | { status: CarParseStatus.FAILURE_NOT_PARSEABLE; errorMessage?: string }; /** * Result of an IPNI verification attempt. `SKIPPED` is returned when a - * structural prerequisite couldn't be met (root CID won't parse). `ERROR` - * is reserved for unexpected exceptions raised by the verifier. + * structural prerequisite couldn't be met (root CID won't parse). + * `FAILURE_OTHER` is reserved for unexpected exceptions raised by the verifier. */ export type IpniCheckOutcome = { status: IpniCheckStatus; @@ -52,7 +52,8 @@ export type IpniCheckOutcome = { /** * Result of the block-fetch sampling step. `SKIPPED` is returned when a * structural prerequisite couldn't be met (SP info not registered). - * `ERROR` is reserved for unexpected exceptions raised by the fetcher. + * `FAILURE_OTHER` covers both block verification failures and unexpected + * exceptions raised by the fetcher. */ export type BlockFetchOutcome = { status: BlockFetchStatus; diff --git a/docs/checks/anon-retrievals.md b/docs/checks/anon-retrievals.md index e54a04f6..292b63bc 100644 --- a/docs/checks/anon-retrievals.md +++ b/docs/checks/anon-retrievals.md @@ -85,7 +85,7 @@ When the selected piece has `withIPFSIndexing = true` and a non-null `ipfsRootCi 2. **IPNI check:** `IpniVerificationService.verify(rootCid, sampledCids, sp)` polls filecoinpin.contact until each CID resolves to the SP under test, the timeout fires, or `IPNI_VERIFICATION_TIMEOUT_MS` is reached. 3. **Block fetch check:** for each sampled CID, fetch `{spBaseUrl}/ipfs/{cid}?format=raw` and hash-verify the response against the CID. Non-2xx, hash mismatch, unsupported codec, or transport errors all count as a single failed block. -CAR parse failure (`not_parseable`) is attributed to the client (bad upload), not the SP. When the CAR is unparseable, IPNI and block fetch are skipped because there are no sampleable CIDs to verify or fetch. +CAR parse failure (`failure.not_parseable`) is attributed to the client (bad upload), not the SP. When the CAR is unparseable, IPNI and block fetch are skipped because there are no sampleable CIDs to verify or fetch. Source: [`piece-validation.service.ts`](../../apps/backend/src/retrieval-anon/piece-validation.service.ts) @@ -114,23 +114,22 @@ Unlike the [Data Storage check](./data-storage.md#deal-status-progression), anon | anonCarParseStatus | Meaning | |--------|-------------------------------------------------------------------------------------------------------------------------------------------------------| -| `parseable` | The fetched piece bytes were successfully parsed as a CAR by `@ipld/car`. | -| `not_parseable` | The fetched piece bytes could not be parsed as a CAR (malformed header, truncated content, unexpected encoding, or parser threw an error). | +| `success` | The fetched piece bytes were successfully parsed as a CAR by `@ipld/car`. | | `skipped` | CAR parsing was not attempted — piece fetch failed, the piece does not advertise IPFS indexing, or the job aborted before parsing. | +| `failure.not_parseable` | The fetched piece bytes could not be parsed as a CAR (malformed header, truncated content, unexpected encoding, or parser threw an error). Attributed to the client (bad upload), not the SP. | | anonIpniStatus | Meaning | |--------|---------| -| `valid` | filecoinpin.contact returned the SP as a provider for the root CID **and** every sampled child CID within `IPNI_VERIFICATION_TIMEOUT_MS`. | -| `invalid` | IPNI was queried but at least one CID never resolved to the SP under test before the timeout (or the timeout fired with unresolved CIDs). | -| `skipped` | IPNI verification was not attempted — piece fetch failed, the piece does not advertise IPFS indexing, CAR parsing returned `not_parseable`, the root CID itself failed to parse, or the job aborted. | -| `error` | IPNI verification was attempted and `IpniVerificationService.verify` threw unexpectedly (transport error, service down, etc.). | +| `success` | filecoinpin.contact returned the SP as a provider for the root CID **and** every sampled child CID within `IPNI_VERIFICATION_TIMEOUT_MS`. | +| `skipped` | IPNI verification was not attempted — piece fetch failed, the piece does not advertise IPFS indexing, CAR parsing returned `failure.not_parseable`, the root CID itself failed to parse, or the job aborted. | +| `failure.timedout` | IPNI was queried but at least one CID never resolved to the SP under test before `IPNI_VERIFICATION_TIMEOUT_MS` (the poll loop exhausted its timeout with unresolved CIDs). | +| `failure.other` | IPNI verification was attempted and `IpniVerificationService.verify` threw unexpectedly (transport error, service down, etc.). | | anonBlockFetchStatus | Meaning | |--------|---------| | `success` | Every sampled CID was fetched via `GET {spBaseUrl}/ipfs/{cid}?format=raw` and the response bytes hash-verified against the declared CID. | -| `failure` | At least one sampled block fetch failed: non-2xx HTTP, hash mismatch, unsupported codec, unsupported hash, or transport error. Each failed sample counts as one failed block. | -| `skipped` | Block-fetch sampling was not attempted — piece fetch failed, the piece does not advertise IPFS indexing, CAR parsing returned `not_parseable`, or the job aborted. | -| `error` | Block-fetch sampling was attempted but the loop threw unexpectedly outside the per-block try/catch. | +| `skipped` | Block-fetch sampling was not attempted — piece fetch failed, the piece does not advertise IPFS indexing, CAR parsing returned `failure.not_parseable`, or the job aborted. | +| `failure.other` | At least one sampled block fetch failed (non-2xx HTTP, hash mismatch, unsupported codec, unsupported hash, or transport error — each failed sample counts as one failed block), **or** the sampling loop threw unexpectedly outside the per-block try/catch. Per-block granularity lives in `block_fetch_failed_count`. | Sources: - [`anon-retrieval.service.ts`](../../apps/backend/src/retrieval-anon/anon-retrieval.service.ts) — orchestrates the dimensions and emits the four status metrics @@ -158,12 +157,12 @@ The DDL and column-level comments in [`clickhouse.schema.ts`](../../apps/backend | `http_response_code` | Raw HTTP status; null on transport failure | | `first_byte_ms`, `last_byte_ms`, `bytes_retrieved`, `throughput_bps` | Piece-fetch performance | | `commp_valid` | Null when retrieval failed before CommP could be hashed | -| `car_status` | `parseable` \| `not_parseable` \| `skipped` — mirrors `anonCarParseStatus` | +| `car_status` | `success` \| `skipped` \| `failure.not_parseable` — mirrors `anonCarParseStatus` | | `car_block_count` | Total CAR block count; null unless `car_status='parseable'` | | `block_fetch_endpoint` | Gateway base URL probed; null when skipped or SP info missing | -| `block_fetch_status` | `success` \| `failure` \| `skipped` \| `error` — mirrors `anonBlockFetchStatus` | +| `block_fetch_status` | `success` \| `skipped` \| `failure.other` — mirrors `anonBlockFetchStatus` | | `block_fetch_sampled_count`, `block_fetch_failed_count` | Sampled / failed block counts; null when skipped | -| `ipni_status` | `valid` \| `invalid` \| `skipped` \| `error` — mirrors `anonIpniStatus` | +| `ipni_status` | `success` \| `skipped` \| `failure.timedout` \| `failure.other` — mirrors `anonIpniStatus` | | `ipni_verify_ms` | IPNI verification duration; null when skipped | | `error_message` | Failure reason; null on success | diff --git a/docs/checks/events-and-metrics.md b/docs/checks/events-and-metrics.md index 5ca64056..7d0bf6fd 100644 --- a/docs/checks/events-and-metrics.md +++ b/docs/checks/events-and-metrics.md @@ -201,9 +201,9 @@ sequenceDiagram | `pullCheckStatus` | Pull | When the [Pull Check](./pull-check.md) terminates (success after direct piece validation, or any failure). Recorded exactly once per check. | `success`, `failure.timedout`, `failure.other`. Failure classification follows [`classifyFailureStatus`](../../apps/backend/src/metrics-prometheus/check-metric-labels.ts) (timeout-keyed errors → `failure.timedout`, everything else → `failure.other`). | [`pull-check.service.ts`](../../apps/backend/src/pull-check/pull-check.service.ts) | | `anonPieceRetrievalStatus` | Anonymous Retrieval | [`anonRetrievalCheckComplete`](#anonRetrievalCheckComplete) | `success`, `skipped`, `failure.http`, `failure.commp`, `failure.timedout`, `failure.other` from [Anonymous Retrieval Sub-status meanings](./anon-retrievals.md#sub-status-meanings). | [`anon-retrieval.service.ts`](../../apps/backend/src/retrieval-anon/anon-retrieval.service.ts) | | `anonPieceHttpResponseCode` | Anonymous Retrieval | [`anonPieceFetchLastByteReceived`](#anonPieceFetchLastByteReceived) | Same as [`ipfsRetrievalHttpResponseCode`](#ipfsRetrievalHttpResponseCode). | [`anon-retrieval.service.ts`](../../apps/backend/src/retrieval-anon/anon-retrieval.service.ts) | -| `anonCarParseStatus` | Anonymous Retrieval | [`anonCarParsed`](#anonCarParsed), **or** when CAR parsing didn't run (records `skipped`) | `parseable`, `not_parseable`, `skipped` from [Anonymous Retrieval Sub-status meanings](./anon-retrievals.md#sub-status-meanings). | [`anon-retrieval.service.ts`](../../apps/backend/src/retrieval-anon/anon-retrieval.service.ts) | -| `anonIpniStatus` | Anonymous Retrieval | [`anonIpniVerificationComplete`](#anonIpniVerificationComplete), **or** when piece fetch failed (records `skipped`) | `valid`, `invalid`, `skipped`, `error` from [Anonymous Retrieval Sub-status meanings](./anon-retrievals.md#sub-status-meanings). | [`anon-retrieval.service.ts`](../../apps/backend/src/retrieval-anon/anon-retrieval.service.ts) | -| `anonBlockFetchStatus` | Anonymous Retrieval | [`anonBlockFetchComplete`](#anonBlockFetchComplete), **or** when piece fetch failed (records `skipped`) | `success`, `failure`, `skipped`, `error` from [Anonymous Retrieval Sub-status meanings](./anon-retrievals.md#sub-status-meanings). | [`anon-retrieval.service.ts`](../../apps/backend/src/retrieval-anon/anon-retrieval.service.ts) | +| `anonCarParseStatus` | Anonymous Retrieval | [`anonCarParsed`](#anonCarParsed), **or** when CAR parsing didn't run (records `skipped`) | `success`, `skipped`, `failure.not_parseable` from [Anonymous Retrieval Sub-status meanings](./anon-retrievals.md#sub-status-meanings). | [`anon-retrieval.service.ts`](../../apps/backend/src/retrieval-anon/anon-retrieval.service.ts) | +| `anonIpniStatus` | Anonymous Retrieval | [`anonIpniVerificationComplete`](#anonIpniVerificationComplete), **or** when piece fetch failed (records `skipped`) | `success`, `skipped`, `failure.timedout`, `failure.other` from [Anonymous Retrieval Sub-status meanings](./anon-retrievals.md#sub-status-meanings). | [`anon-retrieval.service.ts`](../../apps/backend/src/retrieval-anon/anon-retrieval.service.ts) | +| `anonBlockFetchStatus` | Anonymous Retrieval | [`anonBlockFetchComplete`](#anonBlockFetchComplete), **or** when piece fetch failed (records `skipped`) | `success`, `skipped`, `failure.other` from [Anonymous Retrieval Sub-status meanings](./anon-retrievals.md#sub-status-meanings). | [`anon-retrieval.service.ts`](../../apps/backend/src/retrieval-anon/anon-retrieval.service.ts) | ## ClickHouse Tables From 4666e0cc8056bedcdd663bdca06f7d0091a93ea7 Mon Sep 17 00:00:00 2001 From: Dennis Trautwein Date: Fri, 29 May 2026 09:51:30 +0200 Subject: [PATCH 55/55] docs: clarify currently unused probe location field --- docs/checks/anon-retrievals.md | 2 +- docs/environment-variables.md | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/docs/checks/anon-retrievals.md b/docs/checks/anon-retrievals.md index 292b63bc..ba4b7495 100644 --- a/docs/checks/anon-retrievals.md +++ b/docs/checks/anon-retrievals.md @@ -146,7 +146,7 @@ The DDL and column-level comments in [`clickhouse.schema.ts`](../../apps/backend | Column | Meaning | |--------|---------| | `timestamp` | When the check started (ms UTC) | -| `probe_location` | Dealbot probe location (`DEALBOT_PROBE_LOCATION`) | +| `probe_location` | Dealbot probe location (`DEALBOT_PROBE_LOCATION`) - currently unused and set to `unknown` until https://github.com/FilOzone/dealbot/issues/246 lands | | `sp_address`, `sp_id`, `sp_name` | SP identity | | `retrieval_id` | Per-event UUID; correlates row to logs and Prometheus | | `piece_cid`, `data_set_id`, `piece_id`, `raw_size` | Sampled piece identity | diff --git a/docs/environment-variables.md b/docs/environment-variables.md index babce872..95275675 100644 --- a/docs/environment-variables.md +++ b/docs/environment-variables.md @@ -1268,6 +1268,9 @@ CLICKHOUSE_URL=http://default:password@clickhouse-host:8123/dealbot ### `DEALBOT_PROBE_LOCATION` +> [!NOTE] +> Currently unused and set to `unknown` until https://github.com/FilOzone/dealbot/issues/246 is resolved. + - **Type**: `string` - **Required**: No - **Default**: `unknown`