From f0855e2df09d39ae6df2257289e25f3bdc3663ae Mon Sep 17 00:00:00 2001 From: silent-cipher Date: Wed, 29 Apr 2026 18:48:48 +0530 Subject: [PATCH 01/44] chore: upgrade synapse to latest --- apps/backend/package.json | 4 +- pnpm-lock.yaml | 98 +++++++++++++++++++++++++++++++++++---- 2 files changed, 91 insertions(+), 11 deletions(-) diff --git a/apps/backend/package.json b/apps/backend/package.json index ec4ea66a..8994e7e4 100644 --- a/apps/backend/package.json +++ b/apps/backend/package.json @@ -30,8 +30,8 @@ }, "dependencies": { "@clickhouse/client": "^1.11.0", - "@filoz/synapse-core": "0.3.3", - "@filoz/synapse-sdk": "0.40.2", + "@filoz/synapse-core": "0.4.1", + "@filoz/synapse-sdk": "0.40.4", "@ipld/car": "^5.4.2", "@ipld/dag-pb": "^4.1.4", "@nestjs/axios": "^4.0.1", diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 6f0f3e90..022a2231 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -45,11 +45,11 @@ importers: specifier: ^1.11.0 version: 1.18.2 '@filoz/synapse-core': - specifier: 0.3.3 - version: 0.3.3(typescript@5.9.3)(viem@2.47.5(typescript@5.9.3)(zod@4.3.6)) + specifier: 0.4.1 + version: 0.4.1(typescript@5.9.3)(viem@2.47.5(typescript@5.9.3)(zod@4.3.6)) '@filoz/synapse-sdk': - specifier: 0.40.2 - version: 0.40.2(typescript@5.9.3)(viem@2.47.5(typescript@5.9.3)(zod@4.3.6)) + specifier: 0.40.4 + version: 0.40.4(typescript@5.9.3)(viem@2.47.5(typescript@5.9.3)(zod@4.3.6)) '@ipld/car': specifier: ^5.4.2 version: 5.4.2 @@ -561,24 +561,28 @@ packages: engines: {node: '>=14.21.3'} cpu: [arm64] os: [linux] + libc: [musl] '@biomejs/cli-linux-arm64@2.3.14': resolution: {integrity: sha512-KT67FKfzIw6DNnUNdYlBg+eU24Go3n75GWK6NwU4+yJmDYFe9i/MjiI+U/iEzKvo0g7G7MZqoyrhIYuND2w8QQ==} engines: {node: '>=14.21.3'} cpu: [arm64] os: [linux] + libc: [glibc] '@biomejs/cli-linux-x64-musl@2.3.14': resolution: {integrity: sha512-KQU7EkbBBuHPW3/rAcoiVmhlPtDSGOGRPv9js7qJVpYTzjQmVR+C9Rfcz+ti8YCH+zT1J52tuBybtP4IodjxZQ==} engines: {node: '>=14.21.3'} cpu: [x64] os: [linux] + libc: [musl] '@biomejs/cli-linux-x64@2.3.14': resolution: {integrity: sha512-ZsZzQsl9U+wxFrGGS4f6UxREUlgHwmEfu1IrXlgNFrNnd5Th6lIJr8KmSzu/+meSa9f4rzFrbEW9LBBA6ScoMA==} engines: {node: '>=14.21.3'} cpu: [x64] os: [linux] + libc: [glibc] '@biomejs/cli-win32-arm64@2.3.14': resolution: {integrity: sha512-+IKYkj/pUBbnRf1G1+RlyA3LWiDgra1xpS7H2g4BuOzzRbRB+hmlw0yFsLprHhbbt7jUzbzAbAjK/Pn0FDnh1A==} @@ -864,8 +868,13 @@ packages: peerDependencies: viem: 2.x - '@filoz/synapse-sdk@0.40.2': - resolution: {integrity: sha512-JRzCMYAquFsYtd72Lw0FJB90ad06g/qPBCxs7lmfj5nxjbBwiiBA1IUA0vBbud1YQ5FEvXDBAs8J/FiRd6p5+A==} + '@filoz/synapse-core@0.4.1': + resolution: {integrity: sha512-Psj2YpIxNh+nxJN0wQdYMBTQRRhq1gR/C9kosI39Kx6y+lV8ppw02c6mPeHEaa47AG3KfUqQyMb3xqurOlwraQ==} + peerDependencies: + viem: 2.x + + '@filoz/synapse-sdk@0.40.4': + resolution: {integrity: sha512-MRofQ3EixagTglo3nqWPvTyw15LtlPIFryCWN+1swgZk7JTSuFIqVunBphF5R5yFiMC0bbGohBCqrI4CIw+eCQ==} peerDependencies: viem: 2.x @@ -2011,56 +2020,67 @@ packages: resolution: {integrity: sha512-UPMMNeC4LXW7ZSHxeP3Edv09aLsFUMaD1TSVW6n1CWMECnUIJMFFB7+XC2lZTdPtvB36tYC0cJWc86mzSsaviw==} cpu: [arm] os: [linux] + libc: [glibc] '@rollup/rollup-linux-arm-musleabihf@4.53.4': resolution: {integrity: sha512-H8uwlV0otHs5Q7WAMSoyvjV9DJPiy5nJ/xnHolY0QptLPjaSsuX7tw+SPIfiYH6cnVx3fe4EWFafo6gH6ekZKA==} cpu: [arm] os: [linux] + libc: [musl] '@rollup/rollup-linux-arm64-gnu@4.53.4': resolution: {integrity: sha512-BLRwSRwICXz0TXkbIbqJ1ibK+/dSBpTJqDClF61GWIrxTXZWQE78ROeIhgl5MjVs4B4gSLPCFeD4xML9vbzvCQ==} cpu: [arm64] os: [linux] + libc: [glibc] '@rollup/rollup-linux-arm64-musl@4.53.4': resolution: {integrity: sha512-6bySEjOTbmVcPJAywjpGLckK793A0TJWSbIa0sVwtVGfe/Nz6gOWHOwkshUIAp9j7wg2WKcA4Snu7Y1nUZyQew==} cpu: [arm64] os: [linux] + libc: [musl] '@rollup/rollup-linux-loong64-gnu@4.53.4': resolution: {integrity: sha512-U0ow3bXYJZ5MIbchVusxEycBw7bO6C2u5UvD31i5IMTrnt2p4Fh4ZbHSdc/31TScIJQYHwxbj05BpevB3201ug==} cpu: [loong64] os: [linux] + libc: [glibc] '@rollup/rollup-linux-ppc64-gnu@4.53.4': resolution: {integrity: sha512-iujDk07ZNwGLVn0YIWM80SFN039bHZHCdCCuX9nyx3Jsa2d9V/0Y32F+YadzwbvDxhSeVo9zefkoPnXEImnM5w==} cpu: [ppc64] os: [linux] + libc: [glibc] '@rollup/rollup-linux-riscv64-gnu@4.53.4': resolution: {integrity: sha512-MUtAktiOUSu+AXBpx1fkuG/Bi5rhlorGs3lw5QeJ2X3ziEGAq7vFNdWVde6XGaVqi0LGSvugwjoxSNJfHFTC0g==} cpu: [riscv64] os: [linux] + libc: [glibc] '@rollup/rollup-linux-riscv64-musl@4.53.4': resolution: {integrity: sha512-btm35eAbDfPtcFEgaXCI5l3c2WXyzwiE8pArhd66SDtoLWmgK5/M7CUxmUglkwtniPzwvWioBKKl6IXLbPf2sQ==} cpu: [riscv64] os: [linux] + libc: [musl] '@rollup/rollup-linux-s390x-gnu@4.53.4': resolution: {integrity: sha512-uJlhKE9ccUTCUlK+HUz/80cVtx2RayadC5ldDrrDUFaJK0SNb8/cCmC9RhBhIWuZ71Nqj4Uoa9+xljKWRogdhA==} cpu: [s390x] os: [linux] + libc: [glibc] '@rollup/rollup-linux-x64-gnu@4.53.4': resolution: {integrity: sha512-jjEMkzvASQBbzzlzf4os7nzSBd/cvPrpqXCUOqoeCh1dQ4BP3RZCJk8XBeik4MUln3m+8LeTJcY54C/u8wb3DQ==} cpu: [x64] os: [linux] + libc: [glibc] '@rollup/rollup-linux-x64-musl@4.53.4': resolution: {integrity: sha512-lu90KG06NNH19shC5rBPkrh6mrTpq5kviFylPBXQVpdEu0yzb0mDgyxLr6XdcGdBIQTH/UAhDJnL+APZTBu1aQ==} cpu: [x64] os: [linux] + libc: [musl] '@rollup/rollup-openharmony-arm64@4.53.4': resolution: {integrity: sha512-dFDcmLwsUzhAm/dn0+dMOQZoONVYBtgik0VuY/d5IJUUb787L3Ko/ibvTvddqhb3RaB7vFEozYevHN4ox22R/w==} @@ -2198,24 +2218,28 @@ packages: engines: {node: '>=10'} cpu: [arm64] os: [linux] + libc: [glibc] '@swc/core-linux-arm64-musl@1.15.11': resolution: {integrity: sha512-PYftgsTaGnfDK4m6/dty9ryK1FbLk+LosDJ/RJR2nkXGc8rd+WenXIlvHjWULiBVnS1RsjHHOXmTS4nDhe0v0w==} engines: {node: '>=10'} cpu: [arm64] os: [linux] + libc: [musl] '@swc/core-linux-x64-gnu@1.15.11': resolution: {integrity: sha512-DKtnJKIHiZdARyTKiX7zdRjiDS1KihkQWatQiCHMv+zc2sfwb4Glrodx2VLOX4rsa92NLR0Sw8WLcPEMFY1szQ==} engines: {node: '>=10'} cpu: [x64] os: [linux] + libc: [glibc] '@swc/core-linux-x64-musl@1.15.11': resolution: {integrity: sha512-mUjjntHj4+8WBaiDe5UwRNHuEzLjIWBTSGTw0JT9+C9/Yyuh4KQqlcEQ3ro6GkHmBGXBFpGIj/o5VMyRWfVfWw==} engines: {node: '>=10'} cpu: [x64] os: [linux] + libc: [musl] '@swc/core-win32-arm64-msvc@1.15.11': resolution: {integrity: sha512-ZkNNG5zL49YpaFzfl6fskNOSxtcZ5uOYmWBkY4wVAvgbSAQzLRVBp+xArGWh2oXlY/WgL99zQSGTv7RI5E6nzA==} @@ -2292,24 +2316,28 @@ packages: engines: {node: '>= 20'} cpu: [arm64] os: [linux] + libc: [glibc] '@tailwindcss/oxide-linux-arm64-musl@4.2.4': resolution: {integrity: sha512-bBADEGAbo4ASnppIziaQJelekCxdMaxisrk+fB7Thit72IBnALp9K6ffA2G4ruj90G9XRS2VQ6q2bCKbfFV82g==} engines: {node: '>= 20'} cpu: [arm64] os: [linux] + libc: [musl] '@tailwindcss/oxide-linux-x64-gnu@4.2.4': resolution: {integrity: sha512-7Mx25E4WTfnht0TVRTyC00j3i0M+EeFe7wguMDTlX4mRxafznw0CA8WJkFjWYH5BlgELd1kSjuU2JiPnNZbJDA==} engines: {node: '>= 20'} cpu: [x64] os: [linux] + libc: [glibc] '@tailwindcss/oxide-linux-x64-musl@4.2.4': resolution: {integrity: sha512-2wwJRF7nyhOR0hhHoChc04xngV3iS+akccHTGtz965FwF0up4b2lOdo6kI1EbDaEXKgvcrFBYcYQQ/rrnWFVfA==} engines: {node: '>= 20'} cpu: [x64] os: [linux] + libc: [musl] '@tailwindcss/oxide-wasm32-wasi@4.2.4': resolution: {integrity: sha512-FQsqApeor8Fo6gUEklzmaa9994orJZZDBAlQpK2Mq+DslRKFJeD6AjHpBQ0kZFQohVr8o85PPh8eOy86VlSCmw==} @@ -4301,6 +4329,10 @@ packages: resolution: {integrity: sha512-6oIwpsgRfnDiyEDLMay/GqCl3HoAtH5+RUKW29gYkL0QA+ipzpDLA16yQs7/RHCSu+BwgbJaOUqa4A99qNVQVw==} engines: {node: '>=16'} + is-network-error@1.3.1: + resolution: {integrity: sha512-6QCxa49rQbmUWLfk0nuGqzql9U8uaV2H6279bRErPBHe/109hCzsLUBUHfbEtvLIHBd6hyXbgedBSHevm43Edw==} + engines: {node: '>=16'} + is-node-process@1.2.0: resolution: {integrity: sha512-Vg4o6/fqPxIjtxgUH5QLJhwZ7gW5diGCVlXpuUfELC62CuxM1iHcRe51f2W1FDy04Ai4KJkagKjx3XaqyfRKXw==} @@ -4373,6 +4405,9 @@ packages: iso-web@2.1.1: resolution: {integrity: sha512-P3qFt9hVgJx5lgUHY6TBoI575SHT7vt6BswXbcqd3BTZkBtEH59QxP6gMCtAACHxoWezbK2lTPj4yBoTBADDxQ==} + iso-web@2.2.1: + resolution: {integrity: sha512-4pkaxMAK089Gt+ua046Y0vGu7V7V7+P/2fEzlxYK0ssMvQFufaqkETHhoid+422L/kjU7aw9j1BxorpG6jmtXw==} + isomorphic-ws@4.0.1: resolution: {integrity: sha512-BhBvN2MBpWTaSHdWRb/bwdZJ1WaehQ2L1KngkCkfLUGF0mAWAT1sQUQacEmQ0jXkFw/czDXPNQSL5u2/Krsz1w==} peerDependencies: @@ -4686,24 +4721,28 @@ packages: engines: {node: '>= 12.0.0'} cpu: [arm64] os: [linux] + libc: [glibc] lightningcss-linux-arm64-musl@1.32.0: resolution: {integrity: sha512-UpQkoenr4UJEzgVIYpI80lDFvRmPVg6oqboNHfoH4CQIfNA+HOrZ7Mo7KZP02dC6LjghPQJeBsvXhJod/wnIBg==} engines: {node: '>= 12.0.0'} cpu: [arm64] os: [linux] + libc: [musl] lightningcss-linux-x64-gnu@1.32.0: resolution: {integrity: sha512-V7Qr52IhZmdKPVr+Vtw8o+WLsQJYCTd8loIfpDaMRWGUZfBOYEJeyJIkqGIDMZPwPx24pUMfwSxxI8phr/MbOA==} engines: {node: '>= 12.0.0'} cpu: [x64] os: [linux] + libc: [glibc] lightningcss-linux-x64-musl@1.32.0: resolution: {integrity: sha512-bYcLp+Vb0awsiXg/80uCRezCYHNg1/l3mt0gzHnWV9XP1W5sKa5/TCdGWaR/zBM2PeF/HbsQv/j2URNOiVuxWg==} engines: {node: '>= 12.0.0'} cpu: [x64] os: [linux] + libc: [musl] lightningcss-win32-arm64-msvc@1.32.0: resolution: {integrity: sha512-8SbC8BR40pS6baCM8sbtYDSwEVQd4JlFTOlaD3gWGHfThTcABnNDBda6eTZeqbofalIJhFx0qKzgHJmcPTnGdw==} @@ -5309,10 +5348,18 @@ packages: resolution: {integrity: sha512-O/ZPaXuQV29uSLbxWBGGZO1mCQXV2BLIwUr59JUU9SoH76mnYvtms7aafH/isNSNGwuEfP6W/4xD0/TJXxrizw==} engines: {node: '>=20'} + p-queue@9.2.0: + resolution: {integrity: sha512-dWgLE8AH0HjQ9fe74pUkKkvzzYT18Inp4zra3lKHnnwqGvcfcUBrvF2EAVX+envufDNBOzpPq/IBUONDbI7+3g==} + engines: {node: '>=20'} + p-retry@7.1.1: resolution: {integrity: sha512-J5ApzjyRkkf601HpEeykoiCvzHQjWxPAHhyjFcEUP2SWq0+35NKh8TLhpLw+Dkq5TZBFvUM6UigdE9hIVYTl5w==} engines: {node: '>=20'} + p-retry@8.0.0: + resolution: {integrity: sha512-kFVqH1HxOHp8LupNsOys7bSV09VYTRLxarH/mokO4Rqhk6wGi70E0jh4VzvVGXfEVNggHoHLAMWsQqHyU1Ey9A==} + engines: {node: '>=22'} + p-some@7.0.0: resolution: {integrity: sha512-9ldWF6puBzuchsUq7M1THjwwmoiXesqRdpB4WH0D7urKXdGkIaDqVhQS2BSfRRYZ970j9gm3U4/h9hHQx2G1Ug==} engines: {node: '>=20'} @@ -7444,9 +7491,24 @@ snapshots: transitivePeerDependencies: - typescript - '@filoz/synapse-sdk@0.40.2(typescript@5.9.3)(viem@2.47.5(typescript@5.9.3)(zod@4.3.6))': + '@filoz/synapse-core@0.4.1(typescript@5.9.3)(viem@2.47.5(typescript@5.9.3)(zod@4.3.6))': dependencies: - '@filoz/synapse-core': 0.3.3(typescript@5.9.3)(viem@2.47.5(typescript@5.9.3)(zod@4.3.6)) + '@web3-storage/data-segment': 5.3.0 + dnum: 2.17.0 + iso-web: 2.2.1 + multiformats: 13.4.2 + ox: 0.14.5(typescript@5.9.3)(zod@4.3.6) + p-locate: 7.0.0 + p-queue: 9.2.0 + p-some: 7.0.0 + viem: 2.47.5(typescript@5.9.3)(zod@4.3.6) + zod: 4.3.6 + transitivePeerDependencies: + - typescript + + '@filoz/synapse-sdk@0.40.4(typescript@5.9.3)(viem@2.47.5(typescript@5.9.3)(zod@4.3.6))': + dependencies: + '@filoz/synapse-core': 0.4.1(typescript@5.9.3)(viem@2.47.5(typescript@5.9.3)(zod@4.3.6)) multiformats: 13.4.2 viem: 2.47.5(typescript@5.9.3)(zod@4.3.6) transitivePeerDependencies: @@ -11249,7 +11311,7 @@ snapshots: '@chainsafe/libp2p-yamux': 8.0.1 '@clack/prompts': 1.1.0 '@filoz/synapse-core': 0.3.3(typescript@5.9.3)(viem@2.47.5(typescript@5.9.3)(zod@4.3.6)) - '@filoz/synapse-sdk': 0.40.2(typescript@5.9.3)(viem@2.47.5(typescript@5.9.3)(zod@4.3.6)) + '@filoz/synapse-sdk': 0.40.4(typescript@5.9.3)(viem@2.47.5(typescript@5.9.3)(zod@4.3.6)) '@helia/block-brokers': 5.1.4 '@helia/unixfs': 7.1.0(encoding@0.1.13) '@ipld/car': 5.4.2 @@ -11859,6 +11921,8 @@ snapshots: is-network-error@1.3.0: {} + is-network-error@1.3.1: {} + is-node-process@1.2.0: {} is-number@7.0.0: {} @@ -11918,6 +11982,13 @@ snapshots: iso-kv: 3.1.1 p-retry: 7.1.1 + iso-web@2.2.1: + dependencies: + delay: 7.0.0 + is-network-error: 1.3.1 + iso-kv: 3.1.1 + p-retry: 8.0.0 + isomorphic-ws@4.0.1(ws@7.5.10): dependencies: ws: 7.5.10 @@ -13064,10 +13135,19 @@ snapshots: eventemitter3: 5.0.4 p-timeout: 7.0.1 + p-queue@9.2.0: + dependencies: + eventemitter3: 5.0.4 + p-timeout: 7.0.1 + p-retry@7.1.1: dependencies: is-network-error: 1.3.0 + p-retry@8.0.0: + dependencies: + is-network-error: 1.3.1 + p-some@7.0.0: {} p-timeout@6.1.4: {} From a51bd3319e0a8bd212f3b31a7790a0266177c514 Mon Sep 17 00:00:00 2001 From: silent-cipher Date: Fri, 1 May 2026 00:49:46 +0530 Subject: [PATCH 02/44] feat: initial pull flow testing job --- apps/backend/.env.example | 10 + apps/backend/src/app.module.ts | 2 + apps/backend/src/config/app.config.ts | 53 ++ apps/backend/src/database/database.module.ts | 9 +- .../entities/job-schedule-state.entity.ts | 1 + .../database/entities/pull-check.entity.ts | 81 +++ .../1780000000000-CreatePullChecks.ts | 77 +++ apps/backend/src/database/types.ts | 17 + .../src/dev-tools/dev-tools.controller.ts | 53 ++ .../backend/src/dev-tools/dev-tools.module.ts | 3 +- .../src/dev-tools/dev-tools.service.ts | 71 +++ .../dev-tools/dto/trigger-pull-check.dto.ts | 73 +++ apps/backend/src/jobs/jobs.module.ts | 2 + apps/backend/src/jobs/jobs.service.spec.ts | 77 +-- apps/backend/src/jobs/jobs.service.ts | 101 +++- .../repositories/job-schedule.repository.ts | 4 +- .../metrics-prometheus/check-metric-labels.ts | 2 +- .../check-metrics.service.ts | 30 ++ .../metrics-prometheus.module.ts | 26 + .../src/pull-check/hosted-piece.registry.ts | 62 +++ .../src/pull-check/piece-source.controller.ts | 79 +++ .../src/pull-check/pull-check.module.ts | 18 + .../src/pull-check/pull-check.service.ts | 462 ++++++++++++++++++ .../src/pull-check/pull-check.types.ts | 23 + .../src/wallet-sdk/wallet-sdk.service.ts | 12 + apps/backend/src/worker.module.ts | 2 + 26 files changed, 1302 insertions(+), 48 deletions(-) create mode 100644 apps/backend/src/database/entities/pull-check.entity.ts create mode 100644 apps/backend/src/database/migrations/1780000000000-CreatePullChecks.ts create mode 100644 apps/backend/src/dev-tools/dto/trigger-pull-check.dto.ts create mode 100644 apps/backend/src/pull-check/hosted-piece.registry.ts create mode 100644 apps/backend/src/pull-check/piece-source.controller.ts create mode 100644 apps/backend/src/pull-check/pull-check.module.ts create mode 100644 apps/backend/src/pull-check/pull-check.service.ts create mode 100644 apps/backend/src/pull-check/pull-check.types.ts diff --git a/apps/backend/.env.example b/apps/backend/.env.example index 6815a66f..a9943dd1 100644 --- a/apps/backend/.env.example +++ b/apps/backend/.env.example @@ -7,6 +7,9 @@ DEALBOT_PORT=8080 # Specify host for dealbot service (optional) DEALBOT_HOST=localhost +# Optional public base URL for DealBot HTTP API (used to construct hosted-piece source URLs for SP pull checks) +# DEALBOT_API_PUBLIC_URL=https://dealbot.example.com + # Comma-separated list of allowed origins for CORS (for web dev) DEALBOT_ALLOWED_ORIGINS=http://localhost:5173,http://127.0.0.1:5173 @@ -61,6 +64,13 @@ JOB_ENQUEUE_JITTER_SECONDS=0 DEAL_JOB_TIMEOUT_SECONDS=360 # 6m: Max runtime for deal jobs (TODO: reduce default to 3m) RETRIEVAL_JOB_TIMEOUT_SECONDS=60 # 1m: Max runtime for retrieval jobs (TODO: reduce default to 30s) IPFS_BLOCK_FETCH_CONCURRENCY=6 # Parallel block fetches when validating IPFS DAGs + +# Pull Check Configuration +PULL_CHECKS_PER_SP_PER_HOUR=1 # SP pull-pathway checks scheduled per provider per hour +PULL_CHECK_JOB_TIMEOUT_SECONDS=360 # 6m: Max runtime for pull-check jobs +PULL_CHECK_HOSTED_PIECE_TTL_SECONDS=900 # 15m: Hosted piece source TTL exposed at /api/piece/:pieceCid +PULL_CHECK_POLL_INTERVAL_SECONDS=10 # SP pull status polling interval +PULL_CHECK_PIECE_SIZE_BYTES=10485760 # 10 MiB synthetic test piece size per pull check DEALBOT_PGBOSS_POOL_MAX=1 DEALBOT_PGBOSS_SCHEDULER_ENABLED=true diff --git a/apps/backend/src/app.module.ts b/apps/backend/src/app.module.ts index 569ec5e4..ed29e2ce 100644 --- a/apps/backend/src/app.module.ts +++ b/apps/backend/src/app.module.ts @@ -12,6 +12,7 @@ import { DevToolsModule } from "./dev-tools/dev-tools.module.js"; import { JobsModule } from "./jobs/jobs.module.js"; import { MetricsPrometheusModule } from "./metrics-prometheus/metrics-prometheus.module.js"; import { ProvidersModule } from "./providers/providers.module.js"; +import { PullCheckModule } from "./pull-check/pull-check.module.js"; import { RetrievalModule } from "./retrieval/retrieval.module.js"; @Module({ @@ -30,6 +31,7 @@ import { RetrievalModule } from "./retrieval/retrieval.module.js"; RetrievalModule, DataSourceModule, ProvidersModule, + PullCheckModule, ...(process.env.ENABLE_DEV_MODE === "true" ? [DevToolsModule] : []), ], controllers: [AppController], diff --git a/apps/backend/src/config/app.config.ts b/apps/backend/src/config/app.config.ts index b3b32a37..c1c5d490 100644 --- a/apps/backend/src/config/app.config.ts +++ b/apps/backend/src/config/app.config.ts @@ -29,6 +29,7 @@ export const configValidationSchema = Joi.object({ DEALBOT_RUN_MODE: Joi.string().lowercase().valid("api", "worker", "both").default("both"), DEALBOT_PORT: Joi.number().default(3000), DEALBOT_HOST: Joi.string().default("127.0.0.1"), + DEALBOT_API_PUBLIC_URL: Joi.string().uri().optional().allow(""), DEALBOT_METRICS_PORT: Joi.number().default(9090), DEALBOT_METRICS_HOST: Joi.string().default("0.0.0.0"), ENABLE_DEV_MODE: Joi.boolean().default(false), @@ -94,6 +95,16 @@ export const configValidationSchema = Joi.object({ DATA_SET_CREATION_JOB_TIMEOUT_SECONDS: Joi.number().min(60).default(300), // 5 minutes max runtime for dataset creation jobs IPFS_BLOCK_FETCH_CONCURRENCY: Joi.number().integer().min(1).max(32).default(6), + // Pull Check + PULL_CHECKS_PER_SP_PER_HOUR: Joi.number().min(0.001).max(20).default(1), + PULL_CHECK_JOB_TIMEOUT_SECONDS: Joi.number().min(60).default(360), // 6m max runtime for pull check jobs + PULL_CHECK_HOSTED_PIECE_TTL_SECONDS: Joi.number().min(60).default(900), // 15m hosted piece TTL + PULL_CHECK_POLL_INTERVAL_SECONDS: Joi.number().min(1).default(10), + PULL_CHECK_PIECE_SIZE_BYTES: Joi.number() + .integer() + .min(1024) + .default(10 * 1024 * 1024), // 10 MiB + // Piece Cleanup MAX_DATASET_STORAGE_SIZE_BYTES: Joi.number() .integer() @@ -146,6 +157,12 @@ export interface IAppConfig { runMode: "api" | "worker" | "both"; port: number; host: string; + /** + * Optional publicly reachable DealBot API base URL (e.g. `https://dealbot.example.com`). + * Used to construct hosted-piece source URLs that SPs can fetch during pull checks. + * When unset, falls back to `http://${host}:${port}`. + */ + apiPublicUrl: string | null; metricsPort: number; metricsHost: string; enableDevMode: boolean; @@ -278,6 +295,32 @@ export interface IJobsConfig { * Only used when `DEALBOT_JOBS_MODE=pgboss`. */ maxPieceCleanupRuntimeSeconds: number; + /** + * Target number of pull checks per storage provider per hour. + * + * Pull checks validate the SP pull-to-park pathway by serving a temporary piece URL + * from DealBot and asking the SP to pull and park it. Independent of `deal` and `retrieval`. + */ + pullChecksPerSpPerHour: number; + /** + * Maximum runtime (seconds) for pull-check jobs before forced abort. + * + * Bounds the polling window for terminal SP pull status. + */ + pullCheckJobTimeoutSeconds: number; + /** + * Time-to-live (seconds) for the temporary hosted piece source served at + * `/api/piece/:pieceCid` while a pull check is in flight. + */ + pullCheckHostedPieceTtlSeconds: number; + /** + * Polling interval (seconds) used while waiting for a terminal SP pull status. + */ + pullCheckPollIntervalSeconds: number; + /** + * Size (bytes) of the synthetic test piece DealBot generates per pull check. + */ + pullCheckPieceSizeBytes: number; } export interface IDatasetConfig { @@ -347,6 +390,11 @@ export function loadConfig(): IConfig { })(), port: Number.parseInt(process.env.DEALBOT_PORT || "3000", 10), host: process.env.DEALBOT_HOST || "127.0.0.1", + apiPublicUrl: (() => { + const raw = process.env.DEALBOT_API_PUBLIC_URL; + if (raw == null || raw.trim().length === 0) return null; + return raw.trim().replace(/\/+$/, ""); + })(), metricsPort: Number.parseInt(process.env.DEALBOT_METRICS_PORT || "9090", 10), metricsHost: process.env.DEALBOT_METRICS_HOST || "0.0.0.0", enableDevMode: process.env.ENABLE_DEV_MODE === "true", @@ -406,6 +454,11 @@ export function loadConfig(): IConfig { dataSetCreationJobTimeoutSeconds: Number.parseInt(process.env.DATA_SET_CREATION_JOB_TIMEOUT_SECONDS || "300", 10), pieceCleanupPerSpPerHour: Number.parseFloat(process.env.JOB_PIECE_CLEANUP_PER_SP_PER_HOUR || String(1 / 24)), maxPieceCleanupRuntimeSeconds: Number.parseInt(process.env.MAX_PIECE_CLEANUP_RUNTIME_SECONDS || "300", 10), + pullChecksPerSpPerHour: Number.parseFloat(process.env.PULL_CHECKS_PER_SP_PER_HOUR || "1"), + pullCheckJobTimeoutSeconds: Number.parseInt(process.env.PULL_CHECK_JOB_TIMEOUT_SECONDS || "360", 10), + pullCheckHostedPieceTtlSeconds: Number.parseInt(process.env.PULL_CHECK_HOSTED_PIECE_TTL_SECONDS || "900", 10), + pullCheckPollIntervalSeconds: Number.parseInt(process.env.PULL_CHECK_POLL_INTERVAL_SECONDS || "10", 10), + pullCheckPieceSizeBytes: Number.parseInt(process.env.PULL_CHECK_PIECE_SIZE_BYTES || String(10 * 1024 * 1024), 10), }, dataset: { localDatasetsPath: process.env.DEALBOT_LOCAL_DATASETS_PATH || DEFAULT_LOCAL_DATASETS_PATH, diff --git a/apps/backend/src/database/database.module.ts b/apps/backend/src/database/database.module.ts index 9249c3a9..2e8ddf72 100644 --- a/apps/backend/src/database/database.module.ts +++ b/apps/backend/src/database/database.module.ts @@ -10,6 +10,7 @@ import type { IAppConfig, IConfig, IDatabaseConfig } from "../config/app.config. import { DataRetentionBaseline } from "./entities/data-retention-baseline.entity.js"; import { Deal } from "./entities/deal.entity.js"; import { JobScheduleState } from "./entities/job-schedule-state.entity.js"; +import { PullCheck } from "./entities/pull-check.entity.js"; import { Retrieval } from "./entities/retrieval.entity.js"; import { StorageProvider } from "./entities/storage-provider.entity.js"; @@ -49,7 +50,7 @@ function toSafeDataSourceContext(options: DataSourceOptions): Record { + await queryRunner.query(` + DO $$ + BEGIN + IF NOT EXISTS (SELECT 1 FROM pg_type WHERE typname = 'pull_checks_status_enum') THEN + CREATE TYPE "pull_checks_status_enum" AS ENUM ( + 'pending', + 'requesting', + 'polling', + 'verifying', + 'success', + 'failed', + 'timed_out' + ); + END IF; + IF NOT EXISTS (SELECT 1 FROM pg_type WHERE typname = 'pull_checks_verification_status_enum') THEN + CREATE TYPE "pull_checks_verification_status_enum" AS ENUM ( + 'pending', + 'passed', + 'failed', + 'skipped' + ); + END IF; + END$$; + `); + + await queryRunner.query(` + CREATE TABLE IF NOT EXISTS pull_checks ( + id UUID PRIMARY KEY DEFAULT gen_random_uuid(), + sp_address VARCHAR NOT NULL, + piece_cid VARCHAR NOT NULL, + source_url TEXT NOT NULL, + request_id VARCHAR NULL, + status pull_checks_status_enum NOT NULL DEFAULT 'pending', + provider_status VARCHAR NULL, + failure_reason TEXT NULL, + request_started_at TIMESTAMPTZ NULL, + request_completed_at TIMESTAMPTZ NULL, + completed_at TIMESTAMPTZ NULL, + verification_status pull_checks_verification_status_enum NULL, + verification_completed_at TIMESTAMPTZ NULL, + verification_message TEXT NULL, + hosted_piece_expires_at TIMESTAMPTZ NOT NULL, + hosted_piece_cleaned_up_at TIMESTAMPTZ NULL, + error_code VARCHAR NULL, + error_message TEXT NULL, + retry_count INTEGER NOT NULL DEFAULT 0, + created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), + updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW() + ) + `); + + await queryRunner.query(` + CREATE INDEX IF NOT EXISTS idx_pull_checks_sp_address ON pull_checks (sp_address) + `); + await queryRunner.query(` + CREATE INDEX IF NOT EXISTS idx_pull_checks_status ON pull_checks (status) + `); + await queryRunner.query(` + CREATE INDEX IF NOT EXISTS idx_pull_checks_created_at ON pull_checks (created_at) + `); + } + + public async down(queryRunner: QueryRunner): Promise { + await queryRunner.query(`DROP INDEX IF EXISTS idx_pull_checks_created_at`); + await queryRunner.query(`DROP INDEX IF EXISTS idx_pull_checks_status`); + await queryRunner.query(`DROP INDEX IF EXISTS idx_pull_checks_sp_address`); + await queryRunner.query(`DROP TABLE IF EXISTS pull_checks`); + await queryRunner.query(`DROP TYPE IF EXISTS pull_checks_verification_status_enum`); + await queryRunner.query(`DROP TYPE IF EXISTS pull_checks_status_enum`); + } +} diff --git a/apps/backend/src/database/types.ts b/apps/backend/src/database/types.ts index 46fd5d28..b7f9559c 100644 --- a/apps/backend/src/database/types.ts +++ b/apps/backend/src/database/types.ts @@ -28,6 +28,23 @@ export enum IpniStatus { FAILED = "failed", } +export enum PullCheckStatus { + PENDING = "pending", + REQUESTING = "requesting", + POLLING = "polling", + VERIFYING = "verifying", + SUCCESS = "success", + FAILED = "failed", + TIMED_OUT = "timed_out", +} + +export enum PullVerificationStatus { + PENDING = "pending", + PASSED = "passed", + FAILED = "failed", + SKIPPED = "skipped", +} + /** * Metadata schema for deal storage and retrieval */ diff --git a/apps/backend/src/dev-tools/dev-tools.controller.ts b/apps/backend/src/dev-tools/dev-tools.controller.ts index 7ae09d0e..10a3baf8 100644 --- a/apps/backend/src/dev-tools/dev-tools.controller.ts +++ b/apps/backend/src/dev-tools/dev-tools.controller.ts @@ -2,6 +2,11 @@ import { Controller, Get, Logger, Param, Query, UsePipes, ValidationPipe } from import { ApiOperation, ApiQuery, ApiResponse, ApiTags } from "@nestjs/swagger"; import { DevToolsService } from "./dev-tools.service.js"; import { TriggerDealQueryDto, TriggerDealResponseDto } from "./dto/trigger-deal.dto.js"; +import { + PullCheckStatusResponseDto, + TriggerPullCheckQueryDto, + TriggerPullCheckResponseDto, +} from "./dto/trigger-pull-check.dto.js"; import { TriggerRetrievalQueryDto, TriggerRetrievalResponseDto } from "./dto/trigger-retrieval.dto.js"; @ApiTags("Dev Tools") @@ -121,4 +126,52 @@ export class DevToolsController { }); return this.devToolsService.triggerRetrieval(query.dealId, query.spAddress); } + + @Get("pull") + @ApiOperation({ + summary: "Trigger a manual SP pull check (returns immediately, processing in background)", + }) + @ApiQuery({ + name: "spAddress", + required: true, + description: "Storage provider address", + example: "0x1234567890abcdef1234567890abcdef12345678", + }) + @ApiResponse({ + status: 200, + description: "Pull check accepted - use /api/dev/pulls/:pullCheckId to check progress", + type: TriggerPullCheckResponseDto, + }) + @ApiResponse({ status: 400, description: "Storage provider is not eligible for pull checks" }) + @ApiResponse({ status: 404, description: "Storage provider not found" }) + @UsePipes(new ValidationPipe({ transform: true })) + async triggerPullCheck(@Query() query: TriggerPullCheckQueryDto): Promise { + this.logger.log({ + event: "api_request", + message: "GET /api/dev/pull", + endpoint: "/api/dev/pull", + method: "GET", + spAddress: query.spAddress, + }); + return this.devToolsService.triggerPullCheck(query.spAddress); + } + + @Get("pulls/:pullCheckId") + @ApiOperation({ summary: "Get pull-check status by ID" }) + @ApiResponse({ + status: 200, + description: "Pull check status", + type: PullCheckStatusResponseDto, + }) + @ApiResponse({ status: 404, description: "Pull check not found" }) + async getPullCheck(@Param("pullCheckId") pullCheckId: string): Promise { + this.logger.log({ + event: "api_request", + message: "GET /api/dev/pulls/:pullCheckId", + endpoint: "/api/dev/pulls/:pullCheckId", + method: "GET", + pullCheckId, + }); + return this.devToolsService.getPullCheck(pullCheckId); + } } diff --git a/apps/backend/src/dev-tools/dev-tools.module.ts b/apps/backend/src/dev-tools/dev-tools.module.ts index 30db84f6..f54dcdae 100644 --- a/apps/backend/src/dev-tools/dev-tools.module.ts +++ b/apps/backend/src/dev-tools/dev-tools.module.ts @@ -2,13 +2,14 @@ import { Module } from "@nestjs/common"; import { TypeOrmModule } from "@nestjs/typeorm"; import { Deal } from "../database/entities/deal.entity.js"; import { DealModule } from "../deal/deal.module.js"; +import { PullCheckModule } from "../pull-check/pull-check.module.js"; import { RetrievalModule } from "../retrieval/retrieval.module.js"; import { WalletSdkModule } from "../wallet-sdk/wallet-sdk.module.js"; import { DevToolsController } from "./dev-tools.controller.js"; import { DevToolsService } from "./dev-tools.service.js"; @Module({ - imports: [TypeOrmModule.forFeature([Deal]), WalletSdkModule, DealModule, RetrievalModule], + imports: [TypeOrmModule.forFeature([Deal]), WalletSdkModule, DealModule, RetrievalModule, PullCheckModule], controllers: [DevToolsController], providers: [DevToolsService], }) diff --git a/apps/backend/src/dev-tools/dev-tools.service.ts b/apps/backend/src/dev-tools/dev-tools.service.ts index 8b08e046..124b2248 100644 --- a/apps/backend/src/dev-tools/dev-tools.service.ts +++ b/apps/backend/src/dev-tools/dev-tools.service.ts @@ -3,11 +3,14 @@ import { InjectRepository } from "@nestjs/typeorm"; import type { Repository } from "typeorm"; import { type DealLogContext, toStructuredError } from "../common/logging.js"; import { Deal } from "../database/entities/deal.entity.js"; +import type { PullCheck } from "../database/entities/pull-check.entity.js"; import { DealStatus, RetrievalStatus } from "../database/types.js"; import { DealService } from "../deal/deal.service.js"; +import { PullCheckService } from "../pull-check/pull-check.service.js"; import { RetrievalService } from "../retrieval/retrieval.service.js"; import { WalletSdkService } from "../wallet-sdk/wallet-sdk.service.js"; import type { TriggerDealResponseDto } from "./dto/trigger-deal.dto.js"; +import type { PullCheckStatusResponseDto, TriggerPullCheckResponseDto } from "./dto/trigger-pull-check.dto.js"; import type { RetrievalMethodResultDto, TriggerRetrievalResponseDto } from "./dto/trigger-retrieval.dto.js"; @Injectable() @@ -18,6 +21,7 @@ export class DevToolsService { private readonly walletSdkService: WalletSdkService, private readonly dealService: DealService, private readonly retrievalService: RetrievalService, + private readonly pullCheckService: PullCheckService, @InjectRepository(Deal) private readonly dealRepository: Repository, ) {} @@ -293,6 +297,73 @@ export class DevToolsService { }; } + /** + * Trigger a manual SP pull check. Returns immediately with the pull-check ID; + * the actual pull request and polling run in the background, isolated from + * the existing direct-upload data-storage check. + */ + async triggerPullCheck(spAddress: string): Promise { + this.logger.log({ + event: "pull_check_trigger_requested", + message: "Triggering manual pull check for storage provider", + spAddress, + }); + + const record = await this.pullCheckService.triggerManualPullCheck(spAddress); + return this.pullCheckToTriggerDto(record); + } + + /** + * Get current pull-check state by id. Surfaces SP-reported pull status, + * verification status, and computed latencies so operators can review + * outcomes without inspecting database rows directly. + */ + async getPullCheck(pullCheckId: string): Promise { + const record = await this.pullCheckService.getPullCheck(pullCheckId); + return this.pullCheckToStatusDto(record); + } + + private pullCheckToTriggerDto(record: PullCheck): TriggerPullCheckResponseDto { + return { + id: record.id, + spAddress: record.spAddress, + pieceCid: record.pieceCid, + status: record.status, + sourceUrl: record.sourceUrl, + createdAt: record.createdAt, + }; + } + + private pullCheckToStatusDto(record: PullCheck): PullCheckStatusResponseDto { + const requestStartedAt = record.requestStartedAt ?? undefined; + const completedAt = record.completedAt ?? undefined; + const requestCompletedAt = record.requestCompletedAt ?? undefined; + const requestLatencyMs = + record.requestStartedAt && record.requestCompletedAt + ? record.requestCompletedAt.getTime() - record.requestStartedAt.getTime() + : undefined; + const completionLatencyMs = + record.requestStartedAt && record.completedAt + ? record.completedAt.getTime() - record.requestStartedAt.getTime() + : undefined; + void requestCompletedAt; + return { + id: record.id, + spAddress: record.spAddress, + pieceCid: record.pieceCid, + status: record.status, + providerStatus: record.providerStatus ?? undefined, + verificationStatus: record.verificationStatus ?? undefined, + requestLatencyMs, + completionLatencyMs, + failureReason: record.failureReason ?? undefined, + errorMessage: record.errorMessage ?? undefined, + sourceUrl: record.sourceUrl, + requestStartedAt, + completedAt, + }; + } + /** * Find a deal by ID or most recent deal for an SP */ diff --git a/apps/backend/src/dev-tools/dto/trigger-pull-check.dto.ts b/apps/backend/src/dev-tools/dto/trigger-pull-check.dto.ts new file mode 100644 index 00000000..8d2bf609 --- /dev/null +++ b/apps/backend/src/dev-tools/dto/trigger-pull-check.dto.ts @@ -0,0 +1,73 @@ +import { ApiProperty } from "@nestjs/swagger"; +import { IsNotEmpty, IsString } from "class-validator"; + +export class TriggerPullCheckQueryDto { + @ApiProperty({ + description: "Storage provider address to run the pull check against", + example: "0x1234567890abcdef1234567890abcdef12345678", + }) + @IsString() + @IsNotEmpty() + spAddress: string; +} + +export class TriggerPullCheckResponseDto { + @ApiProperty({ description: "Pull check identifier" }) + id: string; + + @ApiProperty({ description: "Storage provider address" }) + spAddress: string; + + @ApiProperty({ description: "Hosted piece CID for this pull check" }) + pieceCid: string; + + @ApiProperty({ description: "Pull-check lifecycle status" }) + status: string; + + @ApiProperty({ description: "Hosted piece source URL the SP must pull from" }) + sourceUrl: string; + + @ApiProperty({ description: "Pull-check creation timestamp" }) + createdAt: Date; +} + +export class PullCheckStatusResponseDto { + @ApiProperty({ description: "Pull check identifier" }) + id: string; + + @ApiProperty({ description: "Storage provider address" }) + spAddress: string; + + @ApiProperty({ description: "Hosted piece CID" }) + pieceCid: string; + + @ApiProperty({ description: "Pull-check lifecycle status" }) + status: string; + + @ApiProperty({ description: "Latest provider-reported pull status", required: false }) + providerStatus?: string; + + @ApiProperty({ description: "Verification status, when applicable", required: false }) + verificationStatus?: string; + + @ApiProperty({ description: "Time from request submission to SP acknowledgement (ms)", required: false }) + requestLatencyMs?: number; + + @ApiProperty({ description: "Time from request submission to terminal SP status (ms)", required: false }) + completionLatencyMs?: number; + + @ApiProperty({ description: "Failure reason, when applicable", required: false }) + failureReason?: string; + + @ApiProperty({ description: "Underlying error message, when applicable", required: false }) + errorMessage?: string; + + @ApiProperty({ description: "Hosted piece source URL the SP was asked to pull from" }) + sourceUrl: string; + + @ApiProperty({ description: "Time at which DealBot started the pull request", required: false }) + requestStartedAt?: Date; + + @ApiProperty({ description: "Time at which DealBot reached a terminal pull state", required: false }) + completedAt?: Date; +} diff --git a/apps/backend/src/jobs/jobs.module.ts b/apps/backend/src/jobs/jobs.module.ts index 15ad4d64..12328093 100644 --- a/apps/backend/src/jobs/jobs.module.ts +++ b/apps/backend/src/jobs/jobs.module.ts @@ -6,6 +6,7 @@ import { JobScheduleState } from "../database/entities/job-schedule-state.entity import { StorageProvider } from "../database/entities/storage-provider.entity.js"; import { DealModule } from "../deal/deal.module.js"; import { PieceCleanupModule } from "../piece-cleanup/piece-cleanup.module.js"; +import { PullCheckModule } from "../pull-check/pull-check.module.js"; import { RetrievalModule } from "../retrieval/retrieval.module.js"; import { WalletSdkModule } from "../wallet-sdk/wallet-sdk.module.js"; import { JobsService } from "./jobs.service.js"; @@ -20,6 +21,7 @@ import { JobScheduleRepository } from "./repositories/job-schedule.repository.js WalletSdkModule, DataRetentionModule, PieceCleanupModule, + PullCheckModule, ], providers: [JobsService, JobScheduleRepository], }) diff --git a/apps/backend/src/jobs/jobs.service.spec.ts b/apps/backend/src/jobs/jobs.service.spec.ts index 5b8c58bc..e9adc6ac 100644 --- a/apps/backend/src/jobs/jobs.service.spec.ts +++ b/apps/backend/src/jobs/jobs.service.spec.ts @@ -29,18 +29,18 @@ describe("JobsService schedule rows", () => { }; let dataRetentionServiceMock: { pollDataRetention: ReturnType }; let metricsMocks: { - jobsQueuedGauge: JobsServiceDeps[8]; - jobsRetryScheduledGauge: JobsServiceDeps[9]; - oldestQueuedAgeGauge: JobsServiceDeps[10]; - oldestInFlightAgeGauge: JobsServiceDeps[11]; - jobsInFlightGauge: JobsServiceDeps[12]; - jobsEnqueueAttemptsCounter: JobsServiceDeps[13]; - jobsStartedCounter: JobsServiceDeps[14]; - jobsCompletedCounter: JobsServiceDeps[15]; - jobsPausedGauge: JobsServiceDeps[16]; - jobDuration: JobsServiceDeps[17]; - storageProvidersActive: JobsServiceDeps[18]; - storageProvidersTested: JobsServiceDeps[19]; + jobsQueuedGauge: JobsServiceDeps[9]; + jobsRetryScheduledGauge: JobsServiceDeps[10]; + oldestQueuedAgeGauge: JobsServiceDeps[11]; + oldestInFlightAgeGauge: JobsServiceDeps[12]; + jobsInFlightGauge: JobsServiceDeps[13]; + jobsEnqueueAttemptsCounter: JobsServiceDeps[14]; + jobsStartedCounter: JobsServiceDeps[15]; + jobsCompletedCounter: JobsServiceDeps[16]; + jobsPausedGauge: JobsServiceDeps[17]; + jobDuration: JobsServiceDeps[18]; + storageProvidersActive: JobsServiceDeps[19]; + storageProvidersTested: JobsServiceDeps[20]; }; let baseConfigValues: Partial; let configService: JobsServiceDeps[0]; @@ -54,18 +54,19 @@ describe("JobsService schedule rows", () => { walletSdkService: JobsServiceDeps[5]; dataRetentionService: JobsServiceDeps[6]; pieceCleanupService: JobsServiceDeps[7]; - jobsQueuedGauge: JobsServiceDeps[8]; - jobsRetryScheduledGauge: JobsServiceDeps[9]; - oldestQueuedAgeGauge: JobsServiceDeps[10]; - oldestInFlightAgeGauge: JobsServiceDeps[11]; - jobsInFlightGauge: JobsServiceDeps[12]; - jobsEnqueueAttemptsCounter: JobsServiceDeps[13]; - jobsStartedCounter: JobsServiceDeps[14]; - jobsCompletedCounter: JobsServiceDeps[15]; - jobsPausedGauge: JobsServiceDeps[16]; - jobDuration: JobsServiceDeps[17]; - storageProvidersActive: JobsServiceDeps[18]; - storageProvidersTested: JobsServiceDeps[19]; + pullCheckService: JobsServiceDeps[8]; + jobsQueuedGauge: JobsServiceDeps[9]; + jobsRetryScheduledGauge: JobsServiceDeps[10]; + oldestQueuedAgeGauge: JobsServiceDeps[11]; + oldestInFlightAgeGauge: JobsServiceDeps[12]; + jobsInFlightGauge: JobsServiceDeps[13]; + jobsEnqueueAttemptsCounter: JobsServiceDeps[14]; + jobsStartedCounter: JobsServiceDeps[15]; + jobsCompletedCounter: JobsServiceDeps[16]; + jobsPausedGauge: JobsServiceDeps[17]; + jobDuration: JobsServiceDeps[18]; + storageProvidersActive: JobsServiceDeps[19]; + storageProvidersTested: JobsServiceDeps[20]; }>, ) => JobsService; @@ -95,18 +96,18 @@ describe("JobsService schedule rows", () => { }; metricsMocks = { - jobsQueuedGauge: { set: vi.fn() } as unknown as JobsServiceDeps[8], - jobsRetryScheduledGauge: { set: vi.fn() } as unknown as JobsServiceDeps[9], - oldestQueuedAgeGauge: { set: vi.fn() } as unknown as JobsServiceDeps[10], - oldestInFlightAgeGauge: { set: vi.fn() } as unknown as JobsServiceDeps[11], - jobsInFlightGauge: { set: vi.fn() } as unknown as JobsServiceDeps[12], - jobsEnqueueAttemptsCounter: { inc: vi.fn() } as unknown as JobsServiceDeps[13], - jobsStartedCounter: { inc: vi.fn() } as unknown as JobsServiceDeps[14], - jobsCompletedCounter: { inc: vi.fn() } as unknown as JobsServiceDeps[15], - jobsPausedGauge: { set: vi.fn() } as unknown as JobsServiceDeps[16], - jobDuration: { observe: vi.fn() } as unknown as JobsServiceDeps[17], - storageProvidersActive: { set: vi.fn() } as unknown as JobsServiceDeps[18], - storageProvidersTested: { set: vi.fn() } as unknown as JobsServiceDeps[19], + jobsQueuedGauge: { set: vi.fn() } as unknown as JobsServiceDeps[9], + jobsRetryScheduledGauge: { set: vi.fn() } as unknown as JobsServiceDeps[10], + oldestQueuedAgeGauge: { set: vi.fn() } as unknown as JobsServiceDeps[11], + oldestInFlightAgeGauge: { set: vi.fn() } as unknown as JobsServiceDeps[12], + jobsInFlightGauge: { set: vi.fn() } as unknown as JobsServiceDeps[13], + jobsEnqueueAttemptsCounter: { inc: vi.fn() } as unknown as JobsServiceDeps[14], + jobsStartedCounter: { inc: vi.fn() } as unknown as JobsServiceDeps[15], + jobsCompletedCounter: { inc: vi.fn() } as unknown as JobsServiceDeps[16], + jobsPausedGauge: { set: vi.fn() } as unknown as JobsServiceDeps[17], + jobDuration: { observe: vi.fn() } as unknown as JobsServiceDeps[18], + storageProvidersActive: { set: vi.fn() } as unknown as JobsServiceDeps[19], + storageProvidersTested: { set: vi.fn() } as unknown as JobsServiceDeps[20], }; const emptySpBlocklists: ISpBlocklistConfig = { @@ -160,6 +161,7 @@ describe("JobsService schedule rows", () => { overrides.walletSdkService ?? ({} as JobsServiceDeps[5]), overrides.dataRetentionService ?? (dataRetentionServiceMock as unknown as JobsServiceDeps[6]), overrides.pieceCleanupService ?? ({} as JobsServiceDeps[7]), + overrides.pullCheckService ?? ({} as JobsServiceDeps[8]), overrides.jobsQueuedGauge ?? metricsMocks.jobsQueuedGauge, overrides.jobsRetryScheduledGauge ?? metricsMocks.jobsRetryScheduledGauge, overrides.oldestQueuedAgeGauge ?? metricsMocks.oldestQueuedAgeGauge, @@ -614,11 +616,12 @@ describe("JobsService schedule rows", () => { // Check upserts for providerB const upsertCalls = jobScheduleRepositoryMock.upsertSchedule.mock.calls; const upsertsForB = upsertCalls.filter((call) => call[1] === providerB.address); - expect(upsertsForB).toHaveLength(4); + expect(upsertsForB).toHaveLength(5); expect(upsertsForB.map((call) => call[0]).sort()).toEqual([ "data_set_creation", "deal", "piece_cleanup", + "pull_check", "retrieval", ]); }); diff --git a/apps/backend/src/jobs/jobs.service.ts b/apps/backend/src/jobs/jobs.service.ts index 01357225..9435b244 100644 --- a/apps/backend/src/jobs/jobs.service.ts +++ b/apps/backend/src/jobs/jobs.service.ts @@ -14,14 +14,21 @@ import type { JobType } from "../database/entities/job-schedule-state.entity.js" import { StorageProvider } from "../database/entities/storage-provider.entity.js"; import { DealService } from "../deal/deal.service.js"; import { PieceCleanupService } from "../piece-cleanup/piece-cleanup.service.js"; +import { PullCheckService } from "../pull-check/pull-check.service.js"; import { RetrievalService } from "../retrieval/retrieval.service.js"; import { WalletSdkService } from "../wallet-sdk/wallet-sdk.service.js"; import { provisionNextMissingDataSet } from "./data-set-creation.handler.js"; import { DATA_RETENTION_POLL_QUEUE, PROVIDERS_REFRESH_QUEUE, SP_WORK_QUEUE } from "./job-queues.js"; import { JobScheduleRepository } from "./repositories/job-schedule.repository.js"; -type SpJobType = "deal" | "retrieval" | "data_set_creation" | "piece_cleanup"; -const SP_JOB_TYPES: ReadonlySet = new Set(["deal", "retrieval", "data_set_creation", "piece_cleanup"]); +type SpJobType = "deal" | "retrieval" | "data_set_creation" | "piece_cleanup" | "pull_check"; +const SP_JOB_TYPES: ReadonlySet = new Set([ + "deal", + "retrieval", + "data_set_creation", + "piece_cleanup", + "pull_check", +]); function isSpJobType(jobType: string): jobType is SpJobType { return SP_JOB_TYPES.has(jobType); } @@ -60,6 +67,7 @@ export class JobsService implements OnModuleInit, OnApplicationShutdown { private readonly walletSdkService: WalletSdkService, private readonly dataRetentionService: DataRetentionService, private readonly pieceCleanupService: PieceCleanupService, + private readonly pullCheckService: PullCheckService, @InjectMetric("jobs_queued") private readonly jobsQueuedGauge: Gauge, @InjectMetric("jobs_retry_scheduled") @@ -290,6 +298,10 @@ export class JobsService implements OnModuleInit, OnApplicationShutdown { await this.handlePieceCleanupJob(job); return; } + if (job.data.jobType === "pull_check") { + await this.handlePullCheckJob(job); + return; + } this.logger.warn({ event: "unknown_sp_job_type", message: "Skipping unknown SP job type", @@ -645,6 +657,73 @@ export class JobsService implements OnModuleInit, OnApplicationShutdown { }); } + private async handlePullCheckJob(job: SpJob): Promise { + const data = job.data; + const spAddress = data.spAddress; + const now = new Date(); + const maintenance = this.getMaintenanceWindowStatus(now); + if (maintenance.active) { + this.logMaintenanceSkip(`pull_check job for ${spAddress}`, maintenance.window?.label, { + jobId: job.id, + providerAddress: spAddress, + providerId: this.walletSdkService.getProviderInfo(spAddress)?.id, + providerName: this.walletSdkService.getProviderInfo(spAddress)?.name, + }); + await this.deferJobForMaintenance("pull_check", data, maintenance, now); + return; + } + + const abortController = new AbortController(); + const timeoutSeconds = this.configService.get("jobs").pullCheckJobTimeoutSeconds; + const timeoutMs = Math.max(60000, timeoutSeconds * 1000); + const effectiveTimeoutSeconds = Math.round(timeoutMs / 1000); + const abortReason = new Error(`Pull check job timeout (${effectiveTimeoutSeconds}s) for ${spAddress}`); + const timeoutId = setTimeout(() => { + abortController.abort(abortReason); + }, timeoutMs); + + await this.recordJobExecution("pull_check", async () => { + const logContext = await this.resolveRunnableProviderJobContext( + "pull_check", + spAddress, + job.id, + "Pull check job skipped: provider is blocked for scheduled pull checks", + ); + if (logContext == null) { + clearTimeout(timeoutId); + return "success"; + } + try { + const triggered = await this.pullCheckService.triggerManualPullCheck(spAddress); + await this.pullCheckService.runPullCheck(triggered.id, abortController.signal, logContext); + return "success"; + } catch (error) { + if (abortController.signal.aborted) { + const reason = abortController.signal.reason; + const reasonMessage = reason instanceof Error ? reason.message : String(reason ?? ""); + this.logger.error({ + ...logContext, + event: "pull_check_job_aborted", + message: reasonMessage || "Pull check job aborted after timeout", + timeoutSeconds: effectiveTimeoutSeconds, + error: toStructuredError(reason ?? error), + }); + return "aborted"; + } + this.logger.error({ + ...logContext, + event: "pull_check_job_failed", + message: "Pull check job failed", + error: toStructuredError(error), + }); + // Jobs are not retried once attempted; failures are handled by the next schedule tick. + throw error; + } finally { + clearTimeout(timeoutId); + } + }); + } + private async handlePieceCleanupJob(job: SpJob): Promise { const data = job.data; const spAddress = data.spAddress; @@ -903,6 +982,7 @@ export class JobsService implements OnModuleInit, OnApplicationShutdown { dataRetentionPollIntervalSeconds: number; providersRefreshIntervalSeconds: number; pieceCleanupIntervalSeconds: number; + pullCheckIntervalSeconds: number; } { const jobsConfig = this.configService.get("jobs", { infer: true }); const scheduling = this.configService.get("scheduling", { infer: true }); @@ -911,11 +991,13 @@ export class JobsService implements OnModuleInit, OnApplicationShutdown { const retrievalsPerHour = jobsConfig.retrievalsPerSpPerHour; const dataSetCreationsPerHour = jobsConfig.dataSetCreationsPerSpPerHour; const pieceCleanupPerHour = jobsConfig.pieceCleanupPerSpPerHour; + const pullChecksPerHour = jobsConfig.pullChecksPerSpPerHour; const dealIntervalSeconds = Math.max(1, Math.round(3600 / dealsPerHour)); const retrievalIntervalSeconds = Math.max(1, Math.round(3600 / retrievalsPerHour)); const dataSetCreationIntervalSeconds = Math.max(1, Math.round(3600 / dataSetCreationsPerHour)); const pieceCleanupIntervalSeconds = Math.max(1, Math.round(3600 / pieceCleanupPerHour)); + const pullCheckIntervalSeconds = Math.max(1, Math.round(3600 / pullChecksPerHour)); const dataRetentionPollIntervalSeconds = scheduling.dataRetentionPollIntervalSeconds; const providersRefreshIntervalSeconds = scheduling.providersRefreshIntervalSeconds; @@ -926,6 +1008,7 @@ export class JobsService implements OnModuleInit, OnApplicationShutdown { dataRetentionPollIntervalSeconds, providersRefreshIntervalSeconds, pieceCleanupIntervalSeconds, + pullCheckIntervalSeconds, }; } @@ -945,6 +1028,7 @@ export class JobsService implements OnModuleInit, OnApplicationShutdown { dataRetentionPollIntervalSeconds, providersRefreshIntervalSeconds, pieceCleanupIntervalSeconds, + pullCheckIntervalSeconds, } = this.getIntervalSecondsForRates(); const useOnlyApprovedProviders = this.configService.get("blockchain").useOnlyApprovedProviders; @@ -964,6 +1048,7 @@ export class JobsService implements OnModuleInit, OnApplicationShutdown { const minDataSets = this.configService.get("blockchain").minNumDataSetsForChecks; const cleanupStartAt = new Date(now.getTime() + phaseMs); + const pullCheckStartAt = new Date(now.getTime() + phaseMs); const spBlocklistsCfg = this.configService.get("spBlocklists"); const unblockedAddresses = providers @@ -995,6 +1080,12 @@ export class JobsService implements OnModuleInit, OnApplicationShutdown { pieceCleanupIntervalSeconds, cleanupStartAt, ); + await this.jobScheduleRepository.upsertSchedule( + "pull_check", + address, + pullCheckIntervalSeconds, + pullCheckStartAt, + ); } if (providers.length > 0) { @@ -1138,6 +1229,8 @@ export class JobsService implements OnModuleInit, OnApplicationShutdown { return SP_WORK_QUEUE; case "piece_cleanup": return SP_WORK_QUEUE; + case "pull_check": + return SP_WORK_QUEUE; case "data_retention_poll": return DATA_RETENTION_POLL_QUEUE; case "providers_refresh": @@ -1158,7 +1251,8 @@ export class JobsService implements OnModuleInit, OnApplicationShutdown { row.job_type === "deal" || row.job_type === "retrieval" || row.job_type === "data_set_creation" || - row.job_type === "piece_cleanup" + row.job_type === "piece_cleanup" || + row.job_type === "pull_check" ) { return { jobType: row.job_type, spAddress: row.sp_address, intervalSeconds: row.interval_seconds }; } @@ -1231,6 +1325,7 @@ export class JobsService implements OnModuleInit, OnApplicationShutdown { "retrieval", "data_set_creation", "piece_cleanup", + "pull_check", "data_retention_poll", "providers_refresh", ]; diff --git a/apps/backend/src/jobs/repositories/job-schedule.repository.ts b/apps/backend/src/jobs/repositories/job-schedule.repository.ts index f0da6a80..8595cf39 100644 --- a/apps/backend/src/jobs/repositories/job-schedule.repository.ts +++ b/apps/backend/src/jobs/repositories/job-schedule.repository.ts @@ -74,7 +74,7 @@ export class JobScheduleRepository { const [rows] = (await this.dataSource.query( ` DELETE FROM job_schedule_state - WHERE job_type IN ('deal', 'retrieval', 'data_set_creation', 'piece_cleanup') + WHERE job_type IN ('deal', 'retrieval', 'data_set_creation', 'piece_cleanup', 'pull_check') AND sp_address <> '' RETURNING sp_address `, @@ -85,7 +85,7 @@ export class JobScheduleRepository { const [rows] = (await this.dataSource.query( ` DELETE FROM job_schedule_state - WHERE job_type IN ('deal', 'retrieval', 'data_set_creation', 'piece_cleanup') + WHERE job_type IN ('deal', 'retrieval', 'data_set_creation', 'piece_cleanup', 'pull_check') AND sp_address <> '' AND sp_address <> ALL($1::text[]) RETURNING sp_address diff --git a/apps/backend/src/metrics-prometheus/check-metric-labels.ts b/apps/backend/src/metrics-prometheus/check-metric-labels.ts index d8447160..07415d45 100644 --- a/apps/backend/src/metrics-prometheus/check-metric-labels.ts +++ b/apps/backend/src/metrics-prometheus/check-metric-labels.ts @@ -1,4 +1,4 @@ -export type CheckType = "dataStorage" | "retrieval" | "dataRetention" | "dataSetCreation"; +export type CheckType = "dataStorage" | "retrieval" | "dataRetention" | "dataSetCreation" | "pullCheck"; export type ProviderStatus = "approved" | "unapproved"; export type CheckMetricLabels = { diff --git a/apps/backend/src/metrics-prometheus/check-metrics.service.ts b/apps/backend/src/metrics-prometheus/check-metrics.service.ts index 55975cad..7f5df0e1 100644 --- a/apps/backend/src/metrics-prometheus/check-metrics.service.ts +++ b/apps/backend/src/metrics-prometheus/check-metrics.service.ts @@ -248,3 +248,33 @@ export class DataSetCreationCheckMetrics { this.dataSetCreationStatusCounter.inc({ ...labels, value }); } } + +@Injectable() +export class PullCheckCheckMetrics { + constructor( + @InjectMetric("pullCheckRequestLatencyMs") + private readonly pullCheckRequestLatencyMs: Histogram, + @InjectMetric("pullCheckCompletionLatencyMs") + private readonly pullCheckCompletionLatencyMs: Histogram, + @InjectMetric("pullCheckStatus") + private readonly pullCheckStatusCounter: Counter, + @InjectMetric("pullCheckProviderStatus") + private readonly pullCheckProviderStatusCounter: Counter, + ) {} + + observeRequestLatencyMs(labels: CheckMetricLabels, value: number | null | undefined): void { + observePositive(this.pullCheckRequestLatencyMs, labels, value); + } + + observeCompletionLatencyMs(labels: CheckMetricLabels, value: number | null | undefined): void { + observePositive(this.pullCheckCompletionLatencyMs, labels, value); + } + + recordStatus(labels: CheckMetricLabels, value: string): void { + this.pullCheckStatusCounter.inc({ ...labels, value }); + } + + recordProviderStatus(labels: CheckMetricLabels, value: string): void { + this.pullCheckProviderStatusCounter.inc({ ...labels, value }); + } +} diff --git a/apps/backend/src/metrics-prometheus/metrics-prometheus.module.ts b/apps/backend/src/metrics-prometheus/metrics-prometheus.module.ts index 18bda30d..d276aafb 100644 --- a/apps/backend/src/metrics-prometheus/metrics-prometheus.module.ts +++ b/apps/backend/src/metrics-prometheus/metrics-prometheus.module.ts @@ -11,6 +11,7 @@ import { DataSetCreationCheckMetrics, DataStorageCheckMetrics, DiscoverabilityCheckMetrics, + PullCheckCheckMetrics, RetrievalCheckMetrics, } from "./check-metrics.service.js"; import { MetricsPrometheusInterceptor } from "./metrics-prometheus.interceptor.js"; @@ -196,6 +197,29 @@ const metricProviders = [ help: "Data-set creation status counts", labelNames: ["checkType", "providerId", "providerName", "providerStatus", "value"] as const, }), + // Pull check metrics (docs/checks/pull-check.md) + makeHistogramProvider({ + name: "pullCheckRequestLatencyMs", + help: "Time from pull request submission to SP request acknowledgement (ms)", + labelNames: ["checkType", "providerId", "providerName", "providerStatus"] as const, + buckets: [10, 50, 100, 500, 1000, 2000, 5000, 10000, 30000, 60000, 120000, 300000], + }), + makeHistogramProvider({ + name: "pullCheckCompletionLatencyMs", + help: "Time from pull request submission to terminal SP pull status (ms)", + labelNames: ["checkType", "providerId", "providerName", "providerStatus"] as const, + buckets: [100, 500, 1000, 2000, 5000, 10000, 30000, 60000, 120000, 300000, 600000], + }), + makeCounterProvider({ + name: "pullCheckStatus", + help: "Pull-check terminal status counts (success | failure.timedout | failure.other | pending)", + labelNames: ["checkType", "providerId", "providerName", "providerStatus", "value"] as const, + }), + makeCounterProvider({ + name: "pullCheckProviderStatus", + help: "Raw SP-reported pull statuses observed by DealBot during polling", + labelNames: ["checkType", "providerId", "providerName", "providerStatus", "value"] as const, + }), // Data Retention Metrics makeCounterProvider({ name: "dataSetChallengeStatus", @@ -333,6 +357,7 @@ const metricProviders = [ RetrievalCheckMetrics, DiscoverabilityCheckMetrics, DataSetCreationCheckMetrics, + PullCheckCheckMetrics, WalletBalanceCollector, // HTTP metrics interceptor { @@ -347,6 +372,7 @@ const metricProviders = [ RetrievalCheckMetrics, DiscoverabilityCheckMetrics, DataSetCreationCheckMetrics, + PullCheckCheckMetrics, WalletBalanceCollector, ], }) diff --git a/apps/backend/src/pull-check/hosted-piece.registry.ts b/apps/backend/src/pull-check/hosted-piece.registry.ts new file mode 100644 index 00000000..aa4ca9e0 --- /dev/null +++ b/apps/backend/src/pull-check/hosted-piece.registry.ts @@ -0,0 +1,62 @@ +import { Injectable, Logger } from "@nestjs/common"; +import type { HostedPieceRegistration } from "./pull-check.types.js"; + +/** + * In-memory registry of hosted piece sources backing pull-check requests. + * + * The first slice keeps this in process memory because there is one DealBot + * API process serving `/api/piece/:pieceCid` and pull checks are bounded by + * the configured hosted-piece TTL. + */ +@Injectable() +export class HostedPieceRegistry { + private readonly logger = new Logger(HostedPieceRegistry.name); + private readonly entries = new Map(); + + register(registration: HostedPieceRegistration): void { + this.entries.set(registration.pieceCid, registration); + this.logger.log({ + event: "hosted_piece_registered", + message: "Registered hosted piece source", + pieceCid: registration.pieceCid, + pullCheckId: registration.pullCheckId, + expiresAt: registration.expiresAt.toISOString(), + byteLength: registration.byteLength, + }); + } + + /** + * Resolve a hosted piece by CID. Returns null when the entry is missing, + * already cleaned up, or has expired. + */ + resolveActive(pieceCid: string, now: Date = new Date()): HostedPieceRegistration | null { + const entry = this.entries.get(pieceCid); + if (!entry) return null; + if (entry.cleanedUp) return null; + if (entry.expiresAt.getTime() <= now.getTime()) return null; + return entry; + } + + /** + * Resolve a hosted piece by CID even when expired/cleaned-up. Used by the + * controller to differentiate a 410 Gone from a 404 Not Found. + */ + resolveAny(pieceCid: string): HostedPieceRegistration | null { + return this.entries.get(pieceCid) ?? null; + } + + markCleanedUp(pieceCid: string): void { + const entry = this.entries.get(pieceCid); + if (!entry) return; + entry.cleanedUp = true; + this.logger.log({ + event: "hosted_piece_cleaned_up", + message: "Marked hosted piece source as cleaned up", + pieceCid, + }); + } + + forget(pieceCid: string): void { + this.entries.delete(pieceCid); + } +} diff --git a/apps/backend/src/pull-check/piece-source.controller.ts b/apps/backend/src/pull-check/piece-source.controller.ts new file mode 100644 index 00000000..aaf29758 --- /dev/null +++ b/apps/backend/src/pull-check/piece-source.controller.ts @@ -0,0 +1,79 @@ +import { Controller, Get, Logger, NotFoundException, Param, Res } from "@nestjs/common"; +import { ApiOperation, ApiResponse, ApiTags } from "@nestjs/swagger"; +import type { Response } from "express"; +import { HostedPieceRegistry } from "./hosted-piece.registry.js"; +import { PullCheckService } from "./pull-check.service.js"; + +/** + * Serves the temporary hosted-piece bytes that a storage provider must fetch + * during a pull check. Bound to the same `/api/*` prefix as other DealBot HTTP + * endpoints. The path component must end with `/piece/{pieceCid}` so that + * SP-side pull workers can address the resource directly. + */ +@ApiTags("Pull Check") +@Controller("api") +export class PieceSourceController { + private readonly logger = new Logger(PieceSourceController.name); + + constructor( + private readonly pullCheckService: PullCheckService, + private readonly hostedPieceRegistry: HostedPieceRegistry, + ) {} + + @Get("piece/:pieceCid") + @ApiOperation({ + summary: "Stream a temporary hosted piece for an in-flight SP pull check", + }) + @ApiResponse({ status: 200, description: "Raw piece bytes streamed to the caller" }) + @ApiResponse({ status: 404, description: "No active hosted piece exists for this pieceCid" }) + @ApiResponse({ status: 410, description: "Hosted piece existed but has expired or been cleaned up" }) + servePiece(@Param("pieceCid") pieceCid: string, @Res() res: Response): void { + if (!pieceCid || pieceCid.trim().length === 0) { + throw new NotFoundException("pieceCid is required"); + } + + const opened = this.pullCheckService.openHostedPieceStream(pieceCid); + if (!opened) { + const known = this.hostedPieceRegistry.resolveAny(pieceCid); + if (known) { + this.logger.warn({ + event: "pull_check_piece_gone", + message: "Hosted piece source no longer active", + pieceCid, + cleanedUp: known.cleanedUp, + expiresAt: known.expiresAt.toISOString(), + }); + res.status(410).send("Hosted piece source has expired or been cleaned up"); + return; + } + this.logger.warn({ + event: "pull_check_piece_unknown", + message: "Hosted piece source not found", + pieceCid, + }); + res.status(404).send("Hosted piece source not found"); + return; + } + + const { registration, stream } = opened; + res.setHeader("Content-Type", registration.contentType); + res.setHeader("Content-Length", registration.byteLength.toString()); + res.setHeader("Cache-Control", "no-store"); + res.setHeader("X-Pull-Check-Piece-CID", registration.pieceCid); + + stream.on("error", (error) => { + this.logger.error({ + event: "pull_check_piece_stream_error", + message: "Failed to stream hosted piece", + pieceCid, + error: error.message, + }); + if (!res.headersSent) { + res.status(500).send("Failed to stream hosted piece"); + return; + } + res.destroy(error); + }); + stream.pipe(res); + } +} diff --git a/apps/backend/src/pull-check/pull-check.module.ts b/apps/backend/src/pull-check/pull-check.module.ts new file mode 100644 index 00000000..74972846 --- /dev/null +++ b/apps/backend/src/pull-check/pull-check.module.ts @@ -0,0 +1,18 @@ +import { Module } from "@nestjs/common"; +import { TypeOrmModule } from "@nestjs/typeorm"; +import { DatabaseModule } from "../database/database.module.js"; +import { PullCheck } from "../database/entities/pull-check.entity.js"; +import { DataSourceModule } from "../dataSource/dataSource.module.js"; +import { DealModule } from "../deal/deal.module.js"; +import { WalletSdkModule } from "../wallet-sdk/wallet-sdk.module.js"; +import { HostedPieceRegistry } from "./hosted-piece.registry.js"; +import { PieceSourceController } from "./piece-source.controller.js"; +import { PullCheckService } from "./pull-check.service.js"; + +@Module({ + imports: [DatabaseModule, TypeOrmModule.forFeature([PullCheck]), WalletSdkModule, DataSourceModule, DealModule], + controllers: [PieceSourceController], + providers: [PullCheckService, HostedPieceRegistry], + exports: [PullCheckService, HostedPieceRegistry], +}) +export class PullCheckModule {} diff --git a/apps/backend/src/pull-check/pull-check.service.ts b/apps/backend/src/pull-check/pull-check.service.ts new file mode 100644 index 00000000..fba9303b --- /dev/null +++ b/apps/backend/src/pull-check/pull-check.service.ts @@ -0,0 +1,462 @@ +import * as fs from "node:fs"; +import * as path from "node:path"; +import { calculate, parse as parsePieceCid } from "@filoz/synapse-core/piece"; +import { pullPieces, waitForPullPieces } from "@filoz/synapse-core/sp"; +import { getDataSet } from "@filoz/synapse-core/warm-storage"; +import { METADATA_KEYS, Synapse } from "@filoz/synapse-sdk"; +import { BadRequestException, Injectable, Logger, NotFoundException } from "@nestjs/common"; +import { ConfigService } from "@nestjs/config"; +import { InjectRepository } from "@nestjs/typeorm"; +import type { Repository } from "typeorm"; +import type { Account, Address, Chain, Client, Transport } from "viem"; +import { type ProviderJobContext, toStructuredError } from "../common/logging.js"; +import { createSynapseFromConfig } from "../common/synapse-factory.js"; +import type { IAppConfig, IBlockchainConfig, IConfig, IDatasetConfig, IJobsConfig } from "../config/app.config.js"; +import { PullCheck } from "../database/entities/pull-check.entity.js"; +import { PullCheckStatus, PullVerificationStatus } from "../database/types.js"; +import { DataSourceService } from "../dataSource/dataSource.service.js"; +import { DealService } from "../deal/deal.service.js"; +import { + buildCheckMetricLabels, + type CheckMetricLabels, + classifyFailureStatus, +} from "../metrics-prometheus/check-metric-labels.js"; +import { PullCheckCheckMetrics } from "../metrics-prometheus/check-metrics.service.js"; +import { WalletSdkService } from "../wallet-sdk/wallet-sdk.service.js"; +import { HostedPieceRegistry } from "./hosted-piece.registry.js"; +import type { HostedPiecePrepared } from "./pull-check.types.js"; + +type SynapseViemClient = Client; + +@Injectable() +export class PullCheckService { + private readonly logger = new Logger(PullCheckService.name); + private readonly blockchainConfig: IBlockchainConfig; + private sharedSynapse?: Synapse; + + constructor( + private readonly configService: ConfigService, + @InjectRepository(PullCheck) + private readonly pullCheckRepository: Repository, + private readonly walletSdkService: WalletSdkService, + private readonly dataSourceService: DataSourceService, + private readonly hostedPieceRegistry: HostedPieceRegistry, + private readonly pullCheckMetrics: PullCheckCheckMetrics, + private readonly dealService: DealService, + ) { + this.blockchainConfig = this.configService.get("blockchain", { infer: true }); + } + + async onModuleInit() { + this.logger.log({ + event: "synapse_initialization", + message: "Creating shared Synapse instance", + }); + this.sharedSynapse = await this.createSynapseInstance(); + } + + async onModuleDestroy(): Promise { + if (this.sharedSynapse) { + this.sharedSynapse = undefined; + } + } + + /** + * Trigger a manual pull check for one provider. Returns immediately with the + * pull-check identifier; the actual pull request and polling run in the + * background. Used by `/api/dev/pull`. + */ + async triggerManualPullCheck(spAddress: string): Promise { + const providerInfo = this.walletSdkService.getProviderInfo(spAddress); + if (!providerInfo) { + throw new NotFoundException(`Storage provider not found: ${spAddress}`); + } + if (!providerInfo.isActive) { + throw new BadRequestException(`Storage provider is not active: ${spAddress}`); + } + if (providerInfo.id == null) { + throw new BadRequestException(`Storage provider is missing providerId: ${spAddress}`); + } + if (!providerInfo.pdp.serviceURL) { + throw new BadRequestException(`Storage provider is missing serviceURL: ${spAddress}`); + } + + const ttlSeconds = this.getJobsConfig().pullCheckHostedPieceTtlSeconds; + const pending = this.pullCheckRepository.create({ + spAddress, + pieceCid: "pending", + sourceUrl: "pending", + status: PullCheckStatus.PENDING, + hostedPieceExpiresAt: new Date(Date.now() + ttlSeconds * 1000), + }); + const saved = await this.pullCheckRepository.save(pending); + + this.logger.log({ + event: "pull_check_manual_triggered", + message: "Manual pull check requested", + pullCheckId: saved.id, + spAddress, + providerId: providerInfo.id, + providerName: providerInfo.name, + }); + + const logContext: ProviderJobContext = { + jobId: "dev_tools_manual_pull", + providerAddress: spAddress, + providerId: providerInfo.id, + providerName: providerInfo.name, + }; + + // Fire-and-forget: orchestrate in the background so the API responds fast. + void this.runPullCheckSafe(saved.id, undefined, logContext); + + return saved; + } + + /** + * Resolve a pull check by id. + */ + async getPullCheck(pullCheckId: string): Promise { + const record = await this.pullCheckRepository.findOne({ where: { id: pullCheckId } }); + if (!record) { + throw new NotFoundException(`Pull check not found: ${pullCheckId}`); + } + return record; + } + + /** + * Wraps `runPullCheck` so callers (manual trigger, scheduler) get consistent + * background error handling without unhandled promise rejections. + */ + async runPullCheckSafe( + pullCheckId: string, + signal: AbortSignal | undefined, + logContext: ProviderJobContext, + ): Promise { + try { + await this.runPullCheck(pullCheckId, signal, logContext); + } catch (error) { + this.logger.error({ + ...logContext, + pullCheckId, + event: "pull_check_unhandled_error", + message: "Unhandled pull check execution error", + error: toStructuredError(error), + }); + } + } + + /** + * Drive one pull check through its full lifecycle: + * prepare hosted piece -> submit pull -> poll terminal status -> verify. + */ + async runPullCheck( + pullCheckId: string, + signal: AbortSignal | undefined, + logContext: ProviderJobContext, + ): Promise { + let record = await this.getPullCheck(pullCheckId); + const providerInfo = this.walletSdkService.getProviderInfo(record.spAddress); + if (!providerInfo || providerInfo.id == null || !providerInfo.pdp.serviceURL) { + record.status = PullCheckStatus.FAILED; + record.failureReason = "provider_not_eligible"; + record.errorMessage = `Provider ${record.spAddress} not eligible for pull check`; + record.completedAt = new Date(); + return this.pullCheckRepository.save(record); + } + + const labels = buildCheckMetricLabels({ + checkType: "pullCheck", + providerId: providerInfo.id, + providerName: providerInfo.name, + providerIsApproved: providerInfo.isApproved, + }); + this.pullCheckMetrics.recordStatus(labels, "pending"); + + let prepared: HostedPiecePrepared | null = null; + let requestSubmittedAt: Date | null = null; + + try { + signal?.throwIfAborted(); + prepared = await this.prepareHostedPiece(record); + record.pieceCid = prepared.registration.pieceCid; + record.sourceUrl = prepared.sourceUrl; + record.hostedPieceExpiresAt = prepared.registration.expiresAt; + record.status = PullCheckStatus.REQUESTING; + record.requestStartedAt = new Date(); + record = await this.pullCheckRepository.save(record); + + const synapseClient = this.requireSynapseClient(); + const synapse = this.sharedSynapse ?? (await this.createSynapseInstance()); + const storage = await synapse.storage.createContext({ + providerId: providerInfo.id, + metadata: this.dealService.getBaseDataSetMetadata(), + }); + const dataSetId = storage.dataSetId; + const clientDataSetId = dataSetId ? (await getDataSet(synapseClient, { dataSetId }))?.clientDataSetId : undefined; + const pieceCidParsed = parsePieceCid(record.pieceCid); + const payee = providerInfo.payee as Address; + const serviceURL = providerInfo.pdp.serviceURL; + + const pullPiecesOptions = { + serviceURL, + pieces: [ + { + pieceCid: pieceCidParsed, + sourceUrl: prepared.sourceUrl, + }, + ], + ...(dataSetId && clientDataSetId ? { dataSetId, clientDataSetId } : { payee }), + signal, + }; + requestSubmittedAt = new Date(); + const pullResponse = await pullPieces(synapseClient, pullPiecesOptions); + const requestCompletedAt = new Date(); + record.requestCompletedAt = requestCompletedAt; + record.providerStatus = pullResponse.status; + record.status = PullCheckStatus.POLLING; + record = await this.pullCheckRepository.save(record); + + this.pullCheckMetrics.observeRequestLatencyMs( + labels, + requestCompletedAt.getTime() - requestSubmittedAt.getTime(), + ); + this.pullCheckMetrics.recordProviderStatus(labels, pullResponse.status); + this.logger.log({ + ...logContext, + pullCheckId, + event: "pull_check_request_submitted", + message: "Pull request submitted to provider", + pieceCid: record.pieceCid, + providerStatus: pullResponse.status, + }); + + const jobsConfig = this.getJobsConfig(); + const waitForPullPiecesOptions = { + ...pullPiecesOptions, + timeout: jobsConfig.pullCheckJobTimeoutSeconds * 1000, + pollInterval: jobsConfig.pullCheckPollIntervalSeconds * 1000, + onStatus: (response) => { + this.pullCheckMetrics.recordProviderStatus(labels, response.status); + this.logger.debug({ + ...logContext, + pullCheckId, + event: "pull_check_status_observed", + message: "Observed pull status", + providerStatus: response.status, + }); + }, + }; + const finalResponse = await waitForPullPieces(synapseClient, waitForPullPiecesOptions); + + const pieceResults = finalResponse.pieces.map((piece: { pieceCid: string; status: string }) => { + const pieceCid = pullPiecesOptions.pieces.find((p) => p.toString() === piece.pieceCid); + return { + pieceCid: pieceCid?.pieceCid || piece.pieceCid, + status: piece.status === "complete" ? ("complete" as const) : ("failed" as const), + }; + }); + + const allComplete = pieceResults.every((p: { status: string }) => p.status === "complete"); + + const completedAt = new Date(); + record.providerStatus = finalResponse.status; + record.completedAt = completedAt; + + if (allComplete) { + record.status = PullCheckStatus.VERIFYING; + // First-slice verification: rely on the SP's own confirmation of the parked + // piece. Future iterations should add an explicit byte/proof check. + record.verificationStatus = PullVerificationStatus.SKIPPED; + record.verificationCompletedAt = completedAt; + record.verificationMessage = "First-slice verification deferred: SP terminal status accepted"; + record.status = PullCheckStatus.SUCCESS; + this.pullCheckMetrics.recordStatus(labels, "success"); + } else { + record.status = PullCheckStatus.FAILED; + record.failureReason = "provider_reported_failure"; + record.verificationStatus = PullVerificationStatus.FAILED; + record.verificationCompletedAt = completedAt; + this.pullCheckMetrics.recordStatus(labels, "failure.other"); + } + + this.pullCheckMetrics.observeCompletionLatencyMs(labels, completedAt.getTime() - requestSubmittedAt.getTime()); + + record = await this.pullCheckRepository.save(record); + + const commitResult = await storage.commit({ + pieces: pullPiecesOptions.pieces.map((pullPiece) => ({ + pieceCid: pullPiece.pieceCid, + pieceMetadata: { + [METADATA_KEYS.IPFS_ROOT_CID]: pullPiece.pieceCid.toString(), + }, + })), + }); + + this.logger.log({ + event: "pull_check_commit_result", + message: "Pull check commit result", + commitResult, + }); + + return record; + } catch (error) { + const failureClass = classifyFailureStatus(error); + const completedAt = new Date(); + record.completedAt = completedAt; + if (failureClass === "failure.timedout") { + record.status = PullCheckStatus.TIMED_OUT; + record.failureReason = "pull_check_timeout"; + } else { + record.status = PullCheckStatus.FAILED; + record.failureReason = "pull_check_error"; + } + record.errorMessage = error instanceof Error ? error.message : String(error); + record.errorCode = (error as { code?: string }).code ?? null; + this.pullCheckMetrics.recordStatus(labels, failureClass); + if (requestSubmittedAt) { + this.pullCheckMetrics.observeCompletionLatencyMs(labels, completedAt.getTime() - requestSubmittedAt.getTime()); + } + this.logger.error({ + ...logContext, + pullCheckId, + event: "pull_check_failed", + message: "Pull check failed", + error: toStructuredError(error), + }); + return this.pullCheckRepository.save(record); + } finally { + if (prepared) { + await this.cleanupHostedPiece(prepared.registration.pieceCid, pullCheckId); + } + } + } + + /** + * Generate a synthetic test piece, compute its piece CID, register it for + * `/api/piece/:pieceCid` serving, and return the source URL plus registration. + */ + async prepareHostedPiece(record: PullCheck): Promise { + const jobsConfig = this.getJobsConfig(); + const datasetConfig = this.configService.get("dataset"); + const targetSize = jobsConfig.pullCheckPieceSizeBytes; + + const dataFile = await this.dataSourceService.generateRandomDataset(targetSize, targetSize); + const filePath = path.join(datasetConfig.localDatasetsPath, dataFile.name); + const dataBytes = + dataFile.data instanceof Uint8Array ? dataFile.data : new Uint8Array(dataFile.data as ArrayBufferLike); + const pieceCid = calculate(dataBytes); + const pieceCidStr = pieceCid.toString(); + const baseUrl = this.resolvePublicBaseUrl(); + const sourceUrl = `${baseUrl}/api/piece/${pieceCidStr}`; + const expiresAt = new Date(Date.now() + jobsConfig.pullCheckHostedPieceTtlSeconds * 1000); + + const registration = { + pieceCid: pieceCidStr, + filePath, + fileName: dataFile.name, + byteLength: dataFile.size, + contentType: "application/octet-stream", + pullCheckId: record.id, + expiresAt, + cleanedUp: false, + }; + this.hostedPieceRegistry.register(registration); + + return { registration, sourceUrl }; + } + + /** + * Mark the hosted piece as cleaned up and remove the on-disk artifact. Safe + * to call multiple times. + */ + async cleanupHostedPiece(pieceCid: string, pullCheckId: string | null): Promise { + const entry = this.hostedPieceRegistry.resolveAny(pieceCid); + if (entry && !entry.cleanedUp) { + this.hostedPieceRegistry.markCleanedUp(pieceCid); + try { + await this.dataSourceService.cleanupRandomDataset(entry.fileName); + } catch (error) { + this.logger.warn({ + event: "pull_check_cleanup_warn", + message: "Failed to cleanup hosted piece artifact", + pieceCid, + pullCheckId, + error: toStructuredError(error), + }); + } + } + if (pullCheckId) { + const record = await this.pullCheckRepository.findOne({ where: { id: pullCheckId } }); + if (record && record.hostedPieceCleanedUpAt == null) { + record.hostedPieceCleanedUpAt = new Date(); + await this.pullCheckRepository.save(record); + } + } + this.hostedPieceRegistry.forget(pieceCid); + } + + buildLabelsForPullCheck(record: PullCheck): CheckMetricLabels { + const providerInfo = this.walletSdkService.getProviderInfo(record.spAddress); + return buildCheckMetricLabels({ + checkType: "pullCheck", + providerId: providerInfo?.id ?? null, + providerName: providerInfo?.name ?? null, + providerIsApproved: providerInfo?.isApproved ?? null, + }); + } + + private getJobsConfig(): IJobsConfig { + return this.configService.get("jobs", { infer: true }); + } + + private resolvePublicBaseUrl(): string { + const appConfig = this.configService.get("app"); + if (appConfig.apiPublicUrl) return appConfig.apiPublicUrl; + return `http://${appConfig.host}:${appConfig.port}`; + } + + private requireSynapseClient(): SynapseViemClient { + const client = this.walletSdkService.getSynapseClient(); + if (client == null) { + throw new Error("Synapse client unavailable: chain integration must be enabled for pull checks"); + } + return client as SynapseViemClient; + } + + private async createSynapseInstance(): Promise { + try { + const { synapse, isSessionKeyMode } = await createSynapseFromConfig(this.blockchainConfig); + if (isSessionKeyMode) { + this.logger.log({ + event: "synapse_session_key_init", + message: "Initializing Synapse with session key", + walletAddress: this.blockchainConfig.walletAddress, + }); + } + return synapse; + } catch (error) { + this.logger.error({ + event: "synapse_init_failed", + message: "Failed to initialize Synapse for deal job", + error: toStructuredError(error), + }); + throw error; + } + } + + /** + * Stream the hosted piece bytes for an active registration. Used by the + * `/api/piece/:pieceCid` controller. Returns null when no active registration + * exists; callers must distinguish 404 from 410 using the registry directly. + */ + openHostedPieceStream( + pieceCid: string, + now: Date = new Date(), + ): { registration: NonNullable>; stream: fs.ReadStream } | null { + const registration = this.hostedPieceRegistry.resolveActive(pieceCid, now); + if (!registration) return null; + const stream = fs.createReadStream(registration.filePath); + return { registration, stream }; + } +} diff --git a/apps/backend/src/pull-check/pull-check.types.ts b/apps/backend/src/pull-check/pull-check.types.ts new file mode 100644 index 00000000..1ccef6fe --- /dev/null +++ b/apps/backend/src/pull-check/pull-check.types.ts @@ -0,0 +1,23 @@ +/** + * In-memory registration describing a hosted-piece source served at + * `/api/piece/:pieceCid` for a single in-flight pull check. + */ +export type HostedPieceRegistration = { + pieceCid: string; + filePath: string; + fileName: string; + byteLength: number; + contentType: string; + pullCheckId: string; + expiresAt: Date; + cleanedUp: boolean; +}; + +/** + * Result of preparing a hosted piece, returned by the service to callers that + * need both the routing identity and the on-disk artifact path. + */ +export type HostedPiecePrepared = { + registration: HostedPieceRegistration; + sourceUrl: string; +}; diff --git a/apps/backend/src/wallet-sdk/wallet-sdk.service.ts b/apps/backend/src/wallet-sdk/wallet-sdk.service.ts index 81c4288b..4eea581b 100644 --- a/apps/backend/src/wallet-sdk/wallet-sdk.service.ts +++ b/apps/backend/src/wallet-sdk/wallet-sdk.service.ts @@ -295,6 +295,18 @@ export class WalletSdkService implements OnModuleInit { return this.providerCache.get(address); } + /** + * Get the underlying Synapse-SDK viem client. + * + * Used by features that need to call low-level Synapse helpers (e.g. `pullPieces` + * from `@filoz/synapse-core/sp`) which require a viem `Client`. + * Returns `null` when chain integration is disabled or the client has not been + * initialized yet. + */ + getSynapseClient(): unknown { + return this._synapseClient ?? null; + } + /** * Ensure wallet has sufficient allowances for operations. * Skipped in session key mode, deposits and operator approvals must be diff --git a/apps/backend/src/worker.module.ts b/apps/backend/src/worker.module.ts index 67bbd5da..9e5cdd62 100644 --- a/apps/backend/src/worker.module.ts +++ b/apps/backend/src/worker.module.ts @@ -7,6 +7,7 @@ import { configValidationSchema, loadConfig } from "./config/app.config.js"; import { DatabaseModule } from "./database/database.module.js"; import { JobsModule } from "./jobs/jobs.module.js"; import { MetricsPrometheusModule } from "./metrics-prometheus/metrics-prometheus.module.js"; +import { PullCheckModule } from "./pull-check/pull-check.module.js"; @Module({ imports: [ @@ -20,6 +21,7 @@ import { MetricsPrometheusModule } from "./metrics-prometheus/metrics-prometheus MetricsPrometheusModule, ClickhouseModule, JobsModule, + PullCheckModule, ], }) export class WorkerModule {} From e08929edecf6834a0429ace093c7e7bffd36dad6 Mon Sep 17 00:00:00 2001 From: silent-cipher Date: Fri, 1 May 2026 01:22:48 +0530 Subject: [PATCH 03/44] fix: double pull check trigger --- apps/backend/src/jobs/jobs.service.ts | 4 ++-- .../src/pull-check/pull-check.service.ts | 22 ++++++++++++++----- 2 files changed, 19 insertions(+), 7 deletions(-) diff --git a/apps/backend/src/jobs/jobs.service.ts b/apps/backend/src/jobs/jobs.service.ts index 9435b244..94b6df0b 100644 --- a/apps/backend/src/jobs/jobs.service.ts +++ b/apps/backend/src/jobs/jobs.service.ts @@ -694,8 +694,8 @@ export class JobsService implements OnModuleInit, OnApplicationShutdown { return "success"; } try { - const triggered = await this.pullCheckService.triggerManualPullCheck(spAddress); - await this.pullCheckService.runPullCheck(triggered.id, abortController.signal, logContext); + const record = await this.pullCheckService.createPullCheckRecord(spAddress); + await this.pullCheckService.runPullCheck(record.id, abortController.signal, logContext); return "success"; } catch (error) { if (abortController.signal.aborted) { diff --git a/apps/backend/src/pull-check/pull-check.service.ts b/apps/backend/src/pull-check/pull-check.service.ts index fba9303b..96b864dd 100644 --- a/apps/backend/src/pull-check/pull-check.service.ts +++ b/apps/backend/src/pull-check/pull-check.service.ts @@ -62,11 +62,9 @@ export class PullCheckService { } /** - * Trigger a manual pull check for one provider. Returns immediately with the - * pull-check identifier; the actual pull request and polling run in the - * background. Used by `/api/dev/pull`. + * Create a pending pull-check record after validating provider eligibility. */ - async triggerManualPullCheck(spAddress: string): Promise { + async createPullCheckRecord(spAddress: string): Promise { const providerInfo = this.walletSdkService.getProviderInfo(spAddress); if (!providerInfo) { throw new NotFoundException(`Storage provider not found: ${spAddress}`); @@ -89,7 +87,21 @@ export class PullCheckService { status: PullCheckStatus.PENDING, hostedPieceExpiresAt: new Date(Date.now() + ttlSeconds * 1000), }); - const saved = await this.pullCheckRepository.save(pending); + return this.pullCheckRepository.save(pending); + } + + /** + * Trigger a manual pull check for one provider. Returns immediately with the + * pull-check identifier; the actual pull request and polling run in the + * background. Used by `/api/dev/pull`. + */ + async triggerManualPullCheck(spAddress: string): Promise { + const saved = await this.createPullCheckRecord(spAddress); + // createPullCheckRecord already validated that providerInfo is non-null with an id. + const providerInfo = this.walletSdkService.getProviderInfo(spAddress); + if (!providerInfo || providerInfo.id == null) { + throw new NotFoundException(`Storage provider disappeared during pull-check setup: ${spAddress}`); + } this.logger.log({ event: "pull_check_manual_triggered", From 720e2275cd22df7b6fb82ec90ddd3383e1040f16 Mon Sep 17 00:00:00 2001 From: silent-cipher Date: Sat, 2 May 2026 13:53:57 +0530 Subject: [PATCH 04/44] chore: remove pull check from dev-tools --- .../src/dev-tools/dev-tools.controller.ts | 53 -------------- .../backend/src/dev-tools/dev-tools.module.ts | 3 +- .../src/dev-tools/dev-tools.service.ts | 71 ------------------ .../dev-tools/dto/trigger-pull-check.dto.ts | 73 ------------------- 4 files changed, 1 insertion(+), 199 deletions(-) delete mode 100644 apps/backend/src/dev-tools/dto/trigger-pull-check.dto.ts diff --git a/apps/backend/src/dev-tools/dev-tools.controller.ts b/apps/backend/src/dev-tools/dev-tools.controller.ts index 10a3baf8..7ae09d0e 100644 --- a/apps/backend/src/dev-tools/dev-tools.controller.ts +++ b/apps/backend/src/dev-tools/dev-tools.controller.ts @@ -2,11 +2,6 @@ import { Controller, Get, Logger, Param, Query, UsePipes, ValidationPipe } from import { ApiOperation, ApiQuery, ApiResponse, ApiTags } from "@nestjs/swagger"; import { DevToolsService } from "./dev-tools.service.js"; import { TriggerDealQueryDto, TriggerDealResponseDto } from "./dto/trigger-deal.dto.js"; -import { - PullCheckStatusResponseDto, - TriggerPullCheckQueryDto, - TriggerPullCheckResponseDto, -} from "./dto/trigger-pull-check.dto.js"; import { TriggerRetrievalQueryDto, TriggerRetrievalResponseDto } from "./dto/trigger-retrieval.dto.js"; @ApiTags("Dev Tools") @@ -126,52 +121,4 @@ export class DevToolsController { }); return this.devToolsService.triggerRetrieval(query.dealId, query.spAddress); } - - @Get("pull") - @ApiOperation({ - summary: "Trigger a manual SP pull check (returns immediately, processing in background)", - }) - @ApiQuery({ - name: "spAddress", - required: true, - description: "Storage provider address", - example: "0x1234567890abcdef1234567890abcdef12345678", - }) - @ApiResponse({ - status: 200, - description: "Pull check accepted - use /api/dev/pulls/:pullCheckId to check progress", - type: TriggerPullCheckResponseDto, - }) - @ApiResponse({ status: 400, description: "Storage provider is not eligible for pull checks" }) - @ApiResponse({ status: 404, description: "Storage provider not found" }) - @UsePipes(new ValidationPipe({ transform: true })) - async triggerPullCheck(@Query() query: TriggerPullCheckQueryDto): Promise { - this.logger.log({ - event: "api_request", - message: "GET /api/dev/pull", - endpoint: "/api/dev/pull", - method: "GET", - spAddress: query.spAddress, - }); - return this.devToolsService.triggerPullCheck(query.spAddress); - } - - @Get("pulls/:pullCheckId") - @ApiOperation({ summary: "Get pull-check status by ID" }) - @ApiResponse({ - status: 200, - description: "Pull check status", - type: PullCheckStatusResponseDto, - }) - @ApiResponse({ status: 404, description: "Pull check not found" }) - async getPullCheck(@Param("pullCheckId") pullCheckId: string): Promise { - this.logger.log({ - event: "api_request", - message: "GET /api/dev/pulls/:pullCheckId", - endpoint: "/api/dev/pulls/:pullCheckId", - method: "GET", - pullCheckId, - }); - return this.devToolsService.getPullCheck(pullCheckId); - } } diff --git a/apps/backend/src/dev-tools/dev-tools.module.ts b/apps/backend/src/dev-tools/dev-tools.module.ts index f54dcdae..30db84f6 100644 --- a/apps/backend/src/dev-tools/dev-tools.module.ts +++ b/apps/backend/src/dev-tools/dev-tools.module.ts @@ -2,14 +2,13 @@ import { Module } from "@nestjs/common"; import { TypeOrmModule } from "@nestjs/typeorm"; import { Deal } from "../database/entities/deal.entity.js"; import { DealModule } from "../deal/deal.module.js"; -import { PullCheckModule } from "../pull-check/pull-check.module.js"; import { RetrievalModule } from "../retrieval/retrieval.module.js"; import { WalletSdkModule } from "../wallet-sdk/wallet-sdk.module.js"; import { DevToolsController } from "./dev-tools.controller.js"; import { DevToolsService } from "./dev-tools.service.js"; @Module({ - imports: [TypeOrmModule.forFeature([Deal]), WalletSdkModule, DealModule, RetrievalModule, PullCheckModule], + imports: [TypeOrmModule.forFeature([Deal]), WalletSdkModule, DealModule, RetrievalModule], controllers: [DevToolsController], providers: [DevToolsService], }) diff --git a/apps/backend/src/dev-tools/dev-tools.service.ts b/apps/backend/src/dev-tools/dev-tools.service.ts index 124b2248..8b08e046 100644 --- a/apps/backend/src/dev-tools/dev-tools.service.ts +++ b/apps/backend/src/dev-tools/dev-tools.service.ts @@ -3,14 +3,11 @@ import { InjectRepository } from "@nestjs/typeorm"; import type { Repository } from "typeorm"; import { type DealLogContext, toStructuredError } from "../common/logging.js"; import { Deal } from "../database/entities/deal.entity.js"; -import type { PullCheck } from "../database/entities/pull-check.entity.js"; import { DealStatus, RetrievalStatus } from "../database/types.js"; import { DealService } from "../deal/deal.service.js"; -import { PullCheckService } from "../pull-check/pull-check.service.js"; import { RetrievalService } from "../retrieval/retrieval.service.js"; import { WalletSdkService } from "../wallet-sdk/wallet-sdk.service.js"; import type { TriggerDealResponseDto } from "./dto/trigger-deal.dto.js"; -import type { PullCheckStatusResponseDto, TriggerPullCheckResponseDto } from "./dto/trigger-pull-check.dto.js"; import type { RetrievalMethodResultDto, TriggerRetrievalResponseDto } from "./dto/trigger-retrieval.dto.js"; @Injectable() @@ -21,7 +18,6 @@ export class DevToolsService { private readonly walletSdkService: WalletSdkService, private readonly dealService: DealService, private readonly retrievalService: RetrievalService, - private readonly pullCheckService: PullCheckService, @InjectRepository(Deal) private readonly dealRepository: Repository, ) {} @@ -297,73 +293,6 @@ export class DevToolsService { }; } - /** - * Trigger a manual SP pull check. Returns immediately with the pull-check ID; - * the actual pull request and polling run in the background, isolated from - * the existing direct-upload data-storage check. - */ - async triggerPullCheck(spAddress: string): Promise { - this.logger.log({ - event: "pull_check_trigger_requested", - message: "Triggering manual pull check for storage provider", - spAddress, - }); - - const record = await this.pullCheckService.triggerManualPullCheck(spAddress); - return this.pullCheckToTriggerDto(record); - } - - /** - * Get current pull-check state by id. Surfaces SP-reported pull status, - * verification status, and computed latencies so operators can review - * outcomes without inspecting database rows directly. - */ - async getPullCheck(pullCheckId: string): Promise { - const record = await this.pullCheckService.getPullCheck(pullCheckId); - return this.pullCheckToStatusDto(record); - } - - private pullCheckToTriggerDto(record: PullCheck): TriggerPullCheckResponseDto { - return { - id: record.id, - spAddress: record.spAddress, - pieceCid: record.pieceCid, - status: record.status, - sourceUrl: record.sourceUrl, - createdAt: record.createdAt, - }; - } - - private pullCheckToStatusDto(record: PullCheck): PullCheckStatusResponseDto { - const requestStartedAt = record.requestStartedAt ?? undefined; - const completedAt = record.completedAt ?? undefined; - const requestCompletedAt = record.requestCompletedAt ?? undefined; - const requestLatencyMs = - record.requestStartedAt && record.requestCompletedAt - ? record.requestCompletedAt.getTime() - record.requestStartedAt.getTime() - : undefined; - const completionLatencyMs = - record.requestStartedAt && record.completedAt - ? record.completedAt.getTime() - record.requestStartedAt.getTime() - : undefined; - void requestCompletedAt; - return { - id: record.id, - spAddress: record.spAddress, - pieceCid: record.pieceCid, - status: record.status, - providerStatus: record.providerStatus ?? undefined, - verificationStatus: record.verificationStatus ?? undefined, - requestLatencyMs, - completionLatencyMs, - failureReason: record.failureReason ?? undefined, - errorMessage: record.errorMessage ?? undefined, - sourceUrl: record.sourceUrl, - requestStartedAt, - completedAt, - }; - } - /** * Find a deal by ID or most recent deal for an SP */ diff --git a/apps/backend/src/dev-tools/dto/trigger-pull-check.dto.ts b/apps/backend/src/dev-tools/dto/trigger-pull-check.dto.ts deleted file mode 100644 index 8d2bf609..00000000 --- a/apps/backend/src/dev-tools/dto/trigger-pull-check.dto.ts +++ /dev/null @@ -1,73 +0,0 @@ -import { ApiProperty } from "@nestjs/swagger"; -import { IsNotEmpty, IsString } from "class-validator"; - -export class TriggerPullCheckQueryDto { - @ApiProperty({ - description: "Storage provider address to run the pull check against", - example: "0x1234567890abcdef1234567890abcdef12345678", - }) - @IsString() - @IsNotEmpty() - spAddress: string; -} - -export class TriggerPullCheckResponseDto { - @ApiProperty({ description: "Pull check identifier" }) - id: string; - - @ApiProperty({ description: "Storage provider address" }) - spAddress: string; - - @ApiProperty({ description: "Hosted piece CID for this pull check" }) - pieceCid: string; - - @ApiProperty({ description: "Pull-check lifecycle status" }) - status: string; - - @ApiProperty({ description: "Hosted piece source URL the SP must pull from" }) - sourceUrl: string; - - @ApiProperty({ description: "Pull-check creation timestamp" }) - createdAt: Date; -} - -export class PullCheckStatusResponseDto { - @ApiProperty({ description: "Pull check identifier" }) - id: string; - - @ApiProperty({ description: "Storage provider address" }) - spAddress: string; - - @ApiProperty({ description: "Hosted piece CID" }) - pieceCid: string; - - @ApiProperty({ description: "Pull-check lifecycle status" }) - status: string; - - @ApiProperty({ description: "Latest provider-reported pull status", required: false }) - providerStatus?: string; - - @ApiProperty({ description: "Verification status, when applicable", required: false }) - verificationStatus?: string; - - @ApiProperty({ description: "Time from request submission to SP acknowledgement (ms)", required: false }) - requestLatencyMs?: number; - - @ApiProperty({ description: "Time from request submission to terminal SP status (ms)", required: false }) - completionLatencyMs?: number; - - @ApiProperty({ description: "Failure reason, when applicable", required: false }) - failureReason?: string; - - @ApiProperty({ description: "Underlying error message, when applicable", required: false }) - errorMessage?: string; - - @ApiProperty({ description: "Hosted piece source URL the SP was asked to pull from" }) - sourceUrl: string; - - @ApiProperty({ description: "Time at which DealBot started the pull request", required: false }) - requestStartedAt?: Date; - - @ApiProperty({ description: "Time at which DealBot reached a terminal pull state", required: false }) - completedAt?: Date; -} From a98e8e0bb2ddc3592f736e2cf167416ea0107109 Mon Sep 17 00:00:00 2001 From: silent-cipher Date: Sat, 2 May 2026 13:57:29 +0530 Subject: [PATCH 05/44] remove pull_checks table from db --- apps/backend/src/database/database.module.ts | 9 +- .../database/entities/pull-check.entity.ts | 81 -------- .../1780000000000-CreatePullChecks.ts | 77 ------- apps/backend/src/database/types.ts | 17 -- apps/backend/src/jobs/jobs.service.ts | 3 +- .../src/pull-check/hosted-piece.registry.ts | 1 - .../src/pull-check/pull-check.module.ts | 4 +- .../src/pull-check/pull-check.service.ts | 190 ++---------------- .../src/pull-check/pull-check.types.ts | 1 - 9 files changed, 25 insertions(+), 358 deletions(-) delete mode 100644 apps/backend/src/database/entities/pull-check.entity.ts delete mode 100644 apps/backend/src/database/migrations/1780000000000-CreatePullChecks.ts diff --git a/apps/backend/src/database/database.module.ts b/apps/backend/src/database/database.module.ts index 2e8ddf72..9249c3a9 100644 --- a/apps/backend/src/database/database.module.ts +++ b/apps/backend/src/database/database.module.ts @@ -10,7 +10,6 @@ import type { IAppConfig, IConfig, IDatabaseConfig } from "../config/app.config. import { DataRetentionBaseline } from "./entities/data-retention-baseline.entity.js"; import { Deal } from "./entities/deal.entity.js"; import { JobScheduleState } from "./entities/job-schedule-state.entity.js"; -import { PullCheck } from "./entities/pull-check.entity.js"; import { Retrieval } from "./entities/retrieval.entity.js"; import { StorageProvider } from "./entities/storage-provider.entity.js"; @@ -50,7 +49,7 @@ function toSafeDataSourceContext(options: DataSourceOptions): Record { - await queryRunner.query(` - DO $$ - BEGIN - IF NOT EXISTS (SELECT 1 FROM pg_type WHERE typname = 'pull_checks_status_enum') THEN - CREATE TYPE "pull_checks_status_enum" AS ENUM ( - 'pending', - 'requesting', - 'polling', - 'verifying', - 'success', - 'failed', - 'timed_out' - ); - END IF; - IF NOT EXISTS (SELECT 1 FROM pg_type WHERE typname = 'pull_checks_verification_status_enum') THEN - CREATE TYPE "pull_checks_verification_status_enum" AS ENUM ( - 'pending', - 'passed', - 'failed', - 'skipped' - ); - END IF; - END$$; - `); - - await queryRunner.query(` - CREATE TABLE IF NOT EXISTS pull_checks ( - id UUID PRIMARY KEY DEFAULT gen_random_uuid(), - sp_address VARCHAR NOT NULL, - piece_cid VARCHAR NOT NULL, - source_url TEXT NOT NULL, - request_id VARCHAR NULL, - status pull_checks_status_enum NOT NULL DEFAULT 'pending', - provider_status VARCHAR NULL, - failure_reason TEXT NULL, - request_started_at TIMESTAMPTZ NULL, - request_completed_at TIMESTAMPTZ NULL, - completed_at TIMESTAMPTZ NULL, - verification_status pull_checks_verification_status_enum NULL, - verification_completed_at TIMESTAMPTZ NULL, - verification_message TEXT NULL, - hosted_piece_expires_at TIMESTAMPTZ NOT NULL, - hosted_piece_cleaned_up_at TIMESTAMPTZ NULL, - error_code VARCHAR NULL, - error_message TEXT NULL, - retry_count INTEGER NOT NULL DEFAULT 0, - created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), - updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW() - ) - `); - - await queryRunner.query(` - CREATE INDEX IF NOT EXISTS idx_pull_checks_sp_address ON pull_checks (sp_address) - `); - await queryRunner.query(` - CREATE INDEX IF NOT EXISTS idx_pull_checks_status ON pull_checks (status) - `); - await queryRunner.query(` - CREATE INDEX IF NOT EXISTS idx_pull_checks_created_at ON pull_checks (created_at) - `); - } - - public async down(queryRunner: QueryRunner): Promise { - await queryRunner.query(`DROP INDEX IF EXISTS idx_pull_checks_created_at`); - await queryRunner.query(`DROP INDEX IF EXISTS idx_pull_checks_status`); - await queryRunner.query(`DROP INDEX IF EXISTS idx_pull_checks_sp_address`); - await queryRunner.query(`DROP TABLE IF EXISTS pull_checks`); - await queryRunner.query(`DROP TYPE IF EXISTS pull_checks_verification_status_enum`); - await queryRunner.query(`DROP TYPE IF EXISTS pull_checks_status_enum`); - } -} diff --git a/apps/backend/src/database/types.ts b/apps/backend/src/database/types.ts index b7f9559c..46fd5d28 100644 --- a/apps/backend/src/database/types.ts +++ b/apps/backend/src/database/types.ts @@ -28,23 +28,6 @@ export enum IpniStatus { FAILED = "failed", } -export enum PullCheckStatus { - PENDING = "pending", - REQUESTING = "requesting", - POLLING = "polling", - VERIFYING = "verifying", - SUCCESS = "success", - FAILED = "failed", - TIMED_OUT = "timed_out", -} - -export enum PullVerificationStatus { - PENDING = "pending", - PASSED = "passed", - FAILED = "failed", - SKIPPED = "skipped", -} - /** * Metadata schema for deal storage and retrieval */ diff --git a/apps/backend/src/jobs/jobs.service.ts b/apps/backend/src/jobs/jobs.service.ts index 94b6df0b..b929405e 100644 --- a/apps/backend/src/jobs/jobs.service.ts +++ b/apps/backend/src/jobs/jobs.service.ts @@ -694,8 +694,7 @@ export class JobsService implements OnModuleInit, OnApplicationShutdown { return "success"; } try { - const record = await this.pullCheckService.createPullCheckRecord(spAddress); - await this.pullCheckService.runPullCheck(record.id, abortController.signal, logContext); + await this.pullCheckService.runPullCheck(spAddress, abortController.signal, logContext); return "success"; } catch (error) { if (abortController.signal.aborted) { diff --git a/apps/backend/src/pull-check/hosted-piece.registry.ts b/apps/backend/src/pull-check/hosted-piece.registry.ts index aa4ca9e0..1adc729c 100644 --- a/apps/backend/src/pull-check/hosted-piece.registry.ts +++ b/apps/backend/src/pull-check/hosted-piece.registry.ts @@ -19,7 +19,6 @@ export class HostedPieceRegistry { event: "hosted_piece_registered", message: "Registered hosted piece source", pieceCid: registration.pieceCid, - pullCheckId: registration.pullCheckId, expiresAt: registration.expiresAt.toISOString(), byteLength: registration.byteLength, }); diff --git a/apps/backend/src/pull-check/pull-check.module.ts b/apps/backend/src/pull-check/pull-check.module.ts index 74972846..ba317cbb 100644 --- a/apps/backend/src/pull-check/pull-check.module.ts +++ b/apps/backend/src/pull-check/pull-check.module.ts @@ -1,7 +1,5 @@ import { Module } from "@nestjs/common"; -import { TypeOrmModule } from "@nestjs/typeorm"; import { DatabaseModule } from "../database/database.module.js"; -import { PullCheck } from "../database/entities/pull-check.entity.js"; import { DataSourceModule } from "../dataSource/dataSource.module.js"; import { DealModule } from "../deal/deal.module.js"; import { WalletSdkModule } from "../wallet-sdk/wallet-sdk.module.js"; @@ -10,7 +8,7 @@ import { PieceSourceController } from "./piece-source.controller.js"; import { PullCheckService } from "./pull-check.service.js"; @Module({ - imports: [DatabaseModule, TypeOrmModule.forFeature([PullCheck]), WalletSdkModule, DataSourceModule, DealModule], + imports: [DatabaseModule, WalletSdkModule, DataSourceModule, DealModule], controllers: [PieceSourceController], providers: [PullCheckService, HostedPieceRegistry], exports: [PullCheckService, HostedPieceRegistry], diff --git a/apps/backend/src/pull-check/pull-check.service.ts b/apps/backend/src/pull-check/pull-check.service.ts index 96b864dd..0085b9e8 100644 --- a/apps/backend/src/pull-check/pull-check.service.ts +++ b/apps/backend/src/pull-check/pull-check.service.ts @@ -4,25 +4,18 @@ import { calculate, parse as parsePieceCid } from "@filoz/synapse-core/piece"; import { pullPieces, waitForPullPieces } from "@filoz/synapse-core/sp"; import { getDataSet } from "@filoz/synapse-core/warm-storage"; import { METADATA_KEYS, Synapse } from "@filoz/synapse-sdk"; -import { BadRequestException, Injectable, Logger, NotFoundException } from "@nestjs/common"; +import { Injectable, Logger } from "@nestjs/common"; import { ConfigService } from "@nestjs/config"; -import { InjectRepository } from "@nestjs/typeorm"; -import type { Repository } from "typeorm"; import type { Account, Address, Chain, Client, Transport } from "viem"; import { type ProviderJobContext, toStructuredError } from "../common/logging.js"; import { createSynapseFromConfig } from "../common/synapse-factory.js"; import type { IAppConfig, IBlockchainConfig, IConfig, IDatasetConfig, IJobsConfig } from "../config/app.config.js"; -import { PullCheck } from "../database/entities/pull-check.entity.js"; -import { PullCheckStatus, PullVerificationStatus } from "../database/types.js"; import { DataSourceService } from "../dataSource/dataSource.service.js"; import { DealService } from "../deal/deal.service.js"; -import { - buildCheckMetricLabels, - type CheckMetricLabels, - classifyFailureStatus, -} from "../metrics-prometheus/check-metric-labels.js"; +import { buildCheckMetricLabels, classifyFailureStatus } from "../metrics-prometheus/check-metric-labels.js"; import { PullCheckCheckMetrics } from "../metrics-prometheus/check-metrics.service.js"; import { WalletSdkService } from "../wallet-sdk/wallet-sdk.service.js"; +import { PDPProviderEx } from "../wallet-sdk/wallet-sdk.types.js"; import { HostedPieceRegistry } from "./hosted-piece.registry.js"; import type { HostedPiecePrepared } from "./pull-check.types.js"; @@ -36,8 +29,6 @@ export class PullCheckService { constructor( private readonly configService: ConfigService, - @InjectRepository(PullCheck) - private readonly pullCheckRepository: Repository, private readonly walletSdkService: WalletSdkService, private readonly dataSourceService: DataSourceService, private readonly hostedPieceRegistry: HostedPieceRegistry, @@ -64,98 +55,22 @@ export class PullCheckService { /** * Create a pending pull-check record after validating provider eligibility. */ - async createPullCheckRecord(spAddress: string): Promise { + validateProviderInfo(spAddress: string): PDPProviderEx { const providerInfo = this.walletSdkService.getProviderInfo(spAddress); if (!providerInfo) { - throw new NotFoundException(`Storage provider not found: ${spAddress}`); + throw new Error(`Storage provider not found: ${spAddress}`); } if (!providerInfo.isActive) { - throw new BadRequestException(`Storage provider is not active: ${spAddress}`); + throw new Error(`Storage provider is not active: ${spAddress}`); } if (providerInfo.id == null) { - throw new BadRequestException(`Storage provider is missing providerId: ${spAddress}`); + throw new Error(`Storage provider is missing providerId: ${spAddress}`); } if (!providerInfo.pdp.serviceURL) { - throw new BadRequestException(`Storage provider is missing serviceURL: ${spAddress}`); + throw new Error(`Storage provider is missing serviceURL: ${spAddress}`); } - const ttlSeconds = this.getJobsConfig().pullCheckHostedPieceTtlSeconds; - const pending = this.pullCheckRepository.create({ - spAddress, - pieceCid: "pending", - sourceUrl: "pending", - status: PullCheckStatus.PENDING, - hostedPieceExpiresAt: new Date(Date.now() + ttlSeconds * 1000), - }); - return this.pullCheckRepository.save(pending); - } - - /** - * Trigger a manual pull check for one provider. Returns immediately with the - * pull-check identifier; the actual pull request and polling run in the - * background. Used by `/api/dev/pull`. - */ - async triggerManualPullCheck(spAddress: string): Promise { - const saved = await this.createPullCheckRecord(spAddress); - // createPullCheckRecord already validated that providerInfo is non-null with an id. - const providerInfo = this.walletSdkService.getProviderInfo(spAddress); - if (!providerInfo || providerInfo.id == null) { - throw new NotFoundException(`Storage provider disappeared during pull-check setup: ${spAddress}`); - } - - this.logger.log({ - event: "pull_check_manual_triggered", - message: "Manual pull check requested", - pullCheckId: saved.id, - spAddress, - providerId: providerInfo.id, - providerName: providerInfo.name, - }); - - const logContext: ProviderJobContext = { - jobId: "dev_tools_manual_pull", - providerAddress: spAddress, - providerId: providerInfo.id, - providerName: providerInfo.name, - }; - - // Fire-and-forget: orchestrate in the background so the API responds fast. - void this.runPullCheckSafe(saved.id, undefined, logContext); - - return saved; - } - - /** - * Resolve a pull check by id. - */ - async getPullCheck(pullCheckId: string): Promise { - const record = await this.pullCheckRepository.findOne({ where: { id: pullCheckId } }); - if (!record) { - throw new NotFoundException(`Pull check not found: ${pullCheckId}`); - } - return record; - } - - /** - * Wraps `runPullCheck` so callers (manual trigger, scheduler) get consistent - * background error handling without unhandled promise rejections. - */ - async runPullCheckSafe( - pullCheckId: string, - signal: AbortSignal | undefined, - logContext: ProviderJobContext, - ): Promise { - try { - await this.runPullCheck(pullCheckId, signal, logContext); - } catch (error) { - this.logger.error({ - ...logContext, - pullCheckId, - event: "pull_check_unhandled_error", - message: "Unhandled pull check execution error", - error: toStructuredError(error), - }); - } + return providerInfo; } /** @@ -163,19 +78,11 @@ export class PullCheckService { * prepare hosted piece -> submit pull -> poll terminal status -> verify. */ async runPullCheck( - pullCheckId: string, + spAddress: string, signal: AbortSignal | undefined, logContext: ProviderJobContext, - ): Promise { - let record = await this.getPullCheck(pullCheckId); - const providerInfo = this.walletSdkService.getProviderInfo(record.spAddress); - if (!providerInfo || providerInfo.id == null || !providerInfo.pdp.serviceURL) { - record.status = PullCheckStatus.FAILED; - record.failureReason = "provider_not_eligible"; - record.errorMessage = `Provider ${record.spAddress} not eligible for pull check`; - record.completedAt = new Date(); - return this.pullCheckRepository.save(record); - } + ): Promise { + const providerInfo = this.validateProviderInfo(spAddress); const labels = buildCheckMetricLabels({ checkType: "pullCheck", @@ -190,13 +97,7 @@ export class PullCheckService { try { signal?.throwIfAborted(); - prepared = await this.prepareHostedPiece(record); - record.pieceCid = prepared.registration.pieceCid; - record.sourceUrl = prepared.sourceUrl; - record.hostedPieceExpiresAt = prepared.registration.expiresAt; - record.status = PullCheckStatus.REQUESTING; - record.requestStartedAt = new Date(); - record = await this.pullCheckRepository.save(record); + prepared = await this.prepareHostedPiece(); const synapseClient = this.requireSynapseClient(); const synapse = this.sharedSynapse ?? (await this.createSynapseInstance()); @@ -206,7 +107,8 @@ export class PullCheckService { }); const dataSetId = storage.dataSetId; const clientDataSetId = dataSetId ? (await getDataSet(synapseClient, { dataSetId }))?.clientDataSetId : undefined; - const pieceCidParsed = parsePieceCid(record.pieceCid); + const pieceCidStr = prepared.registration.pieceCid; + const pieceCidParsed = parsePieceCid(pieceCidStr); const payee = providerInfo.payee as Address; const serviceURL = providerInfo.pdp.serviceURL; @@ -224,10 +126,6 @@ export class PullCheckService { requestSubmittedAt = new Date(); const pullResponse = await pullPieces(synapseClient, pullPiecesOptions); const requestCompletedAt = new Date(); - record.requestCompletedAt = requestCompletedAt; - record.providerStatus = pullResponse.status; - record.status = PullCheckStatus.POLLING; - record = await this.pullCheckRepository.save(record); this.pullCheckMetrics.observeRequestLatencyMs( labels, @@ -236,10 +134,9 @@ export class PullCheckService { this.pullCheckMetrics.recordProviderStatus(labels, pullResponse.status); this.logger.log({ ...logContext, - pullCheckId, event: "pull_check_request_submitted", message: "Pull request submitted to provider", - pieceCid: record.pieceCid, + pieceCid: pieceCidStr, providerStatus: pullResponse.status, }); @@ -252,7 +149,6 @@ export class PullCheckService { this.pullCheckMetrics.recordProviderStatus(labels, response.status); this.logger.debug({ ...logContext, - pullCheckId, event: "pull_check_status_observed", message: "Observed pull status", providerStatus: response.status, @@ -272,30 +168,15 @@ export class PullCheckService { const allComplete = pieceResults.every((p: { status: string }) => p.status === "complete"); const completedAt = new Date(); - record.providerStatus = finalResponse.status; - record.completedAt = completedAt; if (allComplete) { - record.status = PullCheckStatus.VERIFYING; - // First-slice verification: rely on the SP's own confirmation of the parked - // piece. Future iterations should add an explicit byte/proof check. - record.verificationStatus = PullVerificationStatus.SKIPPED; - record.verificationCompletedAt = completedAt; - record.verificationMessage = "First-slice verification deferred: SP terminal status accepted"; - record.status = PullCheckStatus.SUCCESS; this.pullCheckMetrics.recordStatus(labels, "success"); } else { - record.status = PullCheckStatus.FAILED; - record.failureReason = "provider_reported_failure"; - record.verificationStatus = PullVerificationStatus.FAILED; - record.verificationCompletedAt = completedAt; this.pullCheckMetrics.recordStatus(labels, "failure.other"); } this.pullCheckMetrics.observeCompletionLatencyMs(labels, completedAt.getTime() - requestSubmittedAt.getTime()); - record = await this.pullCheckRepository.save(record); - const commitResult = await storage.commit({ pieces: pullPiecesOptions.pieces.map((pullPiece) => ({ pieceCid: pullPiece.pieceCid, @@ -310,36 +191,22 @@ export class PullCheckService { message: "Pull check commit result", commitResult, }); - - return record; } catch (error) { const failureClass = classifyFailureStatus(error); const completedAt = new Date(); - record.completedAt = completedAt; - if (failureClass === "failure.timedout") { - record.status = PullCheckStatus.TIMED_OUT; - record.failureReason = "pull_check_timeout"; - } else { - record.status = PullCheckStatus.FAILED; - record.failureReason = "pull_check_error"; - } - record.errorMessage = error instanceof Error ? error.message : String(error); - record.errorCode = (error as { code?: string }).code ?? null; this.pullCheckMetrics.recordStatus(labels, failureClass); if (requestSubmittedAt) { this.pullCheckMetrics.observeCompletionLatencyMs(labels, completedAt.getTime() - requestSubmittedAt.getTime()); } this.logger.error({ ...logContext, - pullCheckId, event: "pull_check_failed", message: "Pull check failed", error: toStructuredError(error), }); - return this.pullCheckRepository.save(record); } finally { if (prepared) { - await this.cleanupHostedPiece(prepared.registration.pieceCid, pullCheckId); + await this.cleanupHostedPiece(prepared.registration.pieceCid); } } } @@ -348,7 +215,7 @@ export class PullCheckService { * Generate a synthetic test piece, compute its piece CID, register it for * `/api/piece/:pieceCid` serving, and return the source URL plus registration. */ - async prepareHostedPiece(record: PullCheck): Promise { + async prepareHostedPiece(): Promise { const jobsConfig = this.getJobsConfig(); const datasetConfig = this.configService.get("dataset"); const targetSize = jobsConfig.pullCheckPieceSizeBytes; @@ -369,7 +236,6 @@ export class PullCheckService { fileName: dataFile.name, byteLength: dataFile.size, contentType: "application/octet-stream", - pullCheckId: record.id, expiresAt, cleanedUp: false, }; @@ -382,7 +248,7 @@ export class PullCheckService { * Mark the hosted piece as cleaned up and remove the on-disk artifact. Safe * to call multiple times. */ - async cleanupHostedPiece(pieceCid: string, pullCheckId: string | null): Promise { + async cleanupHostedPiece(pieceCid: string): Promise { const entry = this.hostedPieceRegistry.resolveAny(pieceCid); if (entry && !entry.cleanedUp) { this.hostedPieceRegistry.markCleanedUp(pieceCid); @@ -393,31 +259,13 @@ export class PullCheckService { event: "pull_check_cleanup_warn", message: "Failed to cleanup hosted piece artifact", pieceCid, - pullCheckId, error: toStructuredError(error), }); } } - if (pullCheckId) { - const record = await this.pullCheckRepository.findOne({ where: { id: pullCheckId } }); - if (record && record.hostedPieceCleanedUpAt == null) { - record.hostedPieceCleanedUpAt = new Date(); - await this.pullCheckRepository.save(record); - } - } this.hostedPieceRegistry.forget(pieceCid); } - buildLabelsForPullCheck(record: PullCheck): CheckMetricLabels { - const providerInfo = this.walletSdkService.getProviderInfo(record.spAddress); - return buildCheckMetricLabels({ - checkType: "pullCheck", - providerId: providerInfo?.id ?? null, - providerName: providerInfo?.name ?? null, - providerIsApproved: providerInfo?.isApproved ?? null, - }); - } - private getJobsConfig(): IJobsConfig { return this.configService.get("jobs", { infer: true }); } diff --git a/apps/backend/src/pull-check/pull-check.types.ts b/apps/backend/src/pull-check/pull-check.types.ts index 1ccef6fe..dd802e27 100644 --- a/apps/backend/src/pull-check/pull-check.types.ts +++ b/apps/backend/src/pull-check/pull-check.types.ts @@ -8,7 +8,6 @@ export type HostedPieceRegistration = { fileName: string; byteLength: number; contentType: string; - pullCheckId: string; expiresAt: Date; cleanedUp: boolean; }; From a3c7aec3c0a8d4f59018c0b2e50dda97023eca6c Mon Sep 17 00:00:00 2001 From: silent-cipher Date: Sun, 3 May 2026 01:16:45 +0530 Subject: [PATCH 06/44] refactor: add piece validation and improve logging --- .../src/pull-check/hosted-piece.registry.ts | 4 +- .../src/pull-check/pull-check.module.ts | 3 +- .../src/pull-check/pull-check.service.ts | 195 +++++++++++------- 3 files changed, 119 insertions(+), 83 deletions(-) diff --git a/apps/backend/src/pull-check/hosted-piece.registry.ts b/apps/backend/src/pull-check/hosted-piece.registry.ts index 1adc729c..dbf66f1a 100644 --- a/apps/backend/src/pull-check/hosted-piece.registry.ts +++ b/apps/backend/src/pull-check/hosted-piece.registry.ts @@ -15,7 +15,7 @@ export class HostedPieceRegistry { register(registration: HostedPieceRegistration): void { this.entries.set(registration.pieceCid, registration); - this.logger.log({ + this.logger.debug({ event: "hosted_piece_registered", message: "Registered hosted piece source", pieceCid: registration.pieceCid, @@ -48,7 +48,7 @@ export class HostedPieceRegistry { const entry = this.entries.get(pieceCid); if (!entry) return; entry.cleanedUp = true; - this.logger.log({ + this.logger.debug({ event: "hosted_piece_cleaned_up", message: "Marked hosted piece source as cleaned up", pieceCid, diff --git a/apps/backend/src/pull-check/pull-check.module.ts b/apps/backend/src/pull-check/pull-check.module.ts index ba317cbb..fd7b2d56 100644 --- a/apps/backend/src/pull-check/pull-check.module.ts +++ b/apps/backend/src/pull-check/pull-check.module.ts @@ -2,13 +2,14 @@ import { Module } from "@nestjs/common"; import { DatabaseModule } from "../database/database.module.js"; import { DataSourceModule } from "../dataSource/dataSource.module.js"; import { DealModule } from "../deal/deal.module.js"; +import { HttpClientModule } from "../http-client/http-client.module.js"; import { WalletSdkModule } from "../wallet-sdk/wallet-sdk.module.js"; import { HostedPieceRegistry } from "./hosted-piece.registry.js"; import { PieceSourceController } from "./piece-source.controller.js"; import { PullCheckService } from "./pull-check.service.js"; @Module({ - imports: [DatabaseModule, WalletSdkModule, DataSourceModule, DealModule], + imports: [DatabaseModule, WalletSdkModule, DataSourceModule, DealModule, HttpClientModule], controllers: [PieceSourceController], providers: [PullCheckService, HostedPieceRegistry], exports: [PullCheckService, HostedPieceRegistry], diff --git a/apps/backend/src/pull-check/pull-check.service.ts b/apps/backend/src/pull-check/pull-check.service.ts index 0085b9e8..d0ac5d26 100644 --- a/apps/backend/src/pull-check/pull-check.service.ts +++ b/apps/backend/src/pull-check/pull-check.service.ts @@ -3,7 +3,7 @@ import * as path from "node:path"; import { calculate, parse as parsePieceCid } from "@filoz/synapse-core/piece"; import { pullPieces, waitForPullPieces } from "@filoz/synapse-core/sp"; import { getDataSet } from "@filoz/synapse-core/warm-storage"; -import { METADATA_KEYS, Synapse } from "@filoz/synapse-sdk"; +import { Synapse } from "@filoz/synapse-sdk"; import { Injectable, Logger } from "@nestjs/common"; import { ConfigService } from "@nestjs/config"; import type { Account, Address, Chain, Client, Transport } from "viem"; @@ -12,6 +12,7 @@ import { createSynapseFromConfig } from "../common/synapse-factory.js"; import type { IAppConfig, IBlockchainConfig, IConfig, IDatasetConfig, IJobsConfig } from "../config/app.config.js"; import { DataSourceService } from "../dataSource/dataSource.service.js"; import { DealService } from "../deal/deal.service.js"; +import { HttpClientService } from "../http-client/http-client.service.js"; import { buildCheckMetricLabels, classifyFailureStatus } from "../metrics-prometheus/check-metric-labels.js"; import { PullCheckCheckMetrics } from "../metrics-prometheus/check-metrics.service.js"; import { WalletSdkService } from "../wallet-sdk/wallet-sdk.service.js"; @@ -34,16 +35,17 @@ export class PullCheckService { private readonly hostedPieceRegistry: HostedPieceRegistry, private readonly pullCheckMetrics: PullCheckCheckMetrics, private readonly dealService: DealService, + private readonly httpClientService: HttpClientService, ) { this.blockchainConfig = this.configService.get("blockchain", { infer: true }); } async onModuleInit() { - this.logger.log({ - event: "synapse_initialization", - message: "Creating shared Synapse instance", - }); this.sharedSynapse = await this.createSynapseInstance(); + this.logger.debug({ + event: "pull_check_synapse_ready", + message: "Pull-check Synapse instance initialized", + }); } async onModuleDestroy(): Promise { @@ -53,7 +55,9 @@ export class PullCheckService { } /** - * Create a pending pull-check record after validating provider eligibility. + * Resolve and validate provider eligibility for a pull check. Throws when + * the provider is unknown, inactive, missing a numeric provider id, or + * missing a PDP serviceURL. Returns the enriched provider info on success. */ validateProviderInfo(spAddress: string): PDPProviderEx { const providerInfo = this.walletSdkService.getProviderInfo(spAddress); @@ -75,7 +79,16 @@ export class PullCheckService { /** * Drive one pull check through its full lifecycle: - * prepare hosted piece -> submit pull -> poll terminal status -> verify. + * prepare hosted piece -> submit pull -> poll terminal SP status + * -> commit on dataset -> direct `/piece/:cid` validation -> cleanup. + * + * Failure metric + cleanup are owned here; failure logging is owned by the + * caller (jobs handler) so we do not double-log. Errors are re-thrown so the + * scheduler can distinguish `aborted` vs `failed` job outcomes. + * + * NOTE: Pull-check committed pieces are not tracked in the `deal` table, so + * `piece_cleanup` will not garbage-collect them. They will accrue on the SP + * unless explicitly removed. */ async runPullCheck( spAddress: string, @@ -83,7 +96,6 @@ export class PullCheckService { logContext: ProviderJobContext, ): Promise { const providerInfo = this.validateProviderInfo(spAddress); - const labels = buildCheckMetricLabels({ checkType: "pullCheck", providerId: providerInfo.id, @@ -98,6 +110,8 @@ export class PullCheckService { try { signal?.throwIfAborted(); prepared = await this.prepareHostedPiece(); + const pieceCidStr = prepared.registration.pieceCid; + const pieceCidParsed = parsePieceCid(pieceCidStr); const synapseClient = this.requireSynapseClient(); const synapse = this.sharedSynapse ?? (await this.createSynapseInstance()); @@ -105,34 +119,27 @@ export class PullCheckService { providerId: providerInfo.id, metadata: this.dealService.getBaseDataSetMetadata(), }); + + // Resolve pull options for either the existing-dataset or new-dataset SP + // pull pathway. `pullPieces` requires both dataSetId and clientDataSetId + // when targeting an existing dataset; if either is unavailable we treat + // the request as new-dataset and rely on the signed CreateDataSetAndAddPieces. const dataSetId = storage.dataSetId; const clientDataSetId = dataSetId ? (await getDataSet(synapseClient, { dataSetId }))?.clientDataSetId : undefined; - const pieceCidStr = prepared.registration.pieceCid; - const pieceCidParsed = parsePieceCid(pieceCidStr); const payee = providerInfo.payee as Address; const serviceURL = providerInfo.pdp.serviceURL; - const pullPiecesOptions = { serviceURL, - pieces: [ - { - pieceCid: pieceCidParsed, - sourceUrl: prepared.sourceUrl, - }, - ], + pieces: [{ pieceCid: pieceCidParsed, sourceUrl: prepared.sourceUrl }], ...(dataSetId && clientDataSetId ? { dataSetId, clientDataSetId } : { payee }), signal, }; + requestSubmittedAt = new Date(); const pullResponse = await pullPieces(synapseClient, pullPiecesOptions); - const requestCompletedAt = new Date(); - - this.pullCheckMetrics.observeRequestLatencyMs( - labels, - requestCompletedAt.getTime() - requestSubmittedAt.getTime(), - ); - this.pullCheckMetrics.recordProviderStatus(labels, pullResponse.status); - this.logger.log({ + signal?.throwIfAborted(); + this.pullCheckMetrics.observeRequestLatencyMs(labels, Date.now() - requestSubmittedAt.getTime()); + this.logger.debug({ ...logContext, event: "pull_check_request_submitted", message: "Pull request submitted to provider", @@ -141,69 +148,58 @@ export class PullCheckService { }); const jobsConfig = this.getJobsConfig(); - const waitForPullPiecesOptions = { + // `waitForPullPieces` polls the SP repeatedly until a terminal status is + // reported. Intentionally no `onStatus` hook: `pullCheckProviderStatus` + // is a counter and we only want to increment it once per check, at the + // terminal SP status (below). Per-poll increments would inflate the + // counter by the number of polls and break its rate-based semantics. + const finalResponse = await waitForPullPieces(synapseClient, { ...pullPiecesOptions, timeout: jobsConfig.pullCheckJobTimeoutSeconds * 1000, pollInterval: jobsConfig.pullCheckPollIntervalSeconds * 1000, - onStatus: (response) => { - this.pullCheckMetrics.recordProviderStatus(labels, response.status); - this.logger.debug({ - ...logContext, - event: "pull_check_status_observed", - message: "Observed pull status", - providerStatus: response.status, - }); - }, - }; - const finalResponse = await waitForPullPieces(synapseClient, waitForPullPiecesOptions); - - const pieceResults = finalResponse.pieces.map((piece: { pieceCid: string; status: string }) => { - const pieceCid = pullPiecesOptions.pieces.find((p) => p.toString() === piece.pieceCid); - return { - pieceCid: pieceCid?.pieceCid || piece.pieceCid, - status: piece.status === "complete" ? ("complete" as const) : ("failed" as const), - }; }); + signal?.throwIfAborted(); + this.pullCheckMetrics.observeCompletionLatencyMs(labels, Date.now() - requestSubmittedAt.getTime()); + // Record the SP-reported terminal pull status (one increment per check) + // regardless of outcome so both `complete` and `failed` are observable. + this.pullCheckMetrics.recordProviderStatus(labels, finalResponse.status); - const allComplete = pieceResults.every((p: { status: string }) => p.status === "complete"); - - const completedAt = new Date(); - - if (allComplete) { - this.pullCheckMetrics.recordStatus(labels, "success"); - } else { - this.pullCheckMetrics.recordStatus(labels, "failure.other"); + if (finalResponse.status !== "complete") { + throw new Error(`Storage provider failed to pull piece: status=${finalResponse.status}`); } - this.pullCheckMetrics.observeCompletionLatencyMs(labels, completedAt.getTime() - requestSubmittedAt.getTime()); - + // `pullPieces` already signed AddPieces / CreateDataSetAndAddPieces, but + // SDK convention is to also call `storage.commit` so the on-chain add is + // confirmed and the dataset state is observable to the client. We omit + // pieceMetadata: `IPFS_ROOT_CID` is meaningless for synthetic pull-check + // pieces and would corrupt downstream IPNI advertising. const commitResult = await storage.commit({ - pieces: pullPiecesOptions.pieces.map((pullPiece) => ({ - pieceCid: pullPiece.pieceCid, - pieceMetadata: { - [METADATA_KEYS.IPFS_ROOT_CID]: pullPiece.pieceCid.toString(), - }, - })), + pieces: pullPiecesOptions.pieces.map((p) => ({ pieceCid: p.pieceCid })), }); - - this.logger.log({ - event: "pull_check_commit_result", - message: "Pull check commit result", - commitResult, + signal?.throwIfAborted(); + this.logger.debug({ + ...logContext, + event: "pull_check_committed", + message: "Pull-check piece committed to dataset", + pieceCid: pieceCidStr, + dataSetId: commitResult.dataSetId.toString(), + pieceIds: commitResult.pieceIds.map((id) => id.toString()), + txHash: commitResult.txHash, }); + + const pieceValidated = await this.validateByDirectPieceFetch(providerInfo, pieceCidStr, logContext, signal); + signal?.throwIfAborted(); + if (!pieceValidated) { + throw new Error("Pull-check piece validation failed: SP did not serve the expected bytes"); + } + + this.pullCheckMetrics.recordStatus(labels, "success"); } catch (error) { - const failureClass = classifyFailureStatus(error); - const completedAt = new Date(); - this.pullCheckMetrics.recordStatus(labels, failureClass); + this.pullCheckMetrics.recordStatus(labels, classifyFailureStatus(error)); if (requestSubmittedAt) { - this.pullCheckMetrics.observeCompletionLatencyMs(labels, completedAt.getTime() - requestSubmittedAt.getTime()); + this.pullCheckMetrics.observeCompletionLatencyMs(labels, Date.now() - requestSubmittedAt.getTime()); } - this.logger.error({ - ...logContext, - event: "pull_check_failed", - message: "Pull check failed", - error: toStructuredError(error), - }); + throw error; } finally { if (prepared) { await this.cleanupHostedPiece(prepared.registration.pieceCid); @@ -211,6 +207,45 @@ export class PullCheckService { } } + /** + * Validate that the SP serves the just-pulled piece end-to-end by fetching + * `/piece/:pieceCid` from its PDP service URL and recomputing the piece CID + * over the response body. Returns `false` (rather than throwing) so the + * caller can record a domain-specific failure status; abort signals still + * propagate as throws. + */ + async validateByDirectPieceFetch( + providerInfo: PDPProviderEx, + pieceCid: string, + logContext: ProviderJobContext, + signal?: AbortSignal, + ): Promise { + signal?.throwIfAborted(); + const pieceFetchUrl = this.constructPieceFetchUrl(providerInfo.pdp.serviceURL, pieceCid); + try { + const response = await this.httpClientService.requestWithMetrics(pieceFetchUrl, { signal }); + const calculatedPieceCid = calculate(response.data); + return calculatedPieceCid.toString() === pieceCid; + } catch (error) { + // Re-throw aborts so the caller's lifecycle handles cancellation rather + // than treating it as a validation failure. + if (signal?.aborted) throw error; + this.logger.warn({ + ...logContext, + event: "pull_check_direct_piece_fetch_failed", + message: "Direct piece fetch failed during pull-check validation", + pieceCid, + pieceFetchUrl, + error: toStructuredError(error), + }); + return false; + } + } + + private constructPieceFetchUrl(baseUrl: string, pieceCid: string): string { + return `${baseUrl.replace(/\/$/, "")}/piece/${pieceCid}`; + } + /** * Generate a synthetic test piece, compute its piece CID, register it for * `/api/piece/:pieceCid` serving, and return the source URL plus registration. @@ -288,17 +323,17 @@ export class PullCheckService { try { const { synapse, isSessionKeyMode } = await createSynapseFromConfig(this.blockchainConfig); if (isSessionKeyMode) { - this.logger.log({ - event: "synapse_session_key_init", - message: "Initializing Synapse with session key", + this.logger.debug({ + event: "pull_check_synapse_session_key_init", + message: "Pull-check Synapse initialized with session key", walletAddress: this.blockchainConfig.walletAddress, }); } return synapse; } catch (error) { this.logger.error({ - event: "synapse_init_failed", - message: "Failed to initialize Synapse for deal job", + event: "pull_check_synapse_init_failed", + message: "Failed to initialize Synapse for pull-check service", error: toStructuredError(error), }); throw error; From 2875737f546a267942ecd40eefe055370f16067c Mon Sep 17 00:00:00 2001 From: silent-cipher Date: Sun, 3 May 2026 01:30:43 +0530 Subject: [PATCH 07/44] fix: logging --- .../src/pull-check/pull-check.service.ts | 35 +++++++++++-------- 1 file changed, 20 insertions(+), 15 deletions(-) diff --git a/apps/backend/src/pull-check/pull-check.service.ts b/apps/backend/src/pull-check/pull-check.service.ts index d0ac5d26..06f435b6 100644 --- a/apps/backend/src/pull-check/pull-check.service.ts +++ b/apps/backend/src/pull-check/pull-check.service.ts @@ -42,7 +42,7 @@ export class PullCheckService { async onModuleInit() { this.sharedSynapse = await this.createSynapseInstance(); - this.logger.debug({ + this.logger.log({ event: "pull_check_synapse_ready", message: "Pull-check Synapse instance initialized", }); @@ -138,13 +138,15 @@ export class PullCheckService { requestSubmittedAt = new Date(); const pullResponse = await pullPieces(synapseClient, pullPiecesOptions); signal?.throwIfAborted(); - this.pullCheckMetrics.observeRequestLatencyMs(labels, Date.now() - requestSubmittedAt.getTime()); - this.logger.debug({ + const requestLatencyMs = Date.now() - requestSubmittedAt.getTime(); + this.pullCheckMetrics.observeRequestLatencyMs(labels, requestLatencyMs); + this.logger.log({ ...logContext, - event: "pull_check_request_submitted", + event: "pull_request_submitted", message: "Pull request submitted to provider", pieceCid: pieceCidStr, providerStatus: pullResponse.status, + requestLatencyMs, }); const jobsConfig = this.getJobsConfig(); @@ -159,7 +161,8 @@ export class PullCheckService { pollInterval: jobsConfig.pullCheckPollIntervalSeconds * 1000, }); signal?.throwIfAborted(); - this.pullCheckMetrics.observeCompletionLatencyMs(labels, Date.now() - requestSubmittedAt.getTime()); + const completionLatencyMs = Date.now() - requestSubmittedAt.getTime(); + this.pullCheckMetrics.observeCompletionLatencyMs(labels, completionLatencyMs); // Record the SP-reported terminal pull status (one increment per check) // regardless of outcome so both `complete` and `failed` are observable. this.pullCheckMetrics.recordProviderStatus(labels, finalResponse.status); @@ -177,15 +180,6 @@ export class PullCheckService { pieces: pullPiecesOptions.pieces.map((p) => ({ pieceCid: p.pieceCid })), }); signal?.throwIfAborted(); - this.logger.debug({ - ...logContext, - event: "pull_check_committed", - message: "Pull-check piece committed to dataset", - pieceCid: pieceCidStr, - dataSetId: commitResult.dataSetId.toString(), - pieceIds: commitResult.pieceIds.map((id) => id.toString()), - txHash: commitResult.txHash, - }); const pieceValidated = await this.validateByDirectPieceFetch(providerInfo, pieceCidStr, logContext, signal); signal?.throwIfAborted(); @@ -194,6 +188,17 @@ export class PullCheckService { } this.pullCheckMetrics.recordStatus(labels, "success"); + this.logger.log({ + ...logContext, + event: "pull_check_completed", + message: "Pull check completed", + pieceCid: pieceCidStr, + dataSetId: commitResult.dataSetId.toString(), + pieceIds: commitResult.pieceIds.map((id) => id.toString()), + txHash: commitResult.txHash, + requestLatencyMs, + completionLatencyMs, + }); } catch (error) { this.pullCheckMetrics.recordStatus(labels, classifyFailureStatus(error)); if (requestSubmittedAt) { @@ -323,7 +328,7 @@ export class PullCheckService { try { const { synapse, isSessionKeyMode } = await createSynapseFromConfig(this.blockchainConfig); if (isSessionKeyMode) { - this.logger.debug({ + this.logger.log({ event: "pull_check_synapse_session_key_init", message: "Pull-check Synapse initialized with session key", walletAddress: this.blockchainConfig.walletAddress, From 07d21b16b7dfd3f025bee7be30803422c5df851d Mon Sep 17 00:00:00 2001 From: silent-cipher Date: Mon, 4 May 2026 13:33:11 +0530 Subject: [PATCH 08/44] fix: prometheus metric --- apps/backend/src/pull-check/pull-check.service.ts | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/apps/backend/src/pull-check/pull-check.service.ts b/apps/backend/src/pull-check/pull-check.service.ts index 06f435b6..ed973220 100644 --- a/apps/backend/src/pull-check/pull-check.service.ts +++ b/apps/backend/src/pull-check/pull-check.service.ts @@ -102,7 +102,6 @@ export class PullCheckService { providerName: providerInfo.name, providerIsApproved: providerInfo.isApproved, }); - this.pullCheckMetrics.recordStatus(labels, "pending"); let prepared: HostedPiecePrepared | null = null; let requestSubmittedAt: Date | null = null; @@ -145,7 +144,7 @@ export class PullCheckService { event: "pull_request_submitted", message: "Pull request submitted to provider", pieceCid: pieceCidStr, - providerStatus: pullResponse.status, + pullProviderStatus: pullResponse.status, requestLatencyMs, }); From 9e5723578f6668f09f88d5ccf886eb69a86c4cf2 Mon Sep 17 00:00:00 2001 From: silent-cipher Date: Mon, 4 May 2026 15:17:50 +0530 Subject: [PATCH 09/44] feat: add more prometheus metrics --- .../check-metrics.service.ts | 12 +++++++ .../metrics-prometheus.module.ts | 14 +++++++- .../src/pull-check/hosted-piece.registry.ts | 22 ++++++++++++ .../src/pull-check/piece-source.controller.ts | 4 +++ .../src/pull-check/pull-check.service.ts | 36 +++++++++++-------- .../src/pull-check/pull-check.types.ts | 2 ++ 6 files changed, 74 insertions(+), 16 deletions(-) diff --git a/apps/backend/src/metrics-prometheus/check-metrics.service.ts b/apps/backend/src/metrics-prometheus/check-metrics.service.ts index 7f5df0e1..929d2263 100644 --- a/apps/backend/src/metrics-prometheus/check-metrics.service.ts +++ b/apps/backend/src/metrics-prometheus/check-metrics.service.ts @@ -260,6 +260,10 @@ export class PullCheckCheckMetrics { private readonly pullCheckStatusCounter: Counter, @InjectMetric("pullCheckProviderStatus") private readonly pullCheckProviderStatusCounter: Counter, + @InjectMetric("pullCheckFirstByteMs") + private readonly pullCheckFirstByteMs: Histogram, + @InjectMetric("pullCheckThroughputBps") + private readonly pullCheckThroughputBps: Histogram, ) {} observeRequestLatencyMs(labels: CheckMetricLabels, value: number | null | undefined): void { @@ -277,4 +281,12 @@ export class PullCheckCheckMetrics { recordProviderStatus(labels: CheckMetricLabels, value: string): void { this.pullCheckProviderStatusCounter.inc({ ...labels, value }); } + + observeFirstByteMs(labels: CheckMetricLabels, value: number | null | undefined): void { + observePositive(this.pullCheckFirstByteMs, labels, value); + } + + observeThroughputBps(labels: CheckMetricLabels, value: number | null | undefined): void { + observePositive(this.pullCheckThroughputBps, labels, value); + } } diff --git a/apps/backend/src/metrics-prometheus/metrics-prometheus.module.ts b/apps/backend/src/metrics-prometheus/metrics-prometheus.module.ts index d276aafb..1273b279 100644 --- a/apps/backend/src/metrics-prometheus/metrics-prometheus.module.ts +++ b/apps/backend/src/metrics-prometheus/metrics-prometheus.module.ts @@ -212,7 +212,7 @@ const metricProviders = [ }), makeCounterProvider({ name: "pullCheckStatus", - help: "Pull-check terminal status counts (success | failure.timedout | failure.other | pending)", + help: "Pull-check terminal status counts (success | failure.timedout | failure.other)", labelNames: ["checkType", "providerId", "providerName", "providerStatus", "value"] as const, }), makeCounterProvider({ @@ -220,6 +220,18 @@ const metricProviders = [ help: "Raw SP-reported pull statuses observed by DealBot during polling", labelNames: ["checkType", "providerId", "providerName", "providerStatus", "value"] as const, }), + makeHistogramProvider({ + name: "pullCheckFirstByteMs", + help: "Time from pullPieces submission to the SP reading the first byte of the hosted-piece stream (ms)", + labelNames: ["checkType", "providerId", "providerName", "providerStatus"] as const, + buckets: [10, 50, 100, 250, 500, 1000, 2000, 5000, 10000, 30000, 60000, 120000], + }), + makeHistogramProvider({ + name: "pullCheckThroughputBps", + help: "Pull-check throughput approximated as pieceSize / completionLatency in bytes per second", + labelNames: ["checkType", "providerId", "providerName", "providerStatus"] as const, + buckets: throughputBuckets, + }), // Data Retention Metrics makeCounterProvider({ name: "dataSetChallengeStatus", diff --git a/apps/backend/src/pull-check/hosted-piece.registry.ts b/apps/backend/src/pull-check/hosted-piece.registry.ts index dbf66f1a..202ecf80 100644 --- a/apps/backend/src/pull-check/hosted-piece.registry.ts +++ b/apps/backend/src/pull-check/hosted-piece.registry.ts @@ -55,6 +55,28 @@ export class HostedPieceRegistry { }); } + /** + * Record the wall-clock time at which the `pullPieces` request was sent to + * the SP. Idempotent: only the first call wins so that retried checks against + * the same hosted piece do not skew first-byte measurements. + */ + markPullSubmitted(pieceCid: string, at: Date): void { + const entry = this.entries.get(pieceCid); + if (!entry || entry.pullSubmittedAt) return; + entry.pullSubmittedAt = at; + } + + /** + * Record the wall-clock time at which the SP read the first byte of the + * hosted-piece stream. Idempotent: only the first read wins so that an SP + * issuing retries after a failed connection does not overwrite the timestamp. + */ + markFirstByte(pieceCid: string, at: Date): void { + const entry = this.entries.get(pieceCid); + if (!entry || entry.firstByteAt) return; + entry.firstByteAt = at; + } + forget(pieceCid: string): void { this.entries.delete(pieceCid); } diff --git a/apps/backend/src/pull-check/piece-source.controller.ts b/apps/backend/src/pull-check/piece-source.controller.ts index aaf29758..92596af4 100644 --- a/apps/backend/src/pull-check/piece-source.controller.ts +++ b/apps/backend/src/pull-check/piece-source.controller.ts @@ -74,6 +74,10 @@ export class PieceSourceController { } res.destroy(error); }); + // Capture the first-byte timestamp before piping + stream.once("data", () => { + this.hostedPieceRegistry.markFirstByte(pieceCid, new Date()); + }); stream.pipe(res); } } diff --git a/apps/backend/src/pull-check/pull-check.service.ts b/apps/backend/src/pull-check/pull-check.service.ts index ed973220..738fe4c9 100644 --- a/apps/backend/src/pull-check/pull-check.service.ts +++ b/apps/backend/src/pull-check/pull-check.service.ts @@ -82,10 +82,6 @@ export class PullCheckService { * prepare hosted piece -> submit pull -> poll terminal SP status * -> commit on dataset -> direct `/piece/:cid` validation -> cleanup. * - * Failure metric + cleanup are owned here; failure logging is owned by the - * caller (jobs handler) so we do not double-log. Errors are re-thrown so the - * scheduler can distinguish `aborted` vs `failed` job outcomes. - * * NOTE: Pull-check committed pieces are not tracked in the `deal` table, so * `piece_cleanup` will not garbage-collect them. They will accrue on the SP * unless explicitly removed. @@ -135,6 +131,7 @@ export class PullCheckService { }; requestSubmittedAt = new Date(); + this.hostedPieceRegistry.markPullSubmitted(pieceCidStr, requestSubmittedAt); const pullResponse = await pullPieces(synapseClient, pullPiecesOptions); signal?.throwIfAborted(); const requestLatencyMs = Date.now() - requestSubmittedAt.getTime(); @@ -149,11 +146,7 @@ export class PullCheckService { }); const jobsConfig = this.getJobsConfig(); - // `waitForPullPieces` polls the SP repeatedly until a terminal status is - // reported. Intentionally no `onStatus` hook: `pullCheckProviderStatus` - // is a counter and we only want to increment it once per check, at the - // terminal SP status (below). Per-poll increments would inflate the - // counter by the number of polls and break its rate-based semantics. + // `waitForPullPieces` polls the SP repeatedly until a terminal pull status is reported const finalResponse = await waitForPullPieces(synapseClient, { ...pullPiecesOptions, timeout: jobsConfig.pullCheckJobTimeoutSeconds * 1000, @@ -163,18 +156,14 @@ export class PullCheckService { const completionLatencyMs = Date.now() - requestSubmittedAt.getTime(); this.pullCheckMetrics.observeCompletionLatencyMs(labels, completionLatencyMs); // Record the SP-reported terminal pull status (one increment per check) - // regardless of outcome so both `complete` and `failed` are observable. this.pullCheckMetrics.recordProviderStatus(labels, finalResponse.status); if (finalResponse.status !== "complete") { throw new Error(`Storage provider failed to pull piece: status=${finalResponse.status}`); } - // `pullPieces` already signed AddPieces / CreateDataSetAndAddPieces, but - // SDK convention is to also call `storage.commit` so the on-chain add is - // confirmed and the dataset state is observable to the client. We omit - // pieceMetadata: `IPFS_ROOT_CID` is meaningless for synthetic pull-check - // pieces and would corrupt downstream IPNI advertising. + // We omit pieceMetadata: `IPFS_ROOT_CID` is meaningless for synthetic + // pull-check pieces and would corrupt downstream IPNI advertising. const commitResult = await storage.commit({ pieces: pullPiecesOptions.pieces.map((p) => ({ pieceCid: p.pieceCid })), }); @@ -186,6 +175,20 @@ export class PullCheckService { throw new Error("Pull-check piece validation failed: SP did not serve the expected bytes"); } + const firstByteEntry = this.hostedPieceRegistry.resolveAny(pieceCidStr); + const firstByteMs = + firstByteEntry?.firstByteAt && firstByteEntry?.pullSubmittedAt + ? firstByteEntry.firstByteAt.getTime() - firstByteEntry.pullSubmittedAt.getTime() + : null; + if (firstByteMs != null) { + this.pullCheckMetrics.observeFirstByteMs(labels, firstByteMs); + } + // Throughput approximated as pieceSize / completionLatency. This is an + // upper-bound on actual transfer time because completionLatency includes + // SP-side scheduling/queuing and our polling cadence. + const throughputBps = Math.round((prepared.registration.byteLength * 1000) / Math.max(completionLatencyMs, 1)); + this.pullCheckMetrics.observeThroughputBps(labels, throughputBps); + this.pullCheckMetrics.recordStatus(labels, "success"); this.logger.log({ ...logContext, @@ -197,6 +200,9 @@ export class PullCheckService { txHash: commitResult.txHash, requestLatencyMs, completionLatencyMs, + firstByteMs, + throughputBps, + pieceSizeBytes: prepared.registration.byteLength, }); } catch (error) { this.pullCheckMetrics.recordStatus(labels, classifyFailureStatus(error)); diff --git a/apps/backend/src/pull-check/pull-check.types.ts b/apps/backend/src/pull-check/pull-check.types.ts index dd802e27..c8caf447 100644 --- a/apps/backend/src/pull-check/pull-check.types.ts +++ b/apps/backend/src/pull-check/pull-check.types.ts @@ -10,6 +10,8 @@ export type HostedPieceRegistration = { contentType: string; expiresAt: Date; cleanedUp: boolean; + pullSubmittedAt?: Date; + firstByteAt?: Date; }; /** From f9b0e8d72d012fa92ed483f751212728f89e4e85 Mon Sep 17 00:00:00 2001 From: silent-cipher Date: Mon, 4 May 2026 23:31:39 +0530 Subject: [PATCH 10/44] add tests --- .../pull-check/hosted-piece.registry.spec.ts | 162 ++++++ .../piece-source.controller.spec.ts | 173 +++++++ .../src/pull-check/pull-check.service.spec.ts | 488 ++++++++++++++++++ 3 files changed, 823 insertions(+) create mode 100644 apps/backend/src/pull-check/hosted-piece.registry.spec.ts create mode 100644 apps/backend/src/pull-check/piece-source.controller.spec.ts create mode 100644 apps/backend/src/pull-check/pull-check.service.spec.ts diff --git a/apps/backend/src/pull-check/hosted-piece.registry.spec.ts b/apps/backend/src/pull-check/hosted-piece.registry.spec.ts new file mode 100644 index 00000000..a8273227 --- /dev/null +++ b/apps/backend/src/pull-check/hosted-piece.registry.spec.ts @@ -0,0 +1,162 @@ +import { describe, expect, it } from "vitest"; +import { HostedPieceRegistry } from "./hosted-piece.registry.js"; +import type { HostedPieceRegistration } from "./pull-check.types.js"; + +function makeRegistration(overrides: Partial = {}): HostedPieceRegistration { + return { + pieceCid: "bafk-test", + filePath: "/tmp/datasets/test.bin", + fileName: "test.bin", + byteLength: 1024, + contentType: "application/octet-stream", + expiresAt: new Date(Date.now() + 60_000), + cleanedUp: false, + ...overrides, + }; +} + +describe("HostedPieceRegistry", () => { + describe("register / resolveActive / resolveAny", () => { + it("registers a piece and resolves it by CID", () => { + const registry = new HostedPieceRegistry(); + const registration = makeRegistration(); + + registry.register(registration); + + expect(registry.resolveActive(registration.pieceCid)).toBe(registration); + expect(registry.resolveAny(registration.pieceCid)).toBe(registration); + }); + + it("resolveActive returns null for unknown pieceCid", () => { + const registry = new HostedPieceRegistry(); + expect(registry.resolveActive("missing")).toBeNull(); + expect(registry.resolveAny("missing")).toBeNull(); + }); + + it("resolveActive returns null when the registration has been cleaned up", () => { + const registry = new HostedPieceRegistry(); + const registration = makeRegistration({ cleanedUp: true }); + registry.register(registration); + + expect(registry.resolveActive(registration.pieceCid)).toBeNull(); + // resolveAny still surfaces the cleaned-up entry so the controller can + // distinguish 410 Gone from 404 Not Found. + expect(registry.resolveAny(registration.pieceCid)).toBe(registration); + }); + + it("resolveActive returns null when the registration has expired", () => { + const registry = new HostedPieceRegistry(); + const expired = makeRegistration({ expiresAt: new Date(2000, 0, 1) }); + registry.register(expired); + + expect(registry.resolveActive(expired.pieceCid)).toBeNull(); + expect(registry.resolveAny(expired.pieceCid)).toBe(expired); + }); + + it("resolveActive treats expiresAt boundary as expired", () => { + const registry = new HostedPieceRegistry(); + const now = new Date("2030-01-01T00:00:00Z"); + const registration = makeRegistration({ expiresAt: now }); + registry.register(registration); + + expect(registry.resolveActive(registration.pieceCid, now)).toBeNull(); + }); + }); + + describe("markCleanedUp", () => { + it("marks the registration as cleaned up so resolveActive returns null", () => { + const registry = new HostedPieceRegistry(); + const registration = makeRegistration(); + registry.register(registration); + + registry.markCleanedUp(registration.pieceCid); + + expect(registration.cleanedUp).toBe(true); + expect(registry.resolveActive(registration.pieceCid)).toBeNull(); + }); + + it("is a no-op for unknown pieceCid", () => { + const registry = new HostedPieceRegistry(); + expect(() => registry.markCleanedUp("missing")).not.toThrow(); + }); + }); + + describe("markPullSubmitted", () => { + it("stamps the pullSubmittedAt timestamp on a registered piece", () => { + const registry = new HostedPieceRegistry(); + const registration = makeRegistration(); + registry.register(registration); + const submittedAt = new Date("2030-01-01T00:00:00Z"); + + registry.markPullSubmitted(registration.pieceCid, submittedAt); + + expect(registration.pullSubmittedAt).toBe(submittedAt); + }); + + it("is idempotent: only the first call wins so SP retries do not skew measurements", () => { + const registry = new HostedPieceRegistry(); + const registration = makeRegistration(); + registry.register(registration); + const first = new Date("2030-01-01T00:00:00Z"); + const second = new Date("2030-01-01T00:00:01Z"); + + registry.markPullSubmitted(registration.pieceCid, first); + registry.markPullSubmitted(registration.pieceCid, second); + + expect(registration.pullSubmittedAt).toBe(first); + }); + + it("is a no-op for unknown pieceCid", () => { + const registry = new HostedPieceRegistry(); + expect(() => registry.markPullSubmitted("missing", new Date())).not.toThrow(); + }); + }); + + describe("markFirstByte", () => { + it("stamps the firstByteAt timestamp on a registered piece", () => { + const registry = new HostedPieceRegistry(); + const registration = makeRegistration(); + registry.register(registration); + const firstByteAt = new Date("2030-01-01T00:00:00.500Z"); + + registry.markFirstByte(registration.pieceCid, firstByteAt); + + expect(registration.firstByteAt).toBe(firstByteAt); + }); + + it("is idempotent: only the first SP read wins", () => { + const registry = new HostedPieceRegistry(); + const registration = makeRegistration(); + registry.register(registration); + const first = new Date("2030-01-01T00:00:00.500Z"); + const second = new Date("2030-01-01T00:00:01.000Z"); + + registry.markFirstByte(registration.pieceCid, first); + registry.markFirstByte(registration.pieceCid, second); + + expect(registration.firstByteAt).toBe(first); + }); + + it("is a no-op for unknown pieceCid", () => { + const registry = new HostedPieceRegistry(); + expect(() => registry.markFirstByte("missing", new Date())).not.toThrow(); + }); + }); + + describe("forget", () => { + it("removes the registration entirely", () => { + const registry = new HostedPieceRegistry(); + const registration = makeRegistration(); + registry.register(registration); + + registry.forget(registration.pieceCid); + + expect(registry.resolveAny(registration.pieceCid)).toBeNull(); + }); + + it("is a no-op for unknown pieceCid", () => { + const registry = new HostedPieceRegistry(); + expect(() => registry.forget("missing")).not.toThrow(); + }); + }); +}); diff --git a/apps/backend/src/pull-check/piece-source.controller.spec.ts b/apps/backend/src/pull-check/piece-source.controller.spec.ts new file mode 100644 index 00000000..bebfe106 --- /dev/null +++ b/apps/backend/src/pull-check/piece-source.controller.spec.ts @@ -0,0 +1,173 @@ +import { Readable, Writable } from "node:stream"; +import { Test } from "@nestjs/testing"; +import type { Response } from "express"; +import { describe, expect, it, vi } from "vitest"; +import { HostedPieceRegistry } from "./hosted-piece.registry.js"; +import { PieceSourceController } from "./piece-source.controller.js"; +import { PullCheckService } from "./pull-check.service.js"; +import type { HostedPieceRegistration } from "./pull-check.types.js"; + +function makeRegistration(overrides: Partial = {}): HostedPieceRegistration { + return { + pieceCid: "bafk-test", + filePath: "/tmp/test.bin", + fileName: "test.bin", + byteLength: 4, + contentType: "application/octet-stream", + expiresAt: new Date(Date.now() + 60_000), + cleanedUp: false, + ...overrides, + }; +} + +/** + * Fake express `Response` that is also a `Writable`, so `stream.pipe(res)` + * works without a real HTTP layer. The controller only calls `setHeader`, + * `status`, `send`, and `destroy`; we spy on those and let pipe write into + * the sink to verify the body. + */ +type FakeResponse = Writable & { + headersSent: boolean; + chunks: Buffer[]; + setHeader: ReturnType; + status: ReturnType; + send: ReturnType; +}; + +function makeResponse(): FakeResponse { + const chunks: Buffer[] = []; + const sink = new Writable({ + write(chunk, _encoding, cb) { + chunks.push(Buffer.isBuffer(chunk) ? chunk : Buffer.from(chunk)); + cb(); + }, + }) as FakeResponse; + sink.headersSent = false; + sink.chunks = chunks; + sink.setHeader = vi.fn(); + sink.status = vi.fn().mockReturnValue(sink); + sink.send = vi.fn().mockReturnValue(sink); + return sink; +} + +function asResponse(res: FakeResponse): Response { + return res as unknown as Response; +} + +async function setup(opts: { + opened?: ReturnType; + knownEntry?: HostedPieceRegistration | null; +}) { + const pullCheckService = { + openHostedPieceStream: vi.fn().mockReturnValue(opts.opened ?? null), + }; + const hostedPieceRegistry = { + resolveAny: vi.fn().mockReturnValue(opts.knownEntry ?? null), + markFirstByte: vi.fn(), + }; + + const module = await Test.createTestingModule({ + controllers: [PieceSourceController], + providers: [ + { provide: PullCheckService, useValue: pullCheckService }, + { provide: HostedPieceRegistry, useValue: hostedPieceRegistry }, + ], + }).compile(); + + const controller = module.get(PieceSourceController); + return { controller, pullCheckService, hostedPieceRegistry }; +} + +describe("PieceSourceController", () => { + it("returns 404 when pieceCid is missing or empty", async () => { + const { controller } = await setup({}); + const res = makeResponse(); + + // servePiece throws a NestJS NotFoundException synchronously; it is not async. + expect(() => controller.servePiece("", asResponse(res))).toThrow(/pieceCid is required/); + expect(() => controller.servePiece(" ", asResponse(res))).toThrow(/pieceCid is required/); + }); + + it("returns 404 when no registration exists for the pieceCid", async () => { + const { controller, pullCheckService, hostedPieceRegistry } = await setup({}); + const res = makeResponse(); + + controller.servePiece("bafk-unknown", asResponse(res)); + + expect(pullCheckService.openHostedPieceStream).toHaveBeenCalledWith("bafk-unknown"); + expect(hostedPieceRegistry.resolveAny).toHaveBeenCalledWith("bafk-unknown"); + expect(res.status).toHaveBeenCalledWith(404); + expect(res.send).toHaveBeenCalledWith("Hosted piece source not found"); + }); + + it("returns 410 when the registration exists but is no longer active", async () => { + const cleaned = makeRegistration({ cleanedUp: true }); + const { controller } = await setup({ opened: null, knownEntry: cleaned }); + const res = makeResponse(); + + controller.servePiece(cleaned.pieceCid, asResponse(res)); + + expect(res.status).toHaveBeenCalledWith(410); + expect(res.send).toHaveBeenCalledWith("Hosted piece source has expired or been cleaned up"); + }); + + it("streams the piece, sets headers, and marks first byte on the first chunk", async () => { + const registration = makeRegistration(); + const stream = Readable.from([Buffer.from("ABCD")]); + const { controller, hostedPieceRegistry } = await setup({ + opened: { registration, stream } as ReturnType, + }); + const res = makeResponse(); + const pipeSpy = vi.spyOn(stream, "pipe"); + + controller.servePiece(registration.pieceCid, asResponse(res)); + + expect(res.setHeader).toHaveBeenCalledWith("Content-Type", "application/octet-stream"); + expect(res.setHeader).toHaveBeenCalledWith("Content-Length", "4"); + expect(res.setHeader).toHaveBeenCalledWith("Cache-Control", "no-store"); + expect(res.setHeader).toHaveBeenCalledWith("X-Pull-Check-Piece-CID", registration.pieceCid); + expect(pipeSpy).toHaveBeenCalledTimes(1); + + // Wait for the stream to fully drain into our fake Writable sink. + await new Promise((resolve) => res.once("finish", resolve)); + + expect(hostedPieceRegistry.markFirstByte).toHaveBeenCalledTimes(1); + expect(hostedPieceRegistry.markFirstByte).toHaveBeenCalledWith(registration.pieceCid, expect.any(Date)); + expect(Buffer.concat(res.chunks).toString()).toBe("ABCD"); + }); + + it("sends a 500 response when the stream errors before headers are sent", () => { + const registration = makeRegistration(); + const stream = new Readable({ read() {} }); + const opened = { registration, stream } as ReturnType; + const res = makeResponse(); + + return setup({ opened }).then(({ controller }) => { + controller.servePiece(registration.pieceCid, asResponse(res)); + + stream.destroy(new Error("boom")); + stream.emit("error", new Error("boom")); + + expect(res.status).toHaveBeenCalledWith(500); + expect(res.send).toHaveBeenCalledWith("Failed to stream hosted piece"); + }); + }); + + it("destroys the response when the stream errors after headers are sent", async () => { + const registration = makeRegistration(); + const stream = new Readable({ read() {} }); + const opened = { registration, stream } as ReturnType; + const res = makeResponse(); + res.headersSent = true; + // Mock the real destroy to keep Writable from re-emitting the error as an + // unhandled event; we only need to assert the controller forwarded it. + const destroySpy = vi.spyOn(res, "destroy").mockImplementation(() => res); + + const { controller } = await setup({ opened }); + controller.servePiece(registration.pieceCid, asResponse(res)); + const error = new Error("late-boom"); + stream.emit("error", error); + + expect(destroySpy).toHaveBeenCalledWith(error); + }); +}); diff --git a/apps/backend/src/pull-check/pull-check.service.spec.ts b/apps/backend/src/pull-check/pull-check.service.spec.ts new file mode 100644 index 00000000..ac1b8d4f --- /dev/null +++ b/apps/backend/src/pull-check/pull-check.service.spec.ts @@ -0,0 +1,488 @@ +import type { Synapse } from "@filoz/synapse-sdk"; +import { ConfigService } from "@nestjs/config"; +import { Test, type TestingModule } from "@nestjs/testing"; +import { afterEach, beforeEach, describe, expect, it, vi } from "vitest"; +import type { IConfig } from "../config/app.config.js"; +import { DataSourceService } from "../dataSource/dataSource.service.js"; +import { DealService } from "../deal/deal.service.js"; +import { HttpClientService } from "../http-client/http-client.service.js"; +import { PullCheckCheckMetrics } from "../metrics-prometheus/check-metrics.service.js"; +import { WalletSdkService } from "../wallet-sdk/wallet-sdk.service.js"; +import type { PDPProviderEx } from "../wallet-sdk/wallet-sdk.types.js"; +import { HostedPieceRegistry } from "./hosted-piece.registry.js"; +import { PullCheckService } from "./pull-check.service.js"; + +// `@filoz/synapse-core/piece` is mocked so that piece CIDs are deterministic +// strings rather than real CID objects, keeping the tests fast and isolated +// from the SDK's internal hashing. +vi.mock("@filoz/synapse-core/piece", () => ({ + parse: vi.fn((s: string) => ({ __parsed: s, toString: () => s })), + calculate: vi.fn(() => ({ toString: () => "bafk-test-piece" })), +})); + +vi.mock("@filoz/synapse-core/sp", () => ({ + pullPieces: vi.fn(), + waitForPullPieces: vi.fn(), +})); + +vi.mock("@filoz/synapse-core/warm-storage", () => ({ + getDataSet: vi.fn(), +})); + +// `createSynapseFromConfig` is invoked from `onModuleInit`; the tests do not +// run module init, but the import must resolve. +vi.mock("../common/synapse-factory.js", () => ({ + createSynapseFromConfig: vi.fn(), +})); + +import { calculate } from "@filoz/synapse-core/piece"; +import { pullPieces, waitForPullPieces } from "@filoz/synapse-core/sp"; +import { getDataSet } from "@filoz/synapse-core/warm-storage"; + +function makeProvider(overrides: Partial = {}): PDPProviderEx { + return { + id: 42n, + name: "test-sp", + payee: "0xpayee", + isActive: true, + isApproved: true, + pdp: { + serviceURL: "https://sp.example/", + }, + ...overrides, + } as unknown as PDPProviderEx; +} + +describe("PullCheckService", () => { + let module: TestingModule; + let service: PullCheckService; + let walletSdkServiceMock: { getProviderInfo: ReturnType; getSynapseClient: ReturnType }; + let dataSourceServiceMock: { + generateRandomDataset: ReturnType; + cleanupRandomDataset: ReturnType; + }; + let registryMock: { + register: ReturnType; + resolveAny: ReturnType; + resolveActive: ReturnType; + markCleanedUp: ReturnType; + markPullSubmitted: ReturnType; + markFirstByte: ReturnType; + forget: ReturnType; + }; + let dealServiceMock: { getBaseDataSetMetadata: ReturnType }; + let httpClientServiceMock: { requestWithMetrics: ReturnType }; + let metricsMock: { + observeRequestLatencyMs: ReturnType; + observeCompletionLatencyMs: ReturnType; + recordStatus: ReturnType; + recordProviderStatus: ReturnType; + observeFirstByteMs: ReturnType; + observeThroughputBps: ReturnType; + }; + let configValues: Partial; + + beforeEach(async () => { + walletSdkServiceMock = { + getProviderInfo: vi.fn().mockReturnValue(makeProvider()), + getSynapseClient: vi.fn().mockReturnValue({}), + }; + dataSourceServiceMock = { + generateRandomDataset: vi.fn(), + cleanupRandomDataset: vi.fn(), + }; + registryMock = { + register: vi.fn(), + resolveAny: vi.fn().mockReturnValue(null), + resolveActive: vi.fn().mockReturnValue(null), + markCleanedUp: vi.fn(), + markPullSubmitted: vi.fn(), + markFirstByte: vi.fn(), + forget: vi.fn(), + }; + dealServiceMock = { + getBaseDataSetMetadata: vi.fn().mockReturnValue({}), + }; + httpClientServiceMock = { + requestWithMetrics: vi.fn(), + }; + metricsMock = { + observeRequestLatencyMs: vi.fn(), + observeCompletionLatencyMs: vi.fn(), + recordStatus: vi.fn(), + recordProviderStatus: vi.fn(), + observeFirstByteMs: vi.fn(), + observeThroughputBps: vi.fn(), + }; + + configValues = { + app: { host: "localhost", port: 3000, apiPublicUrl: "https://dealbot.example" } as IConfig["app"], + blockchain: { network: "calibration", walletAddress: "0xwallet" } as IConfig["blockchain"], + jobs: { + pullCheckJobTimeoutSeconds: 300, + pullCheckPollIntervalSeconds: 5, + pullCheckPieceSizeBytes: 1024, + pullCheckHostedPieceTtlSeconds: 600, + } as IConfig["jobs"], + dataset: { localDatasetsPath: "/tmp/datasets" } as IConfig["dataset"], + }; + + const configServiceMock = { + get: vi.fn((key: keyof IConfig) => configValues[key]), + }; + + module = await Test.createTestingModule({ + providers: [ + PullCheckService, + { provide: ConfigService, useValue: configServiceMock }, + { provide: WalletSdkService, useValue: walletSdkServiceMock }, + { provide: DataSourceService, useValue: dataSourceServiceMock }, + { provide: HostedPieceRegistry, useValue: registryMock }, + { provide: PullCheckCheckMetrics, useValue: metricsMock }, + { provide: DealService, useValue: dealServiceMock }, + { provide: HttpClientService, useValue: httpClientServiceMock }, + ], + }).compile(); + + service = module.get(PullCheckService); + }); + + afterEach(() => { + vi.clearAllMocks(); + }); + + describe("validateProviderInfo", () => { + it("returns the provider info on the happy path", () => { + const provider = makeProvider(); + walletSdkServiceMock.getProviderInfo.mockReturnValue(provider); + + expect(service.validateProviderInfo("0xsp")).toBe(provider); + }); + + it("throws when the provider is unknown", () => { + walletSdkServiceMock.getProviderInfo.mockReturnValue(undefined); + expect(() => service.validateProviderInfo("0xsp")).toThrow(/not found/); + }); + + it("throws when the provider is inactive", () => { + walletSdkServiceMock.getProviderInfo.mockReturnValue(makeProvider({ isActive: false })); + expect(() => service.validateProviderInfo("0xsp")).toThrow(/not active/); + }); + + it("throws when the provider is missing a numeric id", () => { + walletSdkServiceMock.getProviderInfo.mockReturnValue(makeProvider({ id: undefined as unknown as bigint })); + expect(() => service.validateProviderInfo("0xsp")).toThrow(/missing providerId/); + }); + + it("throws when the provider is missing a PDP serviceURL", () => { + walletSdkServiceMock.getProviderInfo.mockReturnValue( + makeProvider({ pdp: { serviceURL: "" } as PDPProviderEx["pdp"] }), + ); + expect(() => service.validateProviderInfo("0xsp")).toThrow(/missing serviceURL/); + }); + }); + + describe("prepareHostedPiece", () => { + it("generates a dataset, computes the piece CID, and registers the hosted piece", async () => { + dataSourceServiceMock.generateRandomDataset.mockResolvedValue({ + name: "test.bin", + data: Buffer.from("hello"), + size: 5, + }); + + const prepared = await service.prepareHostedPiece(); + + expect(dataSourceServiceMock.generateRandomDataset).toHaveBeenCalledWith(1024, 1024); + expect(calculate).toHaveBeenCalledTimes(1); + expect(prepared.registration.pieceCid).toBe("bafk-test-piece"); + expect(prepared.registration.fileName).toBe("test.bin"); + expect(prepared.registration.byteLength).toBe(5); + expect(prepared.sourceUrl).toBe("https://dealbot.example/api/piece/bafk-test-piece"); + expect(registryMock.register).toHaveBeenCalledWith(prepared.registration); + }); + + it("falls back to host:port when apiPublicUrl is not configured", async () => { + configValues.app = { host: "localhost", port: 3000 } as IConfig["app"]; + dataSourceServiceMock.generateRandomDataset.mockResolvedValue({ + name: "test.bin", + data: Buffer.from("hello"), + size: 5, + }); + + const prepared = await service.prepareHostedPiece(); + expect(prepared.sourceUrl).toBe("http://localhost:3000/api/piece/bafk-test-piece"); + }); + }); + + describe("cleanupHostedPiece", () => { + const baseEntry = { + pieceCid: "bafk-test-piece", + filePath: "/tmp/datasets/test.bin", + fileName: "test.bin", + byteLength: 5, + contentType: "application/octet-stream", + expiresAt: new Date(Date.now() + 60_000), + cleanedUp: false, + }; + + it("marks the registration cleaned up and removes the file", async () => { + registryMock.resolveAny.mockReturnValue({ ...baseEntry }); + + await service.cleanupHostedPiece(baseEntry.pieceCid); + + expect(registryMock.markCleanedUp).toHaveBeenCalledWith(baseEntry.pieceCid); + expect(dataSourceServiceMock.cleanupRandomDataset).toHaveBeenCalledWith(baseEntry.fileName); + expect(registryMock.forget).toHaveBeenCalledWith(baseEntry.pieceCid); + }); + + it("skips file cleanup when the registration is already cleaned up", async () => { + registryMock.resolveAny.mockReturnValue({ ...baseEntry, cleanedUp: true }); + + await service.cleanupHostedPiece(baseEntry.pieceCid); + + expect(registryMock.markCleanedUp).not.toHaveBeenCalled(); + expect(dataSourceServiceMock.cleanupRandomDataset).not.toHaveBeenCalled(); + expect(registryMock.forget).toHaveBeenCalledWith(baseEntry.pieceCid); + }); + + it("forgets the entry even when no registration exists", async () => { + registryMock.resolveAny.mockReturnValue(null); + + await service.cleanupHostedPiece("missing"); + + expect(registryMock.markCleanedUp).not.toHaveBeenCalled(); + expect(dataSourceServiceMock.cleanupRandomDataset).not.toHaveBeenCalled(); + expect(registryMock.forget).toHaveBeenCalledWith("missing"); + }); + + it("does not propagate cleanup errors so callers can rely on it in finally", async () => { + registryMock.resolveAny.mockReturnValue({ ...baseEntry }); + dataSourceServiceMock.cleanupRandomDataset.mockRejectedValue(new Error("disk full")); + + await expect(service.cleanupHostedPiece(baseEntry.pieceCid)).resolves.toBeUndefined(); + expect(registryMock.forget).toHaveBeenCalledWith(baseEntry.pieceCid); + }); + }); + + describe("validateByDirectPieceFetch", () => { + const provider = makeProvider(); + const logContext = { jobId: "job-1", providerAddress: "0xsp", providerId: 42n, providerName: "test-sp" }; + + it("returns true when the recomputed CID matches", async () => { + httpClientServiceMock.requestWithMetrics.mockResolvedValue({ data: Buffer.from("payload") }); + vi.mocked(calculate).mockReturnValueOnce({ toString: () => "bafk-test-piece" } as ReturnType); + + const ok = await service.validateByDirectPieceFetch(provider, "bafk-test-piece", logContext); + expect(ok).toBe(true); + expect(httpClientServiceMock.requestWithMetrics).toHaveBeenCalledWith( + "https://sp.example/piece/bafk-test-piece", + expect.any(Object), + ); + }); + + it("returns false when the recomputed CID does not match", async () => { + httpClientServiceMock.requestWithMetrics.mockResolvedValue({ data: Buffer.from("payload") }); + vi.mocked(calculate).mockReturnValueOnce({ toString: () => "bafk-different" } as ReturnType); + + const ok = await service.validateByDirectPieceFetch(provider, "bafk-test-piece", logContext); + expect(ok).toBe(false); + }); + + it("returns false on transport errors (caller branches on the boolean to record a domain failure)", async () => { + httpClientServiceMock.requestWithMetrics.mockRejectedValue(new Error("ECONNRESET")); + + const ok = await service.validateByDirectPieceFetch(provider, "bafk-test-piece", logContext); + expect(ok).toBe(false); + }); + + it("re-throws when the abort signal fires so cancellation is not masked as validation failure", async () => { + const abort = new AbortController(); + httpClientServiceMock.requestWithMetrics.mockImplementation(async () => { + abort.abort(); + throw new Error("aborted"); + }); + + await expect( + service.validateByDirectPieceFetch(provider, "bafk-test-piece", logContext, abort.signal), + ).rejects.toThrow(); + }); + + it("strips a trailing slash from the SP serviceURL when constructing the fetch URL", async () => { + httpClientServiceMock.requestWithMetrics.mockResolvedValue({ data: Buffer.from("payload") }); + vi.mocked(calculate).mockReturnValueOnce({ toString: () => "bafk-test-piece" } as ReturnType); + + await service.validateByDirectPieceFetch(provider, "bafk-test-piece", logContext); + expect(httpClientServiceMock.requestWithMetrics).toHaveBeenCalledWith( + "https://sp.example/piece/bafk-test-piece", + expect.any(Object), + ); + }); + }); + + describe("runPullCheck", () => { + const logContext = { jobId: "job-1", providerAddress: "0xsp", providerId: 42n, providerName: "test-sp" }; + + function arrangeHappyPath() { + // Pre-stage a registration that prepareHostedPiece will install. + const registration = { + pieceCid: "bafk-test-piece", + filePath: "/tmp/datasets/test.bin", + fileName: "test.bin", + byteLength: 1024, + contentType: "application/octet-stream", + expiresAt: new Date(Date.now() + 60_000), + cleanedUp: false, + pullSubmittedAt: new Date("2030-01-01T00:00:00Z"), + firstByteAt: new Date("2030-01-01T00:00:00.250Z"), + }; + dataSourceServiceMock.generateRandomDataset.mockResolvedValue({ + name: registration.fileName, + data: Buffer.alloc(registration.byteLength), + size: registration.byteLength, + }); + // After cleanup the resolveAny call returns the entry; before that the + // run reads it once to compute first-byte latency. Same shape suffices. + registryMock.resolveAny.mockReturnValue(registration); + + // Mock the synapse storage context returned by `synapse.storage.createContext`. + const commitResult = { + dataSetId: 7n, + pieceIds: [11n, 12n], + txHash: "0xtx", + }; + const storage = { + dataSetId: 7n, + commit: vi.fn().mockResolvedValue(commitResult), + }; + const sharedSynapse = { + storage: { createContext: vi.fn().mockResolvedValue(storage) }, + } as unknown as Synapse; + // The service caches sharedSynapse in onModuleInit; emulate that here. + (service as unknown as { sharedSynapse: Synapse }).sharedSynapse = sharedSynapse; + + vi.mocked(getDataSet).mockResolvedValue({ clientDataSetId: 99n } as unknown as Awaited< + ReturnType + >); + vi.mocked(pullPieces).mockResolvedValue({ status: "pending" } as unknown as Awaited< + ReturnType + >); + vi.mocked(waitForPullPieces).mockResolvedValue({ + status: "complete", + pieces: [{ pieceCid: "bafk-test-piece", status: "complete" }], + } as unknown as Awaited>); + + // Direct-fetch validation succeeds. + httpClientServiceMock.requestWithMetrics.mockResolvedValue({ data: Buffer.from("payload") }); + vi.mocked(calculate).mockReturnValue({ toString: () => "bafk-test-piece" } as ReturnType); + + return { registration, storage, commitResult }; + } + + it("runs the full lifecycle, observes all metrics, and records success", async () => { + const { registration, storage } = arrangeHappyPath(); + + await service.runPullCheck("0xsp", undefined, logContext); + + // Submit timestamp is stamped on the registration. + expect(registryMock.markPullSubmitted).toHaveBeenCalledWith(registration.pieceCid, expect.any(Date)); + // Latency histograms observed at least once each. + expect(metricsMock.observeRequestLatencyMs).toHaveBeenCalledTimes(1); + expect(metricsMock.observeCompletionLatencyMs).toHaveBeenCalledTimes(1); + // Terminal SP status recorded exactly once. + expect(metricsMock.recordProviderStatus).toHaveBeenCalledTimes(1); + expect(metricsMock.recordProviderStatus).toHaveBeenCalledWith(expect.any(Object), "complete"); + // Commit was invoked with no per-piece metadata. + expect(storage.commit).toHaveBeenCalledWith({ + pieces: [{ pieceCid: expect.any(Object) }], + }); + // First-byte and throughput observed since the registration carries + // pullSubmittedAt + firstByteAt and the path completed. + expect(metricsMock.observeFirstByteMs).toHaveBeenCalledTimes(1); + const firstByteMs = metricsMock.observeFirstByteMs.mock.calls[0][1] as number; + expect(firstByteMs).toBe(250); + expect(metricsMock.observeThroughputBps).toHaveBeenCalledTimes(1); + // Terminal aggregate status is success. + expect(metricsMock.recordStatus).toHaveBeenCalledWith(expect.any(Object), "success"); + // Cleanup ran exactly once. + expect(registryMock.markCleanedUp).toHaveBeenCalledWith(registration.pieceCid); + expect(registryMock.forget).toHaveBeenCalledWith(registration.pieceCid); + }); + + it("does not observe firstByte when the SP never read from /api/piece (cached pull)", async () => { + const { registration } = arrangeHappyPath(); + // Simulate a cached pull: SP never fetched from us. + registryMock.resolveAny.mockReturnValue({ ...registration, firstByteAt: undefined }); + + await service.runPullCheck("0xsp", undefined, logContext); + + expect(metricsMock.observeFirstByteMs).not.toHaveBeenCalled(); + expect(metricsMock.observeThroughputBps).toHaveBeenCalledTimes(1); + expect(metricsMock.recordStatus).toHaveBeenCalledWith(expect.any(Object), "success"); + }); + + it("re-throws and records failure.other when the SP terminal status is not 'complete'", async () => { + arrangeHappyPath(); + vi.mocked(waitForPullPieces).mockResolvedValue({ + status: "failed", + pieces: [], + } as unknown as Awaited>); + + await expect(service.runPullCheck("0xsp", undefined, logContext)).rejects.toThrow( + /Storage provider failed to pull piece/, + ); + + expect(metricsMock.recordProviderStatus).toHaveBeenCalledWith(expect.any(Object), "failed"); + expect(metricsMock.recordStatus).toHaveBeenLastCalledWith(expect.any(Object), "failure.other"); + // Cleanup still runs in the finally block. + expect(registryMock.forget).toHaveBeenCalled(); + }); + + it("classifies timeouts as failure.timedout", async () => { + arrangeHappyPath(); + vi.mocked(waitForPullPieces).mockRejectedValue(new Error("polling timed out after 300s")); + + await expect(service.runPullCheck("0xsp", undefined, logContext)).rejects.toThrow(); + expect(metricsMock.recordStatus).toHaveBeenLastCalledWith(expect.any(Object), "failure.timedout"); + }); + + it("re-throws and runs cleanup when the validation step fails", async () => { + arrangeHappyPath(); + // Force validation mismatch by returning a different recomputed CID. + vi.mocked(calculate) + .mockReturnValueOnce({ toString: () => "bafk-test-piece" } as ReturnType) // prepareHostedPiece + .mockReturnValueOnce({ toString: () => "bafk-mismatch" } as ReturnType); // validateByDirectPieceFetch + + await expect(service.runPullCheck("0xsp", undefined, logContext)).rejects.toThrow(/validation failed/); + expect(metricsMock.recordStatus).toHaveBeenLastCalledWith(expect.any(Object), "failure.other"); + expect(registryMock.forget).toHaveBeenCalled(); + }); + + it("re-throws when the abort signal fires before any work runs", async () => { + arrangeHappyPath(); + const controller = new AbortController(); + controller.abort(new Error("Pull check job timeout (300s) for 0xsp")); + + await expect(service.runPullCheck("0xsp", controller.signal, logContext)).rejects.toThrow(); + // No SP-side calls were issued. + expect(pullPieces).not.toHaveBeenCalled(); + expect(waitForPullPieces).not.toHaveBeenCalled(); + // Failure is classified as timed out (abort message contains "timeout"). + expect(metricsMock.recordStatus).toHaveBeenLastCalledWith(expect.any(Object), "failure.timedout"); + }); + + it("re-throws when the synapse client is unavailable", async () => { + arrangeHappyPath(); + walletSdkServiceMock.getSynapseClient.mockReturnValue(null); + + await expect(service.runPullCheck("0xsp", undefined, logContext)).rejects.toThrow(/Synapse client unavailable/); + expect(metricsMock.recordStatus).toHaveBeenLastCalledWith(expect.any(Object), "failure.other"); + }); + }); + + describe("openHostedPieceStream", () => { + it("returns null when no active registration exists", () => { + registryMock.resolveActive.mockReturnValue(null); + expect(service.openHostedPieceStream("missing")).toBeNull(); + }); + }); +}); From 079bc604d45d872a227fdf79e5d7454b1a8e25f5 Mon Sep 17 00:00:00 2001 From: silent-cipher Date: Tue, 5 May 2026 12:39:35 +0530 Subject: [PATCH 11/44] add docs --- docs/checks/README.md | 3 +- docs/checks/events-and-metrics.md | 44 ++++- ...-configuration-and-approval-methodology.md | 17 +- docs/checks/pull-check.md | 165 ++++++++++++++++++ docs/environment-variables.md | 118 ++++++++++++- docs/jobs.md | 9 +- docs/runbooks/jobs.md | 19 +- 7 files changed, 362 insertions(+), 13 deletions(-) create mode 100644 docs/checks/pull-check.md diff --git a/docs/checks/README.md b/docs/checks/README.md index 7563d06d..e7316484 100644 --- a/docs/checks/README.md +++ b/docs/checks/README.md @@ -5,6 +5,7 @@ The files are: - [data-storage.md](./data-storage.md): Defines the "data storage check" and how it is calculated. - [retrievals.md](./retrievals.md): Defines the "retrieval check" and how it is calculated. - [data-retention.md](./data-retention.md): Defines the "data retention check" and how it is calculated. +- [pull-check.md](./pull-check.md): Defines the "pull check" and how it is calculated. - [events-and-metrics.md](./events-and-metrics.md): Defines the events and metrics that are used to assess SP performance. @@ -14,7 +15,7 @@ DealBot creates synthetic traffic for SPs in the onchain SP registry and monitor ## Terminology ### Check -A "check" refers to a task type that dealbot performs on a SP. We currently have [Data Storage](./data-storage.md) and [Retrieval](./retrievals.md) checks. +A "check" refers to a task type that dealbot performs on a SP. We currently have [Data Storage](./data-storage.md), [Retrieval](./retrievals.md), [Data Retention](./data-retention.md), and [Pull](./pull-check.md) checks. ### Deal This is synonym for "Data Storage Check". This is covered in the [data-storage.md](./data-storage.md). diff --git a/docs/checks/events-and-metrics.md b/docs/checks/events-and-metrics.md index 45c5423e..af1beae1 100644 --- a/docs/checks/events-and-metrics.md +++ b/docs/checks/events-and-metrics.md @@ -55,13 +55,49 @@ sequenceDiagram | `ipfsRetrievalLastByteReceived` | Last byte received from `/ipfs/{rootCid}`. | Data Storage, Retrieval |**TBD** | [`retrieval.service.ts`](../../apps/backend/src/retrieval/retrieval.service.ts) | | `ipfsRetrievalIntegrityChecked` | Retrieved content matches expected CID. | Data Storage, Retrieval | **TBD** | [`retrieval.service.ts`](../../apps/backend/src/retrieval/retrieval.service.ts) | +## Pull Check Event Model + +Below are the events for a [Pull Check](./pull-check.md). Pull checks reverse the data flow of the [Data Storage check](./data-storage.md): instead of dealbot uploading bytes, it asks the SP to pull bytes from a temporary `/api/piece/{pieceCid}` URL. + +### Pull Check Event Timeline + +```mermaid +sequenceDiagram + autonumber + participant Dealbot + participant SP as PDP Storage Provider + participant RPC as Chain RPC Provider + + Dealbot->>Dealbot: hostedPieceRegistered + Dealbot->>SP: pullRequestSubmitted (pullPieces) + SP-->>Dealbot: pullRequestAcknowledged + SP-->>Dealbot: hostedPieceFirstByteRead + Dealbot->>SP: pullStatusPolled (waitForPullPieces, repeated) + SP-->>Dealbot: pullTerminalStatusReported + Dealbot->>RPC: pullCheckCommitted (storage.commit) + Dealbot->>SP: directPieceFetchStarted (/piece/{cid}) + SP-->>Dealbot: directPieceFetchCompleted + Dealbot-->>Dealbot: pullCheckIntegrityChecked +``` + +### Pull Check Event List + +| Event | Definition | Implemented | Source of truth | +|------|------------|:------:|-----------------| +| `pullRequestSubmitted` | Dealbot calls `pullPieces` against the SP for the registered piece CID. | Yes | [`pull-check.service.ts`](../../apps/backend/src/pull-check/pull-check.service.ts) | +| `pullRequestAcknowledged` | SP returns from `pullPieces` (success or non-terminal-failure). | Yes | [`pull-check.service.ts`](../../apps/backend/src/pull-check/pull-check.service.ts) | +| `hostedPieceFirstByteRead` | SP reads the first byte of `/api/piece/{pieceCid}` from dealbot. Recorded once per registration. | Yes | [`piece-source.controller.ts`](../../apps/backend/src/pull-check/piece-source.controller.ts) | +| `pullTerminalStatusReported` | SP reports a terminal pull status (`complete`, `failed`, ...) via `waitForPullPieces`. Intermediate poll statuses are not counted. | Yes | [`pull-check.service.ts`](../../apps/backend/src/pull-check/pull-check.service.ts) | +| `pullCheckCommitted` | Synapse `storage.commit()` succeeds for the pulled piece. | Yes | [`pull-check.service.ts`](../../apps/backend/src/pull-check/pull-check.service.ts) | +| `pullCheckIntegrityChecked` | Direct `/piece/{pieceCid}` fetch from the SP returns bytes whose recomputed pieceCid matches the expected CID. | Yes | [`pull-check.service.ts`](../../apps/backend/src/pull-check/pull-check.service.ts) | + ## Metrics * Many of the metrics below are derived from the [events above](#event-list). * They are exported via Prometheus. * All Prometheus/OpenTelemetry metrics have label/attributes for: - `network=calibration|mainnet` - - `checkType=dataStorage|retrieval|dataRetention|dataSetCreation` — attribute metrics to a particular check/job + - `checkType=dataStorage|retrieval|dataRetention|dataSetCreation|pullCheck` — attribute metrics to a particular check/job - `providerId` — attribute metrics to a particular SP - `providerName` — human-readable name of the SP (defaults to `"unknown"` when not available) - `providerStatus=approved|unapproved` — attribute metrics to only approved SPs for example @@ -87,6 +123,10 @@ sequenceDiagram | `dataStorageCheckMs` | Data Storage | [`uploadToSpStart`](#uploadToSpStart) | [`ipfsRetrievalIntegrityChecked`](#ipfsRetrievalIntegrityChecked) | Duration of a Data Storage check | | | `retrievalCheckMs` | Retrieval | Retrieval check start | [`ipfsRetrievalIntegrityChecked`](#ipfsRetrievalIntegrityChecked) | Duration of a Retrieval check | | | `dataSetCreationMs` | Data-Set Creation | Data-set creation uploadToSpStart | Data-set creation pieceConfirmed | Duration of one data-set creation with confirmed piece (all using `createDataSetWithPiece`) | [`deal.service.ts`](../../apps/backend/src/deal/deal.service.ts) | +| `pullCheckRequestLatencyMs` | Pull | [`pullRequestSubmitted`](#pullRequestSubmitted) | [`pullRequestAcknowledged`](#pullRequestAcknowledged) | Time from `pullPieces` submission to SP request acknowledgement. | [`pull-check.service.ts`](../../apps/backend/src/pull-check/pull-check.service.ts) | +| `pullCheckCompletionLatencyMs` | Pull | [`pullRequestSubmitted`](#pullRequestSubmitted) | [`pullTerminalStatusReported`](#pullTerminalStatusReported) | Time from `pullPieces` submission to terminal SP pull status. Observed once on success and once on failure. | [`pull-check.service.ts`](../../apps/backend/src/pull-check/pull-check.service.ts) | +| `pullCheckFirstByteMs` | Pull | [`pullRequestSubmitted`](#pullRequestSubmitted) | [`hostedPieceFirstByteRead`](#hostedPieceFirstByteRead) | Time from `pullPieces` submission to the SP reading the first byte of `/api/piece/{pieceCid}`. Skipped (no observation) when the SP serves the pull from a local cache and never fetches from dealbot. | [`pull-check.service.ts`](../../apps/backend/src/pull-check/pull-check.service.ts), [`piece-source.controller.ts`](../../apps/backend/src/pull-check/piece-source.controller.ts) | +| `pullCheckThroughputBps` | Pull | n/a | n/a | `(pieceSizeBytes / pullCheckCompletionLatencyMs) * 1000`. Upper-bound on actual transfer rate because `pullCheckCompletionLatencyMs` includes SP-side scheduling and dealbot's polling cadence. | [`pull-check.service.ts`](../../apps/backend/src/pull-check/pull-check.service.ts) | ### Status Count Related Metrics @@ -106,6 +146,8 @@ sequenceDiagram | `dataSetCreationStatus` | Data-Set Creation | Not tied to an [event above](#event-list) but rather to data-set creation start (`pending`) and completion (`success`/`failure.*`) | `pending`, `success`, `failure.timedout`, `failure.other` | [`deal.service.ts`](../../apps/backend/src/deal/deal.service.ts) | | `dataSetChallengeStatus` | Data Retention | Emitted on each [Data Retention Check](./data-retention.md) poll when a provider's confirmed proving-period totals advance (strictly positive deltas). Unit: **challenges** (period delta × `CHALLENGES_PER_PROVING_PERIOD = 5`). | `success` (challenges in successfully-proven periods), `failure` (challenges in faulted periods) | [`data-retention.service.ts`](../../apps/backend/src/data-retention/data-retention.service.ts) | | `pdp_provider_estimated_overdue_periods` | Data Retention | Emitted on every [Data Retention Check](./data-retention.md) poll for every successfully processed provider. | Gauge value in proving periods (non-negative integer) | [`data-retention.service.ts`](../../apps/backend/src/data-retention/data-retention.service.ts) | +| `pullCheckStatus` | Pull | When the [Pull Check](./pull-check.md) terminates (success after commit + direct piece validation, or any failure). Recorded exactly once per check. | `success`, `failure.timedout`, `failure.other`. Failure classification follows [`classifyFailureStatus`](../../apps/backend/src/metrics-prometheus/check-metric-labels.ts) (timeout-keyed errors → `failure.timedout`, everything else → `failure.other`). | [`pull-check.service.ts`](../../apps/backend/src/pull-check/pull-check.service.ts) | +| `pullCheckProviderStatus` | Pull | When the SP reports a terminal pull status via `waitForPullPieces`. Recorded exactly once per check (intermediate poll statuses are not counted). | Raw SP-reported pull status, for example `complete`, `failed`, `not_found`. Use this to separate SP-side pull failures from dealbot-side commit/validation failures. | [`pull-check.service.ts`](../../apps/backend/src/pull-check/pull-check.service.ts) | ## ClickHouse Tables diff --git a/docs/checks/production-configuration-and-approval-methodology.md b/docs/checks/production-configuration-and-approval-methodology.md index 6da7c92f..b33b444e 100644 --- a/docs/checks/production-configuration-and-approval-methodology.md +++ b/docs/checks/production-configuration-and-approval-methodology.md @@ -1,6 +1,6 @@ ## Purpose -This document outlines how dealbot is configured for production by the Filecoin Onchain Cloud working group, particularly for determining which SPs to approve for the "official" Filecoin Warm Storage Service contracts on calibration and mainnet. A reader, especially an SP seeking to be approved for paid FOC storage deals by default, should be able to read this document and understand how they are evaluated. While [data-storage.md](./data-storage.md), [retrievals.md](./retrievals.md), and [events-and-metrics.md](./events-and-metrics.md) discuss the dealbot checks and metrics in general, this document provides the context and background for how they are configured and used in production. +This document outlines how dealbot is configured for production by the Filecoin Onchain Cloud working group, particularly for determining which SPs to approve for the "official" Filecoin Warm Storage Service contracts on calibration and mainnet. A reader, especially an SP seeking to be approved for paid FOC storage deals by default, should be able to read this document and understand how they are evaluated. While [data-storage.md](./data-storage.md), [retrievals.md](./retrievals.md), [pull-check.md](./pull-check.md), and [events-and-metrics.md](./events-and-metrics.md) discuss the dealbot checks and metrics in general, this document provides the context and background for how they are configured and used in production. ## Approval Acceptance Criteria @@ -60,6 +60,18 @@ Relevant parameters include: This minimum observed success rate threshold count is for having 95% confidence that the success rate is greater than 95%. See [How are data storage and retrieval check statistics/thresholds calculated?](#how-are-data-storage-and-retrieval-check-statisticsthresholds-calculated) for more details. +### Pull Check + +The [Pull Check](./pull-check.md) is **not** currently an approval criterion. It runs in production to collect operational data on the SP pull-to-park pathway (`pullPieces` request latency, terminal status mix, first-byte latency, throughput) but the resulting Prometheus metrics are not yet folded into an approval threshold. They may inform a future criterion once enough baseline data has accumulated. + +Pull check is enabled with the following parameters in production: + +| Parameter | Value | Notes | +|-----------|-------|-------| +| [`PULL_CHECKS_PER_SP_PER_HOUR`](../environment-variables.md#pull_checks_per_sp_per_hour) | 1 | 24 per day | +| [`PULL_CHECK_PIECE_SIZE_BYTES`](../environment-variables.md#pull_check_piece_size_bytes) | 10485760 | 10 MiB synthetic test piece per check | +| [`PULL_CHECK_JOB_TIMEOUT_SECONDS`](../environment-variables.md#pull_check_job_timeout_seconds) | 360 | 6 minute end-to-end ceiling | + ## SP Maintenance Window Dealbot provides two 20 minute windows per day where it doesn't run "checks" so that SPs can plan their maintenance activities without having their dealbot scores impacted: @@ -78,12 +90,15 @@ With the current configuration, Dealbot will add this much synthetic load on SPs - 15 datasets, requiring 5 challenges per day per dataset. The dataset floor price that is paid by Dealbot to the SP covers the cost of the challenges. - 4x10MB pieces being uploaded per hour. - 8x10MB pieces being downloaded per hour (the newly created pieces as part of the Data Storage checks and random existing pieces as part of the Retrieval checks) +- 1x10MB piece being pulled by the SP per hour as part of the [Pull Check](./pull-check.md), plus a 10MB direct piece-fetch from the SP for validation. Over the course of a day this means: * 75 proof challenges * 960 MB of SP download bandwidth in support of adding new pieces * 960 MB of disk space for the pieces. Piece cleanup removes the oldest pieces once total stored data per SP exceeds a configurable threshold (see [`MAX_DATASET_STORAGE_SIZE_BYTES`](../environment-variables.md#max_dataset_storage_size_bytes)). * 1,920 MB of SP upload bandwidth in support of retrievals +* 240 MB of additional SP download bandwidth for pull-check pulls and 240 MB of additional SP upload bandwidth for pull-check validation re-fetches +* 240 MB of additional SP disk space for committed pull-check pieces. **Pull-check pieces are not garbage-collected by [piece cleanup](../environment-variables.md#piece-cleanup)** (they are not tracked in the `deals` table); they accrue on the SP unless removed manually. ## Appendix diff --git a/docs/checks/pull-check.md b/docs/checks/pull-check.md new file mode 100644 index 00000000..48bf5a42 --- /dev/null +++ b/docs/checks/pull-check.md @@ -0,0 +1,165 @@ +# Pull Check + +This document is the **source of truth** for how dealbot's Pull check works. + +Source code links throughout this document point to the current implementation. + +For event and metric definitions used by the dashboard, see [Dealbot Events & Metrics](./events-and-metrics.md). + +## Overview + +A "pull check" exercises the **storage provider pull-to-park pathway**: dealbot publishes a temporary piece at `/api/piece/{pieceCid}`, asks the SP to fetch (pull) and park it via the Synapse `pullPieces` API, waits for a terminal SP pull status, commits the pulled piece on-chain, and finally re-fetches the piece from the SP to verify byte-for-byte integrity. + +The pull check answers a different question than the [Data Storage check](./data-storage.md): instead of *uploading* bytes to the SP, it asks the SP to *pull* bytes from a public URL. This validates an SP's outbound HTTP fetcher, the pull request lifecycle, and the resulting on-chain commit and retrieval surface. + +A successful pull check requires all [assertions in the table below](#what-gets-asserted) to pass. Failure occurs if any step fails or the job exceeds its max allowed time. Operational timeouts exist to prevent jobs from running indefinitely, but they are not quality assertions. + +> **Where results live:** Pull check results are exported to Prometheus and structured logs only. They are **not** persisted in Postgres or written to ClickHouse. Committed pull-check pieces are also **not** tracked in the `deals` table, so the [Piece Cleanup](../environment-variables.md#piece-cleanup) job will not garbage-collect them; they will accrue on the SP unless explicitly removed. + +## What Gets Asserted + +Each pull check asserts the following for every SP: + +| # | Assertion | How It's Checked | Retries | Relevant Metric for Setting a Max Duration | Implemented? | +|---|-----------|------------------|:---:|--------------------------------------------|:---:| +| 1 | SP accepts the pull request | `pullPieces` returns without error and reports a non-terminal-failure status | 0 | [`pullCheckRequestLatencyMs`](./events-and-metrics.md#pullCheckRequestLatencyMs) | Yes | +| 2 | SP reaches a terminal `complete` pull status | `waitForPullPieces` polls the SP until a terminal status is reported | Polling with delay until [`PULL_CHECK_JOB_TIMEOUT_SECONDS`](../environment-variables.md#pull_check_job_timeout_seconds) | [`pullCheckCompletionLatencyMs`](./events-and-metrics.md#pullCheckCompletionLatencyMs) | Yes | +| 3 | SP records the piece on-chain | Synapse `storage.commit()` succeeds for the pulled piece | n/a | n/a (bounded by job timeout) | Yes | +| 4 | SP serves the pulled piece via `/piece/{pieceCid}` | Re-fetch the bytes from the SP's PDP service URL and re-compute the piece CID | 0 | n/a (bounded by job timeout) | Yes | +| 5 | All checks pass | Pull check is not marked successful until all assertions pass within the job timeout | n/a | [`pullCheckCompletionLatencyMs`](./events-and-metrics.md#pullCheckCompletionLatencyMs) | Yes | + +## Pull Check Lifecycle + +The dealbot scheduler triggers pull check jobs at a configurable rate (`PULL_CHECKS_PER_SP_PER_HOUR`). + +```mermaid +flowchart TD + Generate["Generate random piece + register hosted source
at /api/piece/{pieceCid}"] + Generate --> Submit["Submit pullPieces request to SP"] + Submit --> Poll["Poll SP via waitForPullPieces
until terminal pull status"] + Poll -->|complete| Commit["Synapse storage.commit() the pulled piece"] + Poll -->|other terminal status| Fail["Mark pull check failed"] + Commit --> Validate["Direct /piece/{pieceCid} fetch from SP
+ recompute pieceCid"] + Validate -->|matches| Success["Mark pull check successful"] + Validate -->|mismatch or fetch error| Fail + Success --> Cleanup + Fail --> Cleanup["Forget hosted piece + delete local artifact"] +``` + +### 1. Prepare the hosted piece + +Dealbot generates a random binary file, computes its piece CID, and registers it in an in-memory `HostedPieceRegistry`. The registration carries a TTL controlled by `PULL_CHECK_HOSTED_PIECE_TTL_SECONDS` so the source remains available for the entire pull window. + +The source URL handed to the SP is built from the dealbot `app.apiPublicUrl` config (set via `DEALBOT_API_PUBLIC_URL`). When `DEALBOT_API_PUBLIC_URL` is unset, dealbot falls back to `http://{DEALBOT_HOST}:{DEALBOT_PORT}`, which is only reachable in single-host or `localhost` setups. + +- **File format:** `random-{timestamp}-{uniqueId}.bin` +- **Default size:** `PULL_CHECK_PIECE_SIZE_BYTES` (default 10 MiB) +- **Source URL:** `{apiPublicUrl}/api/piece/{pieceCid}` + +Source: [`pull-check.service.ts` (`prepareHostedPiece`)](../../apps/backend/src/pull-check/pull-check.service.ts), [`hosted-piece.registry.ts`](../../apps/backend/src/pull-check/hosted-piece.registry.ts) + +### 2. Submit the pull request + +Dealbot calls `pullPieces` from `@filoz/synapse-core/sp` with the pieceCid, the source URL, and either the SP's existing `dataSetId`/`clientDataSetId` or the SP `payee` for new-dataset flows. The submission timestamp is stamped on the registration so it can later be subtracted from the first-byte event. + +Source: [`pull-check.service.ts` (`runPullCheck`)](../../apps/backend/src/pull-check/pull-check.service.ts) + +### 3. Wait for terminal SP pull status + +`waitForPullPieces` polls the SP at `PULL_CHECK_POLL_INTERVAL_SECONDS` until the SP reports a terminal status (`complete` or `failed`) or the job timeout fires. Dealbot increments the [`pullCheckProviderStatus`](./events-and-metrics.md#pullCheckProviderStatus) counter exactly once with the **terminal** status; intermediate poll statuses are not counted. + +When the SP fetches `/api/piece/{pieceCid}` for the first time, the controller stamps a first-byte timestamp on the registration. This is the basis for [`pullCheckFirstByteMs`](./events-and-metrics.md#pullCheckFirstByteMs). + +Source: [`piece-source.controller.ts`](../../apps/backend/src/pull-check/piece-source.controller.ts) + +### 4. Commit the piece on-chain + +When the terminal pull status is `complete`, dealbot calls `synapse.storage.commit({ pieces: [{ pieceCid }] })`. Pull-check pieces are committed without `pieceMetadata` because the synthetic content has no meaningful IPFS root CID and including a synthetic one would corrupt downstream IPNI advertising. + +A failure here marks the pull check as `failure.other` and aborts before validation. + +### 5. Direct piece-fetch validation + +After commit, dealbot fetches `{serviceURL}/piece/{pieceCid}` from the SP, re-computes the piece CID over the response body, and compares it against the expected CID. A mismatch fails the pull check with `failure.other`. A network or HTTP error during validation also fails the check (transport errors are intentionally not retried). + +Aborts (job timeout) propagate as throws and are classified as `failure.timedout` rather than as a validation failure. + +Source: [`pull-check.service.ts` (`validateByDirectPieceFetch`)](../../apps/backend/src/pull-check/pull-check.service.ts) + +### 6. Cleanup + +Whether the pull check succeeds or fails, the `finally` block: + +1. Marks the registration as cleaned up (so subsequent `/api/piece/{pieceCid}` requests return HTTP 410 Gone instead of 200). +2. Removes the on-disk dataset artifact via `DataSourceService.cleanupRandomDataset`. +3. Forgets the registration entry so the controller returns HTTP 404 Not Found for any later requests. + +Cleanup errors are logged at WARN level but do not propagate, so a transient cleanup failure cannot mask a successful pull check. + +## Pull Check Status Progression + +A pull check has a single terminal status, recorded once per check via [`pullCheckStatus`](./events-and-metrics.md#pullCheckStatus): + +| Overall Status | Meaning | +|--------|---------| +| `success` | All five [assertions](#what-gets-asserted) passed within the job timeout. | +| `failure.timedout` | The job was aborted because it exceeded `PULL_CHECK_JOB_TIMEOUT_SECONDS`, or the underlying error message indicates a timeout. | +| `failure.other` | Any other failure: SP rejected the pull request, SP reached a non-`complete` terminal status, commit failed, or direct piece validation failed. | + +Failures are classified by inspecting the error message; see [`classifyFailureStatus`](../../apps/backend/src/metrics-prometheus/check-metric-labels.ts) for the exact rule. + +In addition to the overall status, dealbot records the **raw SP-reported terminal pull status** via [`pullCheckProviderStatus`](./events-and-metrics.md#pullCheckProviderStatus) (for example `complete`, `failed`, `not_found`). This separates "SP said it failed" from "dealbot's downstream commit/validation failed" in dashboards. + +## HTTP API + +The dealbot API exposes one endpoint dedicated to pull checks: + +| Method | Path | Description | +|--------|------|-------------| +| `GET` | `/api/piece/{pieceCid}` | Streams the temporary hosted piece bytes for an in-flight pull check. Returns `200` with the bytes when an active registration exists, `410 Gone` when the registration has been cleaned up or expired, and `404 Not Found` when no registration exists. | + +The endpoint is registered on the same `/api` prefix as the other dealbot HTTP endpoints. It is intentionally unauthenticated because SPs must be able to pull from it during a check; access is bounded by the per-piece TTL. + +Source: [`piece-source.controller.ts`](../../apps/backend/src/pull-check/piece-source.controller.ts) + +## Metrics Recorded + +Metric definitions (including Prometheus metrics) live in [Dealbot Events & Metrics](./events-and-metrics.md). The metrics emitted by a pull check are: + +- [`pullCheckRequestLatencyMs`](./events-and-metrics.md#pullCheckRequestLatencyMs) +- [`pullCheckCompletionLatencyMs`](./events-and-metrics.md#pullCheckCompletionLatencyMs) +- [`pullCheckFirstByteMs`](./events-and-metrics.md#pullCheckFirstByteMs) (only when the SP actually pulled from `/api/piece/{pieceCid}`) +- [`pullCheckThroughputBps`](./events-and-metrics.md#pullCheckThroughputBps) +- [`pullCheckStatus`](./events-and-metrics.md#pullCheckStatus) +- [`pullCheckProviderStatus`](./events-and-metrics.md#pullCheckProviderStatus) + +## Configuration + +Key environment variables that control pull check behavior: + +| Variable | Description | +|----------|-------------| +| `DEALBOT_API_PUBLIC_URL` | Public base URL used to construct the hosted-piece source URL handed to SPs. Required for any deployment where SPs cannot reach `DEALBOT_HOST:DEALBOT_PORT` directly. | +| `PULL_CHECKS_PER_SP_PER_HOUR` | Per-SP pull check rate. | +| `PULL_CHECK_JOB_TIMEOUT_SECONDS` | Max end-to-end pull check job runtime before forced abort. | +| `PULL_CHECK_HOSTED_PIECE_TTL_SECONDS` | TTL of the temporary hosted piece source served at `/api/piece/{pieceCid}`. | +| `PULL_CHECK_POLL_INTERVAL_SECONDS` | Polling interval used while waiting for a terminal SP pull status. | +| `PULL_CHECK_PIECE_SIZE_BYTES` | Size of the synthetic test piece dealbot generates per pull check. | + +Source: [`apps/backend/src/config/app.config.ts`](../../apps/backend/src/config/app.config.ts) + +See also: [`docs/environment-variables.md`](../environment-variables.md) for the source-of-truth configuration reference. + +## FAQ + +### Why isn't the pull-check piece tracked in the `deals` table? + +Pull checks are intentionally isolated from the data-storage flow: they don't pass through `DealService.createDeal`, don't allocate a `Deal` entity. This keeps the pull-check signal independent of the data-storage success rate. The trade-off is that the [Piece Cleanup](../environment-variables.md#piece-cleanup) job will not garbage-collect committed pull-check pieces, so SPs accumulate them over time until removed manually. + +### Why does a "cached pull" not record `pullCheckFirstByteMs`? + +If an SP previously pulled the same piece CID and serves the new pull request from a local cache, it will never fetch `/api/piece/{pieceCid}`, so dealbot has no first-byte timestamp to subtract. In that case dealbot skips the histogram observation rather than emit a misleading zero. Cached pulls are uncommon today because each pull check generates a fresh random piece, but the registry's first-byte capture is **idempotent** so retried pulls during a single check do not skew measurements either. + +### Why don't we set `pieceMetadata` on the commit? + +`IPFS_ROOT_CID` is meaningless for synthetic pull-check pieces; setting it would announce a fake provider record to IPNI and corrupt downstream discoverability for unrelated content. We pass only `{ pieceCid }` to `storage.commit()`. diff --git a/docs/environment-variables.md b/docs/environment-variables.md index 359d86da..342b2163 100644 --- a/docs/environment-variables.md +++ b/docs/environment-variables.md @@ -6,7 +6,7 @@ This document provides a comprehensive guide to all environment variables used b | Category | Variables | | ----------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------ | -| [Application](#application-configuration) | `NODE_ENV`, `DEALBOT_PORT`, `DEALBOT_HOST`, `DEALBOT_RUN_MODE`, `DEALBOT_METRICS_PORT`, `DEALBOT_METRICS_HOST`, `DEALBOT_ALLOWED_ORIGINS`, `ENABLE_DEV_MODE` | +| [Application](#application-configuration) | `NODE_ENV`, `DEALBOT_PORT`, `DEALBOT_HOST`, `DEALBOT_API_PUBLIC_URL`, `DEALBOT_RUN_MODE`, `DEALBOT_METRICS_PORT`, `DEALBOT_METRICS_HOST`, `DEALBOT_ALLOWED_ORIGINS`, `ENABLE_DEV_MODE` | | [Database](#database-configuration) | `DATABASE_HOST`, `DATABASE_PORT`, `DATABASE_POOL_MAX`, `DATABASE_USER`, `DATABASE_PASSWORD`, `DATABASE_NAME` | | [Blockchain](#blockchain-configuration) | `NETWORK`, `RPC_URL`, `WALLET_ADDRESS`, `WALLET_PRIVATE_KEY`, `SESSION_KEY_PRIVATE_KEY`, `CHECK_DATASET_CREATION_FEES`, `USE_ONLY_APPROVED_PROVIDERS`, `PDP_SUBGRAPH_ENDPOINT` | | [Dataset Versioning](#dataset-versioning) | `DEALBOT_DATASET_VERSION` | @@ -16,6 +16,7 @@ This document provides a comprehensive guide to all environment variables used b | [ClickHouse](#clickhouse-configuration) | `CLICKHOUSE_URL`, `CLICKHOUSE_BATCH_SIZE`, `CLICKHOUSE_FLUSH_INTERVAL_MS`, `DEALBOT_PROBE_LOCATION` | | [Timeouts](#timeout-configuration) | `CONNECT_TIMEOUT_MS`, `HTTP_REQUEST_TIMEOUT_MS`, `HTTP2_REQUEST_TIMEOUT_MS`, `IPNI_VERIFICATION_TIMEOUT_MS`, `IPNI_VERIFICATION_POLLING_MS` | | [Piece Cleanup](#piece-cleanup) | `MAX_DATASET_STORAGE_SIZE_BYTES`, `TARGET_DATASET_STORAGE_SIZE_BYTES`, `JOB_PIECE_CLEANUP_PER_SP_PER_HOUR`, `MAX_PIECE_CLEANUP_RUNTIME_SECONDS` | +| [Pull Check](#pull-check) | `PULL_CHECKS_PER_SP_PER_HOUR`, `PULL_CHECK_JOB_TIMEOUT_SECONDS`, `PULL_CHECK_HOSTED_PIECE_TTL_SECONDS`, `PULL_CHECK_POLL_INTERVAL_SECONDS`, `PULL_CHECK_PIECE_SIZE_BYTES` | | [SP Blocklist](#sp-blocklist-configuration) | `BLOCKED_SP_IDS`, `BLOCKED_SP_ADDRESSES` | | [Prometheus Metrics](#prometheus-metrics-configuration) | `PROMETHEUS_WALLET_BALANCE_TTL_SECONDS`, `PROMETHEUS_WALLET_BALANCE_ERROR_COOLDOWN_SECONDS` | | [Web Frontend](#web-frontend) | `VITE_API_BASE_URL`, `VITE_PLAUSIBLE_DATA_DOMAIN`, `DEALBOT_API_BASE_URL` | @@ -140,6 +141,29 @@ DEALBOT_HOST=0.0.0.0 --- +### `DEALBOT_API_PUBLIC_URL` + +- **Type**: `string` (URL) +- **Required**: No (but required for [Pull Check](./checks/pull-check.md) when SPs cannot reach `DEALBOT_HOST:DEALBOT_PORT` directly) +- **Default**: Empty (falls back to `http://${DEALBOT_HOST}:${DEALBOT_PORT}`) + +**Role**: Public base URL for the Dealbot HTTP API. Used to construct the hosted-piece source URL handed to a storage provider during a pull check (`{DEALBOT_API_PUBLIC_URL}/api/piece/{pieceCid}`). + +**When to update**: + +- Set to the public URL of your Dealbot deployment whenever pull checks are enabled and SPs cannot reach the bind address directly (the typical production case) +- Leave unset for local development where SPs reach Dealbot on `localhost` + +**Example**: + +```bash +DEALBOT_API_PUBLIC_URL=https://dealbot.filoz.org +``` + +**Notes**: Trailing slashes are stripped at load time. The value is also trimmed and treated as empty when blank. + +--- + ### `DEALBOT_ALLOWED_ORIGINS` - **Type**: `string` (comma-separated URLs) @@ -907,6 +931,98 @@ Only used when `DEALBOT_JOBS_MODE=pgboss`. --- +## Pull Check + +These variables control the [Pull Check](./checks/pull-check.md), which validates the SP pull-to-park pathway. Pull checks are scheduled per SP and exercised through the `sp.work` queue alongside deal, retrieval, and piece-cleanup jobs. + +### `PULL_CHECKS_PER_SP_PER_HOUR` + +- **Type**: `number` +- **Required**: No +- **Default**: `1` +- **Minimum**: `0.001` +- **Maximum**: `20` + +**Role**: Target number of pull checks per storage provider per hour. The rate is converted to an interval internally (for example `1` = every 3600s, `0.5` = every 7200s). + +**Notes**: Fractional values are supported. Pull checks are independent of `DEALS_PER_SP_PER_HOUR` and `RETRIEVALS_PER_SP_PER_HOUR`. + +**Example**: + +```bash +# Twice per day +PULL_CHECKS_PER_SP_PER_HOUR=0.083 +``` + +--- + +### `PULL_CHECK_JOB_TIMEOUT_SECONDS` + +- **Type**: `number` (seconds) +- **Required**: No +- **Default**: `360` (6 minutes) +- **Minimum**: `60` +- **Enforced**: Yes (config validation) + +**Role**: Maximum runtime for a pull-check job before forced abort via `AbortController`. Bounds the polling window for terminal SP pull status, the on-chain commit, and the direct `/piece/{pieceCid}` re-fetch combined. + +**When to update**: + +- Increase if SPs are slow to reach a terminal pull status (large piece sizes or busy SPs) +- Decrease to fail-fast on stuck jobs + +--- + +### `PULL_CHECK_HOSTED_PIECE_TTL_SECONDS` + +- **Type**: `number` (seconds) +- **Required**: No +- **Default**: `900` (15 minutes) +- **Minimum**: `60` + +**Role**: Time-to-live for the temporary hosted piece source served at `/api/piece/{pieceCid}` during an in-flight pull check. After the TTL elapses or the job calls cleanup, the controller responds with HTTP `410 Gone` for that pieceCid. + +**When to update**: + +- Should be at least `PULL_CHECK_JOB_TIMEOUT_SECONDS` plus generous margin for the SP to make its first read; the default 15 minutes provides ~9 minutes of headroom over the 6-minute job timeout default +- Increase only when intentionally allowing SPs to retry pulls long after the dealbot job has aborted + +--- + +### `PULL_CHECK_POLL_INTERVAL_SECONDS` + +- **Type**: `number` (seconds) +- **Required**: No +- **Default**: `10` +- **Minimum**: `1` + +**Role**: Polling interval used by `waitForPullPieces` while waiting for the SP to report a terminal pull status (`complete` or `failed`). + +**When to update**: + +- Decrease for faster terminal-status detection at the cost of more SP-side load +- Increase to be gentler on SPs at the cost of slower pull-check throughput + +--- + +### `PULL_CHECK_PIECE_SIZE_BYTES` + +- **Type**: `number` (integer, bytes) +- **Required**: No +- **Default**: `10485760` (10 MiB) +- **Minimum**: `1024` + +**Role**: Size of the synthetic random piece dealbot generates per pull check. The same byte length is used to compute [`pullCheckThroughputBps`](./checks/events-and-metrics.md#pullCheckThroughputBps). + +**When to update**: + +- Decrease for quicker, lower-bandwidth pull tests +- Increase to stress-test the SP's outbound fetch throughput + +**Note**: Pull-check pieces are committed on-chain but **not** tracked in the `deals` table, so they are not garbage-collected by [Piece Cleanup](#piece-cleanup). Larger pieces accrue on the SP unless removed manually. + +--- + ## Dataset Configuration ### `DEALBOT_LOCAL_DATASETS_PATH` diff --git a/docs/jobs.md b/docs/jobs.md index 6114fac8..df002617 100644 --- a/docs/jobs.md +++ b/docs/jobs.md @@ -6,7 +6,7 @@ This doc explains what a "job" is in dealbot, how jobs are defined, how they're - `job_schedule_state` is the primary schedule entity with one row per `` plus global rows with an empty `sp_address`. - The dealbot scheduler loop polls for due `job_schedule_state` rows, enqueues corresponding pg-boss jobs, and advances `job_schedule_state.next_run_at`. -- Deal/retrieval jobs share the `sp.work` queue with `policy=singleton` and `singletonKey=spAddress` to enforce one active job per SP while allowing backlog. +- All per-SP jobs (`deal`, `retrieval`, `piece_cleanup`, `pull_check`) share the `sp.work` queue with `policy=singleton` and `singletonKey=spAddress` to enforce one active job per SP while allowing backlog. - Dealbot workers poll pg-boss queues via [`boss.work()`](https://github.com/timgit/pg-boss/blob/master/docs/api/workers.md) and run the corresponding handlers. ## Entities and Terminology @@ -15,7 +15,7 @@ This doc explains what a "job" is in dealbot, how jobs are defined, how they're | --- | --- | --- | | `job_schedule_state` | One per `` plus global rows | Schedule state owned by dealbot. | | Storage provider (SP) | One per SP in registry | Filtered by `USE_ONLY_APPROVED_PROVIDERS` when enabled. | -| Job type | `deal`, `retrieval`, `data_set_creation`, `piece_cleanup`, `providers_refresh`, `data_retention_poll` | `deal` corresponds to "data storage check" externally; we keep `deal` in code/DB for compatibility. | +| Job type | `deal`, `retrieval`, `data_set_creation`, `piece_cleanup`, `pull_check`, `providers_refresh`, `data_retention_poll` | `deal` corresponds to "data storage check" externally; we keep `deal` in code/DB for compatibility. | | pg-boss queue | `sp.work`, `providers.refresh`, `data.retention.poll` | `sp.work` is a singleton queue. | | Dealbot scheduler | One per process (when enabled) | Runs the scheduling loop. | | Dealbot worker process | One Node.js process with `DEALBOT_RUN_MODE=worker` or `both` | Hosts pg-boss workers. | @@ -35,6 +35,7 @@ This doc explains what a "job" is in dealbot, how jobs are defined, how they're | `deal` | `sp.work` | [`JobsService.handleDealJob`](../apps/backend/src/jobs/jobs.service.ts) | `{ jobType: 'deal', spAddress, intervalSeconds }` | | `retrieval` | `sp.work` | [`JobsService.handleRetrievalJob`](../apps/backend/src/jobs/jobs.service.ts) | `{ jobType: 'retrieval', spAddress, intervalSeconds }` | | `piece_cleanup` | `sp.work` | [`JobsService.handlePieceCleanupJob`](../apps/backend/src/jobs/jobs.service.ts) | `{ jobType: 'piece_cleanup', spAddress, intervalSeconds }` | +| `pull_check` | `sp.work` | [`JobsService.handlePullCheckJob`](../apps/backend/src/jobs/jobs.service.ts) | `{ jobType: 'pull_check', spAddress, intervalSeconds }` | `sp.work` is created with `policy=singleton`, and jobs set `singletonKey=spAddress` so only one active job per SP can run at a time. @@ -136,7 +137,7 @@ Use these formulas to reason about whether the system can keep up and how much b Per-SP capacity (one job per SP at a time): -- Per-SP execution-minutes per hour = `(deals_per_sp_per_hour * deal_max_minutes) + (retrievals_per_sp_per_hour * retrieval_max_minutes) + (piece_cleanup_per_sp_per_hour * cleanup_max_minutes)` +- Per-SP execution-minutes per hour = `(deals_per_sp_per_hour * deal_max_minutes) + (retrievals_per_sp_per_hour * retrieval_max_minutes) + (piece_cleanup_per_sp_per_hour * cleanup_max_minutes) + (pull_checks_per_sp_per_hour * pull_check_max_minutes)` - If per-SP execution-minutes per hour > 60, that SP can never catch up (backlog grows), even if we had infinite dealbot workers. - If per-SP execution-minutes per hour <= 60, backlog should eventually drain, assuming there are enough dealbot workers (headroom = `60 - per-SP execution-minutes per hour`). @@ -189,7 +190,7 @@ See the "Jobs (pg-boss)" section in `docs/environment-variables.md` for full def - [`DEALBOT_PGBOSS_SCHEDULER_ENABLED`](./environment-variables.md#dealbot_pgboss_scheduler_enabled) - [`DEALBOT_RUN_MODE`](./environment-variables.md#dealbot_run_mode) -- [`DEALS_PER_SP_PER_HOUR`](./environment-variables.md#deals_per_sp_per_hour), [`RETRIEVALS_PER_SP_PER_HOUR`](./environment-variables.md#retrievals_per_sp_per_hour), [`JOB_PIECE_CLEANUP_PER_SP_PER_HOUR`](./environment-variables.md#job_piece_cleanup_per_sp_per_hour) +- [`DEALS_PER_SP_PER_HOUR`](./environment-variables.md#deals_per_sp_per_hour), [`RETRIEVALS_PER_SP_PER_HOUR`](./environment-variables.md#retrievals_per_sp_per_hour), [`JOB_PIECE_CLEANUP_PER_SP_PER_HOUR`](./environment-variables.md#job_piece_cleanup_per_sp_per_hour), [`PULL_CHECKS_PER_SP_PER_HOUR`](./environment-variables.md#pull_checks_per_sp_per_hour) - [`JOB_SCHEDULER_POLL_SECONDS`](./environment-variables.md#job_scheduler_poll_seconds), [`JOB_WORKER_POLL_SECONDS`](./environment-variables.md#job_worker_poll_seconds) - [`JOB_CATCHUP_MAX_ENQUEUE`](./environment-variables.md#job_catchup_max_enqueue) - [`JOB_SCHEDULE_PHASE_SECONDS`](./environment-variables.md#job_schedule_phase_seconds) diff --git a/docs/runbooks/jobs.md b/docs/runbooks/jobs.md index d7fead32..d37d85cb 100644 --- a/docs/runbooks/jobs.md +++ b/docs/runbooks/jobs.md @@ -11,10 +11,10 @@ For routine daily maintenance windows, prefer `DEALBOT_MAINTENANCE_WINDOWS_UTC` `DEALBOT_MAINTENANCE_WINDOW_MINUTES`, which skip deal/retrieval checks automatically in both cron and pg-boss modes. ```sql --- Pause all deal and retrieval jobs +-- Pause all per-SP jobs UPDATE job_schedule_state SET paused = true, updated_at = NOW() -WHERE job_type IN ('deal', 'retrieval', 'piece_cleanup'); +WHERE job_type IN ('deal', 'retrieval', 'piece_cleanup', 'pull_check'); ``` To pause a single provider: @@ -22,7 +22,7 @@ To pause a single provider: ```sql UPDATE job_schedule_state SET paused = true, updated_at = NOW() -WHERE job_type IN ('deal', 'retrieval', 'piece_cleanup') +WHERE job_type IN ('deal', 'retrieval', 'piece_cleanup', 'pull_check') AND sp_address = ''; ``` @@ -31,7 +31,7 @@ WHERE job_type IN ('deal', 'retrieval', 'piece_cleanup') ```sql UPDATE job_schedule_state SET paused = false, next_run_at = NOW(), updated_at = NOW() -WHERE job_type IN ('deal', 'retrieval', 'piece_cleanup'); +WHERE job_type IN ('deal', 'retrieval', 'piece_cleanup', 'pull_check'); ``` To resume a single provider: @@ -39,7 +39,7 @@ To resume a single provider: ```sql UPDATE job_schedule_state SET paused = false, next_run_at = NOW(), updated_at = NOW() -WHERE job_type IN ('deal', 'retrieval', 'piece_cleanup') +WHERE job_type IN ('deal', 'retrieval', 'piece_cleanup', 'pull_check') AND sp_address = ''; ``` @@ -87,6 +87,15 @@ WHERE job_type = 'piece_cleanup' AND sp_address = ''; ``` +Run a pull check for a specific SP: + +```sql +UPDATE job_schedule_state +SET paused = false, next_run_at = NOW(), updated_at = NOW() +WHERE job_type = 'pull_check' + AND sp_address = ''; +``` + - Offsets (`*_START_OFFSET_SECONDS`) are ignored in pg-boss mode. - Job schedules are rate-based (per hour) and persist across restarts. - Paused schedules remain paused until explicitly resumed. Pausing is strictly for manual/admin use. From 5f4e729f9608f19e4c83896afd31472a4d81c977 Mon Sep 17 00:00:00 2001 From: silent-cipher Date: Wed, 6 May 2026 10:00:44 +0530 Subject: [PATCH 12/44] refactor: remove commit from pull check --- .../src/pull-check/pull-check.module.ts | 3 +- .../src/pull-check/pull-check.service.ts | 67 +------------------ docs/checks/events-and-metrics.md | 6 +- docs/checks/pull-check.md | 29 +++----- docs/environment-variables.md | 2 +- 5 files changed, 16 insertions(+), 91 deletions(-) diff --git a/apps/backend/src/pull-check/pull-check.module.ts b/apps/backend/src/pull-check/pull-check.module.ts index fd7b2d56..d2881735 100644 --- a/apps/backend/src/pull-check/pull-check.module.ts +++ b/apps/backend/src/pull-check/pull-check.module.ts @@ -1,7 +1,6 @@ import { Module } from "@nestjs/common"; import { DatabaseModule } from "../database/database.module.js"; import { DataSourceModule } from "../dataSource/dataSource.module.js"; -import { DealModule } from "../deal/deal.module.js"; import { HttpClientModule } from "../http-client/http-client.module.js"; import { WalletSdkModule } from "../wallet-sdk/wallet-sdk.module.js"; import { HostedPieceRegistry } from "./hosted-piece.registry.js"; @@ -9,7 +8,7 @@ import { PieceSourceController } from "./piece-source.controller.js"; import { PullCheckService } from "./pull-check.service.js"; @Module({ - imports: [DatabaseModule, WalletSdkModule, DataSourceModule, DealModule, HttpClientModule], + imports: [DatabaseModule, WalletSdkModule, DataSourceModule, HttpClientModule], controllers: [PieceSourceController], providers: [PullCheckService, HostedPieceRegistry], exports: [PullCheckService, HostedPieceRegistry], diff --git a/apps/backend/src/pull-check/pull-check.service.ts b/apps/backend/src/pull-check/pull-check.service.ts index 738fe4c9..4b2ed5c8 100644 --- a/apps/backend/src/pull-check/pull-check.service.ts +++ b/apps/backend/src/pull-check/pull-check.service.ts @@ -2,16 +2,12 @@ import * as fs from "node:fs"; import * as path from "node:path"; import { calculate, parse as parsePieceCid } from "@filoz/synapse-core/piece"; import { pullPieces, waitForPullPieces } from "@filoz/synapse-core/sp"; -import { getDataSet } from "@filoz/synapse-core/warm-storage"; -import { Synapse } from "@filoz/synapse-sdk"; import { Injectable, Logger } from "@nestjs/common"; import { ConfigService } from "@nestjs/config"; import type { Account, Address, Chain, Client, Transport } from "viem"; import { type ProviderJobContext, toStructuredError } from "../common/logging.js"; -import { createSynapseFromConfig } from "../common/synapse-factory.js"; -import type { IAppConfig, IBlockchainConfig, IConfig, IDatasetConfig, IJobsConfig } from "../config/app.config.js"; +import type { IAppConfig, IConfig, IDatasetConfig, IJobsConfig } from "../config/app.config.js"; import { DataSourceService } from "../dataSource/dataSource.service.js"; -import { DealService } from "../deal/deal.service.js"; import { HttpClientService } from "../http-client/http-client.service.js"; import { buildCheckMetricLabels, classifyFailureStatus } from "../metrics-prometheus/check-metric-labels.js"; import { PullCheckCheckMetrics } from "../metrics-prometheus/check-metrics.service.js"; @@ -25,8 +21,6 @@ type SynapseViemClient = Client; @Injectable() export class PullCheckService { private readonly logger = new Logger(PullCheckService.name); - private readonly blockchainConfig: IBlockchainConfig; - private sharedSynapse?: Synapse; constructor( private readonly configService: ConfigService, @@ -34,25 +28,8 @@ export class PullCheckService { private readonly dataSourceService: DataSourceService, private readonly hostedPieceRegistry: HostedPieceRegistry, private readonly pullCheckMetrics: PullCheckCheckMetrics, - private readonly dealService: DealService, private readonly httpClientService: HttpClientService, - ) { - this.blockchainConfig = this.configService.get("blockchain", { infer: true }); - } - - async onModuleInit() { - this.sharedSynapse = await this.createSynapseInstance(); - this.logger.log({ - event: "pull_check_synapse_ready", - message: "Pull-check Synapse instance initialized", - }); - } - - async onModuleDestroy(): Promise { - if (this.sharedSynapse) { - this.sharedSynapse = undefined; - } - } + ) {} /** * Resolve and validate provider eligibility for a pull check. Throws when @@ -109,24 +86,17 @@ export class PullCheckService { const pieceCidParsed = parsePieceCid(pieceCidStr); const synapseClient = this.requireSynapseClient(); - const synapse = this.sharedSynapse ?? (await this.createSynapseInstance()); - const storage = await synapse.storage.createContext({ - providerId: providerInfo.id, - metadata: this.dealService.getBaseDataSetMetadata(), - }); // Resolve pull options for either the existing-dataset or new-dataset SP // pull pathway. `pullPieces` requires both dataSetId and clientDataSetId // when targeting an existing dataset; if either is unavailable we treat // the request as new-dataset and rely on the signed CreateDataSetAndAddPieces. - const dataSetId = storage.dataSetId; - const clientDataSetId = dataSetId ? (await getDataSet(synapseClient, { dataSetId }))?.clientDataSetId : undefined; const payee = providerInfo.payee as Address; const serviceURL = providerInfo.pdp.serviceURL; const pullPiecesOptions = { serviceURL, pieces: [{ pieceCid: pieceCidParsed, sourceUrl: prepared.sourceUrl }], - ...(dataSetId && clientDataSetId ? { dataSetId, clientDataSetId } : { payee }), + payee, signal, }; @@ -162,13 +132,6 @@ export class PullCheckService { throw new Error(`Storage provider failed to pull piece: status=${finalResponse.status}`); } - // We omit pieceMetadata: `IPFS_ROOT_CID` is meaningless for synthetic - // pull-check pieces and would corrupt downstream IPNI advertising. - const commitResult = await storage.commit({ - pieces: pullPiecesOptions.pieces.map((p) => ({ pieceCid: p.pieceCid })), - }); - signal?.throwIfAborted(); - const pieceValidated = await this.validateByDirectPieceFetch(providerInfo, pieceCidStr, logContext, signal); signal?.throwIfAborted(); if (!pieceValidated) { @@ -195,9 +158,6 @@ export class PullCheckService { event: "pull_check_completed", message: "Pull check completed", pieceCid: pieceCidStr, - dataSetId: commitResult.dataSetId.toString(), - pieceIds: commitResult.pieceIds.map((id) => id.toString()), - txHash: commitResult.txHash, requestLatencyMs, completionLatencyMs, firstByteMs, @@ -329,27 +289,6 @@ export class PullCheckService { return client as SynapseViemClient; } - private async createSynapseInstance(): Promise { - try { - const { synapse, isSessionKeyMode } = await createSynapseFromConfig(this.blockchainConfig); - if (isSessionKeyMode) { - this.logger.log({ - event: "pull_check_synapse_session_key_init", - message: "Pull-check Synapse initialized with session key", - walletAddress: this.blockchainConfig.walletAddress, - }); - } - return synapse; - } catch (error) { - this.logger.error({ - event: "pull_check_synapse_init_failed", - message: "Failed to initialize Synapse for pull-check service", - error: toStructuredError(error), - }); - throw error; - } - } - /** * Stream the hosted piece bytes for an active registration. Used by the * `/api/piece/:pieceCid` controller. Returns null when no active registration diff --git a/docs/checks/events-and-metrics.md b/docs/checks/events-and-metrics.md index af1beae1..2b3c90e5 100644 --- a/docs/checks/events-and-metrics.md +++ b/docs/checks/events-and-metrics.md @@ -74,7 +74,6 @@ sequenceDiagram SP-->>Dealbot: hostedPieceFirstByteRead Dealbot->>SP: pullStatusPolled (waitForPullPieces, repeated) SP-->>Dealbot: pullTerminalStatusReported - Dealbot->>RPC: pullCheckCommitted (storage.commit) Dealbot->>SP: directPieceFetchStarted (/piece/{cid}) SP-->>Dealbot: directPieceFetchCompleted Dealbot-->>Dealbot: pullCheckIntegrityChecked @@ -88,7 +87,6 @@ sequenceDiagram | `pullRequestAcknowledged` | SP returns from `pullPieces` (success or non-terminal-failure). | Yes | [`pull-check.service.ts`](../../apps/backend/src/pull-check/pull-check.service.ts) | | `hostedPieceFirstByteRead` | SP reads the first byte of `/api/piece/{pieceCid}` from dealbot. Recorded once per registration. | Yes | [`piece-source.controller.ts`](../../apps/backend/src/pull-check/piece-source.controller.ts) | | `pullTerminalStatusReported` | SP reports a terminal pull status (`complete`, `failed`, ...) via `waitForPullPieces`. Intermediate poll statuses are not counted. | Yes | [`pull-check.service.ts`](../../apps/backend/src/pull-check/pull-check.service.ts) | -| `pullCheckCommitted` | Synapse `storage.commit()` succeeds for the pulled piece. | Yes | [`pull-check.service.ts`](../../apps/backend/src/pull-check/pull-check.service.ts) | | `pullCheckIntegrityChecked` | Direct `/piece/{pieceCid}` fetch from the SP returns bytes whose recomputed pieceCid matches the expected CID. | Yes | [`pull-check.service.ts`](../../apps/backend/src/pull-check/pull-check.service.ts) | ## Metrics @@ -146,8 +144,8 @@ sequenceDiagram | `dataSetCreationStatus` | Data-Set Creation | Not tied to an [event above](#event-list) but rather to data-set creation start (`pending`) and completion (`success`/`failure.*`) | `pending`, `success`, `failure.timedout`, `failure.other` | [`deal.service.ts`](../../apps/backend/src/deal/deal.service.ts) | | `dataSetChallengeStatus` | Data Retention | Emitted on each [Data Retention Check](./data-retention.md) poll when a provider's confirmed proving-period totals advance (strictly positive deltas). Unit: **challenges** (period delta × `CHALLENGES_PER_PROVING_PERIOD = 5`). | `success` (challenges in successfully-proven periods), `failure` (challenges in faulted periods) | [`data-retention.service.ts`](../../apps/backend/src/data-retention/data-retention.service.ts) | | `pdp_provider_estimated_overdue_periods` | Data Retention | Emitted on every [Data Retention Check](./data-retention.md) poll for every successfully processed provider. | Gauge value in proving periods (non-negative integer) | [`data-retention.service.ts`](../../apps/backend/src/data-retention/data-retention.service.ts) | -| `pullCheckStatus` | Pull | When the [Pull Check](./pull-check.md) terminates (success after commit + direct piece validation, or any failure). Recorded exactly once per check. | `success`, `failure.timedout`, `failure.other`. Failure classification follows [`classifyFailureStatus`](../../apps/backend/src/metrics-prometheus/check-metric-labels.ts) (timeout-keyed errors → `failure.timedout`, everything else → `failure.other`). | [`pull-check.service.ts`](../../apps/backend/src/pull-check/pull-check.service.ts) | -| `pullCheckProviderStatus` | Pull | When the SP reports a terminal pull status via `waitForPullPieces`. Recorded exactly once per check (intermediate poll statuses are not counted). | Raw SP-reported pull status, for example `complete`, `failed`, `not_found`. Use this to separate SP-side pull failures from dealbot-side commit/validation failures. | [`pull-check.service.ts`](../../apps/backend/src/pull-check/pull-check.service.ts) | +| `pullCheckStatus` | Pull | When the [Pull Check](./pull-check.md) terminates (success after direct piece validation, or any failure). Recorded exactly once per check. | `success`, `failure.timedout`, `failure.other`. Failure classification follows [`classifyFailureStatus`](../../apps/backend/src/metrics-prometheus/check-metric-labels.ts) (timeout-keyed errors → `failure.timedout`, everything else → `failure.other`). | [`pull-check.service.ts`](../../apps/backend/src/pull-check/pull-check.service.ts) | +| `pullCheckProviderStatus` | Pull | When the SP reports a terminal pull status via `waitForPullPieces`. Recorded exactly once per check (intermediate poll statuses are not counted). | Raw SP-reported pull status, for example `complete`, `failed`, `not_found`. Use this to separate SP-side pull failures from dealbot-side validation failures. | [`pull-check.service.ts`](../../apps/backend/src/pull-check/pull-check.service.ts) | ## ClickHouse Tables diff --git a/docs/checks/pull-check.md b/docs/checks/pull-check.md index 48bf5a42..0b9fded4 100644 --- a/docs/checks/pull-check.md +++ b/docs/checks/pull-check.md @@ -8,13 +8,13 @@ For event and metric definitions used by the dashboard, see [Dealbot Events & Me ## Overview -A "pull check" exercises the **storage provider pull-to-park pathway**: dealbot publishes a temporary piece at `/api/piece/{pieceCid}`, asks the SP to fetch (pull) and park it via the Synapse `pullPieces` API, waits for a terminal SP pull status, commits the pulled piece on-chain, and finally re-fetches the piece from the SP to verify byte-for-byte integrity. +A "pull check" exercises the **storage provider pull-to-park pathway**: dealbot publishes a temporary piece at `/api/piece/{pieceCid}`, asks the SP to fetch (pull) and park it via the Synapse `pullPieces` API, waits for a terminal SP pull status, and finally re-fetches the piece from the SP to verify byte-for-byte integrity. -The pull check answers a different question than the [Data Storage check](./data-storage.md): instead of *uploading* bytes to the SP, it asks the SP to *pull* bytes from a public URL. This validates an SP's outbound HTTP fetcher, the pull request lifecycle, and the resulting on-chain commit and retrieval surface. +The pull check answers a different question than the [Data Storage check](./data-storage.md): instead of *uploading* bytes to the SP, it asks the SP to *pull* bytes from a public URL. This validates an SP's outbound HTTP fetcher, the pull request lifecycle, and retrieval surface. A successful pull check requires all [assertions in the table below](#what-gets-asserted) to pass. Failure occurs if any step fails or the job exceeds its max allowed time. Operational timeouts exist to prevent jobs from running indefinitely, but they are not quality assertions. -> **Where results live:** Pull check results are exported to Prometheus and structured logs only. They are **not** persisted in Postgres or written to ClickHouse. Committed pull-check pieces are also **not** tracked in the `deals` table, so the [Piece Cleanup](../environment-variables.md#piece-cleanup) job will not garbage-collect them; they will accrue on the SP unless explicitly removed. +> **Where results live:** Pull check results are exported to Prometheus and structured logs only. They are **not** persisted in Postgres or written to ClickHouse. ## What Gets Asserted @@ -24,7 +24,6 @@ Each pull check asserts the following for every SP: |---|-----------|------------------|:---:|--------------------------------------------|:---:| | 1 | SP accepts the pull request | `pullPieces` returns without error and reports a non-terminal-failure status | 0 | [`pullCheckRequestLatencyMs`](./events-and-metrics.md#pullCheckRequestLatencyMs) | Yes | | 2 | SP reaches a terminal `complete` pull status | `waitForPullPieces` polls the SP until a terminal status is reported | Polling with delay until [`PULL_CHECK_JOB_TIMEOUT_SECONDS`](../environment-variables.md#pull_check_job_timeout_seconds) | [`pullCheckCompletionLatencyMs`](./events-and-metrics.md#pullCheckCompletionLatencyMs) | Yes | -| 3 | SP records the piece on-chain | Synapse `storage.commit()` succeeds for the pulled piece | n/a | n/a (bounded by job timeout) | Yes | | 4 | SP serves the pulled piece via `/piece/{pieceCid}` | Re-fetch the bytes from the SP's PDP service URL and re-compute the piece CID | 0 | n/a (bounded by job timeout) | Yes | | 5 | All checks pass | Pull check is not marked successful until all assertions pass within the job timeout | n/a | [`pullCheckCompletionLatencyMs`](./events-and-metrics.md#pullCheckCompletionLatencyMs) | Yes | @@ -37,9 +36,8 @@ flowchart TD Generate["Generate random piece + register hosted source
at /api/piece/{pieceCid}"] Generate --> Submit["Submit pullPieces request to SP"] Submit --> Poll["Poll SP via waitForPullPieces
until terminal pull status"] - Poll -->|complete| Commit["Synapse storage.commit() the pulled piece"] + Poll -->|complete| Validate["Direct /piece/{pieceCid} fetch from SP
+ recompute pieceCid"] Poll -->|other terminal status| Fail["Mark pull check failed"] - Commit --> Validate["Direct /piece/{pieceCid} fetch from SP
+ recompute pieceCid"] Validate -->|matches| Success["Mark pull check successful"] Validate -->|mismatch or fetch error| Fail Success --> Cleanup @@ -72,13 +70,7 @@ When the SP fetches `/api/piece/{pieceCid}` for the first time, the controller s Source: [`piece-source.controller.ts`](../../apps/backend/src/pull-check/piece-source.controller.ts) -### 4. Commit the piece on-chain - -When the terminal pull status is `complete`, dealbot calls `synapse.storage.commit({ pieces: [{ pieceCid }] })`. Pull-check pieces are committed without `pieceMetadata` because the synthetic content has no meaningful IPFS root CID and including a synthetic one would corrupt downstream IPNI advertising. - -A failure here marks the pull check as `failure.other` and aborts before validation. - -### 5. Direct piece-fetch validation +### 4. Direct piece-fetch validation After commit, dealbot fetches `{serviceURL}/piece/{pieceCid}` from the SP, re-computes the piece CID over the response body, and compares it against the expected CID. A mismatch fails the pull check with `failure.other`. A network or HTTP error during validation also fails the check (transport errors are intentionally not retried). @@ -86,7 +78,7 @@ Aborts (job timeout) propagate as throws and are classified as `failure.timedout Source: [`pull-check.service.ts` (`validateByDirectPieceFetch`)](../../apps/backend/src/pull-check/pull-check.service.ts) -### 6. Cleanup +### 5. Cleanup Whether the pull check succeeds or fails, the `finally` block: @@ -104,11 +96,11 @@ A pull check has a single terminal status, recorded once per check via [`pullChe |--------|---------| | `success` | All five [assertions](#what-gets-asserted) passed within the job timeout. | | `failure.timedout` | The job was aborted because it exceeded `PULL_CHECK_JOB_TIMEOUT_SECONDS`, or the underlying error message indicates a timeout. | -| `failure.other` | Any other failure: SP rejected the pull request, SP reached a non-`complete` terminal status, commit failed, or direct piece validation failed. | +| `failure.other` | Any other failure: SP rejected the pull request, SP reached a non-`complete` terminal status, or direct piece validation failed. | Failures are classified by inspecting the error message; see [`classifyFailureStatus`](../../apps/backend/src/metrics-prometheus/check-metric-labels.ts) for the exact rule. -In addition to the overall status, dealbot records the **raw SP-reported terminal pull status** via [`pullCheckProviderStatus`](./events-and-metrics.md#pullCheckProviderStatus) (for example `complete`, `failed`, `not_found`). This separates "SP said it failed" from "dealbot's downstream commit/validation failed" in dashboards. +In addition to the overall status, dealbot records the **raw SP-reported terminal pull status** via [`pullCheckProviderStatus`](./events-and-metrics.md#pullCheckProviderStatus) (for example `complete`, `failed`, `not_found`). This separates "SP said it failed" from "dealbot's downstream validation failed" in dashboards. ## HTTP API @@ -154,12 +146,9 @@ See also: [`docs/environment-variables.md`](../environment-variables.md) for the ### Why isn't the pull-check piece tracked in the `deals` table? -Pull checks are intentionally isolated from the data-storage flow: they don't pass through `DealService.createDeal`, don't allocate a `Deal` entity. This keeps the pull-check signal independent of the data-storage success rate. The trade-off is that the [Piece Cleanup](../environment-variables.md#piece-cleanup) job will not garbage-collect committed pull-check pieces, so SPs accumulate them over time until removed manually. +Pull checks are intentionally isolated from the data-storage flow: they don't pass through `DealService.createDeal`, don't allocate a `Deal` entity. This keeps the pull-check signal independent of the data-storage success rate. ### Why does a "cached pull" not record `pullCheckFirstByteMs`? If an SP previously pulled the same piece CID and serves the new pull request from a local cache, it will never fetch `/api/piece/{pieceCid}`, so dealbot has no first-byte timestamp to subtract. In that case dealbot skips the histogram observation rather than emit a misleading zero. Cached pulls are uncommon today because each pull check generates a fresh random piece, but the registry's first-byte capture is **idempotent** so retried pulls during a single check do not skew measurements either. -### Why don't we set `pieceMetadata` on the commit? - -`IPFS_ROOT_CID` is meaningless for synthetic pull-check pieces; setting it would announce a fake provider record to IPNI and corrupt downstream discoverability for unrelated content. We pass only `{ pieceCid }` to `storage.commit()`. diff --git a/docs/environment-variables.md b/docs/environment-variables.md index 342b2163..22a5e83a 100644 --- a/docs/environment-variables.md +++ b/docs/environment-variables.md @@ -964,7 +964,7 @@ PULL_CHECKS_PER_SP_PER_HOUR=0.083 - **Minimum**: `60` - **Enforced**: Yes (config validation) -**Role**: Maximum runtime for a pull-check job before forced abort via `AbortController`. Bounds the polling window for terminal SP pull status, the on-chain commit, and the direct `/piece/{pieceCid}` re-fetch combined. +**Role**: Maximum runtime for a pull-check job before forced abort via `AbortController`. Bounds the polling window for terminal SP pull status and the direct `/piece/{pieceCid}` re-fetch combined. **When to update**: From e0a4cb7afa7572c30799cbdd406bb17781e2c0de Mon Sep 17 00:00:00 2001 From: silent-cipher Date: Wed, 6 May 2026 10:04:40 +0530 Subject: [PATCH 13/44] fix tests after commit removal --- .../src/pull-check/pull-check.service.spec.ts | 29 ++----------------- 1 file changed, 2 insertions(+), 27 deletions(-) diff --git a/apps/backend/src/pull-check/pull-check.service.spec.ts b/apps/backend/src/pull-check/pull-check.service.spec.ts index ac1b8d4f..67a9c5e5 100644 --- a/apps/backend/src/pull-check/pull-check.service.spec.ts +++ b/apps/backend/src/pull-check/pull-check.service.spec.ts @@ -1,4 +1,3 @@ -import type { Synapse } from "@filoz/synapse-sdk"; import { ConfigService } from "@nestjs/config"; import { Test, type TestingModule } from "@nestjs/testing"; import { afterEach, beforeEach, describe, expect, it, vi } from "vitest"; @@ -37,7 +36,6 @@ vi.mock("../common/synapse-factory.js", () => ({ import { calculate } from "@filoz/synapse-core/piece"; import { pullPieces, waitForPullPieces } from "@filoz/synapse-core/sp"; -import { getDataSet } from "@filoz/synapse-core/warm-storage"; function makeProvider(overrides: Partial = {}): PDPProviderEx { return { @@ -344,25 +342,6 @@ describe("PullCheckService", () => { // run reads it once to compute first-byte latency. Same shape suffices. registryMock.resolveAny.mockReturnValue(registration); - // Mock the synapse storage context returned by `synapse.storage.createContext`. - const commitResult = { - dataSetId: 7n, - pieceIds: [11n, 12n], - txHash: "0xtx", - }; - const storage = { - dataSetId: 7n, - commit: vi.fn().mockResolvedValue(commitResult), - }; - const sharedSynapse = { - storage: { createContext: vi.fn().mockResolvedValue(storage) }, - } as unknown as Synapse; - // The service caches sharedSynapse in onModuleInit; emulate that here. - (service as unknown as { sharedSynapse: Synapse }).sharedSynapse = sharedSynapse; - - vi.mocked(getDataSet).mockResolvedValue({ clientDataSetId: 99n } as unknown as Awaited< - ReturnType - >); vi.mocked(pullPieces).mockResolvedValue({ status: "pending" } as unknown as Awaited< ReturnType >); @@ -375,11 +354,11 @@ describe("PullCheckService", () => { httpClientServiceMock.requestWithMetrics.mockResolvedValue({ data: Buffer.from("payload") }); vi.mocked(calculate).mockReturnValue({ toString: () => "bafk-test-piece" } as ReturnType); - return { registration, storage, commitResult }; + return { registration }; } it("runs the full lifecycle, observes all metrics, and records success", async () => { - const { registration, storage } = arrangeHappyPath(); + const { registration } = arrangeHappyPath(); await service.runPullCheck("0xsp", undefined, logContext); @@ -391,10 +370,6 @@ describe("PullCheckService", () => { // Terminal SP status recorded exactly once. expect(metricsMock.recordProviderStatus).toHaveBeenCalledTimes(1); expect(metricsMock.recordProviderStatus).toHaveBeenCalledWith(expect.any(Object), "complete"); - // Commit was invoked with no per-piece metadata. - expect(storage.commit).toHaveBeenCalledWith({ - pieces: [{ pieceCid: expect.any(Object) }], - }); // First-byte and throughput observed since the registration carries // pullSubmittedAt + firstByteAt and the path completed. expect(metricsMock.observeFirstByteMs).toHaveBeenCalledTimes(1); From 5b5e2b43f7f05508896cdffefa54b5ee79253a2e Mon Sep 17 00:00:00 2001 From: silent-cipher Date: Wed, 6 May 2026 10:06:15 +0530 Subject: [PATCH 14/44] chore: remove unused mock --- apps/backend/src/pull-check/pull-check.service.spec.ts | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/apps/backend/src/pull-check/pull-check.service.spec.ts b/apps/backend/src/pull-check/pull-check.service.spec.ts index 67a9c5e5..ad098b96 100644 --- a/apps/backend/src/pull-check/pull-check.service.spec.ts +++ b/apps/backend/src/pull-check/pull-check.service.spec.ts @@ -24,16 +24,6 @@ vi.mock("@filoz/synapse-core/sp", () => ({ waitForPullPieces: vi.fn(), })); -vi.mock("@filoz/synapse-core/warm-storage", () => ({ - getDataSet: vi.fn(), -})); - -// `createSynapseFromConfig` is invoked from `onModuleInit`; the tests do not -// run module init, but the import must resolve. -vi.mock("../common/synapse-factory.js", () => ({ - createSynapseFromConfig: vi.fn(), -})); - import { calculate } from "@filoz/synapse-core/piece"; import { pullPieces, waitForPullPieces } from "@filoz/synapse-core/sp"; From 2c8e57b6767d8b20d0b5e52b98310bd73c8c3d98 Mon Sep 17 00:00:00 2001 From: silent-cipher Date: Wed, 6 May 2026 10:14:27 +0530 Subject: [PATCH 15/44] fix: observe pull check completion latency once --- apps/backend/src/pull-check/pull-check.service.ts | 3 --- 1 file changed, 3 deletions(-) diff --git a/apps/backend/src/pull-check/pull-check.service.ts b/apps/backend/src/pull-check/pull-check.service.ts index 4b2ed5c8..de724190 100644 --- a/apps/backend/src/pull-check/pull-check.service.ts +++ b/apps/backend/src/pull-check/pull-check.service.ts @@ -166,9 +166,6 @@ export class PullCheckService { }); } catch (error) { this.pullCheckMetrics.recordStatus(labels, classifyFailureStatus(error)); - if (requestSubmittedAt) { - this.pullCheckMetrics.observeCompletionLatencyMs(labels, Date.now() - requestSubmittedAt.getTime()); - } throw error; } finally { if (prepared) { From ded926753437922fe95533abec2ef80d279c9d26 Mon Sep 17 00:00:00 2001 From: silent-cipher Date: Wed, 6 May 2026 10:15:32 +0530 Subject: [PATCH 16/44] fix: metric help text --- .../backend/src/metrics-prometheus/metrics-prometheus.module.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/backend/src/metrics-prometheus/metrics-prometheus.module.ts b/apps/backend/src/metrics-prometheus/metrics-prometheus.module.ts index 1273b279..240b4201 100644 --- a/apps/backend/src/metrics-prometheus/metrics-prometheus.module.ts +++ b/apps/backend/src/metrics-prometheus/metrics-prometheus.module.ts @@ -217,7 +217,7 @@ const metricProviders = [ }), makeCounterProvider({ name: "pullCheckProviderStatus", - help: "Raw SP-reported pull statuses observed by DealBot during polling", + help: "Terminal SP-reported pull status recorded once per check (intermediate polling statuses are not counted)", labelNames: ["checkType", "providerId", "providerName", "providerStatus", "value"] as const, }), makeHistogramProvider({ From 4267cae2d6b48d1678ac76dd287a2911a2a67e77 Mon Sep 17 00:00:00 2001 From: silent-cipher Date: Wed, 6 May 2026 10:25:33 +0530 Subject: [PATCH 17/44] fix: job calculations --- apps/backend/.env.example | 2 +- apps/backend/src/config/app.config.ts | 2 +- docs/environment-variables.md | 2 +- docs/jobs.md | 6 +++--- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/apps/backend/.env.example b/apps/backend/.env.example index a9943dd1..9b3c753b 100644 --- a/apps/backend/.env.example +++ b/apps/backend/.env.example @@ -67,7 +67,7 @@ IPFS_BLOCK_FETCH_CONCURRENCY=6 # Parallel block fetches when validating IP # Pull Check Configuration PULL_CHECKS_PER_SP_PER_HOUR=1 # SP pull-pathway checks scheduled per provider per hour -PULL_CHECK_JOB_TIMEOUT_SECONDS=360 # 6m: Max runtime for pull-check jobs +PULL_CHECK_JOB_TIMEOUT_SECONDS=300 # 5m: Max runtime for pull-check jobs PULL_CHECK_HOSTED_PIECE_TTL_SECONDS=900 # 15m: Hosted piece source TTL exposed at /api/piece/:pieceCid PULL_CHECK_POLL_INTERVAL_SECONDS=10 # SP pull status polling interval PULL_CHECK_PIECE_SIZE_BYTES=10485760 # 10 MiB synthetic test piece size per pull check diff --git a/apps/backend/src/config/app.config.ts b/apps/backend/src/config/app.config.ts index c1c5d490..e2aee2c0 100644 --- a/apps/backend/src/config/app.config.ts +++ b/apps/backend/src/config/app.config.ts @@ -97,7 +97,7 @@ export const configValidationSchema = Joi.object({ // Pull Check PULL_CHECKS_PER_SP_PER_HOUR: Joi.number().min(0.001).max(20).default(1), - PULL_CHECK_JOB_TIMEOUT_SECONDS: Joi.number().min(60).default(360), // 6m max runtime for pull check jobs + PULL_CHECK_JOB_TIMEOUT_SECONDS: Joi.number().min(60).default(300), // 5m max runtime for pull check jobs PULL_CHECK_HOSTED_PIECE_TTL_SECONDS: Joi.number().min(60).default(900), // 15m hosted piece TTL PULL_CHECK_POLL_INTERVAL_SECONDS: Joi.number().min(1).default(10), PULL_CHECK_PIECE_SIZE_BYTES: Joi.number() diff --git a/docs/environment-variables.md b/docs/environment-variables.md index 22a5e83a..b4820fad 100644 --- a/docs/environment-variables.md +++ b/docs/environment-variables.md @@ -960,7 +960,7 @@ PULL_CHECKS_PER_SP_PER_HOUR=0.083 - **Type**: `number` (seconds) - **Required**: No -- **Default**: `360` (6 minutes) +- **Default**: `300` (5 minutes) - **Minimum**: `60` - **Enforced**: Yes (config validation) diff --git a/docs/jobs.md b/docs/jobs.md index df002617..0d380c0e 100644 --- a/docs/jobs.md +++ b/docs/jobs.md @@ -148,11 +148,11 @@ Cluster capacity (worker pool bound): - Worker concurrency (jobs at once) = `dealbot_worker_processes * PG_BOSS_LOCAL_CONCURRENCY` - Max sustainable SP count ≈ `(dealbot_worker_processes * PG_BOSS_LOCAL_CONCURRENCY * 60) / per_sp_execution_minutes_per_hour` -Example (18 SPs, 4 deals/hr @ 5m, 6 retrievals/hr @ 2m, 5 dealbot workers, `PG_BOSS_LOCAL_CONCURRENCY=20`): +Example (18 SPs, 4 deals/hr @ 5m, 6 retrievals/hr @ 2m, 1 / 24 piece cleanup/hr @ 5m, 1 pull check/hr @ 5m, 5 dealbot workers, `PG_BOSS_LOCAL_CONCURRENCY=20`): -- Per-SP execution-minutes per hour = `4*5m + 6*2m = 32 execution-min/hr` (OK; 28 execution-min/hr headroom) +- Per-SP execution-minutes per hour = `4*5m + 6*2m + 1/24*5m + 1*5m = 38 execution-min/hr` (OK; 22 execution-min/hr headroom) - Worker capacity minutes per hour = `5 * 20 * 60 = 6000 execution-min/hr` -- Max sustainable SP count ≈ `6000 / 32 = 187 SPs` +- Max sustainable SP count ≈ `6000 / 38 = 157 SPs` ## Staggering Multiple Deployments From 200dbe9e843ae53401959897c2170320ec6b0fda Mon Sep 17 00:00:00 2001 From: silent-cipher Date: Thu, 7 May 2026 14:52:18 +0530 Subject: [PATCH 18/44] chore: revert back to waitForPullStatus --- .../src/pull-check/pull-check.service.spec.ts | 16 ++++++++-------- .../backend/src/pull-check/pull-check.service.ts | 10 +++++----- docs/checks/events-and-metrics.md | 6 +++--- docs/checks/pull-check.md | 6 +++--- docs/environment-variables.md | 2 +- 5 files changed, 20 insertions(+), 20 deletions(-) diff --git a/apps/backend/src/pull-check/pull-check.service.spec.ts b/apps/backend/src/pull-check/pull-check.service.spec.ts index ad098b96..ce420711 100644 --- a/apps/backend/src/pull-check/pull-check.service.spec.ts +++ b/apps/backend/src/pull-check/pull-check.service.spec.ts @@ -21,11 +21,11 @@ vi.mock("@filoz/synapse-core/piece", () => ({ vi.mock("@filoz/synapse-core/sp", () => ({ pullPieces: vi.fn(), - waitForPullPieces: vi.fn(), + waitForPullStatus: vi.fn(), })); import { calculate } from "@filoz/synapse-core/piece"; -import { pullPieces, waitForPullPieces } from "@filoz/synapse-core/sp"; +import { pullPieces, waitForPullStatus } from "@filoz/synapse-core/sp"; function makeProvider(overrides: Partial = {}): PDPProviderEx { return { @@ -335,10 +335,10 @@ describe("PullCheckService", () => { vi.mocked(pullPieces).mockResolvedValue({ status: "pending" } as unknown as Awaited< ReturnType >); - vi.mocked(waitForPullPieces).mockResolvedValue({ + vi.mocked(waitForPullStatus).mockResolvedValue({ status: "complete", pieces: [{ pieceCid: "bafk-test-piece", status: "complete" }], - } as unknown as Awaited>); + } as unknown as Awaited>); // Direct-fetch validation succeeds. httpClientServiceMock.requestWithMetrics.mockResolvedValue({ data: Buffer.from("payload") }); @@ -387,10 +387,10 @@ describe("PullCheckService", () => { it("re-throws and records failure.other when the SP terminal status is not 'complete'", async () => { arrangeHappyPath(); - vi.mocked(waitForPullPieces).mockResolvedValue({ + vi.mocked(waitForPullStatus).mockResolvedValue({ status: "failed", pieces: [], - } as unknown as Awaited>); + } as unknown as Awaited>); await expect(service.runPullCheck("0xsp", undefined, logContext)).rejects.toThrow( /Storage provider failed to pull piece/, @@ -404,7 +404,7 @@ describe("PullCheckService", () => { it("classifies timeouts as failure.timedout", async () => { arrangeHappyPath(); - vi.mocked(waitForPullPieces).mockRejectedValue(new Error("polling timed out after 300s")); + vi.mocked(waitForPullStatus).mockRejectedValue(new Error("polling timed out after 300s")); await expect(service.runPullCheck("0xsp", undefined, logContext)).rejects.toThrow(); expect(metricsMock.recordStatus).toHaveBeenLastCalledWith(expect.any(Object), "failure.timedout"); @@ -430,7 +430,7 @@ describe("PullCheckService", () => { await expect(service.runPullCheck("0xsp", controller.signal, logContext)).rejects.toThrow(); // No SP-side calls were issued. expect(pullPieces).not.toHaveBeenCalled(); - expect(waitForPullPieces).not.toHaveBeenCalled(); + expect(waitForPullStatus).not.toHaveBeenCalled(); // Failure is classified as timed out (abort message contains "timeout"). expect(metricsMock.recordStatus).toHaveBeenLastCalledWith(expect.any(Object), "failure.timedout"); }); diff --git a/apps/backend/src/pull-check/pull-check.service.ts b/apps/backend/src/pull-check/pull-check.service.ts index de724190..2503bb20 100644 --- a/apps/backend/src/pull-check/pull-check.service.ts +++ b/apps/backend/src/pull-check/pull-check.service.ts @@ -1,9 +1,9 @@ -import * as fs from "node:fs"; -import * as path from "node:path"; import { calculate, parse as parsePieceCid } from "@filoz/synapse-core/piece"; -import { pullPieces, waitForPullPieces } from "@filoz/synapse-core/sp"; +import { pullPieces, waitForPullStatus } from "@filoz/synapse-core/sp"; import { Injectable, Logger } from "@nestjs/common"; import { ConfigService } from "@nestjs/config"; +import * as fs from "node:fs"; +import * as path from "node:path"; import type { Account, Address, Chain, Client, Transport } from "viem"; import { type ProviderJobContext, toStructuredError } from "../common/logging.js"; import type { IAppConfig, IConfig, IDatasetConfig, IJobsConfig } from "../config/app.config.js"; @@ -116,8 +116,8 @@ export class PullCheckService { }); const jobsConfig = this.getJobsConfig(); - // `waitForPullPieces` polls the SP repeatedly until a terminal pull status is reported - const finalResponse = await waitForPullPieces(synapseClient, { + // `waitForPullStatus` polls the SP repeatedly until a terminal pull status is reported + const finalResponse = await waitForPullStatus(synapseClient, { ...pullPiecesOptions, timeout: jobsConfig.pullCheckJobTimeoutSeconds * 1000, pollInterval: jobsConfig.pullCheckPollIntervalSeconds * 1000, diff --git a/docs/checks/events-and-metrics.md b/docs/checks/events-and-metrics.md index b5f1504b..def047b5 100644 --- a/docs/checks/events-and-metrics.md +++ b/docs/checks/events-and-metrics.md @@ -72,7 +72,7 @@ sequenceDiagram Dealbot->>SP: pullRequestSubmitted (pullPieces) SP-->>Dealbot: pullRequestAcknowledged SP-->>Dealbot: hostedPieceFirstByteRead - Dealbot->>SP: pullStatusPolled (waitForPullPieces, repeated) + Dealbot->>SP: pullStatusPolled (waitForPullStatus, repeated) SP-->>Dealbot: pullTerminalStatusReported Dealbot->>SP: directPieceFetchStarted (/piece/{cid}) SP-->>Dealbot: directPieceFetchCompleted @@ -86,7 +86,7 @@ sequenceDiagram | `pullRequestSubmitted` | Dealbot calls `pullPieces` against the SP for the registered piece CID. | Yes | [`pull-check.service.ts`](../../apps/backend/src/pull-check/pull-check.service.ts) | | `pullRequestAcknowledged` | SP returns from `pullPieces` (success or non-terminal-failure). | Yes | [`pull-check.service.ts`](../../apps/backend/src/pull-check/pull-check.service.ts) | | `hostedPieceFirstByteRead` | SP reads the first byte of `/api/piece/{pieceCid}` from dealbot. Recorded once per registration. | Yes | [`piece-source.controller.ts`](../../apps/backend/src/pull-check/piece-source.controller.ts) | -| `pullTerminalStatusReported` | SP reports a terminal pull status (`complete`, `failed`, ...) via `waitForPullPieces`. Intermediate poll statuses are not counted. | Yes | [`pull-check.service.ts`](../../apps/backend/src/pull-check/pull-check.service.ts) | +| `pullTerminalStatusReported` | SP reports a terminal pull status (`complete`, `failed`, ...) via `waitForPullStatus`. Intermediate poll statuses are not counted. | Yes | [`pull-check.service.ts`](../../apps/backend/src/pull-check/pull-check.service.ts) | | `pullCheckIntegrityChecked` | Direct `/piece/{pieceCid}` fetch from the SP returns bytes whose recomputed pieceCid matches the expected CID. | Yes | [`pull-check.service.ts`](../../apps/backend/src/pull-check/pull-check.service.ts) | ## Metrics @@ -145,7 +145,7 @@ sequenceDiagram | `dataSetChallengeStatus` | Data Retention | Emitted on each [Data Retention Check](./data-retention.md) poll when a provider's confirmed proving-period totals advance (strictly positive deltas). Unit: **challenges** (period delta × `CHALLENGES_PER_PROVING_PERIOD = 5`). | `success` (challenges in successfully-proven periods), `failure` (challenges in faulted periods) | [`data-retention.service.ts`](../../apps/backend/src/data-retention/data-retention.service.ts) | | `pdp_provider_estimated_overdue_periods` | Data Retention | Emitted on every [Data Retention Check](./data-retention.md) poll for every successfully processed provider. | Gauge value in proving periods (non-negative integer) | [`data-retention.service.ts`](../../apps/backend/src/data-retention/data-retention.service.ts) | | `pullCheckStatus` | Pull | When the [Pull Check](./pull-check.md) terminates (success after direct piece validation, or any failure). Recorded exactly once per check. | `success`, `failure.timedout`, `failure.other`. Failure classification follows [`classifyFailureStatus`](../../apps/backend/src/metrics-prometheus/check-metric-labels.ts) (timeout-keyed errors → `failure.timedout`, everything else → `failure.other`). | [`pull-check.service.ts`](../../apps/backend/src/pull-check/pull-check.service.ts) | -| `pullCheckProviderStatus` | Pull | When the SP reports a terminal pull status via `waitForPullPieces`. Recorded exactly once per check (intermediate poll statuses are not counted). | Raw SP-reported pull status, for example `complete`, `failed`, `not_found`. Use this to separate SP-side pull failures from dealbot-side validation failures. | [`pull-check.service.ts`](../../apps/backend/src/pull-check/pull-check.service.ts) | +| `pullCheckProviderStatus` | Pull | When the SP reports a terminal pull status via `waitForPullStatus`. Recorded exactly once per check (intermediate poll statuses are not counted). | Raw SP-reported pull status, for example `complete`, `failed`, `not_found`. Use this to separate SP-side pull failures from dealbot-side validation failures. | [`pull-check.service.ts`](../../apps/backend/src/pull-check/pull-check.service.ts) | ## ClickHouse Tables diff --git a/docs/checks/pull-check.md b/docs/checks/pull-check.md index 0b9fded4..c3278f57 100644 --- a/docs/checks/pull-check.md +++ b/docs/checks/pull-check.md @@ -23,7 +23,7 @@ Each pull check asserts the following for every SP: | # | Assertion | How It's Checked | Retries | Relevant Metric for Setting a Max Duration | Implemented? | |---|-----------|------------------|:---:|--------------------------------------------|:---:| | 1 | SP accepts the pull request | `pullPieces` returns without error and reports a non-terminal-failure status | 0 | [`pullCheckRequestLatencyMs`](./events-and-metrics.md#pullCheckRequestLatencyMs) | Yes | -| 2 | SP reaches a terminal `complete` pull status | `waitForPullPieces` polls the SP until a terminal status is reported | Polling with delay until [`PULL_CHECK_JOB_TIMEOUT_SECONDS`](../environment-variables.md#pull_check_job_timeout_seconds) | [`pullCheckCompletionLatencyMs`](./events-and-metrics.md#pullCheckCompletionLatencyMs) | Yes | +| 2 | SP reaches a terminal `complete` pull status | `waitForPullStatus` polls the SP until a terminal status is reported | Polling with delay until [`PULL_CHECK_JOB_TIMEOUT_SECONDS`](../environment-variables.md#pull_check_job_timeout_seconds) | [`pullCheckCompletionLatencyMs`](./events-and-metrics.md#pullCheckCompletionLatencyMs) | Yes | | 4 | SP serves the pulled piece via `/piece/{pieceCid}` | Re-fetch the bytes from the SP's PDP service URL and re-compute the piece CID | 0 | n/a (bounded by job timeout) | Yes | | 5 | All checks pass | Pull check is not marked successful until all assertions pass within the job timeout | n/a | [`pullCheckCompletionLatencyMs`](./events-and-metrics.md#pullCheckCompletionLatencyMs) | Yes | @@ -35,7 +35,7 @@ The dealbot scheduler triggers pull check jobs at a configurable rate (`PULL_CHE flowchart TD Generate["Generate random piece + register hosted source
at /api/piece/{pieceCid}"] Generate --> Submit["Submit pullPieces request to SP"] - Submit --> Poll["Poll SP via waitForPullPieces
until terminal pull status"] + Submit --> Poll["Poll SP via waitForPullStatus
until terminal pull status"] Poll -->|complete| Validate["Direct /piece/{pieceCid} fetch from SP
+ recompute pieceCid"] Poll -->|other terminal status| Fail["Mark pull check failed"] Validate -->|matches| Success["Mark pull check successful"] @@ -64,7 +64,7 @@ Source: [`pull-check.service.ts` (`runPullCheck`)](../../apps/backend/src/pull-c ### 3. Wait for terminal SP pull status -`waitForPullPieces` polls the SP at `PULL_CHECK_POLL_INTERVAL_SECONDS` until the SP reports a terminal status (`complete` or `failed`) or the job timeout fires. Dealbot increments the [`pullCheckProviderStatus`](./events-and-metrics.md#pullCheckProviderStatus) counter exactly once with the **terminal** status; intermediate poll statuses are not counted. +`waitForPullStatus` polls the SP at `PULL_CHECK_POLL_INTERVAL_SECONDS` until the SP reports a terminal status (`complete` or `failed`) or the job timeout fires. Dealbot increments the [`pullCheckProviderStatus`](./events-and-metrics.md#pullCheckProviderStatus) counter exactly once with the **terminal** status; intermediate poll statuses are not counted. When the SP fetches `/api/piece/{pieceCid}` for the first time, the controller stamps a first-byte timestamp on the registration. This is the basis for [`pullCheckFirstByteMs`](./events-and-metrics.md#pullCheckFirstByteMs). diff --git a/docs/environment-variables.md b/docs/environment-variables.md index b4820fad..a53411a2 100644 --- a/docs/environment-variables.md +++ b/docs/environment-variables.md @@ -996,7 +996,7 @@ PULL_CHECKS_PER_SP_PER_HOUR=0.083 - **Default**: `10` - **Minimum**: `1` -**Role**: Polling interval used by `waitForPullPieces` while waiting for the SP to report a terminal pull status (`complete` or `failed`). +**Role**: Polling interval used by `waitForPullStatus` while waiting for the SP to report a terminal pull status (`complete` or `failed`). **When to update**: From f02e4b472e5c274e66a6aa619d566afba837a795 Mon Sep 17 00:00:00 2001 From: silent-cipher Date: Thu, 7 May 2026 14:53:54 +0530 Subject: [PATCH 19/44] chore: fix lint --- apps/backend/src/pull-check/pull-check.service.ts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/apps/backend/src/pull-check/pull-check.service.ts b/apps/backend/src/pull-check/pull-check.service.ts index 2503bb20..a411f136 100644 --- a/apps/backend/src/pull-check/pull-check.service.ts +++ b/apps/backend/src/pull-check/pull-check.service.ts @@ -1,9 +1,9 @@ +import * as fs from "node:fs"; +import * as path from "node:path"; import { calculate, parse as parsePieceCid } from "@filoz/synapse-core/piece"; import { pullPieces, waitForPullStatus } from "@filoz/synapse-core/sp"; import { Injectable, Logger } from "@nestjs/common"; import { ConfigService } from "@nestjs/config"; -import * as fs from "node:fs"; -import * as path from "node:path"; import type { Account, Address, Chain, Client, Transport } from "viem"; import { type ProviderJobContext, toStructuredError } from "../common/logging.js"; import type { IAppConfig, IConfig, IDatasetConfig, IJobsConfig } from "../config/app.config.js"; From 540f03de477c9cdab3095b7998ea639ab5574e88 Mon Sep 17 00:00:00 2001 From: silent-cipher Date: Fri, 8 May 2026 14:22:19 +0530 Subject: [PATCH 20/44] feat: add deterministic random data generation and pull piece persistence --- .../src/dataSource/dataSource.service.spec.ts | 65 +++++++ .../src/dataSource/dataSource.service.ts | 140 ++++++++++++++ apps/backend/src/database/database.module.ts | 9 +- .../database/entities/pull-piece.entity.ts | 39 ++++ .../1776300000000-CreatePullPieces.ts | 28 +++ .../pull-check/hosted-piece.registry.spec.ts | 162 ---------------- .../src/pull-check/hosted-piece.registry.ts | 83 --------- .../piece-source.controller.spec.ts | 173 ------------------ .../src/pull-check/pull-check.module.ts | 12 +- .../src/pull-check/pull-check.service.spec.ts | 168 +++++++---------- .../src/pull-check/pull-check.service.ts | 93 ++++------ .../src/pull-check/pull-check.types.ts | 13 +- ...controller.ts => pull-piece.controller.ts} | 38 ++-- .../src/pull-check/pull-piece.repository.ts | 122 ++++++++++++ 14 files changed, 533 insertions(+), 612 deletions(-) create mode 100644 apps/backend/src/database/entities/pull-piece.entity.ts create mode 100644 apps/backend/src/database/migrations/1776300000000-CreatePullPieces.ts delete mode 100644 apps/backend/src/pull-check/hosted-piece.registry.spec.ts delete mode 100644 apps/backend/src/pull-check/hosted-piece.registry.ts delete mode 100644 apps/backend/src/pull-check/piece-source.controller.spec.ts rename apps/backend/src/pull-check/{piece-source.controller.ts => pull-piece.controller.ts} (57%) create mode 100644 apps/backend/src/pull-check/pull-piece.repository.ts diff --git a/apps/backend/src/dataSource/dataSource.service.spec.ts b/apps/backend/src/dataSource/dataSource.service.spec.ts index 8e8848c1..c4cf0f46 100644 --- a/apps/backend/src/dataSource/dataSource.service.spec.ts +++ b/apps/backend/src/dataSource/dataSource.service.spec.ts @@ -206,4 +206,69 @@ describe("DataSourceService", () => { await expect(fs.promises.access(regularFilePath).then(() => true)).resolves.toBe(true); }); }); + + describe("Deterministic Generation", () => { + const providerAddress = "0x1234567890123456789012345678901234567890"; + const key = "test-key"; + const bytesNeeded = 1024; + + describe("generateBytes", () => { + it("should generate deterministic bytes", () => { + const bytes1 = service.generateBytes({ providerAddress, key, bytesNeeded }); + const bytes2 = service.generateBytes({ providerAddress, key, bytesNeeded }); + + expect(bytes1).toHaveLength(bytesNeeded); + expect(bytes1).toEqual(bytes2); + }); + + it("should generate different bytes for different keys", () => { + const bytes1 = service.generateBytes({ providerAddress, key: "key1", bytesNeeded }); + const bytes2 = service.generateBytes({ providerAddress, key: "key2", bytesNeeded }); + + expect(bytes1).not.toEqual(bytes2); + }); + + it("should generate different bytes for different provider addresses", () => { + const bytes1 = service.generateBytes({ providerAddress: "0x1", key, bytesNeeded }); + const bytes2 = service.generateBytes({ providerAddress: "0x2", key, bytesNeeded }); + + expect(bytes1).not.toEqual(bytes2); + }); + + it("should generate same prefix for different bytesNeeded if size is fixed", () => { + const bytes1 = service.generateBytes({ providerAddress, key, bytesNeeded: 512, size: 1024 }); + const bytes2 = service.generateBytes({ providerAddress, key, bytesNeeded: 1024, size: 1024 }); + + expect(bytes1).toEqual(bytes2.subarray(0, 512)); + }); + }); + + describe("generateBytesStream", () => { + it("should generate same bytes as generateBytes", async () => { + const expected = service.generateBytes({ providerAddress, key, bytesNeeded }); + const stream = service.generateBytesStream({ providerAddress, key, bytesNeeded }); + + const chunks: Buffer[] = []; + for await (const chunk of stream) { + chunks.push(chunk); + } + const result = Buffer.concat(chunks); + + expect(result).toHaveLength(bytesNeeded); + expect(result).toEqual(expected); + }); + + it("should handle larger streams", async () => { + const largeSize = 128 * 1024; // 128 KB + const stream = service.generateBytesStream({ providerAddress, key, bytesNeeded: largeSize }); + + let totalLength = 0; + for await (const chunk of stream) { + totalLength += chunk.length; + } + + expect(totalLength).toBe(largeSize); + }); + }); + }); }); diff --git a/apps/backend/src/dataSource/dataSource.service.ts b/apps/backend/src/dataSource/dataSource.service.ts index b03c7400..33918a4e 100644 --- a/apps/backend/src/dataSource/dataSource.service.ts +++ b/apps/backend/src/dataSource/dataSource.service.ts @@ -1,4 +1,5 @@ import * as crypto from "node:crypto"; +import { Readable } from "node:stream"; import { Injectable, Logger } from "@nestjs/common"; import { ConfigService } from "@nestjs/config"; import * as fs from "fs"; @@ -8,6 +9,27 @@ import { writeWithBackpressure } from "../common/stream-utils.js"; import type { DataFile } from "../common/types.js"; import type { IConfig, IDatasetConfig } from "../config/app.config.js"; +export interface DeterministicBytesOptions { + /** Arbitrary namespace/key to scope the output (e.g. "nonce", "seed:round-1") */ + key: string; + /** Number of pseudo-random bytes to generate */ + bytesNeeded: number; + /** Optional: provider address or any additional entropy source */ + providerAddress?: string; + /** Optional: total size of the piece (used for key derivation to ensure same CID) */ + size?: number; +} + +export interface DeterministicBytesResult { + bytes: Buffer; + derivedKey: Buffer; +} + +const AES_KEY_LENGTH = 32; // AES-256 +const AES_IV_LENGTH = 16; // AES-CTR IV +const UINT64_BUFFER_LENGTH = 8; +const MAX_BYTES = 10 * 1024 * 1024; // 10 MiB — default pull-check piece size + @Injectable() export class DataSourceService { private readonly logger: Logger; @@ -180,4 +202,122 @@ export class DataSourceService { } } } + + // Deterministic Random data generation + /** + * Generates a deterministic pseudo-random byte buffer from the provided seeds. + * + * Algorithm: + * 1. Serialize all seed components to binary (BigUInt64BE for numeric values). + * 2. SHA-256 hash the combined seed → 32-byte AES key. + * 3. AES-256-CTR encrypt a zero-filled buffer with a static IV. + * The keystream itself is the pseudo-random output. + * + * Properties: + * - Deterministic: same inputs always produce the same output. + * - Non-invertible: output does not reveal the key or seeds (SHA-256 pre-image resistance). + * - Streamable: AES-CTR is block-aligned; different `bytesNeeded` values + * produce prefixes of the same infinite stream for the same seeds. + */ + generateBytes(options: DeterministicBytesOptions): Buffer { + const { key, bytesNeeded, providerAddress = "", size = bytesNeeded } = options; + + this.validateOptions(options); + + const derivedKey = this.deriveKey(providerAddress, size, key); + const bytes = this.extractKeystream(derivedKey, bytesNeeded); + + return bytes; + } + + /** + * Returns a Readable stream of deterministic pseudo-random bytes. + */ + generateBytesStream(options: DeterministicBytesOptions): Readable { + const { key, bytesNeeded, providerAddress = "", size = bytesNeeded } = options; + + this.validateOptions({ ...options, bytesNeeded: 1 }); // Just validate basic options + + const derivedKey = this.deriveKey(providerAddress, size, key); + const staticIV = Buffer.alloc(AES_IV_LENGTH, 0); + const cipher = crypto.createCipheriv("aes-256-ctr", derivedKey, staticIV); + + let remaining = bytesNeeded; + const CHUNK_SIZE = 64 * 1024; // 64 KB chunks + + return new Readable({ + read() { + if (remaining <= 0) { + this.push(null); + return; + } + + const toRead = Math.min(remaining, CHUNK_SIZE); + const zeroes = Buffer.alloc(toRead, 0); + const chunk = cipher.update(zeroes); + remaining -= toRead; + + this.push(chunk); + + if (remaining <= 0) { + const final = cipher.final(); + if (final.length > 0) { + this.push(final); + } + this.push(null); + } + }, + }); + } + + private validateOptions(options: DeterministicBytesOptions): void { + const { key, bytesNeeded } = options; + + if (!key || typeof key !== "string" || key.trim().length === 0) { + throw new Error("DeterministicRandom: `key` must be a non-empty string."); + } + + if (!Number.isInteger(bytesNeeded) || bytesNeeded <= 0) { + throw new Error("DeterministicRandom: `bytesNeeded` must be a positive integer."); + } + + if (bytesNeeded > MAX_BYTES) { + throw new Error( + `DeterministicRandom: \`bytesNeeded\` exceeds maximum allowed size of ${MAX_BYTES} bytes. ` + + `Split large requests into chunks.`, + ); + } + + const { size = 0 } = options; + if (!Number.isInteger(size) || size < 0) { + throw new Error("DeterministicRandom: `size` must be a non-negative integer."); + } + } + + private deriveKey(providerAddress: string, size: number, key: string): Buffer { + // Encode `size` as a fixed-width big-endian uint64 so that + // size=1 and size=10 produce distinct keys (no string-concat ambiguity). + const sizeBuffer = Buffer.alloc(UINT64_BUFFER_LENGTH); + sizeBuffer.writeBigUInt64BE(BigInt(size)); + + const seedPayload = Buffer.concat([Buffer.from(providerAddress, "utf8"), sizeBuffer, Buffer.from(key, "utf8")]); + + return crypto.createHash("sha256").update(seedPayload).digest(); + } + + private extractKeystream(derivedKey: Buffer, bytesNeeded: number): Buffer { + if (derivedKey.length !== AES_KEY_LENGTH) { + // Defensive — SHA-256 always returns 32 bytes; guard against future refactors. + throw new Error(`DeterministicRandom: derived key must be ${AES_KEY_LENGTH} bytes.`); + } + + // Static IV is intentional here: the key is freshly derived per input set, + // so IV reuse across different calls does not compromise security. + const staticIV = Buffer.alloc(AES_IV_LENGTH, 0); + const cipher = crypto.createCipheriv("aes-256-ctr", derivedKey, staticIV); + + // Encrypting zeroes extracts the raw AES-CTR keystream — our random output. + const zeroes = Buffer.alloc(bytesNeeded, 0); + return cipher.update(zeroes); + } } diff --git a/apps/backend/src/database/database.module.ts b/apps/backend/src/database/database.module.ts index 9249c3a9..18f67b3c 100644 --- a/apps/backend/src/database/database.module.ts +++ b/apps/backend/src/database/database.module.ts @@ -10,6 +10,7 @@ import type { IAppConfig, IConfig, IDatabaseConfig } from "../config/app.config. import { DataRetentionBaseline } from "./entities/data-retention-baseline.entity.js"; import { Deal } from "./entities/deal.entity.js"; import { JobScheduleState } from "./entities/job-schedule-state.entity.js"; +import { PullPiece } from "./entities/pull-piece.entity.js"; import { Retrieval } from "./entities/retrieval.entity.js"; import { StorageProvider } from "./entities/storage-provider.entity.js"; @@ -49,7 +50,7 @@ function toSafeDataSourceContext(options: DataSourceOptions): Record { + await queryRunner.query(` + CREATE TABLE IF NOT EXISTS pull_pieces ( + piece_cid TEXT PRIMARY KEY, + provider_address TEXT NOT NULL, + key TEXT NOT NULL, + size INT NOT NULL, + expires_at TIMESTAMPTZ NOT NULL, + cleaned_up BOOLEAN NOT NULL DEFAULT FALSE, + pull_submitted_at TIMESTAMPTZ, + first_byte_at TIMESTAMPTZ, + created_at TIMESTAMPTZ NOT NULL DEFAULT NOW() + ) + `); + await queryRunner.query(` + CREATE INDEX IF NOT EXISTS idx_pull_pieces_expires_at ON pull_pieces (expires_at) + `); + } + + public async down(queryRunner: QueryRunner): Promise { + await queryRunner.query(`DROP TABLE IF EXISTS pull_pieces`); + } +} diff --git a/apps/backend/src/pull-check/hosted-piece.registry.spec.ts b/apps/backend/src/pull-check/hosted-piece.registry.spec.ts deleted file mode 100644 index a8273227..00000000 --- a/apps/backend/src/pull-check/hosted-piece.registry.spec.ts +++ /dev/null @@ -1,162 +0,0 @@ -import { describe, expect, it } from "vitest"; -import { HostedPieceRegistry } from "./hosted-piece.registry.js"; -import type { HostedPieceRegistration } from "./pull-check.types.js"; - -function makeRegistration(overrides: Partial = {}): HostedPieceRegistration { - return { - pieceCid: "bafk-test", - filePath: "/tmp/datasets/test.bin", - fileName: "test.bin", - byteLength: 1024, - contentType: "application/octet-stream", - expiresAt: new Date(Date.now() + 60_000), - cleanedUp: false, - ...overrides, - }; -} - -describe("HostedPieceRegistry", () => { - describe("register / resolveActive / resolveAny", () => { - it("registers a piece and resolves it by CID", () => { - const registry = new HostedPieceRegistry(); - const registration = makeRegistration(); - - registry.register(registration); - - expect(registry.resolveActive(registration.pieceCid)).toBe(registration); - expect(registry.resolveAny(registration.pieceCid)).toBe(registration); - }); - - it("resolveActive returns null for unknown pieceCid", () => { - const registry = new HostedPieceRegistry(); - expect(registry.resolveActive("missing")).toBeNull(); - expect(registry.resolveAny("missing")).toBeNull(); - }); - - it("resolveActive returns null when the registration has been cleaned up", () => { - const registry = new HostedPieceRegistry(); - const registration = makeRegistration({ cleanedUp: true }); - registry.register(registration); - - expect(registry.resolveActive(registration.pieceCid)).toBeNull(); - // resolveAny still surfaces the cleaned-up entry so the controller can - // distinguish 410 Gone from 404 Not Found. - expect(registry.resolveAny(registration.pieceCid)).toBe(registration); - }); - - it("resolveActive returns null when the registration has expired", () => { - const registry = new HostedPieceRegistry(); - const expired = makeRegistration({ expiresAt: new Date(2000, 0, 1) }); - registry.register(expired); - - expect(registry.resolveActive(expired.pieceCid)).toBeNull(); - expect(registry.resolveAny(expired.pieceCid)).toBe(expired); - }); - - it("resolveActive treats expiresAt boundary as expired", () => { - const registry = new HostedPieceRegistry(); - const now = new Date("2030-01-01T00:00:00Z"); - const registration = makeRegistration({ expiresAt: now }); - registry.register(registration); - - expect(registry.resolveActive(registration.pieceCid, now)).toBeNull(); - }); - }); - - describe("markCleanedUp", () => { - it("marks the registration as cleaned up so resolveActive returns null", () => { - const registry = new HostedPieceRegistry(); - const registration = makeRegistration(); - registry.register(registration); - - registry.markCleanedUp(registration.pieceCid); - - expect(registration.cleanedUp).toBe(true); - expect(registry.resolveActive(registration.pieceCid)).toBeNull(); - }); - - it("is a no-op for unknown pieceCid", () => { - const registry = new HostedPieceRegistry(); - expect(() => registry.markCleanedUp("missing")).not.toThrow(); - }); - }); - - describe("markPullSubmitted", () => { - it("stamps the pullSubmittedAt timestamp on a registered piece", () => { - const registry = new HostedPieceRegistry(); - const registration = makeRegistration(); - registry.register(registration); - const submittedAt = new Date("2030-01-01T00:00:00Z"); - - registry.markPullSubmitted(registration.pieceCid, submittedAt); - - expect(registration.pullSubmittedAt).toBe(submittedAt); - }); - - it("is idempotent: only the first call wins so SP retries do not skew measurements", () => { - const registry = new HostedPieceRegistry(); - const registration = makeRegistration(); - registry.register(registration); - const first = new Date("2030-01-01T00:00:00Z"); - const second = new Date("2030-01-01T00:00:01Z"); - - registry.markPullSubmitted(registration.pieceCid, first); - registry.markPullSubmitted(registration.pieceCid, second); - - expect(registration.pullSubmittedAt).toBe(first); - }); - - it("is a no-op for unknown pieceCid", () => { - const registry = new HostedPieceRegistry(); - expect(() => registry.markPullSubmitted("missing", new Date())).not.toThrow(); - }); - }); - - describe("markFirstByte", () => { - it("stamps the firstByteAt timestamp on a registered piece", () => { - const registry = new HostedPieceRegistry(); - const registration = makeRegistration(); - registry.register(registration); - const firstByteAt = new Date("2030-01-01T00:00:00.500Z"); - - registry.markFirstByte(registration.pieceCid, firstByteAt); - - expect(registration.firstByteAt).toBe(firstByteAt); - }); - - it("is idempotent: only the first SP read wins", () => { - const registry = new HostedPieceRegistry(); - const registration = makeRegistration(); - registry.register(registration); - const first = new Date("2030-01-01T00:00:00.500Z"); - const second = new Date("2030-01-01T00:00:01.000Z"); - - registry.markFirstByte(registration.pieceCid, first); - registry.markFirstByte(registration.pieceCid, second); - - expect(registration.firstByteAt).toBe(first); - }); - - it("is a no-op for unknown pieceCid", () => { - const registry = new HostedPieceRegistry(); - expect(() => registry.markFirstByte("missing", new Date())).not.toThrow(); - }); - }); - - describe("forget", () => { - it("removes the registration entirely", () => { - const registry = new HostedPieceRegistry(); - const registration = makeRegistration(); - registry.register(registration); - - registry.forget(registration.pieceCid); - - expect(registry.resolveAny(registration.pieceCid)).toBeNull(); - }); - - it("is a no-op for unknown pieceCid", () => { - const registry = new HostedPieceRegistry(); - expect(() => registry.forget("missing")).not.toThrow(); - }); - }); -}); diff --git a/apps/backend/src/pull-check/hosted-piece.registry.ts b/apps/backend/src/pull-check/hosted-piece.registry.ts deleted file mode 100644 index 202ecf80..00000000 --- a/apps/backend/src/pull-check/hosted-piece.registry.ts +++ /dev/null @@ -1,83 +0,0 @@ -import { Injectable, Logger } from "@nestjs/common"; -import type { HostedPieceRegistration } from "./pull-check.types.js"; - -/** - * In-memory registry of hosted piece sources backing pull-check requests. - * - * The first slice keeps this in process memory because there is one DealBot - * API process serving `/api/piece/:pieceCid` and pull checks are bounded by - * the configured hosted-piece TTL. - */ -@Injectable() -export class HostedPieceRegistry { - private readonly logger = new Logger(HostedPieceRegistry.name); - private readonly entries = new Map(); - - register(registration: HostedPieceRegistration): void { - this.entries.set(registration.pieceCid, registration); - this.logger.debug({ - event: "hosted_piece_registered", - message: "Registered hosted piece source", - pieceCid: registration.pieceCid, - expiresAt: registration.expiresAt.toISOString(), - byteLength: registration.byteLength, - }); - } - - /** - * Resolve a hosted piece by CID. Returns null when the entry is missing, - * already cleaned up, or has expired. - */ - resolveActive(pieceCid: string, now: Date = new Date()): HostedPieceRegistration | null { - const entry = this.entries.get(pieceCid); - if (!entry) return null; - if (entry.cleanedUp) return null; - if (entry.expiresAt.getTime() <= now.getTime()) return null; - return entry; - } - - /** - * Resolve a hosted piece by CID even when expired/cleaned-up. Used by the - * controller to differentiate a 410 Gone from a 404 Not Found. - */ - resolveAny(pieceCid: string): HostedPieceRegistration | null { - return this.entries.get(pieceCid) ?? null; - } - - markCleanedUp(pieceCid: string): void { - const entry = this.entries.get(pieceCid); - if (!entry) return; - entry.cleanedUp = true; - this.logger.debug({ - event: "hosted_piece_cleaned_up", - message: "Marked hosted piece source as cleaned up", - pieceCid, - }); - } - - /** - * Record the wall-clock time at which the `pullPieces` request was sent to - * the SP. Idempotent: only the first call wins so that retried checks against - * the same hosted piece do not skew first-byte measurements. - */ - markPullSubmitted(pieceCid: string, at: Date): void { - const entry = this.entries.get(pieceCid); - if (!entry || entry.pullSubmittedAt) return; - entry.pullSubmittedAt = at; - } - - /** - * Record the wall-clock time at which the SP read the first byte of the - * hosted-piece stream. Idempotent: only the first read wins so that an SP - * issuing retries after a failed connection does not overwrite the timestamp. - */ - markFirstByte(pieceCid: string, at: Date): void { - const entry = this.entries.get(pieceCid); - if (!entry || entry.firstByteAt) return; - entry.firstByteAt = at; - } - - forget(pieceCid: string): void { - this.entries.delete(pieceCid); - } -} diff --git a/apps/backend/src/pull-check/piece-source.controller.spec.ts b/apps/backend/src/pull-check/piece-source.controller.spec.ts deleted file mode 100644 index bebfe106..00000000 --- a/apps/backend/src/pull-check/piece-source.controller.spec.ts +++ /dev/null @@ -1,173 +0,0 @@ -import { Readable, Writable } from "node:stream"; -import { Test } from "@nestjs/testing"; -import type { Response } from "express"; -import { describe, expect, it, vi } from "vitest"; -import { HostedPieceRegistry } from "./hosted-piece.registry.js"; -import { PieceSourceController } from "./piece-source.controller.js"; -import { PullCheckService } from "./pull-check.service.js"; -import type { HostedPieceRegistration } from "./pull-check.types.js"; - -function makeRegistration(overrides: Partial = {}): HostedPieceRegistration { - return { - pieceCid: "bafk-test", - filePath: "/tmp/test.bin", - fileName: "test.bin", - byteLength: 4, - contentType: "application/octet-stream", - expiresAt: new Date(Date.now() + 60_000), - cleanedUp: false, - ...overrides, - }; -} - -/** - * Fake express `Response` that is also a `Writable`, so `stream.pipe(res)` - * works without a real HTTP layer. The controller only calls `setHeader`, - * `status`, `send`, and `destroy`; we spy on those and let pipe write into - * the sink to verify the body. - */ -type FakeResponse = Writable & { - headersSent: boolean; - chunks: Buffer[]; - setHeader: ReturnType; - status: ReturnType; - send: ReturnType; -}; - -function makeResponse(): FakeResponse { - const chunks: Buffer[] = []; - const sink = new Writable({ - write(chunk, _encoding, cb) { - chunks.push(Buffer.isBuffer(chunk) ? chunk : Buffer.from(chunk)); - cb(); - }, - }) as FakeResponse; - sink.headersSent = false; - sink.chunks = chunks; - sink.setHeader = vi.fn(); - sink.status = vi.fn().mockReturnValue(sink); - sink.send = vi.fn().mockReturnValue(sink); - return sink; -} - -function asResponse(res: FakeResponse): Response { - return res as unknown as Response; -} - -async function setup(opts: { - opened?: ReturnType; - knownEntry?: HostedPieceRegistration | null; -}) { - const pullCheckService = { - openHostedPieceStream: vi.fn().mockReturnValue(opts.opened ?? null), - }; - const hostedPieceRegistry = { - resolveAny: vi.fn().mockReturnValue(opts.knownEntry ?? null), - markFirstByte: vi.fn(), - }; - - const module = await Test.createTestingModule({ - controllers: [PieceSourceController], - providers: [ - { provide: PullCheckService, useValue: pullCheckService }, - { provide: HostedPieceRegistry, useValue: hostedPieceRegistry }, - ], - }).compile(); - - const controller = module.get(PieceSourceController); - return { controller, pullCheckService, hostedPieceRegistry }; -} - -describe("PieceSourceController", () => { - it("returns 404 when pieceCid is missing or empty", async () => { - const { controller } = await setup({}); - const res = makeResponse(); - - // servePiece throws a NestJS NotFoundException synchronously; it is not async. - expect(() => controller.servePiece("", asResponse(res))).toThrow(/pieceCid is required/); - expect(() => controller.servePiece(" ", asResponse(res))).toThrow(/pieceCid is required/); - }); - - it("returns 404 when no registration exists for the pieceCid", async () => { - const { controller, pullCheckService, hostedPieceRegistry } = await setup({}); - const res = makeResponse(); - - controller.servePiece("bafk-unknown", asResponse(res)); - - expect(pullCheckService.openHostedPieceStream).toHaveBeenCalledWith("bafk-unknown"); - expect(hostedPieceRegistry.resolveAny).toHaveBeenCalledWith("bafk-unknown"); - expect(res.status).toHaveBeenCalledWith(404); - expect(res.send).toHaveBeenCalledWith("Hosted piece source not found"); - }); - - it("returns 410 when the registration exists but is no longer active", async () => { - const cleaned = makeRegistration({ cleanedUp: true }); - const { controller } = await setup({ opened: null, knownEntry: cleaned }); - const res = makeResponse(); - - controller.servePiece(cleaned.pieceCid, asResponse(res)); - - expect(res.status).toHaveBeenCalledWith(410); - expect(res.send).toHaveBeenCalledWith("Hosted piece source has expired or been cleaned up"); - }); - - it("streams the piece, sets headers, and marks first byte on the first chunk", async () => { - const registration = makeRegistration(); - const stream = Readable.from([Buffer.from("ABCD")]); - const { controller, hostedPieceRegistry } = await setup({ - opened: { registration, stream } as ReturnType, - }); - const res = makeResponse(); - const pipeSpy = vi.spyOn(stream, "pipe"); - - controller.servePiece(registration.pieceCid, asResponse(res)); - - expect(res.setHeader).toHaveBeenCalledWith("Content-Type", "application/octet-stream"); - expect(res.setHeader).toHaveBeenCalledWith("Content-Length", "4"); - expect(res.setHeader).toHaveBeenCalledWith("Cache-Control", "no-store"); - expect(res.setHeader).toHaveBeenCalledWith("X-Pull-Check-Piece-CID", registration.pieceCid); - expect(pipeSpy).toHaveBeenCalledTimes(1); - - // Wait for the stream to fully drain into our fake Writable sink. - await new Promise((resolve) => res.once("finish", resolve)); - - expect(hostedPieceRegistry.markFirstByte).toHaveBeenCalledTimes(1); - expect(hostedPieceRegistry.markFirstByte).toHaveBeenCalledWith(registration.pieceCid, expect.any(Date)); - expect(Buffer.concat(res.chunks).toString()).toBe("ABCD"); - }); - - it("sends a 500 response when the stream errors before headers are sent", () => { - const registration = makeRegistration(); - const stream = new Readable({ read() {} }); - const opened = { registration, stream } as ReturnType; - const res = makeResponse(); - - return setup({ opened }).then(({ controller }) => { - controller.servePiece(registration.pieceCid, asResponse(res)); - - stream.destroy(new Error("boom")); - stream.emit("error", new Error("boom")); - - expect(res.status).toHaveBeenCalledWith(500); - expect(res.send).toHaveBeenCalledWith("Failed to stream hosted piece"); - }); - }); - - it("destroys the response when the stream errors after headers are sent", async () => { - const registration = makeRegistration(); - const stream = new Readable({ read() {} }); - const opened = { registration, stream } as ReturnType; - const res = makeResponse(); - res.headersSent = true; - // Mock the real destroy to keep Writable from re-emitting the error as an - // unhandled event; we only need to assert the controller forwarded it. - const destroySpy = vi.spyOn(res, "destroy").mockImplementation(() => res); - - const { controller } = await setup({ opened }); - controller.servePiece(registration.pieceCid, asResponse(res)); - const error = new Error("late-boom"); - stream.emit("error", error); - - expect(destroySpy).toHaveBeenCalledWith(error); - }); -}); diff --git a/apps/backend/src/pull-check/pull-check.module.ts b/apps/backend/src/pull-check/pull-check.module.ts index d2881735..45574158 100644 --- a/apps/backend/src/pull-check/pull-check.module.ts +++ b/apps/backend/src/pull-check/pull-check.module.ts @@ -1,16 +1,18 @@ import { Module } from "@nestjs/common"; +import { TypeOrmModule } from "@nestjs/typeorm"; import { DatabaseModule } from "../database/database.module.js"; +import { PullPiece } from "../database/entities/pull-piece.entity.js"; import { DataSourceModule } from "../dataSource/dataSource.module.js"; import { HttpClientModule } from "../http-client/http-client.module.js"; import { WalletSdkModule } from "../wallet-sdk/wallet-sdk.module.js"; -import { HostedPieceRegistry } from "./hosted-piece.registry.js"; -import { PieceSourceController } from "./piece-source.controller.js"; import { PullCheckService } from "./pull-check.service.js"; +import { PieceSourceController } from "./pull-piece.controller.js"; +import { PullPieceRepository } from "./pull-piece.repository.js"; @Module({ - imports: [DatabaseModule, WalletSdkModule, DataSourceModule, HttpClientModule], + imports: [DatabaseModule, TypeOrmModule.forFeature([PullPiece]), WalletSdkModule, DataSourceModule, HttpClientModule], controllers: [PieceSourceController], - providers: [PullCheckService, HostedPieceRegistry], - exports: [PullCheckService, HostedPieceRegistry], + providers: [PullCheckService, PullPieceRepository], + exports: [PullCheckService, PullPieceRepository], }) export class PullCheckModule {} diff --git a/apps/backend/src/pull-check/pull-check.service.spec.ts b/apps/backend/src/pull-check/pull-check.service.spec.ts index ce420711..8e504413 100644 --- a/apps/backend/src/pull-check/pull-check.service.spec.ts +++ b/apps/backend/src/pull-check/pull-check.service.spec.ts @@ -1,15 +1,15 @@ +import { Readable } from "node:stream"; import { ConfigService } from "@nestjs/config"; import { Test, type TestingModule } from "@nestjs/testing"; import { afterEach, beforeEach, describe, expect, it, vi } from "vitest"; import type { IConfig } from "../config/app.config.js"; import { DataSourceService } from "../dataSource/dataSource.service.js"; -import { DealService } from "../deal/deal.service.js"; import { HttpClientService } from "../http-client/http-client.service.js"; import { PullCheckCheckMetrics } from "../metrics-prometheus/check-metrics.service.js"; import { WalletSdkService } from "../wallet-sdk/wallet-sdk.service.js"; import type { PDPProviderEx } from "../wallet-sdk/wallet-sdk.types.js"; -import { HostedPieceRegistry } from "./hosted-piece.registry.js"; import { PullCheckService } from "./pull-check.service.js"; +import { PullPieceRepository } from "./pull-piece.repository.js"; // `@filoz/synapse-core/piece` is mocked so that piece CIDs are deterministic // strings rather than real CID objects, keeping the tests fast and isolated @@ -17,6 +17,7 @@ import { PullCheckService } from "./pull-check.service.js"; vi.mock("@filoz/synapse-core/piece", () => ({ parse: vi.fn((s: string) => ({ __parsed: s, toString: () => s })), calculate: vi.fn(() => ({ toString: () => "bafk-test-piece" })), + calculateFromIterable: vi.fn().mockResolvedValue("bafk-test-piece"), })); vi.mock("@filoz/synapse-core/sp", () => ({ @@ -48,6 +49,8 @@ describe("PullCheckService", () => { let dataSourceServiceMock: { generateRandomDataset: ReturnType; cleanupRandomDataset: ReturnType; + generateBytes: ReturnType; + generateBytesStream: ReturnType; }; let registryMock: { register: ReturnType; @@ -58,7 +61,6 @@ describe("PullCheckService", () => { markFirstByte: ReturnType; forget: ReturnType; }; - let dealServiceMock: { getBaseDataSetMetadata: ReturnType }; let httpClientServiceMock: { requestWithMetrics: ReturnType }; let metricsMock: { observeRequestLatencyMs: ReturnType; @@ -78,18 +80,17 @@ describe("PullCheckService", () => { dataSourceServiceMock = { generateRandomDataset: vi.fn(), cleanupRandomDataset: vi.fn(), + generateBytes: vi.fn().mockReturnValue(Buffer.alloc(10)), + generateBytesStream: vi.fn().mockReturnValue(Readable.from([Buffer.alloc(10)])), }; registryMock = { - register: vi.fn(), - resolveAny: vi.fn().mockReturnValue(null), - resolveActive: vi.fn().mockReturnValue(null), - markCleanedUp: vi.fn(), - markPullSubmitted: vi.fn(), - markFirstByte: vi.fn(), - forget: vi.fn(), - }; - dealServiceMock = { - getBaseDataSetMetadata: vi.fn().mockReturnValue({}), + register: vi.fn().mockResolvedValue(undefined), + resolveAny: vi.fn().mockResolvedValue(null), + resolveActive: vi.fn().mockResolvedValue(null), + markCleanedUp: vi.fn().mockResolvedValue(undefined), + markPullSubmitted: vi.fn().mockResolvedValue(undefined), + markFirstByte: vi.fn().mockResolvedValue(undefined), + forget: vi.fn().mockResolvedValue(undefined), }; httpClientServiceMock = { requestWithMetrics: vi.fn(), @@ -125,9 +126,8 @@ describe("PullCheckService", () => { { provide: ConfigService, useValue: configServiceMock }, { provide: WalletSdkService, useValue: walletSdkServiceMock }, { provide: DataSourceService, useValue: dataSourceServiceMock }, - { provide: HostedPieceRegistry, useValue: registryMock }, + { provide: PullPieceRepository, useValue: registryMock }, { provide: PullCheckCheckMetrics, useValue: metricsMock }, - { provide: DealService, useValue: dealServiceMock }, { provide: HttpClientService, useValue: httpClientServiceMock }, ], }).compile(); @@ -170,88 +170,29 @@ describe("PullCheckService", () => { }); }); - describe("prepareHostedPiece", () => { - it("generates a dataset, computes the piece CID, and registers the hosted piece", async () => { - dataSourceServiceMock.generateRandomDataset.mockResolvedValue({ - name: "test.bin", - data: Buffer.from("hello"), - size: 5, - }); - - const prepared = await service.prepareHostedPiece(); + describe("preparePullPiece", () => { + it("generates deterministic bytes, computes the piece CID, and registers the pull piece", async () => { + const prepared = await service.preparePullPiece("0xsp"); - expect(dataSourceServiceMock.generateRandomDataset).toHaveBeenCalledWith(1024, 1024); - expect(calculate).toHaveBeenCalledTimes(1); + expect(dataSourceServiceMock.generateBytesStream).toHaveBeenCalledWith({ + providerAddress: "0xsp", + key: expect.any(String), + bytesNeeded: 1024, + }); expect(prepared.registration.pieceCid).toBe("bafk-test-piece"); - expect(prepared.registration.fileName).toBe("test.bin"); - expect(prepared.registration.byteLength).toBe(5); + expect(prepared.registration.size).toBe(1024); expect(prepared.sourceUrl).toBe("https://dealbot.example/api/piece/bafk-test-piece"); expect(registryMock.register).toHaveBeenCalledWith(prepared.registration); }); it("falls back to host:port when apiPublicUrl is not configured", async () => { configValues.app = { host: "localhost", port: 3000 } as IConfig["app"]; - dataSourceServiceMock.generateRandomDataset.mockResolvedValue({ - name: "test.bin", - data: Buffer.from("hello"), - size: 5, - }); - const prepared = await service.prepareHostedPiece(); + const prepared = await service.preparePullPiece("0xsp"); expect(prepared.sourceUrl).toBe("http://localhost:3000/api/piece/bafk-test-piece"); }); }); - describe("cleanupHostedPiece", () => { - const baseEntry = { - pieceCid: "bafk-test-piece", - filePath: "/tmp/datasets/test.bin", - fileName: "test.bin", - byteLength: 5, - contentType: "application/octet-stream", - expiresAt: new Date(Date.now() + 60_000), - cleanedUp: false, - }; - - it("marks the registration cleaned up and removes the file", async () => { - registryMock.resolveAny.mockReturnValue({ ...baseEntry }); - - await service.cleanupHostedPiece(baseEntry.pieceCid); - - expect(registryMock.markCleanedUp).toHaveBeenCalledWith(baseEntry.pieceCid); - expect(dataSourceServiceMock.cleanupRandomDataset).toHaveBeenCalledWith(baseEntry.fileName); - expect(registryMock.forget).toHaveBeenCalledWith(baseEntry.pieceCid); - }); - - it("skips file cleanup when the registration is already cleaned up", async () => { - registryMock.resolveAny.mockReturnValue({ ...baseEntry, cleanedUp: true }); - - await service.cleanupHostedPiece(baseEntry.pieceCid); - - expect(registryMock.markCleanedUp).not.toHaveBeenCalled(); - expect(dataSourceServiceMock.cleanupRandomDataset).not.toHaveBeenCalled(); - expect(registryMock.forget).toHaveBeenCalledWith(baseEntry.pieceCid); - }); - - it("forgets the entry even when no registration exists", async () => { - registryMock.resolveAny.mockReturnValue(null); - - await service.cleanupHostedPiece("missing"); - - expect(registryMock.markCleanedUp).not.toHaveBeenCalled(); - expect(dataSourceServiceMock.cleanupRandomDataset).not.toHaveBeenCalled(); - expect(registryMock.forget).toHaveBeenCalledWith("missing"); - }); - - it("does not propagate cleanup errors so callers can rely on it in finally", async () => { - registryMock.resolveAny.mockReturnValue({ ...baseEntry }); - dataSourceServiceMock.cleanupRandomDataset.mockRejectedValue(new Error("disk full")); - - await expect(service.cleanupHostedPiece(baseEntry.pieceCid)).resolves.toBeUndefined(); - expect(registryMock.forget).toHaveBeenCalledWith(baseEntry.pieceCid); - }); - }); - describe("validateByDirectPieceFetch", () => { const provider = makeProvider(); const logContext = { jobId: "job-1", providerAddress: "0xsp", providerId: 42n, providerName: "test-sp" }; @@ -311,26 +252,21 @@ describe("PullCheckService", () => { const logContext = { jobId: "job-1", providerAddress: "0xsp", providerId: 42n, providerName: "test-sp" }; function arrangeHappyPath() { - // Pre-stage a registration that prepareHostedPiece will install. + // Pre-stage a registration that preparePullPiece will install. const registration = { pieceCid: "bafk-test-piece", - filePath: "/tmp/datasets/test.bin", - fileName: "test.bin", - byteLength: 1024, - contentType: "application/octet-stream", + providerAddress: "0xsp", + key: "test-key", + size: 1024, expiresAt: new Date(Date.now() + 60_000), cleanedUp: false, pullSubmittedAt: new Date("2030-01-01T00:00:00Z"), firstByteAt: new Date("2030-01-01T00:00:00.250Z"), }; - dataSourceServiceMock.generateRandomDataset.mockResolvedValue({ - name: registration.fileName, - data: Buffer.alloc(registration.byteLength), - size: registration.byteLength, - }); + // After cleanup the resolveAny call returns the entry; before that the // run reads it once to compute first-byte latency. Same shape suffices. - registryMock.resolveAny.mockReturnValue(registration); + registryMock.resolveAny.mockResolvedValue(registration); vi.mocked(pullPieces).mockResolvedValue({ status: "pending" } as unknown as Awaited< ReturnType @@ -368,15 +304,15 @@ describe("PullCheckService", () => { expect(metricsMock.observeThroughputBps).toHaveBeenCalledTimes(1); // Terminal aggregate status is success. expect(metricsMock.recordStatus).toHaveBeenCalledWith(expect.any(Object), "success"); - // Cleanup ran exactly once. - expect(registryMock.markCleanedUp).toHaveBeenCalledWith(registration.pieceCid); + + // Cleanup ran (forget called) expect(registryMock.forget).toHaveBeenCalledWith(registration.pieceCid); }); it("does not observe firstByte when the SP never read from /api/piece (cached pull)", async () => { const { registration } = arrangeHappyPath(); // Simulate a cached pull: SP never fetched from us. - registryMock.resolveAny.mockReturnValue({ ...registration, firstByteAt: undefined }); + registryMock.resolveAny.mockResolvedValue({ ...registration, firstByteAt: undefined }); await service.runPullCheck("0xsp", undefined, logContext); @@ -413,9 +349,9 @@ describe("PullCheckService", () => { it("re-throws and runs cleanup when the validation step fails", async () => { arrangeHappyPath(); // Force validation mismatch by returning a different recomputed CID. - vi.mocked(calculate) - .mockReturnValueOnce({ toString: () => "bafk-test-piece" } as ReturnType) // prepareHostedPiece - .mockReturnValueOnce({ toString: () => "bafk-mismatch" } as ReturnType); // validateByDirectPieceFetch + // preparePullPiece no longer calls calculate, it uses createPieceCIDStream. + // So the first call to calculate will be from validateByDirectPieceFetch. + vi.mocked(calculate).mockReturnValueOnce({ toString: () => "bafk-mismatch" } as ReturnType); await expect(service.runPullCheck("0xsp", undefined, logContext)).rejects.toThrow(/validation failed/); expect(metricsMock.recordStatus).toHaveBeenLastCalledWith(expect.any(Object), "failure.other"); @@ -444,10 +380,32 @@ describe("PullCheckService", () => { }); }); - describe("openHostedPieceStream", () => { - it("returns null when no active registration exists", () => { - registryMock.resolveActive.mockReturnValue(null); - expect(service.openHostedPieceStream("missing")).toBeNull(); + describe("openPullPieceStream", () => { + it("returns null when no active registration exists", async () => { + registryMock.resolveActive.mockResolvedValue(null); + expect(await service.openPullPieceStream("missing")).toBeNull(); + }); + + it("returns a stream when active registration exists", async () => { + const registration = { + pieceCid: "bafk-test-piece", + providerAddress: "0xsp", + key: "test-key", + size: 1024, + expiresAt: new Date(Date.now() + 60_000), + cleanedUp: false, + }; + registryMock.resolveActive.mockResolvedValue(registration); + + const result = await service.openPullPieceStream("bafk-test-piece"); + expect(result).not.toBeNull(); + expect(result?.registration).toEqual(registration); + expect(result?.stream).toBeDefined(); + expect(dataSourceServiceMock.generateBytesStream).toHaveBeenCalledWith({ + providerAddress: "0xsp", + key: "test-key", + bytesNeeded: 1024, + }); }); }); }); diff --git a/apps/backend/src/pull-check/pull-check.service.ts b/apps/backend/src/pull-check/pull-check.service.ts index a411f136..cc537485 100644 --- a/apps/backend/src/pull-check/pull-check.service.ts +++ b/apps/backend/src/pull-check/pull-check.service.ts @@ -1,20 +1,20 @@ -import * as fs from "node:fs"; -import * as path from "node:path"; -import { calculate, parse as parsePieceCid } from "@filoz/synapse-core/piece"; +import * as crypto from "node:crypto"; +import { Readable } from "node:stream"; +import { calculate, calculateFromIterable, parse as parsePieceCid } from "@filoz/synapse-core/piece"; import { pullPieces, waitForPullStatus } from "@filoz/synapse-core/sp"; import { Injectable, Logger } from "@nestjs/common"; import { ConfigService } from "@nestjs/config"; import type { Account, Address, Chain, Client, Transport } from "viem"; import { type ProviderJobContext, toStructuredError } from "../common/logging.js"; -import type { IAppConfig, IConfig, IDatasetConfig, IJobsConfig } from "../config/app.config.js"; +import type { IAppConfig, IConfig, IJobsConfig } from "../config/app.config.js"; import { DataSourceService } from "../dataSource/dataSource.service.js"; import { HttpClientService } from "../http-client/http-client.service.js"; import { buildCheckMetricLabels, classifyFailureStatus } from "../metrics-prometheus/check-metric-labels.js"; import { PullCheckCheckMetrics } from "../metrics-prometheus/check-metrics.service.js"; import { WalletSdkService } from "../wallet-sdk/wallet-sdk.service.js"; import { PDPProviderEx } from "../wallet-sdk/wallet-sdk.types.js"; -import { HostedPieceRegistry } from "./hosted-piece.registry.js"; -import type { HostedPiecePrepared } from "./pull-check.types.js"; +import type { PullPiecePrepared, PullPieceRegistration } from "./pull-check.types.js"; +import { PullPieceRepository } from "./pull-piece.repository.js"; type SynapseViemClient = Client; @@ -26,7 +26,7 @@ export class PullCheckService { private readonly configService: ConfigService, private readonly walletSdkService: WalletSdkService, private readonly dataSourceService: DataSourceService, - private readonly hostedPieceRegistry: HostedPieceRegistry, + private readonly pullPieceRepository: PullPieceRepository, private readonly pullCheckMetrics: PullCheckCheckMetrics, private readonly httpClientService: HttpClientService, ) {} @@ -56,7 +56,7 @@ export class PullCheckService { /** * Drive one pull check through its full lifecycle: - * prepare hosted piece -> submit pull -> poll terminal SP status + * prepare pull piece -> submit pull -> poll terminal SP status * -> commit on dataset -> direct `/piece/:cid` validation -> cleanup. * * NOTE: Pull-check committed pieces are not tracked in the `deal` table, so @@ -76,12 +76,12 @@ export class PullCheckService { providerIsApproved: providerInfo.isApproved, }); - let prepared: HostedPiecePrepared | null = null; + let prepared: PullPiecePrepared | null = null; let requestSubmittedAt: Date | null = null; try { signal?.throwIfAborted(); - prepared = await this.prepareHostedPiece(); + prepared = await this.preparePullPiece(spAddress); const pieceCidStr = prepared.registration.pieceCid; const pieceCidParsed = parsePieceCid(pieceCidStr); @@ -101,7 +101,7 @@ export class PullCheckService { }; requestSubmittedAt = new Date(); - this.hostedPieceRegistry.markPullSubmitted(pieceCidStr, requestSubmittedAt); + await this.pullPieceRepository.markPullSubmitted(pieceCidStr, requestSubmittedAt); const pullResponse = await pullPieces(synapseClient, pullPiecesOptions); signal?.throwIfAborted(); const requestLatencyMs = Date.now() - requestSubmittedAt.getTime(); @@ -138,7 +138,7 @@ export class PullCheckService { throw new Error("Pull-check piece validation failed: SP did not serve the expected bytes"); } - const firstByteEntry = this.hostedPieceRegistry.resolveAny(pieceCidStr); + const firstByteEntry = await this.pullPieceRepository.resolveAny(pieceCidStr); const firstByteMs = firstByteEntry?.firstByteAt && firstByteEntry?.pullSubmittedAt ? firstByteEntry.firstByteAt.getTime() - firstByteEntry.pullSubmittedAt.getTime() @@ -149,7 +149,7 @@ export class PullCheckService { // Throughput approximated as pieceSize / completionLatency. This is an // upper-bound on actual transfer time because completionLatency includes // SP-side scheduling/queuing and our polling cadence. - const throughputBps = Math.round((prepared.registration.byteLength * 1000) / Math.max(completionLatencyMs, 1)); + const throughputBps = Math.round((prepared.registration.size * 1000) / Math.max(completionLatencyMs, 1)); this.pullCheckMetrics.observeThroughputBps(labels, throughputBps); this.pullCheckMetrics.recordStatus(labels, "success"); @@ -162,14 +162,14 @@ export class PullCheckService { completionLatencyMs, firstByteMs, throughputBps, - pieceSizeBytes: prepared.registration.byteLength, + pieceSizeBytes: prepared.registration.size, }); } catch (error) { this.pullCheckMetrics.recordStatus(labels, classifyFailureStatus(error)); throw error; } finally { if (prepared) { - await this.cleanupHostedPiece(prepared.registration.pieceCid); + await this.pullPieceRepository.forget(prepared.registration.pieceCid); } } } @@ -217,16 +217,18 @@ export class PullCheckService { * Generate a synthetic test piece, compute its piece CID, register it for * `/api/piece/:pieceCid` serving, and return the source URL plus registration. */ - async prepareHostedPiece(): Promise { + async preparePullPiece(providerAddress: string): Promise { const jobsConfig = this.getJobsConfig(); - const datasetConfig = this.configService.get("dataset"); const targetSize = jobsConfig.pullCheckPieceSizeBytes; + const key = crypto.randomBytes(16).toString("hex"); - const dataFile = await this.dataSourceService.generateRandomDataset(targetSize, targetSize); - const filePath = path.join(datasetConfig.localDatasetsPath, dataFile.name); - const dataBytes = - dataFile.data instanceof Uint8Array ? dataFile.data : new Uint8Array(dataFile.data as ArrayBufferLike); - const pieceCid = calculate(dataBytes); + const dataStream = this.dataSourceService.generateBytesStream({ + providerAddress, + key, + bytesNeeded: targetSize, + }); + + const pieceCid = await calculateFromIterable(dataStream); const pieceCidStr = pieceCid.toString(); const baseUrl = this.resolvePublicBaseUrl(); const sourceUrl = `${baseUrl}/api/piece/${pieceCidStr}`; @@ -234,40 +236,17 @@ export class PullCheckService { const registration = { pieceCid: pieceCidStr, - filePath, - fileName: dataFile.name, - byteLength: dataFile.size, - contentType: "application/octet-stream", + providerAddress, + key, + size: targetSize, expiresAt, cleanedUp: false, }; - this.hostedPieceRegistry.register(registration); + await this.pullPieceRepository.register(registration); return { registration, sourceUrl }; } - /** - * Mark the hosted piece as cleaned up and remove the on-disk artifact. Safe - * to call multiple times. - */ - async cleanupHostedPiece(pieceCid: string): Promise { - const entry = this.hostedPieceRegistry.resolveAny(pieceCid); - if (entry && !entry.cleanedUp) { - this.hostedPieceRegistry.markCleanedUp(pieceCid); - try { - await this.dataSourceService.cleanupRandomDataset(entry.fileName); - } catch (error) { - this.logger.warn({ - event: "pull_check_cleanup_warn", - message: "Failed to cleanup hosted piece artifact", - pieceCid, - error: toStructuredError(error), - }); - } - } - this.hostedPieceRegistry.forget(pieceCid); - } - private getJobsConfig(): IJobsConfig { return this.configService.get("jobs", { infer: true }); } @@ -287,17 +266,23 @@ export class PullCheckService { } /** - * Stream the hosted piece bytes for an active registration. Used by the + * Stream the pull piece bytes for an active registration. Used by the * `/api/piece/:pieceCid` controller. Returns null when no active registration * exists; callers must distinguish 404 from 410 using the registry directly. */ - openHostedPieceStream( + async openPullPieceStream( pieceCid: string, now: Date = new Date(), - ): { registration: NonNullable>; stream: fs.ReadStream } | null { - const registration = this.hostedPieceRegistry.resolveActive(pieceCid, now); + ): Promise<{ registration: PullPieceRegistration; stream: Readable } | null> { + const registration = await this.pullPieceRepository.resolveActive(pieceCid, now); if (!registration) return null; - const stream = fs.createReadStream(registration.filePath); + + const stream = this.dataSourceService.generateBytesStream({ + providerAddress: registration.providerAddress, + key: registration.key, + bytesNeeded: registration.size, + }); + return { registration, stream }; } } diff --git a/apps/backend/src/pull-check/pull-check.types.ts b/apps/backend/src/pull-check/pull-check.types.ts index c8caf447..2a8a27e9 100644 --- a/apps/backend/src/pull-check/pull-check.types.ts +++ b/apps/backend/src/pull-check/pull-check.types.ts @@ -2,12 +2,11 @@ * In-memory registration describing a hosted-piece source served at * `/api/piece/:pieceCid` for a single in-flight pull check. */ -export type HostedPieceRegistration = { +export type PullPieceRegistration = { pieceCid: string; - filePath: string; - fileName: string; - byteLength: number; - contentType: string; + providerAddress: string; + key: string; + size: number; expiresAt: Date; cleanedUp: boolean; pullSubmittedAt?: Date; @@ -18,7 +17,7 @@ export type HostedPieceRegistration = { * Result of preparing a hosted piece, returned by the service to callers that * need both the routing identity and the on-disk artifact path. */ -export type HostedPiecePrepared = { - registration: HostedPieceRegistration; +export type PullPiecePrepared = { + registration: PullPieceRegistration; sourceUrl: string; }; diff --git a/apps/backend/src/pull-check/piece-source.controller.ts b/apps/backend/src/pull-check/pull-piece.controller.ts similarity index 57% rename from apps/backend/src/pull-check/piece-source.controller.ts rename to apps/backend/src/pull-check/pull-piece.controller.ts index 92596af4..456eae2a 100644 --- a/apps/backend/src/pull-check/piece-source.controller.ts +++ b/apps/backend/src/pull-check/pull-piece.controller.ts @@ -1,11 +1,11 @@ import { Controller, Get, Logger, NotFoundException, Param, Res } from "@nestjs/common"; import { ApiOperation, ApiResponse, ApiTags } from "@nestjs/swagger"; import type { Response } from "express"; -import { HostedPieceRegistry } from "./hosted-piece.registry.js"; import { PullCheckService } from "./pull-check.service.js"; +import { PullPieceRepository } from "./pull-piece.repository.js"; /** - * Serves the temporary hosted-piece bytes that a storage provider must fetch + * Serves the temporary pull-piece bytes that a storage provider must fetch * during a pull check. Bound to the same `/api/*` prefix as other DealBot HTTP * endpoints. The path component must end with `/piece/{pieceCid}` so that * SP-side pull workers can address the resource directly. @@ -17,66 +17,66 @@ export class PieceSourceController { constructor( private readonly pullCheckService: PullCheckService, - private readonly hostedPieceRegistry: HostedPieceRegistry, + private readonly pullPieceRepository: PullPieceRepository, ) {} @Get("piece/:pieceCid") @ApiOperation({ - summary: "Stream a temporary hosted piece for an in-flight SP pull check", + summary: "Stream a temporary pull piece for an in-flight SP pull check", }) @ApiResponse({ status: 200, description: "Raw piece bytes streamed to the caller" }) - @ApiResponse({ status: 404, description: "No active hosted piece exists for this pieceCid" }) - @ApiResponse({ status: 410, description: "Hosted piece existed but has expired or been cleaned up" }) - servePiece(@Param("pieceCid") pieceCid: string, @Res() res: Response): void { + @ApiResponse({ status: 404, description: "No active pull piece exists for this pieceCid" }) + @ApiResponse({ status: 410, description: "Pull piece existed but has expired or been cleaned up" }) + async servePiece(@Param("pieceCid") pieceCid: string, @Res() res: Response): Promise { if (!pieceCid || pieceCid.trim().length === 0) { throw new NotFoundException("pieceCid is required"); } - const opened = this.pullCheckService.openHostedPieceStream(pieceCid); + const opened = await this.pullCheckService.openPullPieceStream(pieceCid); if (!opened) { - const known = this.hostedPieceRegistry.resolveAny(pieceCid); + const known = await this.pullPieceRepository.resolveAny(pieceCid); if (known) { this.logger.warn({ event: "pull_check_piece_gone", - message: "Hosted piece source no longer active", + message: "Pull piece source no longer active", pieceCid, cleanedUp: known.cleanedUp, expiresAt: known.expiresAt.toISOString(), }); - res.status(410).send("Hosted piece source has expired or been cleaned up"); + res.status(410).send("Pull piece source has expired or been cleaned up"); return; } this.logger.warn({ event: "pull_check_piece_unknown", - message: "Hosted piece source not found", + message: "Pull piece source not found", pieceCid, }); - res.status(404).send("Hosted piece source not found"); + res.status(404).send("Pull piece source not found"); return; } const { registration, stream } = opened; - res.setHeader("Content-Type", registration.contentType); - res.setHeader("Content-Length", registration.byteLength.toString()); + res.setHeader("Content-Type", "application/octet-stream"); + res.setHeader("Content-Length", registration.size.toString()); res.setHeader("Cache-Control", "no-store"); res.setHeader("X-Pull-Check-Piece-CID", registration.pieceCid); stream.on("error", (error) => { this.logger.error({ event: "pull_check_piece_stream_error", - message: "Failed to stream hosted piece", + message: "Failed to stream pull piece", pieceCid, error: error.message, }); if (!res.headersSent) { - res.status(500).send("Failed to stream hosted piece"); + res.status(500).send("Failed to stream pull piece"); return; } res.destroy(error); }); - // Capture the first-byte timestamp before piping + // Capture the first-byte timestamp before piping (fire-and-forget DB write) stream.once("data", () => { - this.hostedPieceRegistry.markFirstByte(pieceCid, new Date()); + void this.pullPieceRepository.markFirstByte(pieceCid, new Date()); }); stream.pipe(res); } diff --git a/apps/backend/src/pull-check/pull-piece.repository.ts b/apps/backend/src/pull-check/pull-piece.repository.ts new file mode 100644 index 00000000..bc028af5 --- /dev/null +++ b/apps/backend/src/pull-check/pull-piece.repository.ts @@ -0,0 +1,122 @@ +import { Injectable, Logger } from "@nestjs/common"; +import { InjectRepository } from "@nestjs/typeorm"; +import type { Repository } from "typeorm"; +import { PullPiece } from "../database/entities/pull-piece.entity.js"; +import type { PullPieceRegistration } from "./pull-check.types.js"; + +/** + * Postgres-backed registry of hosted piece sources backing pull-check requests. + * + * Persisting to the `hosted_pieces` table allows the API pod(s) to resolve + * registrations created by a separate worker pod in split-process deployments. + */ +@Injectable() +export class PullPieceRepository { + private readonly logger = new Logger(PullPieceRepository.name); + + constructor( + @InjectRepository(PullPiece) + private readonly repo: Repository, + ) {} + + async register(registration: PullPieceRegistration): Promise { + await this.repo.upsert( + { + pieceCid: registration.pieceCid, + providerAddress: registration.providerAddress, + key: registration.key, + size: registration.size, + expiresAt: registration.expiresAt, + cleanedUp: false, + pullSubmittedAt: null, + firstByteAt: null, + }, + ["pieceCid"], + ); + this.logger.debug({ + event: "hosted_piece_registered", + message: "Registered hosted piece source", + pieceCid: registration.pieceCid, + expiresAt: registration.expiresAt.toISOString(), + size: `${registration.size} bytes`, + }); + } + + /** + * Resolve a hosted piece by CID. Returns null when the entry is missing, + * already cleaned up, or has expired. + */ + async resolveActive(pieceCid: string, now: Date = new Date()): Promise { + const row = await this.repo.findOneBy({ pieceCid }); + if (!row) return null; + if (row.cleanedUp) return null; + if (row.expiresAt.getTime() <= now.getTime()) return null; + return this.toRegistration(row); + } + + /** + * Resolve a hosted piece by CID even when expired/cleaned-up. Used by the + * controller to differentiate a 410 Gone from a 404 Not Found. + */ + async resolveAny(pieceCid: string): Promise { + const row = await this.repo.findOneBy({ pieceCid }); + return row ? this.toRegistration(row) : null; + } + + async markCleanedUp(pieceCid: string): Promise { + const result = await this.repo.update({ pieceCid, cleanedUp: false }, { cleanedUp: true }); + if (result.affected && result.affected > 0) { + this.logger.debug({ + event: "hosted_piece_cleaned_up", + message: "Marked hosted piece source as cleaned up", + pieceCid, + }); + } + } + + /** + * Record the wall-clock time at which the `pullPieces` request was sent to + * the SP. Idempotent: only the first call wins so that retried checks against + * the same hosted piece do not skew first-byte measurements. + */ + async markPullSubmitted(pieceCid: string, at: Date): Promise { + // Only set when currently null (idempotent first-write-wins) + await this.repo + .createQueryBuilder() + .update(PullPiece) + .set({ pullSubmittedAt: at }) + .where("piece_cid = :pieceCid AND pull_submitted_at IS NULL", { pieceCid }) + .execute(); + } + + /** + * Record the wall-clock time at which the SP read the first byte of the + * hosted-piece stream. Idempotent: only the first read wins so that an SP + * issuing retries after a failed connection does not overwrite the timestamp. + */ + async markFirstByte(pieceCid: string, at: Date): Promise { + await this.repo + .createQueryBuilder() + .update(PullPiece) + .set({ firstByteAt: at }) + .where("piece_cid = :pieceCid AND first_byte_at IS NULL", { pieceCid }) + .execute(); + } + + async forget(pieceCid: string): Promise { + await this.repo.delete({ pieceCid }); + } + + private toRegistration(row: PullPiece): PullPieceRegistration { + return { + pieceCid: row.pieceCid, + providerAddress: row.providerAddress, + key: row.key, + size: row.size, + expiresAt: row.expiresAt, + cleanedUp: row.cleanedUp, + pullSubmittedAt: row.pullSubmittedAt ?? undefined, + firstByteAt: row.firstByteAt ?? undefined, + }; + } +} From 229bc7a75266801619a308ea44d779cff4f9a52d Mon Sep 17 00:00:00 2001 From: silent-cipher Date: Fri, 8 May 2026 16:29:06 +0530 Subject: [PATCH 21/44] chore: cleanup --- apps/backend/src/dataSource/dataSource.service.ts | 6 ------ 1 file changed, 6 deletions(-) diff --git a/apps/backend/src/dataSource/dataSource.service.ts b/apps/backend/src/dataSource/dataSource.service.ts index 33918a4e..f7dc5184 100644 --- a/apps/backend/src/dataSource/dataSource.service.ts +++ b/apps/backend/src/dataSource/dataSource.service.ts @@ -20,11 +20,6 @@ export interface DeterministicBytesOptions { size?: number; } -export interface DeterministicBytesResult { - bytes: Buffer; - derivedKey: Buffer; -} - const AES_KEY_LENGTH = 32; // AES-256 const AES_IV_LENGTH = 16; // AES-CTR IV const UINT64_BUFFER_LENGTH = 8; @@ -203,7 +198,6 @@ export class DataSourceService { } } - // Deterministic Random data generation /** * Generates a deterministic pseudo-random byte buffer from the provided seeds. * From a0e13b8c04cc9f4d0318c73ce56bd6244f60a8ef Mon Sep 17 00:00:00 2001 From: silent-cipher Date: Fri, 8 May 2026 17:09:46 +0530 Subject: [PATCH 22/44] chore: cleanup --- .../src/database/entities/pull-piece.entity.ts | 3 --- .../migrations/1776300000000-CreatePullPieces.ts | 1 - .../src/pull-check/pull-check.service.spec.ts | 8 -------- apps/backend/src/pull-check/pull-check.service.ts | 1 - apps/backend/src/pull-check/pull-check.types.ts | 1 - .../src/pull-check/pull-piece.controller.ts | 1 - .../src/pull-check/pull-piece.repository.ts | 14 -------------- 7 files changed, 29 deletions(-) diff --git a/apps/backend/src/database/entities/pull-piece.entity.ts b/apps/backend/src/database/entities/pull-piece.entity.ts index 738c3360..cb204443 100644 --- a/apps/backend/src/database/entities/pull-piece.entity.ts +++ b/apps/backend/src/database/entities/pull-piece.entity.ts @@ -25,9 +25,6 @@ export class PullPiece { @Column({ name: "expires_at", type: "timestamptz" }) expiresAt!: Date; - @Column({ name: "cleaned_up", type: "boolean", default: false }) - cleanedUp!: boolean; - @Column({ name: "pull_submitted_at", type: "timestamptz", nullable: true }) pullSubmittedAt: Date | null; diff --git a/apps/backend/src/database/migrations/1776300000000-CreatePullPieces.ts b/apps/backend/src/database/migrations/1776300000000-CreatePullPieces.ts index dd2fc8ed..ccded36d 100644 --- a/apps/backend/src/database/migrations/1776300000000-CreatePullPieces.ts +++ b/apps/backend/src/database/migrations/1776300000000-CreatePullPieces.ts @@ -11,7 +11,6 @@ export class CreatePullPieces1776300000000 implements MigrationInterface { key TEXT NOT NULL, size INT NOT NULL, expires_at TIMESTAMPTZ NOT NULL, - cleaned_up BOOLEAN NOT NULL DEFAULT FALSE, pull_submitted_at TIMESTAMPTZ, first_byte_at TIMESTAMPTZ, created_at TIMESTAMPTZ NOT NULL DEFAULT NOW() diff --git a/apps/backend/src/pull-check/pull-check.service.spec.ts b/apps/backend/src/pull-check/pull-check.service.spec.ts index 8e504413..3c0f8c5b 100644 --- a/apps/backend/src/pull-check/pull-check.service.spec.ts +++ b/apps/backend/src/pull-check/pull-check.service.spec.ts @@ -47,16 +47,12 @@ describe("PullCheckService", () => { let service: PullCheckService; let walletSdkServiceMock: { getProviderInfo: ReturnType; getSynapseClient: ReturnType }; let dataSourceServiceMock: { - generateRandomDataset: ReturnType; - cleanupRandomDataset: ReturnType; - generateBytes: ReturnType; generateBytesStream: ReturnType; }; let registryMock: { register: ReturnType; resolveAny: ReturnType; resolveActive: ReturnType; - markCleanedUp: ReturnType; markPullSubmitted: ReturnType; markFirstByte: ReturnType; forget: ReturnType; @@ -78,16 +74,12 @@ describe("PullCheckService", () => { getSynapseClient: vi.fn().mockReturnValue({}), }; dataSourceServiceMock = { - generateRandomDataset: vi.fn(), - cleanupRandomDataset: vi.fn(), - generateBytes: vi.fn().mockReturnValue(Buffer.alloc(10)), generateBytesStream: vi.fn().mockReturnValue(Readable.from([Buffer.alloc(10)])), }; registryMock = { register: vi.fn().mockResolvedValue(undefined), resolveAny: vi.fn().mockResolvedValue(null), resolveActive: vi.fn().mockResolvedValue(null), - markCleanedUp: vi.fn().mockResolvedValue(undefined), markPullSubmitted: vi.fn().mockResolvedValue(undefined), markFirstByte: vi.fn().mockResolvedValue(undefined), forget: vi.fn().mockResolvedValue(undefined), diff --git a/apps/backend/src/pull-check/pull-check.service.ts b/apps/backend/src/pull-check/pull-check.service.ts index cc537485..3bb896df 100644 --- a/apps/backend/src/pull-check/pull-check.service.ts +++ b/apps/backend/src/pull-check/pull-check.service.ts @@ -240,7 +240,6 @@ export class PullCheckService { key, size: targetSize, expiresAt, - cleanedUp: false, }; await this.pullPieceRepository.register(registration); diff --git a/apps/backend/src/pull-check/pull-check.types.ts b/apps/backend/src/pull-check/pull-check.types.ts index 2a8a27e9..236ec5e7 100644 --- a/apps/backend/src/pull-check/pull-check.types.ts +++ b/apps/backend/src/pull-check/pull-check.types.ts @@ -8,7 +8,6 @@ export type PullPieceRegistration = { key: string; size: number; expiresAt: Date; - cleanedUp: boolean; pullSubmittedAt?: Date; firstByteAt?: Date; }; diff --git a/apps/backend/src/pull-check/pull-piece.controller.ts b/apps/backend/src/pull-check/pull-piece.controller.ts index 456eae2a..90bd9adc 100644 --- a/apps/backend/src/pull-check/pull-piece.controller.ts +++ b/apps/backend/src/pull-check/pull-piece.controller.ts @@ -40,7 +40,6 @@ export class PieceSourceController { event: "pull_check_piece_gone", message: "Pull piece source no longer active", pieceCid, - cleanedUp: known.cleanedUp, expiresAt: known.expiresAt.toISOString(), }); res.status(410).send("Pull piece source has expired or been cleaned up"); diff --git a/apps/backend/src/pull-check/pull-piece.repository.ts b/apps/backend/src/pull-check/pull-piece.repository.ts index bc028af5..902a5256 100644 --- a/apps/backend/src/pull-check/pull-piece.repository.ts +++ b/apps/backend/src/pull-check/pull-piece.repository.ts @@ -27,7 +27,6 @@ export class PullPieceRepository { key: registration.key, size: registration.size, expiresAt: registration.expiresAt, - cleanedUp: false, pullSubmittedAt: null, firstByteAt: null, }, @@ -49,7 +48,6 @@ export class PullPieceRepository { async resolveActive(pieceCid: string, now: Date = new Date()): Promise { const row = await this.repo.findOneBy({ pieceCid }); if (!row) return null; - if (row.cleanedUp) return null; if (row.expiresAt.getTime() <= now.getTime()) return null; return this.toRegistration(row); } @@ -63,17 +61,6 @@ export class PullPieceRepository { return row ? this.toRegistration(row) : null; } - async markCleanedUp(pieceCid: string): Promise { - const result = await this.repo.update({ pieceCid, cleanedUp: false }, { cleanedUp: true }); - if (result.affected && result.affected > 0) { - this.logger.debug({ - event: "hosted_piece_cleaned_up", - message: "Marked hosted piece source as cleaned up", - pieceCid, - }); - } - } - /** * Record the wall-clock time at which the `pullPieces` request was sent to * the SP. Idempotent: only the first call wins so that retried checks against @@ -114,7 +101,6 @@ export class PullPieceRepository { key: row.key, size: row.size, expiresAt: row.expiresAt, - cleanedUp: row.cleanedUp, pullSubmittedAt: row.pullSubmittedAt ?? undefined, firstByteAt: row.firstByteAt ?? undefined, }; From 4f6d2d036b6df2f6bc2ba248b4ceef006ee9da03 Mon Sep 17 00:00:00 2001 From: silent-cipher Date: Fri, 8 May 2026 17:10:34 +0530 Subject: [PATCH 23/44] docs: update to latest --- docs/checks/events-and-metrics.md | 4 ++-- docs/checks/pull-check.md | 23 +++++++++++++---------- 2 files changed, 15 insertions(+), 12 deletions(-) diff --git a/docs/checks/events-and-metrics.md b/docs/checks/events-and-metrics.md index def047b5..4b67f922 100644 --- a/docs/checks/events-and-metrics.md +++ b/docs/checks/events-and-metrics.md @@ -85,7 +85,7 @@ sequenceDiagram |------|------------|:------:|-----------------| | `pullRequestSubmitted` | Dealbot calls `pullPieces` against the SP for the registered piece CID. | Yes | [`pull-check.service.ts`](../../apps/backend/src/pull-check/pull-check.service.ts) | | `pullRequestAcknowledged` | SP returns from `pullPieces` (success or non-terminal-failure). | Yes | [`pull-check.service.ts`](../../apps/backend/src/pull-check/pull-check.service.ts) | -| `hostedPieceFirstByteRead` | SP reads the first byte of `/api/piece/{pieceCid}` from dealbot. Recorded once per registration. | Yes | [`piece-source.controller.ts`](../../apps/backend/src/pull-check/piece-source.controller.ts) | +| `hostedPieceFirstByteRead` | SP reads the first byte of `/api/piece/{pieceCid}` from dealbot. Recorded once per registration. | Yes | [`pull-piece.controller.ts`](../../apps/backend/src/pull-check/pull-piece.controller.ts) | | `pullTerminalStatusReported` | SP reports a terminal pull status (`complete`, `failed`, ...) via `waitForPullStatus`. Intermediate poll statuses are not counted. | Yes | [`pull-check.service.ts`](../../apps/backend/src/pull-check/pull-check.service.ts) | | `pullCheckIntegrityChecked` | Direct `/piece/{pieceCid}` fetch from the SP returns bytes whose recomputed pieceCid matches the expected CID. | Yes | [`pull-check.service.ts`](../../apps/backend/src/pull-check/pull-check.service.ts) | @@ -123,7 +123,7 @@ sequenceDiagram | `dataSetCreationMs` | Data-Set Creation | Data-set creation uploadToSpStart | Data-set creation pieceConfirmed | Duration of one data-set creation with confirmed piece (all using `createDataSetWithPiece`) | [`deal.service.ts`](../../apps/backend/src/deal/deal.service.ts) | | `pullCheckRequestLatencyMs` | Pull | [`pullRequestSubmitted`](#pullRequestSubmitted) | [`pullRequestAcknowledged`](#pullRequestAcknowledged) | Time from `pullPieces` submission to SP request acknowledgement. | [`pull-check.service.ts`](../../apps/backend/src/pull-check/pull-check.service.ts) | | `pullCheckCompletionLatencyMs` | Pull | [`pullRequestSubmitted`](#pullRequestSubmitted) | [`pullTerminalStatusReported`](#pullTerminalStatusReported) | Time from `pullPieces` submission to terminal SP pull status. Observed once on success and once on failure. | [`pull-check.service.ts`](../../apps/backend/src/pull-check/pull-check.service.ts) | -| `pullCheckFirstByteMs` | Pull | [`pullRequestSubmitted`](#pullRequestSubmitted) | [`hostedPieceFirstByteRead`](#hostedPieceFirstByteRead) | Time from `pullPieces` submission to the SP reading the first byte of `/api/piece/{pieceCid}`. Skipped (no observation) when the SP serves the pull from a local cache and never fetches from dealbot. | [`pull-check.service.ts`](../../apps/backend/src/pull-check/pull-check.service.ts), [`piece-source.controller.ts`](../../apps/backend/src/pull-check/piece-source.controller.ts) | +| `pullCheckFirstByteMs` | Pull | [`pullRequestSubmitted`](#pullRequestSubmitted) | [`hostedPieceFirstByteRead`](#hostedPieceFirstByteRead) | Time from `pullPieces` submission to the SP reading the first byte of `/api/piece/{pieceCid}`. Skipped (no observation) when the SP serves the pull from a local cache and never fetches from dealbot. | [`pull-check.service.ts`](../../apps/backend/src/pull-check/pull-check.service.ts), [`pull-piece.controller.ts`](../../apps/backend/src/pull-check/pull-piece.controller.ts) | | `pullCheckThroughputBps` | Pull | n/a | n/a | `(pieceSizeBytes / pullCheckCompletionLatencyMs) * 1000`. Upper-bound on actual transfer rate because `pullCheckCompletionLatencyMs` includes SP-side scheduling and dealbot's polling cadence. | [`pull-check.service.ts`](../../apps/backend/src/pull-check/pull-check.service.ts) | diff --git a/docs/checks/pull-check.md b/docs/checks/pull-check.md index c3278f57..c582f94c 100644 --- a/docs/checks/pull-check.md +++ b/docs/checks/pull-check.md @@ -33,7 +33,7 @@ The dealbot scheduler triggers pull check jobs at a configurable rate (`PULL_CHE ```mermaid flowchart TD - Generate["Generate random piece + register hosted source
at /api/piece/{pieceCid}"] + Generate["Compute PieceCID + register hosted source in Postgres
at /api/piece/{pieceCid}"] Generate --> Submit["Submit pullPieces request to SP"] Submit --> Poll["Poll SP via waitForPullStatus
until terminal pull status"] Poll -->|complete| Validate["Direct /piece/{pieceCid} fetch from SP
+ recompute pieceCid"] @@ -41,20 +41,24 @@ flowchart TD Validate -->|matches| Success["Mark pull check successful"] Validate -->|mismatch or fetch error| Fail Success --> Cleanup - Fail --> Cleanup["Forget hosted piece + delete local artifact"] + Fail --> Cleanup["Forget hosted piece registration"] ``` ### 1. Prepare the hosted piece -Dealbot generates a random binary file, computes its piece CID, and registers it in an in-memory `HostedPieceRegistry`. The registration carries a TTL controlled by `PULL_CHECK_HOSTED_PIECE_TTL_SECONDS` so the source remains available for the entire pull window. +Dealbot computes a deterministic PieceCID for a synthetic test piece and registers it in the Postgres `pull_pieces` table. The registration carries a TTL controlled by `PULL_CHECK_HOSTED_PIECE_TTL_SECONDS` so the source remains available for the entire pull window. + +By persisting registrations to Postgres instead of in-memory, the hosted source can be resolved by any API pod in a multi-pod deployment, even if the pull check was initiated by a different worker pod. + +The synthetic data is **not** stored on disk. Instead, dealbot uses a deterministic pseudo-random generator (AES-256-CTR) to stream the same bytes whenever the SP fetches the piece or dealbot needs to re-compute the CID for validation. The source URL handed to the SP is built from the dealbot `app.apiPublicUrl` config (set via `DEALBOT_API_PUBLIC_URL`). When `DEALBOT_API_PUBLIC_URL` is unset, dealbot falls back to `http://{DEALBOT_HOST}:{DEALBOT_PORT}`, which is only reachable in single-host or `localhost` setups. -- **File format:** `random-{timestamp}-{uniqueId}.bin` +- **Data format:** Deterministic pseudo-random bytes - **Default size:** `PULL_CHECK_PIECE_SIZE_BYTES` (default 10 MiB) - **Source URL:** `{apiPublicUrl}/api/piece/{pieceCid}` -Source: [`pull-check.service.ts` (`prepareHostedPiece`)](../../apps/backend/src/pull-check/pull-check.service.ts), [`hosted-piece.registry.ts`](../../apps/backend/src/pull-check/hosted-piece.registry.ts) +Source: [`pull-check.service.ts` (`preparePullPiece`)](../../apps/backend/src/pull-check/pull-check.service.ts), [`pull-piece.repository.ts`](../../apps/backend/src/pull-check/pull-piece.repository.ts) ### 2. Submit the pull request @@ -68,7 +72,7 @@ Source: [`pull-check.service.ts` (`runPullCheck`)](../../apps/backend/src/pull-c When the SP fetches `/api/piece/{pieceCid}` for the first time, the controller stamps a first-byte timestamp on the registration. This is the basis for [`pullCheckFirstByteMs`](./events-and-metrics.md#pullCheckFirstByteMs). -Source: [`piece-source.controller.ts`](../../apps/backend/src/pull-check/piece-source.controller.ts) +Source: [`pull-piece.controller.ts`](../../apps/backend/src/pull-check/pull-piece.controller.ts) ### 4. Direct piece-fetch validation @@ -83,10 +87,9 @@ Source: [`pull-check.service.ts` (`validateByDirectPieceFetch`)](../../apps/back Whether the pull check succeeds or fails, the `finally` block: 1. Marks the registration as cleaned up (so subsequent `/api/piece/{pieceCid}` requests return HTTP 410 Gone instead of 200). -2. Removes the on-disk dataset artifact via `DataSourceService.cleanupRandomDataset`. -3. Forgets the registration entry so the controller returns HTTP 404 Not Found for any later requests. +2. Forgets the registration entry so the controller returns HTTP 404 Not Found for any later requests. -Cleanup errors are logged at WARN level but do not propagate, so a transient cleanup failure cannot mask a successful pull check. +Source: [`pull-check.service.ts`](../../apps/backend/src/pull-check/pull-check.service.ts) ## Pull Check Status Progression @@ -112,7 +115,7 @@ The dealbot API exposes one endpoint dedicated to pull checks: The endpoint is registered on the same `/api` prefix as the other dealbot HTTP endpoints. It is intentionally unauthenticated because SPs must be able to pull from it during a check; access is bounded by the per-piece TTL. -Source: [`piece-source.controller.ts`](../../apps/backend/src/pull-check/piece-source.controller.ts) +Source: [`pull-piece.controller.ts`](../../apps/backend/src/pull-check/pull-piece.controller.ts) ## Metrics Recorded From c3d87a6394d23b45e29711688297c799a505fc3d Mon Sep 17 00:00:00 2001 From: silent-cipher Date: Fri, 8 May 2026 17:33:03 +0530 Subject: [PATCH 24/44] chore: remove MAX_BYTES limit --- .../backend/src/dataSource/dataSource.service.spec.ts | 2 +- apps/backend/src/dataSource/dataSource.service.ts | 11 +---------- 2 files changed, 2 insertions(+), 11 deletions(-) diff --git a/apps/backend/src/dataSource/dataSource.service.spec.ts b/apps/backend/src/dataSource/dataSource.service.spec.ts index c4cf0f46..0c850509 100644 --- a/apps/backend/src/dataSource/dataSource.service.spec.ts +++ b/apps/backend/src/dataSource/dataSource.service.spec.ts @@ -1,8 +1,8 @@ import { ConfigService } from "@nestjs/config"; import * as fs from "fs"; import * as path from "path"; -import { IConfig } from "src/config/app.config.js"; import { afterEach, beforeEach, describe, expect, it, vi } from "vitest"; +import { IConfig } from "../config/app.config.js"; import { DataSourceService } from "./dataSource.service.js"; describe("DataSourceService", () => { diff --git a/apps/backend/src/dataSource/dataSource.service.ts b/apps/backend/src/dataSource/dataSource.service.ts index f7dc5184..e1845e03 100644 --- a/apps/backend/src/dataSource/dataSource.service.ts +++ b/apps/backend/src/dataSource/dataSource.service.ts @@ -23,7 +23,6 @@ export interface DeterministicBytesOptions { const AES_KEY_LENGTH = 32; // AES-256 const AES_IV_LENGTH = 16; // AES-CTR IV const UINT64_BUFFER_LENGTH = 8; -const MAX_BYTES = 10 * 1024 * 1024; // 10 MiB — default pull-check piece size @Injectable() export class DataSourceService { @@ -265,7 +264,7 @@ export class DataSourceService { } private validateOptions(options: DeterministicBytesOptions): void { - const { key, bytesNeeded } = options; + const { key, bytesNeeded, size = bytesNeeded } = options; if (!key || typeof key !== "string" || key.trim().length === 0) { throw new Error("DeterministicRandom: `key` must be a non-empty string."); @@ -275,14 +274,6 @@ export class DataSourceService { throw new Error("DeterministicRandom: `bytesNeeded` must be a positive integer."); } - if (bytesNeeded > MAX_BYTES) { - throw new Error( - `DeterministicRandom: \`bytesNeeded\` exceeds maximum allowed size of ${MAX_BYTES} bytes. ` + - `Split large requests into chunks.`, - ); - } - - const { size = 0 } = options; if (!Number.isInteger(size) || size < 0) { throw new Error("DeterministicRandom: `size` must be a non-negative integer."); } From 23d5ee7c4c08da798ec09df19f8277bbf5dd467b Mon Sep 17 00:00:00 2001 From: silent-cipher Date: Fri, 8 May 2026 19:26:45 +0530 Subject: [PATCH 25/44] chore: address pr comments --- apps/backend/.env.example | 1 - apps/backend/src/config/app.config.ts | 9 +------- .../src/dataSource/dataSource.service.ts | 2 +- .../database/entities/pull-piece.entity.ts | 6 +---- .../1776300000000-CreatePullPieces.ts | 4 ---- .../src/pull-check/pull-check.service.spec.ts | 17 ++++++-------- .../src/pull-check/pull-check.service.ts | 7 ++---- .../src/pull-check/pull-check.types.ts | 1 - .../src/pull-check/pull-piece.controller.ts | 23 +++++-------------- .../src/pull-check/pull-piece.repository.ts | 21 +++-------------- docs/checks/pull-check.md | 4 ++-- docs/environment-variables.md | 18 +-------------- 12 files changed, 24 insertions(+), 89 deletions(-) diff --git a/apps/backend/.env.example b/apps/backend/.env.example index 9b3c753b..97e1baed 100644 --- a/apps/backend/.env.example +++ b/apps/backend/.env.example @@ -68,7 +68,6 @@ IPFS_BLOCK_FETCH_CONCURRENCY=6 # Parallel block fetches when validating IP # Pull Check Configuration PULL_CHECKS_PER_SP_PER_HOUR=1 # SP pull-pathway checks scheduled per provider per hour PULL_CHECK_JOB_TIMEOUT_SECONDS=300 # 5m: Max runtime for pull-check jobs -PULL_CHECK_HOSTED_PIECE_TTL_SECONDS=900 # 15m: Hosted piece source TTL exposed at /api/piece/:pieceCid PULL_CHECK_POLL_INTERVAL_SECONDS=10 # SP pull status polling interval PULL_CHECK_PIECE_SIZE_BYTES=10485760 # 10 MiB synthetic test piece size per pull check DEALBOT_PGBOSS_POOL_MAX=1 diff --git a/apps/backend/src/config/app.config.ts b/apps/backend/src/config/app.config.ts index e2aee2c0..1c93d218 100644 --- a/apps/backend/src/config/app.config.ts +++ b/apps/backend/src/config/app.config.ts @@ -98,7 +98,6 @@ export const configValidationSchema = Joi.object({ // Pull Check PULL_CHECKS_PER_SP_PER_HOUR: Joi.number().min(0.001).max(20).default(1), PULL_CHECK_JOB_TIMEOUT_SECONDS: Joi.number().min(60).default(300), // 5m max runtime for pull check jobs - PULL_CHECK_HOSTED_PIECE_TTL_SECONDS: Joi.number().min(60).default(900), // 15m hosted piece TTL PULL_CHECK_POLL_INTERVAL_SECONDS: Joi.number().min(1).default(10), PULL_CHECK_PIECE_SIZE_BYTES: Joi.number() .integer() @@ -308,11 +307,6 @@ export interface IJobsConfig { * Bounds the polling window for terminal SP pull status. */ pullCheckJobTimeoutSeconds: number; - /** - * Time-to-live (seconds) for the temporary hosted piece source served at - * `/api/piece/:pieceCid` while a pull check is in flight. - */ - pullCheckHostedPieceTtlSeconds: number; /** * Polling interval (seconds) used while waiting for a terminal SP pull status. */ @@ -455,8 +449,7 @@ export function loadConfig(): IConfig { pieceCleanupPerSpPerHour: Number.parseFloat(process.env.JOB_PIECE_CLEANUP_PER_SP_PER_HOUR || String(1 / 24)), maxPieceCleanupRuntimeSeconds: Number.parseInt(process.env.MAX_PIECE_CLEANUP_RUNTIME_SECONDS || "300", 10), pullChecksPerSpPerHour: Number.parseFloat(process.env.PULL_CHECKS_PER_SP_PER_HOUR || "1"), - pullCheckJobTimeoutSeconds: Number.parseInt(process.env.PULL_CHECK_JOB_TIMEOUT_SECONDS || "360", 10), - pullCheckHostedPieceTtlSeconds: Number.parseInt(process.env.PULL_CHECK_HOSTED_PIECE_TTL_SECONDS || "900", 10), + pullCheckJobTimeoutSeconds: Number.parseInt(process.env.PULL_CHECK_JOB_TIMEOUT_SECONDS || "300", 10), pullCheckPollIntervalSeconds: Number.parseInt(process.env.PULL_CHECK_POLL_INTERVAL_SECONDS || "10", 10), pullCheckPieceSizeBytes: Number.parseInt(process.env.PULL_CHECK_PIECE_SIZE_BYTES || String(10 * 1024 * 1024), 10), }, diff --git a/apps/backend/src/dataSource/dataSource.service.ts b/apps/backend/src/dataSource/dataSource.service.ts index e1845e03..87fb3964 100644 --- a/apps/backend/src/dataSource/dataSource.service.ts +++ b/apps/backend/src/dataSource/dataSource.service.ts @@ -229,7 +229,7 @@ export class DataSourceService { generateBytesStream(options: DeterministicBytesOptions): Readable { const { key, bytesNeeded, providerAddress = "", size = bytesNeeded } = options; - this.validateOptions({ ...options, bytesNeeded: 1 }); // Just validate basic options + this.validateOptions(options); const derivedKey = this.deriveKey(providerAddress, size, key); const staticIV = Buffer.alloc(AES_IV_LENGTH, 0); diff --git a/apps/backend/src/database/entities/pull-piece.entity.ts b/apps/backend/src/database/entities/pull-piece.entity.ts index cb204443..a6122899 100644 --- a/apps/backend/src/database/entities/pull-piece.entity.ts +++ b/apps/backend/src/database/entities/pull-piece.entity.ts @@ -1,4 +1,4 @@ -import { Column, CreateDateColumn, Entity, Index, PrimaryColumn } from "typeorm"; +import { Column, CreateDateColumn, Entity, PrimaryColumn } from "typeorm"; /** * Persisted registration of a temporary pull piece served at @@ -8,7 +8,6 @@ import { Column, CreateDateColumn, Entity, Index, PrimaryColumn } from "typeorm" * resolve registrations created by a separate worker pod. */ @Entity("pull_pieces") -@Index("idx_pull_pieces_expires_at", ["expiresAt"]) export class PullPiece { @PrimaryColumn({ name: "piece_cid", type: "text" }) pieceCid!: string; @@ -22,9 +21,6 @@ export class PullPiece { @Column({ name: "size", type: "int" }) size!: number; - @Column({ name: "expires_at", type: "timestamptz" }) - expiresAt!: Date; - @Column({ name: "pull_submitted_at", type: "timestamptz", nullable: true }) pullSubmittedAt: Date | null; diff --git a/apps/backend/src/database/migrations/1776300000000-CreatePullPieces.ts b/apps/backend/src/database/migrations/1776300000000-CreatePullPieces.ts index ccded36d..24b64c0e 100644 --- a/apps/backend/src/database/migrations/1776300000000-CreatePullPieces.ts +++ b/apps/backend/src/database/migrations/1776300000000-CreatePullPieces.ts @@ -10,15 +10,11 @@ export class CreatePullPieces1776300000000 implements MigrationInterface { provider_address TEXT NOT NULL, key TEXT NOT NULL, size INT NOT NULL, - expires_at TIMESTAMPTZ NOT NULL, pull_submitted_at TIMESTAMPTZ, first_byte_at TIMESTAMPTZ, created_at TIMESTAMPTZ NOT NULL DEFAULT NOW() ) `); - await queryRunner.query(` - CREATE INDEX IF NOT EXISTS idx_pull_pieces_expires_at ON pull_pieces (expires_at) - `); } public async down(queryRunner: QueryRunner): Promise { diff --git a/apps/backend/src/pull-check/pull-check.service.spec.ts b/apps/backend/src/pull-check/pull-check.service.spec.ts index 3c0f8c5b..c24de2d9 100644 --- a/apps/backend/src/pull-check/pull-check.service.spec.ts +++ b/apps/backend/src/pull-check/pull-check.service.spec.ts @@ -51,8 +51,7 @@ describe("PullCheckService", () => { }; let registryMock: { register: ReturnType; - resolveAny: ReturnType; - resolveActive: ReturnType; + resolve: ReturnType; markPullSubmitted: ReturnType; markFirstByte: ReturnType; forget: ReturnType; @@ -78,8 +77,7 @@ describe("PullCheckService", () => { }; registryMock = { register: vi.fn().mockResolvedValue(undefined), - resolveAny: vi.fn().mockResolvedValue(null), - resolveActive: vi.fn().mockResolvedValue(null), + resolve: vi.fn().mockResolvedValue(null), markPullSubmitted: vi.fn().mockResolvedValue(undefined), markFirstByte: vi.fn().mockResolvedValue(undefined), forget: vi.fn().mockResolvedValue(undefined), @@ -103,7 +101,6 @@ describe("PullCheckService", () => { pullCheckJobTimeoutSeconds: 300, pullCheckPollIntervalSeconds: 5, pullCheckPieceSizeBytes: 1024, - pullCheckHostedPieceTtlSeconds: 600, } as IConfig["jobs"], dataset: { localDatasetsPath: "/tmp/datasets" } as IConfig["dataset"], }; @@ -258,7 +255,7 @@ describe("PullCheckService", () => { // After cleanup the resolveAny call returns the entry; before that the // run reads it once to compute first-byte latency. Same shape suffices. - registryMock.resolveAny.mockResolvedValue(registration); + registryMock.resolve.mockResolvedValue(registration); vi.mocked(pullPieces).mockResolvedValue({ status: "pending" } as unknown as Awaited< ReturnType @@ -304,7 +301,7 @@ describe("PullCheckService", () => { it("does not observe firstByte when the SP never read from /api/piece (cached pull)", async () => { const { registration } = arrangeHappyPath(); // Simulate a cached pull: SP never fetched from us. - registryMock.resolveAny.mockResolvedValue({ ...registration, firstByteAt: undefined }); + registryMock.resolve.mockResolvedValue({ ...registration, firstByteAt: undefined }); await service.runPullCheck("0xsp", undefined, logContext); @@ -373,8 +370,8 @@ describe("PullCheckService", () => { }); describe("openPullPieceStream", () => { - it("returns null when no active registration exists", async () => { - registryMock.resolveActive.mockResolvedValue(null); + it("returns null when no registration exists", async () => { + registryMock.resolve.mockResolvedValue(null); expect(await service.openPullPieceStream("missing")).toBeNull(); }); @@ -387,7 +384,7 @@ describe("PullCheckService", () => { expiresAt: new Date(Date.now() + 60_000), cleanedUp: false, }; - registryMock.resolveActive.mockResolvedValue(registration); + registryMock.resolve.mockResolvedValue(registration); const result = await service.openPullPieceStream("bafk-test-piece"); expect(result).not.toBeNull(); diff --git a/apps/backend/src/pull-check/pull-check.service.ts b/apps/backend/src/pull-check/pull-check.service.ts index 3bb896df..109f383e 100644 --- a/apps/backend/src/pull-check/pull-check.service.ts +++ b/apps/backend/src/pull-check/pull-check.service.ts @@ -138,7 +138,7 @@ export class PullCheckService { throw new Error("Pull-check piece validation failed: SP did not serve the expected bytes"); } - const firstByteEntry = await this.pullPieceRepository.resolveAny(pieceCidStr); + const firstByteEntry = await this.pullPieceRepository.resolve(pieceCidStr); const firstByteMs = firstByteEntry?.firstByteAt && firstByteEntry?.pullSubmittedAt ? firstByteEntry.firstByteAt.getTime() - firstByteEntry.pullSubmittedAt.getTime() @@ -232,14 +232,12 @@ export class PullCheckService { const pieceCidStr = pieceCid.toString(); const baseUrl = this.resolvePublicBaseUrl(); const sourceUrl = `${baseUrl}/api/piece/${pieceCidStr}`; - const expiresAt = new Date(Date.now() + jobsConfig.pullCheckHostedPieceTtlSeconds * 1000); const registration = { pieceCid: pieceCidStr, providerAddress, key, size: targetSize, - expiresAt, }; await this.pullPieceRepository.register(registration); @@ -271,9 +269,8 @@ export class PullCheckService { */ async openPullPieceStream( pieceCid: string, - now: Date = new Date(), ): Promise<{ registration: PullPieceRegistration; stream: Readable } | null> { - const registration = await this.pullPieceRepository.resolveActive(pieceCid, now); + const registration = await this.pullPieceRepository.resolve(pieceCid); if (!registration) return null; const stream = this.dataSourceService.generateBytesStream({ diff --git a/apps/backend/src/pull-check/pull-check.types.ts b/apps/backend/src/pull-check/pull-check.types.ts index 236ec5e7..93617a30 100644 --- a/apps/backend/src/pull-check/pull-check.types.ts +++ b/apps/backend/src/pull-check/pull-check.types.ts @@ -7,7 +7,6 @@ export type PullPieceRegistration = { providerAddress: string; key: string; size: number; - expiresAt: Date; pullSubmittedAt?: Date; firstByteAt?: Date; }; diff --git a/apps/backend/src/pull-check/pull-piece.controller.ts b/apps/backend/src/pull-check/pull-piece.controller.ts index 90bd9adc..2540dfa4 100644 --- a/apps/backend/src/pull-check/pull-piece.controller.ts +++ b/apps/backend/src/pull-check/pull-piece.controller.ts @@ -1,6 +1,7 @@ import { Controller, Get, Logger, NotFoundException, Param, Res } from "@nestjs/common"; import { ApiOperation, ApiResponse, ApiTags } from "@nestjs/swagger"; import type { Response } from "express"; +import { PassThrough } from "node:stream"; import { PullCheckService } from "./pull-check.service.js"; import { PullPieceRepository } from "./pull-piece.repository.js"; @@ -21,12 +22,8 @@ export class PieceSourceController { ) {} @Get("piece/:pieceCid") - @ApiOperation({ - summary: "Stream a temporary pull piece for an in-flight SP pull check", - }) @ApiResponse({ status: 200, description: "Raw piece bytes streamed to the caller" }) @ApiResponse({ status: 404, description: "No active pull piece exists for this pieceCid" }) - @ApiResponse({ status: 410, description: "Pull piece existed but has expired or been cleaned up" }) async servePiece(@Param("pieceCid") pieceCid: string, @Res() res: Response): Promise { if (!pieceCid || pieceCid.trim().length === 0) { throw new NotFoundException("pieceCid is required"); @@ -34,17 +31,6 @@ export class PieceSourceController { const opened = await this.pullCheckService.openPullPieceStream(pieceCid); if (!opened) { - const known = await this.pullPieceRepository.resolveAny(pieceCid); - if (known) { - this.logger.warn({ - event: "pull_check_piece_gone", - message: "Pull piece source no longer active", - pieceCid, - expiresAt: known.expiresAt.toISOString(), - }); - res.status(410).send("Pull piece source has expired or been cleaned up"); - return; - } this.logger.warn({ event: "pull_check_piece_unknown", message: "Pull piece source not found", @@ -73,10 +59,13 @@ export class PieceSourceController { } res.destroy(error); }); + + const pt = new PassThrough(); // Capture the first-byte timestamp before piping (fire-and-forget DB write) - stream.once("data", () => { + pt.once("data", () => { void this.pullPieceRepository.markFirstByte(pieceCid, new Date()); }); - stream.pipe(res); + + stream.pipe(pt).pipe(res); } } diff --git a/apps/backend/src/pull-check/pull-piece.repository.ts b/apps/backend/src/pull-check/pull-piece.repository.ts index 902a5256..1af385be 100644 --- a/apps/backend/src/pull-check/pull-piece.repository.ts +++ b/apps/backend/src/pull-check/pull-piece.repository.ts @@ -7,7 +7,7 @@ import type { PullPieceRegistration } from "./pull-check.types.js"; /** * Postgres-backed registry of hosted piece sources backing pull-check requests. * - * Persisting to the `hosted_pieces` table allows the API pod(s) to resolve + * Persisting to the `pull_pieces` table allows the API pod(s) to resolve * registrations created by a separate worker pod in split-process deployments. */ @Injectable() @@ -26,7 +26,6 @@ export class PullPieceRepository { providerAddress: registration.providerAddress, key: registration.key, size: registration.size, - expiresAt: registration.expiresAt, pullSubmittedAt: null, firstByteAt: null, }, @@ -36,27 +35,14 @@ export class PullPieceRepository { event: "hosted_piece_registered", message: "Registered hosted piece source", pieceCid: registration.pieceCid, - expiresAt: registration.expiresAt.toISOString(), size: `${registration.size} bytes`, }); } /** - * Resolve a hosted piece by CID. Returns null when the entry is missing, - * already cleaned up, or has expired. + * Resolve a hosted piece by CID. */ - async resolveActive(pieceCid: string, now: Date = new Date()): Promise { - const row = await this.repo.findOneBy({ pieceCid }); - if (!row) return null; - if (row.expiresAt.getTime() <= now.getTime()) return null; - return this.toRegistration(row); - } - - /** - * Resolve a hosted piece by CID even when expired/cleaned-up. Used by the - * controller to differentiate a 410 Gone from a 404 Not Found. - */ - async resolveAny(pieceCid: string): Promise { + async resolve(pieceCid: string): Promise { const row = await this.repo.findOneBy({ pieceCid }); return row ? this.toRegistration(row) : null; } @@ -100,7 +86,6 @@ export class PullPieceRepository { providerAddress: row.providerAddress, key: row.key, size: row.size, - expiresAt: row.expiresAt, pullSubmittedAt: row.pullSubmittedAt ?? undefined, firstByteAt: row.firstByteAt ?? undefined, }; diff --git a/docs/checks/pull-check.md b/docs/checks/pull-check.md index c582f94c..e60e37ec 100644 --- a/docs/checks/pull-check.md +++ b/docs/checks/pull-check.md @@ -24,8 +24,8 @@ Each pull check asserts the following for every SP: |---|-----------|------------------|:---:|--------------------------------------------|:---:| | 1 | SP accepts the pull request | `pullPieces` returns without error and reports a non-terminal-failure status | 0 | [`pullCheckRequestLatencyMs`](./events-and-metrics.md#pullCheckRequestLatencyMs) | Yes | | 2 | SP reaches a terminal `complete` pull status | `waitForPullStatus` polls the SP until a terminal status is reported | Polling with delay until [`PULL_CHECK_JOB_TIMEOUT_SECONDS`](../environment-variables.md#pull_check_job_timeout_seconds) | [`pullCheckCompletionLatencyMs`](./events-and-metrics.md#pullCheckCompletionLatencyMs) | Yes | -| 4 | SP serves the pulled piece via `/piece/{pieceCid}` | Re-fetch the bytes from the SP's PDP service URL and re-compute the piece CID | 0 | n/a (bounded by job timeout) | Yes | -| 5 | All checks pass | Pull check is not marked successful until all assertions pass within the job timeout | n/a | [`pullCheckCompletionLatencyMs`](./events-and-metrics.md#pullCheckCompletionLatencyMs) | Yes | +| 3 | SP serves the pulled piece via `/piece/{pieceCid}` | Re-fetch the bytes from the SP's PDP service URL and re-compute the piece CID | 0 | n/a (bounded by job timeout) | Yes | +| 4 | All checks pass | Pull check is not marked successful until all assertions pass within the job timeout | n/a | [`pullCheckCompletionLatencyMs`](./events-and-metrics.md#pullCheckCompletionLatencyMs) | Yes | ## Pull Check Lifecycle diff --git a/docs/environment-variables.md b/docs/environment-variables.md index a53411a2..e57a73b2 100644 --- a/docs/environment-variables.md +++ b/docs/environment-variables.md @@ -16,7 +16,7 @@ This document provides a comprehensive guide to all environment variables used b | [ClickHouse](#clickhouse-configuration) | `CLICKHOUSE_URL`, `CLICKHOUSE_BATCH_SIZE`, `CLICKHOUSE_FLUSH_INTERVAL_MS`, `DEALBOT_PROBE_LOCATION` | | [Timeouts](#timeout-configuration) | `CONNECT_TIMEOUT_MS`, `HTTP_REQUEST_TIMEOUT_MS`, `HTTP2_REQUEST_TIMEOUT_MS`, `IPNI_VERIFICATION_TIMEOUT_MS`, `IPNI_VERIFICATION_POLLING_MS` | | [Piece Cleanup](#piece-cleanup) | `MAX_DATASET_STORAGE_SIZE_BYTES`, `TARGET_DATASET_STORAGE_SIZE_BYTES`, `JOB_PIECE_CLEANUP_PER_SP_PER_HOUR`, `MAX_PIECE_CLEANUP_RUNTIME_SECONDS` | -| [Pull Check](#pull-check) | `PULL_CHECKS_PER_SP_PER_HOUR`, `PULL_CHECK_JOB_TIMEOUT_SECONDS`, `PULL_CHECK_HOSTED_PIECE_TTL_SECONDS`, `PULL_CHECK_POLL_INTERVAL_SECONDS`, `PULL_CHECK_PIECE_SIZE_BYTES` | +| [Pull Check](#pull-check) | `PULL_CHECKS_PER_SP_PER_HOUR`, `PULL_CHECK_JOB_TIMEOUT_SECONDS`, `PULL_CHECK_POLL_INTERVAL_SECONDS`, `PULL_CHECK_PIECE_SIZE_BYTES` | | [SP Blocklist](#sp-blocklist-configuration) | `BLOCKED_SP_IDS`, `BLOCKED_SP_ADDRESSES` | | [Prometheus Metrics](#prometheus-metrics-configuration) | `PROMETHEUS_WALLET_BALANCE_TTL_SECONDS`, `PROMETHEUS_WALLET_BALANCE_ERROR_COOLDOWN_SECONDS` | | [Web Frontend](#web-frontend) | `VITE_API_BASE_URL`, `VITE_PLAUSIBLE_DATA_DOMAIN`, `DEALBOT_API_BASE_URL` | @@ -973,22 +973,6 @@ PULL_CHECKS_PER_SP_PER_HOUR=0.083 --- -### `PULL_CHECK_HOSTED_PIECE_TTL_SECONDS` - -- **Type**: `number` (seconds) -- **Required**: No -- **Default**: `900` (15 minutes) -- **Minimum**: `60` - -**Role**: Time-to-live for the temporary hosted piece source served at `/api/piece/{pieceCid}` during an in-flight pull check. After the TTL elapses or the job calls cleanup, the controller responds with HTTP `410 Gone` for that pieceCid. - -**When to update**: - -- Should be at least `PULL_CHECK_JOB_TIMEOUT_SECONDS` plus generous margin for the SP to make its first read; the default 15 minutes provides ~9 minutes of headroom over the 6-minute job timeout default -- Increase only when intentionally allowing SPs to retry pulls long after the dealbot job has aborted - ---- - ### `PULL_CHECK_POLL_INTERVAL_SECONDS` - **Type**: `number` (seconds) From 30f2379123114c3428a124d8a572d747580cd27c Mon Sep 17 00:00:00 2001 From: silent-cipher Date: Fri, 8 May 2026 19:32:18 +0530 Subject: [PATCH 26/44] chore: fix lint plus docs --- apps/backend/src/pull-check/pull-piece.controller.ts | 4 ++-- docs/checks/pull-check.md | 6 ++---- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/apps/backend/src/pull-check/pull-piece.controller.ts b/apps/backend/src/pull-check/pull-piece.controller.ts index 2540dfa4..021e6e3d 100644 --- a/apps/backend/src/pull-check/pull-piece.controller.ts +++ b/apps/backend/src/pull-check/pull-piece.controller.ts @@ -1,7 +1,7 @@ +import { PassThrough } from "node:stream"; import { Controller, Get, Logger, NotFoundException, Param, Res } from "@nestjs/common"; -import { ApiOperation, ApiResponse, ApiTags } from "@nestjs/swagger"; +import { ApiResponse, ApiTags } from "@nestjs/swagger"; import type { Response } from "express"; -import { PassThrough } from "node:stream"; import { PullCheckService } from "./pull-check.service.js"; import { PullPieceRepository } from "./pull-piece.repository.js"; diff --git a/docs/checks/pull-check.md b/docs/checks/pull-check.md index e60e37ec..a806e516 100644 --- a/docs/checks/pull-check.md +++ b/docs/checks/pull-check.md @@ -84,10 +84,8 @@ Source: [`pull-check.service.ts` (`validateByDirectPieceFetch`)](../../apps/back ### 5. Cleanup -Whether the pull check succeeds or fails, the `finally` block: - -1. Marks the registration as cleaned up (so subsequent `/api/piece/{pieceCid}` requests return HTTP 410 Gone instead of 200). -2. Forgets the registration entry so the controller returns HTTP 404 Not Found for any later requests. +Whether the pull check succeeds or fails, the `finally` block removes the registration entry. +After cleanup, subsequent `/api/piece/{pieceCid}` requests return HTTP 404 Not Found. Source: [`pull-check.service.ts`](../../apps/backend/src/pull-check/pull-check.service.ts) From b395c98e8ec80b63fe7a4976e6933f6d6e0c7b1a Mon Sep 17 00:00:00 2001 From: silent-cipher Date: Sat, 9 May 2026 09:51:25 +0530 Subject: [PATCH 27/44] doc: remove ttl leftovers --- docs/checks/pull-check.md | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/docs/checks/pull-check.md b/docs/checks/pull-check.md index a806e516..b63132d4 100644 --- a/docs/checks/pull-check.md +++ b/docs/checks/pull-check.md @@ -46,7 +46,7 @@ flowchart TD ### 1. Prepare the hosted piece -Dealbot computes a deterministic PieceCID for a synthetic test piece and registers it in the Postgres `pull_pieces` table. The registration carries a TTL controlled by `PULL_CHECK_HOSTED_PIECE_TTL_SECONDS` so the source remains available for the entire pull window. +Dealbot computes a deterministic PieceCID for a synthetic test piece and registers it in the Postgres `pull_pieces` table. By persisting registrations to Postgres instead of in-memory, the hosted source can be resolved by any API pod in a multi-pod deployment, even if the pull check was initiated by a different worker pod. @@ -109,9 +109,9 @@ The dealbot API exposes one endpoint dedicated to pull checks: | Method | Path | Description | |--------|------|-------------| -| `GET` | `/api/piece/{pieceCid}` | Streams the temporary hosted piece bytes for an in-flight pull check. Returns `200` with the bytes when an active registration exists, `410 Gone` when the registration has been cleaned up or expired, and `404 Not Found` when no registration exists. | +| `GET` | `/api/piece/{pieceCid}` | Streams the temporary hosted piece bytes for an in-flight pull check. Returns `200` with the bytes when an active registration exists, and `404 Not Found` when no registration exists. | -The endpoint is registered on the same `/api` prefix as the other dealbot HTTP endpoints. It is intentionally unauthenticated because SPs must be able to pull from it during a check; access is bounded by the per-piece TTL. +The endpoint is registered on the same `/api` prefix as the other dealbot HTTP endpoints. It is intentionally unauthenticated because SPs must be able to pull from it during a check; access is bounded by the job lifecycle. Source: [`pull-piece.controller.ts`](../../apps/backend/src/pull-check/pull-piece.controller.ts) @@ -135,7 +135,6 @@ Key environment variables that control pull check behavior: | `DEALBOT_API_PUBLIC_URL` | Public base URL used to construct the hosted-piece source URL handed to SPs. Required for any deployment where SPs cannot reach `DEALBOT_HOST:DEALBOT_PORT` directly. | | `PULL_CHECKS_PER_SP_PER_HOUR` | Per-SP pull check rate. | | `PULL_CHECK_JOB_TIMEOUT_SECONDS` | Max end-to-end pull check job runtime before forced abort. | -| `PULL_CHECK_HOSTED_PIECE_TTL_SECONDS` | TTL of the temporary hosted piece source served at `/api/piece/{pieceCid}`. | | `PULL_CHECK_POLL_INTERVAL_SECONDS` | Polling interval used while waiting for a terminal SP pull status. | | `PULL_CHECK_PIECE_SIZE_BYTES` | Size of the synthetic test piece dealbot generates per pull check. | From b9873cddffb23c8af5334f3b5164a427f585a681 Mon Sep 17 00:00:00 2001 From: silent-cipher Date: Sat, 9 May 2026 09:52:27 +0530 Subject: [PATCH 28/44] refactor: worker doesn't expose /api/piece/:pieceCid --- apps/backend/src/pull-check/pull-check.module.ts | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/apps/backend/src/pull-check/pull-check.module.ts b/apps/backend/src/pull-check/pull-check.module.ts index 45574158..892e9d5e 100644 --- a/apps/backend/src/pull-check/pull-check.module.ts +++ b/apps/backend/src/pull-check/pull-check.module.ts @@ -9,9 +9,12 @@ import { PullCheckService } from "./pull-check.service.js"; import { PieceSourceController } from "./pull-piece.controller.js"; import { PullPieceRepository } from "./pull-piece.repository.js"; +const runMode = process.env.DEALBOT_RUN_MODE?.toLowerCase() || "both"; +const isWorkerOnly = runMode === "worker"; + @Module({ imports: [DatabaseModule, TypeOrmModule.forFeature([PullPiece]), WalletSdkModule, DataSourceModule, HttpClientModule], - controllers: [PieceSourceController], + controllers: isWorkerOnly ? [] : [PieceSourceController], providers: [PullCheckService, PullPieceRepository], exports: [PullCheckService, PullPieceRepository], }) From 1a420f6f90a7b1fe05cf656678aa1ca34ea9ea0a Mon Sep 17 00:00:00 2001 From: silent-cipher Date: Sat, 9 May 2026 13:21:47 +0530 Subject: [PATCH 29/44] refactor: fire and forget pull piece deletion --- apps/backend/src/pull-check/pull-check.service.ts | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/apps/backend/src/pull-check/pull-check.service.ts b/apps/backend/src/pull-check/pull-check.service.ts index 109f383e..8fe45c33 100644 --- a/apps/backend/src/pull-check/pull-check.service.ts +++ b/apps/backend/src/pull-check/pull-check.service.ts @@ -169,7 +169,16 @@ export class PullCheckService { throw error; } finally { if (prepared) { - await this.pullPieceRepository.forget(prepared.registration.pieceCid); + const pieceCid = prepared.registration.pieceCid; + this.pullPieceRepository.forget(pieceCid).catch((error) => { + this.logger.warn({ + ...logContext, + event: "pull_check_piece_forget_failed", + message: "Failed to delete pull piece after job completion", + pieceCid, + error: toStructuredError(error), + }); + }); } } } From da5e7b98efcb398ad90187df6868ed6caa0562d9 Mon Sep 17 00:00:00 2001 From: silent-cipher Date: Mon, 11 May 2026 10:44:22 +0530 Subject: [PATCH 30/44] chore: address doc comments --- apps/backend/src/pull-check/pull-piece.controller.ts | 1 + docs/checks/pull-check.md | 2 +- docs/environment-variables.md | 2 -- 3 files changed, 2 insertions(+), 3 deletions(-) diff --git a/apps/backend/src/pull-check/pull-piece.controller.ts b/apps/backend/src/pull-check/pull-piece.controller.ts index 021e6e3d..fa7ecc2f 100644 --- a/apps/backend/src/pull-check/pull-piece.controller.ts +++ b/apps/backend/src/pull-check/pull-piece.controller.ts @@ -45,6 +45,7 @@ export class PieceSourceController { res.setHeader("Content-Length", registration.size.toString()); res.setHeader("Cache-Control", "no-store"); res.setHeader("X-Pull-Check-Piece-CID", registration.pieceCid); + res.setHeader("Accept-ranges", "none"); stream.on("error", (error) => { this.logger.error({ diff --git a/docs/checks/pull-check.md b/docs/checks/pull-check.md index b63132d4..bb9f78d1 100644 --- a/docs/checks/pull-check.md +++ b/docs/checks/pull-check.md @@ -95,7 +95,7 @@ A pull check has a single terminal status, recorded once per check via [`pullChe | Overall Status | Meaning | |--------|---------| -| `success` | All five [assertions](#what-gets-asserted) passed within the job timeout. | +| `success` | All [assertions](#what-gets-asserted) passed within the job timeout. | | `failure.timedout` | The job was aborted because it exceeded `PULL_CHECK_JOB_TIMEOUT_SECONDS`, or the underlying error message indicates a timeout. | | `failure.other` | Any other failure: SP rejected the pull request, SP reached a non-`complete` terminal status, or direct piece validation failed. | diff --git a/docs/environment-variables.md b/docs/environment-variables.md index e57a73b2..16cefcfe 100644 --- a/docs/environment-variables.md +++ b/docs/environment-variables.md @@ -1003,8 +1003,6 @@ PULL_CHECKS_PER_SP_PER_HOUR=0.083 - Decrease for quicker, lower-bandwidth pull tests - Increase to stress-test the SP's outbound fetch throughput -**Note**: Pull-check pieces are committed on-chain but **not** tracked in the `deals` table, so they are not garbage-collected by [Piece Cleanup](#piece-cleanup). Larger pieces accrue on the SP unless removed manually. - --- ## Dataset Configuration From 2d462c0b0cddb0d882101b026a35db0eae8e6f73 Mon Sep 17 00:00:00 2001 From: silent-cipher Date: Mon, 11 May 2026 11:22:27 +0530 Subject: [PATCH 31/44] refactor: move pull-piece config to dedicated section and add stream limits --- apps/backend/.env.example | 4 +- apps/backend/src/config/app.config.ts | 74 ++++++++++++------- apps/backend/src/jobs/jobs.service.spec.ts | 1 + apps/backend/src/jobs/jobs.service.ts | 5 +- .../src/pull-check/pull-check.service.spec.ts | 4 +- .../src/pull-check/pull-check.service.ts | 15 ++-- docs/environment-variables.md | 48 +++++++++++- 7 files changed, 110 insertions(+), 41 deletions(-) diff --git a/apps/backend/.env.example b/apps/backend/.env.example index 97e1baed..e614e6f0 100644 --- a/apps/backend/.env.example +++ b/apps/backend/.env.example @@ -68,8 +68,10 @@ IPFS_BLOCK_FETCH_CONCURRENCY=6 # Parallel block fetches when validating IP # Pull Check Configuration PULL_CHECKS_PER_SP_PER_HOUR=1 # SP pull-pathway checks scheduled per provider per hour PULL_CHECK_JOB_TIMEOUT_SECONDS=300 # 5m: Max runtime for pull-check jobs -PULL_CHECK_POLL_INTERVAL_SECONDS=10 # SP pull status polling interval +PULL_CHECK_POLL_INTERVAL_SECONDS=2 # SP pull status polling interval PULL_CHECK_PIECE_SIZE_BYTES=10485760 # 10 MiB synthetic test piece size per pull check +PULL_PIECE_MAX_CONCURRENT_STREAMS=50 # Max concurrent streams across all pieces (DoS protection) +PULL_PIECE_MAX_STREAMS_PER_CID=3 # Max concurrent streams per pieceCid (prevents spam of single piece) DEALBOT_PGBOSS_POOL_MAX=1 DEALBOT_PGBOSS_SCHEDULER_ENABLED=true diff --git a/apps/backend/src/config/app.config.ts b/apps/backend/src/config/app.config.ts index 1c93d218..9ddd99d6 100644 --- a/apps/backend/src/config/app.config.ts +++ b/apps/backend/src/config/app.config.ts @@ -98,11 +98,13 @@ export const configValidationSchema = Joi.object({ // Pull Check PULL_CHECKS_PER_SP_PER_HOUR: Joi.number().min(0.001).max(20).default(1), PULL_CHECK_JOB_TIMEOUT_SECONDS: Joi.number().min(60).default(300), // 5m max runtime for pull check jobs - PULL_CHECK_POLL_INTERVAL_SECONDS: Joi.number().min(1).default(10), + PULL_CHECK_POLL_INTERVAL_SECONDS: Joi.number().min(1).default(2), PULL_CHECK_PIECE_SIZE_BYTES: Joi.number() .integer() .min(1024) .default(10 * 1024 * 1024), // 10 MiB + PULL_PIECE_MAX_CONCURRENT_STREAMS: Joi.number().integer().min(1).default(50), // Max concurrent streams across all pieces + PULL_PIECE_MAX_STREAMS_PER_CID: Joi.number().integer().min(1).default(3), // Max concurrent streams per pieceCid // Piece Cleanup MAX_DATASET_STORAGE_SIZE_BYTES: Joi.number() @@ -294,27 +296,6 @@ export interface IJobsConfig { * Only used when `DEALBOT_JOBS_MODE=pgboss`. */ maxPieceCleanupRuntimeSeconds: number; - /** - * Target number of pull checks per storage provider per hour. - * - * Pull checks validate the SP pull-to-park pathway by serving a temporary piece URL - * from DealBot and asking the SP to pull and park it. Independent of `deal` and `retrieval`. - */ - pullChecksPerSpPerHour: number; - /** - * Maximum runtime (seconds) for pull-check jobs before forced abort. - * - * Bounds the polling window for terminal SP pull status. - */ - pullCheckJobTimeoutSeconds: number; - /** - * Polling interval (seconds) used while waiting for a terminal SP pull status. - */ - pullCheckPollIntervalSeconds: number; - /** - * Size (bytes) of the synthetic test piece DealBot generates per pull check. - */ - pullCheckPieceSizeBytes: number; } export interface IDatasetConfig { @@ -358,6 +339,42 @@ export interface IClickhouseConfig { maxBufferSize: number; } +export interface IPullPieceConfig { + /** + * Target number of pull checks per storage provider per hour. + * + * Pull checks validate the SP pull-to-park pathway by serving a temporary piece URL + * from DealBot and asking the SP to pull and park it. Independent of `deal` and `retrieval`. + */ + pullChecksPerSpPerHour: number; + /** + * Maximum runtime (seconds) for pull-check jobs before forced abort. + * + * Bounds the polling window for terminal SP pull status. + */ + pullCheckJobTimeoutSeconds: number; + /** + * Polling interval (seconds) used while waiting for a terminal SP pull status. + */ + pullCheckPollIntervalSeconds: number; + /** + * Size (bytes) of the synthetic test piece DealBot generates per pull check. + */ + pullCheckPieceSizeBytes: number; + /** + * Maximum number of concurrent piece streams across all pieceCids. + * + * Prevents DoS by limiting total server-wide streaming load. + */ + maxConcurrentStreams: number; + /** + * Maximum number of concurrent streams per pieceCid. + * + * Prevents attackers from opening many connections to the same piece. + */ + maxStreamsPerCid: number; +} + export interface IConfig { app: IAppConfig; database: IDatabaseConfig; @@ -370,6 +387,7 @@ export interface IConfig { clickhouse: IClickhouseConfig; pieceCleanup: IPieceCleanupConfig; spBlocklists: ISpBlocklistConfig; + pullPiece: IPullPieceConfig; } export function loadConfig(): IConfig { @@ -448,10 +466,6 @@ export function loadConfig(): IConfig { dataSetCreationJobTimeoutSeconds: Number.parseInt(process.env.DATA_SET_CREATION_JOB_TIMEOUT_SECONDS || "300", 10), pieceCleanupPerSpPerHour: Number.parseFloat(process.env.JOB_PIECE_CLEANUP_PER_SP_PER_HOUR || String(1 / 24)), maxPieceCleanupRuntimeSeconds: Number.parseInt(process.env.MAX_PIECE_CLEANUP_RUNTIME_SECONDS || "300", 10), - pullChecksPerSpPerHour: Number.parseFloat(process.env.PULL_CHECKS_PER_SP_PER_HOUR || "1"), - pullCheckJobTimeoutSeconds: Number.parseInt(process.env.PULL_CHECK_JOB_TIMEOUT_SECONDS || "300", 10), - pullCheckPollIntervalSeconds: Number.parseInt(process.env.PULL_CHECK_POLL_INTERVAL_SECONDS || "10", 10), - pullCheckPieceSizeBytes: Number.parseInt(process.env.PULL_CHECK_PIECE_SIZE_BYTES || String(10 * 1024 * 1024), 10), }, dataset: { localDatasetsPath: process.env.DEALBOT_LOCAL_DATASETS_PATH || DEFAULT_LOCAL_DATASETS_PATH, @@ -501,5 +515,13 @@ export function loadConfig(): IConfig { ids: parseIdList(process.env.BLOCKED_SP_IDS), addresses: parseAddressList(process.env.BLOCKED_SP_ADDRESSES), }, + pullPiece: { + pullChecksPerSpPerHour: Number.parseFloat(process.env.PULL_CHECKS_PER_SP_PER_HOUR || "1"), + pullCheckJobTimeoutSeconds: Number.parseInt(process.env.PULL_CHECK_JOB_TIMEOUT_SECONDS || "300", 10), + pullCheckPollIntervalSeconds: Number.parseInt(process.env.PULL_CHECK_POLL_INTERVAL_SECONDS || "2", 10), + pullCheckPieceSizeBytes: Number.parseInt(process.env.PULL_CHECK_PIECE_SIZE_BYTES || String(10 * 1024 * 1024), 10), + maxConcurrentStreams: Number.parseInt(process.env.PULL_PIECE_MAX_CONCURRENT_STREAMS || "50", 10), + maxStreamsPerCid: Number.parseInt(process.env.PULL_PIECE_MAX_STREAMS_PER_CID || "3", 10), + }, }; } diff --git a/apps/backend/src/jobs/jobs.service.spec.ts b/apps/backend/src/jobs/jobs.service.spec.ts index e9adc6ac..55d319b5 100644 --- a/apps/backend/src/jobs/jobs.service.spec.ts +++ b/apps/backend/src/jobs/jobs.service.spec.ts @@ -134,6 +134,7 @@ describe("JobsService schedule rows", () => { pieceCleanupPerSpPerHour: 1, maxPieceCleanupRuntimeSeconds: 300, } as IConfig["jobs"], + pullPiece: { pullChecksPerSpPerHour: 1, pullCheckJobTimeoutSeconds: 300 } as IConfig["pullPiece"], database: { host: "localhost", port: 5432, diff --git a/apps/backend/src/jobs/jobs.service.ts b/apps/backend/src/jobs/jobs.service.ts index b929405e..a687ea0e 100644 --- a/apps/backend/src/jobs/jobs.service.ts +++ b/apps/backend/src/jobs/jobs.service.ts @@ -674,7 +674,7 @@ export class JobsService implements OnModuleInit, OnApplicationShutdown { } const abortController = new AbortController(); - const timeoutSeconds = this.configService.get("jobs").pullCheckJobTimeoutSeconds; + const timeoutSeconds = this.configService.get("pullPiece", { infer: true }).pullCheckJobTimeoutSeconds; const timeoutMs = Math.max(60000, timeoutSeconds * 1000); const effectiveTimeoutSeconds = Math.round(timeoutMs / 1000); const abortReason = new Error(`Pull check job timeout (${effectiveTimeoutSeconds}s) for ${spAddress}`); @@ -985,12 +985,13 @@ export class JobsService implements OnModuleInit, OnApplicationShutdown { } { const jobsConfig = this.configService.get("jobs", { infer: true }); const scheduling = this.configService.get("scheduling", { infer: true }); + const pullPieceConfig = this.configService.get("pullPiece", { infer: true }); const dealsPerHour = jobsConfig.dealsPerSpPerHour; const retrievalsPerHour = jobsConfig.retrievalsPerSpPerHour; const dataSetCreationsPerHour = jobsConfig.dataSetCreationsPerSpPerHour; const pieceCleanupPerHour = jobsConfig.pieceCleanupPerSpPerHour; - const pullChecksPerHour = jobsConfig.pullChecksPerSpPerHour; + const pullChecksPerHour = pullPieceConfig.pullChecksPerSpPerHour; const dealIntervalSeconds = Math.max(1, Math.round(3600 / dealsPerHour)); const retrievalIntervalSeconds = Math.max(1, Math.round(3600 / retrievalsPerHour)); diff --git a/apps/backend/src/pull-check/pull-check.service.spec.ts b/apps/backend/src/pull-check/pull-check.service.spec.ts index c24de2d9..08cc1b2e 100644 --- a/apps/backend/src/pull-check/pull-check.service.spec.ts +++ b/apps/backend/src/pull-check/pull-check.service.spec.ts @@ -97,11 +97,11 @@ describe("PullCheckService", () => { configValues = { app: { host: "localhost", port: 3000, apiPublicUrl: "https://dealbot.example" } as IConfig["app"], blockchain: { network: "calibration", walletAddress: "0xwallet" } as IConfig["blockchain"], - jobs: { + pullPiece: { pullCheckJobTimeoutSeconds: 300, pullCheckPollIntervalSeconds: 5, pullCheckPieceSizeBytes: 1024, - } as IConfig["jobs"], + } as IConfig["pullPiece"], dataset: { localDatasetsPath: "/tmp/datasets" } as IConfig["dataset"], }; diff --git a/apps/backend/src/pull-check/pull-check.service.ts b/apps/backend/src/pull-check/pull-check.service.ts index 8fe45c33..1e4e11dc 100644 --- a/apps/backend/src/pull-check/pull-check.service.ts +++ b/apps/backend/src/pull-check/pull-check.service.ts @@ -6,7 +6,7 @@ import { Injectable, Logger } from "@nestjs/common"; import { ConfigService } from "@nestjs/config"; import type { Account, Address, Chain, Client, Transport } from "viem"; import { type ProviderJobContext, toStructuredError } from "../common/logging.js"; -import type { IAppConfig, IConfig, IJobsConfig } from "../config/app.config.js"; +import type { IAppConfig, IConfig, IPullPieceConfig } from "../config/app.config.js"; import { DataSourceService } from "../dataSource/dataSource.service.js"; import { HttpClientService } from "../http-client/http-client.service.js"; import { buildCheckMetricLabels, classifyFailureStatus } from "../metrics-prometheus/check-metric-labels.js"; @@ -115,12 +115,12 @@ export class PullCheckService { requestLatencyMs, }); - const jobsConfig = this.getJobsConfig(); + const pullPieceConfig = this.getPullPieceConfig(); // `waitForPullStatus` polls the SP repeatedly until a terminal pull status is reported const finalResponse = await waitForPullStatus(synapseClient, { ...pullPiecesOptions, - timeout: jobsConfig.pullCheckJobTimeoutSeconds * 1000, - pollInterval: jobsConfig.pullCheckPollIntervalSeconds * 1000, + timeout: pullPieceConfig.pullCheckJobTimeoutSeconds * 1000, + pollInterval: pullPieceConfig.pullCheckPollIntervalSeconds * 1000, }); signal?.throwIfAborted(); const completionLatencyMs = Date.now() - requestSubmittedAt.getTime(); @@ -227,8 +227,7 @@ export class PullCheckService { * `/api/piece/:pieceCid` serving, and return the source URL plus registration. */ async preparePullPiece(providerAddress: string): Promise { - const jobsConfig = this.getJobsConfig(); - const targetSize = jobsConfig.pullCheckPieceSizeBytes; + const targetSize = this.getPullPieceConfig().pullCheckPieceSizeBytes; const key = crypto.randomBytes(16).toString("hex"); const dataStream = this.dataSourceService.generateBytesStream({ @@ -253,8 +252,8 @@ export class PullCheckService { return { registration, sourceUrl }; } - private getJobsConfig(): IJobsConfig { - return this.configService.get("jobs", { infer: true }); + private getPullPieceConfig(): IPullPieceConfig { + return this.configService.get("pullPiece", { infer: true }); } private resolvePublicBaseUrl(): string { diff --git a/docs/environment-variables.md b/docs/environment-variables.md index 16cefcfe..e12783d3 100644 --- a/docs/environment-variables.md +++ b/docs/environment-variables.md @@ -16,7 +16,7 @@ This document provides a comprehensive guide to all environment variables used b | [ClickHouse](#clickhouse-configuration) | `CLICKHOUSE_URL`, `CLICKHOUSE_BATCH_SIZE`, `CLICKHOUSE_FLUSH_INTERVAL_MS`, `DEALBOT_PROBE_LOCATION` | | [Timeouts](#timeout-configuration) | `CONNECT_TIMEOUT_MS`, `HTTP_REQUEST_TIMEOUT_MS`, `HTTP2_REQUEST_TIMEOUT_MS`, `IPNI_VERIFICATION_TIMEOUT_MS`, `IPNI_VERIFICATION_POLLING_MS` | | [Piece Cleanup](#piece-cleanup) | `MAX_DATASET_STORAGE_SIZE_BYTES`, `TARGET_DATASET_STORAGE_SIZE_BYTES`, `JOB_PIECE_CLEANUP_PER_SP_PER_HOUR`, `MAX_PIECE_CLEANUP_RUNTIME_SECONDS` | -| [Pull Check](#pull-check) | `PULL_CHECKS_PER_SP_PER_HOUR`, `PULL_CHECK_JOB_TIMEOUT_SECONDS`, `PULL_CHECK_POLL_INTERVAL_SECONDS`, `PULL_CHECK_PIECE_SIZE_BYTES` | +| [Pull Check](#pull-check) | `PULL_CHECKS_PER_SP_PER_HOUR`, `PULL_CHECK_JOB_TIMEOUT_SECONDS`, `PULL_CHECK_POLL_INTERVAL_SECONDS`, `PULL_CHECK_PIECE_SIZE_BYTES`, `PULL_PIECE_MAX_CONCURRENT_STREAMS`, `PULL_PIECE_MAX_STREAMS_PER_CID` | | [SP Blocklist](#sp-blocklist-configuration) | `BLOCKED_SP_IDS`, `BLOCKED_SP_ADDRESSES` | | [Prometheus Metrics](#prometheus-metrics-configuration) | `PROMETHEUS_WALLET_BALANCE_TTL_SECONDS`, `PROMETHEUS_WALLET_BALANCE_ERROR_COOLDOWN_SECONDS` | | [Web Frontend](#web-frontend) | `VITE_API_BASE_URL`, `VITE_PLAUSIBLE_DATA_DOMAIN`, `DEALBOT_API_BASE_URL` | @@ -977,7 +977,7 @@ PULL_CHECKS_PER_SP_PER_HOUR=0.083 - **Type**: `number` (seconds) - **Required**: No -- **Default**: `10` +- **Default**: `2` - **Minimum**: `1` **Role**: Polling interval used by `waitForPullStatus` while waiting for the SP to report a terminal pull status (`complete` or `failed`). @@ -1005,6 +1005,50 @@ PULL_CHECKS_PER_SP_PER_HOUR=0.083 --- +### `PULL_PIECE_MAX_CONCURRENT_STREAMS` + +- **Type**: `number` (integer) +- **Required**: No +- **Default**: `50` +- **Minimum**: `1` + +**Role**: Maximum number of concurrent HTTP/2 streams allowed across all pieces being served at any given time. This is a process-wide cap shared by all in-flight piece requests. + +**When to update**: + +- Decrease to reduce load on the Dealbot HTTP server under heavy SP demand +- Increase if many SPs are simultaneously fetching pieces and stream exhaustion is observed + +**Example**: + +```bash +PULL_PIECE_MAX_CONCURRENT_STREAMS=50 +``` + +--- + +### `PULL_PIECE_MAX_STREAMS_PER_CID` + +- **Type**: `number` (integer) +- **Required**: No +- **Default**: `3` +- **Minimum**: `1` + +**Role**: Maximum number of concurrent HTTP/2 streams allowed per individual `pieceCid`. Prevents a single piece from consuming the entire `PULL_PIECE_MAX_CONCURRENT_STREAMS` budget. + +**When to update**: + +- Decrease to spread stream capacity more evenly across pieces +- Increase if a single large piece must be fetched concurrently by multiple SPs + +**Example**: + +```bash +PULL_PIECE_MAX_STREAMS_PER_CID=3 +``` + +--- + ## Dataset Configuration ### `DEALBOT_LOCAL_DATASETS_PATH` From 93d617941251c526dad327e8feee92c8c970c2ca Mon Sep 17 00:00:00 2001 From: silent-cipher Date: Mon, 11 May 2026 11:43:10 +0530 Subject: [PATCH 32/44] feat: add rate limiting and stream tracking --- apps/backend/package.json | 3 +- apps/backend/src/jobs/jobs.service.spec.ts | 9 +- .../src/pull-check/pull-check.module.ts | 19 +- .../src/pull-check/pull-check.service.spec.ts | 5 +- .../pull-piece-stream-tracker.service.spec.ts | 184 ++++++++++++++++++ .../pull-piece-stream-tracker.service.ts | 159 +++++++++++++++ .../pull-check/pull-piece-throttler.guard.ts | 10 + .../src/pull-check/pull-piece.controller.ts | 100 ++++++---- pnpm-lock.yaml | 16 ++ 9 files changed, 467 insertions(+), 38 deletions(-) create mode 100644 apps/backend/src/pull-check/pull-piece-stream-tracker.service.spec.ts create mode 100644 apps/backend/src/pull-check/pull-piece-stream-tracker.service.ts create mode 100644 apps/backend/src/pull-check/pull-piece-throttler.guard.ts diff --git a/apps/backend/package.json b/apps/backend/package.json index 8fac937a..0fcb0e2a 100644 --- a/apps/backend/package.json +++ b/apps/backend/package.json @@ -41,6 +41,7 @@ "@nestjs/platform-express": "^11.1.13", "@nestjs/schedule": "^6.1.1", "@nestjs/swagger": "^11.2.6", + "@nestjs/throttler": "^6.5.0", "@nestjs/typeorm": "^11.0.0", "@willsoto/nestjs-prometheus": "^6.0.2", "any-signal": "^4.2.0", @@ -55,8 +56,8 @@ "multiformats": "^13.4.2", "nestjs-pino": "^4.6.1", "pg": "^8.18.0", - "pino": "^10.3.1", "pg-boss": "^12.9.0", + "pino": "^10.3.1", "pino-http": "^11.0.0", "prom-client": "^15.1.3", "reflect-metadata": "^0.2.2", diff --git a/apps/backend/src/jobs/jobs.service.spec.ts b/apps/backend/src/jobs/jobs.service.spec.ts index 55d319b5..c02c671e 100644 --- a/apps/backend/src/jobs/jobs.service.spec.ts +++ b/apps/backend/src/jobs/jobs.service.spec.ts @@ -134,7 +134,14 @@ describe("JobsService schedule rows", () => { pieceCleanupPerSpPerHour: 1, maxPieceCleanupRuntimeSeconds: 300, } as IConfig["jobs"], - pullPiece: { pullChecksPerSpPerHour: 1, pullCheckJobTimeoutSeconds: 300 } as IConfig["pullPiece"], + pullPiece: { + pullChecksPerSpPerHour: 1, + pullCheckJobTimeoutSeconds: 300, + pullCheckPollIntervalSeconds: 2, + pullCheckPieceSizeBytes: 10 * 1024 * 1024, + maxConcurrentStreams: 50, + maxStreamsPerCid: 3, + }, database: { host: "localhost", port: 5432, diff --git a/apps/backend/src/pull-check/pull-check.module.ts b/apps/backend/src/pull-check/pull-check.module.ts index 892e9d5e..04d00237 100644 --- a/apps/backend/src/pull-check/pull-check.module.ts +++ b/apps/backend/src/pull-check/pull-check.module.ts @@ -1,4 +1,5 @@ import { Module } from "@nestjs/common"; +import { ThrottlerModule } from "@nestjs/throttler"; import { TypeOrmModule } from "@nestjs/typeorm"; import { DatabaseModule } from "../database/database.module.js"; import { PullPiece } from "../database/entities/pull-piece.entity.js"; @@ -8,14 +9,28 @@ import { WalletSdkModule } from "../wallet-sdk/wallet-sdk.module.js"; import { PullCheckService } from "./pull-check.service.js"; import { PieceSourceController } from "./pull-piece.controller.js"; import { PullPieceRepository } from "./pull-piece.repository.js"; +import { PullPieceStreamTracker } from "./pull-piece-stream-tracker.service.js"; const runMode = process.env.DEALBOT_RUN_MODE?.toLowerCase() || "both"; const isWorkerOnly = runMode === "worker"; @Module({ - imports: [DatabaseModule, TypeOrmModule.forFeature([PullPiece]), WalletSdkModule, DataSourceModule, HttpClientModule], + imports: [ + ThrottlerModule.forRoot([ + { + name: "pull-piece", + ttl: 60_000, // 1 minute window + limit: 10, // 10 requests per IP per window + }, + ]), + TypeOrmModule.forFeature([PullPiece]), + DatabaseModule, + WalletSdkModule, + DataSourceModule, + HttpClientModule, + ], controllers: isWorkerOnly ? [] : [PieceSourceController], - providers: [PullCheckService, PullPieceRepository], + providers: [PullCheckService, PullPieceRepository, PullPieceStreamTracker], exports: [PullCheckService, PullPieceRepository], }) export class PullCheckModule {} diff --git a/apps/backend/src/pull-check/pull-check.service.spec.ts b/apps/backend/src/pull-check/pull-check.service.spec.ts index 08cc1b2e..ee6d09e6 100644 --- a/apps/backend/src/pull-check/pull-check.service.spec.ts +++ b/apps/backend/src/pull-check/pull-check.service.spec.ts @@ -98,10 +98,13 @@ describe("PullCheckService", () => { app: { host: "localhost", port: 3000, apiPublicUrl: "https://dealbot.example" } as IConfig["app"], blockchain: { network: "calibration", walletAddress: "0xwallet" } as IConfig["blockchain"], pullPiece: { + pullChecksPerSpPerHour: 1, pullCheckJobTimeoutSeconds: 300, pullCheckPollIntervalSeconds: 5, pullCheckPieceSizeBytes: 1024, - } as IConfig["pullPiece"], + maxConcurrentStreams: 50, + maxStreamsPerCid: 3, + }, dataset: { localDatasetsPath: "/tmp/datasets" } as IConfig["dataset"], }; diff --git a/apps/backend/src/pull-check/pull-piece-stream-tracker.service.spec.ts b/apps/backend/src/pull-check/pull-piece-stream-tracker.service.spec.ts new file mode 100644 index 00000000..1a43460a --- /dev/null +++ b/apps/backend/src/pull-check/pull-piece-stream-tracker.service.spec.ts @@ -0,0 +1,184 @@ +import { PassThrough } from "node:stream"; +import { ServiceUnavailableException } from "@nestjs/common"; +import { ConfigService } from "@nestjs/config"; +import { beforeEach, describe, expect, it, vi } from "vitest"; +import type { IConfig, IPullPieceConfig } from "../config/app.config.js"; +import { PullPieceStreamTracker } from "./pull-piece-stream-tracker.service.js"; + +/** Helper to wait for stream cleanup events to process */ +async function waitForCleanup(): Promise { + // Wait for multiple event loop cycles to ensure cleanup handlers execute + for (let i = 0; i < 5; i++) { + await new Promise((resolve) => setImmediate(resolve)); + } +} + +describe("PullPieceStreamTracker", () => { + let tracker: PullPieceStreamTracker; + let mockConfigService: ConfigService; + + const defaultConfig: IPullPieceConfig = { + pullChecksPerSpPerHour: 1, + pullCheckJobTimeoutSeconds: 300, + pullCheckPollIntervalSeconds: 2, + pullCheckPieceSizeBytes: 10 * 1024 * 1024, // 10 MB + maxConcurrentStreams: 50, + maxStreamsPerCid: 3, + }; + + beforeEach(() => { + mockConfigService = { + get: vi.fn().mockReturnValue(defaultConfig), + } as unknown as ConfigService; + + tracker = new PullPieceStreamTracker(mockConfigService); + }); + + it("should allow starting a stream when under limits", () => { + expect(() => tracker.reserveStream("baga1")).not.toThrow(); + }); + + it("should throw when global concurrent stream limit is reached", () => { + // Override config with low limit + mockConfigService.get = vi.fn().mockReturnValue({ + maxConcurrentStreams: 2, + maxStreamsPerCid: 3, + }); + + const stream1 = new PassThrough(); + const stream2 = new PassThrough(); + + tracker.reserveStream("piece1"); + tracker.registerStream("piece1", stream1); + + tracker.reserveStream("piece2"); + tracker.registerStream("piece2", stream2); + + // Third stream should fail + expect(() => tracker.reserveStream("piece3")).toThrow(ServiceUnavailableException); + expect(() => tracker.reserveStream("piece3")).toThrow("Server is at capacity"); + + // Clean up + stream1.destroy(); + stream2.destroy(); + }); + + it("should throw when per-pieceCid stream limit is reached", () => { + // Override config with low per-cid limit + mockConfigService.get = vi.fn().mockReturnValue({ + maxConcurrentStreams: 10, + maxStreamsPerCid: 2, + }); + + const stream1 = new PassThrough(); + const stream2 = new PassThrough(); + + tracker.reserveStream("piece1"); + tracker.registerStream("piece1", stream1); + + tracker.reserveStream("piece1"); + tracker.registerStream("piece1", stream2); + + // Third stream for same piece should fail + expect(() => tracker.reserveStream("piece1")).toThrow(ServiceUnavailableException); + expect(() => tracker.reserveStream("piece1")).toThrow("Too many concurrent requests for this piece"); + + // Clean up + stream1.destroy(); + stream2.destroy(); + }); + + it("should unregister stream when stream ends", async () => { + const stream = new PassThrough(); + + tracker.reserveStream("piece1"); + tracker.registerStream("piece1", stream); + + let stats = tracker.getStats(); + expect(stats.activeStreams).toBe(1); + expect(stats.uniquePieceCids).toBe(1); + + // Destroy the stream to trigger cleanup (end() alone doesn't work for unread PassThrough) + stream.destroy(); + + // Give event loop time to process cleanup + await waitForCleanup(); + + stats = tracker.getStats(); + expect(stats.activeStreams).toBe(0); + expect(stats.uniquePieceCids).toBe(0); + }); + + it("should unregister stream when stream errors", async () => { + const stream = new PassThrough(); + + tracker.reserveStream("piece1"); + tracker.registerStream("piece1", stream); + + let stats = tracker.getStats(); + expect(stats.activeStreams).toBe(1); + + // Trigger error + stream.destroy(new Error("Test error")); + + // Give event loop time to process cleanup + await waitForCleanup(); + + stats = tracker.getStats(); + expect(stats.activeStreams).toBe(0); + }); + + it("should track multiple pieces independently", () => { + const stream1 = new PassThrough(); + const stream2 = new PassThrough(); + const stream3 = new PassThrough(); + + tracker.reserveStream("piece1"); + tracker.registerStream("piece1", stream1); + + tracker.reserveStream("piece2"); + tracker.registerStream("piece2", stream2); + + tracker.reserveStream("piece3"); + tracker.registerStream("piece3", stream3); + + const stats = tracker.getStats(); + expect(stats.activeStreams).toBe(3); + expect(stats.uniquePieceCids).toBe(3); + + // Clean up + stream1.destroy(); + stream2.destroy(); + stream3.destroy(); + }); + + it("should allow new streams after previous ones complete", async () => { + // Override config with low limit + mockConfigService.get = vi.fn().mockReturnValue({ + maxConcurrentStreams: 2, + maxStreamsPerCid: 2, + }); + + const stream1 = new PassThrough(); + const stream2 = new PassThrough(); + + tracker.reserveStream("piece1"); + tracker.registerStream("piece1", stream1); + + tracker.reserveStream("piece2"); + tracker.registerStream("piece2", stream2); + + // Would fail now + expect(() => tracker.reserveStream("piece3")).toThrow(); + + // Destroy one stream to free up capacity + stream1.destroy(); + await waitForCleanup(); + + // Should succeed now + expect(() => tracker.reserveStream("piece3")).not.toThrow(); + + // Clean up + stream2.destroy(); + }); +}); diff --git a/apps/backend/src/pull-check/pull-piece-stream-tracker.service.ts b/apps/backend/src/pull-check/pull-piece-stream-tracker.service.ts new file mode 100644 index 00000000..16c7f332 --- /dev/null +++ b/apps/backend/src/pull-check/pull-piece-stream-tracker.service.ts @@ -0,0 +1,159 @@ +import type { Readable } from "node:stream"; +import { Injectable, Logger, ServiceUnavailableException } from "@nestjs/common"; +import { ConfigService } from "@nestjs/config"; +import type { IConfig, IPullPieceConfig } from "../config/app.config.js"; + +/** + * Tracks active pull-piece streams to enforce global and per-pieceCid concurrency limits. + * Prevents DoS attacks where nefarious actors spam the `/api/piece/:pieceCid` endpoint + * with concurrent requests to overwhelm the server. + */ +@Injectable() +export class PullPieceStreamTracker { + private readonly logger = new Logger(PullPieceStreamTracker.name); + + /** Total count of active streams across all pieceCids */ + private activeStreamCount = 0; + + /** Map of pieceCid -> count of active streams for that piece */ + private readonly streamsByPieceCid = new Map(); + + /** Weak set to track which streams have been cleaned up (prevents duplicate cleanup) */ + private readonly cleanedUpStreams = new WeakSet(); + + constructor(private readonly configService: ConfigService) {} + + /** + * Check limits and atomically reserve a stream slot for the given pieceCid. + * Throws ServiceUnavailableException if limits are exceeded. + * On success the slot is incremented immediately; call releaseReservation if + * the stream never materialises (e.g. piece not found, upstream error). + */ + reserveStream(pieceCid: string): void { + const config = this.getPullPieceConfig(); + + // Check global concurrent stream limit + if (this.activeStreamCount >= config.maxConcurrentStreams) { + this.logger.warn({ + event: "pull_piece_stream_limit_global", + message: "Global concurrent stream limit reached", + activeStreams: this.activeStreamCount, + maxConcurrentStreams: config.maxConcurrentStreams, + pieceCid, + }); + throw new ServiceUnavailableException("Server is at capacity. Please retry later."); + } + + // Check per-pieceCid concurrent stream limit + const currentStreamsForCid = this.streamsByPieceCid.get(pieceCid) ?? 0; + if (currentStreamsForCid >= config.maxStreamsPerCid) { + this.logger.warn({ + event: "pull_piece_stream_limit_per_cid", + message: "Per-pieceCid concurrent stream limit reached", + pieceCid, + activeStreamsForCid: currentStreamsForCid, + maxStreamsPerCid: config.maxStreamsPerCid, + }); + throw new ServiceUnavailableException("Too many concurrent requests for this piece. Please retry later."); + } + + // Reserve the slot atomically so concurrent requests see the updated count + this.activeStreamCount++; + this.streamsByPieceCid.set(pieceCid, currentStreamsForCid + 1); + } + + /** + * Release a previously reserved slot without an associated stream. + * Call this when reserveStream succeeded but the stream never materialised + * (piece not found, upstream error, etc.). + */ + releaseReservation(pieceCid: string): void { + if (this.activeStreamCount > 0) { + this.activeStreamCount--; + } + const currentCount = this.streamsByPieceCid.get(pieceCid); + if (currentCount != null && currentCount > 0) { + const newCount = currentCount - 1; + if (newCount === 0) { + this.streamsByPieceCid.delete(pieceCid); + } else { + this.streamsByPieceCid.set(pieceCid, newCount); + } + } + } + + /** + * Attach cleanup handlers to a stream whose slot was already reserved by reserveStream. + * Call this immediately after creating the stream and before piping. + */ + registerStream(pieceCid: string, stream: Readable): void { + // Slot was already incremented by reserveStream; just log and attach handlers. + this.logger.debug({ + event: "pull_piece_stream_registered", + pieceCid, + activeStreams: this.activeStreamCount, + activeStreamsForCid: this.streamsByPieceCid.get(pieceCid) ?? 0, + }); + + // Attach cleanup handler to all stream termination events + // Use a single cleanup function that guards against duplicate calls + const cleanup = () => { + this.unregisterStream(pieceCid, stream); + }; + + // Clean up on any stream termination event (streams can emit multiple events) + stream.once("end", cleanup); + stream.once("error", cleanup); + stream.once("close", cleanup); + } + + /** + * Unregister a stream when it completes, errors, or closes. + * This is called automatically by the stream event handlers. + * Guards against duplicate cleanup using a WeakSet. + */ + private unregisterStream(pieceCid: string, stream: Readable): void { + // Prevent duplicate cleanup if this stream was already cleaned up + if (this.cleanedUpStreams.has(stream)) { + return; + } + this.cleanedUpStreams.add(stream); + + // Decrement global counter + if (this.activeStreamCount > 0) { + this.activeStreamCount--; + } + + // Decrement per-pieceCid counter + const currentCount = this.streamsByPieceCid.get(pieceCid); + if (currentCount != null && currentCount > 0) { + const newCount = currentCount - 1; + if (newCount === 0) { + this.streamsByPieceCid.delete(pieceCid); + } else { + this.streamsByPieceCid.set(pieceCid, newCount); + } + } + + this.logger.debug({ + event: "pull_piece_stream_unregistered", + pieceCid, + activeStreams: this.activeStreamCount, + activeStreamsForCid: this.streamsByPieceCid.get(pieceCid) ?? 0, + }); + } + + /** + * Get current stream statistics for observability. + */ + getStats(): { activeStreams: number; uniquePieceCids: number } { + return { + activeStreams: this.activeStreamCount, + uniquePieceCids: this.streamsByPieceCid.size, + }; + } + + private getPullPieceConfig(): IPullPieceConfig { + return this.configService.get("pullPiece", { infer: true }); + } +} diff --git a/apps/backend/src/pull-check/pull-piece-throttler.guard.ts b/apps/backend/src/pull-check/pull-piece-throttler.guard.ts new file mode 100644 index 00000000..d804a91f --- /dev/null +++ b/apps/backend/src/pull-check/pull-piece-throttler.guard.ts @@ -0,0 +1,10 @@ +import { ExecutionContext, Injectable } from "@nestjs/common"; +import { ThrottlerGuard } from "@nestjs/throttler"; + +@Injectable() +export class PullPieceThrottlerGuard extends ThrottlerGuard { + protected async throwThrottlingException(context: ExecutionContext): Promise { + const res = context.switchToHttp().getResponse(); + res.status(429).setHeader("Retry-After", "60").send("Too many requests"); + } +} diff --git a/apps/backend/src/pull-check/pull-piece.controller.ts b/apps/backend/src/pull-check/pull-piece.controller.ts index fa7ecc2f..fd70889b 100644 --- a/apps/backend/src/pull-check/pull-piece.controller.ts +++ b/apps/backend/src/pull-check/pull-piece.controller.ts @@ -1,9 +1,12 @@ import { PassThrough } from "node:stream"; -import { Controller, Get, Logger, NotFoundException, Param, Res } from "@nestjs/common"; +import { asPieceCID } from "@filoz/synapse-core/piece"; +import { Controller, Get, Logger, NotFoundException, Param, Res, UseGuards } from "@nestjs/common"; import { ApiResponse, ApiTags } from "@nestjs/swagger"; import type { Response } from "express"; import { PullCheckService } from "./pull-check.service.js"; import { PullPieceRepository } from "./pull-piece.repository.js"; +import { PullPieceStreamTracker } from "./pull-piece-stream-tracker.service.js"; +import { PullPieceThrottlerGuard } from "./pull-piece-throttler.guard.js"; /** * Serves the temporary pull-piece bytes that a storage provider must fetch @@ -19,54 +22,85 @@ export class PieceSourceController { constructor( private readonly pullCheckService: PullCheckService, private readonly pullPieceRepository: PullPieceRepository, + private readonly streamTracker: PullPieceStreamTracker, ) {} @Get("piece/:pieceCid") + @UseGuards(PullPieceThrottlerGuard) @ApiResponse({ status: 200, description: "Raw piece bytes streamed to the caller" }) @ApiResponse({ status: 404, description: "No active pull piece exists for this pieceCid" }) + @ApiResponse({ status: 503, description: "Server is at capacity or too many concurrent requests for this piece" }) async servePiece(@Param("pieceCid") pieceCid: string, @Res() res: Response): Promise { if (!pieceCid || pieceCid.trim().length === 0) { throw new NotFoundException("pieceCid is required"); } - const opened = await this.pullCheckService.openPullPieceStream(pieceCid); - if (!opened) { - this.logger.warn({ - event: "pull_check_piece_unknown", - message: "Pull piece source not found", - pieceCid, - }); - res.status(404).send("Pull piece source not found"); - return; + if (!asPieceCID(pieceCid)) { + throw new NotFoundException("pieceCid is invalid"); } - const { registration, stream } = opened; - res.setHeader("Content-Type", "application/octet-stream"); - res.setHeader("Content-Length", registration.size.toString()); - res.setHeader("Cache-Control", "no-store"); - res.setHeader("X-Pull-Check-Piece-CID", registration.pieceCid); - res.setHeader("Accept-ranges", "none"); + // Reserve a slot atomically; throws 503 immediately if limits are already exceeded. + this.streamTracker.reserveStream(pieceCid); - stream.on("error", (error) => { - this.logger.error({ - event: "pull_check_piece_stream_error", - message: "Failed to stream pull piece", - pieceCid, - error: error.message, - }); - if (!res.headersSent) { - res.status(500).send("Failed to stream pull piece"); + // If the stream never materialises (piece not found or error), the reservation must + // be released. streamAttached tracks whether registerStream took ownership. + let streamAttached = false; + try { + const opened = await this.pullCheckService.openPullPieceStream(pieceCid); + if (!opened) { + this.logger.warn({ + event: "pull_check_piece_unknown", + message: "Pull piece source not found", + pieceCid, + }); + res.status(404).send("Pull piece source not found"); return; } - res.destroy(error); - }); - const pt = new PassThrough(); - // Capture the first-byte timestamp before piping (fire-and-forget DB write) - pt.once("data", () => { - void this.pullPieceRepository.markFirstByte(pieceCid, new Date()); - }); + const { registration, stream } = opened; + + // Attach cleanup handlers (slot already counted by reserveStream) + this.streamTracker.registerStream(pieceCid, stream); + streamAttached = true; + + res.setHeader("Content-Type", "application/octet-stream"); + res.setHeader("Content-Length", registration.size.toString()); + res.setHeader("Cache-Control", "no-store"); + res.setHeader("X-Pull-Check-Piece-CID", registration.pieceCid); + res.setHeader("Accept-ranges", "none"); + + stream.on("error", (error) => { + this.logger.error({ + event: "pull_check_piece_stream_error", + message: "Failed to stream pull piece", + pieceCid, + error: error.message, + }); + if (!res.headersSent) { + res.status(500).send("Failed to stream pull piece"); + return; + } + res.destroy(error); + }); + + // If the SP client disconnects mid-transfer, destroy the source generator + // immediately so the stream counter is released and memory is freed rather + // than waiting for the synthetic generator to exhaust all bytesNeeded. + res.on("close", () => { + stream.destroy(); + }); + + const pt = new PassThrough(); + // Capture the first-byte timestamp before piping (fire-and-forget DB write) + pt.once("data", () => { + void this.pullPieceRepository.markFirstByte(pieceCid, new Date()); + }); - stream.pipe(pt).pipe(res); + stream.pipe(pt).pipe(res); + } finally { + if (!streamAttached) { + this.streamTracker.releaseReservation(pieceCid); + } + } } } diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 4beb4487..6a51fdc5 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -77,6 +77,9 @@ importers: '@nestjs/swagger': specifier: ^11.2.6 version: 11.2.6(@nestjs/common@11.1.13(class-transformer@0.5.1)(class-validator@0.14.3)(reflect-metadata@0.2.2)(rxjs@7.8.2))(@nestjs/core@11.1.13)(class-transformer@0.5.1)(class-validator@0.14.3)(reflect-metadata@0.2.2) + '@nestjs/throttler': + specifier: ^6.5.0 + version: 6.5.0(@nestjs/common@11.1.13(class-transformer@0.5.1)(class-validator@0.14.3)(reflect-metadata@0.2.2)(rxjs@7.8.2))(@nestjs/core@11.1.13)(reflect-metadata@0.2.2) '@nestjs/typeorm': specifier: ^11.0.0 version: 11.0.0(@nestjs/common@11.1.13(class-transformer@0.5.1)(class-validator@0.14.3)(reflect-metadata@0.2.2)(rxjs@7.8.2))(@nestjs/core@11.1.13)(reflect-metadata@0.2.2)(rxjs@7.8.2)(typeorm@0.3.28(pg@8.18.0)(ts-node@10.9.2(@swc/core@1.15.11)(@types/node@25.2.3)(typescript@5.9.3))) @@ -1423,6 +1426,13 @@ packages: '@nestjs/platform-express': optional: true + '@nestjs/throttler@6.5.0': + resolution: {integrity: sha512-9j0ZRfH0QE1qyrj9JjIRDz5gQLPqq9yVC2nHsrosDVAfI5HHw08/aUAWx9DZLSdQf4HDkmhTTEGLrRFHENvchQ==} + peerDependencies: + '@nestjs/common': ^7.0.0 || ^8.0.0 || ^9.0.0 || ^10.0.0 || ^11.0.0 + '@nestjs/core': ^7.0.0 || ^8.0.0 || ^9.0.0 || ^10.0.0 || ^11.0.0 + reflect-metadata: ^0.1.13 || ^0.2.0 + '@nestjs/typeorm@11.0.0': resolution: {integrity: sha512-SOeUQl70Lb2OfhGkvnh4KXWlsd+zA08RuuQgT7kKbzivngxzSo1Oc7Usu5VxCxACQC9wc2l9esOHILSJeK7rJA==} peerDependencies: @@ -8639,6 +8649,12 @@ snapshots: optionalDependencies: '@nestjs/platform-express': 11.1.13(@nestjs/common@11.1.13(class-transformer@0.5.1)(class-validator@0.14.3)(reflect-metadata@0.2.2)(rxjs@7.8.2))(@nestjs/core@11.1.13) + '@nestjs/throttler@6.5.0(@nestjs/common@11.1.13(class-transformer@0.5.1)(class-validator@0.14.3)(reflect-metadata@0.2.2)(rxjs@7.8.2))(@nestjs/core@11.1.13)(reflect-metadata@0.2.2)': + dependencies: + '@nestjs/common': 11.1.13(class-transformer@0.5.1)(class-validator@0.14.3)(reflect-metadata@0.2.2)(rxjs@7.8.2) + '@nestjs/core': 11.1.13(@nestjs/common@11.1.13(class-transformer@0.5.1)(class-validator@0.14.3)(reflect-metadata@0.2.2)(rxjs@7.8.2))(@nestjs/platform-express@11.1.13)(reflect-metadata@0.2.2)(rxjs@7.8.2) + reflect-metadata: 0.2.2 + '@nestjs/typeorm@11.0.0(@nestjs/common@11.1.13(class-transformer@0.5.1)(class-validator@0.14.3)(reflect-metadata@0.2.2)(rxjs@7.8.2))(@nestjs/core@11.1.13)(reflect-metadata@0.2.2)(rxjs@7.8.2)(typeorm@0.3.28(pg@8.18.0)(ts-node@10.9.2(@swc/core@1.15.11)(@types/node@25.2.3)(typescript@5.9.3)))': dependencies: '@nestjs/common': 11.1.13(class-transformer@0.5.1)(class-validator@0.14.3)(reflect-metadata@0.2.2)(rxjs@7.8.2) From 863cd285dfbce9e1c58ef1f7f9855e7ddfc79eca Mon Sep 17 00:00:00 2001 From: silent-cipher Date: Mon, 11 May 2026 14:07:43 +0530 Subject: [PATCH 33/44] chore: fix docs --- docs/checks/events-and-metrics.md | 26 +++++++++++++------------- docs/checks/pull-check.md | 31 +++++++++++++++++-------------- 2 files changed, 30 insertions(+), 27 deletions(-) diff --git a/docs/checks/events-and-metrics.md b/docs/checks/events-and-metrics.md index 4b67f922..45f8c82e 100644 --- a/docs/checks/events-and-metrics.md +++ b/docs/checks/events-and-metrics.md @@ -69,25 +69,25 @@ sequenceDiagram participant RPC as Chain RPC Provider Dealbot->>Dealbot: hostedPieceRegistered - Dealbot->>SP: pullRequestSubmitted (pullPieces) - SP-->>Dealbot: pullRequestAcknowledged - SP-->>Dealbot: hostedPieceFirstByteRead + Dealbot->>SP: pullRequestSubmittedToSp (pullPieces) + SP-->>Dealbot: pullRequestAcknowledgedBySp + SP-->>Dealbot: pullRequestStartedBySp Dealbot->>SP: pullStatusPolled (waitForPullStatus, repeated) - SP-->>Dealbot: pullTerminalStatusReported + SP-->>Dealbot: pullRequestIsTerminal Dealbot->>SP: directPieceFetchStarted (/piece/{cid}) SP-->>Dealbot: directPieceFetchCompleted - Dealbot-->>Dealbot: pullCheckIntegrityChecked + Dealbot-->>Dealbot: pullRequestIntegrityChecked ``` ### Pull Check Event List | Event | Definition | Implemented | Source of truth | |------|------------|:------:|-----------------| -| `pullRequestSubmitted` | Dealbot calls `pullPieces` against the SP for the registered piece CID. | Yes | [`pull-check.service.ts`](../../apps/backend/src/pull-check/pull-check.service.ts) | -| `pullRequestAcknowledged` | SP returns from `pullPieces` (success or non-terminal-failure). | Yes | [`pull-check.service.ts`](../../apps/backend/src/pull-check/pull-check.service.ts) | -| `hostedPieceFirstByteRead` | SP reads the first byte of `/api/piece/{pieceCid}` from dealbot. Recorded once per registration. | Yes | [`pull-piece.controller.ts`](../../apps/backend/src/pull-check/pull-piece.controller.ts) | -| `pullTerminalStatusReported` | SP reports a terminal pull status (`complete`, `failed`, ...) via `waitForPullStatus`. Intermediate poll statuses are not counted. | Yes | [`pull-check.service.ts`](../../apps/backend/src/pull-check/pull-check.service.ts) | -| `pullCheckIntegrityChecked` | Direct `/piece/{pieceCid}` fetch from the SP returns bytes whose recomputed pieceCid matches the expected CID. | Yes | [`pull-check.service.ts`](../../apps/backend/src/pull-check/pull-check.service.ts) | +| `pullRequestSubmittedToSp` | Dealbot calls Synapse pullPiece (`POST /pdp/piece/pull`) against the SP for the registered piece CID. | Yes | [`pull-check.service.ts`](../../apps/backend/src/pull-check/pull-check.service.ts) | +| `pullRequestAcknowledgedBySp` | SP returns from `pullPieces` (success or non-terminal-failure). | Yes | [`pull-check.service.ts`](../../apps/backend/src/pull-check/pull-check.service.ts) | +| `pullRequestStartedBySp` | Dealbot receives SP request for `/api/piece/{pieceCid}` from dealbot. Recorded once per registration. | Yes | [`pull-piece.controller.ts`](../../apps/backend/src/pull-check/pull-piece.controller.ts) | +| `pullRequestIsTerminal` | Dealbot determines the pull request is in terminal pull status (`complete`, `failed`, ...) via `waitForPullStatus` or the polling operation has timed out. Intermediate poll statuses are not counted. | Yes | [`pull-check.service.ts`](../../apps/backend/src/pull-check/pull-check.service.ts) | +| `pullRequestIntegrityChecked` | Dealbot performs direct `/piece/{pieceCid}` retrieval from the SP and confirms the bytes match the pieceCid. | Yes | [`pull-check.service.ts`](../../apps/backend/src/pull-check/pull-check.service.ts) | ## Metrics @@ -121,9 +121,9 @@ sequenceDiagram | `dataStorageCheckMs` | Data Storage | [`uploadToSpStart`](#uploadToSpStart) | [`ipfsRetrievalIntegrityChecked`](#ipfsRetrievalIntegrityChecked) | Duration of a Data Storage check | | | `retrievalCheckMs` | Retrieval | Retrieval check start | [`ipfsRetrievalIntegrityChecked`](#ipfsRetrievalIntegrityChecked) | Duration of a Retrieval check | | | `dataSetCreationMs` | Data-Set Creation | Data-set creation uploadToSpStart | Data-set creation pieceConfirmed | Duration of one data-set creation with confirmed piece (all using `createDataSetWithPiece`) | [`deal.service.ts`](../../apps/backend/src/deal/deal.service.ts) | -| `pullCheckRequestLatencyMs` | Pull | [`pullRequestSubmitted`](#pullRequestSubmitted) | [`pullRequestAcknowledged`](#pullRequestAcknowledged) | Time from `pullPieces` submission to SP request acknowledgement. | [`pull-check.service.ts`](../../apps/backend/src/pull-check/pull-check.service.ts) | -| `pullCheckCompletionLatencyMs` | Pull | [`pullRequestSubmitted`](#pullRequestSubmitted) | [`pullTerminalStatusReported`](#pullTerminalStatusReported) | Time from `pullPieces` submission to terminal SP pull status. Observed once on success and once on failure. | [`pull-check.service.ts`](../../apps/backend/src/pull-check/pull-check.service.ts) | -| `pullCheckFirstByteMs` | Pull | [`pullRequestSubmitted`](#pullRequestSubmitted) | [`hostedPieceFirstByteRead`](#hostedPieceFirstByteRead) | Time from `pullPieces` submission to the SP reading the first byte of `/api/piece/{pieceCid}`. Skipped (no observation) when the SP serves the pull from a local cache and never fetches from dealbot. | [`pull-check.service.ts`](../../apps/backend/src/pull-check/pull-check.service.ts), [`pull-piece.controller.ts`](../../apps/backend/src/pull-check/pull-piece.controller.ts) | +| `pullCheckRequestLatencyMs` | Pull | [`pullRequestSubmittedToSp`](#pullRequestSubmittedToSp) | [`pullRequestAcknowledgedBySp`](#pullRequestAcknowledgedBySp) | Time from `pullPieces` submission to SP request acknowledgement. | [`pull-check.service.ts`](../../apps/backend/src/pull-check/pull-check.service.ts) | +| `pullCheckCompletionLatencyMs` | Pull | [`pullRequestSubmittedToSp`](#pullRequestSubmittedToSp) | [`pullRequestIsTerminal`](#pullRequestIsTerminal) | Time from `pullPieces` submission to terminal SP pull status. Observed once on success and once on failure. | [`pull-check.service.ts`](../../apps/backend/src/pull-check/pull-check.service.ts) | +| `pullCheckFirstByteMs` | Pull | [`pullRequestSubmittedToSp`](#pullRequestSubmittedToSp) | [`pullRequestStartedBySp`](#pullRequestStartedBySp) | Time from `pullPieces` submission to the SP reading the first byte of `/api/piece/{pieceCid}`. Skipped (no observation) when the SP serves the pull from a local cache and never fetches from dealbot. | [`pull-check.service.ts`](../../apps/backend/src/pull-check/pull-check.service.ts), [`pull-piece.controller.ts`](../../apps/backend/src/pull-check/pull-piece.controller.ts) | | `pullCheckThroughputBps` | Pull | n/a | n/a | `(pieceSizeBytes / pullCheckCompletionLatencyMs) * 1000`. Upper-bound on actual transfer rate because `pullCheckCompletionLatencyMs` includes SP-side scheduling and dealbot's polling cadence. | [`pull-check.service.ts`](../../apps/backend/src/pull-check/pull-check.service.ts) | diff --git a/docs/checks/pull-check.md b/docs/checks/pull-check.md index bb9f78d1..a7c7f05a 100644 --- a/docs/checks/pull-check.md +++ b/docs/checks/pull-check.md @@ -10,7 +10,7 @@ For event and metric definitions used by the dashboard, see [Dealbot Events & Me A "pull check" exercises the **storage provider pull-to-park pathway**: dealbot publishes a temporary piece at `/api/piece/{pieceCid}`, asks the SP to fetch (pull) and park it via the Synapse `pullPieces` API, waits for a terminal SP pull status, and finally re-fetches the piece from the SP to verify byte-for-byte integrity. -The pull check answers a different question than the [Data Storage check](./data-storage.md): instead of *uploading* bytes to the SP, it asks the SP to *pull* bytes from a public URL. This validates an SP's outbound HTTP fetcher, the pull request lifecycle, and retrieval surface. +The pull check answers a different question than the [Data Storage check](./data-storage.md): instead of *uploading* bytes to the SP, it asks the SP to *pull* bytes from a public URL. This validates an SP's outbound HTTP fetcher, the pull request lifecycle, and `/piece` retrieval surface. (`/piece` retrieval is not covered by the [Data Storage check](./data-storage.md).) A successful pull check requires all [assertions in the table below](#what-gets-asserted) to pass. Failure occurs if any step fails or the job exceeds its max allowed time. Operational timeouts exist to prevent jobs from running indefinitely, but they are not quality assertions. @@ -22,8 +22,8 @@ Each pull check asserts the following for every SP: | # | Assertion | How It's Checked | Retries | Relevant Metric for Setting a Max Duration | Implemented? | |---|-----------|------------------|:---:|--------------------------------------------|:---:| -| 1 | SP accepts the pull request | `pullPieces` returns without error and reports a non-terminal-failure status | 0 | [`pullCheckRequestLatencyMs`](./events-and-metrics.md#pullCheckRequestLatencyMs) | Yes | -| 2 | SP reaches a terminal `complete` pull status | `waitForPullStatus` polls the SP until a terminal status is reported | Polling with delay until [`PULL_CHECK_JOB_TIMEOUT_SECONDS`](../environment-variables.md#pull_check_job_timeout_seconds) | [`pullCheckCompletionLatencyMs`](./events-and-metrics.md#pullCheckCompletionLatencyMs) | Yes | +| 1 | SP accepts the pull request | Synapse `pullPieces` (i.e., initial call to SP `POST /pdp/piece/pull`) returns without error and reports a non-terminal-failure status | 0 | [`pullCheckRequestLatencyMs`](./events-and-metrics.md#pullCheckRequestLatencyMs) | Yes | +| 2 | SP reaches a terminal `complete` pull status | Synapse `waitForPullStatus` polls the SP (using `POST /pdp/piece/pull`) until a terminal status is reported | Polling will continue until [`PULL_CHECK_JOB_TIMEOUT_SECONDS`](../environment-variables.md#pull_check_job_timeout_seconds) is reached | [`pullCheckCompletionLatencyMs`](./events-and-metrics.md#pullCheckCompletionLatencyMs) | Yes | | 3 | SP serves the pulled piece via `/piece/{pieceCid}` | Re-fetch the bytes from the SP's PDP service URL and re-compute the piece CID | 0 | n/a (bounded by job timeout) | Yes | | 4 | All checks pass | Pull check is not marked successful until all assertions pass within the job timeout | n/a | [`pullCheckCompletionLatencyMs`](./events-and-metrics.md#pullCheckCompletionLatencyMs) | Yes | @@ -35,9 +35,10 @@ The dealbot scheduler triggers pull check jobs at a configurable rate (`PULL_CHE flowchart TD Generate["Compute PieceCID + register hosted source in Postgres
at /api/piece/{pieceCid}"] Generate --> Submit["Submit pullPieces request to SP"] - Submit --> Poll["Poll SP via waitForPullStatus
until terminal pull status"] + Submit --> |SP responds with HTTP 200|Poll["Poll SP via waitForPullStatus
until terminal pull status"] + Submit --> |SP doesn't respond with HTTP 200| Fail["Mark pull check failed"] Poll -->|complete| Validate["Direct /piece/{pieceCid} fetch from SP
+ recompute pieceCid"] - Poll -->|other terminal status| Fail["Mark pull check failed"] + Poll -->|other terminal status| Fail Validate -->|matches| Success["Mark pull check successful"] Validate -->|mismatch or fetch error| Fail Success --> Cleanup @@ -46,11 +47,11 @@ flowchart TD ### 1. Prepare the hosted piece -Dealbot computes a deterministic PieceCID for a synthetic test piece and registers it in the Postgres `pull_pieces` table. +Dealbot computes a deterministic PieceCID for a synthetic test piece. The synthetic data is **not** stored on disk. Instead, dealbot uses a deterministic pseudo-random generator (AES-256-CTR) to generate it as needed. -By persisting registrations to Postgres instead of in-memory, the hosted source can be resolved by any API pod in a multi-pod deployment, even if the pull check was initiated by a different worker pod. +The pieceCid and synthetic piece's random seed are registered it in the Postgres `pull_pieces` table. The registration persists for the duration of the pull check job. -The synthetic data is **not** stored on disk. Instead, dealbot uses a deterministic pseudo-random generator (AES-256-CTR) to stream the same bytes whenever the SP fetches the piece or dealbot needs to re-compute the CID for validation. +By persisting registrations to Postgres instead of in-memory, the hosted source can be resolved by any API pod in a multi-pod deployment, even if the pull check was initiated by a different worker pod. It generates the same piece and streams the whenever the SP fetches the piece or dealbot needs to re-compute the CID for validation. The source URL handed to the SP is built from the dealbot `app.apiPublicUrl` config (set via `DEALBOT_API_PUBLIC_URL`). When `DEALBOT_API_PUBLIC_URL` is unset, dealbot falls back to `http://{DEALBOT_HOST}:{DEALBOT_PORT}`, which is only reachable in single-host or `localhost` setups. @@ -62,21 +63,21 @@ Source: [`pull-check.service.ts` (`preparePullPiece`)](../../apps/backend/src/pu ### 2. Submit the pull request -Dealbot calls `pullPieces` from `@filoz/synapse-core/sp` with the pieceCid, the source URL, and either the SP's existing `dataSetId`/`clientDataSetId` or the SP `payee` for new-dataset flows. The submission timestamp is stamped on the registration so it can later be subtracted from the first-byte event. +Dealbot calls `pullPieces` from `@filoz/synapse-core/sp` with the pieceCid, the source URL, and the SP `payee`. The submission timestamp is stamped on the registration so it can later be subtracted from the first-byte event. Source: [`pull-check.service.ts` (`runPullCheck`)](../../apps/backend/src/pull-check/pull-check.service.ts) ### 3. Wait for terminal SP pull status -`waitForPullStatus` polls the SP at `PULL_CHECK_POLL_INTERVAL_SECONDS` until the SP reports a terminal status (`complete` or `failed`) or the job timeout fires. Dealbot increments the [`pullCheckProviderStatus`](./events-and-metrics.md#pullCheckProviderStatus) counter exactly once with the **terminal** status; intermediate poll statuses are not counted. +When dealbot receives the SP request for `/api/piece/{pieceCid}` for the first time, dealbot stamps a first-byte timestamp on the registration. This is the basis for [`pullCheckFirstByteMs`](./events-and-metrics.md#pullCheckFirstByteMs). -When the SP fetches `/api/piece/{pieceCid}` for the first time, the controller stamps a first-byte timestamp on the registration. This is the basis for [`pullCheckFirstByteMs`](./events-and-metrics.md#pullCheckFirstByteMs). +In parallel, dealbot`waitForPullStatus` polls the SP at `PULL_CHECK_POLL_INTERVAL_SECONDS` until the SP reports a terminal status (`complete` or `failed`) or the job timeout fires. Dealbot increments the [`pullCheckProviderStatus`](./events-and-metrics.md#pullCheckProviderStatus) counter exactly once with the **terminal** status; intermediate poll statuses are not counted. Source: [`pull-piece.controller.ts`](../../apps/backend/src/pull-check/pull-piece.controller.ts) ### 4. Direct piece-fetch validation -After commit, dealbot fetches `{serviceURL}/piece/{pieceCid}` from the SP, re-computes the piece CID over the response body, and compares it against the expected CID. A mismatch fails the pull check with `failure.other`. A network or HTTP error during validation also fails the check (transport errors are intentionally not retried). +After `waitForPullStatus`, dealbot fetches `{serviceURL}/piece/{pieceCid}` from the SP, re-computes the piece CID over the response body, and compares it against the expected CID. A mismatch fails the pull check with `failure.other`. A network or HTTP error during validation also fails the check (transport errors are intentionally not retried). Aborts (job timeout) propagate as throws and are classified as `failure.timedout` rather than as a validation failure. @@ -84,7 +85,7 @@ Source: [`pull-check.service.ts` (`validateByDirectPieceFetch`)](../../apps/back ### 5. Cleanup -Whether the pull check succeeds or fails, the `finally` block removes the registration entry. +Whether the pull check succeeds or fails, the `finally` block removes the registration entry from postgres database. After cleanup, subsequent `/api/piece/{pieceCid}` requests return HTTP 404 Not Found. Source: [`pull-check.service.ts`](../../apps/backend/src/pull-check/pull-check.service.ts) @@ -120,8 +121,8 @@ Source: [`pull-piece.controller.ts`](../../apps/backend/src/pull-check/pull-piec Metric definitions (including Prometheus metrics) live in [Dealbot Events & Metrics](./events-and-metrics.md). The metrics emitted by a pull check are: - [`pullCheckRequestLatencyMs`](./events-and-metrics.md#pullCheckRequestLatencyMs) -- [`pullCheckCompletionLatencyMs`](./events-and-metrics.md#pullCheckCompletionLatencyMs) - [`pullCheckFirstByteMs`](./events-and-metrics.md#pullCheckFirstByteMs) (only when the SP actually pulled from `/api/piece/{pieceCid}`) +- [`pullCheckCompletionLatencyMs`](./events-and-metrics.md#pullCheckCompletionLatencyMs) - [`pullCheckThroughputBps`](./events-and-metrics.md#pullCheckThroughputBps) - [`pullCheckStatus`](./events-and-metrics.md#pullCheckStatus) - [`pullCheckProviderStatus`](./events-and-metrics.md#pullCheckProviderStatus) @@ -137,6 +138,8 @@ Key environment variables that control pull check behavior: | `PULL_CHECK_JOB_TIMEOUT_SECONDS` | Max end-to-end pull check job runtime before forced abort. | | `PULL_CHECK_POLL_INTERVAL_SECONDS` | Polling interval used while waiting for a terminal SP pull status. | | `PULL_CHECK_PIECE_SIZE_BYTES` | Size of the synthetic test piece dealbot generates per pull check. | +| `PULL_PIECE_MAX_CONCURRENT_STREAMS` | Process-wide cap on concurrent `/api/piece/{pieceCid}` streams across all pieces. | +| `PULL_PIECE_MAX_STREAMS_PER_CID` | Per-pieceCid cap on concurrent streams; prevents a single piece from exhausting the global budget. | Source: [`apps/backend/src/config/app.config.ts`](../../apps/backend/src/config/app.config.ts) From df544765bc04cedd65963ea88ce1d2c6124bf131 Mon Sep 17 00:00:00 2001 From: silent-cipher Date: Mon, 11 May 2026 14:17:25 +0530 Subject: [PATCH 34/44] chore: fix faqs --- docs/checks/pull-check.md | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/docs/checks/pull-check.md b/docs/checks/pull-check.md index a7c7f05a..6ce20441 100644 --- a/docs/checks/pull-check.md +++ b/docs/checks/pull-check.md @@ -151,7 +151,6 @@ See also: [`docs/environment-variables.md`](../environment-variables.md) for the Pull checks are intentionally isolated from the data-storage flow: they don't pass through `DealService.createDeal`, don't allocate a `Deal` entity. This keeps the pull-check signal independent of the data-storage success rate. -### Why does a "cached pull" not record `pullCheckFirstByteMs`? - -If an SP previously pulled the same piece CID and serves the new pull request from a local cache, it will never fetch `/api/piece/{pieceCid}`, so dealbot has no first-byte timestamp to subtract. In that case dealbot skips the histogram observation rather than emit a misleading zero. Cached pulls are uncommon today because each pull check generates a fresh random piece, but the registry's first-byte capture is **idempotent** so retried pulls during a single check do not skew measurements either. +### Do pull checks verify IPNI indexing and IPFS retrieval? +No, not currently. We are assuming that if an SP does IPNI indexing and IPFS retrieval with data storage checks that it also happens when a piece is pulled. From 445a0d6c63b5b215fbcb50d6f3c5615fa471bff9 Mon Sep 17 00:00:00 2001 From: silent-cipher Date: Mon, 11 May 2026 14:53:46 +0530 Subject: [PATCH 35/44] refactor: rename metrics and arrange --- .../check-metrics.service.ts | 42 +++++++++---------- .../metrics-prometheus.module.ts | 30 ++++++------- .../src/pull-check/pull-check.service.spec.ts | 20 ++++----- .../src/pull-check/pull-check.service.ts | 8 ++-- docs/checks/events-and-metrics.md | 10 ++--- docs/checks/pull-check.md | 22 +++++----- docs/environment-variables.md | 2 +- 7 files changed, 67 insertions(+), 67 deletions(-) diff --git a/apps/backend/src/metrics-prometheus/check-metrics.service.ts b/apps/backend/src/metrics-prometheus/check-metrics.service.ts index 929d2263..a981dded 100644 --- a/apps/backend/src/metrics-prometheus/check-metrics.service.ts +++ b/apps/backend/src/metrics-prometheus/check-metrics.service.ts @@ -252,41 +252,41 @@ export class DataSetCreationCheckMetrics { @Injectable() export class PullCheckCheckMetrics { constructor( - @InjectMetric("pullCheckRequestLatencyMs") - private readonly pullCheckRequestLatencyMs: Histogram, - @InjectMetric("pullCheckCompletionLatencyMs") - private readonly pullCheckCompletionLatencyMs: Histogram, + @InjectMetric("pullRequestAcknowledgementLatencyMs") + private readonly pullRequestAcknowledgementLatencyMs: Histogram, + @InjectMetric("pullRequestStartedMs") + private readonly pullRequestStartedMs: Histogram, + @InjectMetric("pullRequestCompletionLatencyMs") + private readonly pullRequestCompletionLatencyMs: Histogram, + @InjectMetric("pullRequestProviderStatus") + private readonly pullRequestProviderStatusCounter: Counter, + @InjectMetric("pullRequestThroughputBps") + private readonly pullRequestThroughputBps: Histogram, @InjectMetric("pullCheckStatus") private readonly pullCheckStatusCounter: Counter, - @InjectMetric("pullCheckProviderStatus") - private readonly pullCheckProviderStatusCounter: Counter, - @InjectMetric("pullCheckFirstByteMs") - private readonly pullCheckFirstByteMs: Histogram, - @InjectMetric("pullCheckThroughputBps") - private readonly pullCheckThroughputBps: Histogram, ) {} - observeRequestLatencyMs(labels: CheckMetricLabels, value: number | null | undefined): void { - observePositive(this.pullCheckRequestLatencyMs, labels, value); + observeAcknowledgementLatencyMs(labels: CheckMetricLabels, value: number | null | undefined): void { + observePositive(this.pullRequestAcknowledgementLatencyMs, labels, value); } - observeCompletionLatencyMs(labels: CheckMetricLabels, value: number | null | undefined): void { - observePositive(this.pullCheckCompletionLatencyMs, labels, value); + observeStartedMs(labels: CheckMetricLabels, value: number | null | undefined): void { + observePositive(this.pullRequestStartedMs, labels, value); } - recordStatus(labels: CheckMetricLabels, value: string): void { - this.pullCheckStatusCounter.inc({ ...labels, value }); + observeCompletionLatencyMs(labels: CheckMetricLabels, value: number | null | undefined): void { + observePositive(this.pullRequestCompletionLatencyMs, labels, value); } recordProviderStatus(labels: CheckMetricLabels, value: string): void { - this.pullCheckProviderStatusCounter.inc({ ...labels, value }); + this.pullRequestProviderStatusCounter.inc({ ...labels, value }); } - observeFirstByteMs(labels: CheckMetricLabels, value: number | null | undefined): void { - observePositive(this.pullCheckFirstByteMs, labels, value); + observeThroughputBps(labels: CheckMetricLabels, value: number | null | undefined): void { + observePositive(this.pullRequestThroughputBps, labels, value); } - observeThroughputBps(labels: CheckMetricLabels, value: number | null | undefined): void { - observePositive(this.pullCheckThroughputBps, labels, value); + recordStatus(labels: CheckMetricLabels, value: string): void { + this.pullCheckStatusCounter.inc({ ...labels, value }); } } diff --git a/apps/backend/src/metrics-prometheus/metrics-prometheus.module.ts b/apps/backend/src/metrics-prometheus/metrics-prometheus.module.ts index 240b4201..710f9469 100644 --- a/apps/backend/src/metrics-prometheus/metrics-prometheus.module.ts +++ b/apps/backend/src/metrics-prometheus/metrics-prometheus.module.ts @@ -199,39 +199,39 @@ const metricProviders = [ }), // Pull check metrics (docs/checks/pull-check.md) makeHistogramProvider({ - name: "pullCheckRequestLatencyMs", + name: "pullRequestAcknowledgementLatencyMs", help: "Time from pull request submission to SP request acknowledgement (ms)", labelNames: ["checkType", "providerId", "providerName", "providerStatus"] as const, buckets: [10, 50, 100, 500, 1000, 2000, 5000, 10000, 30000, 60000, 120000, 300000], }), makeHistogramProvider({ - name: "pullCheckCompletionLatencyMs", + name: "pullRequestStartedMs", + help: "Time from pullPieces submission to the SP reading the first byte of the hosted-piece stream (ms)", + labelNames: ["checkType", "providerId", "providerName", "providerStatus"] as const, + buckets: [10, 50, 100, 250, 500, 1000, 2000, 5000, 10000, 30000, 60000, 120000], + }), + makeHistogramProvider({ + name: "pullRequestCompletionLatencyMs", help: "Time from pull request submission to terminal SP pull status (ms)", labelNames: ["checkType", "providerId", "providerName", "providerStatus"] as const, buckets: [100, 500, 1000, 2000, 5000, 10000, 30000, 60000, 120000, 300000, 600000], }), makeCounterProvider({ - name: "pullCheckStatus", - help: "Pull-check terminal status counts (success | failure.timedout | failure.other)", - labelNames: ["checkType", "providerId", "providerName", "providerStatus", "value"] as const, - }), - makeCounterProvider({ - name: "pullCheckProviderStatus", + name: "pullRequestProviderStatus", help: "Terminal SP-reported pull status recorded once per check (intermediate polling statuses are not counted)", labelNames: ["checkType", "providerId", "providerName", "providerStatus", "value"] as const, }), makeHistogramProvider({ - name: "pullCheckFirstByteMs", - help: "Time from pullPieces submission to the SP reading the first byte of the hosted-piece stream (ms)", - labelNames: ["checkType", "providerId", "providerName", "providerStatus"] as const, - buckets: [10, 50, 100, 250, 500, 1000, 2000, 5000, 10000, 30000, 60000, 120000], - }), - makeHistogramProvider({ - name: "pullCheckThroughputBps", + name: "pullRequestThroughputBps", help: "Pull-check throughput approximated as pieceSize / completionLatency in bytes per second", labelNames: ["checkType", "providerId", "providerName", "providerStatus"] as const, buckets: throughputBuckets, }), + makeCounterProvider({ + name: "pullCheckStatus", + help: "Pull-check terminal status counts (success | failure.timedout | failure.other)", + labelNames: ["checkType", "providerId", "providerName", "providerStatus", "value"] as const, + }), // Data Retention Metrics makeCounterProvider({ name: "dataSetChallengeStatus", diff --git a/apps/backend/src/pull-check/pull-check.service.spec.ts b/apps/backend/src/pull-check/pull-check.service.spec.ts index ee6d09e6..ec221f3e 100644 --- a/apps/backend/src/pull-check/pull-check.service.spec.ts +++ b/apps/backend/src/pull-check/pull-check.service.spec.ts @@ -58,12 +58,12 @@ describe("PullCheckService", () => { }; let httpClientServiceMock: { requestWithMetrics: ReturnType }; let metricsMock: { - observeRequestLatencyMs: ReturnType; + observeAcknowledgementLatencyMs: ReturnType; + observeStartedMs: ReturnType; observeCompletionLatencyMs: ReturnType; - recordStatus: ReturnType; recordProviderStatus: ReturnType; - observeFirstByteMs: ReturnType; observeThroughputBps: ReturnType; + recordStatus: ReturnType; }; let configValues: Partial; @@ -86,12 +86,12 @@ describe("PullCheckService", () => { requestWithMetrics: vi.fn(), }; metricsMock = { - observeRequestLatencyMs: vi.fn(), + observeAcknowledgementLatencyMs: vi.fn(), + observeStartedMs: vi.fn(), observeCompletionLatencyMs: vi.fn(), - recordStatus: vi.fn(), recordProviderStatus: vi.fn(), - observeFirstByteMs: vi.fn(), observeThroughputBps: vi.fn(), + recordStatus: vi.fn(), }; configValues = { @@ -283,15 +283,15 @@ describe("PullCheckService", () => { // Submit timestamp is stamped on the registration. expect(registryMock.markPullSubmitted).toHaveBeenCalledWith(registration.pieceCid, expect.any(Date)); // Latency histograms observed at least once each. - expect(metricsMock.observeRequestLatencyMs).toHaveBeenCalledTimes(1); + expect(metricsMock.observeAcknowledgementLatencyMs).toHaveBeenCalledTimes(1); expect(metricsMock.observeCompletionLatencyMs).toHaveBeenCalledTimes(1); // Terminal SP status recorded exactly once. expect(metricsMock.recordProviderStatus).toHaveBeenCalledTimes(1); expect(metricsMock.recordProviderStatus).toHaveBeenCalledWith(expect.any(Object), "complete"); // First-byte and throughput observed since the registration carries // pullSubmittedAt + firstByteAt and the path completed. - expect(metricsMock.observeFirstByteMs).toHaveBeenCalledTimes(1); - const firstByteMs = metricsMock.observeFirstByteMs.mock.calls[0][1] as number; + expect(metricsMock.observeStartedMs).toHaveBeenCalledTimes(1); + const firstByteMs = metricsMock.observeStartedMs.mock.calls[0][1] as number; expect(firstByteMs).toBe(250); expect(metricsMock.observeThroughputBps).toHaveBeenCalledTimes(1); // Terminal aggregate status is success. @@ -308,7 +308,7 @@ describe("PullCheckService", () => { await service.runPullCheck("0xsp", undefined, logContext); - expect(metricsMock.observeFirstByteMs).not.toHaveBeenCalled(); + expect(metricsMock.observeStartedMs).not.toHaveBeenCalled(); expect(metricsMock.observeThroughputBps).toHaveBeenCalledTimes(1); expect(metricsMock.recordStatus).toHaveBeenCalledWith(expect.any(Object), "success"); }); diff --git a/apps/backend/src/pull-check/pull-check.service.ts b/apps/backend/src/pull-check/pull-check.service.ts index 1e4e11dc..ab1202ca 100644 --- a/apps/backend/src/pull-check/pull-check.service.ts +++ b/apps/backend/src/pull-check/pull-check.service.ts @@ -105,11 +105,11 @@ export class PullCheckService { const pullResponse = await pullPieces(synapseClient, pullPiecesOptions); signal?.throwIfAborted(); const requestLatencyMs = Date.now() - requestSubmittedAt.getTime(); - this.pullCheckMetrics.observeRequestLatencyMs(labels, requestLatencyMs); + this.pullCheckMetrics.observeAcknowledgementLatencyMs(labels, requestLatencyMs); this.logger.log({ ...logContext, - event: "pull_request_submitted", - message: "Pull request submitted to provider", + event: "pull_request_acknowledged", + message: "Pull request acknowledged by provider", pieceCid: pieceCidStr, pullProviderStatus: pullResponse.status, requestLatencyMs, @@ -144,7 +144,7 @@ export class PullCheckService { ? firstByteEntry.firstByteAt.getTime() - firstByteEntry.pullSubmittedAt.getTime() : null; if (firstByteMs != null) { - this.pullCheckMetrics.observeFirstByteMs(labels, firstByteMs); + this.pullCheckMetrics.observeStartedMs(labels, firstByteMs); } // Throughput approximated as pieceSize / completionLatency. This is an // upper-bound on actual transfer time because completionLatency includes diff --git a/docs/checks/events-and-metrics.md b/docs/checks/events-and-metrics.md index 45f8c82e..a21aaef7 100644 --- a/docs/checks/events-and-metrics.md +++ b/docs/checks/events-and-metrics.md @@ -121,10 +121,10 @@ sequenceDiagram | `dataStorageCheckMs` | Data Storage | [`uploadToSpStart`](#uploadToSpStart) | [`ipfsRetrievalIntegrityChecked`](#ipfsRetrievalIntegrityChecked) | Duration of a Data Storage check | | | `retrievalCheckMs` | Retrieval | Retrieval check start | [`ipfsRetrievalIntegrityChecked`](#ipfsRetrievalIntegrityChecked) | Duration of a Retrieval check | | | `dataSetCreationMs` | Data-Set Creation | Data-set creation uploadToSpStart | Data-set creation pieceConfirmed | Duration of one data-set creation with confirmed piece (all using `createDataSetWithPiece`) | [`deal.service.ts`](../../apps/backend/src/deal/deal.service.ts) | -| `pullCheckRequestLatencyMs` | Pull | [`pullRequestSubmittedToSp`](#pullRequestSubmittedToSp) | [`pullRequestAcknowledgedBySp`](#pullRequestAcknowledgedBySp) | Time from `pullPieces` submission to SP request acknowledgement. | [`pull-check.service.ts`](../../apps/backend/src/pull-check/pull-check.service.ts) | -| `pullCheckCompletionLatencyMs` | Pull | [`pullRequestSubmittedToSp`](#pullRequestSubmittedToSp) | [`pullRequestIsTerminal`](#pullRequestIsTerminal) | Time from `pullPieces` submission to terminal SP pull status. Observed once on success and once on failure. | [`pull-check.service.ts`](../../apps/backend/src/pull-check/pull-check.service.ts) | -| `pullCheckFirstByteMs` | Pull | [`pullRequestSubmittedToSp`](#pullRequestSubmittedToSp) | [`pullRequestStartedBySp`](#pullRequestStartedBySp) | Time from `pullPieces` submission to the SP reading the first byte of `/api/piece/{pieceCid}`. Skipped (no observation) when the SP serves the pull from a local cache and never fetches from dealbot. | [`pull-check.service.ts`](../../apps/backend/src/pull-check/pull-check.service.ts), [`pull-piece.controller.ts`](../../apps/backend/src/pull-check/pull-piece.controller.ts) | -| `pullCheckThroughputBps` | Pull | n/a | n/a | `(pieceSizeBytes / pullCheckCompletionLatencyMs) * 1000`. Upper-bound on actual transfer rate because `pullCheckCompletionLatencyMs` includes SP-side scheduling and dealbot's polling cadence. | [`pull-check.service.ts`](../../apps/backend/src/pull-check/pull-check.service.ts) | +| `pullRequestAcknowledgementLatencyMs` | Pull | [`pullRequestSubmittedToSp`](#pullRequestSubmittedToSp) | [`pullRequestAcknowledgedBySp`](#pullRequestAcknowledgedBySp) | Time from `pullPieces` submission to SP request acknowledgement. | [`pull-check.service.ts`](../../apps/backend/src/pull-check/pull-check.service.ts) | +| `pullRequestStartedMs` | Pull | [`pullRequestSubmittedToSp`](#pullRequestSubmittedToSp) | [`pullRequestStartedBySp`](#pullRequestStartedBySp) | Time from `pullPieces` submission to the SP reading the first byte of `/api/piece/{pieceCid}`. Skipped (no observation) when the SP serves the pull from a local cache and never fetches from dealbot. | [`pull-check.service.ts`](../../apps/backend/src/pull-check/pull-check.service.ts), [`pull-piece.controller.ts`](../../apps/backend/src/pull-check/pull-piece.controller.ts) | +| `pullRequestCompletionLatencyMs` | Pull | [`pullRequestSubmittedToSp`](#pullRequestSubmittedToSp) | [`pullRequestIsTerminal`](#pullRequestIsTerminal) | Time from `pullPieces` submission to terminal SP pull status. Observed once on success and once on failure. | [`pull-check.service.ts`](../../apps/backend/src/pull-check/pull-check.service.ts) | +| `pullRequestThroughputBps` | Pull | n/a | n/a | `(pieceSizeBytes / pullRequestCompletionLatencyMs) * 1000`. Upper-bound on actual transfer rate because `pullRequestCompletionLatencyMs` includes SP-side scheduling and dealbot's polling cadence. | [`pull-check.service.ts`](../../apps/backend/src/pull-check/pull-check.service.ts) | ### Status Count Related Metrics @@ -144,8 +144,8 @@ sequenceDiagram | `dataSetCreationStatus` | Data-Set Creation | Not tied to an [event above](#event-list) but rather to data-set creation start (`pending`) and completion (`success`/`failure.*`) | `pending`, `success`, `failure.timedout`, `failure.other` | [`deal.service.ts`](../../apps/backend/src/deal/deal.service.ts) | | `dataSetChallengeStatus` | Data Retention | Emitted on each [Data Retention Check](./data-retention.md) poll when a provider's confirmed proving-period totals advance (strictly positive deltas). Unit: **challenges** (period delta × `CHALLENGES_PER_PROVING_PERIOD = 5`). | `success` (challenges in successfully-proven periods), `failure` (challenges in faulted periods) | [`data-retention.service.ts`](../../apps/backend/src/data-retention/data-retention.service.ts) | | `pdp_provider_estimated_overdue_periods` | Data Retention | Emitted on every [Data Retention Check](./data-retention.md) poll for every successfully processed provider. | Gauge value in proving periods (non-negative integer) | [`data-retention.service.ts`](../../apps/backend/src/data-retention/data-retention.service.ts) | +| `pullRequestProviderStatus` | Pull | When the SP reports a terminal pull status via `waitForPullStatus`. Recorded exactly once per check (intermediate poll statuses are not counted). | Raw SP-reported pull status, for example `complete`, `failed`, `not_found`. Use this to separate SP-side pull failures from dealbot-side validation failures. | [`pull-check.service.ts`](../../apps/backend/src/pull-check/pull-check.service.ts) | | `pullCheckStatus` | Pull | When the [Pull Check](./pull-check.md) terminates (success after direct piece validation, or any failure). Recorded exactly once per check. | `success`, `failure.timedout`, `failure.other`. Failure classification follows [`classifyFailureStatus`](../../apps/backend/src/metrics-prometheus/check-metric-labels.ts) (timeout-keyed errors → `failure.timedout`, everything else → `failure.other`). | [`pull-check.service.ts`](../../apps/backend/src/pull-check/pull-check.service.ts) | -| `pullCheckProviderStatus` | Pull | When the SP reports a terminal pull status via `waitForPullStatus`. Recorded exactly once per check (intermediate poll statuses are not counted). | Raw SP-reported pull status, for example `complete`, `failed`, `not_found`. Use this to separate SP-side pull failures from dealbot-side validation failures. | [`pull-check.service.ts`](../../apps/backend/src/pull-check/pull-check.service.ts) | ## ClickHouse Tables diff --git a/docs/checks/pull-check.md b/docs/checks/pull-check.md index 6ce20441..87d86243 100644 --- a/docs/checks/pull-check.md +++ b/docs/checks/pull-check.md @@ -22,10 +22,10 @@ Each pull check asserts the following for every SP: | # | Assertion | How It's Checked | Retries | Relevant Metric for Setting a Max Duration | Implemented? | |---|-----------|------------------|:---:|--------------------------------------------|:---:| -| 1 | SP accepts the pull request | Synapse `pullPieces` (i.e., initial call to SP `POST /pdp/piece/pull`) returns without error and reports a non-terminal-failure status | 0 | [`pullCheckRequestLatencyMs`](./events-and-metrics.md#pullCheckRequestLatencyMs) | Yes | -| 2 | SP reaches a terminal `complete` pull status | Synapse `waitForPullStatus` polls the SP (using `POST /pdp/piece/pull`) until a terminal status is reported | Polling will continue until [`PULL_CHECK_JOB_TIMEOUT_SECONDS`](../environment-variables.md#pull_check_job_timeout_seconds) is reached | [`pullCheckCompletionLatencyMs`](./events-and-metrics.md#pullCheckCompletionLatencyMs) | Yes | +| 1 | SP accepts the pull request | Synapse `pullPieces` (i.e., initial call to SP `POST /pdp/piece/pull`) returns without error and reports a non-terminal-failure status | 0 | [`pullRequestAcknowledgementLatencyMs`](./events-and-metrics.md#pullRequestAcknowledgementLatencyMs) | Yes | +| 2 | SP reaches a terminal `complete` pull status | Synapse `waitForPullStatus` polls the SP (using `POST /pdp/piece/pull`) until a terminal status is reported | Polling will continue until [`PULL_CHECK_JOB_TIMEOUT_SECONDS`](../environment-variables.md#pull_check_job_timeout_seconds) is reached | [`pullRequestCompletionLatencyMs`](./events-and-metrics.md#pullRequestCompletionLatencyMs) | Yes | | 3 | SP serves the pulled piece via `/piece/{pieceCid}` | Re-fetch the bytes from the SP's PDP service URL and re-compute the piece CID | 0 | n/a (bounded by job timeout) | Yes | -| 4 | All checks pass | Pull check is not marked successful until all assertions pass within the job timeout | n/a | [`pullCheckCompletionLatencyMs`](./events-and-metrics.md#pullCheckCompletionLatencyMs) | Yes | +| 4 | All checks pass | Pull check is not marked successful until all assertions pass within the job timeout | n/a | [`pullRequestCompletionLatencyMs`](./events-and-metrics.md#pullRequestCompletionLatencyMs) | Yes | ## Pull Check Lifecycle @@ -69,9 +69,9 @@ Source: [`pull-check.service.ts` (`runPullCheck`)](../../apps/backend/src/pull-c ### 3. Wait for terminal SP pull status -When dealbot receives the SP request for `/api/piece/{pieceCid}` for the first time, dealbot stamps a first-byte timestamp on the registration. This is the basis for [`pullCheckFirstByteMs`](./events-and-metrics.md#pullCheckFirstByteMs). +When dealbot receives the SP request for `/api/piece/{pieceCid}` for the first time, dealbot stamps a first-byte timestamp on the registration. This is the basis for [`pullRequestStartedMs`](./events-and-metrics.md#pullRequestStartedMs). -In parallel, dealbot`waitForPullStatus` polls the SP at `PULL_CHECK_POLL_INTERVAL_SECONDS` until the SP reports a terminal status (`complete` or `failed`) or the job timeout fires. Dealbot increments the [`pullCheckProviderStatus`](./events-and-metrics.md#pullCheckProviderStatus) counter exactly once with the **terminal** status; intermediate poll statuses are not counted. +In parallel, dealbot`waitForPullStatus` polls the SP at `PULL_CHECK_POLL_INTERVAL_SECONDS` until the SP reports a terminal status (`complete` or `failed`) or the job timeout fires. Dealbot increments the [`pullRequestProviderStatus`](./events-and-metrics.md#pullRequestProviderStatus) counter exactly once with the **terminal** status; intermediate poll statuses are not counted. Source: [`pull-piece.controller.ts`](../../apps/backend/src/pull-check/pull-piece.controller.ts) @@ -102,7 +102,7 @@ A pull check has a single terminal status, recorded once per check via [`pullChe Failures are classified by inspecting the error message; see [`classifyFailureStatus`](../../apps/backend/src/metrics-prometheus/check-metric-labels.ts) for the exact rule. -In addition to the overall status, dealbot records the **raw SP-reported terminal pull status** via [`pullCheckProviderStatus`](./events-and-metrics.md#pullCheckProviderStatus) (for example `complete`, `failed`, `not_found`). This separates "SP said it failed" from "dealbot's downstream validation failed" in dashboards. +In addition to the overall status, dealbot records the **raw SP-reported terminal pull status** via [`pullRequestProviderStatus`](./events-and-metrics.md#pullRequestProviderStatus) (for example `complete`, `failed`, `not_found`). This separates "SP said it failed" from "dealbot's downstream validation failed" in dashboards. ## HTTP API @@ -120,12 +120,12 @@ Source: [`pull-piece.controller.ts`](../../apps/backend/src/pull-check/pull-piec Metric definitions (including Prometheus metrics) live in [Dealbot Events & Metrics](./events-and-metrics.md). The metrics emitted by a pull check are: -- [`pullCheckRequestLatencyMs`](./events-and-metrics.md#pullCheckRequestLatencyMs) -- [`pullCheckFirstByteMs`](./events-and-metrics.md#pullCheckFirstByteMs) (only when the SP actually pulled from `/api/piece/{pieceCid}`) -- [`pullCheckCompletionLatencyMs`](./events-and-metrics.md#pullCheckCompletionLatencyMs) -- [`pullCheckThroughputBps`](./events-and-metrics.md#pullCheckThroughputBps) +- [`pullRequestAcknowledgementLatencyMs`](./events-and-metrics.md#pullRequestAcknowledgementLatencyMs) +- [`pullRequestStartedMs`](./events-and-metrics.md#pullRequestStartedMs) (only when the SP actually pulled from `/api/piece/{pieceCid}`) +- [`pullRequestCompletionLatencyMs`](./events-and-metrics.md#pullRequestCompletionLatencyMs) +- [`pullRequestProviderStatus`](./events-and-metrics.md#pullRequestProviderStatus) +- [`pullRequestThroughputBps`](./events-and-metrics.md#pullRequestThroughputBps) - [`pullCheckStatus`](./events-and-metrics.md#pullCheckStatus) -- [`pullCheckProviderStatus`](./events-and-metrics.md#pullCheckProviderStatus) ## Configuration diff --git a/docs/environment-variables.md b/docs/environment-variables.md index e12783d3..925276fb 100644 --- a/docs/environment-variables.md +++ b/docs/environment-variables.md @@ -996,7 +996,7 @@ PULL_CHECKS_PER_SP_PER_HOUR=0.083 - **Default**: `10485760` (10 MiB) - **Minimum**: `1024` -**Role**: Size of the synthetic random piece dealbot generates per pull check. The same byte length is used to compute [`pullCheckThroughputBps`](./checks/events-and-metrics.md#pullCheckThroughputBps). +**Role**: Size of the synthetic random piece dealbot generates per pull check. The same byte length is used to compute [`pullRequestThroughputBps`](./checks/events-and-metrics.md#pullRequestThroughputBps). **When to update**: From c164add8473041bb837dce887409d58ae66b3740 Mon Sep 17 00:00:00 2001 From: silent-cipher Date: Mon, 11 May 2026 15:35:29 +0530 Subject: [PATCH 36/44] feat: add pull_checks table to clickhouse --- .../src/clickhouse/clickhouse.schema.ts | 23 ++++++++++ .../src/pull-check/pull-check.service.spec.ts | 16 +++++++ .../src/pull-check/pull-check.service.ts | 42 +++++++++++++++---- 3 files changed, 74 insertions(+), 7 deletions(-) diff --git a/apps/backend/src/clickhouse/clickhouse.schema.ts b/apps/backend/src/clickhouse/clickhouse.schema.ts index 897d8c92..d3d4aee1 100644 --- a/apps/backend/src/clickhouse/clickhouse.schema.ts +++ b/apps/backend/src/clickhouse/clickhouse.schema.ts @@ -78,5 +78,28 @@ export function buildMigrations(database: string): string[] { PRIMARY KEY (probe_location, sp_address, timestamp) PARTITION BY toStartOfMonth(timestamp) TTL toDateTime(timestamp) + INTERVAL 1 YEAR`, + + `CREATE TABLE IF NOT EXISTS ${database}.pull_checks +( + timestamp DateTime64(3, 'UTC'), -- when the pull check terminated + probe_location LowCardinality(String), -- dealbot location + sp_address String, -- storage provider address + sp_id Nullable(UInt64), -- storage provider numeric id + sp_name Nullable(String), -- storage provider name + + piece_cid Nullable(String), -- piece CID of the synthetic test piece; null if preparation failed + piece_size_bytes Nullable(UInt64), -- size of the synthetic piece in bytes; null if preparation failed + + status LowCardinality(String), -- 'success' | 'failure.timedout' | 'failure.other' + provider_status LowCardinality(Nullable(String)), -- raw SP-reported terminal pull status (e.g. 'complete', 'failed'); null if the request was never acknowledged + + acknowledgement_latency_ms Nullable(Float64), -- time from pullPieces submission to SP acknowledgement (ms) + completion_latency_ms Nullable(Float64), -- time from pullPieces submission to terminal SP pull status (ms) + first_byte_ms Nullable(Float64), -- time from pullPieces submission to SP reading first byte of hosted piece (ms); null when SP served from cache or check failed before first byte + throughput_bps Nullable(Float64) -- approx bytes/sec = piece_size_bytes / completion_latency_ms * 1000; null on failure +) ENGINE MergeTree() + PRIMARY KEY (probe_location, sp_address, timestamp) + PARTITION BY toStartOfMonth(timestamp) + TTL toDateTime(timestamp) + INTERVAL 1 YEAR`, ]; } diff --git a/apps/backend/src/pull-check/pull-check.service.spec.ts b/apps/backend/src/pull-check/pull-check.service.spec.ts index ec221f3e..d6280bda 100644 --- a/apps/backend/src/pull-check/pull-check.service.spec.ts +++ b/apps/backend/src/pull-check/pull-check.service.spec.ts @@ -2,6 +2,7 @@ import { Readable } from "node:stream"; import { ConfigService } from "@nestjs/config"; import { Test, type TestingModule } from "@nestjs/testing"; import { afterEach, beforeEach, describe, expect, it, vi } from "vitest"; +import { ClickhouseService } from "../clickhouse/clickhouse.service.js"; import type { IConfig } from "../config/app.config.js"; import { DataSourceService } from "../dataSource/dataSource.service.js"; import { HttpClientService } from "../http-client/http-client.service.js"; @@ -65,6 +66,7 @@ describe("PullCheckService", () => { observeThroughputBps: ReturnType; recordStatus: ReturnType; }; + let clickhouseServiceMock: { insert: ReturnType; probeLocation: string }; let configValues: Partial; beforeEach(async () => { @@ -93,6 +95,7 @@ describe("PullCheckService", () => { observeThroughputBps: vi.fn(), recordStatus: vi.fn(), }; + clickhouseServiceMock = { insert: vi.fn(), probeLocation: "test" }; configValues = { app: { host: "localhost", port: 3000, apiPublicUrl: "https://dealbot.example" } as IConfig["app"], @@ -121,6 +124,7 @@ describe("PullCheckService", () => { { provide: PullPieceRepository, useValue: registryMock }, { provide: PullCheckCheckMetrics, useValue: metricsMock }, { provide: HttpClientService, useValue: httpClientServiceMock }, + { provide: ClickhouseService, useValue: clickhouseServiceMock }, ], }).compile(); @@ -299,6 +303,18 @@ describe("PullCheckService", () => { // Cleanup ran (forget called) expect(registryMock.forget).toHaveBeenCalledWith(registration.pieceCid); + // ClickHouse row written with the check result. + expect(clickhouseServiceMock.insert).toHaveBeenCalledWith( + "pull_checks", + expect.objectContaining({ + probe_location: "test", + sp_address: "0xsp", + piece_cid: "bafk-test-piece", + piece_size_bytes: 1024, + status: "success", + provider_status: "complete", + }), + ); }); it("does not observe firstByte when the SP never read from /api/piece (cached pull)", async () => { diff --git a/apps/backend/src/pull-check/pull-check.service.ts b/apps/backend/src/pull-check/pull-check.service.ts index ab1202ca..8acc95f7 100644 --- a/apps/backend/src/pull-check/pull-check.service.ts +++ b/apps/backend/src/pull-check/pull-check.service.ts @@ -5,6 +5,7 @@ import { pullPieces, waitForPullStatus } from "@filoz/synapse-core/sp"; import { Injectable, Logger } from "@nestjs/common"; import { ConfigService } from "@nestjs/config"; import type { Account, Address, Chain, Client, Transport } from "viem"; +import { ClickhouseService } from "../clickhouse/clickhouse.service.js"; import { type ProviderJobContext, toStructuredError } from "../common/logging.js"; import type { IAppConfig, IConfig, IPullPieceConfig } from "../config/app.config.js"; import { DataSourceService } from "../dataSource/dataSource.service.js"; @@ -29,6 +30,7 @@ export class PullCheckService { private readonly pullPieceRepository: PullPieceRepository, private readonly pullCheckMetrics: PullCheckCheckMetrics, private readonly httpClientService: HttpClientService, + private readonly clickhouseService: ClickhouseService, ) {} /** @@ -78,6 +80,12 @@ export class PullCheckService { let prepared: PullPiecePrepared | null = null; let requestSubmittedAt: Date | null = null; + let requestLatencyMs: number | null = null; + let completionLatencyMs: number | null = null; + let firstByteMs: number | null = null; + let throughputBps: number | null = null; + let finalProviderStatus: string | null = null; + let checkStatus: string | null = null; try { signal?.throwIfAborted(); @@ -104,7 +112,7 @@ export class PullCheckService { await this.pullPieceRepository.markPullSubmitted(pieceCidStr, requestSubmittedAt); const pullResponse = await pullPieces(synapseClient, pullPiecesOptions); signal?.throwIfAborted(); - const requestLatencyMs = Date.now() - requestSubmittedAt.getTime(); + requestLatencyMs = Date.now() - requestSubmittedAt.getTime(); this.pullCheckMetrics.observeAcknowledgementLatencyMs(labels, requestLatencyMs); this.logger.log({ ...logContext, @@ -123,10 +131,11 @@ export class PullCheckService { pollInterval: pullPieceConfig.pullCheckPollIntervalSeconds * 1000, }); signal?.throwIfAborted(); - const completionLatencyMs = Date.now() - requestSubmittedAt.getTime(); + completionLatencyMs = Date.now() - requestSubmittedAt.getTime(); this.pullCheckMetrics.observeCompletionLatencyMs(labels, completionLatencyMs); // Record the SP-reported terminal pull status (one increment per check) - this.pullCheckMetrics.recordProviderStatus(labels, finalResponse.status); + finalProviderStatus = finalResponse.status; + this.pullCheckMetrics.recordProviderStatus(labels, finalProviderStatus); if (finalResponse.status !== "complete") { throw new Error(`Storage provider failed to pull piece: status=${finalResponse.status}`); @@ -139,7 +148,7 @@ export class PullCheckService { } const firstByteEntry = await this.pullPieceRepository.resolve(pieceCidStr); - const firstByteMs = + firstByteMs = firstByteEntry?.firstByteAt && firstByteEntry?.pullSubmittedAt ? firstByteEntry.firstByteAt.getTime() - firstByteEntry.pullSubmittedAt.getTime() : null; @@ -149,10 +158,11 @@ export class PullCheckService { // Throughput approximated as pieceSize / completionLatency. This is an // upper-bound on actual transfer time because completionLatency includes // SP-side scheduling/queuing and our polling cadence. - const throughputBps = Math.round((prepared.registration.size * 1000) / Math.max(completionLatencyMs, 1)); + throughputBps = Math.round((prepared.registration.size * 1000) / Math.max(completionLatencyMs ?? 1, 1)); this.pullCheckMetrics.observeThroughputBps(labels, throughputBps); - this.pullCheckMetrics.recordStatus(labels, "success"); + checkStatus = "success"; + this.pullCheckMetrics.recordStatus(labels, checkStatus); this.logger.log({ ...logContext, event: "pull_check_completed", @@ -165,7 +175,8 @@ export class PullCheckService { pieceSizeBytes: prepared.registration.size, }); } catch (error) { - this.pullCheckMetrics.recordStatus(labels, classifyFailureStatus(error)); + checkStatus = classifyFailureStatus(error); + this.pullCheckMetrics.recordStatus(labels, checkStatus); throw error; } finally { if (prepared) { @@ -180,6 +191,23 @@ export class PullCheckService { }); }); } + if (checkStatus !== null) { + this.clickhouseService.insert("pull_checks", { + timestamp: Date.now(), + probe_location: this.clickhouseService.probeLocation, + sp_address: spAddress, + sp_id: providerInfo.id != null ? String(providerInfo.id) : null, + sp_name: providerInfo.name ?? null, + piece_cid: prepared?.registration.pieceCid ?? null, + piece_size_bytes: prepared?.registration.size ?? null, + status: checkStatus, + provider_status: finalProviderStatus, + acknowledgement_latency_ms: requestLatencyMs, + completion_latency_ms: completionLatencyMs, + first_byte_ms: firstByteMs, + throughput_bps: throughputBps, + }); + } } } From b1de0e594bdef70984e98f8a9a038d78a785e673 Mon Sep 17 00:00:00 2001 From: silent-cipher Date: Mon, 11 May 2026 19:39:08 +0530 Subject: [PATCH 37/44] doc: remove pull check results storage note --- docs/checks/pull-check.md | 2 -- 1 file changed, 2 deletions(-) diff --git a/docs/checks/pull-check.md b/docs/checks/pull-check.md index 87d86243..896f9d75 100644 --- a/docs/checks/pull-check.md +++ b/docs/checks/pull-check.md @@ -14,8 +14,6 @@ The pull check answers a different question than the [Data Storage check](./data A successful pull check requires all [assertions in the table below](#what-gets-asserted) to pass. Failure occurs if any step fails or the job exceeds its max allowed time. Operational timeouts exist to prevent jobs from running indefinitely, but they are not quality assertions. -> **Where results live:** Pull check results are exported to Prometheus and structured logs only. They are **not** persisted in Postgres or written to ClickHouse. - ## What Gets Asserted Each pull check asserts the following for every SP: From 9b0513b57e02cf774bb339b13e6947bf1eead35f Mon Sep 17 00:00:00 2001 From: silent-cipher Date: Mon, 11 May 2026 20:59:52 +0530 Subject: [PATCH 38/44] chore: update first_byte_ms comment --- apps/backend/src/clickhouse/clickhouse.schema.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/backend/src/clickhouse/clickhouse.schema.ts b/apps/backend/src/clickhouse/clickhouse.schema.ts index d3d4aee1..c727f8d6 100644 --- a/apps/backend/src/clickhouse/clickhouse.schema.ts +++ b/apps/backend/src/clickhouse/clickhouse.schema.ts @@ -95,7 +95,7 @@ export function buildMigrations(database: string): string[] { acknowledgement_latency_ms Nullable(Float64), -- time from pullPieces submission to SP acknowledgement (ms) completion_latency_ms Nullable(Float64), -- time from pullPieces submission to terminal SP pull status (ms) - first_byte_ms Nullable(Float64), -- time from pullPieces submission to SP reading first byte of hosted piece (ms); null when SP served from cache or check failed before first byte + first_byte_ms Nullable(Float64), -- time from pullPieces submission to SP reading first byte of hosted piece (ms); null when check failed before first byte throughput_bps Nullable(Float64) -- approx bytes/sec = piece_size_bytes / completion_latency_ms * 1000; null on failure ) ENGINE MergeTree() PRIMARY KEY (probe_location, sp_address, timestamp) From 34a92a605a47bcbd382e6acd5f25ce374078dcd4 Mon Sep 17 00:00:00 2001 From: silent-cipher Date: Mon, 11 May 2026 21:10:52 +0530 Subject: [PATCH 39/44] doc: clarify metric description --- docs/checks/events-and-metrics.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/checks/events-and-metrics.md b/docs/checks/events-and-metrics.md index a21aaef7..51d404d6 100644 --- a/docs/checks/events-and-metrics.md +++ b/docs/checks/events-and-metrics.md @@ -122,7 +122,7 @@ sequenceDiagram | `retrievalCheckMs` | Retrieval | Retrieval check start | [`ipfsRetrievalIntegrityChecked`](#ipfsRetrievalIntegrityChecked) | Duration of a Retrieval check | | | `dataSetCreationMs` | Data-Set Creation | Data-set creation uploadToSpStart | Data-set creation pieceConfirmed | Duration of one data-set creation with confirmed piece (all using `createDataSetWithPiece`) | [`deal.service.ts`](../../apps/backend/src/deal/deal.service.ts) | | `pullRequestAcknowledgementLatencyMs` | Pull | [`pullRequestSubmittedToSp`](#pullRequestSubmittedToSp) | [`pullRequestAcknowledgedBySp`](#pullRequestAcknowledgedBySp) | Time from `pullPieces` submission to SP request acknowledgement. | [`pull-check.service.ts`](../../apps/backend/src/pull-check/pull-check.service.ts) | -| `pullRequestStartedMs` | Pull | [`pullRequestSubmittedToSp`](#pullRequestSubmittedToSp) | [`pullRequestStartedBySp`](#pullRequestStartedBySp) | Time from `pullPieces` submission to the SP reading the first byte of `/api/piece/{pieceCid}`. Skipped (no observation) when the SP serves the pull from a local cache and never fetches from dealbot. | [`pull-check.service.ts`](../../apps/backend/src/pull-check/pull-check.service.ts), [`pull-piece.controller.ts`](../../apps/backend/src/pull-check/pull-piece.controller.ts) | +| `pullRequestStartedMs` | Pull | [`pullRequestSubmittedToSp`](#pullRequestSubmittedToSp) | [`pullRequestStartedBySp`](#pullRequestStartedBySp) | Time from `pullPieces` submission to the SP reading the first byte of `/api/piece/{pieceCid}`. Skipped (no observation) when the SP never fetches from dealbot. | [`pull-check.service.ts`](../../apps/backend/src/pull-check/pull-check.service.ts), [`pull-piece.controller.ts`](../../apps/backend/src/pull-check/pull-piece.controller.ts) | | `pullRequestCompletionLatencyMs` | Pull | [`pullRequestSubmittedToSp`](#pullRequestSubmittedToSp) | [`pullRequestIsTerminal`](#pullRequestIsTerminal) | Time from `pullPieces` submission to terminal SP pull status. Observed once on success and once on failure. | [`pull-check.service.ts`](../../apps/backend/src/pull-check/pull-check.service.ts) | | `pullRequestThroughputBps` | Pull | n/a | n/a | `(pieceSizeBytes / pullRequestCompletionLatencyMs) * 1000`. Upper-bound on actual transfer rate because `pullRequestCompletionLatencyMs` includes SP-side scheduling and dealbot's polling cadence. | [`pull-check.service.ts`](../../apps/backend/src/pull-check/pull-check.service.ts) | From 02f7ab9cdbd5f7d76a04f876af2985542962d1bb Mon Sep 17 00:00:00 2001 From: silent-cipher Date: Mon, 11 May 2026 23:52:54 +0530 Subject: [PATCH 40/44] refactor: remove custom throttler guard --- .../src/pull-check/pull-piece-throttler.guard.ts | 10 ---------- apps/backend/src/pull-check/pull-piece.controller.ts | 4 ++-- 2 files changed, 2 insertions(+), 12 deletions(-) delete mode 100644 apps/backend/src/pull-check/pull-piece-throttler.guard.ts diff --git a/apps/backend/src/pull-check/pull-piece-throttler.guard.ts b/apps/backend/src/pull-check/pull-piece-throttler.guard.ts deleted file mode 100644 index d804a91f..00000000 --- a/apps/backend/src/pull-check/pull-piece-throttler.guard.ts +++ /dev/null @@ -1,10 +0,0 @@ -import { ExecutionContext, Injectable } from "@nestjs/common"; -import { ThrottlerGuard } from "@nestjs/throttler"; - -@Injectable() -export class PullPieceThrottlerGuard extends ThrottlerGuard { - protected async throwThrottlingException(context: ExecutionContext): Promise { - const res = context.switchToHttp().getResponse(); - res.status(429).setHeader("Retry-After", "60").send("Too many requests"); - } -} diff --git a/apps/backend/src/pull-check/pull-piece.controller.ts b/apps/backend/src/pull-check/pull-piece.controller.ts index fd70889b..93185c52 100644 --- a/apps/backend/src/pull-check/pull-piece.controller.ts +++ b/apps/backend/src/pull-check/pull-piece.controller.ts @@ -2,11 +2,11 @@ import { PassThrough } from "node:stream"; import { asPieceCID } from "@filoz/synapse-core/piece"; import { Controller, Get, Logger, NotFoundException, Param, Res, UseGuards } from "@nestjs/common"; import { ApiResponse, ApiTags } from "@nestjs/swagger"; +import { ThrottlerGuard } from "@nestjs/throttler"; import type { Response } from "express"; import { PullCheckService } from "./pull-check.service.js"; import { PullPieceRepository } from "./pull-piece.repository.js"; import { PullPieceStreamTracker } from "./pull-piece-stream-tracker.service.js"; -import { PullPieceThrottlerGuard } from "./pull-piece-throttler.guard.js"; /** * Serves the temporary pull-piece bytes that a storage provider must fetch @@ -26,7 +26,7 @@ export class PieceSourceController { ) {} @Get("piece/:pieceCid") - @UseGuards(PullPieceThrottlerGuard) + @UseGuards(ThrottlerGuard) @ApiResponse({ status: 200, description: "Raw piece bytes streamed to the caller" }) @ApiResponse({ status: 404, description: "No active pull piece exists for this pieceCid" }) @ApiResponse({ status: 503, description: "Server is at capacity or too many concurrent requests for this piece" }) From 65e8a3a26bc9ac28b571f355b1dfca0d3387a3f2 Mon Sep 17 00:00:00 2001 From: silent-cipher Date: Tue, 12 May 2026 11:19:13 +0530 Subject: [PATCH 41/44] feat: add expiresAt field to pull piece entity --- apps/backend/src/database/entities/pull-piece.entity.ts | 6 +++++- .../database/migrations/1776300000000-CreatePullPieces.ts | 3 ++- apps/backend/src/pull-check/pull-check.service.spec.ts | 1 + apps/backend/src/pull-check/pull-check.service.ts | 4 +++- apps/backend/src/pull-check/pull-check.types.ts | 1 + apps/backend/src/pull-check/pull-piece.repository.ts | 2 ++ 6 files changed, 14 insertions(+), 3 deletions(-) diff --git a/apps/backend/src/database/entities/pull-piece.entity.ts b/apps/backend/src/database/entities/pull-piece.entity.ts index a6122899..35c186a3 100644 --- a/apps/backend/src/database/entities/pull-piece.entity.ts +++ b/apps/backend/src/database/entities/pull-piece.entity.ts @@ -1,4 +1,4 @@ -import { Column, CreateDateColumn, Entity, PrimaryColumn } from "typeorm"; +import { Column, CreateDateColumn, Entity, Index, PrimaryColumn } from "typeorm"; /** * Persisted registration of a temporary pull piece served at @@ -27,6 +27,10 @@ export class PullPiece { @Column({ name: "first_byte_at", type: "timestamptz", nullable: true }) firstByteAt: Date | null; + @Index() + @Column({ name: "expires_at", type: "timestamptz" }) + expiresAt!: Date; + @CreateDateColumn({ name: "created_at", type: "timestamptz" }) createdAt!: Date; } diff --git a/apps/backend/src/database/migrations/1776300000000-CreatePullPieces.ts b/apps/backend/src/database/migrations/1776300000000-CreatePullPieces.ts index 24b64c0e..931e448c 100644 --- a/apps/backend/src/database/migrations/1776300000000-CreatePullPieces.ts +++ b/apps/backend/src/database/migrations/1776300000000-CreatePullPieces.ts @@ -12,7 +12,8 @@ export class CreatePullPieces1776300000000 implements MigrationInterface { size INT NOT NULL, pull_submitted_at TIMESTAMPTZ, first_byte_at TIMESTAMPTZ, - created_at TIMESTAMPTZ NOT NULL DEFAULT NOW() + created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), + expires_at TIMESTAMPTZ NOT NULL ) `); } diff --git a/apps/backend/src/pull-check/pull-check.service.spec.ts b/apps/backend/src/pull-check/pull-check.service.spec.ts index ec221f3e..b1f2cd2e 100644 --- a/apps/backend/src/pull-check/pull-check.service.spec.ts +++ b/apps/backend/src/pull-check/pull-check.service.spec.ts @@ -173,6 +173,7 @@ describe("PullCheckService", () => { }); expect(prepared.registration.pieceCid).toBe("bafk-test-piece"); expect(prepared.registration.size).toBe(1024); + expect(prepared.registration.expiresAt).toBeInstanceOf(Date); expect(prepared.sourceUrl).toBe("https://dealbot.example/api/piece/bafk-test-piece"); expect(registryMock.register).toHaveBeenCalledWith(prepared.registration); }); diff --git a/apps/backend/src/pull-check/pull-check.service.ts b/apps/backend/src/pull-check/pull-check.service.ts index ab1202ca..bc720648 100644 --- a/apps/backend/src/pull-check/pull-check.service.ts +++ b/apps/backend/src/pull-check/pull-check.service.ts @@ -227,7 +227,8 @@ export class PullCheckService { * `/api/piece/:pieceCid` serving, and return the source URL plus registration. */ async preparePullPiece(providerAddress: string): Promise { - const targetSize = this.getPullPieceConfig().pullCheckPieceSizeBytes; + const pullPieceConfig = this.getPullPieceConfig(); + const targetSize = pullPieceConfig.pullCheckPieceSizeBytes; const key = crypto.randomBytes(16).toString("hex"); const dataStream = this.dataSourceService.generateBytesStream({ @@ -246,6 +247,7 @@ export class PullCheckService { providerAddress, key, size: targetSize, + expiresAt: new Date(Date.now() + pullPieceConfig.pullCheckJobTimeoutSeconds * 2 * 1000), }; await this.pullPieceRepository.register(registration); diff --git a/apps/backend/src/pull-check/pull-check.types.ts b/apps/backend/src/pull-check/pull-check.types.ts index 93617a30..9afffe6d 100644 --- a/apps/backend/src/pull-check/pull-check.types.ts +++ b/apps/backend/src/pull-check/pull-check.types.ts @@ -9,6 +9,7 @@ export type PullPieceRegistration = { size: number; pullSubmittedAt?: Date; firstByteAt?: Date; + expiresAt: Date; }; /** diff --git a/apps/backend/src/pull-check/pull-piece.repository.ts b/apps/backend/src/pull-check/pull-piece.repository.ts index 1af385be..4df61af2 100644 --- a/apps/backend/src/pull-check/pull-piece.repository.ts +++ b/apps/backend/src/pull-check/pull-piece.repository.ts @@ -28,6 +28,7 @@ export class PullPieceRepository { size: registration.size, pullSubmittedAt: null, firstByteAt: null, + expiresAt: registration.expiresAt, }, ["pieceCid"], ); @@ -88,6 +89,7 @@ export class PullPieceRepository { size: row.size, pullSubmittedAt: row.pullSubmittedAt ?? undefined, firstByteAt: row.firstByteAt ?? undefined, + expiresAt: row.expiresAt, }; } } From 1ba5cd63843d9b27df053fb9617a59c1d7e5d386 Mon Sep 17 00:00:00 2001 From: silent-cipher Date: Tue, 12 May 2026 11:54:43 +0530 Subject: [PATCH 42/44] refactor: stream piece validation and add size checks --- .../src/http-client/http-client.service.ts | 47 +++++++++++ .../src/pull-check/pull-check.service.spec.ts | 78 +++++++++++++------ .../src/pull-check/pull-check.service.ts | 55 +++++++++++-- 3 files changed, 152 insertions(+), 28 deletions(-) diff --git a/apps/backend/src/http-client/http-client.service.ts b/apps/backend/src/http-client/http-client.service.ts index 48e10e5c..47e9e4ee 100644 --- a/apps/backend/src/http-client/http-client.service.ts +++ b/apps/backend/src/http-client/http-client.service.ts @@ -1,3 +1,4 @@ +import { Readable } from "node:stream"; import { HttpService } from "@nestjs/axios"; import { Injectable, Logger } from "@nestjs/common"; import { ConfigService } from "@nestjs/config"; @@ -256,6 +257,52 @@ export class HttpClientService { return Buffer.from(JSON.stringify(data)); } + /** + * Make a streaming GET request via undici and return the response without + * buffering the body. The caller is responsible for consuming or destroying + * the returned `body` stream to free the underlying connection. + * + * Timeouts are decoupled across phases: `headersTimeout` bounds the + * connect/headers phase (short), while `bodyTimeout` bounds inactivity on + * the body stream (longer). + */ + async requestStream( + url: string, + options: { signal?: AbortSignal } = {}, + ): Promise<{ statusCode: number; headers: Record; body: Readable }> { + try { + const response = await undiciRequest(url, { + method: "GET", + headers: { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36" }, + signal: options.signal, + headersTimeout: this.connectTimeoutMs, + bodyTimeout: this.http1TimeoutMs, + }); + return { + statusCode: response.statusCode, + headers: response.headers as Record, + body: response.body, + }; + } catch (error) { + // Translate undici phase-specific timeouts into descriptive errors so + // callers can distinguish them from caller-initiated aborts in logs. + const code = (error as { code?: string } | null)?.code; + let normalized: unknown = error; + if (code === "UND_ERR_HEADERS_TIMEOUT") { + normalized = new Error(`Streaming request headers timed out after ${this.connectTimeoutMs}ms: ${url}`); + } else if (code === "UND_ERR_BODY_TIMEOUT") { + normalized = new Error(`Streaming request body timed out after ${this.http1TimeoutMs}ms: ${url}`); + } + this.logger.warn({ + event: "stream_request_failed", + message: "Streaming request failed", + url, + error: toStructuredError(normalized), + }); + throw normalized; + } + } + private buildHttp2Signals(parentSignal?: AbortSignal): { signal: AbortSignal; connectTimeoutSignal: AbortSignal; diff --git a/apps/backend/src/pull-check/pull-check.service.spec.ts b/apps/backend/src/pull-check/pull-check.service.spec.ts index b1f2cd2e..14fd50df 100644 --- a/apps/backend/src/pull-check/pull-check.service.spec.ts +++ b/apps/backend/src/pull-check/pull-check.service.spec.ts @@ -25,7 +25,7 @@ vi.mock("@filoz/synapse-core/sp", () => ({ waitForPullStatus: vi.fn(), })); -import { calculate } from "@filoz/synapse-core/piece"; +import { calculateFromIterable } from "@filoz/synapse-core/piece"; import { pullPieces, waitForPullStatus } from "@filoz/synapse-core/sp"; function makeProvider(overrides: Partial = {}): PDPProviderEx { @@ -56,7 +56,7 @@ describe("PullCheckService", () => { markFirstByte: ReturnType; forget: ReturnType; }; - let httpClientServiceMock: { requestWithMetrics: ReturnType }; + let httpClientServiceMock: { requestWithMetrics: ReturnType; requestStream: ReturnType }; let metricsMock: { observeAcknowledgementLatencyMs: ReturnType; observeStartedMs: ReturnType; @@ -84,6 +84,7 @@ describe("PullCheckService", () => { }; httpClientServiceMock = { requestWithMetrics: vi.fn(), + requestStream: vi.fn(), }; metricsMock = { observeAcknowledgementLatencyMs: vi.fn(), @@ -190,51 +191,76 @@ describe("PullCheckService", () => { const provider = makeProvider(); const logContext = { jobId: "job-1", providerAddress: "0xsp", providerId: 42n, providerName: "test-sp" }; + function makeStreamResponse( + overrides: { statusCode?: number; headers?: Record; cidResult?: string } = {}, + ) { + const { statusCode = 200, headers = {}, cidResult = "bafk-test-piece" } = overrides; + httpClientServiceMock.requestStream.mockResolvedValue({ + statusCode, + headers: { "content-length": "1024", ...headers }, + body: Readable.from([Buffer.from("payload")]), + }); + if (cidResult !== "bafk-test-piece") { + vi.mocked(calculateFromIterable).mockResolvedValueOnce(cidResult as any); + } + } + it("returns true when the recomputed CID matches", async () => { - httpClientServiceMock.requestWithMetrics.mockResolvedValue({ data: Buffer.from("payload") }); - vi.mocked(calculate).mockReturnValueOnce({ toString: () => "bafk-test-piece" } as ReturnType); + makeStreamResponse(); - const ok = await service.validateByDirectPieceFetch(provider, "bafk-test-piece", logContext); + const ok = await service.validateByDirectPieceFetch(provider, "bafk-test-piece", 1024, logContext); expect(ok).toBe(true); - expect(httpClientServiceMock.requestWithMetrics).toHaveBeenCalledWith( + expect(httpClientServiceMock.requestStream).toHaveBeenCalledWith( "https://sp.example/piece/bafk-test-piece", expect.any(Object), ); }); it("returns false when the recomputed CID does not match", async () => { - httpClientServiceMock.requestWithMetrics.mockResolvedValue({ data: Buffer.from("payload") }); - vi.mocked(calculate).mockReturnValueOnce({ toString: () => "bafk-different" } as ReturnType); + makeStreamResponse({ cidResult: "bafk-different" }); + + const ok = await service.validateByDirectPieceFetch(provider, "bafk-test-piece", 1024, logContext); + expect(ok).toBe(false); + }); - const ok = await service.validateByDirectPieceFetch(provider, "bafk-test-piece", logContext); + it("returns false when the SP returns a non-2xx status", async () => { + makeStreamResponse({ statusCode: 404 }); + + const ok = await service.validateByDirectPieceFetch(provider, "bafk-test-piece", 1024, logContext); + expect(ok).toBe(false); + }); + + it("returns false when Content-Length does not match expected piece size", async () => { + makeStreamResponse({ headers: { "content-length": "9999" } }); + + const ok = await service.validateByDirectPieceFetch(provider, "bafk-test-piece", 1024, logContext); expect(ok).toBe(false); }); it("returns false on transport errors (caller branches on the boolean to record a domain failure)", async () => { - httpClientServiceMock.requestWithMetrics.mockRejectedValue(new Error("ECONNRESET")); + httpClientServiceMock.requestStream.mockRejectedValue(new Error("ECONNRESET")); - const ok = await service.validateByDirectPieceFetch(provider, "bafk-test-piece", logContext); + const ok = await service.validateByDirectPieceFetch(provider, "bafk-test-piece", 1024, logContext); expect(ok).toBe(false); }); it("re-throws when the abort signal fires so cancellation is not masked as validation failure", async () => { const abort = new AbortController(); - httpClientServiceMock.requestWithMetrics.mockImplementation(async () => { + httpClientServiceMock.requestStream.mockImplementation(async () => { abort.abort(); throw new Error("aborted"); }); await expect( - service.validateByDirectPieceFetch(provider, "bafk-test-piece", logContext, abort.signal), + service.validateByDirectPieceFetch(provider, "bafk-test-piece", 1024, logContext, abort.signal), ).rejects.toThrow(); }); it("strips a trailing slash from the SP serviceURL when constructing the fetch URL", async () => { - httpClientServiceMock.requestWithMetrics.mockResolvedValue({ data: Buffer.from("payload") }); - vi.mocked(calculate).mockReturnValueOnce({ toString: () => "bafk-test-piece" } as ReturnType); + makeStreamResponse(); - await service.validateByDirectPieceFetch(provider, "bafk-test-piece", logContext); - expect(httpClientServiceMock.requestWithMetrics).toHaveBeenCalledWith( + await service.validateByDirectPieceFetch(provider, "bafk-test-piece", 1024, logContext); + expect(httpClientServiceMock.requestStream).toHaveBeenCalledWith( "https://sp.example/piece/bafk-test-piece", expect.any(Object), ); @@ -270,8 +296,11 @@ describe("PullCheckService", () => { } as unknown as Awaited>); // Direct-fetch validation succeeds. - httpClientServiceMock.requestWithMetrics.mockResolvedValue({ data: Buffer.from("payload") }); - vi.mocked(calculate).mockReturnValue({ toString: () => "bafk-test-piece" } as ReturnType); + httpClientServiceMock.requestStream.mockResolvedValue({ + statusCode: 200, + headers: { "content-length": "1024" }, + body: Readable.from([Buffer.from("payload")]), + }); return { registration }; } @@ -341,10 +370,13 @@ describe("PullCheckService", () => { it("re-throws and runs cleanup when the validation step fails", async () => { arrangeHappyPath(); - // Force validation mismatch by returning a different recomputed CID. - // preparePullPiece no longer calls calculate, it uses createPieceCIDStream. - // So the first call to calculate will be from validateByDirectPieceFetch. - vi.mocked(calculate).mockReturnValueOnce({ toString: () => "bafk-mismatch" } as ReturnType); + // Force validation mismatch. Both `preparePullPiece` and + // `validateByDirectPieceFetch` call `calculateFromIterable`, so chain + // two one-shot mocks: the first satisfies prepare with the canonical + // CID, the second makes the direct-fetch recompute disagree. + vi.mocked(calculateFromIterable) + .mockResolvedValueOnce("bafk-test-piece" as any) + .mockResolvedValueOnce("bafk-mismatch" as any); await expect(service.runPullCheck("0xsp", undefined, logContext)).rejects.toThrow(/validation failed/); expect(metricsMock.recordStatus).toHaveBeenLastCalledWith(expect.any(Object), "failure.other"); diff --git a/apps/backend/src/pull-check/pull-check.service.ts b/apps/backend/src/pull-check/pull-check.service.ts index bc720648..0343037b 100644 --- a/apps/backend/src/pull-check/pull-check.service.ts +++ b/apps/backend/src/pull-check/pull-check.service.ts @@ -1,6 +1,6 @@ import * as crypto from "node:crypto"; import { Readable } from "node:stream"; -import { calculate, calculateFromIterable, parse as parsePieceCid } from "@filoz/synapse-core/piece"; +import { calculateFromIterable, parse as parsePieceCid } from "@filoz/synapse-core/piece"; import { pullPieces, waitForPullStatus } from "@filoz/synapse-core/sp"; import { Injectable, Logger } from "@nestjs/common"; import { ConfigService } from "@nestjs/config"; @@ -132,7 +132,13 @@ export class PullCheckService { throw new Error(`Storage provider failed to pull piece: status=${finalResponse.status}`); } - const pieceValidated = await this.validateByDirectPieceFetch(providerInfo, pieceCidStr, logContext, signal); + const pieceValidated = await this.validateByDirectPieceFetch( + providerInfo, + pieceCidStr, + prepared.registration.size, + logContext, + signal, + ); signal?.throwIfAborted(); if (!pieceValidated) { throw new Error("Pull-check piece validation failed: SP did not serve the expected bytes"); @@ -193,15 +199,54 @@ export class PullCheckService { async validateByDirectPieceFetch( providerInfo: PDPProviderEx, pieceCid: string, + expectedSize: number, logContext: ProviderJobContext, signal?: AbortSignal, ): Promise { signal?.throwIfAborted(); const pieceFetchUrl = this.constructPieceFetchUrl(providerInfo.pdp.serviceURL, pieceCid); try { - const response = await this.httpClientService.requestWithMetrics(pieceFetchUrl, { signal }); - const calculatedPieceCid = calculate(response.data); - return calculatedPieceCid.toString() === pieceCid; + const response = await this.httpClientService.requestStream(pieceFetchUrl, { signal }); + + if (response.statusCode < 200 || response.statusCode >= 300) { + response.body.destroy(); + this.logger.warn({ + ...logContext, + event: "pull_check_direct_piece_fetch_failed", + message: "Direct piece fetch returned non-2xx status", + pieceCid, + pieceFetchUrl, + statusCode: response.statusCode, + }); + return false; + } + + const rawContentLength = response.headers["content-length"]; + const contentLengthHeader = Array.isArray(rawContentLength) ? rawContentLength[0] : rawContentLength; + if (contentLengthHeader !== undefined) { + const reportedSize = parseInt(contentLengthHeader, 10); + if (!Number.isNaN(reportedSize) && reportedSize !== expectedSize) { + response.body.destroy(); + this.logger.warn({ + ...logContext, + event: "pull_check_direct_piece_size_mismatch", + message: "Content-Length header does not match expected piece size", + pieceCid, + expectedSize, + reportedSize, + }); + return false; + } + } + + try { + const calculatedPieceCid = await calculateFromIterable(response.body); + return calculatedPieceCid.toString() === pieceCid; + } finally { + // Guarantee the underlying socket is released if `calculateFromIterable` + // throws partway (e.g. invalid framing) without fully draining the body. + if (!response.body.destroyed) response.body.destroy(); + } } catch (error) { // Re-throw aborts so the caller's lifecycle handles cancellation rather // than treating it as a validation failure. From 629be23b8e7996d1788a9d9fa94f1cd753398539 Mon Sep 17 00:00:00 2001 From: silent-cipher Date: Wed, 13 May 2026 12:40:12 +0530 Subject: [PATCH 43/44] chore: update waitForPullStatus -> waitForPullPieces --- .../src/pull-check/pull-check.service.spec.ts | 16 ++++++++-------- .../backend/src/pull-check/pull-check.service.ts | 6 +++--- docs/checks/events-and-metrics.md | 6 +++--- docs/checks/pull-check.md | 8 ++++---- docs/environment-variables.md | 2 +- 5 files changed, 19 insertions(+), 19 deletions(-) diff --git a/apps/backend/src/pull-check/pull-check.service.spec.ts b/apps/backend/src/pull-check/pull-check.service.spec.ts index 14fd50df..b0c2e719 100644 --- a/apps/backend/src/pull-check/pull-check.service.spec.ts +++ b/apps/backend/src/pull-check/pull-check.service.spec.ts @@ -22,11 +22,11 @@ vi.mock("@filoz/synapse-core/piece", () => ({ vi.mock("@filoz/synapse-core/sp", () => ({ pullPieces: vi.fn(), - waitForPullStatus: vi.fn(), + waitForPullPieces: vi.fn(), })); import { calculateFromIterable } from "@filoz/synapse-core/piece"; -import { pullPieces, waitForPullStatus } from "@filoz/synapse-core/sp"; +import { pullPieces, waitForPullPieces } from "@filoz/synapse-core/sp"; function makeProvider(overrides: Partial = {}): PDPProviderEx { return { @@ -290,10 +290,10 @@ describe("PullCheckService", () => { vi.mocked(pullPieces).mockResolvedValue({ status: "pending" } as unknown as Awaited< ReturnType >); - vi.mocked(waitForPullStatus).mockResolvedValue({ + vi.mocked(waitForPullPieces).mockResolvedValue({ status: "complete", pieces: [{ pieceCid: "bafk-test-piece", status: "complete" }], - } as unknown as Awaited>); + } as unknown as Awaited>); // Direct-fetch validation succeeds. httpClientServiceMock.requestStream.mockResolvedValue({ @@ -345,10 +345,10 @@ describe("PullCheckService", () => { it("re-throws and records failure.other when the SP terminal status is not 'complete'", async () => { arrangeHappyPath(); - vi.mocked(waitForPullStatus).mockResolvedValue({ + vi.mocked(waitForPullPieces).mockResolvedValue({ status: "failed", pieces: [], - } as unknown as Awaited>); + } as unknown as Awaited>); await expect(service.runPullCheck("0xsp", undefined, logContext)).rejects.toThrow( /Storage provider failed to pull piece/, @@ -362,7 +362,7 @@ describe("PullCheckService", () => { it("classifies timeouts as failure.timedout", async () => { arrangeHappyPath(); - vi.mocked(waitForPullStatus).mockRejectedValue(new Error("polling timed out after 300s")); + vi.mocked(waitForPullPieces).mockRejectedValue(new Error("polling timed out after 300s")); await expect(service.runPullCheck("0xsp", undefined, logContext)).rejects.toThrow(); expect(metricsMock.recordStatus).toHaveBeenLastCalledWith(expect.any(Object), "failure.timedout"); @@ -391,7 +391,7 @@ describe("PullCheckService", () => { await expect(service.runPullCheck("0xsp", controller.signal, logContext)).rejects.toThrow(); // No SP-side calls were issued. expect(pullPieces).not.toHaveBeenCalled(); - expect(waitForPullStatus).not.toHaveBeenCalled(); + expect(waitForPullPieces).not.toHaveBeenCalled(); // Failure is classified as timed out (abort message contains "timeout"). expect(metricsMock.recordStatus).toHaveBeenLastCalledWith(expect.any(Object), "failure.timedout"); }); diff --git a/apps/backend/src/pull-check/pull-check.service.ts b/apps/backend/src/pull-check/pull-check.service.ts index 949fb7c5..8a42e797 100644 --- a/apps/backend/src/pull-check/pull-check.service.ts +++ b/apps/backend/src/pull-check/pull-check.service.ts @@ -1,7 +1,7 @@ import * as crypto from "node:crypto"; import { Readable } from "node:stream"; import { calculateFromIterable, parse as parsePieceCid } from "@filoz/synapse-core/piece"; -import { pullPieces, waitForPullStatus } from "@filoz/synapse-core/sp"; +import { pullPieces, waitForPullPieces } from "@filoz/synapse-core/sp"; import { Injectable, Logger } from "@nestjs/common"; import { ConfigService } from "@nestjs/config"; import type { Account, Address, Chain, Client, Transport } from "viem"; @@ -116,8 +116,8 @@ export class PullCheckService { }); const pullPieceConfig = this.getPullPieceConfig(); - // `waitForPullStatus` polls the SP repeatedly until a terminal pull status is reported - const finalResponse = await waitForPullStatus(synapseClient, { + // `waitForPullPieces` polls the SP repeatedly until a terminal pull status is reported + const finalResponse = await waitForPullPieces(synapseClient, { ...pullPiecesOptions, timeout: pullPieceConfig.pullCheckJobTimeoutSeconds * 1000, pollInterval: pullPieceConfig.pullCheckPollIntervalSeconds * 1000, diff --git a/docs/checks/events-and-metrics.md b/docs/checks/events-and-metrics.md index dfd8099a..da1d2468 100644 --- a/docs/checks/events-and-metrics.md +++ b/docs/checks/events-and-metrics.md @@ -72,7 +72,7 @@ sequenceDiagram Dealbot->>SP: pullRequestSubmittedToSp (pullPieces) SP-->>Dealbot: pullRequestAcknowledgedBySp SP-->>Dealbot: pullRequestStartedBySp - Dealbot->>SP: pullStatusPolled (waitForPullStatus, repeated) + Dealbot->>SP: pullStatusPolled (waitForPullPieces, repeated) SP-->>Dealbot: pullRequestIsTerminal Dealbot->>SP: directPieceFetchStarted (/piece/{cid}) SP-->>Dealbot: directPieceFetchCompleted @@ -86,7 +86,7 @@ sequenceDiagram | `pullRequestSubmittedToSp` | Dealbot calls Synapse pullPiece (`POST /pdp/piece/pull`) against the SP for the registered piece CID. | Yes | [`pull-check.service.ts`](../../apps/backend/src/pull-check/pull-check.service.ts) | | `pullRequestAcknowledgedBySp` | SP returns from `pullPieces` (success or non-terminal-failure). | Yes | [`pull-check.service.ts`](../../apps/backend/src/pull-check/pull-check.service.ts) | | `pullRequestStartedBySp` | Dealbot receives SP request for `/api/piece/{pieceCid}` from dealbot. Recorded once per registration. | Yes | [`pull-piece.controller.ts`](../../apps/backend/src/pull-check/pull-piece.controller.ts) | -| `pullRequestIsTerminal` | Dealbot determines the pull request is in terminal pull status (`complete`, `failed`, ...) via `waitForPullStatus` or the polling operation has timed out. Intermediate poll statuses are not counted. | Yes | [`pull-check.service.ts`](../../apps/backend/src/pull-check/pull-check.service.ts) | +| `pullRequestIsTerminal` | Dealbot determines the pull request is in terminal pull status (`complete`, `failed`, ...) via `waitForPullPieces` or the polling operation has timed out. Intermediate poll statuses are not counted. | Yes | [`pull-check.service.ts`](../../apps/backend/src/pull-check/pull-check.service.ts) | | `pullRequestIntegrityChecked` | Dealbot performs direct `/piece/{pieceCid}` retrieval from the SP and confirms the bytes match the pieceCid. | Yes | [`pull-check.service.ts`](../../apps/backend/src/pull-check/pull-check.service.ts) | ## Metrics @@ -144,7 +144,7 @@ sequenceDiagram | `dataSetCreationStatus` | Data-Set Creation | Not tied to an [event above](#event-list) but rather to data-set creation start (`pending`) and completion (`success`/`failure.*`) | `pending`, `success`, `failure.timedout`, `failure.other` | [`deal.service.ts`](../../apps/backend/src/deal/deal.service.ts) | | `dataSetChallengeStatus` | Data Retention | Emitted on each [Data Retention Check](./data-retention.md) poll when a provider's confirmed proving-period totals advance (strictly positive deltas). Unit: **challenges** (period delta × `CHALLENGES_PER_PROVING_PERIOD = 5`). | `success` (challenges in successfully-proven periods), `failure` (challenges in faulted periods) | [`data-retention.service.ts`](../../apps/backend/src/data-retention/data-retention.service.ts) | | `pdp_provider_estimated_overdue_periods` | Data Retention | Emitted on every [Data Retention Check](./data-retention.md) poll for every successfully processed provider. | Gauge value in proving periods (non-negative integer) | [`data-retention.service.ts`](../../apps/backend/src/data-retention/data-retention.service.ts) | -| `pullRequestProviderStatus` | Pull | When the SP reports a terminal pull status via `waitForPullStatus`. Recorded exactly once per check (intermediate poll statuses are not counted). | Raw SP-reported pull status, for example `complete`, `failed`, `not_found`. Use this to separate SP-side pull failures from dealbot-side validation failures. | [`pull-check.service.ts`](../../apps/backend/src/pull-check/pull-check.service.ts) | +| `pullRequestProviderStatus` | Pull | When the SP reports a terminal pull status via `waitForPullPieces`. Recorded exactly once per check (intermediate poll statuses are not counted). | Raw SP-reported pull status, for example `complete`, `failed`, `not_found`. Use this to separate SP-side pull failures from dealbot-side validation failures. | [`pull-check.service.ts`](../../apps/backend/src/pull-check/pull-check.service.ts) | | `pullCheckStatus` | Pull | When the [Pull Check](./pull-check.md) terminates (success after direct piece validation, or any failure). Recorded exactly once per check. | `success`, `failure.timedout`, `failure.other`. Failure classification follows [`classifyFailureStatus`](../../apps/backend/src/metrics-prometheus/check-metric-labels.ts) (timeout-keyed errors → `failure.timedout`, everything else → `failure.other`). | [`pull-check.service.ts`](../../apps/backend/src/pull-check/pull-check.service.ts) | ## ClickHouse Tables diff --git a/docs/checks/pull-check.md b/docs/checks/pull-check.md index 87d86243..56c57fc7 100644 --- a/docs/checks/pull-check.md +++ b/docs/checks/pull-check.md @@ -23,7 +23,7 @@ Each pull check asserts the following for every SP: | # | Assertion | How It's Checked | Retries | Relevant Metric for Setting a Max Duration | Implemented? | |---|-----------|------------------|:---:|--------------------------------------------|:---:| | 1 | SP accepts the pull request | Synapse `pullPieces` (i.e., initial call to SP `POST /pdp/piece/pull`) returns without error and reports a non-terminal-failure status | 0 | [`pullRequestAcknowledgementLatencyMs`](./events-and-metrics.md#pullRequestAcknowledgementLatencyMs) | Yes | -| 2 | SP reaches a terminal `complete` pull status | Synapse `waitForPullStatus` polls the SP (using `POST /pdp/piece/pull`) until a terminal status is reported | Polling will continue until [`PULL_CHECK_JOB_TIMEOUT_SECONDS`](../environment-variables.md#pull_check_job_timeout_seconds) is reached | [`pullRequestCompletionLatencyMs`](./events-and-metrics.md#pullRequestCompletionLatencyMs) | Yes | +| 2 | SP reaches a terminal `complete` pull status | Synapse `waitForPullPieces` polls the SP (using `POST /pdp/piece/pull`) until a terminal status is reported | Polling will continue until [`PULL_CHECK_JOB_TIMEOUT_SECONDS`](../environment-variables.md#pull_check_job_timeout_seconds) is reached | [`pullRequestCompletionLatencyMs`](./events-and-metrics.md#pullRequestCompletionLatencyMs) | Yes | | 3 | SP serves the pulled piece via `/piece/{pieceCid}` | Re-fetch the bytes from the SP's PDP service URL and re-compute the piece CID | 0 | n/a (bounded by job timeout) | Yes | | 4 | All checks pass | Pull check is not marked successful until all assertions pass within the job timeout | n/a | [`pullRequestCompletionLatencyMs`](./events-and-metrics.md#pullRequestCompletionLatencyMs) | Yes | @@ -35,7 +35,7 @@ The dealbot scheduler triggers pull check jobs at a configurable rate (`PULL_CHE flowchart TD Generate["Compute PieceCID + register hosted source in Postgres
at /api/piece/{pieceCid}"] Generate --> Submit["Submit pullPieces request to SP"] - Submit --> |SP responds with HTTP 200|Poll["Poll SP via waitForPullStatus
until terminal pull status"] + Submit --> |SP responds with HTTP 200|Poll["Poll SP via waitForPullPieces
until terminal pull status"] Submit --> |SP doesn't respond with HTTP 200| Fail["Mark pull check failed"] Poll -->|complete| Validate["Direct /piece/{pieceCid} fetch from SP
+ recompute pieceCid"] Poll -->|other terminal status| Fail @@ -71,13 +71,13 @@ Source: [`pull-check.service.ts` (`runPullCheck`)](../../apps/backend/src/pull-c When dealbot receives the SP request for `/api/piece/{pieceCid}` for the first time, dealbot stamps a first-byte timestamp on the registration. This is the basis for [`pullRequestStartedMs`](./events-and-metrics.md#pullRequestStartedMs). -In parallel, dealbot`waitForPullStatus` polls the SP at `PULL_CHECK_POLL_INTERVAL_SECONDS` until the SP reports a terminal status (`complete` or `failed`) or the job timeout fires. Dealbot increments the [`pullRequestProviderStatus`](./events-and-metrics.md#pullRequestProviderStatus) counter exactly once with the **terminal** status; intermediate poll statuses are not counted. +In parallel, dealbot`waitForPullPieces` polls the SP at `PULL_CHECK_POLL_INTERVAL_SECONDS` until the SP reports a terminal status (`complete` or `failed`) or the job timeout fires. Dealbot increments the [`pullRequestProviderStatus`](./events-and-metrics.md#pullRequestProviderStatus) counter exactly once with the **terminal** status; intermediate poll statuses are not counted. Source: [`pull-piece.controller.ts`](../../apps/backend/src/pull-check/pull-piece.controller.ts) ### 4. Direct piece-fetch validation -After `waitForPullStatus`, dealbot fetches `{serviceURL}/piece/{pieceCid}` from the SP, re-computes the piece CID over the response body, and compares it against the expected CID. A mismatch fails the pull check with `failure.other`. A network or HTTP error during validation also fails the check (transport errors are intentionally not retried). +After `waitForPullPieces`, dealbot fetches `{serviceURL}/piece/{pieceCid}` from the SP, re-computes the piece CID over the response body, and compares it against the expected CID. A mismatch fails the pull check with `failure.other`. A network or HTTP error during validation also fails the check (transport errors are intentionally not retried). Aborts (job timeout) propagate as throws and are classified as `failure.timedout` rather than as a validation failure. diff --git a/docs/environment-variables.md b/docs/environment-variables.md index 925276fb..95596713 100644 --- a/docs/environment-variables.md +++ b/docs/environment-variables.md @@ -980,7 +980,7 @@ PULL_CHECKS_PER_SP_PER_HOUR=0.083 - **Default**: `2` - **Minimum**: `1` -**Role**: Polling interval used by `waitForPullStatus` while waiting for the SP to report a terminal pull status (`complete` or `failed`). +**Role**: Polling interval used by `waitForPullPieces` while waiting for the SP to report a terminal pull status (`complete` or `failed`). **When to update**: From f6b93c6c7355850fb5b95913dd4e31cfd03eb253 Mon Sep 17 00:00:00 2001 From: silent-cipher Date: Wed, 13 May 2026 12:49:36 +0530 Subject: [PATCH 44/44] chore: address pr comments --- .../src/clickhouse/clickhouse.schema.ts | 2 +- .../src/pull-check/pull-check.service.spec.ts | 36 +++++++++++++++++++ .../src/pull-check/pull-check.service.ts | 22 ++++++------ 3 files changed, 49 insertions(+), 11 deletions(-) diff --git a/apps/backend/src/clickhouse/clickhouse.schema.ts b/apps/backend/src/clickhouse/clickhouse.schema.ts index 2c760eb3..7afbab57 100644 --- a/apps/backend/src/clickhouse/clickhouse.schema.ts +++ b/apps/backend/src/clickhouse/clickhouse.schema.ts @@ -91,7 +91,7 @@ export function buildMigrations(database: string): string[] { piece_size_bytes Nullable(UInt64), -- size of the synthetic piece in bytes; null if preparation failed status LowCardinality(String), -- 'success' | 'failure.timedout' | 'failure.other' - provider_status LowCardinality(Nullable(String)), -- raw SP-reported terminal pull status (e.g. 'complete', 'failed'); null if the request was never acknowledged + provider_status LowCardinality(Nullable(String)), -- raw SP-reported terminal pull status (e.g. 'complete', 'failed'); null if the request was never acknowledged or if waiting for pull status errored or timed out acknowledgement_latency_ms Nullable(Float64), -- time from pullPieces submission to SP acknowledgement (ms) completion_latency_ms Nullable(Float64), -- time from pullPieces submission to terminal SP pull status (ms) diff --git a/apps/backend/src/pull-check/pull-check.service.spec.ts b/apps/backend/src/pull-check/pull-check.service.spec.ts index 1ad47d75..d5e392ba 100644 --- a/apps/backend/src/pull-check/pull-check.service.spec.ts +++ b/apps/backend/src/pull-check/pull-check.service.spec.ts @@ -374,6 +374,15 @@ describe("PullCheckService", () => { expect(metricsMock.recordStatus).toHaveBeenLastCalledWith(expect.any(Object), "failure.other"); // Cleanup still runs in the finally block. expect(registryMock.forget).toHaveBeenCalled(); + // ClickHouse row written with the failure outcome. + expect(clickhouseServiceMock.insert).toHaveBeenCalledWith( + "pull_checks", + expect.objectContaining({ + sp_address: "0xsp", + status: "failure.other", + provider_status: "failed", + }), + ); }); it("classifies timeouts as failure.timedout", async () => { @@ -382,6 +391,13 @@ describe("PullCheckService", () => { await expect(service.runPullCheck("0xsp", undefined, logContext)).rejects.toThrow(); expect(metricsMock.recordStatus).toHaveBeenLastCalledWith(expect.any(Object), "failure.timedout"); + expect(clickhouseServiceMock.insert).toHaveBeenCalledWith( + "pull_checks", + expect.objectContaining({ + sp_address: "0xsp", + status: "failure.timedout", + }), + ); }); it("re-throws and runs cleanup when the validation step fails", async () => { @@ -419,6 +435,26 @@ describe("PullCheckService", () => { await expect(service.runPullCheck("0xsp", undefined, logContext)).rejects.toThrow(/Synapse client unavailable/); expect(metricsMock.recordStatus).toHaveBeenLastCalledWith(expect.any(Object), "failure.other"); }); + + it("writes a ClickHouse row with null sp fields when the provider is unknown", async () => { + walletSdkServiceMock.getProviderInfo.mockReturnValue(undefined); + + await expect(service.runPullCheck("0xsp", undefined, logContext)).rejects.toThrow(/not found/); + // No metrics recorded (labels could not be built). + expect(metricsMock.recordStatus).not.toHaveBeenCalled(); + // ClickHouse row still written: sp_address is always available, + // sp_id and sp_name are null since providerInfo was never resolved. + expect(clickhouseServiceMock.insert).toHaveBeenCalledWith( + "pull_checks", + expect.objectContaining({ + sp_address: "0xsp", + sp_id: null, + sp_name: null, + piece_cid: null, + status: "failure.other", + }), + ); + }); }); describe("openPullPieceStream", () => { diff --git a/apps/backend/src/pull-check/pull-check.service.ts b/apps/backend/src/pull-check/pull-check.service.ts index 1d2f773c..b8e01b3d 100644 --- a/apps/backend/src/pull-check/pull-check.service.ts +++ b/apps/backend/src/pull-check/pull-check.service.ts @@ -70,13 +70,8 @@ export class PullCheckService { signal: AbortSignal | undefined, logContext: ProviderJobContext, ): Promise { - const providerInfo = this.validateProviderInfo(spAddress); - const labels = buildCheckMetricLabels({ - checkType: "pullCheck", - providerId: providerInfo.id, - providerName: providerInfo.name, - providerIsApproved: providerInfo.isApproved, - }); + let providerInfo: PDPProviderEx | null = null; + let labels: ReturnType | null = null; let prepared: PullPiecePrepared | null = null; let requestSubmittedAt: Date | null = null; @@ -88,6 +83,13 @@ export class PullCheckService { let checkStatus: string | null = null; try { + providerInfo = this.validateProviderInfo(spAddress); + labels = buildCheckMetricLabels({ + checkType: "pullCheck", + providerId: providerInfo.id, + providerName: providerInfo.name, + providerIsApproved: providerInfo.isApproved, + }); signal?.throwIfAborted(); prepared = await this.preparePullPiece(spAddress); const pieceCidStr = prepared.registration.pieceCid; @@ -182,7 +184,7 @@ export class PullCheckService { }); } catch (error) { checkStatus = classifyFailureStatus(error); - this.pullCheckMetrics.recordStatus(labels, checkStatus); + if (labels !== null) this.pullCheckMetrics.recordStatus(labels, checkStatus); throw error; } finally { if (prepared) { @@ -202,8 +204,8 @@ export class PullCheckService { timestamp: Date.now(), probe_location: this.clickhouseService.probeLocation, sp_address: spAddress, - sp_id: providerInfo.id != null ? String(providerInfo.id) : null, - sp_name: providerInfo.name ?? null, + sp_id: providerInfo?.id != null ? String(providerInfo.id) : null, + sp_name: providerInfo?.name ?? null, piece_cid: prepared?.registration.pieceCid ?? null, piece_size_bytes: prepared?.registration.size ?? null, status: checkStatus,