From e2b22d2cf9eca6701a8dda454aeafaf45d1bc0f1 Mon Sep 17 00:00:00 2001 From: William Kempster Date: Tue, 25 Nov 2025 21:11:20 +0000 Subject: [PATCH 1/4] initial structure for series --- content/build/guides/meta.json | 3 +- .../verifiable-ai/immutable-trust-layer.mdx | 47 +++++++++++++++++ content/build/guides/verifiable-ai/index.mdx | 52 +++++++++++++++++++ content/build/guides/verifiable-ai/meta.json | 9 ++++ .../verifiable-ai/signed-model-registry.mdx | 39 ++++++++++++++ .../verifiable-ai/verifiable-dataset.mdx | 37 +++++++++++++ 6 files changed, 186 insertions(+), 1 deletion(-) create mode 100644 content/build/guides/verifiable-ai/immutable-trust-layer.mdx create mode 100644 content/build/guides/verifiable-ai/index.mdx create mode 100644 content/build/guides/verifiable-ai/meta.json create mode 100644 content/build/guides/verifiable-ai/signed-model-registry.mdx create mode 100644 content/build/guides/verifiable-ai/verifiable-dataset.mdx diff --git a/content/build/guides/meta.json b/content/build/guides/meta.json index 5eb8092fa..1d52d2ebe 100644 --- a/content/build/guides/meta.json +++ b/content/build/guides/meta.json @@ -9,6 +9,7 @@ "crossmint-nft-minting-app", "working-with-arns", "using-turbo-in-a-browser", - "storing-nfts" + "storing-nfts", + "verifiable-ai" ] } diff --git a/content/build/guides/verifiable-ai/immutable-trust-layer.mdx b/content/build/guides/verifiable-ai/immutable-trust-layer.mdx new file mode 100644 index 000000000..652d7c3c1 --- /dev/null +++ b/content/build/guides/verifiable-ai/immutable-trust-layer.mdx @@ -0,0 +1,47 @@ +--- +title: "The Immutable Trust Layer" +description: "Implement a Lambda Architecture for AI logging that creates tamper-proof audit trails for algorithmic liability" +--- + +import { Callout } from "fumadocs-ui/components/callout"; +import { Steps, Step } from "fumadocs-ui/components/steps"; + +## Output and Liability Verification for AI Systems + +Learn how to implement a Lambda Architecture for AI logging that streams encrypted evidence and builds analytics indices for complete algorithmic accountability. + +## Prerequisites + +Before starting, ensure you have: + +- **Node.js** (v18 or higher) +- **TypeScript** knowledge +- **Arweave Wallet (JWK file)** - We recommend [Wander](https://www.wander.app/) +- **Turbo Credits** - Purchase credits to pay for uploads. See [Turbo Credits guide](/build/upload/turbo-credits) +- Completed [The Verifiable Dataset](/build/guides/verifiable-ai/verifiable-dataset) guide +- Completed [The Signed Model Registry](/build/guides/verifiable-ai/signed-model-registry) guide + +## Overview + +This guide covers: + +- Implementing the Speed Layer for real-time evidence streaming +- Building the Batch Layer with Parquet indices for analytics +- Creating tamper-proof audit trails +- Encrypting sensitive AI outputs +- Querying historical AI decisions +- Establishing algorithmic liability frameworks + + +Content for this guide is coming soon. Check back later for the complete walkthrough. + + +## Summary + +By completing this guide series, you've built a complete verifiable AI infrastructure with: + +- **Verifiable Datasets** ensuring input integrity +- **Signed Model Registry** preventing process drift +- **Immutable Trust Layer** providing output accountability + +Your AI systems now have cryptographic proof of their decisions, creating a foundation for enterprise trust and regulatory compliance. diff --git a/content/build/guides/verifiable-ai/index.mdx b/content/build/guides/verifiable-ai/index.mdx new file mode 100644 index 000000000..b0a03421f --- /dev/null +++ b/content/build/guides/verifiable-ai/index.mdx @@ -0,0 +1,52 @@ +--- +title: "Verifiable AI with AR.IO Network" +description: "Build production-grade verifiable AI systems with immutable data provenance, signed model registries, and tamper-proof audit trails" +--- + +import { + Database, + Shield, + FileCheck, + Lock, +} from "lucide-react"; + +## From Black Box to Glass Box: The Verifiable AI Stack + +The challenge with Enterprise AI is not just performance, but **provenance**. Standard cloud storage is mutable, making it difficult to prove exactly which dataset trained a model or what precise state an AI agent was in during a specific incident. + +To solve **Algorithmic Liability**, AI systems require an immutable root of trust. + +AR.IO Network facilitates this by enabling a **"Glass Box"** architecture: + +- **Verifiable Datasets**: Prove the integrity of training data, whether it lives on S3 or directly on Arweave. +- **Signed Model Registries**: Prevent model drift by verifying weights against on-chain proofs before inference starts. +- **The Trust Layer**: A "Lambda Architecture" for logging that streams encrypted evidence for liability (Speed Layer) and builds Parquet indices for analytics (Batch Layer). + +## What You'll Learn + +In this guide series, you will build a production-grade **Verifiable AI Stack** using TypeScript and the Turbo SDK. + + + } + /> + } + /> + } + /> + + +Each guide builds on the last, creating a complete verifiable AI infrastructure by the end of the series. + +Let's get started. diff --git a/content/build/guides/verifiable-ai/meta.json b/content/build/guides/verifiable-ai/meta.json new file mode 100644 index 000000000..28b5e63c9 --- /dev/null +++ b/content/build/guides/verifiable-ai/meta.json @@ -0,0 +1,9 @@ +{ + "title": "Verifiable AI", + "defaultOpen": false, + "pages": [ + "verifiable-dataset", + "signed-model-registry", + "immutable-trust-layer" + ] +} diff --git a/content/build/guides/verifiable-ai/signed-model-registry.mdx b/content/build/guides/verifiable-ai/signed-model-registry.mdx new file mode 100644 index 000000000..9331a6396 --- /dev/null +++ b/content/build/guides/verifiable-ai/signed-model-registry.mdx @@ -0,0 +1,39 @@ +--- +title: "The Signed Model Registry" +description: "Build a registry that prevents model drift by verifying weights against on-chain proofs before inference starts" +--- + +import { Callout } from "fumadocs-ui/components/callout"; +import { Steps, Step } from "fumadocs-ui/components/steps"; + +## Process Verification for AI Models + +Learn how to build a signed model registry that prevents model drift by verifying weights against on-chain proofs before inference starts. + +## Prerequisites + +Before starting, ensure you have: + +- **Node.js** (v18 or higher) +- **TypeScript** knowledge +- **Arweave Wallet (JWK file)** - We recommend [Wander](https://www.wander.app/) +- **Turbo Credits** - Purchase credits to pay for uploads. See [Turbo Credits guide](/build/upload/turbo-credits) +- Completed [The Verifiable Dataset](/build/guides/verifiable-ai/verifiable-dataset) guide + +## Overview + +This guide covers: + +- Creating cryptographic signatures for model weights +- Storing model metadata on Arweave +- Verifying model integrity before inference +- Implementing a model registry service +- Preventing model drift and tampering + + +Content for this guide is coming soon. Check back later for the complete walkthrough. + + +## Next Steps + +After completing this guide, proceed to [The Immutable Trust Layer](/build/guides/verifiable-ai/immutable-trust-layer) to learn how to create tamper-proof audit trails. diff --git a/content/build/guides/verifiable-ai/verifiable-dataset.mdx b/content/build/guides/verifiable-ai/verifiable-dataset.mdx new file mode 100644 index 000000000..ea56b0495 --- /dev/null +++ b/content/build/guides/verifiable-ai/verifiable-dataset.mdx @@ -0,0 +1,37 @@ +--- +title: "The Verifiable Dataset" +description: "Create tamper-proof datasets with cryptographic proofs, ensuring data integrity from S3 to Arweave for AI training" +--- + +import { Callout } from "fumadocs-ui/components/callout"; +import { Steps, Step } from "fumadocs-ui/components/steps"; + +## Input Verification for AI Systems + +Learn how to create verifiable datasets that prove the integrity of training data, whether stored on S3 or directly on Arweave. + +## Prerequisites + +Before starting, ensure you have: + +- **Node.js** (v18 or higher) +- **TypeScript** knowledge +- **Arweave Wallet (JWK file)** - We recommend [Wander](https://www.wander.app/) +- **Turbo Credits** - Purchase credits to pay for uploads. See [Turbo Credits guide](/build/upload/turbo-credits) + +## Overview + +This guide covers: + +- Creating cryptographic proofs for datasets +- Uploading dataset manifests to Arweave +- Verifying dataset integrity +- Integrating with existing S3 workflows + + +Content for this guide is coming soon. Check back later for the complete walkthrough. + + +## Next Steps + +After completing this guide, proceed to [The Signed Model Registry](/build/guides/verifiable-ai/signed-model-registry) to learn how to prevent model drift. From cfd271c96c1341862a8bf35b572c721e65de7e63 Mon Sep 17 00:00:00 2001 From: William Kempster Date: Tue, 25 Nov 2025 21:19:46 +0000 Subject: [PATCH 2/4] initial draft for dataset article --- .../verifiable-ai/verifiable-dataset.mdx | 266 +++++++++++++++++- 1 file changed, 255 insertions(+), 11 deletions(-) diff --git a/content/build/guides/verifiable-ai/verifiable-dataset.mdx b/content/build/guides/verifiable-ai/verifiable-dataset.mdx index ea56b0495..0bf88b541 100644 --- a/content/build/guides/verifiable-ai/verifiable-dataset.mdx +++ b/content/build/guides/verifiable-ai/verifiable-dataset.mdx @@ -6,32 +6,276 @@ description: "Create tamper-proof datasets with cryptographic proofs, ensuring d import { Callout } from "fumadocs-ui/components/callout"; import { Steps, Step } from "fumadocs-ui/components/steps"; -## Input Verification for AI Systems +Training data is the foundation of any AI model. To ensure provenance, you must be able to prove exactly what data was used to train a specific model version. -Learn how to create verifiable datasets that prove the integrity of training data, whether stored on S3 or directly on Arweave. +We provide two patterns depending on your data size and requirements: + +- **The Holographic Anchor**: Best for massive data (TB/PB) stored on S3. +- **The Native Data Lake**: Best for high-value data (under 1TB) stored directly on Arweave with a Parquet index. ## Prerequisites Before starting, ensure you have: - **Node.js** (v18 or higher) -- **TypeScript** knowledge - **Arweave Wallet (JWK file)** - We recommend [Wander](https://www.wander.app/) - **Turbo Credits** - Purchase credits to pay for uploads. See [Turbo Credits guide](/build/upload/turbo-credits) +- **TypeScript** knowledge + +### Install Dependencies + +```bash +npm install @ardrive/turbo-sdk parquets arweave dotenv +npm install --save-dev @types/node +``` + +## Pattern A: The Holographic Anchor (Off-Chain) + +Use this when your dataset is petabyte-scale or must reside in a specific jurisdiction (GDPR). + +With this pattern, we do not upload the actual file. We upload a **cryptographic fingerprint**. + + + +### Generate Cryptographic Proof + +Create a file `anchor-dataset.ts` to generate a SHA-256 hash of your dataset: + +```typescript +import { TurboFactory } from '@ardrive/turbo-sdk'; +import * as fs from 'fs'; +import * as crypto from 'crypto'; +import { pipeline } from 'stream/promises'; + +const jwk = JSON.parse(fs.readFileSync('wallet.json', 'utf-8')); +const turbo = TurboFactory.authenticated({ + privateKey: jwk, + token: 'arweave' +}); + +export async function createHolographicAnchor(filePath: string, s3Url: string) { + console.log("1. Generating Cryptographic Proof..."); + + // Hash stream (Efficient for large files, low RAM usage) + const hash = crypto.createHash('sha256'); + const fileStream = fs.createReadStream(filePath); + await pipeline(fileStream, hash); + const fingerprint = hash.digest('hex'); + + // 2. Prepare the Anchor Payload + const anchor = { + type: 'dataset_anchor', + storage: 's3', + url: s3Url, + sha256: fingerprint, // The mathematical truth + size: fs.statSync(filePath).size, + timestamp: Date.now() + }; + + // 3. Upload Metadata Only + const upload = await turbo.uploadFile({ + fileStreamFactory: () => Buffer.from(JSON.stringify(anchor)), + fileSizeFactory: () => Buffer.byteLength(JSON.stringify(anchor)), + dataItemOpts: { + tags: [ + { name: 'Content-Type', value: 'application/json' }, + { name: 'Type', value: 'Dataset-Anchor' } + ] + } + }); + + console.log(`āš“ Holographic Anchor Minted: ar://${upload.id}`); + return upload.id; +} +``` + + +This approach is memory-efficient for large files. The stream hashing means you can verify petabyte-scale datasets without loading them entirely into RAM. + + + + +### Use the Anchor + +The anchor transaction ID serves as an immutable proof that: +1. A specific dataset existed at a specific time +2. The dataset had a specific SHA-256 hash +3. The dataset was stored at a specific S3 URL + +Anyone can verify the dataset hasn't changed by re-hashing the S3 file and comparing it to the on-chain fingerprint. + +```typescript +// Verify a dataset matches its anchor +async function verifyDataset(anchorId: string, localFilePath: string) { + // 1. Fetch the anchor from Arweave + const anchorData = await fetch(`https://ar-io.net/${anchorId}`); + const anchor = await anchorData.json(); + + // 2. Hash the local file + const hash = crypto.createHash('sha256'); + const fileStream = fs.createReadStream(localFilePath); + await pipeline(fileStream, hash); + const localFingerprint = hash.digest('hex'); + + // 3. Compare + if (localFingerprint === anchor.sha256) { + console.log('āœ… Dataset verified! Matches on-chain anchor.'); + return true; + } else { + console.log('āŒ Dataset verification failed! File has been modified.'); + return false; + } +} +``` + + + + +**Important**: The Holographic Anchor proves a dataset existed with a specific hash, but doesn't make the data itself permanent. For true permanence, use Pattern B. + -## Overview +## Pattern B: The Native Data Lake (On-Chain) -This guide covers: +Use this for fine-tuning sets, RAG Knowledge Bases, or benchmarks where you want both the data and its index permanently stored. -- Creating cryptographic proofs for datasets -- Uploading dataset manifests to Arweave -- Verifying dataset integrity -- Integrating with existing S3 workflows +We upload the raw files to Arweave and generate a Parquet Index. This allows training scripts to filter data (e.g., "Give me only train split images") without downloading the entire dataset manifest. + + + +### Upload Files and Build Index + +Create a file `upload-native-lake.ts`: + +```typescript +import { TurboFactory } from '@ardrive/turbo-sdk'; +import parquets from 'parquets'; +import * as fs from 'fs'; +import * as path from 'path'; + +// Schema: We verify NOT just the ID, but the content metadata too +const schema = new parquets.ParquetSchema({ + filename: { type: 'UTF8' }, + tx_id: { type: 'UTF8' }, // The Arweave Pointer + byte_size: { type: 'INT64' }, + dataset_split: { type: 'UTF8' }, // 'train' vs 'test' + label: { type: 'UTF8' } // e.g. 'pneumonia' +}); + +export async function uploadDatasetWithIndex(baseDir: string) { + const jwk = JSON.parse(fs.readFileSync('wallet.json', 'utf-8')); + const turbo = TurboFactory.authenticated({ + privateKey: jwk, + token: 'arweave' + }); + + const indexRows = []; + const files = fs.readdirSync(baseDir); + + console.log(`šŸš€ Processing ${files.length} files...`); + + // 1. Upload Files + for (const file of files) { + const filePath = path.join(baseDir, file); + const size = fs.statSync(filePath).size; + + // Logic to determine label/split from filename + const isTrain = file.startsWith('train'); + const label = file.includes('cat') ? 'cat' : 'dog'; + + const upload = await turbo.uploadFile({ + fileStreamFactory: () => fs.createReadStream(filePath), + fileSizeFactory: () => size, + dataItemOpts: { tags: [{ name: 'Content-Type', value: 'image/jpeg' }] } + }); + + // Add to Index (Don't just list it, describe it) + indexRows.push({ + filename: file, + tx_id: upload.id, + byte_size: size, + dataset_split: isTrain ? 'train' : 'test', + label: label + }); + + console.log(` āœ“ Uploaded: ${file}`); + } + + // 2. Write Parquet Index + const indexFile = 'dataset_manifest.parquet'; + const writer = await parquets.ParquetWriter.openFile(schema, indexFile); + for (const row of indexRows) await writer.appendRow(row); + await writer.close(); + + // 3. Upload the Index + const manifestUpload = await turbo.uploadFile({ + fileStreamFactory: () => fs.createReadStream(indexFile), + fileSizeFactory: () => fs.statSync(indexFile).size, + dataItemOpts: { + tags: [ + { name: 'Type', value: 'Dataset-Parquet-Manifest' }, + { name: 'Content-Type', value: 'application/octet-stream' } + ] + } + }); + + console.log(`\nšŸŽ‰ Data Lake Created!`); + console.log(`šŸ‘‰ Index ID: ar://${manifestUpload.id}`); + + return manifestUpload.id; +} +``` + + + +### Query the Index + +Training scripts can now query the Parquet index to fetch specific subsets: + +```typescript +import parquets from 'parquets'; + +async function getTrainingImages(manifestId: string) { + // 1. Download the Parquet index + const indexData = await fetch(`https://ar-io.net/${manifestId}`); + const buffer = await indexData.arrayBuffer(); + + // 2. Query for training split + const reader = await parquets.ParquetReader.openBuffer(Buffer.from(buffer)); + const cursor = reader.getCursor(); + + const trainingImages = []; + let record = null; + + while (record = await cursor.next()) { + if (record.dataset_split === 'train') { + trainingImages.push({ + url: `ar://${record.tx_id}`, + label: record.label, + size: record.byte_size + }); + } + } + + await reader.close(); + return trainingImages; +} +``` -Content for this guide is coming soon. Check back later for the complete walkthrough. +**Performance Tip**: The Parquet format allows efficient columnar queries, meaning you can filter millions of records without loading the entire dataset into memory. + + + +## Summary + +You now have two patterns for creating verifiable datasets: + +1. **Holographic Anchor**: For massive datasets that must stay on S3, create an immutable cryptographic fingerprint on Arweave. +2. **Native Data Lake**: For smaller, high-value datasets, store both the data and a queryable Parquet index permanently on Arweave. + +Both patterns provide cryptographic proof of exactly what data was used to train your AI models, solving the provenance problem for Enterprise AI. ## Next Steps -After completing this guide, proceed to [The Signed Model Registry](/build/guides/verifiable-ai/signed-model-registry) to learn how to prevent model drift. +Now that you have verifiable datasets, proceed to [The Signed Model Registry](/build/guides/verifiable-ai/signed-model-registry) to learn how to prevent model drift by verifying weights against on-chain proofs. From 5ddfd5180cbd301de18e72e16ae9539d2e769b93 Mon Sep 17 00:00:00 2001 From: William Kempster Date: Tue, 25 Nov 2025 21:49:21 +0000 Subject: [PATCH 3/4] next draft before adding arns --- .../verifiable-ai/verifiable-dataset.mdx | 172 ++++++++++++++++-- 1 file changed, 153 insertions(+), 19 deletions(-) diff --git a/content/build/guides/verifiable-ai/verifiable-dataset.mdx b/content/build/guides/verifiable-ai/verifiable-dataset.mdx index 0bf88b541..f82f8fb4b 100644 --- a/content/build/guides/verifiable-ai/verifiable-dataset.mdx +++ b/content/build/guides/verifiable-ai/verifiable-dataset.mdx @@ -5,6 +5,7 @@ description: "Create tamper-proof datasets with cryptographic proofs, ensuring d import { Callout } from "fumadocs-ui/components/callout"; import { Steps, Step } from "fumadocs-ui/components/steps"; +import { Tabs, Tab } from "fumadocs-ui/components/tabs"; Training data is the foundation of any AI model. To ensure provenance, you must be able to prove exactly what data was used to train a specific model version. @@ -25,7 +26,7 @@ Before starting, ensure you have: ### Install Dependencies ```bash -npm install @ardrive/turbo-sdk parquets arweave dotenv +npm install @ardrive/turbo-sdk parquetjs @ar.io/wayfinder-core @ar.io/sdk npm install --save-dev @types/node ``` @@ -45,21 +46,26 @@ Create a file `anchor-dataset.ts` to generate a SHA-256 hash of your dataset: import { TurboFactory } from '@ardrive/turbo-sdk'; import * as fs from 'fs'; import * as crypto from 'crypto'; -import { pipeline } from 'stream/promises'; - -const jwk = JSON.parse(fs.readFileSync('wallet.json', 'utf-8')); -const turbo = TurboFactory.authenticated({ - privateKey: jwk, - token: 'arweave' -}); export async function createHolographicAnchor(filePath: string, s3Url: string) { + // Setup Turbo client + const jwk = JSON.parse(fs.readFileSync('wallet.json', 'utf-8')); + const turbo = TurboFactory.authenticated({ + privateKey: jwk, + token: 'arweave' + }); + console.log("1. Generating Cryptographic Proof..."); // Hash stream (Efficient for large files, low RAM usage) const hash = crypto.createHash('sha256'); const fileStream = fs.createReadStream(filePath); - await pipeline(fileStream, hash); + + // Stream the file through the hash + for await (const chunk of fileStream) { + hash.update(chunk); + } + const fingerprint = hash.digest('hex'); // 2. Prepare the Anchor Payload @@ -104,17 +110,27 @@ The anchor transaction ID serves as an immutable proof that: Anyone can verify the dataset hasn't changed by re-hashing the S3 file and comparing it to the on-chain fingerprint. + + + ```typescript -// Verify a dataset matches its anchor +import * as crypto from 'crypto'; +import * as fs from 'fs'; + +// Simple approach - faster to implement but single point of failure async function verifyDataset(anchorId: string, localFilePath: string) { // 1. Fetch the anchor from Arweave - const anchorData = await fetch(`https://ar-io.net/${anchorId}`); + const anchorData = await fetch(`https://arweave.net/${anchorId}`); const anchor = await anchorData.json(); // 2. Hash the local file const hash = crypto.createHash('sha256'); const fileStream = fs.createReadStream(localFilePath); - await pipeline(fileStream, hash); + + for await (const chunk of fileStream) { + hash.update(chunk); + } + const localFingerprint = hash.digest('hex'); // 3. Compare @@ -127,6 +143,63 @@ async function verifyDataset(anchorId: string, localFilePath: string) { } } ``` + + + + + +```typescript +import { createWayfinderClient, PreferredWithFallbackRoutingStrategy, FastestPingRoutingStrategy, NetworkGatewaysProvider } from '@ar.io/wayfinder-core'; +import { ARIO } from '@ar.io/sdk'; +import * as crypto from 'crypto'; +import * as fs from 'fs'; + +// Production approach - preferred gateway with network fallback for resilience +// Replace 'https://arweave.net' with your own gateway if you run one optimized for your data +async function verifyDataset(anchorId: string, localFilePath: string) { + // 1. Setup Wayfinder: tries arweave.net first, falls back to top 10 staked gateways + const wayfinder = createWayfinderClient({ + ario: ARIO.mainnet(), + routingStrategy: new PreferredWithFallbackRoutingStrategy({ + preferredGateway: 'https://arweave.net', + fallbackStrategy: new FastestPingRoutingStrategy({ + timeoutMs: 1000, + gatewaysProvider: new NetworkGatewaysProvider({ + ario: ARIO.mainnet(), + sortBy: 'operatorStake', + limit: 10, + }), + }), + }), + }); + + // 2. Fetch the anchor from Arweave via Wayfinder + const anchorData = await wayfinder.request(`ar://${anchorId}`); + const anchor = await anchorData.json(); + + // 3. Hash the local file + const hash = crypto.createHash('sha256'); + const fileStream = fs.createReadStream(localFilePath); + + for await (const chunk of fileStream) { + hash.update(chunk); + } + + const localFingerprint = hash.digest('hex'); + + // 4. Compare + if (localFingerprint === anchor.sha256) { + console.log('āœ… Dataset verified! Matches on-chain anchor.'); + return true; + } else { + console.log('āŒ Dataset verification failed! File has been modified.'); + return false; + } +} +``` + + + @@ -148,12 +221,12 @@ Create a file `upload-native-lake.ts`: ```typescript import { TurboFactory } from '@ardrive/turbo-sdk'; -import parquets from 'parquets'; +import * as parquet from 'parquetjs'; import * as fs from 'fs'; import * as path from 'path'; // Schema: We verify NOT just the ID, but the content metadata too -const schema = new parquets.ParquetSchema({ +const schema = new parquet.ParquetSchema({ filename: { type: 'UTF8' }, tx_id: { type: 'UTF8' }, // The Arweave Pointer byte_size: { type: 'INT64' }, @@ -178,7 +251,7 @@ export async function uploadDatasetWithIndex(baseDir: string) { const filePath = path.join(baseDir, file); const size = fs.statSync(filePath).size; - // Logic to determine label/split from filename + // Example logic to determine label/split from filename - customize for your dataset const isTrain = file.startsWith('train'); const label = file.includes('cat') ? 'cat' : 'dog'; @@ -202,7 +275,7 @@ export async function uploadDatasetWithIndex(baseDir: string) { // 2. Write Parquet Index const indexFile = 'dataset_manifest.parquet'; - const writer = await parquets.ParquetWriter.openFile(schema, indexFile); + const writer = await parquet.ParquetWriter.openFile(schema, indexFile); for (const row of indexRows) await writer.appendRow(row); await writer.close(); @@ -231,16 +304,20 @@ export async function uploadDatasetWithIndex(baseDir: string) { Training scripts can now query the Parquet index to fetch specific subsets: + + + ```typescript -import parquets from 'parquets'; +import * as parquet from 'parquetjs'; +// Simple approach - faster to implement but single point of failure async function getTrainingImages(manifestId: string) { // 1. Download the Parquet index - const indexData = await fetch(`https://ar-io.net/${manifestId}`); + const indexData = await fetch(`https://arweave.net/${manifestId}`); const buffer = await indexData.arrayBuffer(); // 2. Query for training split - const reader = await parquets.ParquetReader.openBuffer(Buffer.from(buffer)); + const reader = await parquet.ParquetReader.openBuffer(Buffer.from(buffer)); const cursor = reader.getCursor(); const trainingImages = []; @@ -261,6 +338,63 @@ async function getTrainingImages(manifestId: string) { } ``` + + + + +```typescript +import * as parquet from 'parquetjs'; +import { createWayfinderClient, PreferredWithFallbackRoutingStrategy, FastestPingRoutingStrategy, NetworkGatewaysProvider } from '@ar.io/wayfinder-core'; +import { ARIO } from '@ar.io/sdk'; + +// Production approach - keeps your training pipeline operational even during gateway outages +// Replace 'https://arweave.net' with your own gateway if you run one optimized for your data +async function getTrainingImages(manifestId: string) { + // 1. Setup Wayfinder: tries arweave.net first, falls back to top 10 staked gateways + const wayfinder = createWayfinderClient({ + ario: ARIO.mainnet(), + routingStrategy: new PreferredWithFallbackRoutingStrategy({ + preferredGateway: 'https://arweave.net', + fallbackStrategy: new FastestPingRoutingStrategy({ + timeoutMs: 1000, + gatewaysProvider: new NetworkGatewaysProvider({ + ario: ARIO.mainnet(), + sortBy: 'operatorStake', + limit: 10, + }), + }), + }), + }); + + // 2. Download the Parquet index via Wayfinder + const indexData = await wayfinder.request(`ar://${manifestId}`); + const buffer = await indexData.arrayBuffer(); + + // 3. Query for training split + const reader = await parquet.ParquetReader.openBuffer(Buffer.from(buffer)); + const cursor = reader.getCursor(); + + const trainingImages = []; + let record = null; + + while (record = await cursor.next()) { + if (record.dataset_split === 'train') { + trainingImages.push({ + url: `ar://${record.tx_id}`, + label: record.label, + size: record.byte_size + }); + } + } + + await reader.close(); + return trainingImages; +} +``` + + + + **Performance Tip**: The Parquet format allows efficient columnar queries, meaning you can filter millions of records without loading the entire dataset into memory. From 94e1dca9e774f6ef42ac2d113a597e19e8bf137f Mon Sep 17 00:00:00 2001 From: William Kempster Date: Tue, 25 Nov 2025 22:18:55 +0000 Subject: [PATCH 4/4] final version of verifiable ai dataset article --- .../verifiable-ai/verifiable-dataset.mdx | 128 ++++++++++++++++-- 1 file changed, 116 insertions(+), 12 deletions(-) diff --git a/content/build/guides/verifiable-ai/verifiable-dataset.mdx b/content/build/guides/verifiable-ai/verifiable-dataset.mdx index f82f8fb4b..ebe387cc1 100644 --- a/content/build/guides/verifiable-ai/verifiable-dataset.mdx +++ b/content/build/guides/verifiable-ai/verifiable-dataset.mdx @@ -101,7 +101,54 @@ This approach is memory-efficient for large files. The stream hashing means you -### Use the Anchor +### Addressing data with ArNS + +Instead of hardcoding anchor IDs, use ArNS to create a stable reference that always points to the latest anchor version and maintains a permanent version history. + +```typescript +import { ARIO, ANT } from '@ar.io/sdk'; + +async function associateAnchorWithArNS(anchorId: string, arnsName: string, version: string) { + // 1. Get the ANT contract for your ArNS name + // (Assumes you've already purchased/leased the ArNS name via https://arns.app) + const ario = ARIO.mainnet(); + const records = await ario.getArNSRecord({ name: arnsName }); + + if (!records) { + throw new Error(`ArNS name "${arnsName}" not found. Purchase it at https://arns.app first.`); + } + + // 2. Connect to the ANT contract + const ant = ANT.init({ + processId: records.processId, + signer: jwk, // Your Arweave wallet + }); + + // 3. Set the @ record to point to latest version + await ant.setRecord({ + undername: '@', + transactionId: anchorId, + ttlSeconds: 3600, // 1 hour cache + }); + + // 4. Set a versioned undername to permanently reference this version + await ant.setRecord({ + undername: version, // e.g., 'v1', 'v2', '2024-12' + transactionId: anchorId, + ttlSeconds: 3600, + }); + + console.log(` Latest: ar://${arnsName} → ${anchorId}`); + console.log(` Version: ar://${version}_${arnsName} → ${anchorId}`); + + return anchorId; +} +``` + + + + +### Verify the Dataset The anchor transaction ID serves as an immutable proof that: 1. A specific dataset existed at a specific time @@ -118,9 +165,12 @@ import * as crypto from 'crypto'; import * as fs from 'fs'; // Simple approach - faster to implement but single point of failure -async function verifyDataset(anchorId: string, localFilePath: string) { +// Query the latest version or a specific version with ArNS +async function verifyDataset(identifier: string, localFilePath: string) { // 1. Fetch the anchor from Arweave - const anchorData = await fetch(`https://arweave.net/${anchorId}`); + // Use ArNS name for latest version: ar://dataset-anchor + // Use versioned undername for specific version: ar://v1_dataset-anchor + const anchorData = await fetch(`https://arweave.net/${identifier}`); const anchor = await anchorData.json(); // 2. Hash the local file @@ -155,8 +205,8 @@ import * as crypto from 'crypto'; import * as fs from 'fs'; // Production approach - preferred gateway with network fallback for resilience -// Replace 'https://arweave.net' with your own gateway if you run one optimized for your data -async function verifyDataset(anchorId: string, localFilePath: string) { +// Query the latest version or a specific version with ArNS +async function verifyDataset(identifier: string, localFilePath: string) { // 1. Setup Wayfinder: tries arweave.net first, falls back to top 10 staked gateways const wayfinder = createWayfinderClient({ ario: ARIO.mainnet(), @@ -174,7 +224,9 @@ async function verifyDataset(anchorId: string, localFilePath: string) { }); // 2. Fetch the anchor from Arweave via Wayfinder - const anchorData = await wayfinder.request(`ar://${anchorId}`); + // Use ArNS name for latest version: ar://dataset-anchor + // Use versioned undername for specific version: ar://v1_dataset-anchor + const anchorData = await wayfinder.request(`ar://${identifier}`); const anchor = await anchorData.json(); // 3. Hash the local file @@ -299,6 +351,53 @@ export async function uploadDatasetWithIndex(baseDir: string) { ``` + +### Addressing data with ArNS + +Instead of hardcoding manifest IDs, use ArNS to create a stable reference that always points to the latest dataset version and maintains a permanent version history. + +```typescript +import { ARIO, ANT } from '@ar.io/sdk'; + +async function associateDatasetWithArNS(manifestId: string, arnsName: string, version: string) { + // 1. Get the ANT contract for your ArNS name + // (Assumes you've already purchased/leased the ArNS name via https://arns.app) + const ario = ARIO.mainnet(); + const records = await ario.getArNSRecord({ name: arnsName }); + + if (!records) { + throw new Error(`ArNS name "${arnsName}" not found. Purchase it at https://arns.app first.`); + } + + // 2. Connect to the ANT contract + const ant = ANT.init({ + processId: records.processId, + signer: jwk, // Your Arweave wallet + }); + + // 3. Set the @ record to point to latest version + await ant.setRecord({ + undername: '@', + transactionId: manifestId, + ttlSeconds: 3600, // 1 hour cache + }); + + // 4. Set a versioned undername to permanently reference this version + await ant.setRecord({ + undername: version, // e.g., 'v1', 'v2', '2024-12' + transactionId: manifestId, + ttlSeconds: 3600, + }); + + console.log(` Latest: ar://${arnsName} → ${manifestId}`); + console.log(` Version: ar://${version}_${arnsName} → ${manifestId}`); + + return manifestId; +} +``` + + + ### Query the Index @@ -311,9 +410,12 @@ Training scripts can now query the Parquet index to fetch specific subsets: import * as parquet from 'parquetjs'; // Simple approach - faster to implement but single point of failure -async function getTrainingImages(manifestId: string) { +// Query the latest version or a specific version with ArNS +async function getTrainingImages(identifier: string) { // 1. Download the Parquet index - const indexData = await fetch(`https://arweave.net/${manifestId}`); + // Use ArNS name for latest version: ar://medical-imaging + // Use versioned undername for specific version: ar://v1_medical-imaging + const indexData = await fetch(`https://arweave.net/${identifier}`); const buffer = await indexData.arrayBuffer(); // 2. Query for training split @@ -348,8 +450,8 @@ import { createWayfinderClient, PreferredWithFallbackRoutingStrategy, FastestPin import { ARIO } from '@ar.io/sdk'; // Production approach - keeps your training pipeline operational even during gateway outages -// Replace 'https://arweave.net' with your own gateway if you run one optimized for your data -async function getTrainingImages(manifestId: string) { +// Query the latest version or a specific version with ArNS +async function getTrainingImages(identifier: string) { // 1. Setup Wayfinder: tries arweave.net first, falls back to top 10 staked gateways const wayfinder = createWayfinderClient({ ario: ARIO.mainnet(), @@ -366,8 +468,10 @@ async function getTrainingImages(manifestId: string) { }), }); - // 2. Download the Parquet index via Wayfinder - const indexData = await wayfinder.request(`ar://${manifestId}`); + // 2. Download the Parquet index + // Use ArNS name for latest version: ar://medical-imaging + // Use versioned undername for specific version: ar://v1_medical-imaging + const indexData = await wayfinder.request(`ar://${identifier}`); const buffer = await indexData.arrayBuffer(); // 3. Query for training split