From e2b22d2cf9eca6701a8dda454aeafaf45d1bc0f1 Mon Sep 17 00:00:00 2001
From: William Kempster <will@kemspterrrr.xyz>
Date: Tue, 25 Nov 2025 21:11:20 +0000
Subject: [PATCH 1/4] initial structure for series

---
 content/build/guides/meta.json                |  3 +-
 .../verifiable-ai/immutable-trust-layer.mdx   | 47 +++++++++++++++++
 content/build/guides/verifiable-ai/index.mdx  | 52 +++++++++++++++++++
 content/build/guides/verifiable-ai/meta.json  |  9 ++++
 .../verifiable-ai/signed-model-registry.mdx   | 39 ++++++++++++++
 .../verifiable-ai/verifiable-dataset.mdx      | 37 +++++++++++++
 6 files changed, 186 insertions(+), 1 deletion(-)
 create mode 100644 content/build/guides/verifiable-ai/immutable-trust-layer.mdx
 create mode 100644 content/build/guides/verifiable-ai/index.mdx
 create mode 100644 content/build/guides/verifiable-ai/meta.json
 create mode 100644 content/build/guides/verifiable-ai/signed-model-registry.mdx
 create mode 100644 content/build/guides/verifiable-ai/verifiable-dataset.mdx
diff --git a/content/build/guides/meta.json b/content/build/guides/meta.json
index 5eb8092fa..1d52d2ebe 100644
--- a/content/build/guides/meta.json
+++ b/content/build/guides/meta.json
@@ -9,6 +9,7 @@
     "crossmint-nft-minting-app",
     "working-with-arns",
     "using-turbo-in-a-browser",
-    "storing-nfts"
+    "storing-nfts",
+    "verifiable-ai"
   ]
 }
diff --git a/content/build/guides/verifiable-ai/immutable-trust-layer.mdx b/content/build/guides/verifiable-ai/immutable-trust-layer.mdx
new file mode 100644
index 000000000..652d7c3c1
--- /dev/null
+++ b/content/build/guides/verifiable-ai/immutable-trust-layer.mdx
@@ -0,0 +1,47 @@
+---
+title: "The Immutable Trust Layer"
+description: "Implement a Lambda Architecture for AI logging that creates tamper-proof audit trails for algorithmic liability"
+---
+
+import { Callout } from "fumadocs-ui/components/callout";
+import { Steps, Step } from "fumadocs-ui/components/steps";
+
+## Output and Liability Verification for AI Systems
+
+Learn how to implement a Lambda Architecture for AI logging that streams encrypted evidence and builds analytics indices for complete algorithmic accountability.
+
+## Prerequisites
+
+Before starting, ensure you have:
+
+- **Node.js** (v18 or higher)
+- **TypeScript** knowledge
+- **Arweave Wallet (JWK file)** - We recommend [Wander](https://www.wander.app/)
+- **Turbo Credits** - Purchase credits to pay for uploads. See [Turbo Credits guide](/build/upload/turbo-credits)
+- Completed [The Verifiable Dataset](/build/guides/verifiable-ai/verifiable-dataset) guide
+- Completed [The Signed Model Registry](/build/guides/verifiable-ai/signed-model-registry) guide
+
+## Overview
+
+This guide covers:
+
+- Implementing the Speed Layer for real-time evidence streaming
+- Building the Batch Layer with Parquet indices for analytics
+- Creating tamper-proof audit trails
+- Encrypting sensitive AI outputs
+- Querying historical AI decisions
+- Establishing algorithmic liability frameworks
+
+<Callout type="info">
+Content for this guide is coming soon. Check back later for the complete walkthrough.
+</Callout>
+
+## Summary
+
+By completing this guide series, you've built a complete verifiable AI infrastructure with:
+
+- **Verifiable Datasets** ensuring input integrity
+- **Signed Model Registry** preventing process drift
+- **Immutable Trust Layer** providing output accountability
+
+Your AI systems now have cryptographic proof of their decisions, creating a foundation for enterprise trust and regulatory compliance.
diff --git a/content/build/guides/verifiable-ai/index.mdx b/content/build/guides/verifiable-ai/index.mdx
new file mode 100644
index 000000000..b0a03421f
--- /dev/null
+++ b/content/build/guides/verifiable-ai/index.mdx
@@ -0,0 +1,52 @@
+---
+title: "Verifiable AI with AR.IO Network"
+description: "Build production-grade verifiable AI systems with immutable data provenance, signed model registries, and tamper-proof audit trails"
+---
+
+import {
+  Database,
+  Shield,
+  FileCheck,
+  Lock,
+} from "lucide-react";
+
+## From Black Box to Glass Box: The Verifiable AI Stack
+
+The challenge with Enterprise AI is not just performance, but **provenance**. Standard cloud storage is mutable, making it difficult to prove exactly which dataset trained a model or what precise state an AI agent was in during a specific incident.
+
+To solve **Algorithmic Liability**, AI systems require an immutable root of trust.
+
+AR.IO Network facilitates this by enabling a **"Glass Box"** architecture:
+
+- **Verifiable Datasets**: Prove the integrity of training data, whether it lives on S3 or directly on Arweave.
+- **Signed Model Registries**: Prevent model drift by verifying weights against on-chain proofs before inference starts.
+- **The Trust Layer**: A "Lambda Architecture" for logging that streams encrypted evidence for liability (Speed Layer) and builds Parquet indices for analytics (Batch Layer).
+
+## What You'll Learn
+
+In this guide series, you will build a production-grade **Verifiable AI Stack** using TypeScript and the Turbo SDK.
+
+<Cards>
+  <Card
+    title="The Verifiable Dataset"
+    description="Learn how to create tamper-proof datasets with cryptographic proofs, ensuring data integrity from S3 to Arweave for AI training."
+    href="/build/guides/verifiable-ai/verifiable-dataset"
+    icon={<Database className="w-6 h-6" />}
+  />
+  <Card
+    title="The Signed Model Registry"
+    description="Build a registry that prevents model drift by verifying weights against on-chain proofs before inference starts."
+    href="/build/guides/verifiable-ai/signed-model-registry"
+    icon={<FileCheck className="w-6 h-6" />}
+  />
+  <Card
+    title="The Immutable Trust Layer"
+    description="Implement a Lambda Architecture for AI logging that creates tamper-proof audit trails for algorithmic liability."
+    href="/build/guides/verifiable-ai/immutable-trust-layer"
+    icon={<Shield className="w-6 h-6" />}
+  />
+</Cards>
+
+Each guide builds on the last, creating a complete verifiable AI infrastructure by the end of the series.
+
+Let's get started.
diff --git a/content/build/guides/verifiable-ai/meta.json b/content/build/guides/verifiable-ai/meta.json
new file mode 100644
index 000000000..28b5e63c9
--- /dev/null
+++ b/content/build/guides/verifiable-ai/meta.json
@@ -0,0 +1,9 @@
+{
+  "title": "Verifiable AI",
+  "defaultOpen": false,
+  "pages": [
+    "verifiable-dataset",
+    "signed-model-registry",
+    "immutable-trust-layer"
+  ]
+}
diff --git a/content/build/guides/verifiable-ai/signed-model-registry.mdx b/content/build/guides/verifiable-ai/signed-model-registry.mdx
new file mode 100644
index 000000000..9331a6396
--- /dev/null
+++ b/content/build/guides/verifiable-ai/signed-model-registry.mdx
@@ -0,0 +1,39 @@
+---
+title: "The Signed Model Registry"
+description: "Build a registry that prevents model drift by verifying weights against on-chain proofs before inference starts"
+---
+
+import { Callout } from "fumadocs-ui/components/callout";
+import { Steps, Step } from "fumadocs-ui/components/steps";
+
+## Process Verification for AI Models
+
+Learn how to build a signed model registry that prevents model drift by verifying weights against on-chain proofs before inference starts.
+
+## Prerequisites
+
+Before starting, ensure you have:
+
+- **Node.js** (v18 or higher)
+- **TypeScript** knowledge
+- **Arweave Wallet (JWK file)** - We recommend [Wander](https://www.wander.app/)
+- **Turbo Credits** - Purchase credits to pay for uploads. See [Turbo Credits guide](/build/upload/turbo-credits)
+- Completed [The Verifiable Dataset](/build/guides/verifiable-ai/verifiable-dataset) guide
+
+## Overview
+
+This guide covers:
+
+- Creating cryptographic signatures for model weights
+- Storing model metadata on Arweave
+- Verifying model integrity before inference
+- Implementing a model registry service
+- Preventing model drift and tampering
+
+<Callout type="info">
+Content for this guide is coming soon. Check back later for the complete walkthrough.
+</Callout>
+
+## Next Steps
+
+After completing this guide, proceed to [The Immutable Trust Layer](/build/guides/verifiable-ai/immutable-trust-layer) to learn how to create tamper-proof audit trails.
diff --git a/content/build/guides/verifiable-ai/verifiable-dataset.mdx b/content/build/guides/verifiable-ai/verifiable-dataset.mdx
new file mode 100644
index 000000000..ea56b0495
--- /dev/null
+++ b/content/build/guides/verifiable-ai/verifiable-dataset.mdx
@@ -0,0 +1,37 @@
+---
+title: "The Verifiable Dataset"
+description: "Create tamper-proof datasets with cryptographic proofs, ensuring data integrity from S3 to Arweave for AI training"
+---
+
+import { Callout } from "fumadocs-ui/components/callout";
+import { Steps, Step } from "fumadocs-ui/components/steps";
+
+## Input Verification for AI Systems
+
+Learn how to create verifiable datasets that prove the integrity of training data, whether stored on S3 or directly on Arweave.
+
+## Prerequisites
+
+Before starting, ensure you have:
+
+- **Node.js** (v18 or higher)
+- **TypeScript** knowledge
+- **Arweave Wallet (JWK file)** - We recommend [Wander](https://www.wander.app/)
+- **Turbo Credits** - Purchase credits to pay for uploads. See [Turbo Credits guide](/build/upload/turbo-credits)
+
+## Overview
+
+This guide covers:
+
+- Creating cryptographic proofs for datasets
+- Uploading dataset manifests to Arweave
+- Verifying dataset integrity
+- Integrating with existing S3 workflows
+
+<Callout type="info">
+Content for this guide is coming soon. Check back later for the complete walkthrough.
+</Callout>
+
+## Next Steps
+
+After completing this guide, proceed to [The Signed Model Registry](/build/guides/verifiable-ai/signed-model-registry) to learn how to prevent model drift.

From cfd271c96c1341862a8bf35b572c721e65de7e63 Mon Sep 17 00:00:00 2001
From: William Kempster <will@kemspterrrr.xyz>
Date: Tue, 25 Nov 2025 21:19:46 +0000
Subject: [PATCH 2/4] initial draft for dataset article

---
 .../verifiable-ai/verifiable-dataset.mdx      | 266 +++++++++++++++++-
 1 file changed, 255 insertions(+), 11 deletions(-)

diff --git a/content/build/guides/verifiable-ai/verifiable-dataset.mdx b/content/build/guides/verifiable-ai/verifiable-dataset.mdx
index ea56b0495..0bf88b541 100644
--- a/content/build/guides/verifiable-ai/verifiable-dataset.mdx
+++ b/content/build/guides/verifiable-ai/verifiable-dataset.mdx
@@ -6,32 +6,276 @@ description: "Create tamper-proof datasets with cryptographic proofs, ensuring d
 import { Callout } from "fumadocs-ui/components/callout";
 import { Steps, Step } from "fumadocs-ui/components/steps";
 
-## Input Verification for AI Systems
+Training data is the foundation of any AI model. To ensure provenance, you must be able to prove exactly what data was used to train a specific model version.
 
-Learn how to create verifiable datasets that prove the integrity of training data, whether stored on S3 or directly on Arweave.
+We provide two patterns depending on your data size and requirements:
+
+- **The Holographic Anchor**: Best for massive data (TB/PB) stored on S3.
+- **The Native Data Lake**: Best for high-value data (under 1TB) stored directly on Arweave with a Parquet index.
 
 ## Prerequisites
 
 Before starting, ensure you have:
 
 - **Node.js** (v18 or higher)
-- **TypeScript** knowledge
 - **Arweave Wallet (JWK file)** - We recommend [Wander](https://www.wander.app/)
 - **Turbo Credits** - Purchase credits to pay for uploads. See [Turbo Credits guide](/build/upload/turbo-credits)
+- **TypeScript** knowledge
+
+### Install Dependencies
+
+```bash
+npm install @ardrive/turbo-sdk parquets arweave dotenv
+npm install --save-dev @types/node
+```
+
+## Pattern A: The Holographic Anchor (Off-Chain)
+
+Use this when your dataset is petabyte-scale or must reside in a specific jurisdiction (GDPR).
+
+With this pattern, we do not upload the actual file. We upload a **cryptographic fingerprint**.
+
+<Steps>
+<Step>
+### Generate Cryptographic Proof
+
+Create a file `anchor-dataset.ts` to generate a SHA-256 hash of your dataset:
+
+```typescript
+import { TurboFactory } from '@ardrive/turbo-sdk';
+import * as fs from 'fs';
+import * as crypto from 'crypto';
+import { pipeline } from 'stream/promises';
+
+const jwk = JSON.parse(fs.readFileSync('wallet.json', 'utf-8'));
+const turbo = TurboFactory.authenticated({
+  privateKey: jwk,
+  token: 'arweave'
+});
+
+export async function createHolographicAnchor(filePath: string, s3Url: string) {
+  console.log("1. Generating Cryptographic Proof...");
+
+  // Hash stream (Efficient for large files, low RAM usage)
+  const hash = crypto.createHash('sha256');
+  const fileStream = fs.createReadStream(filePath);
+  await pipeline(fileStream, hash);
+  const fingerprint = hash.digest('hex');
+
+  // 2. Prepare the Anchor Payload
+  const anchor = {
+    type: 'dataset_anchor',
+    storage: 's3',
+    url: s3Url,
+    sha256: fingerprint, // The mathematical truth
+    size: fs.statSync(filePath).size,
+    timestamp: Date.now()
+  };
+
+  // 3. Upload Metadata Only
+  const upload = await turbo.uploadFile({
+    fileStreamFactory: () => Buffer.from(JSON.stringify(anchor)),
+    fileSizeFactory: () => Buffer.byteLength(JSON.stringify(anchor)),
+    dataItemOpts: {
+      tags: [
+        { name: 'Content-Type', value: 'application/json' },
+        { name: 'Type', value: 'Dataset-Anchor' }
+      ]
+    }
+  });
+
+  console.log(`⚓ Holographic Anchor Minted: ar://${upload.id}`);
+  return upload.id;
+}
+```
+
+<Callout type="info">
+This approach is memory-efficient for large files. The stream hashing means you can verify petabyte-scale datasets without loading them entirely into RAM.
+</Callout>
+</Step>
+
+<Step>
+### Use the Anchor
+
+The anchor transaction ID serves as an immutable proof that:
+1. A specific dataset existed at a specific time
+2. The dataset had a specific SHA-256 hash
+3. The dataset was stored at a specific S3 URL
+
+Anyone can verify the dataset hasn't changed by re-hashing the S3 file and comparing it to the on-chain fingerprint.
+
+```typescript
+// Verify a dataset matches its anchor
+async function verifyDataset(anchorId: string, localFilePath: string) {
+  // 1. Fetch the anchor from Arweave
+  const anchorData = await fetch(`https://ar-io.net/${anchorId}`);
+  const anchor = await anchorData.json();
+
+  // 2. Hash the local file
+  const hash = crypto.createHash('sha256');
+  const fileStream = fs.createReadStream(localFilePath);
+  await pipeline(fileStream, hash);
+  const localFingerprint = hash.digest('hex');
+
+  // 3. Compare
+  if (localFingerprint === anchor.sha256) {
+    console.log('✅ Dataset verified! Matches on-chain anchor.');
+    return true;
+  } else {
+    console.log('❌ Dataset verification failed! File has been modified.');
+    return false;
+  }
+}
+```
+</Step>
+</Steps>
+
+<Callout type="warning">
+**Important**: The Holographic Anchor proves a dataset existed with a specific hash, but doesn't make the data itself permanent. For true permanence, use Pattern B.
+</Callout>
 
-## Overview
+## Pattern B: The Native Data Lake (On-Chain)
 
-This guide covers:
+Use this for fine-tuning sets, RAG Knowledge Bases, or benchmarks where you want both the data and its index permanently stored.
 
-- Creating cryptographic proofs for datasets
-- Uploading dataset manifests to Arweave
-- Verifying dataset integrity
-- Integrating with existing S3 workflows
+We upload the raw files to Arweave and generate a Parquet Index. This allows training scripts to filter data (e.g., "Give me only train split images") without downloading the entire dataset manifest.
+
+<Steps>
+<Step>
+### Upload Files and Build Index
+
+Create a file `upload-native-lake.ts`:
+
+```typescript
+import { TurboFactory } from '@ardrive/turbo-sdk';
+import parquets from 'parquets';
+import * as fs from 'fs';
+import * as path from 'path';
+
+// Schema: We verify NOT just the ID, but the content metadata too
+const schema = new parquets.ParquetSchema({
+  filename: { type: 'UTF8' },
+  tx_id: { type: 'UTF8' },         // The Arweave Pointer
+  byte_size: { type: 'INT64' },
+  dataset_split: { type: 'UTF8' }, // 'train' vs 'test'
+  label: { type: 'UTF8' }          // e.g. 'pneumonia'
+});
+
+export async function uploadDatasetWithIndex(baseDir: string) {
+  const jwk = JSON.parse(fs.readFileSync('wallet.json', 'utf-8'));
+  const turbo = TurboFactory.authenticated({
+    privateKey: jwk,
+    token: 'arweave'
+  });
+
+  const indexRows = [];
+  const files = fs.readdirSync(baseDir);
+
+  console.log(`🚀 Processing ${files.length} files...`);
+
+  // 1. Upload Files
+  for (const file of files) {
+    const filePath = path.join(baseDir, file);
+    const size = fs.statSync(filePath).size;
+
+    // Logic to determine label/split from filename
+    const isTrain = file.startsWith('train');
+    const label = file.includes('cat') ? 'cat' : 'dog';
+
+    const upload = await turbo.uploadFile({
+      fileStreamFactory: () => fs.createReadStream(filePath),
+      fileSizeFactory: () => size,
+      dataItemOpts: { tags: [{ name: 'Content-Type', value: 'image/jpeg' }] }
+    });
+
+    // Add to Index (Don't just list it, describe it)
+    indexRows.push({
+      filename: file,
+      tx_id: upload.id,
+      byte_size: size,
+      dataset_split: isTrain ? 'train' : 'test',
+      label: label
+    });
+
+    console.log(`   ✓ Uploaded: ${file}`);
+  }
+
+  // 2. Write Parquet Index
+  const indexFile = 'dataset_manifest.parquet';
+  const writer = await parquets.ParquetWriter.openFile(schema, indexFile);
+  for (const row of indexRows) await writer.appendRow(row);
+  await writer.close();
+
+  // 3. Upload the Index
+  const manifestUpload = await turbo.uploadFile({
+    fileStreamFactory: () => fs.createReadStream(indexFile),
+    fileSizeFactory: () => fs.statSync(indexFile).size,
+    dataItemOpts: {
+      tags: [
+        { name: 'Type', value: 'Dataset-Parquet-Manifest' },
+        { name: 'Content-Type', value: 'application/octet-stream' }
+      ]
+    }
+  });
+
+  console.log(`\n🎉 Data Lake Created!`);
+  console.log(`👉 Index ID: ar://${manifestUpload.id}`);
+
+  return manifestUpload.id;
+}
+```
+</Step>
+
+<Step>
+### Query the Index
+
+Training scripts can now query the Parquet index to fetch specific subsets:
+
+```typescript
+import parquets from 'parquets';
+
+async function getTrainingImages(manifestId: string) {
+  // 1. Download the Parquet index
+  const indexData = await fetch(`https://ar-io.net/${manifestId}`);
+  const buffer = await indexData.arrayBuffer();
+
+  // 2. Query for training split
+  const reader = await parquets.ParquetReader.openBuffer(Buffer.from(buffer));
+  const cursor = reader.getCursor();
+
+  const trainingImages = [];
+  let record = null;
+
+  while (record = await cursor.next()) {
+    if (record.dataset_split === 'train') {
+      trainingImages.push({
+        url: `ar://${record.tx_id}`,
+        label: record.label,
+        size: record.byte_size
+      });
+    }
+  }
+
+  await reader.close();
+  return trainingImages;
+}
+```
 
 <Callout type="info">
-Content for this guide is coming soon. Check back later for the complete walkthrough.
+**Performance Tip**: The Parquet format allows efficient columnar queries, meaning you can filter millions of records without loading the entire dataset into memory.
 </Callout>
+</Step>
+</Steps>
+
+## Summary
+
+You now have two patterns for creating verifiable datasets:
+
+1. **Holographic Anchor**: For massive datasets that must stay on S3, create an immutable cryptographic fingerprint on Arweave.
+2. **Native Data Lake**: For smaller, high-value datasets, store both the data and a queryable Parquet index permanently on Arweave.
+
+Both patterns provide cryptographic proof of exactly what data was used to train your AI models, solving the provenance problem for Enterprise AI.
 
 ## Next Steps
 
-After completing this guide, proceed to [The Signed Model Registry](/build/guides/verifiable-ai/signed-model-registry) to learn how to prevent model drift.
+Now that you have verifiable datasets, proceed to [The Signed Model Registry](/build/guides/verifiable-ai/signed-model-registry) to learn how to prevent model drift by verifying weights against on-chain proofs.

From 5ddfd5180cbd301de18e72e16ae9539d2e769b93 Mon Sep 17 00:00:00 2001
From: William Kempster <will@kemspterrrr.xyz>
Date: Tue, 25 Nov 2025 21:49:21 +0000
Subject: [PATCH 3/4] next draft before adding arns

---
 .../verifiable-ai/verifiable-dataset.mdx      | 172 ++++++++++++++++--
 1 file changed, 153 insertions(+), 19 deletions(-)

diff --git a/content/build/guides/verifiable-ai/verifiable-dataset.mdx b/content/build/guides/verifiable-ai/verifiable-dataset.mdx
index 0bf88b541..f82f8fb4b 100644
--- a/content/build/guides/verifiable-ai/verifiable-dataset.mdx
+++ b/content/build/guides/verifiable-ai/verifiable-dataset.mdx
@@ -5,6 +5,7 @@ description: "Create tamper-proof datasets with cryptographic proofs, ensuring d
 
 import { Callout } from "fumadocs-ui/components/callout";
 import { Steps, Step } from "fumadocs-ui/components/steps";
+import { Tabs, Tab } from "fumadocs-ui/components/tabs";
 
 Training data is the foundation of any AI model. To ensure provenance, you must be able to prove exactly what data was used to train a specific model version.
 
@@ -25,7 +26,7 @@ Before starting, ensure you have:
 ### Install Dependencies
 
 ```bash
-npm install @ardrive/turbo-sdk parquets arweave dotenv
+npm install @ardrive/turbo-sdk parquetjs @ar.io/wayfinder-core @ar.io/sdk
 npm install --save-dev @types/node
 ```
 
@@ -45,21 +46,26 @@ Create a file `anchor-dataset.ts` to generate a SHA-256 hash of your dataset:
 import { TurboFactory } from '@ardrive/turbo-sdk';
 import * as fs from 'fs';
 import * as crypto from 'crypto';
-import { pipeline } from 'stream/promises';
-
-const jwk = JSON.parse(fs.readFileSync('wallet.json', 'utf-8'));
-const turbo = TurboFactory.authenticated({
-  privateKey: jwk,
-  token: 'arweave'
-});
 
 export async function createHolographicAnchor(filePath: string, s3Url: string) {
+  // Setup Turbo client
+  const jwk = JSON.parse(fs.readFileSync('wallet.json', 'utf-8'));
+  const turbo = TurboFactory.authenticated({
+    privateKey: jwk,
+    token: 'arweave'
+  });
+
   console.log("1. Generating Cryptographic Proof...");
 
   // Hash stream (Efficient for large files, low RAM usage)
   const hash = crypto.createHash('sha256');
   const fileStream = fs.createReadStream(filePath);
-  await pipeline(fileStream, hash);
+
+  // Stream the file through the hash
+  for await (const chunk of fileStream) {
+    hash.update(chunk);
+  }
+
   const fingerprint = hash.digest('hex');
 
   // 2. Prepare the Anchor Payload
@@ -104,17 +110,27 @@ The anchor transaction ID serves as an immutable proof that:
 
 Anyone can verify the dataset hasn't changed by re-hashing the S3 file and comparing it to the on-chain fingerprint.
 
+<Tabs items={['Direct Request', 'Wayfinder with Fallback']}>
+<Tab value="Direct Request">
+
 ```typescript
-// Verify a dataset matches its anchor
+import * as crypto from 'crypto';
+import * as fs from 'fs';
+
+// Simple approach - faster to implement but single point of failure
 async function verifyDataset(anchorId: string, localFilePath: string) {
   // 1. Fetch the anchor from Arweave
-  const anchorData = await fetch(`https://ar-io.net/${anchorId}`);
+  const anchorData = await fetch(`https://arweave.net/${anchorId}`);
   const anchor = await anchorData.json();
 
   // 2. Hash the local file
   const hash = crypto.createHash('sha256');
   const fileStream = fs.createReadStream(localFilePath);
-  await pipeline(fileStream, hash);
+
+  for await (const chunk of fileStream) {
+    hash.update(chunk);
+  }
+
   const localFingerprint = hash.digest('hex');
 
   // 3. Compare
@@ -127,6 +143,63 @@ async function verifyDataset(anchorId: string, localFilePath: string) {
   }
 }
 ```
+
+</Tab>
+
+<Tab value="Wayfinder with Fallback">
+
+```typescript
+import { createWayfinderClient, PreferredWithFallbackRoutingStrategy, FastestPingRoutingStrategy, NetworkGatewaysProvider } from '@ar.io/wayfinder-core';
+import { ARIO } from '@ar.io/sdk';
+import * as crypto from 'crypto';
+import * as fs from 'fs';
+
+// Production approach - preferred gateway with network fallback for resilience
+// Replace 'https://arweave.net' with your own gateway if you run one optimized for your data
+async function verifyDataset(anchorId: string, localFilePath: string) {
+  // 1. Setup Wayfinder: tries arweave.net first, falls back to top 10 staked gateways
+  const wayfinder = createWayfinderClient({
+    ario: ARIO.mainnet(),
+    routingStrategy: new PreferredWithFallbackRoutingStrategy({
+      preferredGateway: 'https://arweave.net',
+      fallbackStrategy: new FastestPingRoutingStrategy({
+        timeoutMs: 1000,
+        gatewaysProvider: new NetworkGatewaysProvider({
+          ario: ARIO.mainnet(),
+          sortBy: 'operatorStake',
+          limit: 10,
+        }),
+      }),
+    }),
+  });
+
+  // 2. Fetch the anchor from Arweave via Wayfinder
+  const anchorData = await wayfinder.request(`ar://${anchorId}`);
+  const anchor = await anchorData.json();
+
+  // 3. Hash the local file
+  const hash = crypto.createHash('sha256');
+  const fileStream = fs.createReadStream(localFilePath);
+
+  for await (const chunk of fileStream) {
+    hash.update(chunk);
+  }
+
+  const localFingerprint = hash.digest('hex');
+
+  // 4. Compare
+  if (localFingerprint === anchor.sha256) {
+    console.log('✅ Dataset verified! Matches on-chain anchor.');
+    return true;
+  } else {
+    console.log('❌ Dataset verification failed! File has been modified.');
+    return false;
+  }
+}
+```
+
+</Tab>
+</Tabs>
 </Step>
 </Steps>
 
@@ -148,12 +221,12 @@ Create a file `upload-native-lake.ts`:
 
 ```typescript
 import { TurboFactory } from '@ardrive/turbo-sdk';
-import parquets from 'parquets';
+import * as parquet from 'parquetjs';
 import * as fs from 'fs';
 import * as path from 'path';
 
 // Schema: We verify NOT just the ID, but the content metadata too
-const schema = new parquets.ParquetSchema({
+const schema = new parquet.ParquetSchema({
   filename: { type: 'UTF8' },
   tx_id: { type: 'UTF8' },         // The Arweave Pointer
   byte_size: { type: 'INT64' },
@@ -178,7 +251,7 @@ export async function uploadDatasetWithIndex(baseDir: string) {
     const filePath = path.join(baseDir, file);
     const size = fs.statSync(filePath).size;
 
-    // Logic to determine label/split from filename
+    // Example logic to determine label/split from filename - customize for your dataset
     const isTrain = file.startsWith('train');
     const label = file.includes('cat') ? 'cat' : 'dog';
 
@@ -202,7 +275,7 @@ export async function uploadDatasetWithIndex(baseDir: string) {
 
   // 2. Write Parquet Index
   const indexFile = 'dataset_manifest.parquet';
-  const writer = await parquets.ParquetWriter.openFile(schema, indexFile);
+  const writer = await parquet.ParquetWriter.openFile(schema, indexFile);
   for (const row of indexRows) await writer.appendRow(row);
   await writer.close();
 
@@ -231,16 +304,20 @@ export async function uploadDatasetWithIndex(baseDir: string) {
 
 Training scripts can now query the Parquet index to fetch specific subsets:
 
+<Tabs items={['Direct Request', 'Wayfinder with Fallback']}>
+<Tab value="Direct Request">
+
 ```typescript
-import parquets from 'parquets';
+import * as parquet from 'parquetjs';
 
+// Simple approach - faster to implement but single point of failure
 async function getTrainingImages(manifestId: string) {
   // 1. Download the Parquet index
-  const indexData = await fetch(`https://ar-io.net/${manifestId}`);
+  const indexData = await fetch(`https://arweave.net/${manifestId}`);
   const buffer = await indexData.arrayBuffer();
 
   // 2. Query for training split
-  const reader = await parquets.ParquetReader.openBuffer(Buffer.from(buffer));
+  const reader = await parquet.ParquetReader.openBuffer(Buffer.from(buffer));
   const cursor = reader.getCursor();
 
   const trainingImages = [];
@@ -261,6 +338,63 @@ async function getTrainingImages(manifestId: string) {
 }
 ```
 
+</Tab>
+
+<Tab value="Wayfinder with Fallback">
+
+```typescript
+import * as parquet from 'parquetjs';
+import { createWayfinderClient, PreferredWithFallbackRoutingStrategy, FastestPingRoutingStrategy, NetworkGatewaysProvider } from '@ar.io/wayfinder-core';
+import { ARIO } from '@ar.io/sdk';
+
+// Production approach - keeps your training pipeline operational even during gateway outages
+// Replace 'https://arweave.net' with your own gateway if you run one optimized for your data
+async function getTrainingImages(manifestId: string) {
+  // 1. Setup Wayfinder: tries arweave.net first, falls back to top 10 staked gateways
+  const wayfinder = createWayfinderClient({
+    ario: ARIO.mainnet(),
+    routingStrategy: new PreferredWithFallbackRoutingStrategy({
+      preferredGateway: 'https://arweave.net',
+      fallbackStrategy: new FastestPingRoutingStrategy({
+        timeoutMs: 1000,
+        gatewaysProvider: new NetworkGatewaysProvider({
+          ario: ARIO.mainnet(),
+          sortBy: 'operatorStake',
+          limit: 10,
+        }),
+      }),
+    }),
+  });
+
+  // 2. Download the Parquet index via Wayfinder
+  const indexData = await wayfinder.request(`ar://${manifestId}`);
+  const buffer = await indexData.arrayBuffer();
+
+  // 3. Query for training split
+  const reader = await parquet.ParquetReader.openBuffer(Buffer.from(buffer));
+  const cursor = reader.getCursor();
+
+  const trainingImages = [];
+  let record = null;
+
+  while (record = await cursor.next()) {
+    if (record.dataset_split === 'train') {
+      trainingImages.push({
+        url: `ar://${record.tx_id}`,
+        label: record.label,
+        size: record.byte_size
+      });
+    }
+  }
+
+  await reader.close();
+  return trainingImages;
+}
+```
+
+</Tab>
+</Tabs>
+
 <Callout type="info">
 **Performance Tip**: The Parquet format allows efficient columnar queries, meaning you can filter millions of records without loading the entire dataset into memory.
 </Callout>

From 94e1dca9e774f6ef42ac2d113a597e19e8bf137f Mon Sep 17 00:00:00 2001
From: William Kempster <will@kemspterrrr.xyz>
Date: Tue, 25 Nov 2025 22:18:55 +0000
Subject: [PATCH 4/4] final version of verifiable ai dataset article

---
 .../verifiable-ai/verifiable-dataset.mdx      | 128 ++++++++++++++++--
 1 file changed, 116 insertions(+), 12 deletions(-)

diff --git a/content/build/guides/verifiable-ai/verifiable-dataset.mdx b/content/build/guides/verifiable-ai/verifiable-dataset.mdx
index f82f8fb4b..ebe387cc1 100644
--- a/content/build/guides/verifiable-ai/verifiable-dataset.mdx
+++ b/content/build/guides/verifiable-ai/verifiable-dataset.mdx
@@ -101,7 +101,54 @@ This approach is memory-efficient for large files. The stream hashing means you
 </Step>
 
 <Step>
-### Use the Anchor
+### Addressing data with ArNS
+
+Instead of hardcoding anchor IDs, use ArNS to create a stable reference that always points to the latest anchor version and maintains a permanent version history.
+
+```typescript
+import { ARIO, ANT } from '@ar.io/sdk';
+
+async function associateAnchorWithArNS(anchorId: string, arnsName: string, version: string) {
+  // 1. Get the ANT contract for your ArNS name
+  // (Assumes you've already purchased/leased the ArNS name via https://arns.app)
+  const ario = ARIO.mainnet();
+  const records = await ario.getArNSRecord({ name: arnsName });
+
+  if (!records) {
+    throw new Error(`ArNS name "${arnsName}" not found. Purchase it at https://arns.app first.`);
+  }
+
+  // 2. Connect to the ANT contract
+  const ant = ANT.init({
+    processId: records.processId,
+    signer: jwk, // Your Arweave wallet
+  });
+
+  // 3. Set the @ record to point to latest version
+  await ant.setRecord({
+    undername: '@',
+    transactionId: anchorId,
+    ttlSeconds: 3600, // 1 hour cache
+  });
+
+  // 4. Set a versioned undername to permanently reference this version
+  await ant.setRecord({
+    undername: version, // e.g., 'v1', 'v2', '2024-12'
+    transactionId: anchorId,
+    ttlSeconds: 3600,
+  });
+
+  console.log(`   Latest: ar://${arnsName} → ${anchorId}`);
+  console.log(`   Version: ar://${version}_${arnsName} → ${anchorId}`);
+
+  return anchorId;
+}
+```
+
+</Step>
+
+<Step>
+### Verify the Dataset
 
 The anchor transaction ID serves as an immutable proof that:
 1. A specific dataset existed at a specific time
@@ -118,9 +165,12 @@ import * as crypto from 'crypto';
 import * as fs from 'fs';
 
 // Simple approach - faster to implement but single point of failure
-async function verifyDataset(anchorId: string, localFilePath: string) {
+// Query the latest version or a specific version with ArNS
+async function verifyDataset(identifier: string, localFilePath: string) {
   // 1. Fetch the anchor from Arweave
-  const anchorData = await fetch(`https://arweave.net/${anchorId}`);
+  // Use ArNS name for latest version: ar://dataset-anchor
+  // Use versioned undername for specific version: ar://v1_dataset-anchor
+  const anchorData = await fetch(`https://arweave.net/${identifier}`);
   const anchor = await anchorData.json();
 
   // 2. Hash the local file
@@ -155,8 +205,8 @@ import * as crypto from 'crypto';
 import * as fs from 'fs';
 
 // Production approach - preferred gateway with network fallback for resilience
-// Replace 'https://arweave.net' with your own gateway if you run one optimized for your data
-async function verifyDataset(anchorId: string, localFilePath: string) {
+// Query the latest version or a specific version with ArNS
+async function verifyDataset(identifier: string, localFilePath: string) {
   // 1. Setup Wayfinder: tries arweave.net first, falls back to top 10 staked gateways
   const wayfinder = createWayfinderClient({
     ario: ARIO.mainnet(),
@@ -174,7 +224,9 @@ async function verifyDataset(anchorId: string, localFilePath: string) {
   });
 
   // 2. Fetch the anchor from Arweave via Wayfinder
-  const anchorData = await wayfinder.request(`ar://${anchorId}`);
+  // Use ArNS name for latest version: ar://dataset-anchor
+  // Use versioned undername for specific version: ar://v1_dataset-anchor
+  const anchorData = await wayfinder.request(`ar://${identifier}`);
   const anchor = await anchorData.json();
 
   // 3. Hash the local file
@@ -299,6 +351,53 @@ export async function uploadDatasetWithIndex(baseDir: string) {
 ```
 </Step>
 
+<Step>
+### Addressing data with ArNS
+
+Instead of hardcoding manifest IDs, use ArNS to create a stable reference that always points to the latest dataset version and maintains a permanent version history.
+
+```typescript
+import { ARIO, ANT } from '@ar.io/sdk';
+
+async function associateDatasetWithArNS(manifestId: string, arnsName: string, version: string) {
+  // 1. Get the ANT contract for your ArNS name
+  // (Assumes you've already purchased/leased the ArNS name via https://arns.app)
+  const ario = ARIO.mainnet();
+  const records = await ario.getArNSRecord({ name: arnsName });
+
+  if (!records) {
+    throw new Error(`ArNS name "${arnsName}" not found. Purchase it at https://arns.app first.`);
+  }
+
+  // 2. Connect to the ANT contract
+  const ant = ANT.init({
+    processId: records.processId,
+    signer: jwk, // Your Arweave wallet
+  });
+
+  // 3. Set the @ record to point to latest version
+  await ant.setRecord({
+    undername: '@',
+    transactionId: manifestId,
+    ttlSeconds: 3600, // 1 hour cache
+  });
+
+  // 4. Set a versioned undername to permanently reference this version
+  await ant.setRecord({
+    undername: version, // e.g., 'v1', 'v2', '2024-12'
+    transactionId: manifestId,
+    ttlSeconds: 3600,
+  });
+
+  console.log(`   Latest: ar://${arnsName} → ${manifestId}`);
+  console.log(`   Version: ar://${version}_${arnsName} → ${manifestId}`);
+
+  return manifestId;
+}
+```
+
+</Step>
+
 <Step>
 ### Query the Index
 
@@ -311,9 +410,12 @@ Training scripts can now query the Parquet index to fetch specific subsets:
 import * as parquet from 'parquetjs';
 
 // Simple approach - faster to implement but single point of failure
-async function getTrainingImages(manifestId: string) {
+// Query the latest version or a specific version with ArNS
+async function getTrainingImages(identifier: string) {
   // 1. Download the Parquet index
-  const indexData = await fetch(`https://arweave.net/${manifestId}`);
+  // Use ArNS name for latest version: ar://medical-imaging
+  // Use versioned undername for specific version: ar://v1_medical-imaging
+  const indexData = await fetch(`https://arweave.net/${identifier}`);
   const buffer = await indexData.arrayBuffer();
 
   // 2. Query for training split
@@ -348,8 +450,8 @@ import { createWayfinderClient, PreferredWithFallbackRoutingStrategy, FastestPin
 import { ARIO } from '@ar.io/sdk';
 
 // Production approach - keeps your training pipeline operational even during gateway outages
-// Replace 'https://arweave.net' with your own gateway if you run one optimized for your data
-async function getTrainingImages(manifestId: string) {
+// Query the latest version or a specific version with ArNS
+async function getTrainingImages(identifier: string) {
   // 1. Setup Wayfinder: tries arweave.net first, falls back to top 10 staked gateways
   const wayfinder = createWayfinderClient({
     ario: ARIO.mainnet(),
@@ -366,8 +468,10 @@ async function getTrainingImages(manifestId: string) {
     }),
   });
 
-  // 2. Download the Parquet index via Wayfinder
-  const indexData = await wayfinder.request(`ar://${manifestId}`);
+  // 2. Download the Parquet index
+  // Use ArNS name for latest version: ar://medical-imaging
+  // Use versioned undername for specific version: ar://v1_medical-imaging
+  const indexData = await wayfinder.request(`ar://${identifier}`);
   const buffer = await indexData.arrayBuffer();
 
   // 3. Query for training split