fix: chunk SEO IN clause to stay within D1 SQL variable limit (#422)

baezor · emdashbot[bot] · ascorbic · web-flow · commit 80a895b1def1 · 2026-04-11T10:57:15.000Z
* fix: chunk byline IN clauses to stay within D1 SQL variable limit Fixes #219. hydrateEntryBylines builds unbounded IN (?, ?, …) clauses that exceed Cloudflare D1's bound-parameter limit on large collections. Adds a chunks() utility and applies it defense-in-depth at the repository level: getContentBylinesMany, findByUserIds, and getAuthorIds now batch IDs in groups of 50. * chore: add changeset for byline chunking fix * fix: deduplicate content IDs before chunking and add integration tests Deduplicates contentIds in getContentBylinesMany to prevent duplicate credits when the same ID appears across chunk boundaries. Adds tests for the duplication edge case and an end-to-end getBylinesForEntries test spanning both explicit and inferred byline paths. * fix: chunk SEO IN clause to stay within D1 SQL variable limit SeoRepository.getMany builds a WHERE content_id IN (?, ?, ...) clause alongside a collection = ? filter. On Cloudflare D1, which caps bound parameters at 100 per query, passing 100 content ids produces 101 parameters and trips the limit: D1_ERROR: too many SQL variables at offset 369: SQLITE_ERROR This is the same root cause as the byline hydration fix in the sibling commit, but on a different repository that wasn't covered there. SeoRepository.getMany is called from handleContentList before hydrateBylinesMany, so on any collection with has_seo = 1 and >= 99 items, it's the first function to fail on the admin content list endpoint. Apply the same chunking pattern using the shared chunks() helper and SQL_BATCH_SIZE constant. Deduplicate contentIds before chunking for consistency with the byline fix. Pre-fill result with defaults so the two-pass resolve-then-fill-missing logic collapses to a single pass. Adds unit tests covering: - input size larger than SQL_BATCH_SIZE, real ids spread across chunks - all-missing ids get defaults - duplicate input ids resolve cleanly without duplicate rows Repro of the underlying D1 limit for the record: wrangler d1 execute <db> --remote --command \ "SELECT 1 WHERE 'x' = ? AND 1 IN (?,?,...x100)" -> too many SQL variables at offset 231: SQLITE_ERROR [code: 7500] * style: format --------- Co-authored-by: emdashbot[bot] <emdashbot[bot]@users.noreply.github.com> Co-authored-by: Matt Kane <mkane@cloudflare.com>
diff --git a/.changeset/brave-seals-hydrate.md b/.changeset/brave-seals-hydrate.md
@@ -0,0 +1,5 @@
+---
+"emdash": patch
+---
+
+Fixes SEO hydration exceeding D1 SQL variable limit on large collections by chunking the `content_id IN (...)` clause in `SeoRepository.getMany`.
diff --git a/packages/core/src/database/repositories/seo.ts b/packages/core/src/database/repositories/seo.ts
@@ -1,5 +1,6 @@
 import { sql, type Kysely } from "kysely";
 
+import { chunks, SQL_BATCH_SIZE } from "../../utils/chunks.js";
 import type { Database } from "../types.js";
 import type { ContentSeo, ContentSeoInput } from "./types.js";
 
@@ -61,37 +62,40 @@ export class SeoRepository {
 	}
 
 	/**
-	 * Get SEO data for multiple content items in a single query.
+	 * Get SEO data for multiple content items.
 	 * Returns a Map keyed by content_id. Items without SEO rows get defaults.
+	 *
+	 * Chunks the `content_id IN (…)` clause so the total bound-parameter count
+	 * per statement (ids + the `collection = ?` filter) stays within Cloudflare
+	 * D1's 100-variable limit regardless of how many content items are passed.
 	 */
 	async getMany(collection: string, contentIds: string[]): Promise<Map<string, ContentSeo>> {
 		const result = new Map<string, ContentSeo>();
 
 		if (contentIds.length === 0) return result;
 
-		// Batch query — single SELECT with IN clause
-		const rows = await this.db
-			.selectFrom("_emdash_seo")
-			.selectAll()
-			.where("collection", "=", collection)
-			.where("content_id", "in", contentIds)
-			.execute();
-
-		// Index fetched rows by content_id
-		const rowMap = new Map(rows.map((r) => [r.content_id, r]));
-
+		// Pre-fill with defaults so every input id has an entry even if no row exists.
 		for (const id of contentIds) {
-			const row = rowMap.get(id);
-			if (row) {
-				result.set(id, {
+			result.set(id, { ...SEO_DEFAULTS });
+		}
+
+		const uniqueContentIds = [...new Set(contentIds)];
+		for (const chunk of chunks(uniqueContentIds, SQL_BATCH_SIZE)) {
+			const rows = await this.db
+				.selectFrom("_emdash_seo")
+				.selectAll()
+				.where("collection", "=", collection)
+				.where("content_id", "in", chunk)
+				.execute();
+
+			for (const row of rows) {
+				result.set(row.content_id, {
 					title: row.seo_title ?? null,
 					description: row.seo_description ?? null,
 					image: row.seo_image ?? null,
 					canonical: row.seo_canonical ?? null,
 					noIndex: row.seo_no_index === 1,
 				});
-			} else {
-				result.set(id, { ...SEO_DEFAULTS });
 			}
 		}
 
diff --git a/packages/core/tests/unit/database/repositories/seo.test.ts b/packages/core/tests/unit/database/repositories/seo.test.ts
@@ -0,0 +1,114 @@
+import type { Kysely } from "kysely";
+import { describe, it, expect, beforeEach, afterEach } from "vitest";
+
+import { ContentRepository } from "../../../../src/database/repositories/content.js";
+import { SeoRepository } from "../../../../src/database/repositories/seo.js";
+import type { Database } from "../../../../src/database/types.js";
+import { SQL_BATCH_SIZE } from "../../../../src/utils/chunks.js";
+import { setupTestDatabaseWithCollections, teardownTestDatabase } from "../../../utils/test-db.js";
+
+describe("SeoRepository", () => {
+	let db: Kysely<Database>;
+	let seoRepo: SeoRepository;
+	let contentRepo: ContentRepository;
+
+	beforeEach(async () => {
+		db = await setupTestDatabaseWithCollections();
+		// Enable SEO on the post collection — createCollection defaults has_seo to 0.
+		await db
+			.updateTable("_emdash_collections")
+			.set({ has_seo: 1 })
+			.where("slug", "=", "post")
+			.execute();
+		seoRepo = new SeoRepository(db);
+		contentRepo = new ContentRepository(db);
+	});
+
+	afterEach(async () => {
+		await teardownTestDatabase(db);
+	});
+
+	it("getMany handles more IDs than SQL_BATCH_SIZE", async () => {
+		// Create a few real content entries with SEO rows
+		const realIds: string[] = [];
+		for (let i = 0; i < 3; i++) {
+			const content = await contentRepo.create({
+				type: "post",
+				slug: `seo-batch-post-${i}`,
+				data: { title: `SEO Batch Post ${i}` },
+			});
+			await seoRepo.upsert("post", content.id, {
+				title: `SEO Title ${i}`,
+				description: `SEO Description ${i}`,
+			});
+			realIds.push(content.id);
+		}
+
+		// Build an ID list larger than SQL_BATCH_SIZE with real IDs spread across chunks
+		const ids: string[] = [];
+		for (let i = 0; i < SQL_BATCH_SIZE + 10; i++) {
+			ids.push(`fake-id-${i}`);
+		}
+		ids[0] = realIds[0]!;
+		ids[SQL_BATCH_SIZE - 1] = realIds[1]!;
+		ids[SQL_BATCH_SIZE + 5] = realIds[2]!;
+
+		const result = await seoRepo.getMany("post", ids);
+
+		// All input IDs should be present in the result Map
+		expect(result.size).toBe(ids.length);
+
+		// Real IDs should have their SEO data resolved
+		expect(result.get(realIds[0]!)?.title).toBe("SEO Title 0");
+		expect(result.get(realIds[1]!)?.title).toBe("SEO Title 1");
+		expect(result.get(realIds[2]!)?.title).toBe("SEO Title 2");
+
+		// Fake IDs should get default values
+		expect(result.get("fake-id-5")?.title).toBeNull();
+		expect(result.get("fake-id-5")?.description).toBeNull();
+		expect(result.get("fake-id-5")?.noIndex).toBe(false);
+	});
+
+	it("getMany returns defaults for every input id when no rows exist", async () => {
+		const ids: string[] = [];
+		for (let i = 0; i < SQL_BATCH_SIZE + 10; i++) {
+			ids.push(`missing-id-${i}`);
+		}
+
+		const result = await seoRepo.getMany("post", ids);
+
+		expect(result.size).toBe(ids.length);
+		for (const id of ids) {
+			const entry = result.get(id);
+			expect(entry).toBeDefined();
+			expect(entry?.title).toBeNull();
+			expect(entry?.description).toBeNull();
+			expect(entry?.image).toBeNull();
+			expect(entry?.canonical).toBeNull();
+			expect(entry?.noIndex).toBe(false);
+		}
+	});
+
+	it("getMany deduplicates repeated content IDs without duplicate rows", async () => {
+		const content = await contentRepo.create({
+			type: "post",
+			slug: "seo-duplicate-post",
+			data: { title: "SEO Duplicate" },
+		});
+		await seoRepo.upsert("post", content.id, {
+			title: "Duplicate SEO",
+		});
+
+		const ids: string[] = [];
+		for (let i = 0; i < SQL_BATCH_SIZE + 10; i++) {
+			ids.push(`fake-id-${i}`);
+		}
+		ids[0] = content.id;
+		ids[SQL_BATCH_SIZE + 5] = content.id;
+
+		const result = await seoRepo.getMany("post", ids);
+
+		// The real entry should resolve to its SEO row regardless of the duplicate input
+		expect(result.get(content.id)?.title).toBe("Duplicate SEO");
+	});
+});

-Original file line number
+Diff line change
@@ @@ -0,0 +1,5 @@ @@
 +---
 +"emdash": patch
 +---
++
 +Fixes SEO hydration exceeding D1 SQL variable limit on large collections by chunking the `content_id IN (...)` clause in `SeoRepository.getMany`.