Skip to content

Commit 80a895b

Browse files
baezoremdashbot[bot]ascorbic
authored
fix: chunk SEO IN clause to stay within D1 SQL variable limit (#422)
* fix: chunk byline IN clauses to stay within D1 SQL variable limit Fixes #219. hydrateEntryBylines builds unbounded IN (?, ?, …) clauses that exceed Cloudflare D1's bound-parameter limit on large collections. Adds a chunks() utility and applies it defense-in-depth at the repository level: getContentBylinesMany, findByUserIds, and getAuthorIds now batch IDs in groups of 50. * chore: add changeset for byline chunking fix * fix: deduplicate content IDs before chunking and add integration tests Deduplicates contentIds in getContentBylinesMany to prevent duplicate credits when the same ID appears across chunk boundaries. Adds tests for the duplication edge case and an end-to-end getBylinesForEntries test spanning both explicit and inferred byline paths. * fix: chunk SEO IN clause to stay within D1 SQL variable limit SeoRepository.getMany builds a WHERE content_id IN (?, ?, ...) clause alongside a collection = ? filter. On Cloudflare D1, which caps bound parameters at 100 per query, passing 100 content ids produces 101 parameters and trips the limit: D1_ERROR: too many SQL variables at offset 369: SQLITE_ERROR This is the same root cause as the byline hydration fix in the sibling commit, but on a different repository that wasn't covered there. SeoRepository.getMany is called from handleContentList before hydrateBylinesMany, so on any collection with has_seo = 1 and >= 99 items, it's the first function to fail on the admin content list endpoint. Apply the same chunking pattern using the shared chunks() helper and SQL_BATCH_SIZE constant. Deduplicate contentIds before chunking for consistency with the byline fix. Pre-fill result with defaults so the two-pass resolve-then-fill-missing logic collapses to a single pass. Adds unit tests covering: - input size larger than SQL_BATCH_SIZE, real ids spread across chunks - all-missing ids get defaults - duplicate input ids resolve cleanly without duplicate rows Repro of the underlying D1 limit for the record: wrangler d1 execute <db> --remote --command \ "SELECT 1 WHERE 'x' = ? AND 1 IN (?,?,...x100)" -> too many SQL variables at offset 231: SQLITE_ERROR [code: 7500] * style: format --------- Co-authored-by: emdashbot[bot] <emdashbot[bot]@users.noreply.github.com> Co-authored-by: Matt Kane <mkane@cloudflare.com>
1 parent a13c4ec commit 80a895b

3 files changed

Lines changed: 140 additions & 17 deletions

File tree

.changeset/brave-seals-hydrate.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
---
2+
"emdash": patch
3+
---
4+
5+
Fixes SEO hydration exceeding D1 SQL variable limit on large collections by chunking the `content_id IN (...)` clause in `SeoRepository.getMany`.

packages/core/src/database/repositories/seo.ts

Lines changed: 21 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import { sql, type Kysely } from "kysely";
22

3+
import { chunks, SQL_BATCH_SIZE } from "../../utils/chunks.js";
34
import type { Database } from "../types.js";
45
import type { ContentSeo, ContentSeoInput } from "./types.js";
56

@@ -61,37 +62,40 @@ export class SeoRepository {
6162
}
6263

6364
/**
64-
* Get SEO data for multiple content items in a single query.
65+
* Get SEO data for multiple content items.
6566
* Returns a Map keyed by content_id. Items without SEO rows get defaults.
67+
*
68+
* Chunks the `content_id IN (…)` clause so the total bound-parameter count
69+
* per statement (ids + the `collection = ?` filter) stays within Cloudflare
70+
* D1's 100-variable limit regardless of how many content items are passed.
6671
*/
6772
async getMany(collection: string, contentIds: string[]): Promise<Map<string, ContentSeo>> {
6873
const result = new Map<string, ContentSeo>();
6974

7075
if (contentIds.length === 0) return result;
7176

72-
// Batch query — single SELECT with IN clause
73-
const rows = await this.db
74-
.selectFrom("_emdash_seo")
75-
.selectAll()
76-
.where("collection", "=", collection)
77-
.where("content_id", "in", contentIds)
78-
.execute();
79-
80-
// Index fetched rows by content_id
81-
const rowMap = new Map(rows.map((r) => [r.content_id, r]));
82-
77+
// Pre-fill with defaults so every input id has an entry even if no row exists.
8378
for (const id of contentIds) {
84-
const row = rowMap.get(id);
85-
if (row) {
86-
result.set(id, {
79+
result.set(id, { ...SEO_DEFAULTS });
80+
}
81+
82+
const uniqueContentIds = [...new Set(contentIds)];
83+
for (const chunk of chunks(uniqueContentIds, SQL_BATCH_SIZE)) {
84+
const rows = await this.db
85+
.selectFrom("_emdash_seo")
86+
.selectAll()
87+
.where("collection", "=", collection)
88+
.where("content_id", "in", chunk)
89+
.execute();
90+
91+
for (const row of rows) {
92+
result.set(row.content_id, {
8793
title: row.seo_title ?? null,
8894
description: row.seo_description ?? null,
8995
image: row.seo_image ?? null,
9096
canonical: row.seo_canonical ?? null,
9197
noIndex: row.seo_no_index === 1,
9298
});
93-
} else {
94-
result.set(id, { ...SEO_DEFAULTS });
9599
}
96100
}
97101

Lines changed: 114 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,114 @@
1+
import type { Kysely } from "kysely";
2+
import { describe, it, expect, beforeEach, afterEach } from "vitest";
3+
4+
import { ContentRepository } from "../../../../src/database/repositories/content.js";
5+
import { SeoRepository } from "../../../../src/database/repositories/seo.js";
6+
import type { Database } from "../../../../src/database/types.js";
7+
import { SQL_BATCH_SIZE } from "../../../../src/utils/chunks.js";
8+
import { setupTestDatabaseWithCollections, teardownTestDatabase } from "../../../utils/test-db.js";
9+
10+
describe("SeoRepository", () => {
11+
let db: Kysely<Database>;
12+
let seoRepo: SeoRepository;
13+
let contentRepo: ContentRepository;
14+
15+
beforeEach(async () => {
16+
db = await setupTestDatabaseWithCollections();
17+
// Enable SEO on the post collection — createCollection defaults has_seo to 0.
18+
await db
19+
.updateTable("_emdash_collections")
20+
.set({ has_seo: 1 })
21+
.where("slug", "=", "post")
22+
.execute();
23+
seoRepo = new SeoRepository(db);
24+
contentRepo = new ContentRepository(db);
25+
});
26+
27+
afterEach(async () => {
28+
await teardownTestDatabase(db);
29+
});
30+
31+
it("getMany handles more IDs than SQL_BATCH_SIZE", async () => {
32+
// Create a few real content entries with SEO rows
33+
const realIds: string[] = [];
34+
for (let i = 0; i < 3; i++) {
35+
const content = await contentRepo.create({
36+
type: "post",
37+
slug: `seo-batch-post-${i}`,
38+
data: { title: `SEO Batch Post ${i}` },
39+
});
40+
await seoRepo.upsert("post", content.id, {
41+
title: `SEO Title ${i}`,
42+
description: `SEO Description ${i}`,
43+
});
44+
realIds.push(content.id);
45+
}
46+
47+
// Build an ID list larger than SQL_BATCH_SIZE with real IDs spread across chunks
48+
const ids: string[] = [];
49+
for (let i = 0; i < SQL_BATCH_SIZE + 10; i++) {
50+
ids.push(`fake-id-${i}`);
51+
}
52+
ids[0] = realIds[0]!;
53+
ids[SQL_BATCH_SIZE - 1] = realIds[1]!;
54+
ids[SQL_BATCH_SIZE + 5] = realIds[2]!;
55+
56+
const result = await seoRepo.getMany("post", ids);
57+
58+
// All input IDs should be present in the result Map
59+
expect(result.size).toBe(ids.length);
60+
61+
// Real IDs should have their SEO data resolved
62+
expect(result.get(realIds[0]!)?.title).toBe("SEO Title 0");
63+
expect(result.get(realIds[1]!)?.title).toBe("SEO Title 1");
64+
expect(result.get(realIds[2]!)?.title).toBe("SEO Title 2");
65+
66+
// Fake IDs should get default values
67+
expect(result.get("fake-id-5")?.title).toBeNull();
68+
expect(result.get("fake-id-5")?.description).toBeNull();
69+
expect(result.get("fake-id-5")?.noIndex).toBe(false);
70+
});
71+
72+
it("getMany returns defaults for every input id when no rows exist", async () => {
73+
const ids: string[] = [];
74+
for (let i = 0; i < SQL_BATCH_SIZE + 10; i++) {
75+
ids.push(`missing-id-${i}`);
76+
}
77+
78+
const result = await seoRepo.getMany("post", ids);
79+
80+
expect(result.size).toBe(ids.length);
81+
for (const id of ids) {
82+
const entry = result.get(id);
83+
expect(entry).toBeDefined();
84+
expect(entry?.title).toBeNull();
85+
expect(entry?.description).toBeNull();
86+
expect(entry?.image).toBeNull();
87+
expect(entry?.canonical).toBeNull();
88+
expect(entry?.noIndex).toBe(false);
89+
}
90+
});
91+
92+
it("getMany deduplicates repeated content IDs without duplicate rows", async () => {
93+
const content = await contentRepo.create({
94+
type: "post",
95+
slug: "seo-duplicate-post",
96+
data: { title: "SEO Duplicate" },
97+
});
98+
await seoRepo.upsert("post", content.id, {
99+
title: "Duplicate SEO",
100+
});
101+
102+
const ids: string[] = [];
103+
for (let i = 0; i < SQL_BATCH_SIZE + 10; i++) {
104+
ids.push(`fake-id-${i}`);
105+
}
106+
ids[0] = content.id;
107+
ids[SQL_BATCH_SIZE + 5] = content.id;
108+
109+
const result = await seoRepo.getMany("post", ids);
110+
111+
// The real entry should resolve to its SEO row regardless of the duplicate input
112+
expect(result.get(content.id)?.title).toBe("Duplicate SEO");
113+
});
114+
});

0 commit comments

Comments
 (0)