From cec8cee0df585398d20df2c803d545c882fc6bdc Mon Sep 17 00:00:00 2001 From: Max Netterberg Date: Thu, 26 Feb 2026 14:49:51 +0100 Subject: [PATCH 1/2] fix: handle off-by-one xref subsection start in malformed PDFs Some PDFs have a malformed xref table where the subsection header reports the starting object number as 1 instead of 0, while the entries clearly start at object 0 (free list head with gen 65535). This shifted all object resolutions by one, causing wrong page count and infinite loops in extractPages. - Detect and correct the off-by-one in xref subsection parsing - Add cycle detection to ObjectCopier.getInheritedAttribute as defense-in-depth - Add unit test for xref correction and integration test with fixtures --- fixtures/malformed/xref-off-by-one.pdf | Bin 0 -> 1104 bytes src/api/pdf.test.ts | 17 ++++++++++ src/document/object-copier.ts | 9 ++++++ src/parser/xref-parser.test.ts | 41 +++++++++++++++++++++++++ src/parser/xref-parser.ts | 25 +++++++++++++-- 5 files changed, 89 insertions(+), 3 deletions(-) create mode 100644 fixtures/malformed/xref-off-by-one.pdf diff --git a/fixtures/malformed/xref-off-by-one.pdf b/fixtures/malformed/xref-off-by-one.pdf new file mode 100644 index 0000000000000000000000000000000000000000..53feea2337ffda2d64be949986a56b00ad34c681 GIT binary patch literal 1104 zcmbu9!Eb^v6vprT72m~VJ8ZNd0*T?ElPuYmEy&I6P^CJIgN4?`?XTY}y2&73v8a=&sfH=k1H3(;^1J1%3P#im^CyFne=;8?d^C6{!CIJ4N_z`*ld4@(?1 z@-+j+Q41;Umc|3hvZXmbQEh2%*!A@K^TD{)nIktWPjSC$t literal 0 HcmV?d00001 diff --git a/src/api/pdf.test.ts b/src/api/pdf.test.ts index 242e788..1ce1634 100644 --- a/src/api/pdf.test.ts +++ b/src/api/pdf.test.ts @@ -702,6 +702,23 @@ describe("PDF", () => { expect(extracted.getPageCount()).toBe(0); }); + + it("handles PDF with off-by-one xref subsection start", async () => { + // Some malformed PDFs have the xref subsection header saying "1 N" + // instead of "0 N", shifting all object numbers by one. This caused + // wrong page count and infinite loop in extractPages due to objects + // resolving to wrong offsets (e.g., Pages root resolving as a Page + // with a self-referencing Parent). + const bytes = await loadFixture("malformed", "xref-off-by-one.pdf"); + const pdf = await PDF.load(bytes); + + expect(pdf.getPageCount()).toBe(3); + expect(pdf.getPages()[2].width).toBe(300); + expect(pdf.getPages()[2].height).toBe(400); + + const extracted = await pdf.extractPages([0, 1, 2]); + expect(extracted.getPageCount()).toBe(3); + }); }); describe("embedPage and drawPage", () => { diff --git a/src/document/object-copier.ts b/src/document/object-copier.ts index 7e5d719..378c209 100644 --- a/src/document/object-copier.ts +++ b/src/document/object-copier.ts @@ -469,6 +469,7 @@ export class ObjectCopier { */ private getInheritedAttribute(page: PdfDict, key: string): PdfObject | null { let current: PdfDict | null = page; + const visited = new Set(); while (current) { const value = current.get(key); @@ -483,6 +484,14 @@ export class ObjectCopier { break; } + const refKey = `${parentRef.objectNumber}:${parentRef.generation}`; + + if (visited.has(refKey)) { + break; + } + + visited.add(refKey); + const parent = this.source.getObject(parentRef); current = parent instanceof PdfDict ? parent : null; } diff --git a/src/parser/xref-parser.test.ts b/src/parser/xref-parser.test.ts index eb75dc7..c323b58 100644 --- a/src/parser/xref-parser.test.ts +++ b/src/parser/xref-parser.test.ts @@ -180,6 +180,47 @@ trailer expect(result.entries.size).toBe(2); }); + + it("corrects off-by-one subsection start when free list head is at wrong position", () => { + // Some malformed PDFs report firstObjNum=1 when entries actually start at 0. + // The free list head (gen 65535, type f) is always object 0. + const p = parser(`xref +1 4 +0000000000 65535 f +0000000015 00000 n +0000000074 00000 n +0000000120 00000 n +trailer +<< /Size 4 /Root 1 0 R >> +`); + const result = p.parseTable(); + + expect(result.entries.size).toBe(4); + + // Entry should be corrected to object 0 (not 1) + const entry0 = result.entries.get(0); + expect(entry0).toBeDefined(); + expect(entry0!.type).toBe("free"); + if (entry0!.type === "free") { + expect(entry0!.generation).toBe(65535); + } + + // Object 1 should be at offset 15 + const entry1 = result.entries.get(1); + expect(entry1).toBeDefined(); + expect(entry1!.type).toBe("uncompressed"); + if (entry1!.type === "uncompressed") { + expect(entry1!.offset).toBe(15); + } + + // Object 3 should be at offset 120 + const entry3 = result.entries.get(3); + expect(entry3).toBeDefined(); + expect(entry3!.type).toBe("uncompressed"); + if (entry3!.type === "uncompressed") { + expect(entry3!.offset).toBe(120); + } + }); }); describe("trailer parsing", () => { diff --git a/src/parser/xref-parser.ts b/src/parser/xref-parser.ts index 8c92a84..50dedba 100644 --- a/src/parser/xref-parser.ts +++ b/src/parser/xref-parser.ts @@ -353,10 +353,29 @@ export class XRefParser { this.skipWhitespaceFromCurrent(); // Read entries + const parsedEntries: XRefEntry[] = []; + for (let i = 0; i < count; i++) { - const objNum = firstObjNum + i; - const entry = this.parseEntry(); - entries.set(objNum, entry); + parsedEntries.push(this.parseEntry()); + } + + // Detect off-by-one in subsection start: some malformed PDFs report + // firstObjNum=1 when the entries actually start at object 0. + // The free list head (offset 0, generation 65535, type free) is always + // object 0, so if we see it at a non-zero starting position, correct it. + let correctedFirstObjNum = firstObjNum; + + if ( + firstObjNum > 0 && + parsedEntries.length > 0 && + parsedEntries[0].type === "free" && + parsedEntries[0].generation === 65535 + ) { + correctedFirstObjNum = 0; + } + + for (let i = 0; i < parsedEntries.length; i++) { + entries.set(correctedFirstObjNum + i, parsedEntries[i]); } } From 85f94ae041344ed23a00e14d302c65081d0b3c31 Mon Sep 17 00:00:00 2001 From: Lucas Smith Date: Sat, 28 Feb 2026 21:23:49 +1100 Subject: [PATCH 2/2] fix: tighten off-by-one detection --- src/parser/xref-parser.ts | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/parser/xref-parser.ts b/src/parser/xref-parser.ts index 50dedba..83e1f9c 100644 --- a/src/parser/xref-parser.ts +++ b/src/parser/xref-parser.ts @@ -361,17 +361,20 @@ export class XRefParser { // Detect off-by-one in subsection start: some malformed PDFs report // firstObjNum=1 when the entries actually start at object 0. - // The free list head (offset 0, generation 65535, type free) is always - // object 0, so if we see it at a non-zero starting position, correct it. + // The free list head (generation 65535, type free) is always object 0, + // so if we see it at position 1, correct it. (Same fix as pdf.js #3248/#7229) let correctedFirstObjNum = firstObjNum; if ( - firstObjNum > 0 && + firstObjNum === 1 && parsedEntries.length > 0 && parsedEntries[0].type === "free" && parsedEntries[0].generation === 65535 ) { correctedFirstObjNum = 0; + console.warn( + "XRef: corrected subsection start from 1 to 0 (free list head at wrong position)", + ); } for (let i = 0; i < parsedEntries.length; i++) {