diff --git a/fixtures/malformed/xref-off-by-one.pdf b/fixtures/malformed/xref-off-by-one.pdf new file mode 100644 index 0000000..53feea2 Binary files /dev/null and b/fixtures/malformed/xref-off-by-one.pdf differ diff --git a/src/api/pdf.test.ts b/src/api/pdf.test.ts index 242e788..1ce1634 100644 --- a/src/api/pdf.test.ts +++ b/src/api/pdf.test.ts @@ -702,6 +702,23 @@ describe("PDF", () => { expect(extracted.getPageCount()).toBe(0); }); + + it("handles PDF with off-by-one xref subsection start", async () => { + // Some malformed PDFs have the xref subsection header saying "1 N" + // instead of "0 N", shifting all object numbers by one. This caused + // wrong page count and infinite loop in extractPages due to objects + // resolving to wrong offsets (e.g., Pages root resolving as a Page + // with a self-referencing Parent). + const bytes = await loadFixture("malformed", "xref-off-by-one.pdf"); + const pdf = await PDF.load(bytes); + + expect(pdf.getPageCount()).toBe(3); + expect(pdf.getPages()[2].width).toBe(300); + expect(pdf.getPages()[2].height).toBe(400); + + const extracted = await pdf.extractPages([0, 1, 2]); + expect(extracted.getPageCount()).toBe(3); + }); }); describe("embedPage and drawPage", () => { diff --git a/src/document/object-copier.ts b/src/document/object-copier.ts index 7e5d719..378c209 100644 --- a/src/document/object-copier.ts +++ b/src/document/object-copier.ts @@ -469,6 +469,7 @@ export class ObjectCopier { */ private getInheritedAttribute(page: PdfDict, key: string): PdfObject | null { let current: PdfDict | null = page; + const visited = new Set(); while (current) { const value = current.get(key); @@ -483,6 +484,14 @@ export class ObjectCopier { break; } + const refKey = `${parentRef.objectNumber}:${parentRef.generation}`; + + if (visited.has(refKey)) { + break; + } + + visited.add(refKey); + const parent = this.source.getObject(parentRef); current = parent instanceof PdfDict ? parent : null; } diff --git a/src/parser/xref-parser.test.ts b/src/parser/xref-parser.test.ts index eb75dc7..c323b58 100644 --- a/src/parser/xref-parser.test.ts +++ b/src/parser/xref-parser.test.ts @@ -180,6 +180,47 @@ trailer expect(result.entries.size).toBe(2); }); + + it("corrects off-by-one subsection start when free list head is at wrong position", () => { + // Some malformed PDFs report firstObjNum=1 when entries actually start at 0. + // The free list head (gen 65535, type f) is always object 0. + const p = parser(`xref +1 4 +0000000000 65535 f +0000000015 00000 n +0000000074 00000 n +0000000120 00000 n +trailer +<< /Size 4 /Root 1 0 R >> +`); + const result = p.parseTable(); + + expect(result.entries.size).toBe(4); + + // Entry should be corrected to object 0 (not 1) + const entry0 = result.entries.get(0); + expect(entry0).toBeDefined(); + expect(entry0!.type).toBe("free"); + if (entry0!.type === "free") { + expect(entry0!.generation).toBe(65535); + } + + // Object 1 should be at offset 15 + const entry1 = result.entries.get(1); + expect(entry1).toBeDefined(); + expect(entry1!.type).toBe("uncompressed"); + if (entry1!.type === "uncompressed") { + expect(entry1!.offset).toBe(15); + } + + // Object 3 should be at offset 120 + const entry3 = result.entries.get(3); + expect(entry3).toBeDefined(); + expect(entry3!.type).toBe("uncompressed"); + if (entry3!.type === "uncompressed") { + expect(entry3!.offset).toBe(120); + } + }); }); describe("trailer parsing", () => { diff --git a/src/parser/xref-parser.ts b/src/parser/xref-parser.ts index 8c92a84..83e1f9c 100644 --- a/src/parser/xref-parser.ts +++ b/src/parser/xref-parser.ts @@ -353,10 +353,32 @@ export class XRefParser { this.skipWhitespaceFromCurrent(); // Read entries + const parsedEntries: XRefEntry[] = []; + for (let i = 0; i < count; i++) { - const objNum = firstObjNum + i; - const entry = this.parseEntry(); - entries.set(objNum, entry); + parsedEntries.push(this.parseEntry()); + } + + // Detect off-by-one in subsection start: some malformed PDFs report + // firstObjNum=1 when the entries actually start at object 0. + // The free list head (generation 65535, type free) is always object 0, + // so if we see it at position 1, correct it. (Same fix as pdf.js #3248/#7229) + let correctedFirstObjNum = firstObjNum; + + if ( + firstObjNum === 1 && + parsedEntries.length > 0 && + parsedEntries[0].type === "free" && + parsedEntries[0].generation === 65535 + ) { + correctedFirstObjNum = 0; + console.warn( + "XRef: corrected subsection start from 1 to 0 (free list head at wrong position)", + ); + } + + for (let i = 0; i < parsedEntries.length; i++) { + entries.set(correctedFirstObjNum + i, parsedEntries[i]); } }