From ec1caa7168812c74a5d711bef38fd57cc44396b9 Mon Sep 17 00:00:00 2001 From: Nicolas Dupont Date: Tue, 20 Jan 2026 10:01:42 +0100 Subject: [PATCH 1/6] Extract function for readability --- src/archivist/fetcher/fullDomFetcher.js | 111 ++++++++++++++---------- 1 file changed, 66 insertions(+), 45 deletions(-) diff --git a/src/archivist/fetcher/fullDomFetcher.js b/src/archivist/fetcher/fullDomFetcher.js index ee05e3925..b9dca70bb 100644 --- a/src/archivist/fetcher/fullDomFetcher.js +++ b/src/archivist/fetcher/fullDomFetcher.js @@ -8,9 +8,6 @@ let browser; export default async function fetch(url, cssSelectors, config) { puppeteer.use(stealthPlugin({ locale: config.language })); - let context; - let page; - let client; let response; const selectors = [].concat(cssSelectors); @@ -18,24 +15,20 @@ export default async function fetch(url, cssSelectors, config) { throw new Error('The headless browser should be controlled manually with "launchHeadlessBrowser" and "stopHeadlessBrowser".'); } + let context; + let page; + let client; + try { context = await browser.createBrowserContext(); // Create an isolated browser context to ensure complete isolation between fetches (cookies, localStorage, sessionStorage, IndexedDB, cache) page = await context.newPage(); + client = await page.createCDPSession(); + + await configurePage(page, client, config); - await page.setViewport({ width: 1920, height: 1080 }); // Set a realistic viewport size to avoid detection based on default Puppeteer dimensions (800x600) - await page.setDefaultNavigationTimeout(config.navigationTimeout); - await page.setExtraHTTPHeaders({ 'Accept-Language': config.language }); - // Use CDP to ensure the browser language is set correctly (most reliable method, see https://zirkelc.dev/posts/puppeteer-language-experiment) - client = await page.createCDPSession(); - await client.send('Network.setUserAgentOverride', { - userAgent: await browser.userAgent(), - acceptLanguage: config.language, - }); - if (browser.proxyCredentials?.username && browser.proxyCredentials?.password) { - await page.authenticate(browser.proxyCredentials); } response = await page.goto(url, { waitUntil: 'load' }); // Using `load` instead of `networkidle0` as it's more reliable and faster. The 'load' event fires when the page and all its resources (stylesheets, scripts, images) have finished loading. `networkidle0` can be problematic as it waits for 500ms of network inactivity, which may never occur on dynamic pages and then triggers a navigation timeout. @@ -46,31 +39,11 @@ export default async function fetch(url, cssSelectors, config) { const statusCode = response.status(); - if (statusCode < 200 || (statusCode >= 300 && statusCode !== 304)) { + if (!isValidHttpStatus(statusCode)) { throw new Error(`Received HTTP code ${statusCode} when trying to fetch '${url}'`); } - const waitForSelectorsPromises = selectors.filter(Boolean).map(selector => - page.waitForFunction( - cssSelector => { - const element = document.querySelector(cssSelector); // eslint-disable-line no-undef - - return element?.textContent.trim().length; // Ensures element exists and contains non-empty text, as an empty element may indicate content is still loading - }, - { timeout: config.waitForElementsTimeout }, - selector, - )); - - // We expect all elements to be present on the page… - await Promise.all(waitForSelectorsPromises).catch(error => { - if (error.name == 'TimeoutError') { - // however, if they are not, this is not considered as an error since selectors may be out of date - // and the whole content of the page should still be returned. - return; - } - - throw error; - }); + await waitForSelectors(page, selectors, config.waitForElementsTimeout); return { mimeType: 'text/html', @@ -82,15 +55,7 @@ export default async function fetch(url, cssSelectors, config) { } throw new Error(error.message); } finally { - if (client) { - await client.detach(); - } - if (page) { - await page.close(); - } - if (context) { - await context.close(); // Close the isolated context to free resources and ensure complete cleanup - } + await cleanupPage(client, page, context); } } @@ -151,3 +116,59 @@ export async function stopHeadlessBrowser() { await browser.close(); browser = null; } + +function isValidHttpStatus(status) { + return (status >= 200 && status < 300) || status === 304; +} + +async function configurePage(page, client, config) { + await page.setViewport({ width: 1920, height: 1080 }); // Realistic viewport to avoid detection based on default Puppeteer dimensions (800x600) + await page.setDefaultNavigationTimeout(config.navigationTimeout); + await page.setExtraHTTPHeaders({ 'Accept-Language': config.language }); + + // Use CDP to ensure browser language is set correctly (see https://zirkelc.dev/posts/puppeteer-language-experiment) + await client.send('Network.setUserAgentOverride', { + userAgent: await browser.userAgent(), + acceptLanguage: config.language, + }); + + if (browser.proxyCredentials?.username && browser.proxyCredentials?.password) { + await page.authenticate(browser.proxyCredentials); + } +} + +async function waitForSelectors(page, selectors, timeout) { + const waitForSelectorsPromises = selectors.filter(Boolean).map(selector => + page.waitForFunction( + cssSelector => { + const element = document.querySelector(cssSelector); // eslint-disable-line no-undef + + return element?.textContent.trim().length; // Ensures element exists and has non-empty text + }, + { timeout }, + selector, + )); + + // We expect all elements to be present on the page… + await Promise.all(waitForSelectorsPromises).catch(error => { + if (error.name == 'TimeoutError') { + // however, if they are not, this is not considered as an error since selectors may be out of date + // and the whole content of the page should still be returned. + return; + } + + throw error; + }); +} + +async function cleanupPage(client, page, context) { + if (client) { + await client.detach().catch(() => {}); + } + if (page) { + await page.close().catch(() => {}); + } + if (context) { + await context.close().catch(() => {}); // Close the isolated context to free resources and ensure complete cleanup + } +} From c589f6c4174490cc3493fb1e8a1d0e3fcb5a9889 Mon Sep 17 00:00:00 2001 From: Nicolas Dupont Date: Tue, 20 Jan 2026 10:26:39 +0100 Subject: [PATCH 2/6] Add PDF file support in full DOM fetcher --- src/archivist/fetcher/fullDomFetcher.js | 81 +++++++++++++++++++++++-- 1 file changed, 77 insertions(+), 4 deletions(-) diff --git a/src/archivist/fetcher/fullDomFetcher.js b/src/archivist/fetcher/fullDomFetcher.js index b9dca70bb..0d8c73c6a 100644 --- a/src/archivist/fetcher/fullDomFetcher.js +++ b/src/archivist/fetcher/fullDomFetcher.js @@ -8,9 +8,6 @@ let browser; export default async function fetch(url, cssSelectors, config) { puppeteer.use(stealthPlugin({ locale: config.language })); - let response; - const selectors = [].concat(cssSelectors); - if (!browser) { throw new Error('The headless browser should be controlled manually with "launchHeadlessBrowser" and "stopHeadlessBrowser".'); } @@ -26,12 +23,44 @@ export default async function fetch(url, cssSelectors, config) { await configurePage(page, client, config); + const selectors = [].concat(cssSelectors).filter(Boolean); + + let pdf = {}; + let handled = null; + + if (!selectors.length) { // CSS selectors are specified only for HTML content and omitted when fetching a PDF + ({ pdf, handled } = setupPdfInterception(client)); + } + + let response; + let navigationAborted = false; + + try { + response = await page.goto(url, { waitUntil: 'load' }); // Using `load` instead of `networkidle0` as it's more reliable and faster. The 'load' event fires when the page and all its resources (stylesheets, scripts, images) have finished loading. `networkidle0` can be problematic as it waits for 500ms of network inactivity, which may never occur on dynamic pages and then triggers a navigation timeout. + } catch (error) { + if (!error.message.includes('net::ERR_ABORTED')) { + throw error; + } + navigationAborted = true; // Chrome sometimes aborts navigation for PDFs + if (handled) { // Wait for PDF interception to complete (null if interception is not enabled) + await handled; + } + } + // Return PDF if intercepted (aborted navigation or loaded in Chrome's PDF viewer) + if (pdf.content) { + return { mimeType: 'application/pdf', content: pdf.content }; } - response = await page.goto(url, { waitUntil: 'load' }); // Using `load` instead of `networkidle0` as it's more reliable and faster. The 'load' event fires when the page and all its resources (stylesheets, scripts, images) have finished loading. `networkidle0` can be problematic as it waits for 500ms of network inactivity, which may never occur on dynamic pages and then triggers a navigation timeout. + if (navigationAborted) { + if (pdf.status) { + throw new Error(`Received HTTP code ${pdf.status} when trying to fetch '${url}'`); + } + + throw new Error(`Navigation aborted when trying to fetch '${url}'`); + } if (!response) { throw new Error(`Response is empty when trying to fetch '${url}'`); @@ -53,6 +82,7 @@ export default async function fetch(url, cssSelectors, config) { if (error.name === 'TimeoutError') { throw new Error(`Timed out after ${config.navigationTimeout / 1000} seconds when trying to fetch '${url}'`); } + throw new Error(error.message); } finally { await cleanupPage(client, page, context); @@ -137,6 +167,49 @@ async function configurePage(page, client, config) { } } +function setupPdfInterception(client) { + const pdf = { content: null, status: null }; + const { promise: handled, resolve: onHandled } = Promise.withResolvers(); + + client.send('Fetch.enable', { patterns: [{ urlPattern: '*', requestStage: 'Response' }] }); // Intercept all responses before Chrome processes them, allowing to capture PDF content before it's handled by the PDF viewer + + client.on('Fetch.requestPaused', async ({ requestId, resourceType, responseHeaders, responseStatusCode }) => { + try { + const contentType = responseHeaders?.find(header => header.name.toLowerCase() === 'content-type')?.value; + + if (!contentType?.includes('application/pdf')) { + return; + } + + pdf.status = responseStatusCode; + + if (!isValidHttpStatus(responseStatusCode)) { + return; + } + + try { + const { body, base64Encoded } = await client.send('Fetch.getResponseBody', { requestId }); + + pdf.content = Buffer.from(body, base64Encoded ? 'base64' : 'utf8'); + } catch { + // Response body may be unavailable due to network error or connection interruption + } + } finally { + try { + await client.send('Fetch.continueResponse', { requestId }); + } catch { + // Client may have been closed by cleanupPage() in fetch() while this async callback was still running + } + + if (resourceType === 'Document') { // Signal that the main navigation request has been processed + onHandled(); + } + } + }); + + return { pdf, handled }; +} + async function waitForSelectors(page, selectors, timeout) { const waitForSelectorsPromises = selectors.filter(Boolean).map(selector => page.waitForFunction( From c620ba4b2796b01d11ec12028aeb63f95e89c579 Mon Sep 17 00:00:00 2001 From: Nicolas Dupont Date: Tue, 20 Jan 2026 15:03:27 +0100 Subject: [PATCH 3/6] Improve code readability --- src/archivist/fetcher/fullDomFetcher.js | 29 +++++++++++++++---------- 1 file changed, 17 insertions(+), 12 deletions(-) diff --git a/src/archivist/fetcher/fullDomFetcher.js b/src/archivist/fetcher/fullDomFetcher.js index 0d8c73c6a..db83e89e3 100644 --- a/src/archivist/fetcher/fullDomFetcher.js +++ b/src/archivist/fetcher/fullDomFetcher.js @@ -38,27 +38,32 @@ export default async function fetch(url, cssSelectors, config) { try { response = await page.goto(url, { waitUntil: 'load' }); // Using `load` instead of `networkidle0` as it's more reliable and faster. The 'load' event fires when the page and all its resources (stylesheets, scripts, images) have finished loading. `networkidle0` can be problematic as it waits for 500ms of network inactivity, which may never occur on dynamic pages and then triggers a navigation timeout. } catch (error) { - if (!error.message.includes('net::ERR_ABORTED')) { + if (error.message.includes('net::ERR_ABORTED')) { + // Chrome may sometimes abort navigation for files such as PDFs. + // Do not throw for now; wait for the PDF interception handler to finish processing the response. + navigationAborted = true; + } else { throw error; } + } - navigationAborted = true; // Chrome sometimes aborts navigation for PDFs + // PDF interception handling + if (handled) { + await handled; // Wait for the interception callback to finish processing the response - if (handled) { // Wait for PDF interception to complete (null if interception is not enabled) - await handled; + if (pdf.content) { + return { + mimeType: 'application/pdf', + content: pdf.content, + }; } - } - - // Return PDF if intercepted (aborted navigation or loaded in Chrome's PDF viewer) - if (pdf.content) { - return { mimeType: 'application/pdf', content: pdf.content }; - } - if (navigationAborted) { - if (pdf.status) { + if (pdf.status) { // Status captured by CDP interception throw new Error(`Received HTTP code ${pdf.status} when trying to fetch '${url}'`); } + } + if (navigationAborted) { throw new Error(`Navigation aborted when trying to fetch '${url}'`); } From 7a2436b514e8ff8f1dcb6eb40ee3c5439257ceb5 Mon Sep 17 00:00:00 2001 From: Nicolas Dupont Date: Tue, 20 Jan 2026 14:28:16 +0100 Subject: [PATCH 4/6] Add test --- src/archivist/fetcher/fullDomFetcher.test.js | 32 ++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/src/archivist/fetcher/fullDomFetcher.test.js b/src/archivist/fetcher/fullDomFetcher.test.js index 2ebd6ea67..3737eb4fa 100644 --- a/src/archivist/fetcher/fullDomFetcher.test.js +++ b/src/archivist/fetcher/fullDomFetcher.test.js @@ -1,10 +1,15 @@ +import fs from 'fs'; import http from 'http'; +import path from 'path'; +import { fileURLToPath } from 'url'; import { expect, use } from 'chai'; import chaiAsPromised from 'chai-as-promised'; import fetch, { launchHeadlessBrowser, stopHeadlessBrowser } from './fullDomFetcher.js'; +const __dirname = path.dirname(fileURLToPath(import.meta.url)); + const SERVER_PORT = 8977; use(chaiAsPromised); @@ -16,6 +21,7 @@ describe('Full DOM Fetcher', function () { this.timeout(60000); let temporaryServer; + let expectedPDFContent; before(async () => { await launchHeadlessBrowser(); @@ -27,6 +33,10 @@ describe('Full DOM Fetcher', function () { if (request.url === '/delayed-content') { response.writeHead(200, { 'Content-Type': 'text/html' }).write(delayedContentHTML); } + if (request.url === '/terms.pdf') { + expectedPDFContent = fs.readFileSync(path.resolve(__dirname, '../../../test/fixtures/terms.pdf')); + response.writeHead(200, { 'Content-Type': 'application/pdf' }).write(expectedPDFContent); + } return response.end(); }).listen(SERVER_PORT); @@ -85,5 +95,27 @@ describe('Full DOM Fetcher', function () { await expect(fetch(url, ['.content'], { ...config, navigationTimeout: timeout })).to.be.rejectedWith(`Timed out after ${timeout / 1000} seconds when trying to fetch '${url}'`); }); }); + + context('when URL targets a PDF file', () => { + let content; + let mimeType; + const pdfUrl = `http://127.0.0.1:${SERVER_PORT}/terms.pdf`; + + before(async () => { + ({ content, mimeType } = await fetch(pdfUrl, [], config)); + }); + + it('returns a buffer for PDF content', () => { + expect(content).to.be.an.instanceOf(Buffer); + }); + + it('returns the correct MIME type', () => { + expect(mimeType).to.equal('application/pdf'); + }); + + it('returns the PDF file content', () => { + expect(content.equals(expectedPDFContent)).to.be.true; + }); + }); }); }); From 8d2c062468e8ad1ef3b2ff366fe6ca0df626ab93 Mon Sep 17 00:00:00 2001 From: Nicolas Dupont Date: Tue, 20 Jan 2026 14:32:40 +0100 Subject: [PATCH 5/6] Add changelog entry --- CHANGELOG.md | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 99d7aac92..11ede7c19 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,14 @@ All changes that impact users of this module are documented in this file, in the [Common Changelog](https://common-changelog.org) format with some additional specifications defined in the CONTRIBUTING file. This codebase adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## Unreleased [minor] + +> Development of this release was supported by [Reset Tech](https://www.reset.tech). + +### Added + +- Add PDF file support in full DOM fetcher + ## 10.4.0 - 2026-01-19 > Development of this release was supported by [Reset Tech](https://www.reset.tech). From be5af6cec45b09dfd4ea33e448c9f05c54f2d0f8 Mon Sep 17 00:00:00 2001 From: Nicolas Dupont Date: Tue, 20 Jan 2026 15:24:57 +0100 Subject: [PATCH 6/6] Avoid to use Promise.withResolvers Promise.withResolvers was introduced in Node 22 --- src/archivist/fetcher/fullDomFetcher.js | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/archivist/fetcher/fullDomFetcher.js b/src/archivist/fetcher/fullDomFetcher.js index db83e89e3..bef0e6afc 100644 --- a/src/archivist/fetcher/fullDomFetcher.js +++ b/src/archivist/fetcher/fullDomFetcher.js @@ -174,7 +174,8 @@ async function configurePage(page, client, config) { function setupPdfInterception(client) { const pdf = { content: null, status: null }; - const { promise: handled, resolve: onHandled } = Promise.withResolvers(); + let onHandled; + const handled = new Promise(resolve => { onHandled = resolve; }); client.send('Fetch.enable', { patterns: [{ urlPattern: '*', requestStage: 'Response' }] }); // Intercept all responses before Chrome processes them, allowing to capture PDF content before it's handled by the PDF viewer