From ec1caa7168812c74a5d711bef38fd57cc44396b9 Mon Sep 17 00:00:00 2001
From: Nicolas Dupont <npg.dupont@gmail.com>
Date: Tue, 20 Jan 2026 10:01:42 +0100
Subject: [PATCH 1/6] Extract function for readability

---
 src/archivist/fetcher/fullDomFetcher.js | 111 ++++++++++++++----------
 1 file changed, 66 insertions(+), 45 deletions(-)

diff --git a/src/archivist/fetcher/fullDomFetcher.js b/src/archivist/fetcher/fullDomFetcher.js
index ee05e3925..b9dca70bb 100644
--- a/src/archivist/fetcher/fullDomFetcher.js
+++ b/src/archivist/fetcher/fullDomFetcher.js
@@ -8,9 +8,6 @@ let browser;
 export default async function fetch(url, cssSelectors, config) {
   puppeteer.use(stealthPlugin({ locale: config.language }));
 
-  let context;
-  let page;
-  let client;
   let response;
   const selectors = [].concat(cssSelectors);
 
@@ -18,24 +15,20 @@ export default async function fetch(url, cssSelectors, config) {
     throw new Error('The headless browser should be controlled manually with "launchHeadlessBrowser" and "stopHeadlessBrowser".');
   }
 
+  let context;
+  let page;
+  let client;
+
   try {
     context = await browser.createBrowserContext(); // Create an isolated browser context to ensure complete isolation between fetches (cookies, localStorage, sessionStorage, IndexedDB, cache)
     page = await context.newPage();
+    client = await page.createCDPSession();
+
+    await configurePage(page, client, config);
 
-    await page.setViewport({ width: 1920, height: 1080 }); // Set a realistic viewport size to avoid detection based on default Puppeteer dimensions (800x600)
-    await page.setDefaultNavigationTimeout(config.navigationTimeout);
-    await page.setExtraHTTPHeaders({ 'Accept-Language': config.language });
 
-    // Use CDP to ensure the browser language is set correctly (most reliable method, see https://zirkelc.dev/posts/puppeteer-language-experiment)
-    client = await page.createCDPSession();
 
-    await client.send('Network.setUserAgentOverride', {
-      userAgent: await browser.userAgent(),
-      acceptLanguage: config.language,
-    });
 
-    if (browser.proxyCredentials?.username && browser.proxyCredentials?.password) {
-      await page.authenticate(browser.proxyCredentials);
     }
 
     response = await page.goto(url, { waitUntil: 'load' }); // Using `load` instead of `networkidle0` as it's more reliable and faster. The 'load' event fires when the page and all its resources (stylesheets, scripts, images) have finished loading. `networkidle0` can be problematic as it waits for 500ms of network inactivity, which may never occur on dynamic pages and then triggers a navigation timeout.
@@ -46,31 +39,11 @@ export default async function fetch(url, cssSelectors, config) {
 
     const statusCode = response.status();
 
-    if (statusCode < 200 || (statusCode >= 300 && statusCode !== 304)) {
+    if (!isValidHttpStatus(statusCode)) {
       throw new Error(`Received HTTP code ${statusCode} when trying to fetch '${url}'`);
     }
 
-    const waitForSelectorsPromises = selectors.filter(Boolean).map(selector =>
-      page.waitForFunction(
-        cssSelector => {
-          const element = document.querySelector(cssSelector); // eslint-disable-line no-undef
-
-          return element?.textContent.trim().length; // Ensures element exists and contains non-empty text, as an empty element may indicate content is still loading
-        },
-        { timeout: config.waitForElementsTimeout },
-        selector,
-      ));
-
-    // We expect all elements to be present on the page…
-    await Promise.all(waitForSelectorsPromises).catch(error => {
-      if (error.name == 'TimeoutError') {
-        // however, if they are not, this is not considered as an error since selectors may be out of date
-        // and the whole content of the page should still be returned.
-        return;
-      }
-
-      throw error;
-    });
+    await waitForSelectors(page, selectors, config.waitForElementsTimeout);
 
     return {
       mimeType: 'text/html',
@@ -82,15 +55,7 @@ export default async function fetch(url, cssSelectors, config) {
     }
     throw new Error(error.message);
   } finally {
-    if (client) {
-      await client.detach();
-    }
-    if (page) {
-      await page.close();
-    }
-    if (context) {
-      await context.close(); // Close the isolated context to free resources and ensure complete cleanup
-    }
+    await cleanupPage(client, page, context);
   }
 }
 
@@ -151,3 +116,59 @@ export async function stopHeadlessBrowser() {
   await browser.close();
   browser = null;
 }
+
+function isValidHttpStatus(status) {
+  return (status >= 200 && status < 300) || status === 304;
+}
+
+async function configurePage(page, client, config) {
+  await page.setViewport({ width: 1920, height: 1080 }); // Realistic viewport to avoid detection based on default Puppeteer dimensions (800x600)
+  await page.setDefaultNavigationTimeout(config.navigationTimeout);
+  await page.setExtraHTTPHeaders({ 'Accept-Language': config.language });
+
+  // Use CDP to ensure browser language is set correctly (see https://zirkelc.dev/posts/puppeteer-language-experiment)
+  await client.send('Network.setUserAgentOverride', {
+    userAgent: await browser.userAgent(),
+    acceptLanguage: config.language,
+  });
+
+  if (browser.proxyCredentials?.username && browser.proxyCredentials?.password) {
+    await page.authenticate(browser.proxyCredentials);
+  }
+}
+
+async function waitForSelectors(page, selectors, timeout) {
+  const waitForSelectorsPromises = selectors.filter(Boolean).map(selector =>
+    page.waitForFunction(
+      cssSelector => {
+        const element = document.querySelector(cssSelector); // eslint-disable-line no-undef
+
+        return element?.textContent.trim().length; // Ensures element exists and has non-empty text
+      },
+      { timeout },
+      selector,
+    ));
+
+  // We expect all elements to be present on the page…
+  await Promise.all(waitForSelectorsPromises).catch(error => {
+    if (error.name == 'TimeoutError') {
+      // however, if they are not, this is not considered as an error since selectors may be out of date
+      // and the whole content of the page should still be returned.
+      return;
+    }
+
+    throw error;
+  });
+}
+
+async function cleanupPage(client, page, context) {
+  if (client) {
+    await client.detach().catch(() => {});
+  }
+  if (page) {
+    await page.close().catch(() => {});
+  }
+  if (context) {
+    await context.close().catch(() => {}); // Close the isolated context to free resources and ensure complete cleanup
+  }
+}

From c589f6c4174490cc3493fb1e8a1d0e3fcb5a9889 Mon Sep 17 00:00:00 2001
From: Nicolas Dupont <npg.dupont@gmail.com>
Date: Tue, 20 Jan 2026 10:26:39 +0100
Subject: [PATCH 2/6] Add PDF file support in full DOM fetcher

---
 src/archivist/fetcher/fullDomFetcher.js | 81 +++++++++++++++++++++++--
 1 file changed, 77 insertions(+), 4 deletions(-)

diff --git a/src/archivist/fetcher/fullDomFetcher.js b/src/archivist/fetcher/fullDomFetcher.js
index b9dca70bb..0d8c73c6a 100644
--- a/src/archivist/fetcher/fullDomFetcher.js
+++ b/src/archivist/fetcher/fullDomFetcher.js
@@ -8,9 +8,6 @@ let browser;
 export default async function fetch(url, cssSelectors, config) {
   puppeteer.use(stealthPlugin({ locale: config.language }));
 
-  let response;
-  const selectors = [].concat(cssSelectors);
-
   if (!browser) {
     throw new Error('The headless browser should be controlled manually with "launchHeadlessBrowser" and "stopHeadlessBrowser".');
   }
@@ -26,12 +23,44 @@ export default async function fetch(url, cssSelectors, config) {
 
     await configurePage(page, client, config);
 
+    const selectors = [].concat(cssSelectors).filter(Boolean);
+
+    let pdf = {};
+    let handled = null;
+
+    if (!selectors.length) { // CSS selectors are specified only for HTML content and omitted when fetching a PDF
+      ({ pdf, handled } = setupPdfInterception(client));
+    }
+
+    let response;
+    let navigationAborted = false;
+
+    try {
+      response = await page.goto(url, { waitUntil: 'load' }); // Using `load` instead of `networkidle0` as it's more reliable and faster. The 'load' event fires when the page and all its resources (stylesheets, scripts, images) have finished loading. `networkidle0` can be problematic as it waits for 500ms of network inactivity, which may never occur on dynamic pages and then triggers a navigation timeout.
+    } catch (error) {
+      if (!error.message.includes('net::ERR_ABORTED')) {
+        throw error;
+      }
 
+      navigationAborted = true; // Chrome sometimes aborts navigation for PDFs
 
+      if (handled) { // Wait for PDF interception to complete (null if interception is not enabled)
+        await handled;
+      }
+    }
 
+    // Return PDF if intercepted (aborted navigation or loaded in Chrome's PDF viewer)
+    if (pdf.content) {
+      return { mimeType: 'application/pdf', content: pdf.content };
     }
 
-    response = await page.goto(url, { waitUntil: 'load' }); // Using `load` instead of `networkidle0` as it's more reliable and faster. The 'load' event fires when the page and all its resources (stylesheets, scripts, images) have finished loading. `networkidle0` can be problematic as it waits for 500ms of network inactivity, which may never occur on dynamic pages and then triggers a navigation timeout.
+    if (navigationAborted) {
+      if (pdf.status) {
+        throw new Error(`Received HTTP code ${pdf.status} when trying to fetch '${url}'`);
+      }
+
+      throw new Error(`Navigation aborted when trying to fetch '${url}'`);
+    }
 
     if (!response) {
       throw new Error(`Response is empty when trying to fetch '${url}'`);
@@ -53,6 +82,7 @@ export default async function fetch(url, cssSelectors, config) {
     if (error.name === 'TimeoutError') {
       throw new Error(`Timed out after ${config.navigationTimeout / 1000} seconds when trying to fetch '${url}'`);
     }
+
     throw new Error(error.message);
   } finally {
     await cleanupPage(client, page, context);
@@ -137,6 +167,49 @@ async function configurePage(page, client, config) {
   }
 }
 
+function setupPdfInterception(client) {
+  const pdf = { content: null, status: null };
+  const { promise: handled, resolve: onHandled } = Promise.withResolvers();
+
+  client.send('Fetch.enable', { patterns: [{ urlPattern: '*', requestStage: 'Response' }] }); // Intercept all responses before Chrome processes them, allowing to capture PDF content before it's handled by the PDF viewer
+
+  client.on('Fetch.requestPaused', async ({ requestId, resourceType, responseHeaders, responseStatusCode }) => {
+    try {
+      const contentType = responseHeaders?.find(header => header.name.toLowerCase() === 'content-type')?.value;
+
+      if (!contentType?.includes('application/pdf')) {
+        return;
+      }
+
+      pdf.status = responseStatusCode;
+
+      if (!isValidHttpStatus(responseStatusCode)) {
+        return;
+      }
+
+      try {
+        const { body, base64Encoded } = await client.send('Fetch.getResponseBody', { requestId });
+
+        pdf.content = Buffer.from(body, base64Encoded ? 'base64' : 'utf8');
+      } catch {
+        // Response body may be unavailable due to network error or connection interruption
+      }
+    } finally {
+      try {
+        await client.send('Fetch.continueResponse', { requestId });
+      } catch {
+        // Client may have been closed by cleanupPage() in fetch() while this async callback was still running
+      }
+
+      if (resourceType === 'Document') { // Signal that the main navigation request has been processed
+        onHandled();
+      }
+    }
+  });
+
+  return { pdf, handled };
+}
+
 async function waitForSelectors(page, selectors, timeout) {
   const waitForSelectorsPromises = selectors.filter(Boolean).map(selector =>
     page.waitForFunction(

From c620ba4b2796b01d11ec12028aeb63f95e89c579 Mon Sep 17 00:00:00 2001
From: Nicolas Dupont <npg.dupont@gmail.com>
Date: Tue, 20 Jan 2026 15:03:27 +0100
Subject: [PATCH 3/6] Improve code readability

---
 src/archivist/fetcher/fullDomFetcher.js | 29 +++++++++++++++----------
 1 file changed, 17 insertions(+), 12 deletions(-)

diff --git a/src/archivist/fetcher/fullDomFetcher.js b/src/archivist/fetcher/fullDomFetcher.js
index 0d8c73c6a..db83e89e3 100644
--- a/src/archivist/fetcher/fullDomFetcher.js
+++ b/src/archivist/fetcher/fullDomFetcher.js
@@ -38,27 +38,32 @@ export default async function fetch(url, cssSelectors, config) {
     try {
       response = await page.goto(url, { waitUntil: 'load' }); // Using `load` instead of `networkidle0` as it's more reliable and faster. The 'load' event fires when the page and all its resources (stylesheets, scripts, images) have finished loading. `networkidle0` can be problematic as it waits for 500ms of network inactivity, which may never occur on dynamic pages and then triggers a navigation timeout.
     } catch (error) {
-      if (!error.message.includes('net::ERR_ABORTED')) {
+      if (error.message.includes('net::ERR_ABORTED')) {
+        // Chrome may sometimes abort navigation for files such as PDFs.
+        // Do not throw for now; wait for the PDF interception handler to finish processing the response.
+        navigationAborted = true;
+      } else {
         throw error;
       }
+    }
 
-      navigationAborted = true; // Chrome sometimes aborts navigation for PDFs
+    // PDF interception handling
+    if (handled) {
+      await handled; // Wait for the interception callback to finish processing the response
 
-      if (handled) { // Wait for PDF interception to complete (null if interception is not enabled)
-        await handled;
+      if (pdf.content) {
+        return {
+          mimeType: 'application/pdf',
+          content: pdf.content,
+        };
       }
-    }
-
-    // Return PDF if intercepted (aborted navigation or loaded in Chrome's PDF viewer)
-    if (pdf.content) {
-      return { mimeType: 'application/pdf', content: pdf.content };
-    }
 
-    if (navigationAborted) {
-      if (pdf.status) {
+      if (pdf.status) { // Status captured by CDP interception
         throw new Error(`Received HTTP code ${pdf.status} when trying to fetch '${url}'`);
       }
+    }
 
+    if (navigationAborted) {
       throw new Error(`Navigation aborted when trying to fetch '${url}'`);
     }
 

From 7a2436b514e8ff8f1dcb6eb40ee3c5439257ceb5 Mon Sep 17 00:00:00 2001
From: Nicolas Dupont <npg.dupont@gmail.com>
Date: Tue, 20 Jan 2026 14:28:16 +0100
Subject: [PATCH 4/6] Add test

---
 src/archivist/fetcher/fullDomFetcher.test.js | 32 ++++++++++++++++++++
 1 file changed, 32 insertions(+)

diff --git a/src/archivist/fetcher/fullDomFetcher.test.js b/src/archivist/fetcher/fullDomFetcher.test.js
index 2ebd6ea67..3737eb4fa 100644
--- a/src/archivist/fetcher/fullDomFetcher.test.js
+++ b/src/archivist/fetcher/fullDomFetcher.test.js
@@ -1,10 +1,15 @@
+import fs from 'fs';
 import http from 'http';
+import path from 'path';
+import { fileURLToPath } from 'url';
 
 import { expect, use } from 'chai';
 import chaiAsPromised from 'chai-as-promised';
 
 import fetch, { launchHeadlessBrowser, stopHeadlessBrowser } from './fullDomFetcher.js';
 
+const __dirname = path.dirname(fileURLToPath(import.meta.url));
+
 const SERVER_PORT = 8977;
 
 use(chaiAsPromised);
@@ -16,6 +21,7 @@ describe('Full DOM Fetcher', function () {
   this.timeout(60000);
 
   let temporaryServer;
+  let expectedPDFContent;
 
   before(async () => {
     await launchHeadlessBrowser();
@@ -27,6 +33,10 @@ describe('Full DOM Fetcher', function () {
       if (request.url === '/delayed-content') {
         response.writeHead(200, { 'Content-Type': 'text/html' }).write(delayedContentHTML);
       }
+      if (request.url === '/terms.pdf') {
+        expectedPDFContent = fs.readFileSync(path.resolve(__dirname, '../../../test/fixtures/terms.pdf'));
+        response.writeHead(200, { 'Content-Type': 'application/pdf' }).write(expectedPDFContent);
+      }
 
       return response.end();
     }).listen(SERVER_PORT);
@@ -85,5 +95,27 @@ describe('Full DOM Fetcher', function () {
         await expect(fetch(url, ['.content'], { ...config, navigationTimeout: timeout })).to.be.rejectedWith(`Timed out after ${timeout / 1000} seconds when trying to fetch '${url}'`);
       });
     });
+
+    context('when URL targets a PDF file', () => {
+      let content;
+      let mimeType;
+      const pdfUrl = `http://127.0.0.1:${SERVER_PORT}/terms.pdf`;
+
+      before(async () => {
+        ({ content, mimeType } = await fetch(pdfUrl, [], config));
+      });
+
+      it('returns a buffer for PDF content', () => {
+        expect(content).to.be.an.instanceOf(Buffer);
+      });
+
+      it('returns the correct MIME type', () => {
+        expect(mimeType).to.equal('application/pdf');
+      });
+
+      it('returns the PDF file content', () => {
+        expect(content.equals(expectedPDFContent)).to.be.true;
+      });
+    });
   });
 });

From 8d2c062468e8ad1ef3b2ff366fe6ca0df626ab93 Mon Sep 17 00:00:00 2001
From: Nicolas Dupont <npg.dupont@gmail.com>
Date: Tue, 20 Jan 2026 14:32:40 +0100
Subject: [PATCH 5/6] Add changelog entry

---
 CHANGELOG.md | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 99d7aac92..11ede7c19 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -2,6 +2,14 @@
 
 All changes that impact users of this module are documented in this file, in the [Common Changelog](https://common-changelog.org) format with some additional specifications defined in the CONTRIBUTING file. This codebase adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## Unreleased [minor]
+
+> Development of this release was supported by [Reset Tech](https://www.reset.tech).
+
+### Added
+
+- Add PDF file support in full DOM fetcher
+
 ## 10.4.0 - 2026-01-19
 
 > Development of this release was supported by [Reset Tech](https://www.reset.tech).

From be5af6cec45b09dfd4ea33e448c9f05c54f2d0f8 Mon Sep 17 00:00:00 2001
From: Nicolas Dupont <npg.dupont@gmail.com>
Date: Tue, 20 Jan 2026 15:24:57 +0100
Subject: [PATCH 6/6] Avoid to use Promise.withResolvers

Promise.withResolvers was introduced in Node 22
---
 src/archivist/fetcher/fullDomFetcher.js | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/archivist/fetcher/fullDomFetcher.js b/src/archivist/fetcher/fullDomFetcher.js
index db83e89e3..bef0e6afc 100644
--- a/src/archivist/fetcher/fullDomFetcher.js
+++ b/src/archivist/fetcher/fullDomFetcher.js
@@ -174,7 +174,8 @@ async function configurePage(page, client, config) {
 
 function setupPdfInterception(client) {
   const pdf = { content: null, status: null };
-  const { promise: handled, resolve: onHandled } = Promise.withResolvers();
+  let onHandled;
+  const handled = new Promise(resolve => { onHandled = resolve; });
 
   client.send('Fetch.enable', { patterns: [{ urlPattern: '*', requestStage: 'Response' }] }); // Intercept all responses before Chrome processes them, allowing to capture PDF content before it's handled by the PDF viewer