Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,14 @@

All changes that impact users of this module are documented in this file, in the [Common Changelog](https://common-changelog.org) format with some additional specifications defined in the CONTRIBUTING file. This codebase adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## Unreleased [minor]

> Development of this release was supported by [Reset Tech](https://www.reset.tech).

### Added

- Add PDF file support in full DOM fetcher

## 10.4.0 - 2026-01-19

> Development of this release was supported by [Reset Tech](https://www.reset.tech).
Expand Down
198 changes: 149 additions & 49 deletions src/archivist/fetcher/fullDomFetcher.js
Original file line number Diff line number Diff line change
Expand Up @@ -8,69 +8,76 @@ let browser;
export default async function fetch(url, cssSelectors, config) {
puppeteer.use(stealthPlugin({ locale: config.language }));

let context;
let page;
let client;
let response;
const selectors = [].concat(cssSelectors);

if (!browser) {
throw new Error('The headless browser should be controlled manually with "launchHeadlessBrowser" and "stopHeadlessBrowser".');
}

let context;
let page;
let client;

try {
context = await browser.createBrowserContext(); // Create an isolated browser context to ensure complete isolation between fetches (cookies, localStorage, sessionStorage, IndexedDB, cache)
page = await context.newPage();
client = await page.createCDPSession();

await page.setViewport({ width: 1920, height: 1080 }); // Set a realistic viewport size to avoid detection based on default Puppeteer dimensions (800x600)
await page.setDefaultNavigationTimeout(config.navigationTimeout);
await page.setExtraHTTPHeaders({ 'Accept-Language': config.language });
await configurePage(page, client, config);

// Use CDP to ensure the browser language is set correctly (most reliable method, see https://zirkelc.dev/posts/puppeteer-language-experiment)
client = await page.createCDPSession();
const selectors = [].concat(cssSelectors).filter(Boolean);

await client.send('Network.setUserAgentOverride', {
userAgent: await browser.userAgent(),
acceptLanguage: config.language,
});
let pdf = {};
let handled = null;

if (!selectors.length) { // CSS selectors are specified only for HTML content and omitted when fetching a PDF
({ pdf, handled } = setupPdfInterception(client));
}

if (browser.proxyCredentials?.username && browser.proxyCredentials?.password) {
await page.authenticate(browser.proxyCredentials);
let response;
let navigationAborted = false;

try {
response = await page.goto(url, { waitUntil: 'load' }); // Using `load` instead of `networkidle0` as it's more reliable and faster. The 'load' event fires when the page and all its resources (stylesheets, scripts, images) have finished loading. `networkidle0` can be problematic as it waits for 500ms of network inactivity, which may never occur on dynamic pages and then triggers a navigation timeout.
} catch (error) {
if (error.message.includes('net::ERR_ABORTED')) {
// Chrome may sometimes abort navigation for files such as PDFs.
// Do not throw for now; wait for the PDF interception handler to finish processing the response.
navigationAborted = true;
} else {
throw error;
}
}

response = await page.goto(url, { waitUntil: 'load' }); // Using `load` instead of `networkidle0` as it's more reliable and faster. The 'load' event fires when the page and all its resources (stylesheets, scripts, images) have finished loading. `networkidle0` can be problematic as it waits for 500ms of network inactivity, which may never occur on dynamic pages and then triggers a navigation timeout.
// PDF interception handling
if (handled) {
await handled; // Wait for the interception callback to finish processing the response

if (pdf.content) {
return {
mimeType: 'application/pdf',
content: pdf.content,
};
}

if (pdf.status) { // Status captured by CDP interception
throw new Error(`Received HTTP code ${pdf.status} when trying to fetch '${url}'`);
}
}

if (navigationAborted) {
throw new Error(`Navigation aborted when trying to fetch '${url}'`);
}

if (!response) {
throw new Error(`Response is empty when trying to fetch '${url}'`);
}

const statusCode = response.status();

if (statusCode < 200 || (statusCode >= 300 && statusCode !== 304)) {
if (!isValidHttpStatus(statusCode)) {
throw new Error(`Received HTTP code ${statusCode} when trying to fetch '${url}'`);
}

const waitForSelectorsPromises = selectors.filter(Boolean).map(selector =>
page.waitForFunction(
cssSelector => {
const element = document.querySelector(cssSelector); // eslint-disable-line no-undef

return element?.textContent.trim().length; // Ensures element exists and contains non-empty text, as an empty element may indicate content is still loading
},
{ timeout: config.waitForElementsTimeout },
selector,
));

// We expect all elements to be present on the page…
await Promise.all(waitForSelectorsPromises).catch(error => {
if (error.name == 'TimeoutError') {
// however, if they are not, this is not considered as an error since selectors may be out of date
// and the whole content of the page should still be returned.
return;
}

throw error;
});
await waitForSelectors(page, selectors, config.waitForElementsTimeout);

return {
mimeType: 'text/html',
Expand All @@ -80,17 +87,10 @@ export default async function fetch(url, cssSelectors, config) {
if (error.name === 'TimeoutError') {
throw new Error(`Timed out after ${config.navigationTimeout / 1000} seconds when trying to fetch '${url}'`);
}

throw new Error(error.message);
} finally {
if (client) {
await client.detach();
}
if (page) {
await page.close();
}
if (context) {
await context.close(); // Close the isolated context to free resources and ensure complete cleanup
}
await cleanupPage(client, page, context);
}
}

Expand Down Expand Up @@ -151,3 +151,103 @@ export async function stopHeadlessBrowser() {
await browser.close();
browser = null;
}

function isValidHttpStatus(status) {
return (status >= 200 && status < 300) || status === 304;
}

async function configurePage(page, client, config) {
await page.setViewport({ width: 1920, height: 1080 }); // Realistic viewport to avoid detection based on default Puppeteer dimensions (800x600)
await page.setDefaultNavigationTimeout(config.navigationTimeout);
await page.setExtraHTTPHeaders({ 'Accept-Language': config.language });

// Use CDP to ensure browser language is set correctly (see https://zirkelc.dev/posts/puppeteer-language-experiment)
await client.send('Network.setUserAgentOverride', {
userAgent: await browser.userAgent(),
acceptLanguage: config.language,
});

if (browser.proxyCredentials?.username && browser.proxyCredentials?.password) {
await page.authenticate(browser.proxyCredentials);
}
}

function setupPdfInterception(client) {
const pdf = { content: null, status: null };
let onHandled;
const handled = new Promise(resolve => { onHandled = resolve; });

client.send('Fetch.enable', { patterns: [{ urlPattern: '*', requestStage: 'Response' }] }); // Intercept all responses before Chrome processes them, allowing to capture PDF content before it's handled by the PDF viewer

client.on('Fetch.requestPaused', async ({ requestId, resourceType, responseHeaders, responseStatusCode }) => {
try {
const contentType = responseHeaders?.find(header => header.name.toLowerCase() === 'content-type')?.value;

if (!contentType?.includes('application/pdf')) {
return;
}

pdf.status = responseStatusCode;

if (!isValidHttpStatus(responseStatusCode)) {
return;
}

try {
const { body, base64Encoded } = await client.send('Fetch.getResponseBody', { requestId });

pdf.content = Buffer.from(body, base64Encoded ? 'base64' : 'utf8');
} catch {
// Response body may be unavailable due to network error or connection interruption
}
} finally {
try {
await client.send('Fetch.continueResponse', { requestId });
} catch {
// Client may have been closed by cleanupPage() in fetch() while this async callback was still running
}

if (resourceType === 'Document') { // Signal that the main navigation request has been processed
onHandled();
}
}
});

return { pdf, handled };
}

async function waitForSelectors(page, selectors, timeout) {
const waitForSelectorsPromises = selectors.filter(Boolean).map(selector =>
page.waitForFunction(
cssSelector => {
const element = document.querySelector(cssSelector); // eslint-disable-line no-undef

return element?.textContent.trim().length; // Ensures element exists and has non-empty text
},
{ timeout },
selector,
));

// We expect all elements to be present on the page…
await Promise.all(waitForSelectorsPromises).catch(error => {
if (error.name == 'TimeoutError') {
// however, if they are not, this is not considered as an error since selectors may be out of date
// and the whole content of the page should still be returned.
return;
}

throw error;
});
}

async function cleanupPage(client, page, context) {
if (client) {
await client.detach().catch(() => {});
}
if (page) {
await page.close().catch(() => {});
}
if (context) {
await context.close().catch(() => {}); // Close the isolated context to free resources and ensure complete cleanup
}
}
32 changes: 32 additions & 0 deletions src/archivist/fetcher/fullDomFetcher.test.js
Original file line number Diff line number Diff line change
@@ -1,10 +1,15 @@
import fs from 'fs';
import http from 'http';
import path from 'path';
import { fileURLToPath } from 'url';

import { expect, use } from 'chai';
import chaiAsPromised from 'chai-as-promised';

import fetch, { launchHeadlessBrowser, stopHeadlessBrowser } from './fullDomFetcher.js';

const __dirname = path.dirname(fileURLToPath(import.meta.url));

const SERVER_PORT = 8977;

use(chaiAsPromised);
Expand All @@ -16,6 +21,7 @@ describe('Full DOM Fetcher', function () {
this.timeout(60000);

let temporaryServer;
let expectedPDFContent;

before(async () => {
await launchHeadlessBrowser();
Expand All @@ -27,6 +33,10 @@ describe('Full DOM Fetcher', function () {
if (request.url === '/delayed-content') {
response.writeHead(200, { 'Content-Type': 'text/html' }).write(delayedContentHTML);
}
if (request.url === '/terms.pdf') {
expectedPDFContent = fs.readFileSync(path.resolve(__dirname, '../../../test/fixtures/terms.pdf'));
response.writeHead(200, { 'Content-Type': 'application/pdf' }).write(expectedPDFContent);
}

return response.end();
}).listen(SERVER_PORT);
Expand Down Expand Up @@ -85,5 +95,27 @@ describe('Full DOM Fetcher', function () {
await expect(fetch(url, ['.content'], { ...config, navigationTimeout: timeout })).to.be.rejectedWith(`Timed out after ${timeout / 1000} seconds when trying to fetch '${url}'`);
});
});

context('when URL targets a PDF file', () => {
let content;
let mimeType;
const pdfUrl = `http://127.0.0.1:${SERVER_PORT}/terms.pdf`;

before(async () => {
({ content, mimeType } = await fetch(pdfUrl, [], config));
});

it('returns a buffer for PDF content', () => {
expect(content).to.be.an.instanceOf(Buffer);
});

it('returns the correct MIME type', () => {
expect(mimeType).to.equal('application/pdf');
});

it('returns the PDF file content', () => {
expect(content.equals(expectedPDFContent)).to.be.true;
});
});
});
});
Loading