From dac2454695b7d7a92de0a5946aa2e71e993ba047 Mon Sep 17 00:00:00 2001 From: Nicolas Dupont Date: Sat, 27 Dec 2025 11:56:26 +0100 Subject: [PATCH 1/4] Use isolated browser context Instead of manual cookie clearing --- src/archivist/fetcher/fullDomFetcher.js | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/archivist/fetcher/fullDomFetcher.js b/src/archivist/fetcher/fullDomFetcher.js index 9ad8c63d4..1524361f7 100644 --- a/src/archivist/fetcher/fullDomFetcher.js +++ b/src/archivist/fetcher/fullDomFetcher.js @@ -8,6 +8,7 @@ puppeteer.use(stealthPlugin()); let browser; export default async function fetch(url, cssSelectors, config) { + let context; let page; let response; const selectors = [].concat(cssSelectors); @@ -17,16 +18,12 @@ export default async function fetch(url, cssSelectors, config) { } try { - page = await browser.newPage(); + context = await browser.createBrowserContext(); // Create an isolated browser context to ensure complete isolation between fetches (cookies, localStorage, sessionStorage, IndexedDB, cache) + page = await context.newPage(); await page.setDefaultNavigationTimeout(config.navigationTimeout); await page.setExtraHTTPHeaders({ 'Accept-Language': config.language }); - await page.setCacheEnabled(false); // Disable cache to ensure fresh content on each fetch and prevent stale data from previous requests - const client = await page.target().createCDPSession(); - - await client.send('Network.clearBrowserCookies'); // Clear cookies to ensure clean state between fetches and prevent session persistence across different URLs - if (browser.proxyCredentials?.username && browser.proxyCredentials?.password) { await page.authenticate(browser.proxyCredentials); } @@ -78,6 +75,9 @@ export default async function fetch(url, cssSelectors, config) { if (page) { await page.close(); } + if (context) { + await context.close(); // Close the isolated context to free resources and ensure complete cleanup + } } } From 1152b4cb7f8bfe8ae654f19f6cf15cdd03f468b5 Mon Sep 17 00:00:00 2001 From: Nicolas Dupont Date: Sat, 27 Dec 2025 11:58:50 +0100 Subject: [PATCH 2/4] Set realistic viewport size Avoid detection based on default Puppeteer dimensions (800x600) --- src/archivist/fetcher/fullDomFetcher.js | 1 + 1 file changed, 1 insertion(+) diff --git a/src/archivist/fetcher/fullDomFetcher.js b/src/archivist/fetcher/fullDomFetcher.js index 1524361f7..85858238c 100644 --- a/src/archivist/fetcher/fullDomFetcher.js +++ b/src/archivist/fetcher/fullDomFetcher.js @@ -21,6 +21,7 @@ export default async function fetch(url, cssSelectors, config) { context = await browser.createBrowserContext(); // Create an isolated browser context to ensure complete isolation between fetches (cookies, localStorage, sessionStorage, IndexedDB, cache) page = await context.newPage(); + await page.setViewport({ width: 1920, height: 1080 }); // Set a realistic viewport size to avoid detection based on default Puppeteer dimensions (800x600) await page.setDefaultNavigationTimeout(config.navigationTimeout); await page.setExtraHTTPHeaders({ 'Accept-Language': config.language }); From 732f32c7200aff97de828964a3f1a3272e31fc17 Mon Sep 17 00:00:00 2001 From: Nicolas Dupont Date: Sat, 27 Dec 2025 12:21:46 +0100 Subject: [PATCH 3/4] Add Bot detection evasion section to contributing Clarify architecture rationale: HTML-Only fetcher needs no evasion mechanisms due to automatic Full DOM fallback. Emphasize determinism as core design principle for isolation strategy. --- CONTRIBUTING.md | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index e8751de10..76d5b9e1e 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -15,6 +15,7 @@ First of all, thanks for taking the time to contribute! 🎉👍 - [Namespaces](#namespaces) - [Practices](#practices) - [Errors handling](#errors-handling) + - [Bot detection evasion](#bot-detection-evasion) - [List a new contributor](#list-a-new-contributor-in-the-open-terms-archive-website) - - - @@ -226,6 +227,25 @@ In contrast, we consider errors from the `recorder` module as fatal, and we cras This section is highly inspired, and in part extracted, from [this error handling guide](https://console.joyent.com/node-js/production/design/errors). +### Bot detection evasion + +The fetching system uses a two-tier strategy with automatic fallback: + +1. **HTML-Only fetcher**: Lightweight HTTP client based on `node-fetch` with no bot detection evasion mechanisms +2. **Full DOM fetcher**: Headless browser based on `puppeteer-extra` with stealth plugin + +By default, the system first attempts an HTML-Only fetch. If bot blocking is detected (HTTP 403, 406, 502, or ECONNRESET), it automatically falls back to Full DOM. This behavior can be forced via `executeClientScripts: true` in the service declaration. + +The HTML-Only fetcher **intentionally has no evasion mechanisms**. Since fallback to Full DOM is automatic and fallback cases are rare, investing in evasion techniques for the HTML-Only fetcher is not necessary. + +The Full DOM fetcher uses: + +- **Stealth plugin** (`puppeteer-extra-plugin-stealth`) with default configuration to mask browser automation markers +- **Realistic viewport** of 1920x1080 pixels +- **Isolated browser context** for each request, ensuring complete isolation (cookies, storage, cache) + +This design **prioritizes determinism** (same URL → same content) over simulating a persistent user session. This choice is essential for the archiving use case, the content retrieved must be consistent and reproducible, regardless of previous fetch operations. + ## List a new contributor in the Open Terms Archive website We acknowledge the efforts of our contributors by listing them on our [website](https://opentermsarchive.org) and this is made possible by the use of the [All Contributors bot](https://allcontributors.org/docs/en/bot/overview). From 95d9b8ea24945b0828e28769aa7cd34525ddc94c Mon Sep 17 00:00:00 2001 From: Nicolas Dupont Date: Thu, 8 Jan 2026 16:30:10 +0100 Subject: [PATCH 4/4] Add changelog entry --- CHANGELOG.md | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index fdd86097e..514432399 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,15 @@ All changes that impact users of this module are documented in this file, in the [Common Changelog](https://common-changelog.org) format with some additional specifications defined in the CONTRIBUTING file. This codebase adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## Unreleased [minor] + +> Development of this release was made on a volunteer basis by [@Ndpnt](http://github.com/ndpnt). + +### Changed + +- Set realistic viewport size for full DOM fetcher to avoid detection based on default Puppeteer dimensions +- Improve isolation between fetches by using isolated browser contexts + ## 10.2.0 - 2026-01-08 _Full changeset and discussions: [#1219](https://github.com/OpenTermsArchive/engine/pull/1219)._