diff --git a/CHANGELOG.md b/CHANGELOG.md index fdd86097e..514432399 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,15 @@ All changes that impact users of this module are documented in this file, in the [Common Changelog](https://common-changelog.org) format with some additional specifications defined in the CONTRIBUTING file. This codebase adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## Unreleased [minor] + +> Development of this release was made on a volunteer basis by [@Ndpnt](http://github.com/ndpnt). + +### Changed + +- Set realistic viewport size for full DOM fetcher to avoid detection based on default Puppeteer dimensions +- Improve isolation between fetches by using isolated browser contexts + ## 10.2.0 - 2026-01-08 _Full changeset and discussions: [#1219](https://github.com/OpenTermsArchive/engine/pull/1219)._ diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index e8751de10..76d5b9e1e 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -15,6 +15,7 @@ First of all, thanks for taking the time to contribute! 🎉👍 - [Namespaces](#namespaces) - [Practices](#practices) - [Errors handling](#errors-handling) + - [Bot detection evasion](#bot-detection-evasion) - [List a new contributor](#list-a-new-contributor-in-the-open-terms-archive-website) - - - @@ -226,6 +227,25 @@ In contrast, we consider errors from the `recorder` module as fatal, and we cras This section is highly inspired, and in part extracted, from [this error handling guide](https://console.joyent.com/node-js/production/design/errors). +### Bot detection evasion + +The fetching system uses a two-tier strategy with automatic fallback: + +1. **HTML-Only fetcher**: Lightweight HTTP client based on `node-fetch` with no bot detection evasion mechanisms +2. **Full DOM fetcher**: Headless browser based on `puppeteer-extra` with stealth plugin + +By default, the system first attempts an HTML-Only fetch. If bot blocking is detected (HTTP 403, 406, 502, or ECONNRESET), it automatically falls back to Full DOM. This behavior can be forced via `executeClientScripts: true` in the service declaration. + +The HTML-Only fetcher **intentionally has no evasion mechanisms**. Since fallback to Full DOM is automatic and fallback cases are rare, investing in evasion techniques for the HTML-Only fetcher is not necessary. + +The Full DOM fetcher uses: + +- **Stealth plugin** (`puppeteer-extra-plugin-stealth`) with default configuration to mask browser automation markers +- **Realistic viewport** of 1920x1080 pixels +- **Isolated browser context** for each request, ensuring complete isolation (cookies, storage, cache) + +This design **prioritizes determinism** (same URL → same content) over simulating a persistent user session. This choice is essential for the archiving use case, the content retrieved must be consistent and reproducible, regardless of previous fetch operations. + ## List a new contributor in the Open Terms Archive website We acknowledge the efforts of our contributors by listing them on our [website](https://opentermsarchive.org) and this is made possible by the use of the [All Contributors bot](https://allcontributors.org/docs/en/bot/overview). diff --git a/src/archivist/fetcher/fullDomFetcher.js b/src/archivist/fetcher/fullDomFetcher.js index 9ad8c63d4..85858238c 100644 --- a/src/archivist/fetcher/fullDomFetcher.js +++ b/src/archivist/fetcher/fullDomFetcher.js @@ -8,6 +8,7 @@ puppeteer.use(stealthPlugin()); let browser; export default async function fetch(url, cssSelectors, config) { + let context; let page; let response; const selectors = [].concat(cssSelectors); @@ -17,16 +18,13 @@ export default async function fetch(url, cssSelectors, config) { } try { - page = await browser.newPage(); + context = await browser.createBrowserContext(); // Create an isolated browser context to ensure complete isolation between fetches (cookies, localStorage, sessionStorage, IndexedDB, cache) + page = await context.newPage(); + await page.setViewport({ width: 1920, height: 1080 }); // Set a realistic viewport size to avoid detection based on default Puppeteer dimensions (800x600) await page.setDefaultNavigationTimeout(config.navigationTimeout); await page.setExtraHTTPHeaders({ 'Accept-Language': config.language }); - await page.setCacheEnabled(false); // Disable cache to ensure fresh content on each fetch and prevent stale data from previous requests - const client = await page.target().createCDPSession(); - - await client.send('Network.clearBrowserCookies'); // Clear cookies to ensure clean state between fetches and prevent session persistence across different URLs - if (browser.proxyCredentials?.username && browser.proxyCredentials?.password) { await page.authenticate(browser.proxyCredentials); } @@ -78,6 +76,9 @@ export default async function fetch(url, cssSelectors, config) { if (page) { await page.close(); } + if (context) { + await context.close(); // Close the isolated context to free resources and ensure complete cleanup + } } }