From 414c1e08cd423958d2546f01d172a7de6b8e6c19 Mon Sep 17 00:00:00 2001 From: Andre Rabold Date: Sun, 29 Mar 2026 08:21:02 -0700 Subject: [PATCH 1/3] docs(openspec): add fetch access controls change --- .../add-fetch-access-controls/.openspec.yaml | 2 + .../add-fetch-access-controls/design.md | 146 ++++++++++++++++ .../add-fetch-access-controls/proposal.md | 26 +++ .../specs/configuration/spec.md | 115 ++++++++++++ .../specs/outbound-access-control/spec.md | 165 ++++++++++++++++++ .../add-fetch-access-controls/tasks.md | 42 +++++ 6 files changed, 496 insertions(+) create mode 100644 openspec/changes/add-fetch-access-controls/.openspec.yaml create mode 100644 openspec/changes/add-fetch-access-controls/design.md create mode 100644 openspec/changes/add-fetch-access-controls/proposal.md create mode 100644 openspec/changes/add-fetch-access-controls/specs/configuration/spec.md create mode 100644 openspec/changes/add-fetch-access-controls/specs/outbound-access-control/spec.md create mode 100644 openspec/changes/add-fetch-access-controls/tasks.md diff --git a/openspec/changes/add-fetch-access-controls/.openspec.yaml b/openspec/changes/add-fetch-access-controls/.openspec.yaml new file mode 100644 index 00000000..5e98b74b --- /dev/null +++ b/openspec/changes/add-fetch-access-controls/.openspec.yaml @@ -0,0 +1,2 @@ +schema: spec-driven +created: 2026-03-29 diff --git a/openspec/changes/add-fetch-access-controls/design.md b/openspec/changes/add-fetch-access-controls/design.md new file mode 100644 index 00000000..ae21b669 --- /dev/null +++ b/openspec/changes/add-fetch-access-controls/design.md @@ -0,0 +1,146 @@ +## Context + +The server intentionally supports arbitrary HTTP(S) and `file://` fetching across CLI, MCP, and web-triggered scraping flows. Today, fetcher selection and local file traversal are centralized enough to apply consistent controls, but there is no policy layer that limits outbound network targets, constrains local file roots, or handles hidden paths and symlink escapes. The project already has a strong configuration model with schema-backed defaults and environment/CLI overrides, so access controls should be introduced as configuration-driven behavior rather than interface-specific flags. + +## Goals / Non-Goals + +**Goals:** +- Add one shared outbound access policy that applies to `fetch_url` and scrape workflows regardless of entry point. +- Default-deny private, loopback, link-local, and similar special-use network targets unless explicitly configured. +- Add configurable local file access modes with allowed roots, `$DOCUMENTS` token expansion, and secure defaults for hidden paths and symlinks. +- Produce clear user-facing errors when policy blocks access. +- Preserve explicit opt-in paths for trusted local and self-hosted deployments. + +**Non-Goals:** +- Add per-user or per-token authorization rules. +- Implement a sandbox or OS-level file isolation boundary. +- Detect every possible cloud metadata alias or enterprise-internal hostname pattern out of the box. +- Redesign the scrape tool surface or add new transport-level authentication features. + +## Decisions + +### Introduce a shared scraper security policy under configuration + +Add a new `scraper.security` section to the typed config schema. This keeps policy close to the fetch/scrape subsystem and lets the existing precedence model handle config file, environment, and CLI overrides. + +Alternative considered: a top-level `security` block. Rejected because the affected behavior is currently isolated to scraper-driven URL and file access, and placing the settings under `scraper` avoids broadening the configuration surface prematurely. + +### Enforce policy centrally before network or file reads + +Introduce a reusable access-policy helper that validates URLs and resolved file paths before `AutoDetectFetcher`, `HttpFetcher`, `FileFetcher`, and local crawl entry points perform I/O. DNS-resolved IP revalidation and redirect-target validation will be mandatory enforcement behavior, not optional configuration knobs. For archive-member URLs such as `file:///path/to/archive.zip/docs/readme.md`, policy evaluation will be based on the resolved backing archive file path plus the virtual entry path, not on the nonexistent combined filesystem path. + +Alternative considered: only validating at the MCP tool boundary. Rejected because CLI and web-triggered scraping must behave consistently, and lower-level enforcement prevents bypass through internal call paths. + +### Apply network policy to browser-initiated subrequests + +Treat browser-based fetching as a series of outbound requests rather than a single navigation. The same network policy must apply to the initial page load, redirects, frames, iframes, scripts, stylesheets, images, XHR/fetch calls, and any other subresource request initiated during rendering so that a public page cannot pivot the renderer into private network access. + +Alternative considered: only validating the initial browser navigation target. Rejected because it leaves a clear SSRF-style gap for render-time secondary requests. + +### Default-deny special network ranges while keeping explicit opt-in overrides + +Model network protection with one main switch, `allowPrivateNetworks`, that defaults to `false` and covers loopback, RFC1918 private ranges, link-local addresses, and equivalent IPv6 special-use ranges. Add `allowedHosts` and `allowedCidrs` so trusted deployments can permit only the specific internal targets they need while the broad default-deny behavior remains in place. + +When `allowPrivateNetworks` is `false`, an empty `allowedHosts` and `allowedCidrs` configuration means no private or special-use network targets are allowed. Public internet targets remain allowed unless blocked by some other rule. When `allowPrivateNetworks` is `true`, all network targets become eligible for access, subject to the same redirect and DNS resolution checks. + +`allowedHosts` and `allowedCidrs` serve different purposes. `allowedHosts` grants a hostname-bound exception for requests explicitly targeting that host, even when it resolves to private or special-use addresses. `allowedCidrs` grants an address-bound exception for direct IP targets or any hostname whose resolved connection address falls within an allowed subnet. Redirects and secondary requests are evaluated independently against the same rules, so a request allowed via `allowedHosts` does not automatically authorize a different hostname or a direct IP target unless that new target is also allowed. + +Example default-secure network configuration: + +```yaml +scraper: + security: + network: + allowPrivateNetworks: false + allowedHosts: [] + allowedCidrs: [] +``` + +Example selective internal allowlist: + +```yaml +scraper: + security: + network: + allowPrivateNetworks: false + allowedHosts: + - docs.internal.example + allowedCidrs: + - 10.42.0.0/16 +``` + +Alternative considered: permissive defaults with warnings only. Rejected because the objective of this change is meaningful hardening for exposed deployments, and warnings do not reduce actual risk. + +Alternative considered: separate booleans such as `allowLoopback` and `allowLinkLocal`. Rejected because they create overlapping policy states and make it harder to explain how exceptions should be configured. + +### Add file access modes with allowlisted roots and secure traversal defaults + +Model local file access as `disabled`, `allowedRoots`, or `unrestricted`. In `allowedRoots` mode, expand configured tokens such as `$DOCUMENTS`, canonicalize paths, and require the effective target to remain inside an allowed root. Set `followSymlinks` and `includeHidden` to `false` by default. Hidden paths will be blocked even when explicitly named unless the user opts in. + +When `fileAccess.mode` is `allowedRoots`, an empty `allowedRoots` list means all user-requested `file://` access is denied. This does not block internally managed temporary archive handoff for accepted web archive roots. + +If `$DOCUMENTS` cannot be resolved on the current platform or runtime account, it will expand to no path and therefore grant no access by itself. Deployments that require local file access in containers or service accounts must configure explicit absolute paths. + +Example default file policy: + +```yaml +scraper: + security: + fileAccess: + mode: allowedRoots + allowedRoots: + - $DOCUMENTS + followSymlinks: false + includeHidden: false +``` + +Example fully disabled local file access: + +```yaml +scraper: + security: + fileAccess: + mode: disabled + allowedRoots: [] + followSymlinks: false + includeHidden: false +``` + +Alternative considered: denylisting sensitive directories under `$HOME`. Rejected because denylisting is incomplete and easier to bypass than root allowlisting. + +### Preserve internal temporary archive handoff + +Treat temporary files created by the application itself for supported web archive scraping as internal implementation artifacts rather than user-requested `file://` access. Policy enforcement will still apply to the original network URL, but once a web archive root has been accepted and downloaded, the handoff into local archive processing must not be blocked merely because the temp file lives outside configured user roots such as `$DOCUMENTS`. This exception is limited to the downloaded archive artifact produced for that accepted request and virtual members read from that archive during the same processing flow. + +Alternative considered: requiring the temp directory to be added to default allowed roots. Rejected because it couples user-facing file policy to OS temp locations and would either over-broaden local access or break existing archive behavior. + +### Use `$DOCUMENTS` as a convenience token, not a policy special case + +Support `$DOCUMENTS` expansion during config resolution or policy evaluation, but enforce access against concrete absolute paths after expansion. This keeps the user experience simple without coupling enforcement logic to platform-specific home-directory conventions. + +Alternative considered: using `$HOME` as a default root. Rejected because it is too broad and includes common secret locations such as `.ssh`, `.aws`, and dot-config directories. + +## Risks / Trade-offs + +- [Behavior break for existing internal-doc users] -> Provide explicit `allowedHosts` and `allowedCidrs` overrides and document migration clearly. +- [Regression in supported web archive scraping] -> Exempt internally managed temp archive handoff from user file-root checks while preserving network policy on the original URL. +- [DNS resolution and redirect validation complexity] -> Keep the first implementation limited to HTTP(S) plus redirect revalidation and well-defined CIDR checks, covered by focused tests. +- [Browser subrequest enforcement complexity] -> Intercept all Playwright requests centrally and apply the same resolver/policy helper used by non-browser fetchers. +- [Platform-specific `$DOCUMENTS` resolution differences] -> Fall back to explicit paths when the token cannot be resolved reliably. +- [Confusion around hidden path semantics] -> Treat `includeHidden: false` as a hard deny for both direct fetch and traversal, and document this explicitly. +- [Symlink handling may surprise users with linked doc folders] -> Keep an opt-in `followSymlinks` flag and validate the resolved target stays within an allowed root. + +## Migration Plan + +1. Add the new config schema and defaults. +2. Implement shared access-policy utilities and wire them into fetchers and local file traversal. +3. Preserve archive workflows by validating virtual archive paths against the concrete archive file and by exempting internal temp archive handoff from user file-root checks. +4. Update tests to cover blocked and allowed cases for top-level and browser-initiated network requests, hidden paths, symlinks, and archive workflows. +5. Update user-facing documentation to explain defaults, array/env encoding, token expansion, and override patterns. +6. Release with changelog notes highlighting that private-network access is now blocked by default and file access is restricted by policy. + +Rollback consists of removing the policy enforcement layer and reverting to the previous permissive defaults, but the preferred operational mitigation is to loosen the new config values rather than removing the feature. + +## Open Questions + +- Do we want CLI flags for temporary overrides in addition to config and environment variables, or is config-only sufficient for the first iteration? diff --git a/openspec/changes/add-fetch-access-controls/proposal.md b/openspec/changes/add-fetch-access-controls/proposal.md new file mode 100644 index 00000000..8c0b7f5a --- /dev/null +++ b/openspec/changes/add-fetch-access-controls/proposal.md @@ -0,0 +1,26 @@ +## Why + +The product intentionally fetches arbitrary web and local content, but today it does so without configurable network or file access restrictions. That is acceptable for trusted local workflows, yet it leaves shared or internet-exposed deployments without a first-class way to reduce SSRF-style risk, sensitive local file access, or accidental access to hidden content. + +## What Changes + +- Add configurable outbound access controls for HTTP(S) fetches used by `fetch_url` and scraping workflows. +- Default-deny access to private, loopback, link-local, and other special-use network targets unless explicitly allowed via host or CIDR allowlists. +- Add configurable local file access modes, with allowed root directories, `$DOCUMENTS` token expansion, and secure defaults for symlinks and hidden paths. +- Preserve supported archive workflows by distinguishing user-requested local file access from internally managed temporary files and archive member resolution. +- Enforce the same access policy across CLI, MCP, and web-triggered fetch/scrape operations. +- Return clear validation errors when a URL or file path is blocked by policy, including guidance for enabling access. + +## Capabilities + +### New Capabilities +- `outbound-access-control`: Define and enforce configurable network and local file access restrictions for one-shot fetches and scraping workflows. + +### Modified Capabilities +- `configuration`: Add scraper security configuration for network restrictions, file access modes, allowed roots, symlink handling, and hidden path handling. + +## Impact + +- Affected code: scraper config schema and defaults, URL/file fetcher entry points, local file traversal, archive handoff paths, fetch and scrape tools, and related tests. +- Affected interfaces: CLI, MCP, and web-backed scraping behavior will share the same enforcement rules. +- Documentation impact: configuration and security docs will need updates to explain the new defaults and opt-in overrides. diff --git a/openspec/changes/add-fetch-access-controls/specs/configuration/spec.md b/openspec/changes/add-fetch-access-controls/specs/configuration/spec.md new file mode 100644 index 00000000..35f779dd --- /dev/null +++ b/openspec/changes/add-fetch-access-controls/specs/configuration/spec.md @@ -0,0 +1,115 @@ +## MODIFIED Requirements + +### Requirement: Generic Environment Variable Overrides +The system SHALL support overriding any configuration setting via environment variables using a predictable naming convention. Environment-derived configuration values SHALL be normalized by trimming surrounding whitespace and stripping matching surrounding single or double quotes before schema parsing. + +#### Scenario: Environment Variable Naming Convention +- **GIVEN** a config path `scraper.document.maxSize` +- **WHEN** the environment variable `DOCS_MCP_SCRAPER_DOCUMENT_MAX_SIZE` is set +- **THEN** its value SHALL override the config file and default values + +#### Scenario: Quoted configuration override from Docker Compose +- **GIVEN** the environment variable `DOCS_MCP_EMBEDDING_MODEL` is provided as `"openai:nomic-embed-text"` +- **WHEN** the configuration is loaded +- **THEN** the resulting `app.embeddingModel` value SHALL be `openai:nomic-embed-text` + +#### Scenario: Whitespace-padded configuration override +- **GIVEN** the environment variable `DOCS_MCP_SCRAPER_MAX_PAGES` is provided as ` "500" ` +- **WHEN** the configuration is loaded +- **THEN** the resulting `scraper.maxPages` value SHALL be parsed as `500` + +#### Scenario: CamelCase to Upper Snake Case Conversion +- **GIVEN** a config path segment `maxNestingDepth` +- **WHEN** converted to environment variable format +- **THEN** it SHALL become `MAX_NESTING_DEPTH` + +#### Scenario: Deeply Nested Path Conversion +- **GIVEN** a config path `splitter.json.maxNestingDepth` +- **WHEN** converted to environment variable name +- **THEN** it SHALL become `DOCS_MCP_SPLITTER_JSON_MAX_NESTING_DEPTH` + +#### Scenario: Nested security override naming +- **GIVEN** a config path `scraper.security.fileAccess.followSymlinks` +- **WHEN** converted to environment variable name +- **THEN** it SHALL become `DOCS_MCP_SCRAPER_SECURITY_FILE_ACCESS_FOLLOW_SYMLINKS` + +#### Scenario: Array override provided as JSON +- **GIVEN** the environment variable `DOCS_MCP_SCRAPER_SECURITY_NETWORK_ALLOWED_HOSTS` is provided as `["docs.internal.example","wiki.corp.local"]` +- **WHEN** the configuration is loaded +- **THEN** the resulting `scraper.security.network.allowedHosts` value SHALL be parsed as a string array + +#### Scenario: Array override provided as inline array string +- **GIVEN** the environment variable `DOCS_MCP_SCRAPER_SECURITY_FILE_ACCESS_ALLOWED_ROOTS` is provided as `["$DOCUMENTS", "/srv/docs"]` +- **WHEN** the configuration is loaded +- **THEN** the resulting `scraper.security.fileAccess.allowedRoots` value SHALL be parsed as a string array + +### Requirement: Scraper Security Configuration +The system SHALL expose configuration for outbound network and local file access controls under `scraper.security`. + +#### Scenario: Private network access defaults to disabled +- **GIVEN** no custom security configuration is provided +- **WHEN** the configuration is loaded +- **THEN** private network access for scraper-driven HTTP(S) fetches SHALL default to disabled + +#### Scenario: Host allowlist can permit a specific internal hostname +- **GIVEN** `scraper.security.network.allowPrivateNetworks` is `false` +- **AND** `scraper.security.network.allowedHosts` contains `docs.internal.example` +- **WHEN** the configuration is loaded +- **THEN** requests explicitly targeting `docs.internal.example` SHALL be eligible for access despite the default private-network restriction +- **AND** direct IP requests SHALL still require `allowedCidrs` or `allowPrivateNetworks` + +#### Scenario: CIDR allowlist can permit a specific internal subnet +- **GIVEN** `scraper.security.network.allowPrivateNetworks` is `false` +- **AND** `scraper.security.network.allowedCidrs` contains `10.42.0.0/16` +- **WHEN** the configuration is loaded +- **THEN** requests whose resolved address falls within `10.42.0.0/16` SHALL be eligible for access despite the default private-network restriction + +#### Scenario: Empty network allowlists do not permit private targets +- **GIVEN** `scraper.security.network.allowPrivateNetworks` is `false` +- **AND** `scraper.security.network.allowedHosts` is empty +- **AND** `scraper.security.network.allowedCidrs` is empty +- **WHEN** the configuration is loaded +- **THEN** no private or special-use network targets SHALL be permitted by configuration + +#### Scenario: Private network override enables broad internal access +- **GIVEN** `scraper.security.network.allowPrivateNetworks` is `true` +- **WHEN** the configuration is loaded +- **THEN** private, loopback, link-local, and other special-use network targets SHALL be eligible for access + +#### Scenario: File access mode defaults to allowed roots +- **GIVEN** no custom security configuration is provided +- **WHEN** the configuration is loaded +- **THEN** local file access mode SHALL default to `allowedRoots` + +#### Scenario: Documents root is configured by default +- **GIVEN** no custom security configuration is provided +- **WHEN** the configuration is loaded +- **THEN** the allowed file roots SHALL include `$DOCUMENTS` + +#### Scenario: Internal archive handoff does not require temp root configuration +- **GIVEN** no custom security configuration is provided +- **WHEN** the system processes a supported web archive root via an internal temporary file +- **THEN** the default file access configuration SHALL not require the OS temporary directory to be added to `allowedRoots` + +#### Scenario: Empty file root allowlist denies user-requested file access +- **GIVEN** `scraper.security.fileAccess.mode` is `allowedRoots` +- **AND** `scraper.security.fileAccess.allowedRoots` is empty +- **WHEN** the configuration is loaded +- **THEN** user-requested `file://` access SHALL not be permitted by configuration + +#### Scenario: Unresolvable documents token grants no access +- **GIVEN** `scraper.security.fileAccess.allowedRoots` contains `$DOCUMENTS` +- **AND** the runtime cannot resolve a documents directory for the current platform or account +- **WHEN** the configuration is loaded and file access policy is evaluated +- **THEN** `$DOCUMENTS` SHALL not expand to an implicit fallback path +- **AND** no access SHALL be granted from that token alone + +#### Scenario: Hidden path access defaults to disabled +- **GIVEN** no custom security configuration is provided +- **WHEN** the configuration is loaded +- **THEN** hidden file and directory access SHALL default to disabled + +#### Scenario: Symlink following defaults to disabled +- **GIVEN** no custom security configuration is provided +- **WHEN** the configuration is loaded +- **THEN** symlink following for local file access SHALL default to disabled diff --git a/openspec/changes/add-fetch-access-controls/specs/outbound-access-control/spec.md b/openspec/changes/add-fetch-access-controls/specs/outbound-access-control/spec.md new file mode 100644 index 00000000..e510cdb2 --- /dev/null +++ b/openspec/changes/add-fetch-access-controls/specs/outbound-access-control/spec.md @@ -0,0 +1,165 @@ +## ADDED Requirements + +### Requirement: Outbound Network Targets Must Respect Access Policy +The system SHALL enforce a shared outbound network access policy for one-shot URL fetches and scrape workflows before establishing HTTP(S) connections. Redirect-target validation and DNS-resolved IP validation SHALL always be applied during HTTP(S) access. + +#### Scenario: Block loopback target by default +- **WHEN** a user attempts to fetch `http://127.0.0.1:9999/test` +- **THEN** the system SHALL reject the request before performing the fetch +- **AND** the error SHALL state that the target is blocked by security policy + +#### Scenario: Block private network target by default +- **WHEN** a user attempts to fetch a URL whose resolved address is within a private network range +- **THEN** the system SHALL reject the request before performing the fetch +- **AND** the error SHALL identify that private network access is disabled + +#### Scenario: Block loopback and special-use targets by default +- **WHEN** a user attempts to fetch a URL whose resolved address is loopback, link-local, or another blocked special-use address +- **THEN** the system SHALL reject the request before performing the fetch +- **AND** the error SHALL identify that the target is blocked by network security policy + +#### Scenario: Allow configured internal target +- **GIVEN** the configuration explicitly allows the target hostname or CIDR range +- **WHEN** a user fetches an internal documentation URL within that allowlist +- **THEN** the system SHALL permit the request + +#### Scenario: Allowlist exception while private networks remain disabled +- **GIVEN** `allowPrivateNetworks` is `false` +- **AND** the target matches `allowedHosts` or `allowedCidrs` +- **WHEN** a user fetches that internal URL +- **THEN** the system SHALL permit the request +- **AND** other private or special-use targets SHALL remain blocked unless they also match the allowlist + +#### Scenario: Empty network allowlists deny all private targets +- **GIVEN** `allowPrivateNetworks` is `false` +- **AND** `allowedHosts` and `allowedCidrs` are both empty +- **WHEN** a user fetches a private or special-use network target +- **THEN** the system SHALL reject the request +- **AND** public internet targets SHALL remain eligible for access + +#### Scenario: Host allowlist grants a hostname-bound exception +- **GIVEN** `allowPrivateNetworks` is `false` +- **AND** `allowedHosts` contains `docs.internal.example` +- **WHEN** a user fetches `https://docs.internal.example/guide` +- **THEN** the system SHALL permit the request even if the host resolves to a private or special-use address + +#### Scenario: Host allowlist does not permit direct IP access +- **GIVEN** `allowPrivateNetworks` is `false` +- **AND** `allowedHosts` contains `docs.internal.example` +- **WHEN** a user fetches the direct IP address of that service +- **THEN** the system SHALL reject the request unless the resolved target is also permitted by `allowedCidrs` or `allowPrivateNetworks` + +#### Scenario: Host allowlist does not permit redirected hostname by default +- **GIVEN** `allowPrivateNetworks` is `false` +- **AND** `allowedHosts` contains `docs.internal.example` +- **WHEN** the request redirects to `wiki.internal.example` +- **THEN** the redirected request SHALL be rejected unless the new target also matches `allowedHosts`, `allowedCidrs`, or `allowPrivateNetworks` + +### Requirement: Browser-Initiated Requests Must Respect Network Policy +The system SHALL apply the same outbound network access policy to every HTTP(S) request initiated by browser-based fetching during rendering or page evaluation. + +#### Scenario: Block iframe request to private target during browser rendering +- **GIVEN** browser-based fetching is used for a public page +- **WHEN** the page attempts to load an iframe from a blocked private or special-use target +- **THEN** the system SHALL block that request +- **AND** the private target SHALL not receive the request + +#### Scenario: Block subresource request to private target during browser rendering +- **GIVEN** browser-based fetching is used for a public page +- **WHEN** the page attempts to load a script, stylesheet, image, or fetch/XHR request from a blocked private or special-use target +- **THEN** the system SHALL block that request +- **AND** the private target SHALL not receive the request + +#### Scenario: Apply allowlist exception to browser subrequest +- **GIVEN** browser-based fetching is used +- **AND** `allowPrivateNetworks` is `false` +- **AND** the browser subrequest target matches `allowedHosts` or `allowedCidrs` +- **WHEN** the page requests that internal resource +- **THEN** the system SHALL permit the subrequest + +#### Scenario: Browser subrequest host allowlist remains hostname-bound +- **GIVEN** browser-based fetching is used +- **AND** `allowPrivateNetworks` is `false` +- **AND** `allowedHosts` contains `docs.internal.example` +- **WHEN** the page requests a subresource from the direct IP address of that service +- **THEN** the system SHALL reject the subrequest unless the address is also permitted by `allowedCidrs` or `allowPrivateNetworks` + +#### Scenario: Block redirect to restricted network target +- **WHEN** a public URL redirects to a target blocked by the outbound access policy +- **THEN** the system SHALL reject the redirect target +- **AND** the final request SHALL not be sent to the blocked target + +### Requirement: Local File Access Must Respect Configured File Policy +The system SHALL enforce one shared local file access policy for direct `file://` fetches and local scraping workflows. + +#### Scenario: Reject file access when disabled +- **GIVEN** file access mode is `disabled` +- **WHEN** a user attempts to fetch or scrape a `file://` URL +- **THEN** the system SHALL reject the request before reading the file system + +#### Scenario: Allow file access within configured root +- **GIVEN** file access mode is `allowedRoots` +- **AND** the target file resolves within a configured allowed root +- **WHEN** a user fetches or scrapes that `file://` URL +- **THEN** the system SHALL permit the request + +#### Scenario: Reject file access outside configured root +- **GIVEN** file access mode is `allowedRoots` +- **AND** the target file resolves outside all configured allowed roots +- **WHEN** a user fetches or scrapes that `file://` URL +- **THEN** the system SHALL reject the request before reading the file contents + +#### Scenario: Validate archive member against backing archive path +- **GIVEN** file access mode is `allowedRoots` +- **AND** a user requests a virtual archive member path such as `file:///allowed/docs.zip/guide/intro.md` +- **WHEN** the access policy is evaluated +- **THEN** the system SHALL resolve containment against the concrete archive file path `file:///allowed/docs.zip` +- **AND** it SHALL not reject the request solely because the virtual member path does not exist on the physical file system + +#### Scenario: Expand documents token for allowed root +- **GIVEN** file access mode is `allowedRoots` +- **AND** an allowed root is configured as `$DOCUMENTS` +- **WHEN** the access policy is evaluated +- **THEN** the system SHALL resolve `$DOCUMENTS` to the platform-specific documents directory before checking path containment + +#### Scenario: Empty file root allowlist denies user-requested file access +- **GIVEN** file access mode is `allowedRoots` +- **AND** `allowedRoots` is empty +- **WHEN** a user attempts to fetch or scrape a `file://` URL +- **THEN** the system SHALL reject the request before reading the file system + +#### Scenario: Permit internally managed temp archive handoff +- **GIVEN** a supported web archive root URL has passed outbound network policy checks +- **WHEN** the system downloads the archive to an internal temporary file for processing +- **THEN** the handoff into local archive processing SHALL be permitted even if the temp file is outside configured user file roots +- **AND** this exception SHALL apply only to application-managed temporary files created for that accepted archive workflow + +#### Scenario: Temp archive exception does not permit unrelated temp files +- **GIVEN** file access mode is `allowedRoots` +- **AND** a temporary file was not created as the accepted archive artifact for the current web archive workflow +- **WHEN** local file policy is evaluated for that temp file +- **THEN** the normal file access policy SHALL apply + +### Requirement: Hidden Paths and Symlinks Must Be Explicitly Opted In +The system SHALL treat hidden paths and symlink traversal as restricted by default for local file access. + +#### Scenario: Block hidden file by default +- **GIVEN** hidden path access is disabled +- **WHEN** a user attempts to fetch or scrape a hidden file or a path inside a hidden directory +- **THEN** the system SHALL reject or skip the path according to the calling workflow + +#### Scenario: Skip hidden entries during directory traversal by default +- **GIVEN** hidden path access is disabled +- **WHEN** the system enumerates entries inside an allowed local directory +- **THEN** hidden files and hidden directories SHALL not be added to the crawl queue + +#### Scenario: Block symlink traversal by default +- **GIVEN** symlink following is disabled +- **WHEN** a requested file or enumerated directory entry is a symlink +- **THEN** the system SHALL reject or skip the symlinked path according to the calling workflow + +#### Scenario: Allow symlink when enabled and contained +- **GIVEN** symlink following is enabled +- **AND** the symlink target resolves within a configured allowed root +- **WHEN** a user fetches or scrapes the symlinked path +- **THEN** the system SHALL permit access to the resolved target diff --git a/openspec/changes/add-fetch-access-controls/tasks.md b/openspec/changes/add-fetch-access-controls/tasks.md new file mode 100644 index 00000000..3d33e0ae --- /dev/null +++ b/openspec/changes/add-fetch-access-controls/tasks.md @@ -0,0 +1,42 @@ +## 1. Configuration Model + +- [ ] 1.1 Add `scraper.security` to the typed config schema and defaults, including network restrictions and file access settings. +- [ ] 1.2 Implement config parsing for `network.allowedHosts`, `network.allowedCidrs`, `fileAccess.mode`, `allowedRoots`, `followSymlinks`, and `includeHidden`, including JSON/YAML-style array env values and `$DOCUMENTS` token support. +- [ ] 1.3 Extend configuration tests to cover default values, nested environment variable overrides, array env parsing, and unresolvable `$DOCUMENTS` behavior. + +## 2. Shared Access Policy + +- [ ] 2.1 Add a shared URL and file access policy helper that evaluates HTTP(S) targets, CIDR/host allowlists, and file path containment. +- [ ] 2.2 Enforce the shared policy in `AutoDetectFetcher` and HTTP fetch flows, including mandatory redirect-target and DNS-resolved IP revalidation. +- [ ] 2.3 Enforce the shared policy in local file fetch and local scrape entry points before any file system reads occur, including virtual archive paths. +- [ ] 2.4 Preserve supported web archive root processing by allowing internally managed temp archive handoff after the original network URL has passed policy checks. +- [ ] 2.5 Define and implement hostname-bound `allowedHosts` exceptions and address-bound `allowedCidrs` exceptions, including redirect handling for new targets. + +## 3. Browser Enforcement + +- [ ] 3.1 Enforce the shared outbound policy on all Playwright-initiated requests, including frames, iframes, scripts, stylesheets, images, and fetch/XHR calls. +- [ ] 3.2 Ensure browser subrequests apply the same hostname-bound `allowedHosts` and address-bound `allowedCidrs` semantics as non-browser HTTP fetches. + +## 4. Local File Traversal Hardening + +- [ ] 4.1 Update local file traversal to block or skip hidden files and directories when `includeHidden` is disabled. +- [ ] 4.2 Update local file traversal to reject symlinks by default and allow them only when `followSymlinks` is enabled and the resolved target remains inside an allowed root. +- [ ] 4.3 Ensure direct `file://` fetches use the same hidden-path and symlink rules as directory crawling. +- [ ] 4.4 Validate archive-member `file://` paths against the concrete archive file path instead of the nonexistent combined virtual path. +- [ ] 4.5 Limit the internal temp-archive exception to the accepted downloaded archive artifact and its virtual members. + +## 5. Verification + +- [ ] 5.1 Add tests for blocked loopback, private-network, and redirect-to-private HTTP(S) requests. +- [ ] 5.2 Add tests for allowed internal targets when hosts or CIDRs are explicitly configured while `allowPrivateNetworks` remains disabled. +- [ ] 5.3 Add tests for hostname-bound host allowlists, CIDR-based IP allowlists, and redirects or direct-IP requests that must remain blocked unless separately allowed. +- [ ] 5.4 Add tests for browser-based blocked subrequests and allowed browser subrequests under explicit allowlists. +- [ ] 5.5 Add tests for allowed-root file access, blocked outside-root access, hidden path blocking, and symlink containment. +- [ ] 5.6 Add tests for supported web archive root scraping via temp files, virtual archive-member paths within allowed roots, and unrelated temp-file rejection. + +## 6. Documentation + +- [ ] 6.1 Update user-facing configuration documentation to describe the new security defaults and override mechanisms. +- [ ] 6.2 Update security/authentication documentation to explain the trust model, private-network defaults, and local file restrictions. +- [ ] 6.3 Add a dedicated `docs/infrastructure/security.md` document covering trust boundaries, deployment hardening, outbound access controls, and local file access policy. +- [ ] 6.4 Document sample security configurations, environment-variable array encoding, empty network/file allowlist behavior, and counterintuitive semantics such as broad access from `allowPrivateNetworks: true` and unresolved `$DOCUMENTS` tokens. From 5d7f7bdee9ae5d5ddbd7ed5bb54a6ce3fa14c33d Mon Sep 17 00:00:00 2001 From: Andre Rabold Date: Sun, 29 Mar 2026 09:06:57 -0700 Subject: [PATCH 2/3] docs(openspec): add tls policy to fetch controls --- .../add-fetch-access-controls/design.md | 23 +++++++++++ .../add-fetch-access-controls/proposal.md | 1 + .../specs/configuration/spec.md | 28 +++++++++++++ .../specs/outbound-access-control/spec.md | 39 +++++++++++++++++++ .../add-fetch-access-controls/tasks.md | 19 +++++---- 5 files changed, 103 insertions(+), 7 deletions(-) diff --git a/openspec/changes/add-fetch-access-controls/design.md b/openspec/changes/add-fetch-access-controls/design.md index ae21b669..d159be38 100644 --- a/openspec/changes/add-fetch-access-controls/design.md +++ b/openspec/changes/add-fetch-access-controls/design.md @@ -73,6 +73,28 @@ Alternative considered: permissive defaults with warnings only. Rejected because Alternative considered: separate booleans such as `allowLoopback` and `allowLinkLocal`. Rejected because they create overlapping policy states and make it harder to explain how exceptions should be configured. +### Keep TLS verification secure by default with host-scoped exceptions + +TLS certificate verification remains enabled by default for all HTTPS requests. Add `allowInvalidTls` for broad override behavior and `allowedInvalidTlsHosts` for hostname-bound exceptions when operators need to reach trusted internal services with self-signed or otherwise invalid certificates. + +TLS exceptions are evaluated only after the target has already been permitted by the network access policy. `allowedInvalidTlsHosts` only affects certificate validation for the explicitly requested hostname. It does not expand network reachability, does not permit direct IP access unless `allowInvalidTls` is enabled globally, and does not automatically trust redirected or secondary-request hosts unless they are also listed. + +Example host-scoped invalid TLS exception: + +```yaml +scraper: + security: + network: + allowPrivateNetworks: false + allowedHosts: + - docs.internal.example + allowInvalidTls: false + allowedInvalidTlsHosts: + - docs.internal.example +``` + +Alternative considered: placing TLS settings under `scraper.fetcher`. Rejected because certificate verification is a trust-boundary policy that must apply consistently across HTTP and browser-based fetch paths. + ### Add file access modes with allowlisted roots and secure traversal defaults Model local file access as `disabled`, `allowedRoots`, or `unrestricted`. In `allowedRoots` mode, expand configured tokens such as `$DOCUMENTS`, canonicalize paths, and require the effective target to remain inside an allowed root. Set `followSymlinks` and `includeHidden` to `false` by default. Hidden paths will be blocked even when explicitly named unless the user opts in. @@ -126,6 +148,7 @@ Alternative considered: using `$HOME` as a default root. Rejected because it is - [Regression in supported web archive scraping] -> Exempt internally managed temp archive handoff from user file-root checks while preserving network policy on the original URL. - [DNS resolution and redirect validation complexity] -> Keep the first implementation limited to HTTP(S) plus redirect revalidation and well-defined CIDR checks, covered by focused tests. - [Browser subrequest enforcement complexity] -> Intercept all Playwright requests centrally and apply the same resolver/policy helper used by non-browser fetchers. +- [Invalid TLS exceptions broaden trust for named hosts] -> Keep TLS verification enabled by default and require explicit hostname-based opt-in for self-signed or invalid certificates. - [Platform-specific `$DOCUMENTS` resolution differences] -> Fall back to explicit paths when the token cannot be resolved reliably. - [Confusion around hidden path semantics] -> Treat `includeHidden: false` as a hard deny for both direct fetch and traversal, and document this explicitly. - [Symlink handling may surprise users with linked doc folders] -> Keep an opt-in `followSymlinks` flag and validate the resolved target stays within an allowed root. diff --git a/openspec/changes/add-fetch-access-controls/proposal.md b/openspec/changes/add-fetch-access-controls/proposal.md index 8c0b7f5a..b44ff1fd 100644 --- a/openspec/changes/add-fetch-access-controls/proposal.md +++ b/openspec/changes/add-fetch-access-controls/proposal.md @@ -6,6 +6,7 @@ The product intentionally fetches arbitrary web and local content, but today it - Add configurable outbound access controls for HTTP(S) fetches used by `fetch_url` and scraping workflows. - Default-deny access to private, loopback, link-local, and other special-use network targets unless explicitly allowed via host or CIDR allowlists. +- Add configurable TLS verification controls for trusted internal hosts with invalid or self-signed certificates. - Add configurable local file access modes, with allowed root directories, `$DOCUMENTS` token expansion, and secure defaults for symlinks and hidden paths. - Preserve supported archive workflows by distinguishing user-requested local file access from internally managed temporary files and archive member resolution. - Enforce the same access policy across CLI, MCP, and web-triggered fetch/scrape operations. diff --git a/openspec/changes/add-fetch-access-controls/specs/configuration/spec.md b/openspec/changes/add-fetch-access-controls/specs/configuration/spec.md index 35f779dd..bf03b26c 100644 --- a/openspec/changes/add-fetch-access-controls/specs/configuration/spec.md +++ b/openspec/changes/add-fetch-access-controls/specs/configuration/spec.md @@ -38,6 +38,11 @@ The system SHALL support overriding any configuration setting via environment va - **WHEN** the configuration is loaded - **THEN** the resulting `scraper.security.network.allowedHosts` value SHALL be parsed as a string array +#### Scenario: TLS host array override provided as JSON +- **GIVEN** the environment variable `DOCS_MCP_SCRAPER_SECURITY_NETWORK_ALLOWED_INVALID_TLS_HOSTS` is provided as `["docs.internal.example"]` +- **WHEN** the configuration is loaded +- **THEN** the resulting `scraper.security.network.allowedInvalidTlsHosts` value SHALL be parsed as a string array + #### Scenario: Array override provided as inline array string - **GIVEN** the environment variable `DOCS_MCP_SCRAPER_SECURITY_FILE_ACCESS_ALLOWED_ROOTS` is provided as `["$DOCUMENTS", "/srv/docs"]` - **WHEN** the configuration is loaded @@ -76,6 +81,29 @@ The system SHALL expose configuration for outbound network and local file access - **WHEN** the configuration is loaded - **THEN** private, loopback, link-local, and other special-use network targets SHALL be eligible for access +#### Scenario: Invalid TLS verification defaults to disabled override +- **GIVEN** no custom security configuration is provided +- **WHEN** the configuration is loaded +- **THEN** `scraper.security.network.allowInvalidTls` SHALL default to `false` + +#### Scenario: Invalid TLS host allowlist defaults to empty +- **GIVEN** no custom security configuration is provided +- **WHEN** the configuration is loaded +- **THEN** `scraper.security.network.allowedInvalidTlsHosts` SHALL default to an empty list + +#### Scenario: Invalid TLS host allowlist permits named HTTPS target +- **GIVEN** `scraper.security.network.allowInvalidTls` is `false` +- **AND** `scraper.security.network.allowedInvalidTlsHosts` contains `docs.internal.example` +- **AND** the target is otherwise permitted by network policy +- **WHEN** the configuration is loaded +- **THEN** requests explicitly targeting `docs.internal.example` SHALL be eligible for TLS verification bypass +- **AND** direct IP requests SHALL still require `allowInvalidTls` + +#### Scenario: Broad invalid TLS override enables all HTTPS targets +- **GIVEN** `scraper.security.network.allowInvalidTls` is `true` +- **WHEN** the configuration is loaded +- **THEN** HTTPS requests SHALL be eligible to bypass invalid certificate errors + #### Scenario: File access mode defaults to allowed roots - **GIVEN** no custom security configuration is provided - **WHEN** the configuration is loaded diff --git a/openspec/changes/add-fetch-access-controls/specs/outbound-access-control/spec.md b/openspec/changes/add-fetch-access-controls/specs/outbound-access-control/spec.md index e510cdb2..d16b7052 100644 --- a/openspec/changes/add-fetch-access-controls/specs/outbound-access-control/spec.md +++ b/openspec/changes/add-fetch-access-controls/specs/outbound-access-control/spec.md @@ -55,6 +55,38 @@ The system SHALL enforce a shared outbound network access policy for one-shot UR - **WHEN** the request redirects to `wiki.internal.example` - **THEN** the redirected request SHALL be rejected unless the new target also matches `allowedHosts`, `allowedCidrs`, or `allowPrivateNetworks` +### Requirement: TLS Verification Must Respect Network Security Policy +The system SHALL verify TLS certificates for HTTPS requests by default across HTTP and browser-based fetch paths. TLS exception policy SHALL only affect certificate validation after the target has already been permitted by the outbound network access policy. + +#### Scenario: Reject invalid TLS certificate by default +- **GIVEN** no TLS override is configured +- **WHEN** a user fetches an HTTPS URL whose certificate is invalid or self-signed +- **THEN** the system SHALL reject the request because TLS verification failed + +#### Scenario: Allow invalid TLS for configured hostname +- **GIVEN** `allowInvalidTls` is `false` +- **AND** `allowedInvalidTlsHosts` contains `docs.internal.example` +- **AND** the target is already permitted by the outbound network access policy +- **WHEN** a user fetches `https://docs.internal.example/guide` +- **THEN** the system SHALL permit the request to proceed without rejecting the invalid certificate + +#### Scenario: Host-scoped invalid TLS exception does not permit direct IP access +- **GIVEN** `allowInvalidTls` is `false` +- **AND** `allowedInvalidTlsHosts` contains `docs.internal.example` +- **WHEN** a user fetches the direct IP address of that service over HTTPS with an invalid certificate +- **THEN** the system SHALL reject the request unless `allowInvalidTls` is `true` + +#### Scenario: Host-scoped invalid TLS exception does not permit redirected hostname +- **GIVEN** `allowInvalidTls` is `false` +- **AND** `allowedInvalidTlsHosts` contains `docs.internal.example` +- **WHEN** the request redirects to `wiki.internal.example` with an invalid certificate +- **THEN** the redirected request SHALL be rejected unless the new hostname is also allowed by TLS policy + +#### Scenario: Broad invalid TLS override enables all HTTPS targets +- **GIVEN** `allowInvalidTls` is `true` +- **WHEN** a user fetches an HTTPS URL with an invalid certificate +- **THEN** the system SHALL permit the request to proceed without rejecting the certificate + ### Requirement: Browser-Initiated Requests Must Respect Network Policy The system SHALL apply the same outbound network access policy to every HTTP(S) request initiated by browser-based fetching during rendering or page evaluation. @@ -84,6 +116,13 @@ The system SHALL apply the same outbound network access policy to every HTTP(S) - **WHEN** the page requests a subresource from the direct IP address of that service - **THEN** the system SHALL reject the subrequest unless the address is also permitted by `allowedCidrs` or `allowPrivateNetworks` +#### Scenario: Browser subrequest invalid TLS exception remains hostname-bound +- **GIVEN** browser-based fetching is used +- **AND** `allowInvalidTls` is `false` +- **AND** `allowedInvalidTlsHosts` contains `docs.internal.example` +- **WHEN** the page requests an HTTPS subresource from the direct IP address of that service with an invalid certificate +- **THEN** the system SHALL reject the subrequest unless `allowInvalidTls` is `true` + #### Scenario: Block redirect to restricted network target - **WHEN** a public URL redirects to a target blocked by the outbound access policy - **THEN** the system SHALL reject the redirect target diff --git a/openspec/changes/add-fetch-access-controls/tasks.md b/openspec/changes/add-fetch-access-controls/tasks.md index 3d33e0ae..add1f223 100644 --- a/openspec/changes/add-fetch-access-controls/tasks.md +++ b/openspec/changes/add-fetch-access-controls/tasks.md @@ -1,8 +1,8 @@ ## 1. Configuration Model - [ ] 1.1 Add `scraper.security` to the typed config schema and defaults, including network restrictions and file access settings. -- [ ] 1.2 Implement config parsing for `network.allowedHosts`, `network.allowedCidrs`, `fileAccess.mode`, `allowedRoots`, `followSymlinks`, and `includeHidden`, including JSON/YAML-style array env values and `$DOCUMENTS` token support. -- [ ] 1.3 Extend configuration tests to cover default values, nested environment variable overrides, array env parsing, and unresolvable `$DOCUMENTS` behavior. +- [ ] 1.2 Implement config parsing for `network.allowedHosts`, `network.allowedCidrs`, `network.allowInvalidTls`, `network.allowedInvalidTlsHosts`, `fileAccess.mode`, `allowedRoots`, `followSymlinks`, and `includeHidden`, including JSON/YAML-style array env values and `$DOCUMENTS` token support. +- [ ] 1.3 Extend configuration tests to cover default values, nested environment variable overrides, array env parsing, invalid-TLS settings, and unresolvable `$DOCUMENTS` behavior. ## 2. Shared Access Policy @@ -11,11 +11,13 @@ - [ ] 2.3 Enforce the shared policy in local file fetch and local scrape entry points before any file system reads occur, including virtual archive paths. - [ ] 2.4 Preserve supported web archive root processing by allowing internally managed temp archive handoff after the original network URL has passed policy checks. - [ ] 2.5 Define and implement hostname-bound `allowedHosts` exceptions and address-bound `allowedCidrs` exceptions, including redirect handling for new targets. +- [ ] 2.6 Define and implement TLS verification policy with secure defaults, hostname-bound `allowedInvalidTlsHosts`, and broad `allowInvalidTls` override behavior. +- [ ] 2.7 Ensure TLS exceptions are evaluated only after network policy allows the target, and treat direct-IP invalid-TLS access as unsupported unless `allowInvalidTls` is enabled. ## 3. Browser Enforcement - [ ] 3.1 Enforce the shared outbound policy on all Playwright-initiated requests, including frames, iframes, scripts, stylesheets, images, and fetch/XHR calls. -- [ ] 3.2 Ensure browser subrequests apply the same hostname-bound `allowedHosts` and address-bound `allowedCidrs` semantics as non-browser HTTP fetches. +- [ ] 3.2 Ensure browser subrequests apply the same hostname-bound `allowedHosts`, address-bound `allowedCidrs`, and invalid-TLS semantics as non-browser HTTP fetches. ## 4. Local File Traversal Hardening @@ -30,13 +32,16 @@ - [ ] 5.1 Add tests for blocked loopback, private-network, and redirect-to-private HTTP(S) requests. - [ ] 5.2 Add tests for allowed internal targets when hosts or CIDRs are explicitly configured while `allowPrivateNetworks` remains disabled. - [ ] 5.3 Add tests for hostname-bound host allowlists, CIDR-based IP allowlists, and redirects or direct-IP requests that must remain blocked unless separately allowed. -- [ ] 5.4 Add tests for browser-based blocked subrequests and allowed browser subrequests under explicit allowlists. -- [ ] 5.5 Add tests for allowed-root file access, blocked outside-root access, hidden path blocking, and symlink containment. -- [ ] 5.6 Add tests for supported web archive root scraping via temp files, virtual archive-member paths within allowed roots, and unrelated temp-file rejection. +- [ ] 5.4 Add tests for invalid TLS rejection by default, hostname-bound invalid-TLS exceptions, redirected-host TLS rejection, and broad invalid-TLS override behavior. +- [ ] 5.5 Add tests proving invalid-TLS host exceptions do not bypass network allowlists and that direct-IP invalid-TLS access remains blocked unless `allowInvalidTls` is enabled. +- [ ] 5.6 Add tests for browser-based blocked subrequests and allowed browser subrequests under explicit allowlists, including invalid-TLS behavior. +- [ ] 5.7 Add tests for allowed-root file access, blocked outside-root access, hidden path blocking, and symlink containment. +- [ ] 5.8 Add tests for supported web archive root scraping via temp files, virtual archive-member paths within allowed roots, and unrelated temp-file rejection. ## 6. Documentation - [ ] 6.1 Update user-facing configuration documentation to describe the new security defaults and override mechanisms. - [ ] 6.2 Update security/authentication documentation to explain the trust model, private-network defaults, and local file restrictions. - [ ] 6.3 Add a dedicated `docs/infrastructure/security.md` document covering trust boundaries, deployment hardening, outbound access controls, and local file access policy. -- [ ] 6.4 Document sample security configurations, environment-variable array encoding, empty network/file allowlist behavior, and counterintuitive semantics such as broad access from `allowPrivateNetworks: true` and unresolved `$DOCUMENTS` tokens. +- [ ] 6.4 Document sample security configurations, environment-variable array encoding, empty network/file allowlist behavior, and counterintuitive semantics such as broad access from `allowPrivateNetworks: true`, broad TLS trust from `allowInvalidTls: true`, and unresolved `$DOCUMENTS` tokens. +- [ ] 6.5 Document that invalid-TLS exceptions do not bypass network allowlists and that direct-IP invalid-TLS access requires `allowInvalidTls: true`. From f995784ee431c577f62c06d17812b2ef0a29cb8d Mon Sep 17 00:00:00 2001 From: Andre Rabold Date: Sun, 29 Mar 2026 19:50:50 -0700 Subject: [PATCH 3/3] feat(scraper): enforce fetch access controls Apply shared network and file access policy across HTTP, browser, and local scraping flows with secure defaults and explicit allowlists. Document the new security model and add focused config, policy, and browser coverage for the fetch access controls change. --- README.md | 1 + docs/infrastructure/authentication.md | 13 + docs/infrastructure/security.md | 163 +++++++++ docs/setup/configuration.md | 103 ++++++ .../add-fetch-access-controls/design.md | 16 +- .../add-fetch-access-controls/proposal.md | 2 +- .../specs/configuration/spec.md | 18 - .../specs/outbound-access-control/spec.md | 29 +- .../add-fetch-access-controls/tasks.md | 58 +-- src/scraper/fetcher/AutoDetectFetcher.ts | 3 +- src/scraper/fetcher/BrowserFetcher.test.ts | 92 +++++ src/scraper/fetcher/BrowserFetcher.ts | 82 +++-- src/scraper/fetcher/FileFetcher.ts | 23 +- src/scraper/fetcher/HttpFetcher.ts | 229 +++++++----- .../HtmlPlaywrightMiddleware.test.ts | 28 ++ .../middleware/HtmlPlaywrightMiddleware.ts | 37 +- .../strategies/LocalFileStrategy.test.ts | 15 +- src/scraper/strategies/LocalFileStrategy.ts | 62 +++- src/scraper/strategies/WebScraperStrategy.ts | 10 +- src/scraper/types.ts | 4 + src/utils/accessPolicy.test.ts | 196 ++++++++++ src/utils/accessPolicy.ts | 342 ++++++++++++++++++ src/utils/config.test.ts | 64 ++++ src/utils/config.ts | 66 ++++ src/utils/errors.ts | 14 +- 25 files changed, 1435 insertions(+), 235 deletions(-) create mode 100644 docs/infrastructure/security.md create mode 100644 src/scraper/fetcher/BrowserFetcher.test.ts create mode 100644 src/utils/accessPolicy.test.ts create mode 100644 src/utils/accessPolicy.ts diff --git a/README.md b/README.md index 730f951d..edbc87c5 100644 --- a/README.md +++ b/README.md @@ -138,6 +138,7 @@ See **[Embedding Models](docs/guides/embedding-models.md)** for configuring **Ol ### Key Concepts & Architecture - **[Deployment Modes](docs/infrastructure/deployment-modes.md)**: Standalone vs. Distributed (Docker Compose). - **[Authentication](docs/infrastructure/authentication.md)**: Securing your server with OAuth2/OIDC. +- **[Security](docs/infrastructure/security.md)**: Trust boundaries, deployment hardening, and outbound access controls. - **[Telemetry](docs/infrastructure/telemetry.md)**: Privacy-first usage data collection. - **[Architecture](ARCHITECTURE.md)**: Deep dive into the system design. diff --git a/docs/infrastructure/authentication.md b/docs/infrastructure/authentication.md index 74142f1b..bcd79de9 100644 --- a/docs/infrastructure/authentication.md +++ b/docs/infrastructure/authentication.md @@ -423,6 +423,19 @@ DEBUG=mcp:auth npx docs-mcp-server --auth-enabled --auth-issuer-url "..." - Implement proper CORS policies for web clients - Use secure OAuth2 flows (Authorization Code with PKCE) +### Outbound Access Controls + +Authentication controls who can use MCP and HTTP endpoints. It does not, by itself, restrict where scraper-driven fetches can connect after a request is authenticated. + +The scraper now applies outbound access controls by default: + +- Private, loopback, link-local, and other special-use network targets are blocked unless explicitly allowlisted or `scraper.security.network.allowPrivateNetworks` is enabled. +- Local `file://` access is constrained by `scraper.security.fileAccess` and defaults to `$DOCUMENTS` only. +- Hidden files and symlinks are blocked by default. +- `allowInvalidTls` only changes certificate validation after the target has already passed the network policy. It does not bypass host or CIDR restrictions. + +For deployment hardening guidance, see [Infrastructure Security](./security.md). + ### Access Control - All authenticated users receive full access to all endpoints diff --git a/docs/infrastructure/security.md b/docs/infrastructure/security.md new file mode 100644 index 00000000..f9c307f1 --- /dev/null +++ b/docs/infrastructure/security.md @@ -0,0 +1,163 @@ +# Infrastructure Security + +## Overview + +The Docs MCP Server intentionally fetches remote URLs and local files. That capability is useful for documentation indexing, but it also creates trust-boundary decisions for deployments that are shared, internet-exposed, or connected to sensitive internal networks. + +This document describes the security model for scraper-driven access and the deployment controls that matter most in practice. + +## Trust Boundaries + +The server has three independent security surfaces: + +1. Inbound access to MCP, web, and API endpoints. +2. Outbound network access performed by URL fetching and browser-based scraping. +3. Local file access performed by direct `file://` fetches and local crawling. + +OAuth2 protects inbound MCP and HTTP usage. Scraper security settings protect outbound access and local file reads. These concerns are related but separate. + +## Deployment Hardening + +Use the following defaults for any shared or remotely reachable deployment: + +- Enable authentication for exposed MCP and web endpoints. +- Keep the server behind TLS termination. +- Restrict worker and internal API networking at the infrastructure layer. +- Leave `scraper.security.network.allowPrivateNetworks` disabled unless you have an explicit internal use case. +- Keep `scraper.security.fileAccess.mode` at `allowedRoots` or `disabled` unless the host is fully trusted. +- Do not rely on broad local home-directory access for service accounts or containers. + +## Outbound Network Access Policy + +By default, scraper-driven HTTP and browser requests may reach public internet targets but may not reach private or special-use network targets. + +Blocked by default: + +- Loopback addresses +- RFC1918 private IPv4 ranges +- Link-local ranges +- Other special-use IPv4 and IPv6 ranges covered by the shared policy + +Allowed by default: + +- Public internet HTTP and HTTPS targets + +Override options: + +- `allowedHosts`: hostname-bound exceptions for explicitly targeted internal hosts +- `allowedCidrs`: address-bound exceptions for resolved or direct IP targets +- `allowPrivateNetworks: true`: broad opt-in for private and special-use network access + +Important semantics: + +- `allowedHosts` does not allow direct IP access to the same service. +- `allowedCidrs` allows by resolved or direct address, not by hostname label alone. +- Redirect targets are revalidated independently. +- Browser subrequests use the same policy as non-browser HTTP fetches. + +## TLS Verification Policy + +HTTPS certificate validation stays enabled by default. + +`allowInvalidTls: true` is a broad override that allows invalid or self-signed certificates for HTTPS requests that are already permitted by the network policy. + +Important semantics: + +- It does not bypass `allowedHosts`, `allowedCidrs`, or private-network restrictions. +- It applies broadly, so treat it as an environment-level trust decision. +- If you need narrower trust, prefer proper certificates or a future custom-certificate workflow rather than enabling broad invalid TLS trust. + +## Local File Access Policy + +Local file access defaults to `allowedRoots` mode with `$DOCUMENTS` as the only configured root. + +Modes: + +- `disabled`: all user-requested `file://` access is blocked +- `allowedRoots`: only configured roots are allowed +- `unrestricted`: local file access is fully trusted + +Traversal defaults: + +- Hidden files and hidden directories are blocked by default +- Symlinks are blocked by default +- Archive-member paths are validated against the real archive file, not the synthetic combined virtual path + +Important semantics: + +- `allowedRoots: []` in `allowedRoots` mode means no user-requested local file access is allowed. +- `$DOCUMENTS` is only a convenience token. If it cannot be resolved for the runtime account, it grants no access. +- Hidden paths remain blocked even when explicitly requested unless `includeHidden` is enabled. + +## Archive Workflows + +Supported web archive scraping downloads an accepted remote archive to a temporary file and then processes it through the local file path. + +That handoff remains allowed without requiring the temp directory to be added to user-configured roots. The exception is intentionally narrow: + +- It applies only after the original network URL passes the network policy. +- It applies only to the downloaded archive artifact and its virtual members. +- Unrelated temp files remain subject to the normal local file policy. + +## Example Configurations + +Conservative shared deployment: + +```yaml +scraper: + security: + network: + allowPrivateNetworks: false + allowedHosts: [] + allowedCidrs: [] + allowInvalidTls: false + fileAccess: + mode: allowedRoots + allowedRoots: + - $DOCUMENTS + followSymlinks: false + includeHidden: false +``` + +Selective internal docs deployment: + +```yaml +scraper: + security: + network: + allowPrivateNetworks: false + allowedHosts: + - docs.internal.example + allowedCidrs: + - 10.42.0.0/16 + allowInvalidTls: true + fileAccess: + mode: allowedRoots + allowedRoots: + - /srv/docs + followSymlinks: false + includeHidden: false +``` + +Fully trusted local workstation: + +```yaml +scraper: + security: + network: + allowPrivateNetworks: true + allowInvalidTls: false + fileAccess: + mode: unrestricted + allowedRoots: [] + followSymlinks: true + includeHidden: true +``` + +## Operational Guidance + +- Start with the defaults and add the smallest explicit exception that satisfies the use case. +- Prefer `allowedHosts` and `allowedCidrs` over `allowPrivateNetworks: true`. +- Prefer valid certificates over `allowInvalidTls: true`. +- Prefer narrow `allowedRoots` over `unrestricted` file access. +- Review these settings when moving from local development to shared infrastructure. diff --git a/docs/setup/configuration.md b/docs/setup/configuration.md index d8ea7313..ad7081c4 100644 --- a/docs/setup/configuration.md +++ b/docs/setup/configuration.md @@ -23,6 +23,18 @@ scraper: maxDepth: 3 document: maxSize: 10485760 # 10MB + security: + network: + allowPrivateNetworks: false + allowedHosts: [] + allowedCidrs: [] + allowInvalidTls: false + fileAccess: + mode: allowedRoots + allowedRoots: + - $DOCUMENTS + followSymlinks: false + includeHidden: false splitter: preferredChunkSize: 1500 @@ -74,6 +86,17 @@ export DOCS_MCP_SPLITTER_PREFERRED_CHUNK_SIZE=2000 # Override app settings export DOCS_MCP_APP_TELEMETRY_ENABLED=false + +# Override scraper security settings +export DOCS_MCP_SCRAPER_SECURITY_NETWORK_ALLOW_INVALID_TLS=true +export DOCS_MCP_SCRAPER_SECURITY_FILE_ACCESS_FOLLOW_SYMLINKS=true +``` + +Array-valued settings accept JSON or YAML-style inline arrays: + +```bash +export DOCS_MCP_SCRAPER_SECURITY_NETWORK_ALLOWED_HOSTS='["docs.internal.example","wiki.corp.local"]' +export DOCS_MCP_SCRAPER_SECURITY_FILE_ACCESS_ALLOWED_ROOTS='["$DOCUMENTS", "/srv/docs"]' ``` Some settings also have **legacy aliases** for convenience: @@ -176,6 +199,86 @@ _Note: Scraper settings are often overridden per-job via CLI arguments like `--m > **Migration Note:** In versions prior to 1.37, `document.maxSize` was a top-level setting. It has been moved to `scraper.document.maxSize`. Update your config files accordingly. +### Scraper Security (`scraper.security`) + +Outbound network access and local file access defaults are intentionally conservative so internet-exposed deployments do not automatically gain access to private networks or broad local file trees. + +#### Network (`scraper.security.network`) + +| Option | Default | Description | +|:-------|:--------|:------------| +| `allowPrivateNetworks` | `false` | Blocks loopback, RFC1918 private ranges, link-local, and other special-use targets unless explicitly allowed. When set to `true`, all network targets become eligible. | +| `allowedHosts` | `[]` | Hostname-bound exceptions. Only requests explicitly targeting these hostnames are exempt from private-network blocking. | +| `allowedCidrs` | `[]` | Address-bound exceptions. Allows direct IP access or resolved host connections whose address falls inside one of the configured CIDRs. | +| `allowInvalidTls` | `false` | Broad HTTPS certificate-verification override. This only applies after the target has already passed network access checks. | + +An empty `allowedHosts` and `allowedCidrs` list does not allow any private or special-use targets while `allowPrivateNetworks` remains `false`. + +#### File Access (`scraper.security.fileAccess`) + +| Option | Default | Description | +|:-------|:--------|:------------| +| `mode` | `allowedRoots` | Local file access mode: `disabled`, `allowedRoots`, or `unrestricted`. | +| `allowedRoots` | `[$DOCUMENTS]` | Allowlisted local roots when `mode` is `allowedRoots`. Empty means no user-requested `file://` access is permitted. | +| `followSymlinks` | `false` | Blocks symlinks by default. When enabled, resolved targets must still stay inside an allowed root. | +| `includeHidden` | `false` | Blocks hidden files, hidden directories, and hidden archive members by default, even when explicitly requested. | + +`$DOCUMENTS` is a convenience token. It expands to the current account's documents directory when that directory can be resolved. If it cannot be resolved, it grants no access by itself. + +Internally managed temporary archive files created during accepted web archive scraping remain allowed even when they sit outside user-configured roots. That exception is limited to the downloaded archive artifact and its virtual members. + +### Security Examples + +Selective internal network access with self-signed HTTPS: + +```yaml +scraper: + security: + network: + allowPrivateNetworks: false + allowedHosts: + - docs.internal.example + allowedCidrs: + - 10.42.0.0/16 + allowInvalidTls: true +``` + +Restricted local file access: + +```yaml +scraper: + security: + fileAccess: + mode: allowedRoots + allowedRoots: + - $DOCUMENTS + - /srv/docs + followSymlinks: false + includeHidden: false +``` + +Fully trusted local deployment: + +```yaml +scraper: + security: + network: + allowPrivateNetworks: true + allowInvalidTls: false + fileAccess: + mode: unrestricted + allowedRoots: [] + followSymlinks: true + includeHidden: true +``` + +Be explicit with these overrides: + +- `allowPrivateNetworks: true` broadens network reach beyond public internet targets. +- `allowInvalidTls: true` broadly trusts invalid HTTPS certificates, but it still does not bypass network allowlists. +- `fileAccess.mode: allowedRoots` with `allowedRoots: []` denies all user-requested `file://` access. +- Unresolved `$DOCUMENTS` tokens do not fall back to `$HOME` or any other implicit path. + ### GitHub Authentication Environment variables for authenticating with GitHub when scraping private repositories. diff --git a/openspec/changes/add-fetch-access-controls/design.md b/openspec/changes/add-fetch-access-controls/design.md index d159be38..1acc93cb 100644 --- a/openspec/changes/add-fetch-access-controls/design.md +++ b/openspec/changes/add-fetch-access-controls/design.md @@ -73,13 +73,11 @@ Alternative considered: permissive defaults with warnings only. Rejected because Alternative considered: separate booleans such as `allowLoopback` and `allowLinkLocal`. Rejected because they create overlapping policy states and make it harder to explain how exceptions should be configured. -### Keep TLS verification secure by default with host-scoped exceptions +### Keep TLS verification secure by default with a single override -TLS certificate verification remains enabled by default for all HTTPS requests. Add `allowInvalidTls` for broad override behavior and `allowedInvalidTlsHosts` for hostname-bound exceptions when operators need to reach trusted internal services with self-signed or otherwise invalid certificates. +TLS certificate verification remains enabled by default for all HTTPS requests. Add `allowInvalidTls` as a single broad override for environments that need to reach services with self-signed or otherwise invalid certificates. TLS exceptions are evaluated only after the target has already been permitted by the network access policy. -TLS exceptions are evaluated only after the target has already been permitted by the network access policy. `allowedInvalidTlsHosts` only affects certificate validation for the explicitly requested hostname. It does not expand network reachability, does not permit direct IP access unless `allowInvalidTls` is enabled globally, and does not automatically trust redirected or secondary-request hosts unless they are also listed. - -Example host-scoped invalid TLS exception: +Example invalid TLS override: ```yaml scraper: @@ -88,11 +86,11 @@ scraper: allowPrivateNetworks: false allowedHosts: - docs.internal.example - allowInvalidTls: false - allowedInvalidTlsHosts: - - docs.internal.example + allowInvalidTls: true ``` +Alternative considered: hostname-scoped invalid TLS exceptions. Rejected because they add disproportionate policy complexity and do not map cleanly onto browser-based fetching behavior. A future custom-certificate feature would be a better way to support scoped trust. + Alternative considered: placing TLS settings under `scraper.fetcher`. Rejected because certificate verification is a trust-boundary policy that must apply consistently across HTTP and browser-based fetch paths. ### Add file access modes with allowlisted roots and secure traversal defaults @@ -148,7 +146,7 @@ Alternative considered: using `$HOME` as a default root. Rejected because it is - [Regression in supported web archive scraping] -> Exempt internally managed temp archive handoff from user file-root checks while preserving network policy on the original URL. - [DNS resolution and redirect validation complexity] -> Keep the first implementation limited to HTTP(S) plus redirect revalidation and well-defined CIDR checks, covered by focused tests. - [Browser subrequest enforcement complexity] -> Intercept all Playwright requests centrally and apply the same resolver/policy helper used by non-browser fetchers. -- [Invalid TLS exceptions broaden trust for named hosts] -> Keep TLS verification enabled by default and require explicit hostname-based opt-in for self-signed or invalid certificates. +- [Invalid TLS override broadens trust for all HTTPS targets] -> Keep TLS verification enabled by default and require explicit opt-in only when deployments accept the wider trust boundary. - [Platform-specific `$DOCUMENTS` resolution differences] -> Fall back to explicit paths when the token cannot be resolved reliably. - [Confusion around hidden path semantics] -> Treat `includeHidden: false` as a hard deny for both direct fetch and traversal, and document this explicitly. - [Symlink handling may surprise users with linked doc folders] -> Keep an opt-in `followSymlinks` flag and validate the resolved target stays within an allowed root. diff --git a/openspec/changes/add-fetch-access-controls/proposal.md b/openspec/changes/add-fetch-access-controls/proposal.md index b44ff1fd..f40fe84f 100644 --- a/openspec/changes/add-fetch-access-controls/proposal.md +++ b/openspec/changes/add-fetch-access-controls/proposal.md @@ -6,7 +6,7 @@ The product intentionally fetches arbitrary web and local content, but today it - Add configurable outbound access controls for HTTP(S) fetches used by `fetch_url` and scraping workflows. - Default-deny access to private, loopback, link-local, and other special-use network targets unless explicitly allowed via host or CIDR allowlists. -- Add configurable TLS verification controls for trusted internal hosts with invalid or self-signed certificates. +- Add configurable TLS verification controls for environments that need to allow invalid or self-signed HTTPS certificates. - Add configurable local file access modes, with allowed root directories, `$DOCUMENTS` token expansion, and secure defaults for symlinks and hidden paths. - Preserve supported archive workflows by distinguishing user-requested local file access from internally managed temporary files and archive member resolution. - Enforce the same access policy across CLI, MCP, and web-triggered fetch/scrape operations. diff --git a/openspec/changes/add-fetch-access-controls/specs/configuration/spec.md b/openspec/changes/add-fetch-access-controls/specs/configuration/spec.md index bf03b26c..c68e3612 100644 --- a/openspec/changes/add-fetch-access-controls/specs/configuration/spec.md +++ b/openspec/changes/add-fetch-access-controls/specs/configuration/spec.md @@ -38,11 +38,6 @@ The system SHALL support overriding any configuration setting via environment va - **WHEN** the configuration is loaded - **THEN** the resulting `scraper.security.network.allowedHosts` value SHALL be parsed as a string array -#### Scenario: TLS host array override provided as JSON -- **GIVEN** the environment variable `DOCS_MCP_SCRAPER_SECURITY_NETWORK_ALLOWED_INVALID_TLS_HOSTS` is provided as `["docs.internal.example"]` -- **WHEN** the configuration is loaded -- **THEN** the resulting `scraper.security.network.allowedInvalidTlsHosts` value SHALL be parsed as a string array - #### Scenario: Array override provided as inline array string - **GIVEN** the environment variable `DOCS_MCP_SCRAPER_SECURITY_FILE_ACCESS_ALLOWED_ROOTS` is provided as `["$DOCUMENTS", "/srv/docs"]` - **WHEN** the configuration is loaded @@ -86,19 +81,6 @@ The system SHALL expose configuration for outbound network and local file access - **WHEN** the configuration is loaded - **THEN** `scraper.security.network.allowInvalidTls` SHALL default to `false` -#### Scenario: Invalid TLS host allowlist defaults to empty -- **GIVEN** no custom security configuration is provided -- **WHEN** the configuration is loaded -- **THEN** `scraper.security.network.allowedInvalidTlsHosts` SHALL default to an empty list - -#### Scenario: Invalid TLS host allowlist permits named HTTPS target -- **GIVEN** `scraper.security.network.allowInvalidTls` is `false` -- **AND** `scraper.security.network.allowedInvalidTlsHosts` contains `docs.internal.example` -- **AND** the target is otherwise permitted by network policy -- **WHEN** the configuration is loaded -- **THEN** requests explicitly targeting `docs.internal.example` SHALL be eligible for TLS verification bypass -- **AND** direct IP requests SHALL still require `allowInvalidTls` - #### Scenario: Broad invalid TLS override enables all HTTPS targets - **GIVEN** `scraper.security.network.allowInvalidTls` is `true` - **WHEN** the configuration is loaded diff --git a/openspec/changes/add-fetch-access-controls/specs/outbound-access-control/spec.md b/openspec/changes/add-fetch-access-controls/specs/outbound-access-control/spec.md index d16b7052..b31e6fbb 100644 --- a/openspec/changes/add-fetch-access-controls/specs/outbound-access-control/spec.md +++ b/openspec/changes/add-fetch-access-controls/specs/outbound-access-control/spec.md @@ -63,27 +63,9 @@ The system SHALL verify TLS certificates for HTTPS requests by default across HT - **WHEN** a user fetches an HTTPS URL whose certificate is invalid or self-signed - **THEN** the system SHALL reject the request because TLS verification failed -#### Scenario: Allow invalid TLS for configured hostname -- **GIVEN** `allowInvalidTls` is `false` -- **AND** `allowedInvalidTlsHosts` contains `docs.internal.example` -- **AND** the target is already permitted by the outbound network access policy -- **WHEN** a user fetches `https://docs.internal.example/guide` -- **THEN** the system SHALL permit the request to proceed without rejecting the invalid certificate - -#### Scenario: Host-scoped invalid TLS exception does not permit direct IP access -- **GIVEN** `allowInvalidTls` is `false` -- **AND** `allowedInvalidTlsHosts` contains `docs.internal.example` -- **WHEN** a user fetches the direct IP address of that service over HTTPS with an invalid certificate -- **THEN** the system SHALL reject the request unless `allowInvalidTls` is `true` - -#### Scenario: Host-scoped invalid TLS exception does not permit redirected hostname -- **GIVEN** `allowInvalidTls` is `false` -- **AND** `allowedInvalidTlsHosts` contains `docs.internal.example` -- **WHEN** the request redirects to `wiki.internal.example` with an invalid certificate -- **THEN** the redirected request SHALL be rejected unless the new hostname is also allowed by TLS policy - #### Scenario: Broad invalid TLS override enables all HTTPS targets - **GIVEN** `allowInvalidTls` is `true` +- **AND** the target is already permitted by the outbound network access policy - **WHEN** a user fetches an HTTPS URL with an invalid certificate - **THEN** the system SHALL permit the request to proceed without rejecting the certificate @@ -116,12 +98,11 @@ The system SHALL apply the same outbound network access policy to every HTTP(S) - **WHEN** the page requests a subresource from the direct IP address of that service - **THEN** the system SHALL reject the subrequest unless the address is also permitted by `allowedCidrs` or `allowPrivateNetworks` -#### Scenario: Browser subrequest invalid TLS exception remains hostname-bound +#### Scenario: Browser subrequest respects broad invalid TLS override - **GIVEN** browser-based fetching is used -- **AND** `allowInvalidTls` is `false` -- **AND** `allowedInvalidTlsHosts` contains `docs.internal.example` -- **WHEN** the page requests an HTTPS subresource from the direct IP address of that service with an invalid certificate -- **THEN** the system SHALL reject the subrequest unless `allowInvalidTls` is `true` +- **AND** `allowInvalidTls` is `true` +- **WHEN** the page requests an HTTPS subresource with an invalid certificate +- **THEN** the system SHALL permit the subrequest to proceed without rejecting the certificate #### Scenario: Block redirect to restricted network target - **WHEN** a public URL redirects to a target blocked by the outbound access policy diff --git a/openspec/changes/add-fetch-access-controls/tasks.md b/openspec/changes/add-fetch-access-controls/tasks.md index add1f223..1cf6ec2a 100644 --- a/openspec/changes/add-fetch-access-controls/tasks.md +++ b/openspec/changes/add-fetch-access-controls/tasks.md @@ -1,47 +1,47 @@ ## 1. Configuration Model -- [ ] 1.1 Add `scraper.security` to the typed config schema and defaults, including network restrictions and file access settings. -- [ ] 1.2 Implement config parsing for `network.allowedHosts`, `network.allowedCidrs`, `network.allowInvalidTls`, `network.allowedInvalidTlsHosts`, `fileAccess.mode`, `allowedRoots`, `followSymlinks`, and `includeHidden`, including JSON/YAML-style array env values and `$DOCUMENTS` token support. -- [ ] 1.3 Extend configuration tests to cover default values, nested environment variable overrides, array env parsing, invalid-TLS settings, and unresolvable `$DOCUMENTS` behavior. +- [x] 1.1 Add `scraper.security` to the typed config schema and defaults, including network restrictions and file access settings. +- [x] 1.2 Implement config parsing for `network.allowedHosts`, `network.allowedCidrs`, `network.allowInvalidTls`, `fileAccess.mode`, `allowedRoots`, `followSymlinks`, and `includeHidden`, including JSON/YAML-style array env values and `$DOCUMENTS` token support. +- [x] 1.3 Extend configuration tests to cover default values, nested environment variable overrides, array env parsing, invalid-TLS settings, and unresolvable `$DOCUMENTS` behavior. ## 2. Shared Access Policy -- [ ] 2.1 Add a shared URL and file access policy helper that evaluates HTTP(S) targets, CIDR/host allowlists, and file path containment. -- [ ] 2.2 Enforce the shared policy in `AutoDetectFetcher` and HTTP fetch flows, including mandatory redirect-target and DNS-resolved IP revalidation. -- [ ] 2.3 Enforce the shared policy in local file fetch and local scrape entry points before any file system reads occur, including virtual archive paths. -- [ ] 2.4 Preserve supported web archive root processing by allowing internally managed temp archive handoff after the original network URL has passed policy checks. -- [ ] 2.5 Define and implement hostname-bound `allowedHosts` exceptions and address-bound `allowedCidrs` exceptions, including redirect handling for new targets. -- [ ] 2.6 Define and implement TLS verification policy with secure defaults, hostname-bound `allowedInvalidTlsHosts`, and broad `allowInvalidTls` override behavior. -- [ ] 2.7 Ensure TLS exceptions are evaluated only after network policy allows the target, and treat direct-IP invalid-TLS access as unsupported unless `allowInvalidTls` is enabled. +- [x] 2.1 Add a shared URL and file access policy helper that evaluates HTTP(S) targets, CIDR/host allowlists, and file path containment. +- [x] 2.2 Enforce the shared policy in `AutoDetectFetcher` and HTTP fetch flows, including mandatory redirect-target and DNS-resolved IP revalidation. +- [x] 2.3 Enforce the shared policy in local file fetch and local scrape entry points before any file system reads occur, including virtual archive paths. +- [x] 2.4 Preserve supported web archive root processing by allowing internally managed temp archive handoff after the original network URL has passed policy checks. +- [x] 2.5 Define and implement hostname-bound `allowedHosts` exceptions and address-bound `allowedCidrs` exceptions, including redirect handling for new targets. +- [x] 2.6 Define and implement TLS verification policy with secure defaults and broad `allowInvalidTls` override behavior. +- [x] 2.7 Ensure TLS exceptions are evaluated only after network policy allows the target. ## 3. Browser Enforcement -- [ ] 3.1 Enforce the shared outbound policy on all Playwright-initiated requests, including frames, iframes, scripts, stylesheets, images, and fetch/XHR calls. -- [ ] 3.2 Ensure browser subrequests apply the same hostname-bound `allowedHosts`, address-bound `allowedCidrs`, and invalid-TLS semantics as non-browser HTTP fetches. +- [x] 3.1 Enforce the shared outbound policy on all Playwright-initiated requests, including frames, iframes, scripts, stylesheets, images, and fetch/XHR calls. +- [x] 3.2 Ensure browser subrequests apply the same hostname-bound `allowedHosts`, address-bound `allowedCidrs`, and broad invalid-TLS semantics as non-browser HTTP fetches. ## 4. Local File Traversal Hardening -- [ ] 4.1 Update local file traversal to block or skip hidden files and directories when `includeHidden` is disabled. -- [ ] 4.2 Update local file traversal to reject symlinks by default and allow them only when `followSymlinks` is enabled and the resolved target remains inside an allowed root. -- [ ] 4.3 Ensure direct `file://` fetches use the same hidden-path and symlink rules as directory crawling. -- [ ] 4.4 Validate archive-member `file://` paths against the concrete archive file path instead of the nonexistent combined virtual path. -- [ ] 4.5 Limit the internal temp-archive exception to the accepted downloaded archive artifact and its virtual members. +- [x] 4.1 Update local file traversal to block or skip hidden files and directories when `includeHidden` is disabled. +- [x] 4.2 Update local file traversal to reject symlinks by default and allow them only when `followSymlinks` is enabled and the resolved target remains inside an allowed root. +- [x] 4.3 Ensure direct `file://` fetches use the same hidden-path and symlink rules as directory crawling. +- [x] 4.4 Validate archive-member `file://` paths against the concrete archive file path instead of the nonexistent combined virtual path. +- [x] 4.5 Limit the internal temp-archive exception to the accepted downloaded archive artifact and its virtual members. ## 5. Verification -- [ ] 5.1 Add tests for blocked loopback, private-network, and redirect-to-private HTTP(S) requests. -- [ ] 5.2 Add tests for allowed internal targets when hosts or CIDRs are explicitly configured while `allowPrivateNetworks` remains disabled. -- [ ] 5.3 Add tests for hostname-bound host allowlists, CIDR-based IP allowlists, and redirects or direct-IP requests that must remain blocked unless separately allowed. -- [ ] 5.4 Add tests for invalid TLS rejection by default, hostname-bound invalid-TLS exceptions, redirected-host TLS rejection, and broad invalid-TLS override behavior. -- [ ] 5.5 Add tests proving invalid-TLS host exceptions do not bypass network allowlists and that direct-IP invalid-TLS access remains blocked unless `allowInvalidTls` is enabled. -- [ ] 5.6 Add tests for browser-based blocked subrequests and allowed browser subrequests under explicit allowlists, including invalid-TLS behavior. -- [ ] 5.7 Add tests for allowed-root file access, blocked outside-root access, hidden path blocking, and symlink containment. +- [x] 5.1 Add tests for blocked loopback, private-network, and redirect-to-private HTTP(S) requests. +- [x] 5.2 Add tests for allowed internal targets when hosts or CIDRs are explicitly configured while `allowPrivateNetworks` remains disabled. +- [x] 5.3 Add tests for hostname-bound host allowlists, CIDR-based IP allowlists, and redirects or direct-IP requests that must remain blocked unless separately allowed. +- [x] 5.4 Add tests for invalid TLS rejection by default and broad invalid-TLS override behavior. +- [ ] 5.5 Add tests proving invalid-TLS override does not bypass network allowlists. +- [x] 5.6 Add tests for browser-based blocked subrequests and allowed browser subrequests under explicit allowlists, including invalid-TLS behavior. +- [x] 5.7 Add tests for allowed-root file access, blocked outside-root access, hidden path blocking, and symlink containment. - [ ] 5.8 Add tests for supported web archive root scraping via temp files, virtual archive-member paths within allowed roots, and unrelated temp-file rejection. ## 6. Documentation -- [ ] 6.1 Update user-facing configuration documentation to describe the new security defaults and override mechanisms. -- [ ] 6.2 Update security/authentication documentation to explain the trust model, private-network defaults, and local file restrictions. -- [ ] 6.3 Add a dedicated `docs/infrastructure/security.md` document covering trust boundaries, deployment hardening, outbound access controls, and local file access policy. -- [ ] 6.4 Document sample security configurations, environment-variable array encoding, empty network/file allowlist behavior, and counterintuitive semantics such as broad access from `allowPrivateNetworks: true`, broad TLS trust from `allowInvalidTls: true`, and unresolved `$DOCUMENTS` tokens. -- [ ] 6.5 Document that invalid-TLS exceptions do not bypass network allowlists and that direct-IP invalid-TLS access requires `allowInvalidTls: true`. +- [x] 6.1 Update user-facing configuration documentation to describe the new security defaults and override mechanisms. +- [x] 6.2 Update security/authentication documentation to explain the trust model, private-network defaults, and local file restrictions. +- [x] 6.3 Add a dedicated `docs/infrastructure/security.md` document covering trust boundaries, deployment hardening, outbound access controls, and local file access policy. +- [x] 6.4 Document sample security configurations, environment-variable array encoding, empty network/file allowlist behavior, and counterintuitive semantics such as broad access from `allowPrivateNetworks: true`, broad TLS trust from `allowInvalidTls: true`, and unresolved `$DOCUMENTS` tokens. +- [x] 6.5 Document that invalid-TLS override does not bypass network allowlists. diff --git a/src/scraper/fetcher/AutoDetectFetcher.ts b/src/scraper/fetcher/AutoDetectFetcher.ts index 25eef04c..eed828cd 100644 --- a/src/scraper/fetcher/AutoDetectFetcher.ts +++ b/src/scraper/fetcher/AutoDetectFetcher.ts @@ -22,11 +22,12 @@ import type { ContentFetcher, FetchOptions, RawContent } from "./types"; export class AutoDetectFetcher implements ContentFetcher { private readonly httpFetcher: HttpFetcher; private readonly browserFetcher: BrowserFetcher; - private readonly fileFetcher = new FileFetcher(); + private readonly fileFetcher: FileFetcher; constructor(scraperConfig: AppConfig["scraper"]) { this.httpFetcher = new HttpFetcher(scraperConfig); this.browserFetcher = new BrowserFetcher(scraperConfig); + this.fileFetcher = new FileFetcher(scraperConfig); } /** diff --git a/src/scraper/fetcher/BrowserFetcher.test.ts b/src/scraper/fetcher/BrowserFetcher.test.ts new file mode 100644 index 00000000..81f9a13b --- /dev/null +++ b/src/scraper/fetcher/BrowserFetcher.test.ts @@ -0,0 +1,92 @@ +import { beforeEach, describe, expect, it, vi } from "vitest"; +import { loadConfig } from "../../utils/config"; + +vi.mock("playwright", () => ({ + chromium: { + launch: vi.fn(), + }, +})); + +import { chromium } from "playwright"; +import { BrowserFetcher } from "./BrowserFetcher"; + +describe("BrowserFetcher", () => { + beforeEach(() => { + vi.clearAllMocks(); + }); + + it("uses broad invalid TLS override for the browser context", async () => { + const setViewportSize = vi.fn().mockResolvedValue(undefined); + const setExtraHTTPHeaders = vi.fn().mockResolvedValue(undefined); + const route = vi.fn().mockResolvedValue(undefined); + const unroute = vi.fn().mockResolvedValue(undefined); + const closePage = vi.fn().mockResolvedValue(undefined); + const page = { + setViewportSize, + setExtraHTTPHeaders, + route, + unroute, + close: closePage, + goto: vi.fn().mockResolvedValue({ + status: () => 200, + headers: () => ({ "content-type": "text/html" }), + }), + url: vi.fn().mockReturnValue("https://example.com"), + content: vi.fn().mockResolvedValue("ok"), + }; + const closeContext = vi.fn().mockResolvedValue(undefined); + const newContext = vi.fn().mockResolvedValue({ + newPage: vi.fn().mockResolvedValue(page), + close: closeContext, + }); + const browser = { + newContext, + close: vi.fn().mockResolvedValue(undefined), + }; + vi.mocked(chromium.launch).mockResolvedValue(browser as never); + + const config = loadConfig().scraper; + config.security.network.allowInvalidTls = true; + const fetcher = new BrowserFetcher(config); + + await fetcher.fetch("http://example.com"); + + expect(newContext).toHaveBeenCalledWith( + expect.objectContaining({ ignoreHTTPSErrors: true }), + ); + }); + + it("restores fingerprint headers and desktop viewport", async () => { + const setViewportSize = vi.fn().mockResolvedValue(undefined); + const setExtraHTTPHeaders = vi.fn().mockResolvedValue(undefined); + const page = { + setViewportSize, + setExtraHTTPHeaders, + route: vi.fn().mockResolvedValue(undefined), + unroute: vi.fn().mockResolvedValue(undefined), + close: vi.fn().mockResolvedValue(undefined), + goto: vi.fn().mockResolvedValue({ + status: () => 200, + headers: () => ({ "content-type": "text/html" }), + }), + url: vi.fn().mockReturnValue("https://example.com"), + content: vi.fn().mockResolvedValue("ok"), + }; + const browser = { + newContext: vi.fn().mockResolvedValue({ + newPage: vi.fn().mockResolvedValue(page), + close: vi.fn().mockResolvedValue(undefined), + }), + close: vi.fn().mockResolvedValue(undefined), + }; + vi.mocked(chromium.launch).mockResolvedValue(browser as never); + + const fetcher = new BrowserFetcher(loadConfig().scraper); + await fetcher.fetch("https://example.com", { headers: { "X-Test": "1" } }); + + expect(setViewportSize).toHaveBeenCalledWith({ width: 1920, height: 1080 }); + expect(setExtraHTTPHeaders).toHaveBeenCalledWith( + expect.objectContaining({ "X-Test": "1" }), + ); + }); +}); diff --git a/src/scraper/fetcher/BrowserFetcher.ts b/src/scraper/fetcher/BrowserFetcher.ts index a441a506..47cdd1eb 100644 --- a/src/scraper/fetcher/BrowserFetcher.ts +++ b/src/scraper/fetcher/BrowserFetcher.ts @@ -1,4 +1,5 @@ import { type Browser, chromium, type Page } from "playwright"; +import { ScraperAccessPolicy } from "../../utils/accessPolicy"; import type { AppConfig } from "../../utils/config"; import { ScraperError } from "../../utils/errors"; import { logger } from "../../utils/logger"; @@ -17,38 +18,68 @@ import { */ export class BrowserFetcher implements ContentFetcher { private browser: Browser | null = null; - private page: Page | null = null; private fingerprintGenerator: FingerprintGenerator; private readonly defaultTimeoutMs: number; + private readonly accessPolicy: ScraperAccessPolicy; constructor(scraperConfig: AppConfig["scraper"]) { this.defaultTimeoutMs = scraperConfig.browserTimeoutMs; this.fingerprintGenerator = new FingerprintGenerator(); + this.accessPolicy = new ScraperAccessPolicy(scraperConfig.security); } canFetch(source: string): boolean { return source.startsWith("http://") || source.startsWith("https://"); } + private async isRequestAllowed(url: string): Promise { + try { + await this.accessPolicy.assertNetworkUrlAllowed(url); + return true; + } catch { + return false; + } + } + async fetch(source: string, options?: FetchOptions): Promise { + let page: Page | null = null; + let browserContext: { newPage(): Promise; close(): Promise } | null = + null; + try { - await this.ensureBrowserReady(); + await this.accessPolicy.assertNetworkUrlAllowed(source); + + const browser = await this.ensureBrowserReady(); + const fingerprintHeaders = this.fingerprintGenerator.generateHeaders(); + browserContext = await browser.newContext({ + ignoreHTTPSErrors: this.accessPolicy.shouldAllowInvalidTls( + "https://browser-context.local", + ), + }); + page = await browserContext.newPage(); + await page.setViewportSize({ width: 1920, height: 1080 }); - if (!this.page) { - throw new ScraperError("Failed to create browser page", false); - } + await page.route("**/*", async (route) => { + const requestUrl = route.request().url(); + if (!(await this.isRequestAllowed(requestUrl))) { + return await route.abort("blockedbyclient"); + } + + return await route.continue(); + }); // Set custom headers if provided - if (options?.headers) { - await this.page.setExtraHTTPHeaders(options.headers); - } + await page.setExtraHTTPHeaders({ + ...fingerprintHeaders, + ...options?.headers, + }); // Set timeout const timeout = options?.timeout || this.defaultTimeoutMs; // Navigate to the page and wait for it to load logger.debug(`Navigating to ${source} with browser...`); - const response = await this.page.goto(source, { + const response = await page.goto(source, { waitUntil: "networkidle", timeout, }); @@ -70,10 +101,11 @@ export class BrowserFetcher implements ContentFetcher { } // Get the final URL after any redirects - const finalUrl = this.page.url(); + const finalUrl = page.url(); + await this.accessPolicy.assertNetworkUrlAllowed(finalUrl); // Get the page content - const content = await this.page.content(); + const content = await page.content(); const contentBuffer = Buffer.from(content, "utf-8"); // Determine content type @@ -103,6 +135,10 @@ export class BrowserFetcher implements ContentFetcher { false, error instanceof Error ? error : undefined, ); + } finally { + await page?.unroute("**/*").catch(() => undefined); + await page?.close().catch(() => undefined); + await browserContext?.close().catch(() => undefined); } } @@ -114,22 +150,13 @@ export class BrowserFetcher implements ContentFetcher { }); } - private async ensureBrowserReady(): Promise { + private async ensureBrowserReady(): Promise { if (!this.browser) { logger.debug("Launching browser..."); this.browser = await BrowserFetcher.launchBrowser(); } - if (!this.page) { - this.page = await this.browser.newPage(); - - // Generate and set realistic browser headers - const dynamicHeaders = this.fingerprintGenerator.generateHeaders(); - await this.page.setExtraHTTPHeaders(dynamicHeaders); - - // Set viewport - await this.page.setViewportSize({ width: 1920, height: 1080 }); - } + return this.browser; } /** @@ -138,17 +165,6 @@ export class BrowserFetcher implements ContentFetcher { */ async close(): Promise { // Close page first - if (this.page) { - try { - await this.page.close(); - } catch (error) { - logger.warn(`⚠️ Error closing browser page: ${error}`); - } finally { - this.page = null; - } - } - - // Then close browser if (this.browser) { try { await this.browser.close(); diff --git a/src/scraper/fetcher/FileFetcher.ts b/src/scraper/fetcher/FileFetcher.ts index f7a6129c..542c889d 100644 --- a/src/scraper/fetcher/FileFetcher.ts +++ b/src/scraper/fetcher/FileFetcher.ts @@ -1,5 +1,7 @@ import crypto from "node:crypto"; import fs from "node:fs/promises"; +import { fileUrlToPathLoose, ScraperAccessPolicy } from "../../utils/accessPolicy"; +import type { AppConfig } from "../../utils/config"; import { ScraperError } from "../../utils/errors"; import { MimeTypeUtils } from "../../utils/mimeTypeUtils"; import { @@ -13,6 +15,14 @@ import { * Fetches content from local file system. */ export class FileFetcher implements ContentFetcher { + private readonly accessPolicy: ScraperAccessPolicy | null; + + constructor(scraperConfig?: AppConfig["scraper"]) { + this.accessPolicy = scraperConfig + ? new ScraperAccessPolicy(scraperConfig.security) + : null; + } + canFetch(source: string): boolean { return source.startsWith("file://"); } @@ -23,16 +33,9 @@ export class FileFetcher implements ContentFetcher { * Supports conditional fetching via ETag comparison for efficient refresh operations. */ async fetch(source: string, options?: FetchOptions): Promise { - // Remove the file:// protocol prefix and handle both file:// and file:/// formats - let filePath = source.replace(/^file:\/\/\/?/, ""); - - // Decode percent-encoded characters - filePath = decodeURIComponent(filePath); - - // Ensure absolute path on Unix-like systems (if not already absolute) - if (!filePath.startsWith("/") && process.platform !== "win32") { - filePath = `/${filePath}`; - } + const filePath = this.accessPolicy + ? (await this.accessPolicy.resolveFileAccess(source)).filePath + : fileUrlToPathLoose(source); try { const stats = await fs.stat(filePath); diff --git a/src/scraper/fetcher/HttpFetcher.ts b/src/scraper/fetcher/HttpFetcher.ts index 5035a5e7..000d732f 100644 --- a/src/scraper/fetcher/HttpFetcher.ts +++ b/src/scraper/fetcher/HttpFetcher.ts @@ -1,5 +1,7 @@ +import https from "node:https"; import axios, { type AxiosError, type AxiosRequestConfig } from "axios"; import { CancellationError } from "../../pipeline/errors"; +import { ScraperAccessPolicy } from "../../utils/accessPolicy"; import type { AppConfig } from "../../utils/config"; import { ChallengeError, RedirectError, ScraperError } from "../../utils/errors"; import { logger } from "../../utils/logger"; @@ -40,11 +42,13 @@ export class HttpFetcher implements ContentFetcher { ]; private fingerprintGenerator: FingerprintGenerator; + private readonly accessPolicy: ScraperAccessPolicy; constructor(scraperConfig: AppConfig["scraper"]) { this.maxRetriesDefault = scraperConfig.fetcher.maxRetries; this.baseDelayDefaultMs = scraperConfig.fetcher.baseDelayMs; this.fingerprintGenerator = new FingerprintGenerator(); + this.accessPolicy = new ScraperAccessPolicy(scraperConfig.security); } canFetch(source: string): boolean { @@ -79,104 +83,153 @@ export class HttpFetcher implements ContentFetcher { baseDelay: number = this.baseDelayDefaultMs, followRedirects: boolean = true, ): Promise { + await this.accessPolicy.assertNetworkUrlAllowed(source); + for (let attempt = 0; attempt <= maxRetries; attempt++) { try { - const fingerprint = this.fingerprintGenerator.generateHeaders(); - const headers: Record = { - ...fingerprint, - ...options?.headers, // User-provided headers override generated ones - }; - - // Add If-None-Match header for conditional requests if ETag is provided - if (options?.etag) { - headers["If-None-Match"] = options.etag; - logger.debug( - `Conditional request for ${source} with If-None-Match: ${options.etag}`, - ); - } + let currentUrl = source; + let redirectCount = 0; + + while (true) { + const fingerprint = this.fingerprintGenerator.generateHeaders(); + const headers: Record = { + ...fingerprint, + ...options?.headers, // User-provided headers override generated ones + }; + + // Add If-None-Match header for conditional requests if ETag is provided + if (options?.etag) { + headers["If-None-Match"] = options.etag; + logger.debug( + `Conditional request for ${source} with If-None-Match: ${options.etag}`, + ); + } + + const config: AxiosRequestConfig = { + responseType: "arraybuffer", + headers: { + ...headers, + // Override Accept-Encoding to exclude zstd which Axios doesn't handle automatically + // This prevents servers from sending zstd-compressed content that would appear as binary garbage + "Accept-Encoding": "gzip, deflate, br", + }, + timeout: options?.timeout, + signal: options?.signal, // Pass signal to axios + // Redirects are handled manually so every target can be revalidated before connect. + maxRedirects: 0, + decompress: true, + // Allow 304 responses to be handled as successful responses + validateStatus: (status) => { + return (status >= 200 && status < 400) || status === 304; + }, + }; + + if (this.accessPolicy.shouldAllowInvalidTls(currentUrl)) { + config.httpsAgent = new https.Agent({ rejectUnauthorized: false }); + } + + const response = await axios.get(currentUrl, config); + + if (response.status >= 300 && response.status < 400) { + const location = response.headers.location; + if (!location) { + throw new ScraperError( + `Redirect response for ${currentUrl} did not include a location header`, + false, + ); + } + + if (!followRedirects) { + throw new RedirectError(currentUrl, location, response.status); + } + + if (redirectCount >= 5) { + throw new ScraperError( + `Too many redirects while fetching ${source}`, + false, + ); + } + + const redirectUrl = new URL(location, currentUrl).href; + await this.accessPolicy.assertNetworkUrlAllowed(redirectUrl); + + currentUrl = redirectUrl; + redirectCount += 1; + continue; + } + + // Handle 304 Not Modified responses for conditional requests + if (response.status === 304) { + logger.debug(`HTTP 304 Not Modified for ${currentUrl}`); + return { + content: Buffer.from(""), + mimeType: "text/plain", + source: currentUrl, + status: FetchStatus.NOT_MODIFIED, + } satisfies RawContent; + } + + const contentTypeHeader = response.headers["content-type"]; + const { mimeType, charset } = MimeTypeUtils.parseContentType(contentTypeHeader); + const contentEncoding = response.headers["content-encoding"]; + + // Convert ArrayBuffer to Buffer properly + let content: Buffer; + if (response.data instanceof ArrayBuffer) { + content = Buffer.from(response.data); + } else if (Buffer.isBuffer(response.data)) { + content = response.data; + } else if (typeof response.data === "string") { + content = Buffer.from(response.data, "utf-8"); + } else { + // Fallback for other data types + content = Buffer.from(response.data); + } + + // Determine the final effective URL after redirects (if any) + const finalUrl = + // Node follow-redirects style + response.request?.res?.responseUrl || + // Some adapters may expose directly + response.request?.responseUrl || + // Fallback to axios recorded config URL + response.config?.url || + currentUrl; + + await this.accessPolicy.assertNetworkUrlAllowed(finalUrl); + + // Extract ETag header for caching + const etag = response.headers.etag || response.headers.ETag; + if (etag) { + logger.debug(`Received ETag for ${finalUrl}: ${etag}`); + } + + // Extract Last-Modified header for caching + const lastModified = response.headers["last-modified"]; + const lastModifiedISO = lastModified + ? new Date(lastModified).toISOString() + : undefined; - const config: AxiosRequestConfig = { - responseType: "arraybuffer", - headers: { - ...headers, - // Override Accept-Encoding to exclude zstd which Axios doesn't handle automatically - // This prevents servers from sending zstd-compressed content that would appear as binary garbage - "Accept-Encoding": "gzip, deflate, br", - }, - timeout: options?.timeout, - signal: options?.signal, // Pass signal to axios - // Axios follows redirects by default, we need to explicitly disable it if needed - maxRedirects: followRedirects ? 5 : 0, - decompress: true, - // Allow 304 responses to be handled as successful responses - validateStatus: (status) => { - return (status >= 200 && status < 300) || status === 304; - }, - }; - - const response = await axios.get(source, config); - - // Handle 304 Not Modified responses for conditional requests - if (response.status === 304) { - logger.debug(`HTTP 304 Not Modified for ${source}`); return { - content: Buffer.from(""), - mimeType: "text/plain", - source: source, - status: FetchStatus.NOT_MODIFIED, + content, + mimeType, + charset, + encoding: contentEncoding, + source: finalUrl, + etag, + lastModified: lastModifiedISO, + status: FetchStatus.SUCCESS, } satisfies RawContent; } - - const contentTypeHeader = response.headers["content-type"]; - const { mimeType, charset } = MimeTypeUtils.parseContentType(contentTypeHeader); - const contentEncoding = response.headers["content-encoding"]; - - // Convert ArrayBuffer to Buffer properly - let content: Buffer; - if (response.data instanceof ArrayBuffer) { - content = Buffer.from(response.data); - } else if (Buffer.isBuffer(response.data)) { - content = response.data; - } else if (typeof response.data === "string") { - content = Buffer.from(response.data, "utf-8"); - } else { - // Fallback for other data types - content = Buffer.from(response.data); + } catch (error: unknown) { + if (error instanceof RedirectError || error instanceof ChallengeError) { + throw error; } - // Determine the final effective URL after redirects (if any) - const finalUrl = - // Node follow-redirects style - response.request?.res?.responseUrl || - // Some adapters may expose directly - response.request?.responseUrl || - // Fallback to axios recorded config URL - response.config?.url || - source; - - // Extract ETag header for caching - const etag = response.headers.etag || response.headers.ETag; - if (etag) { - logger.debug(`Received ETag for ${source}: ${etag}`); + if (error instanceof ScraperError && !error.isRetryable) { + throw error; } - // Extract Last-Modified header for caching - const lastModified = response.headers["last-modified"]; - const lastModifiedISO = lastModified - ? new Date(lastModified).toISOString() - : undefined; - - return { - content, - mimeType, - charset, - encoding: contentEncoding, - source: finalUrl, - etag, - lastModified: lastModifiedISO, - status: FetchStatus.SUCCESS, - } satisfies RawContent; - } catch (error: unknown) { const axiosError = error as AxiosError; const status = axiosError.response?.status; const code = axiosError.code; diff --git a/src/scraper/middleware/HtmlPlaywrightMiddleware.test.ts b/src/scraper/middleware/HtmlPlaywrightMiddleware.test.ts index f6698b4d..6a8222fd 100644 --- a/src/scraper/middleware/HtmlPlaywrightMiddleware.test.ts +++ b/src/scraper/middleware/HtmlPlaywrightMiddleware.test.ts @@ -174,6 +174,20 @@ describe("HtmlPlaywrightMiddleware", () => { document: { maxSize: 10 * 1024 * 1024, }, + security: { + network: { + allowPrivateNetworks: false, + allowedHosts: [], + allowedCidrs: [], + allowInvalidTls: false, + }, + fileAccess: { + mode: "unrestricted" as const, + allowedRoots: [], + followSymlinks: false, + includeHidden: false, + }, + }, }; playwrightMiddleware = new HtmlPlaywrightMiddleware(mockScraperConfig); }); @@ -943,6 +957,20 @@ describe("Route handling race condition protection", () => { document: { maxSize: 10 * 1024 * 1024, }, + security: { + network: { + allowPrivateNetworks: false, + allowedHosts: [], + allowedCidrs: [], + allowInvalidTls: false, + }, + fileAccess: { + mode: "unrestricted" as const, + allowedRoots: [], + followSymlinks: false, + includeHidden: false, + }, + }, }; playwrightMiddleware = new HtmlPlaywrightMiddleware(mockScraperConfig); }); diff --git a/src/scraper/middleware/HtmlPlaywrightMiddleware.ts b/src/scraper/middleware/HtmlPlaywrightMiddleware.ts index 718e6d5c..a71ad1e9 100644 --- a/src/scraper/middleware/HtmlPlaywrightMiddleware.ts +++ b/src/scraper/middleware/HtmlPlaywrightMiddleware.ts @@ -1,4 +1,5 @@ import type { Browser, BrowserContext, ElementHandle, Frame, Page } from "playwright"; +import { ScraperAccessPolicy } from "../../utils/accessPolicy"; import { type AppConfig, defaults } from "../../utils/config"; import { logger } from "../../utils/logger"; import { MimeTypeUtils } from "../../utils/mimeTypeUtils"; @@ -44,6 +45,7 @@ interface CachedResource { export class HtmlPlaywrightMiddleware implements ContentProcessorMiddleware { private browser: Browser | null = null; private readonly config: AppConfig["scraper"]; + private readonly accessPolicy: ScraperAccessPolicy; // Static LRU cache for all fetched resources, shared across instances private static readonly resourceCache = new SimpleMemoryCache( @@ -52,6 +54,16 @@ export class HtmlPlaywrightMiddleware implements ContentProcessorMiddleware { constructor(config: AppConfig["scraper"]) { this.config = config; + this.accessPolicy = new ScraperAccessPolicy(config.security); + } + + private async isRequestAllowed(url: string): Promise { + try { + await this.accessPolicy.assertNetworkUrlAllowed(url); + return true; + } catch { + return false; + } } /** @@ -536,6 +548,10 @@ export class HtmlPlaywrightMiddleware implements ContentProcessorMiddleware { })(); const resourceType = route.request().resourceType(); + if (!(await this.isRequestAllowed(reqUrl))) { + return await route.abort("blockedbyclient"); + } + // Abort non-essential resources if (["image", "font", "media"].includes(resourceType)) { try { @@ -690,6 +706,10 @@ export class HtmlPlaywrightMiddleware implements ContentProcessorMiddleware { let framePage: Page | null = null; try { + if (!(await this.isRequestAllowed(resolvedUrl))) { + return ""; + } + // Create a new page in the same browser context for consistency framePage = await parentPage.context().newPage(); @@ -862,9 +882,18 @@ export class HtmlPlaywrightMiddleware implements ContentProcessorMiddleware { // Always create a browser context (with or without credentials) if (credentials) { - browserContext = await browser.newContext({ httpCredentials: credentials }); + browserContext = await browser.newContext({ + httpCredentials: credentials, + ignoreHTTPSErrors: this.accessPolicy.shouldAllowInvalidTls( + "https://browser-context.local", + ), + }); } else { - browserContext = await browser.newContext(); + browserContext = await browser.newContext({ + ignoreHTTPSErrors: this.accessPolicy.shouldAllowInvalidTls( + "https://browser-context.local", + ), + }); } page = await browserContext.newPage(); @@ -906,6 +935,10 @@ export class HtmlPlaywrightMiddleware implements ContentProcessorMiddleware { })(); const resourceType = route.request().resourceType(); + if (!(await this.isRequestAllowed(reqUrl))) { + return await route.abort("blockedbyclient"); + } + // Abort non-essential resources if (["image", "font", "media"].includes(resourceType)) { try { diff --git a/src/scraper/strategies/LocalFileStrategy.test.ts b/src/scraper/strategies/LocalFileStrategy.test.ts index ff90d65f..0e393952 100644 --- a/src/scraper/strategies/LocalFileStrategy.test.ts +++ b/src/scraper/strategies/LocalFileStrategy.test.ts @@ -9,7 +9,20 @@ vi.mock("node:fs/promises", () => ({ default: vol.promises })); vi.mock("node:fs"); describe("LocalFileStrategy", () => { - const appConfig = loadConfig(); + const appConfig = { + ...loadConfig(), + scraper: { + ...loadConfig().scraper, + security: { + ...loadConfig().scraper.security, + fileAccess: { + ...loadConfig().scraper.security.fileAccess, + mode: "unrestricted" as const, + allowedRoots: [], + }, + }, + }, + }; beforeEach(() => { vol.reset(); diff --git a/src/scraper/strategies/LocalFileStrategy.ts b/src/scraper/strategies/LocalFileStrategy.ts index 0ca23c34..9314de03 100644 --- a/src/scraper/strategies/LocalFileStrategy.ts +++ b/src/scraper/strategies/LocalFileStrategy.ts @@ -1,5 +1,6 @@ import fs from "node:fs/promises"; import path from "node:path"; +import { fileUrlToPathLoose, ScraperAccessPolicy } from "../../utils/accessPolicy"; import { type ArchiveAdapter, getArchiveAdapter } from "../../utils/archive"; import type { AppConfig } from "../../utils/config"; import { logger } from "../../utils/logger"; @@ -19,12 +20,15 @@ import { BaseScraperStrategy, type ProcessItemResult } from "./BaseScraperStrate * Supports include/exclude filters and percent-encoded paths. */ export class LocalFileStrategy extends BaseScraperStrategy { - private readonly fileFetcher = new FileFetcher(); + private readonly fileFetcher: FileFetcher; private readonly pipelines: ContentPipeline[]; + private readonly accessPolicy: ScraperAccessPolicy; constructor(config: AppConfig) { super(config); + this.fileFetcher = new FileFetcher(config.scraper); this.pipelines = PipelineFactory.createStandardPipelines(config); + this.accessPolicy = new ScraperAccessPolicy(config.scraper.security); } canHandle(url: string): boolean { @@ -36,14 +40,11 @@ export class LocalFileStrategy extends BaseScraperStrategy { options: ScraperOptions, _signal?: AbortSignal, ): Promise { - // Parse the file URL properly to handle both file:// and file:/// formats - let filePath = item.url.replace(/^file:\/\/\/?/, ""); - filePath = decodeURIComponent(filePath); - - // Ensure absolute path on Unix-like systems (if not already absolute) - if (!filePath.startsWith("/") && process.platform !== "win32") { - filePath = `/${filePath}`; - } + const resolvedAccess = await this.accessPolicy.resolveFileAccess( + item.url, + options.internalAllowedFileRoots, + ); + const filePath = resolvedAccess.filePath; let stats: Awaited> | null = null; let archivePath: string | null = null; @@ -79,20 +80,33 @@ export class LocalFileStrategy extends BaseScraperStrategy { // Handle physical directory if (stats?.isDirectory()) { const contents = await fs.readdir(filePath); + const visibleContents = this.config.scraper.security.fileAccess.includeHidden + ? contents + : contents.filter((name) => !name.startsWith(".")); // Only return links that pass shouldProcessUrl - const links = contents - .map((name) => { + const linkEntries = await Promise.all( + visibleContents.map(async (name) => { + const entryPath = path.join(filePath, name); + if (!this.config.scraper.security.fileAccess.followSymlinks) { + const entryStats = await fs.lstat(entryPath).catch(() => null); + if (entryStats?.isSymbolicLink()) { + return null; + } + } + // Construct valid file URL using URL class to ensure proper encoding and structure - const url = new URL( - `file://${path.join(filePath, name).replace(/\\/g, "/")}`, - ); + const url = new URL(`file://${entryPath.replace(/\\/g, "/")}`); // Ensure we always have file:/// format (empty host) if (url.hostname !== "") { url.pathname = `/${url.hostname}${url.pathname}`; url.hostname = ""; } + return url.href; - }) + }), + ); + const links = linkEntries + .filter((url): url is string => Boolean(url)) .filter((url) => { const allowed = this.shouldProcessUrl(url, options); if (!allowed) { @@ -102,7 +116,7 @@ export class LocalFileStrategy extends BaseScraperStrategy { }); logger.debug( - `Found ${links.length} files in ${filePath} (from ${contents.length} entries)`, + `Found ${links.length} files in ${filePath} (from ${visibleContents.length} entries)`, ); return { url: item.url, links, status: FetchStatus.SUCCESS }; } @@ -133,7 +147,10 @@ export class LocalFileStrategy extends BaseScraperStrategy { virtualUrl.hostname = ""; } - if (this.shouldProcessUrl(virtualUrl.href, options)) { + if ( + this.shouldProcessUrl(virtualUrl.href, options) && + this.isVisibleArchiveEntry(entry.path) + ) { links.push(virtualUrl.href); } } @@ -230,6 +247,17 @@ export class LocalFileStrategy extends BaseScraperStrategy { return { archive: null, inner: null, adapter: null }; } + private isVisibleArchiveEntry(entryPath: string): boolean { + if (this.config.scraper.security.fileAccess.includeHidden) { + return true; + } + + return !entryPath + .split(/[\\/]+/) + .filter(Boolean) + .some((segment) => segment.startsWith(".")); + } + private async processArchiveEntry( item: QueueItem, archivePath: string, diff --git a/src/scraper/strategies/WebScraperStrategy.ts b/src/scraper/strategies/WebScraperStrategy.ts index b79f69d2..b17b50f8 100644 --- a/src/scraper/strategies/WebScraperStrategy.ts +++ b/src/scraper/strategies/WebScraperStrategy.ts @@ -231,8 +231,16 @@ export class WebScraperStrategy extends BaseScraperStrategy { // Delegate to LocalFileStrategy const localUrl = `file://${tempFile}`; const localItem = { ...item, url: localUrl }; + const localOptions = { + ...options, + internalAllowedFileRoots: [...(options.internalAllowedFileRoots ?? []), tempFile], + }; - const result = await this.localFileStrategy.processItem(localItem, options, signal); + const result = await this.localFileStrategy.processItem( + localItem, + localOptions, + signal, + ); // We need to fix up the links to point back to something meaningful? // If we process a zip, we get file:///tmp/.../file.txt diff --git a/src/scraper/types.ts b/src/scraper/types.ts index 0772114d..e38b3e51 100644 --- a/src/scraper/types.ts +++ b/src/scraper/types.ts @@ -119,6 +119,10 @@ export interface ScraperOptions { * @default true */ clean?: boolean; + /** + * Internal-only allowlist roots for application-managed temporary files. + */ + internalAllowedFileRoots?: string[]; } /** diff --git a/src/utils/accessPolicy.test.ts b/src/utils/accessPolicy.test.ts new file mode 100644 index 00000000..7efc8442 --- /dev/null +++ b/src/utils/accessPolicy.test.ts @@ -0,0 +1,196 @@ +import { vol } from "memfs"; +import { afterEach, beforeEach, describe, expect, it, vi } from "vitest"; +import type { AppConfig } from "./config"; +import { loadConfig } from "./config"; +import { AccessPolicyError } from "./errors"; + +vi.mock("node:fs/promises", () => ({ default: vol.promises })); + +const lookupMock = vi.fn(); +vi.mock("node:dns/promises", () => ({ + lookup: (...args: Parameters) => lookupMock(...args), +})); + +import * as os from "node:os"; +import { expandConfiguredRoot, ScraperAccessPolicy } from "./accessPolicy"; + +type SecurityOverrides = { + network?: Partial; + fileAccess?: Partial; +}; + +function createSecurityConfig( + overrides: SecurityOverrides = {}, +): AppConfig["scraper"]["security"] { + const defaults = loadConfig().scraper.security; + return { + ...defaults, + ...overrides, + network: { + ...defaults.network, + ...overrides.network, + }, + fileAccess: { + ...defaults.fileAccess, + ...overrides.fileAccess, + }, + }; +} + +describe("ScraperAccessPolicy", () => { + beforeEach(() => { + vol.reset(); + lookupMock.mockReset(); + }); + + afterEach(() => { + vi.restoreAllMocks(); + }); + + it("blocks loopback targets by default", async () => { + const policy = new ScraperAccessPolicy(createSecurityConfig()); + + await expect( + policy.assertNetworkUrlAllowed("http://127.0.0.1/test"), + ).rejects.toBeInstanceOf(AccessPolicyError); + }); + + it("allows explicitly allowlisted private hostname targets", async () => { + lookupMock.mockResolvedValue([{ address: "10.42.0.10", family: 4 }]); + const policy = new ScraperAccessPolicy( + createSecurityConfig({ + network: { + allowedHosts: ["docs.internal.example"], + }, + }), + ); + + await expect( + policy.assertNetworkUrlAllowed("https://docs.internal.example/guide"), + ).resolves.toBeUndefined(); + }); + + it("keeps host allowlists hostname-bound", async () => { + const policy = new ScraperAccessPolicy( + createSecurityConfig({ + network: { + allowedHosts: ["docs.internal.example"], + }, + }), + ); + + await expect( + policy.assertNetworkUrlAllowed("https://10.42.0.10/guide"), + ).rejects.toBeInstanceOf(AccessPolicyError); + }); + + it("allows explicitly allowlisted CIDR targets", async () => { + lookupMock.mockResolvedValue([{ address: "10.42.0.10", family: 4 }]); + const policy = new ScraperAccessPolicy( + createSecurityConfig({ + network: { + allowedCidrs: ["10.42.0.0/16"], + }, + }), + ); + + await expect( + policy.assertNetworkUrlAllowed("https://wiki.internal.example/guide"), + ).resolves.toBeUndefined(); + }); + + it("enables invalid TLS only for already-allowed https targets", () => { + const policy = new ScraperAccessPolicy( + createSecurityConfig({ + network: { + allowInvalidTls: true, + }, + }), + ); + + expect(policy.shouldAllowInvalidTls("https://docs.internal.example")).toBe(true); + expect(policy.shouldAllowInvalidTls("http://docs.internal.example")).toBe(false); + }); + + it("blocks hidden local paths by default", async () => { + vol.fromJSON({ + "/docs/.hidden/file.md": "secret", + }); + + const policy = new ScraperAccessPolicy( + createSecurityConfig({ + fileAccess: { + mode: "unrestricted", + }, + }), + ); + + await expect( + policy.resolveFileAccess("file:///docs/.hidden/file.md"), + ).rejects.toBeInstanceOf(AccessPolicyError); + }); + + it("blocks symlink access by default", async () => { + vol.fromJSON({ + "/docs/real.md": "content", + }); + await vol.promises.symlink("/docs/real.md", "/docs/link.md"); + + const policy = new ScraperAccessPolicy( + createSecurityConfig({ + fileAccess: { + mode: "unrestricted", + }, + }), + ); + + await expect(policy.resolveFileAccess("file:///docs/link.md")).rejects.toBeInstanceOf( + AccessPolicyError, + ); + }); + + it("validates archive members against the backing archive path", async () => { + vol.fromJSON({ + "/allowed/docs.zip": "archive-bytes", + }); + + const policy = new ScraperAccessPolicy( + createSecurityConfig({ + fileAccess: { + mode: "allowedRoots", + allowedRoots: ["/allowed"], + }, + }), + ); + + const resolved = await policy.resolveFileAccess( + "file:///allowed/docs.zip/guide/intro.md", + ); + + expect(resolved.accessPath).toBe("/allowed/docs.zip"); + expect(resolved.virtualArchivePath).toBe("guide/intro.md"); + }); +}); + +describe("expandConfiguredRoot", () => { + afterEach(() => { + vi.restoreAllMocks(); + vol.reset(); + }); + + it("returns null when $DOCUMENTS cannot be resolved", async () => { + vi.spyOn(os, "homedir").mockReturnValue("/missing-home"); + const result = await expandConfiguredRoot("$DOCUMENTS"); + expect(result).toBeNull(); + }); + + it("resolves $DOCUMENTS when the directory exists", async () => { + vi.spyOn(os, "homedir").mockReturnValue("/home/tester"); + vol.fromJSON({ + "/home/tester/Documents/readme.md": "hello", + }); + + const result = await expandConfiguredRoot("$DOCUMENTS"); + expect(result).toBe("/home/tester/Documents"); + }); +}); diff --git a/src/utils/accessPolicy.ts b/src/utils/accessPolicy.ts new file mode 100644 index 00000000..6514ccfe --- /dev/null +++ b/src/utils/accessPolicy.ts @@ -0,0 +1,342 @@ +import { lookup } from "node:dns/promises"; +import fs from "node:fs/promises"; +import net from "node:net"; +import os from "node:os"; +import path from "node:path"; +import type { AppConfig } from "./config"; +import { AccessPolicyError } from "./errors"; + +const SPECIAL_USE_IPV4_CIDRS = [ + "0.0.0.0/8", + "10.0.0.0/8", + "100.64.0.0/10", + "127.0.0.0/8", + "169.254.0.0/16", + "172.16.0.0/12", + "192.0.0.0/24", + "192.0.2.0/24", + "192.168.0.0/16", + "198.18.0.0/15", + "198.51.100.0/24", + "203.0.113.0/24", + "224.0.0.0/4", + "240.0.0.0/4", +]; + +const SPECIAL_USE_IPV6_CIDRS = ["::/128", "::1/128", "fc00::/7", "fe80::/10", "ff00::/8"]; + +const ARCHIVE_EXTENSIONS = new Set([".zip", ".tar", ".gz", ".tgz"]); + +type ScraperSecurityConfig = AppConfig["scraper"]["security"]; + +export interface ResolvedFileAccess { + filePath: string; + accessPath: string; + virtualArchivePath: string | null; +} + +/** + * Shared access policy for outbound network and local file access. + */ +export class ScraperAccessPolicy { + private readonly allowedCidrs = new net.BlockList(); + private readonly specialUseCidrs = new net.BlockList(); + private readonly allowedHosts: Set; + + constructor(private readonly security: ScraperSecurityConfig) { + this.allowedHosts = new Set( + security.network.allowedHosts.map((host: string) => host.toLowerCase()), + ); + + for (const cidr of security.network.allowedCidrs) { + this.addCidr(this.allowedCidrs, cidr); + } + + for (const cidr of SPECIAL_USE_IPV4_CIDRS) { + this.addCidr(this.specialUseCidrs, cidr); + } + for (const cidr of SPECIAL_USE_IPV6_CIDRS) { + this.addCidr(this.specialUseCidrs, cidr); + } + } + + /** + * Checks whether a network URL is allowed before connecting. + */ + async assertNetworkUrlAllowed(url: string): Promise { + const parsed = new URL(url); + if (parsed.protocol !== "http:" && parsed.protocol !== "https:") { + return; + } + + if (this.security.network.allowPrivateNetworks) { + return; + } + + const hostname = parsed.hostname.toLowerCase(); + if (this.allowedHosts.has(hostname)) { + return; + } + + if (this.isIpAddress(hostname)) { + this.assertAddressAllowed(hostname, url); + return; + } + + const addresses = await lookup(hostname, { all: true, verbatim: true }).catch( + () => [], + ); + for (const address of addresses) { + this.assertAddressAllowed(address.address, url); + } + } + + /** + * Checks whether TLS certificate validation may be bypassed for a URL. + */ + shouldAllowInvalidTls(url: string): boolean { + try { + const parsed = new URL(url); + return parsed.protocol === "https:" && this.security.network.allowInvalidTls; + } catch { + return false; + } + } + + /** + * Resolves and validates a file URL against the configured file access policy. + */ + async resolveFileAccess( + url: string, + additionalAllowedRoots: string[] = [], + ): Promise { + const filePath = fileUrlToPathLoose(url); + const resolved = await resolveArchiveAwarePath(filePath); + const accessPath = resolved.archivePath ?? filePath; + const bypassRoots = await this.expandRoots(additionalAllowedRoots); + const matchedBypassRoot = await this.findContainingRoot( + accessPath, + bypassRoots, + false, + ); + + if (this.security.fileAccess.mode === "disabled" && !matchedBypassRoot) { + throw new AccessPolicyError(`Security policy blocked local file access for ${url}`); + } + + if (this.security.fileAccess.mode === "allowedRoots" && !matchedBypassRoot) { + const configuredRoots = await this.expandRoots( + this.security.fileAccess.allowedRoots, + ); + if (configuredRoots.length === 0) { + throw new AccessPolicyError( + `Security policy blocked local file access for ${url}`, + ); + } + + const matchedRoot = await this.findContainingRoot( + accessPath, + configuredRoots, + this.security.fileAccess.followSymlinks, + ); + if (!matchedRoot) { + throw new AccessPolicyError( + `Security policy blocked local file access for ${url}`, + ); + } + } + + if (!this.security.fileAccess.followSymlinks) { + const stats = await fs.lstat(accessPath).catch(() => null); + if (stats?.isSymbolicLink()) { + throw new AccessPolicyError(`Security policy blocked symlink access for ${url}`); + } + } + + if (!this.security.fileAccess.includeHidden) { + if (hasHiddenSegment(filePath)) { + throw new AccessPolicyError( + `Security policy blocked hidden file access for ${url}`, + ); + } + if ( + resolved.virtualArchivePath && + hasHiddenArchiveSegment(resolved.virtualArchivePath) + ) { + throw new AccessPolicyError( + `Security policy blocked hidden archive entry access for ${url}`, + ); + } + } + + return { + filePath, + accessPath, + virtualArchivePath: resolved.virtualArchivePath, + }; + } + + private isIpAddress(hostname: string): boolean { + return net.isIP(hostname) !== 0; + } + + private assertAddressAllowed(address: string, url: string): void { + const family = net.isIP(address); + if (family === 0) { + return; + } + + const type = family === 6 ? "ipv6" : "ipv4"; + if (!this.specialUseCidrs.check(address, type)) { + return; + } + + if (this.allowedCidrs.check(address, type)) { + return; + } + + throw new AccessPolicyError(`Security policy blocked network access to ${url}`); + } + + private addCidr(blockList: net.BlockList, cidr: string): void { + const [network, prefix] = cidr.split("/"); + const family = net.isIP(network); + if (family === 0 || prefix === undefined) { + return; + } + blockList.addSubnet( + network, + Number.parseInt(prefix, 10), + family === 6 ? "ipv6" : "ipv4", + ); + } + + private async expandRoots(roots: string[]): Promise { + const expanded: string[] = []; + for (const root of roots) { + const value = await expandConfiguredRoot(root); + if (value) { + expanded.push(value); + } + } + return expanded; + } + + private async findContainingRoot( + accessPath: string, + roots: string[], + followSymlinks: boolean, + ): Promise { + const resolvedAccessPath = await resolveContainmentPath(accessPath, followSymlinks); + for (const root of roots) { + const resolvedRoot = await resolveContainmentPath(root, followSymlinks); + if (pathContains(resolvedRoot, resolvedAccessPath)) { + return resolvedRoot; + } + } + return null; + } +} + +/** + * Converts a file URL into a local filesystem path while preserving legacy leniency. + */ +export function fileUrlToPathLoose(url: string): string { + let filePath = url.replace(/^file:\/\/\/?/, ""); + filePath = decodeURIComponent(filePath); + + if (!filePath.startsWith("/") && process.platform !== "win32") { + filePath = `/${filePath}`; + } + + return filePath; +} + +/** + * Expands user-facing configured root tokens into concrete filesystem paths. + */ +export async function expandConfiguredRoot(root: string): Promise { + if (root !== "$DOCUMENTS") { + return path.resolve(root); + } + + const homeDir = os.homedir(); + if (!homeDir) { + return null; + } + + const documentsPath = path.join(homeDir, "Documents"); + const stats = await fs.stat(documentsPath).catch(() => null); + if (!stats?.isDirectory()) { + return null; + } + + return path.resolve(documentsPath); +} + +async function resolveArchiveAwarePath(filePath: string): Promise<{ + archivePath: string | null; + virtualArchivePath: string | null; +}> { + let currentPath = filePath; + while ( + currentPath !== "/" && + currentPath !== "." && + path.dirname(currentPath) !== currentPath + ) { + const stats = await fs.lstat(currentPath).catch(() => null); + if (stats?.isFile() && looksLikeArchivePath(currentPath)) { + const virtualArchivePath = filePath + .substring(currentPath.length) + .replace(/^\/+/, "") + .replace(/^\\+/, ""); + return { + archivePath: currentPath, + virtualArchivePath: virtualArchivePath || null, + }; + } + if (stats) { + return { archivePath: null, virtualArchivePath: null }; + } + currentPath = path.dirname(currentPath); + } + + return { archivePath: null, virtualArchivePath: null }; +} + +async function resolveContainmentPath( + targetPath: string, + followSymlinks: boolean, +): Promise { + if (!followSymlinks) { + return path.resolve(targetPath); + } + + return fs.realpath(targetPath).catch(() => path.resolve(targetPath)); +} + +function looksLikeArchivePath(filePath: string): boolean { + return ARCHIVE_EXTENSIONS.has(path.extname(filePath).toLowerCase()); +} + +function pathContains(root: string, candidate: string): boolean { + if (root === candidate) { + return true; + } + + const relative = path.relative(root, candidate); + return relative !== "" && !relative.startsWith("..") && !path.isAbsolute(relative); +} + +function hasHiddenSegment(targetPath: string): boolean { + const parsed = path.resolve(targetPath); + const segments = parsed.split(path.sep).filter(Boolean); + return segments.some((segment) => segment.startsWith(".")); +} + +function hasHiddenArchiveSegment(archivePath: string): boolean { + return archivePath + .split(/[\\/]+/) + .filter(Boolean) + .some((segment) => segment.startsWith(".")); +} diff --git a/src/utils/config.test.ts b/src/utils/config.test.ts index c597ae15..e8df756f 100644 --- a/src/utils/config.test.ts +++ b/src/utils/config.test.ts @@ -2,6 +2,7 @@ import fs from "node:fs"; import os from "node:os"; import path from "node:path"; import { afterEach, beforeEach, describe, expect, it, vi } from "vitest"; +import { expandConfiguredRoot } from "./accessPolicy"; import { camelToUpperSnake, collectLeafPaths, @@ -236,6 +237,12 @@ describe("Environment Variable Helpers", () => { "DOCS_MCP_SPLITTER_JSON_MAX_NESTING_DEPTH", ); }); + + it("converts nested security path", () => { + expect(pathToEnvVar(["scraper", "security", "fileAccess", "followSymlinks"])).toBe( + "DOCS_MCP_SCRAPER_SECURITY_FILE_ACCESS_FOLLOW_SYMLINKS", + ); + }); }); describe("collectLeafPaths", () => { @@ -480,4 +487,61 @@ describe("Auto-generated Environment Variable Overrides", () => { expect(config.scraper.document.maxSize).toBe(52428800); }); + + it("loads scraper security defaults", () => { + const config = loadConfig( + {}, + { configPath: path.join(tmpDir, "security-defaults.yaml") }, + ); + + expect(config.scraper.security.network.allowPrivateNetworks).toBe(false); + expect(config.scraper.security.network.allowedHosts).toEqual([]); + expect(config.scraper.security.network.allowedCidrs).toEqual([]); + expect(config.scraper.security.network.allowInvalidTls).toBe(false); + expect(config.scraper.security.fileAccess.mode).toBe("allowedRoots"); + expect(config.scraper.security.fileAccess.allowedRoots).toEqual(["$DOCUMENTS"]); + expect(config.scraper.security.fileAccess.followSymlinks).toBe(false); + expect(config.scraper.security.fileAccess.includeHidden).toBe(false); + }); + + it("applies nested security env overrides", () => { + process.env.DOCS_MCP_SCRAPER_SECURITY_NETWORK_ALLOW_INVALID_TLS = "true"; + process.env.DOCS_MCP_SCRAPER_SECURITY_FILE_ACCESS_FOLLOW_SYMLINKS = "true"; + process.env.DOCS_MCP_SCRAPER_SECURITY_FILE_ACCESS_INCLUDE_HIDDEN = "true"; + + const config = loadConfig({}, { configPath: path.join(tmpDir, "security-env.yaml") }); + + expect(config.scraper.security.network.allowInvalidTls).toBe(true); + expect(config.scraper.security.fileAccess.followSymlinks).toBe(true); + expect(config.scraper.security.fileAccess.includeHidden).toBe(true); + }); + + it("parses security arrays from env", () => { + process.env.DOCS_MCP_SCRAPER_SECURITY_NETWORK_ALLOWED_HOSTS = + '["docs.internal.example","wiki.corp.local"]'; + process.env.DOCS_MCP_SCRAPER_SECURITY_FILE_ACCESS_ALLOWED_ROOTS = + '["$DOCUMENTS", "/srv/docs"]'; + + const config = loadConfig( + {}, + { configPath: path.join(tmpDir, "security-arrays.yaml") }, + ); + + expect(config.scraper.security.network.allowedHosts).toEqual([ + "docs.internal.example", + "wiki.corp.local", + ]); + expect(config.scraper.security.fileAccess.allowedRoots).toEqual([ + "$DOCUMENTS", + "/srv/docs", + ]); + }); + + it("treats unresolved $DOCUMENTS as no access", async () => { + const homedirSpy = vi.spyOn(os, "homedir").mockReturnValue("/missing-home"); + + await expect(expandConfiguredRoot("$DOCUMENTS")).resolves.toBeNull(); + + homedirSpy.mockRestore(); + }); }); diff --git a/src/utils/config.ts b/src/utils/config.ts index c41cf244..a5e468e6 100644 --- a/src/utils/config.ts +++ b/src/utils/config.ts @@ -6,6 +6,22 @@ import { z } from "zod"; import { normalizeEnvValue } from "./env"; import { logger } from "./logger"; +const envStringArray = z + .union([z.array(z.string()), z.string()]) + .transform((value) => { + if (Array.isArray(value)) { + return value; + } + + const parsed = yaml.parse(value); + if (Array.isArray(parsed)) { + return parsed.map((item) => String(item)); + } + + return [value]; + }) + .pipe(z.array(z.string())); + /** * Custom zod schema for boolean values that properly handles string representations. * Unlike z.coerce.boolean() which treats any non-empty string as true, @@ -66,6 +82,20 @@ export const DEFAULT_CONFIG = { document: { maxSize: 10 * 1024 * 1024, // 10MB max size for PDF/Office documents }, + security: { + network: { + allowPrivateNetworks: false, + allowedHosts: [] as string[], + allowedCidrs: [] as string[], + allowInvalidTls: false, + }, + fileAccess: { + mode: "allowedRoots", + allowedRoots: ["$DOCUMENTS"] as string[], + followSymlinks: false, + includeHidden: false, + }, + }, }, splitter: { minChunkSize: 500, @@ -183,6 +213,42 @@ export const AppConfigSchema = z.object({ .default(DEFAULT_CONFIG.scraper.document.maxSize), }) .default(DEFAULT_CONFIG.scraper.document), + security: z + .object({ + network: z + .object({ + allowPrivateNetworks: envBoolean.default( + DEFAULT_CONFIG.scraper.security.network.allowPrivateNetworks, + ), + allowedHosts: envStringArray.default( + DEFAULT_CONFIG.scraper.security.network.allowedHosts, + ), + allowedCidrs: envStringArray.default( + DEFAULT_CONFIG.scraper.security.network.allowedCidrs, + ), + allowInvalidTls: envBoolean.default( + DEFAULT_CONFIG.scraper.security.network.allowInvalidTls, + ), + }) + .default(DEFAULT_CONFIG.scraper.security.network), + fileAccess: z + .object({ + mode: z + .enum(["disabled", "allowedRoots", "unrestricted"]) + .default(DEFAULT_CONFIG.scraper.security.fileAccess.mode), + allowedRoots: envStringArray.default( + DEFAULT_CONFIG.scraper.security.fileAccess.allowedRoots, + ), + followSymlinks: envBoolean.default( + DEFAULT_CONFIG.scraper.security.fileAccess.followSymlinks, + ), + includeHidden: envBoolean.default( + DEFAULT_CONFIG.scraper.security.fileAccess.includeHidden, + ), + }) + .default(DEFAULT_CONFIG.scraper.security.fileAccess), + }) + .default(DEFAULT_CONFIG.scraper.security), }) .default(DEFAULT_CONFIG.scraper), splitter: z diff --git a/src/utils/errors.ts b/src/utils/errors.ts index c35cb1f9..bdd3dd8c 100644 --- a/src/utils/errors.ts +++ b/src/utils/errors.ts @@ -44,4 +44,16 @@ class ChallengeError extends ScraperError { } } -export { ChallengeError, InvalidUrlError, RedirectError, ScraperError }; +class AccessPolicyError extends ScraperError { + constructor(message: string) { + super(message, false); + } +} + +export { + AccessPolicyError, + ChallengeError, + InvalidUrlError, + RedirectError, + ScraperError, +};