Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
63 changes: 63 additions & 0 deletions src/scraper/fetcher/AutoDetectFetcher.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
import { beforeEach, describe, expect, it, vi } from "vitest";
import { loadConfig } from "../../utils/config";
import { ChallengeError, TlsCertificateError } from "../../utils/errors";
import { AutoDetectFetcher } from "./AutoDetectFetcher";
import { BrowserFetcher } from "./BrowserFetcher";
import { HttpFetcher } from "./HttpFetcher";
import { FetchStatus } from "./types";

describe("AutoDetectFetcher", () => {
const scraperConfig = loadConfig().scraper;
const source = "https://example.com/docs";
const browserResult = {
content: Buffer.from("browser", "utf-8"),
mimeType: "text/html",
source,
status: FetchStatus.SUCCESS,
};

beforeEach(() => {
vi.restoreAllMocks();
});

it("should fall back to browser fetcher on TLS certificate errors", async () => {
vi.spyOn(HttpFetcher.prototype, "fetch").mockRejectedValue(
new TlsCertificateError(source, "UNABLE_TO_VERIFY_LEAF_SIGNATURE"),
);
const browserSpy = vi
.spyOn(BrowserFetcher.prototype, "fetch")
.mockResolvedValue(browserResult);

const fetcher = new AutoDetectFetcher(scraperConfig);
const result = await fetcher.fetch(source);

expect(result).toBe(browserResult);
expect(browserSpy).toHaveBeenCalledWith(source, undefined);
});

it("should fall back to browser fetcher on challenge errors", async () => {
vi.spyOn(HttpFetcher.prototype, "fetch").mockRejectedValue(
new ChallengeError(source, 403, "cloudflare"),
);
const browserSpy = vi
.spyOn(BrowserFetcher.prototype, "fetch")
.mockResolvedValue(browserResult);

const fetcher = new AutoDetectFetcher(scraperConfig);
const result = await fetcher.fetch(source);

expect(result).toBe(browserResult);
expect(browserSpy).toHaveBeenCalledWith(source, undefined);
});

it("should rethrow non-fallback errors", async () => {
const error = new Error("boom");
vi.spyOn(HttpFetcher.prototype, "fetch").mockRejectedValue(error);
const browserSpy = vi.spyOn(BrowserFetcher.prototype, "fetch");

const fetcher = new AutoDetectFetcher(scraperConfig);

await expect(fetcher.fetch(source)).rejects.toThrow(error);
expect(browserSpy).not.toHaveBeenCalled();
});
});
8 changes: 7 additions & 1 deletion src/scraper/fetcher/AutoDetectFetcher.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
*/

import type { AppConfig } from "../../utils/config";
import { ChallengeError } from "../../utils/errors";
import { ChallengeError, TlsCertificateError } from "../../utils/errors";
import { logger } from "../../utils/logger";
import { BrowserFetcher } from "./BrowserFetcher";
import { FileFetcher } from "./FileFetcher";
Expand Down Expand Up @@ -64,6 +64,12 @@ export class AutoDetectFetcher implements ContentFetcher {
);
return this.browserFetcher.fetch(source, options);
}
if (error instanceof TlsCertificateError) {
logger.info(
`🔄 TLS certificate validation failed for ${source}, falling back to browser fetcher...`,
);
return this.browserFetcher.fetch(source, options);
}
throw error;
}
}
Expand Down
22 changes: 21 additions & 1 deletion src/scraper/fetcher/HttpFetcher.test.ts
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import { beforeEach, describe, expect, it, vi } from "vitest";
import { CancellationError } from "../../pipeline/errors";
import { loadConfig } from "../../utils/config";
import { RedirectError, ScraperError } from "../../utils/errors";
import { RedirectError, ScraperError, TlsCertificateError } from "../../utils/errors";

vi.mock("axios");

Expand Down Expand Up @@ -322,6 +322,26 @@ describe("HttpFetcher", () => {
expect(result.status).toBe("not_found");
expect(mockedAxios.get).toHaveBeenCalledTimes(1); // No retries for 404
});

it("should not retry on TLS certificate validation errors", async () => {
const fetcher = createFetcher();
const tlsError = Object.assign(
new Error("unable to verify the first certificate"),
{
code: "UNABLE_TO_VERIFY_LEAF_SIGNATURE",
},
);
mockedAxios.get.mockRejectedValue(tlsError);

await expect(
fetcher.fetch("https://example.com", {
maxRetries: 2,
retryDelay: 1,
}),
).rejects.toBeInstanceOf(TlsCertificateError);

expect(mockedAxios.get).toHaveBeenCalledTimes(1);
});
});

it("should generate fingerprint headers", async () => {
Expand Down
28 changes: 26 additions & 2 deletions src/scraper/fetcher/HttpFetcher.ts
Original file line number Diff line number Diff line change
@@ -1,7 +1,12 @@
import axios, { type AxiosError, type AxiosRequestConfig } from "axios";
import { CancellationError } from "../../pipeline/errors";
import type { AppConfig } from "../../utils/config";
import { ChallengeError, RedirectError, ScraperError } from "../../utils/errors";
import {
ChallengeError,
RedirectError,
ScraperError,
TlsCertificateError,
} from "../../utils/errors";
import { logger } from "../../utils/logger";
import { MimeTypeUtils } from "../../utils/mimeTypeUtils";
import { FingerprintGenerator } from "./FingerprintGenerator";
Expand Down Expand Up @@ -39,6 +44,16 @@ export class HttpFetcher implements ContentFetcher {
"EPERM", // Operation not permitted
];

private readonly tlsCertificateErrorCodes = [
"CERT_HAS_EXPIRED",
"DEPTH_ZERO_SELF_SIGNED_CERT",
"ERR_TLS_CERT_ALTNAME_INVALID",
"SELF_SIGNED_CERT_IN_CHAIN",
"UNABLE_TO_GET_ISSUER_CERT",
"UNABLE_TO_GET_ISSUER_CERT_LOCALLY",
"UNABLE_TO_VERIFY_LEAF_SIGNATURE",
];

private fingerprintGenerator: FingerprintGenerator;

constructor(scraperConfig: AppConfig["scraper"]) {
Expand All @@ -55,6 +70,10 @@ export class HttpFetcher implements ContentFetcher {
return new Promise((resolve) => setTimeout(resolve, ms));
}

private isTlsCertificateError(code?: string): boolean {
return code ? this.tlsCertificateErrorCodes.includes(code) : false;
}

async fetch(source: string, options?: FetchOptions): Promise<RawContent> {
const maxRetries = options?.maxRetries ?? this.maxRetriesDefault;
const baseDelay = options?.retryDelay ?? this.baseDelayDefaultMs;
Expand Down Expand Up @@ -180,6 +199,7 @@ export class HttpFetcher implements ContentFetcher {
const axiosError = error as AxiosError;
const status = axiosError.response?.status;
const code = axiosError.code;
const errorCause = error instanceof Error ? error : undefined;

// Handle abort/cancel: do not retry, throw CancellationError
if (options?.signal?.aborted || code === "ERR_CANCELED") {
Expand Down Expand Up @@ -240,6 +260,10 @@ export class HttpFetcher implements ContentFetcher {
}
}

if (this.isTlsCertificateError(code)) {
throw new TlsCertificateError(source, code, errorCause);
}

if (
attempt < maxRetries &&
(status === undefined || this.retryableStatusCodes.includes(status)) &&
Expand All @@ -261,7 +285,7 @@ export class HttpFetcher implements ContentFetcher {
attempt + 1
} attempts: ${axiosError.message ?? "Unknown error"}`,
true,
error instanceof Error ? error : undefined,
errorCause,
);
}
}
Expand Down
24 changes: 23 additions & 1 deletion src/utils/errors.ts
Original file line number Diff line number Diff line change
Expand Up @@ -44,4 +44,26 @@ class ChallengeError extends ScraperError {
}
}

export { ChallengeError, InvalidUrlError, RedirectError, ScraperError };
class TlsCertificateError extends ScraperError {
constructor(
public readonly url: string,
public readonly code?: string,
cause?: Error,
) {
super(
`TLS certificate validation failed for ${url}${
code ? ` (${code})` : ""
}. The remote site may have an incomplete or untrusted certificate chain.`,
false,
cause,
);
}
}

export {
ChallengeError,
InvalidUrlError,
RedirectError,
ScraperError,
TlsCertificateError,
};
Loading