Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
82 changes: 77 additions & 5 deletions src/scraper/fetcher/HttpFetcher.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -133,8 +133,8 @@ describe("HttpFetcher", () => {
describe("configuration defaults", () => {
it("should use default max retries when not specified", async () => {
const fetcher = createFetcher();
// Mock failure for all attempts - use a retryable error
mockedAxios.get.mockRejectedValue({ response: { status: 500 } });
// Mock failure for all attempts - use a retryable error (503)
mockedAxios.get.mockRejectedValue({ response: { status: 503 } });

await expect(
fetcher.fetch("https://example.com", {
Expand All @@ -149,7 +149,7 @@ describe("HttpFetcher", () => {

it("should respect custom maxRetries option", async () => {
const fetcher = createFetcher();
mockedAxios.get.mockRejectedValue({ response: { status: 500 } });
mockedAxios.get.mockRejectedValue({ response: { status: 503 } });

await expect(
fetcher.fetch("https://example.com", {
Expand Down Expand Up @@ -270,9 +270,9 @@ describe("HttpFetcher", () => {
});

describe("retry logic", () => {
it("should retry on retryable status codes [408, 429, 500, 502, 503, 504, 525]", async () => {
it("should retry on retryable status codes [408, 429, 502, 503, 504, 525]", async () => {
const fetcher = createFetcher();
const retryableStatuses = [408, 429, 500, 502, 503, 504, 525];
const retryableStatuses = [408, 429, 502, 503, 504, 525];

for (const status of retryableStatuses) {
mockedAxios.get.mockReset();
Expand All @@ -292,6 +292,78 @@ describe("HttpFetcher", () => {
}
});

it("should retry 500 at most 3 times (then fail)", async () => {
const fetcher = createFetcher();
mockedAxios.get.mockRejectedValue({ response: { status: 500 } });

await expect(
fetcher.fetch("https://example.com", {
maxRetries: 10, // would allow 10 retries for other codes
retryDelay: 1,
}),
).rejects.toThrow(ScraperError);

expect(mockedAxios.get).toHaveBeenCalledTimes(3); // 1 initial + 2 retries, then stop
});

it("should succeed on 2nd or 3rd attempt for 500", async () => {
const fetcher = createFetcher();
mockedAxios.get.mockReset();
mockedAxios.get.mockRejectedValueOnce({ response: { status: 500 } });
mockedAxios.get.mockResolvedValueOnce({
data: Buffer.from("success", "utf-8"),
headers: { "content-type": "text/plain" },
});

const result = await fetcher.fetch("https://example.com", {
maxRetries: 10,
retryDelay: 1,
});
expect(result.content).toEqual(Buffer.from("success", "utf-8"));
expect(mockedAxios.get).toHaveBeenCalledTimes(2); // 1 fail + 1 success
});

it("should retry on network errors not in blocklist (ETIMEDOUT, ECONNRESET, EAI_AGAIN)", async () => {
const fetcher = createFetcher();
const retryableCodes = ["ETIMEDOUT", "ECONNRESET", "EAI_AGAIN"];

for (const code of retryableCodes) {
mockedAxios.get.mockReset();
mockedAxios.get.mockRejectedValueOnce({ code });
mockedAxios.get.mockResolvedValueOnce({
data: Buffer.from("success", "utf-8"),
headers: { "content-type": "text/plain" },
});

const result = await fetcher.fetch("https://example.com", {
maxRetries: 1,
retryDelay: 1,
});

expect(result.content).toEqual(Buffer.from("success", "utf-8"));
expect(mockedAxios.get).toHaveBeenCalledTimes(2); // Initial + 1 retry
}
});

it("should not retry on blocklisted network errors (ECONNREFUSED, ENOTFOUND)", async () => {
const fetcher = createFetcher();
const blocklistedCodes = ["ECONNREFUSED", "ENOTFOUND"];

for (const code of blocklistedCodes) {
mockedAxios.get.mockReset();
mockedAxios.get.mockRejectedValue({ code });

await expect(
fetcher.fetch("https://example.com", {
maxRetries: 2,
retryDelay: 1,
}),
).rejects.toThrow(ScraperError);

expect(mockedAxios.get).toHaveBeenCalledTimes(1); // No retries
}
});

it("should not retry on non-retryable status codes [400, 401, 403, 404, 405, 410]", async () => {
const fetcher = createFetcher();
const nonRetryableStatuses = [400, 401, 403, 405, 410];
Expand Down
48 changes: 34 additions & 14 deletions src/scraper/fetcher/HttpFetcher.ts
Original file line number Diff line number Diff line change
Expand Up @@ -18,25 +18,30 @@ import {
export class HttpFetcher implements ContentFetcher {
private readonly maxRetriesDefault: number;
private readonly baseDelayDefaultMs: number;
/** HTTP status codes we retry. 500 is retried up to maxAttemptsFor500 (3 total). Others use full maxRetries. */
private readonly retryableStatusCodes = [
Comment thread
kenzaelk98 marked this conversation as resolved.
408, // Request Timeout
429, // Too Many Requests
500, // Internal Server Error
429, // Too Many Requests (rate limiting)
500, // Internal Server Error (capped at 3 attempts to fail faster when permanent)
502, // Bad Gateway
503, // Service Unavailable
504, // Gateway Timeout
525, // SSL Handshake Failed (Cloudflare specific)
525, // SSL Handshake Failed (Cloudflare; transient during cert rotation)
];

/** For 500 we cap at 3 attempts total (1 initial + 2 retries) to fail faster when the error is permanent. */
private readonly maxAttemptsFor500 = 3;

/** Network error codes that are permanent; we retry everything except these. */
private readonly nonRetryableErrorCodes = [
"ENOTFOUND", // DNS resolution failed - domain doesn't exist
"ECONNREFUSED", // Connection refused - service not running
"ENOENT", // No such file or directory
"EACCES", // Permission denied
"EINVAL", // Invalid argument
"EMFILE", // Too many open files
"ENFILE", // File table overflow
"EPERM", // Operation not permitted
"ENOTFOUND",
"ECONNREFUSED",
"ENOENT",
"EACCES",
"EINVAL",
"EMFILE",
"ENFILE",
"EPERM",
];

private fingerprintGenerator: FingerprintGenerator;
Expand All @@ -55,6 +60,16 @@ export class HttpFetcher implements ContentFetcher {
return new Promise((resolve) => setTimeout(resolve, ms));
}

/**
* Returns true when the error is worth retrying: retryable HTTP status, or network error not in the permanent blocklist.
*/
private shouldRetry(status: number | undefined, code: string | undefined): boolean {
Comment thread
kenzaelk98 marked this conversation as resolved.
if (status !== undefined) {
return this.retryableStatusCodes.includes(status);
}
return !this.nonRetryableErrorCodes.includes(code ?? "");
}

async fetch(source: string, options?: FetchOptions): Promise<RawContent> {
const maxRetries = options?.maxRetries ?? this.maxRetriesDefault;
const baseDelay = options?.retryDelay ?? this.baseDelayDefaultMs;
Expand Down Expand Up @@ -240,10 +255,11 @@ export class HttpFetcher implements ContentFetcher {
}
}

const cappedFor500 = status === 500 && attempt + 1 >= this.maxAttemptsFor500;
if (
attempt < maxRetries &&
(status === undefined || this.retryableStatusCodes.includes(status)) &&
!this.nonRetryableErrorCodes.includes(code ?? "")
this.shouldRetry(status, code ?? undefined) &&
!cappedFor500
) {
const delay = baseDelay * 2 ** attempt;
logger.warn(
Expand All @@ -255,7 +271,11 @@ export class HttpFetcher implements ContentFetcher {
continue;
}

// Not a 5xx error or max retries reached
if (attempt < maxRetries && (status !== undefined || code)) {
logger.warn(`Permanent error, not retrying: ${status ?? code} (${source})`);
}

// Permanent error or max retries reached
throw new ScraperError(
`Failed to fetch ${source} after ${
attempt + 1
Expand Down