Skip to content
19 changes: 16 additions & 3 deletions packages/utils/src/internals/robots.ts
Original file line number Diff line number Diff line change
Expand Up @@ -36,13 +36,20 @@ export class RobotsTxtFile {
* Determine the location of a robots.txt file for a URL and fetch it.
* @param url the URL to fetch robots.txt for
* @param [proxyUrl] a proxy to be used for fetching the robots.txt file
* @param [options] additional options
* @param [options.signal] an AbortSignal to cancel the request
* @param [options.timeoutMillis] timeout in milliseconds for the request
*/
static async find(url: string, proxyUrl?: string): Promise<RobotsTxtFile> {
static async find(
url: string,
proxyUrl?: string,
options?: { signal?: AbortSignal; timeoutMillis?: number },
): Promise<RobotsTxtFile> {
const robotsTxtFileUrl = new URL(url);
robotsTxtFileUrl.pathname = '/robots.txt';
robotsTxtFileUrl.search = '';

return RobotsTxtFile.load(robotsTxtFileUrl.toString(), proxyUrl);
return RobotsTxtFile.load(robotsTxtFileUrl.toString(), proxyUrl, options);
}

/**
Expand All @@ -55,7 +62,11 @@ export class RobotsTxtFile {
return new RobotsTxtFile(robotsParser(url, content), proxyUrl);
}

protected static async load(url: string, proxyUrl?: string): Promise<RobotsTxtFile> {
protected static async load(
url: string,
proxyUrl?: string,
options?: { signal?: AbortSignal; timeoutMillis?: number },
): Promise<RobotsTxtFile> {
if (!HTTPError) {
HTTPError = (await import('got-scraping')).HTTPError;
}
Expand All @@ -66,6 +77,8 @@ export class RobotsTxtFile {
proxyUrl,
method: 'GET',
responseType: 'text',
signal: options?.signal,
...(options?.timeoutMillis ? { timeout: { request: options.timeoutMillis } } : {}),
});

return new RobotsTxtFile(robotsParser(url.toString(), response.body), proxyUrl);
Expand Down
75 changes: 45 additions & 30 deletions packages/utils/src/internals/sitemap.ts
Original file line number Diff line number Diff line change
Expand Up @@ -454,13 +454,42 @@ export async function* discoverValidSitemaps(
* Proxy URL to be used for network requests.
*/
proxyUrl?: string;
/**
* Timeout in milliseconds for the entire `discoverValidSitemaps` call.
* An `AbortController` is created internally and its signal is passed to every HTTP request,
* so the whole discovery operation is cancelled once the timeout elapses.
* Defaults to `60_000` ms (60 seconds) to prevent indefinite hangs.
*/
timeoutMillis?: number;
/**
* An external `AbortSignal` to cancel the entire discovery operation.
* If both `signal` and `timeout` are provided, the operation is cancelled
* when either the signal is aborted or the timeout elapses (whichever comes first).
*/
signal?: AbortSignal;
/**
* Timeout in milliseconds for each individual HTTP request during discovery.
* Defaults to `20000` ms (20 seconds).
*/
requestTimeoutMillis?: number;
} = {},
): AsyncIterable<string> {
const { proxyUrl } = options;
const { proxyUrl, timeoutMillis = 60_000, signal: externalSignal, requestTimeoutMillis = 20_000 } = options;
const controller = new AbortController();

const timeoutHandle = setTimeout(() => controller.abort(), timeoutMillis);
const onExternalAbort = () => controller.abort();
if (externalSignal) {
if (externalSignal.aborted) {
controller.abort();
} else {
externalSignal.addEventListener('abort', onExternalAbort, { once: true });
}
}

const signal = controller.signal;
const { gotScraping } = await import('got-scraping');
const sitemapUrls = new Set<string>();
// Keep each probe bounded so discovery cannot stall indefinitely on a single request.
const DISCOVERY_REQUEST_TIMEOUT_MILLIS = 20_000;

const addSitemapUrl = (url: string): string | undefined => {
const sizeBefore = sitemapUrls.size;
Expand All @@ -474,33 +503,15 @@ export async function* discoverValidSitemaps(
return undefined;
};

const runWithTimeout = async <T>(
promise: Promise<T>,
timeoutMillis: number,
timeoutMessage: string,
): Promise<T> => {
let timeout: ReturnType<typeof setTimeout> | undefined;
const timeoutPromise = new Promise<never>((_, reject) => {
timeout = setTimeout(() => reject(new Error(timeoutMessage)), timeoutMillis);
});

try {
return await Promise.race([promise, timeoutPromise]);
} finally {
if (timeout !== undefined) {
clearTimeout(timeout);
}
}
};

const urlExists = async (url: string) => {
const response = await gotScraping({
url,
method: 'HEAD',
proxyUrl,
timeout: {
request: DISCOVERY_REQUEST_TIMEOUT_MILLIS,
request: requestTimeoutMillis,
},
signal,
});

return response.statusCode >= 200 && response.statusCode < 400;
Expand All @@ -512,11 +523,10 @@ export async function* discoverValidSitemaps(
}

try {
const robotsFile = await runWithTimeout(
RobotsFile.find(domainUrls[0], proxyUrl),
DISCOVERY_REQUEST_TIMEOUT_MILLIS,
`Fetching robots.txt timed out for ${hostname}`,
);
const robotsFile = await RobotsFile.find(domainUrls[0], proxyUrl, {
timeoutMillis: requestTimeoutMillis,
signal,
});
for (const sitemapUrl of robotsFile.getSitemaps()) {
if (addSitemapUrl(sitemapUrl)) {
yield sitemapUrl;
Expand Down Expand Up @@ -568,7 +578,12 @@ export async function* discoverValidSitemaps(
discoverSitemapsForDomainUrls(hostname, domainUrls),
);

for await (const url of mergeAsyncIterables(...iterables)) {
yield url;
try {
for await (const url of mergeAsyncIterables(...iterables)) {
yield url;
}
} finally {
clearTimeout(timeoutHandle);
externalSignal?.removeEventListener('abort', onExternalAbort);
}
}
42 changes: 42 additions & 0 deletions packages/utils/test/robots.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ describe('RobotsTxtFile', () => {
nock('http://not-exists.com')
.persist()
.get('/robots.txt')
.delay(500)
.reply(
200,
[
Expand Down Expand Up @@ -57,6 +58,47 @@ describe('RobotsTxtFile', () => {
]);
});

it('respects user-set timeout', async () => {
const start = +Date.now();
const robots = RobotsTxtFile.find('http://not-exists.com/robots.txt', undefined, { timeoutMillis: 200 });

await expect(robots).rejects.toThrow(/timeout/i);
const end = +Date.now();

expect(end - start).toBeGreaterThanOrEqual(200);
expect(end - start).toBeLessThanOrEqual(500);
});

it('respects AbortSignal parameter', async () => {
const controller = new AbortController();
setTimeout(() => controller.abort(), 200);

const start = +Date.now();
const robots = RobotsTxtFile.find('http://not-exists.com/robots.txt', undefined, { signal: controller.signal });

await expect(robots).rejects.toThrow(/aborted/i);
const end = +Date.now();

expect(end - start).toBeGreaterThanOrEqual(200);
expect(end - start).toBeLessThanOrEqual(500);
});

it('respects AbortSignal parameter and timeout together', async () => {
const controller = new AbortController();

const start = +Date.now();
const robots = RobotsTxtFile.find('http://not-exists.com/robots.txt', undefined, {
signal: controller.signal,
timeoutMillis: 200,
});

await expect(robots).rejects.toThrow(/timeout/i);
const end = +Date.now();

expect(end - start).toBeGreaterThanOrEqual(200);
expect(end - start).toBeLessThanOrEqual(500);
});

it('parses allow/deny directives from explicitly provided robots.txt contents', async () => {
const contents = `User-agent: *',
Disallow: *deny_all/
Expand Down
86 changes: 86 additions & 0 deletions packages/utils/test/sitemap.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -588,4 +588,90 @@ describe('discoverValidSitemaps', () => {
'http://domain-b.com/sitemap.txt',
]);
});

it('aborts when timeoutMillis elapses', async () => {
nock('http://slow-site.com')
.get('/robots.txt')
.delay(5_000)
.reply(200, 'Sitemap: http://slow-site.com/sitemap.xml');

const start = Date.now();
const urls = [];
for await (const url of discoverValidSitemaps(['http://slow-site.com'], { timeoutMillis: 100 })) {
urls.push(url);
}
const elapsed = Date.now() - start;

expect(urls).toEqual([]);
expect(elapsed).toBeLessThan(2_000);
});

it('aborts when external signal is triggered', async () => {
nock('http://slow-site.com')
.get('/robots.txt')
.delay(5_000)
.reply(200, 'Sitemap: http://slow-site.com/sitemap.xml');

const ac = new AbortController();
setTimeout(() => ac.abort(), 100);

const start = Date.now();
const urls = [];
for await (const url of discoverValidSitemaps(['http://slow-site.com'], {
timeoutMillis: 60_000,
signal: ac.signal,
})) {
urls.push(url);
}
const elapsed = Date.now() - start;

expect(urls).toEqual([]);
expect(elapsed).toBeLessThan(2_000);
});

it('aborts immediately when signal is already aborted', async () => {
nock('http://slow-site.com')
.get('/robots.txt')
.delay(5_000)
.reply(200, 'Sitemap: http://slow-site.com/sitemap.xml');

const ac = new AbortController();
ac.abort();

const start = Date.now();
const urls = [];
for await (const url of discoverValidSitemaps(['http://slow-site.com'], { signal: ac.signal })) {
urls.push(url);
}
const elapsed = Date.now() - start;

expect(urls).toEqual([]);
expect(elapsed).toBeLessThan(1_000);
});

it('requestTimeoutMillis aborts slow robots.txt without killing the whole discovery', async () => {
nock('http://slow-site.com')
.get('/robots.txt')
.delay(5_000)
.reply(200, 'Sitemap: http://slow-site.com/sitemap.xml')
.head('/sitemap.xml')
.reply(200, '')
.head('/sitemap.txt')
.reply(404, '')
.head('/sitemap_index.xml')
.reply(404, '');

const start = Date.now();
const urls = [];
for await (const url of discoverValidSitemaps(['http://slow-site.com'], {
timeoutMillis: 30_000,
requestTimeoutMillis: 100,
})) {
urls.push(url);
}
const elapsed = Date.now() - start;

expect(urls).toEqual(['http://slow-site.com/sitemap.xml']);
expect(elapsed).toBeLessThan(2_000);
});
});
Loading