Skip to content
Merged
28 changes: 21 additions & 7 deletions packages/basic-crawler/src/internals/basic-crawler.ts
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ import {
KeyValueStore,
LogLevel,
mergeCookies,
NavigationSkippedError,
NonRetryableError,
purgeDefaultStorages,
RequestHandlerError,
Expand Down Expand Up @@ -912,13 +913,13 @@ export class BasicCrawler<
try {
await this.basicContextPipeline
.chain(this.contextPipeline)
.call(crawlingContext, (ctx) => this.handleRequest(ctx, source));
.call(crawlingContext, (ctx) => this.handleRequest(ctx, source, request));
} catch (error) {
// ContextPipelineInterruptedError means the request was intentionally skipped
// (e.g., doesn't match enqueue strategy after redirect). Just return gracefully.
if (error instanceof ContextPipelineInterruptedError) {
await this._timeoutAndRetry(
async () => this.requestManager?.markRequestHandled(crawlingContext.request!),
async () => this.requestManager?.markRequestHandled(request),
this.internalTimeoutMillis,
`Marking request ${crawlingContext.request.url} (${crawlingContext.request.id}) as handled timed out after ${
this.internalTimeoutMillis / 1e3
Expand All @@ -937,6 +938,7 @@ export class BasicCrawler<
await this._requestFunctionErrorHandler(
unwrappedError,
crawlingContext as CrawlingContext,
request,
this.requestManager!,
);
crawlingContext.session?.markBad();
Expand Down Expand Up @@ -1838,9 +1840,7 @@ export class BasicCrawler<
}

/** Handles a single request - runs the request handler with retries, error handling, and lifecycle management. */
protected async handleRequest(crawlingContext: ExtendedContext, requestSource: IRequestManager) {
const { request } = crawlingContext;

protected async handleRequest(crawlingContext: ExtendedContext, requestSource: IRequestManager, request: Request) {
const statisticsId = request.id || request.uniqueKey;
this.stats.startJob(statisticsId);

Expand Down Expand Up @@ -1871,7 +1871,7 @@ export class BasicCrawler<
try {
request.state = RequestState.ERROR_HANDLER;
await addTimeoutToPromise(
async () => this._requestFunctionErrorHandler(err, crawlingContext, requestSource),
async () => this._requestFunctionErrorHandler(err, crawlingContext, request, requestSource),
this.internalTimeoutMillis,
`Handling request failure of ${request.url} (${request.id}) timed out after ${
this.internalTimeoutMillis / 1e3
Expand Down Expand Up @@ -2052,13 +2052,15 @@ export class BasicCrawler<

/**
* Handles errors thrown by user provided requestHandler()
*
* @param request The request object, passed separately to circumvent potential dynamic logic in crawlingContext.request
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is fishy, crawlingContext.request and request should be the same reference here, no?

Do crawler subclasses reassign crawlingContext.request? If so, I would start by correcting that. We've had some really annoying issues when a similar thing happened with Session some time ago.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The problem is that methods such as markRequestAsHandled destructure the request, which triggers the loadedUrl trap. So we just pass the unproxied request in there.

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I mean, we could always

const { common, safe, fields } = request;
const loadedUrl = !request.skipNavigation ? request.loadedUrl : null;

whereever we need this. With the throwing access, we're forcing the users to do this anyway, right?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, it's just that we'd need to touch all kinds off code paths in the request queue/provider hierarchy and I'd rather not, right now.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

forcing the users to do this anyway, right?

yeah, but just the poor souls who decide that they need skipNavigation

*/
protected async _requestFunctionErrorHandler(
error: Error,
crawlingContext: CrawlingContext,
request: Request,
source: IRequestList | IRequestManager,
): Promise<void> {
const { request } = crawlingContext;
request.pushErrorMessage(error);

if (error instanceof CriticalError) {
Expand Down Expand Up @@ -2256,6 +2258,18 @@ export class BasicCrawler<
}

private requestMatchesEnqueueStrategy(request: Request) {
// If `skipNavigation` was used, just return `true`
try {
// eslint-disable-next-line @typescript-eslint/no-unused-expressions
request.loadedUrl;
Comment thread
l2ysho marked this conversation as resolved.
} catch (err) {
if (err instanceof NavigationSkippedError) {
return true;
}

throw err;
}

const { url, loadedUrl } = request;

// eslint-disable-next-line dot-notation -- private access
Expand Down
7 changes: 5 additions & 2 deletions packages/browser-crawler/src/internals/browser-crawler.ts
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ import {
enqueueLinks,
EVENT_SESSION_RETIRED,
handleRequestTimeout,
NavigationSkippedError,
RequestState,
resolveBaseUrlForEnqueueLinksFiltering,
SessionError,
Expand Down Expand Up @@ -508,15 +509,17 @@ export abstract class BrowserCrawler<
request: new Proxy(crawlingContext.request, {
get(target, propertyName, receiver) {
if (propertyName === 'loadedUrl') {
throw new Error(
throw new NavigationSkippedError(
'The `request.loadedUrl` property is not available - `skipNavigation` was used',
);
}
return Reflect.get(target, propertyName, receiver);
},
}) as LoadedRequest<Request>,
get response(): Response {
throw new Error('The `response` property is not available - `skipNavigation` was used');
throw new NavigationSkippedError(
'The `response` property is not available - `skipNavigation` was used',
);
},
};
}
Expand Down
53 changes: 40 additions & 13 deletions packages/cheerio-crawler/src/internals/cheerio-crawler.ts
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,13 @@ import type {
RouterRoutes,
SkippedRequestCallback,
} from '@crawlee/http';
import { enqueueLinks, HttpCrawler, resolveBaseUrlForEnqueueLinksFiltering, Router } from '@crawlee/http';
import {
enqueueLinks,
HttpCrawler,
NavigationSkippedError,
resolveBaseUrlForEnqueueLinksFiltering,
Router,
} from '@crawlee/http';
import type { BatchAddRequestsResult, Dictionary } from '@crawlee/types';
import { type CheerioRoot, extractUrlsFromCheerio, type RobotsTxtFile } from '@crawlee/utils';
import type { CheerioAPI, CheerioOptions } from 'cheerio';
Expand Down Expand Up @@ -194,19 +200,40 @@ export class CheerioCrawler<
}

private async parseContent(crawlingContext: InternalHttpCrawlingContext) {
const isXml = crawlingContext.contentType.type.includes('xml');
const body = Buffer.isBuffer(crawlingContext.body)
? crawlingContext.body.toString(crawlingContext.contentType.encoding)
: crawlingContext.body;
const dom = parseDocument(body, { decodeEntities: true, xmlMode: isXml });
const $ = cheerio.load(dom, {
xml: { decodeEntities: true, xmlMode: isXml },
} as CheerioOptions);
try {
const isXml = crawlingContext.contentType.type.includes('xml');
const body = Buffer.isBuffer(crawlingContext.body)
? crawlingContext.body.toString(crawlingContext.contentType.encoding)
: crawlingContext.body;
const dom = parseDocument(body, { decodeEntities: true, xmlMode: isXml });
const $ = cheerio.load(dom, {
xml: { decodeEntities: true, xmlMode: isXml },
} as CheerioOptions);

return {
$,
body,
};
return {
$,
body,
};
} catch (err) {
if (err instanceof NavigationSkippedError) {
return {
get body(): string {
throw new NavigationSkippedError(
'The `body` property is not available - `skipNavigation` was used',
{ cause: err },
);
},
get $(): CheerioAPI {
throw new NavigationSkippedError(
'The `$` property is not available - `skipNavigation` was used',
{ cause: err },
);
},
};
}

throw err;
}
}

private async addHelpers(crawlingContext: InternalHttpCrawlingContext & { $: CheerioAPI }) {
Expand Down
6 changes: 6 additions & 0 deletions packages/core/src/errors.ts
Original file line number Diff line number Diff line change
Expand Up @@ -78,3 +78,9 @@ export class ServiceConflictError extends Error {
);
}
}

/**
* Thrown by crawlers when `skipNavigation` is used on a request.
* Subclasses can catch this error to skip their own navigation-dependent logic.
*/
export class NavigationSkippedError extends NonRetryableError {}
20 changes: 18 additions & 2 deletions packages/core/src/request.ts
Original file line number Diff line number Diff line change
Expand Up @@ -275,12 +275,24 @@ class CrawleeRequest<UserData extends Dictionary = Dictionary> {
});
}

/** Tells the crawler processing this request to skip the navigation and process the request directly. */
/**
* Tells the crawler processing this request to skip the navigation and process the request directly.
*
* When this is set to `true`, the crawling context will not contain the results of the navigation
* (e.g. `response`, `body`, `contentType`, `$` or `request.loadedUrl`).
* Accessing these properties will throw a {@apilink NavigationSkippedError} at runtime.
*/
get skipNavigation(): boolean {
return this.userData.__crawlee?.skipNavigation ?? false;
}

/** Tells the crawler processing this request to skip the navigation and process the request directly. */
/**
* Tells the crawler processing this request to skip the navigation and process the request directly.
*
* When this is set to `true`, the crawling context will not contain the results of the navigation
* (e.g. `response`, `body`, `contentType`, `$` or `request.loadedUrl`).
* Accessing these properties will throw a {@apilink NavigationSkippedError} at runtime.
*/
set skipNavigation(value: boolean) {
if (!this.userData.__crawlee) {
(this.userData as Dictionary).__crawlee = { skipNavigation: value };
Expand Down Expand Up @@ -548,6 +560,10 @@ export interface RequestOptions<UserData extends Dictionary = Dictionary> {
/**
* If set to `true` then the crawler processing this request evaluates
* the `requestHandler` immediately without prior browser navigation.
*
* When enabled, the crawling context will not contain the results of the navigation
* (e.g. `response`, `body`, `contentType`, `$` or `request.loadedUrl`).
* Accessing these properties will throw a {@apilink NavigationSkippedError} at runtime.
* @default false
*/
skipNavigation?: boolean;
Expand Down
36 changes: 28 additions & 8 deletions packages/http-crawler/src/internals/http-crawler.ts
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,15 @@ import type {
RouterRoutes,
Session,
} from '@crawlee/basic';
import { BasicCrawler, ContextPipeline, mergeCookies, RequestState, Router, SessionError } from '@crawlee/basic';
import {
BasicCrawler,
ContextPipeline,
mergeCookies,
NavigationSkippedError,
RequestState,
Router,
SessionError,
} from '@crawlee/basic';
import type { LoadedRequest } from '@crawlee/core';
import { ResponseWithUrl } from '@crawlee/http-client';
import type { Awaitable, Dictionary } from '@crawlee/types';
Expand Down Expand Up @@ -392,15 +400,17 @@ export class HttpCrawler<
request: new Proxy(request, {
get(target, propertyName, receiver) {
if (propertyName === 'loadedUrl') {
throw new Error(
throw new NavigationSkippedError(
'The `request.loadedUrl` property is not available - `skipNavigation` was used',
);
}
return Reflect.get(target, propertyName, receiver);
},
}) as LoadedRequest<CrawleeRequest>,
get response(): InternalHttpCrawlingContext['response'] {
throw new Error('The `response` property is not available - `skipNavigation` was used');
throw new NavigationSkippedError(
Comment thread
janbuchar marked this conversation as resolved.
'The `response` property is not available - `skipNavigation` was used',
);
},
};
}
Expand Down Expand Up @@ -440,19 +450,29 @@ export class HttpCrawler<
if (crawlingContext.request.skipNavigation) {
return {
get contentType(): InternalHttpCrawlingContext['contentType'] {
throw new Error('The `contentType` property is not available - `skipNavigation` was used');
throw new NavigationSkippedError(
'The `contentType` property is not available - `skipNavigation` was used',
);
},
get body(): InternalHttpCrawlingContext['body'] {
throw new Error('The `body` property is not available - `skipNavigation` was used');
throw new NavigationSkippedError(
'The `body` property is not available - `skipNavigation` was used',
);
},
get json(): InternalHttpCrawlingContext['json'] {
throw new Error('The `json` property is not available - `skipNavigation` was used');
throw new NavigationSkippedError(
'The `json` property is not available - `skipNavigation` was used',
);
},
get waitForSelector(): InternalHttpCrawlingContext['waitForSelector'] {
throw new Error('The `waitForSelector` method is not available - `skipNavigation` was used');
throw new NavigationSkippedError(
'The `waitForSelector` method is not available - `skipNavigation` was used',
);
},
get parseWithCheerio(): InternalHttpCrawlingContext['parseWithCheerio'] {
throw new Error('The `parseWithCheerio` method is not available - `skipNavigation` was used');
throw new NavigationSkippedError(
'The `parseWithCheerio` method is not available - `skipNavigation` was used',
);
},
};
}
Expand Down
Loading
Loading