-
Notifications
You must be signed in to change notification settings - Fork 1.3k
refactor: use ContextPipeline to initialize BasicCrawler's context idiomatically
#3388
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
57267ac
f3eb103
d878cc3
42a47d4
7ae1ecf
74ed230
29a7b8a
3ca5f40
ac4cd1f
35b769c
d08424b
2564f85
2eecaad
9e74fe7
4f5d471
b744f96
93d952a
b0adeb9
f6c13bb
075af99
c1ba7f1
9c5bb8f
953c191
797003b
f7c82a3
c3f540d
6f9a182
0b461a9
b2e8f86
2e54388
ae18594
80acd8c
edadb76
c54a0d9
affe1c9
5959bd2
873d8d9
dd010a8
92b22f1
b619211
83b1c71
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Large diffs are not rendered by default.
| Original file line number | Diff line number | Diff line change | ||
|---|---|---|---|---|
|
|
@@ -150,7 +150,7 @@ export interface AdaptivePlaywrightCrawlerContext<UserData extends Dictionary = | |||
| */ | ||||
| parseWithCheerio(selector?: string, timeoutMs?: number): Promise<CheerioRoot>; | ||||
|
|
||||
| enqueueLinks(options?: EnqueueLinksOptions): Promise<void>; | ||||
| enqueueLinks(options?: EnqueueLinksOptions): Promise<unknown>; | ||||
| } | ||||
|
|
||||
| interface AdaptiveHook | ||||
|
|
@@ -299,17 +299,11 @@ export class AdaptivePlaywrightCrawler< | |||
|
|
||||
| super({ | ||||
| ...rest, | ||||
| // Pass error handlers to the "main" crawler - we only pluck them from `rest` so that they don't go to the sub crawlers | ||||
| errorHandler, | ||||
| failedRequestHandler, | ||||
| // Same for request handler | ||||
| requestHandler, | ||||
| // The builder intentionally returns null so that it crashes the crawler when it tries to use this instead of one of two the specialized context pipelines | ||||
| // (that would be a logical error in this class) | ||||
| contextPipelineBuilder: () => | ||||
| null as unknown as ContextPipeline<CrawlingContext, AdaptivePlaywrightCrawlerContext>, | ||||
| contextPipelineBuilder: contextPipelineBuilder ?? (() => this.buildContextPipeline()), | ||||
| }); | ||||
|
|
||||
| this.individualRequestHandlerTimeoutMillis = requestHandlerTimeoutSecs * 1000; | ||||
|
|
||||
| this.renderingTypePredictor = | ||||
|
|
@@ -408,6 +402,34 @@ export class AdaptivePlaywrightCrawler< | |||
| return await super._init(); | ||||
| } | ||||
|
|
||||
| protected override buildContextPipeline() { | ||||
| const errorMessage = (prop: string) => | ||||
| `The \`${prop}\` property is not available on the outer context pipeline of AdaptivePlaywrightCrawler - it is provided by the inner (static/browser) pipelines`; | ||||
|
|
||||
| return super.buildContextPipeline().compose({ | ||||
| action: async ({ request }) => ({ | ||||
| get request(): LoadedRequest<Request<Dictionary>> { | ||||
| return request as LoadedRequest<Request<Dictionary>>; | ||||
| }, | ||||
| get response(): Response { | ||||
| throw new Error(errorMessage('response')); | ||||
| }, | ||||
| get page(): Page { | ||||
| throw new Error(errorMessage('page')); | ||||
| }, | ||||
| get querySelector(): AdaptivePlaywrightCrawlerContext['querySelector'] { | ||||
| throw new Error(errorMessage('querySelector')); | ||||
| }, | ||||
| get waitForSelector(): AdaptivePlaywrightCrawlerContext['waitForSelector'] { | ||||
| throw new Error(errorMessage('waitForSelector')); | ||||
| }, | ||||
| get parseWithCheerio(): AdaptivePlaywrightCrawlerContext['parseWithCheerio'] { | ||||
| throw new Error(errorMessage('parseWithCheerio')); | ||||
| }, | ||||
| }), | ||||
| }); | ||||
| } | ||||
|
|
||||
| private async adaptCheerioContext(cheerioContext: CheerioCrawlingContext) { | ||||
| // Capture the original response to avoid infinite recursion when the getter is copied to the context | ||||
| const result = this.resultObjects.get(cheerioContext); | ||||
|
|
@@ -507,28 +529,29 @@ export class AdaptivePlaywrightCrawler< | |||
| pushData: result.pushData, | ||||
| useState: this.allowStorageAccess(useStateFunction), | ||||
| getKeyValueStore: this.allowStorageAccess(result.getKeyValueStore), | ||||
| enqueueLinks: async (options: SetRequired<EnqueueLinksOptions, 'urls'>) => { | ||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why is this no longer necessary again?
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. As far as I understand, this gets overwritten by the crawlee/packages/playwright-crawler/src/internals/adaptive-playwright-crawler.ts Line 486 in dd010a8
Both of these call |
||||
| return await this.enqueueLinks(options, context.request, result); | ||||
| }, | ||||
| log: this.createLogProxy(context.log, logs), | ||||
| registerDeferredCleanup: (cleanup: () => Promise<unknown>) => deferredCleanup.push(cleanup), | ||||
| }; | ||||
|
|
||||
| const subCrawlerContext = { ...context, ...resultBoundContextHelpers }; | ||||
| const subCrawlerContext = Object.defineProperties( | ||||
| {}, | ||||
| Object.getOwnPropertyDescriptors(context), | ||||
| ) as typeof context; | ||||
|
|
||||
| // Mark result-bound helpers as non-configurable so they survive the sub-crawler context pipeline | ||||
| // (which would otherwise override them with the sub-crawler's own versions, losing the result binding). | ||||
| for (const [key, descriptor] of Object.entries(Object.getOwnPropertyDescriptors(resultBoundContextHelpers))) { | ||||
|
barjin marked this conversation as resolved.
|
||||
| Object.defineProperty(subCrawlerContext, key, { ...descriptor, configurable: false }); | ||||
| } | ||||
|
|
||||
| this.resultObjects.set(subCrawlerContext, result); | ||||
|
|
||||
| try { | ||||
| const callAdaptiveRequestHandler = async () => { | ||||
| if (renderingType === 'static') { | ||||
| await this.staticContextPipeline.call( | ||||
| subCrawlerContext, | ||||
| async (finalContext) => await this.requestHandler(finalContext), | ||||
| ); | ||||
| await this.staticContextPipeline.call(subCrawlerContext, this.requestHandler.bind(this)); | ||||
| } else if (renderingType === 'clientOnly') { | ||||
| await this.browserContextPipeline.call( | ||||
| subCrawlerContext, | ||||
| async (finalContext) => await this.requestHandler(finalContext), | ||||
| ); | ||||
| await this.browserContextPipeline.call(subCrawlerContext, this.requestHandler.bind(this)); | ||||
| } | ||||
| }; | ||||
|
|
||||
|
|
||||
Uh oh!
There was an error while loading. Please reload this page.