Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
41 commits
Select commit Hold shift + click to select a range
57267ac
chore: use `ContextPipeline` to initialize `Request` and `Session` id…
barjin Feb 5, 2026
f3eb103
chore: run linter
barjin Feb 5, 2026
d878cc3
chore: fix basic-crawler
barjin Feb 6, 2026
42a47d4
fix: implement `AdaptiveCrawler` tricks with the new context pipeline
barjin Feb 6, 2026
7ae1ecf
fix: restore missing enqueueLinks in adaptive crawler resultBoundCont…
barjin Feb 8, 2026
74ed230
chore: trim verbose JSDoc on buildContextPipeline()
barjin Feb 8, 2026
29a7b8a
refactor: use ContextPipeline<{}, CrawlingContext> for type-safe pipe…
barjin Feb 8, 2026
3ca5f40
chore: remove unnecessary comment in adaptive-playwright-crawler
barjin Feb 8, 2026
ac4cd1f
fix: `AdaptiveCrawler` patches to `BasicCrawler`
barjin Feb 9, 2026
35b769c
chore: apply PR comments
barjin Feb 12, 2026
d08424b
refactor: consistent use of contextPipelineBuilder / buildContextPipe…
barjin Feb 12, 2026
2564f85
chore: apply PR suggestion
barjin Feb 13, 2026
2eecaad
chore: extract `BasicCrawler` pipeline steps to methods
barjin Feb 16, 2026
9e74fe7
chore: make `BasicCrawler` pipeline start with `{ Request }`
barjin Feb 16, 2026
4f5d471
Merge branch 'v4' into chore/more-context-pipeline
barjin Feb 16, 2026
b744f96
chore: type fixes
barjin Feb 16, 2026
93d952a
chore: early return on `null` `request`
barjin Feb 16, 2026
b0adeb9
chore: tighter context builder types
barjin Feb 16, 2026
f6c13bb
chore: review comments
barjin Feb 17, 2026
075af99
chore: skip non-configurable properties in `ContextPipeline`
barjin Feb 17, 2026
c1ba7f1
Merge branch 'v4' into chore/more-context-pipeline
barjin Feb 23, 2026
9c5bb8f
chore: allow symbols in context extensions
barjin Feb 23, 2026
953c191
fix: separate `BasicCrawler` context pipeline runs only once for `Ada…
barjin Feb 23, 2026
797003b
fix: handle pipeline errors correctly with `.chain`
barjin Feb 23, 2026
f7c82a3
chore: run linter
barjin Feb 23, 2026
c3f540d
chore: add unit tests for the .chain() method
barjin Feb 24, 2026
6f9a182
chore: add better docs, tests
barjin Feb 24, 2026
0b461a9
refactor: extract `isAllowedBasedOnRobotsTxtFile` to a pipeline step
barjin Feb 24, 2026
b2e8f86
chore: fix comments about error handling
barjin Feb 24, 2026
2e54388
chore: improve type safety by supplying throwing getters
barjin Feb 26, 2026
ae18594
chore: run formatter
barjin Feb 26, 2026
80acd8c
chore: run formatter one more time
barjin Feb 26, 2026
edadb76
chore: fix failing tests
barjin Feb 26, 2026
c54a0d9
docs: add explanatory commetns
barjin Mar 2, 2026
affe1c9
chore: add tests for non-configurable properties in `ContextPipeline`
barjin Mar 2, 2026
5959bd2
fix: do not share cleanup state between pipeline invocations
barjin Mar 2, 2026
873d8d9
chore: fix optional types in `contextPipelineOptions`
barjin Mar 2, 2026
dd010a8
chore: fix ts build error
barjin Mar 2, 2026
92b22f1
chore: apply PR suggestions
barjin Mar 4, 2026
b619211
Merge branch 'v4' into chore/more-context-pipeline
barjin Mar 4, 2026
83b1c71
chore: pass request source to `handleRequest`
barjin Mar 4, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
367 changes: 233 additions & 134 deletions packages/basic-crawler/src/internals/basic-crawler.ts

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion packages/browser-crawler/src/internals/browser-crawler.ts
Original file line number Diff line number Diff line change
Expand Up @@ -411,7 +411,7 @@ export abstract class BrowserCrawler<
});
}

protected buildContextPipeline(): ContextPipeline<
protected override buildContextPipeline(): ContextPipeline<
CrawlingContext,
BrowserCrawlingContext<Page, Response, ProvidedController, Dictionary>
> {
Expand Down
20 changes: 13 additions & 7 deletions packages/cheerio-crawler/src/internals/cheerio-crawler.ts
Original file line number Diff line number Diff line change
Expand Up @@ -176,17 +176,23 @@ export class CheerioCrawler<
* All `CheerioCrawler` parameters are passed via an options object.
*/
constructor(options?: CheerioCrawlerOptions<ContextExtension, ExtendedContext>) {
const { contextPipelineBuilder, ...rest } = options ?? {};

super({
...options,
contextPipelineBuilder: () =>
this.buildContextPipeline()
.compose({
action: async (context) => await this.parseContent(context),
})
.compose({ action: async (context) => await this.addHelpers(context) }),
...rest,
contextPipelineBuilder: contextPipelineBuilder ?? (() => this.buildContextPipeline()),
});
}

protected override buildContextPipeline() {
return super
.buildContextPipeline()
.compose({
action: async (context) => await this.parseContent(context),
})
.compose({ action: async (context) => await this.addHelpers(context) });
}

private async parseContent(crawlingContext: InternalHttpCrawlingContext) {
const isXml = crawlingContext.contentType.type.includes('xml');
const body = Buffer.isBuffer(crawlingContext.body)
Expand Down
50 changes: 49 additions & 1 deletion packages/core/src/crawlers/context_pipeline.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
import type { Awaitable } from '@crawlee/types';

import log from '@apify/log';

import {
ContextPipelineCleanupError,
ContextPipelineInitializationError,
Expand Down Expand Up @@ -56,6 +58,19 @@ export abstract class ContextPipeline<TContextBase, TCrawlingContext extends TCo
middleware: ContextMiddleware<TCrawlingContext, TCrawlingContextExtension>,
): ContextPipeline<TContextBase, TCrawlingContext & TCrawlingContextExtension>;

/**
* Chains another pipeline onto this one. The other pipeline's base context must match
* this pipeline's output context. Returns a new pipeline that runs this pipeline's
* middlewares first, then the other pipeline's middlewares.
*
* @template TFinalContext - The final context type after the chained pipeline's transformations
* @param other - The pipeline to append after this one
* @returns A new ContextPipeline combining both pipelines' middlewares
*/
abstract chain<TFinalContext extends TCrawlingContext>(
other: ContextPipeline<TCrawlingContext, TFinalContext>,
): ContextPipeline<TContextBase, TFinalContext>;

/**
* Executes the middleware pipeline and passes the final context to a consumer function.
*
Expand Down Expand Up @@ -105,6 +120,21 @@ class ContextPipelineImpl<TContextBase, TCrawlingContext extends TContextBase> e
);
}

chain<TFinalContext extends TCrawlingContext>(
Comment thread
barjin marked this conversation as resolved.
other: ContextPipeline<TCrawlingContext, TFinalContext>,
): ContextPipeline<TContextBase, TFinalContext> {
const otherMiddlewares = Array.from(
(other as any).middlewareChain() as Iterable<ContextMiddleware<any, any>>,
).reverse();

let result: ContextPipeline<TContextBase, any> = this as any;
for (const middleware of otherMiddlewares) {
result = result.compose(middleware as any);
}

return result as ContextPipeline<TContextBase, TFinalContext>;
}
Comment thread
barjin marked this conversation as resolved.

private *middlewareChain() {
let step: ContextPipelineImpl<TContextBase, TContextBase> | undefined = this as any;

Expand All @@ -129,7 +159,25 @@ class ContextPipelineImpl<TContextBase, TCrawlingContext extends TContextBase> e
for (const { action, cleanup } of middlewares) {
try {
const contextExtension = await action(crawlingContext);
Object.defineProperties(crawlingContext, Object.getOwnPropertyDescriptors(contextExtension));

const extensionNames = [
...Object.getOwnPropertyNames(contextExtension),
...Object.getOwnPropertySymbols(contextExtension),
];

for (const key of extensionNames) {
try {
if (Object.getOwnPropertyDescriptor(crawlingContext, key)?.configurable !== false) {
Object.defineProperty(
crawlingContext,
key,
Object.getOwnPropertyDescriptor(contextExtension, key)!,
);
}
Comment thread
barjin marked this conversation as resolved.
} catch (error: any) {
log.debug(`Context pipeline failed to define property ${key.toString()}:`, error);
Comment thread
barjin marked this conversation as resolved.
}
}

if (cleanup) {
cleanupStack.push(cleanup);
Expand Down
29 changes: 16 additions & 13 deletions packages/http-crawler/src/internals/file-download.ts
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import { Transform } from 'node:stream';

import type { BasicCrawlerOptions } from '@crawlee/basic';
import { BasicCrawler, ContextPipeline } from '@crawlee/basic';
import { BasicCrawler } from '@crawlee/basic';
import type { CrawlingContext, LoadedRequest, Request } from '@crawlee/core';
import { ResponseWithUrl } from '@crawlee/http-client';
import type { Dictionary } from '@crawlee/types';
Expand Down Expand Up @@ -162,19 +162,22 @@ export class FileDownload extends BasicCrawler<FileDownloadCrawlingContext> {
constructor(options: BasicCrawlerOptions<FileDownloadCrawlingContext> = {}) {
super({
...options,
contextPipelineBuilder: () =>
ContextPipeline.create<CrawlingContext>().compose({
action: async (context) => this.initiateDownload(context),
cleanup: async (context) => {
if (!context.response.bodyUsed) {
// Nobody consumed the body — cancel it so the
// underlying connection can be released.
await context.response.body?.cancel();
}
contextPipelineBuilder: () => this.buildContextPipeline(),
});
}

protected override buildContextPipeline() {
return super.buildContextPipeline().compose({
action: async (context) => this.initiateDownload(context),
cleanup: async (context) => {
if (!context.response.bodyUsed) {
// Nobody consumed the body — cancel it so the
// underlying connection can be released.
await context.response.body?.cancel();
}

await (context as { [kBodyDrained]: Promise<void> })[kBodyDrained];
},
}),
await (context as { [kBodyDrained]: Promise<void> })[kBodyDrained];
},
});
}

Expand Down
2 changes: 1 addition & 1 deletion packages/http-crawler/src/internals/http-crawler.ts
Original file line number Diff line number Diff line change
Expand Up @@ -410,7 +410,7 @@ export class HttpCrawler<
}
}

protected buildContextPipeline(): ContextPipeline<CrawlingContext, InternalHttpCrawlingContext> {
protected override buildContextPipeline(): ContextPipeline<CrawlingContext, InternalHttpCrawlingContext> {
return ContextPipeline.create<CrawlingContext>()
.compose({
action: this.makeHttpRequest.bind(this),
Expand Down
26 changes: 15 additions & 11 deletions packages/jsdom-crawler/src/internals/jsdom-crawler.ts
Original file line number Diff line number Diff line change
Expand Up @@ -192,26 +192,30 @@ export class JSDOMCrawler<
protected virtualConsole: VirtualConsole | null = null;

constructor(options: JSDOMCrawlerOptions<ContextExtension, ExtendedContext> = {}) {
const { runScripts = false, hideInternalConsole = false, ...httpOptions } = options;
const { runScripts = false, hideInternalConsole = false, contextPipelineBuilder, ...httpOptions } = options;

super({
...httpOptions,
contextPipelineBuilder: () =>
this.buildContextPipeline()
.compose({
action: async (context) => await this.parseContent(context),
cleanup: async (context) => {
this.getVirtualConsole().off('jsdomError', this.jsdomErrorHandler);
context.window?.close();
},
})
.compose({ action: async (context) => await this.addHelpers(context) }),
contextPipelineBuilder: contextPipelineBuilder ?? (() => this.buildContextPipeline()),
});

this.runScripts = runScripts;
this.hideInternalConsole = hideInternalConsole;
}

protected override buildContextPipeline() {
return super
.buildContextPipeline()
.compose({
action: async (context) => await this.parseContent(context),
cleanup: async (context) => {
this.getVirtualConsole().off('jsdomError', this.jsdomErrorHandler);
context.window?.close();
},
})
.compose({ action: async (context) => await this.addHelpers(context) });
}

/**
* Returns the currently used `VirtualConsole` instance. Can be used to listen for the JSDOM's internal console messages.
*
Expand Down
20 changes: 13 additions & 7 deletions packages/linkedom-crawler/src/internals/linkedom-crawler.ts
Original file line number Diff line number Diff line change
Expand Up @@ -167,17 +167,23 @@ export class LinkeDOMCrawler<
private static parser = new DOMParser();

constructor(options: LinkeDOMCrawlerOptions<ContextExtension, ExtendedContext>) {
const { contextPipelineBuilder, ...rest } = options;

super({
...options,
contextPipelineBuilder: () =>
this.buildContextPipeline()
.compose({
action: async (context) => this.parseContent(context),
})
.compose({ action: async (context) => this.addHelpers(context) }),
...rest,
contextPipelineBuilder: contextPipelineBuilder ?? (() => this.buildContextPipeline()),
});
}

protected override buildContextPipeline() {
return super
.buildContextPipeline()
.compose({
action: async (context) => this.parseContent(context),
})
.compose({ action: async (context) => this.addHelpers(context) });
}

private async parseContent(crawlingContext: InternalHttpCrawlingContext) {
const isXml = crawlingContext.contentType.type.includes('xml');
const document = LinkeDOMCrawler.parser.parseFromString(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -150,7 +150,7 @@ export interface AdaptivePlaywrightCrawlerContext<UserData extends Dictionary =
*/
parseWithCheerio(selector?: string, timeoutMs?: number): Promise<CheerioRoot>;

enqueueLinks(options?: EnqueueLinksOptions): Promise<void>;
enqueueLinks(options?: EnqueueLinksOptions): Promise<unknown>;
}

interface AdaptiveHook
Expand Down Expand Up @@ -299,17 +299,11 @@ export class AdaptivePlaywrightCrawler<

super({
...rest,
// Pass error handlers to the "main" crawler - we only pluck them from `rest` so that they don't go to the sub crawlers
errorHandler,
failedRequestHandler,
// Same for request handler
requestHandler,
// The builder intentionally returns null so that it crashes the crawler when it tries to use this instead of one of two the specialized context pipelines
// (that would be a logical error in this class)
contextPipelineBuilder: () =>
null as unknown as ContextPipeline<CrawlingContext, AdaptivePlaywrightCrawlerContext>,
contextPipelineBuilder: contextPipelineBuilder ?? (() => this.buildContextPipeline()),
});

this.individualRequestHandlerTimeoutMillis = requestHandlerTimeoutSecs * 1000;

this.renderingTypePredictor =
Expand Down Expand Up @@ -408,6 +402,34 @@ export class AdaptivePlaywrightCrawler<
return await super._init();
}

protected override buildContextPipeline() {
const errorMessage = (prop: string) =>
`The \`${prop}\` property is not available on the outer context pipeline of AdaptivePlaywrightCrawler - it is provided by the inner (static/browser) pipelines`;

return super.buildContextPipeline().compose({
action: async ({ request }) => ({
get request(): LoadedRequest<Request<Dictionary>> {
return request as LoadedRequest<Request<Dictionary>>;
},
get response(): Response {
throw new Error(errorMessage('response'));
},
get page(): Page {
throw new Error(errorMessage('page'));
},
get querySelector(): AdaptivePlaywrightCrawlerContext['querySelector'] {
throw new Error(errorMessage('querySelector'));
},
get waitForSelector(): AdaptivePlaywrightCrawlerContext['waitForSelector'] {
throw new Error(errorMessage('waitForSelector'));
},
get parseWithCheerio(): AdaptivePlaywrightCrawlerContext['parseWithCheerio'] {
throw new Error(errorMessage('parseWithCheerio'));
},
}),
});
}

private async adaptCheerioContext(cheerioContext: CheerioCrawlingContext) {
// Capture the original response to avoid infinite recursion when the getter is copied to the context
const result = this.resultObjects.get(cheerioContext);
Expand Down Expand Up @@ -507,28 +529,29 @@ export class AdaptivePlaywrightCrawler<
pushData: result.pushData,
useState: this.allowStorageAccess(useStateFunction),
getKeyValueStore: this.allowStorageAccess(result.getKeyValueStore),
enqueueLinks: async (options: SetRequired<EnqueueLinksOptions, 'urls'>) => {
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why is this no longer necessary again?

Copy link
Copy Markdown
Member Author

@barjin barjin Mar 4, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As far as I understand, this gets overwritten by the adaptCheerioContext / adaptPlaywrightContext implementations in both branches.

enqueueLinks: async (options: EnqueueLinksOptions = {}, timeoutMs = 5000) => {

Both of these call this.enqueueLinks with the right request and result, so this was imo dead code before. For a moment, I thought we might be calling this from the pre/postnavhooks, but these have the limited context type anyway.

return await this.enqueueLinks(options, context.request, result);
},
log: this.createLogProxy(context.log, logs),
registerDeferredCleanup: (cleanup: () => Promise<unknown>) => deferredCleanup.push(cleanup),
};

const subCrawlerContext = { ...context, ...resultBoundContextHelpers };
const subCrawlerContext = Object.defineProperties(
{},
Object.getOwnPropertyDescriptors(context),
) as typeof context;

// Mark result-bound helpers as non-configurable so they survive the sub-crawler context pipeline
// (which would otherwise override them with the sub-crawler's own versions, losing the result binding).
for (const [key, descriptor] of Object.entries(Object.getOwnPropertyDescriptors(resultBoundContextHelpers))) {
Comment thread
barjin marked this conversation as resolved.
Object.defineProperty(subCrawlerContext, key, { ...descriptor, configurable: false });
}

this.resultObjects.set(subCrawlerContext, result);

try {
const callAdaptiveRequestHandler = async () => {
if (renderingType === 'static') {
await this.staticContextPipeline.call(
subCrawlerContext,
async (finalContext) => await this.requestHandler(finalContext),
);
await this.staticContextPipeline.call(subCrawlerContext, this.requestHandler.bind(this));
} else if (renderingType === 'clientOnly') {
await this.browserContextPipeline.call(
subCrawlerContext,
async (finalContext) => await this.requestHandler(finalContext),
);
await this.browserContextPipeline.call(subCrawlerContext, this.requestHandler.bind(this));
}
};

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -202,7 +202,7 @@ export class PlaywrightCrawler<
constructor(options: PlaywrightCrawlerOptions<ExtendedContext> = {}) {
ow(options, 'PlaywrightCrawlerOptions', ow.object.exactShape(PlaywrightCrawler.optionsShape));

const { launchContext = {}, headless, ...browserCrawlerOptions } = options;
const { launchContext = {}, headless, contextPipelineBuilder, ...browserCrawlerOptions } = options;

const browserPoolOptions = {
...options.browserPoolOptions,
Expand Down Expand Up @@ -234,11 +234,14 @@ export class PlaywrightCrawler<
...(browserCrawlerOptions as PlaywrightCrawlerOptions<ExtendedContext>),
launchContext,
browserPoolOptions,
contextPipelineBuilder: () =>
this.buildContextPipeline().compose({ action: this.enhanceContext.bind(this) }),
contextPipelineBuilder: contextPipelineBuilder ?? (() => this.buildContextPipeline()),
});
}

protected override buildContextPipeline() {
return super.buildContextPipeline().compose({ action: this.enhanceContext.bind(this) });
}

protected override async _navigationHandler(
crawlingContext: PlaywrightCrawlingContext,
gotoOptions: DirectNavigationOptions,
Expand Down
Loading