Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 13 additions & 17 deletions packages/basic-crawler/src/internals/basic-crawler.ts
Original file line number Diff line number Diff line change
Expand Up @@ -550,7 +550,19 @@ export class BasicCrawler<Context extends CrawlingContext = BasicCrawlingContext
protected domainAccessedTime: Map<string, number>;
protected maxSessionRotations: number;
protected maxRequestsPerCrawl?: number;
protected handledRequestsCount = 0;

protected get handledRequestsCount(): number {
return this.stats.state.requestsFinished + this.stats.state.requestsFailed;
Copy link
Copy Markdown
Member

@barjin barjin Feb 13, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Side note: the failed requests are technically also "finished" (word-wise, not in Crawlee), I lobby for renaming this to requestsSucceeded in v4.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

}

/** @deprecated Setting `handledRequestsCount` directly is no longer supported. The count is now derived from `this.stats`. */
protected set handledRequestsCount(_value: number) {
throw new Error(
'Setting `handledRequestsCount` directly is no longer supported. ' +
'The count is now derived from `this.stats.state.requestsFinished` and `this.stats.state.requestsFailed`.',
);
}

protected statusMessageLoggingInterval: number;
protected statusMessageCallback?: StatusMessageCallback;
protected sessionPoolOptions: SessionPoolOptions;
Expand Down Expand Up @@ -983,7 +995,6 @@ export class BasicCrawler<Context extends CrawlingContext = BasicCrawlingContext
this.requestQueue = await this._getRequestQueue();
this.requestManager = undefined;
await this.initializeRequestManager();
this.handledRequestsCount = 0; // This would've been reset by this._init() further down below, but at that point `handledRequestsCount` could prevent `addRequests` from adding the initial requests
}

this.stats.reset();
Expand Down Expand Up @@ -1348,9 +1359,6 @@ export class BasicCrawler<Context extends CrawlingContext = BasicCrawlingContext
this._closeEvents = true;
}

// Initialize AutoscaledPool before awaiting _loadHandledRequestCount(),
// so that the caller can get a reference to it before awaiting the promise returned from run()
// (otherwise there would be no way)
this.autoscaledPool = new AutoscaledPool(this.autoscaledPoolOptions, this.config);

if (this.useSessionPool) {
Expand All @@ -1360,7 +1368,6 @@ export class BasicCrawler<Context extends CrawlingContext = BasicCrawlingContext
}

await this.initializeRequestManager();
await this._loadHandledRequestCount();
}

protected async _runRequestHandler(crawlingContext: Context): Promise<void> {
Expand Down Expand Up @@ -1632,7 +1639,6 @@ export class BasicCrawler<Context extends CrawlingContext = BasicCrawlingContext
isRequestLocked = false; // markRequestHandled succeeded and unlocked the request

this.stats.finishJob(statisticsId, request.retryCount);
this.handledRequestsCount++;

// reclaim session if request finishes successfully
request.state = RequestState.DONE;
Expand Down Expand Up @@ -1863,7 +1869,6 @@ export class BasicCrawler<Context extends CrawlingContext = BasicCrawlingContext
// If we get here, the request is either not retryable
// or failed more than retryCount times and will not be retried anymore.
// Mark the request as failed and do not retry.
this.handledRequestsCount++;
await source.markRequestHandled(request);
this.stats.failJob(request.id || request.uniqueKey, request.retryCount);

Expand Down Expand Up @@ -1952,15 +1957,6 @@ export class BasicCrawler<Context extends CrawlingContext = BasicCrawlingContext
return context as LoadedContext<Context>;
}

/**
* Updates handledRequestsCount from possibly stored counts, usually after worker migration.
*/
protected async _loadHandledRequestCount(): Promise<void> {
if (this.requestManager) {
this.handledRequestsCount = await this.requestManager.handledCount();
}
}

protected async _executeHooks<HookLike extends (...args: any[]) => Awaitable<void>>(
hooks: HookLike[],
...args: Parameters<HookLike>
Expand Down
73 changes: 11 additions & 62 deletions test/core/crawlers/basic_crawler.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -1095,7 +1095,7 @@ describe('BasicCrawler', () => {
expect(await requestList.isEmpty()).toBe(false);
});

test('should load handledRequestCount from storages', async () => {
test('should derive handledRequestCount from Statistics', async () => {
const requestQueue = new RequestQueue({ id: 'id', client: Configuration.getStorageClient() });
requestQueue.isEmpty = async () => false;
requestQueue.isFinished = async () => false;
Expand All @@ -1104,53 +1104,12 @@ describe('BasicCrawler', () => {
// @ts-expect-error Overriding the method for testing purposes
requestQueue.markRequestHandled = async () => {};

const requestQueueStub = vitest.spyOn(requestQueue, 'handledCount').mockResolvedValue(33);
// Even though the request queue reports 33 handled requests (e.g. from a previous crawler run),
// the crawler should use its own Statistics to track the count and process all 40 requests.
vitest.spyOn(requestQueue, 'handledCount').mockResolvedValue(33);

let count = 0;
let crawler = new BasicCrawler({
requestQueue,
maxConcurrency: 1,
requestHandler: async () => {
await sleep(1);
count++;
},
maxRequestsPerCrawl: 40,
});

await crawler.run();
expect(requestQueueStub).toBeCalled();
expect(count).toBe(7);
vitest.restoreAllMocks();

const sources = Array.from(Array(10).keys(), (x) => x + 1).map((i) => ({ url: `http://example.com/${i}` }));
const sourcesCopy = JSON.parse(JSON.stringify(sources));
let requestList = await RequestList.open({ sources });
const requestListStub = vitest.spyOn(requestList, 'handledCount').mockReturnValue(33);

count = 0;
crawler = new BasicCrawler({
requestList,
maxConcurrency: 1,
requestHandler: async () => {
await sleep(1);
count++;
},
maxRequestsPerCrawl: 40,
});

await crawler.run();
expect(requestListStub).toBeCalled();
expect(count).toBe(7);
vitest.restoreAllMocks();

requestList = await RequestList.open({ sources: sourcesCopy });
const listStub = vitest.spyOn(requestList, 'handledCount').mockReturnValue(20);
const queueStub = vitest.spyOn(requestQueue, 'handledCount').mockResolvedValue(33);
const addRequestStub = vitest.spyOn(requestQueue, 'addRequest').mockReturnValue(Promise.resolve() as any);

count = 0;
crawler = new BasicCrawler({
requestList,
const crawler = new BasicCrawler({
requestQueue,
maxConcurrency: 1,
requestHandler: async () => {
Expand All @@ -1161,12 +1120,8 @@ describe('BasicCrawler', () => {
});

await crawler.run();

expect(queueStub).toBeCalled();
expect(listStub).not.toBeCalled();
expect(addRequestStub).toBeCalledTimes(7);
expect(count).toBe(7);

// The crawler should have processed 40 requests (its own limit), not 7 (40 - 33).
expect(count).toBe(40);
vitest.restoreAllMocks();
});

Expand Down Expand Up @@ -1441,12 +1396,6 @@ describe('BasicCrawler', () => {
failedRequestHandler: async () => {},
});

// @ts-expect-error Accessing private prop
crawler._loadHandledRequestCount = () => {
expect(crawler.sessionPool).toBeDefined();
expect(events.listenerCount(EventType.PERSIST_STATE)).toEqual(1);
};

await crawler.run();
expect(events.listenerCount(EventType.PERSIST_STATE)).toEqual(0);
// @ts-expect-error private symbol
Expand Down Expand Up @@ -1605,7 +1554,7 @@ describe('BasicCrawler', () => {
requestHandler: async () => {},
});

crawler['handledRequestsCount'] = 2; // eslint-disable-line dot-notation
crawler.stats.state.requestsFinished = 2;

// Try to add 6 requests - should only add 3 due to limit
const requestsToAdd = [
Expand Down Expand Up @@ -1638,7 +1587,7 @@ describe('BasicCrawler', () => {
requestHandler: async () => {},
});

crawler['handledRequestsCount'] = 1; // eslint-disable-line dot-notation
crawler.stats.state.requestsFinished = 1;

// First call - should add 2 requests (2 more slots to go)
await crawler.addRequests(['http://example.com/1', 'http://example.com/2']);
Expand Down Expand Up @@ -1687,7 +1636,7 @@ describe('BasicCrawler', () => {
requestHandler: async () => {},
});

crawler['handledRequestsCount'] = 0; // eslint-disable-line dot-notation
crawler.stats.state.requestsFinished = 0;

// Mock robots.txt checking to disallow some URLs
vitest.spyOn(crawler as any, 'isAllowedBasedOnRobotsTxtFile').mockImplementation(async (url) => {
Expand Down Expand Up @@ -1794,7 +1743,7 @@ describe('BasicCrawler', () => {
return;
}

crawler['handledRequestsCount'] = 2; // eslint-disable-line dot-notation
crawler.stats.state.requestsFinished = 2;

await context.enqueueLinks({ urls: requestsToAdd, label: 'not-undefined' });
},
Expand Down