Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 6 additions & 2 deletions packages/core/src/session_pool/session.ts
Original file line number Diff line number Diff line change
Expand Up @@ -364,7 +364,11 @@ export class Session implements ISession {
* Sets a cookie within this session for the specific URL.
*/
setCookie(rawCookie: string, url: string): void {
this.cookieJar.setCookieSync(rawCookie, url);
try {
this.cookieJar.setCookieSync(rawCookie, url);
} catch (e) {
this.log.warning('Could not set cookie.', { url, error: (e as Error).message });
}
}

/**
Expand All @@ -384,7 +388,7 @@ export class Session implements ISession {

// if invalid cookies are provided just log the exception. No need to retry the request automatically.
if (errorMessages.length) {
this.log.debug('Could not set cookies.', { errorMessages });
this.log.warning('Could not set cookies.', { errorMessages });
}
}

Expand Down
1 change: 1 addition & 0 deletions packages/http-client/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@
"access": "public"
},
"dependencies": {
"@apify/log": "^2.5.32",
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The only part I'm unsure about is adding @apify/log as a @crawlee/http-client dependency... but well, we gotta log somehow.

I'm open to ideas here.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

#3399 should resolve this problem, shouldn't it?

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@crawlee/http-client should have no dependencies on other Crawlee packages (definitely not @crawlee/core), as it's used from @crawlee/utils - importing @crawlee/core would create a circular dependency. It's a bit of a predicament :/

"@crawlee/types": "4.0.0",
"tough-cookie": "^6.0.0"
}
Expand Down
29 changes: 25 additions & 4 deletions packages/http-client/src/base-http-client.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
import type { BaseHttpClient as BaseHttpClientInterface, SendRequestOptions } from '@crawlee/types';
import { CookieJar } from 'tough-cookie';

import type { Log } from '@apify/log';
import defaultLog from '@apify/log';

export interface CustomFetchOptions {
proxyUrl?: string;
}
Expand All @@ -11,25 +14,43 @@ export interface CustomFetchOptions {
* implement only the low-level network call in `fetch`.
*/
export abstract class BaseHttpClient implements BaseHttpClientInterface {
protected log: Log;

constructor(log?: Log) {
this.log = log ?? defaultLog;
}

/**
* Perform the raw network request and return a single Response without any
* automatic redirect following or special error handling.
*/
protected abstract fetch(input: Request, init?: RequestInit & CustomFetchOptions): Promise<Response>;

private async applyCookies(request: Request, cookieJar: CookieJar): Promise<Request> {
const cookies = (await cookieJar.getCookies(request.url)).map((x) => x.cookieString().trim()).filter(Boolean);
try {
const cookies = (await cookieJar.getCookies(request.url))
.map((x) => x.cookieString().trim())
.filter(Boolean);

if (cookies?.length > 0) {
request.headers.set('cookie', cookies.join('; '));
if (cookies?.length > 0) {
request.headers.set('cookie', cookies.join('; '));
}
} catch (e) {
this.log.warning(`Failed to get cookies for URL "${request.url}": ${(e as Error).message}`);
}
return request;
}

private async setCookies(response: Response, cookieJar: CookieJar): Promise<void> {
const setCookieHeaders = response.headers.getSetCookie();

await Promise.all(setCookieHeaders.map((header) => cookieJar.setCookie(header, response.url)));
for (const header of setCookieHeaders) {
try {
await cookieJar.setCookie(header, response.url);
} catch (e) {
this.log.warning(`Failed to set cookie for URL "${response.url}": ${(e as Error).message}`);
}
}
}

private resolveRequestContext(options?: SendRequestOptions): {
Expand Down
22 changes: 21 additions & 1 deletion test/core/session_pool/session.test.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import { EVENT_SESSION_RETIRED, Session, SessionPool } from '@crawlee/core';
import { EVENT_SESSION_RETIRED, log, Session, SessionPool } from '@crawlee/core';
import { ResponseWithUrl } from '@crawlee/http-client';
import { entries, sleep } from '@crawlee/utils';
import { CookieJar } from 'tough-cookie';
Expand Down Expand Up @@ -207,6 +207,26 @@ describe('Session - testing session behaviour ', () => {
expect(session.getCookieString(url)).toBe('cookie2=your-cookie');
});

test('setCookies will log warning (not throw) on invalid cookies', () => {
const url = 'https://www.example.com';
// domain 'abc.different.domain' does not match the request URL, so tough-cookie rejects it
const cookies = [{ name: 'cookie1', value: 'my-cookie', domain: 'abc.different.domain' }];
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would be nice to explain why this cookie is invalid (it's for a different domain than the one that sets it). Also, can we test calling setCookie with a malformed raw cookie string?

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@copilot work on this, add a comment and a new test with a malformed cookie string.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done in 36d82f7. Added a comment explaining that abc.different.domain doesn't match the request URL (causing tough-cookie to reject it), and added a new test that spies on setCookieSync to simulate a throw and verifies setCookie handles it gracefully without re-throwing.


const mockedLog = vitest.mockObject(log, {
spy: true,
});

session = new Session({ sessionPool, log: mockedLog } as any);
session.setCookies(cookies, url);
expect(session.getCookieString(url)).toBe('');
expect(mockedLog.warning).toHaveBeenCalledOnce();
});

test('setCookie does not throw on malformed raw cookie string', () => {
session = new Session({ sessionPool });
expect(() => session.setCookie('garbled!!!@#$%nonsense', 'https://www.example.com')).not.toThrow();
});

test('setCookies works with hostOnly cookies', () => {
const url = 'https://www.example.com';
const cookies = [
Expand Down
18 changes: 18 additions & 0 deletions yarn.lock
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,13 @@ __metadata:
languageName: node
linkType: hard

"@apify/consts@npm:^2.51.0":
version: 2.51.0
resolution: "@apify/consts@npm:2.51.0"
checksum: 10c0/06902272f1ca99ed515d45aca546ba41130dc8719413e1dd3ceed8c5dd7eeea30a9215227ac393403646b7fbef04bd26ef4f8215c5f04e5d42fa71834304071b
languageName: node
linkType: hard

"@apify/datastructures@npm:^2.0.0, @apify/datastructures@npm:^2.0.3":
version: 2.0.3
resolution: "@apify/datastructures@npm:2.0.3"
Expand Down Expand Up @@ -79,6 +86,16 @@ __metadata:
languageName: node
linkType: hard

"@apify/log@npm:^2.5.32":
version: 2.5.32
resolution: "@apify/log@npm:2.5.32"
dependencies:
"@apify/consts": "npm:^2.51.0"
ansi-colors: "npm:^4.1.1"
checksum: 10c0/dbe716561ee9df72a5f1b26881837780f3d7d8afb11676fd20f4e529dfce7997a4cc6e240c5902c27efa2b73beb66a8563d72a911dc4b6e641c92e8507dfdefb
languageName: node
linkType: hard

"@apify/ps-tree@npm:^1.2.0":
version: 1.2.0
resolution: "@apify/ps-tree@npm:1.2.0"
Expand Down Expand Up @@ -668,6 +685,7 @@ __metadata:
version: 0.0.0-use.local
resolution: "@crawlee/http-client@workspace:packages/http-client"
dependencies:
"@apify/log": "npm:^2.5.32"
"@crawlee/types": "npm:4.0.0"
tough-cookie: "npm:^6.0.0"
languageName: unknown
Expand Down