Skip to content

Commit 2fb30c4

Browse files
B4nanclaude
andcommitted
fix: constructor options take precedence over env vars in Configuration (#3080)
Changes the configuration priority so that user-provided constructor options take precedence over environment variables, while env vars still override defaults and crawlee.json settings. New priority: constructor options > env vars > crawlee.json > defaults This allows users to programmatically override environment variables, e.g., `new Configuration({ headless: false })` now works even if CRAWLEE_HEADLESS=true. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
1 parent 8cf173e commit 2fb30c4

3 files changed

Lines changed: 237 additions & 12 deletions

File tree

docs/upgrading/upgrading_v4.md

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -114,3 +114,19 @@ The `KeyValueStore.getPublicUrl` method is now asynchronous and reads the public
114114
## `preNavigationHooks` in `HttpCrawler` no longer accepts `gotOptions` object
115115

116116
The `preNavigationHooks` option in `HttpCrawler` subclasses no longer accepts the `gotOptions` object as a second parameter. Modify the `crawlingContext` fields (e.g. `.request`) directly instead.
117+
118+
## Configuration priority change
119+
120+
The priority of configuration options has changed. Previously, environment variables always took precedence over constructor options:
121+
122+
```text
123+
crawlee.json < constructor options < environment variables
124+
```
125+
126+
Now, constructor options take the highest precedence, allowing you to programmatically override environment variables:
127+
128+
```text
129+
constructor options > environment variables > crawlee.json > defaults
130+
```
131+
132+
This means that if you have `CRAWLEE_HEADLESS=true` set in your environment, you can now override it by passing `new Configuration({ headless: false })`. Previously, the environment variable would always win.

packages/core/src/configuration.ts

Lines changed: 23 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -204,12 +204,12 @@ export interface ConfigurationOptions {
204204
* const crawler = new BasicCrawler({ ... }, config);
205205
* ```
206206
*
207-
* The configuration provided via environment variables always takes precedence. We can also
208-
* define the `crawlee.json` file in the project root directory which will serve as a baseline,
209-
* so the options provided in constructor will override those. In other words, the precedence is:
207+
* Options explicitly provided in the constructor take the highest precedence, followed by
208+
* environment variables, then the `crawlee.json` file in the project root directory, and finally
209+
* the default values. In other words, the precedence is:
210210
*
211211
* ```text
212-
* crawlee.json < constructor options < environment variables
212+
* constructor options > environment variables > crawlee.json > defaults
213213
* ```
214214
*
215215
* ## Supported Configuration Options
@@ -296,6 +296,7 @@ export class Configuration {
296296
static storage = new AsyncLocalStorage<Configuration>();
297297

298298
protected options!: Map<keyof ConfigurationOptions, ConfigurationOptions[keyof ConfigurationOptions]>;
299+
protected userOptions!: Set<keyof ConfigurationOptions>;
299300
protected services = new Map<string, unknown>();
300301

301302
/** @internal */
@@ -304,7 +305,7 @@ export class Configuration {
304305
public readonly storageManagers = new Map<Constructor, StorageManager>();
305306

306307
/**
307-
* Creates new `Configuration` instance with provided options. Env vars will have precedence over those.
308+
* Creates new `Configuration` instance with provided options. Constructor options take precedence over env vars.
308309
*/
309310
constructor(options: ConfigurationOptions = {}) {
310311
this.buildOptions(options);
@@ -324,12 +325,18 @@ export class Configuration {
324325
}
325326

326327
/**
327-
* Returns configured value. First checks the environment variables, then provided configuration,
328+
* Returns configured value. First checks options explicitly provided in the constructor,
329+
* then environment variables, then options from crawlee.json, and finally
328330
* fallbacks to the `defaultValue` argument if provided, otherwise uses the default value as described
329331
* in the above section.
330332
*/
331333
get<T extends keyof ConfigurationOptions, U extends ConfigurationOptions[T]>(key: T, defaultValue?: U): U {
332-
// prefer env vars, always iterate through the whole map as there might be duplicate env vars for the same option
334+
// 1. Check if user explicitly provided this option (highest priority)
335+
if (this.userOptions.has(key) && this.options.has(key)) {
336+
return this.options.get(key) as U;
337+
}
338+
339+
// 2. Check environment variables (second priority)
333340
let envValue: string | undefined;
334341

335342
for (const [k, v] of entries(Configuration.ENV_MAP)) {
@@ -346,13 +353,13 @@ export class Configuration {
346353
return this._castEnvValue(key, envValue) as U;
347354
}
348355

349-
// check instance level options
356+
// 3. Check options from crawlee.json (third priority)
350357
if (this.options.has(key)) {
351358
return this.options.get(key) as U;
352359
}
353360

354-
// fallback to defaults
355-
return (defaultValue ?? Configuration.DEFAULTS[key as keyof typeof Configuration.DEFAULTS] ?? envValue) as U;
361+
// 4. Fallback to defaults (lowest priority)
362+
return (defaultValue ?? Configuration.DEFAULTS[key as keyof typeof Configuration.DEFAULTS]) as U;
356363
}
357364

358365
protected _castEnvValue(key: keyof ConfigurationOptions, value: number | string | boolean) {
@@ -492,14 +499,18 @@ export class Configuration {
492499
}
493500

494501
protected buildOptions(options: ConfigurationOptions) {
495-
// try to load configuration from crawlee.json as the baseline
502+
// Track which options were explicitly provided by the user
503+
this.userOptions = new Set(Object.keys(options) as (keyof ConfigurationOptions)[]);
504+
505+
// Load crawlee.json as baseline, then merge user options on top
496506
const path = join(process.cwd(), 'crawlee.json');
497507

498508
if (pathExistsSync(path)) {
499509
try {
500510
const file = readFileSync(path);
501511
const optionsFromFileConfig = JSON.parse(file.toString());
502-
Object.assign(options, optionsFromFileConfig);
512+
// Merge file config first, then user options override
513+
options = { ...optionsFromFileConfig, ...options };
503514
} catch {
504515
// ignore
505516
}
Lines changed: 198 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,198 @@
1+
import { unlinkSync,writeFileSync } from 'node:fs';
2+
import { join } from 'node:path';
3+
4+
import { Configuration } from '@crawlee/core';
5+
6+
describe('Configuration priority', () => {
7+
const originalEnv = { ...process.env };
8+
const crawleeJsonPath = join(process.cwd(), 'crawlee.json');
9+
let createdCrawleeJson = false;
10+
11+
beforeEach(() => {
12+
Configuration.resetGlobalState();
13+
});
14+
15+
afterEach(() => {
16+
// Restore original environment
17+
process.env = { ...originalEnv };
18+
Configuration.resetGlobalState();
19+
20+
// Clean up crawlee.json if we created it
21+
if (createdCrawleeJson) {
22+
try {
23+
unlinkSync(crawleeJsonPath);
24+
} catch {
25+
// ignore
26+
}
27+
createdCrawleeJson = false;
28+
}
29+
});
30+
31+
describe('constructor options take precedence over env vars', () => {
32+
test('boolean option: headless', () => {
33+
process.env.CRAWLEE_HEADLESS = 'true';
34+
const config = new Configuration({ headless: false });
35+
36+
expect(config.get('headless')).toBe(false);
37+
});
38+
39+
test('string option: defaultDatasetId', () => {
40+
process.env.CRAWLEE_DEFAULT_DATASET_ID = 'env-dataset';
41+
const config = new Configuration({ defaultDatasetId: 'constructor-dataset' });
42+
43+
expect(config.get('defaultDatasetId')).toBe('constructor-dataset');
44+
});
45+
46+
test('integer option: memoryMbytes', () => {
47+
process.env.CRAWLEE_MEMORY_MBYTES = '1024';
48+
const config = new Configuration({ memoryMbytes: 2048 });
49+
50+
expect(config.get('memoryMbytes')).toBe(2048);
51+
});
52+
53+
test('integer option: persistStateIntervalMillis', () => {
54+
process.env.CRAWLEE_PERSIST_STATE_INTERVAL_MILLIS = '30000';
55+
const config = new Configuration({ persistStateIntervalMillis: 90000 });
56+
57+
expect(config.get('persistStateIntervalMillis')).toBe(90000);
58+
});
59+
});
60+
61+
describe('env vars take precedence over defaults', () => {
62+
test('env var overrides default headless value', () => {
63+
process.env.CRAWLEE_HEADLESS = 'false';
64+
const config = new Configuration();
65+
66+
expect(config.get('headless')).toBe(false);
67+
});
68+
69+
test('env var overrides default persistStateIntervalMillis', () => {
70+
process.env.CRAWLEE_PERSIST_STATE_INTERVAL_MILLIS = '120000';
71+
const config = new Configuration();
72+
73+
expect(config.get('persistStateIntervalMillis')).toBe(120000);
74+
});
75+
});
76+
77+
describe('defaults are used when no other value is provided', () => {
78+
test('uses default headless value', () => {
79+
delete process.env.CRAWLEE_HEADLESS;
80+
const config = new Configuration();
81+
82+
expect(config.get('headless')).toBe(true);
83+
});
84+
85+
test('uses default persistStateIntervalMillis', () => {
86+
delete process.env.CRAWLEE_PERSIST_STATE_INTERVAL_MILLIS;
87+
const config = new Configuration();
88+
89+
expect(config.get('persistStateIntervalMillis')).toBe(60_000);
90+
});
91+
92+
test('uses default defaultDatasetId', () => {
93+
delete process.env.CRAWLEE_DEFAULT_DATASET_ID;
94+
const config = new Configuration();
95+
96+
expect(config.get('defaultDatasetId')).toBe('default');
97+
});
98+
});
99+
100+
describe('env vars are used when constructor option is not provided', () => {
101+
test('uses env var when constructor does not specify the option', () => {
102+
process.env.CRAWLEE_HEADLESS = 'false';
103+
// Constructor provides a different option, not headless
104+
const config = new Configuration({ persistStateIntervalMillis: 90000 });
105+
106+
// headless should come from env var since not in constructor
107+
expect(config.get('headless')).toBe(false);
108+
// persistStateIntervalMillis should come from constructor
109+
expect(config.get('persistStateIntervalMillis')).toBe(90000);
110+
});
111+
});
112+
113+
describe('crawlee.json integration', () => {
114+
test('constructor options override crawlee.json', () => {
115+
writeFileSync(crawleeJsonPath, JSON.stringify({ headless: true, persistStateIntervalMillis: 30000 }));
116+
createdCrawleeJson = true;
117+
118+
const config = new Configuration({ headless: false });
119+
120+
expect(config.get('headless')).toBe(false);
121+
// persistStateIntervalMillis not in constructor, should come from crawlee.json
122+
expect(config.get('persistStateIntervalMillis')).toBe(30000);
123+
});
124+
125+
test('env vars override crawlee.json when constructor option not provided', () => {
126+
writeFileSync(crawleeJsonPath, JSON.stringify({ headless: true }));
127+
createdCrawleeJson = true;
128+
process.env.CRAWLEE_HEADLESS = 'false';
129+
130+
const config = new Configuration();
131+
132+
expect(config.get('headless')).toBe(false);
133+
});
134+
135+
test('crawlee.json values are used when no env var or constructor option', () => {
136+
writeFileSync(crawleeJsonPath, JSON.stringify({ persistStateIntervalMillis: 45000 }));
137+
createdCrawleeJson = true;
138+
delete process.env.CRAWLEE_PERSIST_STATE_INTERVAL_MILLIS;
139+
140+
const config = new Configuration();
141+
142+
expect(config.get('persistStateIntervalMillis')).toBe(45000);
143+
});
144+
145+
test('full priority chain: constructor > env > crawlee.json > defaults', () => {
146+
writeFileSync(crawleeJsonPath, JSON.stringify({
147+
headless: true,
148+
persistStateIntervalMillis: 45000,
149+
defaultDatasetId: 'json-dataset',
150+
inputKey: 'JSON_INPUT',
151+
}));
152+
createdCrawleeJson = true;
153+
154+
process.env.CRAWLEE_HEADLESS = 'false';
155+
process.env.CRAWLEE_PERSIST_STATE_INTERVAL_MILLIS = '30000';
156+
delete process.env.CRAWLEE_DEFAULT_DATASET_ID;
157+
delete process.env.CRAWLEE_INPUT_KEY;
158+
delete process.env.CRAWLEE_PURGE_ON_START;
159+
160+
const config = new Configuration({
161+
headless: true, // Should win over env var 'false'
162+
});
163+
164+
// constructor wins over env var
165+
expect(config.get('headless')).toBe(true);
166+
// env var wins over crawlee.json (no constructor option for this)
167+
expect(config.get('persistStateIntervalMillis')).toBe(30000);
168+
// crawlee.json wins over default (no constructor or env var)
169+
expect(config.get('defaultDatasetId')).toBe('json-dataset');
170+
expect(config.get('inputKey')).toBe('JSON_INPUT');
171+
// default is used (no constructor, env var, or crawlee.json)
172+
expect(config.get('purgeOnStart')).toBe(true);
173+
});
174+
});
175+
176+
describe('edge cases', () => {
177+
test('explicitly setting option to false overrides env var true', () => {
178+
process.env.CRAWLEE_PURGE_ON_START = 'true';
179+
const config = new Configuration({ purgeOnStart: false });
180+
181+
expect(config.get('purgeOnStart')).toBe(false);
182+
});
183+
184+
test('explicitly setting option to 0 overrides env var', () => {
185+
process.env.CRAWLEE_MEMORY_MBYTES = '1024';
186+
const config = new Configuration({ memoryMbytes: 0 });
187+
188+
expect(config.get('memoryMbytes')).toBe(0);
189+
});
190+
191+
test('explicitly setting option to empty string overrides env var', () => {
192+
process.env.CRAWLEE_DEFAULT_DATASET_ID = 'env-dataset';
193+
const config = new Configuration({ defaultDatasetId: '' });
194+
195+
expect(config.get('defaultDatasetId')).toBe('');
196+
});
197+
});
198+
});

0 commit comments

Comments
 (0)