-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathconfig.yaml
More file actions
667 lines (652 loc) · 25.2 KB
/
config.yaml
File metadata and controls
667 lines (652 loc) · 25.2 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
# ==============================================================================
# PLUGIN CONFIGURATION
# ==============================================================================
# Specify the plugin to evaluate. Use either plugin.path OR marketplace.path.
#
# path - Path to the plugin directory (absolute or relative to cwd)
# Example: "./my-plugin" or "/Users/you/plugins/my-plugin"
# Required: Yes (unless using marketplace mode)
#
# name - Override the plugin name from plugin.json
# Possible values: Any string, or null to use manifest name
# Default: null
plugin:
path: "./path/to/plugin"
name: null
# ==============================================================================
# MARKETPLACE MODE (Alternative to single plugin evaluation)
# ==============================================================================
# Evaluate all plugins in a marketplace directory.
#
# path - Path to marketplace directory containing multiple plugins
# Example: "./my-marketplace" or null to disable marketplace mode
# Default: null
#
# evaluate_all - Evaluate all plugins in the marketplace
# Possible values:
# - true - Evaluate all plugins sequentially
# - false - Marketplace path ignored
# Default: false
marketplace:
path: null
evaluate_all: false
# ==============================================================================
# SCOPE - Component Types to Evaluate
# ==============================================================================
# Control which plugin component types are tested.
#
# skills - Evaluate skill components (SKILL.md files)
# Possible values:
# - true - Generate scenarios for skills based on trigger patterns
# - false - Skip skill evaluation
# Default: true
#
# agents - Evaluate agent components (agent .md files)
# Possible values:
# - true - Test agent triggering based on descriptions
# - false - Skip agent evaluation
# Default: true
#
# commands - Evaluate command components (command .md files)
# Possible values:
# - true - Test direct command invocation and slash command detection
# - false - Skip command evaluation
# Default: true
#
# hooks - Evaluate hook components (hooks.json)
# Possible values:
# - true - Test hook triggering
# - false - Skip hook evaluation
# Default: false
#
# mcp_servers - Evaluate MCP server components (.mcp.json)
# Possible values:
# - true - Test MCP server tool registration (Phase 3 feature)
# - false - Skip MCP evaluation
# Default: false (not yet implemented)
scope:
skills: true
agents: true
commands: true
hooks: false
mcp_servers: false
# ==============================================================================
# STAGE 2: GENERATION - Test Scenario Creation
# ==============================================================================
# Configure how test scenarios are generated using LLMs.
#
# model - Model for scenario generation
# Possible values:
# - "claude-opus-4-5-20251101" - Most creative, highest cost
# - "claude-sonnet-4-5-20250929" - Balanced creativity and cost (recommended)
# - "claude-sonnet-4-20250514" - Previous generation, lower cost
# - "claude-haiku-3-5-20250929" - Fastest, lowest cost
# Default: "claude-sonnet-4-5-20250929"
#
# scenarios_per_component - Base scenarios generated per component
# Possible values: 1-100 (practical limit ~20 for cost)
# - Low (1-3) - Quick smoke test
# - Medium (5-10) - Standard evaluation
# - High (15+) - Comprehensive testing
# Default: 5
#
# diversity - Ratio of base scenarios to variations (0.0-1.0)
# Possible values:
# - 0.0 - All variations of 1 base scenario (less diverse)
# - 0.5 - Half base scenarios, half variations
# - 1.0 - All unique base scenarios (most diverse, no variations)
# Formula: base_count = total * diversity; variations = (total - base) / base
# Default: 0.7
#
# max_tokens - Maximum tokens per generation response
# Possible values: 1000-32000
# - 4000 - ~5 scenarios
# - 8000 - ~10 scenarios (recommended)
# - 16000 - ~20 scenarios
# Default: 8000
#
# reasoning_effort - Extended thinking budget for generation
# Possible values:
# - "none" - No extended thinking (fastest)
# - "low" - ~1K thinking tokens
# - "medium" - ~5K thinking tokens (recommended)
# - "high" - ~20K thinking tokens (slowest, most thorough)
# Default: "medium"
#
# semantic_variations - Generate semantic synonym variations for skills
# Possible values:
# - true - Create paraphrased trigger phrases (tests robustness)
# - false - Use only exact trigger patterns from skill descriptions
# Default: true
#
# requests_per_second - Rate limit API calls to prevent 429 errors (optional)
# Possible values:
# - null - No rate limiting (default)
# - 0.5 - Very conservative (1 request per 2 seconds)
# - 1 - Conservative (1 request per second)
# - 2 - Moderate (recommended for safety)
# Note: LLM-based generation now runs in parallel. If you experience rate
# limit errors, enable this setting to add delays between API calls.
generation:
model: "claude-sonnet-4-5-20250929"
scenarios_per_component: 5
diversity: 0.7
max_tokens: 8000
reasoning_effort: "medium"
semantic_variations: true
# requests_per_second: 2 # Uncomment to enable rate limiting
# ==============================================================================
# STAGE 3: EXECUTION - Scenario Execution via Agent SDK
# ==============================================================================
# Configure how scenarios are executed against the plugin.
#
# model - Model for executing test scenarios
# Possible values:
# - "claude-opus-4-5-20251101" - Most capable, highest cost
# - "claude-sonnet-4-5-20250929" - Best balance (recommended for complex plugins)
# - "claude-sonnet-4-20250514" - Good performance, lower cost (recommended)
# - "claude-haiku-3-5-20250929" - Fastest, suitable for simple plugins
# Default: "claude-sonnet-4-20250514"
#
# max_turns - Maximum conversation turns per scenario
# Possible values: 1-20
# - 1 - Single exchange (fast, limited testing)
# - 5 - Standard interaction depth (recommended)
# - 10+ - Extended conversations (for complex workflows)
# Default: 5
#
# timeout_ms - Timeout per scenario execution (milliseconds)
# Possible values: 5000-300000 (5s-5min)
# - 30000 - Short timeout (30s)
# - 60000 - Standard timeout (1min, recommended)
# - 120000 - Extended timeout (2min)
# Default: 60000
#
# max_budget_usd - Stop evaluation if total cost exceeds this (USD)
# Possible values: 0.1-1000
# - 1.0 - Small test run
# - 10.0 - Standard evaluation (recommended)
# - 50+ - Comprehensive testing
# Default: 10.0
#
# session_strategy - Session management strategy (replaces session_isolation)
# Possible values:
# - "batched_by_component" - Scenarios testing the same component share
# a session with /clear between them (~80% faster startup) (default)
# - "isolated" - Each scenario runs in fresh session (slower, safest)
# Trade-off: Batched mode reduces overhead but carries minimal cross-contamination
# risk between scenarios testing the same component.
# Default: "batched_by_component"
#
# session_isolation - [DEPRECATED] Use session_strategy instead
# Possible values:
# - true - Each scenario runs in fresh session (maps to "isolated")
# - false - Reuse sessions (ignored when session_strategy is set)
# Default: true
#
# permission_bypass - Automatically approve permission prompts
# Possible values:
# - true - Auto-approve all permissions (required for automation)
# - false - Manual approval (breaks automation, use for untrusted plugins)
# Default: true
#
# ⚠️ SECURITY: When true, the SDK uses allowDangerouslySkipPermissions,
# meaning plugins can execute any tool without confirmation. This is required
# for unattended evaluation but has security implications:
# - Plugins can perform any action permitted by allowed_tools/disallowed_tools
# - Use strict disallowed_tools (minimum: [Write, Edit, Bash])
# - Run in sandboxed/containerized environments for untrusted plugins
# - Set to false only for manual review scenarios (disables automation)
#
# See SECURITY.md for enterprise deployment guidance.
#
# allowed_tools - Whitelist of allowed tools (null = all allowed)
# Possible values:
# - null - Allow all tools
# - ["Read", "Glob", "Grep"] - Only allow specific tools
# Default: null
#
# disallowed_tools - Blacklist of blocked tools
# Possible values:
# - [] - Allow all tools
# - ["Write", "Edit", "Bash"] - Block file modifications (recommended for safety)
# - ["Bash"] - Block only shell commands
# Default: ["Write", "Edit", "Bash"]
#
# num_reps - Repetitions per scenario for statistical significance
# Possible values: 1-10
# - 1 - Single run per scenario (fast)
# - 3 - Triple repetition (detects flakiness)
# - 5+ - High confidence (expensive)
# Default: 1
#
# additional_plugins - Load extra plugins for cross-plugin conflict testing
# Possible values:
# - [] - No additional plugins
# - ["./other-plugin"] - Test alongside other plugin
# - ["plugin1", "plugin2"] - Test with multiple plugins
# Default: []
#
# requests_per_second - Rate limit API calls to prevent 429 errors (optional)
# Possible values:
# - null - No rate limiting (default)
# - 0.5 - Very conservative (1 request per 2 seconds)
# - 1 - Conservative (1 request per second)
# - 2 - Moderate (recommended for safety)
# - 10 - Aggressive (for high-tier API accounts)
# - 100 - Maximum allowed
# Use case: Prevent rate limit errors during high-volume evaluations
# Note: Complements retry logic - proactive vs reactive rate limiting
# Default: null (disabled)
execution:
model: "claude-sonnet-4-20250514"
max_turns: 5
timeout_ms: 60000
max_budget_usd: 10.0
session_strategy: "batched_by_component"
permission_bypass: true
allowed_tools: null
disallowed_tools:
- Write
- Edit
- Bash
num_reps: 1
additional_plugins: []
# requests_per_second: 2 # Uncomment to enable rate limiting
# ==============================================================================
# STAGE 4: EVALUATION - Detection and Judgment
# ==============================================================================
# Configure how component triggering is detected and judged.
#
# model - Model for LLM judgment (when programmatic detection insufficient)
# Possible values:
# - "claude-opus-4-5-20251101" - Most accurate judgment
# - "claude-sonnet-4-5-20250929" - Balanced accuracy and cost (recommended)
# - "claude-sonnet-4-20250514" - Good judgment, lower cost
# - "claude-haiku-3-5-20250929" - Fast judgment (may miss nuance)
# Default: "claude-sonnet-4-5-20250929"
#
# max_tokens - Maximum tokens for judgment response
# Possible values: 1000-16000
# - 2000 - Brief judgment
# - 4000 - Standard judgment (recommended)
# - 8000+ - Detailed analysis
# Default: 4000
#
# detection_mode - Primary detection strategy
# Possible values:
# - "programmatic_first" - Use tool captures first, LLM judge as fallback (recommended)
# - "llm_only" - Always use LLM judge (slower, more expensive)
# Default: "programmatic_first"
#
# reasoning_effort - Extended thinking budget for judgment
# Possible values:
# - "none" - No extended thinking
# - "low" - ~1K thinking tokens (recommended for speed)
# - "medium" - ~5K thinking tokens
# - "high" - ~20K thinking tokens (most thorough)
# Default: "low"
#
# num_samples - Multi-sample judgment for robustness
# Possible values: 1-10
# - 1 - Single judgment (fast)
# - 3 - Triple sampling (detects inconsistency)
# - 5+ - High confidence (expensive)
# Default: 1
#
# aggregate_method - How to combine multiple sample judgments
# Possible values:
# - "average" - Mean of all scores
# - "median" - Middle value (robust to outliers)
# - "consensus" - Majority vote
# Default: "average"
#
# include_citations - Link judgment highlights to specific message IDs
# Possible values:
# - true - Include evidence links in output
# - false - Judgment only, no citations
# Default: true
evaluation:
model: "claude-sonnet-4-5-20250929"
max_tokens: 4000
detection_mode: "programmatic_first"
reasoning_effort: "low"
num_samples: 1
aggregate_method: "average"
include_citations: true
# ==============================================================================
# OUTPUT CONFIGURATION
# ==============================================================================
# Control output format and verbosity.
#
# format - Primary output format
# Possible values:
# - "json" - JSON output (recommended for programmatic use)
# - "yaml" - YAML output (human-readable)
# - "junit-xml" - JUnit XML (for CI/CD integration)
# - "tap" - Test Anything Protocol
# Default: "json"
#
# include_cli_summary - Print summary table to console
# Possible values:
# - true - Show summary after evaluation (recommended)
# - false - Output file only, no console summary
# Default: true
#
# junit_test_suite_name - Test suite name for JUnit XML output
# Possible values: Any string
# Default: "cc-plugin-eval"
#
# sanitize_transcripts - Redact PII from saved transcript files
# Possible values:
# - true - Apply PII redaction before saving transcripts
# - false - Save transcripts as-is (default)
# Default: false
#
# sanitize_logs - Redact PII from verbose console output
# Possible values:
# - true - Apply PII redaction to console logs
# - false - Log output as-is (default)
# Default: false
#
# sanitization - Advanced sanitization settings (optional)
# enabled - Master switch for sanitization (default: false for backwards compatibility)
# ⚠️ ENTERPRISE: Enable this for PII-sensitive environments or compliance:
# - GDPR, HIPAA, SOC 2, or similar regulations
# - Plugins that handle personally identifiable information
# - Sharing evaluation results with third parties
# - Storing transcripts in systems without equivalent redaction
#
# custom_patterns - Additional regex patterns to redact
# - pattern: Regex string (will be compiled with 'g' flag)
# - replacement: Replacement string for matches
# Default patterns include: API keys, JWTs, emails, phone numbers, SSNs, credit cards
#
# See SECURITY.md for detailed sanitization configuration and enterprise guidance.
#
# pattern_safety_acknowledged - Bypass ReDoS safety validation (default: false)
# ⚠️ SECURITY WARNING: Custom regex patterns may cause ReDoS attacks.
# When set to true, patterns that trigger safety warnings will be allowed.
# Only use this if you have manually verified your patterns are safe.
#
# Before adding patterns, test them for catastrophic backtracking:
# 1. Test on https://regex101.com with large inputs
# 2. Avoid nested quantifiers: (a+)+, (a*)*, (a|a)*
# 3. Limit nesting depth to 3 or fewer levels
#
# Vulnerable example: "(a+)+b" ← Hangs on "aaaaaaaaaaaX"
# Safe alternative: "a+b" ← Matches same strings efficiently
#
# Resources:
# - https://owasp.org/www-community/attacks/Regular_expression_Denial_of_Service_-_ReDoS
# - https://github.com/OWASP/CheatSheetSeries/blob/master/cheatsheets/Regular_Expression_Security_Cheat_Sheet.md
output:
format: "json"
include_cli_summary: true
junit_test_suite_name: "cc-plugin-eval"
sanitize_transcripts: false
sanitize_logs: false
# sanitization:
# pattern_safety_acknowledged: false # Set to true to bypass ReDoS check
# custom_patterns:
# - pattern: "SECRET-\\w+"
# replacement: "[REDACTED_SECRET]"
# ==============================================================================
# RESUME CONFIGURATION
# ==============================================================================
# Resume an interrupted evaluation run.
#
# run_id - Previous run ID to resume from (see results/{plugin}/{run-id}/)
# Possible values:
# - null - Start new run
# - "20250102-120000-abc123" - Resume specific run
# Default: null
#
# from_stage - Pipeline stage to resume from
# Possible values:
# - null - Auto-detect from state.json
# - "analysis" - Re-run from Stage 1
# - "generation" - Re-run from Stage 2
# - "execution" - Re-run from Stage 3
# - "evaluation" - Re-run from Stage 4
# Default: null
resume:
run_id: null
from_stage: null
# ==============================================================================
# FAST MODE - Run Only Failed Scenarios
# ==============================================================================
# Re-run only scenarios that failed in a previous run (regression testing).
#
# enabled - Enable fast mode
# Possible values:
# - true - Only run previously failed scenarios
# - false - Run all scenarios
# Default: false
#
# failed_run_id - Run ID to extract failed scenarios from
# Possible values:
# - null - Use most recent run
# - "20250102-120000-abc123" - Use specific run
# Default: null
fast_mode:
enabled: false
failed_run_id: null
# ==============================================================================
# DRY-RUN & COST ESTIMATION
# ==============================================================================
# dry_run - Stop after scenario generation (no execution)
# Possible values:
# - true - Generate scenarios, show cost estimate, then exit
# - false - Run full pipeline
# Default: false
#
# estimate_costs - Show cost estimate before execution
# Possible values:
# - true - Display estimated cost and prompt for confirmation
# - false - Skip estimation, start execution immediately
# Default: true
dry_run: false
estimate_costs: true
# ==============================================================================
# BATCH PROCESSING (For Large Evaluations)
# ==============================================================================
# Use Anthropic Batches API for cost savings on large runs.
#
# batch_threshold - Use batches when scenario count >= this number
# Possible values: 1-1000
# - 50 - Default threshold (50% cost savings)
# - 100 - Higher threshold (use batches less often)
# - 1 - Always use batches (maximum savings, slower)
# Default: 50
#
# force_synchronous - Force individual API calls (disable batching)
# Possible values:
# - true - Never use batches (faster results, higher cost)
# - false - Use batches when above threshold
# Default: false
#
# poll_interval_ms - How often to check batch status (milliseconds)
# Possible values: 1000-300000 (1s-5min)
# - 10000 - Check every 10s (frequent polling)
# - 30000 - Check every 30s (recommended)
# - 60000 - Check every 1min (patient polling)
# Default: 30000
batch_threshold: 50
force_synchronous: false
poll_interval_ms: 30000
# ==============================================================================
# FILE CHANGE REWINDING (For Write/Edit Testing)
# ==============================================================================
# rewind_file_changes - Undo file modifications after each scenario
# Possible values:
# - true - Restore files to pre-scenario state (enables Write/Edit testing)
# - false - Keep file changes (faster, but requires clean state)
# Use case: Test plugins that modify files without polluting workspace
# Default: false
rewind_file_changes: false
# ==============================================================================
# MCP SERVER CONFIGURATION
# ==============================================================================
# Configure Model Context Protocol server testing.
#
# skip_auth_required - Skip MCP servers that require OAuth
# Possible values:
# - true - Skip servers needing user authentication (recommended for automation)
# - false - Attempt to load all servers (may hang on OAuth prompts)
# Default: true
#
# connection_timeout_ms - Timeout for MCP server connection (milliseconds)
# Possible values: 1000-60000 (1s-1min)
# - 5000 - Quick timeout (5s)
# - 10000 - Standard timeout (10s, recommended)
# - 30000 - Patient timeout (30s)
# Default: 10000
mcp_servers:
skip_auth_required: true
connection_timeout_ms: 10000
# ==============================================================================
# CONFLICT DETECTION
# ==============================================================================
# Detect when multiple components trigger for the same scenario.
#
# enabled - Enable conflict detection and reporting
# Possible values:
# - true - Track and report conflicts in evaluation output
# - false - Ignore conflicts
# Default: true
#
# cross_plugin - Detect conflicts with additional_plugins
# Possible values:
# - true - Check for conflicts across multiple loaded plugins
# - false - Only detect conflicts within target plugin
# Default: false
conflict_detection:
enabled: true
cross_plugin: false
# ==============================================================================
# TUNING - Runtime-Adjustable Parameters
# ==============================================================================
# Fine-tune framework behavior without modifying source code.
# All values have sensible defaults - only override what you need.
#
# TIMEOUTS:
# plugin_load_ms - Maximum time to wait for plugin load (milliseconds)
# Possible values: 5000-120000
# Default: 30000 (30 seconds)
#
# retry_initial_ms - Initial delay before first retry (milliseconds)
# Possible values: 100-10000
# Default: 1000 (1 second)
#
# retry_max_ms - Maximum delay between retries (milliseconds)
# Possible values: 1000-120000
# Default: 30000 (30 seconds)
#
# RETRY:
# max_retries - Maximum retry attempts for transient errors
# Possible values: 0-10
# Default: 3
#
# backoff_multiplier - Exponential backoff multiplier
# Possible values: 1-5
# Default: 2
#
# jitter_factor - Random jitter to prevent thundering herd (0.0-1.0)
# Possible values: 0-1
# Default: 0.1
#
# TOKEN_ESTIMATES (for cost estimation):
# output_per_scenario - Estimated output tokens per scenario generation
# Default: 800
#
# transcript_prompt - Estimated tokens for transcript + evaluation prompt
# Default: 3000
#
# judge_output - Estimated tokens for judge response
# Default: 500
#
# input_per_turn - Estimated input tokens per execution turn
# Default: 500
#
# output_per_turn - Estimated output tokens per execution turn
# Default: 2000
#
# per_skill - Estimated input tokens per skill component
# Default: 600
#
# per_agent - Estimated input tokens per agent component
# Default: 800
#
# per_command - Estimated input tokens per command component
# Default: 300
#
# semantic_gen_max_tokens - Max tokens for semantic variation generation
# Default: 1000
#
# LIMITS:
# transcript_content_length - Max characters per transcript message for display
# Default: 500
#
# prompt_display_length - Max characters for prompt display in verbose output
# Default: 80
#
# progress_bar_width - Width of progress bar in characters
# Default: 20
#
# conflict_domain_part_min - Min length for domain parts in conflict detection
# Default: 4
#
# BATCHING:
# safety_margin - Token safety margin for batch calculations (0.5-1.0)
# Default: 0.75
tuning:
timeouts:
plugin_load_ms: 30000
retry_initial_ms: 1000
retry_max_ms: 30000
retry:
max_retries: 3
backoff_multiplier: 2
jitter_factor: 0.1
token_estimates:
output_per_scenario: 800
transcript_prompt: 3000
judge_output: 500
input_per_turn: 500
output_per_turn: 2000
per_skill: 600
per_agent: 800
per_command: 300
semantic_gen_max_tokens: 1000
limits:
transcript_content_length: 500
prompt_display_length: 80
progress_bar_width: 20
conflict_domain_part_min: 4
batching:
safety_margin: 0.75
# ==============================================================================
# DEBUGGING & PERFORMANCE
# ==============================================================================
# debug - Enable debug-level logging
# Possible values:
# - true - Show debug logs (verbose)
# - false - Show info/warn/error only
# Default: false
#
# verbose - Detailed progress output per scenario
# Possible values:
# - true - Show per-scenario progress, transcripts, tool calls
# - false - Show only summary progress
# Default: false
#
# max_concurrent - Maximum concurrent API calls
# Possible values: 1-50
# - 1 - Sequential (slowest, safest)
# - 10 - Moderate concurrency (recommended)
# - 20+ - High concurrency (faster, risks rate limits)
# Default: 10
debug: false
verbose: false
max_concurrent: 10