Skip to content

Commit b57fa1a

Browse files
committed
Latest iteration of eval builder with some extra tests on public sources
1 parent cf33124 commit b57fa1a

File tree

20 files changed

+1174
-3
lines changed

20 files changed

+1174
-3
lines changed

evals/.env.example

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,15 @@ GROQ_API_KEY=gsk-your-groq-api-key-here
1010
# Optional: OpenRouter API key (if using OpenRouter)
1111
OPENROUTER_API_KEY=your-openrouter-api-key-here
1212

13+
# Optional: Cerebras API key (if using Cerebras models)
14+
CEREBRAS_API_KEY=your-cerebras-api-key-here
15+
16+
# Optional: Anthropic API key (if using Claude models directly)
17+
ANTHROPIC_API_KEY=your-anthropic-api-key-here
18+
19+
# Optional: Google API key (if using Gemini models directly)
20+
GOOGLE_API_KEY=your-google-api-key-here
21+
1322
# Optional: LiteLLM configuration (if using LiteLLM)
1423
# LiteLLM allows you to use custom model endpoints (Ollama, vLLM, etc.)
1524
# Set these variables, then update config.yml to use provider: "litellm"

evals/config.example.anthropic.yml

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
# Evaluation Framework Configuration
2+
# This configuration is shared across all evaluation runner scripts
3+
# Example configuration for Anthropic Claude models
4+
5+
# API endpoint for the evaluation server
6+
api_endpoint: "http://localhost:8080"
7+
8+
# Model configurations for running evaluations
9+
# These models are sent to the agent for processing requests
10+
11+
main_model:
12+
provider: "anthropic"
13+
model_name: "claude-3-5-sonnet-20241022"
14+
api_key: "${ANTHROPIC_API_KEY}"
15+
16+
mini_model:
17+
provider: "anthropic"
18+
model_name: "claude-3-5-haiku-20241022"
19+
api_key: "${ANTHROPIC_API_KEY}"
20+
21+
nano_model:
22+
provider: "anthropic"
23+
model_name: "claude-3-5-haiku-20241022"
24+
api_key: "${ANTHROPIC_API_KEY}"
25+
26+
# Model configuration for judging evaluation responses
27+
# This model is used locally to assess the quality of agent responses
28+
29+
judge_model:
30+
provider: "anthropic"
31+
model_name: "claude-3-5-sonnet-20241022"
32+
api_key: "${ANTHROPIC_API_KEY}"
33+
34+
# Execution settings
35+
36+
execution:
37+
# Default number of evaluations to run per script execution
38+
default_limit: 20
39+
40+
# Timeout for API requests (seconds) - set to max for slow custom API
41+
timeout: 3600
42+
43+
# Number of concurrent evaluation requests
44+
concurrent_requests: 1
45+
46+
# Delay between requests (seconds)
47+
request_delay: 1
48+
49+
# Reporting settings
50+
51+
reporting:
52+
# Directory for storing evaluation reports
53+
reports_dir: "reports"
54+
55+
# Report format
56+
format: "csv"
57+
58+
# Include detailed judge reasoning in reports
59+
include_reasoning: true

evals/config.example.cerebras.yml

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
# Evaluation Framework Configuration
2+
# This configuration is shared across all evaluation runner scripts
3+
# Example configuration for Cerebras models
4+
5+
# API endpoint for the evaluation server
6+
api_endpoint: "http://localhost:8080"
7+
8+
# Model configurations for running evaluations
9+
# These models are sent to the agent for processing requests
10+
11+
main_model:
12+
provider: "cerebras"
13+
model_name: "zai-glm-4.6"
14+
api_key: "${CEREBRAS_API_KEY}"
15+
16+
mini_model:
17+
provider: "cerebras"
18+
model_name: "zai-glm-4.6"
19+
api_key: "${CEREBRAS_API_KEY}"
20+
21+
nano_model:
22+
provider: "cerebras"
23+
model_name: "zai-glm-4.6"
24+
api_key: "${CEREBRAS_API_KEY}"
25+
26+
# Model configuration for judging evaluation responses
27+
# This model is used locally to assess the quality of agent responses
28+
29+
judge_model:
30+
provider: "cerebras"
31+
model_name: "llama3.1-70b"
32+
api_key: "${CEREBRAS_API_KEY}"
33+
34+
# Execution settings
35+
36+
execution:
37+
# Default number of evaluations to run per script execution
38+
default_limit: 20
39+
40+
# Timeout for API requests (seconds) - set to max for slow custom API
41+
timeout: 3600
42+
43+
# Number of concurrent evaluation requests
44+
concurrent_requests: 1
45+
46+
# Delay between requests (seconds)
47+
request_delay: 1
48+
49+
# Reporting settings
50+
51+
reporting:
52+
# Directory for storing evaluation reports
53+
reports_dir: "reports"
54+
55+
# Report format
56+
format: "csv"
57+
58+
# Include detailed judge reasoning in reports
59+
include_reasoning: true

evals/config.example.google.yml

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
# Evaluation Framework Configuration
2+
# This configuration is shared across all evaluation runner scripts
3+
# Example configuration for Google Gemini models
4+
5+
# API endpoint for the evaluation server
6+
api_endpoint: "http://localhost:8080"
7+
8+
# Model configurations for running evaluations
9+
# These models are sent to the agent for processing requests
10+
11+
main_model:
12+
provider: "google"
13+
model_name: "gemini-2.0-flash-exp"
14+
api_key: "${GOOGLE_API_KEY}"
15+
16+
mini_model:
17+
provider: "google"
18+
model_name: "gemini-1.5-flash"
19+
api_key: "${GOOGLE_API_KEY}"
20+
21+
nano_model:
22+
provider: "google"
23+
model_name: "gemini-1.5-flash"
24+
api_key: "${GOOGLE_API_KEY}"
25+
26+
# Model configuration for judging evaluation responses
27+
# This model is used locally to assess the quality of agent responses
28+
29+
judge_model:
30+
provider: "google"
31+
model_name: "gemini-2.0-flash-exp"
32+
api_key: "${GOOGLE_API_KEY}"
33+
34+
# Execution settings
35+
36+
execution:
37+
# Default number of evaluations to run per script execution
38+
default_limit: 20
39+
40+
# Timeout for API requests (seconds) - set to max for slow custom API
41+
timeout: 3600
42+
43+
# Number of concurrent evaluation requests
44+
concurrent_requests: 1
45+
46+
# Delay between requests (seconds)
47+
request_delay: 1
48+
49+
# Reporting settings
50+
51+
reporting:
52+
# Directory for storing evaluation reports
53+
reports_dir: "reports"
54+
55+
# Report format
56+
format: "csv"
57+
58+
# Include detailed judge reasoning in reports
59+
include_reasoning: true

0 commit comments

Comments
 (0)