BrowserOperator
diff --git a/‎evals/.env.example‎
Lines changed: 9 additions & 0 deletions b/‎evals/.env.example‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎evals/config.example.anthropic.yml‎
Lines changed: 59 additions & 0 deletions b/‎evals/config.example.anthropic.yml‎
Lines changed: 59 additions & 0 deletions
diff --git a/‎evals/config.example.cerebras.yml‎
Lines changed: 59 additions & 0 deletions b/‎evals/config.example.cerebras.yml‎
Lines changed: 59 additions & 0 deletions
diff --git a/‎evals/config.example.google.yml‎
Lines changed: 59 additions & 0 deletions b/‎evals/config.example.google.yml‎
Lines changed: 59 additions & 0 deletions
@@ -10,6 +10,15 @@ GROQ_API_KEY=gsk-your-groq-api-key-here
 # Optional: OpenRouter API key (if using OpenRouter)
 OPENROUTER_API_KEY=your-openrouter-api-key-here
 
+# Optional: Cerebras API key (if using Cerebras models)
+CEREBRAS_API_KEY=your-cerebras-api-key-here
+
+# Optional: Anthropic API key (if using Claude models directly)
+ANTHROPIC_API_KEY=your-anthropic-api-key-here
+
+# Optional: Google API key (if using Gemini models directly)
+GOOGLE_API_KEY=your-google-api-key-here
+
 # Optional: LiteLLM configuration (if using LiteLLM)
 # LiteLLM allows you to use custom model endpoints (Ollama, vLLM, etc.)
 # Set these variables, then update config.yml to use provider: "litellm"
 
@@ -0,0 +1,59 @@
+# Evaluation Framework Configuration
+# This configuration is shared across all evaluation runner scripts
+# Example configuration for Anthropic Claude models
+
+# API endpoint for the evaluation server
+api_endpoint: "http://localhost:8080"
+
+# Model configurations for running evaluations
+# These models are sent to the agent for processing requests
+
+main_model:
+  provider: "anthropic"
+  model_name: "claude-3-5-sonnet-20241022"
+  api_key: "${ANTHROPIC_API_KEY}"
+
+mini_model:
+  provider: "anthropic"
+  model_name: "claude-3-5-haiku-20241022"
+  api_key: "${ANTHROPIC_API_KEY}"
+
+nano_model:
+  provider: "anthropic"
+  model_name: "claude-3-5-haiku-20241022"
+  api_key: "${ANTHROPIC_API_KEY}"
+
+# Model configuration for judging evaluation responses
+# This model is used locally to assess the quality of agent responses
+
+judge_model:
+  provider: "anthropic"
+  model_name: "claude-3-5-sonnet-20241022"
+  api_key: "${ANTHROPIC_API_KEY}"
+
+# Execution settings
+
+execution:
+  # Default number of evaluations to run per script execution
+  default_limit: 20
+
+  # Timeout for API requests (seconds) - set to max for slow custom API
+  timeout: 3600
+
+  # Number of concurrent evaluation requests
+  concurrent_requests: 1
+
+  # Delay between requests (seconds)
+  request_delay: 1
+
+# Reporting settings
+
+reporting:
+  # Directory for storing evaluation reports
+  reports_dir: "reports"
+
+  # Report format
+  format: "csv"
+
+  # Include detailed judge reasoning in reports
+  include_reasoning: true
@@ -0,0 +1,59 @@
+# Evaluation Framework Configuration
+# This configuration is shared across all evaluation runner scripts
+# Example configuration for Cerebras models
+
+# API endpoint for the evaluation server
+api_endpoint: "http://localhost:8080"
+
+# Model configurations for running evaluations
+# These models are sent to the agent for processing requests
+
+main_model:
+  provider: "cerebras"
+  model_name: "zai-glm-4.6"
+  api_key: "${CEREBRAS_API_KEY}"
+
+mini_model:
+  provider: "cerebras"
+  model_name: "zai-glm-4.6"
+  api_key: "${CEREBRAS_API_KEY}"
+
+nano_model:
+  provider: "cerebras"
+  model_name: "zai-glm-4.6"
+  api_key: "${CEREBRAS_API_KEY}"
+
+# Model configuration for judging evaluation responses
+# This model is used locally to assess the quality of agent responses
+
+judge_model:
+  provider: "cerebras"
+  model_name: "llama3.1-70b"
+  api_key: "${CEREBRAS_API_KEY}"
+
+# Execution settings
+
+execution:
+  # Default number of evaluations to run per script execution
+  default_limit: 20
+
+  # Timeout for API requests (seconds) - set to max for slow custom API
+  timeout: 3600
+
+  # Number of concurrent evaluation requests
+  concurrent_requests: 1
+
+  # Delay between requests (seconds)
+  request_delay: 1
+
+# Reporting settings
+
+reporting:
+  # Directory for storing evaluation reports
+  reports_dir: "reports"
+
+  # Report format
+  format: "csv"
+
+  # Include detailed judge reasoning in reports
+  include_reasoning: true
@@ -0,0 +1,59 @@
+# Evaluation Framework Configuration
+# This configuration is shared across all evaluation runner scripts
+# Example configuration for Google Gemini models
+
+# API endpoint for the evaluation server
+api_endpoint: "http://localhost:8080"
+
+# Model configurations for running evaluations
+# These models are sent to the agent for processing requests
+
+main_model:
+  provider: "google"
+  model_name: "gemini-2.0-flash-exp"
+  api_key: "${GOOGLE_API_KEY}"
+
+mini_model:
+  provider: "google"
+  model_name: "gemini-1.5-flash"
+  api_key: "${GOOGLE_API_KEY}"
+
+nano_model:
+  provider: "google"
+  model_name: "gemini-1.5-flash"
+  api_key: "${GOOGLE_API_KEY}"
+
+# Model configuration for judging evaluation responses
+# This model is used locally to assess the quality of agent responses
+
+judge_model:
+  provider: "google"
+  model_name: "gemini-2.0-flash-exp"
+  api_key: "${GOOGLE_API_KEY}"
+
+# Execution settings
+
+execution:
+  # Default number of evaluations to run per script execution
+  default_limit: 20
+
+  # Timeout for API requests (seconds) - set to max for slow custom API
+  timeout: 3600
+
+  # Number of concurrent evaluation requests
+  concurrent_requests: 1
+
+  # Delay between requests (seconds)
+  request_delay: 1
+
+# Reporting settings
+
+reporting:
+  # Directory for storing evaluation reports
+  reports_dir: "reports"
+
+  # Report format
+  format: "csv"
+
+  # Include detailed judge reasoning in reports
+  include_reasoning: true