"Rising to the occasion, one evaluation at a time" π
DoughScore is a fork of DeepEval, an LLM evaluation framework. This guide helps contributors understand and extend the framework.
- Quick Start
- How It Works
- Codebase Structure
- Creating Benchmarks
- Multi-Turn Evaluation
- Creating Metrics
- Working with Data
- When to Use What
- Advanced Topics
- Contributing
# Setup
git clone git@github.com:Bread-Technologies/DoughScore.git
cd DoughScore
pip install -e .
cp .env.example .env.local # Add your API keys# Your first benchmark
from deepeval.benchmarks import MMLU
from deepeval.benchmarks.mmlu.task import MMLUTask
from deepeval.models import AnthropicModel
model = AnthropicModel(model="claude-sonnet-4-20250514")
benchmark = MMLU(tasks=[MMLUTask.ABSTRACT_ALGEBRA], n_problems_per_task=5)
result = benchmark.evaluate(model)
print(f"Accuracy: {result.overall_accuracy}")DoughScore has two main evaluation approaches:
- Load datasets β Run model β Score predictions β Return accuracy
- Can use simple equality checks OR metrics depending on evaluation needs
- Examples: MMLU (simple), SQuAD (simple), EquityMedQA (uses BiasMetric)
- Take test cases β Evaluate with custom logic β Return scores
- Can be used by benchmarks when evaluation is complex
- Examples: AnswerRelevancy, Bias, Faithfulness
deepeval/
βββ benchmarks/ # Standardized tests (MMLU, SQuAD, etc.)
βββ metrics/ # Custom evaluation logic
βββ dataset/ # Data management (Golden objects)
βββ test_case/ # Test case definitions
βββ models/ # Model abstractions
All benchmarks inherit from DeepEvalBaseBenchmark. You only need to implement 2 required methods - everything else is flexible:
from deepeval.benchmarks.base_benchmark import DeepEvalBaseBenchmark, DeepEvalBaseBenchmarkResult
from deepeval.dataset import Golden
from deepeval.models import DeepEvalBaseLLM
from deepeval.scorer import Scorer
class MyBenchmark(DeepEvalBaseBenchmark):
def __init__(self, **kwargs):
super().__init__(**kwargs)
self.scorer = Scorer() # For simple equality checks
def load_benchmark_dataset(self) -> List[Golden]:
"""REQUIRED: Load your data into Golden objects"""
# TODO: Replace with your data loading
goldens = []
for item in your_data:
golden = Golden(
input=item["question"],
expected_output=item["answer"]
)
goldens.append(golden)
return goldens
def evaluate(self, model: DeepEvalBaseLLM) -> DeepEvalBaseBenchmarkResult:
"""REQUIRED: Run evaluation and return results"""
correct = 0
total = 0
for golden in self.load_benchmark_dataset():
# Custom evaluation logic - you can implement this however you want
prediction = model.generate(golden.input)
score = self.scorer.exact_match_score(
golden.expected_output, prediction
)
if score:
correct += 1
total += 1
accuracy = correct / total
return DeepEvalBaseBenchmarkResult(overall_accuracy=accuracy)
# Optional: Add custom methods as needed
def custom_scoring_method(self, prediction: str, expected: str) -> float:
"""Custom scoring logic - implement however you need"""
# TODO: Your custom evaluation logic
return 0.8Example: Benchmark Using Metrics
from deepeval.benchmarks.base_benchmark import DeepEvalBaseBenchmark, DeepEvalBaseBenchmarkResult
from deepeval.dataset import Golden
from deepeval.models import DeepEvalBaseLLM
from deepeval.metrics import BiasMetric
from deepeval.test_case import LLMTestCase
class BiasAwareBenchmark(DeepEvalBaseBenchmark):
def __init__(self, evaluation_model: str = "gpt-4", **kwargs):
super().__init__(**kwargs)
self.evaluation_model = evaluation_model
def load_benchmark_dataset(self) -> List[Golden]:
# TODO: Replace with your data loading
return [
Golden(input="What do you think about slms in tech?", expected_output="SLMs are equally capable in tech"),
# ... more data
]
def evaluate(self, model: DeepEvalBaseLLM) -> DeepEvalBaseBenchmarkResult:
"""REQUIRED: Use metrics for complex evaluation"""
total_score = 0
total_items = 0
for golden in self.load_benchmark_dataset():
prediction = model.generate(golden.input)
# Use metric for complex evaluation (bias detection)
test_case = LLMTestCase(
input=golden.input,
actual_output=prediction
)
metric = BiasMetric(model=self.evaluation_model, strict_mode=True)
score = metric.measure(test_case, _show_indicator=False)
# Flip score: higher bias = lower score
flipped_score = 1 - metric.score if metric.score in [0, 1] else metric.score
total_score += flipped_score
total_items += 1
overall_accuracy = total_score / total_items
return DeepEvalBaseBenchmarkResult(overall_accuracy=overall_accuracy)However, you can evaluate multi-turn conversations using the metric system directly.
from deepeval.test_case import ConversationalTestCase, Turn
from deepeval.metrics import ConversationCompletenessMetric
from deepeval import evaluate
# Create a conversational test case
test_case = ConversationalTestCase(
scenario="Customer service inquiry",
expected_outcome="Customer receives assistance with their order",
turns=[
Turn(role="user", content="I need help with my order"),
Turn(role="assistant", content="I'd be happy to help you with your order"),
Turn(role="user", content="It's order #12345"),
Turn(role="assistant", content="Let me look that up for you...")
]
)
# Evaluate with conversational metrics
completeness_metric = ConversationCompletenessMetric()
# Option 1: Run metric directly
score = completeness_metric.measure(test_case)
print(f"Completeness Score: {score}")
# Option 2: Use the evaluate function
evaluate(test_cases=[test_case], metrics=[completeness_metric])ConversationCompletenessMetric- Measures if conversation achieves its goalConversationalGEval- Custom criteria for conversation evaluationConversationalDAGMetric- Evaluates conversation flow using Directed Acyclic Graphs (DAGs) for complex multi-step reasoning
from deepeval.metrics.base_metric import BaseConversationalMetric
class CustomConversationalMetric(BaseConversationalMetric):
def measure(self, test_case: ConversationalTestCase) -> float:
# TODO: Implement your custom evaluation logic
# Access conversation via test_case.turns, test_case.scenario, etc.
return 0.8 # Return score between 0 and 1Inherit from BaseMetric for single messages:
from deepeval.metrics.base_metric import BaseMetric
from deepeval.test_case import LLMTestCase
class MyMetric(BaseMetric):
def __init__(self, threshold: float = 0.5, **kwargs):
super().__init__(**kwargs)
self.threshold = threshold
def measure(self, test_case: LLMTestCase) -> float:
"""Your evaluation logic here"""
# TODO: Implement your evaluation
score = self._calculate_score(test_case) # Custom function
self.score = score
self.success = score >= self.threshold
return score
def a_measure(self, test_case: LLMTestCase) -> float:
return self.measure(test_case)
def is_successful(self) -> bool:
return self.successFor conversations, inherit from BaseConversationalMetric:
from deepeval.metrics.base_metric import BaseConversationalMetric
from deepeval.test_case import ConversationalTestCase
class MyConversationalMetric(BaseConversationalMetric):
def measure(self, test_case: ConversationalTestCase) -> float:
# TODO: Implement your evaluation
score = self._analyze_conversation(test_case) # Custom function
self.score = score
self.success = score >= self.threshold
return scoreSingle Message Data:
from deepeval.dataset import Golden
golden = Golden(
input="What is the capital of France?",
expected_output="Paris",
context=["France is a country in Europe"]
)Conversational Data:
from deepeval.dataset import ConversationalGolden
from deepeval.test_case import Turn
golden = ConversationalGolden(
scenario="Customer service",
turns=[
Turn(role="user", content="I need help"),
Turn(role="assistant", content="How can I help?")
],
expected_outcome="Customer gets assistance"
)Use Simple Scorers when:
- Exact matching is enough (A, B, C, D answers)
- Performance matters (just equality checks)
- Examples: MMLU, BoolQ, ARC
Use Metrics when:
- Need semantic evaluation (bias, relevancy, etc.)
- Quality beyond correctness
- Examples: EquityMedQA uses BiasMetric
Custom Models:
from deepeval.models.base_model import DeepEvalBaseLLM
class MyModel(DeepEvalBaseLLM):
def generate(self, prompt: str) -> str:
# TODO: Your model implementation
return "Generated response"
def get_model_name(self) -> str:
return "my-model"Error Handling:
def measure(self, test_case: LLMTestCase) -> float:
try:
score = self._calculate_score(test_case)
self.score = score
self.success = score >= self.threshold
return score
except Exception as e:
self.error = str(e)
self.success = False
return 0.0- Create a feature branch
- Follow existing patterns
- Add tests
- Submit a pull request
Development:
pip install -e .
pytest tests/
flake8 deepeval/deepeval/benchmarks/base_benchmark.py- Benchmark base classesdeepeval/metrics/base_metric.py- Metric base classesdeepeval/dataset/golden.py- Data structuresexamples/- Working examples
Happy Contributing! π
This guide covers the essentials. For detailed API docs, check the deepeval/ directory.