Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion autotemp.py
Original file line number Diff line number Diff line change
Expand Up @@ -383,7 +383,12 @@ def benchmark(self, dataset: List[Dict[str, str]], temperature_string: str, top_
run_text = self.run(prompt, temperature_string, top_p, advanced=advanced, rounds=rounds)
best_output = self._extract_best_output_from_run(run_text)
# We do not have direct overall score; compute via judges again for consistency
score_detail = self.evaluate_output(best_output, temperature=float(self.default_temp or 0.7), top_p=float(top_p))
score_detail = self.evaluate_output(
prompt,
best_output,
temperature=float(self.default_temp or 0.7),
top_p=float(top_p),
)
per_item_scores.append(float(score_detail.get("overall", 0.0)))
if reference:
met = self._compute_external_metrics(best_output, reference)
Expand Down
1 change: 1 addition & 0 deletions tests/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@

98 changes: 98 additions & 0 deletions tests/test_benchmark.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
import importlib.util
import pathlib
import sys
import types
import unittest


def load_autotemp_module():
openai_stub = types.SimpleNamespace(
chat=types.SimpleNamespace(
completions=types.SimpleNamespace(create=lambda **kwargs: None)
),
api_key=None,
)
dotenv_stub = types.SimpleNamespace(load_dotenv=lambda: None)

class DummyInterface:
def __init__(self, *args, **kwargs):
pass

def launch(self):
return None

class DummySlider:
def __init__(self, *args, **kwargs):
pass

class DummyCheckbox:
def __init__(self, *args, **kwargs):
pass

def style(self, **kwargs):
return self

gradio_stub = types.SimpleNamespace(
Interface=DummyInterface,
Slider=DummySlider,
Checkbox=DummyCheckbox,
)

sys.modules["openai"] = openai_stub
sys.modules["dotenv"] = dotenv_stub
sys.modules["gradio"] = gradio_stub

module_path = pathlib.Path(__file__).resolve().parents[1] / "autotemp.py"
spec = importlib.util.spec_from_file_location("autotemp_module", module_path)
module = importlib.util.module_from_spec(spec)
assert spec.loader is not None
spec.loader.exec_module(module)
return module


AUTOTEMP_MODULE = load_autotemp_module()
AutoTemp = AUTOTEMP_MODULE.AutoTemp


class FakeAutoTemp(AutoTemp):
def __init__(self):
self.default_temp = 0.0
self.model_version = "dummy-model"
self.judges = 1
self.usage_totals = {"prompt_tokens": 0, "completion_tokens": 0, "total_tokens": 0}
self.last_evaluate_call = None

def run(self, prompt, temperature_string, top_p, advanced=False, rounds=1, exploration_c=1.0):
return "Best AutoTemp Output\nsynthetic benchmark answer\n\nJudges: {}"

def evaluate_output(self, prompt_text, output, temperature, top_p):
self.last_evaluate_call = (prompt_text, output, temperature, top_p)
return {"overall": 77.0}

def estimate_cost_usd(self):
return 0.0


class BenchmarkRegressionTests(unittest.TestCase):
def test_benchmark_scores_best_output_with_prompt_context(self):
agent = FakeAutoTemp()

summary = agent.benchmark(
dataset=[{"prompt": "Explain entropy simply", "reference": ""}],
temperature_string="0.4,0.8",
top_p=0.9,
models=["dummy-model"],
advanced=False,
rounds=1,
judges=1,
)

self.assertEqual(summary["dummy-model"]["mean_overall"], 77.0)
self.assertEqual(
agent.last_evaluate_call,
("Explain entropy simply", "synthetic benchmark answer", 0.7, 0.9),
)


if __name__ == "__main__":
unittest.main()