From 946fdecb620e088d7cc64c2fc77bea1e23aa36e2 Mon Sep 17 00:00:00 2001 From: Dylan Huang Date: Thu, 11 Sep 2025 10:43:01 -0700 Subject: [PATCH 01/23] try reasoning effort high --- .../test_pydantic_complex_queries_responses.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/tests/chinook/pydantic/test_pydantic_complex_queries_responses.py b/tests/chinook/pydantic/test_pydantic_complex_queries_responses.py index d94e98fa..5a425647 100644 --- a/tests/chinook/pydantic/test_pydantic_complex_queries_responses.py +++ b/tests/chinook/pydantic/test_pydantic_complex_queries_responses.py @@ -19,8 +19,11 @@ def agent_factory(config: RolloutProcessorConfig) -> Agent: model_name = config.completion_params["model"] - model_settings = OpenAIResponsesModelSettings() - model = OpenAIResponsesModel(model_name) + reasoning = config.completion_params["reasoning"] + settings = OpenAIResponsesModelSettings( + openai_reasoning_effort=reasoning, + ) + model = OpenAIResponsesModel(model_name, settings=settings) return setup_agent(model) @@ -33,7 +36,8 @@ def agent_factory(config: RolloutProcessorConfig) -> Agent: input_rows=[collect_dataset()], completion_params=[ { - "model": "gpt-4o", + "model": "gpt-5", + "reasoning": "high", }, ], rollout_processor=PydanticAgentRolloutProcessor(agent_factory), From dca946ebcb5dbb02eb548ab17b542dd29ab3c429 Mon Sep 17 00:00:00 2001 From: Dylan Huang Date: Thu, 11 Sep 2025 12:41:50 -0700 Subject: [PATCH 02/23] task 7 --- tests/chinook/dataset/task_7/ground_truth.md | 29 ++++++++++++++++++++ tests/chinook/dataset/task_7/task.txt | 5 ++++ 2 files changed, 34 insertions(+) create mode 100644 tests/chinook/dataset/task_7/ground_truth.md create mode 100644 tests/chinook/dataset/task_7/task.txt diff --git a/tests/chinook/dataset/task_7/ground_truth.md b/tests/chinook/dataset/task_7/ground_truth.md new file mode 100644 index 00000000..4c2c2972 --- /dev/null +++ b/tests/chinook/dataset/task_7/ground_truth.md @@ -0,0 +1,29 @@ +| customer_id | first_name | last_name | last_purchase | lifetime_value | distinct_genres | related_tracks_available | customer_status | segment | +| ----------- | ---------- | ----------- | ------------------- | -------------- | --------------- | ------------------------ | --------------- | ------------------ | +| 26 | Richard | Cunningham | 2025-04-05 00:00:00 | 474.62 | 7 | 2727 | Needs Attention | High Value Diverse | +| 57 | Luis | Rojas | 2024-10-14 00:00:00 | 415.62 | 12 | 2962 | At Risk | High Value Diverse | +| 37 | Fynn | Zimmermann | 2025-06-03 00:00:00 | 388.62 | 10 | 2911 | Needs Attention | High Value Diverse | +| 5 | František | Wichterlová | 2025-05-06 00:00:00 | 376.62 | 8 | 2879 | Needs Attention | High Value Diverse | +| 43 | Isabelle | Mercier | 2025-06-06 00:00:00 | 376.62 | 9 | 2057 | Needs Attention | High Value Diverse | +| 28 | Julia | Barnett | 2025-05-19 00:00:00 | 370.62 | 6 | 2587 | Needs Attention | High Value Diverse | +| 17 | Jack | Smith | 2024-07-31 00:00:00 | 352.62 | 10 | 2862 | At Risk | High Value Diverse | +| 34 | João | Fernandes | 2024-10-01 00:00:00 | 352.62 | 10 | 2830 | At Risk | High Value Diverse | +| 15 | Jennifer | Peterson | 2024-12-15 00:00:00 | 343.62 | 8 | 2384 | At Risk | High Value Diverse | +| 51 | Joakim | Johansson | 2024-12-02 00:00:00 | 340.62 | 7 | 2795 | At Risk | High Value Diverse | +| 40 | Dominique | Lefebvre | 2024-08-13 00:00:00 | 336.62 | 7 | 2753 | At Risk | High Value Diverse | +| 19 | Tim | Goyer | 2024-09-13 00:00:00 | 335.62 | 9 | 2924 | At Risk | High Value Diverse | +| 53 | Phil | Hughes | 2025-01-15 00:00:00 | 334.62 | 6 | 2735 | At Risk | High Value Diverse | +| 55 | Mark | Taylor | 2024-08-31 00:00:00 | 334.62 | 6 | 2379 | At Risk | High Value Diverse | +| 2 | Leonie | Köhler | 2024-07-13 00:00:00 | 334.62 | 7 | 2716 | At Risk | High Value Diverse | +| 9 | Kara | Nielsen | 2025-02-02 00:00:00 | 334.62 | 5 | 2592 | At Risk | High Value Diverse | +| 11 | Alexandre | Rocha | 2025-03-18 00:00:00 | 334.62 | 6 | 2637 | Needs Attention | High Value Diverse | +| 13 | Fernanda | Ramos | 2024-11-01 00:00:00 | 334.62 | 7 | 2476 | At Risk | High Value Diverse | +| 14 | Mark | Philips | 2025-05-11 00:00:00 | 334.62 | 10 | 2889 | Needs Attention | High Value Diverse | +| 30 | Edward | Francis | 2025-01-02 00:00:00 | 334.62 | 6 | 2188 | At Risk | High Value Diverse | +| 32 | Aaron | Mitchell | 2025-02-15 00:00:00 | 334.62 | 7 | 2509 | At Risk | High Value Diverse | +| 36 | Hannah | Schneider | 2024-11-14 00:00:00 | 334.62 | 6 | 2653 | At Risk | High Value Diverse | +| 38 | Niklas | Schröder | 2024-06-30 00:00:00 | 334.62 | 7 | 2795 | At Risk | High Value Diverse | +| 47 | Lucas | Mancini | 2025-03-05 00:00:00 | 334.62 | 6 | 2699 | At Risk | High Value Diverse | +| 49 | Stanisław | Wójcik | 2025-04-18 00:00:00 | 334.62 | 5 | 2381 | Needs Attention | High Value Diverse | +| 52 | Emma | Jones | 2025-06-11 00:00:00 | 334.62 | 5 | 2602 | Needs Attention | High Value Diverse | +| 59 | Puja | Srivastava | 2024-05-30 00:00:00 | 331.66 | 7 | 2763 | At Risk | High Value Diverse | diff --git a/tests/chinook/dataset/task_7/task.txt b/tests/chinook/dataset/task_7/task.txt new file mode 100644 index 00000000..47521597 --- /dev/null +++ b/tests/chinook/dataset/task_7/task.txt @@ -0,0 +1,5 @@ +Find high-value customers who haven't purchased recently and their potential +re-engagement opportunities. Identify customers who haven't purchased in 3+ +months, have lifetime value >$25, and have >10 related tracks available. Include +customer status (At Risk/Needs Attention/Active) and segment (High Value +Diverse/Medium Value/Low Value). From fe9ed080067851ced013d272dae0bbf7fc01470a Mon Sep 17 00:00:00 2001 From: Dylan Huang Date: Thu, 11 Sep 2025 13:14:31 -0700 Subject: [PATCH 03/23] tasks 8-12 --- tests/chinook/dataset/task_11/ground_truth.md | 575 ++++++++++++++++++ tests/chinook/dataset/task_11/task.txt | 3 + tests/chinook/dataset/task_12/ground_truth.md | 52 ++ tests/chinook/dataset/task_12/task.txt | 3 + tests/chinook/dataset/task_8/ground_truth.md | 334 ++++++++++ tests/chinook/dataset/task_8/task.txt | 3 + tests/chinook/dataset/task_9/ground_truth.md | 61 ++ tests/chinook/dataset/task_9/task.txt | 3 + .../chinook/dataset/track_10/ground_truth.md | 349 +++++++++++ tests/chinook/dataset/track_10/task.txt | 3 + 10 files changed, 1386 insertions(+) create mode 100644 tests/chinook/dataset/task_11/ground_truth.md create mode 100644 tests/chinook/dataset/task_11/task.txt create mode 100644 tests/chinook/dataset/task_12/ground_truth.md create mode 100644 tests/chinook/dataset/task_12/task.txt create mode 100644 tests/chinook/dataset/task_8/ground_truth.md create mode 100644 tests/chinook/dataset/task_8/task.txt create mode 100644 tests/chinook/dataset/task_9/ground_truth.md create mode 100644 tests/chinook/dataset/task_9/task.txt create mode 100644 tests/chinook/dataset/track_10/ground_truth.md create mode 100644 tests/chinook/dataset/track_10/task.txt diff --git a/tests/chinook/dataset/task_11/ground_truth.md b/tests/chinook/dataset/task_11/ground_truth.md new file mode 100644 index 00000000..994f8602 --- /dev/null +++ b/tests/chinook/dataset/task_11/ground_truth.md @@ -0,0 +1,575 @@ +| country | genre | month | average_purchase_value | customer_count | annual_avg_purchase_value | seasonal_trend_indicator | +| -------------- | ------------------ | ----- | ---------------------- | -------------- | ------------------------- | ------------------------ | +| Argentina | Alternative & Punk | 3 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Argentina | Alternative & Punk | 9 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Argentina | Alternative & Punk | 11 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Argentina | Easy Listening | 3 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Argentina | Jazz | 1 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Argentina | Latin | 3 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Argentina | Latin | 11 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Argentina | Metal | 8 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Argentina | Metal | 12 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Argentina | Rock | 3 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Argentina | Rock | 6 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Argentina | Rock | 9 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Argentina | Soundtrack | 3 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Australia | Blues | 1 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Australia | Heavy Metal | 1 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Australia | Latin | 11 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Australia | Metal | 1 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Australia | Metal | 10 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Australia | Reggae | 8 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Australia | Rock | 1 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Australia | Rock | 4 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Australia | Rock | 5 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Australia | Rock | 7 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Australia | Rock | 8 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Australia | Rock | 10 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Austria | Classical | 10 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Austria | Drama | 1 | 1.99 | 1 | 1.99000000000000000000 | Normal | +| Austria | Jazz | 9 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Austria | Latin | 4 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Austria | Metal | 9 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Austria | Metal | 12 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Austria | Pop | 10 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Austria | R&B/Soul | 6 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Austria | R&B/Soul | 10 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Austria | Rock | 1 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Austria | Rock | 7 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Austria | Rock | 9 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Austria | TV Shows | 1 | 1.99 | 1 | 1.99000000000000000000 | Normal | +| Belgium | Alternative & Punk | 3 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Belgium | Alternative & Punk | 10 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Belgium | Alternative & Punk | 11 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Belgium | Metal | 8 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Belgium | R&B/Soul | 7 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Belgium | Rock | 1 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Belgium | Rock | 2 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Belgium | Rock | 3 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Belgium | Rock | 11 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Brazil | Alternative & Punk | 6 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Brazil | Alternative & Punk | 8 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Brazil | Alternative & Punk | 10 | 0.99 | 2 | 0.99000000000000000000 | Normal | +| Brazil | Blues | 7 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Brazil | Blues | 12 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Brazil | Classical | 10 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Brazil | Classical | 11 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Brazil | Hip Hop/Rap | 10 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Brazil | Latin | 3 | 0.99 | 3 | 0.99000000000000000000 | Normal | +| Brazil | Latin | 4 | 0.99 | 2 | 0.99000000000000000000 | Normal | +| Brazil | Latin | 6 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Brazil | Latin | 7 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Brazil | Latin | 8 | 0.99 | 2 | 0.99000000000000000000 | Normal | +| Brazil | Latin | 12 | 0.99 | 2 | 0.99000000000000000000 | Normal | +| Brazil | Metal | 1 | 0.99 | 3 | 0.99000000000000000000 | Normal | +| Brazil | Metal | 8 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Brazil | Metal | 9 | 0.99 | 2 | 0.99000000000000000000 | Normal | +| Brazil | Metal | 12 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Brazil | Pop | 11 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Brazil | Pop | 12 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Brazil | R&B/Soul | 3 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Brazil | R&B/Soul | 7 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Brazil | Reggae | 8 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Brazil | Reggae | 10 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Brazil | Reggae | 12 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Brazil | Rock | 2 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Brazil | Rock | 3 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Brazil | Rock | 4 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Brazil | Rock | 5 | 0.99 | 3 | 0.99000000000000000000 | Normal | +| Brazil | Rock | 6 | 0.99 | 3 | 0.99000000000000000000 | Normal | +| Brazil | Rock | 7 | 0.99 | 2 | 0.99000000000000000000 | Normal | +| Brazil | Rock | 8 | 0.99 | 2 | 0.99000000000000000000 | Normal | +| Brazil | Rock | 9 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Brazil | Rock | 10 | 0.99 | 2 | 0.99000000000000000000 | Normal | +| Brazil | Rock | 11 | 0.99 | 3 | 0.99000000000000000000 | Normal | +| Brazil | Rock | 12 | 0.99 | 3 | 0.99000000000000000000 | Normal | +| Brazil | Sci Fi & Fantasy | 3 | 1.99 | 1 | 1.99000000000000000000 | Normal | +| Brazil | Soundtrack | 8 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Brazil | Soundtrack | 12 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Brazil | World | 12 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Canada | Alternative & Punk | 1 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Canada | Alternative & Punk | 2 | 0.99 | 2 | 0.99000000000000000000 | Normal | +| Canada | Alternative & Punk | 3 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Canada | Alternative & Punk | 4 | 0.99 | 3 | 0.99000000000000000000 | Normal | +| Canada | Alternative & Punk | 5 | 0.99 | 2 | 0.99000000000000000000 | Normal | +| Canada | Alternative & Punk | 9 | 0.99 | 2 | 0.99000000000000000000 | Normal | +| Canada | Alternative & Punk | 11 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Canada | Blues | 6 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Canada | Blues | 8 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Canada | Bossa Nova | 1 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Canada | Bossa Nova | 10 | 0.99 | 2 | 0.99000000000000000000 | Normal | +| Canada | Classical | 7 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Canada | Classical | 10 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Canada | Drama | 3 | 1.99 | 2 | 1.99000000000000000000 | Normal | +| Canada | Electronica/Dance | 3 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Canada | Electronica/Dance | 5 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Canada | Hip Hop/Rap | 3 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Canada | Hip Hop/Rap | 5 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Canada | Jazz | 1 | 0.99 | 3 | 0.99000000000000000000 | Normal | +| Canada | Jazz | 4 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Canada | Jazz | 6 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Canada | Jazz | 7 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Canada | Jazz | 12 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Canada | Latin | 1 | 0.99 | 3 | 0.99000000000000000000 | Normal | +| Canada | Latin | 2 | 0.99 | 2 | 0.99000000000000000000 | Normal | +| Canada | Latin | 3 | 0.99 | 2 | 0.99000000000000000000 | Normal | +| Canada | Latin | 4 | 0.99 | 2 | 0.99000000000000000000 | Normal | +| Canada | Latin | 5 | 0.99 | 2 | 0.99000000000000000000 | Normal | +| Canada | Latin | 7 | 0.99 | 2 | 0.99000000000000000000 | Normal | +| Canada | Latin | 9 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Canada | Latin | 10 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Canada | Latin | 11 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Canada | Latin | 12 | 0.99 | 4 | 0.99000000000000000000 | Normal | +| Canada | Metal | 1 | 0.99 | 2 | 0.99000000000000000000 | Normal | +| Canada | Metal | 3 | 0.99 | 2 | 0.99000000000000000000 | Normal | +| Canada | Metal | 4 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Canada | Metal | 6 | 0.99 | 3 | 0.99000000000000000000 | Normal | +| Canada | Metal | 7 | 0.99 | 2 | 0.99000000000000000000 | Normal | +| Canada | Metal | 9 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Canada | Metal | 10 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Canada | Metal | 11 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Canada | Metal | 12 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Canada | R&B/Soul | 5 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Canada | R&B/Soul | 11 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Canada | Reggae | 2 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Canada | Reggae | 5 | 0.99 | 2 | 0.99000000000000000000 | Normal | +| Canada | Rock | 1 | 0.99 | 2 | 0.99000000000000000000 | Normal | +| Canada | Rock | 2 | 0.99 | 2 | 0.99000000000000000000 | Normal | +| Canada | Rock | 3 | 0.99 | 2 | 0.99000000000000000000 | Normal | +| Canada | Rock | 4 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Canada | Rock | 5 | 0.99 | 2 | 0.99000000000000000000 | Normal | +| Canada | Rock | 6 | 0.99 | 3 | 0.99000000000000000000 | Normal | +| Canada | Rock | 7 | 0.99 | 5 | 0.99000000000000000000 | Normal | +| Canada | Rock | 8 | 0.99 | 2 | 0.99000000000000000000 | Normal | +| Canada | Rock | 9 | 0.99 | 3 | 0.99000000000000000000 | Normal | +| Canada | Rock | 10 | 0.99 | 3 | 0.99000000000000000000 | Normal | +| Canada | Rock | 11 | 0.99 | 2 | 0.99000000000000000000 | Normal | +| Canada | Rock | 12 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Canada | Rock And Roll | 4 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Canada | TV Shows | 3 | 1.99 | 1 | 1.99000000000000000000 | Normal | +| Canada | World | 5 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Canada | World | 7 | 0.99 | 2 | 0.99000000000000000000 | Normal | +| Canada | World | 9 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Chile | Alternative & Punk | 5 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Chile | Blues | 11 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Chile | Classical | 10 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Chile | Drama | 1 | 1.99 | 1 | 1.99000000000000000000 | Normal | +| Chile | Easy Listening | 5 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Chile | Latin | 5 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Chile | Latin | 11 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Chile | Metal | 5 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Chile | Metal | 8 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Chile | Rock | 2 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Chile | Rock | 4 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Chile | Rock | 5 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Chile | Sci Fi & Fantasy | 1 | 1.99 | 1 | 1.99000000000000000000 | Normal | +| Chile | Science Fiction | 1 | 1.99 | 1 | 1.99000000000000000000 | Normal | +| Chile | Soundtrack | 5 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Chile | TV Shows | 1 | 1.99 | 1 | 1.99000000000000000000 | Normal | +| Czech Republic | Alternative & Punk | 2 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Czech Republic | Alternative & Punk | 6 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Czech Republic | Alternative & Punk | 7 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Czech Republic | Alternative & Punk | 10 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Czech Republic | Alternative & Punk | 11 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Czech Republic | Blues | 8 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Czech Republic | Drama | 9 | 1.99 | 1 | 1.99000000000000000000 | Normal | +| Czech Republic | Drama | 11 | 1.99 | 1 | 1.99000000000000000000 | Normal | +| Czech Republic | Electronica/Dance | 7 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Czech Republic | Jazz | 6 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Czech Republic | Latin | 4 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Czech Republic | Latin | 8 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Czech Republic | Latin | 9 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Czech Republic | Metal | 5 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Czech Republic | Metal | 9 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Czech Republic | Pop | 3 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Czech Republic | R&B/Soul | 7 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Czech Republic | Rock | 2 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Czech Republic | Rock | 5 | 0.99 | 2 | 0.99000000000000000000 | Normal | +| Czech Republic | Rock | 7 | 0.99 | 2 | 0.99000000000000000000 | Normal | +| Czech Republic | Rock | 9 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Czech Republic | Rock | 11 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Czech Republic | Rock | 12 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Czech Republic | Science Fiction | 11 | 1.99 | 1 | 1.99000000000000000000 | Normal | +| Czech Republic | TV Shows | 9 | 1.99 | 1 | 1.9900000000000000 | Normal | +| Czech Republic | TV Shows | 11 | 1.99 | 1 | 1.9900000000000000 | Normal | +| Denmark | Alternative & Punk | 6 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Denmark | Latin | 2 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Denmark | Latin | 4 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Denmark | Metal | 9 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Denmark | Metal | 12 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Denmark | Pop | 3 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Denmark | Rock | 2 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Denmark | Rock | 3 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Denmark | Rock | 6 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Denmark | Rock | 11 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Finland | Alternative & Punk | 11 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Finland | Jazz | 3 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Finland | Latin | 8 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Finland | Latin | 12 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Finland | Metal | 12 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Finland | Reggae | 12 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Finland | Rock | 5 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Finland | Rock | 8 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Finland | Rock | 9 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Finland | Rock | 12 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Finland | Sci Fi & Fantasy | 6 | 1.99 | 1 | 1.99000000000000000000 | Normal | +| France | Alternative | 10 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| France | Alternative & Punk | 1 | 0.99 | 2 | 0.99000000000000000000 | Normal | +| France | Alternative & Punk | 5 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| France | Alternative & Punk | 7 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| France | Alternative & Punk | 9 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| France | Alternative & Punk | 10 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| France | Alternative & Punk | 11 | 0.99 | 4 | 0.99000000000000000000 | Normal | +| France | Alternative & Punk | 12 | 0.99 | 2 | 0.99000000000000000000 | Normal | +| France | Blues | 3 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| France | Bossa Nova | 3 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| France | Classical | 4 | 0.99 | 3 | 0.99000000000000000000 | Normal | +| France | Classical | 10 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| France | Drama | 6 | 1.99 | 2 | 1.9900000000000000 | Normal | +| France | Drama | 10 | 1.99 | 1 | 1.9900000000000000 | Normal | +| France | Electronica/Dance | 10 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| France | Hip Hop/Rap | 10 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| France | Jazz | 1 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| France | Jazz | 3 | 0.99 | 2 | 0.99000000000000000000 | Normal | +| France | Jazz | 7 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| France | Latin | 1 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| France | Latin | 2 | 0.99 | 2 | 0.99000000000000000000 | Normal | +| France | Latin | 3 | 0.99 | 2 | 0.99000000000000000000 | Normal | +| France | Latin | 5 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| France | Metal | 1 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| France | Metal | 3 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| France | Metal | 5 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| France | Metal | 7 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| France | Metal | 10 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| France | Metal | 11 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| France | Metal | 12 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| France | Pop | 5 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| France | Reggae | 5 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| France | Rock | 1 | 0.99 | 2 | 0.99000000000000000000 | Normal | +| France | Rock | 3 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| France | Rock | 5 | 0.99 | 2 | 0.99000000000000000000 | Normal | +| France | Rock | 6 | 0.99 | 2 | 0.99000000000000000000 | Normal | +| France | Rock | 7 | 0.99 | 3 | 0.99000000000000000000 | Normal | +| France | Rock | 8 | 0.99 | 3 | 0.99000000000000000000 | Normal | +| France | Rock | 9 | 0.99 | 2 | 0.99000000000000000000 | Normal | +| France | Rock | 10 | 0.99 | 2 | 0.99000000000000000000 | Normal | +| France | Rock | 11 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| France | Rock | 12 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| France | Rock And Roll | 7 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| France | Sci Fi & Fantasy | 6 | 1.99 | 1 | 1.99000000000000000000 | Normal | +| France | Soundtrack | 1 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| France | Soundtrack | 5 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| France | TV Shows | 10 | 1.99 | 1 | 1.99000000000000000000 | Normal | +| Germany | Alternative | 3 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Germany | Alternative & Punk | 4 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Germany | Alternative & Punk | 5 | 0.99 | 2 | 0.99000000000000000000 | Normal | +| Germany | Alternative & Punk | 6 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Germany | Alternative & Punk | 8 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Germany | Alternative & Punk | 10 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Germany | Blues | 6 | 0.99 | 2 | 0.99000000000000000000 | Normal | +| Germany | Blues | 8 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Germany | Blues | 11 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Germany | Drama | 4 | 1.99 | 1 | 1.99000000000000000000 | Normal | +| Germany | Easy Listening | 8 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Germany | Heavy Metal | 6 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Germany | Jazz | 8 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Germany | Jazz | 10 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Germany | Latin | 1 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Germany | Latin | 2 | 0.99 | 2 | 0.99000000000000000000 | Normal | +| Germany | Latin | 3 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Germany | Latin | 8 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Germany | Latin | 10 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Germany | Latin | 12 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Germany | Metal | 2 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Germany | Metal | 3 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Germany | Metal | 6 | 0.99 | 2 | 0.99000000000000000000 | Normal | +| Germany | Metal | 8 | 0.99 | 2 | 0.99000000000000000000 | Normal | +| Germany | Metal | 9 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Germany | Metal | 11 | 0.99 | 2 | 0.99000000000000000000 | Normal | +| Germany | Metal | 12 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Germany | Pop | 2 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Germany | Rock | 1 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Germany | Rock | 2 | 0.99 | 2 | 0.99000000000000000000 | Normal | +| Germany | Rock | 5 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Germany | Rock | 6 | 0.99 | 3 | 0.99000000000000000000 | Normal | +| Germany | Rock | 7 | 0.99 | 2 | 0.99000000000000000000 | Normal | +| Germany | Rock | 8 | 0.99 | 2 | 0.99000000000000000000 | Normal | +| Germany | Rock | 9 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Germany | Rock | 10 | 0.99 | 2 | 0.99000000000000000000 | Normal | +| Germany | Science Fiction | 4 | 1.99 | 1 | 1.99000000000000000000 | Normal | +| Germany | Soundtrack | 2 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Germany | Soundtrack | 8 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Germany | Soundtrack | 10 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Germany | TV Shows | 4 | 1.99 | 1 | 1.99000000000000000000 | Normal | +| Hungary | Alternative & Punk | 1 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Hungary | Alternative & Punk | 11 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Hungary | Blues | 11 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Hungary | Comedy | 2 | 1.99 | 1 | 1.99000000000000000000 | Normal | +| Hungary | Drama | 2 | 1.99 | 1 | 1.99000000000000000000 | Normal | +| Hungary | Electronica/Dance | 10 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Hungary | Latin | 2 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Hungary | Metal | 2 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Hungary | Metal | 11 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Hungary | R&B/Soul | 10 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Hungary | Rock | 2 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Hungary | Rock | 5 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Hungary | Rock | 7 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Hungary | Rock | 8 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Hungary | Rock | 10 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Hungary | Sci Fi & Fantasy | 2 | 1.99 | 1 | 1.99000000000000000000 | Normal | +| Hungary | TV Shows | 2 | 1.99 | 1 | 1.99000000000000000000 | Normal | +| India | Alternative & Punk | 3 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| India | Alternative & Punk | 5 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| India | Alternative & Punk | 9 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| India | Blues | 3 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| India | Classical | 10 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| India | Jazz | 1 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| India | Jazz | 7 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| India | Jazz | 9 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| India | Latin | 7 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| India | Latin | 9 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| India | Metal | 5 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| India | Metal | 8 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| India | R&B/Soul | 7 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| India | Rock | 3 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| India | Rock | 4 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| India | Rock | 5 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| India | Rock | 6 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| India | Rock | 7 | 0.99 | 2 | 0.99000000000000000000 | Normal | +| India | Rock | 9 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| India | Sci Fi & Fantasy | 2 | 1.99 | 1 | 1.99000000000000000000 | Normal | +| India | TV Shows | 12 | 1.99 | 1 | 1.99000000000000000000 | Normal | +| Ireland | Alternative & Punk | 11 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Ireland | Blues | 3 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Ireland | Drama | 4 | 1.99 | 1 | 1.99000000000000000000 | Normal | +| Ireland | Jazz | 3 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Ireland | Jazz | 12 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Ireland | Latin | 2 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Ireland | Latin | 9 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Ireland | Metal | 12 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Ireland | Rock | 4 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Ireland | Rock | 8 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Ireland | Rock | 12 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Ireland | TV Shows | 4 | 1.99 | 1 | 1.99000000000000000000 | Normal | +| Italy | Alternative & Punk | 1 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Italy | Alternative & Punk | 3 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Italy | Alternative & Punk | 7 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Italy | Blues | 3 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Italy | Classical | 4 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Italy | Latin | 10 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Italy | Metal | 12 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Italy | Rock | 3 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Italy | Rock | 4 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Italy | Rock | 5 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Italy | Rock | 7 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Netherlands | Alternative & Punk | 5 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Netherlands | Alternative & Punk | 9 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Netherlands | Blues | 3 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Netherlands | Latin | 5 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Netherlands | Metal | 12 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Netherlands | Pop | 6 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Netherlands | Rock | 2 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Netherlands | Rock | 5 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Netherlands | Rock | 8 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Netherlands | Rock | 9 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Netherlands | Sci Fi & Fantasy | 6 | 1.99 | 1 | 1.99000000000000000000 | Normal | +| Norway | Alternative | 6 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Norway | Alternative & Punk | 10 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Norway | Classical | 6 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Norway | Drama | 6 | 1.99 | 1 | 1.99000000000000000000 | Normal | +| Norway | Electronica/Dance | 6 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Norway | Latin | 4 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Norway | Rock | 1 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Norway | Rock | 2 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Norway | Rock | 4 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Norway | Rock | 5 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Norway | Rock | 11 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Norway | World | 6 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Poland | Alternative & Punk | 5 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Poland | Alternative & Punk | 11 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Poland | Blues | 11 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Poland | Jazz | 11 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Poland | Latin | 7 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Poland | Latin | 10 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Poland | Rock | 2 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Poland | Rock | 4 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Poland | Rock | 5 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Poland | Rock | 7 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Poland | Rock | 8 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Poland | Rock | 11 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Portugal | Alternative & Punk | 2 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Portugal | Alternative & Punk | 5 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Portugal | Alternative & Punk | 11 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Portugal | Blues | 4 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Portugal | Electronica/Dance | 2 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Portugal | Heavy Metal | 4 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Portugal | Jazz | 4 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Portugal | Latin | 1 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Portugal | Latin | 2 | 0.99 | 2 | 0.99000000000000000000 | Normal | +| Portugal | Latin | 6 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Portugal | Latin | 7 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Portugal | Latin | 12 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Portugal | Metal | 1 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Portugal | Metal | 2 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Portugal | Metal | 4 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Portugal | Metal | 9 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Portugal | Metal | 10 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Portugal | Pop | 10 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Portugal | R&B/Soul | 2 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Portugal | Rock | 1 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Portugal | Rock | 2 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Portugal | Rock | 4 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Portugal | Rock | 8 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Portugal | Rock | 10 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Portugal | Rock | 11 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Portugal | Rock | 12 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Portugal | Sci Fi & Fantasy | 10 | 1.99 | 1 | 1.99000000000000000000 | Normal | +| Portugal | TV Shows | 10 | 1.99 | 1 | 1.99000000000000000000 | Normal | +| Portugal | World | 2 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Spain | Alternative & Punk | 1 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Spain | Alternative & Punk | 9 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Spain | Hip Hop/Rap | 1 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Spain | Jazz | 9 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Spain | Latin | 8 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Spain | Latin | 9 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Spain | Latin | 11 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Spain | Metal | 6 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Spain | Metal | 12 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Spain | Reggae | 1 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Spain | Rock | 1 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Spain | Rock | 5 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Spain | Rock | 8 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Spain | Rock | 9 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Spain | Rock | 11 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Sweden | Alternative & Punk | 1 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Sweden | Alternative & Punk | 8 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Sweden | Jazz | 4 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Sweden | Latin | 4 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Sweden | Latin | 12 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Sweden | Metal | 4 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Sweden | Metal | 7 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Sweden | Metal | 10 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Sweden | Rock | 2 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Sweden | Rock | 4 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Sweden | Rock | 10 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| Sweden | TV Shows | 1 | 1.99 | 1 | 1.99000000000000000000 | Normal | +| Sweden | World | 4 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| United Kingdom | Alternative & Punk | 5 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| United Kingdom | Alternative & Punk | 9 | 0.99 | 2 | 0.99000000000000000000 | Normal | +| United Kingdom | Alternative & Punk | 10 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| United Kingdom | Hip Hop/Rap | 6 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| United Kingdom | Jazz | 1 | 0.99 | 2 | 0.99000000000000000000 | Normal | +| United Kingdom | Jazz | 4 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| United Kingdom | Latin | 2 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| United Kingdom | Latin | 6 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| United Kingdom | Latin | 8 | 0.99 | 2 | 0.99000000000000000000 | Normal | +| United Kingdom | Latin | 10 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| United Kingdom | Latin | 11 | 0.99 | 2 | 0.99000000000000000000 | Normal | +| United Kingdom | Metal | 3 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| United Kingdom | Metal | 5 | 0.99 | 2 | 0.99000000000000000000 | Normal | +| United Kingdom | Metal | 7 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| United Kingdom | Metal | 8 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| United Kingdom | Metal | 10 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| United Kingdom | Metal | 12 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| United Kingdom | Pop | 6 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| United Kingdom | R&B/Soul | 8 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| United Kingdom | Reggae | 2 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| United Kingdom | Rock | 2 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| United Kingdom | Rock | 3 | 0.99 | 2 | 0.99000000000000000000 | Normal | +| United Kingdom | Rock | 4 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| United Kingdom | Rock | 5 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| United Kingdom | Rock | 6 | 0.99 | 2 | 0.99000000000000000000 | Normal | +| United Kingdom | Rock | 7 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| United Kingdom | Rock | 8 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| United Kingdom | Rock | 10 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| United Kingdom | World | 10 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| USA | Alternative | 3 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| USA | Alternative & Punk | 1 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| USA | Alternative & Punk | 2 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| USA | Alternative & Punk | 3 | 0.99 | 3 | 0.99000000000000000000 | Normal | +| USA | Alternative & Punk | 4 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| USA | Alternative & Punk | 6 | 0.99 | 3 | 0.99000000000000000000 | Normal | +| USA | Alternative & Punk | 7 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| USA | Alternative & Punk | 8 | 0.99 | 2 | 0.99000000000000000000 | Normal | +| USA | Alternative & Punk | 10 | 0.99 | 2 | 0.99000000000000000000 | Normal | +| USA | Alternative & Punk | 11 | 0.99 | 3 | 0.99000000000000000000 | Normal | +| USA | Alternative & Punk | 12 | 0.99 | 3 | 0.99000000000000000000 | Normal | +| USA | Blues | 1 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| USA | Blues | 4 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| USA | Blues | 8 | 0.99 | 3 | 0.99000000000000000000 | Normal | +| USA | Blues | 9 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| USA | Blues | 10 | 0.99 | 2 | 0.99000000000000000000 | Normal | +| USA | Bossa Nova | 6 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| USA | Bossa Nova | 10 | 0.99 | 2 | 0.99000000000000000000 | Normal | +| USA | Classical | 3 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| USA | Classical | 7 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| USA | Comedy | 3 | 1.99 | 1 | 1.99000000000000000000 | Normal | +| USA | Comedy | 5 | 1.99 | 1 | 1.99000000000000000000 | Normal | +| USA | Comedy | 9 | 1.99 | 2 | 1.99000000000000000000 | Normal | +| USA | Drama | 8 | 1.99 | 1 | 1.99000000000000000000 | Normal | +| USA | Easy Listening | 12 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| USA | Heavy Metal | 6 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| USA | Heavy Metal | 9 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| USA | Hip Hop/Rap | 8 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| USA | Hip Hop/Rap | 9 | 0.99 | 2 | 0.99000000000000000000 | Normal | +| USA | Jazz | 1 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| USA | Jazz | 2 | 0.99 | 2 | 0.99000000000000000000 | Normal | +| USA | Jazz | 3 | 0.99 | 2 | 0.99000000000000000000 | Normal | +| USA | Jazz | 4 | 0.99 | 2 | 0.99000000000000000000 | Normal | +| USA | Jazz | 6 | 0.99 | 2 | 0.99000000000000000000 | Normal | +| USA | Jazz | 9 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| USA | Jazz | 10 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| USA | Jazz | 11 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| USA | Latin | 1 | 0.99 | 3 | 0.99000000000000000000 | Normal | +| USA | Latin | 2 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| USA | Latin | 3 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| USA | Latin | 4 | 0.99 | 9 | 0.99000000000000000000 | Normal | +| USA | Latin | 5 | 0.99 | 5 | 0.99000000000000000000 | Normal | +| USA | Latin | 6 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| USA | Latin | 7 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| USA | Latin | 8 | 0.99 | 2 | 0.99000000000000000000 | Normal | +| USA | Latin | 9 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| USA | Latin | 10 | 0.99 | 2 | 0.99000000000000000000 | Normal | +| USA | Latin | 11 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| USA | Latin | 12 | 0.99 | 5 | 0.99000000000000000000 | Normal | +| USA | Metal | 1 | 0.99 | 3 | 0.99000000000000000000 | Normal | +| USA | Metal | 3 | 0.99 | 2 | 0.99000000000000000000 | Normal | +| USA | Metal | 4 | 0.99 | 3 | 0.99000000000000000000 | Normal | +| USA | Metal | 5 | 0.99 | 2 | 0.99000000000000000000 | Normal | +| USA | Metal | 6 | 0.99 | 2 | 0.99000000000000000000 | Normal | +| USA | Metal | 7 | 0.99 | 2 | 0.99000000000000000000 | Normal | +| USA | Metal | 9 | 0.99 | 3 | 0.99000000000000000000 | Normal | +| USA | Metal | 10 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| USA | Metal | 11 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| USA | Metal | 12 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| USA | Pop | 7 | 0.99 | 2 | 0.99000000000000000000 | Normal | +| USA | Pop | 8 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| USA | R&B/Soul | 1 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| USA | R&B/Soul | 3 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| USA | R&B/Soul | 7 | 0.99 | 4 | 0.99000000000000000000 | Normal | +| USA | Reggae | 5 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| USA | Reggae | 8 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| USA | Rock | 1 | 0.99 | 3 | 0.99000000000000000000 | Normal | +| USA | Rock | 2 | 0.99 | 5 | 0.99000000000000000000 | Normal | +| USA | Rock | 3 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| USA | Rock | 4 | 0.99 | 3 | 0.99000000000000000000 | Normal | +| USA | Rock | 5 | 0.99 | 2 | 0.99000000000000000000 | Normal | +| USA | Rock | 6 | 0.99 | 6 | 0.99000000000000000000 | Normal | +| USA | Rock | 8 | 0.99 | 3 | 0.99000000000000000000 | Normal | +| USA | Rock | 9 | 0.99 | 2 | 0.99000000000000000000 | Normal | +| USA | Rock | 10 | 0.99 | 3 | 0.99000000000000000000 | Normal | +| USA | Rock | 11 | 0.99 | 4 | 0.99000000000000000000 | Normal | +| USA | Rock | 12 | 0.99 | 7 | 0.99000000000000000000 | Normal | +| USA | Rock And Roll | 1 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| USA | Rock And Roll | 11 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| USA | Sci Fi & Fantasy | 8 | 1.99 | 1 | 1.99000000000000000000 | Normal | +| USA | Sci Fi & Fantasy | 9 | 1.99 | 1 | 1.99000000000000000000 | Normal | +| USA | Science Fiction | 7 | 1.99 | 1 | 1.99000000000000000000 | Normal | +| USA | Soundtrack | 4 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| USA | Soundtrack | 8 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| USA | Soundtrack | 12 | 0.99 | 1 | 0.99000000000000000000 | Normal | +| USA | TV Shows | 3 | 1.99 | 1 | 1.99000000000000000000 | Normal | +| USA | TV Shows | 5 | 1.99 | 1 | 1.99000000000000000000 | Normal | +| USA | TV Shows | 7 | 1.99 | 1 | 1.99000000000000000000 | Normal | +| USA | TV Shows | 8 | 1.99 | 1 | 1.99000000000000000000 | Normal | +| USA | TV Shows | 9 | 1.99 | 3 | 1.99000000000000000000 | Normal | diff --git a/tests/chinook/dataset/task_11/task.txt b/tests/chinook/dataset/task_11/task.txt new file mode 100644 index 00000000..a40fb591 --- /dev/null +++ b/tests/chinook/dataset/task_11/task.txt @@ -0,0 +1,3 @@ +Analyze seasonal purchasing trends by customer country and genre preference. +Show country, genre, month, average purchase value, customer count, and seasonal +trend indicator. diff --git a/tests/chinook/dataset/task_12/ground_truth.md b/tests/chinook/dataset/task_12/ground_truth.md new file mode 100644 index 00000000..526ac03c --- /dev/null +++ b/tests/chinook/dataset/task_12/ground_truth.md @@ -0,0 +1,52 @@ +| customer_id | customer_name | genre_diversity_score | avg_track_price | total_spending | diversity_to_spending_ratio | +| ----------- | --------------------- | --------------------- | ---------------------- | -------------- | --------------------------- | +| 57 | Luis Rojas | 12 | 1.2268421052631579 | 46.62 | 0.25740025740025740026 | +| 45 | Ladislav Kovács | 11 | 1.2005263157894737 | 45.62 | 0.24112231477422183253 | +| 14 | Mark Philips | 10 | 0.99000000000000000000 | 37.62 | 0.26581605528973950027 | +| 34 | João Fernandes | 10 | 1.0426315789473684 | 39.62 | 0.25239777889954568400 | +| 3 | François Tremblay | 10 | 1.0426315789473684 | 39.62 | 0.25239777889954568400 | +| 17 | Jack Smith | 10 | 1.0426315789473684 | 39.62 | 0.25239777889954568400 | +| 37 | Fynn Zimmermann | 10 | 1.1478947368421053 | 43.62 | 0.22925263640531866116 | +| 24 | Frank Ralston | 10 | 1.1478947368421053 | 43.62 | 0.22925263640531866116 | +| 27 | Patrick Gray | 9 | 0.99000000000000000000 | 37.62 | 0.23923444976076555024 | +| 23 | John Gordon | 9 | 0.99000000000000000000 | 37.62 | 0.23923444976076555024 | +| 21 | Kathy Chase | 9 | 0.99000000000000000000 | 37.62 | 0.23923444976076555024 | +| 41 | Marc Dubois | 9 | 0.99000000000000000000 | 37.62 | 0.23923444976076555024 | +| 19 | Tim Goyer | 9 | 1.01631578947368421053 | 38.62 | 0.23303987571206628690 | +| 43 | Isabelle Mercier | 9 | 1.0689473684210526 | 40.62 | 0.22156573116691285081 | +| 7 | Astrid Gruber | 9 | 1.1215789473684211 | 42.62 | 0.21116846550915063351 | +| 6 | Helena Holý | 9 | 1.3057894736842105 | 49.62 | 0.18137847642079806530 | +| 54 | Steve Murray | 8 | 0.99000000000000000000 | 37.62 | 0.21265284423179160021 | +| 15 | Jennifer Peterson | 8 | 1.01631578947368421053 | 38.62 | 0.20714655618850336613 | +| 58 | Manoj Pareek | 8 | 1.01631578947368421053 | 38.62 | 0.20714655618850336613 | +| 1 | Luís Gonçalves | 8 | 1.0426315789473684 | 39.62 | 0.20191822311963654720 | +| 4 | Bjørn Hansen | 8 | 1.0426315789473684 | 39.62 | 0.20191822311963654720 | +| 22 | Heather Leacock | 8 | 1.0426315789473684 | 39.62 | 0.20191822311963654720 | +| 5 | František Wichterlová | 8 | 1.0689473684210526 | 40.62 | 0.19694731659281142294 | +| 25 | Victor Stevens | 8 | 1.1215789473684211 | 42.62 | 0.18770530267480056312 | +| 46 | Hugh O'Reilly | 8 | 1.2005263157894737 | 45.62 | 0.17536168347216133275 | +| 59 | Puja Srivastava | 7 | 1.01777777777777777778 | 36.64 | 0.19104803493449781659 | +| 56 | Diego Gutiérrez | 7 | 0.99000000000000000000 | 37.62 | 0.18607123870281765019 | +| 50 | Enrique Muñoz | 7 | 0.99000000000000000000 | 37.62 | 0.18607123870281765019 | +| 32 | Aaron Mitchell | 7 | 0.99000000000000000000 | 37.62 | 0.18607123870281765019 | +| 10 | Eduardo Martins | 7 | 0.99000000000000000000 | 37.62 | 0.18607123870281765019 | +| 13 | Fernanda Ramos | 7 | 0.99000000000000000000 | 37.62 | 0.18607123870281765019 | +| 38 | Niklas Schröder | 7 | 0.99000000000000000000 | 37.62 | 0.18607123870281765019 | +| 2 | Leonie Köhler | 7 | 0.99000000000000000000 | 37.62 | 0.18607123870281765019 | +| 16 | Frank Harris | 7 | 0.99000000000000000000 | 37.62 | 0.18607123870281765019 | +| 39 | Camille Bernard | 7 | 1.01631578947368421053 | 38.62 | 0.18125323666494044537 | +| 51 | Joakim Johansson | 7 | 1.01631578947368421053 | 38.62 | 0.18125323666494044537 | +| 40 | Dominique Lefebvre | 7 | 1.01631578947368421053 | 38.62 | 0.18125323666494044537 | +| 20 | Dan Miller | 7 | 1.0426315789473684 | 39.62 | 0.17667844522968197880 | +| 42 | Wyatt Girard | 7 | 1.0426315789473684 | 39.62 | 0.17667844522968197880 | +| 48 | Johannes Van der Berg | 7 | 1.0689473684210526 | 40.62 | 0.17232890201870999508 | +| 44 | Terhi Hämäläinen | 7 | 1.0952631578947368 | 41.62 | 0.16818837097549255166 | +| 26 | Richard Cunningham | 7 | 1.2531578947368421 | 47.62 | 0.14699706005879882402 | +| 35 | Madalena Sampaio | 6 | 0.99000000000000000000 | 37.62 | 0.15948963317384370016 | +| 33 | Ellie Sullivan | 6 | 0.99000000000000000000 | 37.62 | 0.15948963317384370016 | +| 47 | Lucas Mancini | 6 | 0.99000000000000000000 | 37.62 | 0.15948963317384370016 | +| 36 | Hannah Schneider | 6 | 0.99000000000000000000 | 37.62 | 0.15948963317384370016 | +| 11 | Alexandre Rocha | 6 | 0.99000000000000000000 | 37.62 | 0.15948963317384370016 | +| 18 | Michelle Brooks | 6 | 0.99000000000000000000 | 37.62 | 0.15948963317384370016 | +| 31 | Martha Silk | 6 | 0.99000000000000000000 | 37.62 | 0.15948963317384370016 | +| 30 | Edward Francis | 6 | 0.99000000000000000000 | 37.62 | 0.15948963317384370016 | diff --git a/tests/chinook/dataset/task_12/task.txt b/tests/chinook/dataset/task_12/task.txt new file mode 100644 index 00000000..844f82d6 --- /dev/null +++ b/tests/chinook/dataset/task_12/task.txt @@ -0,0 +1,3 @@ +Find customers with the most diverse music taste and their spending correlation. +Calculate genre diversity score, average track price, total spending, and +diversity-to-spending ratio. diff --git a/tests/chinook/dataset/task_8/ground_truth.md b/tests/chinook/dataset/task_8/ground_truth.md new file mode 100644 index 00000000..6f4c47cc --- /dev/null +++ b/tests/chinook/dataset/task_8/ground_truth.md @@ -0,0 +1,334 @@ +| genre | track | artist | unit_price | total_quantity_sold | total_revenue | popularity_rank | +| ------------------ | ---------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------- | ---------- | ------------------- | ------------- | --------------- | +| Alternative | Moth | Audioslave | 0.99 | 1 | 0.99 | 1 | +| Alternative | All Night Thing | Temple of the Dog | 0.99 | 1 | 0.99 | 1 | +| Alternative | Slowness | Calexico | 0.99 | 1 | 0.99 | 1 | +| Alternative | Call Me a Dog | Temple of the Dog | 0.99 | 1 | 0.99 | 1 | +| Alternative | Show Me How to Live (Live at the Quart Festival) | Audioslave | 0.99 | 1 | 0.99 | 1 | +| Alternative | Scar On the Sky | Chris Cornell | 0.99 | 1 | 0.99 | 1 | +| Alternative | Until We Fall | Audioslave | 0.99 | 1 | 0.99 | 1 | +| Alternative | Four Walled World | Temple of the Dog | 0.99 | 1 | 0.99 | 1 | +| Alternative | Say Hello 2 Heaven | Temple of the Dog | 0.99 | 1 | 0.99 | 1 | +| Alternative | Sound of a Gun | Audioslave | 0.99 | 1 | 0.99 | 1 | +| Alternative | Billie Jean | Chris Cornell | 0.99 | 1 | 0.99 | 1 | +| Alternative | Safe and Sound | Chris Cornell | 0.99 | 1 | 0.99 | 1 | +| Alternative | Disappearing Act | Chris Cornell | 0.99 | 1 | 0.99 | 1 | +| Alternative | Shape of Things to Come | Audioslave | 0.99 | 1 | 0.99 | 1 | +| Alternative & Punk | Cabeça Dinossauro | Titãs | 0.99 | 2 | 1.98 | 1 | +| Alternative & Punk | Times Like These | Foo Fighters | 0.99 | 2 | 1.98 | 1 | +| Alternative & Punk | Midlife Crisis | Faith No More | 0.99 | 2 | 1.98 | 1 | +| Alternative & Punk | She Loves Me Not | Faith No More | 0.99 | 2 | 1.98 | 1 | +| Alternative & Punk | Train In Vain | The Clash | 0.99 | 2 | 1.98 | 1 | +| Alternative & Punk | Toda Cor | Titãs | 0.99 | 2 | 1.98 | 1 | +| Alternative & Punk | When I Come Around | Green Day | 0.99 | 2 | 1.98 | 1 | +| Alternative & Punk | Big Wave | Pearl Jam | 0.99 | 2 | 1.98 | 1 | +| Alternative & Punk | I Believe | R.E.M. | 0.99 | 2 | 1.98 | 1 | +| Alternative & Punk | Voodoo | Body Count | 0.99 | 2 | 1.98 | 1 | +| Alternative & Punk | Country Feedback | R.E.M. Feat. Kate Pearson | 0.99 | 2 | 1.98 | 1 | +| Alternative & Punk | Comida | Titãs | 0.99 | 2 | 1.98 | 1 | +| Alternative & Punk | Ando Meio Desligado | Os Mutantes | 0.99 | 2 | 1.98 | 1 | +| Alternative & Punk | Lugar Nenhum | Titãs | 0.99 | 2 | 1.98 | 1 | +| Alternative & Punk | Gyroscope | The Tea Party | 0.99 | 2 | 1.98 | 1 | +| Alternative & Punk | Bullet With Butterfly Wings | Smashing Pumpkins | 0.99 | 2 | 1.98 | 1 | +| Alternative & Punk | Saturnine | Smashing Pumpkins | 0.99 | 2 | 1.98 | 1 | +| Alternative & Punk | So Central Rain | R.E.M. | 0.99 | 2 | 1.98 | 1 | +| Alternative & Punk | Heaven Coming Down | The Tea Party | 0.99 | 2 | 1.98 | 1 | +| Alternative & Punk | Give Me Novacaine | Green Day | 0.99 | 2 | 1.98 | 1 | +| Alternative & Punk | Get Up | R.E.M. | 0.99 | 2 | 1.98 | 1 | +| Alternative & Punk | A Melhor Forma | Titãs | 0.99 | 2 | 1.98 | 1 | +| Alternative & Punk | Last Cup Of Sorrow | Faith No More | 0.99 | 2 | 1.98 | 1 | +| Alternative & Punk | Basket Case | Green Day | 0.99 | 2 | 1.98 | 1 | +| Alternative & Punk | Babylon | The Tea Party | 0.99 | 2 | 1.98 | 1 | +| Alternative & Punk | Real Love | Smashing Pumpkins | 0.99 | 2 | 1.98 | 1 | +| Alternative & Punk | Caffeine | Faith No More | 0.99 | 2 | 1.98 | 1 | +| Alternative & Punk | Easy | Faith No More | 0.99 | 2 | 1.98 | 1 | +| Alternative & Punk | Bowels Of The Devil | Body Count | 0.99 | 2 | 1.98 | 1 | +| Alternative & Punk | Fall On Me | R.E.M. | 0.99 | 2 | 1.98 | 1 | +| Alternative & Punk | Untitled | R.E.M. | 0.99 | 2 | 1.98 | 1 | +| Alternative & Punk | Esporrei Na Manivela | Raimundos | 0.99 | 2 | 1.98 | 1 | +| Alternative & Punk | Holiday | Green Day | 0.99 | 2 | 1.98 | 1 | +| Alternative & Punk | Nice Guys Finish Last | Green Day | 0.99 | 2 | 1.98 | 1 | +| Alternative & Punk | Whatsername | Green Day | 0.99 | 2 | 1.98 | 1 | +| Alternative & Punk | Waiting | Smashing Pumpkins | 0.99 | 2 | 1.98 | 1 | +| Alternative & Punk | Under The Bridge | Red Hot Chili Peppers | 0.99 | 2 | 1.98 | 1 | +| Alternative & Punk | Cherub Rock | Smashing Pumpkins | 0.99 | 2 | 1.98 | 1 | +| Alternative & Punk | O Pulso | Titãs | 0.99 | 2 | 1.98 | 1 | +| Alternative & Punk | Take It Or Leave It | JET | 0.99 | 2 | 1.98 | 1 | +| Alternative & Punk | Release | The Tea Party | 0.99 | 2 | 1.98 | 1 | +| Blues | Promises | Eric Clapton | 0.99 | 2 | 1.98 | 1 | +| Blues | Travis Walk | Stevie Ray Vaughan & Double Trouble | 0.99 | 2 | 1.98 | 1 | +| Blues | Sunshine Of Your Love | Eric Clapton | 0.99 | 2 | 1.98 | 1 | +| Blues | Midnight From The Inside Out | The Black Crowes | 0.99 | 2 | 1.98 | 1 | +| Blues | Lay Down Sally | Eric Clapton | 0.99 | 2 | 1.98 | 1 | +| Blues | Title Song | The Black Crowes | 0.99 | 2 | 1.98 | 1 | +| Blues | Lonely Stranger | Eric Clapton | 0.99 | 2 | 1.98 | 1 | +| Blues | When My Left Eye Jumps | Buddy Guy | 0.99 | 2 | 1.98 | 1 | +| Bossa Nova | Onde Anda Você | Toquinho & Vinícius | 0.99 | 2 | 1.98 | 1 | +| Classical | Scheherazade, Op. 35: I. The Sea and Sindbad's Ship | Chicago Symphony Orchestra & Fritz Reiner | 0.99 | 2 | 1.98 | 1 | +| Classical | Symphonie Fantastique, Op. 14: V. Songe d'une nuit du sabbat | Michael Tilson Thomas & San Francisco Symphony | 0.99 | 2 | 1.98 | 1 | +| Classical | String Quartet No. 12 in C Minor, D. 703 "Quartettsatz": II. Andante - Allegro assai | Emerson String Quartet | 0.99 | 2 | 1.98 | 1 | +| Classical | Suite No. 3 in D, BWV 1068: III. Gavotte I & II | Academy of St. Martin in the Fields, Sir Neville Marriner & Thurston Dart | 0.99 | 2 | 1.98 | 1 | +| Classical | Music for the Funeral of Queen Mary: VI. "Thou Knowest, Lord, the Secrets of Our Hearts" | Equale Brass Ensemble, John Eliot Gardiner & Munich Monteverdi Orchestra and Choir | 0.99 | 2 | 1.98 | 1 | +| Comedy | Phyllis's Wedding | The Office | 1.99 | 2 | 3.98 | 1 | +| Drama | How to Stop an Exploding Man | Heroes | 1.99 | 2 | 3.98 | 1 | +| Drama | The Fix | Heroes | 1.99 | 2 | 3.98 | 1 | +| Easy Listening | I Get A Kick Out Of You | Frank Sinatra | 0.99 | 1 | 0.99 | 1 | +| Easy Listening | What Now My Love | Frank Sinatra | 0.99 | 1 | 0.99 | 1 | +| Easy Listening | New York, New York | Frank Sinatra | 0.99 | 1 | 0.99 | 1 | +| Easy Listening | Bad, Bad Leroy Brown | Frank Sinatra | 0.99 | 1 | 0.99 | 1 | +| Easy Listening | Come Fly With Me | Frank Sinatra | 0.99 | 1 | 0.99 | 1 | +| Easy Listening | It Was A Very Good Year | Frank Sinatra | 0.99 | 1 | 0.99 | 1 | +| Easy Listening | Fly Me To The Moon | Frank Sinatra | 0.99 | 1 | 0.99 | 1 | +| Easy Listening | My Kind Of Town | Frank Sinatra | 0.99 | 1 | 0.99 | 1 | +| Easy Listening | Summer Love | Frank Sinatra | 0.99 | 1 | 0.99 | 1 | +| Easy Listening | Mack The Knife | Frank Sinatra | 0.99 | 1 | 0.99 | 1 | +| Electronica/Dance | Light Years | Jamiroquai | 0.99 | 2 | 1.98 | 1 | +| Heavy Metal | Gates Of Tomorrow | Iron Maiden | 0.99 | 1 | 0.99 | 1 | +| Heavy Metal | Drifter | Iron Maiden | 0.99 | 1 | 0.99 | 1 | +| Heavy Metal | Another Life | Iron Maiden | 0.99 | 1 | 0.99 | 1 | +| Heavy Metal | Dance Of Death | Iron Maiden | 0.99 | 1 | 0.99 | 1 | +| Heavy Metal | Die With Your Boots On | Iron Maiden | 0.99 | 1 | 0.99 | 1 | +| Heavy Metal | Purgatory | Iron Maiden | 0.99 | 1 | 0.99 | 1 | +| Heavy Metal | The Ides Of March | Iron Maiden | 0.99 | 1 | 0.99 | 1 | +| Heavy Metal | Age Of Innocence | Iron Maiden | 0.99 | 1 | 0.99 | 1 | +| Heavy Metal | Phantom Of The Opera | Iron Maiden | 0.99 | 1 | 0.99 | 1 | +| Heavy Metal | Wildest Dreams | Iron Maiden | 0.99 | 1 | 0.99 | 1 | +| Heavy Metal | Face In The Sand | Iron Maiden | 0.99 | 1 | 0.99 | 1 | +| Heavy Metal | Genghis Khan | Iron Maiden | 0.99 | 1 | 0.99 | 1 | +| Hip Hop/Rap | Rappers Reais | Planet Hemp | 0.99 | 2 | 1.98 | 1 | +| Hip Hop/Rap | Nega Do Cabelo Duro | Planet Hemp | 0.99 | 2 | 1.98 | 1 | +| Jazz | Surrender | Spyro Gyra | 0.99 | 2 | 1.98 | 1 | +| Jazz | Starburst | Spyro Gyra | 0.99 | 2 | 1.98 | 1 | +| Jazz | L'Arc En Ciel De Miles | Incognito | 0.99 | 2 | 1.98 | 1 | +| Jazz | End Of Romanticism | Spyro Gyra | 0.99 | 2 | 1.98 | 1 | +| Jazz | Lemon Drop | Gene Krupa | 0.99 | 2 | 1.98 | 1 | +| Jazz | The Duke | Miles Davis | 0.99 | 2 | 1.98 | 1 | +| Jazz | Don't Take Your Love From Me | Gene Krupa | 0.99 | 2 | 1.98 | 1 | +| Jazz | Lamento De Carnaval | Gilberto Gil | 0.99 | 2 | 1.98 | 1 | +| Jazz | Blue Rythm Fantasy | Gene Krupa | 0.99 | 2 | 1.98 | 1 | +| Jazz | J Squared | Spyro Gyra | 0.99 | 2 | 1.98 | 1 | +| Jazz | Drum Boogie | Gene Krupa | 0.99 | 2 | 1.98 | 1 | +| Jazz | Por Causa De Você | Antônio Carlos Jobim | 0.99 | 2 | 1.98 | 1 | +| Latin | Não Quero Dinheiro | Tim Maia | 0.99 | 2 | 1.98 | 1 | +| Latin | Selvagem | Os Paralamas Do Sucesso | 0.99 | 2 | 1.98 | 1 | +| Latin | Revolta Olodum | Olodum | 0.99 | 2 | 1.98 | 1 | +| Latin | Meu Erro | Os Paralamas Do Sucesso | 0.99 | 2 | 1.98 | 1 | +| Latin | Meditação | Caetano Veloso | 0.99 | 2 | 1.98 | 1 | +| Latin | Falamansa Song | Falamansa | 0.99 | 2 | 1.98 | 1 | +| Latin | Romance Ideal | Os Paralamas Do Sucesso | 0.99 | 2 | 1.98 | 1 | +| Latin | Tarde Em Itapoã | Antônio Carlos Jobim | 0.99 | 2 | 1.98 | 1 | +| Latin | Álibi | Djavan | 0.99 | 2 | 1.98 | 1 | +| Latin | Bem Devagar | Caetano Veloso | 0.99 | 2 | 1.98 | 1 | +| Latin | Meu Caro Amigo | Chico Buarque | 0.99 | 2 | 1.98 | 1 | +| Latin | Geni E O Zepelim | Chico Buarque | 0.99 | 2 | 1.98 | 1 | +| Latin | Lixo Do Mangue | Chico Science & Nação Zumbi | 0.99 | 2 | 1.98 | 1 | +| Latin | Amor De Muito | Chico Science & Nação Zumbi | 0.99 | 2 | 1.98 | 1 | +| Latin | Esquinas | Djavan | 0.99 | 2 | 1.98 | 1 | +| Latin | Menestrel Das Alagoas | Milton Nascimento | 0.99 | 2 | 1.98 | 1 | +| Latin | Samba Do Lado | Chico Science & Nação Zumbi | 0.99 | 2 | 1.98 | 1 | +| Latin | Is This Love (Live) | Gilberto Gil | 0.99 | 2 | 1.98 | 1 | +| Latin | Música No Ar | Tim Maia | 0.99 | 2 | 1.98 | 1 | +| Latin | As Rosas Não Falam (Beth Carvalho) | Various Artists | 0.99 | 2 | 1.98 | 1 | +| Latin | Mensagen De Amor (2000) | Os Paralamas Do Sucesso | 0.99 | 2 | 1.98 | 1 | +| Latin | Norwegian Wood | Milton Nascimento | 0.99 | 2 | 1.98 | 1 | +| Latin | Beira Mar | Eric Clapton | 0.99 | 2 | 1.98 | 1 | +| Latin | Tanto Tempo | Antônio Carlos Jobim | 0.99 | 2 | 1.98 | 1 | +| Latin | Tempo Perdido | Legião Urbana | 0.99 | 2 | 1.98 | 1 | +| Latin | Stir It Up (Live) | Gilberto Gil | 0.99 | 2 | 1.98 | 1 | +| Latin | Brasil | Cássia Eller | 0.99 | 2 | 1.98 | 1 | +| Latin | Açai | Djavan | 0.99 | 2 | 1.98 | 1 | +| Latin | Bumbo Da Mangueira | Antônio Carlos Jobim | 0.99 | 2 | 1.98 | 1 | +| Latin | Linha Do Equador | Cláudio Zoli | 0.99 | 2 | 1.98 | 1 | +| Latin | Samba De Orly | Chico Buarque | 0.99 | 2 | 1.98 | 1 | +| Latin | Woman Of The World (Ao Vivo) | Cássia Eller | 0.99 | 2 | 1.98 | 1 | +| Latin | Só Tinha De Ser Com Você | Antônio Carlos Jobim | 0.99 | 2 | 1.98 | 1 | +| Latin | Morena De Angola | Chico Buarque | 0.99 | 2 | 1.98 | 1 | +| Latin | Nebulosa Do Amor | Os Paralamas Do Sucesso | 0.99 | 2 | 1.98 | 1 | +| Latin | Comportamento Geral | Gonzaguinha | 0.99 | 2 | 1.98 | 1 | +| Latin | Dois Pra Lá, Dois Pra Cá | Elis Regina | 0.99 | 2 | 1.98 | 1 | +| Latin | Azul | Djavan | 0.99 | 2 | 1.98 | 1 | +| Latin | A Novidade (Live) | Gilberto Gil | 0.99 | 2 | 1.98 | 1 | +| Latin | Pela Luz dos Olhos Teus (Miúcha e Tom Jobim) | Various Artists | 0.99 | 2 | 1.98 | 1 | +| Latin | Compadre | Tim Maia | 0.99 | 2 | 1.98 | 1 | +| Latin | Saber Amar | Os Paralamas Do Sucesso | 0.99 | 2 | 1.98 | 1 | +| Latin | Mis Penas Lloraba Yo (Ao Vivo) Soy Gitano (Tangos) | Cássia Eller | 0.99 | 2 | 1.98 | 1 | +| Latin | L'Avventura | Legião Urbana | 0.99 | 2 | 1.98 | 1 | +| Latin | Nossa Gente (Avisa Là) | Olodum | 0.99 | 2 | 1.98 | 1 | +| Latin | Mel | Caetano Veloso | 0.99 | 2 | 1.98 | 1 | +| Metal | Gangland | Iron Maiden | 0.99 | 2 | 1.98 | 1 | +| Metal | Where Eagles Dare | Iron Maiden | 0.99 | 2 | 1.98 | 1 | +| Metal | The Unforgiven II | Metallica | 0.99 | 2 | 1.98 | 1 | +| Metal | Can I Play With Madness | Iron Maiden | 0.99 | 2 | 1.98 | 1 | +| Metal | Motorbreath | Metallica | 0.99 | 2 | 1.98 | 1 | +| Metal | For Whom The Bell Tolls | Metallica | 0.99 | 2 | 1.98 | 1 | +| Metal | Phantom Lord | Metallica | 0.99 | 2 | 1.98 | 1 | +| Metal | Whiskey In The Jar | Metallica | 0.99 | 2 | 1.98 | 1 | +| Metal | Nothing Else Matters | Metallica | 0.99 | 2 | 1.98 | 1 | +| Metal | Re-Align | Godsmack | 0.99 | 2 | 1.98 | 1 | +| Metal | Breakdown | Guns N' Roses | 0.99 | 2 | 1.98 | 1 | +| Metal | Better Than You | Metallica | 0.99 | 2 | 1.98 | 1 | +| Metal | Moonchild | Iron Maiden | 0.99 | 2 | 1.98 | 1 | +| Metal | Run Silent Run Deep | Iron Maiden | 0.99 | 2 | 1.98 | 1 | +| Metal | Snowblind | Black Sabbath | 0.99 | 2 | 1.98 | 1 | +| Metal | Turbo Lover | Judas Priest | 0.99 | 2 | 1.98 | 1 | +| Metal | Welcome Home (Sanitarium) | Apocalyptica | 0.99 | 2 | 1.98 | 1 | +| Metal | Cornucopia | Black Sabbath | 0.99 | 2 | 1.98 | 1 | +| Metal | Leper Messiah | Metallica | 0.99 | 2 | 1.98 | 1 | +| Metal | The House Jack Built | Metallica | 0.99 | 2 | 1.98 | 1 | +| Metal | Shoot Me Again | Metallica | 0.99 | 2 | 1.98 | 1 | +| Metal | Soldier Side - Intro | System Of A Down | 0.99 | 2 | 1.98 | 1 | +| Metal | Question! | System Of A Down | 0.99 | 2 | 1.98 | 1 | +| Metal | Blood Brothers | Iron Maiden | 0.99 | 2 | 1.98 | 1 | +| Metal | The Prisoner | Iron Maiden | 0.99 | 2 | 1.98 | 1 | +| Metal | You've Got Another Thing Comin' | Judas Priest | 0.99 | 2 | 1.98 | 1 | +| Metal | Stone Cold Crazy | Metallica | 0.99 | 2 | 1.98 | 1 | +| Metal | Battery | Metallica | 0.99 | 2 | 1.98 | 1 | +| Metal | Children Of The Grave | Ozzy Osbourne | 0.99 | 2 | 1.98 | 1 | +| Metal | Violent Pornography | System Of A Down | 0.99 | 2 | 1.98 | 1 | +| Metal | Aces High | Iron Maiden | 0.99 | 2 | 1.98 | 1 | +| Metal | Chemical Wedding | Bruce Dickinson | 0.99 | 2 | 1.98 | 1 | +| Metal | The Clairvoyant | Iron Maiden | 0.99 | 2 | 1.98 | 1 | +| Opera | Die Zauberflöte, K.620: "Der Hölle Rache Kocht in Meinem Herze" | Sir Georg Solti, Sumi Jo & Wiener Philharmoniker | 0.99 | 0 | 0 | 1 | +| Pop | Give Peace a Chance | U2 | 0.99 | 2 | 1.98 | 1 | +| Pop | Beautiful Boy | U2 | 0.99 | 2 | 1.98 | 1 | +| R&B/Soul | Rehab | Amy Winehouse | 0.99 | 2 | 1.98 | 1 | +| R&B/Soul | Hey America | James Brown | 0.99 | 2 | 1.98 | 1 | +| R&B/Soul | Abraham, Martin And John | Marvin Gaye | 0.99 | 2 | 1.98 | 1 | +| R&B/Soul | Save The Children | Marvin Gaye | 0.99 | 2 | 1.98 | 1 | +| Reggae | C'est La Vie | UB40 | 0.99 | 2 | 1.98 | 1 | +| Reggae | A Cor Do Sol | Cidade Negra | 0.99 | 2 | 1.98 | 1 | +| Rock | Two Hearts Beat As One | U2 | 0.99 | 2 | 1.98 | 1 | +| Rock | City Of Blinding Lights | U2 | 0.99 | 2 | 1.98 | 1 | +| Rock | Red Light | U2 | 0.99 | 2 | 1.98 | 1 | +| Rock | Shock Me | Kiss | 0.99 | 2 | 1.98 | 1 | +| Rock | The Trooper | Iron Maiden | 0.99 | 2 | 1.98 | 1 | +| Rock | Everlasting Love | U2 | 0.99 | 2 | 1.98 | 1 | +| Rock | The Midnight Special | Creedence Clearwater Revival | 0.99 | 2 | 1.98 | 1 | +| Rock | Untitled | Pearl Jam | 0.99 | 2 | 1.98 | 1 | +| Rock | Down by the Sea | Men At Work | 0.99 | 2 | 1.98 | 1 | +| Rock | What A Day | Faith No More | 0.99 | 2 | 1.98 | 1 | +| Rock | Pretty Noose | Soundgarden | 0.99 | 2 | 1.98 | 1 | +| Rock | War Pigs | Ozzy Osbourne | 0.99 | 2 | 1.98 | 1 | +| Rock | Boris The Spider | The Who | 0.99 | 2 | 1.98 | 1 | +| Rock | Sweet Hitch-Hiker | Creedence Clearwater Revival | 0.99 | 2 | 1.98 | 1 | +| Rock | Menino De Rua | O Terço | 0.99 | 2 | 1.98 | 1 | +| Rock | Sure Know Something | Kiss | 0.99 | 2 | 1.98 | 1 | +| Rock | Commotion | Creedence Clearwater Revival | 0.99 | 2 | 1.98 | 1 | +| Rock | Que Luz É Essa | Raul Seixas | 0.99 | 2 | 1.98 | 1 | +| Rock | Judgement Of Heaven | Iron Maiden | 0.99 | 2 | 1.98 | 1 | +| Rock | Sun King | The Cult | 0.99 | 2 | 1.98 | 1 | +| Rock | Smoke On The Water | Deep Purple | 0.99 | 2 | 1.98 | 1 | +| Rock | Eruption | Van Halen | 0.99 | 2 | 1.98 | 1 | +| Rock | The Aftermath | Iron Maiden | 0.99 | 2 | 1.98 | 1 | +| Rock | Perfect Crime | Guns N' Roses | 0.99 | 2 | 1.98 | 1 | +| Rock | Dancing Days | Led Zeppelin | 0.99 | 2 | 1.98 | 1 | +| Rock | Binky The Doormat | R.E.M. | 0.99 | 2 | 1.98 | 1 | +| Rock | Walking Into Clarksdale | Page & Plant | 0.99 | 2 | 1.98 | 1 | +| Rock | Miami | U2 | 0.99 | 2 | 1.98 | 1 | +| Rock | Shakes and Ladders | Men At Work | 0.99 | 2 | 1.98 | 1 | +| Rock | Speed Of Light | The Cult | 0.99 | 2 | 1.98 | 1 | +| Rock | Whatever It Is, I Just Can't Stop | Jamiroquai | 0.99 | 2 | 1.98 | 1 | +| Rock | Good Golly Miss Molly | Creedence Clearwater Revival | 0.99 | 2 | 1.98 | 1 | +| Rock | The Star Spangled Banner | U2 | 0.99 | 2 | 1.98 | 1 | +| Rock | Don't Look Now | Creedence Clearwater Revival | 0.99 | 2 | 1.98 | 1 | +| Rock | When I Look At The World | U2 | 0.99 | 2 | 1.98 | 1 | +| Rock | Sweetest Thing | U2 | 0.99 | 2 | 1.98 | 1 | +| Rock | The Number Of The Beast | Iron Maiden | 0.99 | 2 | 1.98 | 1 | +| Rock | Don't Look To The Eyes Of A Stranger | Iron Maiden | 0.99 | 2 | 1.98 | 1 | +| Rock | Hallelujah Here She Comes | U2 | 0.99 | 2 | 1.98 | 1 | +| Rock | Hallowed Be Thy Name | Iron Maiden | 0.99 | 2 | 1.98 | 1 | +| Rock | I'm Going Slightly Mad | Queen | 0.99 | 2 | 1.98 | 1 | +| Rock | You Better You Bet | The Who | 0.99 | 2 | 1.98 | 1 | +| Rock | You're My Best Friend | Queen | 0.99 | 2 | 1.98 | 1 | +| Rock | Susie Q | Creedence Clearwater Revival | 0.99 | 2 | 1.98 | 1 | +| Rock | We Are The Champions | Queen | 0.99 | 2 | 1.98 | 1 | +| Rock | I'm Gonna Crawl | Led Zeppelin | 0.99 | 2 | 1.98 | 1 | +| Rock | Eruption | Van Halen | 0.99 | 2 | 1.98 | 1 | +| Rock | The Garden | Guns N' Roses | 0.99 | 2 | 1.98 | 1 | +| Rock | Overdose | AC/DC | 0.99 | 2 | 1.98 | 1 | +| Rock | Calling Dr. Love | Kiss | 0.99 | 2 | 1.98 | 1 | +| Rock | Balls to the Wall | Accept | 0.99 | 2 | 1.98 | 1 | +| Rock | Walter's Walk | Led Zeppelin | 0.99 | 2 | 1.98 | 1 | +| Rock | Inject The Venom | AC/DC | 0.99 | 2 | 1.98 | 1 | +| Rock | The Last Song | Foo Fighters | 0.99 | 2 | 1.98 | 1 | +| Rock | No No No | Deep Purple | 0.99 | 2 | 1.98 | 1 | +| Rock | Piece Of Pie | Stone Temple Pilots | 0.99 | 2 | 1.98 | 1 | +| Rock | The Thin Line Between Love & Hate | Iron Maiden | 0.99 | 2 | 1.98 | 1 | +| Rock | Year to the Day | Van Halen | 0.99 | 2 | 1.98 | 1 | +| Rock | Ramble On | Led Zeppelin | 0.99 | 2 | 1.98 | 1 | +| Rock | Snowballed | AC/DC | 0.99 | 2 | 1.98 | 1 | +| Rock | Right On Time | Red Hot Chili Peppers | 0.99 | 2 | 1.98 | 1 | +| Rock | Sangue Latino | O Terço | 0.99 | 2 | 1.98 | 1 | +| Rock | Pride (In The Name Of Love) | U2 | 0.99 | 2 | 1.98 | 1 | +| Rock | In The Evening | Led Zeppelin | 0.99 | 2 | 1.98 | 1 | +| Rock | For Your Life | Led Zeppelin | 0.99 | 2 | 1.98 | 1 | +| Rock | Undertow | R.E.M. | 0.99 | 2 | 1.98 | 1 | +| Rock | For the Greater Good of God | Iron Maiden | 0.99 | 2 | 1.98 | 1 | +| Rock | Folhas Secas | O Terço | 0.99 | 2 | 1.98 | 1 | +| Rock | Tear | Red Hot Chili Peppers | 0.99 | 2 | 1.98 | 1 | +| Rock | Plaster Caster | Kiss | 0.99 | 2 | 1.98 | 1 | +| Rock | Living Loving Maid (She's Just A Woman) | Led Zeppelin | 0.99 | 2 | 1.98 | 1 | +| Rock | Living On Love | David Coverdale | 0.99 | 2 | 1.98 | 1 | +| Rock | How Many More Times | Led Zeppelin | 0.99 | 2 | 1.98 | 1 | +| Rock | Dirty Day | U2 | 0.99 | 2 | 1.98 | 1 | +| Rock | Rocket Queen | Guns N' Roses | 0.99 | 2 | 1.98 | 1 | +| Rock | Flying High Again | Ozzy Osbourne | 0.99 | 2 | 1.98 | 1 | +| Rock | Deuces Are Wild | Aerosmith | 0.99 | 2 | 1.98 | 1 | +| Rock | Dazed and Confused | Led Zeppelin | 0.99 | 2 | 1.98 | 1 | +| Rock | Sure Know Something | Kiss | 0.99 | 2 | 1.98 | 1 | +| Rock | Peace On Earth | U2 | 0.99 | 2 | 1.98 | 1 | +| Rock | Maybe I'm A Leo | Deep Purple | 0.99 | 2 | 1.98 | 1 | +| Rock | Not The Doctor | Alanis Morissette | 0.99 | 2 | 1.98 | 1 | +| Rock | Mr. Cab Driver | Lenny Kravitz | 0.99 | 2 | 1.98 | 1 | +| Rock | Rock And Roll All Nite | Kiss | 0.99 | 2 | 1.98 | 1 | +| Rock | All Along The Watchtower | U2 | 0.99 | 2 | 1.98 | 1 | +| Rock | Mama, I'm Coming Home | Ozzy Osbourne | 0.99 | 2 | 1.98 | 1 | +| Rock | Garden of Eden | Guns N' Roses | 0.99 | 2 | 1.98 | 1 | +| Rock | Somebody To Love | Queen | 0.99 | 2 | 1.98 | 1 | +| Rock | Bron-Y-Aur Stomp | Led Zeppelin | 0.99 | 2 | 1.98 | 1 | +| Rock | Sunday Bloody Sunday | U2 | 0.99 | 2 | 1.98 | 1 | +| Rock And Roll | Long Tall Sally | BackBeat | 0.99 | 1 | 0.99 | 1 | +| Rock And Roll | Money | BackBeat | 0.99 | 1 | 0.99 | 1 | +| Rock And Roll | Carol | BackBeat | 0.99 | 1 | 0.99 | 1 | +| Rock And Roll | Rock 'N' Roll Music | BackBeat | 0.99 | 1 | 0.99 | 1 | +| Rock And Roll | C'Mon Everybody | BackBeat | 0.99 | 1 | 0.99 | 1 | +| Rock And Roll | Good Golly Miss Molly | BackBeat | 0.99 | 1 | 0.99 | 1 | +| Sci Fi & Fantasy | The Hand of God | Battlestar Galactica (Classic) | 1.99 | 1 | 1.99 | 1 | +| Sci Fi & Fantasy | Battlestar Galactica, Pt. 2 | Battlestar Galactica (Classic) | 1.99 | 1 | 1.99 | 1 | +| Sci Fi & Fantasy | The Gun On Ice Planet Zero, Pt. 2 | Battlestar Galactica (Classic) | 1.99 | 1 | 1.99 | 1 | +| Sci Fi & Fantasy | Take the Celestra | Battlestar Galactica (Classic) | 1.99 | 1 | 1.99 | 1 | +| Sci Fi & Fantasy | Battlestar Galactica, Pt. 1 | Battlestar Galactica (Classic) | 1.99 | 1 | 1.99 | 1 | +| Sci Fi & Fantasy | The Lost Warrior | Battlestar Galactica (Classic) | 1.99 | 1 | 1.99 | 1 | +| Sci Fi & Fantasy | The Long Patrol | Battlestar Galactica (Classic) | 1.99 | 1 | 1.99 | 1 | +| Sci Fi & Fantasy | Lost Planet of the Gods, Pt. 2 | Battlestar Galactica (Classic) | 1.99 | 1 | 1.99 | 1 | +| Sci Fi & Fantasy | Lost Planet of the Gods, Pt. 1 | Battlestar Galactica (Classic) | 1.99 | 1 | 1.99 | 1 | +| Sci Fi & Fantasy | War of the Gods, Pt. 2 | Battlestar Galactica (Classic) | 1.99 | 1 | 1.99 | 1 | +| Sci Fi & Fantasy | The Living Legend, Pt. 1 | Battlestar Galactica (Classic) | 1.99 | 1 | 1.99 | 1 | +| Sci Fi & Fantasy | Baltar's Escape | Battlestar Galactica (Classic) | 1.99 | 1 | 1.99 | 1 | +| Sci Fi & Fantasy | Greetings from Earth, Pt. 1 | Battlestar Galactica (Classic) | 1.99 | 1 | 1.99 | 1 | +| Sci Fi & Fantasy | The Living Legend, Pt. 2 | Battlestar Galactica (Classic) | 1.99 | 1 | 1.99 | 1 | +| Sci Fi & Fantasy | The Gun On Ice Planet Zero, Pt. 1 | Battlestar Galactica (Classic) | 1.99 | 1 | 1.99 | 1 | +| Sci Fi & Fantasy | Crossroads, Pt. 2 | Battlestar Galactica | 1.99 | 1 | 1.99 | 1 | +| Sci Fi & Fantasy | Experiment In Terra | Battlestar Galactica (Classic) | 1.99 | 1 | 1.99 | 1 | +| Sci Fi & Fantasy | Greetings from Earth, Pt. 2 | Battlestar Galactica (Classic) | 1.99 | 1 | 1.99 | 1 | +| Sci Fi & Fantasy | The Magnificent Warriors | Battlestar Galactica (Classic) | 1.99 | 1 | 1.99 | 1 | +| Sci Fi & Fantasy | Crossroads, Pt. 1 | Battlestar Galactica | 1.99 | 1 | 1.99 | 1 | +| Science Fiction | The Woman King | Battlestar Galactica | 1.99 | 2 | 3.98 | 1 | +| Soundtrack | Plot 180 | Passengers | 0.99 | 2 | 1.98 | 1 | +| TV Shows | Walkabout | Lost | 1.99 | 2 | 3.98 | 1 | +| TV Shows | Pilot | Aquaman | 1.99 | 2 | 3.98 | 1 | +| TV Shows | Gay Witch Hunt | The Office | 1.99 | 2 | 3.98 | 1 | +| TV Shows | Hot Girl | The Office | 1.99 | 2 | 3.98 | 1 | +| World | No Futuro | João Suplicy | 0.99 | 1 | 0.99 | 1 | +| World | Demorou! | Mônica Marianno | 0.99 | 1 | 0.99 | 1 | +| World | Enquanto O Dia Não Vem | João Suplicy | 0.99 | 1 | 0.99 | 1 | +| World | A Moça e a Chuva | Mônica Marianno | 0.99 | 1 | 0.99 | 1 | +| World | Papelão | João Suplicy | 0.99 | 1 | 0.99 | 1 | +| World | É Fogo | Mônica Marianno | 0.99 | 1 | 0.99 | 1 | +| World | Din Din Wo (Little Child) | Habib Koité and Bamada | 0.99 | 1 | 0.99 | 1 | +| World | Voce Inteira | João Suplicy | 0.99 | 1 | 0.99 | 1 | +| World | O Que Vai Em Meu Coração | Mônica Marianno | 0.99 | 1 | 0.99 | 1 | +| World | Cafezinho | João Suplicy | 0.99 | 1 | 0.99 | 1 | +| World | Já Foi | Mônica Marianno | 0.99 | 1 | 0.99 | 1 | +| World | Equinocio | João Suplicy | 0.99 | 1 | 0.99 | 1 | +| World | Aos Leões | Mônica Marianno | 0.99 | 1 | 0.99 | 1 | diff --git a/tests/chinook/dataset/task_8/task.txt b/tests/chinook/dataset/task_8/task.txt new file mode 100644 index 00000000..35967740 --- /dev/null +++ b/tests/chinook/dataset/task_8/task.txt @@ -0,0 +1,3 @@ +Find me all tracks and their artist that are both the most expensive in their +genre and have the highest sales volume within that same genre. I need to +identify which premium-priced tracks are still dominating their market segments. diff --git a/tests/chinook/dataset/task_9/ground_truth.md b/tests/chinook/dataset/task_9/ground_truth.md new file mode 100644 index 00000000..547b4d07 --- /dev/null +++ b/tests/chinook/dataset/task_9/ground_truth.md @@ -0,0 +1,61 @@ +i| customer_id | customer_name | orders_count | total_spent | total_spent_rounded | orders_per_day | days_active | days_since_last_purchase | first_order | last_order | clv_segment | churn_risk | +| ----------- | --------------------- | ------------ | ----------- | ------------------- | ---------------------- | ----------- | ------------------------ | ---------------------- | ---------------------- | ----------- | ----------------- | +| 6 | Helena Holý | 7 | 49.62 | 49.62 | 0.00441361916771752837 | 1586 | -63 | 2021-07-11 00:00:00+00 | 2025-11-13 00:00:00+00 | High | Low churn risk | +| 26 | Richard Cunningham | 7 | 47.62 | 47.62 | 0.00562248995983935743 | 1245 | 159 | 2021-11-07 00:00:00+00 | 2025-04-05 00:00:00+00 | High | High churn risk | +| 57 | Luis Rojas | 7 | 46.62 | 46.62 | 0.00543056633048875097 | 1289 | 332 | 2021-04-04 00:00:00+00 | 2024-10-14 00:00:00+00 | High | High churn risk | +| 45 | Ladislav Kovács | 7 | 45.62 | 45.62 | 0.00543056633048875097 | 1289 | 53 | 2022-01-08 00:00:00+00 | 2025-07-20 00:00:00+00 | High | Medium churn risk | +| 46 | Hugh O'Reilly | 7 | 45.62 | 45.62 | 0.00403458213256484150 | 1735 | -54 | 2021-02-03 00:00:00+00 | 2025-11-04 00:00:00+00 | High | Low churn risk | +| 28 | Julia Barnett | 7 | 43.62 | 43.62 | 0.00543056633048875097 | 1289 | 115 | 2021-11-07 00:00:00+00 | 2025-05-19 00:00:00+00 | High | High churn risk | +| 37 | Fynn Zimmermann | 7 | 43.62 | 43.62 | 0.00438596491228070175 | 1596 | 100 | 2021-01-19 00:00:00+00 | 2025-06-03 00:00:00+00 | High | High churn risk | +| 24 | Frank Ralston | 7 | 43.62 | 43.62 | 0.00543056633048875097 | 1289 | 22 | 2022-02-08 00:00:00+00 | 2025-08-20 00:00:00+00 | High | Low churn risk | +| 25 | Victor Stevens | 7 | 42.62 | 42.62 | 0.00403458213256484150 | 1735 | -85 | 2021-03-06 00:00:00+00 | 2025-12-05 00:00:00+00 | High | Low churn risk | +| 7 | Astrid Gruber | 7 | 42.62 | 42.62 | 0.00543056633048875097 | 1289 | 84 | 2021-12-08 00:00:00+00 | 2025-06-19 00:00:00+00 | High | Medium churn risk | +| 44 | Terhi Hämäläinen | 7 | 41.62 | 41.62 | 0.00441361916771752837 | 1586 | -94 | 2021-08-11 00:00:00+00 | 2025-12-14 00:00:00+00 | High | Low churn risk | +| 48 | Johannes Van der Berg | 7 | 40.62 | 40.62 | 0.00441361916771752837 | 1586 | -1 | 2021-05-10 00:00:00+00 | 2025-09-12 00:00:00+00 | High | Low churn risk | +| 43 | Isabelle Mercier | 7 | 40.62 | 40.62 | 0.00562248995983935743 | 1245 | 97 | 2022-01-08 00:00:00+00 | 2025-06-06 00:00:00+00 | High | High churn risk | +| 5 | František Wichterlová | 7 | 40.62 | 40.62 | 0.00562248995983935743 | 1245 | 128 | 2021-12-08 00:00:00+00 | 2025-05-06 00:00:00+00 | High | High churn risk | +| 17 | Jack Smith | 7 | 39.62 | 39.62 | 0.00562248995983935743 | 1245 | 407 | 2021-03-04 00:00:00+00 | 2024-07-31 00:00:00+00 | High | High churn risk | +| 34 | João Fernandes | 7 | 39.62 | 39.62 | 0.00562248995983935743 | 1245 | 345 | 2021-05-05 00:00:00+00 | 2024-10-01 00:00:00+00 | High | High churn risk | +| 20 | Dan Miller | 7 | 39.62 | 39.62 | 0.00543056633048875097 | 1289 | -71 | 2022-05-12 00:00:00+00 | 2025-11-21 00:00:00+00 | High | Low churn risk | +| 22 | Heather Leacock | 7 | 39.62 | 39.62 | 0.00562248995983935743 | 1245 | 66 | 2022-02-08 00:00:00+00 | 2025-07-07 00:00:00+00 | High | Medium churn risk | +| 3 | François Tremblay | 7 | 39.62 | 39.62 | 0.00543056633048875097 | 1289 | -9 | 2022-03-11 00:00:00+00 | 2025-09-20 00:00:00+00 | High | Low churn risk | +| 42 | Wyatt Girard | 7 | 39.62 | 39.62 | 0.00403458213256484150 | 1735 | -53 | 2021-02-02 00:00:00+00 | 2025-11-03 00:00:00+00 | High | Low churn risk | +| 1 | Luís Gonçalves | 7 | 39.62 | 39.62 | 0.00562248995983935743 | 1245 | 35 | 2022-03-11 00:00:00+00 | 2025-08-07 00:00:00+00 | Medium | Medium churn risk | +| 4 | Bjørn Hansen | 7 | 39.62 | 39.62 | 0.00403458213256484150 | 1735 | -22 | 2021-01-02 00:00:00+00 | 2025-10-03 00:00:00+00 | Medium | Low churn risk | +| 58 | Manoj Pareek | 7 | 38.62 | 38.62 | 0.00543056633048875097 | 1289 | -102 | 2022-06-12 00:00:00+00 | 2025-12-22 00:00:00+00 | Medium | Low churn risk | +| 40 | Dominique Lefebvre | 7 | 38.62 | 38.62 | 0.00543056633048875097 | 1289 | 394 | 2021-02-01 00:00:00+00 | 2024-08-13 00:00:00+00 | Medium | High churn risk | +| 15 | Jennifer Peterson | 7 | 38.62 | 38.62 | 0.00543056633048875097 | 1289 | 270 | 2021-06-05 00:00:00+00 | 2024-12-15 00:00:00+00 | Medium | High churn risk | +| 39 | Camille Bernard | 7 | 38.62 | 38.62 | 0.00562248995983935743 | 1245 | 4 | 2022-04-11 00:00:00+00 | 2025-09-07 00:00:00+00 | Medium | Low churn risk | +| 51 | Joakim Johansson | 7 | 38.62 | 38.62 | 0.00562248995983935743 | 1245 | 283 | 2021-07-06 00:00:00+00 | 2024-12-02 00:00:00+00 | Medium | High churn risk | +| 19 | Tim Goyer | 7 | 38.62 | 38.62 | 0.00543056633048875097 | 1289 | 363 | 2021-03-04 00:00:00+00 | 2024-09-13 00:00:00+00 | Medium | High churn risk | +| 55 | Mark Taylor | 7 | 37.62 | 37.62 | 0.00562248995983935743 | 1245 | 376 | 2021-04-04 00:00:00+00 | 2024-08-31 00:00:00+00 | Medium | High churn risk | +| 56 | Diego Gutiérrez | 7 | 37.62 | 37.62 | 0.00562248995983935743 | 1245 | -58 | 2022-06-12 00:00:00+00 | 2025-11-08 00:00:00+00 | Medium | Low churn risk | +| 52 | Emma Jones | 7 | 37.62 | 37.62 | 0.00441361916771752837 | 1586 | 92 | 2021-02-06 00:00:00+00 | 2025-06-11 00:00:00+00 | Medium | High churn risk | +| 2 | Leonie Köhler | 7 | 37.62 | 37.62 | 0.00543056633048875097 | 1289 | 425 | 2021-01-01 00:00:00+00 | 2024-07-13 00:00:00+00 | Medium | High churn risk | +| 8 | Daan Peeters | 7 | 37.62 | 37.62 | 0.00403458213256484150 | 1735 | -23 | 2021-01-03 00:00:00+00 | 2025-10-04 00:00:00+00 | Medium | Low churn risk | +| 9 | Kara Nielsen | 7 | 37.62 | 37.62 | 0.00562248995983935743 | 1245 | 221 | 2021-09-06 00:00:00+00 | 2025-02-02 00:00:00+00 | Medium | High churn risk | +| 10 | Eduardo Martins | 7 | 37.62 | 37.62 | 0.00441361916771752837 | 1586 | 30 | 2021-04-09 00:00:00+00 | 2025-08-12 00:00:00+00 | Medium | Low churn risk | +| 11 | Alexandre Rocha | 7 | 37.62 | 37.62 | 0.00543056633048875097 | 1289 | 177 | 2021-09-06 00:00:00+00 | 2025-03-18 00:00:00+00 | Medium | High churn risk | +| 12 | Roberto Almeida | 7 | 37.62 | 37.62 | 0.00438596491228070175 | 1596 | -24 | 2021-05-23 00:00:00+00 | 2025-10-05 00:00:00+00 | Medium | Low churn risk | +| 13 | Fernanda Ramos | 7 | 37.62 | 37.62 | 0.00562248995983935743 | 1245 | 314 | 2021-06-05 00:00:00+00 | 2024-11-01 00:00:00+00 | Medium | High churn risk | +| 14 | Mark Philips | 7 | 37.62 | 37.62 | 0.00441361916771752837 | 1586 | 123 | 2021-01-06 00:00:00+00 | 2025-05-11 00:00:00+00 | Medium | High churn risk | +| 16 | Frank Harris | 7 | 37.62 | 37.62 | 0.00438596491228070175 | 1596 | 69 | 2021-02-19 00:00:00+00 | 2025-07-04 00:00:00+00 | Medium | Medium churn risk | +| 18 | Michelle Brooks | 7 | 37.62 | 37.62 | 0.00562248995983935743 | 1245 | -27 | 2022-05-12 00:00:00+00 | 2025-10-08 00:00:00+00 | Low | Low churn risk | +| 21 | Kathy Chase | 7 | 37.62 | 37.62 | 0.00403458213256484150 | 1735 | -84 | 2021-03-05 00:00:00+00 | 2025-12-04 00:00:00+00 | Low | Low churn risk | +| 23 | John Gordon | 7 | 37.62 | 37.62 | 0.00391498881431767338 | 1788 | -84 | 2021-01-11 00:00:00+00 | 2025-12-04 00:00:00+00 | Low | Low churn risk | +| 27 | Patrick Gray | 7 | 37.62 | 37.62 | 0.00441361916771752837 | 1586 | -32 | 2021-06-10 00:00:00+00 | 2025-10-13 00:00:00+00 | Low | Low churn risk | +| 29 | Robert Brown | 7 | 37.62 | 37.62 | 0.00438596491228070175 | 1596 | -86 | 2021-07-24 00:00:00+00 | 2025-12-06 00:00:00+00 | Low | Low churn risk | +| 30 | Edward Francis | 7 | 37.62 | 37.62 | 0.00562248995983935743 | 1245 | 252 | 2021-08-06 00:00:00+00 | 2025-01-02 00:00:00+00 | Low | High churn risk | +| 31 | Martha Silk | 7 | 37.62 | 37.62 | 0.00441361916771752837 | 1586 | 61 | 2021-03-09 00:00:00+00 | 2025-07-12 00:00:00+00 | Low | Medium churn risk | +| 32 | Aaron Mitchell | 7 | 37.62 | 37.62 | 0.00543056633048875097 | 1289 | 208 | 2021-08-06 00:00:00+00 | 2025-02-15 00:00:00+00 | Low | High churn risk | +| 33 | Ellie Sullivan | 7 | 37.62 | 37.62 | 0.00438596491228070175 | 1596 | 7 | 2021-04-22 00:00:00+00 | 2025-09-04 00:00:00+00 | Low | Low churn risk | +| 35 | Madalena Sampaio | 7 | 37.62 | 37.62 | 0.00562248995983935743 | 1245 | -89 | 2022-07-13 00:00:00+00 | 2025-12-09 00:00:00+00 | Low | Low churn risk | +| 36 | Hannah Schneider | 7 | 37.62 | 37.62 | 0.00543056633048875097 | 1289 | 301 | 2021-05-05 00:00:00+00 | 2024-11-14 00:00:00+00 | Low | High churn risk | +| 38 | Niklas Schröder | 7 | 37.62 | 37.62 | 0.00562248995983935743 | 1245 | 438 | 2021-02-01 00:00:00+00 | 2024-06-30 00:00:00+00 | Low | High churn risk | +| 41 | Marc Dubois | 7 | 37.62 | 37.62 | 0.00543056633048875097 | 1289 | -40 | 2022-04-11 00:00:00+00 | 2025-10-21 00:00:00+00 | Low | Low churn risk | +| 47 | Lucas Mancini | 7 | 37.62 | 37.62 | 0.00562248995983935743 | 1245 | 190 | 2021-10-07 00:00:00+00 | 2025-03-05 00:00:00+00 | Low | High churn risk | +| 49 | Stanisław Wójcik | 7 | 37.62 | 37.62 | 0.00543056633048875097 | 1289 | 146 | 2021-10-07 00:00:00+00 | 2025-04-18 00:00:00+00 | Low | High churn risk | +| 50 | Enrique Muñoz | 7 | 37.62 | 37.62 | 0.00438596491228070175 | 1596 | -55 | 2021-06-23 00:00:00+00 | 2025-11-05 00:00:00+00 | Low | Low churn risk | +| 53 | Phil Hughes | 7 | 37.62 | 37.62 | 0.00543056633048875097 | 1289 | 239 | 2021-07-06 00:00:00+00 | 2025-01-15 00:00:00+00 | Low | High churn risk | +| 54 | Steve Murray | 7 | 37.62 | 37.62 | 0.00438596491228070175 | 1596 | 38 | 2021-03-22 00:00:00+00 | 2025-08-04 00:00:00+00 | Low | Medium churn risk | +| 59 | Puja Srivastava | 6 | 36.64 | 36.64 | 0.00521285838401390096 | 1151 | 469 | 2021-04-05 00:00:00+00 | 2024-05-30 00:00:00+00 | Low | High churn risk | diff --git a/tests/chinook/dataset/task_9/task.txt b/tests/chinook/dataset/task_9/task.txt new file mode 100644 index 00000000..5fb7c070 --- /dev/null +++ b/tests/chinook/dataset/task_9/task.txt @@ -0,0 +1,3 @@ +Calculate customer lifetime value segments and predict churn risk. Group +customers into segments based on spending patterns and calculate days since last +purchase to identify churn risk. diff --git a/tests/chinook/dataset/track_10/ground_truth.md b/tests/chinook/dataset/track_10/ground_truth.md new file mode 100644 index 00000000..332e271d --- /dev/null +++ b/tests/chinook/dataset/track_10/ground_truth.md @@ -0,0 +1,349 @@ +| album_title | artist_name | total_tracks | track_count_variance | genre_count | most_common_genre | +| ----------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------- | ------------ | ---------------------- | ----------- | ------------------ | +| Lost, Season 3 | Lost | 26 | 81.0000000000000000 | 2 | Drama | +| Heroes, Season 1 | Heroes | 23 | 72.2500000000000000 | 2 | Drama | +| Greatest Hits | Lenny Kravitz | 57 | 60.6666666666666667 | 3 | Rock | +| LOST, Season 4 | Lost | 17 | 30.2500000000000000 | 2 | Drama | +| The Office, Season 3 | The Office | 25 | 20.2500000000000000 | 2 | Comedy | +| Battlestar Galactica, Season 3 | Battlestar Galactica | 19 | 17.5555555555555556 | 3 | Science Fiction | +| Lost, Season 2 | Lost | 24 | 16.0000000000000000 | 2 | TV Shows | +| Rock In Rio [CD2] | Iron Maiden | 9 | 12.2500000000000000 | 2 | Rock | +| The Number of The Beast | Iron Maiden | 8 | 9.0000000000000000 | 2 | Metal | +| Live After Death | Iron Maiden | 18 | 4.0000000000000000 | 2 | Metal | +| Unplugged | Eric Clapton | 30 | 1.00000000000000000000 | 2 | Latin | +| Minha Historia | Chico Buarque | 34 | 0 | 1 | Latin | +| Lost, Season 1 | Lost | 25 | 0 | 1 | TV Shows | +| My Way: The Best Of Frank Sinatra [Disc 1] | Frank Sinatra | 24 | 0 | 1 | Easy Listening | +| Battlestar Galactica (Classic), Season 1 | Battlestar Galactica (Classic) | 24 | 0 | 1 | Sci Fi & Fantasy | +| Instant Karma: The Amnesty International Campaign to Save Darfur | U2 | 23 | 0 | 1 | Pop | +| Afrociberdelia | Chico Science & Nação Zumbi | 23 | 0 | 1 | Latin | +| Acústico | Titãs | 22 | 0 | 1 | Alternative & Punk | +| Up An' Atom | Gene Krupa | 22 | 0 | 1 | Jazz | +| The Office, Season 2 | The Office | 22 | 0 | 1 | TV Shows | +| International Superhits | Green Day | 21 | 0 | 1 | Alternative & Punk | +| Acústico MTV | Os Paralamas Do Sucesso | 21 | 0 | 1 | Latin | +| Greatest Kiss | Kiss | 20 | 0 | 1 | Rock | +| Chronicle, Vol. 2 | Creedence Clearwater Revival | 20 | 0 | 1 | Rock | +| Chronicle, Vol. 1 | Creedence Clearwater Revival | 20 | 0 | 1 | Rock | +| Sex Machine | James Brown | 20 | 0 | 1 | R&B/Soul | +| My Generation - The Very Best Of The Who | The Who | 20 | 0 | 1 | Rock | +| Ao Vivo [IMPORT] | Zeca Pagodinho | 19 | 0 | 1 | Latin | +| House of Pain | House Of Pain | 19 | 0 | 1 | Hip Hop/Rap | +| The Cream Of Clapton | Eric Clapton | 18 | 0 | 1 | Blues | +| Barulhinho Bom | Marisa Monte | 18 | 0 | 1 | Latin | +| Pure Cult: The Best Of The Cult (For Rockers, Ravers, Lovers & Sinners) [UK] | The Cult | 18 | 0 | 1 | Rock | +| Prenda Minha | Caetano Veloso | 18 | 0 | 1 | Latin | +| The Singles | The Clash | 18 | 0 | 1 | Alternative & Punk | +| Seek And Shall Find: More Of The Best (1963-1981) | Marvin Gaye | 18 | 0 | 1 | R&B/Soul | +| Rotten Apples: Greatest Hits | Smashing Pumpkins | 18 | 0 | 1 | Alternative & Punk | +| Blood Sugar Sex Magik | Red Hot Chili Peppers | 17 | 0 | 1 | Alternative & Punk | +| Greatest Hits I | Queen | 17 | 0 | 1 | Rock | +| A-Sides | Soundgarden | 17 | 0 | 1 | Rock | +| Chill: Brazil (Disc 1) | Marcos Valle | 17 | 0 | 1 | Latin | +| The Best Of Van Halen, Vol. I | Van Halen | 17 | 0 | 1 | Rock | +| Radio Brasil (O Som da Jovem Vanguarda) - Seleccao de Henrique Amaro | O Rappa | 17 | 0 | 1 | Electronica/Dance | +| From The Muddy Banks Of The Wishkah [Live] | Nirvana | 17 | 0 | 1 | Rock | +| Chill: Brazil (Disc 2) | Antônio Carlos Jobim | 17 | 0 | 1 | Latin | +| Acústico MTV [Live] | Cidade Negra | 17 | 0 | 1 | Reggae | +| Body Count | Body Count | 17 | 0 | 1 | Alternative & Punk | +| Greatest Hits II | Queen | 17 | 0 | 1 | Rock | +| Motley Crue Greatest Hits | Mötley Crüe | 17 | 0 | 1 | Metal | +| Rattle And Hum | U2 | 17 | 0 | 1 | Rock | +| Are You Experienced? | Jimi Hendrix | 17 | 0 | 1 | Rock | +| Volume Dois | Titãs | 16 | 0 | 1 | Alternative & Punk | +| Live On Two Legs [Live] | Pearl Jam | 16 | 0 | 1 | Rock | +| Living After Midnight | Judas Priest | 16 | 0 | 1 | Metal | +| Arquivo Os Paralamas Do Sucesso | Os Paralamas Do Sucesso | 16 | 0 | 1 | Latin | +| Os Cães Ladram Mas A Caravana Não Pára | Planet Hemp | 16 | 0 | 1 | Hip Hop/Rap | +| By The Way | Red Hot Chili Peppers | 16 | 0 | 1 | Rock | +| Use Your Illusion I | Guns N' Roses | 16 | 0 | 1 | Rock | +| Mais Do Mesmo | Legião Urbana | 16 | 0 | 1 | Latin | +| Judas 0: B-Sides and Rarities | Smashing Pumpkins | 16 | 0 | 1 | Alternative & Punk | +| Vault: Def Leppard's Greatest Hits | Def Leppard | 16 | 0 | 1 | Rock | +| Roda De Funk | Funk Como Le Gusta | 16 | 0 | 1 | Latin | +| The Best Of R.E.M.: The IRS Years | R.E.M. | 16 | 0 | 1 | Alternative & Punk | +| Garage Inc. (Disc 2) | Metallica | 16 | 0 | 1 | Metal | +| A TempestadeTempestade Ou O Livro Dos Dias | Legião Urbana | 15 | 0 | 1 | Latin | +| Voodoo Lounge | The Rolling Stones | 15 | 0 | 1 | Rock | +| Serie Sem Limite (Disc 1) | Tim Maia | 15 | 0 | 1 | Latin | +| Unplugged [Live] | Kiss | 15 | 0 | 1 | Rock | +| B-Sides 1980-1990 | U2 | 15 | 0 | 1 | Rock | +| Vinicius De Moraes | Vinícius De Moraes | 15 | 0 | 1 | Latin | +| King For A Day Fool For A Lifetime | Faith No More | 15 | 0 | 1 | Rock | +| Cássia Eller - Sem Limite [Disc 1] | Cássia Eller | 15 | 0 | 1 | Latin | +| Riot Act | Pearl Jam | 15 | 0 | 1 | Rock | +| Quanta Gente Veio Ver (Live) | Gilberto Gil | 15 | 0 | 1 | Latin | +| Serie Sem Limite (Disc 2) | Tim Maia | 15 | 0 | 1 | Latin | +| Big Ones | Aerosmith | 15 | 0 | 1 | Rock | +| Compositores | O Terço | 15 | 0 | 1 | Rock | +| Vinícius De Moraes - Sem Limite | Toquinho & Vinícius | 15 | 0 | 1 | Bossa Nova | +| Ace Of Spades | Motörhead | 15 | 0 | 1 | Metal | +| Tangents | The Tea Party | 15 | 0 | 1 | Alternative & Punk | +| Californication | Red Hot Chili Peppers | 15 | 0 | 1 | Rock | +| Cássia Eller - Coleção Sem Limite [Disc 2] | Cássia Eller | 15 | 0 | 1 | Latin | +| Cafezinho | João Suplicy | 14 | 0 | 1 | World | +| Elis Regina-Minha História | Elis Regina | 14 | 0 | 1 | Latin | +| Olodum | Olodum | 14 | 0 | 1 | Latin | +| Warner 25 Anos | Antônio Carlos Jobim | 14 | 0 | 1 | Jazz | +| Sambas De Enredo 2001 | Various Artists | 14 | 0 | 1 | Latin | +| BBC Sessions [Disc 1] [Live] | Led Zeppelin | 14 | 0 | 1 | Rock | +| Carnaval 2001 | Various Artists | 14 | 0 | 1 | Soundtrack | +| Deixa Entrar | Falamansa | 14 | 0 | 1 | Latin | +| Revelations | Audioslave | 14 | 0 | 1 | Alternative | +| Minha História | Os Mutantes | 14 | 0 | 1 | Alternative & Punk | +| Audioslave | Audioslave | 14 | 0 | 1 | Rock | +| Tribute | Ozzy Osbourne | 14 | 0 | 1 | Metal | +| Lulu Santos - RCA 100 Anos De Música - Álbum 02 | Lulu Santos | 14 | 0 | 1 | Latin | +| Retrospective I (1974-1980) | Rush | 14 | 0 | 1 | Rock | +| Vozes do MPB | Various Artists | 14 | 0 | 1 | Latin | +| Axé Bahia 2001 | Various Artists | 14 | 0 | 1 | Pop | +| Meus Momentos | Gonzaguinha | 14 | 0 | 1 | Latin | +| Cidade Negra - Hits | Cidade Negra | 14 | 0 | 1 | Reggae | +| Lulu Santos - RCA 100 Anos De Música - Álbum 01 | Lulu Santos | 14 | 0 | 1 | Latin | +| Use Your Illusion II | Guns N' Roses | 14 | 0 | 1 | Metal | +| The Best Of 1980-1990 | U2 | 14 | 0 | 1 | Rock | +| Raul Seixas | Raul Seixas | 14 | 0 | 1 | Rock | +| Angel Dust | Faith No More | 14 | 0 | 1 | Alternative & Punk | +| As Canções de Eu Tu Eles | Gilberto Gil | 14 | 0 | 1 | Soundtrack | +| Original Soundtracks 1 | Passengers | 14 | 0 | 1 | Soundtrack | +| The Best of Ed Motta | Ed Motta | 14 | 0 | 1 | Latin | +| New Adventures In Hi-Fi | R.E.M. | 14 | 0 | 1 | Rock | +| The Police Greatest Hits | The Police | 14 | 0 | 1 | Rock | +| Load | Metallica | 14 | 0 | 1 | Metal | +| UB40 The Best Of - Volume Two [UK] | UB40 | 14 | 0 | 1 | Reggae | +| Carry On | Chris Cornell | 14 | 0 | 1 | Alternative | +| Miles Ahead | Miles Davis | 14 | 0 | 1 | Jazz | +| Jorge Ben Jor 25 Anos | Jorge Ben | 14 | 0 | 1 | Latin | +| No Security | The Rolling Stones | 14 | 0 | 1 | Rock | +| Milton Nascimento Ao Vivo | Milton Nascimento | 13 | 0 | 1 | Latin | +| Supernatural | Santana | 13 | 0 | 1 | Rock | +| ReLoad | Metallica | 13 | 0 | 1 | Metal | +| The Essential Miles Davis [Disc 1] | Miles Davis | 13 | 0 | 1 | Jazz | +| Get Born | JET | 13 | 0 | 1 | Alternative & Punk | +| The Colour And The Shape | Foo Fighters | 13 | 0 | 1 | Rock | +| Da Lama Ao Caos | Chico Science & Nação Zumbi | 13 | 0 | 1 | Latin | +| Djavan Ao Vivo - Vol. 1 | Djavan | 13 | 0 | 1 | Latin | +| Jagged Little Pill | Alanis Morissette | 13 | 0 | 1 | Rock | +| Pearl Jam | Pearl Jam | 13 | 0 | 1 | Alternative & Punk | +| Alcohol Fueled Brewtality Live! [Disc 1] | Black Label Society | 13 | 0 | 1 | Metal | +| Minas | Milton Nascimento | 13 | 0 | 1 | Latin | +| Blue Moods | Incognito | 13 | 0 | 1 | Jazz | +| American Idiot | Green Day | 13 | 0 | 1 | Alternative & Punk | +| Djavan Ao Vivo - Vol. 02 | Djavan | 13 | 0 | 1 | Latin | +| Contraband | Velvet Revolver | 13 | 0 | 1 | Rock | +| Beyond Good And Evil | The Cult | 12 | 0 | 1 | Rock | +| Walking Into Clarksdale | Page & Plant | 12 | 0 | 1 | Rock | +| BackBeat Soundtrack | BackBeat | 12 | 0 | 1 | Rock And Roll | +| Into The Light | David Coverdale | 12 | 0 | 1 | Rock | +| Vs. | Pearl Jam | 12 | 0 | 1 | Rock | +| Purpendicular | Deep Purple | 12 | 0 | 1 | Rock | +| 20th Century Masters - The Millennium Collection: The Best of Scorpions | Scorpions | 12 | 0 | 1 | Rock | +| Appetite for Destruction | Guns N' Roses | 12 | 0 | 1 | Rock | +| Faceless | Godsmack | 12 | 0 | 1 | Metal | +| Demorou... | Mônica Marianno | 12 | 0 | 1 | World | +| Speak of the Devil | Ozzy Osbourne | 12 | 0 | 1 | Rock | +| Heart of the Night | Spyro Gyra | 12 | 0 | 1 | Jazz | +| Back to Black | Amy Winehouse | 12 | 0 | 1 | R&B/Soul | +| Album Of The Year | Faith No More | 12 | 0 | 1 | Alternative & Punk | +| Black Album | Metallica | 12 | 0 | 1 | Metal | +| Hot Rocks, 1964-1971 (Disc 1) | The Rolling Stones | 12 | 0 | 1 | Rock | +| Achtung Baby | U2 | 12 | 0 | 1 | Rock | +| Core | Stone Temple Pilots | 12 | 0 | 1 | Rock | +| Van Halen III | Van Halen | 12 | 0 | 1 | Rock | +| A Real Dead One | Iron Maiden | 12 | 0 | 1 | Metal | +| Pop | U2 | 12 | 0 | 1 | Rock | +| Nevermind | Nirvana | 12 | 0 | 1 | Rock | +| Arquivo II | Os Paralamas Do Sucesso | 12 | 0 | 1 | Latin | +| Facelift | Alice In Chains | 12 | 0 | 1 | Rock | +| Maquinarama | Skank | 12 | 0 | 1 | Rock | +| Diver Down | Van Halen | 12 | 0 | 1 | Rock | +| Fear Of The Dark | Iron Maiden | 12 | 0 | 1 | Rock | +| Out Of Exile | Audioslave | 12 | 0 | 1 | Alternative & Punk | +| Jota Quest-1995 | Jota Quest | 12 | 0 | 1 | Latin | +| O Samba Poconé | Skank | 11 | 0 | 1 | Rock | +| A Real Live One | Iron Maiden | 11 | 0 | 1 | Metal | +| Ten | Pearl Jam | 11 | 0 | 1 | Rock | +| Mezmerize | System Of A Down | 11 | 0 | 1 | Metal | +| The Return Of The Space Cowboy | Jamiroquai | 11 | 0 | 1 | Electronica/Dance | +| Green | R.E.M. | 11 | 0 | 1 | Alternative & Punk | +| The Best Of Buddy Guy - The Millenium Collection | Buddy Guy | 11 | 0 | 1 | Blues | +| Garage Inc. (Disc 1) | Metallica | 11 | 0 | 1 | Metal | +| The Doors | The Doors | 11 | 0 | 1 | Rock | +| One By One | Foo Fighters | 11 | 0 | 1 | Alternative & Punk | +| Transmission | The Tea Party | 11 | 0 | 1 | Alternative & Punk | +| Frank | Amy Winehouse | 11 | 0 | 1 | Pop | +| News Of The World | Queen | 11 | 0 | 1 | Rock | +| The Real Thing | Faith No More | 11 | 0 | 1 | Alternative & Punk | +| Van Halen | Van Halen | 11 | 0 | 1 | Rock | +| The X Factor | Iron Maiden | 11 | 0 | 1 | Rock | +| How To Dismantle An Atomic Bomb | U2 | 11 | 0 | 1 | Rock | +| Dance Of Death | Iron Maiden | 11 | 0 | 1 | Heavy Metal | +| Synkronized | Jamiroquai | 11 | 0 | 1 | R&B/Soul | +| Chemical Wedding | Bruce Dickinson | 11 | 0 | 1 | Metal | +| Knocking at Your Back Door: The Best Of Deep Purple in the 80's | Deep Purple | 11 | 0 | 1 | Rock | +| A Matter of Life and Death | Iron Maiden | 11 | 0 | 1 | Rock | +| Out Of Time | R.E.M. Feat. Kate Pearson | 11 | 0 | 1 | Alternative & Punk | +| All That You Can't Leave Behind | U2 | 11 | 0 | 1 | Rock | +| St. Anger | Metallica | 11 | 0 | 1 | Metal | +| BBC Sessions [Disc 2] [Live] | Led Zeppelin | 10 | 0 | 1 | Rock | +| War | U2 | 10 | 0 | 1 | Rock | +| The Battle Rages On | Deep Purple | 10 | 0 | 1 | Rock | +| Temple of the Dog | Temple of the Dog | 10 | 0 | 1 | Alternative | +| For Those About To Rock We Salute You | AC/DC | 10 | 0 | 1 | Rock | +| Live At Donington 1992 (Disc 1) | Iron Maiden | 10 | 0 | 1 | Rock | +| Led Zeppelin III | Led Zeppelin | 10 | 0 | 1 | Rock | +| Emergency On Planet Earth | Jamiroquai | 10 | 0 | 1 | Rock | +| The Essential Miles Davis [Disc 2] | Miles Davis | 10 | 0 | 1 | Jazz | +| In Step | Stevie Ray Vaughan & Double Trouble | 10 | 0 | 1 | Blues | +| Killers | Iron Maiden | 10 | 0 | 1 | Heavy Metal | +| The Best Of Men At Work | Men At Work | 10 | 0 | 1 | Rock | +| Zooropa | U2 | 10 | 0 | 1 | Rock | +| In Your Honor [Disc 1] | Foo Fighters | 10 | 0 | 1 | Rock | +| In Your Honor [Disc 2] | Foo Fighters | 10 | 0 | 1 | Rock | +| Rock In Rio [CD1] | Iron Maiden | 10 | 0 | 1 | Metal | +| Live [Disc 1] | The Black Crowes | 10 | 0 | 1 | Blues | +| Live At Donington 1992 (Disc 2) | Iron Maiden | 10 | 0 | 1 | Rock | +| No Prayer For The Dying | Iron Maiden | 10 | 0 | 1 | Metal | +| Black Sabbath Vol. 4 (Remaster) | Black Sabbath | 10 | 0 | 1 | Metal | +| Surfing with the Alien (Remastered) | Joe Satriani | 10 | 0 | 1 | Rock | +| Brave New World | Iron Maiden | 10 | 0 | 1 | Rock | +| Na Pista | Cláudio Zoli | 10 | 0 | 1 | Latin | +| The Beast Live | Paul D'Ianno | 10 | 0 | 1 | Rock | +| Cesta Básica | Raimundos | 10 | 0 | 1 | Alternative & Punk | +| Misplaced Childhood | Marillion | 10 | 0 | 1 | Rock | +| Kill 'Em All | Metallica | 10 | 0 | 1 | Metal | +| Physical Graffiti [Disc 2] | Led Zeppelin | 9 | 0 | 1 | Rock | +| Morning Dance | Spyro Gyra | 9 | 0 | 1 | Jazz | +| Piece Of Mind | Iron Maiden | 9 | 0 | 1 | Metal | +| Dark Side Of The Moon | Pink Floyd | 9 | 0 | 1 | Rock | +| Outbreak | Dennis Chambers | 9 | 0 | 1 | Jazz | +| ...And Justice For All | Metallica | 9 | 0 | 1 | Metal | +| Led Zeppelin II | Led Zeppelin | 9 | 0 | 1 | Rock | +| Come Taste The Band | Deep Purple | 9 | 0 | 1 | Rock | +| Stormbringer | Deep Purple | 9 | 0 | 1 | Rock | +| Slaves And Masters | Deep Purple | 9 | 0 | 1 | Rock | +| Iron Maiden | Iron Maiden | 9 | 0 | 1 | Blues | +| Bongo Fury | Frank Zappa & Captain Beefheart | 9 | 0 | 1 | Rock | +| Live [Disc 2] | The Black Crowes | 9 | 0 | 1 | Blues | +| Led Zeppelin I | Led Zeppelin | 9 | 0 | 1 | Rock | +| Virtual XI | Iron Maiden | 8 | 0 | 1 | Rock | +| Plays Metallica By Four Cellos | Apocalyptica | 8 | 0 | 1 | Metal | +| Master Of Puppets | Metallica | 8 | 0 | 1 | Metal | +| IV | Led Zeppelin | 8 | 0 | 1 | Rock | +| Coda | Led Zeppelin | 8 | 0 | 1 | Rock | +| Santana - As Years Go By | Santana | 8 | 0 | 1 | Rock | +| Somewhere in Time | Iron Maiden | 8 | 0 | 1 | Metal | +| Seventh Son of a Seventh Son | Iron Maiden | 8 | 0 | 1 | Metal | +| Powerslave | Iron Maiden | 8 | 0 | 1 | Metal | +| Let There Be Rock | AC/DC | 8 | 0 | 1 | Rock | +| Houses Of The Holy | Led Zeppelin | 8 | 0 | 1 | Rock | +| The Best Of Billy Cobham | Billy Cobham | 8 | 0 | 1 | Jazz | +| Ride The Lightning | Metallica | 8 | 0 | 1 | Metal | +| Fireball | Deep Purple | 7 | 0 | 1 | Rock | +| Deep Purple In Rock | Deep Purple | 7 | 0 | 1 | Rock | +| In Through The Out Door | Led Zeppelin | 7 | 0 | 1 | Rock | +| MK III The Final Concerts [Disc 1] | Deep Purple | 7 | 0 | 1 | Rock | +| [1997] Black Light Syndrome | Terry Bozzio, Tony Levin & Steve Stevens | 7 | 0 | 1 | Rock | +| Black Sabbath | Black Sabbath | 7 | 0 | 1 | Metal | +| Machine Head | Deep Purple | 7 | 0 | 1 | Rock | +| Presence | Led Zeppelin | 7 | 0 | 1 | Rock | +| Physical Graffiti [Disc 1] | Led Zeppelin | 6 | 0 | 1 | Rock | +| Santana Live | Santana | 6 | 0 | 1 | Rock | +| The Office, Season 1 | The Office | 6 | 0 | 1 | TV Shows | +| Alcohol Fueled Brewtality Live! [Disc 2] | Black Label Society | 5 | 0 | 1 | Metal | +| The Song Remains The Same (Disc 1) | Led Zeppelin | 5 | 0 | 1 | Rock | +| The Final Concerts (Disc 2) | Deep Purple | 4 | 0 | 1 | Rock | +| The Song Remains The Same (Disc 2) | Led Zeppelin | 4 | 0 | 1 | Rock | +| Restless and Wild | Accept | 3 | 0 | 1 | Rock | +| Quanta Gente Veio ver--Bônus De Carnaval | Gilberto Gil | 3 | 0 | 1 | Jazz | +| Sozinho Remix Ao Vivo | Caetano Veloso | 3 | 0 | 1 | Latin | +| No More Tears (Remastered) | Ozzy Osbourne | 2 | 0 | 1 | Rock | +| English Renaissance | The King's Singers | 2 | 0 | 1 | Classical | +| Muso Ko | Habib Koité and Bamada | 2 | 0 | 1 | World | +| Every Kind of Light | The Posies | 2 | 0 | 1 | Rock | +| Quiet Songs | Aisha Duo | 2 | 0 | 1 | Jazz | +| The World of Classical Favourites | Academy of St. Martin in the Fields & Sir Neville Marriner | 2 | 0 | 1 | Classical | +| Blizzard of Ozz | Ozzy Osbourne | 2 | 0 | 1 | Rock | +| Realize | Karsh Kale | 2 | 0 | 1 | Electronica/Dance | +| Great Performances - Barber's Adagio and Other Romantic Favorites for Strings | Leonard Bernstein & New York Philharmonic | 1 | 0 | 1 | Classical | +| Locatelli: Concertos for Violin, Strings and Continuo, Vol. 3 | Mela Tenenbaum, Pro Musica Prague & Richard Kapp | 1 | 0 | 1 | Classical | +| Beethoven Piano Sonatas: Moonlight & Pastorale | Maurizio Pollini | 1 | 0 | 1 | Classical | +| Mascagni: Cavalleria Rusticana | James Levine | 1 | 0 | 1 | Classical | +| Monteverdi: L'Orfeo | C. Monteverdi, Nigel Rogers - Chiaroscuro; London Baroque; London Cornett & Sackbu | 1 | 0 | 1 | Classical | +| Great Opera Choruses | Chicago Symphony Chorus, Chicago Symphony Orchestra & Sir Georg Solti | 1 | 0 | 1 | Classical | +| Liszt - 12 Études D'Execution Transcendante | Michele Campanella | 1 | 0 | 1 | Classical | +| Tchaikovsky: 1812 Festival Overture, Op.49, Capriccio Italien & Beethoven: Wellington's Victory | Antal Doráti & London Symphony Orchestra | 1 | 0 | 1 | Classical | +| Bach: Goldberg Variations | Wilhelm Kempff | 1 | 0 | 1 | Classical | +| Pachelbel: Canon & Gigue | English Concert & Trevor Pinnock | 1 | 0 | 1 | Classical | +| Diary of a Madman (Remastered) | Ozzy Osbourne | 1 | 0 | 1 | Rock | +| Handel: Music for the Royal Fireworks (Original Version 1749) | English Concert & Trevor Pinnock | 1 | 0 | 1 | Classical | +| Mozart Gala: Famous Arias | Sir Georg Solti, Sumi Jo & Wiener Philharmoniker | 1 | 0 | 1 | Opera | +| Battlestar Galactica: The Story So Far | Battlestar Galactica | 1 | 0 | 1 | Science Fiction | +| Mozart: Chamber Music | Nash Ensemble | 1 | 0 | 1 | Classical | +| A Copland Celebration, Vol. I | Aaron Copland & London Symphony Orchestra | 1 | 0 | 1 | Classical | +| Elgar: Cello Concerto & Vaughan Williams: Fantasias | Felix Schmidt, London Symphony Orchestra & Rafael Frühbeck de Burgos | 1 | 0 | 1 | Classical | +| Worlds | Aaron Goldberg | 1 | 0 | 1 | Jazz | +| Pavarotti's Opera Made Easy | Luciano Pavarotti | 1 | 0 | 1 | Classical | +| Allegri: Miserere | Richard Marlow & The Choir of Trinity College, Cambridge | 1 | 0 | 1 | Classical | +| Grieg: Peer Gynt Suites & Sibelius: Pelléas et Mélisande | Berliner Philharmoniker & Herbert Von Karajan | 1 | 0 | 1 | Classical | +| Palestrina: Missa Papae Marcelli & Allegri: Miserere | Choir Of Westminster Abbey & Simon Preston | 1 | 0 | 1 | Classical | +| Mendelssohn: A Midsummer Night's Dream | Philharmonia Orchestra & Sir Neville Marriner | 1 | 0 | 1 | Classical | +| Bach: Toccata & Fugue in D Minor | Ton Koopman | 1 | 0 | 1 | Classical | +| Scheherazade | Chicago Symphony Orchestra & Fritz Reiner | 1 | 0 | 1 | Classical | +| Chopin: Piano Concertos Nos. 1 & 2 | Emanuel Ax, Eugene Ormandy & Philadelphia Orchestra | 1 | 0 | 1 | Classical | +| Puccini: Madama Butterfly - Highlights | Herbert Von Karajan, Mirella Freni & Wiener Philharmoniker | 1 | 0 | 1 | Classical | +| Carmina Burana | Boston Symphony Orchestra & Seiji Ozawa | 1 | 0 | 1 | Classical | +| Bizet: Carmen Highlights | Chor der Wiener Staatsoper, Herbert Von Karajan & Wiener Philharmoniker | 1 | 0 | 1 | Classical | +| Aquaman | Aquaman | 1 | 0 | 1 | TV Shows | +| Bach: Violin Concertos | Hilary Hahn, Jeffrey Kahane, Los Angeles Chamber Orchestra & Margaret Batjer | 1 | 0 | 1 | Classical | +| Cake: B-Sides and Rarities | Cake | 1 | 0 | 1 | Alternative | +| Sibelius: Finlandia | Berliner Philharmoniker & Hans Rosbaud | 1 | 0 | 1 | Classical | +| Armada: Music from the Courts of England and Spain | Fretwork | 1 | 0 | 1 | Classical | +| Balls to the Wall | Accept | 1 | 0 | 1 | Rock | +| Haydn: Symphonies 99 - 104 | Royal Philharmonic Orchestra & Sir Thomas Beecham | 1 | 0 | 1 | Classical | +| Respighi:Pines of Rome | Eugene Ormandy | 1 | 0 | 1 | Classical | +| Un-Led-Ed | Dread Zeppelin | 1 | 0 | 1 | Rock | +| Purcell: The Fairy Queen | Roger Norrington, London Classical Players | 1 | 0 | 1 | Classical | +| Koyaanisqatsi (Soundtrack from the Motion Picture) | Philip Glass Ensemble | 1 | 0 | 1 | Soundtrack | +| Szymanowski: Piano Works, Vol. 1 | Martin Roscoe | 1 | 0 | 1 | Classical | +| Carried to Dust (Bonus Track Version) | Calexico | 1 | 0 | 1 | Alternative | +| Mozart: Symphonies Nos. 40 & 41 | Berliner Philharmoniker & Herbert Von Karajan | 1 | 0 | 1 | Classical | +| Bartok: Violin & Viola Concertos | Yehudi Menuhin | 1 | 0 | 1 | Classical | +| Bach: The Cello Suites | Yo-Yo Ma | 1 | 0 | 1 | Classical | +| Duos II | Luciana Souza/Romero Lubambo | 1 | 0 | 1 | Latin | +| Fauré: Requiem, Ravel: Pavane & Others | Academy of St. Martin in the Fields, John Birch, Sir Neville Marriner & Sylvia McNair | 1 | 0 | 1 | Classical | +| Adorate Deum: Gregorian Chant from the Proper of the Mass | Alberto Turco & Nova Schola Gregoriana | 1 | 0 | 1 | Classical | +| Bark at the Moon (Remastered) | Ozzy Osbourne | 1 | 0 | 1 | Rock | +| Schubert: The Late String Quartets & String Quintet (3 CD's) | Emerson String Quartet | 1 | 0 | 1 | Classical | +| The Last Night of the Proms | Barry Wordsworth & BBC Concert Orchestra | 1 | 0 | 1 | Classical | +| The Best of Beethoven | Nicolaus Esterhazy Sinfonia | 1 | 0 | 1 | Classical | +| SCRIABIN: Vers la flamme | Christopher O'Riley | 1 | 0 | 1 | Classical | +| Tchaikovsky: The Nutcracker | London Symphony Orchestra & Sir Charles Mackerras | 1 | 0 | 1 | Classical | +| Beethoven: Symphony No. 6 'Pastoral' Etc. | Otto Klemperer & Philharmonia Orchestra | 1 | 0 | 1 | Classical | +| Prokofiev: Romeo & Juliet | Michael Tilson Thomas & San Francisco Symphony | 1 | 0 | 1 | Classical | +| Vivaldi: The Four Seasons | Anne-Sophie Mutter, Herbert Von Karajan & Wiener Philharmoniker | 1 | 0 | 1 | Classical | +| Prokofiev: Symphony No.1 | Sergei Prokofiev & Yuri Temirkanov | 1 | 0 | 1 | Classical | +| Handel: The Messiah (Highlights) | Scholars Baroque Ensemble | 1 | 0 | 1 | Classical | +| Beethoven: Symhonies Nos. 5 & 6 | Orchestre Révolutionnaire et Romantique & John Eliot Gardiner | 1 | 0 | 1 | Classical | +| Purcell: Music for the Queen Mary | Equale Brass Ensemble, John Eliot Gardiner & Munich Monteverdi Orchestra and Choir | 1 | 0 | 1 | Classical | +| Sir Neville Marriner: A Celebration | Academy of St. Martin in the Fields Chamber Ensemble & Sir Neville Marriner | 1 | 0 | 1 | Classical | +| Berlioz: Symphonie Fantastique | Michael Tilson Thomas & San Francisco Symphony | 1 | 0 | 1 | Classical | +| Bach: Orchestral Suites Nos. 1 - 4 | Academy of St. Martin in the Fields, Sir Neville Marriner & Thurston Dart | 1 | 0 | 1 | Classical | +| Charpentier: Divertissements, Airs & Concerts | Les Arts Florissants & William Christie | 1 | 0 | 1 | Classical | +| Górecki: Symphony No. 3 | Adrian Leaper & Doreen de Feis | 1 | 0 | 1 | Classical | +| Great Recordings of the Century - Shubert: Schwanengesang, 4 Lieder | Gerald Moore | 1 | 0 | 1 | Classical | +| J.S. Bach: Chaconne, Suite in E Minor, Partita in E Major & Prelude, Fugue and Allegro | Julian Bream | 1 | 0 | 1 | Classical | +| Bach: The Brandenburg Concertos | Orchestra of The Age of Enlightenment | 1 | 0 | 1 | Classical | +| Great Recordings of the Century: Paganini's 24 Caprices | Itzhak Perlman | 1 | 0 | 1 | Classical | +| Wagner: Favourite Overtures | Sir Georg Solti & Wiener Philharmoniker | 1 | 0 | 1 | Classical | +| South American Getaway | The 12 Cellists of The Berlin Philharmonic | 1 | 0 | 1 | Classical | +| A Soprano Inspired | Britten Sinfonia, Ivor Bolton & Lesley Garrett | 1 | 0 | 1 | Classical | +| Great Recordings of the Century - Mahler: Das Lied von der Erde | Gustav Mahler | 1 | 0 | 1 | Classical | +| Mozart: Wind Concertos | Berliner Philharmoniker, Claudio Abbado & Sabine Meyer | 1 | 0 | 1 | Classical | +| Nielsen: The Six Symphonies | Göteborgs Symfoniker & Neeme Järvi | 1 | 0 | 1 | Classical | +| Weill: The Seven Deadly Sins | Kent Nagano and Orchestre de l'Opéra de Lyon | 1 | 0 | 1 | Classical | +| Prokofiev: Symphony No.5 & Stravinksy: Le Sacre Du Printemps | Berliner Philharmoniker & Herbert Von Karajan | 1 | 0 | 1 | Classical | +| Holst: The Planets, Op. 32 & Vaughan Williams: Fantasies | Eugene Ormandy | 1 | 0 | 1 | Classical | +| Strauss: Waltzes | Eugene Ormandy | 1 | 0 | 1 | Classical | +| The Ultimate Relexation Album | Charles Dutoit & L'Orchestre Symphonique de Montréal | 1 | 0 | 1 | Classical | +| Adams, John: The Chairman Dances | Edo de Waart & San Francisco Symphony | 1 | 0 | 1 | Classical | diff --git a/tests/chinook/dataset/track_10/task.txt b/tests/chinook/dataset/track_10/task.txt new file mode 100644 index 00000000..65721606 --- /dev/null +++ b/tests/chinook/dataset/track_10/task.txt @@ -0,0 +1,3 @@ +Find albums with the highest track count variance and their genre distribution. +Show album title, artist, total tracks, track count variance, genre count, and +most common genre. From 2724d8d8d26607da9ad4be4c5197c52b38b0a7e9 Mon Sep 17 00:00:00 2001 From: Dylan Huang Date: Thu, 11 Sep 2025 13:26:09 -0700 Subject: [PATCH 04/23] generically come up with IDs --- eval_protocol/pytest/parameterize.py | 17 +++++---- tests/pytest/test_parameterized_ids.py | 53 +++++++++++++++++++------- 2 files changed, 49 insertions(+), 21 deletions(-) diff --git a/eval_protocol/pytest/parameterize.py b/eval_protocol/pytest/parameterize.py index 030a80fb..cba8f65c 100644 --- a/eval_protocol/pytest/parameterize.py +++ b/eval_protocol/pytest/parameterize.py @@ -33,7 +33,7 @@ def generate_id(self, combo: CombinationTuple) -> str | None: class DefaultParameterIdGenerator: """Default ID generator that creates meaningful IDs from parameter combinations.""" - def __init__(self, max_length: int = 50): + def __init__(self, max_length: int = 200): """Initialize the ID generator with configuration options. Args: @@ -45,13 +45,16 @@ def generate_id(self, combo: CombinationTuple) -> str | None: """Generate an ID for a parameter combination.""" dataset, completion_params, messages, rows, evaluation_test_kwargs = combo - # Add model name if available if completion_params: - model = completion_params.get("model") - if model: - # Extract just the model name, not the full path - model_name = model.split("/")[-1] if "/" in model else model - id_str = f"model-{model_name}" + # Get all string, numeric, and boolean values from completion_params, sorted by key + str_values = [] + for key in sorted(completion_params.keys()): + value = completion_params[key] + if isinstance(value, (str, int, float, bool)): + str_values.append(str(value)) + + if str_values: + id_str = ":".join(str_values) # Truncate if too long if len(id_str) > self.max_length: diff --git a/tests/pytest/test_parameterized_ids.py b/tests/pytest/test_parameterized_ids.py index ff1726d7..b182bfe5 100644 --- a/tests/pytest/test_parameterized_ids.py +++ b/tests/pytest/test_parameterized_ids.py @@ -43,34 +43,59 @@ def test_parameterized_ids(row: EvaluationRow) -> EvaluationRow: # Extract the IDs from the parametrize mark ids = parametrize_mark.kwargs.get("ids") if ids is not None: - # Should have IDs for models but not for temperature-only params - expected_ids = ["model-gpt-oss-120b", "model-gpt-4"] + # Should have IDs for all parameters that have string/numeric values + expected_ids = ["fireworks_ai/accounts/fireworks/models/gpt-oss-120b", "gpt-4", "0.5"] assert list(ids) == expected_ids, f"Expected {expected_ids}, got {list(ids)}" def test_default_id_generator(): - """Test the DefaultParameterIdGenerator with various model names.""" + """Test the DefaultParameterIdGenerator with various parameter combinations.""" generator = DefaultParameterIdGenerator() # Test with full model path combo1 = (None, {"model": "fireworks_ai/accounts/fireworks/models/gpt-oss-120b"}, None, None, None) id1 = generator.generate_id(combo1) - assert id1 == "model-gpt-oss-120b" + assert id1 == "fireworks_ai/accounts/fireworks/models/gpt-oss-120b" # Test with simple model name combo2 = (None, {"model": "gpt-4"}, None, None, None) id2 = generator.generate_id(combo2) - assert id2 == "model-gpt-4" + assert id2 == "gpt-4" - # Test with no model - combo3 = (None, {"temperature": 0.5}, None, None, None) + # Test with multiple string parameters + combo3 = (None, {"model": "gpt-4", "stream": "true", "temperature": "0.7"}, None, None, None) id3 = generator.generate_id(combo3) - assert id3 is None + assert id3 == "gpt-4:true:0.7" - # Test with None completion_params - combo4 = (None, None, None, None, None) + # Test with mixed string and numeric parameters + combo4 = (None, {"model": "gpt-4", "temperature": 0.7, "max_tokens": 100}, None, None, None) id4 = generator.generate_id(combo4) - assert id4 is None + assert id4 == "100:gpt-4:0.7" # Keys are sorted alphabetically: max_tokens, model, temperature + + # Test with only numeric values + combo5 = (None, {"temperature": 0.5, "max_tokens": 100}, None, None, None) + id5 = generator.generate_id(combo5) + assert id5 == "100:0.5" # Keys are sorted alphabetically: max_tokens, temperature + + # Test with boolean values + combo6 = (None, {"stream": True, "echo": False}, None, None, None) + id6 = generator.generate_id(combo6) + assert id6 == "False:True" # Keys are sorted alphabetically: echo, stream + + # Test with mixed string, numeric, and boolean values + combo7 = (None, {"model": "gpt-4", "temperature": 0.7, "stream": True}, None, None, None) + id7 = generator.generate_id(combo7) + assert id7 == "gpt-4:True:0.7" # Keys are sorted alphabetically: model, stream, temperature + + # Test with no supported values (only non-supported types like lists, dicts) + combo8 = (None, {"messages": [{"role": "user"}], "config": {"key": "value"}}, None, None, None) + id8 = generator.generate_id(combo8) + assert id8 is None + + # Test with None completion_params + combo9 = (None, None, None, None, None) + id9 = generator.generate_id(combo9) + assert id9 is None def test_pytest_parametrize_with_custom_id_generator(): @@ -80,7 +105,7 @@ def test_pytest_parametrize_with_custom_id_generator(): combinations = [ (None, {"model": "gpt-4"}, None, None, None), (None, {"model": "claude-3"}, None, None, None), - (None, {"temperature": 0.5}, None, None, None), # No model + (None, {"temperature": 0.5}, None, None, None), # Only numeric values ] # Test with default generator @@ -95,7 +120,7 @@ def test_pytest_parametrize_with_custom_id_generator(): assert result["argnames"] == ["completion_params"] assert len(list(result["argvalues"])) == 3 - assert result["ids"] == ["model-gpt-4", "model-claude-3"] # None for no model + assert result["ids"] == ["gpt-4", "claude-3", "0.5"] # All have string/numeric values def test_id_generator_max_length(): @@ -105,5 +130,5 @@ def test_id_generator_max_length(): # Test with long model name combo = (None, {"model": "very-long-model-name-that-exceeds-max-length"}, None, None, None) id_str = generator.generate_id(combo) - assert id_str == "model-v..." + assert id_str == "very-lo..." assert len(id_str) <= 10 From fa5a70d377b3fd52ccad625a22d36353d3fa8f66 Mon Sep 17 00:00:00 2001 From: Dylan Huang Date: Thu, 11 Sep 2025 14:37:09 -0700 Subject: [PATCH 05/23] fix folder name --- tests/chinook/dataset/{track_10 => task_10}/ground_truth.md | 0 tests/chinook/dataset/{track_10 => task_10}/task.txt | 0 2 files changed, 0 insertions(+), 0 deletions(-) rename tests/chinook/dataset/{track_10 => task_10}/ground_truth.md (100%) rename tests/chinook/dataset/{track_10 => task_10}/task.txt (100%) diff --git a/tests/chinook/dataset/track_10/ground_truth.md b/tests/chinook/dataset/task_10/ground_truth.md similarity index 100% rename from tests/chinook/dataset/track_10/ground_truth.md rename to tests/chinook/dataset/task_10/ground_truth.md diff --git a/tests/chinook/dataset/track_10/task.txt b/tests/chinook/dataset/task_10/task.txt similarity index 100% rename from tests/chinook/dataset/track_10/task.txt rename to tests/chinook/dataset/task_10/task.txt From e7b09f93983e08514ee5c98966713bee55b2aec7 Mon Sep 17 00:00:00 2001 From: Dylan Huang Date: Mon, 15 Sep 2025 14:02:26 -0700 Subject: [PATCH 06/23] save --- .../default_pydantic_ai_rollout_processor.py | 26 +++++++++++++++--- .../pydantic/test_pydantic_complex_queries.py | 27 ++++++++++++++----- ...test_pydantic_complex_queries_responses.py | 3 +-- 3 files changed, 44 insertions(+), 12 deletions(-) diff --git a/eval_protocol/pytest/default_pydantic_ai_rollout_processor.py b/eval_protocol/pytest/default_pydantic_ai_rollout_processor.py index 9b9a4353..94ed388a 100644 --- a/eval_protocol/pytest/default_pydantic_ai_rollout_processor.py +++ b/eval_protocol/pytest/default_pydantic_ai_rollout_processor.py @@ -13,7 +13,7 @@ from openai.types.chat import ChatCompletion, ChatCompletionMessage, ChatCompletionMessageParam from openai.types.chat.chat_completion import Choice as ChatCompletionChoice from pydantic import TypeAdapter -from pydantic_ai import Agent +from pydantic_ai import Agent, ModelSettings from pydantic_ai._utils import generate_tool_call_id from pydantic_ai.messages import ModelMessage from pydantic_ai.messages import ( @@ -46,7 +46,6 @@ def __call__(self, rows: list[EvaluationRow], config: RolloutProcessorConfig) -> """Create agent rollout tasks and return them for external handling.""" semaphore = config.semaphore - agent = self._setup_agent(config) async def process_row(row: EvaluationRow) -> EvaluationRow: @@ -70,7 +69,10 @@ async def process_row(row: EvaluationRow) -> EvaluationRow: row.tools = tools model_messages = [self.convert_ep_message_to_pyd_message(m, row) for m in row.messages] - response = await agent.run(message_history=model_messages, usage_limits=config.kwargs.get("usage_limits")) + settings = self.construct_model_settings(agent, row) + response = await agent.run( + message_history=model_messages, usage_limits=config.kwargs.get("usage_limits"), model_settings=settings + ) row.messages = await self.convert_pyd_message_to_ep_message(response.all_messages()) # TODO: pydantic ai accumulates usage info across all models in multi-agent setup, so this simple tracking doesn't work for cost. to discuss with @dphuang2 when he's back. @@ -98,6 +100,24 @@ async def convert_pyd_message_to_ep_message(self, messages: list[ModelMessage]) oai_messages: list[ChatCompletionMessageParam] = await self._util._map_messages(messages) return [Message(**m) for m in oai_messages] # pyright: ignore[reportArgumentType] + def construct_model_settings(self, agent: Agent, row: EvaluationRow) -> ModelSettings: + model = agent.model + if model and not isinstance(model, str) and model.settings: + # We must copy model settings to avoid concurrency issues by modifying the same object in-place + settings = model.settings.copy() + if settings is None: + settings = ModelSettings() + settings["extra_body"] = settings.get("extra_body", {}) + extra_body = settings["extra_body"] + if isinstance(extra_body, dict): + extra_body["metadata"] = settings.get("metadata", {}) + extra_body["metadata"]["row_id"] = row.input_metadata.row_id + extra_body["metadata"]["invocation_id"] = row.execution_metadata.invocation_id + extra_body["metadata"]["rollout_id"] = row.execution_metadata.rollout_id + extra_body["metadata"]["run_id"] = row.execution_metadata.run_id + extra_body["metadata"]["experiment_id"] = row.execution_metadata.experiment_id + return settings + def convert_ep_message_to_pyd_message(self, message: Message, row: EvaluationRow) -> ModelMessage: if message.role == "assistant": type_adapter = TypeAdapter(ChatCompletionMessage) diff --git a/tests/chinook/pydantic/test_pydantic_complex_queries.py b/tests/chinook/pydantic/test_pydantic_complex_queries.py index 439e4b18..6a29a1f6 100644 --- a/tests/chinook/pydantic/test_pydantic_complex_queries.py +++ b/tests/chinook/pydantic/test_pydantic_complex_queries.py @@ -1,7 +1,7 @@ import os from pydantic import BaseModel from pydantic_ai import Agent -from pydantic_ai.models.openai import OpenAIChatModel +from pydantic_ai.models.openai import OpenAIChatModel, OpenAIChatModelSettings import pytest from eval_protocol.models import EvaluateResult, EvaluationRow @@ -14,15 +14,18 @@ LLM_JUDGE_PROMPT = ( "Your job is to compare the response to the expected answer.\n" "The response will be a narrative report of the query results.\n" - "If the response contains the same or well summarized information as the expected answer, return 1.0.\n" - "If the response does not contain the same information or is missing information, return 0.0." + "Return a score between 0.0 and 1.0, where 1.0 means the response contains all or well summarized information as the expected answer, " + "0.0 means the response does not contain the same information or is missing all key information, " + "and values in between represent partial credit for responses that are partially correct or contain some but not all of the expected information." ) def agent_factory(config: RolloutProcessorConfig) -> Agent: model_name = config.completion_params["model"] - provider = config.completion_params["provider"] - model = OpenAIChatModel(model_name, provider=provider) + provider = config.completion_params.get("provider") + reasoning = config.completion_params.get("reasoning") + settings = OpenAIChatModelSettings(openai_reasoning_effort=reasoning) + model = OpenAIChatModel(model_name, provider=provider or "openai", settings=settings) return setup_agent(model) @@ -38,8 +41,19 @@ def agent_factory(config: RolloutProcessorConfig) -> Agent: "model": "accounts/fireworks/models/kimi-k2-instruct", "provider": "fireworks", }, + { + "model": "accounts/fireworks/models/deepseek-v3p1", + "provider": "fireworks", + }, + { + "model": "accounts/fireworks/models/kimi-k2-instruct-0905", + "provider": "fireworks", + }, + {"model": "gpt-5"}, + {"model": "gpt-5", "reasoning": "high"}, ], rollout_processor=PydanticAgentRolloutProcessor(agent_factory), + num_runs=2, ) async def test_pydantic_complex_queries(row: EvaluationRow) -> EvaluationRow: """ @@ -58,8 +72,7 @@ async def test_pydantic_complex_queries(row: EvaluationRow) -> EvaluationRow: ) else: model = OpenAIChatModel( - "accounts/fireworks/models/kimi-k2-instruct", - provider="fireworks", + "gpt-5", ) class Response(BaseModel): diff --git a/tests/chinook/pydantic/test_pydantic_complex_queries_responses.py b/tests/chinook/pydantic/test_pydantic_complex_queries_responses.py index 5a425647..dba824bb 100644 --- a/tests/chinook/pydantic/test_pydantic_complex_queries_responses.py +++ b/tests/chinook/pydantic/test_pydantic_complex_queries_responses.py @@ -19,7 +19,7 @@ def agent_factory(config: RolloutProcessorConfig) -> Agent: model_name = config.completion_params["model"] - reasoning = config.completion_params["reasoning"] + reasoning = config.completion_params.get("reasoning") settings = OpenAIResponsesModelSettings( openai_reasoning_effort=reasoning, ) @@ -37,7 +37,6 @@ def agent_factory(config: RolloutProcessorConfig) -> Agent: completion_params=[ { "model": "gpt-5", - "reasoning": "high", }, ], rollout_processor=PydanticAgentRolloutProcessor(agent_factory), From 9bb0fbb45ef25f844165d41ceaec0c25314787a4 Mon Sep 17 00:00:00 2001 From: Dylan Huang Date: Tue, 16 Sep 2025 11:33:21 -0700 Subject: [PATCH 07/23] fix metadata storing logic based on chatcompletions vs. responses for pydantic because of "metadata perameter is only allowed when 'store' is enabled" and pydantic has no option to set "store" = True --- .../default_pydantic_ai_rollout_processor.py | 8 +- .../test_pydantic_ai_metadata_handling.py | 110 ++++++++++++++++++ 2 files changed, 116 insertions(+), 2 deletions(-) create mode 100644 tests/pytest/test_pydantic_ai_metadata_handling.py diff --git a/eval_protocol/pytest/default_pydantic_ai_rollout_processor.py b/eval_protocol/pytest/default_pydantic_ai_rollout_processor.py index 94ed388a..47b7b456 100644 --- a/eval_protocol/pytest/default_pydantic_ai_rollout_processor.py +++ b/eval_protocol/pytest/default_pydantic_ai_rollout_processor.py @@ -22,7 +22,7 @@ ToolReturnPart, UserPromptPart, ) -from pydantic_ai.models.openai import OpenAIChatModel +from pydantic_ai.models.openai import OpenAIChatModel, OpenAIResponsesModel from pydantic_ai.providers.openai import OpenAIProvider logger = logging.getLogger(__name__) @@ -102,6 +102,7 @@ async def convert_pyd_message_to_ep_message(self, messages: list[ModelMessage]) def construct_model_settings(self, agent: Agent, row: EvaluationRow) -> ModelSettings: model = agent.model + settings = None if model and not isinstance(model, str) and model.settings: # We must copy model settings to avoid concurrency issues by modifying the same object in-place settings = model.settings.copy() @@ -109,13 +110,16 @@ def construct_model_settings(self, agent: Agent, row: EvaluationRow) -> ModelSet settings = ModelSettings() settings["extra_body"] = settings.get("extra_body", {}) extra_body = settings["extra_body"] - if isinstance(extra_body, dict): + + # Only store metadata for ResponsesModel, not for ChatModel + if isinstance(extra_body, dict) and isinstance(model, OpenAIResponsesModel): extra_body["metadata"] = settings.get("metadata", {}) extra_body["metadata"]["row_id"] = row.input_metadata.row_id extra_body["metadata"]["invocation_id"] = row.execution_metadata.invocation_id extra_body["metadata"]["rollout_id"] = row.execution_metadata.rollout_id extra_body["metadata"]["run_id"] = row.execution_metadata.run_id extra_body["metadata"]["experiment_id"] = row.execution_metadata.experiment_id + return settings def convert_ep_message_to_pyd_message(self, message: Message, row: EvaluationRow) -> ModelMessage: diff --git a/tests/pytest/test_pydantic_ai_metadata_handling.py b/tests/pytest/test_pydantic_ai_metadata_handling.py new file mode 100644 index 00000000..9046d6c2 --- /dev/null +++ b/tests/pytest/test_pydantic_ai_metadata_handling.py @@ -0,0 +1,110 @@ +import pytest +from typing import Any, Dict +from unittest.mock import Mock +from pydantic_ai import Agent +from pydantic_ai.models.openai import OpenAIChatModel, OpenAIResponsesModel +from eval_protocol.models import EvaluationRow, InputMetadata, ExecutionMetadata +from eval_protocol.pytest.default_pydantic_ai_rollout_processor import PydanticAgentRolloutProcessor +from eval_protocol.pytest.types import RolloutProcessorConfig + + +def test_pydantic_ai_metadata_only_stored_for_responses_model(): + """Test that PydanticAI metadata is only stored in extra_body for ResponsesModel, not for ChatModel.""" + + # Create a test row with metadata + row = EvaluationRow( + input_metadata=InputMetadata(row_id="test-row-123"), + execution_metadata=ExecutionMetadata( + invocation_id="test-invocation-456", + rollout_id="test-rollout-789", + run_id="test-run-101", + experiment_id="test-experiment-202", + ), + messages=[], + ) + + # Test with OpenAIChatModel (should NOT store metadata) + chat_model = OpenAIChatModel("gpt-4") + chat_agent = Agent(model=chat_model) + processor = PydanticAgentRolloutProcessor(lambda config: chat_agent) + + settings = processor.construct_model_settings(chat_agent, row) + + # ChatModel should not have metadata in extra_body + extra_body = settings.get("extra_body", {}) + assert isinstance(extra_body, dict), "extra_body should be a dict" + assert "metadata" not in extra_body, "ChatModel should not store metadata in extra_body" + + # Test with OpenAIResponsesModel (should store metadata) + responses_model = OpenAIResponsesModel("gpt-5") + responses_agent = Agent(model=responses_model) + processor_responses = PydanticAgentRolloutProcessor(lambda config: responses_agent) + + settings_responses = processor_responses.construct_model_settings(responses_agent, row) + + # ResponsesModel should have metadata in extra_body + extra_body_responses = settings_responses.get("extra_body", {}) + assert isinstance(extra_body_responses, dict), "extra_body should be a dict" + assert "metadata" in extra_body_responses, "ResponsesModel should store metadata in extra_body" + + metadata = extra_body_responses["metadata"] + assert isinstance(metadata, dict), "metadata should be a dict" + assert metadata["row_id"] == "test-row-123" + assert metadata["invocation_id"] == "test-invocation-456" + assert metadata["rollout_id"] == "test-rollout-789" + assert metadata["run_id"] == "test-run-101" + assert metadata["experiment_id"] == "test-experiment-202" + + +def test_pydantic_ai_metadata_handling_with_string_model(): + """Test that PydanticAI string models don't cause issues with metadata handling.""" + + # Create a test row + row = EvaluationRow( + input_metadata=InputMetadata(row_id="test-row-123"), + execution_metadata=ExecutionMetadata( + invocation_id="test-invocation-456", + rollout_id="test-rollout-789", + run_id="test-run-101", + experiment_id="test-experiment-202", + ), + messages=[], + ) + + # Create agent with string model (should not store metadata) + agent = Agent(model="gpt-4") + processor = PydanticAgentRolloutProcessor(lambda config: agent) + + settings = processor.construct_model_settings(agent, row) + + # String model should not have metadata in extra_body + extra_body = settings.get("extra_body", {}) + assert isinstance(extra_body, dict), "extra_body should be a dict" + assert "metadata" not in extra_body, "String model should not store metadata in extra_body" + + +def test_pydantic_ai_metadata_handling_with_none_model(): + """Test that PydanticAI None model doesn't cause issues with metadata handling.""" + + # Create a test row + row = EvaluationRow( + input_metadata=InputMetadata(row_id="test-row-123"), + execution_metadata=ExecutionMetadata( + invocation_id="test-invocation-456", + rollout_id="test-rollout-789", + run_id="test-run-101", + experiment_id="test-experiment-202", + ), + messages=[], + ) + + # Create agent with None model (should not store metadata) + agent = Agent(model=None) + processor = PydanticAgentRolloutProcessor(lambda config: agent) + + settings = processor.construct_model_settings(agent, row) + + # None model should not have metadata in extra_body + extra_body = settings.get("extra_body", {}) + assert isinstance(extra_body, dict), "extra_body should be a dict" + assert "metadata" not in extra_body, "None model should not store metadata in extra_body" From 2d66b50aef8b6d0f9e3db0371c452acc6a61f593 Mon Sep 17 00:00:00 2001 From: Dylan Huang Date: Tue, 16 Sep 2025 11:33:29 -0700 Subject: [PATCH 08/23] Add method to retrieve the first user message in EvaluationRow class --- eval_protocol/models.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/eval_protocol/models.py b/eval_protocol/models.py index e9a6ca39..2804db59 100644 --- a/eval_protocol/models.py +++ b/eval_protocol/models.py @@ -658,6 +658,13 @@ def last_assistant_message(self) -> Optional[Message]: return None return assistant_messages[-1] + def get_first_user_message(self) -> Optional[Message]: + """Returns the first user message from the conversation. Returns None if none found.""" + user_messages = self.get_user_messages() + if not user_messages: + return None + return user_messages[0] + def get_user_messages(self) -> List[Message]: """Returns only the user messages from the conversation.""" return [msg for msg in self.messages if msg.role == "user"] From 6f0c59f91835c5854eb00fcc0054c198d6499d46 Mon Sep 17 00:00:00 2001 From: Dylan Huang Date: Tue, 16 Sep 2025 11:33:44 -0700 Subject: [PATCH 09/23] better test name --- tests/pytest/test_metadata_handling.py | 110 +++++++++++++++++++++++++ 1 file changed, 110 insertions(+) create mode 100644 tests/pytest/test_metadata_handling.py diff --git a/tests/pytest/test_metadata_handling.py b/tests/pytest/test_metadata_handling.py new file mode 100644 index 00000000..9046d6c2 --- /dev/null +++ b/tests/pytest/test_metadata_handling.py @@ -0,0 +1,110 @@ +import pytest +from typing import Any, Dict +from unittest.mock import Mock +from pydantic_ai import Agent +from pydantic_ai.models.openai import OpenAIChatModel, OpenAIResponsesModel +from eval_protocol.models import EvaluationRow, InputMetadata, ExecutionMetadata +from eval_protocol.pytest.default_pydantic_ai_rollout_processor import PydanticAgentRolloutProcessor +from eval_protocol.pytest.types import RolloutProcessorConfig + + +def test_pydantic_ai_metadata_only_stored_for_responses_model(): + """Test that PydanticAI metadata is only stored in extra_body for ResponsesModel, not for ChatModel.""" + + # Create a test row with metadata + row = EvaluationRow( + input_metadata=InputMetadata(row_id="test-row-123"), + execution_metadata=ExecutionMetadata( + invocation_id="test-invocation-456", + rollout_id="test-rollout-789", + run_id="test-run-101", + experiment_id="test-experiment-202", + ), + messages=[], + ) + + # Test with OpenAIChatModel (should NOT store metadata) + chat_model = OpenAIChatModel("gpt-4") + chat_agent = Agent(model=chat_model) + processor = PydanticAgentRolloutProcessor(lambda config: chat_agent) + + settings = processor.construct_model_settings(chat_agent, row) + + # ChatModel should not have metadata in extra_body + extra_body = settings.get("extra_body", {}) + assert isinstance(extra_body, dict), "extra_body should be a dict" + assert "metadata" not in extra_body, "ChatModel should not store metadata in extra_body" + + # Test with OpenAIResponsesModel (should store metadata) + responses_model = OpenAIResponsesModel("gpt-5") + responses_agent = Agent(model=responses_model) + processor_responses = PydanticAgentRolloutProcessor(lambda config: responses_agent) + + settings_responses = processor_responses.construct_model_settings(responses_agent, row) + + # ResponsesModel should have metadata in extra_body + extra_body_responses = settings_responses.get("extra_body", {}) + assert isinstance(extra_body_responses, dict), "extra_body should be a dict" + assert "metadata" in extra_body_responses, "ResponsesModel should store metadata in extra_body" + + metadata = extra_body_responses["metadata"] + assert isinstance(metadata, dict), "metadata should be a dict" + assert metadata["row_id"] == "test-row-123" + assert metadata["invocation_id"] == "test-invocation-456" + assert metadata["rollout_id"] == "test-rollout-789" + assert metadata["run_id"] == "test-run-101" + assert metadata["experiment_id"] == "test-experiment-202" + + +def test_pydantic_ai_metadata_handling_with_string_model(): + """Test that PydanticAI string models don't cause issues with metadata handling.""" + + # Create a test row + row = EvaluationRow( + input_metadata=InputMetadata(row_id="test-row-123"), + execution_metadata=ExecutionMetadata( + invocation_id="test-invocation-456", + rollout_id="test-rollout-789", + run_id="test-run-101", + experiment_id="test-experiment-202", + ), + messages=[], + ) + + # Create agent with string model (should not store metadata) + agent = Agent(model="gpt-4") + processor = PydanticAgentRolloutProcessor(lambda config: agent) + + settings = processor.construct_model_settings(agent, row) + + # String model should not have metadata in extra_body + extra_body = settings.get("extra_body", {}) + assert isinstance(extra_body, dict), "extra_body should be a dict" + assert "metadata" not in extra_body, "String model should not store metadata in extra_body" + + +def test_pydantic_ai_metadata_handling_with_none_model(): + """Test that PydanticAI None model doesn't cause issues with metadata handling.""" + + # Create a test row + row = EvaluationRow( + input_metadata=InputMetadata(row_id="test-row-123"), + execution_metadata=ExecutionMetadata( + invocation_id="test-invocation-456", + rollout_id="test-rollout-789", + run_id="test-run-101", + experiment_id="test-experiment-202", + ), + messages=[], + ) + + # Create agent with None model (should not store metadata) + agent = Agent(model=None) + processor = PydanticAgentRolloutProcessor(lambda config: agent) + + settings = processor.construct_model_settings(agent, row) + + # None model should not have metadata in extra_body + extra_body = settings.get("extra_body", {}) + assert isinstance(extra_body, dict), "extra_body should be a dict" + assert "metadata" not in extra_body, "None model should not store metadata in extra_body" From ca5379a99c688d74847163c3f04d1ff779fd8796 Mon Sep 17 00:00:00 2001 From: Dylan Huang Date: Tue, 16 Sep 2025 12:01:41 -0700 Subject: [PATCH 10/23] better LLM judge --- .../pydantic/test_pydantic_complex_queries.py | 48 +++++++++++-------- 1 file changed, 29 insertions(+), 19 deletions(-) diff --git a/tests/chinook/pydantic/test_pydantic_complex_queries.py b/tests/chinook/pydantic/test_pydantic_complex_queries.py index 6a29a1f6..583c90df 100644 --- a/tests/chinook/pydantic/test_pydantic_complex_queries.py +++ b/tests/chinook/pydantic/test_pydantic_complex_queries.py @@ -12,11 +12,13 @@ from tests.pytest.test_pydantic_agent import PydanticAgentRolloutProcessor LLM_JUDGE_PROMPT = ( - "Your job is to compare the response to the expected answer.\n" - "The response will be a narrative report of the query results.\n" - "Return a score between 0.0 and 1.0, where 1.0 means the response contains all or well summarized information as the expected answer, " - "0.0 means the response does not contain the same information or is missing all key information, " - "and values in between represent partial credit for responses that are partially correct or contain some but not all of the expected information." + "You will be given the user task, the response of an AI assistant, and the expected answer.\n" + "Compare the response to the expected answer. The response is a narrative report of the query results.\n" + "Reward for core information being provided correctly and ignore formatting issues.\n" + "Return a score from 0.0 to 1.0:\n" + "- 1.0: response contains all or well summarized information as the expected answer\n" + "- 0.0: response does not contain the same information or is missing all key information\n" + "- Values in between: partial credit for responses that are partially correct or contain some but not all of the expected information." ) @@ -37,20 +39,20 @@ def agent_factory(config: RolloutProcessorConfig) -> Agent: @evaluation_test( input_rows=[collect_dataset()], completion_params=[ - { - "model": "accounts/fireworks/models/kimi-k2-instruct", - "provider": "fireworks", - }, - { - "model": "accounts/fireworks/models/deepseek-v3p1", - "provider": "fireworks", - }, - { - "model": "accounts/fireworks/models/kimi-k2-instruct-0905", - "provider": "fireworks", - }, + # { + # "model": "accounts/fireworks/models/kimi-k2-instruct", + # "provider": "fireworks", + # }, + # { + # "model": "accounts/fireworks/models/deepseek-v3p1", + # "provider": "fireworks", + # }, + # { + # "model": "accounts/fireworks/models/kimi-k2-instruct-0905", + # "provider": "fireworks", + # }, {"model": "gpt-5"}, - {"model": "gpt-5", "reasoning": "high"}, + # {"model": "gpt-5", "reasoning": "high"}, ], rollout_processor=PydanticAgentRolloutProcessor(agent_factory), num_runs=2, @@ -75,6 +77,14 @@ async def test_pydantic_complex_queries(row: EvaluationRow) -> EvaluationRow: "gpt-5", ) + first_user_message = row.get_first_user_message() + if first_user_message is None: + row.evaluation_result = EvaluateResult( + score=0.0, + reason="No user message found", + ) + return row + class Response(BaseModel): """ A score between 0.0 and 1.0 indicating whether the response is correct. @@ -94,7 +104,7 @@ class Response(BaseModel): output_retries=5, ) result = await comparison_agent.run( - f"Expected answer: {row.ground_truth}\nResponse: {last_assistant_message.content}" + f"User task: {first_user_message.content}\nResponse: {last_assistant_message.content}\nExpected answer: {row.ground_truth}" ) row.evaluation_result = EvaluateResult( score=result.output.score, From a4ecbe4eb31b0f16320557b5766e1ef6ea7a0097 Mon Sep 17 00:00:00 2001 From: Dylan Huang Date: Tue, 16 Sep 2025 13:25:43 -0700 Subject: [PATCH 11/23] fix llm_judge_langsmith.py --- .../quickstart/llm_judge_langsmith.py | 23 ++++++++++++++----- 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/eval_protocol/quickstart/llm_judge_langsmith.py b/eval_protocol/quickstart/llm_judge_langsmith.py index f4efb7f5..16a287a9 100644 --- a/eval_protocol/quickstart/llm_judge_langsmith.py +++ b/eval_protocol/quickstart/llm_judge_langsmith.py @@ -22,6 +22,7 @@ import os from typing import Any, Dict, List, Optional +from openai import AsyncOpenAI import pytest from eval_protocol.models import EvaluationRow, Message, EvaluateResult, MetricResult @@ -31,7 +32,7 @@ split_multi_turn_rows, JUDGE_CONFIGS, calculate_bootstrap_scores, - run_judgment, + run_judgment_async, ) from eval_protocol.adapters.langsmith import LangSmithAdapter @@ -91,10 +92,15 @@ async def test_llm_judge_langsmith(rows: List[EvaluationRow]) -> List[Evaluation judgments: List[Dict[str, Any]] = [] - for row in rows: - result = run_judgment(row, model_name, judge_name) - if result and result["games"][0] and result["games"][1]: - judgments.append(result) + judge_config = JUDGE_CONFIGS[judge_name] + + async with AsyncOpenAI( + api_key=judge_config.get("api_key"), base_url=judge_config.get("base_url") + ) as shared_client: + for row in rows: + result = await run_judgment_async(row, model_name, judge_name, shared_client) + if result and result["games"][0] and result["games"][1]: + judgments.append(result) if not judgments: print("❌ No valid judgments generated") @@ -102,7 +108,12 @@ async def test_llm_judge_langsmith(rows: List[EvaluationRow]) -> List[Evaluation print(f"✅ Generated {len(judgments)} valid judgments") - mean_score, lower_score, upper_score = calculate_bootstrap_scores(judgments) + result = calculate_bootstrap_scores(judgments) + if not result: + print("❌ No valid scores extracted") + return rows + + mean_score, lower_score, upper_score = result if mean_score == 0.0: print("❌ No valid scores extracted") return rows From a3bd5839158e42474573c64d01ec4e29c58ee03c Mon Sep 17 00:00:00 2001 From: Dylan Huang Date: Tue, 16 Sep 2025 13:25:56 -0700 Subject: [PATCH 12/23] Update run_judgment_async to use AsyncOpenAI client for improved async handling --- eval_protocol/quickstart/utils.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/eval_protocol/quickstart/utils.py b/eval_protocol/quickstart/utils.py index d862a472..ebe695ec 100644 --- a/eval_protocol/quickstart/utils.py +++ b/eval_protocol/quickstart/utils.py @@ -6,6 +6,7 @@ from datetime import datetime import re from typing import List, Dict, Any, Optional +from openai import AsyncOpenAI import pandas as pd from eval_protocol.models import EvaluationRow, Message, EvaluateResult, MetricResult @@ -204,7 +205,7 @@ async def pairwise_judgment_async(question_text, answer_a, answer_b, tools, judg async def run_judgment_async( - row: EvaluationRow, model_name: str, judge_name: str, shared_client + row: EvaluationRow, model_name: str, judge_name: str, shared_client: AsyncOpenAI ) -> Optional[Dict[str, Any]]: """Async judgment using shared client to avoid cleanup issues.""" if not row.messages: From e97b76a1f2a27a407d7b207b0ebf1d6223107d70 Mon Sep 17 00:00:00 2001 From: Dylan Huang Date: Tue, 16 Sep 2025 14:45:08 -0700 Subject: [PATCH 13/23] snapshot test the simple example --- eval_protocol/adapters/openai_responses.py | 138 +++++++++++ pyproject.toml | 1 + .../test_openai_responses_adapter.ambr | 222 ++++++++++++++++++ .../adapters/test_openai_responses_adapter.py | 48 ++++ uv.lock | 16 +- 5 files changed, 424 insertions(+), 1 deletion(-) create mode 100644 eval_protocol/adapters/openai_responses.py create mode 100644 tests/adapters/__snapshots__/test_openai_responses_adapter.ambr create mode 100644 tests/adapters/test_openai_responses_adapter.py diff --git a/eval_protocol/adapters/openai_responses.py b/eval_protocol/adapters/openai_responses.py new file mode 100644 index 00000000..f9fc105c --- /dev/null +++ b/eval_protocol/adapters/openai_responses.py @@ -0,0 +1,138 @@ +"""Langfuse adapter for Eval Protocol. + +This adapter allows pulling data from Langfuse deployments and converting it +to EvaluationRow format for use in evaluation pipelines. +""" + +from collections.abc import Iterable +import logging +from typing import List + +from openai.pagination import SyncCursorPage +from openai.types.chat.chat_completion_message import FunctionCall +from openai.types.responses import Response +from openai.types.responses.response_item import ResponseItem +from openai.types.chat.chat_completion_message_function_tool_call import ( + ChatCompletionMessageFunctionToolCall, + Function, +) + +from eval_protocol.models import EvaluationRow, InputMetadata, Message + +logger = logging.getLogger(__name__) + + +from openai import OpenAI + + +class OpenAIResponsesAdapter: + """Adapter to pull data from OpenAI Responses API and convert to EvaluationRow format. + + This adapter can pull both chat conversations and tool calling traces from + Langfuse deployments and convert them into the EvaluationRow format expected + by the evaluation protocol. + + Examples: + Basic usage: + >>> adapter = OpenAIResponsesAdapter( + ... api_key="your_api_key", + ... ) + >>> rows = list(adapter.get_evaluation_rows(respnse_ids=["response_id_1", "response_id_2"])) + """ + + def __init__(self, api_key: str | None = None, base_url: str | None = None): + """Initialize the OpenAI Responses adapter.""" + self.openai = OpenAI(api_key=api_key, base_url=base_url) + + def get_evaluation_rows( + self, + response_ids: List[str], + ) -> List[EvaluationRow]: + """Pull responses from OpenAI Responses API and convert to EvaluationRow format. + + Args: + response_ids: List of response IDs to fetch + Returns: + List[EvaluationRow]: Converted evaluation rows + """ + eval_rows: list[EvaluationRow] = [] + + for response_id in response_ids: + input_items = self.openai.responses.input_items.list(response_id=response_id) + response = self.openai.responses.retrieve(response_id=response_id) + eval_rows.append(self._create_evaluation_row(input_items, response)) + + logger.info( + "Successfully processed %d selected traces into %d evaluation rows", len(response_ids), len(eval_rows) + ) + return eval_rows + + def _create_evaluation_row(self, input_items: SyncCursorPage[ResponseItem], response: Response) -> EvaluationRow: + """Convert a response to an evaluation row.""" + messages: list[Message] = [] + if response.instructions: + if isinstance(response.instructions, list): + raise NotImplementedError("List of instructions is not supported") + else: + messages.append(Message(role="system", content=response.instructions)) + messages.extend(self._create_messages(input_items)) + if response.output_text: + messages.append(Message(role="assistant", content=response.output_text)) + return EvaluationRow( + messages=messages, + tools=[tool.model_dump() for tool in response.tools], + input_metadata=InputMetadata( + completion_params={ + "model": response.model, + "temperature": response.temperature, + "max_output_tokens": response.max_output_tokens, + "max_tool_calls": response.max_tool_calls, + "parallel_tool_calls": response.parallel_tool_calls, + """ + We have to manually extract the reasoning effort and summary + from the response.reasoning object because the openai-python + causes an issue with model_dump() which is used for testing. + + https://github.com/openai/openai-python/issues/1306#issuecomment-2966267356 + """ + "reasoning": { + "effort": response.reasoning.effort, + "summary": response.reasoning.summary, + } + if response.reasoning + else None, + "top_logprobs": response.top_logprobs, + "truncation": response.truncation, + "top_p": response.top_p, + } + ), + ) + + def _create_messages(self, input_items: SyncCursorPage[ResponseItem]) -> Iterable[Message]: + """Create messages from input items. + + Converts OpenAI Responses API input items to chat completion message format. + Handles different types of response items including messages and tool calls. + """ + messages: list[Message] = [] + for item in input_items: + if item.type == "message": + # This is a message item (input or output) + content = item.content + for content_item in content: + if content_item.type == "input_text": + text_content = content_item.text + # Create new message + messages.append(Message(role=item.role, content=text_content)) + else: + raise NotImplementedError(f"Unsupported content type: {content_item.type}") + elif item.type == "function_call_output": + messages.append(Message(role="tool", content=item.output, tool_call_id=item.call_id)) + elif item.type == "function_call": + tool_call = ChatCompletionMessageFunctionToolCall( + id=item.call_id, type="function", function=Function(name=item.name, arguments=item.arguments) + ) + messages.append(Message(role="assistant", tool_calls=[tool_call])) + else: + raise NotImplementedError(f"Unsupported item type: {item.type}") + return reversed(messages) diff --git a/pyproject.toml b/pyproject.toml index b55ace62..fc1dde92 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -81,6 +81,7 @@ dev = [ "jupyter>=1.1.1", "pip>=25.1.1", "haikus==0.3.8", + "syrupy>=4.0.0", ] trl = [ "torch>=1.9", diff --git a/tests/adapters/__snapshots__/test_openai_responses_adapter.ambr b/tests/adapters/__snapshots__/test_openai_responses_adapter.ambr new file mode 100644 index 00000000..c1b6c21b --- /dev/null +++ b/tests/adapters/__snapshots__/test_openai_responses_adapter.ambr @@ -0,0 +1,222 @@ +# serializer version: 1 +# name: test_openai_responses_adapter_with_real_response_simple + list([ + dict({ + 'eval_metadata': None, + 'evaluation_result': None, + 'ground_truth': None, + 'input_metadata': dict({ + 'completion_params': dict({ + ''' + + We have to manually extract the reasoning effort and summary + from the response.reasoning object because the openai-python + causes an issue with model_dump() which is used for testing. + + https://github.com/openai/openai-python/issues/1306#issuecomment-2966267356 + reasoning + ''': dict({ + 'effort': 'medium', + 'summary': None, + }), + 'max_output_tokens': None, + 'max_tool_calls': None, + 'model': 'gpt-5-2025-08-07', + 'parallel_tool_calls': True, + 'temperature': 1.0, + 'top_logprobs': 0, + 'top_p': 1.0, + 'truncation': 'disabled', + }), + 'dataset_info': None, + 'row_id': None, + 'session_data': None, + }), + 'messages': list([ + dict({ + 'content': ''' + You are a helpful assistant that has access to the + Chinook database stored in a Postgres database. You have access to a tool to + execute SQL queries that you should use to answer questions. Your job is to + answer questions about the database. If you run into an error, you should try to + fix the query and try again. Here is the schema of the database: + + Schema: + table_name,column_name,data_type,is_nullable + album,album_id,integer,NO + album,title,character varying,NO + album,artist_id,integer,NO + artist,artist_id,integer,NO + artist,name,character varying,YES + customer,customer_id,integer,NO + customer,first_name,character varying,NO + customer,last_name,character varying,NO + customer,company,character varying,YES + customer,address,character varying,YES + customer,city,character varying,YES + customer,state,character varying,YES + customer,country,character varying,YES + customer,postal_code,character varying,YES + customer,phone,character varying,YES + customer,fax,character varying,YES + customer,email,character varying,NO + customer,support_rep_id,integer,YES + employee,employee_id,integer,NO + employee,last_name,character varying,NO + employee,first_name,character varying,NO + employee,title,character varying,YES + employee,reports_to,integer,YES + employee,birth_date,timestamp without time zone,YES + employee,hire_date,timestamp without time zone,YES + employee,address,character varying,YES + employee,city,character varying,YES + employee,state,character varying,YES + employee,country,character varying,YES + employee,postal_code,character varying,YES + employee,phone,character varying,YES + employee,fax,character varying,YES + employee,email,character varying,YES + genre,genre_id,integer,NO + genre,name,character varying,YES + invoice,invoice_id,integer,NO + invoice,customer_id,integer,NO + invoice,invoice_date,timestamp without time zone,NO + invoice,billing_address,character varying,YES + invoice,billing_city,character varying,YES + invoice,billing_state,character varying,YES + invoice,billing_country,character varying,YES + invoice,billing_postal_code,character varying,YES + invoice,total,numeric,NO + invoice_line,invoice_line_id,integer,NO + invoice_line,invoice_id,integer,NO + invoice_line,track_id,integer,NO + invoice_line,unit_price,numeric,NO + invoice_line,quantity,integer,NO + media_type,media_type_id,integer,NO + media_type,name,character varying,YES + playlist,playlist_id,integer,NO + playlist,name,character varying,YES + playlist_track,playlist_id,integer,NO + playlist_track,track_id,integer,NO + track,track_id,integer,NO + track,name,character varying,NO + track,album_id,integer,YES + track,media_type_id,integer,NO + track,genre_id,integer,YES + track,composer,character varying,YES + track,milliseconds,integer,NO + track,bytes,integer,YES + track,unit_price,numeric,NO + ''', + 'control_plane_step': None, + 'function_call': None, + 'name': None, + 'reasoning_content': None, + 'role': 'system', + 'tool_call_id': None, + 'tool_calls': None, + }), + dict({ + 'content': 'Find all employees and their reporting hierarchy levels using a recursive CTE. Show employee name, level, and the complete hierarchy path from top to bottom.', + 'control_plane_step': None, + 'function_call': None, + 'name': None, + 'reasoning_content': None, + 'role': 'user', + 'tool_call_id': None, + 'tool_calls': None, + }), + dict({ + 'content': '', + 'control_plane_step': None, + 'function_call': None, + 'name': None, + 'reasoning_content': None, + 'role': 'assistant', + 'tool_call_id': None, + 'tool_calls': list([ + dict({ + 'function': dict({ + 'arguments': '{"query":"WITH RECURSIVE emp_tree AS (\\n SELECT \\n e.employee_id,\\n (e.first_name || \' \' || e.last_name) AS name,\\n e.reports_to,\\n 1 AS level,\\n (e.first_name || \' \' || e.last_name) AS path\\n FROM employee e\\n WHERE e.reports_to IS NULL\\n UNION ALL\\n SELECT \\n c.employee_id,\\n (c.first_name || \' \' || c.last_name) AS name,\\n c.reports_to,\\n et.level + 1 AS level,\\n (et.path || \' > \' || c.first_name || \' \' || c.last_name) AS path\\n FROM employee c\\n JOIN emp_tree et ON c.reports_to = et.employee_id\\n)\\nSELECT name, level, path\\nFROM emp_tree\\nORDER BY path;"}', + 'name': 'execute_sql', + }), + 'id': 'call_jWVUHFHBmrK1FpHIP7FqZmaO', + 'type': 'function', + }), + ]), + }), + dict({ + 'content': ''' + | name | level | path | + | --- | --- | --- | + | Andrew Adams | 1 | Andrew Adams | + | Michael Mitchell | 2 | Andrew Adams > Michael Mitchell | + | Laura Callahan | 3 | Andrew Adams > Michael Mitchell > Laura Callahan | + | Robert King | 3 | Andrew Adams > Michael Mitchell > Robert King | + | Nancy Edwards | 2 | Andrew Adams > Nancy Edwards | + | Jane Peacock | 3 | Andrew Adams > Nancy Edwards > Jane Peacock | + | Margaret Park | 3 | Andrew Adams > Nancy Edwards > Margaret Park | + | Steve Johnson | 3 | Andrew Adams > Nancy Edwards > Steve Johnson | + ''', + 'control_plane_step': None, + 'function_call': None, + 'name': None, + 'reasoning_content': None, + 'role': 'tool', + 'tool_call_id': 'call_jWVUHFHBmrK1FpHIP7FqZmaO', + 'tool_calls': None, + }), + dict({ + 'content': ''' + Here are all employees with their hierarchy level and full path from the top: + + - Andrew Adams — Level 1 — Path: Andrew Adams + - Michael Mitchell — Level 2 — Path: Andrew Adams > Michael Mitchell + - Laura Callahan — Level 3 — Path: Andrew Adams > Michael Mitchell > Laura Callahan + - Robert King — Level 3 — Path: Andrew Adams > Michael Mitchell > Robert King + - Nancy Edwards — Level 2 — Path: Andrew Adams > Nancy Edwards + - Jane Peacock — Level 3 — Path: Andrew Adams > Nancy Edwards > Jane Peacock + - Margaret Park — Level 3 — Path: Andrew Adams > Nancy Edwards > Margaret Park + - Steve Johnson — Level 3 — Path: Andrew Adams > Nancy Edwards > Steve Johnson + + If you want this ordered differently (e.g., by level, name, or reporting manager), let me know and I can adjust the query. + ''', + 'control_plane_step': None, + 'function_call': None, + 'name': None, + 'reasoning_content': None, + 'role': 'assistant', + 'tool_call_id': None, + 'tool_calls': None, + }), + ]), + 'pid': None, + 'rollout_status': dict({ + 'code': , + 'details': list([ + ]), + 'message': 'Rollout is running', + }), + 'tools': list([ + dict({ + 'description': None, + 'name': 'execute_sql', + 'parameters': dict({ + 'additionalProperties': False, + 'properties': dict({ + 'query': dict({ + 'type': 'string', + }), + }), + 'required': list([ + 'query', + ]), + 'type': 'object', + }), + 'strict': True, + 'type': 'function', + }), + ]), + }), + ]) +# --- diff --git a/tests/adapters/test_openai_responses_adapter.py b/tests/adapters/test_openai_responses_adapter.py new file mode 100644 index 00000000..0d146229 --- /dev/null +++ b/tests/adapters/test_openai_responses_adapter.py @@ -0,0 +1,48 @@ +"""Tests for OpenAIResponsesAdapter.""" + +from syrupy.assertion import SnapshotAssertion + +from eval_protocol.adapters.openai_responses import OpenAIResponsesAdapter + + +def test_openai_responses_adapter_with_real_response_simple(snapshot: SnapshotAssertion): + """Test OpenAIResponsesAdapter with a real response ID that is a simple 4 + message conversation with: system, user, tool, tool response, assistant. + + https://platform.openai.com/logs/resp_05639dcaca074fbc0068c9946593b481908cac70075926d85c + """ + adapter = OpenAIResponsesAdapter() + + response_id = "resp_05639dcaca074fbc0068c9946593b481908cac70075926d85c" + + eval_rows = adapter.get_evaluation_rows(response_ids=[response_id]) + + # Basic assertions about the returned data structure + assert isinstance(eval_rows, list) + assert len(eval_rows) == 1 + + # Convert to dict for snapshot testing + eval_rows_dict = [row.model_dump(exclude={"created_at", "execution_metadata"}) for row in eval_rows] + + # Assert against snapshot + assert eval_rows_dict == snapshot + + +def test_openai_responses_adapter_with_real_response_parallel_tool_calls(snapshot: SnapshotAssertion): + """ + https://platform.openai.com/logs/resp_0e1b7db5d96e92470068c99506443c819e9305e92915d2405f + """ + adapter = OpenAIResponsesAdapter() + response_id = "resp_0e1b7db5d96e92470068c99506443c819e9305e92915d2405f" + + eval_rows = adapter.get_evaluation_rows(response_ids=[response_id]) + + # Basic assertions about the returned data structure + assert isinstance(eval_rows, list) + assert len(eval_rows) == 1 + + # Convert to dict for snapshot testing + eval_rows_dict = [row.model_dump() for row in eval_rows] + + # Assert against snapshot + assert eval_rows_dict == snapshot diff --git a/uv.lock b/uv.lock index 6d333a22..1ac05adc 100644 --- a/uv.lock +++ b/uv.lock @@ -1,5 +1,5 @@ version = 1 -revision = 3 +revision = 2 requires-python = ">=3.10" resolution-markers = [ "python_full_version >= '3.13'", @@ -1244,6 +1244,7 @@ dev = [ { name = "pytest-httpserver" }, { name = "pytest-xdist" }, { name = "ruff" }, + { name = "syrupy" }, { name = "transformers" }, { name = "twine" }, { name = "types-docker" }, @@ -1377,6 +1378,7 @@ requires-dist = [ { name = "selenium", marker = "extra == 'svgbench'", specifier = ">=4.0.0" }, { name = "supabase", marker = "extra == 'supabase'", specifier = ">=2.18.1" }, { name = "swig", marker = "extra == 'box2d'" }, + { name = "syrupy", marker = "extra == 'dev'", specifier = ">=4.0.0" }, { name = "toml", specifier = ">=0.10.0" }, { name = "torch", marker = "extra == 'trl'", specifier = ">=1.9" }, { name = "transformers", marker = "extra == 'adapters'", specifier = ">=4.0.0" }, @@ -6380,6 +6382,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/a2/09/77d55d46fd61b4a135c444fc97158ef34a095e5681d0a6c10b75bf356191/sympy-1.14.0-py3-none-any.whl", hash = "sha256:e091cc3e99d2141a0ba2847328f5479b05d94a6635cb96148ccb3f34671bd8f5", size = 6299353, upload-time = "2025-04-27T18:04:59.103Z" }, ] +[[package]] +name = "syrupy" +version = "4.9.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "pytest" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/8c/f8/022d8704a3314f3e96dbd6bbd16ebe119ce30e35f41aabfa92345652fceb/syrupy-4.9.1.tar.gz", hash = "sha256:b7d0fcadad80a7d2f6c4c71917918e8ebe2483e8c703dfc8d49cdbb01081f9a4", size = 52492, upload-time = "2025-03-24T01:36:37.225Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/ec/9d/aef9ec5fd5a4ee2f6a96032c4eda5888c5c7cec65cef6b28c4fc37671d88/syrupy-4.9.1-py3-none-any.whl", hash = "sha256:b94cc12ed0e5e75b448255430af642516842a2374a46936dd2650cfb6dd20eda", size = 52214, upload-time = "2025-03-24T01:36:35.278Z" }, +] + [[package]] name = "temporalio" version = "1.17.0" From 7024ac1d2a4a510ebd3084eac04086c0884b45e0 Mon Sep 17 00:00:00 2001 From: Dylan Huang Date: Tue, 16 Sep 2025 14:46:42 -0700 Subject: [PATCH 14/23] update snapshot --- .pre-commit-config.yaml | 4 ++-- .../__snapshots__/test_openai_responses_adapter.ambr | 10 +++++----- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 2dce407d..3302cfb9 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -7,9 +7,9 @@ repos: rev: v6.0.0 hooks: - id: trailing-whitespace - exclude: "(^vite-app/|\\.snap$)" + exclude: "(^vite-app/|\\.snap$|\\.ambr$)" - id: end-of-file-fixer - exclude: "(^vite-app/|\\.snap$)" + exclude: "(^vite-app/|\\.snap$|\\.ambr$)" - id: check-yaml - id: check-added-large-files - id: check-merge-conflict diff --git a/tests/adapters/__snapshots__/test_openai_responses_adapter.ambr b/tests/adapters/__snapshots__/test_openai_responses_adapter.ambr index c1b6c21b..03cfc92d 100644 --- a/tests/adapters/__snapshots__/test_openai_responses_adapter.ambr +++ b/tests/adapters/__snapshots__/test_openai_responses_adapter.ambr @@ -8,11 +8,11 @@ 'input_metadata': dict({ 'completion_params': dict({ ''' - + We have to manually extract the reasoning effort and summary from the response.reasoning object because the openai-python causes an issue with model_dump() which is used for testing. - + https://github.com/openai/openai-python/issues/1306#issuecomment-2966267356 reasoning ''': dict({ @@ -40,7 +40,7 @@ execute SQL queries that you should use to answer questions. Your job is to answer questions about the database. If you run into an error, you should try to fix the query and try again. Here is the schema of the database: - + Schema: table_name,column_name,data_type,is_nullable album,album_id,integer,NO @@ -169,7 +169,7 @@ dict({ 'content': ''' Here are all employees with their hierarchy level and full path from the top: - + - Andrew Adams — Level 1 — Path: Andrew Adams - Michael Mitchell — Level 2 — Path: Andrew Adams > Michael Mitchell - Laura Callahan — Level 3 — Path: Andrew Adams > Michael Mitchell > Laura Callahan @@ -178,7 +178,7 @@ - Jane Peacock — Level 3 — Path: Andrew Adams > Nancy Edwards > Jane Peacock - Margaret Park — Level 3 — Path: Andrew Adams > Nancy Edwards > Margaret Park - Steve Johnson — Level 3 — Path: Andrew Adams > Nancy Edwards > Steve Johnson - + If you want this ordered differently (e.g., by level, name, or reporting manager), let me know and I can adjust the query. ''', 'control_plane_step': None, From e61223861f2b87269a596d18d9974c73057e910e Mon Sep 17 00:00:00 2001 From: Dylan Huang Date: Tue, 16 Sep 2025 14:56:46 -0700 Subject: [PATCH 15/23] fix handling of parallel tool calls --- eval_protocol/adapters/openai_responses.py | 28 +- .../test_openai_responses_adapter.ambr | 444 ++++++++++++++++++ .../adapters/test_openai_responses_adapter.py | 2 +- 3 files changed, 471 insertions(+), 3 deletions(-) diff --git a/eval_protocol/adapters/openai_responses.py b/eval_protocol/adapters/openai_responses.py index f9fc105c..d71be690 100644 --- a/eval_protocol/adapters/openai_responses.py +++ b/eval_protocol/adapters/openai_responses.py @@ -113,10 +113,25 @@ def _create_messages(self, input_items: SyncCursorPage[ResponseItem]) -> Iterabl Converts OpenAI Responses API input items to chat completion message format. Handles different types of response items including messages and tool calls. + Groups parallel tool calls under a single assistant message. + Since we iterate backwards and reverse at the end, tool call outputs should + be added before the assistant message with tool calls. """ messages: list[Message] = [] + current_tool_calls: list[ChatCompletionMessageFunctionToolCall] = [] + tool_call_outputs: list[Message] = [] + for item in input_items: if item.type == "message": + # If we have accumulated tool calls, create an assistant message with them + if current_tool_calls: + # Add tool call outputs first (since we reverse at the end) + messages.extend(tool_call_outputs) + tool_call_outputs = [] + # Then add the assistant message with tool calls + messages.append(Message(role="assistant", tool_calls=current_tool_calls)) + current_tool_calls = [] + # This is a message item (input or output) content = item.content for content_item in content: @@ -127,12 +142,21 @@ def _create_messages(self, input_items: SyncCursorPage[ResponseItem]) -> Iterabl else: raise NotImplementedError(f"Unsupported content type: {content_item.type}") elif item.type == "function_call_output": - messages.append(Message(role="tool", content=item.output, tool_call_id=item.call_id)) + # Collect tool call outputs to add before assistant message + tool_call_outputs.append(Message(role="tool", content=item.output, tool_call_id=item.call_id)) elif item.type == "function_call": tool_call = ChatCompletionMessageFunctionToolCall( id=item.call_id, type="function", function=Function(name=item.name, arguments=item.arguments) ) - messages.append(Message(role="assistant", tool_calls=[tool_call])) + current_tool_calls.append(tool_call) else: raise NotImplementedError(f"Unsupported item type: {item.type}") + + # If we have remaining tool calls, create an assistant message with them + if current_tool_calls: + # Add tool call outputs first (since we reverse at the end) + messages.extend(tool_call_outputs) + # Then add the assistant message with tool calls + messages.append(Message(role="assistant", tool_calls=current_tool_calls)) + return reversed(messages) diff --git a/tests/adapters/__snapshots__/test_openai_responses_adapter.ambr b/tests/adapters/__snapshots__/test_openai_responses_adapter.ambr index 03cfc92d..b2bc145b 100644 --- a/tests/adapters/__snapshots__/test_openai_responses_adapter.ambr +++ b/tests/adapters/__snapshots__/test_openai_responses_adapter.ambr @@ -1,4 +1,448 @@ # serializer version: 1 +# name: test_openai_responses_adapter_with_real_response_parallel_tool_calls + list([ + dict({ + 'eval_metadata': None, + 'evaluation_result': None, + 'ground_truth': None, + 'input_metadata': dict({ + 'completion_params': dict({ + ''' + + We have to manually extract the reasoning effort and summary + from the response.reasoning object because the openai-python + causes an issue with model_dump() which is used for testing. + + https://github.com/openai/openai-python/issues/1306#issuecomment-2966267356 + reasoning + ''': dict({ + 'effort': 'medium', + 'summary': None, + }), + 'max_output_tokens': None, + 'max_tool_calls': None, + 'model': 'gpt-5-2025-08-07', + 'parallel_tool_calls': True, + 'temperature': 1.0, + 'top_logprobs': 0, + 'top_p': 1.0, + 'truncation': 'disabled', + }), + 'dataset_info': None, + 'row_id': None, + 'session_data': None, + }), + 'messages': list([ + dict({ + 'content': ''' + You are a helpful assistant that has access to the + Chinook database stored in a Postgres database. You have access to a tool to + execute SQL queries that you should use to answer questions. Your job is to + answer questions about the database. If you run into an error, you should try to + fix the query and try again. Here is the schema of the database: + + Schema: + table_name,column_name,data_type,is_nullable + album,album_id,integer,NO + album,title,character varying,NO + album,artist_id,integer,NO + artist,artist_id,integer,NO + artist,name,character varying,YES + customer,customer_id,integer,NO + customer,first_name,character varying,NO + customer,last_name,character varying,NO + customer,company,character varying,YES + customer,address,character varying,YES + customer,city,character varying,YES + customer,state,character varying,YES + customer,country,character varying,YES + customer,postal_code,character varying,YES + customer,phone,character varying,YES + customer,fax,character varying,YES + customer,email,character varying,NO + customer,support_rep_id,integer,YES + employee,employee_id,integer,NO + employee,last_name,character varying,NO + employee,first_name,character varying,NO + employee,title,character varying,YES + employee,reports_to,integer,YES + employee,birth_date,timestamp without time zone,YES + employee,hire_date,timestamp without time zone,YES + employee,address,character varying,YES + employee,city,character varying,YES + employee,state,character varying,YES + employee,country,character varying,YES + employee,postal_code,character varying,YES + employee,phone,character varying,YES + employee,fax,character varying,YES + employee,email,character varying,YES + genre,genre_id,integer,NO + genre,name,character varying,YES + invoice,invoice_id,integer,NO + invoice,customer_id,integer,NO + invoice,invoice_date,timestamp without time zone,NO + invoice,billing_address,character varying,YES + invoice,billing_city,character varying,YES + invoice,billing_state,character varying,YES + invoice,billing_country,character varying,YES + invoice,billing_postal_code,character varying,YES + invoice,total,numeric,NO + invoice_line,invoice_line_id,integer,NO + invoice_line,invoice_id,integer,NO + invoice_line,track_id,integer,NO + invoice_line,unit_price,numeric,NO + invoice_line,quantity,integer,NO + media_type,media_type_id,integer,NO + media_type,name,character varying,YES + playlist,playlist_id,integer,NO + playlist,name,character varying,YES + playlist_track,playlist_id,integer,NO + playlist_track,track_id,integer,NO + track,track_id,integer,NO + track,name,character varying,NO + track,album_id,integer,YES + track,media_type_id,integer,NO + track,genre_id,integer,YES + track,composer,character varying,YES + track,milliseconds,integer,NO + track,bytes,integer,YES + track,unit_price,numeric,NO + ''', + 'control_plane_step': None, + 'function_call': None, + 'name': None, + 'reasoning_content': None, + 'role': 'system', + 'tool_call_id': None, + 'tool_calls': None, + }), + dict({ + 'content': ''' + Calculate customer lifetime value segments and predict churn risk. Group + customers into segments based on spending patterns and calculate days since last + purchase to identify churn risk. + ''', + 'control_plane_step': None, + 'function_call': None, + 'name': None, + 'reasoning_content': None, + 'role': 'user', + 'tool_call_id': None, + 'tool_calls': None, + }), + dict({ + 'content': '', + 'control_plane_step': None, + 'function_call': None, + 'name': None, + 'reasoning_content': None, + 'role': 'assistant', + 'tool_call_id': None, + 'tool_calls': list([ + dict({ + 'function': dict({ + 'arguments': '{"query":"WITH customer_metrics AS (\\n SELECT \\n c.customer_id,\\n (c.first_name || \' \' || c.last_name) AS customer_name,\\n c.country,\\n COALESCE(SUM(i.total), 0) AS lifetime_value,\\n COUNT(i.invoice_id) AS orders,\\n COALESCE(AVG(i.total), 0) AS avg_order_value,\\n MAX(i.invoice_date) AS last_purchase_date\\n FROM customer c\\n LEFT JOIN invoice i ON i.customer_id = c.customer_id\\n GROUP BY c.customer_id, customer_name, c.country\\n),\\nscored AS (\\n SELECT \\n *,\\n CASE \\n WHEN last_purchase_date IS NULL THEN NULL\\n ELSE (current_date - last_purchase_date::date)\\n END AS days_since_last_purchase_raw,\\n PERCENT_RANK() OVER (ORDER BY lifetime_value) AS lv_percent_rank\\n FROM customer_metrics\\n),\\nsegmented AS (\\n SELECT \\n customer_id,\\n customer_name,\\n country,\\n lifetime_value,\\n orders,\\n avg_order_value,\\n last_purchase_date,\\n CASE \\n WHEN days_since_last_purchase_raw IS NULL THEN NULL\\n ELSE GREATEST(0, days_since_last_purchase_raw)\\n END AS days_since_last_purchase,\\n CASE \\n WHEN lifetime_value = 0 THEN \'No Spend\'\\n WHEN lv_percent_rank >= 0.8 THEN \'High\'\\n WHEN lv_percent_rank >= 0.5 THEN \'Medium\'\\n ELSE \'Low\'\\n END AS lv_segment,\\n CASE \\n WHEN last_purchase_date IS NULL THEN \'Churn Risk: No purchases\'\\n WHEN (CASE WHEN days_since_last_purchase_raw IS NULL THEN NULL ELSE GREATEST(0, days_since_last_purchase_raw) END) > 365 THEN \'High Risk\'\\n WHEN (CASE WHEN days_since_last_purchase_raw IS NULL THEN NULL ELSE GREATEST(0, days_since_last_purchase_raw) END) BETWEEN 181 AND 365 THEN \'Medium Risk\'\\n WHEN (CASE WHEN days_since_last_purchase_raw IS NULL THEN NULL ELSE GREATEST(0, days_since_last_purchase_raw) END) BETWEEN 91 AND 180 THEN \'Low Risk\'\\n ELSE \'Active\'\\n END AS churn_risk\\n FROM scored\\n)\\nSELECT \\n customer_id,\\n customer_name,\\n country,\\n lifetime_value,\\n orders,\\n avg_order_value,\\n last_purchase_date,\\n days_since_last_purchase,\\n lv_segment,\\n churn_risk\\nFROM segmented\\nWHERE churn_risk = \'High Risk\'\\nORDER BY days_since_last_purchase DESC, lifetime_value DESC\\nLIMIT 10;"}', + 'name': 'execute_sql', + }), + 'id': 'call_QklYCtJeAzMn8iMshl0PaYXg', + 'type': 'function', + }), + dict({ + 'function': dict({ + 'arguments': '{"query":"WITH customer_metrics AS (\\n SELECT \\n c.customer_id,\\n (c.first_name || \' \' || c.last_name) AS customer_name,\\n c.country,\\n COALESCE(SUM(i.total), 0) AS lifetime_value,\\n COUNT(i.invoice_id) AS orders,\\n COALESCE(AVG(i.total), 0) AS avg_order_value,\\n MAX(i.invoice_date) AS last_purchase_date\\n FROM customer c\\n LEFT JOIN invoice i ON i.customer_id = c.customer_id\\n GROUP BY c.customer_id, customer_name, c.country\\n),\\nscored AS (\\n SELECT \\n *,\\n CASE \\n WHEN last_purchase_date IS NULL THEN NULL\\n ELSE (current_date - last_purchase_date::date)\\n END AS days_since_last_purchase_raw,\\n PERCENT_RANK() OVER (ORDER BY lifetime_value) AS lv_percent_rank\\n FROM customer_metrics\\n),\\nsegmented AS (\\n SELECT \\n customer_id,\\n customer_name,\\n country,\\n lifetime_value,\\n orders,\\n avg_order_value,\\n last_purchase_date,\\n CASE \\n WHEN days_since_last_purchase_raw IS NULL THEN NULL\\n ELSE GREATEST(0, days_since_last_purchase_raw)\\n END AS days_since_last_purchase,\\n CASE \\n WHEN lifetime_value = 0 THEN \'No Spend\'\\n WHEN lv_percent_rank >= 0.8 THEN \'High\'\\n WHEN lv_percent_rank >= 0.5 THEN \'Medium\'\\n ELSE \'Low\'\\n END AS lv_segment,\\n CASE \\n WHEN last_purchase_date IS NULL THEN \'Churn Risk: No purchases\'\\n WHEN (CASE WHEN days_since_last_purchase_raw IS NULL THEN NULL ELSE GREATEST(0, days_since_last_purchase_raw) END) > 365 THEN \'High Risk\'\\n WHEN (CASE WHEN days_since_last_purchase_raw IS NULL THEN NULL ELSE GREATEST(0, days_since_last_purchase_raw) END) BETWEEN 181 AND 365 THEN \'Medium Risk\'\\n WHEN (CASE WHEN days_since_last_purchase_raw IS NULL THEN NULL ELSE GREATEST(0, days_since_last_purchase_raw) END) BETWEEN 91 AND 180 THEN \'Low Risk\'\\n ELSE \'Active\'\\n END AS churn_risk\\n FROM scored\\n)\\nSELECT \\n lv_segment,\\n COUNT(*) AS customers,\\n ROUND(AVG(lifetime_value)::numeric, 2) AS avg_ltv,\\n ROUND(AVG(COALESCE(days_since_last_purchase, 0))::numeric, 1) AS avg_days_since_last_purchase,\\n SUM(CASE WHEN churn_risk = \'High Risk\' THEN 1 ELSE 0 END) AS high_risk_customers\\nFROM segmented\\nGROUP BY lv_segment\\nORDER BY \\n CASE lv_segment WHEN \'High\' THEN 1 WHEN \'Medium\' THEN 2 WHEN \'Low\' THEN 3 WHEN \'No Spend\' THEN 4 ELSE 5 END;"}', + 'name': 'execute_sql', + }), + 'id': 'call_8yfYxjCJFppcVcjnmKbTBxoj', + 'type': 'function', + }), + dict({ + 'function': dict({ + 'arguments': '{"query":"WITH customer_metrics AS (\\n SELECT \\n c.customer_id,\\n (c.first_name || \' \' || c.last_name) AS customer_name,\\n c.country,\\n COALESCE(SUM(i.total), 0) AS lifetime_value,\\n COUNT(i.invoice_id) AS orders,\\n COALESCE(AVG(i.total), 0) AS avg_order_value,\\n MAX(i.invoice_date) AS last_purchase_date\\n FROM customer c\\n LEFT JOIN invoice i ON i.customer_id = c.customer_id\\n GROUP BY c.customer_id, customer_name, c.country\\n),\\nscored AS (\\n SELECT \\n *,\\n CASE \\n WHEN last_purchase_date IS NULL THEN NULL\\n ELSE (current_date - last_purchase_date::date)\\n END AS days_since_last_purchase_raw,\\n PERCENT_RANK() OVER (ORDER BY lifetime_value) AS lv_percent_rank\\n FROM customer_metrics\\n),\\nsegmented AS (\\n SELECT \\n customer_id,\\n customer_name,\\n country,\\n lifetime_value,\\n orders,\\n avg_order_value,\\n last_purchase_date,\\n CASE \\n WHEN days_since_last_purchase_raw IS NULL THEN NULL\\n ELSE GREATEST(0, days_since_last_purchase_raw)\\n END AS days_since_last_purchase,\\n CASE \\n WHEN lifetime_value = 0 THEN \'No Spend\'\\n WHEN lv_percent_rank >= 0.8 THEN \'High\'\\n WHEN lv_percent_rank >= 0.5 THEN \'Medium\'\\n ELSE \'Low\'\\n END AS lv_segment,\\n CASE \\n WHEN last_purchase_date IS NULL THEN \'Churn Risk: No purchases\'\\n WHEN (CASE WHEN days_since_last_purchase_raw IS NULL THEN NULL ELSE GREATEST(0, days_since_last_purchase_raw) END) > 365 THEN \'High Risk\'\\n WHEN (CASE WHEN days_since_last_purchase_raw IS NULL THEN NULL ELSE GREATEST(0, days_since_last_purchase_raw) END) BETWEEN 181 AND 365 THEN \'Medium Risk\'\\n WHEN (CASE WHEN days_since_last_purchase_raw IS NULL THEN NULL ELSE GREATEST(0, days_since_last_purchase_raw) END) BETWEEN 91 AND 180 THEN \'Low Risk\'\\n ELSE \'Active\'\\n END AS churn_risk\\n FROM scored\\n)\\nSELECT *\\nFROM segmented\\nORDER BY lifetime_value DESC, customer_id;"}', + 'name': 'execute_sql', + }), + 'id': 'call_pyQOTPrrGRT0IuljYLkbeb0a', + 'type': 'function', + }), + dict({ + 'function': dict({ + 'arguments': '{"query":"WITH customer_metrics AS (\\n SELECT \\n c.customer_id,\\n (c.first_name || \' \' || c.last_name) AS customer_name,\\n c.country,\\n COALESCE(SUM(i.total), 0) AS lifetime_value,\\n COUNT(i.invoice_id) AS orders,\\n COALESCE(AVG(i.total), 0) AS avg_order_value,\\n MAX(i.invoice_date) AS last_purchase_date\\n FROM customer c\\n LEFT JOIN invoice i ON i.customer_id = c.customer_id\\n GROUP BY c.customer_id, customer_name, c.country\\n),\\nscored AS (\\n SELECT \\n *,\\n CASE \\n WHEN last_purchase_date IS NULL THEN NULL\\n ELSE (current_date - last_purchase_date::date)\\n END AS days_since_last_purchase_raw,\\n PERCENT_RANK() OVER (ORDER BY lifetime_value) AS lv_percent_rank\\n FROM customer_metrics\\n),\\nsegmented AS (\\n SELECT \\n customer_id,\\n customer_name,\\n country,\\n lifetime_value,\\n orders,\\n avg_order_value,\\n last_purchase_date,\\n CASE \\n WHEN days_since_last_purchase_raw IS NULL THEN NULL\\n ELSE GREATEST(0, days_since_last_purchase_raw)\\n END AS days_since_last_purchase,\\n CASE \\n WHEN lifetime_value = 0 THEN \'No Spend\'\\n WHEN lv_percent_rank >= 0.8 THEN \'High\'\\n WHEN lv_percent_rank >= 0.5 THEN \'Medium\'\\n ELSE \'Low\'\\n END AS lv_segment,\\n CASE \\n WHEN last_purchase_date IS NULL THEN \'Churn Risk: No purchases\'\\n WHEN (CASE WHEN days_since_last_purchase_raw IS NULL THEN NULL ELSE GREATEST(0, days_since_last_purchase_raw) END) > 365 THEN \'High Risk\'\\n WHEN (CASE WHEN days_since_last_purchase_raw IS NULL THEN NULL ELSE GREATEST(0, days_since_last_purchase_raw) END) BETWEEN 181 AND 365 THEN \'Medium Risk\'\\n WHEN (CASE WHEN days_since_last_purchase_raw IS NULL THEN NULL ELSE GREATEST(0, days_since_last_purchase_raw) END) BETWEEN 91 AND 180 THEN \'Low Risk\'\\n ELSE \'Active\'\\n END AS churn_risk\\n FROM scored\\n)\\nSELECT \\n churn_risk,\\n COUNT(*) AS customers,\\n ROUND(AVG(lifetime_value)::numeric, 2) AS avg_ltv,\\n ROUND(AVG(COALESCE(days_since_last_purchase, 0))::numeric, 1) AS avg_days_since_last_purchase\\nFROM segmented\\nGROUP BY 1\\nORDER BY \\n CASE churn_risk WHEN \'High Risk\' THEN 1 WHEN \'Medium Risk\' THEN 2 WHEN \'Low Risk\' THEN 3 WHEN \'Active\' THEN 4 WHEN \'Churn Risk: No purchases\' THEN 5 ELSE 6 END;"}', + 'name': 'execute_sql', + }), + 'id': 'call_ufaloEQ87VReYLpLxeaf2EmV', + 'type': 'function', + }), + dict({ + 'function': dict({ + 'arguments': '{"query":"WITH customer_metrics AS (\\n SELECT \\n c.customer_id,\\n (c.first_name || \' \' || c.last_name) AS customer_name,\\n c.country,\\n COALESCE(SUM(i.total), 0) AS lifetime_value,\\n COUNT(i.invoice_id) AS orders,\\n COALESCE(AVG(i.total), 0) AS avg_order_value,\\n MAX(i.invoice_date) AS last_purchase_date\\n FROM customer c\\n LEFT JOIN invoice i ON i.customer_id = c.customer_id\\n GROUP BY c.customer_id, customer_name, c.country\\n),\\nscored AS (\\n SELECT \\n *,\\n CASE \\n WHEN last_purchase_date IS NULL THEN NULL\\n ELSE (current_date - last_purchase_date::date)\\n END AS days_since_last_purchase_raw,\\n PERCENT_RANK() OVER (ORDER BY lifetime_value) AS lv_percent_rank\\n FROM customer_metrics\\n),\\nsegmented AS (\\n SELECT \\n customer_id,\\n customer_name,\\n country,\\n lifetime_value,\\n orders,\\n avg_order_value,\\n last_purchase_date,\\n CASE \\n WHEN days_since_last_purchase_raw IS NULL THEN NULL\\n ELSE GREATEST(0, days_since_last_purchase_raw)\\n END AS days_since_last_purchase,\\n CASE \\n WHEN lifetime_value = 0 THEN \'No Spend\'\\n WHEN lv_percent_rank >= 0.8 THEN \'High\'\\n WHEN lv_percent_rank >= 0.5 THEN \'Medium\'\\n ELSE \'Low\'\\n END AS lv_segment,\\n CASE \\n WHEN last_purchase_date IS NULL THEN \'Churn Risk: No purchases\'\\n WHEN (CASE WHEN days_since_last_purchase_raw IS NULL THEN NULL ELSE GREATEST(0, days_since_last_purchase_raw) END) > 365 THEN \'High Risk\'\\n WHEN (CASE WHEN days_since_last_purchase_raw IS NULL THEN NULL ELSE GREATEST(0, days_since_last_purchase_raw) END) BETWEEN 181 AND 365 THEN \'Medium Risk\'\\n WHEN (CASE WHEN days_since_last_purchase_raw IS NULL THEN NULL ELSE GREATEST(0, days_since_last_purchase_raw) END) BETWEEN 91 AND 180 THEN \'Low Risk\'\\n ELSE \'Active\'\\n END AS churn_risk\\n FROM scored\\n)\\nSELECT \\n customer_id,\\n customer_name,\\n country,\\n lifetime_value,\\n orders,\\n avg_order_value,\\n last_purchase_date,\\n days_since_last_purchase,\\n lv_segment,\\n churn_risk\\nFROM segmented\\nWHERE churn_risk = \'High Risk\'\\nORDER BY lifetime_value DESC, days_since_last_purchase DESC\\nLIMIT 10;"}', + 'name': 'execute_sql', + }), + 'id': 'call_C3MJguOJ62YJtQ4wN6duDsEV', + 'type': 'function', + }), + dict({ + 'function': dict({ + 'arguments': '{"query":"WITH customer_metrics AS (\\n SELECT \\n c.customer_id,\\n (c.first_name || \' \' || c.last_name) AS customer_name,\\n c.country,\\n COALESCE(SUM(i.total), 0) AS lifetime_value,\\n COUNT(i.invoice_id) AS orders,\\n COALESCE(AVG(i.total), 0) AS avg_order_value,\\n MAX(i.invoice_date) AS last_purchase_date\\n FROM customer c\\n LEFT JOIN invoice i ON i.customer_id = c.customer_id\\n GROUP BY c.customer_id, customer_name, c.country\\n),\\nscored AS (\\n SELECT \\n *,\\n CASE \\n WHEN last_purchase_date IS NULL THEN NULL\\n ELSE (current_date - last_purchase_date::date)\\n END AS days_since_last_purchase_raw,\\n PERCENT_RANK() OVER (ORDER BY lifetime_value) AS lv_percent_rank\\n FROM customer_metrics\\n),\\nsegmented AS (\\n SELECT \\n customer_id,\\n customer_name,\\n country,\\n lifetime_value,\\n orders,\\n avg_order_value,\\n last_purchase_date,\\n CASE \\n WHEN days_since_last_purchase_raw IS NULL THEN NULL\\n ELSE GREATEST(0, days_since_last_purchase_raw)\\n END AS days_since_last_purchase,\\n CASE \\n WHEN lifetime_value = 0 THEN \'No Spend\'\\n WHEN lv_percent_rank >= 0.8 THEN \'High\'\\n WHEN lv_percent_rank >= 0.5 THEN \'Medium\'\\n ELSE \'Low\'\\n END AS lv_segment,\\n CASE \\n WHEN last_purchase_date IS NULL THEN \'Churn Risk: No purchases\'\\n WHEN (CASE WHEN days_since_last_purchase_raw IS NULL THEN NULL ELSE GREATEST(0, days_since_last_purchase_raw) END) > 365 THEN \'High Risk\'\\n WHEN (CASE WHEN days_since_last_purchase_raw IS NULL THEN NULL ELSE GREATEST(0, days_since_last_purchase_raw) END) BETWEEN 181 AND 365 THEN \'Medium Risk\'\\n WHEN (CASE WHEN days_since_last_purchase_raw IS NULL THEN NULL ELSE GREATEST(0, days_since_last_purchase_raw) END) BETWEEN 91 AND 180 THEN \'Low Risk\'\\n ELSE \'Active\'\\n END AS churn_risk\\n FROM scored\\n)\\nSELECT \\n lv_segment,\\n COUNT(*) AS customers,\\n ROUND(AVG(lifetime_value)::numeric, 2) AS avg_ltv,\\n ROUND(AVG(COALESCE(days_since_last_purchase, 0))::numeric, 1) AS avg_days_since_last_purchase,\\n SUM(CASE WHEN churn_risk = \'High Risk\' THEN 1 ELSE 0 END) AS high_risk_customers\\nFROM segmented\\nGROUP BY lv_segment\\nORDER BY \\n CASE lv_segment WHEN \'High\' THEN 1 WHEN \'Medium\' THEN 2 WHEN \'Low\' THEN 3 WHEN \'No Spend\' THEN 4 ELSE 5 END;"}', + 'name': 'execute_sql', + }), + 'id': 'call_feR1zKfz6o7B50j6iCcRyLZI', + 'type': 'function', + }), + dict({ + 'function': dict({ + 'arguments': '{"query":"WITH customer_metrics AS (\\n SELECT \\n c.customer_id,\\n (c.first_name || \' \' || c.last_name) AS customer_name,\\n c.country,\\n COALESCE(SUM(i.total), 0) AS lifetime_value,\\n COUNT(i.invoice_id) AS orders,\\n COALESCE(AVG(i.total), 0) AS avg_order_value,\\n MAX(i.invoice_date) AS last_purchase_date\\n FROM customer c\\n LEFT JOIN invoice i ON i.customer_id = c.customer_id\\n GROUP BY c.customer_id, customer_name, c.country\\n),\\nscored AS (\\n SELECT \\n *,\\n CASE \\n WHEN last_purchase_date IS NULL THEN NULL\\n ELSE (current_date - last_purchase_date::date)\\n END AS days_since_last_purchase_raw,\\n PERCENT_RANK() OVER (ORDER BY lifetime_value) AS lv_percent_rank\\n FROM customer_metrics\\n),\\nsegmented AS (\\n SELECT \\n customer_id,\\n customer_name,\\n country,\\n lifetime_value,\\n orders,\\n avg_order_value,\\n last_purchase_date,\\n CASE \\n WHEN days_since_last_purchase_raw IS NULL THEN NULL\\n ELSE GREATEST(0, days_since_last_purchase_raw)\\n END AS days_since_last_purchase,\\n CASE \\n WHEN lifetime_value = 0 THEN \'No Spend\'\\n WHEN lv_percent_rank >= 0.8 THEN \'High\'\\n WHEN lv_percent_rank >= 0.5 THEN \'Medium\'\\n ELSE \'Low\'\\n END AS lv_segment,\\n CASE \\n WHEN last_purchase_date IS NULL THEN \'Churn Risk: No purchases\'\\n WHEN (CASE WHEN days_since_last_purchase_raw IS NULL THEN NULL ELSE GREATEST(0, days_since_last_purchase_raw) END) > 365 THEN \'High Risk\'\\n WHEN (CASE WHEN days_since_last_purchase_raw IS NULL THEN NULL ELSE GREATEST(0, days_since_last_purchase_raw) END) BETWEEN 181 AND 365 THEN \'Medium Risk\'\\n WHEN (CASE WHEN days_since_last_purchase_raw IS NULL THEN NULL ELSE GREATEST(0, days_since_last_purchase_raw) END) BETWEEN 91 AND 180 THEN \'Low Risk\'\\n ELSE \'Active\'\\n END AS churn_risk\\n FROM scored\\n)\\nSELECT *\\nFROM segmented\\nORDER BY lifetime_value DESC, customer_id;"}', + 'name': 'execute_sql', + }), + 'id': 'call_PxEO2fausJ3iH9oOiqPKpaku', + 'type': 'function', + }), + dict({ + 'function': dict({ + 'arguments': '{"query":"WITH customer_metrics AS (\\n SELECT \\n c.customer_id,\\n (c.first_name || \' \' || c.last_name) AS customer_name,\\n c.country,\\n COALESCE(SUM(i.total), 0) AS lifetime_value,\\n COUNT(i.invoice_id) AS orders,\\n COALESCE(AVG(i.total), 0) AS avg_order_value,\\n MAX(i.invoice_date) AS last_purchase_date\\n FROM customer c\\n LEFT JOIN invoice i ON i.customer_id = c.customer_id\\n GROUP BY c.customer_id, customer_name, c.country\\n),\\nscored AS (\\n SELECT \\n *,\\n CASE \\n WHEN last_purchase_date IS NULL THEN NULL\\n ELSE (current_date - last_purchase_date::date)\\n END AS days_since_last_purchase,\\n PERCENT_RANK() OVER (ORDER BY lifetime_value) AS lv_percent_rank\\n FROM customer_metrics\\n),\\nsegmented AS (\\n SELECT \\n customer_id,\\n customer_name,\\n country,\\n lifetime_value,\\n orders,\\n avg_order_value,\\n last_purchase_date,\\n days_since_last_purchase,\\n CASE \\n WHEN lifetime_value = 0 THEN \'No Spend\'\\n WHEN lv_percent_rank >= 0.8 THEN \'High\'\\n WHEN lv_percent_rank >= 0.5 THEN \'Medium\'\\n ELSE \'Low\'\\n END AS lv_segment,\\n CASE \\n WHEN last_purchase_date IS NULL THEN \'Churn Risk: No purchases\'\\n WHEN days_since_last_purchase > 365 THEN \'High Risk\'\\n WHEN days_since_last_purchase BETWEEN 181 AND 365 THEN \'Medium Risk\'\\n WHEN days_since_last_purchase BETWEEN 91 AND 180 THEN \'Low Risk\'\\n ELSE \'Active\'\\n END AS churn_risk\\n FROM scored\\n)\\nSELECT *\\nFROM segmented\\nORDER BY lifetime_value DESC, customer_id;"}', + 'name': 'execute_sql', + }), + 'id': 'call_XMf9acssGFrA4PQy1QsOyGJS', + 'type': 'function', + }), + ]), + }), + dict({ + 'content': ''' + | customer_id | customer_name | country | lifetime_value | orders | avg_order_value | last_purchase_date | days_since_last_purchase | lv_segment | churn_risk | + | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | + | 6 | Helena Holý | Czech Republic | 49.62 | 7 | 7.0885714285714286 | 2025-11-13 00:00:00 | -58 | High | Active | + | 26 | Richard Cunningham | USA | 47.62 | 7 | 6.8028571428571429 | 2025-04-05 00:00:00 | 164 | High | Low Risk | + | 57 | Luis Rojas | Chile | 46.62 | 7 | 6.6600000000000000 | 2024-10-14 00:00:00 | 337 | High | Medium Risk | + | 45 | Ladislav Kovács | Hungary | 45.62 | 7 | 6.5171428571428571 | 2025-07-20 00:00:00 | 58 | High | Active | + | 46 | Hugh O'Reilly | Ireland | 45.62 | 7 | 6.5171428571428571 | 2025-11-04 00:00:00 | -49 | High | Active | + | 24 | Frank Ralston | USA | 43.62 | 7 | 6.2314285714285714 | 2025-08-20 00:00:00 | 27 | High | Active | + | 28 | Julia Barnett | USA | 43.62 | 7 | 6.2314285714285714 | 2025-05-19 00:00:00 | 120 | High | Low Risk | + | 37 | Fynn Zimmermann | Germany | 43.62 | 7 | 6.2314285714285714 | 2025-06-03 00:00:00 | 105 | High | Low Risk | + | 7 | Astrid Gruber | Austria | 42.62 | 7 | 6.0885714285714286 | 2025-06-19 00:00:00 | 89 | High | Active | + | 25 | Victor Stevens | USA | 42.62 | 7 | 6.0885714285714286 | 2025-12-05 00:00:00 | -80 | High | Active | + | 44 | Terhi Hämäläinen | Finland | 41.62 | 7 | 5.9457142857142857 | 2025-12-14 00:00:00 | -89 | High | Active | + | 5 | František Wichterlová | Czech Republic | 40.62 | 7 | 5.8028571428571429 | 2025-05-06 00:00:00 | 133 | Medium | Low Risk | + | 43 | Isabelle Mercier | France | 40.62 | 7 | 5.8028571428571429 | 2025-06-06 00:00:00 | 102 | Medium | Low Risk | + | 48 | Johannes Van der Berg | Netherlands | 40.62 | 7 | 5.8028571428571429 | 2025-09-12 00:00:00 | 4 | Medium | Active | + | 1 | Luís Gonçalves | Brazil | 39.62 | 7 | 5.6600000000000000 | 2025-08-07 00:00:00 | 40 | Medium | Active | + | 3 | François Tremblay | Canada | 39.62 | 7 | 5.6600000000000000 | 2025-09-20 00:00:00 | -4 | Medium | Active | + | 4 | Bjørn Hansen | Norway | 39.62 | 7 | 5.6600000000000000 | 2025-10-03 00:00:00 | -17 | Medium | Active | + | 17 | Jack Smith | USA | 39.62 | 7 | 5.6600000000000000 | 2024-07-31 00:00:00 | 412 | Medium | High Risk | + | 20 | Dan Miller | USA | 39.62 | 7 | 5.6600000000000000 | 2025-11-21 00:00:00 | -66 | Medium | Active | + | 22 | Heather Leacock | USA | 39.62 | 7 | 5.6600000000000000 | 2025-07-07 00:00:00 | 71 | Medium | Active | + | 34 | João Fernandes | Portugal | 39.62 | 7 | 5.6600000000000000 | 2024-10-01 00:00:00 | 350 | Medium | Medium Risk | + | 42 | Wyatt Girard | France | 39.62 | 7 | 5.6600000000000000 | 2025-11-03 00:00:00 | -48 | Medium | Active | + | 15 | Jennifer Peterson | Canada | 38.62 | 7 | 5.5171428571428571 | 2024-12-15 00:00:00 | 275 | Medium | Medium Risk | + | 19 | Tim Goyer | USA | 38.62 | 7 | 5.5171428571428571 | 2024-09-13 00:00:00 | 368 | Medium | High Risk | + | 39 | Camille Bernard | France | 38.62 | 7 | 5.5171428571428571 | 2025-09-07 00:00:00 | 9 | Medium | Active | + | 40 | Dominique Lefebvre | France | 38.62 | 7 | 5.5171428571428571 | 2024-08-13 00:00:00 | 399 | Medium | High Risk | + | 51 | Joakim Johansson | Sweden | 38.62 | 7 | 5.5171428571428571 | 2024-12-02 00:00:00 | 288 | Medium | Medium Risk | + | 58 | Manoj Pareek | India | 38.62 | 7 | 5.5171428571428571 | 2025-12-22 00:00:00 | -97 | Medium | Active | + | 2 | Leonie Köhler | Germany | 37.62 | 7 | 5.3742857142857143 | 2024-07-13 00:00:00 | 430 | Low | High Risk | + | 8 | Daan Peeters | Belgium | 37.62 | 7 | 5.3742857142857143 | 2025-10-04 00:00:00 | -18 | Low | Active | + | 9 | Kara Nielsen | Denmark | 37.62 | 7 | 5.3742857142857143 | 2025-02-02 00:00:00 | 226 | Low | Medium Risk | + | 10 | Eduardo Martins | Brazil | 37.62 | 7 | 5.3742857142857143 | 2025-08-12 00:00:00 | 35 | Low | Active | + | 11 | Alexandre Rocha | Brazil | 37.62 | 7 | 5.3742857142857143 | 2025-03-18 00:00:00 | 182 | Low | Medium Risk | + | 12 | Roberto Almeida | Brazil | 37.62 | 7 | 5.3742857142857143 | 2025-10-05 00:00:00 | -19 | Low | Active | + | 13 | Fernanda Ramos | Brazil | 37.62 | 7 | 5.3742857142857143 | 2024-11-01 00:00:00 | 319 | Low | Medium Risk | + | 14 | Mark Philips | Canada | 37.62 | 7 | 5.3742857142857143 | 2025-05-11 00:00:00 | 128 | Low | Low Risk | + | 16 | Frank Harris | USA | 37.62 | 7 | 5.3742857142857143 | 2025-07-04 00:00:00 | 74 | Low | Active | + | 18 | Michelle Brooks | USA | 37.62 | 7 | 5.3742857142857143 | 2025-10-08 00:00:00 | -22 | Low | Active | + | 21 | Kathy Chase | USA | 37.62 | 7 | 5.3742857142857143 | 2025-12-04 00:00:00 | -79 | Low | Active | + | 23 | John Gordon | USA | 37.62 | 7 | 5.3742857142857143 | 2025-12-04 00:00:00 | -79 | Low | Active | + | 27 | Patrick Gray | USA | 37.62 | 7 | 5.3742857142857143 | 2025-10-13 00:00:00 | -27 | Low | Active | + | 29 | Robert Brown | Canada | 37.62 | 7 | 5.3742857142857143 | 2025-12-06 00:00:00 | -81 | Low | Active | + | 30 | Edward Francis | Canada | 37.62 | 7 | 5.3742857142857143 | 2025-01-02 00:00:00 | 257 | Low | Medium Risk | + | 31 | Martha Silk | Canada | 37.62 | 7 | 5.3742857142857143 | 2025-07-12 00:00:00 | 66 | Low | Active | + | 32 | Aaron Mitchell | Canada | 37.62 | 7 | 5.3742857142857143 | 2025-02-15 00:00:00 | 213 | Low | Medium Risk | + | 33 | Ellie Sullivan | Canada | 37.62 | 7 | 5.3742857142857143 | 2025-09-04 00:00:00 | 12 | Low | Active | + | 35 | Madalena Sampaio | Portugal | 37.62 | 7 | 5.3742857142857143 | 2025-12-09 00:00:00 | -84 | Low | Active | + | 36 | Hannah Schneider | Germany | 37.62 | 7 | 5.3742857142857143 | 2024-11-14 00:00:00 | 306 | Low | Medium Risk | + | 38 | Niklas Schröder | Germany | 37.62 | 7 | 5.3742857142857143 | 2024-06-30 00:00:00 | 443 | Low | High Risk | + | 41 | Marc Dubois | France | 37.62 | 7 | 5.3742857142857143 | 2025-10-21 00:00:00 | -35 | Low | Active | + | 47 | Lucas Mancini | Italy | 37.62 | 7 | 5.3742857142857143 | 2025-03-05 00:00:00 | 195 | Low | Medium Risk | + | 49 | Stanisław Wójcik | Poland | 37.62 | 7 | 5.3742857142857143 | 2025-04-18 00:00:00 | 151 | Low | Low Risk | + | 50 | Enrique Muñoz | Spain | 37.62 | 7 | 5.3742857142857143 | 2025-11-05 00:00:00 | -50 | Low | Active | + | 52 | Emma Jones | United Kingdom | 37.62 | 7 | 5.3742857142857143 | 2025-06-11 00:00:00 | 97 | Low | Low Risk | + | 53 | Phil Hughes | United Kingdom | 37.62 | 7 | 5.3742857142857143 | 2025-01-15 00:00:00 | 244 | Low | Medium Risk | + | 54 | Steve Murray | United Kingdom | 37.62 | 7 | 5.3742857142857143 | 2025-08-04 00:00:00 | 43 | Low | Active | + | 55 | Mark Taylor | Australia | 37.62 | 7 | 5.3742857142857143 | 2024-08-31 00:00:00 | 381 | Low | High Risk | + | 56 | Diego Gutiérrez | Argentina | 37.62 | 7 | 5.3742857142857143 | 2025-11-08 00:00:00 | -53 | Low | Active | + | 59 | Puja Srivastava | India | 36.64 | 6 | 6.1066666666666667 | 2024-05-30 00:00:00 | 474 | Low | High Risk | + ''', + 'control_plane_step': None, + 'function_call': None, + 'name': None, + 'reasoning_content': None, + 'role': 'tool', + 'tool_call_id': 'call_XMf9acssGFrA4PQy1QsOyGJS', + 'tool_calls': None, + }), + dict({ + 'content': ''' + | churn_risk | customers | avg_ltv | avg_days_since_last_purchase | + | --- | --- | --- | --- | + | High Risk | 7 | 38.05 | 415.3 | + | Medium Risk | 12 | 38.70 | 266.0 | + | Low Risk | 8 | 41.12 | 125.0 | + | Active | 32 | 39.65 | 16.5 | + ''', + 'control_plane_step': None, + 'function_call': None, + 'name': None, + 'reasoning_content': None, + 'role': 'tool', + 'tool_call_id': 'call_PxEO2fausJ3iH9oOiqPKpaku', + 'tool_calls': None, + }), + dict({ + 'content': 'No results found.', + 'control_plane_step': None, + 'function_call': None, + 'name': None, + 'reasoning_content': None, + 'role': 'tool', + 'tool_call_id': 'call_feR1zKfz6o7B50j6iCcRyLZI', + 'tool_calls': None, + }), + dict({ + 'content': 'No results found.', + 'control_plane_step': None, + 'function_call': None, + 'name': None, + 'reasoning_content': None, + 'role': 'tool', + 'tool_call_id': 'call_C3MJguOJ62YJtQ4wN6duDsEV', + 'tool_calls': None, + }), + dict({ + 'content': 'No results found.', + 'control_plane_step': None, + 'function_call': None, + 'name': None, + 'reasoning_content': None, + 'role': 'tool', + 'tool_call_id': 'call_ufaloEQ87VReYLpLxeaf2EmV', + 'tool_calls': None, + }), + dict({ + 'content': ''' + | customer_id | customer_name | country | lifetime_value | orders | avg_order_value | last_purchase_date | days_since_last_purchase | lv_segment | churn_risk | + | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | + | 59 | Puja Srivastava | India | 36.64 | 6 | 6.1066666666666667 | 2024-05-30 00:00:00 | 474 | Low | High Risk | + | 38 | Niklas Schröder | Germany | 37.62 | 7 | 5.3742857142857143 | 2024-06-30 00:00:00 | 443 | Low | High Risk | + | 2 | Leonie Köhler | Germany | 37.62 | 7 | 5.3742857142857143 | 2024-07-13 00:00:00 | 430 | Low | High Risk | + | 17 | Jack Smith | USA | 39.62 | 7 | 5.6600000000000000 | 2024-07-31 00:00:00 | 412 | Medium | High Risk | + | 40 | Dominique Lefebvre | France | 38.62 | 7 | 5.5171428571428571 | 2024-08-13 00:00:00 | 399 | Medium | High Risk | + | 55 | Mark Taylor | Australia | 37.62 | 7 | 5.3742857142857143 | 2024-08-31 00:00:00 | 381 | Low | High Risk | + | 19 | Tim Goyer | USA | 38.62 | 7 | 5.5171428571428571 | 2024-09-13 00:00:00 | 368 | Medium | High Risk | + ''', + 'control_plane_step': None, + 'function_call': None, + 'name': None, + 'reasoning_content': None, + 'role': 'tool', + 'tool_call_id': 'call_pyQOTPrrGRT0IuljYLkbeb0a', + 'tool_calls': None, + }), + dict({ + 'content': 'No results found.', + 'control_plane_step': None, + 'function_call': None, + 'name': None, + 'reasoning_content': None, + 'role': 'tool', + 'tool_call_id': 'call_8yfYxjCJFppcVcjnmKbTBxoj', + 'tool_calls': None, + }), + dict({ + 'content': 'No results found.', + 'control_plane_step': None, + 'function_call': None, + 'name': None, + 'reasoning_content': None, + 'role': 'tool', + 'tool_call_id': 'call_QklYCtJeAzMn8iMshl0PaYXg', + 'tool_calls': None, + }), + dict({ + 'content': ''' + Here’s a CLV segmentation and churn-risk view built from invoices in the Chinook database. + + How it’s calculated + - Lifetime value (LTV): sum of all invoice totals per customer. + - Orders: count of invoices per customer; AOV: average invoice total. + - Days since last purchase: max(0, today − last invoice date). Future-dated invoices are treated as 0 days. + - LTV segments (percentile-based): + - High: top ~20% by LTV + - Medium: next ~30% + - Low: bottom ~50% + - No Spend: LTV = 0 + - Churn risk: + - Active: 0–90 days + - Low Risk: 91–180 days + - Medium Risk: 181–365 days + - High Risk: >365 days + - Churn Risk: No purchases (never bought) + + Churn-risk summary (all 59 customers) + - High Risk: 7 customers | Avg LTV: 38.05 | Avg days since last purchase: 415.3 + - Medium Risk: 12 customers | Avg LTV: 38.70 | Avg days since last purchase: 266.0 + - Low Risk: 8 customers | Avg LTV: 41.12 | Avg days since last purchase: 125.0 + - Active: 32 customers | Avg LTV: 39.65 | Avg days since last purchase: 16.5 + + Top high-risk customers to prioritize (longest time since last purchase) + - Puja Srivastava (India) — LTV 36.64 — 474 days — LV segment: Low — Churn risk: High + - Niklas Schröder (Germany) — LTV 37.62 — 443 days — LV segment: Low — Churn risk: High + - Leonie Köhler (Germany) — LTV 37.62 — 430 days — LV segment: Low — Churn risk: High + - Jack Smith (USA) — LTV 39.62 — 412 days — LV segment: Medium — Churn risk: High + - Dominique Lefebvre (France) — LTV 38.62 — 399 days — LV segment: Medium — Churn risk: High + - Mark Taylor (Australia) — LTV 37.62 — 381 days — LV segment: Low — Churn risk: High + - Tim Goyer (USA) — LTV 38.62 — 368 days — LV segment: Medium — Churn risk: High + + Notes and next steps + - The percent-rank approach keeps LTV segments relative to your customer base. As spending patterns shift, segment thresholds adjust automatically. + - If you want fixed-dollar LTV segments (e.g., Low < $30, Medium $30–$45, High > $45) or different churn thresholds, say the word and I’ll rerun the analysis. + - I can export the full customer-level table (customer, country, LTV, orders, AOV, last purchase date, days since last purchase, LTV segment, churn risk) as CSV if needed. + ''', + 'control_plane_step': None, + 'function_call': None, + 'name': None, + 'reasoning_content': None, + 'role': 'assistant', + 'tool_call_id': None, + 'tool_calls': None, + }), + ]), + 'pid': None, + 'rollout_status': dict({ + 'code': , + 'details': list([ + ]), + 'message': 'Rollout is running', + }), + 'tools': list([ + dict({ + 'description': None, + 'name': 'execute_sql', + 'parameters': dict({ + 'additionalProperties': False, + 'properties': dict({ + 'query': dict({ + 'type': 'string', + }), + }), + 'required': list([ + 'query', + ]), + 'type': 'object', + }), + 'strict': True, + 'type': 'function', + }), + ]), + }), + ]) +# --- # name: test_openai_responses_adapter_with_real_response_simple list([ dict({ diff --git a/tests/adapters/test_openai_responses_adapter.py b/tests/adapters/test_openai_responses_adapter.py index 0d146229..bf9edd98 100644 --- a/tests/adapters/test_openai_responses_adapter.py +++ b/tests/adapters/test_openai_responses_adapter.py @@ -42,7 +42,7 @@ def test_openai_responses_adapter_with_real_response_parallel_tool_calls(snapsho assert len(eval_rows) == 1 # Convert to dict for snapshot testing - eval_rows_dict = [row.model_dump() for row in eval_rows] + eval_rows_dict = [row.model_dump(exclude={"created_at", "execution_metadata"}) for row in eval_rows] # Assert against snapshot assert eval_rows_dict == snapshot From 276fd100666a2fdddf0bbc62ca8717b5cee26478 Mon Sep 17 00:00:00 2001 From: Dylan Huang Date: Tue, 16 Sep 2025 16:42:35 -0700 Subject: [PATCH 16/23] update snapshot --- .../test_openai_responses_adapter.ambr | 52 ++++++++++--------- 1 file changed, 28 insertions(+), 24 deletions(-) diff --git a/tests/adapters/__snapshots__/test_openai_responses_adapter.ambr b/tests/adapters/__snapshots__/test_openai_responses_adapter.ambr index b2bc145b..ffe05a94 100644 --- a/tests/adapters/__snapshots__/test_openai_responses_adapter.ambr +++ b/tests/adapters/__snapshots__/test_openai_responses_adapter.ambr @@ -422,21 +422,23 @@ }), 'tools': list([ dict({ - 'description': None, - 'name': 'execute_sql', - 'parameters': dict({ - 'additionalProperties': False, - 'properties': dict({ - 'query': dict({ - 'type': 'string', + 'function': dict({ + 'description': None, + 'name': 'execute_sql', + 'parameters': dict({ + 'additionalProperties': False, + 'properties': dict({ + 'query': dict({ + 'type': 'string', + }), }), + 'required': list([ + 'query', + ]), + 'type': 'object', }), - 'required': list([ - 'query', - ]), - 'type': 'object', + 'strict': True, }), - 'strict': True, 'type': 'function', }), ]), @@ -643,21 +645,23 @@ }), 'tools': list([ dict({ - 'description': None, - 'name': 'execute_sql', - 'parameters': dict({ - 'additionalProperties': False, - 'properties': dict({ - 'query': dict({ - 'type': 'string', + 'function': dict({ + 'description': None, + 'name': 'execute_sql', + 'parameters': dict({ + 'additionalProperties': False, + 'properties': dict({ + 'query': dict({ + 'type': 'string', + }), }), + 'required': list([ + 'query', + ]), + 'type': 'object', }), - 'required': list([ - 'query', - ]), - 'type': 'object', + 'strict': True, }), - 'strict': True, 'type': 'function', }), ]), From c1118a98a523f6165494646b8851c2aee73ecf3b Mon Sep 17 00:00:00 2001 From: Dylan Huang Date: Tue, 16 Sep 2025 16:42:42 -0700 Subject: [PATCH 17/23] add pyright ignore statements --- .../pydantic/test_pydantic_complex_queries_responses.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/chinook/pydantic/test_pydantic_complex_queries_responses.py b/tests/chinook/pydantic/test_pydantic_complex_queries_responses.py index dba824bb..5ed1e377 100644 --- a/tests/chinook/pydantic/test_pydantic_complex_queries_responses.py +++ b/tests/chinook/pydantic/test_pydantic_complex_queries_responses.py @@ -27,11 +27,11 @@ def agent_factory(config: RolloutProcessorConfig) -> Agent: return setup_agent(model) -@pytest.mark.skipif( +@pytest.mark.skipif( # pyright: ignore[reportAttributeAccessIssue] os.environ.get("CI") == "true", reason="This was only run locally to generate traces in Responses API", ) -@pytest.mark.asyncio +@pytest.mark.asyncio # pyright: ignore[reportAttributeAccessIssue] @evaluation_test( input_rows=[collect_dataset()], completion_params=[ From 1d138c66be3e894037d5abd80e7f19921dc91a4e Mon Sep 17 00:00:00 2001 From: Dylan Huang Date: Tue, 16 Sep 2025 16:42:56 -0700 Subject: [PATCH 18/23] add __init__ and export judge as aha_judge --- eval_protocol/quickstart/__init__.py | 4 ++++ eval_protocol/quickstart/llm_judge.py | 11 ++++++----- 2 files changed, 10 insertions(+), 5 deletions(-) create mode 100644 eval_protocol/quickstart/__init__.py diff --git a/eval_protocol/quickstart/__init__.py b/eval_protocol/quickstart/__init__.py new file mode 100644 index 00000000..24592748 --- /dev/null +++ b/eval_protocol/quickstart/__init__.py @@ -0,0 +1,4 @@ +from .llm_judge import aha_judge +from .utils import split_multi_turn_rows + +__all__ = ["aha_judge"] diff --git a/eval_protocol/quickstart/llm_judge.py b/eval_protocol/quickstart/llm_judge.py index 7c1be954..50ecc1d0 100644 --- a/eval_protocol/quickstart/llm_judge.py +++ b/eval_protocol/quickstart/llm_judge.py @@ -2,9 +2,11 @@ Default LLM judge for Eval Protocol. Inspired by Arena-Hard-Auto. """ +from collections.abc import Awaitable, Callable import os from datetime import datetime from typing import List, Dict, Any, Optional +from typing_extensions import cast from tqdm import tqdm import pytest @@ -56,6 +58,10 @@ mode="all", ) async def test_llm_judge(rows: list[EvaluationRow]) -> list[EvaluationRow]: + return await aha_judge(rows) + + +async def aha_judge(rows: list[EvaluationRow], judge_name: str = "gemini-2.5-pro") -> list[EvaluationRow]: """ LLM Judge evaluation using Arena-Hard-Auto style pairwise comparisons. @@ -73,8 +79,6 @@ async def test_llm_judge(rows: list[EvaluationRow]) -> list[EvaluationRow]: Same rows with updated evaluation_result containing scores and judgments """ - judge_name = "gemini-2.5-pro" # Edit to which judge you'd like to use. Configs are in utils.py. - if not rows: print("❌ No evaluation rows provided") return rows @@ -130,7 +134,4 @@ async def run_judgment(row): if row.evaluation_result: row.evaluation_result.score = mean_score - # Optional, push scores back to Langfuse. Note that one score per model will be pushed back onto same trace. - push_scores_to_langfuse(rows, model_name, mean_score) - return rows From 4f0c834f44b869fb2c9c9a023169cad5dc0f0668 Mon Sep 17 00:00:00 2001 From: Dylan Huang Date: Tue, 16 Sep 2025 16:44:55 -0700 Subject: [PATCH 19/23] properly convert responses tools to chat completion tools --- eval_protocol/adapters/openai_responses.py | 31 ++++++++++++++++++++-- 1 file changed, 29 insertions(+), 2 deletions(-) diff --git a/eval_protocol/adapters/openai_responses.py b/eval_protocol/adapters/openai_responses.py index d71be690..8380ce06 100644 --- a/eval_protocol/adapters/openai_responses.py +++ b/eval_protocol/adapters/openai_responses.py @@ -4,11 +4,13 @@ to EvaluationRow format for use in evaluation pipelines. """ -from collections.abc import Iterable +from collections.abc import Iterable, Sequence import logging from typing import List +from typing_extensions import Any from openai.pagination import SyncCursorPage +from openai.types.chat.chat_completion_function_tool_param import ChatCompletionFunctionToolParam from openai.types.chat.chat_completion_message import FunctionCall from openai.types.responses import Response from openai.types.responses.response_item import ResponseItem @@ -16,6 +18,7 @@ ChatCompletionMessageFunctionToolCall, Function, ) +from openai.types.responses.tool import Tool from eval_protocol.models import EvaluationRow, InputMetadata, Message @@ -78,9 +81,11 @@ def _create_evaluation_row(self, input_items: SyncCursorPage[ResponseItem], resp messages.extend(self._create_messages(input_items)) if response.output_text: messages.append(Message(role="assistant", content=response.output_text)) + tools = self._responses_tools_to_chat_completion_tools(response.tools) + tool_dicts = [dict(tool) for tool in tools] return EvaluationRow( messages=messages, - tools=[tool.model_dump() for tool in response.tools], + tools=tool_dicts, input_metadata=InputMetadata( completion_params={ "model": response.model, @@ -108,6 +113,28 @@ def _create_evaluation_row(self, input_items: SyncCursorPage[ResponseItem], resp ), ) + def _responses_tools_to_chat_completion_tools( + self, tools: List[Tool] + ) -> Sequence[ChatCompletionFunctionToolParam]: + """Convert OpenAI Responses API tools to chat completion message function tool calls.""" + chat_completion_tools: List[ChatCompletionFunctionToolParam] = [] + for tool in tools: + if tool.type == "function": + chat_completion_tools.append( + { + "type": "function", + "function": { + "name": tool.name, + "parameters": tool.parameters or {}, + "strict": tool.strict, + "description": tool.description or "", + }, + } + ) + else: + raise NotImplementedError("Only function tools are supported") + return chat_completion_tools + def _create_messages(self, input_items: SyncCursorPage[ResponseItem]) -> Iterable[Message]: """Create messages from input items. From 544a340040b1925de38aeacb63f4846674d98fb6 Mon Sep 17 00:00:00 2001 From: Dylan Huang Date: Tue, 16 Sep 2025 22:06:21 -0700 Subject: [PATCH 20/23] update snapshots --- .../adapters/__snapshots__/test_openai_responses_adapter.ambr | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/adapters/__snapshots__/test_openai_responses_adapter.ambr b/tests/adapters/__snapshots__/test_openai_responses_adapter.ambr index ffe05a94..ad45893b 100644 --- a/tests/adapters/__snapshots__/test_openai_responses_adapter.ambr +++ b/tests/adapters/__snapshots__/test_openai_responses_adapter.ambr @@ -423,7 +423,7 @@ 'tools': list([ dict({ 'function': dict({ - 'description': None, + 'description': '', 'name': 'execute_sql', 'parameters': dict({ 'additionalProperties': False, @@ -646,7 +646,7 @@ 'tools': list([ dict({ 'function': dict({ - 'description': None, + 'description': '', 'name': 'execute_sql', 'parameters': dict({ 'additionalProperties': False, From 7175428b23a0c01dc762b2461d8eb25421ba19f5 Mon Sep 17 00:00:00 2001 From: Dylan Huang Date: Tue, 16 Sep 2025 22:07:17 -0700 Subject: [PATCH 21/23] remove duplicate test --- tests/pytest/test_metadata_handling.py | 110 ------------------------- 1 file changed, 110 deletions(-) delete mode 100644 tests/pytest/test_metadata_handling.py diff --git a/tests/pytest/test_metadata_handling.py b/tests/pytest/test_metadata_handling.py deleted file mode 100644 index 9046d6c2..00000000 --- a/tests/pytest/test_metadata_handling.py +++ /dev/null @@ -1,110 +0,0 @@ -import pytest -from typing import Any, Dict -from unittest.mock import Mock -from pydantic_ai import Agent -from pydantic_ai.models.openai import OpenAIChatModel, OpenAIResponsesModel -from eval_protocol.models import EvaluationRow, InputMetadata, ExecutionMetadata -from eval_protocol.pytest.default_pydantic_ai_rollout_processor import PydanticAgentRolloutProcessor -from eval_protocol.pytest.types import RolloutProcessorConfig - - -def test_pydantic_ai_metadata_only_stored_for_responses_model(): - """Test that PydanticAI metadata is only stored in extra_body for ResponsesModel, not for ChatModel.""" - - # Create a test row with metadata - row = EvaluationRow( - input_metadata=InputMetadata(row_id="test-row-123"), - execution_metadata=ExecutionMetadata( - invocation_id="test-invocation-456", - rollout_id="test-rollout-789", - run_id="test-run-101", - experiment_id="test-experiment-202", - ), - messages=[], - ) - - # Test with OpenAIChatModel (should NOT store metadata) - chat_model = OpenAIChatModel("gpt-4") - chat_agent = Agent(model=chat_model) - processor = PydanticAgentRolloutProcessor(lambda config: chat_agent) - - settings = processor.construct_model_settings(chat_agent, row) - - # ChatModel should not have metadata in extra_body - extra_body = settings.get("extra_body", {}) - assert isinstance(extra_body, dict), "extra_body should be a dict" - assert "metadata" not in extra_body, "ChatModel should not store metadata in extra_body" - - # Test with OpenAIResponsesModel (should store metadata) - responses_model = OpenAIResponsesModel("gpt-5") - responses_agent = Agent(model=responses_model) - processor_responses = PydanticAgentRolloutProcessor(lambda config: responses_agent) - - settings_responses = processor_responses.construct_model_settings(responses_agent, row) - - # ResponsesModel should have metadata in extra_body - extra_body_responses = settings_responses.get("extra_body", {}) - assert isinstance(extra_body_responses, dict), "extra_body should be a dict" - assert "metadata" in extra_body_responses, "ResponsesModel should store metadata in extra_body" - - metadata = extra_body_responses["metadata"] - assert isinstance(metadata, dict), "metadata should be a dict" - assert metadata["row_id"] == "test-row-123" - assert metadata["invocation_id"] == "test-invocation-456" - assert metadata["rollout_id"] == "test-rollout-789" - assert metadata["run_id"] == "test-run-101" - assert metadata["experiment_id"] == "test-experiment-202" - - -def test_pydantic_ai_metadata_handling_with_string_model(): - """Test that PydanticAI string models don't cause issues with metadata handling.""" - - # Create a test row - row = EvaluationRow( - input_metadata=InputMetadata(row_id="test-row-123"), - execution_metadata=ExecutionMetadata( - invocation_id="test-invocation-456", - rollout_id="test-rollout-789", - run_id="test-run-101", - experiment_id="test-experiment-202", - ), - messages=[], - ) - - # Create agent with string model (should not store metadata) - agent = Agent(model="gpt-4") - processor = PydanticAgentRolloutProcessor(lambda config: agent) - - settings = processor.construct_model_settings(agent, row) - - # String model should not have metadata in extra_body - extra_body = settings.get("extra_body", {}) - assert isinstance(extra_body, dict), "extra_body should be a dict" - assert "metadata" not in extra_body, "String model should not store metadata in extra_body" - - -def test_pydantic_ai_metadata_handling_with_none_model(): - """Test that PydanticAI None model doesn't cause issues with metadata handling.""" - - # Create a test row - row = EvaluationRow( - input_metadata=InputMetadata(row_id="test-row-123"), - execution_metadata=ExecutionMetadata( - invocation_id="test-invocation-456", - rollout_id="test-rollout-789", - run_id="test-run-101", - experiment_id="test-experiment-202", - ), - messages=[], - ) - - # Create agent with None model (should not store metadata) - agent = Agent(model=None) - processor = PydanticAgentRolloutProcessor(lambda config: agent) - - settings = processor.construct_model_settings(agent, row) - - # None model should not have metadata in extra_body - extra_body = settings.get("extra_body", {}) - assert isinstance(extra_body, dict), "extra_body should be a dict" - assert "metadata" not in extra_body, "None model should not store metadata in extra_body" From f585ce01307cde8169bca2f8ca666af9ab203271 Mon Sep 17 00:00:00 2001 From: Dylan Huang Date: Tue, 16 Sep 2025 22:07:24 -0700 Subject: [PATCH 22/23] add responses example --- .../quickstart/llm_judge_openai_responses.py | 55 +++++++++++++++++++ 1 file changed, 55 insertions(+) create mode 100644 eval_protocol/quickstart/llm_judge_openai_responses.py diff --git a/eval_protocol/quickstart/llm_judge_openai_responses.py b/eval_protocol/quickstart/llm_judge_openai_responses.py new file mode 100644 index 00000000..056abfc3 --- /dev/null +++ b/eval_protocol/quickstart/llm_judge_openai_responses.py @@ -0,0 +1,55 @@ +""" +LLM Judge quickstart that PULLS DATA FROM OpenAI Responses API and persists results locally via Eval Protocol. + +This mirrors `eval_protocol/quickstart/llm_judge.py` (Langfuse source), but uses +OpenAI Responses API as the source of evaluation rows. + +Env vars: + export OPENAI_API_KEY=... # required to fetch examples + +Judge model keys: + - Default judge is "gemini-2.5-pro" from utils; requires GEMINI_API_KEY + - Or set judge in the code to "gpt-4.1" and export OPENAI_API_KEY + +Run: + pytest python-sdk/eval_protocol/quickstart/llm_judge_openai_responses.py -q -s +""" + +import os +from typing import List + +import pytest + +from eval_protocol.models import EvaluationRow +from eval_protocol.pytest import evaluation_test +from eval_protocol.pytest.default_single_turn_rollout_process import SingleTurnRolloutProcessor +from eval_protocol.quickstart import aha_judge, split_multi_turn_rows +from eval_protocol.adapters.openai_responses import OpenAIResponsesAdapter + +adapter = OpenAIResponsesAdapter() +input_rows = adapter.get_evaluation_rows( + response_ids=[ + "resp_0e1b7db5d96e92470068c99506443c819e9305e92915d2405f", + "resp_05639dcaca074fbc0068c9946593b481908cac70075926d85c", + ] +) + + +@pytest.mark.skipif(os.environ.get("CI") == "true", reason="Skip in CI") # pyright: ignore[reportAttributeAccessIssue] +@pytest.mark.asyncio # pyright: ignore[reportAttributeAccessIssue] +@evaluation_test( + input_rows=[input_rows], + completion_params=[ + { + "model": "fireworks_ai/accounts/fireworks/models/deepseek-v3p1", + }, + { + "model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct-0905", + }, + ], + rollout_processor=SingleTurnRolloutProcessor(), + preprocess_fn=split_multi_turn_rows, + mode="all", +) +async def test_llm_judge_openai_responses(rows: List[EvaluationRow]) -> List[EvaluationRow]: + return await aha_judge(rows) From 08841e2dbd47ea297855bd6adf27ff124279f8eb Mon Sep 17 00:00:00 2001 From: Dylan Huang Date: Tue, 16 Sep 2025 22:18:32 -0700 Subject: [PATCH 23/23] add OPENAI_API_KEY to testing workflow --- .github/workflows/ci.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index d291b6c0..03e836f9 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -88,6 +88,7 @@ jobs: - name: Run Core Tests with pytest-xdist env: + OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} E2B_API_KEY: ${{ secrets.E2B_API_KEY }} FIREWORKS_API_KEY: ${{ secrets.FIREWORKS_API_KEY }} FIREWORKS_ACCOUNT_ID: ${{ secrets.FIREWORKS_ACCOUNT_ID }}