Skip to content

Commit 58d8409

Browse files
authored
livesvgbench + metadata fix (#83)
* livesvgbench + metadata fix * bugs in retry processor
1 parent 030e886 commit 58d8409

File tree

9 files changed

+720
-133
lines changed

9 files changed

+720
-133
lines changed

.github/workflows/ci.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,7 @@ jobs:
9595
--ignore=tests/pytest/test_apps_coding.py \
9696
--ignore=tests/test_tau_bench_airline_smoke.py \
9797
--ignore=tests/pytest/test_svgbench.py \
98+
--ignore=tests/pytest/test_livesvgbench.py \
9899
--cov=eval_protocol --cov-append --cov-report=xml --cov-report=term-missing -v --durations=10
99100
100101
- name: Store coverage file

eval_protocol/pytest/evaluation_test.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -475,6 +475,7 @@ async def _execute_with_semaphore(row):
475475
for result in all_results:
476476
for r in result:
477477
if r.eval_metadata is not None:
478+
r.eval_metadata.status = "finished"
478479
r.eval_metadata.passed = passed
479480
active_logger.log(r)
480481

eval_protocol/pytest/plugin.py

Lines changed: 6 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -63,13 +63,13 @@ def pytest_addoption(parser) -> None:
6363
"--ep-max-retry",
6464
action="store",
6565
type=int,
66-
default=None,
66+
default=0,
6767
help=("Failed rollouts (with rollout_status.status == 'error') will be retried up to this many times."),
6868
)
6969
group.addoption(
70-
"--ep-fail-on-permanent-failure",
70+
"--ep-fail-on-max-retry",
7171
action="store",
72-
default=None,
72+
default="true",
7373
choices=["true", "false"],
7474
help=(
7575
"Whether to fail the entire rollout when permanent failures occur after max retries. "
@@ -118,12 +118,10 @@ def pytest_configure(config) -> None:
118118
os.environ["EP_SUMMARY_JSON"] = summary_json_path
119119

120120
max_retry = config.getoption("--ep-max-retry")
121-
if max_retry is not None:
122-
os.environ["EP_MAX_RETRY"] = str(max_retry)
121+
os.environ["EP_MAX_RETRY"] = str(max_retry)
123122

124-
fail_on_permanent_failure = config.getoption("--ep-fail-on-permanent-failure")
125-
if fail_on_permanent_failure is not None:
126-
os.environ["EP_FAIL_ON_PERMANENT_FAILURE"] = fail_on_permanent_failure
123+
fail_on_max_retry = config.getoption("--ep-fail-on-max-retry")
124+
os.environ["EP_FAIL_ON_MAX_RETRY"] = fail_on_max_retry
127125

128126
# Allow ad-hoc overrides of input params via CLI flags
129127
try:

eval_protocol/pytest/utils.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -280,7 +280,13 @@ async def retry_handler(failed_row: EvaluationRow):
280280

281281
async def initial_processor():
282282
"""Process initial batch and spawn retries for failures"""
283-
base_tasks = rollout_processor(fresh_dataset, config)
283+
# catch any task creation errors and raise them immediately, i.e. port already in use
284+
try:
285+
base_tasks = rollout_processor(fresh_dataset, config)
286+
except Exception as e:
287+
print(f"❌ Rollout processor failed to initialize: {e}")
288+
raise e
289+
284290
pending = set(base_tasks)
285291

286292
while pending:
@@ -310,7 +316,7 @@ async def initial_processor():
310316

311317
# only permanent failure rows are put on the queue, so we can check for them here
312318
if finished_row.rollout_status and finished_row.rollout_status.status == "error":
313-
if os.getenv("EP_FAIL_ON_PERMANENT_FAILURE", "true") != "false":
319+
if max_retry > 0 and os.getenv("EP_FAIL_ON_MAX_RETRY", "true") != "false":
314320
raise RuntimeError(
315321
f"Rollout {finished_row.execution_metadata.rollout_id} failed after {max_retry} retries. Errors: {finished_row.rollout_status.termination_reason}"
316322
)

tests/pytest/data/svgbench_dataset.jsonl

Lines changed: 105 additions & 105 deletions
Large diffs are not rendered by default.
Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
1-
{"requirements": ["Cow must be clearly recognizable with distinctive bovine features", "Include cow body, head, four legs, tail, and udder", "Add cow ears, eyes, and snout for facial recognition", "Cow should be positioned in a realistic plowing stance (pulling forward)", "Use appropriate cow coloring (black/white patches, brown, or solid color)", "Include a traditional plow with visible blade/share", "Show plow handles extending upward", "Depict connection mechanism between cow and plow (yoke, harness, or chains)", "Plow should appear to be cutting into the soil", "Show ground/soil with visible furrows behind the plow", "Include plowed and unplowed sections of field", "Add simple background elements (horizon line, sky)", "Include basic vegetation or crops"], "prompt": "Write `svg` code to draw an image of a cow plowing a field.", "id": "cow_plowing"}
2-
{"requirements": ["The overall background of the SVG must be white", "All primary elements must be horizontally centered on the canvas", "Include the Google logo in the center, using its official multi-color scheme (blue, red, yellow, blue, green, red)", "Place a prominent search bar directly below the Google logo", "The search bar must be a rounded rectangle with a light gray border", "The search bar must contain a gray magnifying glass icon on the left side", "The search bar must contain a gray microphone icon on the right side", "Place two distinct buttons below the search bar", "The left button must be labeled 'Google Search'", "The right button must be labeled 'I'm Feeling Lucky'", "Buttons should have a light gray background, a thin border, and dark gray text", "Create a header section at the top right of the canvas", "The header must include text links for 'Gmail' and 'Images'", "The header must include a 3x3 grid icon (Google Apps launcher)", "The header must include a prominent 'Sign in' button, typically with a blue background and white text"], "prompt": "Write `svg` code for a screenshot of the [Google homepage](https://google.com).", "id": "google_homepage"}
3-
{"requirements": ["Create a primary circular or elliptical shape for the top surface of a round dinner table", "The table should have a distinct color or a simple texture like wood grain", "Include exactly 4 sets of cutlery arranged around the table", "Each cutlery set must consist of a recognizable fork, knife, and spoon", "Position the 4 cutlery sets at distinct place settings (e.g., at 12, 3, 6, and 9 o'clock positions)", "Optionally, include a round dinner plate at each of the 4 place settings", "Place exactly 3 main food dishes on the surface of the table", "First dish: A recognizable roasted turkey, golden-brown in color, showing drumsticks and a plump body", "The turkey should be presented on its own platter or serving dish", "Second dish: A round pizza, cut into slices, with visible crust and toppings", "Third dish: A serving of tacos (at least two), with visible folded shells and fillings (e.g., lettuce, meat, cheese)", "The tacos should be on a plate or in a holder", "Arrange the three main dishes in the center of the table, ensuring they don't unnaturally overlap", "The overall perspective should be top-down or slightly isometric"], "prompt": "Write `svg` code for an image of a round dinner table with 4 sets of cutlery and 3 dishes on the table, including a turkey, pizza and tacos.", "id": "dinner_table"}
1+
{"requirements": ["Cow must be clearly recognizable with distinctive bovine features", "Include cow body, head, four legs, tail, and udder", "Add cow ears, eyes, and snout for facial recognition", "Cow should be positioned in a realistic plowing stance (pulling forward)", "Use appropriate cow coloring (black/white patches, brown, or solid color)", "Include a traditional plow with visible blade/share", "Show plow handles extending upward", "Depict connection mechanism between cow and plow (yoke, harness, or chains)", "Plow should appear to be cutting into the soil", "Show ground/soil with visible furrows behind the plow", "Include plowed and unplowed sections of field", "Add simple background elements (horizon line, sky)", "Include basic vegetation or crops"], "prompt": "Write `svg` code to draw an image of a cow plowing a field."}
2+
{"requirements": ["The overall background of the SVG must be white", "All primary elements must be horizontally centered on the canvas", "Include the Google logo in the center, using its official multi-color scheme (blue, red, yellow, blue, green, red)", "Place a prominent search bar directly below the Google logo", "The search bar must be a rounded rectangle with a light gray border", "The search bar must contain a gray magnifying glass icon on the left side", "The search bar must contain a gray microphone icon on the right side", "Place two distinct buttons below the search bar", "The left button must be labeled 'Google Search'", "The right button must be labeled 'I'm Feeling Lucky'", "Buttons should have a light gray background, a thin border, and dark gray text", "Create a header section at the top right of the canvas", "The header must include text links for 'Gmail' and 'Images'", "The header must include a 3x3 grid icon (Google Apps launcher)", "The header must include a prominent 'Sign in' button, typically with a blue background and white text"], "prompt": "Write `svg` code for a screenshot of the [Google homepage](https://google.com)."}
3+
{"requirements": ["Create a primary circular or elliptical shape for the top surface of a round dinner table", "The table should have a distinct color or a simple texture like wood grain", "Include exactly 4 sets of cutlery arranged around the table", "Each cutlery set must consist of a recognizable fork, knife, and spoon", "Position the 4 cutlery sets at distinct place settings (e.g., at 12, 3, 6, and 9 o'clock positions)", "Optionally, include a round dinner plate at each of the 4 place settings", "Place exactly 3 main food dishes on the surface of the table", "First dish: A recognizable roasted turkey, golden-brown in color, showing drumsticks and a plump body", "The turkey should be presented on its own platter or serving dish", "Second dish: A round pizza, cut into slices, with visible crust and toppings", "Third dish: A serving of tacos (at least two), with visible folded shells and fillings (e.g., lettuce, meat, cheese)", "The tacos should be on a plate or in a holder", "Arrange the three main dishes in the center of the table, ensuring they don't unnaturally overlap", "The overall perspective should be top-down or slightly isometric"], "prompt": "Write `svg` code for an image of a round dinner table with 4 sets of cutlery and 3 dishes on the table, including a turkey, pizza and tacos."}

0 commit comments

Comments
 (0)