From 42f7dc6a2bfe03970489dc5421fac4de68837716 Mon Sep 17 00:00:00 2001 From: Kilian Lieret Date: Thu, 18 Jun 2026 02:28:30 +0000 Subject: [PATCH] Data: sync test-ignore updates from internal reference Port the June 2026 test-ignore updates from the internal reference harness into ProgramBench across 41 instances (+270 ignored tests total), keeping all metadata verbatim, and update CLAUDE.md accordingly. Branch-hash keys and per-branch test name-sets are unchanged, so no test blob re-upload is needed; only ignore metadata differs. The testorg__calculator fixture is untouched. Internal-reference-commit: 1ca7719a46b9fc521af2353e287b4f07f4c071ea Internal-reference-commit: a8f153ad485daf95e2041b5e872b5b715b7f5410 Internal-reference-commit: b6466b9e33c2abd88ce8ca06037c876b1edcc864 Internal-reference-commit: d7f13854873fd23b60606729d649f31a4dc7d7ac Internal-reference-commit: 7b3a00e057cc625c86bcb9988a439ae50adb0828 Internal-reference-commit: 83488589c5237b4b9af904e1809997df735106f0 Internal-reference-commit: 3bbf4632e7d5e978b817bc168fa0897d0bbc2161 Internal-reference-commit: 691dcaa75a89d8d6f22fc7253c641375df9c3bdf --- CLAUDE.md | 20 + .../tasks/antonmedv__walk.bf802ef/tests.json | 754 ++++++++++++++++++ .../ast-grep__ast-grep.dde0fe0/tests.json | 87 ++ .../bensadeh__tailspin.6278437/tests.json | 11 + .../burntsushi__ripgrep.3b7fd44/tests.json | 35 + .../tasks/canop__broot.d6c798e/tests.json | 11 + .../data/tasks/chirlu__sox.42b3557/tests.json | 66 ++ .../dandavison__delta.acd758f/tests.json | 11 + .../tasks/duckdb__duckdb.bdb65ec/tests.json | 17 + .../tasks/ekzhang__bore.8e059cd/tests.json | 11 + .../tasks/elkowar__pipr.fae0b17/tests.json | 11 + .../tasks/ffmpeg__ffmpeg.360a402/tests.json | 105 +++ .../tasks/gromacs__gromacs.665ea4c/tests.json | 121 +++ .../data/tasks/hatoo__oha.8dc6349/tests.json | 91 +++ .../tasks/htop-dev__htop.523600b/tests.json | 11 + .../tasks/isona__dirble.e2dea9f/tests.json | 52 ++ .../data/tasks/jgm__pandoc.5caad90/tests.json | 154 ++++ .../tasks/jrnxf__thokr.09375ef/tests.json | 80 ++ .../tasks/junegunn__fzf.b56d614/tests.json | 22 + .../kisielk__errcheck.dacab89/tests.json | 11 + .../tasks/kyoheiu__felix.95df390/tests.json | 11 + .../tasks/mkj__dropbear.75f699b/tests.json | 192 +++++ .../nikolassv__bartib.6b9b5ce/tests.json | 11 + .../data/tasks/noborus__ov.b96c2ba/tests.json | 572 +++++++++++++ .../data/tasks/orf__gping.26eb5b9/tests.json | 17 + .../data/tasks/peco__peco.4e58dad/tests.json | 144 +++- .../data/tasks/pls-rs__pls.4e1ae50/tests.json | 11 + .../tasks/raviqqe__muffet.a882908/tests.json | 17 + .../rcoh__angle-grinder.9c2fc88/tests.json | 36 + .../rhysd__kiro-editor.4157485/tests.json | 94 +++ .../data/tasks/rs__curlie.5dfcbb1/tests.json | 129 +++ .../tests.json | 25 +- .../tasks/sheepla__pingu.926d475/tests.json | 55 ++ .../tasks/sqlite__sqlite.839433d/tests.json | 62 ++ .../tests.json | 11 + .../svenstaro__genact.16f96e3/tests.json | 17 + .../tasks/tstack__lnav.ee34494/tests.json | 11 + .../unhappychoice__gittype.34b72d0/tests.json | 235 ++++++ .../tasks/y2z__monolith.8702e66/tests.json | 11 + .../yassinebridi__serpl.c48a9d7/tests.json | 345 ++++++++ .../tasks/ys-l__flamelens.0b4dc33/tests.json | 18 + .../data/tasks/zk-org__zk.10d93d5/tests.json | 11 + 42 files changed, 3714 insertions(+), 2 deletions(-) diff --git a/CLAUDE.md b/CLAUDE.md index 5aaee0d..0dc1596 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -4,6 +4,26 @@ ProgramBench evaluates whether LM-based SWE-agents can reverse-engineer black-box software systems. The workflow: take an open-source CLI tool (mostly Rust/Go), compile it into a Docker image with source removed, then have an LM agent re-implement it from scratch by interacting only with the binary. Behavioral tests (also LM-generated) score the re-implementation. +## Test ignore reasons + +Some behavioral tests are unreliable and are excluded from scoring. Each excluded +test is recorded under `branches..ignored_tests[]` in a task's `tests.json`, +with one or more `reasons[].id` explaining why. All ignored tests are excluded from +scoring regardless of reason; the id is informational. + +- `gold_fail` — test fails **deterministically** on the reference (gold) solution, so it + is defective rather than discriminating. Also covers golden-output drift (the gold + binary is correct but the captured golden is stale/non-reproducible relative to the + build toolchain, an embedded build-stamp, or an external resource). +- `gold_flaky` — test is **non-deterministic** on the gold solution: it passes in some + runs and fails in others. These are timing/race/network/TUI-snapshot flakes, not real + defects (distinct from the deterministic `gold_fail`). +- `dummy_pass` — test passes even on a trivial/dummy executable, so it fails to + distinguish a real implementation from a stub. +- `outcome_dependent_presence` — test appears in some eval runs but not others. +- `slow_or_hang` — test hangs mid-call or exceeds a duration threshold. +- `ignored_manual` — manually excluded. + ## Quick reference ```bash diff --git a/src/programbench/data/tasks/antonmedv__walk.bf802ef/tests.json b/src/programbench/data/tasks/antonmedv__walk.bf802ef/tests.json index 9c188e1..4a614d8 100644 --- a/src/programbench/data/tasks/antonmedv__walk.bf802ef/tests.json +++ b/src/programbench/data/tasks/antonmedv__walk.bf802ef/tests.json @@ -1164,6 +1164,42 @@ "tests.test_tui_search.test_typing_in_search_mode" ], "ignored_tests": [ + { + "name": "tests.test_cli_config.test_dir_only_flag_shows_only_directories", + "reasons": [ + { + "extra": { + "cause": "Flaky TUI/listing output: an extra size line (e.g. '20B') appears nondeterministically", + "statuses": [ + "passed", + "failure", + "passed" + ] + }, + "id": "gold_flaky", + "timestamp": 1781197573, + "user": "kilian" + } + ] + }, + { + "name": "tests.test_cli_config.test_dir_only_with_icons_combines_both_features", + "reasons": [ + { + "extra": { + "cause": "Flaky TUI/listing output: an extra size line (e.g. '20B') appears nondeterministically", + "statuses": [ + "passed", + "failure", + "passed" + ] + }, + "id": "gold_flaky", + "timestamp": 1781197573, + "user": "kilian" + } + ] + }, { "name": "tests.test_cli_config.test_empty_directory_shows_no_files_message", "reasons": [ @@ -1184,6 +1220,96 @@ } ] }, + { + "name": "tests.test_cli_config.test_icons_flag_enables_icons", + "reasons": [ + { + "extra": { + "cause": "Flaky TUI/listing output: an extra size line (e.g. '20B') appears nondeterministically", + "statuses": [ + "passed", + "failure", + "passed" + ] + }, + "id": "gold_flaky", + "timestamp": 1781197573, + "user": "kilian" + } + ] + }, + { + "name": "tests.test_cli_config.test_multiple_flags_together", + "reasons": [ + { + "extra": { + "cause": "Flaky TUI/listing output: an extra size line (e.g. '20B') appears nondeterministically", + "statuses": [ + "passed", + "failure", + "passed" + ] + }, + "id": "gold_flaky", + "timestamp": 1781197573, + "user": "kilian" + } + ] + }, + { + "name": "tests.test_cli_config.test_path_argument_starts_in_specified_directory", + "reasons": [ + { + "extra": { + "cause": "Flaky TUI/listing output: an extra size line (e.g. '20B') appears nondeterministically", + "statuses": [ + "passed", + "failure", + "passed" + ] + }, + "id": "gold_flaky", + "timestamp": 1781197573, + "user": "kilian" + } + ] + }, + { + "name": "tests.test_cli_config.test_path_argument_with_flags", + "reasons": [ + { + "extra": { + "cause": "Flaky TUI/listing output: an extra size line (e.g. '20B') appears nondeterministically", + "statuses": [ + "passed", + "failure", + "passed" + ] + }, + "id": "gold_flaky", + "timestamp": 1781197573, + "user": "kilian" + } + ] + }, + { + "name": "tests.test_cli_config.test_preview_flag_enables_preview_mode", + "reasons": [ + { + "extra": { + "cause": "Flaky TUI/listing output: an extra size line (e.g. '20B') appears nondeterministically", + "statuses": [ + "passed", + "failure", + "passed" + ] + }, + "id": "gold_flaky", + "timestamp": 1781197573, + "user": "kilian" + } + ] + }, { "name": "tests.test_cli_config.test_preview_mode_empty_directory", "reasons": [ @@ -1192,6 +1318,42 @@ } ] }, + { + "name": "tests.test_cli_config.test_walk_main_color_env_changes_cursor_color", + "reasons": [ + { + "extra": { + "cause": "Flaky TUI/listing output: an extra size line (e.g. '20B') appears nondeterministically", + "statuses": [ + "passed", + "failure", + "passed" + ] + }, + "id": "gold_flaky", + "timestamp": 1781197573, + "user": "kilian" + } + ] + }, + { + "name": "tests.test_cli_config.test_walk_no_highlight_env_disables_syntax_highlighting", + "reasons": [ + { + "extra": { + "cause": "Flaky TUI/listing output: an extra size line (e.g. '20B') appears nondeterministically", + "statuses": [ + "passed", + "failure", + "passed" + ] + }, + "id": "gold_flaky", + "timestamp": 1781197573, + "user": "kilian" + } + ] + }, { "name": "tests.test_cli_config.test_with_border_flag_adds_border_in_preview", "reasons": [ @@ -1212,6 +1374,60 @@ } ] }, + { + "name": "tests.test_edge_cases.test_empty_file_preview", + "reasons": [ + { + "extra": { + "cause": "Flaky TUI/listing output: an extra size line (e.g. '20B') appears nondeterministically", + "statuses": [ + "passed", + "failure", + "passed" + ] + }, + "id": "gold_flaky", + "timestamp": 1781197573, + "user": "kilian" + } + ] + }, + { + "name": "tests.test_edge_cases.test_hidden_files_shown_by_default", + "reasons": [ + { + "extra": { + "cause": "Flaky TUI/listing output: an extra size line (e.g. '20B') appears nondeterministically", + "statuses": [ + "passed", + "failure", + "passed" + ] + }, + "id": "gold_flaky", + "timestamp": 1781197573, + "user": "kilian" + } + ] + }, + { + "name": "tests.test_edge_cases.test_large_directory_500_files", + "reasons": [ + { + "extra": { + "cause": "Flaky TUI/listing output: an extra size line (e.g. '20B') appears nondeterministically", + "statuses": [ + "passed", + "failure", + "passed" + ] + }, + "id": "gold_flaky", + "timestamp": 1781197573, + "user": "kilian" + } + ] + }, { "name": "tests.test_edge_cases.test_owner_function_basic", "reasons": [ @@ -1376,6 +1592,96 @@ } ] }, + { + "name": "tests.test_edge_cases.test_special_characters_in_filenames", + "reasons": [ + { + "extra": { + "cause": "Flaky TUI/listing output: an extra size line (e.g. '20B') appears nondeterministically", + "statuses": [ + "passed", + "failure", + "passed" + ] + }, + "id": "gold_flaky", + "timestamp": 1781197573, + "user": "kilian" + } + ] + }, + { + "name": "tests.test_edge_cases.test_symlinks_display_in_listing", + "reasons": [ + { + "extra": { + "cause": "Flaky TUI/listing output: an extra size line (e.g. '20B') appears nondeterministically", + "statuses": [ + "passed", + "failure", + "passed" + ] + }, + "id": "gold_flaky", + "timestamp": 1781197573, + "user": "kilian" + } + ] + }, + { + "name": "tests.test_edge_cases.test_tab_character_in_filename", + "reasons": [ + { + "extra": { + "cause": "Flaky TUI/listing output: an extra size line (e.g. '20B') appears nondeterministically", + "statuses": [ + "passed", + "failure", + "passed" + ] + }, + "id": "gold_flaky", + "timestamp": 1781197573, + "user": "kilian" + } + ] + }, + { + "name": "tests.test_edge_cases.test_unicode_filenames_display_correctly", + "reasons": [ + { + "extra": { + "cause": "Flaky TUI/listing output: an extra size line (e.g. '20B') appears nondeterministically", + "statuses": [ + "passed", + "failure", + "passed" + ] + }, + "id": "gold_flaky", + "timestamp": 1781197573, + "user": "kilian" + } + ] + }, + { + "name": "tests.test_edge_cases.test_very_long_filenames_display", + "reasons": [ + { + "extra": { + "cause": "Flaky TUI/listing output: an extra size line (e.g. '20B') appears nondeterministically", + "statuses": [ + "passed", + "failure", + "passed" + ] + }, + "id": "gold_flaky", + "timestamp": 1781197573, + "user": "kilian" + } + ] + }, { "name": "tests.test_gap_main.test_dir_only_flag", "reasons": [ @@ -1600,6 +1906,330 @@ } ] }, + { + "name": "tests.test_icons.test_archive_extensions_show_zip_icon", + "reasons": [ + { + "extra": { + "cause": "Flaky TUI/listing output: an extra size line (e.g. '20B') appears nondeterministically", + "statuses": [ + "passed", + "failure", + "passed" + ] + }, + "id": "gold_flaky", + "timestamp": 1781197573, + "user": "kilian" + } + ] + }, + { + "name": "tests.test_icons.test_files_without_extension_show_executable_or_generic_icon", + "reasons": [ + { + "extra": { + "cause": "Flaky TUI/listing output: an extra size line (e.g. '20B') appears nondeterministically", + "statuses": [ + "passed", + "failure", + "passed" + ] + }, + "id": "gold_flaky", + "timestamp": 1781197573, + "user": "kilian" + } + ] + }, + { + "name": "tests.test_icons.test_hidden_files_still_show_extension_icons", + "reasons": [ + { + "extra": { + "cause": "Flaky TUI/listing output: an extra size line (e.g. '20B') appears nondeterministically", + "statuses": [ + "passed", + "failure", + "passed" + ] + }, + "id": "gold_flaky", + "timestamp": 1781197573, + "user": "kilian" + } + ] + }, + { + "name": "tests.test_icons.test_icons_disabled_shows_no_icons", + "reasons": [ + { + "extra": { + "cause": "Flaky TUI/listing output: an extra size line (e.g. '20B') appears nondeterministically", + "statuses": [ + "passed", + "failure", + "passed" + ] + }, + "id": "gold_flaky", + "timestamp": 1781197573, + "user": "kilian" + } + ] + }, + { + "name": "tests.test_icons.test_media_files_show_appropriate_icons", + "reasons": [ + { + "extra": { + "cause": "Flaky TUI/listing output: an extra size line (e.g. '20B') appears nondeterministically", + "statuses": [ + "passed", + "failure", + "passed" + ] + }, + "id": "gold_flaky", + "timestamp": 1781197573, + "user": "kilian" + } + ] + }, + { + "name": "tests.test_icons.test_programming_language_extensions_show_correct_icons", + "reasons": [ + { + "extra": { + "cause": "Flaky TUI/listing output: an extra size line (e.g. '20B') appears nondeterministically", + "statuses": [ + "passed", + "failure", + "passed" + ] + }, + "id": "gold_flaky", + "timestamp": 1781197573, + "user": "kilian" + } + ] + }, + { + "name": "tests.test_icons.test_specific_filenames_get_special_icons", + "reasons": [ + { + "extra": { + "cause": "Flaky TUI/listing output: an extra size line (e.g. '20B') appears nondeterministically", + "statuses": [ + "passed", + "failure", + "passed" + ] + }, + "id": "gold_flaky", + "timestamp": 1781197573, + "user": "kilian" + } + ] + }, + { + "name": "tests.test_icons.test_unknown_extensions_show_generic_file_icon", + "reasons": [ + { + "extra": { + "cause": "Flaky TUI/listing output: an extra size line (e.g. '20B') appears nondeterministically", + "statuses": [ + "passed", + "failure", + "passed" + ] + }, + "id": "gold_flaky", + "timestamp": 1781197573, + "user": "kilian" + } + ] + }, + { + "name": "tests.test_icons.test_uppercase_extensions_normalized_to_lowercase", + "reasons": [ + { + "extra": { + "cause": "Flaky TUI/listing output: an extra size line (e.g. '20B') appears nondeterministically", + "statuses": [ + "passed", + "failure", + "passed" + ] + }, + "id": "gold_flaky", + "timestamp": 1781197573, + "user": "kilian" + } + ] + }, + { + "name": "tests.test_icons.test_wildcard_filename_patterns_match_correctly", + "reasons": [ + { + "extra": { + "cause": "Flaky TUI/listing output: an extra size line (e.g. '20B') appears nondeterministically", + "statuses": [ + "passed", + "failure", + "passed" + ] + }, + "id": "gold_flaky", + "timestamp": 1781197573, + "user": "kilian" + } + ] + }, + { + "name": "tests.test_image_preview.test_animated_gif_renders_first_frame", + "reasons": [ + { + "extra": { + "cause": "Flaky TUI/listing output: an extra size line (e.g. '20B') appears nondeterministically", + "statuses": [ + "passed", + "failure", + "passed" + ] + }, + "id": "gold_flaky", + "timestamp": 1781197573, + "user": "kilian" + } + ] + }, + { + "name": "tests.test_image_preview.test_corrupted_jpeg_shows_error", + "reasons": [ + { + "extra": { + "cause": "Flaky TUI/listing output: an extra size line (e.g. '20B') appears nondeterministically", + "statuses": [ + "passed", + "failure", + "passed" + ] + }, + "id": "gold_flaky", + "timestamp": 1781197573, + "user": "kilian" + } + ] + }, + { + "name": "tests.test_image_preview.test_corrupted_png_shows_error", + "reasons": [ + { + "extra": { + "cause": "Flaky TUI/listing output: an extra size line (e.g. '20B') appears nondeterministically", + "statuses": [ + "passed", + "failure", + "passed" + ] + }, + "id": "gold_flaky", + "timestamp": 1781197573, + "user": "kilian" + } + ] + }, + { + "name": "tests.test_image_preview.test_file_with_no_extension_not_treated_as_image", + "reasons": [ + { + "extra": { + "cause": "Flaky TUI/listing output: an extra size line (e.g. '20B') appears nondeterministically", + "statuses": [ + "passed", + "failure", + "passed" + ] + }, + "id": "gold_flaky", + "timestamp": 1781197573, + "user": "kilian" + } + ] + }, + { + "name": "tests.test_image_preview.test_gif_with_mixed_case_extension", + "reasons": [ + { + "extra": { + "cause": "Flaky TUI/listing output: an extra size line (e.g. '20B') appears nondeterministically", + "statuses": [ + "passed", + "failure", + "passed" + ] + }, + "id": "gold_flaky", + "timestamp": 1781197573, + "user": "kilian" + } + ] + }, + { + "name": "tests.test_image_preview.test_jpeg_with_uppercase_extension", + "reasons": [ + { + "extra": { + "cause": "Flaky TUI/listing output: an extra size line (e.g. '20B') appears nondeterministically", + "statuses": [ + "passed", + "failure", + "passed" + ] + }, + "id": "gold_flaky", + "timestamp": 1781197573, + "user": "kilian" + } + ] + }, + { + "name": "tests.test_image_preview.test_png_image_preview", + "reasons": [ + { + "extra": { + "cause": "Flaky TUI/listing output: an extra size line (e.g. '20B') appears nondeterministically", + "statuses": [ + "passed", + "failure", + "passed" + ] + }, + "id": "gold_flaky", + "timestamp": 1781197573, + "user": "kilian" + } + ] + }, + { + "name": "tests.test_image_preview.test_subdirectory_navigation_image_preview", + "reasons": [ + { + "extra": { + "cause": "Flaky TUI/listing output: an extra size line (e.g. '20B') appears nondeterministically", + "statuses": [ + "passed", + "failure", + "passed" + ] + }, + "id": "gold_flaky", + "timestamp": 1781197573, + "user": "kilian" + } + ] + }, { "name": "tests.test_image_preview.test_symlink_to_image_renders", "reasons": [ @@ -1608,6 +2238,96 @@ } ] }, + { + "name": "tests.test_image_preview.test_text_file_not_treated_as_image", + "reasons": [ + { + "extra": { + "cause": "Flaky TUI/listing output: an extra size line (e.g. '20B') appears nondeterministically", + "statuses": [ + "passed", + "failure", + "passed" + ] + }, + "id": "gold_flaky", + "timestamp": 1781197573, + "user": "kilian" + } + ] + }, + { + "name": "tests.test_tui_navigation.test_backspace_exits_directory", + "reasons": [ + { + "extra": { + "cause": "Flaky TUI/listing output: an extra size line (e.g. '20B') appears nondeterministically", + "statuses": [ + "passed", + "failure", + "passed" + ] + }, + "id": "gold_flaky", + "timestamp": 1781197573, + "user": "kilian" + } + ] + }, + { + "name": "tests.test_tui_navigation.test_initial_directory_listing", + "reasons": [ + { + "extra": { + "cause": "Flaky TUI/listing output: an extra size line (e.g. '20B') appears nondeterministically", + "statuses": [ + "passed", + "failure", + "passed" + ] + }, + "id": "gold_flaky", + "timestamp": 1781197573, + "user": "kilian" + } + ] + }, + { + "name": "tests.test_tui_navigation.test_multicolumn_layout", + "reasons": [ + { + "extra": { + "cause": "Flaky TUI/listing output: an extra size line (e.g. '20B') appears nondeterministically", + "statuses": [ + "passed", + "failure", + "passed" + ] + }, + "id": "gold_flaky", + "timestamp": 1781197573, + "user": "kilian" + } + ] + }, + { + "name": "tests.test_tui_navigation.test_toggle_hidden_files", + "reasons": [ + { + "extra": { + "cause": "Flaky TUI/listing output: an extra size line (e.g. '20B') appears nondeterministically", + "statuses": [ + "passed", + "failure", + "passed" + ] + }, + "id": "gold_flaky", + "timestamp": 1781197573, + "user": "kilian" + } + ] + }, { "name": "tests.test_tui_operations.test_delete_empty_directory", "reasons": [ @@ -1624,6 +2344,40 @@ } ] }, + { + "name": "tests.test_tui_operations.test_multiple_delete_undo_cycles", + "reasons": [ + { + "extra": { + "fail": 1, + "note": "TUI delete/undo temp-file existence race (filesystem state non-deterministic under tui2cli capture)", + "pass": 19, + "pattern": "ppppppppppfppppppppp", + "source": "20x gold cloud eval 2026-06-13" + }, + "id": "gold_flaky", + "timestamp": 1781326815, + "user": "kilian" + } + ] + }, + { + "name": "tests.test_tui_operations.test_multiple_pending_deletions_stacked", + "reasons": [ + { + "extra": { + "fail": 1, + "note": "TUI delete/undo temp-file existence race (filesystem state non-deterministic under tui2cli capture)", + "pass": 19, + "pattern": "ppppppppppfppppppppp", + "source": "20x gold cloud eval 2026-06-13" + }, + "id": "gold_flaky", + "timestamp": 1781326815, + "user": "kilian" + } + ] + }, { "name": "tests.test_tui_operations.test_undo_most_recent_deletion_only", "reasons": [ diff --git a/src/programbench/data/tasks/ast-grep__ast-grep.dde0fe0/tests.json b/src/programbench/data/tasks/ast-grep__ast-grep.dde0fe0/tests.json index c7c6d3c..a24bd2d 100644 --- a/src/programbench/data/tasks/ast-grep__ast-grep.dde0fe0/tests.json +++ b/src/programbench/data/tasks/ast-grep__ast-grep.dde0fe0/tests.json @@ -981,6 +981,40 @@ } ] }, + { + "name": "eval.tests.test_language_features.test_mixed_js_and_css_injections_in_html", + "reasons": [ + { + "extra": { + "fail": 1, + "note": "transient nondeterminism in HTML multi-language injection matching (passes 19/20)", + "pass": 19, + "pattern": "pfpppppppppppppppppp", + "source": "20x gold cloud eval 2026-06-13" + }, + "id": "gold_flaky", + "timestamp": 1781326815, + "user": "kilian" + } + ] + }, + { + "name": "eval.tests.test_language_features.test_mixed_typescript_and_javascript_in_html", + "reasons": [ + { + "extra": { + "fail": 1, + "note": "transient nondeterminism in HTML multi-language injection matching (passes 19/20)", + "pass": 19, + "pattern": "ppppppppppppppfppppp", + "source": "20x gold cloud eval 2026-06-13" + }, + "id": "gold_flaky", + "timestamp": 1781326815, + "user": "kilian" + } + ] + }, { "name": "eval.tests.test_language_features.test_multiple_js_script_tags_in_html", "reasons": [ @@ -991,6 +1025,41 @@ } ] }, + { + "name": "eval.tests.test_language_features.test_plain_js_script_not_matched_as_typescript", + "reasons": [ + { + "extra": { + "fail": 1, + "note": "transient nondeterminism in HTML multi-language injection matching (passes 19/20)", + "pass": 19, + "pattern": "ppppppppppppppfppppp", + "source": "20x gold cloud eval 2026-06-13" + }, + "id": "gold_flaky", + "timestamp": 1781326815, + "user": "kilian" + } + ] + }, + { + "name": "eval.tests.test_language_features.test_typescript_type_alias_in_html", + "reasons": [ + { + "extra": { + "cause": "Nondeterministic analysis/diagnostic ordering (parallel rule evaluation)", + "statuses": [ + "failure", + "passed", + "passed" + ] + }, + "id": "gold_flaky", + "timestamp": 1781197573, + "user": "kilian" + } + ] + }, { "name": "eval.tests.test_multiple_fixers.test_three_fixers_json_consistent_replacement", "reasons": [ @@ -1011,6 +1080,24 @@ } ] }, + { + "name": "eval.tests.test_parameterized_advanced.test_error_cyclic_dependency", + "reasons": [ + { + "extra": { + "cause": "Nondeterministic analysis/diagnostic ordering (parallel rule evaluation)", + "statuses": [ + "failure", + "passed", + "passed" + ] + }, + "id": "gold_flaky", + "timestamp": 1781197573, + "user": "kilian" + } + ] + }, { "name": "eval.tests.test_parameterized_advanced.test_parameterized_utility_with_stopBy", "reasons": [ diff --git a/src/programbench/data/tasks/bensadeh__tailspin.6278437/tests.json b/src/programbench/data/tasks/bensadeh__tailspin.6278437/tests.json index d09be48..6ce26be 100644 --- a/src/programbench/data/tasks/bensadeh__tailspin.6278437/tests.json +++ b/src/programbench/data/tasks/bensadeh__tailspin.6278437/tests.json @@ -369,6 +369,17 @@ "id": "dummy_pass" } ] + }, + { + "name": "eval.tests.test_follow_mode.test_follow_flag_short", + "reasons": [ + { + "id": "gold_flaky", + "note": "Flaky on v6 gold: passed in some gold-eval rounds and failed in others across 3 independent runs (timing-sensitive TUI/tui2cli snapshot, tmux, or db-lock/test-isolation concurrency); unreliable for grading.", + "timestamp": 1749700000, + "user": "kilian" + } + ] } ] }, diff --git a/src/programbench/data/tasks/burntsushi__ripgrep.3b7fd44/tests.json b/src/programbench/data/tasks/burntsushi__ripgrep.3b7fd44/tests.json index b542fc4..b0dd8ee 100644 --- a/src/programbench/data/tasks/burntsushi__ripgrep.3b7fd44/tests.json +++ b/src/programbench/data/tasks/burntsushi__ripgrep.3b7fd44/tests.json @@ -5315,6 +5315,23 @@ "tests.test_walk_errors.test_whitespace_only_gitignore" ], "ignored_tests": [ + { + "name": "tests.test_cli_utils.test_max_filesize_bytes_format", + "reasons": [ + { + "extra": { + "fail": 1, + "note": "directory-listing order nondeterminism (large_file/small_file output order varies)", + "pass": 19, + "pattern": "pppfpppppppppppppppp", + "source": "20x gold cloud eval 2026-06-13" + }, + "id": "gold_flaky", + "timestamp": 1781326815, + "user": "kilian" + } + ] + }, { "name": "tests.test_encoding.test_short_encoding_flag_syntax", "reasons": [ @@ -6127,6 +6144,24 @@ } ] }, + { + "name": "tests.test_vimgrep.test_output_mode_no_heading_multifile", + "reasons": [ + { + "extra": { + "cause": "Nondeterministic multifile output ordering (parallel directory walk)", + "statuses": [ + "passed", + "failure", + "passed" + ] + }, + "id": "gold_flaky", + "timestamp": 1781197573, + "user": "kilian" + } + ] + }, { "name": "tests.test_walk_errors.test_files_with_no_read_permission_as_non_root", "reasons": [ diff --git a/src/programbench/data/tasks/canop__broot.d6c798e/tests.json b/src/programbench/data/tasks/canop__broot.d6c798e/tests.json index b219add..1328eeb 100644 --- a/src/programbench/data/tasks/canop__broot.d6c798e/tests.json +++ b/src/programbench/data/tasks/canop__broot.d6c798e/tests.json @@ -1352,6 +1352,17 @@ } ] }, + { + "name": "tests.test_panel_state.test_toggle_sizes_displays_file_sizes", + "reasons": [ + { + "id": "gold_flaky", + "note": "Flaky on v6 gold: passed in some gold-eval rounds and failed in others across 3 independent runs (timing-sensitive TUI/tui2cli snapshot, tmux, or db-lock/test-isolation concurrency); unreliable for grading.", + "timestamp": 1749700000, + "user": "kilian" + } + ] + }, { "name": "tests.test_panel_state.test_trash_panel_state", "reasons": [ diff --git a/src/programbench/data/tasks/chirlu__sox.42b3557/tests.json b/src/programbench/data/tasks/chirlu__sox.42b3557/tests.json index 81319b2..cd524cc 100644 --- a/src/programbench/data/tasks/chirlu__sox.42b3557/tests.json +++ b/src/programbench/data/tasks/chirlu__sox.42b3557/tests.json @@ -1334,6 +1334,72 @@ } ] }, + { + "name": "tests.test_cli_options.test_buffer_option_missing_argument", + "reasons": [ + { + "id": "gold_fail", + "note": "[gold_fail_v6_build_featureset] Fails on v6 gold: the reproducible build (e.g. ffmpeg --disable-autodetect) produces a deterministic but different optional-feature/codec/help-list set than the autodetect build the golden was captured against.", + "timestamp": 1749700000, + "user": "kilian" + } + ] + }, + { + "name": "tests.test_cli_options.test_help_effect_nonexistent", + "reasons": [ + { + "id": "gold_fail", + "note": "[gold_fail_v6_build_featureset] Fails on v6 gold: the reproducible build (e.g. ffmpeg --disable-autodetect) produces a deterministic but different optional-feature/codec/help-list set than the autodetect build the golden was captured against.", + "timestamp": 1749700000, + "user": "kilian" + } + ] + }, + { + "name": "tests.test_cli_options.test_help_format_nonexistent", + "reasons": [ + { + "id": "gold_fail", + "note": "[gold_fail_v6_build_featureset] Fails on v6 gold: the reproducible build (e.g. ffmpeg --disable-autodetect) produces a deterministic but different optional-feature/codec/help-list set than the autodetect build the golden was captured against.", + "timestamp": 1749700000, + "user": "kilian" + } + ] + }, + { + "name": "tests.test_cli_options.test_help_output_complete", + "reasons": [ + { + "id": "gold_fail", + "note": "[gold_fail_v6_build_featureset] Fails on v6 gold: the reproducible build (e.g. ffmpeg --disable-autodetect) produces a deterministic but different optional-feature/codec/help-list set than the autodetect build the golden was captured against.", + "timestamp": 1749700000, + "user": "kilian" + } + ] + }, + { + "name": "tests.test_cli_options.test_invalid_option", + "reasons": [ + { + "id": "gold_fail", + "note": "[gold_fail_v6_build_featureset] Fails on v6 gold: the reproducible build (e.g. ffmpeg --disable-autodetect) produces a deterministic but different optional-feature/codec/help-list set than the autodetect build the golden was captured against.", + "timestamp": 1749700000, + "user": "kilian" + } + ] + }, + { + "name": "tests.test_cli_options.test_no_input_files", + "reasons": [ + { + "id": "gold_fail", + "note": "[gold_fail_v6_build_featureset] Fails on v6 gold: the reproducible build (e.g. ffmpeg --disable-autodetect) produces a deterministic but different optional-feature/codec/help-list set than the autodetect build the golden was captured against.", + "timestamp": 1749700000, + "user": "kilian" + } + ] + }, { "name": "tests.test_cli_options.test_reproducible_random", "reasons": [ diff --git a/src/programbench/data/tasks/dandavison__delta.acd758f/tests.json b/src/programbench/data/tasks/dandavison__delta.acd758f/tests.json index 8faf9a3..ab0c878 100644 --- a/src/programbench/data/tasks/dandavison__delta.acd758f/tests.json +++ b/src/programbench/data/tasks/dandavison__delta.acd758f/tests.json @@ -1269,6 +1269,17 @@ } ] }, + { + "name": "tests.test_grep_gaps.test_git_grep_before_context_only", + "reasons": [ + { + "id": "gold_flaky", + "note": "Flaky on v6 gold: passed in some gold-eval rounds and failed in others across 3 independent runs (timing-sensitive TUI/tui2cli snapshot, tmux, or db-lock/test-isolation concurrency); unreliable for grading.", + "timestamp": 1749700000, + "user": "kilian" + } + ] + }, { "name": "tests.test_grep_gaps.test_git_grep_classic_output_type_override", "reasons": [ diff --git a/src/programbench/data/tasks/duckdb__duckdb.bdb65ec/tests.json b/src/programbench/data/tasks/duckdb__duckdb.bdb65ec/tests.json index 384f1d4..cd2019d 100644 --- a/src/programbench/data/tasks/duckdb__duckdb.bdb65ec/tests.json +++ b/src/programbench/data/tasks/duckdb__duckdb.bdb65ec/tests.json @@ -35227,6 +35227,23 @@ } ] }, + { + "name": "tests.test_harvest_sql_2.test_flatten", + "reasons": [ + { + "extra": { + "fail": 1, + "note": "SQL result row-order nondeterminism (no ORDER BY): flatten rows reorder", + "pass": 19, + "pattern": "ppppppppppppppppppfp", + "source": "20x gold cloud eval 2026-06-13" + }, + "id": "gold_flaky", + "timestamp": 1781326815, + "user": "kilian" + } + ] + }, { "name": "tests.test_harvest_sql_2.test_format_bytes", "reasons": [ diff --git a/src/programbench/data/tasks/ekzhang__bore.8e059cd/tests.json b/src/programbench/data/tasks/ekzhang__bore.8e059cd/tests.json index e04ff38..14ad167 100644 --- a/src/programbench/data/tasks/ekzhang__bore.8e059cd/tests.json +++ b/src/programbench/data/tasks/ekzhang__bore.8e059cd/tests.json @@ -529,6 +529,17 @@ "id": "gold_fail" } ] + }, + { + "name": "tests.test_harvest.test_basic_proxy[None]@server_tests", + "reasons": [ + { + "id": "gold_flaky", + "note": "Flaky on v6 gold: passed in some gold-eval rounds and failed in others across 3 independent runs (timing-sensitive TUI/tui2cli snapshot, tmux, or db-lock/test-isolation concurrency); unreliable for grading.", + "timestamp": 1749700000, + "user": "kilian" + } + ] } ] }, diff --git a/src/programbench/data/tasks/elkowar__pipr.fae0b17/tests.json b/src/programbench/data/tasks/elkowar__pipr.fae0b17/tests.json index 12b2c35..06ef978 100644 --- a/src/programbench/data/tasks/elkowar__pipr.fae0b17/tests.json +++ b/src/programbench/data/tasks/elkowar__pipr.fae0b17/tests.json @@ -2546,6 +2546,17 @@ } ] }, + { + "name": "tests.test_command_list_window.test_history_enter_selection", + "reasons": [ + { + "id": "gold_flaky", + "note": "Flaky on v6 gold: passed in some gold-eval rounds and failed in others across 3 independent runs (timing-sensitive TUI/tui2cli snapshot, tmux, or db-lock/test-isolation concurrency); unreliable for grading.", + "timestamp": 1749700000, + "user": "kilian" + } + ] + }, { "name": "tests.test_command_list_window.test_history_escape_closes_window", "reasons": [ diff --git a/src/programbench/data/tasks/ffmpeg__ffmpeg.360a402/tests.json b/src/programbench/data/tasks/ffmpeg__ffmpeg.360a402/tests.json index 55ec360..752e269 100644 --- a/src/programbench/data/tasks/ffmpeg__ffmpeg.360a402/tests.json +++ b/src/programbench/data/tasks/ffmpeg__ffmpeg.360a402/tests.json @@ -3890,6 +3890,17 @@ } ] }, + { + "name": "tests.test_cmdutils_deep.test_list_encoders_complete", + "reasons": [ + { + "id": "gold_fail", + "note": "[gold_fail_v6_build_featureset] Fails on v6 gold: the reproducible build (e.g. ffmpeg --disable-autodetect) produces a deterministic but different optional-feature/codec/help-list set than the autodetect build the golden was captured against.", + "timestamp": 1749700000, + "user": "kilian" + } + ] + }, { "name": "tests.test_cmdutils_gaps.test_codecs_list_complete", "reasons": [ @@ -12453,6 +12464,50 @@ } ] }, + { + "name": "tests.test_help_info.test_codecs", + "reasons": [ + { + "id": "gold_fail", + "note": "[gold_fail_v6_build_featureset] Fails on v6 gold: the reproducible build (e.g. ffmpeg --disable-autodetect) produces a deterministic but different optional-feature/codec/help-list set than the autodetect build the golden was captured against.", + "timestamp": 1749700000, + "user": "kilian" + } + ] + }, + { + "name": "tests.test_help_info.test_decoders", + "reasons": [ + { + "id": "gold_fail", + "note": "[gold_fail_v6_build_featureset] Fails on v6 gold: the reproducible build (e.g. ffmpeg --disable-autodetect) produces a deterministic but different optional-feature/codec/help-list set than the autodetect build the golden was captured against.", + "timestamp": 1749700000, + "user": "kilian" + } + ] + }, + { + "name": "tests.test_help_info.test_encoders", + "reasons": [ + { + "id": "gold_fail", + "note": "[gold_fail_v6_build_featureset] Fails on v6 gold: the reproducible build (e.g. ffmpeg --disable-autodetect) produces a deterministic but different optional-feature/codec/help-list set than the autodetect build the golden was captured against.", + "timestamp": 1749700000, + "user": "kilian" + } + ] + }, + { + "name": "tests.test_help_info.test_help_full", + "reasons": [ + { + "id": "gold_fail", + "note": "[gold_fail_v6_build_featureset] Fails on v6 gold: the reproducible build (e.g. ffmpeg --disable-autodetect) produces a deterministic but different optional-feature/codec/help-list set than the autodetect build the golden was captured against.", + "timestamp": 1749700000, + "user": "kilian" + } + ] + }, { "name": "tests.test_logging.test_default_help_shows_banner", "reasons": [ @@ -12565,6 +12620,39 @@ } ] }, + { + "name": "tests.test_opt_common_final.test_show_codecs_list", + "reasons": [ + { + "id": "gold_fail", + "note": "[gold_fail_v6_build_featureset] Fails on v6 gold: the reproducible build (e.g. ffmpeg --disable-autodetect) produces a deterministic but different optional-feature/codec/help-list set than the autodetect build the golden was captured against.", + "timestamp": 1749700000, + "user": "kilian" + } + ] + }, + { + "name": "tests.test_opt_common_final.test_show_decoders_list", + "reasons": [ + { + "id": "gold_fail", + "note": "[gold_fail_v6_build_featureset] Fails on v6 gold: the reproducible build (e.g. ffmpeg --disable-autodetect) produces a deterministic but different optional-feature/codec/help-list set than the autodetect build the golden was captured against.", + "timestamp": 1749700000, + "user": "kilian" + } + ] + }, + { + "name": "tests.test_opt_common_final.test_show_encoders_list", + "reasons": [ + { + "id": "gold_fail", + "note": "[gold_fail_v6_build_featureset] Fails on v6 gold: the reproducible build (e.g. ffmpeg --disable-autodetect) produces a deterministic but different optional-feature/codec/help-list set than the autodetect build the golden was captured against.", + "timestamp": 1749700000, + "user": "kilian" + } + ] + }, { "name": "tests.test_option_parsing.test_noautorotate_boolean_negation", "reasons": [ @@ -12584,6 +12672,23 @@ "user": "kilian" } ] + }, + { + "name": "tests.test_textformat_gaps.test_default_format_noprint_wrappers_option", + "reasons": [ + { + "extra": { + "fail": 1, + "note": "test-resource setup race: input.mp4 not present yet when test runs under pytest-xdist", + "pass": 19, + "pattern": "ppppfppppppppppppppp", + "source": "20x gold cloud eval 2026-06-13" + }, + "id": "gold_flaky", + "timestamp": 1781326815, + "user": "kilian" + } + ] } ] }, diff --git a/src/programbench/data/tasks/gromacs__gromacs.665ea4c/tests.json b/src/programbench/data/tasks/gromacs__gromacs.665ea4c/tests.json index 92cf950..45060b0 100644 --- a/src/programbench/data/tasks/gromacs__gromacs.665ea4c/tests.json +++ b/src/programbench/data/tasks/gromacs__gromacs.665ea4c/tests.json @@ -2314,6 +2314,127 @@ } ] }, + { + "name": "tests.test_structure.test_gyrate_protein_basic", + "reasons": [ + { + "id": "gold_fail", + "note": "[gold_fail_v6_build_stamp] Fails on v6 gold: test asserts a build-stamped version/commit/date string (non-reproducible build metadata, e.g. embedded git rev or build date) rather than functionality; the golden captured the previous build's stamp.", + "timestamp": 1749700000, + "user": "kilian" + } + ] + }, + { + "name": "tests.test_structure.test_gyrate_weighting_modes", + "reasons": [ + { + "id": "gold_fail", + "note": "[gold_fail_v6_build_stamp] Fails on v6 gold: test asserts a build-stamped version/commit/date string (non-reproducible build metadata, e.g. embedded git rev or build date) rather than functionality; the golden captured the previous build's stamp.", + "timestamp": 1749700000, + "user": "kilian" + } + ] + }, + { + "name": "tests.test_structure.test_rms_backbone", + "reasons": [ + { + "id": "gold_fail", + "note": "[gold_fail_v6_build_stamp] Fails on v6 gold: test asserts a build-stamped version/commit/date string (non-reproducible build metadata, e.g. embedded git rev or build date) rather than functionality; the golden captured the previous build's stamp.", + "timestamp": 1749700000, + "user": "kilian" + } + ] + }, + { + "name": "tests.test_structure.test_rms_calpha_basic", + "reasons": [ + { + "id": "gold_fail", + "note": "[gold_fail_v6_build_stamp] Fails on v6 gold: test asserts a build-stamped version/commit/date string (non-reproducible build metadata, e.g. embedded git rev or build date) rather than functionality; the golden captured the previous build's stamp.", + "timestamp": 1749700000, + "user": "kilian" + } + ] + }, + { + "name": "tests.test_structure.test_rms_fit_translation", + "reasons": [ + { + "id": "gold_fail", + "note": "[gold_fail_v6_build_stamp] Fails on v6 gold: test asserts a build-stamped version/commit/date string (non-reproducible build metadata, e.g. embedded git rev or build date) rather than functionality; the golden captured the previous build's stamp.", + "timestamp": 1749700000, + "user": "kilian" + } + ] + }, + { + "name": "tests.test_structure.test_rms_mirror_image", + "reasons": [ + { + "id": "gold_fail", + "note": "[gold_fail_v6_build_stamp] Fails on v6 gold: test asserts a build-stamped version/commit/date string (non-reproducible build metadata, e.g. embedded git rev or build date) rather than functionality; the golden captured the previous build's stamp.", + "timestamp": 1749700000, + "user": "kilian" + } + ] + }, + { + "name": "tests.test_structure.test_rms_nofit", + "reasons": [ + { + "id": "gold_fail", + "note": "[gold_fail_v6_build_stamp] Fails on v6 gold: test asserts a build-stamped version/commit/date string (non-reproducible build metadata, e.g. embedded git rev or build date) rather than functionality; the golden captured the previous build's stamp.", + "timestamp": 1749700000, + "user": "kilian" + } + ] + }, + { + "name": "tests.test_structure.test_rms_time_range", + "reasons": [ + { + "id": "gold_fail", + "note": "[gold_fail_v6_build_stamp] Fails on v6 gold: test asserts a build-stamped version/commit/date string (non-reproducible build metadata, e.g. embedded git rev or build date) rather than functionality; the golden captured the previous build's stamp.", + "timestamp": 1749700000, + "user": "kilian" + } + ] + }, + { + "name": "tests.test_structure.test_rms_what_rho", + "reasons": [ + { + "id": "gold_fail", + "note": "[gold_fail_v6_build_stamp] Fails on v6 gold: test asserts a build-stamped version/commit/date string (non-reproducible build metadata, e.g. embedded git rev or build date) rather than functionality; the golden captured the previous build's stamp.", + "timestamp": 1749700000, + "user": "kilian" + } + ] + }, + { + "name": "tests.test_structure.test_rmsf_calpha_basic", + "reasons": [ + { + "id": "gold_fail", + "note": "[gold_fail_v6_build_stamp] Fails on v6 gold: test asserts a build-stamped version/commit/date string (non-reproducible build metadata, e.g. embedded git rev or build date) rather than functionality; the golden captured the previous build's stamp.", + "timestamp": 1749700000, + "user": "kilian" + } + ] + }, + { + "name": "tests.test_structure.test_rmsf_residue_averaging", + "reasons": [ + { + "id": "gold_fail", + "note": "[gold_fail_v6_build_stamp] Fails on v6 gold: test asserts a build-stamped version/commit/date string (non-reproducible build metadata, e.g. embedded git rev or build date) rather than functionality; the golden captured the previous build's stamp.", + "timestamp": 1749700000, + "user": "kilian" + } + ] + }, { "name": "tests.test_tcaf_trjorder.test_tcaf_cubic_averaging_optional", "reasons": [ diff --git a/src/programbench/data/tasks/hatoo__oha.8dc6349/tests.json b/src/programbench/data/tasks/hatoo__oha.8dc6349/tests.json index ffeaf24..c100ea8 100644 --- a/src/programbench/data/tasks/hatoo__oha.8dc6349/tests.json +++ b/src/programbench/data/tasks/hatoo__oha.8dc6349/tests.json @@ -1815,6 +1815,28 @@ } ] }, + { + "name": "tests.test_output.test_json_output_details_connection_time_relationships", + "reasons": [ + { + "id": "gold_flaky", + "note": "Flaky on v6 gold: passed in some gold-eval rounds and failed in others across 3 independent runs (timing-sensitive TUI/tui2cli snapshot, tmux, or db-lock/test-isolation concurrency); unreliable for grading.", + "timestamp": 1749700000, + "user": "kilian" + } + ] + }, + { + "name": "tests.test_output.test_json_output_details_connection_times", + "reasons": [ + { + "id": "gold_flaky", + "note": "Flaky on v6 gold: passed in some gold-eval rounds and failed in others across 3 independent runs (timing-sensitive TUI/tui2cli snapshot, tmux, or db-lock/test-isolation concurrency); unreliable for grading.", + "timestamp": 1749700000, + "user": "kilian" + } + ] + }, { "name": "tests.test_output.test_json_output_error_distribution_empty_on_success", "reasons": [ @@ -1857,6 +1879,17 @@ } ] }, + { + "name": "tests.test_output.test_json_output_status_code_distribution", + "reasons": [ + { + "id": "gold_flaky", + "note": "Flaky on v6 gold: passed in some gold-eval rounds and failed in others across 3 independent runs (timing-sensitive TUI/tui2cli snapshot, tmux, or db-lock/test-isolation concurrency); unreliable for grading.", + "timestamp": 1749700000, + "user": "kilian" + } + ] + }, { "name": "tests.test_output.test_json_output_with_higher_request_count", "reasons": [ @@ -1873,6 +1906,17 @@ } ] }, + { + "name": "tests.test_output.test_json_output_with_single_connection", + "reasons": [ + { + "id": "gold_flaky", + "note": "Flaky on v6 gold: passed in some gold-eval rounds and failed in others across 3 independent runs (timing-sensitive TUI/tui2cli snapshot, tmux, or db-lock/test-isolation concurrency); unreliable for grading.", + "timestamp": 1749700000, + "user": "kilian" + } + ] + }, { "name": "tests.test_output.test_json_schema_validation", "reasons": [ @@ -2105,6 +2149,17 @@ } ] }, + { + "name": "tests.test_timescale.test_time_unit_precision_formatting", + "reasons": [ + { + "id": "gold_fail", + "note": "[gold_fail_v6_toolchain] Summary emits fewer 4-decimal time values than the golden expects (oha output-format/version drift). 20x gold cloud eval 2026-06-13, failed 20/20 rounds", + "timestamp": 1781487387, + "user": "kilian" + } + ] + }, { "name": "tests.test_timescale.test_time_unit_seconds_text_output", "reasons": [ @@ -2243,6 +2298,24 @@ } ] }, + { + "name": "tests.test_tui.test_tui_multiple_status_codes", + "reasons": [ + { + "extra": { + "cause": "Flaky TUI render timing (live progress / HTTP-load output)", + "statuses": [ + "failure", + "passed", + "passed" + ] + }, + "id": "gold_flaky", + "timestamp": 1781197573, + "user": "kilian" + } + ] + }, { "name": "tests.test_tui.test_tui_with_high_concurrency", "reasons": [ @@ -2252,6 +2325,24 @@ "user": "kilian" } ] + }, + { + "name": "tests.test_tui.test_tui_with_post_method", + "reasons": [ + { + "extra": { + "cause": "Flaky TUI render timing (live progress / HTTP-load output)", + "statuses": [ + "failure", + "passed", + "passed" + ] + }, + "id": "gold_flaky", + "timestamp": 1781197573, + "user": "kilian" + } + ] } ] }, diff --git a/src/programbench/data/tasks/htop-dev__htop.523600b/tests.json b/src/programbench/data/tasks/htop-dev__htop.523600b/tests.json index fb04388..fbd6acf 100644 --- a/src/programbench/data/tasks/htop-dev__htop.523600b/tests.json +++ b/src/programbench/data/tasks/htop-dev__htop.523600b/tests.json @@ -5184,6 +5184,17 @@ } ] }, + { + "name": "eval.tests.test_ultra_intensive.test_ultra_tree_all_configs", + "reasons": [ + { + "id": "gold_flaky", + "note": "Flaky on v6 gold: passed in some gold-eval rounds and failed in others across 3 independent runs (timing-sensitive TUI/tui2cli snapshot, tmux, or db-lock/test-isolation concurrency); unreliable for grading.", + "timestamp": 1749700000, + "user": "kilian" + } + ] + }, { "name": "eval.tests.test_update_control.test_delay_flag", "reasons": [ diff --git a/src/programbench/data/tasks/isona__dirble.e2dea9f/tests.json b/src/programbench/data/tasks/isona__dirble.e2dea9f/tests.json index fca7da9..a2ea0c9 100644 --- a/src/programbench/data/tasks/isona__dirble.e2dea9f/tests.json +++ b/src/programbench/data/tasks/isona__dirble.e2dea9f/tests.json @@ -2460,6 +2460,24 @@ "tests.test_validator.test_validator_with_length_based_detection" ], "ignored_tests": [ + { + "name": "tests.test_edge_cases.test_uri_file_with_mixed_valid_invalid_urls", + "reasons": [ + { + "extra": { + "cause": "Network-dependent (external HTTP/proxy, e.g. httpbin / ipv6 localhost)", + "statuses": [ + "passed", + "failure", + "passed" + ] + }, + "id": "gold_flaky", + "timestamp": 1781197573, + "user": "kilian" + } + ] + }, { "name": "tests.test_edge_cases.test_url_with_encoded_spaces", "reasons": [ @@ -3320,6 +3338,23 @@ } ] }, + { + "name": "tests.test_output_formats.test_default_text_output_format", + "reasons": [ + { + "extra": { + "fail": 1, + "note": "local HTTP test-server connection race (curl: Could not connect to 127.0.0.1:8765)", + "pass": 19, + "pattern": "ppppppppppppppppfppp", + "source": "20x gold cloud eval 2026-06-13" + }, + "id": "gold_flaky", + "timestamp": 1781326815, + "user": "kilian" + } + ] + }, { "name": "tests.test_output_formats.test_hide_lengths_affects_all_output_formats", "reasons": [ @@ -3400,6 +3435,23 @@ } ] }, + { + "name": "tests.test_output_formats.test_show_htaccess_flag", + "reasons": [ + { + "extra": { + "fail": 2, + "note": "local HTTP test-server connection race (curl: Could not connect to 127.0.0.1:8765)", + "pass": 18, + "pattern": "pppppppppppppfppfppp", + "source": "20x gold cloud eval 2026-06-13" + }, + "id": "gold_flaky", + "timestamp": 1781326815, + "user": "kilian" + } + ] + }, { "name": "tests.test_output_formats.test_silent_mode_no_live_output", "reasons": [ diff --git a/src/programbench/data/tasks/jgm__pandoc.5caad90/tests.json b/src/programbench/data/tasks/jgm__pandoc.5caad90/tests.json index f8ec440..c72016d 100644 --- a/src/programbench/data/tasks/jgm__pandoc.5caad90/tests.json +++ b/src/programbench/data/tasks/jgm__pandoc.5caad90/tests.json @@ -5529,6 +5529,39 @@ "tests.test_zip_output.test_directory_extraction_preserves_all_files" ], "ignored_tests": [ + { + "name": "tests.test_asciidoc_rst_typst.test_asciidoc_attributes_and_metadata", + "reasons": [ + { + "id": "gold_fail", + "note": "[gold_fail_v6_toolchain] Gold emits pandoc-api-version [1,23,1,2]; goldens captured against [1,23,1,1] (pandoc-types library version drift in the reproducible build). 20x gold cloud eval 2026-06-13, failed 20/20 rounds", + "timestamp": 1781487387, + "user": "kilian" + } + ] + }, + { + "name": "tests.test_asciidoc_rst_typst.test_rst_to_json_ast_structure", + "reasons": [ + { + "id": "gold_fail", + "note": "[gold_fail_v6_toolchain] Gold emits pandoc-api-version [1,23,1,2]; goldens captured against [1,23,1,1] (pandoc-types library version drift in the reproducible build). 20x gold cloud eval 2026-06-13, failed 20/20 rounds", + "timestamp": 1781487387, + "user": "kilian" + } + ] + }, + { + "name": "tests.test_asciidoc_rst_typst.test_typst_metadata_extraction", + "reasons": [ + { + "id": "gold_fail", + "note": "[gold_fail_v6_toolchain] Gold emits pandoc-api-version [1,23,1,2]; goldens captured against [1,23,1,1] (pandoc-types library version drift in the reproducible build). 20x gold cloud eval 2026-06-13, failed 20/20 rounds", + "timestamp": 1781487387, + "user": "kilian" + } + ] + }, { "name": "tests.test_citations.test_bibliography_entry_ids", "reasons": [ @@ -5689,6 +5722,17 @@ } ] }, + { + "name": "tests.test_encoding_i18n.test_unicode_preservation_in_json_format", + "reasons": [ + { + "id": "gold_fail", + "note": "[gold_fail_v6_toolchain] Gold emits pandoc-api-version [1,23,1,2]; goldens captured against [1,23,1,1] (pandoc-types library version drift in the reproducible build). 20x gold cloud eval 2026-06-13, failed 20/20 rounds", + "timestamp": 1781487387, + "user": "kilian" + } + ] + }, { "name": "tests.test_error_handling.test_medium_large_file_processing", "reasons": [ @@ -5699,6 +5743,17 @@ } ] }, + { + "name": "tests.test_error_paths.test_json_incompatible_api_version", + "reasons": [ + { + "id": "gold_fail", + "note": "[gold_fail_v6_toolchain] Gold emits pandoc-api-version [1,23,1,2]; goldens captured against [1,23,1,1] (pandoc-types library version drift in the reproducible build). 20x gold cloud eval 2026-06-13, failed 20/20 rounds", + "timestamp": 1781487387, + "user": "kilian" + } + ] + }, { "name": "tests.test_filesystem_ops.test_binary_output_format_pdf_requires_output_file", "reasons": [ @@ -5719,6 +5774,17 @@ } ] }, + { + "name": "tests.test_filesystem_ops.test_output_json_format_produces_parseable_json", + "reasons": [ + { + "id": "gold_fail", + "note": "[gold_fail_v6_toolchain] Gold emits pandoc-api-version [1,23,1,2]; goldens captured against [1,23,1,1] (pandoc-types library version drift in the reproducible build). 20x gold cloud eval 2026-06-13, failed 20/20 rounds", + "timestamp": 1781487387, + "user": "kilian" + } + ] + }, { "name": "tests.test_filesystem_ops.test_output_same_as_input_file_handles_atomicity", "reasons": [ @@ -5729,6 +5795,39 @@ } ] }, + { + "name": "tests.test_filters_ast_advanced.test_filter_complex_metadata_structure", + "reasons": [ + { + "id": "gold_fail", + "note": "[gold_fail_v6_toolchain] Gold emits pandoc-api-version [1,23,1,2]; goldens captured against [1,23,1,1] (pandoc-types library version drift in the reproducible build). 20x gold cloud eval 2026-06-13, failed 20/20 rounds", + "timestamp": 1781487387, + "user": "kilian" + } + ] + }, + { + "name": "tests.test_filters_ast_advanced.test_json_ast_filter_transformation", + "reasons": [ + { + "id": "gold_fail", + "note": "[gold_fail_v6_toolchain] Gold emits pandoc-api-version [1,23,1,2]; goldens captured against [1,23,1,1] (pandoc-types library version drift in the reproducible build). 20x gold cloud eval 2026-06-13, failed 20/20 rounds", + "timestamp": 1781487387, + "user": "kilian" + } + ] + }, + { + "name": "tests.test_format_matrix.test_markdown_to_json_ast_lossless_machine_readable", + "reasons": [ + { + "id": "gold_fail", + "note": "[gold_fail_v6_toolchain] Gold emits pandoc-api-version [1,23,1,2]; goldens captured against [1,23,1,1] (pandoc-types library version drift in the reproducible build). 20x gold cloud eval 2026-06-13, failed 20/20 rounds", + "timestamp": 1781487387, + "user": "kilian" + } + ] + }, { "name": "tests.test_harvest_cmd_batch1.test_cmd_10094", "reasons": [ @@ -7404,6 +7503,28 @@ } ] }, + { + "name": "tests.test_man_format.test_man_only_title_header", + "reasons": [ + { + "id": "gold_fail", + "note": "[gold_fail_v6_toolchain] Gold emits pandoc-api-version [1,23,1,2]; goldens captured against [1,23,1,1] (pandoc-types library version drift in the reproducible build). 20x gold cloud eval 2026-06-13, failed 20/20 rounds", + "timestamp": 1781487387, + "user": "kilian" + } + ] + }, + { + "name": "tests.test_man_format.test_man_to_json_ast", + "reasons": [ + { + "id": "gold_fail", + "note": "[gold_fail_v6_toolchain] Gold emits pandoc-api-version [1,23,1,2]; goldens captured against [1,23,1,1] (pandoc-types library version drift in the reproducible build). 20x gold cloud eval 2026-06-13, failed 20/20 rounds", + "timestamp": 1781487387, + "user": "kilian" + } + ] + }, { "name": "tests.test_markdown_headings_lists.test_markdown_headings_roundtrip_atx", "reasons": [ @@ -7424,6 +7545,17 @@ } ] }, + { + "name": "tests.test_org_opml_pod.test_pod_to_json_ast", + "reasons": [ + { + "id": "gold_fail", + "note": "[gold_fail_v6_toolchain] Gold emits pandoc-api-version [1,23,1,2]; goldens captured against [1,23,1,1] (pandoc-types library version drift in the reproducible build). 20x gold cloud eval 2026-06-13, failed 20/20 rounds", + "timestamp": 1781487387, + "user": "kilian" + } + ] + }, { "name": "tests.test_pdf.test_pdf_large_document", "reasons": [ @@ -7478,6 +7610,17 @@ } ] }, + { + "name": "tests.test_rtf_advanced.test_rtf_to_json_ast_structure", + "reasons": [ + { + "id": "gold_fail", + "note": "[gold_fail_v6_toolchain] Gold emits pandoc-api-version [1,23,1,2]; goldens captured against [1,23,1,1] (pandoc-types library version drift in the reproducible build). 20x gold cloud eval 2026-06-13, failed 20/20 rounds", + "timestamp": 1781487387, + "user": "kilian" + } + ] + }, { "name": "tests.test_signal_handling.test_binary_stdin_rejection", "reasons": [ @@ -7672,6 +7815,17 @@ } ] }, + { + "name": "tests.test_tsv_tables.test_tsv_to_json_ast", + "reasons": [ + { + "id": "gold_fail", + "note": "[gold_fail_v6_toolchain] Gold emits pandoc-api-version [1,23,1,2]; goldens captured against [1,23,1,1] (pandoc-types library version drift in the reproducible build). 20x gold cloud eval 2026-06-13, failed 20/20 rounds", + "timestamp": 1781487387, + "user": "kilian" + } + ] + }, { "name": "tests.test_variable_json.test_variable_json_boolean_false", "reasons": [ diff --git a/src/programbench/data/tasks/jrnxf__thokr.09375ef/tests.json b/src/programbench/data/tasks/jrnxf__thokr.09375ef/tests.json index 42ca5f7..afa87fc 100644 --- a/src/programbench/data/tasks/jrnxf__thokr.09375ef/tests.json +++ b/src/programbench/data/tasks/jrnxf__thokr.09375ef/tests.json @@ -556,6 +556,24 @@ } ] }, + { + "name": "tests.test_logging.test_timed_test_num_secs_has_value", + "reasons": [ + { + "extra": { + "cause": "Timing-dependent (elapsed-time assertion)", + "statuses": [ + "passed", + "passed", + "failure" + ] + }, + "id": "gold_flaky", + "timestamp": 1781197573, + "user": "kilian" + } + ] + }, { "name": "tests.test_tui_basic.test_ctrl_c_exits", "reasons": [ @@ -566,6 +584,51 @@ } ] }, + { + "name": "tests.test_tui_timed.test_timer_display_format", + "reasons": [ + { + "id": "gold_flaky", + "note": "Flaky on v6 gold: passed in some gold-eval rounds and failed in others across 3 independent runs (timing-sensitive TUI/tui2cli snapshot, tmux, or db-lock/test-isolation concurrency); unreliable for grading.", + "timestamp": 1749700000, + "user": "kilian" + } + ] + }, + { + "name": "tests.test_tui_timed.test_timer_not_shown_without_s_flag", + "reasons": [ + { + "extra": { + "fail": 1, + "note": "TUI timer rendering timing race (running/paused timer state non-deterministic in captured frame)", + "pass": 19, + "pattern": "pfpppppppppppppppppp", + "source": "20x gold cloud eval 2026-06-13" + }, + "id": "gold_flaky", + "timestamp": 1781326815, + "user": "kilian" + } + ] + }, + { + "name": "tests.test_tui_timed.test_timer_pauses_on_results_screen", + "reasons": [ + { + "extra": { + "fail": 13, + "note": "TUI timer rendering timing race (running/paused timer state non-deterministic in captured frame)", + "pass": 7, + "pattern": "ppffffffppfppffffffp", + "source": "20x gold cloud eval 2026-06-13" + }, + "id": "gold_flaky", + "timestamp": 1781326815, + "user": "kilian" + } + ] + }, { "name": "tests.test_tui_timed.test_timer_with_3_seconds", "reasons": [ @@ -576,6 +639,23 @@ } ] }, + { + "name": "tests.test_tui_timed.test_timer_with_5_seconds", + "reasons": [ + { + "extra": { + "fail": 5, + "note": "TUI timer rendering timing race (running/paused timer state non-deterministic in captured frame)", + "pass": 15, + "pattern": "fffpppppppfppppppfpp", + "source": "20x gold cloud eval 2026-06-13" + }, + "id": "gold_flaky", + "timestamp": 1781326815, + "user": "kilian" + } + ] + }, { "name": "tests.test_tui_timed.test_timer_with_very_long_duration", "reasons": [ diff --git a/src/programbench/data/tasks/junegunn__fzf.b56d614/tests.json b/src/programbench/data/tasks/junegunn__fzf.b56d614/tests.json index 66245c6..8c36b70 100644 --- a/src/programbench/data/tasks/junegunn__fzf.b56d614/tests.json +++ b/src/programbench/data/tasks/junegunn__fzf.b56d614/tests.json @@ -4644,6 +4644,17 @@ "tests.test_fzf.TestSchemeAndScoring.test_path_scheme" ], "ignored_tests": [ + { + "name": "tests.test_fzf.TestBasicFunctionality.test_version_flag", + "reasons": [ + { + "id": "gold_fail", + "note": "[gold_fail_v6_build_stamp] Fails on v6 gold: test asserts a build-stamped version/commit/date string (non-reproducible build metadata, e.g. embedded git rev or build date) rather than functionality; the golden captured the previous build's stamp.", + "timestamp": 1749700000, + "user": "kilian" + } + ] + }, { "name": "tests.test_fzf.TestCommandLineParsing.test_short_and_long_flags", "reasons": [ @@ -4703,6 +4714,17 @@ "eval.tests.test_fzf_interactive_tmux.test_interactive_expect_ctrl_j_prints_key_then_selection" ], "ignored_tests": [ + { + "name": "eval.tests.test_fzf_cli.test_version_exact", + "reasons": [ + { + "id": "gold_fail", + "note": "[gold_fail_v6_build_stamp] Fails on v6 gold: test asserts a build-stamped version/commit/date string (non-reproducible build metadata, e.g. embedded git rev or build date) rather than functionality; the golden captured the previous build's stamp.", + "timestamp": 1749700000, + "user": "kilian" + } + ] + }, { "name": "eval.tests.test_fzf_interactive_tmux.test_interactive_expect_ctrl_j_prints_key_then_selection", "reasons": [ diff --git a/src/programbench/data/tasks/kisielk__errcheck.dacab89/tests.json b/src/programbench/data/tasks/kisielk__errcheck.dacab89/tests.json index e64cb0e..50e3563 100644 --- a/src/programbench/data/tasks/kisielk__errcheck.dacab89/tests.json +++ b/src/programbench/data/tasks/kisielk__errcheck.dacab89/tests.json @@ -1373,6 +1373,17 @@ } ] }, + { + "name": "tests.test_output.test_exit_code_two_on_fatal_errors", + "reasons": [ + { + "id": "gold_fail", + "note": "[gold_fail_v6_toolchain] Fails on v6 gold: output reflects the pinned toolchain / dependency versions (e.g. Go compiler diagnostic wording, stdlib net behavior, or generated-code content); the golden captured the previous build environment's output.", + "timestamp": 1749700000, + "user": "kilian" + } + ] + }, { "name": "tests.test_smoke.test_no_args_runs", "reasons": [ diff --git a/src/programbench/data/tasks/kyoheiu__felix.95df390/tests.json b/src/programbench/data/tasks/kyoheiu__felix.95df390/tests.json index 6bd466f..80903c5 100644 --- a/src/programbench/data/tasks/kyoheiu__felix.95df390/tests.json +++ b/src/programbench/data/tasks/kyoheiu__felix.95df390/tests.json @@ -2837,6 +2837,17 @@ } ] }, + { + "name": "tests.test_layout_calculations.test_narrow_terminal_below_proper_width", + "reasons": [ + { + "id": "gold_flaky", + "note": "Flaky on v6 gold: passed in some gold-eval rounds and failed in others across 3 independent runs (timing-sensitive TUI/tui2cli snapshot, tmux, or db-lock/test-isolation concurrency); unreliable for grading.", + "timestamp": 1749700000, + "user": "kilian" + } + ] + }, { "name": "tests.test_magic_packed_extended.test_unpack_plain_text_non_archive_error", "reasons": [ diff --git a/src/programbench/data/tasks/mkj__dropbear.75f699b/tests.json b/src/programbench/data/tasks/mkj__dropbear.75f699b/tests.json index decbde8..dd533ed 100644 --- a/src/programbench/data/tasks/mkj__dropbear.75f699b/tests.json +++ b/src/programbench/data/tasks/mkj__dropbear.75f699b/tests.json @@ -1651,6 +1651,23 @@ } ] }, + { + "name": "tests.test_authkey_options_gap.test_malformed_permitopen_non_numeric_port", + "reasons": [ + { + "extra": { + "fail": 12, + "note": "ssh dbclient connection/host-key-mismatch/socket-bind race (regenerated host keys + port reuse under xdist)", + "pass": 8, + "pattern": "pffppffpppffffpffpff", + "source": "20x gold cloud eval 2026-06-13" + }, + "id": "gold_flaky", + "timestamp": 1781326815, + "user": "kilian" + } + ] + }, { "name": "tests.test_authkey_options_gap.test_multiple_command_options_rejected", "reasons": [ @@ -1661,6 +1678,17 @@ } ] }, + { + "name": "tests.test_authkey_options_gap.test_option_case_insensitive", + "reasons": [ + { + "id": "gold_flaky", + "note": "ssh dbclient connection/host-key-mismatch race (same family as other dropbear flakes). caught in postfix gold rerun (21st run); passed 20/20 in 20x sweep", + "timestamp": 1781500166, + "user": "kilian" + } + ] + }, { "name": "tests.test_authkey_options_gap.test_unknown_option_causes_rejection", "reasons": [ @@ -2021,6 +2049,41 @@ } ] }, + { + "name": "tests.test_forwarding_x11_agent.test_agent_socket_actually_exists", + "reasons": [ + { + "extra": { + "fail": 1, + "note": "ssh dbclient connection/host-key-mismatch/socket-bind race (regenerated host keys + port reuse under xdist)", + "pass": 19, + "pattern": "pppppppppppppfpppppp", + "source": "20x gold cloud eval 2026-06-13" + }, + "id": "gold_flaky", + "timestamp": 1781326815, + "user": "kilian" + } + ] + }, + { + "name": "tests.test_forwarding_x11_agent.test_agent_socket_cleanup_on_disconnect", + "reasons": [ + { + "extra": { + "cause": "SSH socket/port/host-key race (random ports, key mismatch)", + "statuses": [ + "passed", + "failure", + "passed" + ] + }, + "id": "gold_flaky", + "timestamp": 1781197573, + "user": "kilian" + } + ] + }, { "name": "tests.test_forwarding_x11_agent.test_agent_socket_directory_permissions", "reasons": [ @@ -2031,6 +2094,24 @@ } ] }, + { + "name": "tests.test_forwarding_x11_agent.test_agent_socket_fd_number_in_filename", + "reasons": [ + { + "extra": { + "cause": "SSH socket/port/host-key race (random ports, key mismatch)", + "statuses": [ + "passed", + "failure", + "passed" + ] + }, + "id": "gold_flaky", + "timestamp": 1781197573, + "user": "kilian" + } + ] + }, { "name": "tests.test_forwarding_x11_agent.test_agent_socket_is_unix_domain_socket", "reasons": [ @@ -2041,6 +2122,17 @@ } ] }, + { + "name": "tests.test_forwarding_x11_agent.test_agent_socket_path_format", + "reasons": [ + { + "id": "gold_flaky", + "note": "Flaky on v6 gold: passed in some gold-eval rounds and failed in others across 3 independent runs (timing-sensitive TUI/tui2cli snapshot, tmux, or db-lock/test-isolation concurrency); unreliable for grading.", + "timestamp": 1749700000, + "user": "kilian" + } + ] + }, { "name": "tests.test_forwarding_x11_agent.test_agent_socket_permissions", "reasons": [ @@ -2051,6 +2143,23 @@ } ] }, + { + "name": "tests.test_forwarding_x11_agent.test_agent_socket_random_components", + "reasons": [ + { + "extra": { + "fail": 3, + "note": "ssh dbclient connection/host-key-mismatch/socket-bind race (regenerated host keys + port reuse under xdist)", + "pass": 17, + "pattern": "fpppfpppppppfppppppp", + "source": "20x gold cloud eval 2026-06-13" + }, + "id": "gold_flaky", + "timestamp": 1781326815, + "user": "kilian" + } + ] + }, { "name": "tests.test_forwarding_x11_agent.test_no_agent_forwarding_option_code_exists", "reasons": [ @@ -3141,6 +3250,24 @@ } ] }, + { + "name": "tests.test_netio_gap.test_client_connection_to_ipv4_only_server", + "reasons": [ + { + "extra": { + "cause": "SSH socket/port/host-key race (random ports, key mismatch)", + "statuses": [ + "passed", + "passed", + "failure" + ] + }, + "id": "gold_flaky", + "timestamp": 1781197573, + "user": "kilian" + } + ] + }, { "name": "tests.test_netio_gap.test_client_ipv4_mapped_connection", "reasons": [ @@ -3371,6 +3498,23 @@ } ] }, + { + "name": "tests.test_pty_terminal.test_pty_newline_handling", + "reasons": [ + { + "extra": { + "fail": 1, + "note": "ssh dbclient connection/host-key-mismatch/socket-bind race (regenerated host keys + port reuse under xdist)", + "pass": 19, + "pattern": "ppppppfppppppppppppp", + "source": "20x gold cloud eval 2026-06-13" + }, + "id": "gold_flaky", + "timestamp": 1781326815, + "user": "kilian" + } + ] + }, { "name": "tests.test_pty_terminal.test_pty_not_allocated_without_t_flag", "reasons": [ @@ -3391,6 +3535,23 @@ } ] }, + { + "name": "tests.test_pty_terminal.test_pty_read_large_output", + "reasons": [ + { + "extra": { + "fail": 1, + "note": "ssh dbclient connection/host-key-mismatch/socket-bind race (regenerated host keys + port reuse under xdist)", + "pass": 19, + "pattern": "fppppppppppppppppppp", + "source": "20x gold cloud eval 2026-06-13" + }, + "id": "gold_flaky", + "timestamp": 1781326815, + "user": "kilian" + } + ] + }, { "name": "tests.test_pty_terminal.test_pty_session_without_shell", "reasons": [ @@ -3511,6 +3672,24 @@ } ] }, + { + "name": "tests.test_pubkey_auth.test_crlf_line_endings", + "reasons": [ + { + "extra": { + "cause": "SSH socket/port/host-key race (random ports, key mismatch)", + "statuses": [ + "failure", + "passed", + "failure" + ] + }, + "id": "gold_flaky", + "timestamp": 1781197573, + "user": "kilian" + } + ] + }, { "name": "tests.test_pubkey_auth.test_different_key_types_ecdsa", "reasons": [ @@ -4643,6 +4822,19 @@ }, { "id": "dummy_pass" + }, + { + "extra": { + "cause": "SSH socket/port/host-key race (random ports, key mismatch)", + "statuses": [ + "failure", + "passed", + "failure" + ] + }, + "id": "gold_flaky", + "timestamp": 1781197573, + "user": "kilian" } ] }, diff --git a/src/programbench/data/tasks/nikolassv__bartib.6b9b5ce/tests.json b/src/programbench/data/tasks/nikolassv__bartib.6b9b5ce/tests.json index 9833522..77e4bfd 100644 --- a/src/programbench/data/tasks/nikolassv__bartib.6b9b5ce/tests.json +++ b/src/programbench/data/tasks/nikolassv__bartib.6b9b5ce/tests.json @@ -1135,6 +1135,17 @@ } ] }, + { + "name": "tests.test_current_status.test_current_special_characters_in_names", + "reasons": [ + { + "id": "gold_fail", + "note": "[gold_fail_v6_toolchain] Table header trailing-whitespace differs by one space from golden (formatter version drift). 20x gold cloud eval 2026-06-13, failed 20/20 rounds", + "timestamp": 1781487387, + "user": "kilian" + } + ] + }, { "name": "tests.test_current_status.test_status_aggregation_multiple_same_project", "reasons": [ diff --git a/src/programbench/data/tasks/noborus__ov.b96c2ba/tests.json b/src/programbench/data/tasks/noborus__ov.b96c2ba/tests.json index 8083c2a..dbc7c34 100644 --- a/src/programbench/data/tasks/noborus__ov.b96c2ba/tests.json +++ b/src/programbench/data/tasks/noborus__ov.b96c2ba/tests.json @@ -5451,6 +5451,17 @@ } ] }, + { + "name": "tests.test_edit.test_editor_line_number_placeholder", + "reasons": [ + { + "id": "gold_flaky", + "note": "Flaky on v6 gold: passed in some gold-eval rounds and failed in others across 3 independent runs (timing-sensitive TUI/tui2cli snapshot, tmux, or db-lock/test-isolation concurrency); unreliable for grading.", + "timestamp": 1749700000, + "user": "kilian" + } + ] + }, { "name": "tests.test_edit.test_temp_file_cleanup_after_edit", "reasons": [ @@ -5461,6 +5472,17 @@ } ] }, + { + "name": "tests.test_exec_mode.test_exec_mode_command_error_captured_in_stderr", + "reasons": [ + { + "id": "gold_flaky", + "note": "Flaky on v6 gold: passed in some gold-eval rounds and failed in others across 3 independent runs (timing-sensitive TUI/tui2cli snapshot, tmux, or db-lock/test-isolation concurrency); unreliable for grading.", + "timestamp": 1749700000, + "user": "kilian" + } + ] + }, { "name": "tests.test_filter_search_advanced.test_backward_search_from_end", "reasons": [ @@ -5481,6 +5503,17 @@ } ] }, + { + "name": "tests.test_filter_search_advanced.test_invalid_regex_fallback_to_literal", + "reasons": [ + { + "id": "gold_flaky", + "note": "Flaky on v6 gold: passed in some gold-eval rounds and failed in others across 3 independent runs (timing-sensitive TUI/tui2cli snapshot, tmux, or db-lock/test-isolation concurrency); unreliable for grading.", + "timestamp": 1749700000, + "user": "kilian" + } + ] + }, { "name": "tests.test_filter_search_advanced.test_regex_search_toggle_during_input", "reasons": [ @@ -5509,6 +5542,28 @@ } ] }, + { + "name": "tests.test_input_header_modes.test_header_column_accepts_numeric_value", + "reasons": [ + { + "id": "gold_flaky", + "note": "Flaky on v6 gold: passed in some gold-eval rounds and failed in others across 3 independent runs (timing-sensitive TUI/tui2cli snapshot, tmux, or db-lock/test-isolation concurrency); unreliable for grading.", + "timestamp": 1749700000, + "user": "kilian" + } + ] + }, + { + "name": "tests.test_input_header_modes.test_header_input_empty_string_shows_error", + "reasons": [ + { + "id": "gold_flaky", + "note": "Flaky on v6 gold: passed in some gold-eval rounds and failed in others across 3 independent runs (timing-sensitive TUI/tui2cli snapshot, tmux, or db-lock/test-isolation concurrency); unreliable for grading.", + "timestamp": 1749700000, + "user": "kilian" + } + ] + }, { "name": "tests.test_input_header_modes.test_header_input_invalid_characters", "reasons": [ @@ -5517,6 +5572,28 @@ } ] }, + { + "name": "tests.test_input_header_modes.test_header_input_mode_prompt_appears", + "reasons": [ + { + "id": "gold_flaky", + "note": "Flaky on v6 gold: passed in some gold-eval rounds and failed in others across 3 independent runs (timing-sensitive TUI/tui2cli snapshot, tmux, or db-lock/test-isolation concurrency); unreliable for grading.", + "timestamp": 1749700000, + "user": "kilian" + } + ] + }, + { + "name": "tests.test_input_header_modes.test_header_input_multiple_digits", + "reasons": [ + { + "id": "gold_flaky", + "note": "Flaky on v6 gold: passed in some gold-eval rounds and failed in others across 3 independent runs (timing-sensitive TUI/tui2cli snapshot, tmux, or db-lock/test-isolation concurrency); unreliable for grading.", + "timestamp": 1749700000, + "user": "kilian" + } + ] + }, { "name": "tests.test_input_header_modes.test_jump_target_empty_string_no_change", "reasons": [ @@ -5527,6 +5604,61 @@ } ] }, + { + "name": "tests.test_input_header_modes.test_jump_target_input_mode_prompt_appears", + "reasons": [ + { + "id": "gold_flaky", + "note": "Flaky on v6 gold: passed in some gold-eval rounds and failed in others across 3 independent runs (timing-sensitive TUI/tui2cli snapshot, tmux, or db-lock/test-isolation concurrency); unreliable for grading.", + "timestamp": 1749700000, + "user": "kilian" + } + ] + }, + { + "name": "tests.test_input_header_modes.test_jump_target_numeric_value", + "reasons": [ + { + "id": "gold_flaky", + "note": "Flaky on v6 gold: passed in some gold-eval rounds and failed in others across 3 independent runs (timing-sensitive TUI/tui2cli snapshot, tmux, or db-lock/test-isolation concurrency); unreliable for grading.", + "timestamp": 1749700000, + "user": "kilian" + } + ] + }, + { + "name": "tests.test_input_modes_advanced.test_filter_mode_up_populates_from_history", + "reasons": [ + { + "id": "gold_flaky", + "note": "Flaky on v6 gold: passed in some gold-eval rounds and failed in others across 3 independent runs (timing-sensitive TUI/tui2cli snapshot, tmux, or db-lock/test-isolation concurrency); unreliable for grading.", + "timestamp": 1749700000, + "user": "kilian" + } + ] + }, + { + "name": "tests.test_input_modes_advanced.test_multicolor_confirm_sets_multicolor", + "reasons": [ + { + "id": "gold_flaky", + "note": "Flaky on v6 gold: passed in some gold-eval rounds and failed in others across 3 independent runs (timing-sensitive TUI/tui2cli snapshot, tmux, or db-lock/test-isolation concurrency); unreliable for grading.", + "timestamp": 1749700000, + "user": "kilian" + } + ] + }, + { + "name": "tests.test_input_modes_advanced.test_multicolor_up_down_navigation", + "reasons": [ + { + "id": "gold_flaky", + "note": "Flaky on v6 gold: passed in some gold-eval rounds and failed in others across 3 independent runs (timing-sensitive TUI/tui2cli snapshot, tmux, or db-lock/test-isolation concurrency); unreliable for grading.", + "timestamp": 1749700000, + "user": "kilian" + } + ] + }, { "name": "tests.test_mouse_marks_sidebar.test_mark_multiple_lines_and_navigate", "reasons": [ @@ -5537,6 +5669,17 @@ } ] }, + { + "name": "tests.test_mouse_marks_sidebar.test_mark_single_line", + "reasons": [ + { + "id": "gold_flaky", + "note": "Flaky on v6 gold: passed in some gold-eval rounds and failed in others across 3 independent runs (timing-sensitive TUI/tui2cli snapshot, tmux, or db-lock/test-isolation concurrency); unreliable for grading.", + "timestamp": 1749700000, + "user": "kilian" + } + ] + }, { "name": "tests.test_mouse_marks_sidebar.test_remove_all_marks", "reasons": [ @@ -5555,6 +5698,28 @@ } ] }, + { + "name": "tests.test_mouse_marks_sidebar.test_sidebar_document_list", + "reasons": [ + { + "id": "gold_flaky", + "note": "Flaky on v6 gold: passed in some gold-eval rounds and failed in others across 3 independent runs (timing-sensitive TUI/tui2cli snapshot, tmux, or db-lock/test-isolation concurrency); unreliable for grading.", + "timestamp": 1749700000, + "user": "kilian" + } + ] + }, + { + "name": "tests.test_mouse_marks_sidebar.test_sidebar_help_display", + "reasons": [ + { + "id": "gold_flaky", + "note": "Flaky on v6 gold: passed in some gold-eval rounds and failed in others across 3 independent runs (timing-sensitive TUI/tui2cli snapshot, tmux, or db-lock/test-isolation concurrency); unreliable for grading.", + "timestamp": 1749700000, + "user": "kilian" + } + ] + }, { "name": "tests.test_mouse_marks_sidebar.test_sidebar_mark_list_display", "reasons": [ @@ -5563,6 +5728,50 @@ } ] }, + { + "name": "tests.test_move_leftright_advanced.test_column_left_no_cycle_at_start", + "reasons": [ + { + "id": "gold_flaky", + "note": "Flaky on v6 gold: passed in some gold-eval rounds and failed in others across 3 independent runs (timing-sensitive TUI/tui2cli snapshot, tmux, or db-lock/test-isolation concurrency); unreliable for grading.", + "timestamp": 1749700000, + "user": "kilian" + } + ] + }, + { + "name": "tests.test_movement_functions.test_section_navigation_at_boundaries", + "reasons": [ + { + "id": "gold_flaky", + "note": "Flaky on v6 gold: passed in some gold-eval rounds and failed in others across 3 independent runs (timing-sensitive TUI/tui2cli snapshot, tmux, or db-lock/test-isolation concurrency); unreliable for grading.", + "timestamp": 1749700000, + "user": "kilian" + } + ] + }, + { + "name": "tests.test_movement_functions.test_section_navigation_no_delimiter", + "reasons": [ + { + "id": "gold_flaky", + "note": "Flaky on v6 gold: passed in some gold-eval rounds and failed in others across 3 independent runs (timing-sensitive TUI/tui2cli snapshot, tmux, or db-lock/test-isolation concurrency); unreliable for grading.", + "timestamp": 1749700000, + "user": "kilian" + } + ] + }, + { + "name": "tests.test_movement_functions.test_section_navigation_previous", + "reasons": [ + { + "id": "gold_flaky", + "note": "Flaky on v6 gold: passed in some gold-eval rounds and failed in others across 3 independent runs (timing-sensitive TUI/tui2cli snapshot, tmux, or db-lock/test-isolation concurrency); unreliable for grading.", + "timestamp": 1749700000, + "user": "kilian" + } + ] + }, { "name": "tests.test_output_modes.test_exit_write_multiple_flags_order_independence", "reasons": [ @@ -5603,6 +5812,28 @@ } ] }, + { + "name": "tests.test_save_and_converters.test_converter_es_processes_escape_sequences", + "reasons": [ + { + "id": "gold_flaky", + "note": "Flaky on v6 gold: passed in some gold-eval rounds and failed in others across 3 independent runs (timing-sensitive TUI/tui2cli snapshot, tmux, or db-lock/test-isolation concurrency); unreliable for grading.", + "timestamp": 1749700000, + "user": "kilian" + } + ] + }, + { + "name": "tests.test_save_and_converters.test_converter_raw_displays_escape_sequences_as_caret_notation", + "reasons": [ + { + "id": "gold_flaky", + "note": "Flaky on v6 gold: passed in some gold-eval rounds and failed in others across 3 independent runs (timing-sensitive TUI/tui2cli snapshot, tmux, or db-lock/test-isolation concurrency); unreliable for grading.", + "timestamp": 1749700000, + "user": "kilian" + } + ] + }, { "name": "tests.test_save_and_converters.test_raw_flag_equivalent_to_converter_raw", "reasons": [ @@ -5662,6 +5893,347 @@ "id": "gold_fail" } ] + }, + { + "name": "tests.test_search_edge_cases.test_empty_search_pattern_forward", + "reasons": [ + { + "id": "gold_flaky", + "note": "Flaky on v6 gold: passed in some gold-eval rounds and failed in others across 3 independent runs (timing-sensitive TUI/tui2cli snapshot, tmux, or db-lock/test-isolation concurrency); unreliable for grading.", + "timestamp": 1749700000, + "user": "kilian" + } + ] + }, + { + "name": "tests.test_search_edge_cases.test_incremental_search_updates", + "reasons": [ + { + "id": "gold_flaky", + "note": "Flaky on v6 gold: passed in some gold-eval rounds and failed in others across 3 independent runs (timing-sensitive TUI/tui2cli snapshot, tmux, or db-lock/test-isolation concurrency); unreliable for grading.", + "timestamp": 1749700000, + "user": "kilian" + } + ] + }, + { + "name": "tests.test_search_edge_cases.test_invalid_regex_fallback_to_literal", + "reasons": [ + { + "id": "gold_flaky", + "note": "Flaky on v6 gold: passed in some gold-eval rounds and failed in others across 3 independent runs (timing-sensitive TUI/tui2cli snapshot, tmux, or db-lock/test-isolation concurrency); unreliable for grading.", + "timestamp": 1749700000, + "user": "kilian" + } + ] + }, + { + "name": "tests.test_search_edge_cases.test_multicolor_with_quoted_strings", + "reasons": [ + { + "id": "gold_flaky", + "note": "Flaky on v6 gold: passed in some gold-eval rounds and failed in others across 3 independent runs (timing-sensitive TUI/tui2cli snapshot, tmux, or db-lock/test-isolation concurrency); unreliable for grading.", + "timestamp": 1749700000, + "user": "kilian" + } + ] + }, + { + "name": "tests.test_search_edge_cases.test_next_search_continuation", + "reasons": [ + { + "id": "gold_flaky", + "note": "Flaky on v6 gold: passed in some gold-eval rounds and failed in others across 3 independent runs (timing-sensitive TUI/tui2cli snapshot, tmux, or db-lock/test-isolation concurrency); unreliable for grading.", + "timestamp": 1749700000, + "user": "kilian" + } + ] + }, + { + "name": "tests.test_search_edge_cases.test_search_not_found_message", + "reasons": [ + { + "id": "gold_flaky", + "note": "Flaky on v6 gold: passed in some gold-eval rounds and failed in others across 3 independent runs (timing-sensitive TUI/tui2cli snapshot, tmux, or db-lock/test-isolation concurrency); unreliable for grading.", + "timestamp": 1749700000, + "user": "kilian" + } + ] + }, + { + "name": "tests.test_search_edge_cases.test_search_numeric_only_pattern", + "reasons": [ + { + "id": "gold_flaky", + "note": "Flaky on v6 gold: passed in some gold-eval rounds and failed in others across 3 independent runs (timing-sensitive TUI/tui2cli snapshot, tmux, or db-lock/test-isolation concurrency); unreliable for grading.", + "timestamp": 1749700000, + "user": "kilian" + } + ] + }, + { + "name": "tests.test_search_edge_cases.test_search_symbol_only_pattern", + "reasons": [ + { + "id": "gold_flaky", + "note": "Flaky on v6 gold: passed in some gold-eval rounds and failed in others across 3 independent runs (timing-sensitive TUI/tui2cli snapshot, tmux, or db-lock/test-isolation concurrency); unreliable for grading.", + "timestamp": 1749700000, + "user": "kilian" + } + ] + }, + { + "name": "tests.test_search_edge_cases.test_search_wrapping_forward", + "reasons": [ + { + "id": "gold_flaky", + "note": "Flaky on v6 gold: passed in some gold-eval rounds and failed in others across 3 independent runs (timing-sensitive TUI/tui2cli snapshot, tmux, or db-lock/test-isolation concurrency); unreliable for grading.", + "timestamp": 1749700000, + "user": "kilian" + } + ] + }, + { + "name": "tests.test_search_edge_cases.test_smart_case_with_uppercase_in_pattern", + "reasons": [ + { + "id": "gold_flaky", + "note": "Flaky on v6 gold: passed in some gold-eval rounds and failed in others across 3 independent runs (timing-sensitive TUI/tui2cli snapshot, tmux, or db-lock/test-isolation concurrency); unreliable for grading.", + "timestamp": 1749700000, + "user": "kilian" + } + ] + }, + { + "name": "tests.test_tui_advanced.test_alternate_rows", + "reasons": [ + { + "id": "gold_flaky", + "note": "Flaky on v6 gold: passed in some gold-eval rounds and failed in others across 3 independent runs (timing-sensitive TUI/tui2cli snapshot, tmux, or db-lock/test-isolation concurrency); unreliable for grading.", + "timestamp": 1749700000, + "user": "kilian" + } + ] + }, + { + "name": "tests.test_tui_advanced.test_caption_display", + "reasons": [ + { + "id": "gold_flaky", + "note": "Flaky on v6 gold: passed in some gold-eval rounds and failed in others across 3 independent runs (timing-sensitive TUI/tui2cli snapshot, tmux, or db-lock/test-isolation concurrency); unreliable for grading.", + "timestamp": 1749700000, + "user": "kilian" + } + ] + }, + { + "name": "tests.test_tui_advanced.test_mark_set_and_message", + "reasons": [ + { + "id": "gold_flaky", + "note": "Flaky on v6 gold: passed in some gold-eval rounds and failed in others across 3 independent runs (timing-sensitive TUI/tui2cli snapshot, tmux, or db-lock/test-isolation concurrency); unreliable for grading.", + "timestamp": 1749700000, + "user": "kilian" + } + ] + }, + { + "name": "tests.test_tui_advanced.test_ruler_absolute", + "reasons": [ + { + "id": "gold_flaky", + "note": "Flaky on v6 gold: passed in some gold-eval rounds and failed in others across 3 independent runs (timing-sensitive TUI/tui2cli snapshot, tmux, or db-lock/test-isolation concurrency); unreliable for grading.", + "timestamp": 1749700000, + "user": "kilian" + } + ] + }, + { + "name": "tests.test_tui_advanced.test_ruler_disabled", + "reasons": [ + { + "id": "gold_flaky", + "note": "Flaky on v6 gold: passed in some gold-eval rounds and failed in others across 3 independent runs (timing-sensitive TUI/tui2cli snapshot, tmux, or db-lock/test-isolation concurrency); unreliable for grading.", + "timestamp": 1749700000, + "user": "kilian" + } + ] + }, + { + "name": "tests.test_tui_advanced.test_status_line_content", + "reasons": [ + { + "id": "gold_flaky", + "note": "Flaky on v6 gold: passed in some gold-eval rounds and failed in others across 3 independent runs (timing-sensitive TUI/tui2cli snapshot, tmux, or db-lock/test-isolation concurrency); unreliable for grading.", + "timestamp": 1749700000, + "user": "kilian" + } + ] + }, + { + "name": "tests.test_tui_advanced.test_status_line_disabled", + "reasons": [ + { + "id": "gold_flaky", + "note": "Flaky on v6 gold: passed in some gold-eval rounds and failed in others across 3 independent runs (timing-sensitive TUI/tui2cli snapshot, tmux, or db-lock/test-isolation concurrency); unreliable for grading.", + "timestamp": 1749700000, + "user": "kilian" + } + ] + }, + { + "name": "tests.test_tui_control_flow.test_bottom_navigation_triggers_request_bottom", + "reasons": [ + { + "id": "gold_flaky", + "note": "Flaky on v6 gold: passed in some gold-eval rounds and failed in others across 3 independent runs (timing-sensitive TUI/tui2cli snapshot, tmux, or db-lock/test-isolation concurrency); unreliable for grading.", + "timestamp": 1749700000, + "user": "kilian" + } + ] + }, + { + "name": "tests.test_tui_control_flow.test_exec_mode_reload_control_reader", + "reasons": [ + { + "id": "gold_flaky", + "note": "Flaky on v6 gold: passed in some gold-eval rounds and failed in others across 3 independent runs (timing-sensitive TUI/tui2cli snapshot, tmux, or db-lock/test-isolation concurrency); unreliable for grading.", + "timestamp": 1749700000, + "user": "kilian" + } + ] + }, + { + "name": "tests.test_tui_control_flow.test_follow_mode_request_follow", + "reasons": [ + { + "id": "gold_flaky", + "note": "Flaky on v6 gold: passed in some gold-eval rounds and failed in others across 3 independent runs (timing-sensitive TUI/tui2cli snapshot, tmux, or db-lock/test-isolation concurrency); unreliable for grading.", + "timestamp": 1749700000, + "user": "kilian" + } + ] + }, + { + "name": "tests.test_tui_control_flow.test_reload_file_with_f5", + "reasons": [ + { + "id": "gold_flaky", + "note": "Flaky on v6 gold: passed in some gold-eval rounds and failed in others across 3 independent runs (timing-sensitive TUI/tui2cli snapshot, tmux, or db-lock/test-isolation concurrency); unreliable for grading.", + "timestamp": 1749700000, + "user": "kilian" + } + ] + }, + { + "name": "tests.test_tui_control_flow.test_search_request_search", + "reasons": [ + { + "id": "gold_flaky", + "note": "Flaky on v6 gold: passed in some gold-eval rounds and failed in others across 3 independent runs (timing-sensitive TUI/tui2cli snapshot, tmux, or db-lock/test-isolation concurrency); unreliable for grading.", + "timestamp": 1749700000, + "user": "kilian" + } + ] + }, + { + "name": "tests.test_tui_control_flow.test_watch_mode_periodic_reload", + "reasons": [ + { + "id": "gold_flaky", + "note": "Flaky on v6 gold: passed in some gold-eval rounds and failed in others across 3 independent runs (timing-sensitive TUI/tui2cli snapshot, tmux, or db-lock/test-isolation concurrency); unreliable for grading.", + "timestamp": 1749700000, + "user": "kilian" + } + ] + }, + { + "name": "tests.test_tui_input_modes.test_header_column_mode", + "reasons": [ + { + "id": "gold_flaky", + "note": "Flaky on v6 gold: passed in some gold-eval rounds and failed in others across 3 independent runs (timing-sensitive TUI/tui2cli snapshot, tmux, or db-lock/test-isolation concurrency); unreliable for grading.", + "timestamp": 1749700000, + "user": "kilian" + } + ] + }, + { + "name": "tests.test_tui_input_modes.test_header_mode_basic", + "reasons": [ + { + "id": "gold_flaky", + "note": "Flaky on v6 gold: passed in some gold-eval rounds and failed in others across 3 independent runs (timing-sensitive TUI/tui2cli snapshot, tmux, or db-lock/test-isolation concurrency); unreliable for grading.", + "timestamp": 1749700000, + "user": "kilian" + } + ] + }, + { + "name": "tests.test_tui_input_modes.test_jump_target_mode", + "reasons": [ + { + "id": "gold_flaky", + "note": "Flaky on v6 gold: passed in some gold-eval rounds and failed in others across 3 independent runs (timing-sensitive TUI/tui2cli snapshot, tmux, or db-lock/test-isolation concurrency); unreliable for grading.", + "timestamp": 1749700000, + "user": "kilian" + } + ] + }, + { + "name": "tests.test_tui_input_modes.test_mark_goto_mode", + "reasons": [ + { + "id": "gold_flaky", + "note": "Flaky on v6 gold: passed in some gold-eval rounds and failed in others across 3 independent runs (timing-sensitive TUI/tui2cli snapshot, tmux, or db-lock/test-isolation concurrency); unreliable for grading.", + "timestamp": 1749700000, + "user": "kilian" + } + ] + }, + { + "name": "tests.test_tui_input_modes.test_save_buffer_requires_non_seekable", + "reasons": [ + { + "id": "gold_flaky", + "note": "Flaky on v6 gold: passed in some gold-eval rounds and failed in others across 3 independent runs (timing-sensitive TUI/tui2cli snapshot, tmux, or db-lock/test-isolation concurrency); unreliable for grading.", + "timestamp": 1749700000, + "user": "kilian" + } + ] + }, + { + "name": "tests.test_tui_input_modes.test_section_num_mode", + "reasons": [ + { + "id": "gold_flaky", + "note": "Flaky on v6 gold: passed in some gold-eval rounds and failed in others across 3 independent runs (timing-sensitive TUI/tui2cli snapshot, tmux, or db-lock/test-isolation concurrency); unreliable for grading.", + "timestamp": 1749700000, + "user": "kilian" + } + ] + }, + { + "name": "tests.test_tui_input_modes.test_skip_lines_mode", + "reasons": [ + { + "id": "gold_flaky", + "note": "Flaky on v6 gold: passed in some gold-eval rounds and failed in others across 3 independent runs (timing-sensitive TUI/tui2cli snapshot, tmux, or db-lock/test-isolation concurrency); unreliable for grading.", + "timestamp": 1749700000, + "user": "kilian" + } + ] + }, + { + "name": "tests.test_tui_navigation.test_basic_file_display", + "reasons": [ + { + "id": "gold_flaky", + "note": "Flaky on v6 gold: passed in some gold-eval rounds and failed in others across 3 independent runs (timing-sensitive TUI/tui2cli snapshot, tmux, or db-lock/test-isolation concurrency); unreliable for grading.", + "timestamp": 1749700000, + "user": "kilian" + } + ] } ] }, diff --git a/src/programbench/data/tasks/orf__gping.26eb5b9/tests.json b/src/programbench/data/tasks/orf__gping.26eb5b9/tests.json index 50cd7ef..96c4131 100644 --- a/src/programbench/data/tasks/orf__gping.26eb5b9/tests.json +++ b/src/programbench/data/tasks/orf__gping.26eb5b9/tests.json @@ -2781,6 +2781,23 @@ } ] }, + { + "name": "tests.test_shortcuts.test_aws_region_shortcut_expansion", + "reasons": [ + { + "extra": { + "fail": 5, + "note": "TUI ping-stats rendering timing race (stats not yet shown in captured frame)", + "pass": 15, + "pattern": "ffppppfpppppppppfppf", + "source": "20x gold cloud eval 2026-06-13" + }, + "id": "gold_flaky", + "timestamp": 1781326815, + "user": "kilian" + } + ] + }, { "name": "tests.test_shortcuts.test_hostname_resolves_to_ipv6_with_flag", "reasons": [ diff --git a/src/programbench/data/tasks/peco__peco.4e58dad/tests.json b/src/programbench/data/tasks/peco__peco.4e58dad/tests.json index fdd4b54..f6d88e2 100644 --- a/src/programbench/data/tasks/peco__peco.4e58dad/tests.json +++ b/src/programbench/data/tasks/peco__peco.4e58dad/tests.json @@ -454,6 +454,23 @@ } ] }, + { + "name": "eval.tests.test_basic.test_dash_in_regular_query", + "reasons": [ + { + "extra": { + "fail": 1, + "note": "tcell screen-init race (open /dev/tty: no such device) + TUI layout snapshot timing; --select-1 exit code tty-dependent", + "pass": 19, + "pattern": "pppfpppppppppppppppp", + "source": "20x gold cloud eval 2026-06-13" + }, + "id": "gold_flaky", + "timestamp": 1781326815, + "user": "kilian" + } + ] + }, { "name": "eval.tests.test_basic.test_empty_input_with_exit_0_flag", "reasons": [ @@ -652,6 +669,23 @@ } ] }, + { + "name": "eval.tests.test_basic.test_print_query_with_different_selection", + "reasons": [ + { + "extra": { + "fail": 1, + "note": "tcell screen-init race (open /dev/tty: no such device) + TUI layout snapshot timing; --select-1 exit code tty-dependent", + "pass": 19, + "pattern": "pppppppppppppfpppppp", + "source": "20x gold cloud eval 2026-06-13" + }, + "id": "gold_flaky", + "timestamp": 1781326815, + "user": "kilian" + } + ] + }, { "name": "eval.tests.test_basic.test_query_case_insensitive_default", "reasons": [ @@ -846,6 +880,23 @@ } ] }, + { + "name": "eval.tests.test_coverage.test_config_with_initial_filter", + "reasons": [ + { + "extra": { + "fail": 2, + "note": "tcell screen-init race (open /dev/tty: no such device) + TUI layout snapshot timing; --select-1 exit code tty-dependent", + "pass": 18, + "pattern": "pppppppppppppppppfpf", + "source": "20x gold cloud eval 2026-06-13" + }, + "id": "gold_flaky", + "timestamp": 1781326815, + "user": "kilian" + } + ] + }, { "name": "eval.tests.test_coverage.test_config_with_invalid_height_format", "reasons": [ @@ -1150,6 +1201,23 @@ } ] }, + { + "name": "eval.tests.test_coverage.test_print_query_with_no_match", + "reasons": [ + { + "extra": { + "fail": 2, + "note": "tcell screen-init race (open /dev/tty: no such device) + TUI layout snapshot timing; --select-1 exit code tty-dependent", + "pass": 18, + "pattern": "pppppfppfppppppppppp", + "source": "20x gold cloud eval 2026-06-13" + }, + "id": "gold_flaky", + "timestamp": 1781326815, + "user": "kilian" + } + ] + }, { "name": "eval.tests.test_coverage.test_prompt_with_select_1", "reasons": [ @@ -1186,6 +1254,23 @@ } ] }, + { + "name": "eval.tests.test_coverage.test_query_with_emoji", + "reasons": [ + { + "extra": { + "fail": 1, + "note": "tcell screen-init race (open /dev/tty: no such device) + TUI layout snapshot timing; --select-1 exit code tty-dependent", + "pass": 19, + "pattern": "pppfpppppppppppppppp", + "source": "20x gold cloud eval 2026-06-13" + }, + "id": "gold_flaky", + "timestamp": 1781326815, + "user": "kilian" + } + ] + }, { "name": "eval.tests.test_coverage.test_query_with_newline", "reasons": [ @@ -4871,6 +4956,40 @@ } ] }, + { + "name": "tests.test_layout_gap.test_layout_bottom_up_page_up", + "reasons": [ + { + "extra": { + "fail": 1, + "note": "tcell screen-init race (open /dev/tty: no such device) + TUI layout snapshot timing; --select-1 exit code tty-dependent", + "pass": 19, + "pattern": "ppppfppppppppppppppp", + "source": "20x gold cloud eval 2026-06-13" + }, + "id": "gold_flaky", + "timestamp": 1781326815, + "user": "kilian" + } + ] + }, + { + "name": "tests.test_layout_gap.test_layout_cursor_middle_of_query", + "reasons": [ + { + "extra": { + "fail": 1, + "note": "tcell screen-init race (open /dev/tty: no such device) + TUI layout snapshot timing; --select-1 exit code tty-dependent", + "pass": 19, + "pattern": "ppppfppppppppppppppp", + "source": "20x gold cloud eval 2026-06-13" + }, + "id": "gold_flaky", + "timestamp": 1781326815, + "user": "kilian" + } + ] + }, { "name": "tests.test_layout_gap.test_layout_few_lines_wrapping", "reasons": [ @@ -5696,6 +5815,17 @@ } ] }, + { + "name": "tests.test_basic.TestSpecialCharacters.test_whitespace_only_lines", + "reasons": [ + { + "id": "gold_flaky", + "note": "Flaky on v6 gold: passed in some gold-eval rounds and failed in others across 3 independent runs (timing-sensitive TUI/tui2cli snapshot, tmux, or db-lock/test-isolation concurrency); unreliable for grading.", + "timestamp": 1749700000, + "user": "kilian" + } + ] + }, { "name": "tests.test_interactive.TestInteractiveBasic.test_basic_display", "reasons": [ @@ -5902,7 +6032,19 @@ "eval.tests.test_peco_behavior.test_unknown_flag_errors", "eval.tests.test_peco_behavior.test_version_format" ], - "ignored_tests": [] + "ignored_tests": [ + { + "name": "eval.tests.test_peco_behavior.test_prompt_option_changes_prompt", + "reasons": [ + { + "id": "gold_flaky", + "note": "Flaky on v6 gold: passed in some gold-eval rounds and failed in others across 3 independent runs (timing-sensitive TUI/tui2cli snapshot, tmux, or db-lock/test-isolation concurrency); unreliable for grading.", + "timestamp": 1749700000, + "user": "kilian" + } + ] + } + ] }, "6c5b8939620c": { "ignored": false, diff --git a/src/programbench/data/tasks/pls-rs__pls.4e1ae50/tests.json b/src/programbench/data/tasks/pls-rs__pls.4e1ae50/tests.json index 8918848..db1da1b 100644 --- a/src/programbench/data/tasks/pls-rs__pls.4e1ae50/tests.json +++ b/src/programbench/data/tasks/pls-rs__pls.4e1ae50/tests.json @@ -394,6 +394,17 @@ } ] }, + { + "name": "tests.test_filtering.test_importance_filter_cutoff_0", + "reasons": [ + { + "id": "gold_flaky", + "note": "Flaky on v6 gold: passed in some gold-eval rounds and failed in others across 3 independent runs (timing-sensitive TUI/tui2cli snapshot, tmux, or db-lock/test-isolation concurrency); unreliable for grading.", + "timestamp": 1749700000, + "user": "kilian" + } + ] + }, { "name": "tests.test_filtering.test_importance_filter_cutoff_1", "reasons": [ diff --git a/src/programbench/data/tasks/raviqqe__muffet.a882908/tests.json b/src/programbench/data/tasks/raviqqe__muffet.a882908/tests.json index 07b313d..cc350fb 100644 --- a/src/programbench/data/tasks/raviqqe__muffet.a882908/tests.json +++ b/src/programbench/data/tasks/raviqqe__muffet.a882908/tests.json @@ -675,6 +675,23 @@ } ] }, + { + "name": "tests.test_output_formats.test_json_empty_links_array_structure", + "reasons": [ + { + "extra": { + "fail": 8, + "note": "local HTTP test-server dial race (error when dialing 127.0.0.1:8765)", + "pass": 12, + "pattern": "ppfffppfffppppppffpp", + "source": "20x gold cloud eval 2026-06-13" + }, + "id": "gold_flaky", + "timestamp": 1781326815, + "user": "kilian" + } + ] + }, { "name": "tests.test_output_formats.test_json_error_is_string", "reasons": [ diff --git a/src/programbench/data/tasks/rcoh__angle-grinder.9c2fc88/tests.json b/src/programbench/data/tasks/rcoh__angle-grinder.9c2fc88/tests.json index db339ad..d66605a 100644 --- a/src/programbench/data/tasks/rcoh__angle-grinder.9c2fc88/tests.json +++ b/src/programbench/data/tasks/rcoh__angle-grinder.9c2fc88/tests.json @@ -1267,6 +1267,42 @@ } ] }, + { + "name": "tests.test_tty_rendering.test_tty_aggregate_shows_ansi_escape_codes", + "reasons": [ + { + "extra": { + "cause": "Flaky TUI render timing (progressive / ANSI tty output)", + "statuses": [ + "passed", + "failure", + "passed" + ] + }, + "id": "gold_flaky", + "timestamp": 1781197573, + "user": "kilian" + } + ] + }, + { + "name": "tests.test_tty_rendering.test_tty_avg_aggregate_shows_progressive_updates", + "reasons": [ + { + "extra": { + "cause": "Flaky TUI render timing (progressive / ANSI tty output)", + "statuses": [ + "passed", + "passed", + "failure" + ] + }, + "id": "gold_flaky", + "timestamp": 1781197573, + "user": "kilian" + } + ] + }, { "name": "tests.test_tty_rendering.test_tty_count_aggregate_intermediate_updates", "reasons": [ diff --git a/src/programbench/data/tasks/rhysd__kiro-editor.4157485/tests.json b/src/programbench/data/tasks/rhysd__kiro-editor.4157485/tests.json index eaf4a49..8414a3c 100644 --- a/src/programbench/data/tasks/rhysd__kiro-editor.4157485/tests.json +++ b/src/programbench/data/tasks/rhysd__kiro-editor.4157485/tests.json @@ -1487,6 +1487,17 @@ } ] }, + { + "name": "tests.test_edge_cases.test_horizontal_scroll_on_long_line", + "reasons": [ + { + "id": "gold_flaky", + "note": "Flaky on v6 gold: passed in some gold-eval rounds and failed in others across 3 independent runs (timing-sensitive TUI/tui2cli snapshot, tmux, or db-lock/test-isolation concurrency); unreliable for grading.", + "timestamp": 1749700000, + "user": "kilian" + } + ] + }, { "name": "tests.test_edge_cases.test_large_file_navigation_to_end", "reasons": [ @@ -1511,6 +1522,24 @@ } ] }, + { + "name": "tests.test_edge_cases.test_utf8_multibyte_characters", + "reasons": [ + { + "extra": { + "cause": "Flaky TUI render timing (editor screen state, utf8 input)", + "statuses": [ + "failure", + "passed", + "failure" + ] + }, + "id": "gold_flaky", + "timestamp": 1781197573, + "user": "kilian" + } + ] + }, { "name": "tests.test_editing.test_backspace_at_line_start", "reasons": [ @@ -1733,6 +1762,24 @@ } ] }, + { + "name": "tests.test_editor_gaps.test_backspace_key_deletes_char", + "reasons": [ + { + "extra": { + "cause": "Flaky TUI render timing (editor screen state, utf8 input)", + "statuses": [ + "passed", + "passed", + "failure" + ] + }, + "id": "gold_flaky", + "timestamp": 1781197573, + "user": "kilian" + } + ] + }, { "name": "tests.test_editor_gaps.test_ctrl_bracket_page_down", "reasons": [ @@ -1741,6 +1788,24 @@ } ] }, + { + "name": "tests.test_editor_gaps.test_utf8_character_input", + "reasons": [ + { + "extra": { + "cause": "Flaky TUI render timing (editor screen state, utf8 input)", + "statuses": [ + "failure", + "passed", + "failure" + ] + }, + "id": "gold_flaky", + "timestamp": 1781197573, + "user": "kilian" + } + ] + }, { "name": "tests.test_error_coverage.test_control_char_tab", "reasons": [ @@ -1899,6 +1964,35 @@ } ] }, + { + "name": "tests.test_tui_core.test_backspace_deletion", + "reasons": [ + { + "extra": { + "cause": "Flaky TUI render timing (editor screen state, utf8 input)", + "statuses": [ + "passed", + "passed", + "failure" + ] + }, + "id": "gold_flaky", + "timestamp": 1781197573, + "user": "kilian" + } + ] + }, + { + "name": "tests.test_tui_core.test_delete_to_end_of_line", + "reasons": [ + { + "id": "gold_flaky", + "note": "Flaky on v6 gold: passed in some gold-eval rounds and failed in others across 3 independent runs (timing-sensitive TUI/tui2cli snapshot, tmux, or db-lock/test-isolation concurrency); unreliable for grading.", + "timestamp": 1749700000, + "user": "kilian" + } + ] + }, { "name": "tests.test_tui_core.test_page_down_scrolling", "reasons": [ diff --git a/src/programbench/data/tasks/rs__curlie.5dfcbb1/tests.json b/src/programbench/data/tasks/rs__curlie.5dfcbb1/tests.json index ec18d8b..366b3f8 100644 --- a/src/programbench/data/tasks/rs__curlie.5dfcbb1/tests.json +++ b/src/programbench/data/tasks/rs__curlie.5dfcbb1/tests.json @@ -688,6 +688,42 @@ } ] }, + { + "name": "tests.test_formatting_json.test_deep_nested_json", + "reasons": [ + { + "extra": { + "cause": "Network-dependent (curl connection failures, rc=7)", + "statuses": [ + "failure", + "passed", + "failure" + ] + }, + "id": "gold_flaky", + "timestamp": 1781197573, + "user": "kilian" + } + ] + }, + { + "name": "tests.test_formatting_json.test_empty_json_array", + "reasons": [ + { + "extra": { + "cause": "Network-dependent (curl connection failures, rc=7)", + "statuses": [ + "failure", + "failure", + "passed" + ] + }, + "id": "gold_flaky", + "timestamp": 1781197573, + "user": "kilian" + } + ] + }, { "name": "tests.test_formatting_json.test_empty_json_object", "reasons": [ @@ -706,6 +742,17 @@ } ] }, + { + "name": "tests.test_formatting_json.test_escape_sequences_in_json_strings", + "reasons": [ + { + "id": "gold_flaky", + "note": "Flaky on v6 gold: passed in some gold-eval rounds and failed in others across 3 independent runs (timing-sensitive TUI/tui2cli snapshot, tmux, or db-lock/test-isolation concurrency); unreliable for grading.", + "timestamp": 1749700000, + "user": "kilian" + } + ] + }, { "name": "tests.test_formatting_json.test_html_passthrough", "reasons": [ @@ -724,6 +771,24 @@ } ] }, + { + "name": "tests.test_formatting_json.test_literals_null_true_false_colorization", + "reasons": [ + { + "extra": { + "cause": "Network-dependent (curl connection failures, rc=7)", + "statuses": [ + "failure", + "passed", + "failure" + ] + }, + "id": "gold_flaky", + "timestamp": 1781197573, + "user": "kilian" + } + ] + }, { "name": "tests.test_formatting_json.test_malformed_json_extra_closing_brace_negative_level_guard", "reasons": [ @@ -732,6 +797,24 @@ } ] }, + { + "name": "tests.test_formatting_json.test_no_pretty_flag_no_formatting", + "reasons": [ + { + "extra": { + "cause": "Network-dependent (curl connection failures, rc=7)", + "statuses": [ + "failure", + "passed", + "failure" + ] + }, + "id": "gold_flaky", + "timestamp": 1781197573, + "user": "kilian" + } + ] + }, { "name": "tests.test_formatting_json.test_numbers_formatting", "reasons": [ @@ -742,6 +825,52 @@ } ] }, + { + "name": "tests.test_formatting_json.test_plain_text_passthrough", + "reasons": [ + { + "id": "gold_flaky", + "note": "Flaky on v6 gold: passed in some gold-eval rounds and failed in others across 3 independent runs (timing-sensitive TUI/tui2cli snapshot, tmux, or db-lock/test-isolation concurrency); unreliable for grading.", + "timestamp": 1749700000, + "user": "kilian" + } + ] + }, + { + "name": "tests.test_formatting_json.test_unicode_characters", + "reasons": [ + { + "extra": { + "fail": 5, + "note": "curl connection race (exit 7) against local test server", + "pass": 15, + "pattern": "pffpppppppfppppfppfp", + "source": "20x gold cloud eval 2026-06-13" + }, + "id": "gold_flaky", + "timestamp": 1781326815, + "user": "kilian" + } + ] + }, + { + "name": "tests.test_formatting_json.test_whitespace_tabs_and_carriage_returns", + "reasons": [ + { + "extra": { + "cause": "Network-dependent (curl connection failures, rc=7)", + "statuses": [ + "passed", + "failure", + "failure" + ] + }, + "id": "gold_flaky", + "timestamp": 1781197573, + "user": "kilian" + } + ] + }, { "name": "tests.test_smoke.test_binary_exists", "reasons": [ diff --git a/src/programbench/data/tasks/rust-embedded__svd2rust.1760b5e/tests.json b/src/programbench/data/tasks/rust-embedded__svd2rust.1760b5e/tests.json index c8460e0..e9ff58a 100644 --- a/src/programbench/data/tasks/rust-embedded__svd2rust.1760b5e/tests.json +++ b/src/programbench/data/tasks/rust-embedded__svd2rust.1760b5e/tests.json @@ -1583,7 +1583,30 @@ "eval.tests.test_logging.test_log_info_has_two_lines", "eval.tests.test_logging.test_log_off_produces_no_stderr" ], - "ignored_tests": [] + "ignored_tests": [ + { + "name": "eval.tests.test_cli_help_version.test_version_exact", + "reasons": [ + { + "id": "gold_fail", + "note": "[gold_fail_v6_build_stamp] Fails on v6 gold: test asserts a build-stamped version/commit/date string (non-reproducible build metadata, e.g. embedded git rev or build date) rather than functionality; the golden captured the previous build's stamp.", + "timestamp": 1749700000, + "user": "kilian" + } + ] + }, + { + "name": "eval.tests.test_generation_outputs.test_generate_default_files_and_content_hashes", + "reasons": [ + { + "id": "gold_fail", + "note": "[gold_fail_v6_toolchain] Fails on v6 gold: output reflects the pinned toolchain / dependency versions (e.g. Go compiler diagnostic wording, stdlib net behavior, or generated-code content); the golden captured the previous build environment's output.", + "timestamp": 1749700000, + "user": "kilian" + } + ] + } + ] }, "0e234d2b8eef": { "ignored": false, diff --git a/src/programbench/data/tasks/sheepla__pingu.926d475/tests.json b/src/programbench/data/tasks/sheepla__pingu.926d475/tests.json index 1d9ddff..144f0c5 100644 --- a/src/programbench/data/tasks/sheepla__pingu.926d475/tests.json +++ b/src/programbench/data/tasks/sheepla__pingu.926d475/tests.json @@ -372,6 +372,17 @@ } ] }, + { + "name": "tests.test_edge_cases.test_ipv4_all_zeros_resolves_to_localhost", + "reasons": [ + { + "id": "gold_fail", + "note": "[gold_fail_v6_toolchain] Fails on v6 gold: output reflects the pinned toolchain / dependency versions (e.g. Go compiler diagnostic wording, stdlib net behavior, or generated-code content); the golden captured the previous build environment's output.", + "timestamp": 1749700000, + "user": "kilian" + } + ] + }, { "name": "tests.test_edge_cases.test_ipv6_localhost", "reasons": [ @@ -424,6 +435,50 @@ } ] }, + { + "name": "tests.test_flags.test_version_format", + "reasons": [ + { + "id": "gold_fail", + "note": "[gold_fail_v6_build_stamp] Fails on v6 gold: test asserts a build-stamped version/commit/date string (non-reproducible build metadata, e.g. embedded git rev or build date) rather than functionality; the golden captured the previous build's stamp.", + "timestamp": 1749700000, + "user": "kilian" + } + ] + }, + { + "name": "tests.test_flags.test_version_ignores_other_flags", + "reasons": [ + { + "id": "gold_fail", + "note": "[gold_fail_v6_build_stamp] Fails on v6 gold: test asserts a build-stamped version/commit/date string (non-reproducible build metadata, e.g. embedded git rev or build date) rather than functionality; the golden captured the previous build's stamp.", + "timestamp": 1749700000, + "user": "kilian" + } + ] + }, + { + "name": "tests.test_flags.test_version_long_flag", + "reasons": [ + { + "id": "gold_fail", + "note": "[gold_fail_v6_build_stamp] Fails on v6 gold: test asserts a build-stamped version/commit/date string (non-reproducible build metadata, e.g. embedded git rev or build date) rather than functionality; the golden captured the previous build's stamp.", + "timestamp": 1749700000, + "user": "kilian" + } + ] + }, + { + "name": "tests.test_flags.test_version_short_flag", + "reasons": [ + { + "id": "gold_fail", + "note": "[gold_fail_v6_build_stamp] Fails on v6 gold: test asserts a build-stamped version/commit/date string (non-reproducible build metadata, e.g. embedded git rev or build date) rather than functionality; the golden captured the previous build's stamp.", + "timestamp": 1749700000, + "user": "kilian" + } + ] + }, { "name": "tests.test_ping.test_ping_count_50_localhost", "reasons": [ diff --git a/src/programbench/data/tasks/sqlite__sqlite.839433d/tests.json b/src/programbench/data/tasks/sqlite__sqlite.839433d/tests.json index ae59415..25c3644 100644 --- a/src/programbench/data/tasks/sqlite__sqlite.839433d/tests.json +++ b/src/programbench/data/tasks/sqlite__sqlite.839433d/tests.json @@ -22592,6 +22592,17 @@ } ] }, + { + "name": "tests.test_harvest_e_createtable.test_e_createtable_t4_15_0@db_e_createtable", + "reasons": [ + { + "id": "gold_flaky", + "note": "Flaky on v6 gold: passed in some gold-eval rounds and failed in others across 3 independent runs (timing-sensitive TUI/tui2cli snapshot, tmux, or db-lock/test-isolation concurrency); unreliable for grading.", + "timestamp": 1749700000, + "user": "kilian" + } + ] + }, { "name": "tests.test_harvest_e_createtable.test_e_createtable_t4_18_3@db_e_createtable", "reasons": [ @@ -33763,6 +33774,23 @@ } ] }, + { + "name": "tests.test_harvest_pragma4.test_pragma4_t4_1_1@db_pragma4", + "reasons": [ + { + "extra": { + "fail": 1, + "note": "test-isolation race: pre-existing table/view from dirty shared DB state (table/view already exists)", + "pass": 19, + "pattern": "pppppppppfpppppppppp", + "source": "20x gold cloud eval 2026-06-13" + }, + "id": "gold_flaky", + "timestamp": 1781326815, + "user": "kilian" + } + ] + }, { "name": "tests.test_harvest_pragma4.test_pragma4_t4_1_2@db_pragma4", "reasons": [ @@ -34896,6 +34924,23 @@ } ] }, + { + "name": "tests.test_harvest_rowvalue7.test_rowvalue7_t1_1", + "reasons": [ + { + "extra": { + "fail": 1, + "note": "test-isolation race: pre-existing table/view from dirty shared DB state (table/view already exists)", + "pass": 19, + "pattern": "pppppppppppppppppfpp", + "source": "20x gold cloud eval 2026-06-13" + }, + "id": "gold_flaky", + "timestamp": 1781326815, + "user": "kilian" + } + ] + }, { "name": "tests.test_harvest_rowvalue7.test_rowvalue7_t1_2@db_rowvalue7", "reasons": [ @@ -41767,6 +41812,23 @@ } ] }, + { + "name": "tests.test_harvest_window1.test_window1_t73_0", + "reasons": [ + { + "extra": { + "fail": 2, + "note": "test-isolation race: pre-existing table/view from dirty shared DB state (table/view already exists)", + "pass": 18, + "pattern": "pppppppppppfpppppppf", + "source": "20x gold cloud eval 2026-06-13" + }, + "id": "gold_flaky", + "timestamp": 1781326815, + "user": "kilian" + } + ] + }, { "name": "tests.test_harvest_window1.test_window1_t73_1", "reasons": [ diff --git a/src/programbench/data/tasks/stranger6667__jsonschema.d52e881/tests.json b/src/programbench/data/tasks/stranger6667__jsonschema.d52e881/tests.json index 0cd75ab..b22a124 100644 --- a/src/programbench/data/tasks/stranger6667__jsonschema.d52e881/tests.json +++ b/src/programbench/data/tasks/stranger6667__jsonschema.d52e881/tests.json @@ -1645,6 +1645,17 @@ "id": "gold_fail" } ] + }, + { + "name": "tests.test_retriever.test_insecure_flag_bypasses_cert_validation", + "reasons": [ + { + "id": "gold_fail", + "note": "[gold_fail_v6_toolchain] HTTP-client error wording differs from golden ('error decoding response body'); reqwest version drift (test also depends on external https://self-signed.badssl.com). 20x gold cloud eval 2026-06-13, failed 20/20 rounds", + "timestamp": 1781487387, + "user": "kilian" + } + ] } ] }, diff --git a/src/programbench/data/tasks/svenstaro__genact.16f96e3/tests.json b/src/programbench/data/tasks/svenstaro__genact.16f96e3/tests.json index 75914cd..5f06b78 100644 --- a/src/programbench/data/tasks/svenstaro__genact.16f96e3/tests.json +++ b/src/programbench/data/tasks/svenstaro__genact.16f96e3/tests.json @@ -243,6 +243,23 @@ "tests.test_tui_extended.test_weblog_module_nginx_access_log" ], "ignored_tests": [ + { + "name": "tests.test_modules_misc.test_rkhunter_rootkit_scanning", + "reasons": [ + { + "extra": { + "fail": 1, + "note": "randomized activity-simulator output: expected task line absent in a random run", + "pass": 19, + "pattern": "pppppppppppppppppppf", + "source": "20x gold cloud eval 2026-06-13" + }, + "id": "gold_flaky", + "timestamp": 1781326815, + "user": "kilian" + } + ] + }, { "name": "tests.test_modules_security.test_docker_image_rm_shows_digest_after_tag", "reasons": [ diff --git a/src/programbench/data/tasks/tstack__lnav.ee34494/tests.json b/src/programbench/data/tasks/tstack__lnav.ee34494/tests.json index f128c4b..d862b4a 100644 --- a/src/programbench/data/tasks/tstack__lnav.ee34494/tests.json +++ b/src/programbench/data/tasks/tstack__lnav.ee34494/tests.json @@ -1740,6 +1740,17 @@ } ] }, + { + "name": "tests.test_help_system.test_help_flag_basic_usage", + "reasons": [ + { + "id": "gold_fail", + "note": "[gold_fail_v6_build_stamp] Usage string embeds the build git-hash (4.0-8ad2d43) differing from golden (4.0-07efb63). 20x gold cloud eval 2026-06-13, failed 20/20 rounds", + "timestamp": 1781487387, + "user": "kilian" + } + ] + }, { "name": "tests.test_hotkeys.test_hotkey_C_clear_bookmarks", "reasons": [ diff --git a/src/programbench/data/tasks/unhappychoice__gittype.34b72d0/tests.json b/src/programbench/data/tasks/unhappychoice__gittype.34b72d0/tests.json index 291b5ba..5bb3dff 100644 --- a/src/programbench/data/tasks/unhappychoice__gittype.34b72d0/tests.json +++ b/src/programbench/data/tasks/unhappychoice__gittype.34b72d0/tests.json @@ -1348,6 +1348,17 @@ } ] }, + { + "name": "tests.test_global_args.test_double_dash_separator", + "reasons": [ + { + "id": "gold_flaky", + "note": "Flaky on v6 gold: passed in some gold-eval rounds and failed in others across 3 independent runs (timing-sensitive TUI/tui2cli snapshot, tmux, or db-lock/test-isolation concurrency); unreliable for grading.", + "timestamp": 1749700000, + "user": "kilian" + } + ] + }, { "name": "tests.test_global_args.test_duplicate_languages_accepted", "reasons": [ @@ -1404,6 +1415,17 @@ } ] }, + { + "name": "tests.test_global_args.test_unknown_subcommand", + "reasons": [ + { + "id": "gold_flaky", + "note": "Flaky on v6 gold: passed in some gold-eval rounds and failed in others across 3 independent runs (timing-sensitive TUI/tui2cli snapshot, tmux, or db-lock/test-isolation concurrency); unreliable for grading.", + "timestamp": 1749700000, + "user": "kilian" + } + ] + }, { "name": "tests.test_global_args.test_version_flags_produce_identical_output", "reasons": [ @@ -1712,6 +1734,83 @@ } ] }, + { + "name": "tests.test_repo_paths.test_langs_all_supported_languages", + "reasons": [ + { + "id": "gold_flaky", + "note": "Flaky on v6 gold: passed in some gold-eval rounds and failed in others across 3 independent runs (timing-sensitive TUI/tui2cli snapshot, tmux, or db-lock/test-isolation concurrency); unreliable for grading.", + "timestamp": 1749700000, + "user": "kilian" + } + ] + }, + { + "name": "tests.test_repo_paths.test_langs_case_insensitive", + "reasons": [ + { + "id": "gold_flaky", + "note": "Flaky on v6 gold: passed in some gold-eval rounds and failed in others across 3 independent runs (timing-sensitive TUI/tui2cli snapshot, tmux, or db-lock/test-isolation concurrency); unreliable for grading.", + "timestamp": 1749700000, + "user": "kilian" + } + ] + }, + { + "name": "tests.test_repo_paths.test_langs_duplicate_entries", + "reasons": [ + { + "id": "gold_flaky", + "note": "Flaky on v6 gold: passed in some gold-eval rounds and failed in others across 3 independent runs (timing-sensitive TUI/tui2cli snapshot, tmux, or db-lock/test-isolation concurrency); unreliable for grading.", + "timestamp": 1749700000, + "user": "kilian" + } + ] + }, + { + "name": "tests.test_repo_paths.test_langs_empty_value_tries_to_start", + "reasons": [ + { + "id": "gold_flaky", + "note": "Flaky on v6 gold: passed in some gold-eval rounds and failed in others across 3 independent runs (timing-sensitive TUI/tui2cli snapshot, tmux, or db-lock/test-isolation concurrency); unreliable for grading.", + "timestamp": 1749700000, + "user": "kilian" + } + ] + }, + { + "name": "tests.test_repo_paths.test_langs_with_spaces_in_names", + "reasons": [ + { + "id": "gold_flaky", + "note": "Flaky on v6 gold: passed in some gold-eval rounds and failed in others across 3 independent runs (timing-sensitive TUI/tui2cli snapshot, tmux, or db-lock/test-isolation concurrency); unreliable for grading.", + "timestamp": 1749700000, + "user": "kilian" + } + ] + }, + { + "name": "tests.test_repo_paths.test_multiple_langs_flags_are_combined", + "reasons": [ + { + "id": "gold_flaky", + "note": "Flaky on v6 gold: passed in some gold-eval rounds and failed in others across 3 independent runs (timing-sensitive TUI/tui2cli snapshot, tmux, or db-lock/test-isolation concurrency); unreliable for grading.", + "timestamp": 1749700000, + "user": "kilian" + } + ] + }, + { + "name": "tests.test_repo_paths.test_no_args_requires_tty", + "reasons": [ + { + "id": "gold_flaky", + "note": "Flaky on v6 gold: passed in some gold-eval rounds and failed in others across 3 independent runs (timing-sensitive TUI/tui2cli snapshot, tmux, or db-lock/test-isolation concurrency); unreliable for grading.", + "timestamp": 1749700000, + "user": "kilian" + } + ] + }, { "name": "tests.test_scoring_ranking.test_consistency_streaks_stored_as_json", "reasons": [ @@ -2044,6 +2143,23 @@ } ] }, + { + "name": "tests.test_tui_loading.test_current_directory_implicit_path", + "reasons": [ + { + "extra": { + "fail": 11, + "note": "TUI loading/title-screen render timing race under tui2cli capture (title/version screen not reached in captured frame)", + "pass": 9, + "pattern": "ppppppffffffffpfpffp", + "source": "20x gold cloud eval 2026-06-13" + }, + "id": "gold_flaky", + "timestamp": 1781326815, + "user": "kilian" + } + ] + }, { "name": "tests.test_tui_loading.test_difficulty_wrapping_from_hard_to_easiest", "reasons": [ @@ -2054,6 +2170,74 @@ } ] }, + { + "name": "tests.test_tui_loading.test_loading_completes_and_reaches_title_screen", + "reasons": [ + { + "extra": { + "fail": 11, + "note": "TUI loading/title-screen render timing race under tui2cli capture (title/version screen not reached in captured frame)", + "pass": 9, + "pattern": "ppppppffffffffpfpffp", + "source": "20x gold cloud eval 2026-06-13" + }, + "id": "gold_flaky", + "timestamp": 1781326815, + "user": "kilian" + } + ] + }, + { + "name": "tests.test_tui_loading.test_multiple_repos_same_session_prevents_conflicts", + "reasons": [ + { + "extra": { + "fail": 11, + "note": "TUI loading/title-screen render timing race under tui2cli capture (title/version screen not reached in captured frame)", + "pass": 9, + "pattern": "ppppppffffffffpfpffp", + "source": "20x gold cloud eval 2026-06-13" + }, + "id": "gold_flaky", + "timestamp": 1781326815, + "user": "kilian" + } + ] + }, + { + "name": "tests.test_tui_loading.test_title_screen_difficulty_change_right_arrow", + "reasons": [ + { + "extra": { + "fail": 11, + "note": "TUI loading/title-screen render timing race under tui2cli capture (title/version screen not reached in captured frame)", + "pass": 9, + "pattern": "ppppppffffffffpfpffp", + "source": "20x gold cloud eval 2026-06-13" + }, + "id": "gold_flaky", + "timestamp": 1781326815, + "user": "kilian" + } + ] + }, + { + "name": "tests.test_tui_loading.test_version_check_exit_with_escape", + "reasons": [ + { + "extra": { + "fail": 11, + "note": "TUI loading/title-screen render timing race under tui2cli capture (title/version screen not reached in captured frame)", + "pass": 9, + "pattern": "ppppppffffffffpfpffp", + "source": "20x gold cloud eval 2026-06-13" + }, + "id": "gold_flaky", + "timestamp": 1781326815, + "user": "kilian" + } + ] + }, { "name": "tests.test_tui_loading.test_version_check_screen_appears_on_launch", "reasons": [ @@ -2062,6 +2246,23 @@ } ] }, + { + "name": "tests.test_tui_loading.test_version_check_skip_with_space", + "reasons": [ + { + "extra": { + "fail": 11, + "note": "TUI loading/title-screen render timing race under tui2cli capture (title/version screen not reached in captured frame)", + "pass": 9, + "pattern": "ppppppffffffffpfpffp", + "source": "20x gold cloud eval 2026-06-13" + }, + "id": "gold_flaky", + "timestamp": 1781326815, + "user": "kilian" + } + ] + }, { "name": "tests.test_tui_records.test_records_all_filter_states", "reasons": [ @@ -2080,6 +2281,40 @@ } ] }, + { + "name": "tests.test_tui_records.test_records_escape_returns_to_title", + "reasons": [ + { + "extra": { + "fail": 11, + "note": "TUI loading/title-screen render timing race under tui2cli capture (title/version screen not reached in captured frame)", + "pass": 9, + "pattern": "ppppppffffffffpfpffp", + "source": "20x gold cloud eval 2026-06-13" + }, + "id": "gold_flaky", + "timestamp": 1781326815, + "user": "kilian" + } + ] + }, + { + "name": "tests.test_tui_records.test_records_screen_access_from_title", + "reasons": [ + { + "extra": { + "fail": 11, + "note": "TUI loading/title-screen render timing race under tui2cli capture (title/version screen not reached in captured frame)", + "pass": 9, + "pattern": "ppppppffffffffpfpffp", + "source": "20x gold cloud eval 2026-06-13" + }, + "id": "gold_flaky", + "timestamp": 1781326815, + "user": "kilian" + } + ] + }, { "name": "tests.test_tui_records.test_records_sort_cycling", "reasons": [ diff --git a/src/programbench/data/tasks/y2z__monolith.8702e66/tests.json b/src/programbench/data/tasks/y2z__monolith.8702e66/tests.json index 43a6329..0f95d60 100644 --- a/src/programbench/data/tasks/y2z__monolith.8702e66/tests.json +++ b/src/programbench/data/tasks/y2z__monolith.8702e66/tests.json @@ -776,6 +776,17 @@ } ] }, + { + "name": "eval.tests.test_cli_operations.test_ignore_errors_flag", + "reasons": [ + { + "id": "gold_fail", + "note": "[gold_fail_v6_external_resource] Inlined external resource (base64 data-URI) differs from golden; monolith fetches and embeds web content, so the result depends on a non-reproducible external page. 20x gold cloud eval 2026-06-13, failed 20/20 rounds", + "timestamp": 1781487387, + "user": "kilian" + } + ] + }, { "name": "eval.tests.test_cookies.test_case_insensitive_domain_matching", "reasons": [ diff --git a/src/programbench/data/tasks/yassinebridi__serpl.c48a9d7/tests.json b/src/programbench/data/tasks/yassinebridi__serpl.c48a9d7/tests.json index 3671bb4..b3dc858 100644 --- a/src/programbench/data/tasks/yassinebridi__serpl.c48a9d7/tests.json +++ b/src/programbench/data/tasks/yassinebridi__serpl.c48a9d7/tests.json @@ -923,6 +923,182 @@ } ] }, + { + "name": "tests.test_config_color_parsing.test_parse_color_all_named_colors", + "reasons": [ + { + "id": "gold_flaky", + "note": "Flaky on v6 gold: passed in some gold-eval rounds and failed in others across 3 independent runs (timing-sensitive TUI/tui2cli snapshot, tmux, or db-lock/test-isolation concurrency); unreliable for grading.", + "timestamp": 1749700000, + "user": "kilian" + } + ] + }, + { + "name": "tests.test_config_color_parsing.test_parse_color_bold_red", + "reasons": [ + { + "id": "gold_flaky", + "note": "Flaky on v6 gold: passed in some gold-eval rounds and failed in others across 3 independent runs (timing-sensitive TUI/tui2cli snapshot, tmux, or db-lock/test-isolation concurrency); unreliable for grading.", + "timestamp": 1749700000, + "user": "kilian" + } + ] + }, + { + "name": "tests.test_config_color_parsing.test_parse_color_bright_format", + "reasons": [ + { + "id": "gold_flaky", + "note": "Flaky on v6 gold: passed in some gold-eval rounds and failed in others across 3 independent runs (timing-sensitive TUI/tui2cli snapshot, tmux, or db-lock/test-isolation concurrency); unreliable for grading.", + "timestamp": 1749700000, + "user": "kilian" + } + ] + }, + { + "name": "tests.test_config_color_parsing.test_parse_color_gray_max_value", + "reasons": [ + { + "id": "gold_flaky", + "note": "Flaky on v6 gold: passed in some gold-eval rounds and failed in others across 3 independent runs (timing-sensitive TUI/tui2cli snapshot, tmux, or db-lock/test-isolation concurrency); unreliable for grading.", + "timestamp": 1749700000, + "user": "kilian" + } + ] + }, + { + "name": "tests.test_config_color_parsing.test_parse_color_gray_min_value", + "reasons": [ + { + "id": "gold_flaky", + "note": "Flaky on v6 gold: passed in some gold-eval rounds and failed in others across 3 independent runs (timing-sensitive TUI/tui2cli snapshot, tmux, or db-lock/test-isolation concurrency); unreliable for grading.", + "timestamp": 1749700000, + "user": "kilian" + } + ] + }, + { + "name": "tests.test_config_color_parsing.test_parse_color_gray_scale", + "reasons": [ + { + "id": "gold_flaky", + "note": "Flaky on v6 gold: passed in some gold-eval rounds and failed in others across 3 independent runs (timing-sensitive TUI/tui2cli snapshot, tmux, or db-lock/test-isolation concurrency); unreliable for grading.", + "timestamp": 1749700000, + "user": "kilian" + } + ] + }, + { + "name": "tests.test_config_color_parsing.test_parse_color_index_format", + "reasons": [ + { + "id": "gold_flaky", + "note": "Flaky on v6 gold: passed in some gold-eval rounds and failed in others across 3 independent runs (timing-sensitive TUI/tui2cli snapshot, tmux, or db-lock/test-isolation concurrency); unreliable for grading.", + "timestamp": 1749700000, + "user": "kilian" + } + ] + }, + { + "name": "tests.test_config_color_parsing.test_parse_color_index_max_value", + "reasons": [ + { + "id": "gold_flaky", + "note": "Flaky on v6 gold: passed in some gold-eval rounds and failed in others across 3 independent runs (timing-sensitive TUI/tui2cli snapshot, tmux, or db-lock/test-isolation concurrency); unreliable for grading.", + "timestamp": 1749700000, + "user": "kilian" + } + ] + }, + { + "name": "tests.test_config_color_parsing.test_parse_color_invalid_returns_none", + "reasons": [ + { + "id": "gold_flaky", + "note": "Flaky on v6 gold: passed in some gold-eval rounds and failed in others across 3 independent runs (timing-sensitive TUI/tui2cli snapshot, tmux, or db-lock/test-isolation concurrency); unreliable for grading.", + "timestamp": 1749700000, + "user": "kilian" + } + ] + }, + { + "name": "tests.test_config_color_parsing.test_parse_color_named_black", + "reasons": [ + { + "id": "gold_flaky", + "note": "Flaky on v6 gold: passed in some gold-eval rounds and failed in others across 3 independent runs (timing-sensitive TUI/tui2cli snapshot, tmux, or db-lock/test-isolation concurrency); unreliable for grading.", + "timestamp": 1749700000, + "user": "kilian" + } + ] + }, + { + "name": "tests.test_config_color_parsing.test_parse_color_rgb_all_max", + "reasons": [ + { + "id": "gold_flaky", + "note": "Flaky on v6 gold: passed in some gold-eval rounds and failed in others across 3 independent runs (timing-sensitive TUI/tui2cli snapshot, tmux, or db-lock/test-isolation concurrency); unreliable for grading.", + "timestamp": 1749700000, + "user": "kilian" + } + ] + }, + { + "name": "tests.test_config_color_parsing.test_parse_color_rgb_all_zeros", + "reasons": [ + { + "id": "gold_flaky", + "note": "Flaky on v6 gold: passed in some gold-eval rounds and failed in others across 3 independent runs (timing-sensitive TUI/tui2cli snapshot, tmux, or db-lock/test-isolation concurrency); unreliable for grading.", + "timestamp": 1749700000, + "user": "kilian" + } + ] + }, + { + "name": "tests.test_config_color_parsing.test_parse_color_rgb_format", + "reasons": [ + { + "id": "gold_flaky", + "note": "Flaky on v6 gold: passed in some gold-eval rounds and failed in others across 3 independent runs (timing-sensitive TUI/tui2cli snapshot, tmux, or db-lock/test-isolation concurrency); unreliable for grading.", + "timestamp": 1749700000, + "user": "kilian" + } + ] + }, + { + "name": "tests.test_config_color_parsing.test_parse_color_whitespace_trimming", + "reasons": [ + { + "id": "gold_flaky", + "note": "Flaky on v6 gold: passed in some gold-eval rounds and failed in others across 3 independent runs (timing-sensitive TUI/tui2cli snapshot, tmux, or db-lock/test-isolation concurrency); unreliable for grading.", + "timestamp": 1749700000, + "user": "kilian" + } + ] + }, + { + "name": "tests.test_replace_errors.test_binary_file_skipped", + "reasons": [ + { + "id": "gold_flaky", + "note": "Flaky on v6 gold: passed in some gold-eval rounds and failed in others across 3 independent runs (timing-sensitive TUI/tui2cli snapshot, tmux, or db-lock/test-isolation concurrency); unreliable for grading.", + "timestamp": 1749700000, + "user": "kilian" + } + ] + }, + { + "name": "tests.test_replace_errors.test_case_sensitive_edge_cases", + "reasons": [ + { + "id": "gold_flaky", + "note": "Flaky on v6 gold: passed in some gold-eval rounds and failed in others across 3 independent runs (timing-sensitive TUI/tui2cli snapshot, tmux, or db-lock/test-isolation concurrency); unreliable for grading.", + "timestamp": 1749700000, + "user": "kilian" + } + ] + }, { "name": "tests.test_replace_errors.test_empty_file_handling", "reasons": [ @@ -941,6 +1117,50 @@ } ] }, + { + "name": "tests.test_replace_errors.test_files_with_special_characters_in_names", + "reasons": [ + { + "id": "gold_flaky", + "note": "Flaky on v6 gold: passed in some gold-eval rounds and failed in others across 3 independent runs (timing-sensitive TUI/tui2cli snapshot, tmux, or db-lock/test-isolation concurrency); unreliable for grading.", + "timestamp": 1749700000, + "user": "kilian" + } + ] + }, + { + "name": "tests.test_replace_errors.test_newline_only_file_handling", + "reasons": [ + { + "id": "gold_flaky", + "note": "Flaky on v6 gold: passed in some gold-eval rounds and failed in others across 3 independent runs (timing-sensitive TUI/tui2cli snapshot, tmux, or db-lock/test-isolation concurrency); unreliable for grading.", + "timestamp": 1749700000, + "user": "kilian" + } + ] + }, + { + "name": "tests.test_replace_errors.test_readonly_file_handling", + "reasons": [ + { + "id": "gold_flaky", + "note": "Flaky on v6 gold: passed in some gold-eval rounds and failed in others across 3 independent runs (timing-sensitive TUI/tui2cli snapshot, tmux, or db-lock/test-isolation concurrency); unreliable for grading.", + "timestamp": 1749700000, + "user": "kilian" + } + ] + }, + { + "name": "tests.test_replace_errors.test_symlink_handling", + "reasons": [ + { + "id": "gold_flaky", + "note": "Flaky on v6 gold: passed in some gold-eval rounds and failed in others across 3 independent runs (timing-sensitive TUI/tui2cli snapshot, tmux, or db-lock/test-isolation concurrency); unreliable for grading.", + "timestamp": 1749700000, + "user": "kilian" + } + ] + }, { "name": "tests.test_replace_errors.test_very_large_file", "reasons": [ @@ -961,6 +1181,17 @@ } ] }, + { + "name": "tests.test_search_advanced.test_search_simple_mode", + "reasons": [ + { + "id": "gold_flaky", + "note": "Flaky on v6 gold: passed in some gold-eval rounds and failed in others across 3 independent runs (timing-sensitive TUI/tui2cli snapshot, tmux, or db-lock/test-isolation concurrency); unreliable for grading.", + "timestamp": 1749700000, + "user": "kilian" + } + ] + }, { "name": "tests.test_search_advanced.test_search_with_special_regex_chars", "reasons": [ @@ -969,6 +1200,23 @@ } ] }, + { + "name": "tests.test_tui_basic.test_tab_navigation_between_panes", + "reasons": [ + { + "extra": { + "fail": 19, + "note": "TUI Search/Preview pane snapshot capture timing race", + "pass": 1, + "pattern": "pfffffffffffffffffff", + "source": "20x gold cloud eval 2026-06-13" + }, + "id": "gold_flaky", + "timestamp": 1781326815, + "user": "kilian" + } + ] + }, { "name": "tests.test_tui_dialogs.test_notification_disappears_after_duration", "reasons": [ @@ -985,6 +1233,17 @@ } ] }, + { + "name": "tests.test_tui_preview.test_preview_pane_displays_sample1_with_multiple_matches", + "reasons": [ + { + "id": "gold_flaky", + "note": "Flaky on v6 gold: passed in some gold-eval rounds and failed in others across 3 independent runs (timing-sensitive TUI/tui2cli snapshot, tmux, or db-lock/test-isolation concurrency); unreliable for grading.", + "timestamp": 1749700000, + "user": "kilian" + } + ] + }, { "name": "tests.test_tui_preview.test_preview_pane_displays_unicode_content", "reasons": [ @@ -1009,6 +1268,24 @@ } ] }, + { + "name": "tests.test_tui_preview.test_preview_pane_goto_top_with_g", + "reasons": [ + { + "extra": { + "cause": "Flaky TUI render timing (preview-pane navigation)", + "statuses": [ + "passed", + "passed", + "failure" + ] + }, + "id": "gold_flaky", + "timestamp": 1781197573, + "user": "kilian" + } + ] + }, { "name": "tests.test_tui_preview.test_preview_pane_navigate_backward_with_k", "reasons": [ @@ -1027,6 +1304,24 @@ } ] }, + { + "name": "tests.test_tui_preview.test_preview_pane_navigate_to_first_match_with_j", + "reasons": [ + { + "extra": { + "cause": "Flaky TUI render timing (preview-pane navigation)", + "statuses": [ + "failure", + "passed", + "passed" + ] + }, + "id": "gold_flaky", + "timestamp": 1781197573, + "user": "kilian" + } + ] + }, { "name": "tests.test_tui_replace.test_cancel_empty_replacement", "reasons": [ @@ -1047,6 +1342,28 @@ } ] }, + { + "name": "tests.test_tui_results.test_empty_results_list", + "reasons": [ + { + "id": "gold_flaky", + "note": "TUI Search/Preview snapshot capture timing race. caught in postfix gold rerun (21st run); passed 20/20 in 20x sweep", + "timestamp": 1781500166, + "user": "kilian" + } + ] + }, + { + "name": "tests.test_tui_results.test_file_deletion_with_d_key", + "reasons": [ + { + "id": "gold_flaky", + "note": "Flaky on v6 gold: passed in some gold-eval rounds and failed in others across 3 independent runs (timing-sensitive TUI/tui2cli snapshot, tmux, or db-lock/test-isolation concurrency); unreliable for grading.", + "timestamp": 1749700000, + "user": "kilian" + } + ] + }, { "name": "tests.test_tui_results.test_multiple_deletions_in_sequence", "reasons": [ @@ -1107,6 +1424,23 @@ } ] }, + { + "name": "tests.test_tui_search.test_search_tab_navigation_to_results", + "reasons": [ + { + "extra": { + "fail": 19, + "note": "TUI Search/Preview pane snapshot capture timing race", + "pass": 1, + "pattern": "pfffffffffffffffffff", + "source": "20x gold cloud eval 2026-06-13" + }, + "id": "gold_flaky", + "timestamp": 1781326815, + "user": "kilian" + } + ] + }, { "name": "tests.test_tui_suspend_resume.TestTuiSuspendResume.test_resume_reinitializes_terminal", "reasons": [ @@ -1123,6 +1457,17 @@ } ] }, + { + "name": "tests.test_tui_workflows.test_empty_search_results_handling", + "reasons": [ + { + "id": "gold_flaky", + "note": "TUI snapshot capture timing race (empty captured frame). caught in postfix gold rerun (21st run); passed 20/20 in 20x sweep", + "timestamp": 1781500166, + "user": "kilian" + } + ] + }, { "name": "tests.test_tui_workflows.test_large_file_many_matches_navigation", "reasons": [ diff --git a/src/programbench/data/tasks/ys-l__flamelens.0b4dc33/tests.json b/src/programbench/data/tasks/ys-l__flamelens.0b4dc33/tests.json index 56dde8a..5b9cd8b 100644 --- a/src/programbench/data/tasks/ys-l__flamelens.0b4dc33/tests.json +++ b/src/programbench/data/tasks/ys-l__flamelens.0b4dc33/tests.json @@ -764,6 +764,24 @@ } ] }, + { + "name": "tests.test_parsing.test_readable_data_comment_visualization", + "reasons": [ + { + "extra": { + "cause": "Flaky TUI render timing (flamegraph parse/render)", + "statuses": [ + "passed", + "failure", + "failure" + ] + }, + "id": "gold_flaky", + "timestamp": 1781197573, + "user": "kilian" + } + ] + }, { "name": "tests.test_parsing.test_recursive_stacks", "reasons": [ diff --git a/src/programbench/data/tasks/zk-org__zk.10d93d5/tests.json b/src/programbench/data/tasks/zk-org__zk.10d93d5/tests.json index c173b96..171168b 100644 --- a/src/programbench/data/tasks/zk-org__zk.10d93d5/tests.json +++ b/src/programbench/data/tasks/zk-org__zk.10d93d5/tests.json @@ -3335,6 +3335,17 @@ } ] }, + { + "name": "tests.test_list_format.test_format_oneline", + "reasons": [ + { + "id": "gold_fail", + "note": "[gold_fail_v6_toolchain] list --format=oneline output deterministically differs from captured golden (identical across all 20 rounds; gold output drift). 20x gold cloud eval 2026-06-13, failed 20/20 rounds", + "timestamp": 1781487387, + "user": "kilian" + } + ] + }, { "name": "tests.test_list_format.test_template_format_date_helper", "reasons": [