From 42f7dc6a2bfe03970489dc5421fac4de68837716 Mon Sep 17 00:00:00 2001
From: Kilian Lieret <klieret@meta.com>
Date: Thu, 18 Jun 2026 02:28:30 +0000
Subject: [PATCH] Data: sync test-ignore updates from internal reference

Port the June 2026 test-ignore updates from the internal reference harness
into ProgramBench across 41 instances (+270 ignored tests total), keeping all
metadata verbatim, and update CLAUDE.md accordingly.

Branch-hash keys and per-branch test name-sets are unchanged, so no test
blob re-upload is needed; only ignore metadata differs. The
testorg__calculator fixture is untouched.

Internal-reference-commit: 1ca7719a46b9fc521af2353e287b4f07f4c071ea
Internal-reference-commit: a8f153ad485daf95e2041b5e872b5b715b7f5410
Internal-reference-commit: b6466b9e33c2abd88ce8ca06037c876b1edcc864
Internal-reference-commit: d7f13854873fd23b60606729d649f31a4dc7d7ac
Internal-reference-commit: 7b3a00e057cc625c86bcb9988a439ae50adb0828
Internal-reference-commit: 83488589c5237b4b9af904e1809997df735106f0
Internal-reference-commit: 3bbf4632e7d5e978b817bc168fa0897d0bbc2161
Internal-reference-commit: 691dcaa75a89d8d6f22fc7253c641375df9c3bdf
---
 CLAUDE.md                                     |  20 +
 .../tasks/antonmedv__walk.bf802ef/tests.json  | 754 ++++++++++++++++++
 .../ast-grep__ast-grep.dde0fe0/tests.json     |  87 ++
 .../bensadeh__tailspin.6278437/tests.json     |  11 +
 .../burntsushi__ripgrep.3b7fd44/tests.json    |  35 +
 .../tasks/canop__broot.d6c798e/tests.json     |  11 +
 .../data/tasks/chirlu__sox.42b3557/tests.json |  66 ++
 .../dandavison__delta.acd758f/tests.json      |  11 +
 .../tasks/duckdb__duckdb.bdb65ec/tests.json   |  17 +
 .../tasks/ekzhang__bore.8e059cd/tests.json    |  11 +
 .../tasks/elkowar__pipr.fae0b17/tests.json    |  11 +
 .../tasks/ffmpeg__ffmpeg.360a402/tests.json   | 105 +++
 .../tasks/gromacs__gromacs.665ea4c/tests.json | 121 +++
 .../data/tasks/hatoo__oha.8dc6349/tests.json  |  91 +++
 .../tasks/htop-dev__htop.523600b/tests.json   |  11 +
 .../tasks/isona__dirble.e2dea9f/tests.json    |  52 ++
 .../data/tasks/jgm__pandoc.5caad90/tests.json | 154 ++++
 .../tasks/jrnxf__thokr.09375ef/tests.json     |  80 ++
 .../tasks/junegunn__fzf.b56d614/tests.json    |  22 +
 .../kisielk__errcheck.dacab89/tests.json      |  11 +
 .../tasks/kyoheiu__felix.95df390/tests.json   |  11 +
 .../tasks/mkj__dropbear.75f699b/tests.json    | 192 +++++
 .../nikolassv__bartib.6b9b5ce/tests.json      |  11 +
 .../data/tasks/noborus__ov.b96c2ba/tests.json | 572 +++++++++++++
 .../data/tasks/orf__gping.26eb5b9/tests.json  |  17 +
 .../data/tasks/peco__peco.4e58dad/tests.json  | 144 +++-
 .../data/tasks/pls-rs__pls.4e1ae50/tests.json |  11 +
 .../tasks/raviqqe__muffet.a882908/tests.json  |  17 +
 .../rcoh__angle-grinder.9c2fc88/tests.json    |  36 +
 .../rhysd__kiro-editor.4157485/tests.json     |  94 +++
 .../data/tasks/rs__curlie.5dfcbb1/tests.json  | 129 +++
 .../tests.json                                |  25 +-
 .../tasks/sheepla__pingu.926d475/tests.json   |  55 ++
 .../tasks/sqlite__sqlite.839433d/tests.json   |  62 ++
 .../tests.json                                |  11 +
 .../svenstaro__genact.16f96e3/tests.json      |  17 +
 .../tasks/tstack__lnav.ee34494/tests.json     |  11 +
 .../unhappychoice__gittype.34b72d0/tests.json | 235 ++++++
 .../tasks/y2z__monolith.8702e66/tests.json    |  11 +
 .../yassinebridi__serpl.c48a9d7/tests.json    | 345 ++++++++
 .../tasks/ys-l__flamelens.0b4dc33/tests.json  |  18 +
 .../data/tasks/zk-org__zk.10d93d5/tests.json  |  11 +
 42 files changed, 3714 insertions(+), 2 deletions(-)

diff --git a/CLAUDE.md b/CLAUDE.md
index 5aaee0d..0dc1596 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -4,6 +4,26 @@
 
 ProgramBench evaluates whether LM-based SWE-agents can reverse-engineer black-box software systems. The workflow: take an open-source CLI tool (mostly Rust/Go), compile it into a Docker image with source removed, then have an LM agent re-implement it from scratch by interacting only with the binary. Behavioral tests (also LM-generated) score the re-implementation.
 
+## Test ignore reasons
+
+Some behavioral tests are unreliable and are excluded from scoring. Each excluded
+test is recorded under `branches.<hash>.ignored_tests[]` in a task's `tests.json`,
+with one or more `reasons[].id` explaining why. All ignored tests are excluded from
+scoring regardless of reason; the id is informational.
+
+- `gold_fail` — test fails **deterministically** on the reference (gold) solution, so it
+  is defective rather than discriminating. Also covers golden-output drift (the gold
+  binary is correct but the captured golden is stale/non-reproducible relative to the
+  build toolchain, an embedded build-stamp, or an external resource).
+- `gold_flaky` — test is **non-deterministic** on the gold solution: it passes in some
+  runs and fails in others. These are timing/race/network/TUI-snapshot flakes, not real
+  defects (distinct from the deterministic `gold_fail`).
+- `dummy_pass` — test passes even on a trivial/dummy executable, so it fails to
+  distinguish a real implementation from a stub.
+- `outcome_dependent_presence` — test appears in some eval runs but not others.
+- `slow_or_hang` — test hangs mid-call or exceeds a duration threshold.
+- `ignored_manual` — manually excluded.
+
 ## Quick reference
 
 ```bash
diff --git a/src/programbench/data/tasks/antonmedv__walk.bf802ef/tests.json b/src/programbench/data/tasks/antonmedv__walk.bf802ef/tests.json
index 9c188e1..4a614d8 100644
--- a/src/programbench/data/tasks/antonmedv__walk.bf802ef/tests.json
+++ b/src/programbench/data/tasks/antonmedv__walk.bf802ef/tests.json
@@ -1164,6 +1164,42 @@
         "tests.test_tui_search.test_typing_in_search_mode"
       ],
       "ignored_tests": [
+        {
+          "name": "tests.test_cli_config.test_dir_only_flag_shows_only_directories",
+          "reasons": [
+            {
+              "extra": {
+                "cause": "Flaky TUI/listing output: an extra size line (e.g. '20B') appears nondeterministically",
+                "statuses": [
+                  "passed",
+                  "failure",
+                  "passed"
+                ]
+              },
+              "id": "gold_flaky",
+              "timestamp": 1781197573,
+              "user": "kilian"
+            }
+          ]
+        },
+        {
+          "name": "tests.test_cli_config.test_dir_only_with_icons_combines_both_features",
+          "reasons": [
+            {
+              "extra": {
+                "cause": "Flaky TUI/listing output: an extra size line (e.g. '20B') appears nondeterministically",
+                "statuses": [
+                  "passed",
+                  "failure",
+                  "passed"
+                ]
+              },
+              "id": "gold_flaky",
+              "timestamp": 1781197573,
+              "user": "kilian"
+            }
+          ]
+        },
         {
           "name": "tests.test_cli_config.test_empty_directory_shows_no_files_message",
           "reasons": [
@@ -1184,6 +1220,96 @@
             }
           ]
         },
+        {
+          "name": "tests.test_cli_config.test_icons_flag_enables_icons",
+          "reasons": [
+            {
+              "extra": {
+                "cause": "Flaky TUI/listing output: an extra size line (e.g. '20B') appears nondeterministically",
+                "statuses": [
+                  "passed",
+                  "failure",
+                  "passed"
+                ]
+              },
+              "id": "gold_flaky",
+              "timestamp": 1781197573,
+              "user": "kilian"
+            }
+          ]
+        },
+        {
+          "name": "tests.test_cli_config.test_multiple_flags_together",
+          "reasons": [
+            {
+              "extra": {
+                "cause": "Flaky TUI/listing output: an extra size line (e.g. '20B') appears nondeterministically",
+                "statuses": [
+                  "passed",
+                  "failure",
+                  "passed"
+                ]
+              },
+              "id": "gold_flaky",
+              "timestamp": 1781197573,
+              "user": "kilian"
+            }
+          ]
+        },
+        {
+          "name": "tests.test_cli_config.test_path_argument_starts_in_specified_directory",
+          "reasons": [
+            {
+              "extra": {
+                "cause": "Flaky TUI/listing output: an extra size line (e.g. '20B') appears nondeterministically",
+                "statuses": [
+                  "passed",
+                  "failure",
+                  "passed"
+                ]
+              },
+              "id": "gold_flaky",
+              "timestamp": 1781197573,
+              "user": "kilian"
+            }
+          ]
+        },
+        {
+          "name": "tests.test_cli_config.test_path_argument_with_flags",
+          "reasons": [
+            {
+              "extra": {
+                "cause": "Flaky TUI/listing output: an extra size line (e.g. '20B') appears nondeterministically",
+                "statuses": [
+                  "passed",
+                  "failure",
+                  "passed"
+                ]
+              },
+              "id": "gold_flaky",
+              "timestamp": 1781197573,
+              "user": "kilian"
+            }
+          ]
+        },
+        {
+          "name": "tests.test_cli_config.test_preview_flag_enables_preview_mode",
+          "reasons": [
+            {
+              "extra": {
+                "cause": "Flaky TUI/listing output: an extra size line (e.g. '20B') appears nondeterministically",
+                "statuses": [
+                  "passed",
+                  "failure",
+                  "passed"
+                ]
+              },
+              "id": "gold_flaky",
+              "timestamp": 1781197573,
+              "user": "kilian"
+            }
+          ]
+        },
         {
           "name": "tests.test_cli_config.test_preview_mode_empty_directory",
           "reasons": [
@@ -1192,6 +1318,42 @@
             }
           ]
         },
+        {
+          "name": "tests.test_cli_config.test_walk_main_color_env_changes_cursor_color",
+          "reasons": [
+            {
+              "extra": {
+                "cause": "Flaky TUI/listing output: an extra size line (e.g. '20B') appears nondeterministically",
+                "statuses": [
+                  "passed",
+                  "failure",
+                  "passed"
+                ]
+              },
+              "id": "gold_flaky",
+              "timestamp": 1781197573,
+              "user": "kilian"
+            }
+          ]
+        },
+        {
+          "name": "tests.test_cli_config.test_walk_no_highlight_env_disables_syntax_highlighting",
+          "reasons": [
+            {
+              "extra": {
+                "cause": "Flaky TUI/listing output: an extra size line (e.g. '20B') appears nondeterministically",
+                "statuses": [
+                  "passed",
+                  "failure",
+                  "passed"
+                ]
+              },
+              "id": "gold_flaky",
+              "timestamp": 1781197573,
+              "user": "kilian"
+            }
+          ]
+        },
         {
           "name": "tests.test_cli_config.test_with_border_flag_adds_border_in_preview",
           "reasons": [
@@ -1212,6 +1374,60 @@
             }
           ]
         },
+        {
+          "name": "tests.test_edge_cases.test_empty_file_preview",
+          "reasons": [
+            {
+              "extra": {
+                "cause": "Flaky TUI/listing output: an extra size line (e.g. '20B') appears nondeterministically",
+                "statuses": [
+                  "passed",
+                  "failure",
+                  "passed"
+                ]
+              },
+              "id": "gold_flaky",
+              "timestamp": 1781197573,
+              "user": "kilian"
+            }
+          ]
+        },
+        {
+          "name": "tests.test_edge_cases.test_hidden_files_shown_by_default",
+          "reasons": [
+            {
+              "extra": {
+                "cause": "Flaky TUI/listing output: an extra size line (e.g. '20B') appears nondeterministically",
+                "statuses": [
+                  "passed",
+                  "failure",
+                  "passed"
+                ]
+              },
+              "id": "gold_flaky",
+              "timestamp": 1781197573,
+              "user": "kilian"
+            }
+          ]
+        },
+        {
+          "name": "tests.test_edge_cases.test_large_directory_500_files",
+          "reasons": [
+            {
+              "extra": {
+                "cause": "Flaky TUI/listing output: an extra size line (e.g. '20B') appears nondeterministically",
+                "statuses": [
+                  "passed",
+                  "failure",
+                  "passed"
+                ]
+              },
+              "id": "gold_flaky",
+              "timestamp": 1781197573,
+              "user": "kilian"
+            }
+          ]
+        },
         {
           "name": "tests.test_edge_cases.test_owner_function_basic",
           "reasons": [
@@ -1376,6 +1592,96 @@
             }
           ]
         },
+        {
+          "name": "tests.test_edge_cases.test_special_characters_in_filenames",
+          "reasons": [
+            {
+              "extra": {
+                "cause": "Flaky TUI/listing output: an extra size line (e.g. '20B') appears nondeterministically",
+                "statuses": [
+                  "passed",
+                  "failure",
+                  "passed"
+                ]
+              },
+              "id": "gold_flaky",
+              "timestamp": 1781197573,
+              "user": "kilian"
+            }
+          ]
+        },
+        {
+          "name": "tests.test_edge_cases.test_symlinks_display_in_listing",
+          "reasons": [
+            {
+              "extra": {
+                "cause": "Flaky TUI/listing output: an extra size line (e.g. '20B') appears nondeterministically",
+                "statuses": [
+                  "passed",
+                  "failure",
+                  "passed"
+                ]
+              },
+              "id": "gold_flaky",
+              "timestamp": 1781197573,
+              "user": "kilian"
+            }
+          ]
+        },
+        {
+          "name": "tests.test_edge_cases.test_tab_character_in_filename",
+          "reasons": [
+            {
+              "extra": {
+                "cause": "Flaky TUI/listing output: an extra size line (e.g. '20B') appears nondeterministically",
+                "statuses": [
+                  "passed",
+                  "failure",
+                  "passed"
+                ]
+              },
+              "id": "gold_flaky",
+              "timestamp": 1781197573,
+              "user": "kilian"
+            }
+          ]
+        },
+        {
+          "name": "tests.test_edge_cases.test_unicode_filenames_display_correctly",
+          "reasons": [
+            {
+              "extra": {
+                "cause": "Flaky TUI/listing output: an extra size line (e.g. '20B') appears nondeterministically",
+                "statuses": [
+                  "passed",
+                  "failure",
+                  "passed"
+                ]
+              },
+              "id": "gold_flaky",
+              "timestamp": 1781197573,
+              "user": "kilian"
+            }
+          ]
+        },
+        {
+          "name": "tests.test_edge_cases.test_very_long_filenames_display",
+          "reasons": [
+            {
+              "extra": {
+                "cause": "Flaky TUI/listing output: an extra size line (e.g. '20B') appears nondeterministically",
+                "statuses": [
+                  "passed",
+                  "failure",
+                  "passed"
+                ]
+              },
+              "id": "gold_flaky",
+              "timestamp": 1781197573,
+              "user": "kilian"
+            }
+          ]
+        },
         {
           "name": "tests.test_gap_main.test_dir_only_flag",
           "reasons": [
@@ -1600,6 +1906,330 @@
             }
           ]
         },
+        {
+          "name": "tests.test_icons.test_archive_extensions_show_zip_icon",
+          "reasons": [
+            {
+              "extra": {
+                "cause": "Flaky TUI/listing output: an extra size line (e.g. '20B') appears nondeterministically",
+                "statuses": [
+                  "passed",
+                  "failure",
+                  "passed"
+                ]
+              },
+              "id": "gold_flaky",
+              "timestamp": 1781197573,
+              "user": "kilian"
+            }
+          ]
+        },
+        {
+          "name": "tests.test_icons.test_files_without_extension_show_executable_or_generic_icon",
+          "reasons": [
+            {
+              "extra": {
+                "cause": "Flaky TUI/listing output: an extra size line (e.g. '20B') appears nondeterministically",
+                "statuses": [
+                  "passed",
+                  "failure",
+                  "passed"
+                ]
+              },
+              "id": "gold_flaky",
+              "timestamp": 1781197573,
+              "user": "kilian"
+            }
+          ]
+        },
+        {
+          "name": "tests.test_icons.test_hidden_files_still_show_extension_icons",
+          "reasons": [
+            {
+              "extra": {
+                "cause": "Flaky TUI/listing output: an extra size line (e.g. '20B') appears nondeterministically",
+                "statuses": [
+                  "passed",
+                  "failure",
+                  "passed"
+                ]
+              },
+              "id": "gold_flaky",
+              "timestamp": 1781197573,
+              "user": "kilian"
+            }
+          ]
+        },
+        {
+          "name": "tests.test_icons.test_icons_disabled_shows_no_icons",
+          "reasons": [
+            {
+              "extra": {
+                "cause": "Flaky TUI/listing output: an extra size line (e.g. '20B') appears nondeterministically",
+                "statuses": [
+                  "passed",
+                  "failure",
+                  "passed"
+                ]
+              },
+              "id": "gold_flaky",
+              "timestamp": 1781197573,
+              "user": "kilian"
+            }
+          ]
+        },
+        {
+          "name": "tests.test_icons.test_media_files_show_appropriate_icons",
+          "reasons": [
+            {
+              "extra": {
+                "cause": "Flaky TUI/listing output: an extra size line (e.g. '20B') appears nondeterministically",
+                "statuses": [
+                  "passed",
+                  "failure",
+                  "passed"
+                ]
+              },
+              "id": "gold_flaky",
+              "timestamp": 1781197573,
+              "user": "kilian"
+            }
+          ]
+        },
+        {
+          "name": "tests.test_icons.test_programming_language_extensions_show_correct_icons",
+          "reasons": [
+            {
+              "extra": {
+                "cause": "Flaky TUI/listing output: an extra size line (e.g. '20B') appears nondeterministically",
+                "statuses": [
+                  "passed",
+                  "failure",
+                  "passed"
+                ]
+              },
+              "id": "gold_flaky",
+              "timestamp": 1781197573,
+              "user": "kilian"
+            }
+          ]
+        },
+        {
+          "name": "tests.test_icons.test_specific_filenames_get_special_icons",
+          "reasons": [
+            {
+              "extra": {
+                "cause": "Flaky TUI/listing output: an extra size line (e.g. '20B') appears nondeterministically",
+                "statuses": [
+                  "passed",
+                  "failure",
+                  "passed"
+                ]
+              },
+              "id": "gold_flaky",
+              "timestamp": 1781197573,
+              "user": "kilian"
+            }
+          ]
+        },
+        {
+          "name": "tests.test_icons.test_unknown_extensions_show_generic_file_icon",
+          "reasons": [
+            {
+              "extra": {
+                "cause": "Flaky TUI/listing output: an extra size line (e.g. '20B') appears nondeterministically",
+                "statuses": [
+                  "passed",
+                  "failure",
+                  "passed"
+                ]
+              },
+              "id": "gold_flaky",
+              "timestamp": 1781197573,
+              "user": "kilian"
+            }
+          ]
+        },
+        {
+          "name": "tests.test_icons.test_uppercase_extensions_normalized_to_lowercase",
+          "reasons": [
+            {
+              "extra": {
+                "cause": "Flaky TUI/listing output: an extra size line (e.g. '20B') appears nondeterministically",
+                "statuses": [
+                  "passed",
+                  "failure",
+                  "passed"
+                ]
+              },
+              "id": "gold_flaky",
+              "timestamp": 1781197573,
+              "user": "kilian"
+            }
+          ]
+        },
+        {
+          "name": "tests.test_icons.test_wildcard_filename_patterns_match_correctly",
+          "reasons": [
+            {
+              "extra": {
+                "cause": "Flaky TUI/listing output: an extra size line (e.g. '20B') appears nondeterministically",
+                "statuses": [
+                  "passed",
+                  "failure",
+                  "passed"
+                ]
+              },
+              "id": "gold_flaky",
+              "timestamp": 1781197573,
+              "user": "kilian"
+            }
+          ]
+        },
+        {
+          "name": "tests.test_image_preview.test_animated_gif_renders_first_frame",
+          "reasons": [
+            {
+              "extra": {
+                "cause": "Flaky TUI/listing output: an extra size line (e.g. '20B') appears nondeterministically",
+                "statuses": [
+                  "passed",
+                  "failure",
+                  "passed"
+                ]
+              },
+              "id": "gold_flaky",
+              "timestamp": 1781197573,
+              "user": "kilian"
+            }
+          ]
+        },
+        {
+          "name": "tests.test_image_preview.test_corrupted_jpeg_shows_error",
+          "reasons": [
+            {
+              "extra": {
+                "cause": "Flaky TUI/listing output: an extra size line (e.g. '20B') appears nondeterministically",
+                "statuses": [
+                  "passed",
+                  "failure",
+                  "passed"
+                ]
+              },
+              "id": "gold_flaky",
+              "timestamp": 1781197573,
+              "user": "kilian"
+            }
+          ]
+        },
+        {
+          "name": "tests.test_image_preview.test_corrupted_png_shows_error",
+          "reasons": [
+            {
+              "extra": {
+                "cause": "Flaky TUI/listing output: an extra size line (e.g. '20B') appears nondeterministically",
+                "statuses": [
+                  "passed",
+                  "failure",
+                  "passed"
+                ]
+              },
+              "id": "gold_flaky",
+              "timestamp": 1781197573,
+              "user": "kilian"
+            }
+          ]
+        },
+        {
+          "name": "tests.test_image_preview.test_file_with_no_extension_not_treated_as_image",
+          "reasons": [
+            {
+              "extra": {
+                "cause": "Flaky TUI/listing output: an extra size line (e.g. '20B') appears nondeterministically",
+                "statuses": [
+                  "passed",
+                  "failure",
+                  "passed"
+                ]
+              },
+              "id": "gold_flaky",
+              "timestamp": 1781197573,
+              "user": "kilian"
+            }
+          ]
+        },
+        {
+          "name": "tests.test_image_preview.test_gif_with_mixed_case_extension",
+          "reasons": [
+            {
+              "extra": {
+                "cause": "Flaky TUI/listing output: an extra size line (e.g. '20B') appears nondeterministically",
+                "statuses": [
+                  "passed",
+                  "failure",
+                  "passed"
+                ]
+              },
+              "id": "gold_flaky",
+              "timestamp": 1781197573,
+              "user": "kilian"
+            }
+          ]
+        },
+        {
+          "name": "tests.test_image_preview.test_jpeg_with_uppercase_extension",
+          "reasons": [
+            {
+              "extra": {
+                "cause": "Flaky TUI/listing output: an extra size line (e.g. '20B') appears nondeterministically",
+                "statuses": [
+                  "passed",
+                  "failure",
+                  "passed"
+                ]
+              },
+              "id": "gold_flaky",
+              "timestamp": 1781197573,
+              "user": "kilian"
+            }
+          ]
+        },
+        {
+          "name": "tests.test_image_preview.test_png_image_preview",
+          "reasons": [
+            {
+              "extra": {
+                "cause": "Flaky TUI/listing output: an extra size line (e.g. '20B') appears nondeterministically",
+                "statuses": [
+                  "passed",
+                  "failure",
+                  "passed"
+                ]
+              },
+              "id": "gold_flaky",
+              "timestamp": 1781197573,
+              "user": "kilian"
+            }
+          ]
+        },
+        {
+          "name": "tests.test_image_preview.test_subdirectory_navigation_image_preview",
+          "reasons": [
+            {
+              "extra": {
+                "cause": "Flaky TUI/listing output: an extra size line (e.g. '20B') appears nondeterministically",
+                "statuses": [
+                  "passed",
+                  "failure",
+                  "passed"
+                ]
+              },
+              "id": "gold_flaky",
+              "timestamp": 1781197573,
+              "user": "kilian"
+            }
+          ]
+        },
         {
           "name": "tests.test_image_preview.test_symlink_to_image_renders",
           "reasons": [
@@ -1608,6 +2238,96 @@
             }
           ]
         },
+        {
+          "name": "tests.test_image_preview.test_text_file_not_treated_as_image",
+          "reasons": [
+            {
+              "extra": {
+                "cause": "Flaky TUI/listing output: an extra size line (e.g. '20B') appears nondeterministically",
+                "statuses": [
+                  "passed",
+                  "failure",
+                  "passed"
+                ]
+              },
+              "id": "gold_flaky",
+              "timestamp": 1781197573,
+              "user": "kilian"
+            }
+          ]
+        },
+        {
+          "name": "tests.test_tui_navigation.test_backspace_exits_directory",
+          "reasons": [
+            {
+              "extra": {
+                "cause": "Flaky TUI/listing output: an extra size line (e.g. '20B') appears nondeterministically",
+                "statuses": [
+                  "passed",
+                  "failure",
+                  "passed"
+                ]
+              },
+              "id": "gold_flaky",
+              "timestamp": 1781197573,
+              "user": "kilian"
+            }
+          ]
+        },
+        {
+          "name": "tests.test_tui_navigation.test_initial_directory_listing",
+          "reasons": [
+            {
+              "extra": {
+                "cause": "Flaky TUI/listing output: an extra size line (e.g. '20B') appears nondeterministically",
+                "statuses": [
+                  "passed",
+                  "failure",
+                  "passed"
+                ]
+              },
+              "id": "gold_flaky",
+              "timestamp": 1781197573,
+              "user": "kilian"
+            }
+          ]
+        },
+        {
+          "name": "tests.test_tui_navigation.test_multicolumn_layout",
+          "reasons": [
+            {
+              "extra": {
+                "cause": "Flaky TUI/listing output: an extra size line (e.g. '20B') appears nondeterministically",
+                "statuses": [
+                  "passed",
+                  "failure",
+                  "passed"
+                ]
+              },
+              "id": "gold_flaky",
+              "timestamp": 1781197573,
+              "user": "kilian"
+            }
+          ]
+        },
+        {
+          "name": "tests.test_tui_navigation.test_toggle_hidden_files",
+          "reasons": [
+            {
+              "extra": {
+                "cause": "Flaky TUI/listing output: an extra size line (e.g. '20B') appears nondeterministically",
+                "statuses": [
+                  "passed",
+                  "failure",
+                  "passed"
+                ]
+              },
+              "id": "gold_flaky",
+              "timestamp": 1781197573,
+              "user": "kilian"
+            }
+          ]
+        },
         {
           "name": "tests.test_tui_operations.test_delete_empty_directory",
           "reasons": [
@@ -1624,6 +2344,40 @@
             }
           ]
         },
+        {
+          "name": "tests.test_tui_operations.test_multiple_delete_undo_cycles",
+          "reasons": [
+            {
+              "extra": {
+                "fail": 1,
+                "note": "TUI delete/undo temp-file existence race (filesystem state non-deterministic under tui2cli capture)",
+                "pass": 19,
+                "pattern": "ppppppppppfppppppppp",
+                "source": "20x gold cloud eval 2026-06-13"
+              },
+              "id": "gold_flaky",
+              "timestamp": 1781326815,
+              "user": "kilian"
+            }
+          ]
+        },
+        {
+          "name": "tests.test_tui_operations.test_multiple_pending_deletions_stacked",
+          "reasons": [
+            {
+              "extra": {
+                "fail": 1,
+                "note": "TUI delete/undo temp-file existence race (filesystem state non-deterministic under tui2cli capture)",
+                "pass": 19,
+                "pattern": "ppppppppppfppppppppp",
+                "source": "20x gold cloud eval 2026-06-13"
+              },
+              "id": "gold_flaky",
+              "timestamp": 1781326815,
+              "user": "kilian"
+            }
+          ]
+        },
         {
           "name": "tests.test_tui_operations.test_undo_most_recent_deletion_only",
           "reasons": [
diff --git a/src/programbench/data/tasks/ast-grep__ast-grep.dde0fe0/tests.json b/src/programbench/data/tasks/ast-grep__ast-grep.dde0fe0/tests.json
index c7c6d3c..a24bd2d 100644
--- a/src/programbench/data/tasks/ast-grep__ast-grep.dde0fe0/tests.json
+++ b/src/programbench/data/tasks/ast-grep__ast-grep.dde0fe0/tests.json
@@ -981,6 +981,40 @@
             }
           ]
         },
+        {
+          "name": "eval.tests.test_language_features.test_mixed_js_and_css_injections_in_html",
+          "reasons": [
+            {
+              "extra": {
+                "fail": 1,
+                "note": "transient nondeterminism in HTML multi-language injection matching (passes 19/20)",
+                "pass": 19,
+                "pattern": "pfpppppppppppppppppp",
+                "source": "20x gold cloud eval 2026-06-13"
+              },
+              "id": "gold_flaky",
+              "timestamp": 1781326815,
+              "user": "kilian"
+            }
+          ]
+        },
+        {
+          "name": "eval.tests.test_language_features.test_mixed_typescript_and_javascript_in_html",
+          "reasons": [
+            {
+              "extra": {
+                "fail": 1,
+                "note": "transient nondeterminism in HTML multi-language injection matching (passes 19/20)",
+                "pass": 19,
+                "pattern": "ppppppppppppppfppppp",
+                "source": "20x gold cloud eval 2026-06-13"
+              },
+              "id": "gold_flaky",
+              "timestamp": 1781326815,
+              "user": "kilian"
+            }
+          ]
+        },
         {
           "name": "eval.tests.test_language_features.test_multiple_js_script_tags_in_html",
           "reasons": [
@@ -991,6 +1025,41 @@
             }
           ]
         },
+        {
+          "name": "eval.tests.test_language_features.test_plain_js_script_not_matched_as_typescript",
+          "reasons": [
+            {
+              "extra": {
+                "fail": 1,
+                "note": "transient nondeterminism in HTML multi-language injection matching (passes 19/20)",
+                "pass": 19,
+                "pattern": "ppppppppppppppfppppp",
+                "source": "20x gold cloud eval 2026-06-13"
+              },
+              "id": "gold_flaky",
+              "timestamp": 1781326815,
+              "user": "kilian"
+            }
+          ]
+        },
+        {
+          "name": "eval.tests.test_language_features.test_typescript_type_alias_in_html",
+          "reasons": [
+            {
+              "extra": {
+                "cause": "Nondeterministic analysis/diagnostic ordering (parallel rule evaluation)",
+                "statuses": [
+                  "failure",
+                  "passed",
+                  "passed"
+                ]
+              },
+              "id": "gold_flaky",
+              "timestamp": 1781197573,
+              "user": "kilian"
+            }
+          ]
+        },
         {
           "name": "eval.tests.test_multiple_fixers.test_three_fixers_json_consistent_replacement",
           "reasons": [
@@ -1011,6 +1080,24 @@
             }
           ]
         },
+        {
+          "name": "eval.tests.test_parameterized_advanced.test_error_cyclic_dependency",
+          "reasons": [
+            {
+              "extra": {
+                "cause": "Nondeterministic analysis/diagnostic ordering (parallel rule evaluation)",
+                "statuses": [
+                  "failure",
+                  "passed",
+                  "passed"
+                ]
+              },
+              "id": "gold_flaky",
+              "timestamp": 1781197573,
+              "user": "kilian"
+            }
+          ]
+        },
         {
           "name": "eval.tests.test_parameterized_advanced.test_parameterized_utility_with_stopBy",
           "reasons": [
diff --git a/src/programbench/data/tasks/bensadeh__tailspin.6278437/tests.json b/src/programbench/data/tasks/bensadeh__tailspin.6278437/tests.json
index d09be48..6ce26be 100644
--- a/src/programbench/data/tasks/bensadeh__tailspin.6278437/tests.json
+++ b/src/programbench/data/tasks/bensadeh__tailspin.6278437/tests.json
@@ -369,6 +369,17 @@
               "id": "dummy_pass"
             }
           ]
+        },
+        {
+          "name": "eval.tests.test_follow_mode.test_follow_flag_short",
+          "reasons": [
+            {
+              "id": "gold_flaky",
+              "note": "Flaky on v6 gold: passed in some gold-eval rounds and failed in others across 3 independent runs (timing-sensitive TUI/tui2cli snapshot, tmux, or db-lock/test-isolation concurrency); unreliable for grading.",
+              "timestamp": 1749700000,
+              "user": "kilian"
+            }
+          ]
         }
       ]
     },
diff --git a/src/programbench/data/tasks/burntsushi__ripgrep.3b7fd44/tests.json b/src/programbench/data/tasks/burntsushi__ripgrep.3b7fd44/tests.json
index b542fc4..b0dd8ee 100644
--- a/src/programbench/data/tasks/burntsushi__ripgrep.3b7fd44/tests.json
+++ b/src/programbench/data/tasks/burntsushi__ripgrep.3b7fd44/tests.json
@@ -5315,6 +5315,23 @@
         "tests.test_walk_errors.test_whitespace_only_gitignore"
       ],
       "ignored_tests": [
+        {
+          "name": "tests.test_cli_utils.test_max_filesize_bytes_format",
+          "reasons": [
+            {
+              "extra": {
+                "fail": 1,
+                "note": "directory-listing order nondeterminism (large_file/small_file output order varies)",
+                "pass": 19,
+                "pattern": "pppfpppppppppppppppp",
+                "source": "20x gold cloud eval 2026-06-13"
+              },
+              "id": "gold_flaky",
+              "timestamp": 1781326815,
+              "user": "kilian"
+            }
+          ]
+        },
         {
           "name": "tests.test_encoding.test_short_encoding_flag_syntax",
           "reasons": [
@@ -6127,6 +6144,24 @@
             }
           ]
         },
+        {
+          "name": "tests.test_vimgrep.test_output_mode_no_heading_multifile",
+          "reasons": [
+            {
+              "extra": {
+                "cause": "Nondeterministic multifile output ordering (parallel directory walk)",
+                "statuses": [
+                  "passed",
+                  "failure",
+                  "passed"
+                ]
+              },
+              "id": "gold_flaky",
+              "timestamp": 1781197573,
+              "user": "kilian"
+            }
+          ]
+        },
         {
           "name": "tests.test_walk_errors.test_files_with_no_read_permission_as_non_root",
           "reasons": [
diff --git a/src/programbench/data/tasks/canop__broot.d6c798e/tests.json b/src/programbench/data/tasks/canop__broot.d6c798e/tests.json
index b219add..1328eeb 100644
--- a/src/programbench/data/tasks/canop__broot.d6c798e/tests.json
+++ b/src/programbench/data/tasks/canop__broot.d6c798e/tests.json
@@ -1352,6 +1352,17 @@
             }
           ]
         },
+        {
+          "name": "tests.test_panel_state.test_toggle_sizes_displays_file_sizes",
+          "reasons": [
+            {
+              "id": "gold_flaky",
+              "note": "Flaky on v6 gold: passed in some gold-eval rounds and failed in others across 3 independent runs (timing-sensitive TUI/tui2cli snapshot, tmux, or db-lock/test-isolation concurrency); unreliable for grading.",
+              "timestamp": 1749700000,
+              "user": "kilian"
+            }
+          ]
+        },
         {
           "name": "tests.test_panel_state.test_trash_panel_state",
           "reasons": [
diff --git a/src/programbench/data/tasks/chirlu__sox.42b3557/tests.json b/src/programbench/data/tasks/chirlu__sox.42b3557/tests.json
index 81319b2..cd524cc 100644
--- a/src/programbench/data/tasks/chirlu__sox.42b3557/tests.json
+++ b/src/programbench/data/tasks/chirlu__sox.42b3557/tests.json
@@ -1334,6 +1334,72 @@
             }
           ]
         },
+        {
+          "name": "tests.test_cli_options.test_buffer_option_missing_argument",
+          "reasons": [
+            {
+              "id": "gold_fail",
+              "note": "[gold_fail_v6_build_featureset] Fails on v6 gold: the reproducible build (e.g. ffmpeg --disable-autodetect) produces a deterministic but different optional-feature/codec/help-list set than the autodetect build the golden was captured against.",
+              "timestamp": 1749700000,
+              "user": "kilian"
+            }
+          ]
+        },
+        {
+          "name": "tests.test_cli_options.test_help_effect_nonexistent",
+          "reasons": [
+            {
+              "id": "gold_fail",
+              "note": "[gold_fail_v6_build_featureset] Fails on v6 gold: the reproducible build (e.g. ffmpeg --disable-autodetect) produces a deterministic but different optional-feature/codec/help-list set than the autodetect build the golden was captured against.",
+              "timestamp": 1749700000,
+              "user": "kilian"
+            }
+          ]
+        },
+        {
+          "name": "tests.test_cli_options.test_help_format_nonexistent",
+          "reasons": [
+            {
+              "id": "gold_fail",
+              "note": "[gold_fail_v6_build_featureset] Fails on v6 gold: the reproducible build (e.g. ffmpeg --disable-autodetect) produces a deterministic but different optional-feature/codec/help-list set than the autodetect build the golden was captured against.",
+              "timestamp": 1749700000,
+              "user": "kilian"
+            }
+          ]
+        },
+        {
+          "name": "tests.test_cli_options.test_help_output_complete",
+          "reasons": [
+            {
+              "id": "gold_fail",
+              "note": "[gold_fail_v6_build_featureset] Fails on v6 gold: the reproducible build (e.g. ffmpeg --disable-autodetect) produces a deterministic but different optional-feature/codec/help-list set than the autodetect build the golden was captured against.",
+              "timestamp": 1749700000,
+              "user": "kilian"
+            }
+          ]
+        },
+        {
+          "name": "tests.test_cli_options.test_invalid_option",
+          "reasons": [
+            {
+              "id": "gold_fail",
+              "note": "[gold_fail_v6_build_featureset] Fails on v6 gold: the reproducible build (e.g. ffmpeg --disable-autodetect) produces a deterministic but different optional-feature/codec/help-list set than the autodetect build the golden was captured against.",
+              "timestamp": 1749700000,
+              "user": "kilian"
+            }
+          ]
+        },
+        {
+          "name": "tests.test_cli_options.test_no_input_files",
+          "reasons": [
+            {
+              "id": "gold_fail",
+              "note": "[gold_fail_v6_build_featureset] Fails on v6 gold: the reproducible build (e.g. ffmpeg --disable-autodetect) produces a deterministic but different optional-feature/codec/help-list set than the autodetect build the golden was captured against.",
+              "timestamp": 1749700000,
+              "user": "kilian"
+            }
+          ]
+        },
         {
           "name": "tests.test_cli_options.test_reproducible_random",
           "reasons": [
diff --git a/src/programbench/data/tasks/dandavison__delta.acd758f/tests.json b/src/programbench/data/tasks/dandavison__delta.acd758f/tests.json
index 8faf9a3..ab0c878 100644
--- a/src/programbench/data/tasks/dandavison__delta.acd758f/tests.json
+++ b/src/programbench/data/tasks/dandavison__delta.acd758f/tests.json
@@ -1269,6 +1269,17 @@
             }
           ]
         },
+        {
+          "name": "tests.test_grep_gaps.test_git_grep_before_context_only",
+          "reasons": [
+            {
+              "id": "gold_flaky",
+              "note": "Flaky on v6 gold: passed in some gold-eval rounds and failed in others across 3 independent runs (timing-sensitive TUI/tui2cli snapshot, tmux, or db-lock/test-isolation concurrency); unreliable for grading.",
+              "timestamp": 1749700000,
+              "user": "kilian"
+            }
+          ]
+        },
         {
           "name": "tests.test_grep_gaps.test_git_grep_classic_output_type_override",
           "reasons": [
diff --git a/src/programbench/data/tasks/duckdb__duckdb.bdb65ec/tests.json b/src/programbench/data/tasks/duckdb__duckdb.bdb65ec/tests.json
index 384f1d4..cd2019d 100644
--- a/src/programbench/data/tasks/duckdb__duckdb.bdb65ec/tests.json
+++ b/src/programbench/data/tasks/duckdb__duckdb.bdb65ec/tests.json
@@ -35227,6 +35227,23 @@
             }
           ]
         },
+        {
+          "name": "tests.test_harvest_sql_2.test_flatten",
+          "reasons": [
+            {
+              "extra": {
+                "fail": 1,
+                "note": "SQL result row-order nondeterminism (no ORDER BY): flatten rows reorder",
+                "pass": 19,
+                "pattern": "ppppppppppppppppppfp",
+                "source": "20x gold cloud eval 2026-06-13"
+              },
+              "id": "gold_flaky",
+              "timestamp": 1781326815,
+              "user": "kilian"
+            }
+          ]
+        },
         {
           "name": "tests.test_harvest_sql_2.test_format_bytes",
           "reasons": [
diff --git a/src/programbench/data/tasks/ekzhang__bore.8e059cd/tests.json b/src/programbench/data/tasks/ekzhang__bore.8e059cd/tests.json
index e04ff38..14ad167 100644
--- a/src/programbench/data/tasks/ekzhang__bore.8e059cd/tests.json
+++ b/src/programbench/data/tasks/ekzhang__bore.8e059cd/tests.json
@@ -529,6 +529,17 @@
               "id": "gold_fail"
             }
           ]
+        },
+        {
+          "name": "tests.test_harvest.test_basic_proxy[None]@server_tests",
+          "reasons": [
+            {
+              "id": "gold_flaky",
+              "note": "Flaky on v6 gold: passed in some gold-eval rounds and failed in others across 3 independent runs (timing-sensitive TUI/tui2cli snapshot, tmux, or db-lock/test-isolation concurrency); unreliable for grading.",
+              "timestamp": 1749700000,
+              "user": "kilian"
+            }
+          ]
         }
       ]
     },
diff --git a/src/programbench/data/tasks/elkowar__pipr.fae0b17/tests.json b/src/programbench/data/tasks/elkowar__pipr.fae0b17/tests.json
index 12b2c35..06ef978 100644
--- a/src/programbench/data/tasks/elkowar__pipr.fae0b17/tests.json
+++ b/src/programbench/data/tasks/elkowar__pipr.fae0b17/tests.json
@@ -2546,6 +2546,17 @@
             }
           ]
         },
+        {
+          "name": "tests.test_command_list_window.test_history_enter_selection",
+          "reasons": [
+            {
+              "id": "gold_flaky",
+              "note": "Flaky on v6 gold: passed in some gold-eval rounds and failed in others across 3 independent runs (timing-sensitive TUI/tui2cli snapshot, tmux, or db-lock/test-isolation concurrency); unreliable for grading.",
+              "timestamp": 1749700000,
+              "user": "kilian"
+            }
+          ]
+        },
         {
           "name": "tests.test_command_list_window.test_history_escape_closes_window",
           "reasons": [
diff --git a/src/programbench/data/tasks/ffmpeg__ffmpeg.360a402/tests.json b/src/programbench/data/tasks/ffmpeg__ffmpeg.360a402/tests.json
index 55ec360..752e269 100644
--- a/src/programbench/data/tasks/ffmpeg__ffmpeg.360a402/tests.json
+++ b/src/programbench/data/tasks/ffmpeg__ffmpeg.360a402/tests.json
@@ -3890,6 +3890,17 @@
             }
           ]
         },
+        {
+          "name": "tests.test_cmdutils_deep.test_list_encoders_complete",
+          "reasons": [
+            {
+              "id": "gold_fail",
+              "note": "[gold_fail_v6_build_featureset] Fails on v6 gold: the reproducible build (e.g. ffmpeg --disable-autodetect) produces a deterministic but different optional-feature/codec/help-list set than the autodetect build the golden was captured against.",
+              "timestamp": 1749700000,
+              "user": "kilian"
+            }
+          ]
+        },
         {
           "name": "tests.test_cmdutils_gaps.test_codecs_list_complete",
           "reasons": [
@@ -12453,6 +12464,50 @@
             }
           ]
         },
+        {
+          "name": "tests.test_help_info.test_codecs",
+          "reasons": [
+            {
+              "id": "gold_fail",
+              "note": "[gold_fail_v6_build_featureset] Fails on v6 gold: the reproducible build (e.g. ffmpeg --disable-autodetect) produces a deterministic but different optional-feature/codec/help-list set than the autodetect build the golden was captured against.",
+              "timestamp": 1749700000,
+              "user": "kilian"
+            }
+          ]
+        },
+        {
+          "name": "tests.test_help_info.test_decoders",
+          "reasons": [
+            {
+              "id": "gold_fail",
+              "note": "[gold_fail_v6_build_featureset] Fails on v6 gold: the reproducible build (e.g. ffmpeg --disable-autodetect) produces a deterministic but different optional-feature/codec/help-list set than the autodetect build the golden was captured against.",
+              "timestamp": 1749700000,
+              "user": "kilian"
+            }
+          ]
+        },
+        {
+          "name": "tests.test_help_info.test_encoders",
+          "reasons": [
+            {
+              "id": "gold_fail",
+              "note": "[gold_fail_v6_build_featureset] Fails on v6 gold: the reproducible build (e.g. ffmpeg --disable-autodetect) produces a deterministic but different optional-feature/codec/help-list set than the autodetect build the golden was captured against.",
+              "timestamp": 1749700000,
+              "user": "kilian"
+            }
+          ]
+        },
+        {
+          "name": "tests.test_help_info.test_help_full",
+          "reasons": [
+            {
+              "id": "gold_fail",
+              "note": "[gold_fail_v6_build_featureset] Fails on v6 gold: the reproducible build (e.g. ffmpeg --disable-autodetect) produces a deterministic but different optional-feature/codec/help-list set than the autodetect build the golden was captured against.",
+              "timestamp": 1749700000,
+              "user": "kilian"
+            }
+          ]
+        },
         {
           "name": "tests.test_logging.test_default_help_shows_banner",
           "reasons": [
@@ -12565,6 +12620,39 @@
             }
           ]
         },
+        {
+          "name": "tests.test_opt_common_final.test_show_codecs_list",
+          "reasons": [
+            {
+              "id": "gold_fail",
+              "note": "[gold_fail_v6_build_featureset] Fails on v6 gold: the reproducible build (e.g. ffmpeg --disable-autodetect) produces a deterministic but different optional-feature/codec/help-list set than the autodetect build the golden was captured against.",
+              "timestamp": 1749700000,
+              "user": "kilian"
+            }
+          ]
+        },
+        {
+          "name": "tests.test_opt_common_final.test_show_decoders_list",
+          "reasons": [
+            {
+              "id": "gold_fail",
+              "note": "[gold_fail_v6_build_featureset] Fails on v6 gold: the reproducible build (e.g. ffmpeg --disable-autodetect) produces a deterministic but different optional-feature/codec/help-list set than the autodetect build the golden was captured against.",
+              "timestamp": 1749700000,
+              "user": "kilian"
+            }
+          ]
+        },
+        {
+          "name": "tests.test_opt_common_final.test_show_encoders_list",
+          "reasons": [
+            {
+              "id": "gold_fail",
+              "note": "[gold_fail_v6_build_featureset] Fails on v6 gold: the reproducible build (e.g. ffmpeg --disable-autodetect) produces a deterministic but different optional-feature/codec/help-list set than the autodetect build the golden was captured against.",
+              "timestamp": 1749700000,
+              "user": "kilian"
+            }
+          ]
+        },
         {
           "name": "tests.test_option_parsing.test_noautorotate_boolean_negation",
           "reasons": [
@@ -12584,6 +12672,23 @@
               "user": "kilian"
             }
           ]
+        },
+        {
+          "name": "tests.test_textformat_gaps.test_default_format_noprint_wrappers_option",
+          "reasons": [
+            {
+              "extra": {
+                "fail": 1,
+                "note": "test-resource setup race: input.mp4 not present yet when test runs under pytest-xdist",
+                "pass": 19,
+                "pattern": "ppppfppppppppppppppp",
+                "source": "20x gold cloud eval 2026-06-13"
+              },
+              "id": "gold_flaky",
+              "timestamp": 1781326815,
+              "user": "kilian"
+            }
+          ]
         }
       ]
     },
diff --git a/src/programbench/data/tasks/gromacs__gromacs.665ea4c/tests.json b/src/programbench/data/tasks/gromacs__gromacs.665ea4c/tests.json
index 92cf950..45060b0 100644
--- a/src/programbench/data/tasks/gromacs__gromacs.665ea4c/tests.json
+++ b/src/programbench/data/tasks/gromacs__gromacs.665ea4c/tests.json
@@ -2314,6 +2314,127 @@
             }
           ]
         },
+        {
+          "name": "tests.test_structure.test_gyrate_protein_basic",
+          "reasons": [
+            {
+              "id": "gold_fail",
+              "note": "[gold_fail_v6_build_stamp] Fails on v6 gold: test asserts a build-stamped version/commit/date string (non-reproducible build metadata, e.g. embedded git rev or build date) rather than functionality; the golden captured the previous build's stamp.",
+              "timestamp": 1749700000,
+              "user": "kilian"
+            }
+          ]
+        },
+        {
+          "name": "tests.test_structure.test_gyrate_weighting_modes",
+          "reasons": [
+            {
+              "id": "gold_fail",
+              "note": "[gold_fail_v6_build_stamp] Fails on v6 gold: test asserts a build-stamped version/commit/date string (non-reproducible build metadata, e.g. embedded git rev or build date) rather than functionality; the golden captured the previous build's stamp.",
+              "timestamp": 1749700000,
+              "user": "kilian"
+            }
+          ]
+        },
+        {
+          "name": "tests.test_structure.test_rms_backbone",
+          "reasons": [
+            {
+              "id": "gold_fail",
+              "note": "[gold_fail_v6_build_stamp] Fails on v6 gold: test asserts a build-stamped version/commit/date string (non-reproducible build metadata, e.g. embedded git rev or build date) rather than functionality; the golden captured the previous build's stamp.",
+              "timestamp": 1749700000,
+              "user": "kilian"
+            }
+          ]
+        },
+        {
+          "name": "tests.test_structure.test_rms_calpha_basic",
+          "reasons": [
+            {
+              "id": "gold_fail",
+              "note": "[gold_fail_v6_build_stamp] Fails on v6 gold: test asserts a build-stamped version/commit/date string (non-reproducible build metadata, e.g. embedded git rev or build date) rather than functionality; the golden captured the previous build's stamp.",
+              "timestamp": 1749700000,
+              "user": "kilian"
+            }
+          ]
+        },
+        {
+          "name": "tests.test_structure.test_rms_fit_translation",
+          "reasons": [
+            {
+              "id": "gold_fail",
+              "note": "[gold_fail_v6_build_stamp] Fails on v6 gold: test asserts a build-stamped version/commit/date string (non-reproducible build metadata, e.g. embedded git rev or build date) rather than functionality; the golden captured the previous build's stamp.",
+              "timestamp": 1749700000,
+              "user": "kilian"
+            }
+          ]
+        },
+        {
+          "name": "tests.test_structure.test_rms_mirror_image",
+          "reasons": [
+            {
+              "id": "gold_fail",
+              "note": "[gold_fail_v6_build_stamp] Fails on v6 gold: test asserts a build-stamped version/commit/date string (non-reproducible build metadata, e.g. embedded git rev or build date) rather than functionality; the golden captured the previous build's stamp.",
+              "timestamp": 1749700000,
+              "user": "kilian"
+            }
+          ]
+        },
+        {
+          "name": "tests.test_structure.test_rms_nofit",
+          "reasons": [
+            {
+              "id": "gold_fail",
+              "note": "[gold_fail_v6_build_stamp] Fails on v6 gold: test asserts a build-stamped version/commit/date string (non-reproducible build metadata, e.g. embedded git rev or build date) rather than functionality; the golden captured the previous build's stamp.",
+              "timestamp": 1749700000,
+              "user": "kilian"
+            }
+          ]
+        },
+        {
+          "name": "tests.test_structure.test_rms_time_range",
+          "reasons": [
+            {
+              "id": "gold_fail",
+              "note": "[gold_fail_v6_build_stamp] Fails on v6 gold: test asserts a build-stamped version/commit/date string (non-reproducible build metadata, e.g. embedded git rev or build date) rather than functionality; the golden captured the previous build's stamp.",
+              "timestamp": 1749700000,
+              "user": "kilian"
+            }
+          ]
+        },
+        {
+          "name": "tests.test_structure.test_rms_what_rho",
+          "reasons": [
+            {
+              "id": "gold_fail",
+              "note": "[gold_fail_v6_build_stamp] Fails on v6 gold: test asserts a build-stamped version/commit/date string (non-reproducible build metadata, e.g. embedded git rev or build date) rather than functionality; the golden captured the previous build's stamp.",
+              "timestamp": 1749700000,
+              "user": "kilian"
+            }
+          ]
+        },
+        {
+          "name": "tests.test_structure.test_rmsf_calpha_basic",
+          "reasons": [
+            {
+              "id": "gold_fail",
+              "note": "[gold_fail_v6_build_stamp] Fails on v6 gold: test asserts a build-stamped version/commit/date string (non-reproducible build metadata, e.g. embedded git rev or build date) rather than functionality; the golden captured the previous build's stamp.",
+              "timestamp": 1749700000,
+              "user": "kilian"
+            }
+          ]
+        },
+        {
+          "name": "tests.test_structure.test_rmsf_residue_averaging",
+          "reasons": [
+            {
+              "id": "gold_fail",
+              "note": "[gold_fail_v6_build_stamp] Fails on v6 gold: test asserts a build-stamped version/commit/date string (non-reproducible build metadata, e.g. embedded git rev or build date) rather than functionality; the golden captured the previous build's stamp.",
+              "timestamp": 1749700000,
+              "user": "kilian"
+            }
+          ]
+        },
         {
           "name": "tests.test_tcaf_trjorder.test_tcaf_cubic_averaging_optional",
           "reasons": [
diff --git a/src/programbench/data/tasks/hatoo__oha.8dc6349/tests.json b/src/programbench/data/tasks/hatoo__oha.8dc6349/tests.json
index ffeaf24..c100ea8 100644
--- a/src/programbench/data/tasks/hatoo__oha.8dc6349/tests.json
+++ b/src/programbench/data/tasks/hatoo__oha.8dc6349/tests.json
@@ -1815,6 +1815,28 @@
             }
           ]
         },
+        {
+          "name": "tests.test_output.test_json_output_details_connection_time_relationships",
+          "reasons": [
+            {
+              "id": "gold_flaky",
+              "note": "Flaky on v6 gold: passed in some gold-eval rounds and failed in others across 3 independent runs (timing-sensitive TUI/tui2cli snapshot, tmux, or db-lock/test-isolation concurrency); unreliable for grading.",
+              "timestamp": 1749700000,
+              "user": "kilian"
+            }
+          ]
+        },
+        {
+          "name": "tests.test_output.test_json_output_details_connection_times",
+          "reasons": [
+            {
+              "id": "gold_flaky",
+              "note": "Flaky on v6 gold: passed in some gold-eval rounds and failed in others across 3 independent runs (timing-sensitive TUI/tui2cli snapshot, tmux, or db-lock/test-isolation concurrency); unreliable for grading.",
+              "timestamp": 1749700000,
+              "user": "kilian"
+            }
+          ]
+        },
         {
           "name": "tests.test_output.test_json_output_error_distribution_empty_on_success",
           "reasons": [
@@ -1857,6 +1879,17 @@
             }
           ]
         },
+        {
+          "name": "tests.test_output.test_json_output_status_code_distribution",
+          "reasons": [
+            {
+              "id": "gold_flaky",
+              "note": "Flaky on v6 gold: passed in some gold-eval rounds and failed in others across 3 independent runs (timing-sensitive TUI/tui2cli snapshot, tmux, or db-lock/test-isolation concurrency); unreliable for grading.",
+              "timestamp": 1749700000,
+              "user": "kilian"
+            }
+          ]
+        },
         {
           "name": "tests.test_output.test_json_output_with_higher_request_count",
           "reasons": [
@@ -1873,6 +1906,17 @@
             }
           ]
         },
+        {
+          "name": "tests.test_output.test_json_output_with_single_connection",
+          "reasons": [
+            {
+              "id": "gold_flaky",
+              "note": "Flaky on v6 gold: passed in some gold-eval rounds and failed in others across 3 independent runs (timing-sensitive TUI/tui2cli snapshot, tmux, or db-lock/test-isolation concurrency); unreliable for grading.",
+              "timestamp": 1749700000,
+              "user": "kilian"
+            }
+          ]
+        },
         {
           "name": "tests.test_output.test_json_schema_validation",
           "reasons": [
@@ -2105,6 +2149,17 @@
             }
           ]
         },
+        {
+          "name": "tests.test_timescale.test_time_unit_precision_formatting",
+          "reasons": [
+            {
+              "id": "gold_fail",
+              "note": "[gold_fail_v6_toolchain] Summary emits fewer 4-decimal time values than the golden expects (oha output-format/version drift). 20x gold cloud eval 2026-06-13, failed 20/20 rounds",
+              "timestamp": 1781487387,
+              "user": "kilian"
+            }
+          ]
+        },
         {
           "name": "tests.test_timescale.test_time_unit_seconds_text_output",
           "reasons": [
@@ -2243,6 +2298,24 @@
             }
           ]
         },
+        {
+          "name": "tests.test_tui.test_tui_multiple_status_codes",
+          "reasons": [
+            {
+              "extra": {
+                "cause": "Flaky TUI render timing (live progress / HTTP-load output)",
+                "statuses": [
+                  "failure",
+                  "passed",
+                  "passed"
+                ]
+              },
+              "id": "gold_flaky",
+              "timestamp": 1781197573,
+              "user": "kilian"
+            }
+          ]
+        },
         {
           "name": "tests.test_tui.test_tui_with_high_concurrency",
           "reasons": [
@@ -2252,6 +2325,24 @@
               "user": "kilian"
             }
           ]
+        },
+        {
+          "name": "tests.test_tui.test_tui_with_post_method",
+          "reasons": [
+            {
+              "extra": {
+                "cause": "Flaky TUI render timing (live progress / HTTP-load output)",
+                "statuses": [
+                  "failure",
+                  "passed",
+                  "passed"
+                ]
+              },
+              "id": "gold_flaky",
+              "timestamp": 1781197573,
+              "user": "kilian"
+            }
+          ]
         }
       ]
     },
diff --git a/src/programbench/data/tasks/htop-dev__htop.523600b/tests.json b/src/programbench/data/tasks/htop-dev__htop.523600b/tests.json
index fb04388..fbd6acf 100644
--- a/src/programbench/data/tasks/htop-dev__htop.523600b/tests.json
+++ b/src/programbench/data/tasks/htop-dev__htop.523600b/tests.json
@@ -5184,6 +5184,17 @@
             }
           ]
         },
+        {
+          "name": "eval.tests.test_ultra_intensive.test_ultra_tree_all_configs",
+          "reasons": [
+            {
+              "id": "gold_flaky",
+              "note": "Flaky on v6 gold: passed in some gold-eval rounds and failed in others across 3 independent runs (timing-sensitive TUI/tui2cli snapshot, tmux, or db-lock/test-isolation concurrency); unreliable for grading.",
+              "timestamp": 1749700000,
+              "user": "kilian"
+            }
+          ]
+        },
         {
           "name": "eval.tests.test_update_control.test_delay_flag",
           "reasons": [
diff --git a/src/programbench/data/tasks/isona__dirble.e2dea9f/tests.json b/src/programbench/data/tasks/isona__dirble.e2dea9f/tests.json
index fca7da9..a2ea0c9 100644
--- a/src/programbench/data/tasks/isona__dirble.e2dea9f/tests.json
+++ b/src/programbench/data/tasks/isona__dirble.e2dea9f/tests.json
@@ -2460,6 +2460,24 @@
         "tests.test_validator.test_validator_with_length_based_detection"
       ],
       "ignored_tests": [
+        {
+          "name": "tests.test_edge_cases.test_uri_file_with_mixed_valid_invalid_urls",
+          "reasons": [
+            {
+              "extra": {
+                "cause": "Network-dependent (external HTTP/proxy, e.g. httpbin / ipv6 localhost)",
+                "statuses": [
+                  "passed",
+                  "failure",
+                  "passed"
+                ]
+              },
+              "id": "gold_flaky",
+              "timestamp": 1781197573,
+              "user": "kilian"
+            }
+          ]
+        },
         {
           "name": "tests.test_edge_cases.test_url_with_encoded_spaces",
           "reasons": [
@@ -3320,6 +3338,23 @@
             }
           ]
         },
+        {
+          "name": "tests.test_output_formats.test_default_text_output_format",
+          "reasons": [
+            {
+              "extra": {
+                "fail": 1,
+                "note": "local HTTP test-server connection race (curl: Could not connect to 127.0.0.1:8765)",
+                "pass": 19,
+                "pattern": "ppppppppppppppppfppp",
+                "source": "20x gold cloud eval 2026-06-13"
+              },
+              "id": "gold_flaky",
+              "timestamp": 1781326815,
+              "user": "kilian"
+            }
+          ]
+        },
         {
           "name": "tests.test_output_formats.test_hide_lengths_affects_all_output_formats",
           "reasons": [
@@ -3400,6 +3435,23 @@
             }
           ]
         },
+        {
+          "name": "tests.test_output_formats.test_show_htaccess_flag",
+          "reasons": [
+            {
+              "extra": {
+                "fail": 2,
+                "note": "local HTTP test-server connection race (curl: Could not connect to 127.0.0.1:8765)",
+                "pass": 18,
+                "pattern": "pppppppppppppfppfppp",
+                "source": "20x gold cloud eval 2026-06-13"
+              },
+              "id": "gold_flaky",
+              "timestamp": 1781326815,
+              "user": "kilian"
+            }
+          ]
+        },
         {
           "name": "tests.test_output_formats.test_silent_mode_no_live_output",
           "reasons": [
diff --git a/src/programbench/data/tasks/jgm__pandoc.5caad90/tests.json b/src/programbench/data/tasks/jgm__pandoc.5caad90/tests.json
index f8ec440..c72016d 100644
--- a/src/programbench/data/tasks/jgm__pandoc.5caad90/tests.json
+++ b/src/programbench/data/tasks/jgm__pandoc.5caad90/tests.json
@@ -5529,6 +5529,39 @@
         "tests.test_zip_output.test_directory_extraction_preserves_all_files"
       ],
       "ignored_tests": [
+        {
+          "name": "tests.test_asciidoc_rst_typst.test_asciidoc_attributes_and_metadata",
+          "reasons": [
+            {
+              "id": "gold_fail",
+              "note": "[gold_fail_v6_toolchain] Gold emits pandoc-api-version [1,23,1,2]; goldens captured against [1,23,1,1] (pandoc-types library version drift in the reproducible build). 20x gold cloud eval 2026-06-13, failed 20/20 rounds",
+              "timestamp": 1781487387,
+              "user": "kilian"
+            }
+          ]
+        },
+        {
+          "name": "tests.test_asciidoc_rst_typst.test_rst_to_json_ast_structure",
+          "reasons": [
+            {
+              "id": "gold_fail",
+              "note": "[gold_fail_v6_toolchain] Gold emits pandoc-api-version [1,23,1,2]; goldens captured against [1,23,1,1] (pandoc-types library version drift in the reproducible build). 20x gold cloud eval 2026-06-13, failed 20/20 rounds",
+              "timestamp": 1781487387,
+              "user": "kilian"
+            }
+          ]
+        },
+        {
+          "name": "tests.test_asciidoc_rst_typst.test_typst_metadata_extraction",
+          "reasons": [
+            {
+              "id": "gold_fail",
+              "note": "[gold_fail_v6_toolchain] Gold emits pandoc-api-version [1,23,1,2]; goldens captured against [1,23,1,1] (pandoc-types library version drift in the reproducible build). 20x gold cloud eval 2026-06-13, failed 20/20 rounds",
+              "timestamp": 1781487387,
+              "user": "kilian"
+            }
+          ]
+        },
         {
           "name": "tests.test_citations.test_bibliography_entry_ids",
           "reasons": [
@@ -5689,6 +5722,17 @@
             }
           ]
         },
+        {
+          "name": "tests.test_encoding_i18n.test_unicode_preservation_in_json_format",
+          "reasons": [
+            {
+              "id": "gold_fail",
+              "note": "[gold_fail_v6_toolchain] Gold emits pandoc-api-version [1,23,1,2]; goldens captured against [1,23,1,1] (pandoc-types library version drift in the reproducible build). 20x gold cloud eval 2026-06-13, failed 20/20 rounds",
+              "timestamp": 1781487387,
+              "user": "kilian"
+            }
+          ]
+        },
         {
           "name": "tests.test_error_handling.test_medium_large_file_processing",
           "reasons": [
@@ -5699,6 +5743,17 @@
             }
           ]
         },
+        {
+          "name": "tests.test_error_paths.test_json_incompatible_api_version",
+          "reasons": [
+            {
+              "id": "gold_fail",
+              "note": "[gold_fail_v6_toolchain] Gold emits pandoc-api-version [1,23,1,2]; goldens captured against [1,23,1,1] (pandoc-types library version drift in the reproducible build). 20x gold cloud eval 2026-06-13, failed 20/20 rounds",
+              "timestamp": 1781487387,
+              "user": "kilian"
+            }
+          ]
+        },
         {
           "name": "tests.test_filesystem_ops.test_binary_output_format_pdf_requires_output_file",
           "reasons": [
@@ -5719,6 +5774,17 @@
             }
           ]
         },
+        {
+          "name": "tests.test_filesystem_ops.test_output_json_format_produces_parseable_json",
+          "reasons": [
+            {
+              "id": "gold_fail",
+              "note": "[gold_fail_v6_toolchain] Gold emits pandoc-api-version [1,23,1,2]; goldens captured against [1,23,1,1] (pandoc-types library version drift in the reproducible build). 20x gold cloud eval 2026-06-13, failed 20/20 rounds",
+              "timestamp": 1781487387,
+              "user": "kilian"
+            }
+          ]
+        },
         {
           "name": "tests.test_filesystem_ops.test_output_same_as_input_file_handles_atomicity",
           "reasons": [
@@ -5729,6 +5795,39 @@
             }
           ]
         },
+        {
+          "name": "tests.test_filters_ast_advanced.test_filter_complex_metadata_structure",
+          "reasons": [
+            {
+              "id": "gold_fail",
+              "note": "[gold_fail_v6_toolchain] Gold emits pandoc-api-version [1,23,1,2]; goldens captured against [1,23,1,1] (pandoc-types library version drift in the reproducible build). 20x gold cloud eval 2026-06-13, failed 20/20 rounds",
+              "timestamp": 1781487387,
+              "user": "kilian"
+            }
+          ]
+        },
+        {
+          "name": "tests.test_filters_ast_advanced.test_json_ast_filter_transformation",
+          "reasons": [
+            {
+              "id": "gold_fail",
+              "note": "[gold_fail_v6_toolchain] Gold emits pandoc-api-version [1,23,1,2]; goldens captured against [1,23,1,1] (pandoc-types library version drift in the reproducible build). 20x gold cloud eval 2026-06-13, failed 20/20 rounds",
+              "timestamp": 1781487387,
+              "user": "kilian"
+            }
+          ]
+        },
+        {
+          "name": "tests.test_format_matrix.test_markdown_to_json_ast_lossless_machine_readable",
+          "reasons": [
+            {
+              "id": "gold_fail",
+              "note": "[gold_fail_v6_toolchain] Gold emits pandoc-api-version [1,23,1,2]; goldens captured against [1,23,1,1] (pandoc-types library version drift in the reproducible build). 20x gold cloud eval 2026-06-13, failed 20/20 rounds",
+              "timestamp": 1781487387,
+              "user": "kilian"
+            }
+          ]
+        },
         {
           "name": "tests.test_harvest_cmd_batch1.test_cmd_10094",
           "reasons": [
@@ -7404,6 +7503,28 @@
             }
           ]
         },
+        {
+          "name": "tests.test_man_format.test_man_only_title_header",
+          "reasons": [
+            {
+              "id": "gold_fail",
+              "note": "[gold_fail_v6_toolchain] Gold emits pandoc-api-version [1,23,1,2]; goldens captured against [1,23,1,1] (pandoc-types library version drift in the reproducible build). 20x gold cloud eval 2026-06-13, failed 20/20 rounds",
+              "timestamp": 1781487387,
+              "user": "kilian"
+            }
+          ]
+        },
+        {
+          "name": "tests.test_man_format.test_man_to_json_ast",
+          "reasons": [
+            {
+              "id": "gold_fail",
+              "note": "[gold_fail_v6_toolchain] Gold emits pandoc-api-version [1,23,1,2]; goldens captured against [1,23,1,1] (pandoc-types library version drift in the reproducible build). 20x gold cloud eval 2026-06-13, failed 20/20 rounds",
+              "timestamp": 1781487387,
+              "user": "kilian"
+            }
+          ]
+        },
         {
           "name": "tests.test_markdown_headings_lists.test_markdown_headings_roundtrip_atx",
           "reasons": [
@@ -7424,6 +7545,17 @@
             }
           ]
         },
+        {
+          "name": "tests.test_org_opml_pod.test_pod_to_json_ast",
+          "reasons": [
+            {
+              "id": "gold_fail",
+              "note": "[gold_fail_v6_toolchain] Gold emits pandoc-api-version [1,23,1,2]; goldens captured against [1,23,1,1] (pandoc-types library version drift in the reproducible build). 20x gold cloud eval 2026-06-13, failed 20/20 rounds",
+              "timestamp": 1781487387,
+              "user": "kilian"
+            }
+          ]
+        },
         {
           "name": "tests.test_pdf.test_pdf_large_document",
           "reasons": [
@@ -7478,6 +7610,17 @@
             }
           ]
         },
+        {
+          "name": "tests.test_rtf_advanced.test_rtf_to_json_ast_structure",
+          "reasons": [
+            {
+              "id": "gold_fail",
+              "note": "[gold_fail_v6_toolchain] Gold emits pandoc-api-version [1,23,1,2]; goldens captured against [1,23,1,1] (pandoc-types library version drift in the reproducible build). 20x gold cloud eval 2026-06-13, failed 20/20 rounds",
+              "timestamp": 1781487387,
+              "user": "kilian"
+            }
+          ]
+        },
         {
           "name": "tests.test_signal_handling.test_binary_stdin_rejection",
           "reasons": [
@@ -7672,6 +7815,17 @@
             }
           ]
         },
+        {
+          "name": "tests.test_tsv_tables.test_tsv_to_json_ast",
+          "reasons": [
+            {
+              "id": "gold_fail",
+              "note": "[gold_fail_v6_toolchain] Gold emits pandoc-api-version [1,23,1,2]; goldens captured against [1,23,1,1] (pandoc-types library version drift in the reproducible build). 20x gold cloud eval 2026-06-13, failed 20/20 rounds",
+              "timestamp": 1781487387,
+              "user": "kilian"
+            }
+          ]
+        },
         {
           "name": "tests.test_variable_json.test_variable_json_boolean_false",
           "reasons": [
diff --git a/src/programbench/data/tasks/jrnxf__thokr.09375ef/tests.json b/src/programbench/data/tasks/jrnxf__thokr.09375ef/tests.json
index 42ca5f7..afa87fc 100644
--- a/src/programbench/data/tasks/jrnxf__thokr.09375ef/tests.json
+++ b/src/programbench/data/tasks/jrnxf__thokr.09375ef/tests.json
@@ -556,6 +556,24 @@
             }
           ]
         },
+        {
+          "name": "tests.test_logging.test_timed_test_num_secs_has_value",
+          "reasons": [
+            {
+              "extra": {
+                "cause": "Timing-dependent (elapsed-time assertion)",
+                "statuses": [
+                  "passed",
+                  "passed",
+                  "failure"
+                ]
+              },
+              "id": "gold_flaky",
+              "timestamp": 1781197573,
+              "user": "kilian"
+            }
+          ]
+        },
         {
           "name": "tests.test_tui_basic.test_ctrl_c_exits",
           "reasons": [
@@ -566,6 +584,51 @@
             }
           ]
         },
+        {
+          "name": "tests.test_tui_timed.test_timer_display_format",
+          "reasons": [
+            {
+              "id": "gold_flaky",
+              "note": "Flaky on v6 gold: passed in some gold-eval rounds and failed in others across 3 independent runs (timing-sensitive TUI/tui2cli snapshot, tmux, or db-lock/test-isolation concurrency); unreliable for grading.",
+              "timestamp": 1749700000,
+              "user": "kilian"
+            }
+          ]
+        },
+        {
+          "name": "tests.test_tui_timed.test_timer_not_shown_without_s_flag",
+          "reasons": [
+            {
+              "extra": {
+                "fail": 1,
+                "note": "TUI timer rendering timing race (running/paused timer state non-deterministic in captured frame)",
+                "pass": 19,
+                "pattern": "pfpppppppppppppppppp",
+                "source": "20x gold cloud eval 2026-06-13"
+              },
+              "id": "gold_flaky",
+              "timestamp": 1781326815,
+              "user": "kilian"
+            }
+          ]
+        },
+        {
+          "name": "tests.test_tui_timed.test_timer_pauses_on_results_screen",
+          "reasons": [
+            {
+              "extra": {
+                "fail": 13,
+                "note": "TUI timer rendering timing race (running/paused timer state non-deterministic in captured frame)",
+                "pass": 7,
+                "pattern": "ppffffffppfppffffffp",
+                "source": "20x gold cloud eval 2026-06-13"
+              },
+              "id": "gold_flaky",
+              "timestamp": 1781326815,
+              "user": "kilian"
+            }
+          ]
+        },
         {
           "name": "tests.test_tui_timed.test_timer_with_3_seconds",
           "reasons": [
@@ -576,6 +639,23 @@
             }
           ]
         },
+        {
+          "name": "tests.test_tui_timed.test_timer_with_5_seconds",
+          "reasons": [
+            {
+              "extra": {
+                "fail": 5,
+                "note": "TUI timer rendering timing race (running/paused timer state non-deterministic in captured frame)",
+                "pass": 15,
+                "pattern": "fffpppppppfppppppfpp",
+                "source": "20x gold cloud eval 2026-06-13"
+              },
+              "id": "gold_flaky",
+              "timestamp": 1781326815,
+              "user": "kilian"
+            }
+          ]
+        },
         {
           "name": "tests.test_tui_timed.test_timer_with_very_long_duration",
           "reasons": [
diff --git a/src/programbench/data/tasks/junegunn__fzf.b56d614/tests.json b/src/programbench/data/tasks/junegunn__fzf.b56d614/tests.json
index 66245c6..8c36b70 100644
--- a/src/programbench/data/tasks/junegunn__fzf.b56d614/tests.json
+++ b/src/programbench/data/tasks/junegunn__fzf.b56d614/tests.json
@@ -4644,6 +4644,17 @@
         "tests.test_fzf.TestSchemeAndScoring.test_path_scheme"
       ],
       "ignored_tests": [
+        {
+          "name": "tests.test_fzf.TestBasicFunctionality.test_version_flag",
+          "reasons": [
+            {
+              "id": "gold_fail",
+              "note": "[gold_fail_v6_build_stamp] Fails on v6 gold: test asserts a build-stamped version/commit/date string (non-reproducible build metadata, e.g. embedded git rev or build date) rather than functionality; the golden captured the previous build's stamp.",
+              "timestamp": 1749700000,
+              "user": "kilian"
+            }
+          ]
+        },
         {
           "name": "tests.test_fzf.TestCommandLineParsing.test_short_and_long_flags",
           "reasons": [
@@ -4703,6 +4714,17 @@
         "eval.tests.test_fzf_interactive_tmux.test_interactive_expect_ctrl_j_prints_key_then_selection"
       ],
       "ignored_tests": [
+        {
+          "name": "eval.tests.test_fzf_cli.test_version_exact",
+          "reasons": [
+            {
+              "id": "gold_fail",
+              "note": "[gold_fail_v6_build_stamp] Fails on v6 gold: test asserts a build-stamped version/commit/date string (non-reproducible build metadata, e.g. embedded git rev or build date) rather than functionality; the golden captured the previous build's stamp.",
+              "timestamp": 1749700000,
+              "user": "kilian"
+            }
+          ]
+        },
         {
           "name": "eval.tests.test_fzf_interactive_tmux.test_interactive_expect_ctrl_j_prints_key_then_selection",
           "reasons": [
diff --git a/src/programbench/data/tasks/kisielk__errcheck.dacab89/tests.json b/src/programbench/data/tasks/kisielk__errcheck.dacab89/tests.json
index e64cb0e..50e3563 100644
--- a/src/programbench/data/tasks/kisielk__errcheck.dacab89/tests.json
+++ b/src/programbench/data/tasks/kisielk__errcheck.dacab89/tests.json
@@ -1373,6 +1373,17 @@
             }
           ]
         },
+        {
+          "name": "tests.test_output.test_exit_code_two_on_fatal_errors",
+          "reasons": [
+            {
+              "id": "gold_fail",
+              "note": "[gold_fail_v6_toolchain] Fails on v6 gold: output reflects the pinned toolchain / dependency versions (e.g. Go compiler diagnostic wording, stdlib net behavior, or generated-code content); the golden captured the previous build environment's output.",
+              "timestamp": 1749700000,
+              "user": "kilian"
+            }
+          ]
+        },
         {
           "name": "tests.test_smoke.test_no_args_runs",
           "reasons": [
diff --git a/src/programbench/data/tasks/kyoheiu__felix.95df390/tests.json b/src/programbench/data/tasks/kyoheiu__felix.95df390/tests.json
index 6bd466f..80903c5 100644
--- a/src/programbench/data/tasks/kyoheiu__felix.95df390/tests.json
+++ b/src/programbench/data/tasks/kyoheiu__felix.95df390/tests.json
@@ -2837,6 +2837,17 @@
             }
           ]
         },
+        {
+          "name": "tests.test_layout_calculations.test_narrow_terminal_below_proper_width",
+          "reasons": [
+            {
+              "id": "gold_flaky",
+              "note": "Flaky on v6 gold: passed in some gold-eval rounds and failed in others across 3 independent runs (timing-sensitive TUI/tui2cli snapshot, tmux, or db-lock/test-isolation concurrency); unreliable for grading.",
+              "timestamp": 1749700000,
+              "user": "kilian"
+            }
+          ]
+        },
         {
           "name": "tests.test_magic_packed_extended.test_unpack_plain_text_non_archive_error",
           "reasons": [
diff --git a/src/programbench/data/tasks/mkj__dropbear.75f699b/tests.json b/src/programbench/data/tasks/mkj__dropbear.75f699b/tests.json
index decbde8..dd533ed 100644
--- a/src/programbench/data/tasks/mkj__dropbear.75f699b/tests.json
+++ b/src/programbench/data/tasks/mkj__dropbear.75f699b/tests.json
@@ -1651,6 +1651,23 @@
             }
           ]
         },
+        {
+          "name": "tests.test_authkey_options_gap.test_malformed_permitopen_non_numeric_port",
+          "reasons": [
+            {
+              "extra": {
+                "fail": 12,
+                "note": "ssh dbclient connection/host-key-mismatch/socket-bind race (regenerated host keys + port reuse under xdist)",
+                "pass": 8,
+                "pattern": "pffppffpppffffpffpff",
+                "source": "20x gold cloud eval 2026-06-13"
+              },
+              "id": "gold_flaky",
+              "timestamp": 1781326815,
+              "user": "kilian"
+            }
+          ]
+        },
         {
           "name": "tests.test_authkey_options_gap.test_multiple_command_options_rejected",
           "reasons": [
@@ -1661,6 +1678,17 @@
             }
           ]
         },
+        {
+          "name": "tests.test_authkey_options_gap.test_option_case_insensitive",
+          "reasons": [
+            {
+              "id": "gold_flaky",
+              "note": "ssh dbclient connection/host-key-mismatch race (same family as other dropbear flakes). caught in postfix gold rerun (21st run); passed 20/20 in 20x sweep",
+              "timestamp": 1781500166,
+              "user": "kilian"
+            }
+          ]
+        },
         {
           "name": "tests.test_authkey_options_gap.test_unknown_option_causes_rejection",
           "reasons": [
@@ -2021,6 +2049,41 @@
             }
           ]
         },
+        {
+          "name": "tests.test_forwarding_x11_agent.test_agent_socket_actually_exists",
+          "reasons": [
+            {
+              "extra": {
+                "fail": 1,
+                "note": "ssh dbclient connection/host-key-mismatch/socket-bind race (regenerated host keys + port reuse under xdist)",
+                "pass": 19,
+                "pattern": "pppppppppppppfpppppp",
+                "source": "20x gold cloud eval 2026-06-13"
+              },
+              "id": "gold_flaky",
+              "timestamp": 1781326815,
+              "user": "kilian"
+            }
+          ]
+        },
+        {
+          "name": "tests.test_forwarding_x11_agent.test_agent_socket_cleanup_on_disconnect",
+          "reasons": [
+            {
+              "extra": {
+                "cause": "SSH socket/port/host-key race (random ports, key mismatch)",
+                "statuses": [
+                  "passed",
+                  "failure",
+                  "passed"
+                ]
+              },
+              "id": "gold_flaky",
+              "timestamp": 1781197573,
+              "user": "kilian"
+            }
+          ]
+        },
         {
           "name": "tests.test_forwarding_x11_agent.test_agent_socket_directory_permissions",
           "reasons": [
@@ -2031,6 +2094,24 @@
             }
           ]
         },
+        {
+          "name": "tests.test_forwarding_x11_agent.test_agent_socket_fd_number_in_filename",
+          "reasons": [
+            {
+              "extra": {
+                "cause": "SSH socket/port/host-key race (random ports, key mismatch)",
+                "statuses": [
+                  "passed",
+                  "failure",
+                  "passed"
+                ]
+              },
+              "id": "gold_flaky",
+              "timestamp": 1781197573,
+              "user": "kilian"
+            }
+          ]
+        },
         {
           "name": "tests.test_forwarding_x11_agent.test_agent_socket_is_unix_domain_socket",
           "reasons": [
@@ -2041,6 +2122,17 @@
             }
           ]
         },
+        {
+          "name": "tests.test_forwarding_x11_agent.test_agent_socket_path_format",
+          "reasons": [
+            {
+              "id": "gold_flaky",
+              "note": "Flaky on v6 gold: passed in some gold-eval rounds and failed in others across 3 independent runs (timing-sensitive TUI/tui2cli snapshot, tmux, or db-lock/test-isolation concurrency); unreliable for grading.",
+              "timestamp": 1749700000,
+              "user": "kilian"
+            }
+          ]
+        },
         {
           "name": "tests.test_forwarding_x11_agent.test_agent_socket_permissions",
           "reasons": [
@@ -2051,6 +2143,23 @@
             }
           ]
         },
+        {
+          "name": "tests.test_forwarding_x11_agent.test_agent_socket_random_components",
+          "reasons": [
+            {
+              "extra": {
+                "fail": 3,
+                "note": "ssh dbclient connection/host-key-mismatch/socket-bind race (regenerated host keys + port reuse under xdist)",
+                "pass": 17,
+                "pattern": "fpppfpppppppfppppppp",
+                "source": "20x gold cloud eval 2026-06-13"
+              },
+              "id": "gold_flaky",
+              "timestamp": 1781326815,
+              "user": "kilian"
+            }
+          ]
+        },
         {
           "name": "tests.test_forwarding_x11_agent.test_no_agent_forwarding_option_code_exists",
           "reasons": [
@@ -3141,6 +3250,24 @@
             }
           ]
         },
+        {
+          "name": "tests.test_netio_gap.test_client_connection_to_ipv4_only_server",
+          "reasons": [
+            {
+              "extra": {
+                "cause": "SSH socket/port/host-key race (random ports, key mismatch)",
+                "statuses": [
+                  "passed",
+                  "passed",
+                  "failure"
+                ]
+              },
+              "id": "gold_flaky",
+              "timestamp": 1781197573,
+              "user": "kilian"
+            }
+          ]
+        },
         {
           "name": "tests.test_netio_gap.test_client_ipv4_mapped_connection",
           "reasons": [
@@ -3371,6 +3498,23 @@
             }
           ]
         },
+        {
+          "name": "tests.test_pty_terminal.test_pty_newline_handling",
+          "reasons": [
+            {
+              "extra": {
+                "fail": 1,
+                "note": "ssh dbclient connection/host-key-mismatch/socket-bind race (regenerated host keys + port reuse under xdist)",
+                "pass": 19,
+                "pattern": "ppppppfppppppppppppp",
+                "source": "20x gold cloud eval 2026-06-13"
+              },
+              "id": "gold_flaky",
+              "timestamp": 1781326815,
+              "user": "kilian"
+            }
+          ]
+        },
         {
           "name": "tests.test_pty_terminal.test_pty_not_allocated_without_t_flag",
           "reasons": [
@@ -3391,6 +3535,23 @@
             }
           ]
         },
+        {
+          "name": "tests.test_pty_terminal.test_pty_read_large_output",
+          "reasons": [
+            {
+              "extra": {
+                "fail": 1,
+                "note": "ssh dbclient connection/host-key-mismatch/socket-bind race (regenerated host keys + port reuse under xdist)",
+                "pass": 19,
+                "pattern": "fppppppppppppppppppp",
+                "source": "20x gold cloud eval 2026-06-13"
+              },
+              "id": "gold_flaky",
+              "timestamp": 1781326815,
+              "user": "kilian"
+            }
+          ]
+        },
         {
           "name": "tests.test_pty_terminal.test_pty_session_without_shell",
           "reasons": [
@@ -3511,6 +3672,24 @@
             }
           ]
         },
+        {
+          "name": "tests.test_pubkey_auth.test_crlf_line_endings",
+          "reasons": [
+            {
+              "extra": {
+                "cause": "SSH socket/port/host-key race (random ports, key mismatch)",
+                "statuses": [
+                  "failure",
+                  "passed",
+                  "failure"
+                ]
+              },
+              "id": "gold_flaky",
+              "timestamp": 1781197573,
+              "user": "kilian"
+            }
+          ]
+        },
         {
           "name": "tests.test_pubkey_auth.test_different_key_types_ecdsa",
           "reasons": [
@@ -4643,6 +4822,19 @@
             },
             {
               "id": "dummy_pass"
+            },
+            {
+              "extra": {
+                "cause": "SSH socket/port/host-key race (random ports, key mismatch)",
+                "statuses": [
+                  "failure",
+                  "passed",
+                  "failure"
+                ]
+              },
+              "id": "gold_flaky",
+              "timestamp": 1781197573,
+              "user": "kilian"
             }
           ]
         },
diff --git a/src/programbench/data/tasks/nikolassv__bartib.6b9b5ce/tests.json b/src/programbench/data/tasks/nikolassv__bartib.6b9b5ce/tests.json
index 9833522..77e4bfd 100644
--- a/src/programbench/data/tasks/nikolassv__bartib.6b9b5ce/tests.json
+++ b/src/programbench/data/tasks/nikolassv__bartib.6b9b5ce/tests.json
@@ -1135,6 +1135,17 @@
             }
           ]
         },
+        {
+          "name": "tests.test_current_status.test_current_special_characters_in_names",
+          "reasons": [
+            {
+              "id": "gold_fail",
+              "note": "[gold_fail_v6_toolchain] Table header trailing-whitespace differs by one space from golden (formatter version drift). 20x gold cloud eval 2026-06-13, failed 20/20 rounds",
+              "timestamp": 1781487387,
+              "user": "kilian"
+            }
+          ]
+        },
         {
           "name": "tests.test_current_status.test_status_aggregation_multiple_same_project",
           "reasons": [
diff --git a/src/programbench/data/tasks/noborus__ov.b96c2ba/tests.json b/src/programbench/data/tasks/noborus__ov.b96c2ba/tests.json
index 8083c2a..dbc7c34 100644
--- a/src/programbench/data/tasks/noborus__ov.b96c2ba/tests.json
+++ b/src/programbench/data/tasks/noborus__ov.b96c2ba/tests.json
@@ -5451,6 +5451,17 @@
             }
           ]
         },
+        {
+          "name": "tests.test_edit.test_editor_line_number_placeholder",
+          "reasons": [
+            {
+              "id": "gold_flaky",
+              "note": "Flaky on v6 gold: passed in some gold-eval rounds and failed in others across 3 independent runs (timing-sensitive TUI/tui2cli snapshot, tmux, or db-lock/test-isolation concurrency); unreliable for grading.",
+              "timestamp": 1749700000,
+              "user": "kilian"
+            }
+          ]
+        },
         {
           "name": "tests.test_edit.test_temp_file_cleanup_after_edit",
           "reasons": [
@@ -5461,6 +5472,17 @@
             }
           ]
         },
+        {
+          "name": "tests.test_exec_mode.test_exec_mode_command_error_captured_in_stderr",
+          "reasons": [
+            {
+              "id": "gold_flaky",
+              "note": "Flaky on v6 gold: passed in some gold-eval rounds and failed in others across 3 independent runs (timing-sensitive TUI/tui2cli snapshot, tmux, or db-lock/test-isolation concurrency); unreliable for grading.",
+              "timestamp": 1749700000,
+              "user": "kilian"
+            }
+          ]
+        },
         {
           "name": "tests.test_filter_search_advanced.test_backward_search_from_end",
           "reasons": [
@@ -5481,6 +5503,17 @@
             }
           ]
         },
+        {
+          "name": "tests.test_filter_search_advanced.test_invalid_regex_fallback_to_literal",
+          "reasons": [
+            {
+              "id": "gold_flaky",
+              "note": "Flaky on v6 gold: passed in some gold-eval rounds and failed in others across 3 independent runs (timing-sensitive TUI/tui2cli snapshot, tmux, or db-lock/test-isolation concurrency); unreliable for grading.",
+              "timestamp": 1749700000,
+              "user": "kilian"
+            }
+          ]
+        },
         {
           "name": "tests.test_filter_search_advanced.test_regex_search_toggle_during_input",
           "reasons": [
@@ -5509,6 +5542,28 @@
             }
           ]
         },
+        {
+          "name": "tests.test_input_header_modes.test_header_column_accepts_numeric_value",
+          "reasons": [
+            {
+              "id": "gold_flaky",
+              "note": "Flaky on v6 gold: passed in some gold-eval rounds and failed in others across 3 independent runs (timing-sensitive TUI/tui2cli snapshot, tmux, or db-lock/test-isolation concurrency); unreliable for grading.",
+              "timestamp": 1749700000,
+              "user": "kilian"
+            }
+          ]
+        },
+        {
+          "name": "tests.test_input_header_modes.test_header_input_empty_string_shows_error",
+          "reasons": [
+            {
+              "id": "gold_flaky",
+              "note": "Flaky on v6 gold: passed in some gold-eval rounds and failed in others across 3 independent runs (timing-sensitive TUI/tui2cli snapshot, tmux, or db-lock/test-isolation concurrency); unreliable for grading.",
+              "timestamp": 1749700000,
+              "user": "kilian"
+            }
+          ]
+        },
         {
           "name": "tests.test_input_header_modes.test_header_input_invalid_characters",
           "reasons": [
@@ -5517,6 +5572,28 @@
             }
           ]
         },
+        {
+          "name": "tests.test_input_header_modes.test_header_input_mode_prompt_appears",
+          "reasons": [
+            {
+              "id": "gold_flaky",
+              "note": "Flaky on v6 gold: passed in some gold-eval rounds and failed in others across 3 independent runs (timing-sensitive TUI/tui2cli snapshot, tmux, or db-lock/test-isolation concurrency); unreliable for grading.",
+              "timestamp": 1749700000,
+              "user": "kilian"
+            }
+          ]
+        },
+        {
+          "name": "tests.test_input_header_modes.test_header_input_multiple_digits",
+          "reasons": [
+            {
+              "id": "gold_flaky",
+              "note": "Flaky on v6 gold: passed in some gold-eval rounds and failed in others across 3 independent runs (timing-sensitive TUI/tui2cli snapshot, tmux, or db-lock/test-isolation concurrency); unreliable for grading.",
+              "timestamp": 1749700000,
+              "user": "kilian"
+            }
+          ]
+        },
         {
           "name": "tests.test_input_header_modes.test_jump_target_empty_string_no_change",
           "reasons": [
@@ -5527,6 +5604,61 @@
             }
           ]
         },
+        {
+          "name": "tests.test_input_header_modes.test_jump_target_input_mode_prompt_appears",
+          "reasons": [
+            {
+              "id": "gold_flaky",
+              "note": "Flaky on v6 gold: passed in some gold-eval rounds and failed in others across 3 independent runs (timing-sensitive TUI/tui2cli snapshot, tmux, or db-lock/test-isolation concurrency); unreliable for grading.",
+              "timestamp": 1749700000,
+              "user": "kilian"
+            }
+          ]
+        },
+        {
+          "name": "tests.test_input_header_modes.test_jump_target_numeric_value",
+          "reasons": [
+            {
+              "id": "gold_flaky",
+              "note": "Flaky on v6 gold: passed in some gold-eval rounds and failed in others across 3 independent runs (timing-sensitive TUI/tui2cli snapshot, tmux, or db-lock/test-isolation concurrency); unreliable for grading.",
+              "timestamp": 1749700000,
+              "user": "kilian"
+            }
+          ]
+        },
+        {
+          "name": "tests.test_input_modes_advanced.test_filter_mode_up_populates_from_history",
+          "reasons": [
+            {
+              "id": "gold_flaky",
+              "note": "Flaky on v6 gold: passed in some gold-eval rounds and failed in others across 3 independent runs (timing-sensitive TUI/tui2cli snapshot, tmux, or db-lock/test-isolation concurrency); unreliable for grading.",
+              "timestamp": 1749700000,
+              "user": "kilian"
+            }
+          ]
+        },
+        {
+          "name": "tests.test_input_modes_advanced.test_multicolor_confirm_sets_multicolor",
+          "reasons": [
+            {
+              "id": "gold_flaky",
+              "note": "Flaky on v6 gold: passed in some gold-eval rounds and failed in others across 3 independent runs (timing-sensitive TUI/tui2cli snapshot, tmux, or db-lock/test-isolation concurrency); unreliable for grading.",
+              "timestamp": 1749700000,
+              "user": "kilian"
+            }
+          ]
+        },
+        {
+          "name": "tests.test_input_modes_advanced.test_multicolor_up_down_navigation",
+          "reasons": [
+            {
+              "id": "gold_flaky",
+              "note": "Flaky on v6 gold: passed in some gold-eval rounds and failed in others across 3 independent runs (timing-sensitive TUI/tui2cli snapshot, tmux, or db-lock/test-isolation concurrency); unreliable for grading.",
+              "timestamp": 1749700000,
+              "user": "kilian"
+            }
+          ]
+        },
         {
           "name": "tests.test_mouse_marks_sidebar.test_mark_multiple_lines_and_navigate",
           "reasons": [
@@ -5537,6 +5669,17 @@
             }
           ]
         },
+        {
+          "name": "tests.test_mouse_marks_sidebar.test_mark_single_line",
+          "reasons": [
+            {
+              "id": "gold_flaky",
+              "note": "Flaky on v6 gold: passed in some gold-eval rounds and failed in others across 3 independent runs (timing-sensitive TUI/tui2cli snapshot, tmux, or db-lock/test-isolation concurrency); unreliable for grading.",
+              "timestamp": 1749700000,
+              "user": "kilian"
+            }
+          ]
+        },
         {
           "name": "tests.test_mouse_marks_sidebar.test_remove_all_marks",
           "reasons": [
@@ -5555,6 +5698,28 @@
             }
           ]
         },
+        {
+          "name": "tests.test_mouse_marks_sidebar.test_sidebar_document_list",
+          "reasons": [
+            {
+              "id": "gold_flaky",
+              "note": "Flaky on v6 gold: passed in some gold-eval rounds and failed in others across 3 independent runs (timing-sensitive TUI/tui2cli snapshot, tmux, or db-lock/test-isolation concurrency); unreliable for grading.",
+              "timestamp": 1749700000,
+              "user": "kilian"
+            }
+          ]
+        },
+        {
+          "name": "tests.test_mouse_marks_sidebar.test_sidebar_help_display",
+          "reasons": [
+            {
+              "id": "gold_flaky",
+              "note": "Flaky on v6 gold: passed in some gold-eval rounds and failed in others across 3 independent runs (timing-sensitive TUI/tui2cli snapshot, tmux, or db-lock/test-isolation concurrency); unreliable for grading.",
+              "timestamp": 1749700000,
+              "user": "kilian"
+            }
+          ]
+        },
         {
           "name": "tests.test_mouse_marks_sidebar.test_sidebar_mark_list_display",
           "reasons": [
@@ -5563,6 +5728,50 @@
             }
           ]
         },
+        {
+          "name": "tests.test_move_leftright_advanced.test_column_left_no_cycle_at_start",
+          "reasons": [
+            {
+              "id": "gold_flaky",
+              "note": "Flaky on v6 gold: passed in some gold-eval rounds and failed in others across 3 independent runs (timing-sensitive TUI/tui2cli snapshot, tmux, or db-lock/test-isolation concurrency); unreliable for grading.",
+              "timestamp": 1749700000,
+              "user": "kilian"
+            }
+          ]
+        },
+        {
+          "name": "tests.test_movement_functions.test_section_navigation_at_boundaries",
+          "reasons": [
+            {
+              "id": "gold_flaky",
+              "note": "Flaky on v6 gold: passed in some gold-eval rounds and failed in others across 3 independent runs (timing-sensitive TUI/tui2cli snapshot, tmux, or db-lock/test-isolation concurrency); unreliable for grading.",
+              "timestamp": 1749700000,
+              "user": "kilian"
+            }
+          ]
+        },
+        {
+          "name": "tests.test_movement_functions.test_section_navigation_no_delimiter",
+          "reasons": [
+            {
+              "id": "gold_flaky",
+              "note": "Flaky on v6 gold: passed in some gold-eval rounds and failed in others across 3 independent runs (timing-sensitive TUI/tui2cli snapshot, tmux, or db-lock/test-isolation concurrency); unreliable for grading.",
+              "timestamp": 1749700000,
+              "user": "kilian"
+            }
+          ]
+        },
+        {
+          "name": "tests.test_movement_functions.test_section_navigation_previous",
+          "reasons": [
+            {
+              "id": "gold_flaky",
+              "note": "Flaky on v6 gold: passed in some gold-eval rounds and failed in others across 3 independent runs (timing-sensitive TUI/tui2cli snapshot, tmux, or db-lock/test-isolation concurrency); unreliable for grading.",
+              "timestamp": 1749700000,
+              "user": "kilian"
+            }
+          ]
+        },
         {
           "name": "tests.test_output_modes.test_exit_write_multiple_flags_order_independence",
           "reasons": [
@@ -5603,6 +5812,28 @@
             }
           ]
         },
+        {
+          "name": "tests.test_save_and_converters.test_converter_es_processes_escape_sequences",
+          "reasons": [
+            {
+              "id": "gold_flaky",
+              "note": "Flaky on v6 gold: passed in some gold-eval rounds and failed in others across 3 independent runs (timing-sensitive TUI/tui2cli snapshot, tmux, or db-lock/test-isolation concurrency); unreliable for grading.",
+              "timestamp": 1749700000,
+              "user": "kilian"
+            }
+          ]
+        },
+        {
+          "name": "tests.test_save_and_converters.test_converter_raw_displays_escape_sequences_as_caret_notation",
+          "reasons": [
+            {
+              "id": "gold_flaky",
+              "note": "Flaky on v6 gold: passed in some gold-eval rounds and failed in others across 3 independent runs (timing-sensitive TUI/tui2cli snapshot, tmux, or db-lock/test-isolation concurrency); unreliable for grading.",
+              "timestamp": 1749700000,
+              "user": "kilian"
+            }
+          ]
+        },
         {
           "name": "tests.test_save_and_converters.test_raw_flag_equivalent_to_converter_raw",
           "reasons": [
@@ -5662,6 +5893,347 @@
               "id": "gold_fail"
             }
           ]
+        },
+        {
+          "name": "tests.test_search_edge_cases.test_empty_search_pattern_forward",
+          "reasons": [
+            {
+              "id": "gold_flaky",
+              "note": "Flaky on v6 gold: passed in some gold-eval rounds and failed in others across 3 independent runs (timing-sensitive TUI/tui2cli snapshot, tmux, or db-lock/test-isolation concurrency); unreliable for grading.",
+              "timestamp": 1749700000,
+              "user": "kilian"
+            }
+          ]
+        },
+        {
+          "name": "tests.test_search_edge_cases.test_incremental_search_updates",
+          "reasons": [
+            {
+              "id": "gold_flaky",
+              "note": "Flaky on v6 gold: passed in some gold-eval rounds and failed in others across 3 independent runs (timing-sensitive TUI/tui2cli snapshot, tmux, or db-lock/test-isolation concurrency); unreliable for grading.",
+              "timestamp": 1749700000,
+              "user": "kilian"
+            }
+          ]
+        },
+        {
+          "name": "tests.test_search_edge_cases.test_invalid_regex_fallback_to_literal",
+          "reasons": [
+            {
+              "id": "gold_flaky",
+              "note": "Flaky on v6 gold: passed in some gold-eval rounds and failed in others across 3 independent runs (timing-sensitive TUI/tui2cli snapshot, tmux, or db-lock/test-isolation concurrency); unreliable for grading.",
+              "timestamp": 1749700000,
+              "user": "kilian"
+            }
+          ]
+        },
+        {
+          "name": "tests.test_search_edge_cases.test_multicolor_with_quoted_strings",
+          "reasons": [
+            {
+              "id": "gold_flaky",
+              "note": "Flaky on v6 gold: passed in some gold-eval rounds and failed in others across 3 independent runs (timing-sensitive TUI/tui2cli snapshot, tmux, or db-lock/test-isolation concurrency); unreliable for grading.",
+              "timestamp": 1749700000,
+              "user": "kilian"
+            }
+          ]
+        },
+        {
+          "name": "tests.test_search_edge_cases.test_next_search_continuation",
+          "reasons": [
+            {
+              "id": "gold_flaky",
+              "note": "Flaky on v6 gold: passed in some gold-eval rounds and failed in others across 3 independent runs (timing-sensitive TUI/tui2cli snapshot, tmux, or db-lock/test-isolation concurrency); unreliable for grading.",
+              "timestamp": 1749700000,
+              "user": "kilian"
+            }
+          ]
+        },
+        {
+          "name": "tests.test_search_edge_cases.test_search_not_found_message",
+          "reasons": [
+            {
+              "id": "gold_flaky",
+              "note": "Flaky on v6 gold: passed in some gold-eval rounds and failed in others across 3 independent runs (timing-sensitive TUI/tui2cli snapshot, tmux, or db-lock/test-isolation concurrency); unreliable for grading.",
+              "timestamp": 1749700000,
+              "user": "kilian"
+            }
+          ]
+        },
+        {
+          "name": "tests.test_search_edge_cases.test_search_numeric_only_pattern",
+          "reasons": [
+            {
+              "id": "gold_flaky",
+              "note": "Flaky on v6 gold: passed in some gold-eval rounds and failed in others across 3 independent runs (timing-sensitive TUI/tui2cli snapshot, tmux, or db-lock/test-isolation concurrency); unreliable for grading.",
+              "timestamp": 1749700000,
+              "user": "kilian"
+            }
+          ]
+        },
+        {
+          "name": "tests.test_search_edge_cases.test_search_symbol_only_pattern",
+          "reasons": [
+            {
+              "id": "gold_flaky",
+              "note": "Flaky on v6 gold: passed in some gold-eval rounds and failed in others across 3 independent runs (timing-sensitive TUI/tui2cli snapshot, tmux, or db-lock/test-isolation concurrency); unreliable for grading.",
+              "timestamp": 1749700000,
+              "user": "kilian"
+            }
+          ]
+        },
+        {
+          "name": "tests.test_search_edge_cases.test_search_wrapping_forward",
+          "reasons": [
+            {
+              "id": "gold_flaky",
+              "note": "Flaky on v6 gold: passed in some gold-eval rounds and failed in others across 3 independent runs (timing-sensitive TUI/tui2cli snapshot, tmux, or db-lock/test-isolation concurrency); unreliable for grading.",
+              "timestamp": 1749700000,
+              "user": "kilian"
+            }
+          ]
+        },
+        {
+          "name": "tests.test_search_edge_cases.test_smart_case_with_uppercase_in_pattern",
+          "reasons": [
+            {
+              "id": "gold_flaky",
+              "note": "Flaky on v6 gold: passed in some gold-eval rounds and failed in others across 3 independent runs (timing-sensitive TUI/tui2cli snapshot, tmux, or db-lock/test-isolation concurrency); unreliable for grading.",
+              "timestamp": 1749700000,
+              "user": "kilian"
+            }
+          ]
+        },
+        {
+          "name": "tests.test_tui_advanced.test_alternate_rows",
+          "reasons": [
+            {
+              "id": "gold_flaky",
+              "note": "Flaky on v6 gold: passed in some gold-eval rounds and failed in others across 3 independent runs (timing-sensitive TUI/tui2cli snapshot, tmux, or db-lock/test-isolation concurrency); unreliable for grading.",
+              "timestamp": 1749700000,
+              "user": "kilian"
+            }
+          ]
+        },
+        {
+          "name": "tests.test_tui_advanced.test_caption_display",
+          "reasons": [
+            {
+              "id": "gold_flaky",
+              "note": "Flaky on v6 gold: passed in some gold-eval rounds and failed in others across 3 independent runs (timing-sensitive TUI/tui2cli snapshot, tmux, or db-lock/test-isolation concurrency); unreliable for grading.",
+              "timestamp": 1749700000,
+              "user": "kilian"
+            }
+          ]
+        },
+        {
+          "name": "tests.test_tui_advanced.test_mark_set_and_message",
+          "reasons": [
+            {
+              "id": "gold_flaky",
+              "note": "Flaky on v6 gold: passed in some gold-eval rounds and failed in others across 3 independent runs (timing-sensitive TUI/tui2cli snapshot, tmux, or db-lock/test-isolation concurrency); unreliable for grading.",
+              "timestamp": 1749700000,
+              "user": "kilian"
+            }
+          ]
+        },
+        {
+          "name": "tests.test_tui_advanced.test_ruler_absolute",
+          "reasons": [
+            {
+              "id": "gold_flaky",
+              "note": "Flaky on v6 gold: passed in some gold-eval rounds and failed in others across 3 independent runs (timing-sensitive TUI/tui2cli snapshot, tmux, or db-lock/test-isolation concurrency); unreliable for grading.",
+              "timestamp": 1749700000,
+              "user": "kilian"
+            }
+          ]
+        },
+        {
+          "name": "tests.test_tui_advanced.test_ruler_disabled",
+          "reasons": [
+            {
+              "id": "gold_flaky",
+              "note": "Flaky on v6 gold: passed in some gold-eval rounds and failed in others across 3 independent runs (timing-sensitive TUI/tui2cli snapshot, tmux, or db-lock/test-isolation concurrency); unreliable for grading.",
+              "timestamp": 1749700000,
+              "user": "kilian"
+            }
+          ]
+        },
+        {
+          "name": "tests.test_tui_advanced.test_status_line_content",
+          "reasons": [
+            {
+              "id": "gold_flaky",
+              "note": "Flaky on v6 gold: passed in some gold-eval rounds and failed in others across 3 independent runs (timing-sensitive TUI/tui2cli snapshot, tmux, or db-lock/test-isolation concurrency); unreliable for grading.",
+              "timestamp": 1749700000,
+              "user": "kilian"
+            }
+          ]
+        },
+        {
+          "name": "tests.test_tui_advanced.test_status_line_disabled",
+          "reasons": [
+            {
+              "id": "gold_flaky",
+              "note": "Flaky on v6 gold: passed in some gold-eval rounds and failed in others across 3 independent runs (timing-sensitive TUI/tui2cli snapshot, tmux, or db-lock/test-isolation concurrency); unreliable for grading.",
+              "timestamp": 1749700000,
+              "user": "kilian"
+            }
+          ]
+        },
+        {
+          "name": "tests.test_tui_control_flow.test_bottom_navigation_triggers_request_bottom",
+          "reasons": [
+            {
+              "id": "gold_flaky",
+              "note": "Flaky on v6 gold: passed in some gold-eval rounds and failed in others across 3 independent runs (timing-sensitive TUI/tui2cli snapshot, tmux, or db-lock/test-isolation concurrency); unreliable for grading.",
+              "timestamp": 1749700000,
+              "user": "kilian"
+            }
+          ]
+        },
+        {
+          "name": "tests.test_tui_control_flow.test_exec_mode_reload_control_reader",
+          "reasons": [
+            {
+              "id": "gold_flaky",
+              "note": "Flaky on v6 gold: passed in some gold-eval rounds and failed in others across 3 independent runs (timing-sensitive TUI/tui2cli snapshot, tmux, or db-lock/test-isolation concurrency); unreliable for grading.",
+              "timestamp": 1749700000,
+              "user": "kilian"
+            }
+          ]
+        },
+        {
+          "name": "tests.test_tui_control_flow.test_follow_mode_request_follow",
+          "reasons": [
+            {
+              "id": "gold_flaky",
+              "note": "Flaky on v6 gold: passed in some gold-eval rounds and failed in others across 3 independent runs (timing-sensitive TUI/tui2cli snapshot, tmux, or db-lock/test-isolation concurrency); unreliable for grading.",
+              "timestamp": 1749700000,
+              "user": "kilian"
+            }
+          ]
+        },
+        {
+          "name": "tests.test_tui_control_flow.test_reload_file_with_f5",
+          "reasons": [
+            {
+              "id": "gold_flaky",
+              "note": "Flaky on v6 gold: passed in some gold-eval rounds and failed in others across 3 independent runs (timing-sensitive TUI/tui2cli snapshot, tmux, or db-lock/test-isolation concurrency); unreliable for grading.",
+              "timestamp": 1749700000,
+              "user": "kilian"
+            }
+          ]
+        },
+        {
+          "name": "tests.test_tui_control_flow.test_search_request_search",
+          "reasons": [
+            {
+              "id": "gold_flaky",
+              "note": "Flaky on v6 gold: passed in some gold-eval rounds and failed in others across 3 independent runs (timing-sensitive TUI/tui2cli snapshot, tmux, or db-lock/test-isolation concurrency); unreliable for grading.",
+              "timestamp": 1749700000,
+              "user": "kilian"
+            }
+          ]
+        },
+        {
+          "name": "tests.test_tui_control_flow.test_watch_mode_periodic_reload",
+          "reasons": [
+            {
+              "id": "gold_flaky",
+              "note": "Flaky on v6 gold: passed in some gold-eval rounds and failed in others across 3 independent runs (timing-sensitive TUI/tui2cli snapshot, tmux, or db-lock/test-isolation concurrency); unreliable for grading.",
+              "timestamp": 1749700000,
+              "user": "kilian"
+            }
+          ]
+        },
+        {
+          "name": "tests.test_tui_input_modes.test_header_column_mode",
+          "reasons": [
+            {
+              "id": "gold_flaky",
+              "note": "Flaky on v6 gold: passed in some gold-eval rounds and failed in others across 3 independent runs (timing-sensitive TUI/tui2cli snapshot, tmux, or db-lock/test-isolation concurrency); unreliable for grading.",
+              "timestamp": 1749700000,
+              "user": "kilian"
+            }
+          ]
+        },
+        {
+          "name": "tests.test_tui_input_modes.test_header_mode_basic",
+          "reasons": [
+            {
+              "id": "gold_flaky",
+              "note": "Flaky on v6 gold: passed in some gold-eval rounds and failed in others across 3 independent runs (timing-sensitive TUI/tui2cli snapshot, tmux, or db-lock/test-isolation concurrency); unreliable for grading.",
+              "timestamp": 1749700000,
+              "user": "kilian"
+            }
+          ]
+        },
+        {
+          "name": "tests.test_tui_input_modes.test_jump_target_mode",
+          "reasons": [
+            {
+              "id": "gold_flaky",
+              "note": "Flaky on v6 gold: passed in some gold-eval rounds and failed in others across 3 independent runs (timing-sensitive TUI/tui2cli snapshot, tmux, or db-lock/test-isolation concurrency); unreliable for grading.",
+              "timestamp": 1749700000,
+              "user": "kilian"
+            }
+          ]
+        },
+        {
+          "name": "tests.test_tui_input_modes.test_mark_goto_mode",
+          "reasons": [
+            {
+              "id": "gold_flaky",
+              "note": "Flaky on v6 gold: passed in some gold-eval rounds and failed in others across 3 independent runs (timing-sensitive TUI/tui2cli snapshot, tmux, or db-lock/test-isolation concurrency); unreliable for grading.",
+              "timestamp": 1749700000,
+              "user": "kilian"
+            }
+          ]
+        },
+        {
+          "name": "tests.test_tui_input_modes.test_save_buffer_requires_non_seekable",
+          "reasons": [
+            {
+              "id": "gold_flaky",
+              "note": "Flaky on v6 gold: passed in some gold-eval rounds and failed in others across 3 independent runs (timing-sensitive TUI/tui2cli snapshot, tmux, or db-lock/test-isolation concurrency); unreliable for grading.",
+              "timestamp": 1749700000,
+              "user": "kilian"
+            }
+          ]
+        },
+        {
+          "name": "tests.test_tui_input_modes.test_section_num_mode",
+          "reasons": [
+            {
+              "id": "gold_flaky",
+              "note": "Flaky on v6 gold: passed in some gold-eval rounds and failed in others across 3 independent runs (timing-sensitive TUI/tui2cli snapshot, tmux, or db-lock/test-isolation concurrency); unreliable for grading.",
+              "timestamp": 1749700000,
+              "user": "kilian"
+            }
+          ]
+        },
+        {
+          "name": "tests.test_tui_input_modes.test_skip_lines_mode",
+          "reasons": [
+            {
+              "id": "gold_flaky",
+              "note": "Flaky on v6 gold: passed in some gold-eval rounds and failed in others across 3 independent runs (timing-sensitive TUI/tui2cli snapshot, tmux, or db-lock/test-isolation concurrency); unreliable for grading.",
+              "timestamp": 1749700000,
+              "user": "kilian"
+            }
+          ]
+        },
+        {
+          "name": "tests.test_tui_navigation.test_basic_file_display",
+          "reasons": [
+            {
+              "id": "gold_flaky",
+              "note": "Flaky on v6 gold: passed in some gold-eval rounds and failed in others across 3 independent runs (timing-sensitive TUI/tui2cli snapshot, tmux, or db-lock/test-isolation concurrency); unreliable for grading.",
+              "timestamp": 1749700000,
+              "user": "kilian"
+            }
+          ]
         }
       ]
     },
diff --git a/src/programbench/data/tasks/orf__gping.26eb5b9/tests.json b/src/programbench/data/tasks/orf__gping.26eb5b9/tests.json
index 50cd7ef..96c4131 100644
--- a/src/programbench/data/tasks/orf__gping.26eb5b9/tests.json
+++ b/src/programbench/data/tasks/orf__gping.26eb5b9/tests.json
@@ -2781,6 +2781,23 @@
             }
           ]
         },
+        {
+          "name": "tests.test_shortcuts.test_aws_region_shortcut_expansion",
+          "reasons": [
+            {
+              "extra": {
+                "fail": 5,
+                "note": "TUI ping-stats rendering timing race (stats not yet shown in captured frame)",
+                "pass": 15,
+                "pattern": "ffppppfpppppppppfppf",
+                "source": "20x gold cloud eval 2026-06-13"
+              },
+              "id": "gold_flaky",
+              "timestamp": 1781326815,
+              "user": "kilian"
+            }
+          ]
+        },
         {
           "name": "tests.test_shortcuts.test_hostname_resolves_to_ipv6_with_flag",
           "reasons": [
diff --git a/src/programbench/data/tasks/peco__peco.4e58dad/tests.json b/src/programbench/data/tasks/peco__peco.4e58dad/tests.json
index fdd4b54..f6d88e2 100644
--- a/src/programbench/data/tasks/peco__peco.4e58dad/tests.json
+++ b/src/programbench/data/tasks/peco__peco.4e58dad/tests.json
@@ -454,6 +454,23 @@
             }
           ]
         },
+        {
+          "name": "eval.tests.test_basic.test_dash_in_regular_query",
+          "reasons": [
+            {
+              "extra": {
+                "fail": 1,
+                "note": "tcell screen-init race (open /dev/tty: no such device) + TUI layout snapshot timing; --select-1 exit code tty-dependent",
+                "pass": 19,
+                "pattern": "pppfpppppppppppppppp",
+                "source": "20x gold cloud eval 2026-06-13"
+              },
+              "id": "gold_flaky",
+              "timestamp": 1781326815,
+              "user": "kilian"
+            }
+          ]
+        },
         {
           "name": "eval.tests.test_basic.test_empty_input_with_exit_0_flag",
           "reasons": [
@@ -652,6 +669,23 @@
             }
           ]
         },
+        {
+          "name": "eval.tests.test_basic.test_print_query_with_different_selection",
+          "reasons": [
+            {
+              "extra": {
+                "fail": 1,
+                "note": "tcell screen-init race (open /dev/tty: no such device) + TUI layout snapshot timing; --select-1 exit code tty-dependent",
+                "pass": 19,
+                "pattern": "pppppppppppppfpppppp",
+                "source": "20x gold cloud eval 2026-06-13"
+              },
+              "id": "gold_flaky",
+              "timestamp": 1781326815,
+              "user": "kilian"
+            }
+          ]
+        },
         {
           "name": "eval.tests.test_basic.test_query_case_insensitive_default",
           "reasons": [
@@ -846,6 +880,23 @@
             }
           ]
         },
+        {
+          "name": "eval.tests.test_coverage.test_config_with_initial_filter",
+          "reasons": [
+            {
+              "extra": {
+                "fail": 2,
+                "note": "tcell screen-init race (open /dev/tty: no such device) + TUI layout snapshot timing; --select-1 exit code tty-dependent",
+                "pass": 18,
+                "pattern": "pppppppppppppppppfpf",
+                "source": "20x gold cloud eval 2026-06-13"
+              },
+              "id": "gold_flaky",
+              "timestamp": 1781326815,
+              "user": "kilian"
+            }
+          ]
+        },
         {
           "name": "eval.tests.test_coverage.test_config_with_invalid_height_format",
           "reasons": [
@@ -1150,6 +1201,23 @@
             }
           ]
         },
+        {
+          "name": "eval.tests.test_coverage.test_print_query_with_no_match",
+          "reasons": [
+            {
+              "extra": {
+                "fail": 2,
+                "note": "tcell screen-init race (open /dev/tty: no such device) + TUI layout snapshot timing; --select-1 exit code tty-dependent",
+                "pass": 18,
+                "pattern": "pppppfppfppppppppppp",
+                "source": "20x gold cloud eval 2026-06-13"
+              },
+              "id": "gold_flaky",
+              "timestamp": 1781326815,
+              "user": "kilian"
+            }
+          ]
+        },
         {
           "name": "eval.tests.test_coverage.test_prompt_with_select_1",
           "reasons": [
@@ -1186,6 +1254,23 @@
             }
           ]
         },
+        {
+          "name": "eval.tests.test_coverage.test_query_with_emoji",
+          "reasons": [
+            {
+              "extra": {
+                "fail": 1,
+                "note": "tcell screen-init race (open /dev/tty: no such device) + TUI layout snapshot timing; --select-1 exit code tty-dependent",
+                "pass": 19,
+                "pattern": "pppfpppppppppppppppp",
+                "source": "20x gold cloud eval 2026-06-13"
+              },
+              "id": "gold_flaky",
+              "timestamp": 1781326815,
+              "user": "kilian"
+            }
+          ]
+        },
         {
           "name": "eval.tests.test_coverage.test_query_with_newline",
           "reasons": [
@@ -4871,6 +4956,40 @@
             }
           ]
         },
+        {
+          "name": "tests.test_layout_gap.test_layout_bottom_up_page_up",
+          "reasons": [
+            {
+              "extra": {
+                "fail": 1,
+                "note": "tcell screen-init race (open /dev/tty: no such device) + TUI layout snapshot timing; --select-1 exit code tty-dependent",
+                "pass": 19,
+                "pattern": "ppppfppppppppppppppp",
+                "source": "20x gold cloud eval 2026-06-13"
+              },
+              "id": "gold_flaky",
+              "timestamp": 1781326815,
+              "user": "kilian"
+            }
+          ]
+        },
+        {
+          "name": "tests.test_layout_gap.test_layout_cursor_middle_of_query",
+          "reasons": [
+            {
+              "extra": {
+                "fail": 1,
+                "note": "tcell screen-init race (open /dev/tty: no such device) + TUI layout snapshot timing; --select-1 exit code tty-dependent",
+                "pass": 19,
+                "pattern": "ppppfppppppppppppppp",
+                "source": "20x gold cloud eval 2026-06-13"
+              },
+              "id": "gold_flaky",
+              "timestamp": 1781326815,
+              "user": "kilian"
+            }
+          ]
+        },
         {
           "name": "tests.test_layout_gap.test_layout_few_lines_wrapping",
           "reasons": [
@@ -5696,6 +5815,17 @@
             }
           ]
         },
+        {
+          "name": "tests.test_basic.TestSpecialCharacters.test_whitespace_only_lines",
+          "reasons": [
+            {
+              "id": "gold_flaky",
+              "note": "Flaky on v6 gold: passed in some gold-eval rounds and failed in others across 3 independent runs (timing-sensitive TUI/tui2cli snapshot, tmux, or db-lock/test-isolation concurrency); unreliable for grading.",
+              "timestamp": 1749700000,
+              "user": "kilian"
+            }
+          ]
+        },
         {
           "name": "tests.test_interactive.TestInteractiveBasic.test_basic_display",
           "reasons": [
@@ -5902,7 +6032,19 @@
         "eval.tests.test_peco_behavior.test_unknown_flag_errors",
         "eval.tests.test_peco_behavior.test_version_format"
       ],
-      "ignored_tests": []
+      "ignored_tests": [
+        {
+          "name": "eval.tests.test_peco_behavior.test_prompt_option_changes_prompt",
+          "reasons": [
+            {
+              "id": "gold_flaky",
+              "note": "Flaky on v6 gold: passed in some gold-eval rounds and failed in others across 3 independent runs (timing-sensitive TUI/tui2cli snapshot, tmux, or db-lock/test-isolation concurrency); unreliable for grading.",
+              "timestamp": 1749700000,
+              "user": "kilian"
+            }
+          ]
+        }
+      ]
     },
     "6c5b8939620c": {
       "ignored": false,
diff --git a/src/programbench/data/tasks/pls-rs__pls.4e1ae50/tests.json b/src/programbench/data/tasks/pls-rs__pls.4e1ae50/tests.json
index 8918848..db1da1b 100644
--- a/src/programbench/data/tasks/pls-rs__pls.4e1ae50/tests.json
+++ b/src/programbench/data/tasks/pls-rs__pls.4e1ae50/tests.json
@@ -394,6 +394,17 @@
             }
           ]
         },
+        {
+          "name": "tests.test_filtering.test_importance_filter_cutoff_0",
+          "reasons": [
+            {
+              "id": "gold_flaky",
+              "note": "Flaky on v6 gold: passed in some gold-eval rounds and failed in others across 3 independent runs (timing-sensitive TUI/tui2cli snapshot, tmux, or db-lock/test-isolation concurrency); unreliable for grading.",
+              "timestamp": 1749700000,
+              "user": "kilian"
+            }
+          ]
+        },
         {
           "name": "tests.test_filtering.test_importance_filter_cutoff_1",
           "reasons": [
diff --git a/src/programbench/data/tasks/raviqqe__muffet.a882908/tests.json b/src/programbench/data/tasks/raviqqe__muffet.a882908/tests.json
index 07b313d..cc350fb 100644
--- a/src/programbench/data/tasks/raviqqe__muffet.a882908/tests.json
+++ b/src/programbench/data/tasks/raviqqe__muffet.a882908/tests.json
@@ -675,6 +675,23 @@
             }
           ]
         },
+        {
+          "name": "tests.test_output_formats.test_json_empty_links_array_structure",
+          "reasons": [
+            {
+              "extra": {
+                "fail": 8,
+                "note": "local HTTP test-server dial race (error when dialing 127.0.0.1:8765)",
+                "pass": 12,
+                "pattern": "ppfffppfffppppppffpp",
+                "source": "20x gold cloud eval 2026-06-13"
+              },
+              "id": "gold_flaky",
+              "timestamp": 1781326815,
+              "user": "kilian"
+            }
+          ]
+        },
         {
           "name": "tests.test_output_formats.test_json_error_is_string",
           "reasons": [
diff --git a/src/programbench/data/tasks/rcoh__angle-grinder.9c2fc88/tests.json b/src/programbench/data/tasks/rcoh__angle-grinder.9c2fc88/tests.json
index db339ad..d66605a 100644
--- a/src/programbench/data/tasks/rcoh__angle-grinder.9c2fc88/tests.json
+++ b/src/programbench/data/tasks/rcoh__angle-grinder.9c2fc88/tests.json
@@ -1267,6 +1267,42 @@
             }
           ]
         },
+        {
+          "name": "tests.test_tty_rendering.test_tty_aggregate_shows_ansi_escape_codes",
+          "reasons": [
+            {
+              "extra": {
+                "cause": "Flaky TUI render timing (progressive / ANSI tty output)",
+                "statuses": [
+                  "passed",
+                  "failure",
+                  "passed"
+                ]
+              },
+              "id": "gold_flaky",
+              "timestamp": 1781197573,
+              "user": "kilian"
+            }
+          ]
+        },
+        {
+          "name": "tests.test_tty_rendering.test_tty_avg_aggregate_shows_progressive_updates",
+          "reasons": [
+            {
+              "extra": {
+                "cause": "Flaky TUI render timing (progressive / ANSI tty output)",
+                "statuses": [
+                  "passed",
+                  "passed",
+                  "failure"
+                ]
+              },
+              "id": "gold_flaky",
+              "timestamp": 1781197573,
+              "user": "kilian"
+            }
+          ]
+        },
         {
           "name": "tests.test_tty_rendering.test_tty_count_aggregate_intermediate_updates",
           "reasons": [
diff --git a/src/programbench/data/tasks/rhysd__kiro-editor.4157485/tests.json b/src/programbench/data/tasks/rhysd__kiro-editor.4157485/tests.json
index eaf4a49..8414a3c 100644
--- a/src/programbench/data/tasks/rhysd__kiro-editor.4157485/tests.json
+++ b/src/programbench/data/tasks/rhysd__kiro-editor.4157485/tests.json
@@ -1487,6 +1487,17 @@
             }
           ]
         },
+        {
+          "name": "tests.test_edge_cases.test_horizontal_scroll_on_long_line",
+          "reasons": [
+            {
+              "id": "gold_flaky",
+              "note": "Flaky on v6 gold: passed in some gold-eval rounds and failed in others across 3 independent runs (timing-sensitive TUI/tui2cli snapshot, tmux, or db-lock/test-isolation concurrency); unreliable for grading.",
+              "timestamp": 1749700000,
+              "user": "kilian"
+            }
+          ]
+        },
         {
           "name": "tests.test_edge_cases.test_large_file_navigation_to_end",
           "reasons": [
@@ -1511,6 +1522,24 @@
             }
           ]
         },
+        {
+          "name": "tests.test_edge_cases.test_utf8_multibyte_characters",
+          "reasons": [
+            {
+              "extra": {
+                "cause": "Flaky TUI render timing (editor screen state, utf8 input)",
+                "statuses": [
+                  "failure",
+                  "passed",
+                  "failure"
+                ]
+              },
+              "id": "gold_flaky",
+              "timestamp": 1781197573,
+              "user": "kilian"
+            }
+          ]
+        },
         {
           "name": "tests.test_editing.test_backspace_at_line_start",
           "reasons": [
@@ -1733,6 +1762,24 @@
             }
           ]
         },
+        {
+          "name": "tests.test_editor_gaps.test_backspace_key_deletes_char",
+          "reasons": [
+            {
+              "extra": {
+                "cause": "Flaky TUI render timing (editor screen state, utf8 input)",
+                "statuses": [
+                  "passed",
+                  "passed",
+                  "failure"
+                ]
+              },
+              "id": "gold_flaky",
+              "timestamp": 1781197573,
+              "user": "kilian"
+            }
+          ]
+        },
         {
           "name": "tests.test_editor_gaps.test_ctrl_bracket_page_down",
           "reasons": [
@@ -1741,6 +1788,24 @@
             }
           ]
         },
+        {
+          "name": "tests.test_editor_gaps.test_utf8_character_input",
+          "reasons": [
+            {
+              "extra": {
+                "cause": "Flaky TUI render timing (editor screen state, utf8 input)",
+                "statuses": [
+                  "failure",
+                  "passed",
+                  "failure"
+                ]
+              },
+              "id": "gold_flaky",
+              "timestamp": 1781197573,
+              "user": "kilian"
+            }
+          ]
+        },
         {
           "name": "tests.test_error_coverage.test_control_char_tab",
           "reasons": [
@@ -1899,6 +1964,35 @@
             }
           ]
         },
+        {
+          "name": "tests.test_tui_core.test_backspace_deletion",
+          "reasons": [
+            {
+              "extra": {
+                "cause": "Flaky TUI render timing (editor screen state, utf8 input)",
+                "statuses": [
+                  "passed",
+                  "passed",
+                  "failure"
+                ]
+              },
+              "id": "gold_flaky",
+              "timestamp": 1781197573,
+              "user": "kilian"
+            }
+          ]
+        },
+        {
+          "name": "tests.test_tui_core.test_delete_to_end_of_line",
+          "reasons": [
+            {
+              "id": "gold_flaky",
+              "note": "Flaky on v6 gold: passed in some gold-eval rounds and failed in others across 3 independent runs (timing-sensitive TUI/tui2cli snapshot, tmux, or db-lock/test-isolation concurrency); unreliable for grading.",
+              "timestamp": 1749700000,
+              "user": "kilian"
+            }
+          ]
+        },
         {
           "name": "tests.test_tui_core.test_page_down_scrolling",
           "reasons": [
diff --git a/src/programbench/data/tasks/rs__curlie.5dfcbb1/tests.json b/src/programbench/data/tasks/rs__curlie.5dfcbb1/tests.json
index ec18d8b..366b3f8 100644
--- a/src/programbench/data/tasks/rs__curlie.5dfcbb1/tests.json
+++ b/src/programbench/data/tasks/rs__curlie.5dfcbb1/tests.json
@@ -688,6 +688,42 @@
             }
           ]
         },
+        {
+          "name": "tests.test_formatting_json.test_deep_nested_json",
+          "reasons": [
+            {
+              "extra": {
+                "cause": "Network-dependent (curl connection failures, rc=7)",
+                "statuses": [
+                  "failure",
+                  "passed",
+                  "failure"
+                ]
+              },
+              "id": "gold_flaky",
+              "timestamp": 1781197573,
+              "user": "kilian"
+            }
+          ]
+        },
+        {
+          "name": "tests.test_formatting_json.test_empty_json_array",
+          "reasons": [
+            {
+              "extra": {
+                "cause": "Network-dependent (curl connection failures, rc=7)",
+                "statuses": [
+                  "failure",
+                  "failure",
+                  "passed"
+                ]
+              },
+              "id": "gold_flaky",
+              "timestamp": 1781197573,
+              "user": "kilian"
+            }
+          ]
+        },
         {
           "name": "tests.test_formatting_json.test_empty_json_object",
           "reasons": [
@@ -706,6 +742,17 @@
             }
           ]
         },
+        {
+          "name": "tests.test_formatting_json.test_escape_sequences_in_json_strings",
+          "reasons": [
+            {
+              "id": "gold_flaky",
+              "note": "Flaky on v6 gold: passed in some gold-eval rounds and failed in others across 3 independent runs (timing-sensitive TUI/tui2cli snapshot, tmux, or db-lock/test-isolation concurrency); unreliable for grading.",
+              "timestamp": 1749700000,
+              "user": "kilian"
+            }
+          ]
+        },
         {
           "name": "tests.test_formatting_json.test_html_passthrough",
           "reasons": [
@@ -724,6 +771,24 @@
             }
           ]
         },
+        {
+          "name": "tests.test_formatting_json.test_literals_null_true_false_colorization",
+          "reasons": [
+            {
+              "extra": {
+                "cause": "Network-dependent (curl connection failures, rc=7)",
+                "statuses": [
+                  "failure",
+                  "passed",
+                  "failure"
+                ]
+              },
+              "id": "gold_flaky",
+              "timestamp": 1781197573,
+              "user": "kilian"
+            }
+          ]
+        },
         {
           "name": "tests.test_formatting_json.test_malformed_json_extra_closing_brace_negative_level_guard",
           "reasons": [
@@ -732,6 +797,24 @@
             }
           ]
         },
+        {
+          "name": "tests.test_formatting_json.test_no_pretty_flag_no_formatting",
+          "reasons": [
+            {
+              "extra": {
+                "cause": "Network-dependent (curl connection failures, rc=7)",
+                "statuses": [
+                  "failure",
+                  "passed",
+                  "failure"
+                ]
+              },
+              "id": "gold_flaky",
+              "timestamp": 1781197573,
+              "user": "kilian"
+            }
+          ]
+        },
         {
           "name": "tests.test_formatting_json.test_numbers_formatting",
           "reasons": [
@@ -742,6 +825,52 @@
             }
           ]
         },
+        {
+          "name": "tests.test_formatting_json.test_plain_text_passthrough",
+          "reasons": [
+            {
+              "id": "gold_flaky",
+              "note": "Flaky on v6 gold: passed in some gold-eval rounds and failed in others across 3 independent runs (timing-sensitive TUI/tui2cli snapshot, tmux, or db-lock/test-isolation concurrency); unreliable for grading.",
+              "timestamp": 1749700000,
+              "user": "kilian"
+            }
+          ]
+        },
+        {
+          "name": "tests.test_formatting_json.test_unicode_characters",
+          "reasons": [
+            {
+              "extra": {
+                "fail": 5,
+                "note": "curl connection race (exit 7) against local test server",
+                "pass": 15,
+                "pattern": "pffpppppppfppppfppfp",
+                "source": "20x gold cloud eval 2026-06-13"
+              },
+              "id": "gold_flaky",
+              "timestamp": 1781326815,
+              "user": "kilian"
+            }
+          ]
+        },
+        {
+          "name": "tests.test_formatting_json.test_whitespace_tabs_and_carriage_returns",
+          "reasons": [
+            {
+              "extra": {
+                "cause": "Network-dependent (curl connection failures, rc=7)",
+                "statuses": [
+                  "passed",
+                  "failure",
+                  "failure"
+                ]
+              },
+              "id": "gold_flaky",
+              "timestamp": 1781197573,
+              "user": "kilian"
+            }
+          ]
+        },
         {
           "name": "tests.test_smoke.test_binary_exists",
           "reasons": [
diff --git a/src/programbench/data/tasks/rust-embedded__svd2rust.1760b5e/tests.json b/src/programbench/data/tasks/rust-embedded__svd2rust.1760b5e/tests.json
index c8460e0..e9ff58a 100644
--- a/src/programbench/data/tasks/rust-embedded__svd2rust.1760b5e/tests.json
+++ b/src/programbench/data/tasks/rust-embedded__svd2rust.1760b5e/tests.json
@@ -1583,7 +1583,30 @@
         "eval.tests.test_logging.test_log_info_has_two_lines",
         "eval.tests.test_logging.test_log_off_produces_no_stderr"
       ],
-      "ignored_tests": []
+      "ignored_tests": [
+        {
+          "name": "eval.tests.test_cli_help_version.test_version_exact",
+          "reasons": [
+            {
+              "id": "gold_fail",
+              "note": "[gold_fail_v6_build_stamp] Fails on v6 gold: test asserts a build-stamped version/commit/date string (non-reproducible build metadata, e.g. embedded git rev or build date) rather than functionality; the golden captured the previous build's stamp.",
+              "timestamp": 1749700000,
+              "user": "kilian"
+            }
+          ]
+        },
+        {
+          "name": "eval.tests.test_generation_outputs.test_generate_default_files_and_content_hashes",
+          "reasons": [
+            {
+              "id": "gold_fail",
+              "note": "[gold_fail_v6_toolchain] Fails on v6 gold: output reflects the pinned toolchain / dependency versions (e.g. Go compiler diagnostic wording, stdlib net behavior, or generated-code content); the golden captured the previous build environment's output.",
+              "timestamp": 1749700000,
+              "user": "kilian"
+            }
+          ]
+        }
+      ]
     },
     "0e234d2b8eef": {
       "ignored": false,
diff --git a/src/programbench/data/tasks/sheepla__pingu.926d475/tests.json b/src/programbench/data/tasks/sheepla__pingu.926d475/tests.json
index 1d9ddff..144f0c5 100644
--- a/src/programbench/data/tasks/sheepla__pingu.926d475/tests.json
+++ b/src/programbench/data/tasks/sheepla__pingu.926d475/tests.json
@@ -372,6 +372,17 @@
             }
           ]
         },
+        {
+          "name": "tests.test_edge_cases.test_ipv4_all_zeros_resolves_to_localhost",
+          "reasons": [
+            {
+              "id": "gold_fail",
+              "note": "[gold_fail_v6_toolchain] Fails on v6 gold: output reflects the pinned toolchain / dependency versions (e.g. Go compiler diagnostic wording, stdlib net behavior, or generated-code content); the golden captured the previous build environment's output.",
+              "timestamp": 1749700000,
+              "user": "kilian"
+            }
+          ]
+        },
         {
           "name": "tests.test_edge_cases.test_ipv6_localhost",
           "reasons": [
@@ -424,6 +435,50 @@
             }
           ]
         },
+        {
+          "name": "tests.test_flags.test_version_format",
+          "reasons": [
+            {
+              "id": "gold_fail",
+              "note": "[gold_fail_v6_build_stamp] Fails on v6 gold: test asserts a build-stamped version/commit/date string (non-reproducible build metadata, e.g. embedded git rev or build date) rather than functionality; the golden captured the previous build's stamp.",
+              "timestamp": 1749700000,
+              "user": "kilian"
+            }
+          ]
+        },
+        {
+          "name": "tests.test_flags.test_version_ignores_other_flags",
+          "reasons": [
+            {
+              "id": "gold_fail",
+              "note": "[gold_fail_v6_build_stamp] Fails on v6 gold: test asserts a build-stamped version/commit/date string (non-reproducible build metadata, e.g. embedded git rev or build date) rather than functionality; the golden captured the previous build's stamp.",
+              "timestamp": 1749700000,
+              "user": "kilian"
+            }
+          ]
+        },
+        {
+          "name": "tests.test_flags.test_version_long_flag",
+          "reasons": [
+            {
+              "id": "gold_fail",
+              "note": "[gold_fail_v6_build_stamp] Fails on v6 gold: test asserts a build-stamped version/commit/date string (non-reproducible build metadata, e.g. embedded git rev or build date) rather than functionality; the golden captured the previous build's stamp.",
+              "timestamp": 1749700000,
+              "user": "kilian"
+            }
+          ]
+        },
+        {
+          "name": "tests.test_flags.test_version_short_flag",
+          "reasons": [
+            {
+              "id": "gold_fail",
+              "note": "[gold_fail_v6_build_stamp] Fails on v6 gold: test asserts a build-stamped version/commit/date string (non-reproducible build metadata, e.g. embedded git rev or build date) rather than functionality; the golden captured the previous build's stamp.",
+              "timestamp": 1749700000,
+              "user": "kilian"
+            }
+          ]
+        },
         {
           "name": "tests.test_ping.test_ping_count_50_localhost",
           "reasons": [
diff --git a/src/programbench/data/tasks/sqlite__sqlite.839433d/tests.json b/src/programbench/data/tasks/sqlite__sqlite.839433d/tests.json
index ae59415..25c3644 100644
--- a/src/programbench/data/tasks/sqlite__sqlite.839433d/tests.json
+++ b/src/programbench/data/tasks/sqlite__sqlite.839433d/tests.json
@@ -22592,6 +22592,17 @@
             }
           ]
         },
+        {
+          "name": "tests.test_harvest_e_createtable.test_e_createtable_t4_15_0@db_e_createtable",
+          "reasons": [
+            {
+              "id": "gold_flaky",
+              "note": "Flaky on v6 gold: passed in some gold-eval rounds and failed in others across 3 independent runs (timing-sensitive TUI/tui2cli snapshot, tmux, or db-lock/test-isolation concurrency); unreliable for grading.",
+              "timestamp": 1749700000,
+              "user": "kilian"
+            }
+          ]
+        },
         {
           "name": "tests.test_harvest_e_createtable.test_e_createtable_t4_18_3@db_e_createtable",
           "reasons": [
@@ -33763,6 +33774,23 @@
             }
           ]
         },
+        {
+          "name": "tests.test_harvest_pragma4.test_pragma4_t4_1_1@db_pragma4",
+          "reasons": [
+            {
+              "extra": {
+                "fail": 1,
+                "note": "test-isolation race: pre-existing table/view from dirty shared DB state (table/view already exists)",
+                "pass": 19,
+                "pattern": "pppppppppfpppppppppp",
+                "source": "20x gold cloud eval 2026-06-13"
+              },
+              "id": "gold_flaky",
+              "timestamp": 1781326815,
+              "user": "kilian"
+            }
+          ]
+        },
         {
           "name": "tests.test_harvest_pragma4.test_pragma4_t4_1_2@db_pragma4",
           "reasons": [
@@ -34896,6 +34924,23 @@
             }
           ]
         },
+        {
+          "name": "tests.test_harvest_rowvalue7.test_rowvalue7_t1_1",
+          "reasons": [
+            {
+              "extra": {
+                "fail": 1,
+                "note": "test-isolation race: pre-existing table/view from dirty shared DB state (table/view already exists)",
+                "pass": 19,
+                "pattern": "pppppppppppppppppfpp",
+                "source": "20x gold cloud eval 2026-06-13"
+              },
+              "id": "gold_flaky",
+              "timestamp": 1781326815,
+              "user": "kilian"
+            }
+          ]
+        },
         {
           "name": "tests.test_harvest_rowvalue7.test_rowvalue7_t1_2@db_rowvalue7",
           "reasons": [
@@ -41767,6 +41812,23 @@
             }
           ]
         },
+        {
+          "name": "tests.test_harvest_window1.test_window1_t73_0",
+          "reasons": [
+            {
+              "extra": {
+                "fail": 2,
+                "note": "test-isolation race: pre-existing table/view from dirty shared DB state (table/view already exists)",
+                "pass": 18,
+                "pattern": "pppppppppppfpppppppf",
+                "source": "20x gold cloud eval 2026-06-13"
+              },
+              "id": "gold_flaky",
+              "timestamp": 1781326815,
+              "user": "kilian"
+            }
+          ]
+        },
         {
           "name": "tests.test_harvest_window1.test_window1_t73_1",
           "reasons": [
diff --git a/src/programbench/data/tasks/stranger6667__jsonschema.d52e881/tests.json b/src/programbench/data/tasks/stranger6667__jsonschema.d52e881/tests.json
index 0cd75ab..b22a124 100644
--- a/src/programbench/data/tasks/stranger6667__jsonschema.d52e881/tests.json
+++ b/src/programbench/data/tasks/stranger6667__jsonschema.d52e881/tests.json
@@ -1645,6 +1645,17 @@
               "id": "gold_fail"
             }
           ]
+        },
+        {
+          "name": "tests.test_retriever.test_insecure_flag_bypasses_cert_validation",
+          "reasons": [
+            {
+              "id": "gold_fail",
+              "note": "[gold_fail_v6_toolchain] HTTP-client error wording differs from golden ('error decoding response body'); reqwest version drift (test also depends on external https://self-signed.badssl.com). 20x gold cloud eval 2026-06-13, failed 20/20 rounds",
+              "timestamp": 1781487387,
+              "user": "kilian"
+            }
+          ]
         }
       ]
     },
diff --git a/src/programbench/data/tasks/svenstaro__genact.16f96e3/tests.json b/src/programbench/data/tasks/svenstaro__genact.16f96e3/tests.json
index 75914cd..5f06b78 100644
--- a/src/programbench/data/tasks/svenstaro__genact.16f96e3/tests.json
+++ b/src/programbench/data/tasks/svenstaro__genact.16f96e3/tests.json
@@ -243,6 +243,23 @@
         "tests.test_tui_extended.test_weblog_module_nginx_access_log"
       ],
       "ignored_tests": [
+        {
+          "name": "tests.test_modules_misc.test_rkhunter_rootkit_scanning",
+          "reasons": [
+            {
+              "extra": {
+                "fail": 1,
+                "note": "randomized activity-simulator output: expected task line absent in a random run",
+                "pass": 19,
+                "pattern": "pppppppppppppppppppf",
+                "source": "20x gold cloud eval 2026-06-13"
+              },
+              "id": "gold_flaky",
+              "timestamp": 1781326815,
+              "user": "kilian"
+            }
+          ]
+        },
         {
           "name": "tests.test_modules_security.test_docker_image_rm_shows_digest_after_tag",
           "reasons": [
diff --git a/src/programbench/data/tasks/tstack__lnav.ee34494/tests.json b/src/programbench/data/tasks/tstack__lnav.ee34494/tests.json
index f128c4b..d862b4a 100644
--- a/src/programbench/data/tasks/tstack__lnav.ee34494/tests.json
+++ b/src/programbench/data/tasks/tstack__lnav.ee34494/tests.json
@@ -1740,6 +1740,17 @@
             }
           ]
         },
+        {
+          "name": "tests.test_help_system.test_help_flag_basic_usage",
+          "reasons": [
+            {
+              "id": "gold_fail",
+              "note": "[gold_fail_v6_build_stamp] Usage string embeds the build git-hash (4.0-8ad2d43) differing from golden (4.0-07efb63). 20x gold cloud eval 2026-06-13, failed 20/20 rounds",
+              "timestamp": 1781487387,
+              "user": "kilian"
+            }
+          ]
+        },
         {
           "name": "tests.test_hotkeys.test_hotkey_C_clear_bookmarks",
           "reasons": [
diff --git a/src/programbench/data/tasks/unhappychoice__gittype.34b72d0/tests.json b/src/programbench/data/tasks/unhappychoice__gittype.34b72d0/tests.json
index 291b5ba..5bb3dff 100644
--- a/src/programbench/data/tasks/unhappychoice__gittype.34b72d0/tests.json
+++ b/src/programbench/data/tasks/unhappychoice__gittype.34b72d0/tests.json
@@ -1348,6 +1348,17 @@
             }
           ]
         },
+        {
+          "name": "tests.test_global_args.test_double_dash_separator",
+          "reasons": [
+            {
+              "id": "gold_flaky",
+              "note": "Flaky on v6 gold: passed in some gold-eval rounds and failed in others across 3 independent runs (timing-sensitive TUI/tui2cli snapshot, tmux, or db-lock/test-isolation concurrency); unreliable for grading.",
+              "timestamp": 1749700000,
+              "user": "kilian"
+            }
+          ]
+        },
         {
           "name": "tests.test_global_args.test_duplicate_languages_accepted",
           "reasons": [
@@ -1404,6 +1415,17 @@
             }
           ]
         },
+        {
+          "name": "tests.test_global_args.test_unknown_subcommand",
+          "reasons": [
+            {
+              "id": "gold_flaky",
+              "note": "Flaky on v6 gold: passed in some gold-eval rounds and failed in others across 3 independent runs (timing-sensitive TUI/tui2cli snapshot, tmux, or db-lock/test-isolation concurrency); unreliable for grading.",
+              "timestamp": 1749700000,
+              "user": "kilian"
+            }
+          ]
+        },
         {
           "name": "tests.test_global_args.test_version_flags_produce_identical_output",
           "reasons": [
@@ -1712,6 +1734,83 @@
             }
           ]
         },
+        {
+          "name": "tests.test_repo_paths.test_langs_all_supported_languages",
+          "reasons": [
+            {
+              "id": "gold_flaky",
+              "note": "Flaky on v6 gold: passed in some gold-eval rounds and failed in others across 3 independent runs (timing-sensitive TUI/tui2cli snapshot, tmux, or db-lock/test-isolation concurrency); unreliable for grading.",
+              "timestamp": 1749700000,
+              "user": "kilian"
+            }
+          ]
+        },
+        {
+          "name": "tests.test_repo_paths.test_langs_case_insensitive",
+          "reasons": [
+            {
+              "id": "gold_flaky",
+              "note": "Flaky on v6 gold: passed in some gold-eval rounds and failed in others across 3 independent runs (timing-sensitive TUI/tui2cli snapshot, tmux, or db-lock/test-isolation concurrency); unreliable for grading.",
+              "timestamp": 1749700000,
+              "user": "kilian"
+            }
+          ]
+        },
+        {
+          "name": "tests.test_repo_paths.test_langs_duplicate_entries",
+          "reasons": [
+            {
+              "id": "gold_flaky",
+              "note": "Flaky on v6 gold: passed in some gold-eval rounds and failed in others across 3 independent runs (timing-sensitive TUI/tui2cli snapshot, tmux, or db-lock/test-isolation concurrency); unreliable for grading.",
+              "timestamp": 1749700000,
+              "user": "kilian"
+            }
+          ]
+        },
+        {
+          "name": "tests.test_repo_paths.test_langs_empty_value_tries_to_start",
+          "reasons": [
+            {
+              "id": "gold_flaky",
+              "note": "Flaky on v6 gold: passed in some gold-eval rounds and failed in others across 3 independent runs (timing-sensitive TUI/tui2cli snapshot, tmux, or db-lock/test-isolation concurrency); unreliable for grading.",
+              "timestamp": 1749700000,
+              "user": "kilian"
+            }
+          ]
+        },
+        {
+          "name": "tests.test_repo_paths.test_langs_with_spaces_in_names",
+          "reasons": [
+            {
+              "id": "gold_flaky",
+              "note": "Flaky on v6 gold: passed in some gold-eval rounds and failed in others across 3 independent runs (timing-sensitive TUI/tui2cli snapshot, tmux, or db-lock/test-isolation concurrency); unreliable for grading.",
+              "timestamp": 1749700000,
+              "user": "kilian"
+            }
+          ]
+        },
+        {
+          "name": "tests.test_repo_paths.test_multiple_langs_flags_are_combined",
+          "reasons": [
+            {
+              "id": "gold_flaky",
+              "note": "Flaky on v6 gold: passed in some gold-eval rounds and failed in others across 3 independent runs (timing-sensitive TUI/tui2cli snapshot, tmux, or db-lock/test-isolation concurrency); unreliable for grading.",
+              "timestamp": 1749700000,
+              "user": "kilian"
+            }
+          ]
+        },
+        {
+          "name": "tests.test_repo_paths.test_no_args_requires_tty",
+          "reasons": [
+            {
+              "id": "gold_flaky",
+              "note": "Flaky on v6 gold: passed in some gold-eval rounds and failed in others across 3 independent runs (timing-sensitive TUI/tui2cli snapshot, tmux, or db-lock/test-isolation concurrency); unreliable for grading.",
+              "timestamp": 1749700000,
+              "user": "kilian"
+            }
+          ]
+        },
         {
           "name": "tests.test_scoring_ranking.test_consistency_streaks_stored_as_json",
           "reasons": [
@@ -2044,6 +2143,23 @@
             }
           ]
         },
+        {
+          "name": "tests.test_tui_loading.test_current_directory_implicit_path",
+          "reasons": [
+            {
+              "extra": {
+                "fail": 11,
+                "note": "TUI loading/title-screen render timing race under tui2cli capture (title/version screen not reached in captured frame)",
+                "pass": 9,
+                "pattern": "ppppppffffffffpfpffp",
+                "source": "20x gold cloud eval 2026-06-13"
+              },
+              "id": "gold_flaky",
+              "timestamp": 1781326815,
+              "user": "kilian"
+            }
+          ]
+        },
         {
           "name": "tests.test_tui_loading.test_difficulty_wrapping_from_hard_to_easiest",
           "reasons": [
@@ -2054,6 +2170,74 @@
             }
           ]
         },
+        {
+          "name": "tests.test_tui_loading.test_loading_completes_and_reaches_title_screen",
+          "reasons": [
+            {
+              "extra": {
+                "fail": 11,
+                "note": "TUI loading/title-screen render timing race under tui2cli capture (title/version screen not reached in captured frame)",
+                "pass": 9,
+                "pattern": "ppppppffffffffpfpffp",
+                "source": "20x gold cloud eval 2026-06-13"
+              },
+              "id": "gold_flaky",
+              "timestamp": 1781326815,
+              "user": "kilian"
+            }
+          ]
+        },
+        {
+          "name": "tests.test_tui_loading.test_multiple_repos_same_session_prevents_conflicts",
+          "reasons": [
+            {
+              "extra": {
+                "fail": 11,
+                "note": "TUI loading/title-screen render timing race under tui2cli capture (title/version screen not reached in captured frame)",
+                "pass": 9,
+                "pattern": "ppppppffffffffpfpffp",
+                "source": "20x gold cloud eval 2026-06-13"
+              },
+              "id": "gold_flaky",
+              "timestamp": 1781326815,
+              "user": "kilian"
+            }
+          ]
+        },
+        {
+          "name": "tests.test_tui_loading.test_title_screen_difficulty_change_right_arrow",
+          "reasons": [
+            {
+              "extra": {
+                "fail": 11,
+                "note": "TUI loading/title-screen render timing race under tui2cli capture (title/version screen not reached in captured frame)",
+                "pass": 9,
+                "pattern": "ppppppffffffffpfpffp",
+                "source": "20x gold cloud eval 2026-06-13"
+              },
+              "id": "gold_flaky",
+              "timestamp": 1781326815,
+              "user": "kilian"
+            }
+          ]
+        },
+        {
+          "name": "tests.test_tui_loading.test_version_check_exit_with_escape",
+          "reasons": [
+            {
+              "extra": {
+                "fail": 11,
+                "note": "TUI loading/title-screen render timing race under tui2cli capture (title/version screen not reached in captured frame)",
+                "pass": 9,
+                "pattern": "ppppppffffffffpfpffp",
+                "source": "20x gold cloud eval 2026-06-13"
+              },
+              "id": "gold_flaky",
+              "timestamp": 1781326815,
+              "user": "kilian"
+            }
+          ]
+        },
         {
           "name": "tests.test_tui_loading.test_version_check_screen_appears_on_launch",
           "reasons": [
@@ -2062,6 +2246,23 @@
             }
           ]
         },
+        {
+          "name": "tests.test_tui_loading.test_version_check_skip_with_space",
+          "reasons": [
+            {
+              "extra": {
+                "fail": 11,
+                "note": "TUI loading/title-screen render timing race under tui2cli capture (title/version screen not reached in captured frame)",
+                "pass": 9,
+                "pattern": "ppppppffffffffpfpffp",
+                "source": "20x gold cloud eval 2026-06-13"
+              },
+              "id": "gold_flaky",
+              "timestamp": 1781326815,
+              "user": "kilian"
+            }
+          ]
+        },
         {
           "name": "tests.test_tui_records.test_records_all_filter_states",
           "reasons": [
@@ -2080,6 +2281,40 @@
             }
           ]
         },
+        {
+          "name": "tests.test_tui_records.test_records_escape_returns_to_title",
+          "reasons": [
+            {
+              "extra": {
+                "fail": 11,
+                "note": "TUI loading/title-screen render timing race under tui2cli capture (title/version screen not reached in captured frame)",
+                "pass": 9,
+                "pattern": "ppppppffffffffpfpffp",
+                "source": "20x gold cloud eval 2026-06-13"
+              },
+              "id": "gold_flaky",
+              "timestamp": 1781326815,
+              "user": "kilian"
+            }
+          ]
+        },
+        {
+          "name": "tests.test_tui_records.test_records_screen_access_from_title",
+          "reasons": [
+            {
+              "extra": {
+                "fail": 11,
+                "note": "TUI loading/title-screen render timing race under tui2cli capture (title/version screen not reached in captured frame)",
+                "pass": 9,
+                "pattern": "ppppppffffffffpfpffp",
+                "source": "20x gold cloud eval 2026-06-13"
+              },
+              "id": "gold_flaky",
+              "timestamp": 1781326815,
+              "user": "kilian"
+            }
+          ]
+        },
         {
           "name": "tests.test_tui_records.test_records_sort_cycling",
           "reasons": [
diff --git a/src/programbench/data/tasks/y2z__monolith.8702e66/tests.json b/src/programbench/data/tasks/y2z__monolith.8702e66/tests.json
index 43a6329..0f95d60 100644
--- a/src/programbench/data/tasks/y2z__monolith.8702e66/tests.json
+++ b/src/programbench/data/tasks/y2z__monolith.8702e66/tests.json
@@ -776,6 +776,17 @@
             }
           ]
         },
+        {
+          "name": "eval.tests.test_cli_operations.test_ignore_errors_flag",
+          "reasons": [
+            {
+              "id": "gold_fail",
+              "note": "[gold_fail_v6_external_resource] Inlined external resource (base64 data-URI) differs from golden; monolith fetches and embeds web content, so the result depends on a non-reproducible external page. 20x gold cloud eval 2026-06-13, failed 20/20 rounds",
+              "timestamp": 1781487387,
+              "user": "kilian"
+            }
+          ]
+        },
         {
           "name": "eval.tests.test_cookies.test_case_insensitive_domain_matching",
           "reasons": [
diff --git a/src/programbench/data/tasks/yassinebridi__serpl.c48a9d7/tests.json b/src/programbench/data/tasks/yassinebridi__serpl.c48a9d7/tests.json
index 3671bb4..b3dc858 100644
--- a/src/programbench/data/tasks/yassinebridi__serpl.c48a9d7/tests.json
+++ b/src/programbench/data/tasks/yassinebridi__serpl.c48a9d7/tests.json
@@ -923,6 +923,182 @@
             }
           ]
         },
+        {
+          "name": "tests.test_config_color_parsing.test_parse_color_all_named_colors",
+          "reasons": [
+            {
+              "id": "gold_flaky",
+              "note": "Flaky on v6 gold: passed in some gold-eval rounds and failed in others across 3 independent runs (timing-sensitive TUI/tui2cli snapshot, tmux, or db-lock/test-isolation concurrency); unreliable for grading.",
+              "timestamp": 1749700000,
+              "user": "kilian"
+            }
+          ]
+        },
+        {
+          "name": "tests.test_config_color_parsing.test_parse_color_bold_red",
+          "reasons": [
+            {
+              "id": "gold_flaky",
+              "note": "Flaky on v6 gold: passed in some gold-eval rounds and failed in others across 3 independent runs (timing-sensitive TUI/tui2cli snapshot, tmux, or db-lock/test-isolation concurrency); unreliable for grading.",
+              "timestamp": 1749700000,
+              "user": "kilian"
+            }
+          ]
+        },
+        {
+          "name": "tests.test_config_color_parsing.test_parse_color_bright_format",
+          "reasons": [
+            {
+              "id": "gold_flaky",
+              "note": "Flaky on v6 gold: passed in some gold-eval rounds and failed in others across 3 independent runs (timing-sensitive TUI/tui2cli snapshot, tmux, or db-lock/test-isolation concurrency); unreliable for grading.",
+              "timestamp": 1749700000,
+              "user": "kilian"
+            }
+          ]
+        },
+        {
+          "name": "tests.test_config_color_parsing.test_parse_color_gray_max_value",
+          "reasons": [
+            {
+              "id": "gold_flaky",
+              "note": "Flaky on v6 gold: passed in some gold-eval rounds and failed in others across 3 independent runs (timing-sensitive TUI/tui2cli snapshot, tmux, or db-lock/test-isolation concurrency); unreliable for grading.",
+              "timestamp": 1749700000,
+              "user": "kilian"
+            }
+          ]
+        },
+        {
+          "name": "tests.test_config_color_parsing.test_parse_color_gray_min_value",
+          "reasons": [
+            {
+              "id": "gold_flaky",
+              "note": "Flaky on v6 gold: passed in some gold-eval rounds and failed in others across 3 independent runs (timing-sensitive TUI/tui2cli snapshot, tmux, or db-lock/test-isolation concurrency); unreliable for grading.",
+              "timestamp": 1749700000,
+              "user": "kilian"
+            }
+          ]
+        },
+        {
+          "name": "tests.test_config_color_parsing.test_parse_color_gray_scale",
+          "reasons": [
+            {
+              "id": "gold_flaky",
+              "note": "Flaky on v6 gold: passed in some gold-eval rounds and failed in others across 3 independent runs (timing-sensitive TUI/tui2cli snapshot, tmux, or db-lock/test-isolation concurrency); unreliable for grading.",
+              "timestamp": 1749700000,
+              "user": "kilian"
+            }
+          ]
+        },
+        {
+          "name": "tests.test_config_color_parsing.test_parse_color_index_format",
+          "reasons": [
+            {
+              "id": "gold_flaky",
+              "note": "Flaky on v6 gold: passed in some gold-eval rounds and failed in others across 3 independent runs (timing-sensitive TUI/tui2cli snapshot, tmux, or db-lock/test-isolation concurrency); unreliable for grading.",
+              "timestamp": 1749700000,
+              "user": "kilian"
+            }
+          ]
+        },
+        {
+          "name": "tests.test_config_color_parsing.test_parse_color_index_max_value",
+          "reasons": [
+            {
+              "id": "gold_flaky",
+              "note": "Flaky on v6 gold: passed in some gold-eval rounds and failed in others across 3 independent runs (timing-sensitive TUI/tui2cli snapshot, tmux, or db-lock/test-isolation concurrency); unreliable for grading.",
+              "timestamp": 1749700000,
+              "user": "kilian"
+            }
+          ]
+        },
+        {
+          "name": "tests.test_config_color_parsing.test_parse_color_invalid_returns_none",
+          "reasons": [
+            {
+              "id": "gold_flaky",
+              "note": "Flaky on v6 gold: passed in some gold-eval rounds and failed in others across 3 independent runs (timing-sensitive TUI/tui2cli snapshot, tmux, or db-lock/test-isolation concurrency); unreliable for grading.",
+              "timestamp": 1749700000,
+              "user": "kilian"
+            }
+          ]
+        },
+        {
+          "name": "tests.test_config_color_parsing.test_parse_color_named_black",
+          "reasons": [
+            {
+              "id": "gold_flaky",
+              "note": "Flaky on v6 gold: passed in some gold-eval rounds and failed in others across 3 independent runs (timing-sensitive TUI/tui2cli snapshot, tmux, or db-lock/test-isolation concurrency); unreliable for grading.",
+              "timestamp": 1749700000,
+              "user": "kilian"
+            }
+          ]
+        },
+        {
+          "name": "tests.test_config_color_parsing.test_parse_color_rgb_all_max",
+          "reasons": [
+            {
+              "id": "gold_flaky",
+              "note": "Flaky on v6 gold: passed in some gold-eval rounds and failed in others across 3 independent runs (timing-sensitive TUI/tui2cli snapshot, tmux, or db-lock/test-isolation concurrency); unreliable for grading.",
+              "timestamp": 1749700000,
+              "user": "kilian"
+            }
+          ]
+        },
+        {
+          "name": "tests.test_config_color_parsing.test_parse_color_rgb_all_zeros",
+          "reasons": [
+            {
+              "id": "gold_flaky",
+              "note": "Flaky on v6 gold: passed in some gold-eval rounds and failed in others across 3 independent runs (timing-sensitive TUI/tui2cli snapshot, tmux, or db-lock/test-isolation concurrency); unreliable for grading.",
+              "timestamp": 1749700000,
+              "user": "kilian"
+            }
+          ]
+        },
+        {
+          "name": "tests.test_config_color_parsing.test_parse_color_rgb_format",
+          "reasons": [
+            {
+              "id": "gold_flaky",
+              "note": "Flaky on v6 gold: passed in some gold-eval rounds and failed in others across 3 independent runs (timing-sensitive TUI/tui2cli snapshot, tmux, or db-lock/test-isolation concurrency); unreliable for grading.",
+              "timestamp": 1749700000,
+              "user": "kilian"
+            }
+          ]
+        },
+        {
+          "name": "tests.test_config_color_parsing.test_parse_color_whitespace_trimming",
+          "reasons": [
+            {
+              "id": "gold_flaky",
+              "note": "Flaky on v6 gold: passed in some gold-eval rounds and failed in others across 3 independent runs (timing-sensitive TUI/tui2cli snapshot, tmux, or db-lock/test-isolation concurrency); unreliable for grading.",
+              "timestamp": 1749700000,
+              "user": "kilian"
+            }
+          ]
+        },
+        {
+          "name": "tests.test_replace_errors.test_binary_file_skipped",
+          "reasons": [
+            {
+              "id": "gold_flaky",
+              "note": "Flaky on v6 gold: passed in some gold-eval rounds and failed in others across 3 independent runs (timing-sensitive TUI/tui2cli snapshot, tmux, or db-lock/test-isolation concurrency); unreliable for grading.",
+              "timestamp": 1749700000,
+              "user": "kilian"
+            }
+          ]
+        },
+        {
+          "name": "tests.test_replace_errors.test_case_sensitive_edge_cases",
+          "reasons": [
+            {
+              "id": "gold_flaky",
+              "note": "Flaky on v6 gold: passed in some gold-eval rounds and failed in others across 3 independent runs (timing-sensitive TUI/tui2cli snapshot, tmux, or db-lock/test-isolation concurrency); unreliable for grading.",
+              "timestamp": 1749700000,
+              "user": "kilian"
+            }
+          ]
+        },
         {
           "name": "tests.test_replace_errors.test_empty_file_handling",
           "reasons": [
@@ -941,6 +1117,50 @@
             }
           ]
         },
+        {
+          "name": "tests.test_replace_errors.test_files_with_special_characters_in_names",
+          "reasons": [
+            {
+              "id": "gold_flaky",
+              "note": "Flaky on v6 gold: passed in some gold-eval rounds and failed in others across 3 independent runs (timing-sensitive TUI/tui2cli snapshot, tmux, or db-lock/test-isolation concurrency); unreliable for grading.",
+              "timestamp": 1749700000,
+              "user": "kilian"
+            }
+          ]
+        },
+        {
+          "name": "tests.test_replace_errors.test_newline_only_file_handling",
+          "reasons": [
+            {
+              "id": "gold_flaky",
+              "note": "Flaky on v6 gold: passed in some gold-eval rounds and failed in others across 3 independent runs (timing-sensitive TUI/tui2cli snapshot, tmux, or db-lock/test-isolation concurrency); unreliable for grading.",
+              "timestamp": 1749700000,
+              "user": "kilian"
+            }
+          ]
+        },
+        {
+          "name": "tests.test_replace_errors.test_readonly_file_handling",
+          "reasons": [
+            {
+              "id": "gold_flaky",
+              "note": "Flaky on v6 gold: passed in some gold-eval rounds and failed in others across 3 independent runs (timing-sensitive TUI/tui2cli snapshot, tmux, or db-lock/test-isolation concurrency); unreliable for grading.",
+              "timestamp": 1749700000,
+              "user": "kilian"
+            }
+          ]
+        },
+        {
+          "name": "tests.test_replace_errors.test_symlink_handling",
+          "reasons": [
+            {
+              "id": "gold_flaky",
+              "note": "Flaky on v6 gold: passed in some gold-eval rounds and failed in others across 3 independent runs (timing-sensitive TUI/tui2cli snapshot, tmux, or db-lock/test-isolation concurrency); unreliable for grading.",
+              "timestamp": 1749700000,
+              "user": "kilian"
+            }
+          ]
+        },
         {
           "name": "tests.test_replace_errors.test_very_large_file",
           "reasons": [
@@ -961,6 +1181,17 @@
             }
           ]
         },
+        {
+          "name": "tests.test_search_advanced.test_search_simple_mode",
+          "reasons": [
+            {
+              "id": "gold_flaky",
+              "note": "Flaky on v6 gold: passed in some gold-eval rounds and failed in others across 3 independent runs (timing-sensitive TUI/tui2cli snapshot, tmux, or db-lock/test-isolation concurrency); unreliable for grading.",
+              "timestamp": 1749700000,
+              "user": "kilian"
+            }
+          ]
+        },
         {
           "name": "tests.test_search_advanced.test_search_with_special_regex_chars",
           "reasons": [
@@ -969,6 +1200,23 @@
             }
           ]
         },
+        {
+          "name": "tests.test_tui_basic.test_tab_navigation_between_panes",
+          "reasons": [
+            {
+              "extra": {
+                "fail": 19,
+                "note": "TUI Search/Preview pane snapshot capture timing race",
+                "pass": 1,
+                "pattern": "pfffffffffffffffffff",
+                "source": "20x gold cloud eval 2026-06-13"
+              },
+              "id": "gold_flaky",
+              "timestamp": 1781326815,
+              "user": "kilian"
+            }
+          ]
+        },
         {
           "name": "tests.test_tui_dialogs.test_notification_disappears_after_duration",
           "reasons": [
@@ -985,6 +1233,17 @@
             }
           ]
         },
+        {
+          "name": "tests.test_tui_preview.test_preview_pane_displays_sample1_with_multiple_matches",
+          "reasons": [
+            {
+              "id": "gold_flaky",
+              "note": "Flaky on v6 gold: passed in some gold-eval rounds and failed in others across 3 independent runs (timing-sensitive TUI/tui2cli snapshot, tmux, or db-lock/test-isolation concurrency); unreliable for grading.",
+              "timestamp": 1749700000,
+              "user": "kilian"
+            }
+          ]
+        },
         {
           "name": "tests.test_tui_preview.test_preview_pane_displays_unicode_content",
           "reasons": [
@@ -1009,6 +1268,24 @@
             }
           ]
         },
+        {
+          "name": "tests.test_tui_preview.test_preview_pane_goto_top_with_g",
+          "reasons": [
+            {
+              "extra": {
+                "cause": "Flaky TUI render timing (preview-pane navigation)",
+                "statuses": [
+                  "passed",
+                  "passed",
+                  "failure"
+                ]
+              },
+              "id": "gold_flaky",
+              "timestamp": 1781197573,
+              "user": "kilian"
+            }
+          ]
+        },
         {
           "name": "tests.test_tui_preview.test_preview_pane_navigate_backward_with_k",
           "reasons": [
@@ -1027,6 +1304,24 @@
             }
           ]
         },
+        {
+          "name": "tests.test_tui_preview.test_preview_pane_navigate_to_first_match_with_j",
+          "reasons": [
+            {
+              "extra": {
+                "cause": "Flaky TUI render timing (preview-pane navigation)",
+                "statuses": [
+                  "failure",
+                  "passed",
+                  "passed"
+                ]
+              },
+              "id": "gold_flaky",
+              "timestamp": 1781197573,
+              "user": "kilian"
+            }
+          ]
+        },
         {
           "name": "tests.test_tui_replace.test_cancel_empty_replacement",
           "reasons": [
@@ -1047,6 +1342,28 @@
             }
           ]
         },
+        {
+          "name": "tests.test_tui_results.test_empty_results_list",
+          "reasons": [
+            {
+              "id": "gold_flaky",
+              "note": "TUI Search/Preview snapshot capture timing race. caught in postfix gold rerun (21st run); passed 20/20 in 20x sweep",
+              "timestamp": 1781500166,
+              "user": "kilian"
+            }
+          ]
+        },
+        {
+          "name": "tests.test_tui_results.test_file_deletion_with_d_key",
+          "reasons": [
+            {
+              "id": "gold_flaky",
+              "note": "Flaky on v6 gold: passed in some gold-eval rounds and failed in others across 3 independent runs (timing-sensitive TUI/tui2cli snapshot, tmux, or db-lock/test-isolation concurrency); unreliable for grading.",
+              "timestamp": 1749700000,
+              "user": "kilian"
+            }
+          ]
+        },
         {
           "name": "tests.test_tui_results.test_multiple_deletions_in_sequence",
           "reasons": [
@@ -1107,6 +1424,23 @@
             }
           ]
         },
+        {
+          "name": "tests.test_tui_search.test_search_tab_navigation_to_results",
+          "reasons": [
+            {
+              "extra": {
+                "fail": 19,
+                "note": "TUI Search/Preview pane snapshot capture timing race",
+                "pass": 1,
+                "pattern": "pfffffffffffffffffff",
+                "source": "20x gold cloud eval 2026-06-13"
+              },
+              "id": "gold_flaky",
+              "timestamp": 1781326815,
+              "user": "kilian"
+            }
+          ]
+        },
         {
           "name": "tests.test_tui_suspend_resume.TestTuiSuspendResume.test_resume_reinitializes_terminal",
           "reasons": [
@@ -1123,6 +1457,17 @@
             }
           ]
         },
+        {
+          "name": "tests.test_tui_workflows.test_empty_search_results_handling",
+          "reasons": [
+            {
+              "id": "gold_flaky",
+              "note": "TUI snapshot capture timing race (empty captured frame). caught in postfix gold rerun (21st run); passed 20/20 in 20x sweep",
+              "timestamp": 1781500166,
+              "user": "kilian"
+            }
+          ]
+        },
         {
           "name": "tests.test_tui_workflows.test_large_file_many_matches_navigation",
           "reasons": [
diff --git a/src/programbench/data/tasks/ys-l__flamelens.0b4dc33/tests.json b/src/programbench/data/tasks/ys-l__flamelens.0b4dc33/tests.json
index 56dde8a..5b9cd8b 100644
--- a/src/programbench/data/tasks/ys-l__flamelens.0b4dc33/tests.json
+++ b/src/programbench/data/tasks/ys-l__flamelens.0b4dc33/tests.json
@@ -764,6 +764,24 @@
             }
           ]
         },
+        {
+          "name": "tests.test_parsing.test_readable_data_comment_visualization",
+          "reasons": [
+            {
+              "extra": {
+                "cause": "Flaky TUI render timing (flamegraph parse/render)",
+                "statuses": [
+                  "passed",
+                  "failure",
+                  "failure"
+                ]
+              },
+              "id": "gold_flaky",
+              "timestamp": 1781197573,
+              "user": "kilian"
+            }
+          ]
+        },
         {
           "name": "tests.test_parsing.test_recursive_stacks",
           "reasons": [
diff --git a/src/programbench/data/tasks/zk-org__zk.10d93d5/tests.json b/src/programbench/data/tasks/zk-org__zk.10d93d5/tests.json
index c173b96..171168b 100644
--- a/src/programbench/data/tasks/zk-org__zk.10d93d5/tests.json
+++ b/src/programbench/data/tasks/zk-org__zk.10d93d5/tests.json
@@ -3335,6 +3335,17 @@
             }
           ]
         },
+        {
+          "name": "tests.test_list_format.test_format_oneline",
+          "reasons": [
+            {
+              "id": "gold_fail",
+              "note": "[gold_fail_v6_toolchain] list --format=oneline output deterministically differs from captured golden (identical across all 20 rounds; gold output drift). 20x gold cloud eval 2026-06-13, failed 20/20 rounds",
+              "timestamp": 1781487387,
+              "user": "kilian"
+            }
+          ]
+        },
         {
           "name": "tests.test_list_format.test_template_format_date_helper",
           "reasons": [