mlcommons · wu6u3tw · Feb 10, 2026 · Feb 10, 2026 · Feb 10, 2026 · Feb 10, 2026
@@ -3,7 +3,7 @@ name: Auto-Update Dev Branches from Master
 on:
   push:
     branches:
-      - master  # Trigger workflow on commits to 'master' branch.
+      - master  # Trigger workflow on commits to 'master' branch
   workflow_dispatch: {}
 
 jobs:

@@ -0,0 +1,10 @@
+130, A boat sailing leisurely along the Seine River with the Eiffel Tower in background, pixel art
+106, A panda drinking coffee in a cafe in Paris, watercolor painting
+84, The bund Shanghai, black and white
+59, an elephant spraying itself with water using its trunk to cool down
+12, a car turning a corner
+31, a truck anchored in a tranquil bay
+86, The bund Shanghai, in cyberpunk style
+122, Gwen Stacy reading a book, in cyberpunk style
+233, skyscraper
+96, a shark is swimming in the ocean, animated style
@@ -0,0 +1,10 @@
+A boat sailing leisurely along the Seine River with the Eiffel Tower in background, pixel art-0.mp4, 130.mp4
+A panda drinking coffee in a cafe in Paris, watercolor painting-0.mp4, 106.mp4
+The bund Shanghai, black and white-0.mp4, 84.mp4
+an elephant spraying itself with water using its trunk to cool down-0.mp4, 59.mp4
+a car turning a corner-0.mp4, 12.mp4
+a truck anchored in a tranquil bay-0.mp4, 31.mp4
+The bund Shanghai, in cyberpunk style-0.mp4, 86.mp4
+Gwen Stacy reading a book, in cyberpunk style-0.mp4, 122.mp4
+skyscraper-0.mp4, 223.mp4 
+a shark is swimming in the ocean, animated style-0.mp4, 96.mp4
@@ -101,11 +101,11 @@ def main():
         "singlestream": "SingleStream",
         "multistream": "MultiStream",
         "server": "Server",
-        "interactive":"Interactive",
+        "interactive": "Interactive",
         "offline": "Offline",
     }
-    df["Scenario"] = df["Scenario"].apply(lambda x: scenario_map.get(str(x).lower(), x))
-
+    df["Scenario"] = df["Scenario"].apply(
+        lambda x: scenario_map.get(str(x).lower(), x))
 
     output = args.input[:-4]
     writer = pd.ExcelWriter(output + ".xlsx", engine="xlsxwriter")

@@ -2,6 +2,7 @@
 from ..constants import *
 from ..loader import SubmissionLogs
 from ..configuration.configuration import Config
+from ..utils import check_extra_files
 import re
 import os
 
@@ -25,6 +26,9 @@ class AccuracyCheck(BaseCheck):
     - `loadgen_errors_check`: Fails if Loadgen reported non-ignored errors.
     - `dataset_check`: Verifies the reported sample count matches the
         configured dataset size unless the check is skipped.
+    - `extra_files_check`: For benchmarks in REQUIRED_ACC_BENCHMARK (e.g.
+        stable-diffusion-xl, wan-2.2-t2v-a14b), verifies required extra
+        artifacts (e.g. images/, videos/) exist in the accuracy directory.
 
     Attributes:
             submission_logs (SubmissionLogs): Holder for submission log paths
@@ -78,6 +82,7 @@ def setup_checks(self):
         self.checks.append(self.accuracy_json_check)
         self.checks.append(self.loadgen_errors_check)
         self.checks.append(self.dataset_check)
+        self.checks.append(self.extra_files_check)
 
     def accuracy_result_check(self):
         """Validate reported accuracy metrics in `accuracy.txt`.
@@ -234,3 +239,34 @@ def dataset_check(self):
             )
             return False
         return True
+
+    def extra_files_check(self):
+        """Verify required extra accuracy files for certain benchmarks.
+
+        For models in REQUIRED_ACC_BENCHMARK (e.g. stable-diffusion-xl
+        images, wan-2.2-t2v-a14b videos), ensures the accuracy directory
+        contains the required subdirs and files. Skipped if
+        skip_extra_accuracy_files_check is set.
+
+        Returns:
+            bool: True if the check is skipped, the model has no extra
+                requirements, or all required files exist; False otherwise.
+        """
+        if self.config.skip_extra_accuracy_files_check:
+            return True
+        if self.model not in REQUIRED_ACC_BENCHMARK:
+            return True
+        if self.config.version not in REQUIRED_ACC_BENCHMARK[self.model]:
+            return True
+        acc_dir = os.path.dirname(self.path)
+        target_files = REQUIRED_ACC_BENCHMARK[self.model][self.config.version]
+        extra_files_pass, missing_files = check_extra_files(
+            acc_dir, target_files)
+        if not extra_files_pass:
+            self.log.error(
+                "%s expected to have the following extra files (%s)",
+                acc_dir,
+                missing_files,
+            )
+            return False
+        return True
@@ -85,13 +85,15 @@ def missing_check(self):
             self.log.error("Performance log missing at %s", self.path)
             return False
         return True
-    
+
     def scenarios_check(self):
         if self.submission_logs.loader_data.get("check_scenarios", False):
             return True
         else:
-            missing_scenarios = self.submission_logs.loader_data.get("missing_scenarios", [])
-            unknown_scenarios = self.submission_logs.loader_data.get("unknown_scenarios", [])
+            missing_scenarios = self.submission_logs.loader_data.get(
+                "missing_scenarios", [])
+            unknown_scenarios = self.submission_logs.loader_data.get(
+                "unknown_scenarios", [])
             if len(missing_scenarios) > 0:
                 self.log.error(
                     "%s does not have all required scenarios, missing %s",
@@ -116,7 +118,8 @@ def loadgen_errors_check(self):
             bool: True if no blocking Loadgen errors are present,
                 False otherwise.
         """
-        compliance_skip = self.submission_logs.loader_data.get("compliance_skip", False)
+        compliance_skip = self.submission_logs.loader_data.get(
+            "compliance_skip", False)
         if self.mlperf_log.has_error():
             has_critical_errors = False
             if self.config.ignore_uncommited:
@@ -127,7 +130,7 @@ def loadgen_errors_check(self):
                     ):
                         has_critical_errors = True
                     if (
-                        not compliance_skip 
+                        not compliance_skip
                         and "Multiple conf files are used" in error["value"]
                     ):
                         has_critical_errors = True
@@ -454,7 +457,7 @@ def inferred_check(self):
                 ("singlestream", "offline")
             ]
             if (self.scenario.lower(), self.scenario_fixed.lower()
-                ) not in list_inferred:
+                    ) not in list_inferred:
                 self.log.error(
                     "Result for scenario %s can not be inferred from %s for: %s",
                     self.scenario_fixed,
@@ -543,12 +546,12 @@ def get_inferred_result(self, res):
             res = qps_wo_loadgen_overhead
 
         if (self.scenario_fixed in ["Offline"]
-            ) and self.scenario in ["MultiStream"]:
+                ) and self.scenario in ["MultiStream"]:
             inferred = True
             res = samples_per_query * S_TO_MS / (latency_mean / MS_TO_NS)
 
         if (self.scenario_fixed in ["MultiStream"]
-            ) and self.scenario in ["SingleStream"]:
+                ) and self.scenario in ["SingleStream"]:
             inferred = True
             # samples_per_query does not match with the one reported in the logs
             # when inferring MultiStream from SingleStream
@@ -565,6 +568,6 @@ def get_inferred_result(self, res):
             else:
                 res = (latency_99_percentile * samples_per_query) / MS_TO_NS
         if (self.scenario_fixed in ["Interactive"]
-            ) and self.scenario not in ["Server"]:
+                ) and self.scenario not in ["Server"]:
             is_valid = False
         return res, is_valid
@@ -1123,6 +1123,22 @@
                 "2289",
             ]
         },
+    },
+    "wan-2.2-t2v-a14b": {
+        "v6.0": {
+            "videos": [
+                "130",
+                "106",
+                "84",
+                "59",
+                "12",
+                "31",
+                "86",
+                "122",
+                "233",
+                "96",
+            ]
+        },
     }
 }
 REQUIRED_MEASURE_FILES = ["user.conf", "README.md"]
@@ -1695,6 +1711,7 @@
     "v6.0": "{division}/{submitter}/results/{system}/{benchmark}/{scenario}/TEST08/verify_accuracy.txt",
     "default": "{division}/{submitter}/results/{system}/{benchmark}/{scenario}/TEST08/verify_accuracy.txt",
 }
+
 TEST07_ACC_PATH = {
     "v6.0": "{division}/{submitter}/results/{system}/{benchmark}/{scenario}/TEST07/verify_accuracy.txt",
     "default": "{division}/{submitter}/results/{system}/{benchmark}/{scenario}/TEST07/verify_accuracy.txt",

@@ -57,8 +57,13 @@ def check_extra_files(path, target_files):
             for target_file in target_files[dir]:
                 if target_file not in files:
                     check_pass = False
-                    missing_files.append(
-                        f"{os.path.join(path, dir, target_file)}.png")
+                    if "images" in dir:
+                        missing_files.append(
+                            f"{os.path.join(path, dir, target_file)}.png")
+                    if "videos" in dir:
+                        missing_files.append(
+                            f"{os.path.join(path, dir, target_file)}.mp4")
+
             if "captions" not in files:
                 missing_files.append(
                     f"{os.path.join(path, dir, 'captions.txt')}")
@@ -107,17 +112,19 @@ def is_number(s):
         return True
     except ValueError:
         return False
-
+
+
 def lower_list(l):
     return [str(e).lower() for e in l]
 
+
 def contains_list(l1, l2):
     # Check if l1 contains all elements of l2
     missing = []
     for e in l2:
         if e not in l1:
             missing.append(e)
-    return missing, len(missing) == 0 
+    return missing, len(missing) == 0
 
 
 def get_performance_metric(
@@ -317,7 +324,7 @@ def get_power_metric(config, scenario_fixed, log_path, is_valid, res):
                 samples_per_query = 8
 
             if (scenario_fixed in ["MultiStream"]
-                ) and scenario in ["SingleStream"]:
+                    ) and scenario in ["SingleStream"]:
                 power_metric = (
                     avg_power * power_duration * samples_per_query * 1000 / num_queries
                 )