fix for jax implementation of sfadamw

wyfEmma · wyfEmma · commit 07503bfe8faa · 2025-12-15T19:25:57.000Z
diff --git a/algoperf/workloads/fastmri/input_pipeline.py b/algoperf/workloads/fastmri/input_pipeline.py
@@ -11,8 +11,8 @@
 
 from algoperf import data_utils
 
-_TRAIN_DIR = 'fastmri/knee_singlecoil_train'
-_VAL_DIR = 'fastmri/knee_singlecoil_val'
+_TRAIN_DIR = 'knee_singlecoil_train'
+_VAL_DIR = 'knee_singlecoil_val'
 _EVAL_SEED = 0
 
 
diff --git a/algoperf/workloads/imagenet_resnet/workload.py b/algoperf/workloads/imagenet_resnet/workload.py
@@ -145,4 +145,4 @@ def _build_input_queue(
   @property
   def step_hint(self) -> int:
     """Approx. steps the baseline can do in the allowed runtime budget."""
-    return 195_999
+    return 186_666
diff --git a/algoperf/workloads/imagenet_vit/workload.py b/algoperf/workloads/imagenet_vit/workload.py
@@ -121,4 +121,4 @@ def _build_dataset(
   @property
   def step_hint(self) -> int:
     """Approx. steps the baseline can do in the allowed runtime budget."""
-    return 167_999
+    return 186_666
diff --git a/algoperf/workloads/librispeech_conformer/workload.py b/algoperf/workloads/librispeech_conformer/workload.py
@@ -89,4 +89,4 @@ def eval_period_time_sec(self) -> int:
   @property
   def step_hint(self) -> int:
     """Approx. steps the baseline can do in the allowed runtime budget."""
-    return 76_000
+    return 80_000
diff --git a/algoperf/workloads/librispeech_deepspeech/librispeech_jax/workload.py b/algoperf/workloads/librispeech_deepspeech/librispeech_jax/workload.py
@@ -96,7 +96,7 @@ def test_target_value(self) -> float:
   @property
   def step_hint(self) -> int:
     """Approx. steps the baseline can do in the allowed runtime budget."""
-    return 38_400
+    return 48_000
 
   @property
   def max_allowed_runtime_sec(self) -> int:
diff --git a/algoperf/workloads/librispeech_deepspeech/librispeech_pytorch/workload.py b/algoperf/workloads/librispeech_deepspeech/librispeech_pytorch/workload.py
@@ -92,7 +92,7 @@ def test_target_value(self) -> float:
   @property
   def step_hint(self) -> int:
     """Approx. steps the baseline can do in the allowed runtime budget."""
-    return 38_400
+    return 48_000
 
   @property
   def max_allowed_runtime_sec(self) -> int:
diff --git a/algoperf/workloads/ogbg/workload.py b/algoperf/workloads/ogbg/workload.py
@@ -144,7 +144,7 @@ def loss_fn(
   @property
   def step_hint(self) -> int:
     """Approx. steps the baseline can do in the allowed runtime budget."""
-    return 52_000
+    return 80_000
 
   @abc.abstractmethod
   def _normalize_eval_metrics(
diff --git a/algoperf/workloads/wmt/workload.py b/algoperf/workloads/wmt/workload.py
@@ -98,7 +98,7 @@ def eval_period_time_sec(self) -> int:
   @property
   def step_hint(self) -> int:
     """Approx. steps the baseline can do in the allowed runtime budget."""
-    return 120_000
+    return 133_333
 
   @property
   def pre_ln(self) -> bool:
diff --git a/docker/scripts/plot.py b/docker/scripts/plot.py
@@ -0,0 +1,128 @@
+# plot_results.py
+
+import pandas as pd
+import matplotlib.pyplot as plt
+from pathlib import Path
+import collections
+
+# --- Configuration ---
+# The base directory to start searching from.
+# The script assumes it's run from the 'experiment_runs' directory.
+SEARCH_DIR = Path("~/experiment_runs/tests/regression_tests/adamw").expanduser()
+
+# The directory where the output plots will be saved.
+OUTPUT_DIR = Path("ssim_plots")
+
+# The columns to use for the x and y axes.
+X_AXIS_COL = "global_step"
+Y_AXIS_CANDIDATES = ["validation/loss", "validation/ctc_loss"]
+# ---------------------
+
+def generate_plots():
+    """
+    Finds all 'measurements.csv' files, groups them by workflow,
+    and generates a JAX vs. PyTorch plot for each.
+    """
+    # Create the output directory if it doesn't already exist
+    OUTPUT_DIR.mkdir(exist_ok=True)
+    print(f"📊 Plots will be saved to the '{OUTPUT_DIR}' directory.")
+
+    # Use a dictionary to group file paths by their workflow name
+    # e.g., {'fastmri': [...], 'wmt': [...]}
+    workflow_files = collections.defaultdict(list)
+
+    # Recursively find all 'measurements.csv' files in the search directory
+    for csv_path in SEARCH_DIR.rglob("measurements.csv"):
+        try:
+            # The directory name looks like 'fastmri_jax' or 'wmt_pytorch'.
+            # We get this from the parent of the parent of the csv file.
+            # e.g., .../fastmri_jax/trial_1/measurements.csv
+            workflow_framework_name = csv_path.parent.parent.name
+            
+            # Split the name to get the framework (last part) and workflow (everything else)
+            parts = workflow_framework_name.split('_')
+            framework = parts[-1]
+            workflow = '_'.join(parts[:-1])
+            
+            # Store the path and framework for this workflow
+            if framework in ['jax', 'pytorch']:
+                workflow_files[workflow].append({'path': csv_path, 'framework': framework})
+
+        except IndexError:
+            # This handles cases where the directory name might not match the expected pattern
+            print(f"⚠️ Could not parse workflow/framework from path: {csv_path}")
+            continue
+
+    if not workflow_files:
+        print("❌ No 'measurements.csv' files found. Check the SEARCH_DIR variable and your folder structure.")
+        return
+
+    print(f"\nFound {len(workflow_files)} workflows. Generating plots...")
+
+    # Iterate through each workflow and its associated files to create a plot
+    for workflow, files in workflow_files.items():
+        plt.style.use('seaborn-v0_8-whitegrid')
+        fig, ax = plt.subplots(figsize=(12, 7))
+
+        print(f"  -> Processing workflow: '{workflow}'")
+
+        y_axis_col_used = None # To store the name of the y-axis column for the plot labels
+        
+        # Plot data for each framework (JAX and PyTorch) on the same figure
+        for item in files:
+            try:
+                df = pd.read_csv(item['path'])
+                
+                y_axis_col = None
+                for candidate in Y_AXIS_CANDIDATES:
+                    if candidate in df.columns:
+                        y_axis_col = candidate
+                        if not y_axis_col_used:
+                            y_axis_col_used = y_axis_col # Set the label from the first file
+                        break # Found a valid column, no need to check further
+                
+                # if item['framework'] == 'jax':
+                #     y_axis_col = None
+
+                # Check if the required columns exist in the CSV
+                if X_AXIS_COL in df.columns and y_axis_col:
+                                        
+                    # 1. Forward-fill 'global_step' to propagate the last valid step downwards.
+                    df[X_AXIS_COL] = df[X_AXIS_COL].ffill()
+                    
+                    # 2. Drop any rows where 'validation/ssim' is empty (NaN).
+                    df_cleaned = df.dropna(subset=[y_axis_col])
+                    
+                    # Plot the cleaned data
+                    ax.plot(
+                        df_cleaned[X_AXIS_COL],
+                        df_cleaned[y_axis_col],
+                        label=item['framework'].capitalize(), # e.g., 'Jax'
+                        marker='.',
+                        linestyle='-',
+                        alpha=0.8
+                    )
+                else:
+                    print(f"     - Skipping {item['path']} (missing required columns).")
+
+            except Exception as e:
+                print(f"     - ❗️ Error reading {item['path']}: {e}")
+
+        # Customize and save the plot
+        ax.set_title(f'Validation loss vs. Global Step for {workflow.replace("_", " ").title()}', fontsize=16)
+        ax.set_xlabel("Global Step", fontsize=12)
+        ax.set_ylabel("Validation loss", fontsize=12)
+        ax.legend(title="Framework", fontsize=10)
+        plt.tight_layout()
+        plt.yscale('log')
+
+        # Define the output filename and save the figure
+        output_filename = OUTPUT_DIR / f"{workflow}_comparison.png"
+        plt.savefig(output_filename, dpi=150)
+        plt.close(fig) # Close the figure to free up memory
+
+    print("\n✅ All plots generated successfully!")
+
+
+if __name__ == "__main__":
+    generate_plots()
diff --git a/submission_runner.py b/submission_runner.py
@@ -19,10 +19,10 @@
 import importlib
 import itertools
 import json
-import jax
 import os
 import struct
 import time
+import optax
 from inspect import signature
 from types import MappingProxyType
 from typing import Any, Dict, Optional, Tuple
@@ -861,9 +861,11 @@ def main(_):
 
 
 if __name__ == '__main__':
+  print(optax.__version__)
+  print("!!!!")
   flags.mark_flag_as_required('workload')
   flags.mark_flag_as_required('framework')
   flags.mark_flag_as_required('submission_path')
   flags.mark_flag_as_required('experiment_dir')
   flags.mark_flag_as_required('experiment_name')
-  app.run(main)
+  app.run(main)
diff --git a/tests/test_algorithms/schedule_free_adamw/jax/submission.py b/tests/test_algorithms/schedule_free_adamw/jax/submission.py
diff --git a/tests/test_algorithms/schedule_free_adamw/pytorch/submission.py b/tests/test_algorithms/schedule_free_adamw/pytorch/submission.py
diff --git a/tests/test_traindiffs.py b/tests/test_traindiffs.py