comparison of weights initialized from pytorch saved state dict

init-22 · init-22 · commit c057f10418f5 · 2025-06-08T17:17:19.000Z
diff --git a/algoperf/workloads/criteo1tb/criteo1tb_jax/workload.py b/algoperf/workloads/criteo1tb/criteo1tb_jax/workload.py
@@ -7,7 +7,7 @@
 import jax
 import jax.numpy as jnp
 import numpy as np
-
+import logging
 from algoperf import param_utils
 from algoperf import spec
 from algoperf.workloads.criteo1tb.criteo1tb_jax import models
@@ -104,7 +104,8 @@ def init_model_fn(
         {'params': params_rng, 'dropout': dropout_rng},
         jnp.ones(input_shape, jnp.float32))
     initial_params = initial_variables['params']
-    initial_params = use_pytorch_weights(initial_params)
+    logging.info('\n\nInitializing with Pytorch weights\n\n')
+    initial_params = use_pytorch_weights(initial_params, file_name="/results/pytorch_base_model_criteo1tb_8_june.pth")
     self._param_shapes = param_utils.jax_param_shapes(initial_params)
     self._param_types = param_utils.jax_param_types(self._param_shapes)
     return jax_utils.replicate(initial_params), None
diff --git a/algoperf/workloads/criteo1tb/criteo1tb_pytorch/workload.py b/algoperf/workloads/criteo1tb/criteo1tb_pytorch/workload.py
@@ -88,7 +88,7 @@ def init_model_fn(
         dropout_rate=dropout_rate,
         use_layer_norm=self.use_layer_norm,
         embedding_init_multiplier=self.embedding_init_multiplier)
-    torch.save(model.state_dict(), "/results/pytorch_base_model_criteo1tb_22_may.pth")
+    torch.save(model.state_dict(), "/results/pytorch_base_model_criteo1tb_8_june.pth")
     self._param_shapes = param_utils.pytorch_param_shapes(model)
     self._param_types = param_utils.pytorch_param_types(self._param_shapes)
     model.to(DEVICE)
diff --git a/custom_pytorch_jax_converter.py b/custom_pytorch_jax_converter.py
@@ -1,14 +1,7 @@
 import torch
 import numpy as np
-from flax.core import freeze, unfreeze
-
-# Load PyTorch state_dict
-state_dict = torch.load("/results/pytorch_base_model_criteo1tb_22_may.pth")
-
-# Convert PyTorch tensors to NumPy arrays
-numpy_weights = {k: v.numpy() for k, v in state_dict.items()}
-
-
+import jax
+import jax.numpy as jnp
 """
 Jax default parameter structure:
 dict_keys(['Dense_0', 'Dense_1', 'Dense_2', 'Dense_3', 'Dense_4', 'Dense_5', 'Dense_6', 'Dense_7', 'embedding_table'])
@@ -23,7 +16,13 @@
 The function assumes that the Jax model parameters are already initialized
 and that the PyTorch weights are in the correct format.
 """
-def use_pytorch_weights(jax_params):
+def use_pytorch_weights(jax_params, file_name=None):
+    # Load PyTorch state_dict
+    state_dict = torch.load(file_name)
+    print(state_dict.keys())
+    # Convert PyTorch tensors to NumPy arrays
+    numpy_weights = {k: v.cpu().numpy() for k, v in state_dict.items()}   
+
     # --- Embedding Table ---
     embedding_table = np.concatenate([
         numpy_weights[f'embedding_chunk_{i}'] for i in range(4)
@@ -42,3 +41,28 @@ def use_pytorch_weights(jax_params):
         jax_params[f'Dense_{j}']['bias'] = numpy_weights[f'top_mlp.{i}.bias']
     
     return jax_params
+
+
+def are_weights_equal(params1, params2, atol=1e-6, rtol=1e-6):
+    """Compares two JAX PyTrees of weights and prints where they differ."""
+    all_equal = True
+
+    def compare_fn(p1, p2):
+        nonlocal all_equal
+        #if not jnp.allclose(p1, p2):
+        if not jnp.allclose(p1, p2, atol=atol, rtol=rtol):
+            print("❌ Mismatch found:")
+            print(f"Shape 1: {p1.shape}, Shape 2: {p2.shape}")
+            print(f"Max diff: {jnp.max(jnp.abs(p1 - p2))}")
+            all_equal = False
+        return jnp.allclose(p1, p2, atol=atol, rtol=rtol)
+
+    try:
+        _ = jax.tree_util.tree_map(compare_fn, params1, params2)
+    except Exception as e:
+        print("❌ Structure mismatch or error during comparison:", e)
+        return False
+
+    if all_equal:
+        print("✅ All weights are equal (within tolerance)")
+    return all_equal
diff --git a/reference_algorithms/schedule_free/jax/submission.py b/reference_algorithms/schedule_free/jax/submission.py
@@ -8,8 +8,10 @@
 import jax
 from jax import lax
 import jax.numpy as jnp
+import logging
 from optax.contrib import schedule_free_adamw
 from algoperf import spec
+from custom_pytorch_jax_converter import use_pytorch_weights, are_weights_equal
 
 _GRAD_CLIP_EPS = 1e-6
 
@@ -168,6 +170,9 @@ def update_params(workload: spec.Workload,
             'loss': loss[0],
             'grad_norm': grad_norm[0],
         }, global_step)
+  logging.info('\n\nUsing the PyTorch weights of first update\n\n')
+  params = use_pytorch_weights(new_params, file_name="/results/pytorch_base_model_criteo1tb_8_june_after_first_update.pth")
+  are_weights_equal(new_params, params)
   return (new_optimizer_state, opt_update_fn), new_params, new_model_state
 
 
diff --git a/reference_algorithms/schedule_free/pytorch/submission.py b/reference_algorithms/schedule_free/pytorch/submission.py
@@ -280,6 +280,8 @@ def closure():
               global_step,
               loss.item())
 
+  torch.save(current_param_container.module.state_dict(), "/results/pytorch_base_model_criteo1tb_8_june_after_first_update.pth")
+
   return (optimizer_state, current_param_container, new_model_state)
 
 
diff --git a/submission_runner.py b/submission_runner.py
@@ -376,8 +376,12 @@ def train_once(
       train_state['training_complete'] = True
     global_step += 1
     logging.info(f'Global step: {global_step}, batch size: {len(batch)}')
+    
     if (max_global_steps is not None) and (global_step == max_global_steps):
       train_state['training_complete'] = True
+      if dist.is_available() and dist.is_initialized():
+        dist.destroy_process_group()
+      exit(0)
 
     train_step_end_time = get_time()