AI-Hypercomputer · mbohlool · Apr 30, 2026
@@ -35,6 +35,7 @@ tensorflow-datasets
 tensorflow
 tokamax
 tokenizers
+torchax>=0.0.11
 transformers<5.0.0
 
 # pinning torch and torchvision to specific versions to avoid

@@ -179,6 +179,7 @@ toml>=0.10.2
 tomlkit>=0.14.0
 toolz>=1.1.0
 torch @ https://download.pytorch.org/whl/cpu/torch-2.10.0%2Bcpu-cp312-cp312-manylinux_2_28_x86_64.whl
+torchax>=0.0.11
 torchvision @ https://download.pytorch.org/whl/cpu/torchvision-0.25.0%2Bcpu-cp312-cp312-manylinux_2_28_x86_64.whl
 tqdm>=4.67.3
 transformers>=4.57.6

@@ -0,0 +1,75 @@
+"""
+Copyright 2026 Google LLC
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    https://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+from typing import Tuple
+
+import torch
+import jax
+from torchax import interop, default_env
+
+# --- Monkeypatch transformers masking_utils to avoid torchax integer tracing bug ---
+import transformers.masking_utils
+
+_orig_sliding_window_overlay = transformers.masking_utils.sliding_window_overlay
+
+
+def _patched_sliding_window_overlay(sliding_window: int):
+  # pylint: disable=unused-argument
+
+  def inner_mask(batch_idx: int, head_idx: int, q_idx: int, kv_idx: int) -> bool:
+    # Since sequence length < sliding window (e.g. 256 < 4096), this mask is always True.
+    # We return a standard boolean tensor using new_ones to guarantee Torchax compatibility
+    # and prevent any implicit tracing crashes.
+    return q_idx.new_ones((), dtype=torch.bool)
+
+  return inner_mask
+
+
+transformers.masking_utils.sliding_window_overlay = _patched_sliding_window_overlay
+# -----------------------------------------------------------------------------------
+
+
+class TorchaxGemma3TextEncoder(interop.JittableModule):
+  """
+  A jittable Torchax module for wrapping the HuggingFace PyTorch
+  Gemma3ForConditionalGeneration text encoder.
+  """
+
+  def __init__(self, text_encoder):
+    super().__init__(text_encoder, extra_jit_args={"static_argnames": ["output_hidden_states"]})
+
+  def __call__(
+      self, input_ids: jax.Array, attention_mask: jax.Array, output_hidden_states: bool = True
+  ) -> Tuple[jax.Array, ...]:
+    with default_env():
+      input_ids = interop.torch_view(input_ids)
+      attention_mask = interop.torch_view(attention_mask)
+
+      output = self.functional_call(
+          self._forward_inner,
+          params=self.params,
+          buffers=self.buffers,
+          input_ids=input_ids,
+          attention_mask=attention_mask,
+          output_hidden_states=output_hidden_states,
+      )
+    return interop.jax_view(output)
+
+  @staticmethod
+  def _forward_inner(model, input_ids, attention_mask, output_hidden_states=True):
+    # We only return hidden states as a tuple of tensors.
+    # That allows interop.jax_view to convert them into a tuple of jax Arrays
+    return model(input_ids=input_ids, attention_mask=attention_mask, output_hidden_states=output_hidden_states).hidden_states