mesozoic-egg · mesozoic-egg · Jul 15, 2025 · Jul 16, 2025 · Jul 16, 2025 · Jul 16, 2025
diff --git a/test/test_ops.py b/test/test_ops.py
@@ -14,7 +14,7 @@
 if CI:
   warnings.filterwarnings("ignore", message="Non-empty compiler output encountered")
 
-FORWARD_ONLY = getenv("FORWARD_ONLY", 0)
+FORWARD_ONLY = getenv("FORWARD_ONLY", 1)
 PRINT_TENSORS = getenv("PRINT_TENSORS", 0)
 
 def helper_test_op(shps, torch_fxn, tinygrad_fxn=None, atol=1e-6, rtol=1e-3, grad_atol=1e-4, grad_rtol=1e-3,
@@ -1284,7 +1284,8 @@ def test_small_gemm_range(self):
   def test_small_gemm_eye(self):
     helper_test_op(None, lambda x,y: x.matmul(y), lambda x,y: x@y, vals=[np.eye(8).astype(np.float32), np.eye(8).astype(np.float32)])
   @unittest.skipIf(CI and Device.DEFAULT in ["NV", "LLVM", "GPU", "CUDA"] or IMAGE
-  or (Device.DEFAULT == "WEBGPU" and platform.system() == "Windows"), "not supported on these in CI/IMAGE")
+  or (Device.DEFAULT == "WEBGPU" and platform.system() == "Windows")
+  or (Device.DEFAULT == "ASM"), "not supported on these in CI/IMAGE")
   def test_gemm_fp16(self):
     helper_test_op([(64,64), (64,64)], lambda x,y: x.half().matmul(y.half()), atol=5e-3, rtol=5e-3)
   def test_gemm(self):

diff --git a/test/test_ops_2.py b/test/test_ops_2.py
diff --git a/tinygrad/device.py b/tinygrad/device.py
@@ -283,7 +283,6 @@ def _offset(self, buf, size:int, offset:int): return from_mv(self._as_buffer(buf
 
 # CPUProgram is a jit/shellcode program that can be just mmapped and jumped to
 class CPUProgram:
-  rt_lib = ctypes.CDLL(ctypes.util.find_library('System' if OSX else 'kernel32') if OSX or sys.platform == "win32" else 'libgcc_s.so.1')
 
   def __init__(self, name:str, lib:bytes):
     if sys.platform == "win32":
@@ -303,6 +302,8 @@ def __init__(self, name:str, lib:bytes):
       # MAP_JIT allows us to easily flip pages from RW- to R-X and vice versa. It is a noop on intel cpus. (man pthread_jit_write_protect_np)
       self.mem = mmap(-1, len(lib), MAP_ANON | MAP_PRIVATE | (MAP_JIT if OSX else 0), PROT_READ | PROT_WRITE | PROT_EXEC)
 
+      if OSX or sys.platform == "win32": 
+        CPUProgram.rt_lib = ctypes.CDLL(ctypes.util.find_library('System' if OSX else 'kernel32') if OSX or sys.platform == "win32" else 'libgcc_s.so.1')
       if OSX: CPUProgram.rt_lib.pthread_jit_write_protect_np(False)
       self.mem.write(lib)
       if OSX: CPUProgram.rt_lib.pthread_jit_write_protect_np(True)
@@ -311,19 +312,33 @@ def __init__(self, name:str, lib:bytes):
       # libgcc_s comes as shared library but compiler-rt is only a bunch of static library archives which we can't directly load, but fortunately
       # it somehow found its way into libSystem on macos (likely because it used __builtin_clear_cache) and libgcc_s is ~always present on linux
       # Using ["name"] instead of .name because otherwise name is getting mangled: https://docs.python.org/3.12/reference/expressions.html#index-5
-      CPUProgram.rt_lib["__clear_cache"](ctypes.c_void_p(mv_address(self.mem)), ctypes.c_void_p(mv_address(self.mem) + len(lib)))
+      if hasattr(CPUProgram, "rt_lib"):
+        CPUProgram.rt_lib["__clear_cache"](ctypes.c_void_p(mv_address(self.mem)), ctypes.c_void_p(mv_address(self.mem) + len(lib)))
 
       self.fxn = ctypes.CFUNCTYPE(None)(mv_address(self.mem))
 
   def __call__(self, *bufs, vals=(), wait=False):
     args = list(bufs) + list(vals)
+    if p:=os.environ.get("SAVE_BYTES"):
+      for i, b in enumerate(bufs[1:]):
+        print(f"Data {i+1}:")
+        _bytes = bytes(b)
+        print(", ".join([f"0x{_b:02x}" for _b in _bytes]))
+        print()
     # NOTE: replace this by --target={host's triple}-elf in clang args once we only support macos sequoia and later.
     # Apple relaxes abi requirement for stack arguments to always be at least 8 byte aligned on arm64
     # https://developer.apple.com/documentation/xcode/writing-arm64-code-for-apple-platforms
     # This hack is required because clang/llvm bug doesn't allow us to just use {host's triple}+'-elf' (relocation failures)
     # The bug was fixed in https://github.com/llvm/llvm-project/commit/454cc36630296262cdb6360b60f90a64a97f7f1a but was only backported to xcode 16+
     if platform.machine() == "arm64" and OSX: args = args[:8] + [ctypes.c_int64(a) if isinstance(a, int) else a for a in args[8:]]
-    return cpu_time_execution(lambda: self.fxn(*args), enable=wait)
+    ret = cpu_time_execution(lambda: self.fxn(*args), enable=wait)
+    if p:=os.environ.get("SAVE_BYTES"):
+      for i, b in enumerate(bufs[0:1]):
+        print(f"Data {i}:")
+        _bytes = bytes(b)
+        print(", ".join([f"0x{_b:02x}" for _b in _bytes]))
+        print()
+    return
 
   def __del__(self):
     if sys.platform == 'win32': ctypes.windll.kernel32.VirtualFree(ctypes.c_void_p(self.mem), ctypes.c_size_t(0), 0x8000) #0x8000 - MEM_RELEASE

diff --git a/tinygrad/opt/heuristic.py b/tinygrad/opt/heuristic.py
@@ -81,7 +81,7 @@ def has_expanded_axis(shape, strides): return any(resolve(s > 1) and not resolve
   # if last reduce dim is small(ish), loop unroll the reduce
   upcast_size = prod(k.full_shape[a] for a in k.axes_of(AxisType.UPCAST, AxisType.UNROLL))
   if k.unrollable_dims and (upcast_size <= 4 or not k.axes_of(AxisType.UNROLL)) and (upcast_size < 64):
-    if (s:=k.full_shape[k.unrollable_dims[-1]]) <= 32:
+    if (s:=k.full_shape[k.unrollable_dims[-1]]) <= 8:
       k.apply_opt(Opt(OptOps.UNROLL, k.unrollable_dims[-1]-k.first_reduce, 0))
       # if it's small, upcast a second reduce dimension too
       if k.unrollable_dims and s <= 3 and k.full_shape[k.unrollable_dims[-1]] <= 3: