mesozoic-egg · mesozoic-egg · Aug 19, 2025 · Aug 19, 2025 · Aug 19, 2025 · Aug 19, 2025
diff --git a/test/test_ops_asm.py b/test/test_ops_asm.py
diff --git a/test/test_ops_sass.py b/test/test_ops_sass.py
@@ -0,0 +1,100 @@
+import time, math, unittest, functools, os, torch
+import numpy as np
+from typing import List, Callable
+import warnings
+from tinygrad.helpers import DISABLE_COMPILER_CACHE, getenv, IMAGE, DEBUG, CI, Context, TRANSCENDENTAL, DEVECTORIZE, OSX, Context 
+from tinygrad.helpers import getenv, IMAGE, DEBUG, CI, Context, TRANSCENDENTAL, OSX, AMD_LLVM
+from tinygrad import Tensor, Device, dtypes
+from tinygrad.tensor import _to_np_dtype
+from tinygrad.device import is_dtype_supported
+
+def skipU(flag: str):
+  if os.environ.get(flag):
+    return lambda func: func
+  return unittest.skip("")
+
+if getenv("TINY_BACKEND"):
+  import tinygrad.frontend.torch # noqa: F401 # pylint: disable=unused-import
+  torch.set_default_device("tiny")
+
+FORWARD_ONLY = getenv("FORWARD_ONLY", 1)
+PRINT_TENSORS = getenv("PRINT_TENSORS", 0)
+def helper_test_op(shps, torch_fxn, tinygrad_fxn=None, atol=1e-6, rtol=1e-3, grad_atol=1e-4, grad_rtol=1e-3,
+                   forward_only=False, vals=None, low=-2, high=2):
+  if tinygrad_fxn is None: tinygrad_fxn = torch_fxn
+  ts, tst = prepare_test_op(low, high, shps, vals, forward_only)
+
+  st = time.monotonic()
+  out = torch_fxn(*ts)
+  torch_fp = time.monotonic() - st
+
+  # move inputs to a different device, test the device of intermediate tensors are correct
+  #if mt:=getenv("MOVE_TENSOR", ""): for t in tst: t.to_(mt)
+
+  st = time.monotonic()
+  ret = tinygrad_fxn(*tst).realize()
+  tinygrad_fp = time.monotonic() - st
+
+  def compare(s, tinygrad_output, torch_output, atol, rtol):
+    if PRINT_TENSORS: print(s, tinygrad_output, torch_output)
+    try:
+      assert tinygrad_output.shape == torch_output.shape, f"shape mismatch: tinygrad={tinygrad_output.shape} | torch={torch_output.shape}"
+      assert tinygrad_output.dtype == torch_output.dtype, f"dtype mismatch: tinygrad={tinygrad_output.dtype} | torch={torch_output.dtype}"
+      if np.issubdtype(tinygrad_output.dtype, np.floating):
+        np.testing.assert_allclose(tinygrad_output, torch_output, atol=atol, rtol=rtol)
+      else:
+        np.testing.assert_equal(tinygrad_output, torch_output)
+    except Exception as e:
+      raise Exception(f"{s} failed shape {tinygrad_output.shape}: {e}")
+
+  if DEBUG >= 6:
+    np.set_printoptions(linewidth=200, suppress=True)
+    print(ret.numpy())
+    print(out.detach().cpu().numpy())
+  compare("forward pass", ret.numpy(), out.detach().cpu().numpy(), atol=atol, rtol=rtol)
+
+  torch_fbp, tinygrad_fbp = np.nan, np.nan
+  if not forward_only and not FORWARD_ONLY and ts and tst:
+    st = time.monotonic()
+    torch_grads = torch.autograd.grad(torch_fxn(*ts).sum(), ts)
+    torch_fbp = time.monotonic() - st
+
+    st = time.monotonic()
+    # NOTE: we now have to recompute the forward pass since we realized it
+    tiny_grads = tinygrad_fxn(*tst).sum().gradient(*tst)
+    Tensor.realize(*tiny_grads)
+    tinygrad_fbp = time.monotonic() - st
+
+    for i, (t, torch_grad) in enumerate(zip(tiny_grads, torch_grads)):
+      compare(f"backward pass tensor {i}", t.numpy(), torch_grad.detach().cpu().numpy(), atol=grad_atol, rtol=grad_rtol)
+
+  if not CI:
+    print("\ntesting %40r   torch/tinygrad fp: %.2f / %.2f ms  bp: %.2f / %.2f ms " % \
+          (shps, torch_fp*1000, tinygrad_fp*1000, torch_fbp*1000, tinygrad_fbp*1000), end="")
+
+def prepare_test_op(low, high, shps, vals, forward_only=False):
+  if shps is None:
+    ts = [torch.tensor(x, requires_grad=(not forward_only)) for x in vals]
+  else:
+    np.random.seed(0)
+    np_data = [np.random.uniform(low=low, high=high, size=size).astype(_to_np_dtype(dtypes.default_float)) for size in shps]
+    if os.environ.get("INPUT_BYTES"):
+      print(f"{np_data=}")
+      b = np_data[0].tobytes()
+      for _b in b: print(f"{_b:#x}", end=", ")
+      print()
+    ts = [torch.tensor(data, requires_grad=(not forward_only)) for data in np_data]
+  for i in range(len(ts)):
+    # NOTE: torch default int64 for python ints input
+    if ts[i].dtype == torch.int64: ts[i] = ts[i].type(torch.int32)
+  tst = [Tensor(x.detach().cpu().numpy(), requires_grad=(not forward_only and not FORWARD_ONLY)) for x in ts]
+  return ts, tst
+
+class TestOps(unittest.TestCase):
+  def test_add(self):
+    with Context(DEBUG=0):
+      _a = np.array([1.2, 1.3, 1.4]).astype(np.float32)
+      _b = np.array([1.2, 1.3, 1.4]).astype(np.float32)
+      a = Tensor(_a).realize()
+      b = Tensor(_b).realize()
+    np.testing.assert_equal((a*b).numpy(), _a*_b)
diff --git a/tinygrad/helpers.py b/tinygrad/helpers.py
@@ -135,7 +135,7 @@ def __lt__(self, x): return self.value < x
 SPLIT_REDUCEOP, NO_MEMORY_PLANNER, RING = ContextVar("SPLIT_REDUCEOP", 1), ContextVar("NO_MEMORY_PLANNER", 0), ContextVar("RING", 1)
 PICKLE_BUFFERS, PROFILE, LRU = ContextVar("PICKLE_BUFFERS", 1), ContextVar("PROFILE", getenv("VIZ")), ContextVar("LRU", 1)
 CACHELEVEL, IGNORE_BEAM_CACHE, DEVECTORIZE = ContextVar("CACHELEVEL", 2), ContextVar("IGNORE_BEAM_CACHE", 0), ContextVar("DEVECTORIZE", 1)
-DISABLE_COMPILER_CACHE = ContextVar("DISABLE_COMPILER_CACHE", 0)
+DISABLE_COMPILER_CACHE = ContextVar("DISABLE_COMPILER_CACHE", 1)
 DONT_REALIZE_EXPAND, DONT_GROUP_REDUCES = ContextVar("DONT_REALIZE_EXPAND", 0), ContextVar("DONT_GROUP_REDUCES", 0)
 QUANTIZE, VALIDATE_WITH_CPU = ContextVar("QUANTIZE", 0), ContextVar("VALIDATE_WITH_CPU", 0)
 CORRECT_DIVMOD_FOLDING, FUSE_OPTIM = ContextVar("CORRECT_DIVMOD_FOLDING", 0), ContextVar("FUSE_OPTIM", 0)