karpathy · Yegon-David · May 19, 2025
diff --git a/.gitignore b/.gitignore
@@ -1 +1,3 @@
 .ipynb_checkpoints/
+__pycache__/
+test/*.json
diff --git a/micrograd/engine.py b/micrograd/engine.py
@@ -1,94 +1,167 @@
-
 class Value:
-    """ stores a single scalar value and its gradient """
+    """stores a single scalar value and its gradient"""
 
-    def __init__(self, data, _children=(), _op=''):
+    def __init__(self, data, _children=(), _op=""):
         self.data = data
         self.grad = 0
         # internal variables used for autograd graph construction
         self._backward = lambda: None
         self._prev = set(_children)
-        self._op = _op # the op that produced this node, for graphviz / debugging / etc
+        self._op = _op  # the op that produced this node, for graphviz / debugging / etc
 
     def __add__(self, other):
         other = other if isinstance(other, Value) else Value(other)
-        out = Value(self.data + other.data, (self, other), '+')
+        out = Value(self.data + other.data, (self, other), "+")
 
         def _backward():
             self.grad += out.grad
             other.grad += out.grad
+
         out._backward = _backward
 
         return out
 
     def __mul__(self, other):
         other = other if isinstance(other, Value) else Value(other)
-        out = Value(self.data * other.data, (self, other), '*')
+        out = Value(self.data * other.data, (self, other), "*")
 
         def _backward():
             self.grad += other.data * out.grad
             other.grad += self.data * out.grad
+
         out._backward = _backward
 
         return out
 
     def __pow__(self, other):
-        assert isinstance(other, (int, float)), "only supporting int/float powers for now"
-        out = Value(self.data**other, (self,), f'**{other}')
+        assert isinstance(other, (int, float)), (
+            "only supporting int/float powers for now"
+        )
+        # change _op to simplify loading of the model Value.data_(data)
+        out = Value(self.data**other, (self,), "**")
 
         def _backward():
-            self.grad += (other * self.data**(other-1)) * out.grad
+            self.grad += (other * self.data ** (other - 1)) * out.grad
+
         out._backward = _backward
 
         return out
 
     def relu(self):
-        out = Value(0 if self.data < 0 else self.data, (self,), 'ReLU')
+        out = Value(0 if self.data < 0 else self.data, (self,), "ReLU")
 
         def _backward():
             self.grad += (out.data > 0) * out.grad
+
         out._backward = _backward
 
         return out
 
     def backward(self):
-
         # topological order all of the children in the graph
         topo = []
         visited = set()
+
         def build_topo(v):
             if v not in visited:
                 visited.add(v)
                 for child in v._prev:
                     build_topo(child)
                 topo.append(v)
+
         build_topo(self)
 
         # go one variable at a time and apply the chain rule to get its gradient
         self.grad = 1
         for v in reversed(topo):
             v._backward()
 
-    def __neg__(self): # -self
+    def __neg__(self):  # -self
         return self * -1
 
-    def __radd__(self, other): # other + self
+    def __radd__(self, other):  # other + self
         return self + other
 
-    def __sub__(self, other): # self - other
+    def __sub__(self, other):  # self - other
         return self + (-other)
 
-    def __rsub__(self, other): # other - self
+    def __rsub__(self, other):  # other - self
         return other + (-self)
 
-    def __rmul__(self, other): # other * self
+    def __rmul__(self, other):  # other * self
         return self * other
 
-    def __truediv__(self, other): # self / other
+    def __truediv__(self, other):  # self / other
         return self * other**-1
 
-    def __rtruediv__(self, other): # other / self
+    def __rtruediv__(self, other):  # other / self
         return other * self**-1
 
     def __repr__(self):
         return f"Value(data={self.data}, grad={self.grad})"
+
+    # SAVE: base object structure as json
+    def _data(self):
+        """
+        Return a dict representing this Value and its computation graph.
+        """
+        return {
+            "d": self.data,
+            "o": self._op,
+            "c": [child._data() for child in self._prev],
+            "g": self.grad,
+        }
+
+    @staticmethod
+    def data_(data):
+        """
+        Create a Value from a saved dictionary (as from _data).
+        """
+        inst = Value(data["d"], [], data["o"])
+        inst.grad = data["g"]
+        if not data["c"]:
+            return inst
+        inst._prev = []
+        for cd in data["c"]:
+            inst._prev.append(Value.data_(cd))
+
+        # recreating _backward lambada based on operator and related values
+        backward = lambda: None
+        match data["o"]:
+            case "*":
+
+                def back():
+                    inst._prev[0].grad += inst._prev[1].data * inst.grad
+                    inst._prev[1].grad += inst._prev[0].data * inst.grad
+
+                backward = back
+            case "+":
+
+                def back():
+                    inst._prev[0].grad += 1.0 * inst.grad
+                    inst._prev[1].grad += 1.0 * inst.grad
+
+                backward = back
+            case "**":
+
+                def back():
+                    inst._prev[0].grad += (
+                        inst.data * inst._prev[0].data ** (inst.data - 1)
+                    ) * inst.grad
+
+                backward = back
+            case "tanh":
+
+                def back():
+                    inst._prev[0].grad += (1 - (inst.data) ** 2) * inst.grad
+
+                backward = back
+            case "ReLU":
+
+                def back():
+                    inst._prev[0].grad += (inst.data > 0) * inst.grad
+
+                backward = back
+
+        inst._backward = backward
+        return inst
diff --git a/micrograd/nn.py b/micrograd/nn.py
@@ -1,34 +1,54 @@
+import json
+import os
 import random
+
 from micrograd.engine import Value
 
-class Module:
 
+class Module:
     def zero_grad(self):
         for p in self.parameters():
             p.grad = 0
 
     def parameters(self):
         return []
 
-class Neuron(Module):
 
+class Neuron(Module):
     def __init__(self, nin, nonlin=True):
-        self.w = [Value(random.uniform(-1,1)) for _ in range(nin)]
+        self.w = [Value(random.uniform(-1, 1)) for _ in range(nin)]
         self.b = Value(0)
         self.nonlin = nonlin
 
     def __call__(self, x):
-        act = sum((wi*xi for wi,xi in zip(self.w, x)), self.b)
+        act = sum((wi * xi for wi, xi in zip(self.w, x)), self.b)
         return act.relu() if self.nonlin else act
 
     def parameters(self):
         return self.w + [self.b]
 
+    # SAVE neuron data
+    def _data(self):
+        return {
+            "b": self.b._data(),
+            "il": self.nonlin,
+            "w": [cw._data() for cw in self.w],
+        }
+
+    # LOAD neuron from saved data
+    @staticmethod
+    def data_(data):
+        inst = Neuron(0, nonlin=data["il"])
+        inst.w = [Value.data_(rw) for rw in data["w"]]
+        inst.b = Value.data_(data["b"])
+
+        return inst
+
     def __repr__(self):
         return f"{'ReLU' if self.nonlin else 'Linear'}Neuron({len(self.w)})"
 
-class Layer(Module):
 
+class Layer(Module):
     def __init__(self, nin, nout, **kwargs):
         self.neurons = [Neuron(nin, **kwargs) for _ in range(nout)]
 
@@ -39,14 +59,29 @@ def __call__(self, x):
     def parameters(self):
         return [p for n in self.neurons for p in n.parameters()]
 
+    # SAVE layer
+    def _data(self):
+        return {"ns": [n._data() for n in self.neurons]}
+
+    # LOAD layer
+    @staticmethod
+    def data_(data):
+        inst = Layer(0, 0)
+        inst.neurons = [Neuron.data_(nr) for nr in data["ns"]]
+
+        return inst
+
     def __repr__(self):
         return f"Layer of [{', '.join(str(n) for n in self.neurons)}]"
 
-class MLP(Module):
 
+class MLP(Module):
     def __init__(self, nin, nouts):
         sz = [nin] + nouts
-        self.layers = [Layer(sz[i], sz[i+1], nonlin=i!=len(nouts)-1) for i in range(len(nouts))]
+        self.layers = [
+            Layer(sz[i], sz[i + 1], nonlin=i != len(nouts) - 1)
+            for i in range(len(nouts))
+        ]
 
     def __call__(self, x):
         for layer in self.layers:
@@ -58,3 +93,49 @@ def parameters(self):
 
     def __repr__(self):
         return f"MLP of [{', '.join(str(layer) for layer in self.layers)}]"
+
+    # optional to automate training
+    def fit(self, X, y, epochs=1000, lr=0.001):
+        for epoch in range(epochs):
+            # Forward pass
+            out = [self(x) for x in X]  # Each `mp(x)` should return a `Value`
+
+            loss = sum((ya - yp) ** 2 for yp, ya in zip(out, y)) / len(y)
+
+            loss.backward()  # This works now because `loss` is a `Value`
+
+            for param in self.parameters():
+                param.data -= lr * param.grad  # Gradient descent update
+                param.grad = 0  # Reset gradients for next iteration
+
+            if epoch % 100 == 0:
+                print(f"Epoch: {epoch}, Loss: {loss.data}")
+
+    # SAVE mlp
+    def _data(self):
+        return {"ls": [ly._data() for ly in self.layers]}
+
+    # LOAD mlp
+    @staticmethod
+    def data_(data):
+        inst = MLP(0, [])
+        inst.layers = [Layer.data_(ly) for ly in data["ls"]]
+        return inst
+
+    # Interface point for saving model
+    def save(self, fp: str):
+        if fp == "" or fp is None:
+            raise ValueError("File path is not specified")
+
+        with open(fp, "w", encoding="utf-8") as f:
+            json.dump(self._data(), f, indent=2)
+            print("SUCCESS! Model saved!")
+
+    # Inteface point for loading model
+    @staticmethod
+    def load(fp: str):
+        if not os.path.exists(fp):
+            raise FileNotFoundError
+        with open(fp, "r", encoding="utf-8") as f:
+            contents = json.load(f)
+            return MLP.data_(contents)