diff --git a/20241129_viz.md b/20241129_viz.md index a559b47..60c24e8 100644 --- a/20241129_viz.md +++ b/20241129_viz.md @@ -20,13 +20,14 @@ will allow us to see the generated code, and NOOPT tells tinygrad not to enable see the generated metal/cuda code: ```c++ -kernel void r_4(device float* data0, device float* data1, uint3 gid [[threadgroup_position_in_grid]], uint3 lid [[thread_position_in_threadgroup]]) { - float acc0 = 0.0f; - for (int ridx0 = 0; ridx0 < 4; ridx0++) { - float val0 = *(data1+ridx0); - acc0 = (acc0+val0); +kernel void r_4(device float* data0_1, device float* data1_4, uint3 gid [[threadgroup_position_in_grid]], uint3 lid [[thread_position_in_threadgroup]]) { + float acc0[1]; + *(acc0+0) = 0.0f; + for (int ridx1000 = 0; ridx1000 < 4; ridx1000++) { + float val0 = (*(data1_4+ridx1000)); + *(acc0+0) = ((*(acc0+0))+val0); } - *(data0+0) = acc0; + *(data0_1+0) = (*(acc0+0)); } ``` @@ -35,9 +36,9 @@ of vectorized data type, and remove the loop. In fact, that's what the optimizat on this time `DEBUG=5 python script.py`: ```c++ -kernel void r_4(device float* data0, device float* data1, uint3 gid [[threadgroup_position_in_grid]], uint3 lid [[thread_position_in_threadgroup]]) { - float4 val0 = *((device float4*)((data1+0))); - *(data0+0) = (val0.w+val0.z+val0.x+val0.y); +kernel void r_4(device float* data0_1, device float* data1_4, uint3 gid [[threadgroup_position_in_grid]], uint3 lid [[thread_position_in_threadgroup]]) { + float4 val0 = (*((device float4*)((data1_4+0)))); + *(data0_1+0) = (val0.x+val0.y+val0.z+val0.w); } ``` diff --git a/20241217_st.md b/20241217_st.md index dc18567..344a282 100644 --- a/20241217_st.md +++ b/20241217_st.md @@ -1,5 +1,10 @@ # Shapetracker +> [!NOTE] +> Shapetracker since has been removed from Tinygrad via commit 1d1e1d9d88b93db07be2244cc087f6786a5705fc. This document will stay in place for archival purposes. A replacement has been introduced - RAGNEIFY. +> +> Reasoning for the removal: https://x.com/__tinygrad__/status/1964037572503752910 + > examples were tested on commit a2a4ff30dcfafc8e7763303e9d8f0955900e8617 (in case things go out of date) Suppose you have a 2 by 2 matrix containing values 1, 2, 3, 4. These four numbers are not stored in memory as a matrix,