tensor-compiler/tensor.py at mainline · a-rahimi/tensor-compiler · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
from typing import Tuple

import dataclasses as dc
import numpy as np

import compute_graph
import tile
import vector


@dc.dataclass
class Tensor(compute_graph.Data):
    "A memory buffer stored in DRAM."

    shape: vector.VectorOp
    dtype: type

    def set_name(self, num_allocations: int) -> None:
        self.name = f"t{num_allocations}"

    def referenced_ops(self) -> Tuple[compute_graph.Operator]:
        return (self.shape,)


@dc.dataclass
class TensorOp(compute_graph.Operator):
    pass


class Allocate(TensorOp):
    def __init__(self, shape: vector.VectorOp, dtype: np.dtype):
        return super().__init__(
            out=Tensor("<unnamed>", shape, dtype),
            input_ops=(shape,),
        )

    def extra_args(self) -> str:
        return f"dtype=" + np.dtype(self.out.dtype).name


class Return(TensorOp):
    def __init__(self, t: TensorOp):
        return super().__init__(out=t.out, input_ops=(t,))


class Add(TensorOp):
    def __init__(self, in_a: TensorOp, in_b: TensorOp):
        new_t = Allocate(in_a.out.shape, in_a.out.dtype)
        return super().__init__(out=new_t.out, input_ops=(in_a, in_b, new_t))

    def generate(self) -> tile.TileOp:
        return Scan(tile.Add, (self.input_ops[0].out, self.input_ops[1].out, self.out))


class SoftMax(TensorOp):
    def __init__(self, t: TensorOp):
        new_t = Allocate(t.out.shape, t.out.dtype)
        super().__init__(out=new_t.out, input_ops=(t, new_t))

    def generate(self) -> tile.TileOp:
        accumulator_tile = tile.Tile(self.out.shape[:-1], dtype=np.int32)
        (in_a,) = self.ins
        return tile.WithTemporaryTile(
            accumulator_tile,
            Scan(
                tile.ExpSum, tensors=(in_a.out, self.out), fixed_tiles=accumulator_tile
            ),
            Scan(tile.DivideInPlace, tensors=(self.out,), fixed_tiles=accumulator_tile),
        )


class MatMul(TensorOp):
    def __init__(self, in_a: TensorOp, in_b: TensorOp):
        # Create a new shape vector whose value is
        #   out_shape = in_a.shape[:-1] +  (in_b.shape[-1],)
        len = in_a.out.shape.out.length

        out_shape = vector.Copy(
            dst=vector.Copy(
                dst=vector.Allocate(len, in_a.out.shape.out.dtype),
                src=in_a.out.shape,
                size=len - 1,
            ),
            dst_offset=len - 1,
            src=in_b.out.shape,
            src_offset=len - 1,
            size=1,
        )

        new_t = Allocate(out_shape, in_a.out.dtype)
        super().__init__(out=new_t.out, input_ops=(in_a, in_b, new_t))

    def generate(self) -> tile.TileOp:
        return ScanForMatMul(
            tile.MatMul, (self.input_ops[0].out, self.input_ops[1].out, self.out)
        )


@dc.dataclass
class Scan(tile.TileOp):
    op: tile.TileOp
    tensors: Tuple[Tensor]
    fixed_tiles: Tuple[tile.Tile]
    _tile_shape: Tuple[int] = None


@dc.dataclass
class ScanForMatMul(tile.TileOp):
    op: tile.TileOp
    tile_in_a: Tensor
    tile_in_b: Tensor
    tile_out: Tensor