Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 25 additions & 0 deletions include/ml_lib/core/transformer-block.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
#pragma once
#include "../math/matrix.h"
#include "attention-layer.h"
#include "layer-norm.h"
#include "neural-network-layer.h"
#include "optimizer.h"

class TransformerBlock {
private:
AttentionLayer attention;
LayerNorm norm1;
LayerNorm norm2;
NeuralNetworkLayer ff1;
NeuralNetworkLayer ff2;

Matrix attention_input_cache;
Matrix ff_input_cache;

public:
TransformerBlock(int embed_dim, int num_heads, int ff_dim);

Matrix forward(const Matrix& input);
Matrix backward(const Matrix& grad_output);
void update(Optimizer* opt);
};
36 changes: 36 additions & 0 deletions include/ml_lib/models/transformer.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
#pragma once
#include "../math/matrix.h"
#include "../core/embedding-layer.h"
#include "../core/sin-pos-encode.h"
#include "../core/transformer-block.h"
#include "../core/neural-network-layer.h"
#include "gradient-model.h"
#include <vector>
#include <memory>

class Transformer : public GradientModel {
private:
int vocab_size;
int embed_dim;
int max_seq_len;

EmbeddingLayer embedding;
SinPositionalEncoding pos_encoding;
std::vector<std::shared_ptr<TransformerBlock>> blocks;
NeuralNetworkLayer output_projection;

std::vector<int> last_token_input;
Matrix last_logits;

public:
Transformer(int vocab_size, int embed_dim, int num_heads,
int num_layers, int ff_dim, int max_seq_len,
std::unique_ptr<LossFunction> loss,
std::unique_ptr<Optimizer> opt,
std::unique_ptr<Regularizer> reg);

Matrix forward(const Matrix& X) override;
Matrix forward(const std::vector<int>& tokens);
void backward(const Matrix& y_true) override;
void update() override;
};
59 changes: 59 additions & 0 deletions source/core/transformer-block.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
#include "ml_lib/core/transformer-block.h"

TransformerBlock::TransformerBlock(int embed_dim, int num_heads, int ff_dim)
: attention(embed_dim, num_heads),
norm1(embed_dim),
norm2(embed_dim),
ff1(embed_dim, ff_dim, ACTIVATION_FUNC::RELU),
ff2(ff_dim, embed_dim, ACTIVATION_FUNC::LINEAR)
{
}

Matrix TransformerBlock::forward(const Matrix& input)
{
attention_input_cache = input;

// Self Attention
Matrix attn_out = attention.forward(input);

// Add & Norm
Matrix residual1 = input + attn_out;
Matrix normed1 = norm1.forward(residual1);

// Feed Forward
ff_input_cache = normed1;

// Add & Norm
Matrix ff_out = ff2.forward(ff1.forward(normed1));
Matrix residual2 = normed1 + ff_out;
Matrix output = norm2.forward(residual2);

return output;
}

Matrix TransformerBlock::backward(const Matrix& grad_output)
{
Matrix grad_norm2 = norm2.backward(grad_output);

Matrix grad_ff2 = ff2.backward(grad_norm2);
Matrix grad_ff1 = ff1.backward(grad_ff2);

Matrix grad_residual1 = grad_norm2 + grad_ff1;

Matrix grad_norm1 = norm1.backward(grad_residual1);

Matrix grad_attn = attention.backward(grad_norm1);

Matrix grad_input = grad_norm1 + grad_attn;

return grad_input;
}

void TransformerBlock::update(Optimizer* opt)
{
attention.update(opt);
norm1.update(opt);
ff1.update(opt);
ff2.update(opt);
norm2.update(opt);
}
67 changes: 67 additions & 0 deletions source/models/transformer.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
#include "ml_lib/models/transformer.h"

Transformer::Transformer(int vocab_size, int embed_dim, int num_heads,
int num_layers, int ff_dim, int max_seq_len,
std::unique_ptr<LossFunction> loss,
std::unique_ptr<Optimizer> opt,
std::unique_ptr<Regularizer> reg)
: GradientModel(std::move(loss), std::move(opt), std::move(reg)),
vocab_size(vocab_size),
embed_dim(embed_dim),
max_seq_len(max_seq_len),
embedding(vocab_size, embed_dim),
pos_encoding(embed_dim, max_seq_len),
output_projection(embed_dim, vocab_size, ACTIVATION_FUNC::SOFTMAX)
{
for (int i = 0; i < num_layers; i++) {
blocks.push_back(std::make_shared<TransformerBlock>(embed_dim, num_heads, ff_dim));
}
}

Matrix Transformer::forward(const std::vector<int>& tokens)
{
last_token_input = tokens;

Matrix embedded = embedding.forward(tokens);
Matrix x = pos_encoding.forward(embedded);

for (auto& block : blocks) {
x = block->forward(x);
}

last_logits = output_projection.forward(x);
last_output = last_logits;

return last_logits;
}

Matrix Transformer::forward(const Matrix& X)
{
std::vector<int> tokens;
for (int i = 0; i < X.rows(); i++) {
tokens.push_back(static_cast<int>(X(i, 0)));
}
return forward(tokens);
}

void Transformer::backward(const Matrix& y_true)
{
Matrix grad = loss_func->gradient(last_logits, y_true);

grad = output_projection.backward(grad);

for (int i = blocks.size() - 1; i >= 0; i--) {
grad = blocks[i]->backward(grad);
}

embedding.backward(grad);
}

void Transformer::update()
{
embedding.update(optimizer.get());
for (auto& block : blocks) {
block->update(optimizer.get());
}
output_projection.update(optimizer.get());
}
Loading