From daf4dfcdf952a5ead733a7ef3786172b47bdc254 Mon Sep 17 00:00:00 2001 From: ProdigiousPersonn Date: Thu, 5 Feb 2026 19:27:28 -0800 Subject: [PATCH 1/2] Transformer Block --- include/ml_lib/core/transformer-block.h | 25 +++++++++++ source/core/transformer-block.cpp | 59 +++++++++++++++++++++++++ 2 files changed, 84 insertions(+) create mode 100644 include/ml_lib/core/transformer-block.h create mode 100644 source/core/transformer-block.cpp diff --git a/include/ml_lib/core/transformer-block.h b/include/ml_lib/core/transformer-block.h new file mode 100644 index 0000000..41f3f9d --- /dev/null +++ b/include/ml_lib/core/transformer-block.h @@ -0,0 +1,25 @@ +#pragma once +#include "../math/matrix.h" +#include "attention-layer.h" +#include "layer-norm.h" +#include "neural-network-layer.h" +#include "optimizer.h" + +class TransformerBlock { + private: + AttentionLayer attention; + LayerNorm norm1; + LayerNorm norm2; + NeuralNetworkLayer ff1; + NeuralNetworkLayer ff2; + + Matrix attention_input_cache; + Matrix ff_input_cache; + + public: + TransformerBlock(int embed_dim, int num_heads, int ff_dim); + + Matrix forward(const Matrix& input); + Matrix backward(const Matrix& grad_output); + void update(Optimizer* opt); +}; diff --git a/source/core/transformer-block.cpp b/source/core/transformer-block.cpp new file mode 100644 index 0000000..6c7b8f2 --- /dev/null +++ b/source/core/transformer-block.cpp @@ -0,0 +1,59 @@ +#include "ml_lib/core/transformer-block.h" + +TransformerBlock::TransformerBlock(int embed_dim, int num_heads, int ff_dim) + : attention(embed_dim, num_heads), + norm1(embed_dim), + norm2(embed_dim), + ff1(embed_dim, ff_dim, ACTIVATION_FUNC::RELU), + ff2(ff_dim, embed_dim, ACTIVATION_FUNC::LINEAR) +{ +} + +Matrix TransformerBlock::forward(const Matrix& input) +{ + attention_input_cache = input; + + // Self Attention + Matrix attn_out = attention.forward(input); + + // Add & Norm + Matrix residual1 = input + attn_out; + Matrix normed1 = norm1.forward(residual1); + + // Feed Forward + ff_input_cache = normed1; + + // Add & Norm + Matrix ff_out = ff2.forward(ff1.forward(normed1)); + Matrix residual2 = normed1 + ff_out; + Matrix output = norm2.forward(residual2); + + return output; +} + +Matrix TransformerBlock::backward(const Matrix& grad_output) +{ + Matrix grad_norm2 = norm2.backward(grad_output); + + Matrix grad_ff2 = ff2.backward(grad_norm2); + Matrix grad_ff1 = ff1.backward(grad_ff2); + + Matrix grad_residual1 = grad_norm2 + grad_ff1; + + Matrix grad_norm1 = norm1.backward(grad_residual1); + + Matrix grad_attn = attention.backward(grad_norm1); + + Matrix grad_input = grad_norm1 + grad_attn; + + return grad_input; +} + +void TransformerBlock::update(Optimizer* opt) +{ + attention.update(opt); + norm1.update(opt); + ff1.update(opt); + ff2.update(opt); + norm2.update(opt); +} From 2ea1c9b2b01842d6a77a64182528b9d4c236bf24 Mon Sep 17 00:00:00 2001 From: ProdigiousPersonn Date: Thu, 5 Feb 2026 19:27:45 -0800 Subject: [PATCH 2/2] Transformer --- include/ml_lib/models/transformer.h | 36 ++++++++++++++++ source/models/transformer.cpp | 67 +++++++++++++++++++++++++++++ 2 files changed, 103 insertions(+) create mode 100644 include/ml_lib/models/transformer.h create mode 100644 source/models/transformer.cpp diff --git a/include/ml_lib/models/transformer.h b/include/ml_lib/models/transformer.h new file mode 100644 index 0000000..da1dcc8 --- /dev/null +++ b/include/ml_lib/models/transformer.h @@ -0,0 +1,36 @@ +#pragma once +#include "../math/matrix.h" +#include "../core/embedding-layer.h" +#include "../core/sin-pos-encode.h" +#include "../core/transformer-block.h" +#include "../core/neural-network-layer.h" +#include "gradient-model.h" +#include +#include + +class Transformer : public GradientModel { + private: + int vocab_size; + int embed_dim; + int max_seq_len; + + EmbeddingLayer embedding; + SinPositionalEncoding pos_encoding; + std::vector> blocks; + NeuralNetworkLayer output_projection; + + std::vector last_token_input; + Matrix last_logits; + + public: + Transformer(int vocab_size, int embed_dim, int num_heads, + int num_layers, int ff_dim, int max_seq_len, + std::unique_ptr loss, + std::unique_ptr opt, + std::unique_ptr reg); + + Matrix forward(const Matrix& X) override; + Matrix forward(const std::vector& tokens); + void backward(const Matrix& y_true) override; + void update() override; +}; diff --git a/source/models/transformer.cpp b/source/models/transformer.cpp new file mode 100644 index 0000000..01657d4 --- /dev/null +++ b/source/models/transformer.cpp @@ -0,0 +1,67 @@ +#include "ml_lib/models/transformer.h" + +Transformer::Transformer(int vocab_size, int embed_dim, int num_heads, + int num_layers, int ff_dim, int max_seq_len, + std::unique_ptr loss, + std::unique_ptr opt, + std::unique_ptr reg) + : GradientModel(std::move(loss), std::move(opt), std::move(reg)), + vocab_size(vocab_size), + embed_dim(embed_dim), + max_seq_len(max_seq_len), + embedding(vocab_size, embed_dim), + pos_encoding(embed_dim, max_seq_len), + output_projection(embed_dim, vocab_size, ACTIVATION_FUNC::SOFTMAX) +{ + for (int i = 0; i < num_layers; i++) { + blocks.push_back(std::make_shared(embed_dim, num_heads, ff_dim)); + } +} + +Matrix Transformer::forward(const std::vector& tokens) +{ + last_token_input = tokens; + + Matrix embedded = embedding.forward(tokens); + Matrix x = pos_encoding.forward(embedded); + + for (auto& block : blocks) { + x = block->forward(x); + } + + last_logits = output_projection.forward(x); + last_output = last_logits; + + return last_logits; +} + +Matrix Transformer::forward(const Matrix& X) +{ + std::vector tokens; + for (int i = 0; i < X.rows(); i++) { + tokens.push_back(static_cast(X(i, 0))); + } + return forward(tokens); +} + +void Transformer::backward(const Matrix& y_true) +{ + Matrix grad = loss_func->gradient(last_logits, y_true); + + grad = output_projection.backward(grad); + + for (int i = blocks.size() - 1; i >= 0; i--) { + grad = blocks[i]->backward(grad); + } + + embedding.backward(grad); +} + +void Transformer::update() +{ + embedding.update(optimizer.get()); + for (auto& block : blocks) { + block->update(optimizer.get()); + } + output_projection.update(optimizer.get()); +}