diff --git a/.DS_Store b/.DS_Store
new file mode 100644
index 0000000..601adcf
Binary files /dev/null and b/.DS_Store differ
diff --git a/AIUsage.md b/AIUsage.md
index 918539f..4932e78 100644
--- a/AIUsage.md
+++ b/AIUsage.md
@@ -3,6 +3,7 @@
 ## Source Code
 
  - Notebook LM was used to understand technical articles relevant to the creation of code
+ - CodeRabbit AI was used for PR review
 
 ### Tokenizer
 
diff --git a/include/block.hpp b/include/block.hpp
index 9bace6b..5c437fc 100644
--- a/include/block.hpp
+++ b/include/block.hpp
@@ -3,6 +3,8 @@
 
 #include "neural_network.hpp"
 #include "self_attention.hpp"
+#include "utility.hpp"
+#include <vector>
 
 namespace openchat {
     class block {
@@ -41,14 +43,24 @@ namespace openchat {
 
             utility::matrix feedForward(utility::matrix x) {
                 x = attention.attention(x);
-                for (int i = 0; i < x.rows; i++) {
-                    std::vector<float> output = network.feedForward(std::vector<float>(x[i], x[i] + this->n_embd));
-                    std::copy(output.begin(), output.end(), x[i]);
-                }
+                x = network.feedForward(x);
 
                 return x;
             }
 
+            std::pair<utility::matrix, std::pair<std::vector<std::pair<utility::matrix, utility::matrix>>, std::vector<utility::matrix>>> backward(utility::matrix dZ) {
+                std::pair<utility::matrix, std::vector<std::pair<utility::matrix, utility::matrix>>> net_pass = network.backward(dZ);
+                
+                std::pair<utility::matrix, std::vector<utility::matrix>> attn_pass = attention.backward(net_pass.first);
+            
+                std::pair<std::vector<std::pair<utility::matrix, utility::matrix>>, std::vector<utility::matrix>> dW;
+                dW.first = net_pass.second;
+                dW.second = attn_pass.second;
+            
+                return {attn_pass.first, dW};
+            }
+            
+
             void changeOne(char mat, size_t row, size_t col, float d) {
                 this->attention.changeOne(mat, row, col, d);
             }
diff --git a/include/embedder.hpp b/include/embedder.hpp
index cd46ff7..60dc182 100644
--- a/include/embedder.hpp
+++ b/include/embedder.hpp
@@ -3,6 +3,7 @@
 
 #include <cstddef>
 #include <filesystem>
+#include <forward_list>
 #include <random>
 #include <fstream>
 #include "utility.hpp"
@@ -14,6 +15,8 @@ namespace openchat {
             size_t n_tok;
             size_t n_embd;
 
+            std::forward_list<int> toks;
+
             std::default_random_engine generator;
             std::normal_distribution<float> initDist;
 
@@ -88,10 +91,34 @@ namespace openchat {
                 return vec;
             }
 
+            utility::matrix embed(std::forward_list<int> toks) {
+                this->toks = toks;
+                utility::matrix vec(std::distance(toks.begin(), toks.end()), this->n_embd);
+                int i = 0;
+                for (int tok : toks) {
+                    std::vector<float> emb = this->embed(tok);
+                    std::copy(emb.begin(), emb.end(), vec[i]);
+                    i++;
+                }
+
+                return vec;
+            }
+
             utility::matrix * getTable() {
                 return &this->table;
             }
 
+            void backward(utility::matrix dZ, float lr) {
+                auto tok_it = this->toks.begin();
+                for (int i = 0; i < dZ.rows; i++) {
+                    int tok = *tok_it;
+                    for (int j = 0; j < dZ.cols; j++) {
+                        this->table[tok][j] -= dZ[i][j] * lr;
+                    }
+                    ++tok_it;
+                }
+            }
+
             embedder() {}
     };
 }
diff --git a/include/layer.hpp b/include/layer.hpp
index 68e610f..d9704dd 100644
--- a/include/layer.hpp
+++ b/include/layer.hpp
@@ -11,6 +11,10 @@ namespace openchat {
         private:
             utility::matrix weights;
             utility::matrix biases;
+
+            utility::matrix X;
+            utility::matrix Z;
+
             std::default_random_engine generator;
             std::normal_distribution<float> initDist;
 
@@ -71,14 +75,36 @@ namespace openchat {
                 this->readFromFile(input);
             }
 
-            std::vector<float> feedForward(std::vector<float> input) {
-                utility::matrix i (1, input.size());
-                i.data = input;
-                std::vector<float> output = utility::add(utility::dot(i, this->weights), this->biases).data;
-                for (size_t i = 0; i < output.size(); i++) {
-                    output[i] = utility::relu(output[i]);
+            utility::matrix feedForward(utility::matrix x) {
+                this->X = x;
+                utility::matrix z = utility::dot(x, this->weights);
+                this->Z = z;
+                for (size_t i = 0; i < z.rows; i++) {
+                    for (size_t j = 0; j < z.cols; j++) {
+                        z[i][j] += this->biases[0][j];
+                    }
+                }
+                for (size_t i = 0; i < z.rows; i++) for (size_t j = 0; j < z.cols; j++) z[i][j] = utility::relu(z[i][j]);
+                return z;
+            }
+
+            std::pair<utility::matrix, std::pair<utility::matrix, utility::matrix>> backward(utility::matrix dZ) {
+                int M = dZ.rows;
+                int N = dZ.cols;
+
+                utility::matrix dW = utility::dot(utility::transpose(this->X), dZ);
+                utility::matrix dX = utility::dot(dZ, utility::transpose(this->weights));
+        
+                utility::matrix db(1, N); 
+                std::fill(db.data.begin(), db.data.end(), 0.0f);
+        
+                for (int i = 0; i < M; ++i) {
+                    for (int j = 0; j < N; ++j) {
+                        db.data[j] += dZ.data[i * N + j];
+                    }
                 }
-                return output;
+        
+                return {dX, {dW, db}};
             }
 
             void changeOne(float d, size_t n_in, size_t n_out) {
diff --git a/include/model.hpp b/include/model.hpp
index 7a52eac..062e518 100644
--- a/include/model.hpp
+++ b/include/model.hpp
@@ -1,7 +1,9 @@
 #ifndef MODEL_HPP
 #define MODEL_HPP
 
+#include <cstddef>
 #include <forward_list>
+#include <stdexcept>
 #include <vector>
 #include <filesystem>
 #include <utility>
@@ -17,6 +19,7 @@ namespace openchat {
             tokenizer tokenizer;
             embedder embedder;
             std::vector<block> blocks;
+            float learning_rate = 0.01;
             
         public:
             void init() {
@@ -33,25 +36,24 @@ namespace openchat {
                 }
             }
 
-            model(class tokenizer &tokenizer, class embedder &embedder, std::vector<class block> &blocks) {
+            model(class tokenizer &tokenizer, class embedder &embedder, std::vector<class block> &blocks, float learning_rate = 0.01) {
                 this->tokenizer = tokenizer;
                 this->embedder = embedder;
                 this->blocks = blocks;
 
+                this->learning_rate = learning_rate;
+
                 this->init();
             }
 
-            std::string forwardPass(std::string input) {
-                std::forward_list<int> tokens = tokenizer.encode(input);
+            void changeLearningRate(float n) {
+                this->learning_rate = n;
+            }
+
+            utility::matrix forwardPass(std::forward_list<int> tokens) {
                 utility::matrix unembed = utility::transpose(*embedder.getTable());
 
-                utility::matrix x = utility::matrix(std::distance(tokens.begin(), tokens.end()), embedder.getNEmbd());
-                int i = 0;
-                for (int token : tokens) {
-                  std::vector<float> emb = embedder.embed(token);
-                  std::copy(emb.begin(), emb.end(), x[i]);
-                  i++;
-                }
+                utility::matrix x = embedder.embed(tokens);
                 
                 x = positionalEncoding(x).apply();
 
@@ -64,17 +66,70 @@ namespace openchat {
 
                 dist = utility::softmax(utility::dot(dist, unembed));
 
-                float max = dist[0][0];
-                int token = 0;
-                for (size_t j = 1; j < dist.cols; j++) {
-                    if (dist[0][j] > max) {
-                        max = dist[0][j];
-                        token = static_cast<int>(j);
+                return dist;
+            }
+
+            void train(std::string input, size_t epochs = 100) {
+                std::forward_list<int> corpus = tokenizer.encode(input);
+                auto length = std::distance(corpus.begin(), corpus.end());
+                
+                if (length < static_cast<long long>(epochs)) throw std::invalid_argument("Give a longer input!");
+                size_t start = length - epochs;
+            
+                for (size_t i = 0; i < epochs; i++) {
+                    std::forward_list<int> tokens(corpus.begin(), std::next(corpus.begin(), i + start));
+                    int next = *std::next(corpus.begin(), i + start);
+            
+                    utility::matrix dist = forwardPass(tokens);
+                    utility::matrix oneHot = utility::matrix(dist.rows, dist.cols);
+                    oneHot[0][next] = 1;
+                    float loss = -1 * std::log(dist[0][next]);
+            
+                    utility::matrix dZ = utility::subtract(dist, oneHot);
+            
+                    std::vector<std::pair<std::vector<std::pair<utility::matrix, utility::matrix>>, std::vector<utility::matrix>>> bdW;
+                    utility::matrix edW;
+            
+                    for (auto it = blocks.rbegin(); it != blocks.rend(); ++it) {
+                        std::pair<utility::matrix, std::pair<std::vector<std::pair<utility::matrix, utility::matrix>>, std::vector<utility::matrix>>> p = it->backward(dZ);
+                        dZ = p.first;
+                        bdW.push_back(p.second);
                     }
-                }
+            
+                    embedder.backward(dZ, this->learning_rate);
+
+                    int j = 0;
+                    for (auto it = this->blocks.rbegin(); it != this->blocks.rend(); ++it) {
+                        block &b = *it;
+                        std::vector<std::pair<utility::matrix, utility::matrix>> ndW = bdW[j].first;
+                        std::vector<utility::matrix> adW = bdW[j].second;
+
+                        for (size_t layer = 0; layer < ndW.size(); layer++) {
+                            for (int k = 0; k < ndW[layer].first.cols; k++)
+                                b.changeOne(layer, ndW[layer].first[0][k] * this->learning_rate, k);
+
+                            for (int k = 0; k < ndW[layer].second.rows; k++)
+                                for (int l = 0; l < ndW[layer].second.cols; l++)
+                                    b.changeOne(layer, ndW[layer].second[k][l] * this->learning_rate, k, l);
+                        }
+
+                        for (int k = 0; k < adW[0].rows; k++)
+                            for (int l = 0; l < adW[0].cols; l++)
+                                b.changeOne('q', k, l, adW[0][k][l] * this->learning_rate);
 
-                return tokenizer.decode({token});
+                        for (int k = 0; k < adW[1].rows; k++)
+                            for (int l = 0; l < adW[1].cols; l++)
+                                b.changeOne('k', k, l, adW[1][k][l] * this->learning_rate);
+
+                        for (int k = 0; k < adW[2].rows; k++)
+                            for (int l = 0; l < adW[2].cols; l++)
+                                b.changeOne('v', k, l, adW[2][k][l] * this->learning_rate);
+
+                        j++;
+                    }
+                }
             }
+            
     };
 }
 
diff --git a/include/neural_network.hpp b/include/neural_network.hpp
index b677c21..75a222b 100644
--- a/include/neural_network.hpp
+++ b/include/neural_network.hpp
@@ -6,6 +6,7 @@
 #include <limits>
 #include <vector>
 #include "layer.hpp"
+#include "utility.hpp"
 
 namespace openchat {
     class neuralNetwork {
@@ -13,13 +14,8 @@ namespace openchat {
             std::vector<layer> network;
             std::vector<size_t> dimensions;
 
-            std::vector<float> gain;
-            std::vector<float> bias;
-
         public:
             void init() {
-              this->gain = std::vector<float>(this->dimensions[(this->dimensions.size() - 1)], 1.0f);
-              this->bias = std::vector<float>(this->dimensions[(this->dimensions.size() - 1)], 0.0f);
               for (size_t i = 0; i < dimensions.size() - 1; i++) {
                 network.push_back(layer(dimensions[i], dimensions[i + 1]));
               }
@@ -30,8 +26,6 @@ namespace openchat {
 
                 if (inFile.is_open()) {
                     inFile.read(reinterpret_cast<char *>(this->dimensions.data()), sizeof(size_t) * this->dimensions.size());
-                    inFile.read(reinterpret_cast<char *>(this->gain.data()), sizeof(float) * this->dimensions[(this->dimensions.size() - 1)]);
-                    inFile.read(reinterpret_cast<char *>(this->bias.data()), sizeof(float) * this->dimensions[(this->dimensions.size() - 1)]);
                 }
 
                 for (size_t i = 0; i < input.second.size(); i++) {
@@ -44,8 +38,6 @@ namespace openchat {
 
                 if (outFile.is_open()) {
                     outFile.write(reinterpret_cast<char *>(this->dimensions.data()), sizeof(size_t) * this->dimensions.size());
-                    outFile.write(reinterpret_cast<char *>(this->gain.data()), sizeof(float) * this->dimensions[(this->dimensions.size() - 1)]);
-                    outFile.write(reinterpret_cast<char *>(this->bias.data()), sizeof(float) * this->dimensions[(this->dimensions.size() - 1)]);
                 }
 
                 for (size_t i = 0; i < output.second.size(); i++) {
@@ -63,7 +55,9 @@ namespace openchat {
                 this->readFromFile(input);
             }
 
-            void layerNorm(std::vector<float> &input) {
+            void layerNorm(std::vector<float> &x, size_t start, size_t end) {
+                std::vector<float> input(x.begin() + start, x.begin() + end);
+
                 if (input.empty()) return;
 
                 float sum = 0;
@@ -75,17 +69,34 @@ namespace openchat {
                 float variance = sum / input.size();
 
                 for (int i = 0; i < input.size(); i++) {
-                    input[i] = ((input[i] - mean)/std::sqrt(variance + std::numeric_limits<float>::epsilon())) * this->gain[i] + this->bias[i];
+                    input[i] = ((input[i] - mean)/std::sqrt(variance + std::numeric_limits<float>::epsilon()));
+                }
+
+                for (int i = 0; i < input.size(); i++) {
+                    x[start+i] = input[i];
                 }
             }
 
-            std::vector<float> feedForward(std::vector<float> input) {
+            utility::matrix feedForward(utility::matrix x) {
                 for (layer &l : network) {
-                    input = l.feedForward(input);
+                    x = l.feedForward(x);
                 }
 
-                this->layerNorm(input);
-                return input;
+                for (int i = 0; i < x.rows; i++) {
+                    this->layerNorm(x.data, i*x.cols, i*x.cols+x.cols);
+                }
+                return x;
+            }
+
+            std::pair<utility::matrix, std::vector<std::pair<utility::matrix, utility::matrix>>> backward(utility::matrix dZ) {
+                std::pair<utility::matrix, std::vector<std::pair<utility::matrix, utility::matrix>>> ndW;
+                for (auto it = this->network.rbegin(); it != this->network.rend(); ++it) {
+                    std::pair<utility::matrix, std::pair<utility::matrix, utility::matrix>> p = it->backward(dZ);
+                    dZ = p.first;
+                    ndW.second.push_back(p.second); 
+                }
+                ndW.first = dZ;
+                return ndW;
             }
 
             void changeOne(size_t layer, float d, size_t n_in, size_t n_out) {
@@ -95,11 +106,6 @@ namespace openchat {
             void changeOne(size_t layer, float d, size_t n_in) {
                 this->network[layer].changeOne(d, n_in);
             }
-            
-            void changeOne(int x, float d, int pos) {
-                if (x == 0) gain[pos] -= d;
-                if (x == 1) bias[pos] -= d;
-            }
 
             neuralNetwork() {}
     };
diff --git a/include/self_attention.hpp b/include/self_attention.hpp
index 9a8f0e8..1eb8db1 100644
--- a/include/self_attention.hpp
+++ b/include/self_attention.hpp
@@ -5,6 +5,9 @@
 #include <filesystem>
 #include <fstream>
 #include <random>
+#include <cmath>
+#include <vector>
+#include <utility>
 
 namespace openchat {
     class selfAttention {
@@ -16,6 +19,8 @@ namespace openchat {
             utility::matrix q;
             utility::matrix k;
             utility::matrix v;
+            utility::matrix p;
+            utility::matrix x;
 
             size_t n_embd;
 
@@ -24,7 +29,8 @@ namespace openchat {
 
           public:
             void init() {
-                initDist = std::normal_distribution<float>(0, this->n_embd);
+                float stddev = 1.0f / std::sqrt(static_cast<float>(this->n_embd));
+                initDist = std::normal_distribution<float>(0.0f, stddev);
 
                 for (size_t i = 0; i < this->n_embd; i++) {
                     for (size_t j = 0; j < this->n_embd; j++) {
@@ -78,23 +84,68 @@ namespace openchat {
             }
 
             utility::matrix attention(utility::matrix x) {
+                this->x = x;
                 this->q = utility::dot(x, this->wq);
                 this->k = utility::dot(x, this->wk);
                 this->v = utility::dot(x, this->wv);
-                
-                return utility::dot(
-                    utility::softmax(
-                        utility::scalar_div(
-                            utility::dot(this->q, 
-                            utility::transpose(this->k)), 
-                        std::sqrt(n_embd))), 
-                    this->v); 
+
+                this->p = utility::softmax(utility::scalar_div(
+                    utility::dot(this->q, utility::transpose(this->k)),
+                    std::sqrt(static_cast<float>(n_embd))));
+
+                return utility::dot(this->p, this->v);
             }
 
             size_t getNEmbed() {
                 return this->n_embd;
             }
 
+            std::pair<utility::matrix, std::vector<utility::matrix>> backward(utility::matrix dZ) {
+                utility::matrix dV = utility::dot(utility::transpose(this->p), dZ);
+                utility::matrix dP = utility::dot(dZ, utility::transpose(this->v));
+                
+                int M = dP.rows; 
+                utility::matrix dS(M, M);
+                
+                for (int i = 0; i < M; ++i) {
+                    float sum_dP_P = 0.0f;
+
+                    for (int k = 0; k < M; ++k) {
+                        sum_dP_P += dP.data[i * M + k] * this->p.data[i * M + k];
+                    }
+                    
+                    for (int j = 0; j < M; ++j) {
+                        int idx = i * M + j;
+                        dS.data[idx] = this->p.data[idx] * (dP.data[idx] - sum_dP_P);
+                    }
+                }
+                
+                int K = this->q.cols; 
+                float scale = 1.0f / std::sqrt(static_cast<float>(K));
+                for (int i = 0; i < M * M; ++i) {
+                    dS.data[i] *= scale;
+                }
+                
+                utility::matrix dQ = utility::dot(dS, this->k);
+                utility::matrix dK = utility::dot(utility::transpose(dS), this->q);
+                
+                utility::matrix dWq = utility::dot(utility::transpose(this->x), dQ);
+                utility::matrix dWk = utility::dot(utility::transpose(this->x), dK);
+                utility::matrix dWv = utility::dot(utility::transpose(this->x), dV);
+                
+                utility::matrix dX_q = utility::dot(dQ, utility::transpose(this->wq));
+                utility::matrix dX_k = utility::dot(dK, utility::transpose(this->wk));
+                utility::matrix dX_v = utility::dot(dV, utility::transpose(this->wv));
+                
+                utility::matrix dX(dX_q.rows, dX_q.cols);
+                for (int i = 0; i < dX.rows * dX.cols; ++i) {
+                    dX.data[i] = dX_q.data[i] + dX_k.data[i] + dX_v.data[i];
+                }
+                
+                std::vector<utility::matrix> weight_gradients = {dWq, dWk, dWv};
+                return {dX, weight_gradients};
+            }
+
             void changeOne(char mat, size_t row, size_t col, float d) {
                 if (mat == 'q') wq[row][col] -= d;
                 else if (mat == 'k') wk[row][col] -= d;
@@ -105,4 +156,4 @@ namespace openchat {
     };
 }
 
-#endif
\ No newline at end of file
+#endif
diff --git a/include/utility.hpp b/include/utility.hpp
index 686f2aa..d829931 100644
--- a/include/utility.hpp
+++ b/include/utility.hpp
@@ -49,6 +49,22 @@ namespace openchat {
 
             return c;
         }
+
+        inline matrix subtract(const matrix& a, const matrix& b) {
+            if (a.cols != b.cols)
+                throw std::invalid_argument("Columns don't match!");
+            if (a.rows != b.rows)
+                throw std::invalid_argument("Rows don't match!");
+            
+            matrix c = matrix(a.rows, b.cols);
+            for (size_t i = 0; i < a.rows; i++ ) {
+                for (size_t j = 0; j < a.cols; j++) {
+                    c[i][j] = a[i][j] - b[i][j];
+                }
+            }
+
+            return c;
+        }
         
         inline matrix dot(const matrix& a, const matrix& b) {
             if (a.cols != b.rows) throw std::invalid_argument("Inner dimensions don't match!");