From 5e81b426d3a380b4ceb6d26171cac24ce564661b Mon Sep 17 00:00:00 2001 From: Ben Kiev <146984941+Bean91@users.noreply.github.com> Date: Sat, 20 Jun 2026 19:09:47 -0500 Subject: [PATCH 1/8] added gradient finding, need to add the weight changing next. --- include/block.hpp | 20 ++++++++--- include/embedder.hpp | 24 +++++++++++++ include/layer.hpp | 23 +++++++----- include/model.hpp | 61 ++++++++++++++++++++++---------- include/neural_network.hpp | 46 +++++++++++++----------- include/self_attention.hpp | 71 ++++++++++++++++++++++++++++++++------ include/utility.hpp | 16 +++++++++ 7 files changed, 201 insertions(+), 60 deletions(-) diff --git a/include/block.hpp b/include/block.hpp index 9bace6b..7c32beb 100644 --- a/include/block.hpp +++ b/include/block.hpp @@ -3,6 +3,8 @@ #include "neural_network.hpp" #include "self_attention.hpp" +#include "utility.hpp" +#include namespace openchat { class block { @@ -41,14 +43,24 @@ namespace openchat { utility::matrix feedForward(utility::matrix x) { x = attention.attention(x); - for (int i = 0; i < x.rows; i++) { - std::vector output = network.feedForward(std::vector(x[i], x[i] + this->n_embd)); - std::copy(output.begin(), output.end(), x[i]); - } + x = network.feedForward(x); return x; } + std::pair, std::vector>> backward(utility::matrix dZ) { + std::pair> net_pass = network.backward(dZ); + + std::pair> attn_pass = attention.backward(net_pass.first); + + std::pair, std::vector> dW; + dW.first = net_pass.second; + dW.second = attn_pass.second; + + return {attn_pass.first, dW}; + } + + void changeOne(char mat, size_t row, size_t col, float d) { this->attention.changeOne(mat, row, col, d); } diff --git a/include/embedder.hpp b/include/embedder.hpp index cd46ff7..84617b1 100644 --- a/include/embedder.hpp +++ b/include/embedder.hpp @@ -3,6 +3,7 @@ #include #include +#include #include #include #include "utility.hpp" @@ -14,6 +15,8 @@ namespace openchat { size_t n_tok; size_t n_embd; + std::forward_list toks; + std::default_random_engine generator; std::normal_distribution initDist; @@ -88,10 +91,31 @@ namespace openchat { return vec; } + utility::matrix embed(std::forward_list toks) { + this->toks = toks; + utility::matrix vec(std::distance(toks.begin(), toks.end()), this->n_embd); + int i = 0; + for (int tok : toks) { + std::vector emb = this->embed(tok); + std::copy(emb.begin(), emb.end(), vec[i]); + i++; + } + + return vec; + } + utility::matrix * getTable() { return &this->table; } + void backward(utility::matrix dZ, float lr) { + for (int i = 0; i < dZ.rows; i++) { + for (int j = 0; j < dZ.cols; j++) { + this->table[*std::next(this->toks.begin(), i)][j] += dZ[i][j] * lr; + } + } + } + embedder() {} }; } diff --git a/include/layer.hpp b/include/layer.hpp index 68e610f..85beb4d 100644 --- a/include/layer.hpp +++ b/include/layer.hpp @@ -11,6 +11,9 @@ namespace openchat { private: utility::matrix weights; utility::matrix biases; + + utility::matrix X; + std::default_random_engine generator; std::normal_distribution initDist; @@ -71,14 +74,18 @@ namespace openchat { this->readFromFile(input); } - std::vector feedForward(std::vector input) { - utility::matrix i (1, input.size()); - i.data = input; - std::vector output = utility::add(utility::dot(i, this->weights), this->biases).data; - for (size_t i = 0; i < output.size(); i++) { - output[i] = utility::relu(output[i]); - } - return output; + utility::matrix feedForward(utility::matrix x) { + this->X = x; + utility::matrix z = utility::add(utility::dot(x, this->weights), this->biases); + for (size_t i = 0; i < z.rows; i++) for (size_t j = 0; j < z.cols; j++) z[i][j] = utility::relu(z[i][j]); + return z; + } + + std::pair backward(utility::matrix dZ) { + utility::matrix dW = utility::dot(utility::transpose(X), dZ); + utility::matrix dX = utility::dot(dZ, utility::transpose(this->weights)); + + return {dX, dW}; } void changeOne(float d, size_t n_in, size_t n_out) { diff --git a/include/model.hpp b/include/model.hpp index 7a52eac..af1fecf 100644 --- a/include/model.hpp +++ b/include/model.hpp @@ -1,7 +1,9 @@ #ifndef MODEL_HPP #define MODEL_HPP +#include #include +#include #include #include #include @@ -17,6 +19,7 @@ namespace openchat { tokenizer tokenizer; embedder embedder; std::vector blocks; + float learning_rate = 0.01; public: void init() { @@ -33,25 +36,24 @@ namespace openchat { } } - model(class tokenizer &tokenizer, class embedder &embedder, std::vector &blocks) { + model(class tokenizer &tokenizer, class embedder &embedder, std::vector &blocks, float learning_rate = 0.01) { this->tokenizer = tokenizer; this->embedder = embedder; this->blocks = blocks; + this->learning_rate = learning_rate; + this->init(); } - std::string forwardPass(std::string input) { - std::forward_list tokens = tokenizer.encode(input); + void changeLearningRate(float n) { + this->learning_rate = n; + } + + utility::matrix forwardPass(std::forward_list tokens) { utility::matrix unembed = utility::transpose(*embedder.getTable()); - utility::matrix x = utility::matrix(std::distance(tokens.begin(), tokens.end()), embedder.getNEmbd()); - int i = 0; - for (int token : tokens) { - std::vector emb = embedder.embed(token); - std::copy(emb.begin(), emb.end(), x[i]); - i++; - } + utility::matrix x = embedder.embed(tokens); x = positionalEncoding(x).apply(); @@ -64,17 +66,40 @@ namespace openchat { dist = utility::softmax(utility::dot(dist, unembed)); - float max = dist[0][0]; - int token = 0; - for (size_t j = 1; j < dist.cols; j++) { - if (dist[0][j] > max) { - max = dist[0][j]; - token = static_cast(j); + return dist; + } + + void backward(std::string input, size_t epochs = 100) { + std::forward_list corpus = tokenizer.encode(input); + auto length = std::distance(corpus.begin(), corpus.end()); + + if (length < static_cast(epochs)) throw std::invalid_argument("Give a longer input!"); + size_t start = length - epochs; + + for (size_t i = 0; i < epochs; i++) { + std::forward_list tokens(corpus.begin(), std::next(corpus.begin(), i + start)); + int next = *std::next(corpus.begin(), i + start); + + utility::matrix dist = forwardPass(tokens); + utility::matrix oneHot = utility::matrix(dist.rows, 1); + oneHot[next][0] = 1; + float loss = -1 * std::log(dist[next][0]); + + utility::matrix dZ = utility::subtract(dist, oneHot); + + std::vector, std::vector>> bdW; + utility::matrix edW; + + for (auto it = blocks.rbegin(); it != blocks.rend(); ++it) { + std::pair, std::vector>> p = it->backward(dZ); + dZ = p.first; + bdW.push_back(p.second); } + + embedder.backward(dZ, this->learning_rate); } - - return tokenizer.decode({token}); } + }; } diff --git a/include/neural_network.hpp b/include/neural_network.hpp index b677c21..d223f8d 100644 --- a/include/neural_network.hpp +++ b/include/neural_network.hpp @@ -6,6 +6,7 @@ #include #include #include "layer.hpp" +#include "utility.hpp" namespace openchat { class neuralNetwork { @@ -13,13 +14,8 @@ namespace openchat { std::vector network; std::vector dimensions; - std::vector gain; - std::vector bias; - public: void init() { - this->gain = std::vector(this->dimensions[(this->dimensions.size() - 1)], 1.0f); - this->bias = std::vector(this->dimensions[(this->dimensions.size() - 1)], 0.0f); for (size_t i = 0; i < dimensions.size() - 1; i++) { network.push_back(layer(dimensions[i], dimensions[i + 1])); } @@ -30,8 +26,6 @@ namespace openchat { if (inFile.is_open()) { inFile.read(reinterpret_cast(this->dimensions.data()), sizeof(size_t) * this->dimensions.size()); - inFile.read(reinterpret_cast(this->gain.data()), sizeof(float) * this->dimensions[(this->dimensions.size() - 1)]); - inFile.read(reinterpret_cast(this->bias.data()), sizeof(float) * this->dimensions[(this->dimensions.size() - 1)]); } for (size_t i = 0; i < input.second.size(); i++) { @@ -44,8 +38,6 @@ namespace openchat { if (outFile.is_open()) { outFile.write(reinterpret_cast(this->dimensions.data()), sizeof(size_t) * this->dimensions.size()); - outFile.write(reinterpret_cast(this->gain.data()), sizeof(float) * this->dimensions[(this->dimensions.size() - 1)]); - outFile.write(reinterpret_cast(this->bias.data()), sizeof(float) * this->dimensions[(this->dimensions.size() - 1)]); } for (size_t i = 0; i < output.second.size(); i++) { @@ -63,7 +55,9 @@ namespace openchat { this->readFromFile(input); } - void layerNorm(std::vector &input) { + void layerNorm(std::vector &x, size_t start, size_t end) { + std::vector input(x[start], x[end]); + if (input.empty()) return; float sum = 0; @@ -75,17 +69,34 @@ namespace openchat { float variance = sum / input.size(); for (int i = 0; i < input.size(); i++) { - input[i] = ((input[i] - mean)/std::sqrt(variance + std::numeric_limits::epsilon())) * this->gain[i] + this->bias[i]; + input[i] = ((input[i] - mean)/std::sqrt(variance + std::numeric_limits::epsilon())); + } + + for (int i = 0; i < input.size(); i++) { + x[start+i] = input[i]; } } - std::vector feedForward(std::vector input) { + utility::matrix feedForward(utility::matrix x) { for (layer &l : network) { - input = l.feedForward(input); + x = l.feedForward(x); } - this->layerNorm(input); - return input; + for (int i = 0; i < x.rows; i++) { + this->layerNorm(x.data, i*x.cols, i*x.cols+x.cols); + } + return x; + } + + std::pair> backward(utility::matrix dZ) { + std::pair> ndW; + for (auto it = this->network.rbegin(); it != this->network.rend(); ++it) { + std::pair p = it->backward(dZ); + dZ = p.first; + ndW.second.push_back(p.second); + } + ndW.first = dZ; + return ndW; } void changeOne(size_t layer, float d, size_t n_in, size_t n_out) { @@ -95,11 +106,6 @@ namespace openchat { void changeOne(size_t layer, float d, size_t n_in) { this->network[layer].changeOne(d, n_in); } - - void changeOne(int x, float d, int pos) { - if (x == 0) gain[pos] -= d; - if (x == 1) bias[pos] -= d; - } neuralNetwork() {} }; diff --git a/include/self_attention.hpp b/include/self_attention.hpp index 9a8f0e8..1eb8db1 100644 --- a/include/self_attention.hpp +++ b/include/self_attention.hpp @@ -5,6 +5,9 @@ #include #include #include +#include +#include +#include namespace openchat { class selfAttention { @@ -16,6 +19,8 @@ namespace openchat { utility::matrix q; utility::matrix k; utility::matrix v; + utility::matrix p; + utility::matrix x; size_t n_embd; @@ -24,7 +29,8 @@ namespace openchat { public: void init() { - initDist = std::normal_distribution(0, this->n_embd); + float stddev = 1.0f / std::sqrt(static_cast(this->n_embd)); + initDist = std::normal_distribution(0.0f, stddev); for (size_t i = 0; i < this->n_embd; i++) { for (size_t j = 0; j < this->n_embd; j++) { @@ -78,23 +84,68 @@ namespace openchat { } utility::matrix attention(utility::matrix x) { + this->x = x; this->q = utility::dot(x, this->wq); this->k = utility::dot(x, this->wk); this->v = utility::dot(x, this->wv); - - return utility::dot( - utility::softmax( - utility::scalar_div( - utility::dot(this->q, - utility::transpose(this->k)), - std::sqrt(n_embd))), - this->v); + + this->p = utility::softmax(utility::scalar_div( + utility::dot(this->q, utility::transpose(this->k)), + std::sqrt(static_cast(n_embd)))); + + return utility::dot(this->p, this->v); } size_t getNEmbed() { return this->n_embd; } + std::pair> backward(utility::matrix dZ) { + utility::matrix dV = utility::dot(utility::transpose(this->p), dZ); + utility::matrix dP = utility::dot(dZ, utility::transpose(this->v)); + + int M = dP.rows; + utility::matrix dS(M, M); + + for (int i = 0; i < M; ++i) { + float sum_dP_P = 0.0f; + + for (int k = 0; k < M; ++k) { + sum_dP_P += dP.data[i * M + k] * this->p.data[i * M + k]; + } + + for (int j = 0; j < M; ++j) { + int idx = i * M + j; + dS.data[idx] = this->p.data[idx] * (dP.data[idx] - sum_dP_P); + } + } + + int K = this->q.cols; + float scale = 1.0f / std::sqrt(static_cast(K)); + for (int i = 0; i < M * M; ++i) { + dS.data[i] *= scale; + } + + utility::matrix dQ = utility::dot(dS, this->k); + utility::matrix dK = utility::dot(utility::transpose(dS), this->q); + + utility::matrix dWq = utility::dot(utility::transpose(this->x), dQ); + utility::matrix dWk = utility::dot(utility::transpose(this->x), dK); + utility::matrix dWv = utility::dot(utility::transpose(this->x), dV); + + utility::matrix dX_q = utility::dot(dQ, utility::transpose(this->wq)); + utility::matrix dX_k = utility::dot(dK, utility::transpose(this->wk)); + utility::matrix dX_v = utility::dot(dV, utility::transpose(this->wv)); + + utility::matrix dX(dX_q.rows, dX_q.cols); + for (int i = 0; i < dX.rows * dX.cols; ++i) { + dX.data[i] = dX_q.data[i] + dX_k.data[i] + dX_v.data[i]; + } + + std::vector weight_gradients = {dWq, dWk, dWv}; + return {dX, weight_gradients}; + } + void changeOne(char mat, size_t row, size_t col, float d) { if (mat == 'q') wq[row][col] -= d; else if (mat == 'k') wk[row][col] -= d; @@ -105,4 +156,4 @@ namespace openchat { }; } -#endif \ No newline at end of file +#endif diff --git a/include/utility.hpp b/include/utility.hpp index 686f2aa..d829931 100644 --- a/include/utility.hpp +++ b/include/utility.hpp @@ -49,6 +49,22 @@ namespace openchat { return c; } + + inline matrix subtract(const matrix& a, const matrix& b) { + if (a.cols != b.cols) + throw std::invalid_argument("Columns don't match!"); + if (a.rows != b.rows) + throw std::invalid_argument("Rows don't match!"); + + matrix c = matrix(a.rows, b.cols); + for (size_t i = 0; i < a.rows; i++ ) { + for (size_t j = 0; j < a.cols; j++) { + c[i][j] = a[i][j] - b[i][j]; + } + } + + return c; + } inline matrix dot(const matrix& a, const matrix& b) { if (a.cols != b.rows) throw std::invalid_argument("Inner dimensions don't match!"); From dcc57bc7e841eb0f411cef585bb234c3b33d3d6f Mon Sep 17 00:00:00 2001 From: Ben Kiev <146984941+Bean91@users.noreply.github.com> Date: Sat, 20 Jun 2026 19:51:21 -0500 Subject: [PATCH 2/8] finished backprop --- include/block.hpp | 6 +++--- include/layer.hpp | 20 ++++++++++++++++---- include/model.hpp | 35 ++++++++++++++++++++++++++++++++--- include/neural_network.hpp | 6 +++--- 4 files changed, 54 insertions(+), 13 deletions(-) diff --git a/include/block.hpp b/include/block.hpp index 7c32beb..5c437fc 100644 --- a/include/block.hpp +++ b/include/block.hpp @@ -48,12 +48,12 @@ namespace openchat { return x; } - std::pair, std::vector>> backward(utility::matrix dZ) { - std::pair> net_pass = network.backward(dZ); + std::pair>, std::vector>> backward(utility::matrix dZ) { + std::pair>> net_pass = network.backward(dZ); std::pair> attn_pass = attention.backward(net_pass.first); - std::pair, std::vector> dW; + std::pair>, std::vector> dW; dW.first = net_pass.second; dW.second = attn_pass.second; diff --git a/include/layer.hpp b/include/layer.hpp index 85beb4d..044e179 100644 --- a/include/layer.hpp +++ b/include/layer.hpp @@ -81,11 +81,23 @@ namespace openchat { return z; } - std::pair backward(utility::matrix dZ) { - utility::matrix dW = utility::dot(utility::transpose(X), dZ); - utility::matrix dX = utility::dot(dZ, utility::transpose(this->weights)); + std::pair> backward(utility::matrix dZ) { + int M = dZ.rows; + int N = dZ.cols; - return {dX, dW}; + utility::matrix dW = utility::dot(utility::transpose(this->X), dZ); + utility::matrix dX = utility::dot(dZ, utility::transpose(this->weights)); + + utility::matrix db(1, N); + std::fill(db.data.begin(), db.data.end(), 0.0f); + + for (int i = 0; i < M; ++i) { + for (int j = 0; j < N; ++j) { + db.data[j] += dZ.data[i * N + j]; + } + } + + return {dX, {dW, db}}; } void changeOne(float d, size_t n_in, size_t n_out) { diff --git a/include/model.hpp b/include/model.hpp index af1fecf..281ca31 100644 --- a/include/model.hpp +++ b/include/model.hpp @@ -69,7 +69,7 @@ namespace openchat { return dist; } - void backward(std::string input, size_t epochs = 100) { + void train(std::string input, size_t epochs = 100) { std::forward_list corpus = tokenizer.encode(input); auto length = std::distance(corpus.begin(), corpus.end()); @@ -87,16 +87,45 @@ namespace openchat { utility::matrix dZ = utility::subtract(dist, oneHot); - std::vector, std::vector>> bdW; + std::vector>, std::vector>> bdW; utility::matrix edW; for (auto it = blocks.rbegin(); it != blocks.rend(); ++it) { - std::pair, std::vector>> p = it->backward(dZ); + std::pair>, std::vector>> p = it->backward(dZ); dZ = p.first; bdW.push_back(p.second); } embedder.backward(dZ, this->learning_rate); + + int j = 0; + for (block& b : this->blocks) { + std::vector> ndW = bdW[j].first; + std::vector adW = bdW[j].second; + + for (size_t layer = ndW.size() - 1; layer >= 0; layer--) { + for (int k = 0; k < ndW[layer].first.cols; k++) + b.changeOne(layer, ndW[layer].first[0][k] * this->learning_rate, k); + + for (int k = 0; k < ndW[layer].second.rows; k ++) + for (int l = 0; l < ndW[layer].second.rows; l++) + b.changeOne(layer, ndW[layer].second[k][l] * this->learning_rate, k, l); + } + + for (int k = 0; k < adW[0].rows; k++) + for (int l = 0; l < adW[0].rows; l++) + b.changeOne('q', k, l, adW[0][k][l] * this->learning_rate); + + for (int k = 0; k < adW[1].rows; k++) + for (int l = 0; l < adW[1].rows; l++) + b.changeOne('k', k, l, adW[1][k][l] * this->learning_rate); + + for (int k = 0; k < adW[2].rows; k++) + for (int l = 0; l < adW[2].rows; l++) + b.changeOne('v', k, l, adW[2][k][l] * this->learning_rate); + + j++; + } } } diff --git a/include/neural_network.hpp b/include/neural_network.hpp index d223f8d..71234ed 100644 --- a/include/neural_network.hpp +++ b/include/neural_network.hpp @@ -88,10 +88,10 @@ namespace openchat { return x; } - std::pair> backward(utility::matrix dZ) { - std::pair> ndW; + std::pair>> backward(utility::matrix dZ) { + std::pair>> ndW; for (auto it = this->network.rbegin(); it != this->network.rend(); ++it) { - std::pair p = it->backward(dZ); + std::pair> p = it->backward(dZ); dZ = p.first; ndW.second.push_back(p.second); } From d7c055983890596245ee70224ce31ac4cabbb175 Mon Sep 17 00:00:00 2001 From: Ben Kiev <146984941+Bean91@users.noreply.github.com> Date: Sat, 20 Jun 2026 20:03:36 -0500 Subject: [PATCH 3/8] Update include/embedder.hpp Co-authored-by: coderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com> --- include/embedder.hpp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/include/embedder.hpp b/include/embedder.hpp index 84617b1..1bf839f 100644 --- a/include/embedder.hpp +++ b/include/embedder.hpp @@ -109,10 +109,13 @@ namespace openchat { } void backward(utility::matrix dZ, float lr) { + auto tok_it = this->toks.begin(); for (int i = 0; i < dZ.rows; i++) { + int tok = *tok_it; for (int j = 0; j < dZ.cols; j++) { - this->table[*std::next(this->toks.begin(), i)][j] += dZ[i][j] * lr; + this->table[tok][j] -= dZ[i][j] * lr; } + +tok_it; } } From d7efe3d14c9b7350dc94782235e122da3388a70a Mon Sep 17 00:00:00 2001 From: Ben Kiev <146984941+Bean91@users.noreply.github.com> Date: Sat, 20 Jun 2026 20:04:23 -0500 Subject: [PATCH 4/8] Update include/layer.hpp Co-authored-by: coderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com> --- include/layer.hpp | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/include/layer.hpp b/include/layer.hpp index 044e179..d0f15e9 100644 --- a/include/layer.hpp +++ b/include/layer.hpp @@ -76,7 +76,12 @@ namespace openchat { utility::matrix feedForward(utility::matrix x) { this->X = x; - utility::matrix z = utility::add(utility::dot(x, this->weights), this->biases); + utility::matrix z = utility::dot(x, this->weights); + for (size_t i = 0; i < z.rows; i++) { + for (size_t j = 0; j < z.cols; j++) { + z[i][j] += this->biases[0][j]; + } + } for (size_t i = 0; i < z.rows; i++) for (size_t j = 0; j < z.cols; j++) z[i][j] = utility::relu(z[i][j]); return z; } From e0de46863de68d50cfc7a334d24d7dcb5bd6c934 Mon Sep 17 00:00:00 2001 From: Ben Kiev <146984941+Bean91@users.noreply.github.com> Date: Sat, 20 Jun 2026 21:57:09 -0500 Subject: [PATCH 5/8] Update include/neural_network.hpp Co-authored-by: coderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com> --- include/neural_network.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/neural_network.hpp b/include/neural_network.hpp index 71234ed..75a222b 100644 --- a/include/neural_network.hpp +++ b/include/neural_network.hpp @@ -56,7 +56,7 @@ namespace openchat { } void layerNorm(std::vector &x, size_t start, size_t end) { - std::vector input(x[start], x[end]); + std::vector input(x.begin() + start, x.begin() + end); if (input.empty()) return; From f770a6d4bc08c9690c4aa29b4c26c679b9eb6c07 Mon Sep 17 00:00:00 2001 From: Ben Kiev <146984941+Bean91@users.noreply.github.com> Date: Sun, 21 Jun 2026 10:33:14 -0500 Subject: [PATCH 6/8] bug fixes --- AIUsage.md | 1 + include/embedder.hpp | 6 +++--- include/layer.hpp | 2 ++ include/model.hpp | 25 +++++++++++++------------ 4 files changed, 19 insertions(+), 15 deletions(-) diff --git a/AIUsage.md b/AIUsage.md index 918539f..4932e78 100644 --- a/AIUsage.md +++ b/AIUsage.md @@ -3,6 +3,7 @@ ## Source Code - Notebook LM was used to understand technical articles relevant to the creation of code + - CodeRabbit AI was used for PR review ### Tokenizer diff --git a/include/embedder.hpp b/include/embedder.hpp index 84617b1..2901bcf 100644 --- a/include/embedder.hpp +++ b/include/embedder.hpp @@ -96,9 +96,9 @@ namespace openchat { utility::matrix vec(std::distance(toks.begin(), toks.end()), this->n_embd); int i = 0; for (int tok : toks) { - std::vector emb = this->embed(tok); - std::copy(emb.begin(), emb.end(), vec[i]); - i++; + std::vector emb = this->embed(tok); + std::copy(emb.begin(), emb.end(), vec[i]); + i++; } return vec; diff --git a/include/layer.hpp b/include/layer.hpp index 044e179..d80bf9d 100644 --- a/include/layer.hpp +++ b/include/layer.hpp @@ -13,6 +13,7 @@ namespace openchat { utility::matrix biases; utility::matrix X; + utility::matrix Z; std::default_random_engine generator; std::normal_distribution initDist; @@ -77,6 +78,7 @@ namespace openchat { utility::matrix feedForward(utility::matrix x) { this->X = x; utility::matrix z = utility::add(utility::dot(x, this->weights), this->biases); + this->Z = z; for (size_t i = 0; i < z.rows; i++) for (size_t j = 0; j < z.cols; j++) z[i][j] = utility::relu(z[i][j]); return z; } diff --git a/include/model.hpp b/include/model.hpp index 281ca31..062e518 100644 --- a/include/model.hpp +++ b/include/model.hpp @@ -81,9 +81,9 @@ namespace openchat { int next = *std::next(corpus.begin(), i + start); utility::matrix dist = forwardPass(tokens); - utility::matrix oneHot = utility::matrix(dist.rows, 1); - oneHot[next][0] = 1; - float loss = -1 * std::log(dist[next][0]); + utility::matrix oneHot = utility::matrix(dist.rows, dist.cols); + oneHot[0][next] = 1; + float loss = -1 * std::log(dist[0][next]); utility::matrix dZ = utility::subtract(dist, oneHot); @@ -99,29 +99,30 @@ namespace openchat { embedder.backward(dZ, this->learning_rate); int j = 0; - for (block& b : this->blocks) { + for (auto it = this->blocks.rbegin(); it != this->blocks.rend(); ++it) { + block &b = *it; std::vector> ndW = bdW[j].first; std::vector adW = bdW[j].second; - for (size_t layer = ndW.size() - 1; layer >= 0; layer--) { - for (int k = 0; k < ndW[layer].first.cols; k++) + for (size_t layer = 0; layer < ndW.size(); layer++) { + for (int k = 0; k < ndW[layer].first.cols; k++) b.changeOne(layer, ndW[layer].first[0][k] * this->learning_rate, k); - for (int k = 0; k < ndW[layer].second.rows; k ++) - for (int l = 0; l < ndW[layer].second.rows; l++) - b.changeOne(layer, ndW[layer].second[k][l] * this->learning_rate, k, l); + for (int k = 0; k < ndW[layer].second.rows; k++) + for (int l = 0; l < ndW[layer].second.cols; l++) + b.changeOne(layer, ndW[layer].second[k][l] * this->learning_rate, k, l); } for (int k = 0; k < adW[0].rows; k++) - for (int l = 0; l < adW[0].rows; l++) + for (int l = 0; l < adW[0].cols; l++) b.changeOne('q', k, l, adW[0][k][l] * this->learning_rate); for (int k = 0; k < adW[1].rows; k++) - for (int l = 0; l < adW[1].rows; l++) + for (int l = 0; l < adW[1].cols; l++) b.changeOne('k', k, l, adW[1][k][l] * this->learning_rate); for (int k = 0; k < adW[2].rows; k++) - for (int l = 0; l < adW[2].rows; l++) + for (int l = 0; l < adW[2].cols; l++) b.changeOne('v', k, l, adW[2][k][l] * this->learning_rate); j++; From 5fd98cfa09b2caf523a16d3f10f460ded8334c03 Mon Sep 17 00:00:00 2001 From: Ben Kiev <146984941+Bean91@users.noreply.github.com> Date: Sun, 21 Jun 2026 10:34:48 -0500 Subject: [PATCH 7/8] bug fixes --- .DS_Store | Bin 0 -> 10244 bytes .github/workflows/documentation.yml | 34 ++++++++++++++++++++++++++++ .github/workflows/macos.yml | 27 ++++++++++++++++++++++ .github/workflows/pre-commit.yml | 15 ++++++++++++ .github/workflows/ubuntu.yml | 31 +++++++++++++++++++++++++ .github/workflows/windows.yml | 28 +++++++++++++++++++++++ include/embedder.hpp | 2 +- 7 files changed, 136 insertions(+), 1 deletion(-) create mode 100644 .DS_Store create mode 100644 .github/workflows/documentation.yml create mode 100644 .github/workflows/macos.yml create mode 100644 .github/workflows/pre-commit.yml create mode 100644 .github/workflows/ubuntu.yml create mode 100644 .github/workflows/windows.yml diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..601adcf7f660eeac9f96882c51c4b5b8d297e08b GIT binary patch literal 10244 zcmeHMYiu3G6+UO};N2`CYh#lTa&s@fKuqie<2>4sl3c$;Ozb$xwc|$~m%Vo<_A2XL zv%B{i6Qwf0cm$B>k5;8X|5b$_yxJhz{y{1Tkboa3M1X__kOBaeBB#dXlw2b4dLEYrCE^%Fk@Nvfm;wgd zU(@aDcBo$i+ zCE^M|w4T@^P@Z20xK1jfnTRe(N;MQvsuGo|7+owCAgNRbj4mG-^^DOC z1x4@FeoBY~W+aW%*aEQymRo?sWi@oe0E|LVd4C^pyh7gb@>I`ddMIxDuLzU_SOs|~ zf(;H>rL6Bdo;yWc4$gYI`+HD;pr(cAsjjkx1{kDXd!Zi=LKga~rLna}T7x4b*$sU# zL{z*m)M)&wMq@36;M0t3a7CPg9F58&-Ykv0c65AxPgKlr1?~J?FdJFt#r&*e2VrOD zMO5nQS6$Z7pf%_XC$iJtTsAbrV$fsy$9QSP^PGG+Z|0BMY%XUs_j+z<+Ai~jf^9Jp zIX-5)mN(bq6;GFX{wep)}J}kwR>krYWwb8XFF17c6W7lq;_oIb@r^Lwxn;` zb0l}#nQ^@bXhsw;+Av{_EH9?#D7V5at)SrKY1F)l6-4bqDKZGYSvJUrh7p-W)mH15 zueoBa(b#llV{>CmOUqU3)?dBhnhiPQ%Dh=PJ>~PM_Ii%z_c%p%Y|6K-?5tU^-Kp`A z-4$l-yIIa?su4ZHtdmQuk@;DYX$%d3K;%nyhz(;1&IecyDaSkL?k$ILTl*zNJc(3=^z z3!W=SzE(6##X~;hO{VvKA=e?DIZ)2D$&3MMnW2Mub2%4LOutlkN#sXd&t+OVb4b=` z^!02fbPT~!I1P8hBk(vp34epv-~;#^{)G)_U@Nxc4!jlj;eH&zNi5(r+UUy$l(Ls3 zWVxJTK)Dr`-U&**5({Fon4w)_eP+l4ZcT3S zCDV)*Jf}G0QcYvQbP7chX5Eg%?I@(H*Q>%iYL;?*GVuJ6e1ICOScyxlsHQ!A#k%Bm zZCks3_KQsaz|i3f+)s7A<*Kog9oTt?O_)J}xfXfj=$vhZ)02coH?&XL%-6I=>=<#) z8FrzGd-TRcLdq=Zlv^1u=IuL+b~taORaN$pXiaOXasIePlN|Ca-l8RH5{cH$s(Os` zdwoW!Wb29c4pJ~j0y0N5%+8xN%?L(L)0QDs^_{y^&9Fv=Ge)di_7Lm9SV{VD#BAB8 zYU2fOW`-scNM541PgRZAPV$Zi%^)0}W$xmTdiSg9WH4>|Y(ZJ!XUf|N#>glAh zihv}GH|T9i<1zw~EVd`NCJlwKD~p}UF5RdjWNJcpXL74#PZ*WOz4|8I z08JKpl6^^oBToIHeE)TL8{Q?~|BQV9KUjw= zi$KB~_zu30pVavFP}!>`#cx*j?V7{ox=S4DEIQlf!G2+parnLFWbi-z{-yRwS6hXNKw3lA`e_{K~kYY2^Zmf u@f=SD{v0n}G>8}7*+hLNl3S2esFR*hC;C778Bq126W{;i`~Uyn{r@|(kixtG literal 0 HcmV?d00001 diff --git a/.github/workflows/documentation.yml b/.github/workflows/documentation.yml new file mode 100644 index 0000000..d741a19 --- /dev/null +++ b/.github/workflows/documentation.yml @@ -0,0 +1,34 @@ +name: Documentation + +on: + push: + tags: + - "*" + branches: [ main, master ] + +jobs: + build: + name: Build and publish documentation + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + - uses: actions/setup-python@v2 + - name: Install Docs + run: | + sudo apt-get install doxygen + pip install jinja2 Pygments + - name: prepare + run: | + make prepare + - name: configure + run: | + cmake -H. -Bbuild -G "Unix Makefiles" -DCMAKE_BUILD_TYPE="Debug" + - name: building + run: | + cmake --build build --config Debug --target docs -j4 + - name: Deploy to GitHub Pages + uses: Cecilapp/GitHub-Pages-deploy@v3 + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + with: + build_dir: ./docs/html diff --git a/.github/workflows/macos.yml b/.github/workflows/macos.yml new file mode 100644 index 0000000..40ce4ca --- /dev/null +++ b/.github/workflows/macos.yml @@ -0,0 +1,27 @@ +name: MacOS CI Test + +on: + push: + branches: [ main, master, dev ] + pull_request: + branches: [ main, master, dev ] + +jobs: + build: + runs-on: macos-latest + steps: + - uses: actions/checkout@v2 + - name: prepare + run: | + make prepare + - name: configure + run: | + cmake -H. -Bbuild -G "Unix Makefiles" -DCMAKE_BUILD_TYPE="Debug" + - name: building + run: | + cmake --build build --config Debug --target unit_tests -j4 + - name: testing + run: | + cd build + cd tests + ./unit_tests diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml new file mode 100644 index 0000000..5b607b2 --- /dev/null +++ b/.github/workflows/pre-commit.yml @@ -0,0 +1,15 @@ +name: pre-commit + +on: + push: + branches: [ main, master, dev ] + pull_request: + branches: [ main, master, dev ] + +jobs: + pre-commit: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + - uses: actions/setup-python@v2 + - uses: pre-commit/action@v2.0.0 diff --git a/.github/workflows/ubuntu.yml b/.github/workflows/ubuntu.yml new file mode 100644 index 0000000..e9ca5ac --- /dev/null +++ b/.github/workflows/ubuntu.yml @@ -0,0 +1,31 @@ +name: Ubuntu CI Test + +on: + push: + branches: [ main, master, dev ] + pull_request: + branches: [ main, master, dev ] + +jobs: + build: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + - name: install + run: | + sudo apt-get install gcovr lcov + - name: prepare + run: | + make prepare + - name: configure + run: | + cmake -H. -Bbuild -G "Unix Makefiles" -DCMAKE_BUILD_TYPE=Debug -DENABLE_COVERAGE=On + - name: building + run: | + cmake --build build --config Debug --target coverage -j4 + - name: testing + run: | + cd build + cd tests + ./unit_tests + bash <(curl -s https://codecov.io/bash) diff --git a/.github/workflows/windows.yml b/.github/workflows/windows.yml new file mode 100644 index 0000000..7be18e6 --- /dev/null +++ b/.github/workflows/windows.yml @@ -0,0 +1,28 @@ +name: Windows CI Test + +on: + push: + branches: [ main, master, dev ] + pull_request: + branches: [ main, master, dev ] + +jobs: + build: + runs-on: windows-latest + steps: + - uses: actions/checkout@v2 + - name: prepare + run: | + make prepare + - name: configure + run: | + cmake -H"." -Bbuild -T host=x86 -A x64 -DCMAKE_BUILD_TYPE="Debug" + - name: building + run: | + cmake --build build --config Debug --target unit_tests -j4 + - name: testing + run: | + cd build + cd tests + cd Debug + .\unit_tests.exe diff --git a/include/embedder.hpp b/include/embedder.hpp index d4d6c8c..60dc182 100644 --- a/include/embedder.hpp +++ b/include/embedder.hpp @@ -115,7 +115,7 @@ namespace openchat { for (int j = 0; j < dZ.cols; j++) { this->table[tok][j] -= dZ[i][j] * lr; } - +tok_it; + ++tok_it; } } From 5654136a2bbab7b4cf235238fc2220d5743e6379 Mon Sep 17 00:00:00 2001 From: Ben Kiev <146984941+Bean91@users.noreply.github.com> Date: Sun, 21 Jun 2026 10:38:08 -0500 Subject: [PATCH 8/8] removed workflows - uneeded --- .github/workflows/documentation.yml | 34 ----------------------------- .github/workflows/macos.yml | 27 ----------------------- .github/workflows/pre-commit.yml | 15 ------------- .github/workflows/ubuntu.yml | 31 -------------------------- .github/workflows/windows.yml | 28 ------------------------ 5 files changed, 135 deletions(-) delete mode 100644 .github/workflows/documentation.yml delete mode 100644 .github/workflows/macos.yml delete mode 100644 .github/workflows/pre-commit.yml delete mode 100644 .github/workflows/ubuntu.yml delete mode 100644 .github/workflows/windows.yml diff --git a/.github/workflows/documentation.yml b/.github/workflows/documentation.yml deleted file mode 100644 index d741a19..0000000 --- a/.github/workflows/documentation.yml +++ /dev/null @@ -1,34 +0,0 @@ -name: Documentation - -on: - push: - tags: - - "*" - branches: [ main, master ] - -jobs: - build: - name: Build and publish documentation - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v2 - - uses: actions/setup-python@v2 - - name: Install Docs - run: | - sudo apt-get install doxygen - pip install jinja2 Pygments - - name: prepare - run: | - make prepare - - name: configure - run: | - cmake -H. -Bbuild -G "Unix Makefiles" -DCMAKE_BUILD_TYPE="Debug" - - name: building - run: | - cmake --build build --config Debug --target docs -j4 - - name: Deploy to GitHub Pages - uses: Cecilapp/GitHub-Pages-deploy@v3 - env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - with: - build_dir: ./docs/html diff --git a/.github/workflows/macos.yml b/.github/workflows/macos.yml deleted file mode 100644 index 40ce4ca..0000000 --- a/.github/workflows/macos.yml +++ /dev/null @@ -1,27 +0,0 @@ -name: MacOS CI Test - -on: - push: - branches: [ main, master, dev ] - pull_request: - branches: [ main, master, dev ] - -jobs: - build: - runs-on: macos-latest - steps: - - uses: actions/checkout@v2 - - name: prepare - run: | - make prepare - - name: configure - run: | - cmake -H. -Bbuild -G "Unix Makefiles" -DCMAKE_BUILD_TYPE="Debug" - - name: building - run: | - cmake --build build --config Debug --target unit_tests -j4 - - name: testing - run: | - cd build - cd tests - ./unit_tests diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml deleted file mode 100644 index 5b607b2..0000000 --- a/.github/workflows/pre-commit.yml +++ /dev/null @@ -1,15 +0,0 @@ -name: pre-commit - -on: - push: - branches: [ main, master, dev ] - pull_request: - branches: [ main, master, dev ] - -jobs: - pre-commit: - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v2 - - uses: actions/setup-python@v2 - - uses: pre-commit/action@v2.0.0 diff --git a/.github/workflows/ubuntu.yml b/.github/workflows/ubuntu.yml deleted file mode 100644 index e9ca5ac..0000000 --- a/.github/workflows/ubuntu.yml +++ /dev/null @@ -1,31 +0,0 @@ -name: Ubuntu CI Test - -on: - push: - branches: [ main, master, dev ] - pull_request: - branches: [ main, master, dev ] - -jobs: - build: - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v2 - - name: install - run: | - sudo apt-get install gcovr lcov - - name: prepare - run: | - make prepare - - name: configure - run: | - cmake -H. -Bbuild -G "Unix Makefiles" -DCMAKE_BUILD_TYPE=Debug -DENABLE_COVERAGE=On - - name: building - run: | - cmake --build build --config Debug --target coverage -j4 - - name: testing - run: | - cd build - cd tests - ./unit_tests - bash <(curl -s https://codecov.io/bash) diff --git a/.github/workflows/windows.yml b/.github/workflows/windows.yml deleted file mode 100644 index 7be18e6..0000000 --- a/.github/workflows/windows.yml +++ /dev/null @@ -1,28 +0,0 @@ -name: Windows CI Test - -on: - push: - branches: [ main, master, dev ] - pull_request: - branches: [ main, master, dev ] - -jobs: - build: - runs-on: windows-latest - steps: - - uses: actions/checkout@v2 - - name: prepare - run: | - make prepare - - name: configure - run: | - cmake -H"." -Bbuild -T host=x86 -A x64 -DCMAKE_BUILD_TYPE="Debug" - - name: building - run: | - cmake --build build --config Debug --target unit_tests -j4 - - name: testing - run: | - cd build - cd tests - cd Debug - .\unit_tests.exe