diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 0000000..601adcf Binary files /dev/null and b/.DS_Store differ diff --git a/AIUsage.md b/AIUsage.md index 918539f..4932e78 100644 --- a/AIUsage.md +++ b/AIUsage.md @@ -3,6 +3,7 @@ ## Source Code - Notebook LM was used to understand technical articles relevant to the creation of code + - CodeRabbit AI was used for PR review ### Tokenizer diff --git a/include/block.hpp b/include/block.hpp index 9bace6b..5c437fc 100644 --- a/include/block.hpp +++ b/include/block.hpp @@ -3,6 +3,8 @@ #include "neural_network.hpp" #include "self_attention.hpp" +#include "utility.hpp" +#include namespace openchat { class block { @@ -41,14 +43,24 @@ namespace openchat { utility::matrix feedForward(utility::matrix x) { x = attention.attention(x); - for (int i = 0; i < x.rows; i++) { - std::vector output = network.feedForward(std::vector(x[i], x[i] + this->n_embd)); - std::copy(output.begin(), output.end(), x[i]); - } + x = network.feedForward(x); return x; } + std::pair>, std::vector>> backward(utility::matrix dZ) { + std::pair>> net_pass = network.backward(dZ); + + std::pair> attn_pass = attention.backward(net_pass.first); + + std::pair>, std::vector> dW; + dW.first = net_pass.second; + dW.second = attn_pass.second; + + return {attn_pass.first, dW}; + } + + void changeOne(char mat, size_t row, size_t col, float d) { this->attention.changeOne(mat, row, col, d); } diff --git a/include/embedder.hpp b/include/embedder.hpp index cd46ff7..60dc182 100644 --- a/include/embedder.hpp +++ b/include/embedder.hpp @@ -3,6 +3,7 @@ #include #include +#include #include #include #include "utility.hpp" @@ -14,6 +15,8 @@ namespace openchat { size_t n_tok; size_t n_embd; + std::forward_list toks; + std::default_random_engine generator; std::normal_distribution initDist; @@ -88,10 +91,34 @@ namespace openchat { return vec; } + utility::matrix embed(std::forward_list toks) { + this->toks = toks; + utility::matrix vec(std::distance(toks.begin(), toks.end()), this->n_embd); + int i = 0; + for (int tok : toks) { + std::vector emb = this->embed(tok); + std::copy(emb.begin(), emb.end(), vec[i]); + i++; + } + + return vec; + } + utility::matrix * getTable() { return &this->table; } + void backward(utility::matrix dZ, float lr) { + auto tok_it = this->toks.begin(); + for (int i = 0; i < dZ.rows; i++) { + int tok = *tok_it; + for (int j = 0; j < dZ.cols; j++) { + this->table[tok][j] -= dZ[i][j] * lr; + } + ++tok_it; + } + } + embedder() {} }; } diff --git a/include/layer.hpp b/include/layer.hpp index 68e610f..d9704dd 100644 --- a/include/layer.hpp +++ b/include/layer.hpp @@ -11,6 +11,10 @@ namespace openchat { private: utility::matrix weights; utility::matrix biases; + + utility::matrix X; + utility::matrix Z; + std::default_random_engine generator; std::normal_distribution initDist; @@ -71,14 +75,36 @@ namespace openchat { this->readFromFile(input); } - std::vector feedForward(std::vector input) { - utility::matrix i (1, input.size()); - i.data = input; - std::vector output = utility::add(utility::dot(i, this->weights), this->biases).data; - for (size_t i = 0; i < output.size(); i++) { - output[i] = utility::relu(output[i]); + utility::matrix feedForward(utility::matrix x) { + this->X = x; + utility::matrix z = utility::dot(x, this->weights); + this->Z = z; + for (size_t i = 0; i < z.rows; i++) { + for (size_t j = 0; j < z.cols; j++) { + z[i][j] += this->biases[0][j]; + } + } + for (size_t i = 0; i < z.rows; i++) for (size_t j = 0; j < z.cols; j++) z[i][j] = utility::relu(z[i][j]); + return z; + } + + std::pair> backward(utility::matrix dZ) { + int M = dZ.rows; + int N = dZ.cols; + + utility::matrix dW = utility::dot(utility::transpose(this->X), dZ); + utility::matrix dX = utility::dot(dZ, utility::transpose(this->weights)); + + utility::matrix db(1, N); + std::fill(db.data.begin(), db.data.end(), 0.0f); + + for (int i = 0; i < M; ++i) { + for (int j = 0; j < N; ++j) { + db.data[j] += dZ.data[i * N + j]; + } } - return output; + + return {dX, {dW, db}}; } void changeOne(float d, size_t n_in, size_t n_out) { diff --git a/include/model.hpp b/include/model.hpp index 7a52eac..062e518 100644 --- a/include/model.hpp +++ b/include/model.hpp @@ -1,7 +1,9 @@ #ifndef MODEL_HPP #define MODEL_HPP +#include #include +#include #include #include #include @@ -17,6 +19,7 @@ namespace openchat { tokenizer tokenizer; embedder embedder; std::vector blocks; + float learning_rate = 0.01; public: void init() { @@ -33,25 +36,24 @@ namespace openchat { } } - model(class tokenizer &tokenizer, class embedder &embedder, std::vector &blocks) { + model(class tokenizer &tokenizer, class embedder &embedder, std::vector &blocks, float learning_rate = 0.01) { this->tokenizer = tokenizer; this->embedder = embedder; this->blocks = blocks; + this->learning_rate = learning_rate; + this->init(); } - std::string forwardPass(std::string input) { - std::forward_list tokens = tokenizer.encode(input); + void changeLearningRate(float n) { + this->learning_rate = n; + } + + utility::matrix forwardPass(std::forward_list tokens) { utility::matrix unembed = utility::transpose(*embedder.getTable()); - utility::matrix x = utility::matrix(std::distance(tokens.begin(), tokens.end()), embedder.getNEmbd()); - int i = 0; - for (int token : tokens) { - std::vector emb = embedder.embed(token); - std::copy(emb.begin(), emb.end(), x[i]); - i++; - } + utility::matrix x = embedder.embed(tokens); x = positionalEncoding(x).apply(); @@ -64,17 +66,70 @@ namespace openchat { dist = utility::softmax(utility::dot(dist, unembed)); - float max = dist[0][0]; - int token = 0; - for (size_t j = 1; j < dist.cols; j++) { - if (dist[0][j] > max) { - max = dist[0][j]; - token = static_cast(j); + return dist; + } + + void train(std::string input, size_t epochs = 100) { + std::forward_list corpus = tokenizer.encode(input); + auto length = std::distance(corpus.begin(), corpus.end()); + + if (length < static_cast(epochs)) throw std::invalid_argument("Give a longer input!"); + size_t start = length - epochs; + + for (size_t i = 0; i < epochs; i++) { + std::forward_list tokens(corpus.begin(), std::next(corpus.begin(), i + start)); + int next = *std::next(corpus.begin(), i + start); + + utility::matrix dist = forwardPass(tokens); + utility::matrix oneHot = utility::matrix(dist.rows, dist.cols); + oneHot[0][next] = 1; + float loss = -1 * std::log(dist[0][next]); + + utility::matrix dZ = utility::subtract(dist, oneHot); + + std::vector>, std::vector>> bdW; + utility::matrix edW; + + for (auto it = blocks.rbegin(); it != blocks.rend(); ++it) { + std::pair>, std::vector>> p = it->backward(dZ); + dZ = p.first; + bdW.push_back(p.second); } - } + + embedder.backward(dZ, this->learning_rate); + + int j = 0; + for (auto it = this->blocks.rbegin(); it != this->blocks.rend(); ++it) { + block &b = *it; + std::vector> ndW = bdW[j].first; + std::vector adW = bdW[j].second; + + for (size_t layer = 0; layer < ndW.size(); layer++) { + for (int k = 0; k < ndW[layer].first.cols; k++) + b.changeOne(layer, ndW[layer].first[0][k] * this->learning_rate, k); + + for (int k = 0; k < ndW[layer].second.rows; k++) + for (int l = 0; l < ndW[layer].second.cols; l++) + b.changeOne(layer, ndW[layer].second[k][l] * this->learning_rate, k, l); + } + + for (int k = 0; k < adW[0].rows; k++) + for (int l = 0; l < adW[0].cols; l++) + b.changeOne('q', k, l, adW[0][k][l] * this->learning_rate); - return tokenizer.decode({token}); + for (int k = 0; k < adW[1].rows; k++) + for (int l = 0; l < adW[1].cols; l++) + b.changeOne('k', k, l, adW[1][k][l] * this->learning_rate); + + for (int k = 0; k < adW[2].rows; k++) + for (int l = 0; l < adW[2].cols; l++) + b.changeOne('v', k, l, adW[2][k][l] * this->learning_rate); + + j++; + } + } } + }; } diff --git a/include/neural_network.hpp b/include/neural_network.hpp index b677c21..75a222b 100644 --- a/include/neural_network.hpp +++ b/include/neural_network.hpp @@ -6,6 +6,7 @@ #include #include #include "layer.hpp" +#include "utility.hpp" namespace openchat { class neuralNetwork { @@ -13,13 +14,8 @@ namespace openchat { std::vector network; std::vector dimensions; - std::vector gain; - std::vector bias; - public: void init() { - this->gain = std::vector(this->dimensions[(this->dimensions.size() - 1)], 1.0f); - this->bias = std::vector(this->dimensions[(this->dimensions.size() - 1)], 0.0f); for (size_t i = 0; i < dimensions.size() - 1; i++) { network.push_back(layer(dimensions[i], dimensions[i + 1])); } @@ -30,8 +26,6 @@ namespace openchat { if (inFile.is_open()) { inFile.read(reinterpret_cast(this->dimensions.data()), sizeof(size_t) * this->dimensions.size()); - inFile.read(reinterpret_cast(this->gain.data()), sizeof(float) * this->dimensions[(this->dimensions.size() - 1)]); - inFile.read(reinterpret_cast(this->bias.data()), sizeof(float) * this->dimensions[(this->dimensions.size() - 1)]); } for (size_t i = 0; i < input.second.size(); i++) { @@ -44,8 +38,6 @@ namespace openchat { if (outFile.is_open()) { outFile.write(reinterpret_cast(this->dimensions.data()), sizeof(size_t) * this->dimensions.size()); - outFile.write(reinterpret_cast(this->gain.data()), sizeof(float) * this->dimensions[(this->dimensions.size() - 1)]); - outFile.write(reinterpret_cast(this->bias.data()), sizeof(float) * this->dimensions[(this->dimensions.size() - 1)]); } for (size_t i = 0; i < output.second.size(); i++) { @@ -63,7 +55,9 @@ namespace openchat { this->readFromFile(input); } - void layerNorm(std::vector &input) { + void layerNorm(std::vector &x, size_t start, size_t end) { + std::vector input(x.begin() + start, x.begin() + end); + if (input.empty()) return; float sum = 0; @@ -75,17 +69,34 @@ namespace openchat { float variance = sum / input.size(); for (int i = 0; i < input.size(); i++) { - input[i] = ((input[i] - mean)/std::sqrt(variance + std::numeric_limits::epsilon())) * this->gain[i] + this->bias[i]; + input[i] = ((input[i] - mean)/std::sqrt(variance + std::numeric_limits::epsilon())); + } + + for (int i = 0; i < input.size(); i++) { + x[start+i] = input[i]; } } - std::vector feedForward(std::vector input) { + utility::matrix feedForward(utility::matrix x) { for (layer &l : network) { - input = l.feedForward(input); + x = l.feedForward(x); } - this->layerNorm(input); - return input; + for (int i = 0; i < x.rows; i++) { + this->layerNorm(x.data, i*x.cols, i*x.cols+x.cols); + } + return x; + } + + std::pair>> backward(utility::matrix dZ) { + std::pair>> ndW; + for (auto it = this->network.rbegin(); it != this->network.rend(); ++it) { + std::pair> p = it->backward(dZ); + dZ = p.first; + ndW.second.push_back(p.second); + } + ndW.first = dZ; + return ndW; } void changeOne(size_t layer, float d, size_t n_in, size_t n_out) { @@ -95,11 +106,6 @@ namespace openchat { void changeOne(size_t layer, float d, size_t n_in) { this->network[layer].changeOne(d, n_in); } - - void changeOne(int x, float d, int pos) { - if (x == 0) gain[pos] -= d; - if (x == 1) bias[pos] -= d; - } neuralNetwork() {} }; diff --git a/include/self_attention.hpp b/include/self_attention.hpp index 9a8f0e8..1eb8db1 100644 --- a/include/self_attention.hpp +++ b/include/self_attention.hpp @@ -5,6 +5,9 @@ #include #include #include +#include +#include +#include namespace openchat { class selfAttention { @@ -16,6 +19,8 @@ namespace openchat { utility::matrix q; utility::matrix k; utility::matrix v; + utility::matrix p; + utility::matrix x; size_t n_embd; @@ -24,7 +29,8 @@ namespace openchat { public: void init() { - initDist = std::normal_distribution(0, this->n_embd); + float stddev = 1.0f / std::sqrt(static_cast(this->n_embd)); + initDist = std::normal_distribution(0.0f, stddev); for (size_t i = 0; i < this->n_embd; i++) { for (size_t j = 0; j < this->n_embd; j++) { @@ -78,23 +84,68 @@ namespace openchat { } utility::matrix attention(utility::matrix x) { + this->x = x; this->q = utility::dot(x, this->wq); this->k = utility::dot(x, this->wk); this->v = utility::dot(x, this->wv); - - return utility::dot( - utility::softmax( - utility::scalar_div( - utility::dot(this->q, - utility::transpose(this->k)), - std::sqrt(n_embd))), - this->v); + + this->p = utility::softmax(utility::scalar_div( + utility::dot(this->q, utility::transpose(this->k)), + std::sqrt(static_cast(n_embd)))); + + return utility::dot(this->p, this->v); } size_t getNEmbed() { return this->n_embd; } + std::pair> backward(utility::matrix dZ) { + utility::matrix dV = utility::dot(utility::transpose(this->p), dZ); + utility::matrix dP = utility::dot(dZ, utility::transpose(this->v)); + + int M = dP.rows; + utility::matrix dS(M, M); + + for (int i = 0; i < M; ++i) { + float sum_dP_P = 0.0f; + + for (int k = 0; k < M; ++k) { + sum_dP_P += dP.data[i * M + k] * this->p.data[i * M + k]; + } + + for (int j = 0; j < M; ++j) { + int idx = i * M + j; + dS.data[idx] = this->p.data[idx] * (dP.data[idx] - sum_dP_P); + } + } + + int K = this->q.cols; + float scale = 1.0f / std::sqrt(static_cast(K)); + for (int i = 0; i < M * M; ++i) { + dS.data[i] *= scale; + } + + utility::matrix dQ = utility::dot(dS, this->k); + utility::matrix dK = utility::dot(utility::transpose(dS), this->q); + + utility::matrix dWq = utility::dot(utility::transpose(this->x), dQ); + utility::matrix dWk = utility::dot(utility::transpose(this->x), dK); + utility::matrix dWv = utility::dot(utility::transpose(this->x), dV); + + utility::matrix dX_q = utility::dot(dQ, utility::transpose(this->wq)); + utility::matrix dX_k = utility::dot(dK, utility::transpose(this->wk)); + utility::matrix dX_v = utility::dot(dV, utility::transpose(this->wv)); + + utility::matrix dX(dX_q.rows, dX_q.cols); + for (int i = 0; i < dX.rows * dX.cols; ++i) { + dX.data[i] = dX_q.data[i] + dX_k.data[i] + dX_v.data[i]; + } + + std::vector weight_gradients = {dWq, dWk, dWv}; + return {dX, weight_gradients}; + } + void changeOne(char mat, size_t row, size_t col, float d) { if (mat == 'q') wq[row][col] -= d; else if (mat == 'k') wk[row][col] -= d; @@ -105,4 +156,4 @@ namespace openchat { }; } -#endif \ No newline at end of file +#endif diff --git a/include/utility.hpp b/include/utility.hpp index 686f2aa..d829931 100644 --- a/include/utility.hpp +++ b/include/utility.hpp @@ -49,6 +49,22 @@ namespace openchat { return c; } + + inline matrix subtract(const matrix& a, const matrix& b) { + if (a.cols != b.cols) + throw std::invalid_argument("Columns don't match!"); + if (a.rows != b.rows) + throw std::invalid_argument("Rows don't match!"); + + matrix c = matrix(a.rows, b.cols); + for (size_t i = 0; i < a.rows; i++ ) { + for (size_t j = 0; j < a.cols; j++) { + c[i][j] = a[i][j] - b[i][j]; + } + } + + return c; + } inline matrix dot(const matrix& a, const matrix& b) { if (a.cols != b.rows) throw std::invalid_argument("Inner dimensions don't match!");