Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file added .DS_Store
Binary file not shown.
1 change: 1 addition & 0 deletions AIUsage.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
## Source Code

- Notebook LM was used to understand technical articles relevant to the creation of code
- CodeRabbit AI was used for PR review

### Tokenizer

Expand Down
20 changes: 16 additions & 4 deletions include/block.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@

#include "neural_network.hpp"
#include "self_attention.hpp"
#include "utility.hpp"
#include <vector>

namespace openchat {
class block {
Expand Down Expand Up @@ -41,14 +43,24 @@ namespace openchat {

utility::matrix feedForward(utility::matrix x) {
x = attention.attention(x);
for (int i = 0; i < x.rows; i++) {
std::vector<float> output = network.feedForward(std::vector<float>(x[i], x[i] + this->n_embd));
std::copy(output.begin(), output.end(), x[i]);
}
x = network.feedForward(x);

return x;
}

std::pair<utility::matrix, std::pair<std::vector<std::pair<utility::matrix, utility::matrix>>, std::vector<utility::matrix>>> backward(utility::matrix dZ) {
std::pair<utility::matrix, std::vector<std::pair<utility::matrix, utility::matrix>>> net_pass = network.backward(dZ);

std::pair<utility::matrix, std::vector<utility::matrix>> attn_pass = attention.backward(net_pass.first);

std::pair<std::vector<std::pair<utility::matrix, utility::matrix>>, std::vector<utility::matrix>> dW;
dW.first = net_pass.second;
dW.second = attn_pass.second;

return {attn_pass.first, dW};
}


void changeOne(char mat, size_t row, size_t col, float d) {
this->attention.changeOne(mat, row, col, d);
}
Expand Down
27 changes: 27 additions & 0 deletions include/embedder.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

#include <cstddef>
#include <filesystem>
#include <forward_list>
#include <random>
#include <fstream>
#include "utility.hpp"
Expand All @@ -14,6 +15,8 @@ namespace openchat {
size_t n_tok;
size_t n_embd;

std::forward_list<int> toks;

std::default_random_engine generator;
std::normal_distribution<float> initDist;

Expand Down Expand Up @@ -88,10 +91,34 @@ namespace openchat {
return vec;
}

utility::matrix embed(std::forward_list<int> toks) {
this->toks = toks;
utility::matrix vec(std::distance(toks.begin(), toks.end()), this->n_embd);
int i = 0;
for (int tok : toks) {
std::vector<float> emb = this->embed(tok);
std::copy(emb.begin(), emb.end(), vec[i]);
i++;
}

return vec;
}

utility::matrix * getTable() {
return &this->table;
}

void backward(utility::matrix dZ, float lr) {
auto tok_it = this->toks.begin();
for (int i = 0; i < dZ.rows; i++) {
int tok = *tok_it;
for (int j = 0; j < dZ.cols; j++) {
this->table[tok][j] -= dZ[i][j] * lr;
}
++tok_it;
}
}
Comment thread
Bean91 marked this conversation as resolved.

embedder() {}
};
}
Expand Down
40 changes: 33 additions & 7 deletions include/layer.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,10 @@ namespace openchat {
private:
utility::matrix weights;
utility::matrix biases;

utility::matrix X;
utility::matrix Z;

std::default_random_engine generator;
std::normal_distribution<float> initDist;

Expand Down Expand Up @@ -71,14 +75,36 @@ namespace openchat {
this->readFromFile(input);
}

std::vector<float> feedForward(std::vector<float> input) {
utility::matrix i (1, input.size());
i.data = input;
std::vector<float> output = utility::add(utility::dot(i, this->weights), this->biases).data;
for (size_t i = 0; i < output.size(); i++) {
output[i] = utility::relu(output[i]);
utility::matrix feedForward(utility::matrix x) {
this->X = x;
utility::matrix z = utility::dot(x, this->weights);
this->Z = z;
for (size_t i = 0; i < z.rows; i++) {
for (size_t j = 0; j < z.cols; j++) {
z[i][j] += this->biases[0][j];
}
}
for (size_t i = 0; i < z.rows; i++) for (size_t j = 0; j < z.cols; j++) z[i][j] = utility::relu(z[i][j]);
return z;
}
Comment thread
Bean91 marked this conversation as resolved.

std::pair<utility::matrix, std::pair<utility::matrix, utility::matrix>> backward(utility::matrix dZ) {
int M = dZ.rows;
int N = dZ.cols;

utility::matrix dW = utility::dot(utility::transpose(this->X), dZ);
utility::matrix dX = utility::dot(dZ, utility::transpose(this->weights));

utility::matrix db(1, N);
std::fill(db.data.begin(), db.data.end(), 0.0f);

for (int i = 0; i < M; ++i) {
for (int j = 0; j < N; ++j) {
db.data[j] += dZ.data[i * N + j];
}
}
return output;

return {dX, {dW, db}};
}
Comment thread
Bean91 marked this conversation as resolved.

void changeOne(float d, size_t n_in, size_t n_out) {
Expand Down
91 changes: 73 additions & 18 deletions include/model.hpp
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
#ifndef MODEL_HPP
#define MODEL_HPP

#include <cstddef>
#include <forward_list>
#include <stdexcept>
#include <vector>
#include <filesystem>
#include <utility>
Expand All @@ -17,6 +19,7 @@ namespace openchat {
tokenizer tokenizer;
embedder embedder;
std::vector<block> blocks;
float learning_rate = 0.01;

public:
void init() {
Expand All @@ -33,25 +36,24 @@ namespace openchat {
}
}

model(class tokenizer &tokenizer, class embedder &embedder, std::vector<class block> &blocks) {
model(class tokenizer &tokenizer, class embedder &embedder, std::vector<class block> &blocks, float learning_rate = 0.01) {
this->tokenizer = tokenizer;
this->embedder = embedder;
this->blocks = blocks;

this->learning_rate = learning_rate;

this->init();
}

std::string forwardPass(std::string input) {
std::forward_list<int> tokens = tokenizer.encode(input);
void changeLearningRate(float n) {
this->learning_rate = n;
}

utility::matrix forwardPass(std::forward_list<int> tokens) {
utility::matrix unembed = utility::transpose(*embedder.getTable());

utility::matrix x = utility::matrix(std::distance(tokens.begin(), tokens.end()), embedder.getNEmbd());
int i = 0;
for (int token : tokens) {
std::vector<float> emb = embedder.embed(token);
std::copy(emb.begin(), emb.end(), x[i]);
i++;
}
utility::matrix x = embedder.embed(tokens);

x = positionalEncoding(x).apply();

Expand All @@ -64,17 +66,70 @@ namespace openchat {

dist = utility::softmax(utility::dot(dist, unembed));

float max = dist[0][0];
int token = 0;
for (size_t j = 1; j < dist.cols; j++) {
if (dist[0][j] > max) {
max = dist[0][j];
token = static_cast<int>(j);
return dist;
}

void train(std::string input, size_t epochs = 100) {
std::forward_list<int> corpus = tokenizer.encode(input);
auto length = std::distance(corpus.begin(), corpus.end());

if (length < static_cast<long long>(epochs)) throw std::invalid_argument("Give a longer input!");
size_t start = length - epochs;

for (size_t i = 0; i < epochs; i++) {
std::forward_list<int> tokens(corpus.begin(), std::next(corpus.begin(), i + start));
int next = *std::next(corpus.begin(), i + start);

utility::matrix dist = forwardPass(tokens);
utility::matrix oneHot = utility::matrix(dist.rows, dist.cols);
oneHot[0][next] = 1;
float loss = -1 * std::log(dist[0][next]);

utility::matrix dZ = utility::subtract(dist, oneHot);

std::vector<std::pair<std::vector<std::pair<utility::matrix, utility::matrix>>, std::vector<utility::matrix>>> bdW;
utility::matrix edW;

for (auto it = blocks.rbegin(); it != blocks.rend(); ++it) {
std::pair<utility::matrix, std::pair<std::vector<std::pair<utility::matrix, utility::matrix>>, std::vector<utility::matrix>>> p = it->backward(dZ);
dZ = p.first;
bdW.push_back(p.second);
}
}

embedder.backward(dZ, this->learning_rate);

int j = 0;
for (auto it = this->blocks.rbegin(); it != this->blocks.rend(); ++it) {
block &b = *it;
std::vector<std::pair<utility::matrix, utility::matrix>> ndW = bdW[j].first;
std::vector<utility::matrix> adW = bdW[j].second;

for (size_t layer = 0; layer < ndW.size(); layer++) {
for (int k = 0; k < ndW[layer].first.cols; k++)
b.changeOne(layer, ndW[layer].first[0][k] * this->learning_rate, k);

for (int k = 0; k < ndW[layer].second.rows; k++)
for (int l = 0; l < ndW[layer].second.cols; l++)
b.changeOne(layer, ndW[layer].second[k][l] * this->learning_rate, k, l);
}

for (int k = 0; k < adW[0].rows; k++)
for (int l = 0; l < adW[0].cols; l++)
b.changeOne('q', k, l, adW[0][k][l] * this->learning_rate);

return tokenizer.decode({token});
for (int k = 0; k < adW[1].rows; k++)
for (int l = 0; l < adW[1].cols; l++)
b.changeOne('k', k, l, adW[1][k][l] * this->learning_rate);

for (int k = 0; k < adW[2].rows; k++)
for (int l = 0; l < adW[2].cols; l++)
b.changeOne('v', k, l, adW[2][k][l] * this->learning_rate);
Comment thread
Bean91 marked this conversation as resolved.

j++;
Comment thread
Bean91 marked this conversation as resolved.
}
}
}

};
}

Expand Down
46 changes: 26 additions & 20 deletions include/neural_network.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,20 +6,16 @@
#include <limits>
#include <vector>
#include "layer.hpp"
#include "utility.hpp"

namespace openchat {
class neuralNetwork {
private:
std::vector<layer> network;
std::vector<size_t> dimensions;

std::vector<float> gain;
std::vector<float> bias;

public:
void init() {
this->gain = std::vector<float>(this->dimensions[(this->dimensions.size() - 1)], 1.0f);
this->bias = std::vector<float>(this->dimensions[(this->dimensions.size() - 1)], 0.0f);
for (size_t i = 0; i < dimensions.size() - 1; i++) {
network.push_back(layer(dimensions[i], dimensions[i + 1]));
}
Expand All @@ -30,8 +26,6 @@ namespace openchat {

if (inFile.is_open()) {
inFile.read(reinterpret_cast<char *>(this->dimensions.data()), sizeof(size_t) * this->dimensions.size());
inFile.read(reinterpret_cast<char *>(this->gain.data()), sizeof(float) * this->dimensions[(this->dimensions.size() - 1)]);
inFile.read(reinterpret_cast<char *>(this->bias.data()), sizeof(float) * this->dimensions[(this->dimensions.size() - 1)]);
}

for (size_t i = 0; i < input.second.size(); i++) {
Expand All @@ -44,8 +38,6 @@ namespace openchat {

if (outFile.is_open()) {
outFile.write(reinterpret_cast<char *>(this->dimensions.data()), sizeof(size_t) * this->dimensions.size());
outFile.write(reinterpret_cast<char *>(this->gain.data()), sizeof(float) * this->dimensions[(this->dimensions.size() - 1)]);
outFile.write(reinterpret_cast<char *>(this->bias.data()), sizeof(float) * this->dimensions[(this->dimensions.size() - 1)]);
}

for (size_t i = 0; i < output.second.size(); i++) {
Expand All @@ -63,7 +55,9 @@ namespace openchat {
this->readFromFile(input);
}

void layerNorm(std::vector<float> &input) {
void layerNorm(std::vector<float> &x, size_t start, size_t end) {
std::vector<float> input(x.begin() + start, x.begin() + end);

if (input.empty()) return;

float sum = 0;
Expand All @@ -75,17 +69,34 @@ namespace openchat {
float variance = sum / input.size();

for (int i = 0; i < input.size(); i++) {
input[i] = ((input[i] - mean)/std::sqrt(variance + std::numeric_limits<float>::epsilon())) * this->gain[i] + this->bias[i];
input[i] = ((input[i] - mean)/std::sqrt(variance + std::numeric_limits<float>::epsilon()));
}

for (int i = 0; i < input.size(); i++) {
x[start+i] = input[i];
}
}

std::vector<float> feedForward(std::vector<float> input) {
utility::matrix feedForward(utility::matrix x) {
for (layer &l : network) {
input = l.feedForward(input);
x = l.feedForward(x);
}

this->layerNorm(input);
return input;
for (int i = 0; i < x.rows; i++) {
this->layerNorm(x.data, i*x.cols, i*x.cols+x.cols);
}
return x;
}

std::pair<utility::matrix, std::vector<std::pair<utility::matrix, utility::matrix>>> backward(utility::matrix dZ) {
std::pair<utility::matrix, std::vector<std::pair<utility::matrix, utility::matrix>>> ndW;
for (auto it = this->network.rbegin(); it != this->network.rend(); ++it) {
std::pair<utility::matrix, std::pair<utility::matrix, utility::matrix>> p = it->backward(dZ);
dZ = p.first;
ndW.second.push_back(p.second);
}
ndW.first = dZ;
return ndW;
}

void changeOne(size_t layer, float d, size_t n_in, size_t n_out) {
Expand All @@ -95,11 +106,6 @@ namespace openchat {
void changeOne(size_t layer, float d, size_t n_in) {
this->network[layer].changeOne(d, n_in);
}

void changeOne(int x, float d, int pos) {
if (x == 0) gain[pos] -= d;
if (x == 1) bias[pos] -= d;
}

neuralNetwork() {}
};
Expand Down
Loading