Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 14 additions & 0 deletions demos/tinystories/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
chai_add_executable(TinyStoriesTest
${CMAKE_CURRENT_SOURCE_DIR}/testArgmaxDecode.chpl
${PROJECT_ROOT_DIR}/lib
)

chai_add_executable(TinyStoriesTestConv2d
${CMAKE_CURRENT_SOURCE_DIR}/testConv2d.chpl
${PROJECT_ROOT_DIR}/lib
)

chai_add_executable(OliverHowTo
${CMAKE_CURRENT_SOURCE_DIR}/oh2.chpl
${PROJECT_ROOT_DIR}/lib
)
5 changes: 5 additions & 0 deletions demos/tinystories/oh2.chpl
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
use Tensor;

proc main() {
writeln(ndarray.arange(1,2,3));
}
12 changes: 12 additions & 0 deletions demos/tinystories/research/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
cmake_minimum_required(VERSION 4.0)
project(libtorch_example)

find_package(Torch REQUIRED)

add_executable(libtorch_example generate.cpp)

# Include path for nlohmann/json
target_include_directories(libtorch_example PRIVATE ${PROJECT_SOURCE_DIR}/external/json)

target_link_libraries(libtorch_example "${TORCH_LIBRARIES}")
set_property(TARGET libtorch_example PROPERTY CXX_STANDARD 17)
5 changes: 5 additions & 0 deletions demos/tinystories/research/checkpath.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
import torch
print(torch.__path__)
print(torch.utils.cmake_prefix_path)


92 changes: 92 additions & 0 deletions demos/tinystories/research/data/TinyStories.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
import torch
from collections import Counter
from torch.utils.data import Dataset, DataLoader
from datasets import load_dataset
import spacy
from torch.nn.utils.rnn import pad_sequence
from tqdm import tqdm
import random
random.seed(0)

MAX_LEN = 300
MAX_STORIES = 250000

class Vocabulary:
def __init__(self, corpus, tokenizer):
self.tokenizer = tokenizer
self.word2idx, self.idx2word = self.build_vocab(corpus)

def __len__(self):
return len(self.word2idx)

def text2idx(self, text):
tokens = [str(x).strip().lower() for x in self.tokenizer(text)]
return [self.word2idx[t] if t in self.word2idx.keys() else self.word2idx['<UNK>'] for t in tokens]

def idx2text(self, idxs):
return [self.idx2word[i] if i in self.idx2word.keys() else '<UNK>' for i in idxs]


def build_vocab(self,corpus):
cntr = Counter()
for datapoint in tqdm(corpus):
cntr.update( [str(x).strip().lower() for x in self.tokenizer(datapoint)] )

tokens = [t for t,c in cntr.items() if c >= 30]
word2idx = {t:i+4 for i,t in enumerate(tokens)}
idx2word = {i+4:t for i,t in enumerate(tokens)}

word2idx['<PAD>'] = 0 #add padding token
idx2word[0] = '<PAD>'

word2idx['<SOS>'] = 1 #add padding token
idx2word[1] = '<SOS>'

word2idx['<EOS>'] = 2 #add padding token
idx2word[2] = '<EOS>'

word2idx['<UNK>'] = 3 #add padding token
idx2word[3] = '<UNK>'


return word2idx, idx2word

class TinyStories(Dataset):

def __init__(self,split="train", vocab = None):

print("Loading data...")
dataset = load_dataset("roneneldan/TinyStories", split=split)
self.data = [x["text"] for x in random.sample(list(dataset), MAX_STORIES)]


if vocab == None:
print("Building vocab...")
self.vocab = Vocabulary(self.data, spacy.load('en_core_web_sm').tokenizer)
else:
self.vocab = vocab

def __len__(self):
return len(self.data)

def __getitem__(self, idx):
x = self.vocab.text2idx(self.data[idx])
l = min(MAX_LEN, len(x))
numeralized = [self.vocab.word2idx['<SOS>']]+x[:l]+[self.vocab.word2idx['<EOS>']]
return torch.tensor(numeralized)

@staticmethod
def pad_collate(batch):
xx_pad = pad_sequence(batch, batch_first=True, padding_value=0)

return xx_pad

def getTinyStoriesDataloadersAndVocab(batch_size=128):
train = TinyStories(split="train")

collate = TinyStories.pad_collate
train_loader = DataLoader(train, batch_size=batch_size, num_workers=8, shuffle=True, collate_fn=collate, drop_last=True)

return train_loader, train.vocab


24 changes: 24 additions & 0 deletions demos/tinystories/research/export_model.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
import torch
from models.TransformerLM import *
from data.TinyStories import *
from spacy.tokenizer import Tokenizer

# safely unpickle Vocabulary object
torch.serialization.add_safe_globals([Vocabulary, Tokenizer])

# load checkpoint
CHKPT_PATH = "./chkpts/2ZJPbu_TinyStories"
chkpt = torch.load(CHKPT_PATH, map_location=torch.device('cpu'))

# rebuild model
config = chkpt["config"]
vocab = chkpt["vocab"]
vocab_size = len(vocab)

model = TransformerLM(vocab_size, config["d_model"], config["n_heads"], config["n_layers"])
model.load_state_dict(chkpt["model_state_dict"])
model.eval()

# script and save model
scripted_model = torch.jit.script(model)
scripted_model.save("model.pt")
Loading
Loading