Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,2 +1,4 @@
.env
.vscode/
.vscode/
data/python.txt
__pycache__/
3 changes: 2 additions & 1 deletion Pipfile
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ pygithub = "*"
autopep8 = "*"
textgenrnn = "*"
tensorflow = "*"
keras = "*"

[requires]
python_version = "3.7"
python_version = "3.6"
53 changes: 21 additions & 32 deletions Pipfile.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

12 changes: 11 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1,2 +1,12 @@
# PyOctoscraper
A Python based scraper to download Python source code from Github
A Python based scraper to download Python source code from Github and train an RNN to generate source code. I have no hopes that the code generated will be useful, or even valid, but it's a fun experiment nonetheless.

## Scraper

I could not find any dataset of source code, so I scraped it myself. The [scraper.py](scraper.py) does the magic. To keep things sane, we're only interested in `keras` code written in `python` that have more than 500 stars. The rationale being that well written code is more likely to be written correctly (not exactly proof, but a close enough approximation).

## Generation

A super big shoutout to [Max Woolf](http://minimaxir.com/) for creating the [textgenrnn](https://github.com/minimaxir/textgenrnn) package. It made my life considerably easier and I highly recommend it for quick and dirty projects.

## Samples
2 changes: 1 addition & 1 deletion Scraper/octoscrape.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ def __init__(self, page=0):

def search_repos(self):
return self.g.search_repositories(
query='keras stars:>=500 fork:true language:python').get_page(self.page)
query='keras stars:>=1000 fork:true language:python').get_page(self.page)

def get_contents(self, repo, file_extension):
try:
Expand Down
Empty file added Trainer/__init__.py
Empty file.
80 changes: 80 additions & 0 deletions Trainer/trainer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
from keras.preprocessing.text import Tokenizer
from keras.models import Sequential
from keras.layers import Dropout, LSTM, Dense, Bidirectional, Activation, Embedding
from keras.callbacks import ModelCheckpoint, EarlyStopping

import numpy as np


class Trainer:
def __init__(self, textfile, seq_length=50, batch_size=32):
self.textfile = textfile
self.seq_length = seq_length
self.BATCH_SIZE = batch_size

self._process_content()
self._build_model()
self._setup_checkpoints()

def _setup_checkpoints(self):
model_checkpoint = ModelCheckpoint(
'model.hdf5', monitor='acc', save_best_only=True)
earlystopping_checkpoint = EarlyStopping(
monitor='acc', patience=20)
self._checkpoints = [model_checkpoint, earlystopping_checkpoint]

def _build_model(self):
model = Sequential()
model.add(Embedding(input_dim=len(self._words), output_dim=1024))

model.add(Bidirectional(
LSTM(128), input_shape=(self.seq_length, len(self._words))))

model.add(Dropout(0.5))
model.add(Dense(len(self._words)))
model.add(Activation('softmax'))
model.compile(loss='sparse_categorical_crossentropy',
optimizer="adam", metrics=['accuracy'])

self._model = model

def _process_content(self):
file = open(self.textfile, 'r')
filecontents = file.read()
file.close()
filecontents = filecontents.replace('\n', ' \n ')

text_in_words = [w for w in filecontents.split(
' ') if w.strip() != '' or w == '\n']
self._words = set(text_in_words)

self._word_indices = dict((c, i) for i, c in enumerate(self._words))
self._indices_word = dict((i, c) for i, c in enumerate(self._words))

STEP = 1
self._codelines = []
self._next_words = []
for i in range(0, len(text_in_words) - self.seq_length, STEP):
# print(text_in_words[i: i + self.seq_length])
self._codelines.append(text_in_words[i: i + self.seq_length])
self._next_words.append(text_in_words[i + self.seq_length])

def _generator(self, sentence_list, next_word_list, batch_size):
index = 0
while True:
x = np.zeros((batch_size, self.seq_length), dtype=np.int32)
y = np.zeros((batch_size), dtype=np.int32)
for i in range(batch_size):
for t, w in enumerate(sentence_list[index % len(sentence_list)]):
x[i, t] = self._word_indices[w]
y[i] = self._word_indices[next_word_list[index %
len(sentence_list)]]
index = index + 1
yield x, y

def train(self):
self._model.fit_generator(self._generator(self._codelines, self._next_words, self.BATCH_SIZE),
steps_per_epoch=int(
len(self._codelines)/self.BATCH_SIZE) + 1,
epochs=100,
callbacks=self._checkpoints)
Loading