Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions bench.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ def run1(args, src_name, num_runs):
runs = [
[['java', '-jar', './bin/freq01scala.jar'], 'freq01.scala', 3],
[['python', './src/freq01.py'], 'freq01.py', 3],
[['python', './src/freq02.py'], 'freq02.py', 3],
[['./bin/freq03cpp' + EXE], 'freq03.cpp'],
[['./bin/freq02cpp' + EXE], 'freq02.cpp'],
[['./bin/freq01cpp' + EXE], 'freq01.cpp'],
Expand Down
69 changes: 69 additions & 0 deletions src/freq02.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
import string
import sys
from collections import Counter

from itertools import chain


CHUNK_SIZE = 20 * 1024 * 1024


non_alpha = bytes(ch for ch in range(256) if chr(ch) not in string.ascii_letters)

translation_table = bytes.maketrans(
string.ascii_uppercase.encode() + non_alpha,
string.ascii_lowercase.encode() + b' ' * len(non_alpha)
)

tail = [b''] # variable to store tail of previous chunk of data.


def get_words_with_empty_strings(line):
"""Replaces any non-alphabetical symbols with spaces and splits the string using space separator.

Saves the last word into tails[0] to concat it with the first word of the next chunk.
"""
result = line.translate(translation_table).split(b' ')
result[0] = tail[0] + result[0]
tail[0] = result[-1]
result[-1] = b''
return result


def sort_key(x):
word, fq = x
return -fq, word


def main(in_file_name, out_file_name):

with open(in_file_name, 'rb') as in_file:
def read_chunk():
return in_file.read(CHUNK_SIZE)

words_and_spaces = chain.from_iterable(
get_words_with_empty_strings(line) for line in iter(read_chunk, b'')
)

# Ignore empty strings.
words = filter(None, chain(words_and_spaces, tail[0]))

counts = sorted(Counter(words).items(), key=sort_key)

rows = (
b' '.join((str(v).encode(), k))
for k, v in counts
)

result = b'\r\n'.join(rows) + b'\r\n'

with open(out_file_name, 'wb+') as out_file:
out_file.write(result)


if __name__ == '__main__':
if len(sys.argv) != 3:
print('No args')
exit(1)

main(sys.argv[1], sys.argv[2])