-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathMyTokenizer.py
More file actions
27 lines (20 loc) · 805 Bytes
/
MyTokenizer.py
File metadata and controls
27 lines (20 loc) · 805 Bytes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
import re
name = input("File name ?:")
file = open(name,"r",encoding="utf-8")
data = file.read()
nlines = nwords = para = 0
#Paragraphs
para = data.count('\n\n')
#Words
words = data.split()
extra = words.count('!')+words.count('"')+words.count(')')+words.count('(')+words.count(',')+words.count('.')+words.count('-')+words.count('?')+words.count(':')+words.count(";")
nwords = len(words)-extra
#Sentences
match = re.findall('[a-z|A-Z|0-9|)]+[\.|\!|\?][\"|\”|\s|\n|)][A-Z]*',data)
two = re.findall("[A-Z][a-z][\.][\s]",data)
special = re.findall('[A-Z|a-z|0-9][\.][\s]+[a-z]',data)
nlines = len(match)-len(two)-len(special)
print('Number of Paragraphs = %d' %para) #ASSUMPTION
print('Number of Sentences = %d' %nlines)
print('Number of Words = %d' %nwords)
file.close()