-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy patharbitrary_function.py
More file actions
111 lines (63 loc) · 1.8 KB
/
arbitrary_function.py
File metadata and controls
111 lines (63 loc) · 1.8 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
# pickle
import pickle
# utilities
import re
# string
from string import punctuation
# nltk
from nltk.tag import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import TweetTokenizer
file = open('./ML model building/LRmodel.pickle', 'rb')
info = pickle.load(file)
file.close()
file2 = open('ML model building/vectoriser.pickle', 'rb')
info2 = pickle.load(file2)
file2.close()
data = 'i hate driving with no shoes on'
#####
def get_text(data):
data=data.lower()
######
#5.2 REMOVE STOP WORDS
def cleaning_URLs(data):
return re.sub('(www.[^\s]+)|(http?://[^\s]+)|(https?://[^\s]+)','',data)
data=cleaning_URLs(data)
#5.3 REMOVE STOP WORDS
stopwords1=stopwords.words('english')
def cleaning_stopwords(data):
return " ".join([word for word in str(data).split() if word not in stopwords1])
data=cleaning_stopwords(data)
# 5.4 CLEAN AND REMOVE PUNCTUATION
punctuations_list = punctuation
def cleaning_punctuations(data):
translator = str.maketrans('', '', punctuations_list)
return data.translate(translator)
data=cleaning_punctuations(data)
#5.5 CLEANING NUMBERS
def cleaning_numbers(data):
return re.sub('[0-9]+','',data)
data=cleaning_numbers(data)
#5.6 TOKENIZATION OF TWEET TEXT
tweet_tokenizer = TweetTokenizer(
preserve_case = True,
reduce_len = False,
strip_handles = False)
tweet=data
tokens = tweet_tokenizer.tokenize(tweet)
data = tokens
tags = pos_tag(data)
def _tag2type(tag):
if tag.startswith('NN'):
return 'n'
elif tag.startswith('VB'):
return 'v'
else:
return 'a'
lemmatizer = WordNetLemmatizer()
data = [lemmatizer.lemmatize(t[0], _tag2type(t[1])) for t in tags]
data = ' '.join(data)
data = [data]
data = info2.transform(data)
print(info.predict(data))