-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathexercise6.py
More file actions
135 lines (103 loc) · 4.55 KB
/
exercise6.py
File metadata and controls
135 lines (103 loc) · 4.55 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
def gaussianKernel(x1, x2, sigma):
return np.exp(-np.sum(np.square(x1 - x2)) / (2 * (sigma ** 2)))
def dataset3Params(X, y, Xval, yval):
C_candidates = np.array([0.01, 0.03, 0.1, 0.3, 1, 3, 10, 30])
sigma_candidates = np.array([0.01, 0.03, 0.1, 0.3, 1, 3, 10, 30])
distance = np.zeros([C_candidates.size, sigma_candidates.size])
for i in range(C_candidates.size):
for j in range(sigma_candidates.size):
model = utils.svmTrain(X, y, C_candidates[i], gaussianKernel, args=(sigma_candidates[j],))
predictions = utils.svmPredict(model, Xval)
distance[i, j] = np.mean(predictions != yval)
# https://numpy.org/doc/stable/reference/generated/numpy.argmin.html
ind = np.unravel_index(np.argmin(distance, axis = None), distance.shape)
C = C_candidates[ind[0]]
sigma = sigma_candidates[ind[1]]
return C, sigma
def processEmail(email_contents, verbose=True):
"""
Preprocesses the body of an email and returns a list of indices
of the words contained in the email.
Parameters
----------
email_contents : str
A string containing one email.
verbose : bool
If True, print the resulting email after processing.
Returns
-------
word_indices : list
A list of integers containing the index of each word in the
email which is also present in the vocabulary.
Instructions
------------
Fill in this function to add the index of word to word_indices
if it is in the vocabulary. At this point of the code, you have
a stemmed word from the email in the variable word.
You should look up word in the vocabulary list (vocabList).
If a match exists, you should add the index of the word to the word_indices
list. Concretely, if word = 'action', then you should
look up the vocabulary list to find where in vocabList
'action' appears. For example, if vocabList[18] =
'action', then, you should add 18 to the word_indices
vector (e.g., word_indices.append(18)).
Notes
-----
- vocabList[idx] returns a the word with index idx in the vocabulary list.
- vocabList.index(word) return index of word `word` in the vocabulary list.
(A ValueError exception is raised if the word does not exist.)
"""
# Load Vocabulary
vocabList = utils.getVocabList()
# Init return value
word_indices = []
# ========================== Preprocess Email ===========================
# Find the Headers ( \n\n and remove )
# Uncomment the following lines if you are working with raw emails with the
# full headers
# hdrstart = email_contents.find(chr(10) + chr(10))
# email_contents = email_contents[hdrstart:]
# Lower case
email_contents = email_contents.lower()
# Strip all HTML
# Looks for any expression that starts with < and ends with > and replace
# and does not have any < or > in the tag it with a space
email_contents =re.compile('<[^<>]+>').sub(' ', email_contents)
# Handle Numbers
# Look for one or more characters between 0-9
email_contents = re.compile('[0-9]+').sub(' number ', email_contents)
# Handle URLS
# Look for strings starting with http:// or https://
email_contents = re.compile('(http|https)://[^\s]*').sub(' httpaddr ', email_contents)
# Handle Email Addresses
# Look for strings with @ in the middle
email_contents = re.compile('[^\s]+@[^\s]+').sub(' emailaddr ', email_contents)
# Handle $ sign
email_contents = re.compile('[$]+').sub(' dollar ', email_contents)
# get rid of any punctuation
email_contents = re.split('[ @$/#.-:&*+=\[\]?!(){},''">_<;%\n\r]', email_contents)
# remove any empty word string
email_contents = [word for word in email_contents if len(word) > 0]
# Stem the email contents word by word
stemmer = utils.PorterStemmer()
processed_email = []
for word in email_contents:
# Remove any remaining non alphanumeric characters in word
word = re.compile('[^a-zA-Z0-9]').sub('', word).strip()
word = stemmer.stem(word)
processed_email.append(word)
if len(word) < 1:
continue
if word in vocabList:
word_indices.append(vocabList.index(word))
if verbose:
print('----------------')
print('Processed email:')
print('----------------')
print(' '.join(processed_email))
return word_indices
def emailFeatures(word_indices):
n = 1899
x = np.zeros(n)
x[word_indices] = 1
return x