-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathunsupervised-feature-selection.py
More file actions
172 lines (135 loc) · 5.08 KB
/
unsupervised-feature-selection.py
File metadata and controls
172 lines (135 loc) · 5.08 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
import os
import glob
import sys
import errno
import codecs
import nltk
from nltk.corpus import stopwords
from collections import Counter
from scipy.sparse import coo_matrix
from sklearn.cluster import KMeans
from sklearn import preprocessing
from sklearn.decomposition import PCA
from gensim.models import Word2Vec
import numpy
CORPUS_FILE = 'resources/LaVanguardia.txt' # Text to be processed
CLUSTERS_NUMBER = 40 # Number of clusters of words
MIN_FREQUENCY = 10 # Min word frequency to be considered
WINDOWS_SIZE = 2 # Windows size to determine the contexts
MAX_SENTECES = 30000 # Sentences to read for pca
def readFile():
# Read the file TEXT_FILE.
print("Loading file",CORPUS_FILE)
f = codecs.open(CORPUS_FILE,'r','latin1')
content = f.read()
return content
def tokenize(text):
# Tokenize and normalize the given text.
sents = nltk.sent_tokenize(text)
tokenized_sents = [nltk.word_tokenize(sent) for sent in sents]
tokenized_sents = [process_tokens(sent) for sent in tokenized_sents]
return tokenized_sents
def process_tokens(tokens):
# Process the given list of tokens
tokens = [token.lower() for token in tokens] # All tokens to lowercase
words = [token for token in tokens if token.isalpha()] # Maintain strings with alphabetic characters
words = [token for token in words if token not in stopwords.words('spanish')] # Remove stopwords
wnl = nltk.WordNetLemmatizer()
lemmatized = [wnl.lemmatize(t) for t in words] # Lemmatization
return lemmatized
def gen_vectors(normalized_text):
# Generate word vectors using neural word embeddings
print("\nGenerating word vectors")
model = Word2Vec(normalized,size=100,window=5,min_count=5)
vects = []
for word in model.wv.vocab:
vects.append(model.wv[word])
matrix = numpy.array(vects)
print("Matrix shape:",matrix.shape)
print("Vectors generated")
return model.wv.vocab,matrix
def frequent_words(text):
# Returns the words that appear at least MIN_FREQUENCY times
print("\nGetting most frequent words")
words = tokens = nltk.word_tokenize(text)
words = [token.lower() for token in words]
wnl = nltk.WordNetLemmatizer()
words = [wnl.lemmatize(t) for t in words] # Lemmatization
most_frequents = []
counter = Counter(words)
for w in counter:
if (counter[w]>=MIN_FREQUENCY):
most_frequents.append(w)
print("Most frequent words calculated. Total:",str(len(most_frequents)))
return most_frequents
def create_cooccurrence_matrix(sentences,frequent_words):
# Create coocurrence matrix. Only create columns for those words that are in frequent_words
print("\nCreating co-occurrence matrix")
set_all_words={}
set_freq_words={}
data=[]
row=[]
col=[]
for sentence in sentences:
tokens=sentence
for pos,token in enumerate(tokens):
i=set_all_words.setdefault(token,len(set_all_words))
start=max(0,pos-WINDOWS_SIZE)
end=min(len(tokens),pos+WINDOWS_SIZE+1)
for pos2 in range(start,end):
if pos2==pos or tokens[pos2] not in frequent_words:
continue
j=set_freq_words.setdefault(tokens[pos2],len(set_freq_words))
data.append(1.); row.append(i); col.append(j);
cooccurrence_matrix=coo_matrix((data,(row,col)))
print("Vocabulary size:",len(set_all_words))
print("Matrix shape:",cooccurrence_matrix.shape)
print("Co-occurrence matrix finished")
return set_all_words,set_freq_words,cooccurrence_matrix
def unsupervised_fs_pca(vectors):
print("PCA reduction. Original shape:",vectors.shape)
pca = PCA(n_components=1000)
vectors = preprocessing.normalize(vectors)
pca.fit(vectors.todense())
new_vectors = pca.transform(vectors.todense())
print("Finished feature selection. Shape:",new_vectors.shape)
return new_vectors
def gen_clusters(vectors):
# Generate word clusters using the k-means algorithm.
print("\nClustering started")
vectors = preprocessing.normalize(vectors)
km_model = KMeans(n_clusters=CLUSTERS_NUMBER)
km_model.fit(vectors)
print("Clustering finished")
return km_model
def show_results(vocabulary,model):
# Show results
c = Counter(sorted(model.labels_))
print("\nTotal clusters:",len(c))
for cluster in c:
print ("Cluster#",cluster," - Total words:",c[cluster])
keysVocab = list(vocabulary.keys())
for n in range(len(c)):
print("Cluster %d" % n)
print("Words:", end='')
word_indexs = [i for i,x in enumerate(list(model.labels_)) if x == n]
for i in word_indexs:
print(' %s' % keysVocab[i], end=',')
print()
print()
print()
if __name__ == "__main__":
file_content = readFile() # Read the CORPUS_FILE
normalized = tokenize(file_content)
vocabulary = {}
vectors = []
unsup_method = sys.argv[1]
if (unsup_method=="pca"):
normalized = normalized[:MAX_SENTECES]
frequent_words = frequent_words(file_content) # Get the most frequent words
vocabulary, features, vectors = create_cooccurrence_matrix(normalized,frequent_words)
vectors = unsupervised_fs_pca(vectors)
if (unsup_method=="embeddings"):
vocabulary, vectors = gen_vectors(normalized)
km_model = gen_clusters(vectors) # Generate clusters
show_results(vocabulary,km_model)