-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathpreprocess.py
More file actions
101 lines (80 loc) · 3.23 KB
/
preprocess.py
File metadata and controls
101 lines (80 loc) · 3.23 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
import pandas as pd
from nltk.corpus import stopwords
from nltk import download
download('stopwords')
import re
import matplotlib.pyplot as plt
from sentence_transformers import SentenceTransformer
import numpy as np
class Preprocessor:
'''
Removes punctuations, numbers, whitespaces
Converts sentences into lower case
'''
def __init__(self, numbers=True, punct=True, stopwords=True, empty_sentence=True, model='bert-base-nli-mean-tokens'):
self.numbers = numbers
self.punct = punct
self.stopwords = stopwords
self.empty_sentence = empty_sentence
self.model = SentenceTransformer(model)
def apply(self, sentence):
sentence = sentence.lower()
if self.numbers:
sentence = Preprocessor.remove_numbers(sentence)
if self.punct:
sentence = Preprocessor.remove_punct(sentence)
if self.stopwords:
sentence = Preprocessor.remove_stopwords(sentence)
if self.empty_sentence:
sentence = Preprocessor.empty_sentence(sentence)
if sentence!=None:
sentence = re.sub(r'\s+', ' ', sentence)
return sentence
@staticmethod
def remove_punct(sentence):
sentence = re.sub(r'[^\w\s]', '', sentence)
return sentence
@staticmethod
def remove_numbers(sentence):
sentence = re.sub(r'[0-9]', '', sentence)
return sentence
@staticmethod
def remove_stopwords(sentence):
sentence_clean = ' '.join( [ word for word in sentence.split() if word.lower() not in set( stopwords.words('english') ) ] )
return sentence_clean
@staticmethod
def empty_sentence(sentence):
words=sentence.split()
if (not all(elem == "" for elem in sentence)) and len(sentence)>2:
return sentence
else:
return None
def get_embedding(self, sentence):
return self.model.encode(sentence)
processor = Preprocessor()
# Labelled data
labelled=pd.read_csv('data/labelled_data.csv')
labelled.drop('Unnamed: 0', axis=1, inplace=True)
# Unlabelled data
unlabelled=pd.read_csv('data/unlabelled_data.csv')
# Removing labelled sentences from unlabelled dataset
mask = unlabelled['country'].isin(labelled['country']) & unlabelled['year'].isin(labelled['year']) & unlabelled['doc_id'].isin(labelled['doc_id'])
unlabelled = unlabelled[~mask]
unlabelled.drop('Unnamed: 0', axis=1, inplace=True)
# Labelled Data
labelled_cleaned=labelled.copy()
labelled_cleaned['text_clean']=labelled_cleaned['text'].apply(processor.apply)
labelled_cleaned.dropna(inplace=True)
labelled_cleaned.reset_index(inplace=True, drop=True)
labelled_cleaned['embeddings']=labelled_cleaned['text_clean'].apply(processor.get_embedding)
# Unlabelled Data
unlabelled_cleaned=unlabelled.sample(2000)
unlabelled_cleaned['text_clean']=unlabelled_cleaned['text'].apply(processor.apply)
unlabelled_cleaned.dropna(inplace=True)
unlabelled_cleaned.reset_index(inplace=True, drop=True)
unlabelled_cleaned['embeddings']=unlabelled_cleaned['text_clean'].apply(processor.get_embedding)
# Saving cleaned data and embeddings
labelled_cleaned.to_csv('data/labelled_cleaned.csv')
np.save('data/labelled_embeddings.npy', np.stack(labelled_cleaned['embeddings'].tolist()))
unlabelled_cleaned.to_csv('data/unlabelled_cleaned.csv')
np.save('data/unlabelled_embeddings_2k.npy', np.stack(unlabelled_cleaned['embeddings'].tolist()))