-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtextCleaning.py
More file actions
53 lines (37 loc) · 1.34 KB
/
textCleaning.py
File metadata and controls
53 lines (37 loc) · 1.34 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
import re
import contractions
import unicodedata
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import unittest
lemm = WordNetLemmatizer()
def clean_text(text):
text = text.lower()
text = contractions.fix(text)
text = re.sub(r"\d+", " ", text)
text = re.sub(r"[@,$,&,/]", " ", text)
text = re.sub(r"[<.*?>]", " ", text)
text = re.sub(r"\s+", " ", text)
words = word_tokenize(text)
words = [lemm.lemmatize(w) for w in words]
text = " ".join(words)
text = unicodedata.normalize("NFKD", text)
text = ''.join([c for c in text if not unicodedata.combining(c)])
return text
class TestCleanText(unittest.TestCase):
def test_lowercase(self):
self.assertEqual(clean_text("HELLO"), "hello")
def test_contraction(self):
result = clean_text("don't")
self.assertIn("do", result)
self.assertIn("not", result)
def test_numbers_removed(self):
self.assertEqual(clean_text("movie123"), "movie")
def test_symbols_removed(self):
self.assertEqual(clean_text("hi@you&me"), "hi you me")
def test_slashes_removed(self):
self.assertEqual(clean_text("good/bad"), "good bad")
def test_spacing(self):
self.assertEqual(clean_text("hello world"), "hello world")
if __name__ == "__main__":
unittest.main()