-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathraw_tweets.py
More file actions
112 lines (98 loc) · 4.77 KB
/
raw_tweets.py
File metadata and controls
112 lines (98 loc) · 4.77 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
import pymssql
import sys
import os
from itertools import groupby
from operator import itemgetter
import sqlQueries
import textCleanUp
import fileFunctions
import config
import inputManagment
import spacy
from spacy import en
nlp = spacy.load("en")
#this function works in a similar manner to test.py, however, instead of processing the tweet texts,
#it generates file with the raw tweet bodies
def connect():
cursor = sqlQueries.connectionToDatabase()
filterKeywords= fileFunctions.readKeywordFile()
searchQuery = config.searchStringForSqlQuery()
locationSc = "Scotland"
locationEn = "England"
print("Generation of text files of raw tweets for summarisation")
print ("--- --- --- --- --- --- --- --- --- --- ------ --- --- --- --- --- --- --- --- --- ------ --- --- --- --- --- ------ --- --- --- --- --- --- --- --- --- ------ --- --- --- --- --- --- --- --- --- ------ --- --- --- --- --- ---")
print (" ")
for word in filterKeywords:
#print ("Search for word '"+word+"' for all locations in the database has begun.")
#searching and collecting from database all words with that keyword in two datasets by location
if "+" in word:
wordList = word.split("+")
print (wordList)
resultSc=inputManagment.searchForGroup(cursor, wordList,searchQuery,locationSc)
#resultEn=inputManagment.searchForGroup(cursor, wordList,searchQuery,locationEn)
else:
resultSc = inputManagment.searchForKeyword(cursor, word,searchQuery,locationSc)
#resultEn = inputManagment.searchForKeyword(cursor, word,searchQuery,locationEn)
#countEn = len(resultEn)
countSc = len(resultSc)
#count = countEn+countSc
#print ("Search for '"+word+"' has finished. There were "+str(countSc)+" tweets containing '"+word+"' in the database." )
#print (" ")
if countSc>0:
rowSc = textCleanUp.removeDupsAndRetweets(resultSc, locationSc)
#rowEn = textCleanUp.removeDupsAndRetweets(resultEn, locationEn)
#rowEnCount = len(rowEn)
rowScCount = len(rowSc)
#countAllUniques = rowEnCount + rowScCount
#print ("There are a total of "+str(rowScCount)+" from Scotland.")
#print (" ")
print(word)
#for r in rowSc:
#print(r[2])
textGen(rowSc, locationSc, word)
#textGen(rowEn, locationEn, word)
print("+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-")
def textGen(sortedRowS, location, word):
path = config.rawTweetsPath()
directory = path+word
#if the directory doesn't exist, create one
if not os.path.exists(directory):
os.makedirs(directory)
compare = len(sortedRowS)
wordCount = 0
i = 1
n = 0
tweetText = []
totalCount = 0
numberOfLines=0
for tweet in sortedRowS:
filterWords = []
text = tweet[2].lower()
check = 1
#check = textCleanUp.searchForKeywordCombos(filterKeywords, text, filterWords, nlp)
if check == 1:
n += 1
numberOfLines+=1
totalCount+=1
delimiter = " "
stripped = text.replace("\n", " ")
addCount = len(text.split(delimiter))
print(stripped)
#print ("---------------------------------------------------------")
tweetText.append(stripped)
wordCount = wordCount + addCount
if wordCount >=500 or n==compare:
#print(n)
#print(compare)
wordCount = 0
file = open("%s/%s_texts_%s.txt" %(directory,i, location), "w", encoding = "utf-8")
file.write ("<d_%s> %s\n" %(i,numberOfLines))
for line in tweetText:
lineS = ''.join(line)
file.write("%s\n" %lineS)
file.close()
tweetText.clear()
i += 1
numberOfLines=0
#print ("There is a total of "+ str(totalCount)+ " tweets, containing "+word+" that have been saved to files for "+location+".")
connect()