pyMSSQL/raw_tweets.py at master · FoodSentimentObservatory/pyMSSQL · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
import pymssql
import sys
import os
from itertools import groupby
from operator import itemgetter
import sqlQueries
import textCleanUp
import fileFunctions
import config
import inputManagment
import spacy
from spacy import en

nlp = spacy.load("en")
#this function works in a similar manner to test.py, however, instead of processing the tweet texts,
#it generates file with the raw tweet bodies
def connect():
        cursor = sqlQueries.connectionToDatabase()

        filterKeywords= fileFunctions.readKeywordFile()
        searchQuery = config.searchStringForSqlQuery()

        locationSc = "Scotland"
        locationEn = "England"
        print("Generation of text files of raw tweets for summarisation")
        print ("--- --- --- --- --- --- --- --- --- --- ------ --- --- --- --- --- --- --- --- --- ------ --- --- --- --- --- ------ --- --- --- --- --- --- --- --- --- ------ --- --- --- --- --- --- --- --- --- ------ --- --- --- --- --- ---")
        print (" ")

        for word in filterKeywords:

                #print ("Search for word '"+word+"' for all locations in the database has begun.")

                #searching and collecting from database all words with that keyword in two datasets by location
                if "+" in word:
                    wordList = word.split("+")
                    print (wordList)
                    resultSc=inputManagment.searchForGroup(cursor, wordList,searchQuery,locationSc)
                    #resultEn=inputManagment.searchForGroup(cursor, wordList,searchQuery,locationEn)
                else:

                    resultSc = inputManagment.searchForKeyword(cursor, word,searchQuery,locationSc)
                    #resultEn = inputManagment.searchForKeyword(cursor, word,searchQuery,locationEn)
                #countEn = len(resultEn)
                countSc = len(resultSc)
                #count = countEn+countSc
                #print ("Search for '"+word+"' has finished. There were "+str(countSc)+" tweets containing '"+word+"' in the database." )
                #print (" ")
                if countSc>0:
                    rowSc = textCleanUp.removeDupsAndRetweets(resultSc, locationSc)
                    #rowEn = textCleanUp.removeDupsAndRetweets(resultEn, locationEn)
                    #rowEnCount = len(rowEn)
                    rowScCount = len(rowSc)
                    #countAllUniques = rowEnCount + rowScCount
                    #print ("There are a total of "+str(rowScCount)+" from Scotland.")
                    #print (" ")

                    print(word)
                    #for r in rowSc:
                        #print(r[2])

                    textGen(rowSc, locationSc, word)
                    #textGen(rowEn, locationEn, word)
                    print("+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-")

def textGen(sortedRowS, location, word):
        path = config.rawTweetsPath()
        directory = path+word
        #if the directory doesn't exist, create one
        if not os.path.exists(directory):
            os.makedirs(directory)

        compare = len(sortedRowS)
        wordCount = 0
        i = 1
        n = 0
        tweetText = []
        totalCount = 0
        numberOfLines=0
        for tweet in sortedRowS:
                filterWords = []
                text = tweet[2].lower()
                check = 1
                #check = textCleanUp.searchForKeywordCombos(filterKeywords, text, filterWords, nlp)
                if check == 1:
                    n += 1
                    numberOfLines+=1
                    totalCount+=1
                    delimiter = " "
                    stripped = text.replace("\n", " ")
                    addCount = len(text.split(delimiter))
                    print(stripped)
                    #print ("---------------------------------------------------------")
                    tweetText.append(stripped)
                    wordCount = wordCount + addCount
                    if wordCount >=500 or n==compare:

                        #print(n)
                        #print(compare)
                        wordCount = 0
                        file = open("%s/%s_texts_%s.txt" %(directory,i, location), "w", encoding = "utf-8")
                        file.write ("<d_%s> %s\n" %(i,numberOfLines))
                        for line in tweetText:
                            lineS = ''.join(line)
                            file.write("%s\n" %lineS)
                        file.close()
                        tweetText.clear()
                        i += 1
                        numberOfLines=0
        #print ("There is a total of "+ str(totalCount)+ " tweets, containing "+word+" that have been saved to files for "+location+".")


connect()