-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathScrape past results.py
More file actions
414 lines (375 loc) · 23 KB
/
Scrape past results.py
File metadata and controls
414 lines (375 loc) · 23 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
from time import sleep as sleep
from urllib.request import Request,urlopen
from urllib.error import HTTPError,URLError
from bs4 import BeautifulSoup as BS
from datetime import date
import sqlite3
mydb = sqlite3.connect("CSGO-Results.db")
cursor = mydb.cursor()
#order of data entry should be
#Team, Player, Event, Game, GameMap, PlayerMap, TeamMap
URL = "https://www.hltv.org"
req = Request("https://www.hltv.org/results", headers={'User-Agent':'Mozilla/6.0'})
def lookAtResult(url,date):
req = Request(url, headers={'User-Agent':'Mozilla/6.0'})
try:
webpage = urlopen(req)
except HTTPError as e:#If there is a server error
print("e")#show the error
except URLError as e:#If URL does not exist
print("Server could not be found")
else:#If there are no errors
#Scrapes
html = BS(webpage.read(), "html.parser")
#Before anything, it needs to check if the result is valid, "1" under the winning teams logo means match page is invalid.
#i am looking for <div class="won">1</div>, if it exists then i know this result isnt valid to be scraped, check design segment.
try:#I need to use try at the moment, just because there might be a(very) rare case of tie, and the class won wont be there or correct
classWon = html.find("div", {"class":"won"}).getText()
if classWon == "1":
valid = False
else:
valid = True
except:
valid = True
if valid == False:
print("Result isnt valid... ignoring...")
return("False")
# <div class="box-headline flexbox nowrap header">
#Find out if it is LAN and if it is best of 3
#<div class="padding preformatted-text">Best of 3 (LAN)
#Someones name could have a 3 or a 1 in it so i need to make sure when it says someone is standing in for someone
#it still successfully finds out if it is best of 3 of 1, or 2 or 5
#
#
#but also if the name also has LAN in it.
#The program still needs to do checks to make sure the matchpage is okay for webscraping
#The next check is to make sure all the stats tables are there, if the web scraper is too
#quick on the match page, hltv might have only had enough time for the first few maps, and
#not the closing map.
try:
maplist = html.find("div",{"class":"box-headline flexbox nowrap header"})
maplist = maplist.find("div",{"class":"flexbox nowrap"})#There would normally be loads of these div with class small-padding
maplist = maplist.findAll("div",{"class":"small-padding"})
except:
print("Oh, its a best of 1 but without any tables of data...")
return("MissingMap(s)")
#but i search only html for div class box-headline flexbox nowrap header, which is specific to one bit.
numberOfMaps = (len(maplist)-1)
#Now to check how many maps there is supposed to be
#I already have classWon from a different thing, so i only need to scrape classLost
classLost = html.find("div", {"class":"lost"}).getText()
mapsCheck = int(classLost)+int(classWon)
print("there are {0} tables of data for {1} map(s)".format(numberOfMaps,mapsCheck))
#Checking if theyre the same
if mapsCheck >5:
print("dont worry about maps check, its only a best of 1")
mapsCheck=1
#remember to explain in design the reason why this check doesnt work for best of 1 and what i have put into place to check best of ones.
elif mapsCheck == numberOfMaps:
print("All stats are there")
else:
print("RESULT MISSING DATA!")
return("MissingMap(s)")
#Getting all map names:
mapInformation = html.findAll("div",{"class":"mapholder"})
roundsWon = {}
for i in range(mapsCheck):
roundsWon.update({mapInformation[i].find("div",{"class":"mapname"}).getText():{str(mapInformation[i].find("div",{"class":"results-teamname text-ellipsis"}).getText()):int(mapInformation[i].find("div",{"class":"results-team-score"}).getText()),str((mapInformation[i].findAll("div",{"class":"results-teamname text-ellipsis"})[1]).getText()):int((mapInformation[i].findAll("div",{"class":"results-team-score"})[1]).getText())}})
print(roundsWon)
#Map names done
text = html.find("div", {"class":"padding preformatted-text"}).getText()
print(text)
#gtSmartphone-only statsPlayerName
#.split
text = html.find("div", {"class":"event text-ellipsis"}).getText()
print(text)#Event
eventID = eventDatabaseEntry(text)
if classWon == "2":
gameFormat = "Best of 3"
elif classWon == "3":
gameFormat = "Best of 5"
else:
gameFormat = "Best of 1"
gameID = gameDatabaseEntry(gameFormat, numberOfMaps, date, eventID)
latestResult = html.find("div", {"class": "small-padding stats-detailed-stats"}).find("a", href=True)
url = "".join([URL, latestResult["href"]])
print(url)
return(url,numberOfMaps,gameID,roundsWon)
def getTable(url,numberOfMaps):
req = Request(url, headers={'User-Agent':'Mozilla/6.0'})
try:
webpage = urlopen(req)
except HTTPError as e:#If there is a server error
print("e")#show the error
except URLError as e:#If URL does not exist
print("Server could not be found")
else:#If there are no errors
#Scrapes
html = BS(webpage.read(), "html.parser")
if numberOfMaps == 1:#Best of 1
mapTableList = html.findAll("table", {"class":"stats-table"})
return(True,mapTableList)
else:#Others
mapListHTML = []
mapListURL = []
mapListHTML = html.findAll("a", {"class":"col stats-match-map standard-box a-reset inactive"})
for i in range(len(mapListHTML)):
mapListURL.append(mapListHTML[i].get("href"))
return(False,mapListURL)
def eventDatabaseEntry(eventName):
cursor.execute("SELECT * FROM Event WHERE EventName = (?)",(eventName,))
temp = cursor.fetchall()
if temp == []:
cursor.execute("INSERT INTO Event (EventName) VALUES (?)",(eventName,))
cursor.execute("COMMIT")
print("Event stored in database, getting ID")
cursor.execute("SELECT EventID FROM Event WHERE EventName = (?)",(eventName,))
for i in cursor.fetchall():
return(i[0])
else:
print("Event is already in database, getting ID")
for i in temp:#This is messy but it brings out the tuple from the list
if len(i)==2:#Makes sure there is only one item in the tuple, theres no reason as to why it shouldnt, but this is keeping sure.
return(i[0])
def gameDatabaseEntry(gameFormat, numberOfMaps, date, event):
cursor.execute("INSERT INTO Game (Format,NumberOfMaps,Date,EventID) VALUES (?,?,?,?)",(gameFormat, numberOfMaps, date, event,))
cursor.execute("COMMIT")
return(cursor.lastrowid)
def gameMapDatabaseEntry(map, rounds, gameID):
cursor.execute("INSERT INTO gameMap (RoundsPlayed,MapName,GameID) VALUES (?,?,?)",(rounds,map,gameID,))
cursor.execute("COMMIT")
return(cursor.lastrowid)
def multipleMapsHandler(mapListURL,gameID,roundsWon):#This subroutine only gets called when the match is not a best of 1.
for i in range(len(mapListURL)):
count = 0
for each in roundsWon:
if count == i:
mapRoundsWon = {each:roundsWon[each]}
break
count+=1#have to have count because annoyingly, dictionaries arent ordered.
mapsTables = []
url = "".join([URL, mapListURL[i]])
print(url)
print("waiting 5 seconds")
sleep(5)
req = Request(url, headers={'User-Agent':'Mozilla/6.0'})
try:
webpage = urlopen(req)
except HTTPError as e:#If there is a server error
print("e")#show the error
except URLError as e:#If URL does not exist
print("Server could not be found")
else:#If there are no errors
#Scrapes
html = BS(webpage.read(), "html.parser")
mapsTables = html.findAll("table", {"class":"stats-table"})
lookAtTable(mapsTables,gameID,mapRoundsWon)
def checkIfTeamExists(teamName):
teamID = 0
cursor.execute("SELECT * FROM Team WHERE TeamName = (?)",(teamName,))
temp = cursor.fetchall()
if temp == []:
return [False,teamID]#Team does not exist
for i in temp:#This is messy but it brings out the tuple from the list
if len(i)==2:#Makes sure there is only one item in the tuple, theres no reason as to why it shouldnt, but this is keeping sure.
teamID = i[0]
else:
print("ERROR checking a team's TeamID... Was there more than one team named " + teamName+"? Using returning 0as teamID")
return [True,teamID]#Team does exist.
def playerDatabaseManager(playerName,nationality,teamID):# This subroutine checks if a player exists, if not the player is added, then it checks if the player's linked team is correct
#First check is if player exists,
#next is to check that the team hes playing for is correct.
cursor.execute("SELECT NickName,Nationality FROM Player WHERE NickName = (?) AND Nationality = (?)",(playerName,nationality,))
if cursor.fetchall() == []:
#Player does not exist
cursor.execute("INSERT INTO Player (NickName,Nationality,TeamID) VALUES(?,?,?)",(playerName,nationality,teamID))
cursor.execute("COMMIT")
return(cursor.lastrowid)
#Onto second check... Only relevent to players that are already in the database.
else:
cursor.execute("SELECT TeamID,PlayerID FROM Player WHERE NickName = (?) AND Nationality = (?)",(playerName,nationality))
for i in cursor.fetchall():#This is messy but it brings out the tuple from the list
if len(i)==2:#Makes sure there is only one item in the tuple, theres no reason as to why it shouldnt, but this is keeping sure.
if i[0] == teamID:
print("The current teamID for this player is correct!")
else:
print("Previous results show this player is in a different team")
playerID = i[1]
else:
print("ERROR checking a player's TeamID... Was there more than one player named " + playerName+"?")
return(playerID)
def teamMapDatabaseEntry(mapName,roundsWon,roundCount,teamID,gameMapID):
roundsLost = roundCount-roundsWon
if roundsWon > roundsLost:
won = "w"
elif roundsWon == roundsLost:
won = "t"
else:
won = "l"
cursor.execute("INSERT INTO TeamMap (GameMapID,TeamID,Won,RoundsWon,RoundsLost) VALUES (?,?,?,?,?)",(gameMapID,teamID,won,roundsWon,roundsLost,))
cursor.execute("COMMIT")
def playerMapDatabaseEntry(mapID,playerID,kills,deaths,adr,kast,rating):
cursor.execute("INSERT INTO PlayerMap (GameMapID,PlayerID,Kills,Deaths,adr,kast,Rating) VALUES (?,?,?,?,?,?,?)",(mapID,playerID,kills,deaths,adr,kast,rating,))
cursor.execute("COMMIT")
#def lookAtTable(tableData, teamData, gameMapID):#Data imported is the tables, not the whole html page
def lookAtTable(tableData,gameID,teamData):
for each in teamData:
#Theres only ever going to be one item in this... maybe it was better off afterall to make it a list
#instead of a dictionary.
roundCount = 0
print(teamData)
for k in teamData[each]:
roundCount += teamData[each][k]
gameMapID = gameMapDatabaseEntry(each, roundCount, gameID)
for tableLoopIndex in range(len(tableData)):
print("""
Map Number {0}
""".format(tableLoopIndex/2))
currentTable = tableData[tableLoopIndex]
teamName = currentTable.find("th", {"class":"st-teamname text-ellipsis"}).getText()
print(teamName)
team = checkIfTeamExists(teamName)
teamID = team[1]
if team[0] == True:
print("This team is already in the database.")
else:
print("This team is not in the database... adding...")
cursor.execute("INSERT INTO Team (TeamName) VALUES (?)",(teamName,))
cursor.execute("COMMIT")
cursor.execute("SELECT TeamID FROM Team WHERE TeamName = (?)",(teamName,))
for i in cursor.fetchall():#This is messy but it brings out the tuple from the list
if len(i)==1:#Makes sure there is only one item in the tuple, theres no reason as to why it shouldnt, but this is keeping sure.
teamID = i[0]
else:
print("ERROR getting teamID... Was there more than one team named " + teamName+"?")
print("Successfully added to database with team ID: {0}".format(teamID))
teamMapDatabaseEntry(each,teamData[each][teamName],roundCount,teamID,gameMapID)
name = currentTable.findAll("td", {"class":"st-player"})#Finds the HTML tag containing each players name
nationality = []
for i in name:
nationality.append((i.find("img", alt = True))['alt'])
#<img alt="Thailand"
#data.findAll("span", {"class":"gtSmartphone-only"}).decompose
#i need too find a way to remove this. This command doesn work.
kills = currentTable.findAll("td",{"class":"st-kills"} )
deaths = currentTable.findAll("td", {"class": "st-deaths"})
kast = currentTable.findAll("td", {"class":"st-kdratio"})
adr=currentTable.findAll("td", {"class":"st-adr"})
rating=currentTable.findAll("td", {"class":"st-rating"})
for i in range (len(name)):#Goes through every person in game
#above is the len because if a player has a technical issue, and a 6th player comes on as a standin.
print(name[i].getText())#Prints only the text of the player name
print(nationality[i])
playerID = playerDatabaseManager(name[i].getText(),nationality[i],teamID)#REMEMBER TO ADD NATIONALITY TO PLAYERSSSSSS
print(kills[i].getText())
print(deaths[i].getText())
print(kast[i].getText())
print(adr[i].getText())
print(rating[i].getText())
playerMapDatabaseEntry(gameMapID,playerID,int(kills[i].getText().split()[0]),int(deaths[i].getText()),float(adr[i].getText()),float(kast[i].getText().split('%')[0]),float(rating[i].getText()))
print("NEXT")
print("")
print("PLAYER:")
def checkingForMap(URL):
print("Only goes here if there is a stats table missing")
print("Ill give it 3 tries. if it fails on all, it will give up and move on")
count = 0
while count <3:
print("Waiting 30 seconds")
sleep(30)
data = lookAtResult(URL)
if data[0] == "MissingMap(s)":
count = count +1
else:
print("All stats are now there")
lookAtTable(data[0],data[1],data[2])
def getAllResultsOnPage(resultsPage):
req = Request(resultsPage, headers={'User-Agent':'Mozilla/6.0'})
try:
webpage = urlopen(req)#Open hltv results page
except HTTPError as e:#If there is a server error
print("e")#show the error
except URLError as e:#If URL does not exist
print("Server could not be found")
else:#If there are no errors
#Scrapes
html = BS(webpage.read(), "html.parser")#the html is stored
temp = html.find('div', {"class": "big-results"})#if this doesnt exists(is there featured results)
#messy.........
if temp == None:
print("There is no featured results")
else:#if there is no errors, meaning there is featured results
print("There is a featured results, removing!")
html.find('div', {"class":"big-results"}).decompose()#no errors means there is a featured results, so this line removes the features results.
#still crap but ammended it so it works...
#<div class="results-sublist"><span class="standard-headline">Results for November 16th 2019</span>
returnedList = []
latestResult = html.findAll("div", {"class":"results-sublist"})
for i in range(len(latestResult)):
dateString = latestResult[i].find("span", {"class":"standard-headline"}).getText()
string2 = (dateString.split('for ')[1])
string3 = string2.split(' ')
string3[1] = string3[1].split('th')[0]
string3[2] = string3[2].split('0')[1]
if string3[0] == "March":
string3[0] = "03"
elif string3[0] == "April":
string3[0] = "04"
elif string3[0] == "December":
string3[0] = "12"
elif string3[0] == "January":
string3[0] = "01"
elif string3[0] == "February":
string3[0] = "02"
if string3[2] == "2":
string3[2] = "2020"
list1 = [string3[2],string3[0],string3[1]]
print("-".join(list1))
returnedList.append("-".join(list1))
results = latestResult[i].findAll("div", {"class":"result-con"})
for j in range(len(results)):
results[j] = results[j].find("a", href=True)
results[j] = "".join([URL, results[j]["href"]])
returnedList.append(results)
latestResult = html.findAll("div", {"class":"result-con"})#Finds the latest result
return returnedList
def main():
resultsPage = ["https://www.hltv.org/results?offset=","0"]
lastResult = ''
while int(resultsPage[1])>=0:
resultURL = getAllResultsOnPage("".join(resultsPage))
print(resultURL)
resultURL = ['2020-04-14', ['https://www.hltv.org/matches/2340380/orgless-vs-cloud9-flashpoint-1', 'https://www.hltv.org/matches/2340762/salamander-vs-skade-esea-mdl-season-33-europe-relegation'], '2020-04-13', ['https://www.hltv.org/matches/2340763/ttc-vs-fate-esea-mdl-season-33-europe-relegation', 'https://www.hltv.org/matches/2340760/apeks-vs-thedice-esea-mdl-season-33-europe-relegation', 'https://www.hltv.org/matches/2340755/reason-vs-ambush-esea-mdl-season-33-europe-relegation', 'https://www.hltv.org/matches/2340758/prima-vs-wisla-krakow-esea-mdl-season-33-europe-relegation', 'https://www.hltv.org/matches/2340759/endpoint-vs-vexed-esea-mdl-season-33-europe-relegation', 'https://www.hltv.org/matches/2340761/offset-vs-unique-esea-mdl-season-33-europe-relegation', 'https://www.hltv.org/matches/2340379/geng-vs-havu-flashpoint-1', 'https://www.hltv.org/matches/2340720/skade-vs-fate-esea-mdl-season-33-europe-relegation', 'https://www.hltv.org/matches/2340719/ambush-vs-turkey5-esea-mdl-season-33-europe-relegation', 'https://www.hltv.org/matches/2340721/salamander-vs-ttc-esea-mdl-season-33-europe-relegation', 'https://www.hltv.org/matches/2340723/vexed-vs-unique-esea-mdl-season-33-europe-relegation', 'https://www.hltv.org/matches/2340725/wisla-krakow-vs-thedice-esea-mdl-season-33-europe-relegation', 'https://www.hltv.org/matches/2340722/endpoint-vs-offset-esea-mdl-season-33-europe-relegation', 'https://www.hltv.org/matches/2340672/sprout-vs-heretics-lootbet-season-6', 'https://www.hltv.org/matches/2340757/tiger-vs-btrg-play-from-home', 'https://www.hltv.org/matches/2340745/zigma-vs-btrg-play-from-home', 'https://www.hltv.org/matches/2340718/illuminar-vs-reason-esea-mdl-season-33-europe-relegation', 'https://www.hltv.org/matches/2340677/pro100-vs-ago-lootbet-season-6', 'https://www.hltv.org/matches/2340744/invictus-vs-tiger-play-from-home', 'https://www.hltv.org/matches/2340657/liquid-vs-evil-geniuses-esl-pro-league-season-11-north-america', 'https://www.hltv.org/matches/2340378/havu-vs-mibr-flashpoint-1', 'https://www.hltv.org/matches/2340746/big-vs-spirit-home-sweet-home-cup'], '2020-04-12', ['https://www.hltv.org/matches/2340659/fnatic-vs-mousesports-esl-pro-league-season-11-europe', 'https://www.hltv.org/matches/2340377/mad-lions-vs-cloud9-flashpoint-1', 'https://www.hltv.org/matches/2340724/prima-vs-apeks-esea-mdl-season-33-europe-relegation', 'https://www.hltv.org/matches/2340743/north-vs-spirit-home-sweet-home-cup', 'https://www.hltv.org/matches/2340670/smash-vs-illuminar-lootbet-season-6', 'https://www.hltv.org/matches/2340741/big-vs-sprout-home-sweet-home-cup', 'https://www.hltv.org/matches/2340742/movistar-riders-vs-heretics-cyberbet-cup', 'https://www.hltv.org/matches/2340669/espada-vs-hard-legion-lootbet-season-6', 'https://www.hltv.org/matches/2340656/furia-vs-evil-geniuses-esl-pro-league-season-11-north-america', 'https://www.hltv.org/matches/2340738/spirit-vs-forze-home-sweet-home-cup'], '2020-04-11', ['https://www.hltv.org/matches/2340376/chaos-vs-orgless-flashpoint-1', 'https://www.hltv.org/matches/2340658/astralis-vs-mousesports-esl-pro-league-season-11-europe', 'https://www.hltv.org/matches/2340375/funplus-phoenix-vs-geng-flashpoint-1', 'https://www.hltv.org/matches/2340737/endpoint-vs-north-home-sweet-home-cup', 'https://www.hltv.org/matches/2340668/pro100-vs-nemiga-lootbet-season-6', 'https://www.hltv.org/matches/2340739/heretics-vs-navi-junior-cyberbet-cup', 'https://www.hltv.org/matches/2340736/skade-vs-sprout-home-sweet-home-cup', 'https://www.hltv.org/matches/2340667/ago-vs-nordavind-lootbet-season-6', 'https://www.hltv.org/matches/2340740/movistar-riders-vs-syman-cyberbet-cup', 'https://www.hltv.org/matches/2340735/big-vs-virtuspro-home-sweet-home-cup', 'https://www.hltv.org/matches/2340730/nordavind-vs-syman-cyberbet-cup', 'https://www.hltv.org/matches/2340374/mibr-vs-orgless-flashpoint-1', 'https://www.hltv.org/matches/2340655/liquid-vs-evil-geniuses-esl-pro-league-season-11-north-america', 'https://www.hltv.org/matches/2340734/gambit-youngsters-vs-virtuspro-home-sweet-home-cup']]
for i in range(len(resultURL)):
if i ==0 or (i%2)==0:
print(resultURL[i])
date = resultURL[i]
else:
for j in range(len(resultURL[i])):
print("waiting 10 seconds")
sleep(10)
data = lookAtResult(resultURL[i][j],date)
missingMap = False
if data == "MissingMap(s)":
missingMap = True
elif data == "False":
print("ignored!")
else:
print(data)
url = data[0]
numberOfMaps = data[1]
gameID = data[2]
roundsWon = data[3]
print("waiting 5 seconds")
sleep(5)
data = getTable(url,numberOfMaps)
if data[0]==True:#If it is a best of 1
lookAtTable(data[1],gameID,roundsWon)
else:#not best of 3
multipleMapsHandler(data[1],gameID,roundsWon)
else:
print("That was the old url, no need to scrape again!!!")
print("waiting 120 seconds...")
sleep(120)
resultsPage[1] = str(int(resultsPage[1]) - 100)
main()