-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathCodechefScraper.py
More file actions
96 lines (83 loc) · 3.13 KB
/
CodechefScraper.py
File metadata and controls
96 lines (83 loc) · 3.13 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
import urllib
import re
import os
''' at the end while writing the txt file format is encoded so here we create htmlCodes so as to decode it '''
htmlCodes = (
("'", '''),
('"', '"'),
('>', '>'),
('<', '<'),
('&', '&')
)
username=raw_input("Enter Username: ")
url="https://www.codechef.com/users/"+username
flag=0 # flag maintained to check for internal server errors
while(flag==0):
try: # try until valid html is not recieved
html=urllib.urlopen(url).read()
if("Internal Server Error" not in html):
flag=1
except:
pass
pattern='<a href="(.+?)>' # pattern to find all questions
regex=re.findall(pattern,html)
questions=[]
x=[]
map={} # question map to avoid duplicate question entry
for i in regex:
if 'status' in i:
i=i.split("/status/")
i[1]=i[1].split('"')[0]
try:
map[i[1]] # try to accept question in map if we get error question is to be included else pass
except:
map[i[1]]='1'
questions.append(i)
totalquestions=len(questions)
''' questions is a 2-d string list with first string=contest name and second string=questionname'''
try:
os.makedirs(username)
except:
print("Could not make directory")
exit()
print("Created directory for user : "+username)
for q in range(0,len(questions)):
question_name=questions[q][1].split(","+username)[0]
print("Getting question:"+question_name+" "+str(q+1)+"/"+str(totalquestions))
url="https://www.codechef.com/"+questions[q][0]+"/status/"+questions[q][1]
os.makedirs(username+"/"+questions[q][0]+"_"+question_name)
flag=0 # to avoid internal sever error
while(flag==0):
try:
html=urllib.urlopen(url).read()
if("Internal Server Error" not in html):
flag=1
except:
pass
html=html.split('<tr class=\\"kol\\"') # all submission in table format
for i in range(1,len(html)):
html[i]=str(html[i])
if("<img src='/misc/tick-icon.gif'>" in html[i]): #if solution is accepted it contains tick-icon
pattern='[0-9]">(.+?)</td>' #get only necessary data
html[i]=re.findall(pattern,html[i])
sub_id=html[i][0]
sub_time=html[i][1]
sub_time=html[i][4]
sub_mem=html[i][5]
sub_lang=html[i][6]
file=open(username+"/"+questions[q][0]+"_"+question_name+"/"+sub_lang+"_"+sub_time+"_"+sub_mem+".txt", 'w')
suburl="https://www.codechef.com/viewplaintext/"+sub_id
flag=0 #to avoid internal server error
while(flag==0):
try:
subhtml=urllib.urlopen(suburl).read()
if("Internal Server Error" not in subhtml):
flag=1
except:
pass
subhtml=subhtml.split(">")[1]
subhtml=subhtml.split("<")[0]
for code in htmlCodes:
subhtml = subhtml.replace(code[1], code[0])
file.write(subhtml)
print("All Done!")