forked from ShoufaChen/clone-anonymous4open
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathclone.py
More file actions
128 lines (104 loc) · 4.18 KB
/
clone.py
File metadata and controls
128 lines (104 loc) · 4.18 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
import urllib.request as urllib2
import re
from bs4 import BeautifulSoup
import os
import argparse
def parse_args():
parser = argparse.ArgumentParser(description='Clone from the https://anonymous.4open.science')
parser.add_argument('--clone-dir', type=str, default='master',
help='master loacation')
parser.add_argument('--target', type=str,
help='anonymous link you want to clone')
return parser.parse_args()
def create_dir(name):
if not os.path.exists(name):
os.mkdir(name)
def check_html(url):
req = urllib2.Request(url, headers={'User-Agent': 'Mozilla/5.0'})
try:
response = urllib2.urlopen(req).read()
except urllib2.URLError as e:
print(e)
print(url)
return True
return False
def pull_html(url):
req = urllib2.Request(url, headers={'User-Agent': 'Mozilla/5.0'})
try:
response = urllib2.urlopen(req).read()
except urllib2.URLError as e:
print(e)
print(url)
content = response.decode('utf-8')
soup = BeautifulSoup(content, "lxml")
return soup
def pull_trees(url):
folder_soup = pull_html(url)
trees = folder_soup.find_all('div', attrs={'class': 'tree'})
return trees
def pull_blobs(url):
blobs_soup = pull_html(url)
blobs = blobs_soup.find_all('div', attrs={'class': 'blob'})
return blobs
def clone_file(url, download, root_url='https://anonymous.4open.science'):
blobs = pull_blobs(root_url+url)
for blob in blobs:
href = blob.a.get('href')
split_href = href.split('/')
file_name = '/'.join([download]+split_href[3:])
print('Clone... ', file_name)
#used for debug
#print('Clone... ', file_name, href)
### Not support clone markdown files now and LICENSE
if check_html(root_url+href) or split_href[-1] =='LICENSE':
continue
blob_soup = pull_html(root_url+href)
if split_href[-1].split('.')[1] in( 'png','jpeg','gif'):
for img in blob_soup.select("img[src]"):
image_url = img["src"]
import base64
img_data=image_url
head, data = img_data.split(',', 1)
file_ext = head.split(';')[0].split('/')[1]
plain_data = base64.b64decode(data)
with open(file_name, 'wb') as f:
f.write(plain_data)
elif split_href[-1].split('.')[-1] == 'md' :
md_file=blob_soup.find_all( "div", {"class":"markdown-body"})
md_file=html2text.html2text(str(md_file))
with open(file_name, 'w') as f:
f.write(md_file)
else:
source_code = blob_soup.find('code')
with open(file_name, 'w') as f:
f.write(source_code.get_text())
def clone_dirs(url, folders_url_lis, download, root_url='https://anonymous.4open.science'):
trees = pull_trees(root_url+url)
for t in trees:
href = t.a.get('href')
split_href = href.split('/')
#folder_name = split_href[-2]
folder_name = '/'.join([download]+split_href[3:-1])
print('Clone... ', folder_name)
#print('Clone... ', folder_name, href)
create_dir(folder_name)
folders_url_list.append(href)
folders_url_list.remove(url)
return folders_url_list
if __name__ == '__main__':
args = parse_args()
assert args.target, '\nPlese specifipy your target URL, \n e.g: '\
+'python clone.py --target https://anonymous.4open.science/r/840c8c57-3c32-451e-bf12-0e20be300389/'
root_url = 'https://anonymous.4open.science'
target_url = args.target.replace(root_url, '')
create_dir(args.clone_dir)
folders_url_list = [target_url]
clone_file(target_url.replace(root_url, ''), args.clone_dir)
folders_url_list = clone_dirs(target_url.replace(root_url, ''), folders_url_list, args.clone_dir)
while len(folders_url_list):
url = folders_url_list[0]
clone_file(url, args.clone_dir)
folders_url_list = clone_dirs(url, folders_url_list, args.clone_dir)
print('==='*20)
print('Successfully Clone to: {}'.format(args.clone_dir))
print('==='*20)