clone-anonymous4open/clone.py at master · attiamohammed/clone-anonymous4open · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
import urllib.request as urllib2
import re
from bs4 import BeautifulSoup
import os
import argparse

def parse_args():
    parser = argparse.ArgumentParser(description='Clone from the https://anonymous.4open.science')
    parser.add_argument('--clone-dir', type=str, default='master',
                        help='master loacation')
    parser.add_argument('--target', type=str,
                        help='anonymous link you want to clone')
    return parser.parse_args()


def create_dir(name):
    if not os.path.exists(name):
        os.mkdir(name)

def check_html(url):
    req = urllib2.Request(url, headers={'User-Agent': 'Mozilla/5.0'})
    try:
        response = urllib2.urlopen(req).read()
    except urllib2.URLError as e:
        print(e)
        print(url)
        return True
    return False

def pull_html(url):
    req = urllib2.Request(url, headers={'User-Agent': 'Mozilla/5.0'})
    try:
        response = urllib2.urlopen(req).read()
    except urllib2.URLError as e:
        print(e)
        print(url)
    content = response.decode('utf-8')
    soup = BeautifulSoup(content, "lxml")

    return soup


def pull_trees(url):
    folder_soup = pull_html(url)
    trees = folder_soup.find_all('div', attrs={'class': 'tree'})
    return trees

def pull_blobs(url):
    blobs_soup = pull_html(url)
    blobs = blobs_soup.find_all('div', attrs={'class': 'blob'})
    return blobs

def clone_file(url, download, root_url='https://anonymous.4open.science'):
    blobs = pull_blobs(root_url+url)
    for blob in blobs:
        href = blob.a.get('href')
        split_href = href.split('/')
        file_name = '/'.join([download]+split_href[3:])

        print('Clone...  ', file_name)
        #used for debug
        #print('Clone...  ', file_name, href)

        ### Not support clone markdown files now  and LICENSE
        if check_html(root_url+href)  or split_href[-1] =='LICENSE':
            continue

        blob_soup = pull_html(root_url+href)
        if split_href[-1].split('.')[1] in( 'png','jpeg','gif'):
            for img in blob_soup.select("img[src]"):
                image_url = img["src"]
                import base64
                img_data=image_url
                head, data = img_data.split(',', 1)
                file_ext = head.split(';')[0].split('/')[1]
                plain_data = base64.b64decode(data)
                with open(file_name, 'wb') as f:
                    f.write(plain_data)
        elif split_href[-1].split('.')[-1] == 'md' :
            md_file=blob_soup.find_all( "div", {"class":"markdown-body"})
            md_file=html2text.html2text(str(md_file))
            with open(file_name, 'w') as f:
                f.write(md_file)
        else:
            source_code = blob_soup.find('code')
            with open(file_name, 'w') as f:
                f.write(source_code.get_text())

def clone_dirs(url, folders_url_lis, download, root_url='https://anonymous.4open.science'):
    trees = pull_trees(root_url+url)

    for t in trees:
        href = t.a.get('href')
        split_href = href.split('/')
        #folder_name = split_href[-2]
        folder_name = '/'.join([download]+split_href[3:-1])
        print('Clone...  ', folder_name)
        #print('Clone...   ', folder_name, href)
        create_dir(folder_name)

        folders_url_list.append(href)

    folders_url_list.remove(url)
    return folders_url_list


if __name__ == '__main__':
    args = parse_args()
    assert args.target, '\nPlese specifipy your target URL, \n e.g:    '\
            +'python clone.py --target https://anonymous.4open.science/r/840c8c57-3c32-451e-bf12-0e20be300389/'

    root_url = 'https://anonymous.4open.science'
    target_url = args.target.replace(root_url, '')

    create_dir(args.clone_dir)

    folders_url_list = [target_url]
    clone_file(target_url.replace(root_url, ''), args.clone_dir)
    folders_url_list = clone_dirs(target_url.replace(root_url, ''), folders_url_list, args.clone_dir)

    while len(folders_url_list):
        url = folders_url_list[0]
        clone_file(url, args.clone_dir)
        folders_url_list = clone_dirs(url, folders_url_list, args.clone_dir)

    print('==='*20)
    print('Successfully Clone to: {}'.format(args.clone_dir))
    print('==='*20)