1+ import os
2+ import requests
3+ import time
4+ from playwright .sync_api import sync_playwright
5+ from concurrent .futures import ThreadPoolExecutor , as_completed
6+ from github import Github , GithubException
7+ from tqdm import tqdm
8+
9+ from crawl_w_css import fetch_and_embed_css
10+ from screenshot import take_screenshot
11+
12+ GITHUB_TOKEN = os .getenv ('GITHUB_TOKEN' )
13+ MAX_RETRIES = 5
14+ BACKOFF_FACTOR = 2 # Exponential backoff multiplier
15+
16+ MAX_WORKERS = 32
17+
18+
19+ gh = Github (GITHUB_TOKEN )
20+
21+ def api_request_with_retry (url , headers ):
22+ retries = 0
23+ while retries < MAX_RETRIES :
24+ response = requests .get (url , headers = headers )
25+ if response .status_code == 200 :
26+ return response
27+ elif response .status_code == 403 :
28+ # Handle rate limiting
29+ reset_time = int (response .headers .get ('X-RateLimit-Reset' , time .time () + 60 ))
30+ sleep_duration = max (reset_time - time .time (), 60 ) * BACKOFF_FACTOR ** retries # Wait at least 60 seconds
31+ print (f"Rate limit reached. Sleeping for { sleep_duration } seconds." )
32+ time .sleep (sleep_duration )
33+ elif response .status_code >= 500 :
34+ # Retry on server errors
35+ retries += 1
36+ sleep_duration = BACKOFF_FACTOR ** retries
37+ print (f"Server error { response .status_code } . Retrying in { sleep_duration } seconds..." )
38+ time .sleep (sleep_duration )
39+ else :
40+ print (f"Error fetching { url } : { response .status_code } - { response .text } " )
41+ return None
42+ return None
43+
44+ def get_github_io_sites ():
45+ headers = {
46+ 'Authorization' : f'token { GITHUB_TOKEN } ' ,
47+ 'Accept' : 'application/vnd.github.v3+json'
48+ }
49+ github_io_sites = []
50+
51+ # Adjust the date ranges based on how far back you want to search
52+ date_ranges = [
53+ "2024-01-01..2024-12-31" ,
54+ "2023-01-01..2023-12-31" ,
55+ "2022-01-01..2022-12-31" ,
56+ "2021-01-01..2021-12-31" ,
57+ "2020-01-01..2020-12-31" ,
58+ "2019-01-01..2019-12-31" ,
59+ "2018-01-01..2018-12-31" ,
60+ "2010-01-01..2017-12-31"
61+ ]
62+
63+ for date_range in date_ranges :
64+ for page in range (1 , 11 ): # 10 pages of 100 results each
65+ url = f"https://api.github.com/search/repositories?q=github.io+in:name+created:{ date_range } &page={ page } &per_page=100"
66+ print (f"Fetching url: { url } " )
67+ response = api_request_with_retry (url , headers )
68+ if response and response .status_code == 200 :
69+ data = response .json ()
70+ repositories = data .get ('items' , [])
71+ if not repositories :
72+ break
73+ for repo in repositories :
74+ if repo ['name' ].endswith ('.github.io' ):
75+ github_io_sites .append (repo )
76+ elif response is None :
77+ print ("Failed to fetch repositories after retries. Exiting." )
78+ break
79+ return github_io_sites
80+
81+
82+ def check_license (repo ):
83+ headers = {
84+ 'Authorization' : f'token { GITHUB_TOKEN } ' ,
85+ 'Accept' : 'application/vnd.github.v3+json'
86+ }
87+ license_url = repo ['url' ] + '/license'
88+ response = api_request_with_retry (license_url , headers )
89+ if response and response .status_code == 200 :
90+ license_info = response .json ().get ('license' )
91+ if license_info and license_info .get ('spdx_id' ) in ['MIT' , 'Apache-2.0' , 'GPL-3.0' , 'BSD-3-Clause' ]:
92+ return True
93+ return False
94+
95+ def process_site (repo ):
96+ base_dir = '/juice2/scr2/nlp/pix2code/zyanzhe/Github_Pages'
97+
98+ site_url = f"https://{ repo ['name' ]} "
99+ if check_license (repo ):
100+ # print(f"Processing: {site_url}")
101+ html_content = fetch_and_embed_css (site_url )
102+ repo_name = repo ['name' ].replace (".github.io" , "" )
103+ if html_content :
104+ # print(f"Successfully processed {site_url}")
105+ with open (f'{ base_dir } /{ repo_name } .html' , 'w' ) as f :
106+ f .write (html_content )
107+ take_screenshot (f'{ base_dir } /{ repo_name } .html' , f'{ base_dir } /{ repo_name } .png' , do_it_again = True )
108+ else :
109+ print (f"Failed to process { site_url } " )
110+ else :
111+ print (f"License check failed for { site_url } , skipping." )
112+
113+
114+ def main ():
115+
116+ sites = get_github_io_sites ()
117+ print (f"There are a total of { len (sites )} sites" )
118+ # sites = sites[:16]
119+
120+ with ThreadPoolExecutor (max_workers = MAX_WORKERS ) as executor :
121+ futures = {executor .submit (process_site , repo ): repo for repo in sites }
122+ with tqdm (total = len (futures ), desc = "Processing sites" ) as pbar :
123+ for future in as_completed (futures ):
124+ try :
125+ future .result () # Retrieve the result to catch exceptions
126+ except Exception as e :
127+ print (f"An error occurred during processing: { e } " )
128+ finally :
129+ pbar .update (1 ) # Update the progress bar
130+
131+ # count = 0
132+ # for repo in sites:
133+ # if check_license(repo):
134+ # site_url = f"https://{repo['name']}"
135+ # print(f"Processing: {site_url}")
136+ # html_content = fetch_and_embed_css(site_url)
137+ # repo_name = repo['name'].replace(".github.io", "")
138+ # if html_content:
139+ # print(f"Successfully processed {site_url}")
140+ # with open(f'{base_dir}/{repo_name}.html', 'w') as f:
141+ # f.write(html_content)
142+ # take_screenshot(f'{base_dir}/{repo_name}.html', f'{base_dir}/{repo_name}.png')
143+ # count += 1
144+ # if count >= 10:
145+ # break
146+ # else:
147+ # print(f"Failed to process {site_url}")
148+
149+ if __name__ == '__main__' :
150+ main ()
0 commit comments