Skip to content

Commit 7a575e4

Browse files
committed
data crawling and evaluation scripts for Design2Code-hard
1 parent 338a1b0 commit 7a575e4

5 files changed

Lines changed: 888 additions & 4 deletions

File tree

Lines changed: 150 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,150 @@
1+
import os
2+
import requests
3+
import time
4+
from playwright.sync_api import sync_playwright
5+
from concurrent.futures import ThreadPoolExecutor, as_completed
6+
from github import Github, GithubException
7+
from tqdm import tqdm
8+
9+
from crawl_w_css import fetch_and_embed_css
10+
from screenshot import take_screenshot
11+
12+
GITHUB_TOKEN = os.getenv('GITHUB_TOKEN')
13+
MAX_RETRIES = 5
14+
BACKOFF_FACTOR = 2 # Exponential backoff multiplier
15+
16+
MAX_WORKERS = 32
17+
18+
19+
gh = Github(GITHUB_TOKEN)
20+
21+
def api_request_with_retry(url, headers):
22+
retries = 0
23+
while retries < MAX_RETRIES:
24+
response = requests.get(url, headers=headers)
25+
if response.status_code == 200:
26+
return response
27+
elif response.status_code == 403:
28+
# Handle rate limiting
29+
reset_time = int(response.headers.get('X-RateLimit-Reset', time.time() + 60))
30+
sleep_duration = max(reset_time - time.time(), 60) * BACKOFF_FACTOR ** retries # Wait at least 60 seconds
31+
print(f"Rate limit reached. Sleeping for {sleep_duration} seconds.")
32+
time.sleep(sleep_duration)
33+
elif response.status_code >= 500:
34+
# Retry on server errors
35+
retries += 1
36+
sleep_duration = BACKOFF_FACTOR ** retries
37+
print(f"Server error {response.status_code}. Retrying in {sleep_duration} seconds...")
38+
time.sleep(sleep_duration)
39+
else:
40+
print(f"Error fetching {url}: {response.status_code} - {response.text}")
41+
return None
42+
return None
43+
44+
def get_github_io_sites():
45+
headers = {
46+
'Authorization': f'token {GITHUB_TOKEN}',
47+
'Accept': 'application/vnd.github.v3+json'
48+
}
49+
github_io_sites = []
50+
51+
# Adjust the date ranges based on how far back you want to search
52+
date_ranges = [
53+
"2024-01-01..2024-12-31",
54+
"2023-01-01..2023-12-31",
55+
"2022-01-01..2022-12-31",
56+
"2021-01-01..2021-12-31",
57+
"2020-01-01..2020-12-31",
58+
"2019-01-01..2019-12-31",
59+
"2018-01-01..2018-12-31",
60+
"2010-01-01..2017-12-31"
61+
]
62+
63+
for date_range in date_ranges:
64+
for page in range(1, 11): # 10 pages of 100 results each
65+
url = f"https://api.github.com/search/repositories?q=github.io+in:name+created:{date_range}&page={page}&per_page=100"
66+
print(f"Fetching url: {url}")
67+
response = api_request_with_retry(url, headers)
68+
if response and response.status_code == 200:
69+
data = response.json()
70+
repositories = data.get('items', [])
71+
if not repositories:
72+
break
73+
for repo in repositories:
74+
if repo['name'].endswith('.github.io'):
75+
github_io_sites.append(repo)
76+
elif response is None:
77+
print("Failed to fetch repositories after retries. Exiting.")
78+
break
79+
return github_io_sites
80+
81+
82+
def check_license(repo):
83+
headers = {
84+
'Authorization': f'token {GITHUB_TOKEN}',
85+
'Accept': 'application/vnd.github.v3+json'
86+
}
87+
license_url = repo['url'] + '/license'
88+
response = api_request_with_retry(license_url, headers)
89+
if response and response.status_code == 200:
90+
license_info = response.json().get('license')
91+
if license_info and license_info.get('spdx_id') in ['MIT', 'Apache-2.0', 'GPL-3.0', 'BSD-3-Clause']:
92+
return True
93+
return False
94+
95+
def process_site(repo):
96+
base_dir = '/juice2/scr2/nlp/pix2code/zyanzhe/Github_Pages'
97+
98+
site_url = f"https://{repo['name']}"
99+
if check_license(repo):
100+
# print(f"Processing: {site_url}")
101+
html_content = fetch_and_embed_css(site_url)
102+
repo_name = repo['name'].replace(".github.io", "")
103+
if html_content:
104+
# print(f"Successfully processed {site_url}")
105+
with open(f'{base_dir}/{repo_name}.html', 'w') as f:
106+
f.write(html_content)
107+
take_screenshot(f'{base_dir}/{repo_name}.html', f'{base_dir}/{repo_name}.png', do_it_again=True)
108+
else:
109+
print(f"Failed to process {site_url}")
110+
else:
111+
print(f"License check failed for {site_url}, skipping.")
112+
113+
114+
def main():
115+
116+
sites = get_github_io_sites()
117+
print(f"There are a total of {len(sites)} sites")
118+
# sites = sites[:16]
119+
120+
with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
121+
futures = {executor.submit(process_site, repo): repo for repo in sites}
122+
with tqdm(total=len(futures), desc="Processing sites") as pbar:
123+
for future in as_completed(futures):
124+
try:
125+
future.result() # Retrieve the result to catch exceptions
126+
except Exception as e:
127+
print(f"An error occurred during processing: {e}")
128+
finally:
129+
pbar.update(1) # Update the progress bar
130+
131+
# count = 0
132+
# for repo in sites:
133+
# if check_license(repo):
134+
# site_url = f"https://{repo['name']}"
135+
# print(f"Processing: {site_url}")
136+
# html_content = fetch_and_embed_css(site_url)
137+
# repo_name = repo['name'].replace(".github.io", "")
138+
# if html_content:
139+
# print(f"Successfully processed {site_url}")
140+
# with open(f'{base_dir}/{repo_name}.html', 'w') as f:
141+
# f.write(html_content)
142+
# take_screenshot(f'{base_dir}/{repo_name}.html', f'{base_dir}/{repo_name}.png')
143+
# count += 1
144+
# if count >= 10:
145+
# break
146+
# else:
147+
# print(f"Failed to process {site_url}")
148+
149+
if __name__ == '__main__':
150+
main()

0 commit comments

Comments
 (0)