-
Notifications
You must be signed in to change notification settings - Fork 181
Expand file tree
/
Copy pathfix_mirror.py
More file actions
114 lines (85 loc) · 3.48 KB
/
fix_mirror.py
File metadata and controls
114 lines (85 loc) · 3.48 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
#!/usr/bin/env python3
import re
import sys
from pathlib import Path
from urllib.parse import urlparse
ROOT_DIR = Path(sys.argv[1]) if len(sys.argv) > 1 else Path(".")
TARGET_EXTENSIONS = {'.html', '.css', '.js'}
UPLOAD_BROKEN = "https://upload.cppreference.com/mwiki/images/"
UPLOAD_CORRECT = "https://upload.cppreference.com/images/"
UPLOAD_LOCAL_DIR = "upload.cppreference.com/images/"
IMAGE_EXTENSIONS = (".png", ".jpg", ".jpeg", ".gif", ".svg", ".webp", ".ico")
def local_prefix(root, file_path):
rel_path = file_path.relative_to(root)
depth = len(rel_path.parts) - 1
return "../" * depth if depth > 0 else "./"
def clean_url(url):
url = url.rstrip('"\'),;')
parsed = urlparse(url)
path = strip_image_cache_suffix(parsed.path)
if path.lower().endswith(IMAGE_EXTENSIONS):
return parsed._replace(path=path, fragment='').geturl()
return url
def strip_image_cache_suffix(path):
return re.sub(
r'(\.(?:png|jpe?g|gif|svg|webp|ico))(?:@|%40)[^/\\?#]*$',
r'\1', path, flags=re.IGNORECASE)
def local_path_for_url(url):
parsed = urlparse(url)
return parsed.netloc + strip_image_cache_suffix(parsed.path)
def rewrite_upload_urls(content, root, file_path, urls_to_download):
local_path = local_prefix(root, file_path) + UPLOAD_LOCAL_DIR
fixed_content = content.replace(UPLOAD_BROKEN, UPLOAD_CORRECT)
matches = re.findall(
r'https://upload\.cppreference\.com/images/[^"\'<>\s)]+',
fixed_content)
urls_to_download.update(clean_url(url) for url in matches)
new_content = content.replace(UPLOAD_BROKEN, local_path)
new_content = new_content.replace(UPLOAD_CORRECT, local_path)
return new_content
def rewrite_vendor_urls(content, root, file_path, urls_to_download):
prefix = local_prefix(root, file_path)
def replace(match):
url = clean_url(match.group(0))
if "static.cloudflareinsights.com" in url:
return url
urls_to_download.add(url)
return prefix + local_path_for_url(url)
content = re.sub(
r'https://cdn\.jsdelivr\.net/[^"\'<>\s)]+',
replace, content)
return content
def main():
root = ROOT_DIR.resolve()
all_files = [
f for f in root.rglob('*')
if f.suffix.lower() in TARGET_EXTENSIONS and f.is_file()
]
urls_to_download = set()
files_modified = 0
print(f"Scanning {len(all_files)} files...")
for file_path in all_files:
try:
content = file_path.read_text(encoding='utf-8', errors='ignore')
except OSError as exc:
print(f"Skipping {file_path}: {exc}")
continue
new_content = rewrite_upload_urls(
content, root, file_path, urls_to_download)
new_content = rewrite_vendor_urls(
new_content, root, file_path, urls_to_download)
if new_content != content:
file_path.write_text(new_content, encoding='utf-8')
files_modified += 1
print(f"Modified {files_modified} files with local relative paths.")
if urls_to_download:
url_file = root / "urls_to_download.txt"
url_file.write_text(
"\n".join(sorted(urls_to_download)) + "\n", encoding='utf-8')
print(f"\nWrote {len(urls_to_download)} asset URLs to {url_file}")
print("\nRun this command to download missing assets:")
print(f" wget --force-directories --trust-server-names -i {url_file}")
else:
print("No missing asset URLs found.")
if __name__ == "__main__":
main()