cppreference2mshelp/fix_mirror.py at master · myfreeer/cppreference2mshelp · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
#!/usr/bin/env python3
import re
import sys
from pathlib import Path
from urllib.parse import urlparse

ROOT_DIR = Path(sys.argv[1]) if len(sys.argv) > 1 else Path(".")
TARGET_EXTENSIONS = {'.html', '.css', '.js'}

UPLOAD_BROKEN = "https://upload.cppreference.com/mwiki/images/"
UPLOAD_CORRECT = "https://upload.cppreference.com/images/"
UPLOAD_LOCAL_DIR = "upload.cppreference.com/images/"
IMAGE_EXTENSIONS = (".png", ".jpg", ".jpeg", ".gif", ".svg", ".webp", ".ico")


def local_prefix(root, file_path):
    rel_path = file_path.relative_to(root)
    depth = len(rel_path.parts) - 1
    return "../" * depth if depth > 0 else "./"


def clean_url(url):
    url = url.rstrip('"\'),;')
    parsed = urlparse(url)
    path = strip_image_cache_suffix(parsed.path)
    if path.lower().endswith(IMAGE_EXTENSIONS):
        return parsed._replace(path=path, fragment='').geturl()
    return url


def strip_image_cache_suffix(path):
    return re.sub(
        r'(\.(?:png|jpe?g|gif|svg|webp|ico))(?:@|%40)[^/\\?#]*$',
        r'\1', path, flags=re.IGNORECASE)


def local_path_for_url(url):
    parsed = urlparse(url)
    return parsed.netloc + strip_image_cache_suffix(parsed.path)


def rewrite_upload_urls(content, root, file_path, urls_to_download):
    local_path = local_prefix(root, file_path) + UPLOAD_LOCAL_DIR
    fixed_content = content.replace(UPLOAD_BROKEN, UPLOAD_CORRECT)
    matches = re.findall(
        r'https://upload\.cppreference\.com/images/[^"\'<>\s)]+',
        fixed_content)
    urls_to_download.update(clean_url(url) for url in matches)

    new_content = content.replace(UPLOAD_BROKEN, local_path)
    new_content = new_content.replace(UPLOAD_CORRECT, local_path)
    return new_content


def rewrite_vendor_urls(content, root, file_path, urls_to_download):
    prefix = local_prefix(root, file_path)

    def replace(match):
        url = clean_url(match.group(0))
        if "static.cloudflareinsights.com" in url:
            return url
        urls_to_download.add(url)
        return prefix + local_path_for_url(url)

    content = re.sub(
        r'https://cdn\.jsdelivr\.net/[^"\'<>\s)]+',
        replace, content)

    return content


def main():
    root = ROOT_DIR.resolve()
    all_files = [
        f for f in root.rglob('*')
        if f.suffix.lower() in TARGET_EXTENSIONS and f.is_file()
    ]

    urls_to_download = set()
    files_modified = 0

    print(f"Scanning {len(all_files)} files...")

    for file_path in all_files:
        try:
            content = file_path.read_text(encoding='utf-8', errors='ignore')
        except OSError as exc:
            print(f"Skipping {file_path}: {exc}")
            continue

        new_content = rewrite_upload_urls(
            content, root, file_path, urls_to_download)
        new_content = rewrite_vendor_urls(
            new_content, root, file_path, urls_to_download)

        if new_content != content:
            file_path.write_text(new_content, encoding='utf-8')
            files_modified += 1

    print(f"Modified {files_modified} files with local relative paths.")

    if urls_to_download:
        url_file = root / "urls_to_download.txt"
        url_file.write_text(
            "\n".join(sorted(urls_to_download)) + "\n", encoding='utf-8')
        print(f"\nWrote {len(urls_to_download)} asset URLs to {url_file}")
        print("\nRun this command to download missing assets:")
        print(f"  wget --force-directories --trust-server-names -i {url_file}")
    else:
        print("No missing asset URLs found.")


if __name__ == "__main__":
    main()