-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathunicode_sortkey_adder.py
More file actions
154 lines (121 loc) · 4.8 KB
/
unicode_sortkey_adder.py
File metadata and controls
154 lines (121 loc) · 4.8 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
"""
=========================================================
Unicode Category Sortkey Adder for Wikimedia Commons
=========================================================
[ SETUP & USAGE ]
1. Install reqs: pip install pywikibot
2. Login config: Ensure 'user-config.py' is set up in your directory.
3. Purpose : Automatically extracts Unicode hex values from filenames
(e.g., U+1F600.svg) and adds them as 6-digit sortkeys
to all categories in the file's wikitext.
4. Run : Set TARGET_CATEGORY below, then run the script.
=========================================================
"""
import pywikibot
import re
import os
from pywikibot import pagegenerators
# ================= CONFIGURATION =================
# 1. Target Category
TARGET_CATEGORY = "Category:YOUR_FONT_OR_GLYPH_CATEGORY_HERE"
# 2. Log File (To remember processed files and prevent duplicate work)
LOG_FILE = "sortkey_processed_log.txt"
# 3. Filename Pattern (Matches standard Unicode files: U+XXXX.svg, U+XXXXX.png, etc.)
# It extracts the hex code ignoring the file extension.
FILENAME_PATTERN = re.compile(r"U\+([0-9A-Fa-f]{4,6})\.[a-zA-Z0-9]+$")
# 4. Edit Summary
EDIT_SUMMARY = "Adding 6-digit Unicode sortkey to categories for proper block order"
# 5. Advanced Controls
# Set DRY_RUN = True to test regex and logic without saving to the Wiki
DRY_RUN = False
# =================================================
# Initialize Site
site = pywikibot.Site('commons', 'commons')
def load_history():
"""Loads the list of already processed files from the log."""
if not os.path.exists(LOG_FILE):
return set()
with open(LOG_FILE, "r", encoding="utf-8") as f:
return set(line.strip() for line in f)
def append_history(title):
"""Saves a successfully processed (or skipped) filename to the log."""
with open(LOG_FILE, "a", encoding="utf-8") as f:
f.write(f"{title}\n")
def compute_sortkey(hexstr):
"""Converts the extracted hex string into a standard 6-digit sortkey (e.g., A9 -> 0000A9)."""
return f"{int(hexstr, 16):06X}"
def add_sortkey_to_text(text, sortkey):
"""
Appends the sortkey to all category tags in the text.
Skips categories that already have a pipe (|) and a sortkey.
"""
# Regex: Matches [[Category:Name]] but ignores [[Category:Name|Sortkey]]
cat_re = re.compile(r"\[\[([Cc]ategory:[^\]|]+)\]\]")
def repl(match):
cat_name = match.group(1)
return f"[[{cat_name}|{sortkey}]]"
return cat_re.sub(repl, text)
def process_page(page, processed_set):
title = page.title(with_ns=False)
# 1. Validate filename pattern
m = FILENAME_PATTERN.search(title)
if not m:
print(f"[SKIP] Invalid Name Pattern (Not U+XXXX): {title}")
return
# 2. Generate Sortkey
hexpart = m.group(1)
sortkey = compute_sortkey(hexpart)
print(f"\nProcessing: {title} (Target Sortkey: {sortkey})")
try:
text = page.get()
except pywikibot.NoPage:
print(f"[ERROR] Page does not exist: {title}")
return
# 3. Modify text
new_text = add_sortkey_to_text(text, sortkey)
# 4. Check if changes were actually made
if new_text == text:
print(f"[SKIP] No changes needed (Sortkeys already exist or no categories found).")
append_history(page.title())
processed_set.add(page.title())
return
# 5. Save the page
if DRY_RUN:
print(f"[-] DRY RUN: Would have saved '{title}' with summary: {EDIT_SUMMARY}")
return
try:
page.put(new_text, summary=f"{EDIT_SUMMARY} ({sortkey})")
print(f"[SUCCESS] Saved: {title}")
append_history(page.title())
processed_set.add(page.title())
except pywikibot.LockedPage:
print(f"[SKIP] Page is Locked: {title}")
except Exception as e:
print(f"[ERROR] Could not save {title}: {e}")
def main():
try:
site.login()
print(f"Logged in as: {site.user()}")
except Exception as e:
print(f"Login Failed: {e}")
return
# Load history
processed_set = load_history()
print(f"Loaded {len(processed_set)} previously processed files from log.")
# Setup generator for the target category
print(f"Fetching files from: {TARGET_CATEGORY}...")
cat = pywikibot.Category(site, TARGET_CATEGORY)
gen = pagegenerators.CategorizedPageGenerator(cat, namespaces=[6]) # Namespace 6 = File
count = 0
for page in gen:
# Skip if already in log
if page.title() in processed_set:
continue
process_page(page, processed_set)
count += 1
if DRY_RUN:
print("\n✅ DRY RUN complete. No actual edits were made.")
else:
print(f"\n✅ Done! All available files in the category have been processed.")
if __name__ == "__main__":
main()