|
| 1 | +#!/usr/bin/env python3 |
| 2 | + |
| 3 | +# Copyright (c) 2026 SandAI. All Rights Reserved. |
| 4 | +# |
| 5 | +# Licensed under the Apache License, Version 2.0 (the "License"); |
| 6 | +# you may not use this file except in compliance with the License. |
| 7 | +# You may obtain a copy of the License at |
| 8 | +# |
| 9 | +# http://www.apache.org/licenses/LICENSE-2.0 |
| 10 | +# |
| 11 | +# Unless required by applicable law or agreed to in writing, software |
| 12 | +# distributed under the License is distributed on an "AS IS" BASIS, |
| 13 | +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 14 | +# See the License for the specific language governing permissions and |
| 15 | +# limitations under the License. |
| 16 | + |
| 17 | +""" |
| 18 | +Check for Chinese characters in source code. |
| 19 | +
|
| 20 | +Two modes: |
| 21 | + - CI mode : set env vars BASE_REF and HEAD_REF to only check the PR diff. |
| 22 | + - Local mode: run without those env vars to scan every tracked file in the repo. |
| 23 | +
|
| 24 | +Usage: |
| 25 | + python3 .github/workflows/check_chinese_chars.py # scan entire repo |
| 26 | + BASE_REF=main HEAD_REF=HEAD python3 ... # scan diff only |
| 27 | +""" |
| 28 | + |
| 29 | +import os |
| 30 | +import re |
| 31 | +import subprocess |
| 32 | +import sys |
| 33 | +from typing import List, Tuple |
| 34 | + |
| 35 | +CHINESE_CHAR_PATTERN = re.compile( |
| 36 | + "[" |
| 37 | + "\u4e00-\u9fff" # CJK Unified Ideographs |
| 38 | + "\u3400-\u4dbf" # CJK Unified Ideographs Extension A |
| 39 | + "\uf900-\ufaff" # CJK Compatibility Ideographs |
| 40 | + "\u3000-\u303f" # CJK Symbols and Punctuation |
| 41 | + "\uff01-\uff5e" # Fullwidth ASCII variants |
| 42 | + "]" |
| 43 | +) |
| 44 | + |
| 45 | +BINARY_EXTENSIONS = frozenset( |
| 46 | + { |
| 47 | + ".png", |
| 48 | + ".jpg", |
| 49 | + ".jpeg", |
| 50 | + ".gif", |
| 51 | + ".bmp", |
| 52 | + ".ico", |
| 53 | + ".svg", |
| 54 | + ".webp", |
| 55 | + ".mp3", |
| 56 | + ".mp4", |
| 57 | + ".wav", |
| 58 | + ".avi", |
| 59 | + ".mov", |
| 60 | + ".mkv", |
| 61 | + ".zip", |
| 62 | + ".tar", |
| 63 | + ".gz", |
| 64 | + ".bz2", |
| 65 | + ".xz", |
| 66 | + ".7z", |
| 67 | + ".bin", |
| 68 | + ".exe", |
| 69 | + ".dll", |
| 70 | + ".so", |
| 71 | + ".dylib", |
| 72 | + ".pt", |
| 73 | + ".pth", |
| 74 | + ".onnx", |
| 75 | + ".safetensors", |
| 76 | + ".pickle", |
| 77 | + ".pkl", |
| 78 | + ".pdf", |
| 79 | + ".woff", |
| 80 | + ".woff2", |
| 81 | + ".ttf", |
| 82 | + ".otf", |
| 83 | + ".eot", |
| 84 | + ".pyc", |
| 85 | + ".o", |
| 86 | + ".a", |
| 87 | + ".nsys-rep", |
| 88 | + ".npz", |
| 89 | + ".npy", |
| 90 | + } |
| 91 | +) |
| 92 | + |
| 93 | + |
| 94 | +def _is_binary(path: str) -> bool: |
| 95 | + _, ext = os.path.splitext(path.lower()) |
| 96 | + return ext in BINARY_EXTENSIONS |
| 97 | + |
| 98 | + |
| 99 | +# --------------------------------------------------------------------------- |
| 100 | +# CI mode: only check newly added / modified lines in the PR diff |
| 101 | +# --------------------------------------------------------------------------- |
| 102 | + |
| 103 | + |
| 104 | +def _check_diff(base_sha: str, head_sha: str) -> List[Tuple[str, int, str]]: |
| 105 | + base_sha = subprocess.check_output(["git", "rev-parse", base_sha], text=True).strip() |
| 106 | + head_sha = subprocess.check_output(["git", "rev-parse", head_sha], text=True).strip() |
| 107 | + |
| 108 | + print(f"[CI mode] Checking diff between {base_sha[:8]} and {head_sha[:8]} ...") |
| 109 | + |
| 110 | + result = subprocess.run( |
| 111 | + ["git", "diff", "-U0", "--diff-filter=ACM", base_sha, head_sha], capture_output=True, text=True, check=True |
| 112 | + ) |
| 113 | + |
| 114 | + findings: List[Tuple[str, int, str]] = [] |
| 115 | + current_file = None |
| 116 | + line_num = 0 |
| 117 | + |
| 118 | + for line in result.stdout.split("\n"): |
| 119 | + if line.startswith("diff --git"): |
| 120 | + parts = line.split(" b/") |
| 121 | + current_file = parts[-1] if len(parts) >= 2 else None |
| 122 | + continue |
| 123 | + if line.startswith("@@"): |
| 124 | + match = re.search(r"\+(\d+)", line) |
| 125 | + if match: |
| 126 | + line_num = int(match.group(1)) - 1 |
| 127 | + continue |
| 128 | + if line.startswith("+++") or line.startswith("---"): |
| 129 | + continue |
| 130 | + if line.startswith("+"): |
| 131 | + line_num += 1 |
| 132 | + content = line[1:] |
| 133 | + if current_file and not _is_binary(current_file) and CHINESE_CHAR_PATTERN.search(content): |
| 134 | + findings.append((current_file, line_num, content)) |
| 135 | + elif not line.startswith("-"): |
| 136 | + line_num += 1 |
| 137 | + |
| 138 | + return findings |
| 139 | + |
| 140 | + |
| 141 | +# --------------------------------------------------------------------------- |
| 142 | +# Local mode: scan every tracked file in the repo |
| 143 | +# --------------------------------------------------------------------------- |
| 144 | + |
| 145 | + |
| 146 | +def _check_all_files() -> List[Tuple[str, int, str]]: |
| 147 | + print("[Local mode] Scanning all tracked files for Chinese characters ...") |
| 148 | + |
| 149 | + tracked = subprocess.check_output(["git", "ls-files"], text=True).strip().split("\n") |
| 150 | + |
| 151 | + findings: List[Tuple[str, int, str]] = [] |
| 152 | + for filepath in tracked: |
| 153 | + if not filepath or _is_binary(filepath) or not os.path.isfile(filepath): |
| 154 | + continue |
| 155 | + try: |
| 156 | + with open(filepath, encoding="utf-8", errors="ignore") as fh: |
| 157 | + for line_num, line in enumerate(fh, start=1): |
| 158 | + if CHINESE_CHAR_PATTERN.search(line): |
| 159 | + findings.append((filepath, line_num, line.rstrip("\n"))) |
| 160 | + except (OSError, UnicodeDecodeError): |
| 161 | + continue |
| 162 | + |
| 163 | + return findings |
| 164 | + |
| 165 | + |
| 166 | +# --------------------------------------------------------------------------- |
| 167 | +# Entry point |
| 168 | +# --------------------------------------------------------------------------- |
| 169 | + |
| 170 | + |
| 171 | +def _report(findings: List[Tuple[str, int, str]], is_ci: bool) -> None: |
| 172 | + if not findings: |
| 173 | + print("\nNo Chinese characters found.") |
| 174 | + return |
| 175 | + |
| 176 | + print(f"\nFound {len(findings)} line(s) containing Chinese characters:\n") |
| 177 | + for filepath, line_no, content in findings: |
| 178 | + stripped = content.strip() |
| 179 | + print(f" {filepath}:{line_no}: {stripped}") |
| 180 | + if is_ci: |
| 181 | + print(f"::error file={filepath},line={line_no}::Chinese character detected: {stripped}") |
| 182 | + |
| 183 | + print(f"\n{len(findings)} occurrence(s) total. Please remove Chinese characters from your code.") |
| 184 | + |
| 185 | + |
| 186 | +def main(): |
| 187 | + base_ref = os.environ.get("BASE_REF") |
| 188 | + head_ref = os.environ.get("HEAD_REF") |
| 189 | + is_ci = bool(base_ref and head_ref) |
| 190 | + |
| 191 | + if is_ci: |
| 192 | + findings = _check_diff(base_ref, head_ref) |
| 193 | + else: |
| 194 | + findings = _check_all_files() |
| 195 | + |
| 196 | + _report(findings, is_ci) |
| 197 | + |
| 198 | + if findings: |
| 199 | + sys.exit(1) |
| 200 | + |
| 201 | + |
| 202 | +if __name__ == "__main__": |
| 203 | + main() |
0 commit comments