-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmain.py
More file actions
141 lines (108 loc) · 4.61 KB
/
main.py
File metadata and controls
141 lines (108 loc) · 4.61 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import json
import argparse
import sys
from typing import Dict, Tuple
class Converter:
def __init__(self, dict_path: str):
with open(dict_path, 'r', encoding='utf-8') as f:
data = json.load(f)
self.dict_version = data.get('dict_version', 0)
self.entries = data.get('entries', [])
self.tw_to_zh = {} # 台文 -> [(華文, 權重), ...]
self.zh_to_tw = {} # 華文 -> [(台文, 權重), ...]
for entry in self.entries:
tw = entry['台文']
cn = entry['華文']
weight = entry.get('權重', 0.0)
if tw not in self.tw_to_zh:
self.tw_to_zh[tw] = []
self.tw_to_zh[tw].append((cn, weight))
if cn not in self.zh_to_tw:
self.zh_to_tw[cn] = []
self.zh_to_tw[cn].append((tw, weight))
def calculate_weight(self, text: str, base_weight: float) -> float:
length_weight = len(text) * 0.1
return length_weight + base_weight
def find_best_match(self, text: str, pos: int, dictionary: Dict) -> Tuple[str, str, int]:
best_match = None
best_replacement = None
best_weight = float('-inf')
best_length = 0
for length in range(1, len(text) - pos + 1):
substring = text[pos:pos + length]
if substring in dictionary:
for replacement, base_weight in dictionary[substring]:
total_weight = self.calculate_weight(substring, base_weight)
if total_weight > best_weight:
best_weight = total_weight
best_match = substring
best_replacement = replacement
best_length = length
return best_match, best_replacement, best_length
def convert(self, text: str, mode: str) -> str:
if mode == 't2c':
dictionary = self.tw_to_zh
elif mode == 'c2t':
dictionary = self.zh_to_tw
else:
raise ValueError(f"你欲做啥?不支援的模式: {mode}")
result = []
pos = 0
while pos < len(text):
match, replacement, length = self.find_best_match(text, pos, dictionary)
if match:
result.append(replacement)
pos += length
else:
result.append(text[pos])
pos += 1
return ''.join(result)
def main():
parser = argparse.ArgumentParser(
description='TaigiCC - Taigi-Chinese Converter',
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
範例:
%(prog)s --mode t2c --text "台文內容" --dict dict.json
%(prog)s --mode c2t --file input.txt --dict dict.json --output output.txt
"""
)
parser.add_argument('--mode', '-m', required=True, choices=['t2c', 'c2t'],
help='轉換模式: t2c (台文到華文) 或 c2t (華文到台文)')
parser.add_argument('--dict', '-d', required=True,
help='字典檔案路徑 (JSON 格式)')
input_group = parser.add_mutually_exclusive_group(required=True)
input_group.add_argument('--text', '-t',
help='直接輸入要轉換的文字')
input_group.add_argument('--file', '-f',
help='輸入檔案路徑')
parser.add_argument('--output', '-o',
help='輸出檔案路徑 (不指定則輸出到 stdout)')
args = parser.parse_args()
try:
converter = Converter(args.dict)
if args.text:
input_text = args.text
else:
with open(args.file, 'r', encoding='utf-8') as f:
input_text = f.read()
output_text = converter.convert(input_text, args.mode)
if args.output:
with open(args.output, 'w', encoding='utf-8') as f:
f.write(output_text)
print(f"轉換完成,已輸出到: {args.output}", file=sys.stderr)
else:
print(output_text)
except FileNotFoundError as e:
print(f"錯誤: 檔案不存在 - {e}", file=sys.stderr)
sys.exit(1)
except json.JSONDecodeError as e:
print(f"錯誤: JSON 格式錯誤 - {e}", file=sys.stderr)
sys.exit(1)
except Exception as e:
print(f"錯誤: {e}", file=sys.stderr)
sys.exit(1)
if __name__ == '__main__':
main()