-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathChineseCharacterParser.py
More file actions
65 lines (55 loc) · 2.52 KB
/
ChineseCharacterParser.py
File metadata and controls
65 lines (55 loc) · 2.52 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
import json
import csv
# import dictionary
from hanzipy.decomposer import HanziDecomposer
decomposer = HanziDecomposer()
# import decomposer
from hanzipy.dictionary import HanziDictionary
dictionary = HanziDictionary()
def load_unihan_data(filename):
with open(filename, 'r', encoding='utf-8') as file:
return json.load(file)
def find_primary_radical(data, kRSUnicode):
radical_code = kRSUnicode.split('.')[0] + '.0'
for char, details in data.items():
if details['kRSUnicode'] == radical_code:
return char, details
return None, None
def main():
unihan_data = load_unihan_data('UnihanLite.json')
input_characters = []
# Load characters from the input file
with open('input.txt', 'r', encoding='utf-8') as file:
input_characters = [line.strip() for line in file.readlines()]
# Prepare to write to the CSV
with open('output.csv', 'w', newline='', encoding='utf-8') as csvfile:
fieldnames = ['number', 'character', 'Mandarin', 'Definition', 'primaryRadical',
'radicalMandarin', 'hanzipyStrokes', 'hanzipyRadicals']
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
for idx, char in enumerate(input_characters, 1):
char_data = unihan_data.get(char, {})
decomposition = decomposer.decompose(char)
# Find primary radical
primary_radical_char, primary_radical_data = find_primary_radical(unihan_data, char_data.get('kRSUnicode', ''))
# Write base information
writer.writerow({
'number': idx,
'character': char,
'Mandarin': char_data.get('kMandarin', ''),
'Definition': char_data.get('kDefinition', ''),
'primaryRadical': primary_radical_char,
'radicalMandarin': primary_radical_data.get('kMandarin', '') if primary_radical_data else '',
'hanzipyStrokes': ', '.join(decomposition['graphical']),
})
# Write radicals
for radical_idx, radical in enumerate(decomposition['radical'], 1):
radical_data = unihan_data.get(radical, {})
writer.writerow({
'number': f'{idx}{chr(96 + radical_idx)}', # Generates '1a', '1b', etc.
'character': radical,
'Mandarin': radical_data.get('kMandarin', ''),
'Definition': radical_data.get('kDefinition', ''),
})
if __name__ == '__main__':
main()