diff --git a/AUTHORS.rst b/AUTHORS.rst index 2b233b5..b2a3d7a 100644 --- a/AUTHORS.rst +++ b/AUTHORS.rst @@ -10,4 +10,12 @@ Author and Maintainer Contributors ------------ -None yet. Why not be the first? +* Tait Hoyem — HTML Formatting + +Why not be the second? :-) + +Attribution +------------ + +* Sun Jianai — FZKai-Extended font [used in pictures] +* Google — [Source Sans Pro, Normal 400](https://www.google.com/fonts#QuickUsePlace:quickUse/Family:Source+Sans+Pro) [used for Pinyin font] diff --git a/CHANGES.rst b/CHANGES.rst index 87033d4..28902de 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -3,6 +3,11 @@ Change Log ---------- +0.3.0 (2016-05-27) +++++++++++++++++++ + +* Added HTML Formatting. + 0.2.6 (2016-05-23) ++++++++++++++++++ diff --git a/README.rst b/README.rst index 2dfb2dd..59c34dc 100644 --- a/README.rst +++ b/README.rst @@ -4,7 +4,7 @@ Dragon Mapper .. image:: https://badge.fury.io/py/dragonmapper.png :target: http://badge.fury.io/py/dragonmapper - + .. image:: https://travis-ci.org/tsroten/dragonmapper.png?branch=develop :target: https://travis-ci.org/tsroten/dragonmapper @@ -22,27 +22,48 @@ Features Phonetic Alphabet. * Identify a string as Traditional or Simplified Chinese, Pinyin, Zhuyin, or the International Phonetic Alphabet. +* Output HTML of characters with Pinyin attached to them. .. code:: python + >>> from dragonmapper import hanzi >>> s = '我是一个美国人。' - >>> dragonmapper.hanzi.is_simplified(s) + >>> hanzi.is_simplified(s) True - >>> dragonmapper.hanzi.to_pinyin(s) + >>> hanzi.to_pinyin(s) 'wǒshìyīgèměiguórén。' - >>> dragonmapper.hanzi.to_pinyin(s, all_readings=True) + >>> hanzi.to_pinyin(s, all_readings=True) '[wǒ][shì/shi/tí][yī][gè/ge/gě/gàn][měi][guó][rén/ren]。' .. code:: python + >>> from dragonmapper import transcriptions as trans >>> s = 'Wǒ shì yīgè měiguórén.' - >>> dragonmapper.transcriptions.is_pinyin(s) + >>> trans.is_pinyin(s) True - >>> dragonmapper.transcriptions.pinyin_to_zhuyin(s) + >>> trans.pinyin_to_zhuyin(s) 'ㄨㄛˇ ㄕˋ ㄧ ㄍㄜˋ ㄇㄟˇ ㄍㄨㄛˊ ㄖㄣˊ.' - >>> dragonmapper.transcriptions.pinyin_to_ipa(s) + >>> trans.pinyin_to_ipa(s) 'wɔ˧˩˧ ʂɨ˥˩ i˥ kɤ˥˩ meɪ˧˩˧ kwɔ˧˥ ʐən˧˥.' +.. code:: python + + >>> from dragonmapper import transcriptions as trans + >>> form dragonmapper import hanzi + >>> from dragonmapper import html + >>> s = "我是加拿大人" + >>> zh = hanzi.to_zhuyin(s) + >>> pi = trans.zhuyin_to_pinyin(zh).split(' ') + >>> pi + ['wǒ', 'shì', 'jiā', 'ná', 'dà', 'rén'] + >>> h = html.to_html(s, top=pi) + >>> print(h) + +* The intermediate switch to Zhuyin, is because of spacing. You can space out the characters instead. +* Note: only top is aviable right now, as browsers do not currently support having it elsewhere. +.. image:: https://s25.postimg.org/4s44wylcv/Screenshot_from_2016_08_03_15_59_03.png + :target: https://postimg.org/image/o9yscwiaj/ + Getting Started --------------- * `Install Dragon Mapper `_ diff --git a/docs/api.rst b/docs/api.rst index 2522e95..baaf9b6 100644 --- a/docs/api.rst +++ b/docs/api.rst @@ -234,3 +234,9 @@ lines of code. .. autofunction:: to_zhuyin .. autofunction:: to_ipa + +HTML Conversion: +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Creates some HTML from the characters, and transcription systems you have. + +#.. autofunction:: to_html diff --git a/docs/index.rst b/docs/index.rst index 558b17c..a70297d 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -15,22 +15,24 @@ functions for Chinese text processing: .. code:: python + >>> from dragonmapper import hanzi >>> s = '我是一个美国人。' - >>> dragonmapper.hanzi.is_simplified(s) + >>> hanzi.is_simplified(s) True - >>> dragonmapper.hanzi.to_pinyin(s) + >>> hanzi.to_pinyin(s) 'wǒshìyīgèměiguórén。' - >>> dragonmapper.hanzi.to_pinyin(s, all_readings=True) + >>> hanzi.to_pinyin(s, all_readings=True) '[wǒ][shì/shi/tí][yī][gè/ge/gě/gàn][měi][guó][rén/ren]。' .. code:: python + >>> from dragonmapper import transcriptions as trans >>> s = 'Wǒ shì yīgè měiguórén.' - >>> dragonmapper.transcriptions.is_pinyin(s) + >>> trans.is_pinyin(s) True - >>> dragonmapper.transcriptions.pinyin_to_zhuyin(s) + >>> trans.pinyin_to_zhuyin(s) 'ㄨㄛˇ ㄕˋ ㄧ ㄍㄜˋ ㄇㄟˇ ㄍㄨㄛˊ ㄖㄣˊ.' - >>> dragonmapper.transcriptions.pinyin_to_ipa(s) + >>> trans.pinyin_to_ipa(s) 'wɔ˧˩˧ ʂɨ˥˩ i˥ kɤ˥˩ meɪ˧˩˧ kwɔ˧˥ ʐən˧˥.' If this is your first time using Dragon Mapper, check out the :doc:`installation`. diff --git a/dragonmapper/data/default-style.css b/dragonmapper/data/default-style.css new file mode 100644 index 0000000..c1fdf1b --- /dev/null +++ b/dragonmapper/data/default-style.css @@ -0,0 +1,40 @@ +/** +Default CSS style for dragonmapper +**/ + +@import url(https://fonts.googleapis.com/css?family=Source+Sans+Pro); + +.hanzi{ + font-size: 2em; + line-height: 1em; + text-align: center; + vertical-align: middle; +} +.punct{ + font-size: 1.5em; + line-height: 1em; + text-align: center; + vertical-align: middle; +} +.zhuyin{ + font-size: 0.6em; + line-height: 1em; + text-align: center; + vertical-align: middle; +} +.pinyin{ + font-size: 1em; + line-height: 1em; + /** Some fonts have exess space on accented pinyin character, + setting the font fixes this problem. **/ + font-family: 'Source Sans Pro', sans-serif; + text-align: center; + vertical-align: center; +} +.tone-mark{ + font-size: 1em; + text-align: center; +} +.unknown{ + visibility: collapse; +} diff --git a/dragonmapper/data/hanzi_pinyin_characters.tsv b/dragonmapper/data/hanzi_pinyin_characters.tsv index f5db59c..0873de3 100644 --- a/dragonmapper/data/hanzi_pinyin_characters.tsv +++ b/dragonmapper/data/hanzi_pinyin_characters.tsv @@ -24969,8 +24969,8 @@ 䳠 shuì/zhù 脽 shuí 𧀣 shuí -谁 shuí -誰 shuí/shéi +谁 shéi +誰 shéi/shuí 鎙 shuò 硕 shuò 𠲿 shuò diff --git a/dragonmapper/data/transcriptions.csv b/dragonmapper/data/transcriptions.csv index 6447017..9f3377c 100644 --- a/dragonmapper/data/transcriptions.csv +++ b/dragonmapper/data/transcriptions.csv @@ -65,6 +65,7 @@ dei,ㄉㄟ,teɪ den,ㄉㄣ,tən deng,ㄉㄥ,tɤŋ di,ㄉㄧ,ti +dia,ㄉㄧㄚ,tjɑ dian,ㄉㄧㄢ,tjɛn diang,ㄉㄧㄤ,tjɑŋ diao,ㄉㄧㄠ,tjɑʊ @@ -234,6 +235,7 @@ nun,ㄋㄨㄣ,nwən nuo,ㄋㄨㄛ,nwɔ nü,ㄋㄩ,ny nüe,ㄋㄩㄝ,nɥœ +o,ㄛ,wɔ ou,ㄡ,oʊ pa,ㄆㄚ,pʰa pai,ㄆㄞ,pʰaɪ @@ -368,6 +370,7 @@ ye,ㄧㄝ,jɛ yi,ㄧ,i yin,ㄧㄣ,in ying,ㄧㄥ,iŋ +yo,ㄧㄛ,jʊ yong,ㄩㄥ,yʊŋ you,ㄧㄡ,yoʊ yu,ㄩ,y diff --git a/dragonmapper/hanzi.py b/dragonmapper/hanzi.py index bc67245..be585ce 100644 --- a/dragonmapper/hanzi.py +++ b/dragonmapper/hanzi.py @@ -34,7 +34,8 @@ def _load_data(): - """Load the word and character mapping data into a dictionary. + r""" + Load the word and character mapping data into a dictionary. In the data files, each line is formatted like this: HANZI PINYIN_READING/PINYIN_READING @@ -58,8 +59,9 @@ def _load_data(): _WORDS = _HANZI_PINYIN_MAP['words'] -def _hanzi_to_pinyin(hanzi): - """Return the Pinyin reading for a Chinese word. +def _hanzi_to_pinyin(hanzi, DICT=None): + """ + Return the Pinyin reading for a Chinese word. If the given string *hanzi* matches a CC-CEDICT word, the return value is formatted like this: [WORD_READING1, WORD_READING2, ...] @@ -71,10 +73,18 @@ def _hanzi_to_pinyin(hanzi): original character is returned, e.g. [[CHAR_READING1, ...], CHAR, ...] """ + if DICT is None: + DICT = _HANZI_PINYIN_MAP + try: - return _HANZI_PINYIN_MAP['words'][hanzi] + return DICT['words'][hanzi] except KeyError: - return [_CHARACTERS.get(character, character) for character in hanzi] + return [ + DICT['characters'].get( + character, + character) + for character in hanzi + ] def _enclose_readings(container, readings): @@ -88,7 +98,8 @@ def _enclose_readings(container, readings): def to_pinyin(s, delimiter=' ', all_readings=False, container='[]', accented=True): - """Convert a string's Chinese characters to Pinyin readings. + """ + Convert a string's Chinese characters to Pinyin readings. *s* is a string containing Chinese characters. *accented* is a boolean value indicating whether to return accented or numbered Pinyin @@ -169,7 +180,8 @@ def to_pinyin(s, delimiter=' ', all_readings=False, container='[]', def to_zhuyin(s, delimiter=' ', all_readings=False, container='[]'): - """Convert a string's Chinese characters to Zhuyin readings. + """ + Convert a string's Chinese characters to Zhuyin readings. *s* is a string containing Chinese characters. @@ -192,7 +204,8 @@ def to_zhuyin(s, delimiter=' ', all_readings=False, container='[]'): def to_ipa(s, delimiter=' ', all_readings=False, container='[]'): - """Convert a string's Chinese characters to IPA. + """ + Convert a string's Chinese characters to IPA. *s* is a string containing Chinese characters. @@ -212,3 +225,25 @@ def to_ipa(s, delimiter=' ', all_readings=False, container='[]'): numbered_pinyin = to_pinyin(s, delimiter, all_readings, container, False) ipa = pinyin_to_ipa(numbered_pinyin) return ipa + + +def to_jyutping(s, delimiter=' ', all_readings=False, container='[]'): + """ + Convert a string's Chinese characters to Jyutping. + + *s* is a string containing Chinese characters. + + *delimiter* is the character used to indicate word boundaries in *s*. + This is used to differentiate between words and characters so that a more + accurate reading can be returned. + + *all_readings* is a boolean value indicating whether or not to return all + possible readings in the case of words/characters that have multiple + readings. *container* is a two character string that is used to + enclose words/characters if *all_readings* is ``True``. The default + ``'[]'`` is used like this: ``'[READING1/READING2]'``. + + Characters not recognized as Chinese are left untouched. + + """ + pass diff --git a/dragonmapper/html.py b/dragonmapper/html.py new file mode 100644 index 0000000..316db74 --- /dev/null +++ b/dragonmapper/html.py @@ -0,0 +1,118 @@ +# -*- coding: utf-8 -*- +"""Formatting Chinese into HTML with dragonmapper's functions""" + + +from __future__ import unicode_literals +import zhon +from dragonmapper import hanzi +from dragonmapper import transcriptions as trans + +CHINESE_TYPE_UNKNOWN = 0 +CHINESE_TYPE_SIMPLIFIED = 1 +CHINESE_TYPE_TRADITIONAL = 2 +CHINESE_TYPE_SAME = 3 + +TYPE_TO_CSS_CLASS = {0: 'unknown', + 1: 'simplified', + 2: 'traditional', + 3: 'traditional-simplified-same'} + +_indentation = 0 +_line_html = '' +punctuation = tuple(zhon.hanzi.punctuation + zhon.pinyin.punctuation) +_tones_marks = ['¯', 'ˊ', 'ˇ', 'ˋ', '˙', '1', '2', '3', '4', '5'] + + +def _identify(s): + """ Returns string of text type for HTML/CSS. + + *s* is the string to identify. + """ + + if hanzi.has_chinese(s): + return "hanzi" + elif s == "": + return "unknown" + elif s in punctuation: + return "punct" + elif s in _tones_marks: + return "tone-mark" + else: + c = trans.identify(s) + if c == trans.ZHUYIN: + return "zhuyin" + elif c == trans.PINYIN: + return "pinyin" + elif c == trans.IPA: + return "ipa" + elif c == trans.UNKNOWN: + return "unknown" + + +def _html_add(s, tabs=0): + """ + Wrapper for _line_html+="..." + + *s* is what to add to the html string. + *tabs* specifies the identation intensity (in tabs). + """ + + global _line_html + _line_html += (("\n")+("\t"*(tabs+_indentation)))+s + + +def is_what_type_of_chinese(s): + """ + Returns values for diffent kinds of Chinese, see CHINESE_TYPE_... + + *s* character string + """ + + if hanzi.is_traditional(s) and hanzi.is_simplified(s): + return CHINESE_TYPE_SAME + elif hanzi.is_traditional(s): + return CHINESE_TYPE_TRADITIONAL + elif hanzi.is_simplified(s): + return CHINESE_TYPE_SIMPLIFIED + return CHINESE_TYPE_UNKNOWN + + +def to_html(characters, + top=None, + minified=False, + indentation=0): + """ + Returns (probably) valid HTML(5) for the Chinese characters, + and phonetic notations provided. + + *characters* is an string of the Chinese characters. + *top* an array that will be displayed on top of the respective characters. + TODO: Add support for more sides... Waiting on browser support. + *indentation* specifies how many extra tabs there should be. + """ + + global _indentation + global _line_html + _indentation = indentation + _line_html = "" + + phonetic_script_type = _identify("".join(top)) + + char_type = TYPE_TO_CSS_CLASS[is_what_type_of_chinese(characters)] + + _html_add( + "".format( + "".join(characters), char_type) + ) + for i in range(len(characters)): + _html_add("{1}\ +{3}".format( + characters, + characters[i], + phonetic_script_type, + top[i]), 1) + _html_add("") + + if minified: + return _line_html.replace('\t', '').replace('\n', '') + return _line_html diff --git a/dragonmapper/tests/test-html.py b/dragonmapper/tests/test-html.py new file mode 100644 index 0000000..e339541 --- /dev/null +++ b/dragonmapper/tests/test-html.py @@ -0,0 +1,26 @@ +# -*- coding: utf-8 -*- +"""Unit tests for dragonmapper.html.""" +from __future__ import unicode_literals +import unittest +from dragonmapper import html + + +class TestHtmlFuctions(unittest.TestCase): + + maxDiff = None + + zi = '你好' + pinyin = ['ni3', 'hao3'] + ruby = '\ +\ +ni3\ +\ +hao3' + + def test_to_html(self): + self.assertEqual( + html.to_html( + self.zi, + top=self.pinyin, + minified=True), + self.ruby) diff --git a/dragonmapper/tests/test-transcriptions.py b/dragonmapper/tests/test-transcriptions.py index a4d5e14..ad3821f 100644 --- a/dragonmapper/tests/test-transcriptions.py +++ b/dragonmapper/tests/test-transcriptions.py @@ -173,3 +173,15 @@ def test_issue_8(self): numbered = 'Ao4di4li4' self.assertEqual(numbered, trans.accented_to_numbered(accented)) + + def test_short_syllables(self): + pinyin = 'yo1' + zhuyin = 'ㄧㄛ' + + self.assertEqual(zhuyin, trans.pinyin_to_zhuyin(pinyin)) + + def test_dia_conversion(self): + pinyin = 'dia3' + zhuyin = 'ㄉㄧㄚˇ' + + self.assertEqual(zhuyin, trans.pinyin_to_zhuyin(pinyin)) diff --git a/requirements.txt b/requirements.txt index 8020a81..51eed4e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -12,4 +12,4 @@ py==1.4.19 pyflakes==0.8.1 tox==1.6.1 virtualenv==1.11.2 -zhon==1.1.3 +zhon==1.1.5 diff --git a/setup.py b/setup.py index 16360b2..2427e8a 100644 --- a/setup.py +++ b/setup.py @@ -50,7 +50,8 @@ def open_file(filename): 'Topic :: Text Processing :: Linguistic', ], keywords=['chinese', 'mandarin', 'transcription', 'pinyin', 'zhuyin', - 'ipa', 'convert', 'bopomofo', 'hanzi', 'characters', 'readings'], + 'ipa', 'convert', 'bopomofo', 'hanzi', 'characters', 'readings', + 'html'], packages=['dragonmapper', 'dragonmapper.data'], package_data={'dragonmapper': ['data/*.tsv', 'data/*.csv']}, test_suite='dragonmapper.tests',