From 8c565c6e4371dac3cd7f9302e565e8673671c9e4 Mon Sep 17 00:00:00 2001 From: Graeme West Date: Mon, 8 Jul 2019 10:17:41 +0100 Subject: [PATCH] Adding multi-locale support, clarifying documentation, and expanding tests. --- README.md | 2 +- tests/{example.tmx => example0.tmx} | 0 tests/example2_multi_locales.tmx | 37 ++++++++++++++++++++++ tests/tmx2dataframe_test.py | 34 +++++++++++--------- tmx2dataframe/tmx2dataframe.py | 48 +++++++++++++++++++++-------- 5 files changed, 94 insertions(+), 27 deletions(-) rename tests/{example.tmx => example0.tmx} (100%) create mode 100644 tests/example2_multi_locales.tmx diff --git a/README.md b/README.md index ecde29a..2c41ec0 100644 --- a/README.md +++ b/README.md @@ -20,7 +20,7 @@ pip install tmx2dataframe ## Usage ``` -import tmx2dataframe +from tmx2dataframe import tmx2dataframe metadata, df = tmx2dataframe.read("/path/to/tmxfile.tmx") ``` diff --git a/tests/example.tmx b/tests/example0.tmx similarity index 100% rename from tests/example.tmx rename to tests/example0.tmx diff --git a/tests/example2_multi_locales.tmx b/tests/example2_multi_locales.tmx new file mode 100644 index 0000000..3b3ccb8 --- /dev/null +++ b/tests/example2_multi_locales.tmx @@ -0,0 +1,37 @@ + + + +
+
+ + + + inkululeko yokwakha izinto ngokusebenzisa ubuciko; + + + freedom of artistic creativity; + + + కళాత్మక సృజనాత్మకత స్వేచ్ఛ; + + + + + Samukela uDkt. Imtiaz Sooliman kulendlu namuhla, wenhlangano i-Gift of the Givers. + + + We welcome Dr Imtiaz Sooliman of the Gift of the Givers in this house today. + + + ఈ రోజు ఈ బహుమతుల బహుమతి యొక్క డాక్టర్ ఇంతియాజ్ సూలిమాన్‌ను మేము స్వాగతిస్తున్నాము. + + + +
\ No newline at end of file diff --git a/tests/tmx2dataframe_test.py b/tests/tmx2dataframe_test.py index 32c75d7..30a2e96 100644 --- a/tests/tmx2dataframe_test.py +++ b/tests/tmx2dataframe_test.py @@ -1,29 +1,35 @@ from tmx2dataframe import tmx2dataframe def test_tmx2dataframe_metadata(): - metadata, df = tmx2dataframe.read('tests/example.tmx') + metadata, df = tmx2dataframe.read('tests/example0.tmx') assert metadata['srclang'] == 'zul-ZA' def test_tmx2dataframe_dataframe(): - metadata, df = tmx2dataframe.read('tests/example.tmx') + metadata, df = tmx2dataframe.read('tests/example0.tmx') - assert 'source_language' in df.columns - assert 'source_sentence' in df.columns - assert 'target_language' in df.columns - assert 'target_sentence' in df.columns + assert 'zul-ZA' in df.columns + assert 'eng-GB' in df.columns assert len(df) == 2 - assert df.iloc[0]['target_language'] == "eng-GB" - assert df.iloc[0]['target_sentence'] == "freedom of artistic creativity;" + assert df.iloc[0]['eng-GB'] == "freedom of artistic creativity;" def test_tmx2dataframe_differentattributename(): metadata, df = tmx2dataframe.read('tests/example.1.tmx') - assert 'source_language' in df.columns - assert 'source_sentence' in df.columns - assert 'target_language' in df.columns - assert 'target_sentence' in df.columns + assert 'zul-ZA' in df.columns + assert 'eng-GB' in df.columns assert len(df) == 2 - assert df.iloc[0]['target_language'] == "eng-GB" - assert df.iloc[0]['target_sentence'] == "freedom of artistic creativity;" \ No newline at end of file + assert df.iloc[0]['eng-GB'] == "freedom of artistic creativity;" + +def test_multi_locales(): + metadata, df = tmx2dataframe.read('tests/example2_multi_locales.tmx') + + assert 'zul-ZA' in df.columns + assert 'eng-GB' in df.columns + assert 'tel-IN' in df.columns + + assert len(df) == 2 + assert df.iloc[0]['eng-GB'] == "freedom of artistic creativity;" + assert df.iloc[0]['zul-ZA'] == "inkululeko yokwakha izinto ngokusebenzisa ubuciko;" + assert df.iloc[0]['tel-IN'] == "కళాత్మక సృజనాత్మకత స్వేచ్ఛ;" diff --git a/tmx2dataframe/tmx2dataframe.py b/tmx2dataframe/tmx2dataframe.py index 144da19..2407f05 100644 --- a/tmx2dataframe/tmx2dataframe.py +++ b/tmx2dataframe/tmx2dataframe.py @@ -1,14 +1,41 @@ import pandas as pd from xml.dom import minidom +def getText(nodelist): + """Helper function to return the text content from an XML node, joined as a single string. + """ + rc = [] + for node in nodelist: + if node.nodeType == minidom.Node.TEXT_NODE: + rc.append(node.data) + return ''.join(rc) def process_tuv(tuv): + """Function to process a single TMX 'TUV' unit - a unit of text in a particular language. + + Args: + tuv (Node): The node to process. + + Returns: + lang (String): The locale/language code of the element. + txt (String): The text contained in the element. + """ if 'lang' in tuv.attributes: lang = tuv.attributes['lang'].value else: lang = tuv.attributes['xml:lang'].value seg = tuv.getElementsByTagName('seg')[0] - txt = seg.childNodes[0].data + + # If the node has direct text content data, process it as a string + if hasattr(seg.childNodes[0], 'data'): + txt = seg.childNodes[0].data + + # If it doesn't have a 'data' attribute, it most likely contains child tags such as placeholders (). Therefore, include these as XML strings. + else: + if len(seg.childNodes) > 0 : + txt = getText(seg.childNodes) + else: + print("no child nodes") return lang, txt def read(path): @@ -20,7 +47,7 @@ def read(path): Returns: dict: The header of the TMX file, which contains metadata - DataFrame: A Pandas Dataframe. Each line item consists of source_language, source_sentence, target_language, target_sentence + DataFrame: A Pandas Dataframe. The column names will be the locale/language codes, and the row content will be the translations for each locale. """ # parse an xml file by name @@ -39,17 +66,14 @@ def read(path): translation_units = body.getElementsByTagName('tu') items = [] for tu in translation_units: - srclang, srcsentence = process_tuv(tu.getElementsByTagName('tuv')[0]) - targetlang, targetsentence = process_tuv(tu.getElementsByTagName('tuv')[1]) - item = { - 'source_language': srclang, - 'source_sentence': srcsentence, - 'target_language': targetlang, - 'target_sentence': targetsentence - } - items.append(item) + tuvs = tu.getElementsByTagName('tuv') + tudata = {} + for tuv in tuvs: + lang, sentence = process_tuv(tuv) + tudata[lang] = sentence + items.append(tudata) df = pd.DataFrame(items) return metadata, df - \ No newline at end of file +