diff --git a/README.md b/README.md
index ecde29a..2c41ec0 100644
--- a/README.md
+++ b/README.md
@@ -20,7 +20,7 @@ pip install tmx2dataframe
## Usage
```
-import tmx2dataframe
+from tmx2dataframe import tmx2dataframe
metadata, df = tmx2dataframe.read("/path/to/tmxfile.tmx")
```
diff --git a/tests/example.tmx b/tests/example0.tmx
similarity index 100%
rename from tests/example.tmx
rename to tests/example0.tmx
diff --git a/tests/example2_multi_locales.tmx b/tests/example2_multi_locales.tmx
new file mode 100644
index 0000000..3b3ccb8
--- /dev/null
+++ b/tests/example2_multi_locales.tmx
@@ -0,0 +1,37 @@
+
+
+
+
+
+
+
+ inkululeko yokwakha izinto ngokusebenzisa ubuciko;
+
+
+ freedom of artistic creativity;
+
+
+ కళాత్మక సృజనాత్మకత స్వేచ్ఛ;
+
+
+
+
+ Samukela uDkt. Imtiaz Sooliman kulendlu namuhla, wenhlangano i-Gift of the Givers.
+
+
+ We welcome Dr Imtiaz Sooliman of the Gift of the Givers in this house today.
+
+
+ ఈ రోజు ఈ బహుమతుల బహుమతి యొక్క డాక్టర్ ఇంతియాజ్ సూలిమాన్ను మేము స్వాగతిస్తున్నాము.
+
+
+
+
\ No newline at end of file
diff --git a/tests/tmx2dataframe_test.py b/tests/tmx2dataframe_test.py
index 32c75d7..30a2e96 100644
--- a/tests/tmx2dataframe_test.py
+++ b/tests/tmx2dataframe_test.py
@@ -1,29 +1,35 @@
from tmx2dataframe import tmx2dataframe
def test_tmx2dataframe_metadata():
- metadata, df = tmx2dataframe.read('tests/example.tmx')
+ metadata, df = tmx2dataframe.read('tests/example0.tmx')
assert metadata['srclang'] == 'zul-ZA'
def test_tmx2dataframe_dataframe():
- metadata, df = tmx2dataframe.read('tests/example.tmx')
+ metadata, df = tmx2dataframe.read('tests/example0.tmx')
- assert 'source_language' in df.columns
- assert 'source_sentence' in df.columns
- assert 'target_language' in df.columns
- assert 'target_sentence' in df.columns
+ assert 'zul-ZA' in df.columns
+ assert 'eng-GB' in df.columns
assert len(df) == 2
- assert df.iloc[0]['target_language'] == "eng-GB"
- assert df.iloc[0]['target_sentence'] == "freedom of artistic creativity;"
+ assert df.iloc[0]['eng-GB'] == "freedom of artistic creativity;"
def test_tmx2dataframe_differentattributename():
metadata, df = tmx2dataframe.read('tests/example.1.tmx')
- assert 'source_language' in df.columns
- assert 'source_sentence' in df.columns
- assert 'target_language' in df.columns
- assert 'target_sentence' in df.columns
+ assert 'zul-ZA' in df.columns
+ assert 'eng-GB' in df.columns
assert len(df) == 2
- assert df.iloc[0]['target_language'] == "eng-GB"
- assert df.iloc[0]['target_sentence'] == "freedom of artistic creativity;"
\ No newline at end of file
+ assert df.iloc[0]['eng-GB'] == "freedom of artistic creativity;"
+
+def test_multi_locales():
+ metadata, df = tmx2dataframe.read('tests/example2_multi_locales.tmx')
+
+ assert 'zul-ZA' in df.columns
+ assert 'eng-GB' in df.columns
+ assert 'tel-IN' in df.columns
+
+ assert len(df) == 2
+ assert df.iloc[0]['eng-GB'] == "freedom of artistic creativity;"
+ assert df.iloc[0]['zul-ZA'] == "inkululeko yokwakha izinto ngokusebenzisa ubuciko;"
+ assert df.iloc[0]['tel-IN'] == "కళాత్మక సృజనాత్మకత స్వేచ్ఛ;"
diff --git a/tmx2dataframe/tmx2dataframe.py b/tmx2dataframe/tmx2dataframe.py
index 144da19..2407f05 100644
--- a/tmx2dataframe/tmx2dataframe.py
+++ b/tmx2dataframe/tmx2dataframe.py
@@ -1,14 +1,41 @@
import pandas as pd
from xml.dom import minidom
+def getText(nodelist):
+ """Helper function to return the text content from an XML node, joined as a single string.
+ """
+ rc = []
+ for node in nodelist:
+ if node.nodeType == minidom.Node.TEXT_NODE:
+ rc.append(node.data)
+ return ''.join(rc)
def process_tuv(tuv):
+ """Function to process a single TMX 'TUV' unit - a unit of text in a particular language.
+
+ Args:
+ tuv (Node): The node to process.
+
+ Returns:
+ lang (String): The locale/language code of the element.
+ txt (String): The text contained in the element.
+ """
if 'lang' in tuv.attributes:
lang = tuv.attributes['lang'].value
else:
lang = tuv.attributes['xml:lang'].value
seg = tuv.getElementsByTagName('seg')[0]
- txt = seg.childNodes[0].data
+
+ # If the node has direct text content data, process it as a string
+ if hasattr(seg.childNodes[0], 'data'):
+ txt = seg.childNodes[0].data
+
+ # If it doesn't have a 'data' attribute, it most likely contains child tags such as placeholders (). Therefore, include these as XML strings.
+ else:
+ if len(seg.childNodes) > 0 :
+ txt = getText(seg.childNodes)
+ else:
+ print("no child nodes")
return lang, txt
def read(path):
@@ -20,7 +47,7 @@ def read(path):
Returns:
dict: The header of the TMX file, which contains metadata
- DataFrame: A Pandas Dataframe. Each line item consists of source_language, source_sentence, target_language, target_sentence
+ DataFrame: A Pandas Dataframe. The column names will be the locale/language codes, and the row content will be the translations for each locale.
"""
# parse an xml file by name
@@ -39,17 +66,14 @@ def read(path):
translation_units = body.getElementsByTagName('tu')
items = []
for tu in translation_units:
- srclang, srcsentence = process_tuv(tu.getElementsByTagName('tuv')[0])
- targetlang, targetsentence = process_tuv(tu.getElementsByTagName('tuv')[1])
- item = {
- 'source_language': srclang,
- 'source_sentence': srcsentence,
- 'target_language': targetlang,
- 'target_sentence': targetsentence
- }
- items.append(item)
+ tuvs = tu.getElementsByTagName('tuv')
+ tudata = {}
+ for tuv in tuvs:
+ lang, sentence = process_tuv(tuv)
+ tudata[lang] = sentence
+ items.append(tudata)
df = pd.DataFrame(items)
return metadata, df
-
\ No newline at end of file
+