Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ pip install tmx2dataframe
## Usage

```
import tmx2dataframe
from tmx2dataframe import tmx2dataframe

metadata, df = tmx2dataframe.read("/path/to/tmxfile.tmx")
```
Expand Down
File renamed without changes.
37 changes: 37 additions & 0 deletions tests/example2_multi_locales.tmx
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE tmx SYSTEM "tmx11.dtd">
<tmx version="1.1">
<header
creationtool = "CTexT Alignment Interface Pro 1.0"
segtype = "sentence"
o-tmf = "CTexT Alignment Interface Pro 1.0 Output"
adminlang="EN-US"
srclang="zul-ZA"
datatype="plaintext"
>
</header>
<body>
<tu>
<tuv xml:lang="zul-ZA">
<seg>inkululeko yokwakha izinto ngokusebenzisa ubuciko;</seg>
</tuv>
<tuv xml:lang="eng-GB">
<seg>freedom of artistic creativity;</seg>
</tuv>
<tuv xml:lang="tel-IN">
<seg>కళాత్మక సృజనాత్మకత స్వేచ్ఛ;</seg>
</tuv>
</tu>
<tu>
<tuv xml:lang="zul-ZA">
<seg>Samukela uDkt. Imtiaz Sooliman kulendlu namuhla, wenhlangano i-Gift of the Givers. </seg>
</tuv>
<tuv xml:lang="eng-GB">
<seg>We welcome Dr Imtiaz Sooliman of the Gift of the Givers in this house today. </seg>
</tuv>
<tuv xml:lang="tel-IN">
<seg>ఈ రోజు ఈ బహుమతుల బహుమతి యొక్క డాక్టర్ ఇంతియాజ్ సూలిమాన్‌ను మేము స్వాగతిస్తున్నాము.</seg>
</tuv>
</tu>
</body>
</tmx>
34 changes: 20 additions & 14 deletions tests/tmx2dataframe_test.py
Original file line number Diff line number Diff line change
@@ -1,29 +1,35 @@
from tmx2dataframe import tmx2dataframe

def test_tmx2dataframe_metadata():
metadata, df = tmx2dataframe.read('tests/example.tmx')
metadata, df = tmx2dataframe.read('tests/example0.tmx')
assert metadata['srclang'] == 'zul-ZA'

def test_tmx2dataframe_dataframe():
metadata, df = tmx2dataframe.read('tests/example.tmx')
metadata, df = tmx2dataframe.read('tests/example0.tmx')

assert 'source_language' in df.columns
assert 'source_sentence' in df.columns
assert 'target_language' in df.columns
assert 'target_sentence' in df.columns
assert 'zul-ZA' in df.columns
assert 'eng-GB' in df.columns

assert len(df) == 2
assert df.iloc[0]['target_language'] == "eng-GB"
assert df.iloc[0]['target_sentence'] == "freedom of artistic creativity;"
assert df.iloc[0]['eng-GB'] == "freedom of artistic creativity;"

def test_tmx2dataframe_differentattributename():
metadata, df = tmx2dataframe.read('tests/example.1.tmx')

assert 'source_language' in df.columns
assert 'source_sentence' in df.columns
assert 'target_language' in df.columns
assert 'target_sentence' in df.columns
assert 'zul-ZA' in df.columns
assert 'eng-GB' in df.columns

assert len(df) == 2
assert df.iloc[0]['target_language'] == "eng-GB"
assert df.iloc[0]['target_sentence'] == "freedom of artistic creativity;"
assert df.iloc[0]['eng-GB'] == "freedom of artistic creativity;"

def test_multi_locales():
metadata, df = tmx2dataframe.read('tests/example2_multi_locales.tmx')

assert 'zul-ZA' in df.columns
assert 'eng-GB' in df.columns
assert 'tel-IN' in df.columns

assert len(df) == 2
assert df.iloc[0]['eng-GB'] == "freedom of artistic creativity;"
assert df.iloc[0]['zul-ZA'] == "inkululeko yokwakha izinto ngokusebenzisa ubuciko;"
assert df.iloc[0]['tel-IN'] == "కళాత్మక సృజనాత్మకత స్వేచ్ఛ;"
48 changes: 36 additions & 12 deletions tmx2dataframe/tmx2dataframe.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,41 @@
import pandas as pd
from xml.dom import minidom

def getText(nodelist):
"""Helper function to return the text content from an XML node, joined as a single string.
"""
rc = []
for node in nodelist:
if node.nodeType == minidom.Node.TEXT_NODE:
rc.append(node.data)
return ''.join(rc)

def process_tuv(tuv):
"""Function to process a single TMX 'TUV' unit - a unit of text in a particular language.

Args:
tuv (Node): The <tuv> node to process.

Returns:
lang (String): The locale/language code of the <tuv> element.
txt (String): The text contained in the <tuv> element.
"""
if 'lang' in tuv.attributes:
lang = tuv.attributes['lang'].value
else:
lang = tuv.attributes['xml:lang'].value
seg = tuv.getElementsByTagName('seg')[0]
txt = seg.childNodes[0].data

# If the node has direct text content data, process it as a string
if hasattr(seg.childNodes[0], 'data'):
txt = seg.childNodes[0].data

# If it doesn't have a 'data' attribute, it most likely contains child tags such as placeholders (<ph>). Therefore, include these as XML strings.
else:
if len(seg.childNodes) > 0 :
txt = getText(seg.childNodes)
else:
print("no child nodes")
return lang, txt

def read(path):
Expand All @@ -20,7 +47,7 @@ def read(path):

Returns:
dict: The header of the TMX file, which contains metadata
DataFrame: A Pandas Dataframe. Each line item consists of source_language, source_sentence, target_language, target_sentence
DataFrame: A Pandas Dataframe. The column names will be the locale/language codes, and the row content will be the translations for each locale.

"""
# parse an xml file by name
Expand All @@ -39,17 +66,14 @@ def read(path):
translation_units = body.getElementsByTagName('tu')
items = []
for tu in translation_units:
srclang, srcsentence = process_tuv(tu.getElementsByTagName('tuv')[0])
targetlang, targetsentence = process_tuv(tu.getElementsByTagName('tuv')[1])
item = {
'source_language': srclang,
'source_sentence': srcsentence,
'target_language': targetlang,
'target_sentence': targetsentence
}
items.append(item)
tuvs = tu.getElementsByTagName('tuv')
tudata = {}
for tuv in tuvs:
lang, sentence = process_tuv(tuv)
tudata[lang] = sentence
items.append(tudata)

df = pd.DataFrame(items)
return metadata, df