From b477aca7047cdb15cd16588a0976d0990d3a96a4 Mon Sep 17 00:00:00 2001 From: Manuel Souto Pico Date: Wed, 4 Sep 2024 17:16:15 +0200 Subject: [PATCH] Patch to extract tu's note when there's one. Only the first one is extracted if there's more than one --- tmx2dataframe/tmx2dataframe.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/tmx2dataframe/tmx2dataframe.py b/tmx2dataframe/tmx2dataframe.py index 11ea4cd..e263eae 100644 --- a/tmx2dataframe/tmx2dataframe.py +++ b/tmx2dataframe/tmx2dataframe.py @@ -10,6 +10,15 @@ def process_tuv(tuv): txt = seg.childNodes[0].data return lang, txt + +def extract_note(tu): + notes = tu.getElementsByTagName('note') + if len(notes) >= 1: + # if there's more than one note, only the first one will be extracted + # if all notes are to be extracted, they could be concatenated in the same value + return notes[0].childNodes[0].data + + def read(path): """Read function takes in a path to TMX translation file and outputs the metadata and a pandas dataframe. @@ -51,6 +60,11 @@ def read(path): 'target_language': targetlang, 'target_sentence': targetsentence } + + note = extract_note(tu) + if note: + item['note'] = note + items.append(item) df = pd.DataFrame(items)