xmlanntools/xml2standoff at main · czcorpus/xmlanntools · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

# Copyright (c) 2024-2025 Pavel Vondřička <pavel.vondricka@ff.cuni.cz>
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; version 2
# dated June, 1991.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.

import html
import re
import json
from io import StringIO
from pathlib import Path


def scan_to(text, ptr, mark):
    """
    Scan text up to the given mark and return preceding contents and the mark

    Parameters:
    -----------

    text : str
        Text string
    ptr : integer
        Pointer to the current character position in the text (used as scan start position)
    mark: str
        (Sub)string to seek until

    Returns:
    --------

    segment : str
        Text segment (str) from start (original ptr position) to the start of the searched *mark*
    mark : str
        The found mark (str) or empty string if not found until EOF
    ptr : integer
        Pointer to the last position (the end of the found *mark*)
    """
    start = ptr
    while ptr < len(text) and text[ptr:ptr+len(mark)] != mark:
        ptr += 1
    if text[ptr:ptr+len(mark)] != mark:
        mark = ''
    pre = ptr
    ptr += len(mark)
    return text[start:pre], mark, ptr


def text_continues(text, ptr, str):
    """
    Test whether text contents at position `ptr` continues with the given string

    Parameters:
    -----------

    text : str
        text string
    ptr : integer
        Pointer to the current character position in the text (to test)
    str : str
        String to test for

    Returns:
    --------

    result : boolean
        Match or not
    """
    return text[ptr:ptr+len(str)] == str

def re_partition(pattern, string):
    match = re.search(pattern, string)
    if not match:
        return string, '', ''
    return string[:match.start()], match.group(0), string[match.end():]

def parse_constraints(description):
    """
    Parse additional constraints on textelements (or excluded elements)

    Parameters:
    -----------
    description: str
        string reptresentation of the constraints

    Returns:
    --------
    constraints: dict
        constraints on the name, attributes and/or parents (recursive) of the element

    Supported format example:
    - anyparent//direct_parent/element_name[@attr1="value1"][@attr2="value2"]
    (all attribute constraints must be satisfied (AND logic))

    """
    name = ''
    parents, element = description.rsplit('/', 1) if '/' in description else ('', description)

    direct = not parents.endswith('/')
    if not direct:
        parents = parents[:-1]

    name, *attr_constraints = element.split('[')

    attrs = []
    for constr in attr_constraints:
        assert constr[-1] == ']', "Missing end bracket in constraint: ["+constr
        aname, avalue = constr[:-1].split('=')
        attrs.append({
            'name': aname.lstrip('@ ').rstrip(),
            'value': avalue.strip("\" ")
        })

    parent = None if not parents else parse_constraints(parents)
    if parent:
        parent['direct'] = direct

    return {
        'name': name,
        'parent': parent,
        'attrs': attrs
    }

def matches_constraint(constraint, heap):
    """
    Test whether the current element (last on the heap) matches the given constraint
    (see `parse_constraints` above for details)
    """
    if not len(heap):
        return False
    ptr = -1
    current = heap[ptr]
    # compare current element name with name requested
    if constraint['name'] != '*' and current['name'] != constraint['name'] and current['name'] != constraint['name']+'/':
        return False
    # test for attribute constraints
    for attr_constr in constraint['attrs']:
        expr = re.compile(r"(?<!\S)"+attr_constr['name']+r"\s*=\s*\""+html.escape(attr_constr['value'], quote=True)+"\"")
        if not expr.search(current['contents']):
            return False
    # test for parent constraints
    parent_constr = constraint['parent']
    if parent_constr:
        while True:
            parent_test = matches_constraint(parent_constr, heap[:ptr])
            if parent_test:
                return True
            ptr -= 1
            if parent_constr['direct'] or abs(ptr) >= len(heap):
                return False
    return True

def matches_any_constraints(constraints, heap):
    """
    Test whether the current element (last on the heap) matches any of the constraints in the list
    """
    for constr in constraints:
        if matches_constraint(constr, heap):
            return True
    return False

def split_xml(text, txtelements=None, excludes=None, keep_linebreaks=False):
    """
    Split XML into plaintext and stand-off XML markup description

    Parameters:
    -----------

    text : str
        XML input as text string
    txtelements : list of element constraints
        list of names of elements containg text to be extracted
    excludes : list of element constraints
        list of names of elements to be skipped
    keep_linebreaks : boolean
        keep linebreaks within text elements or remove them?

    Returns:
    --------

    plaintext : str
        Extracted plaintext contents (str) stripped of all XML mark-up
    markup : list of dict
        XML markup description in the form of list of XML element descriptions:
        - name (str) : name of the XML element
        - contents (str) : other contents of the XML element tag (attributes with values, XML comment contents etc.)
        - start (integer) : character position in the plain text where the XML element's span starts
        - end (integer) : character position in the plain text where the XML element's span ends
        - level (integher) : level of nesting (depth) in the XML tree
        - order (integer) : order of element in the XML tree
        - text (str): (optional) text contents at the beginning of the element (if not a text element, i.e. it has not been extracted into plain text file)
        - tail (str): (optional) text contents at the end of the element (have not been extracted into plain text file)
        - skip (int): (optional) skip following plain text contents at the start of the element (provisional line break)
        - cut (int): (optional) cut plain text contents from the end of the element (provisional line break)
        - csep (string): content separator (whitespace) between element name and other contents (attributes) if more than just the common single space character
    """

    # buffer for the extracted plaintext
    plaintext = StringIO()
    # character counter for the extracted plain text contents
    charcnt = 0
    # heap of open element spans
    heap = []
    # buffer for the XML mark-up descxription
    markup = []
    # XML element level and order counters
    slevel = 0
    order = 0
    # pointer to the current character position in the source XML string
    ptr = 0
    # content aware extraction:
    # depth within text elements (may be nested)
    txtel_depth = 0
    # depth within excluded elements (may be nested)
    exclude_depth = 0
    # tuple with a current mark-up element description and its attribute to keep the text which shall not be extracted
    text_store = None
    # shall we remove line breaks within text elements?
    normalize_linebreaks = txtelements is not None and not keep_linebreaks
    # count lines
    line_count = 1

    lbre = re.compile(r"[\n\r]+")

    while True:
        # scan to the beginning of next XML mark-up and add the contents to the plain text buffer
        (contents, mark, ptr) = scan_to(text, ptr, "<")
        contents = html.unescape(contents)
        line_count += contents.count("\n")

        if text_store is None:
            # default: extract text contents into the output plain text file
            if normalize_linebreaks:
                contents = lbre.sub(" ", contents)
            charcnt += len(contents)
            plaintext.write(contents)
        else:
            # mark-up element description to store/keep contents within
            text_store[0][text_store[1]] = contents
            text_store = None

        if len(mark):

            # treat XML elements, comments and processing instructions and their tag contents
            if text_continues(text, ptr, '?'):
                # processing instruction
                (element, mark, ptr) = scan_to(text, ptr, "?>")
                line_count += element.count("\n")
                (ename, csep, econtents) = re_partition(r"(\s+)", element)
                econtents += '?'
            elif text_continues(text, ptr, '!--'):
                # XML comment
                (element, mark, ptr) = scan_to(text, ptr, "-->")
                line_count += element.count("\n")
                ename = '!--'
                econtents = element[3:] + '--'
            else:
                # XML element
                (element, mark, ptr) = scan_to(text, ptr, ">")
                line_count += element.count("\n")
                (ename, csep, econtents) = re_partition(r"(\s+)", element)

            # store the mark-up data into the mark-up buffer
            if ename.startswith("/"):
                # end tag: add ending position and append the element span description into the mark-up buffer
                ename = ename.lstrip("/")
                last = heap.pop()
                slevel -= 1
                if ename != last['name']:
                    raise Exception("XML parsing error at line {0}: Element '{1}' not closed before end of element '{2}'."
                                    .format(line_count, last['name'], ename))
                last['end'] = charcnt
                markup.append(last)
                # is the ending element a text element or an excluded element?
                if txtelements is not None:
                    if matches_any_constraints(txtelements, heap + [last]):
                        txtel_depth -= 1
                        if exclude_depth == 0:
                            # insert provisional line break into the plain text that will be cut in the reverse conversion (unless within excluded scope)
                            charcnt += 1
                            plaintext.write("\n")
                            last['cut'] = 1
                            last['end'] = charcnt
                    # outside the scope of text elements: do not extract following text anymore, but keep it within the mark-up description
                    if txtel_depth == 0:
                        text_store = (last, 'tail')
                if excludes is not None:
                    if matches_any_constraints(excludes, heap + [last]):
                        exclude_depth -= 1
                    # still within the scope of excluded elements: no text extraction
                    if exclude_depth > 0:
                        text_store = (last, 'tail')

            else:
                # start tag, empty element, comment or processing instruction
                order += 1
                if ename.startswith("?") or ename.startswith("!") or ename.endswith("/") or econtents.endswith("/"):
                    # zero-span: empty tags, comments and proc. instructions have no end tag and can be inserted into the buffer immediately
                    eldesc = {"name": ename, "contents": econtents,
                              "start": charcnt, "end": charcnt, "level": slevel, 'order': order}
                    if csep and csep != ' ':
                        eldesc['csep'] = csep
                    # should the empty element add a provisional line break?
                    if txtelements is not None and exclude_depth == 0 and ename.endswith("/") and matches_any_constraints(txtelements, heap + [eldesc]):
                        charcnt += 1
                        plaintext.write("\n")
                        eldesc['skip'] = 1
                    markup.append(eldesc)
                    if (txtelements is not None and txtel_depth == 0) or (excludes is not None and exclude_depth > 0):
                        text_store = (eldesc, 'tail')
                else:
                    # pair start tags: add description to the heap to be inserted into the buffer when a matching end tag is found
                    eldesc = {"name": ename, "contents": econtents,
                              "start": charcnt, "level": slevel, 'order': order}
                    if csep and csep != ' ':
                        eldesc['csep'] = csep
                    # if included elements are defined, check whether we shall extract the text contents or store/keep them within the mark-up description
                    if txtelements is not None:
                        if matches_any_constraints(txtelements, heap + [eldesc]):
                            txtel_depth  += 1
                            text_store = None
                            if exclude_depth == 0:
                                # insert provisional line break into the plain text that will be skipped in the reverse conversion
                                charcnt += 1
                                plaintext.write("\n")
                                eldesc['skip'] = 1
                        elif txtel_depth == 0:
                            text_store = (eldesc, 'text')
                    # if excluded elements are defined, keep text contents within their description and do not extract them
                    if excludes is not None:
                        if matches_any_constraints(excludes, heap + [eldesc]):
                            exclude_depth += 1
                        if exclude_depth > 0:
                            text_store = (eldesc, 'text')
                    heap.append(eldesc)
                    slevel += 1
        if not len(mark):
            # end of XML contents: exit
            break
    return plaintext.getvalue(), markup

def process(config, xmlin, txtout=None, jsonout=None):
    txtout = txtout or xmlin.with_suffix('.txt')
    jsonout = jsonout or xmlin.with_suffix('.json')

    includes = [parse_constraints(i) for i in config.get('text_elements').replace(',',' ').split()] if config.get('text_elements') else None
    excludes = [parse_constraints(i) for i in config.get('exclude_elements').replace(',',' ').split()] if config.get('exclude_elements') else None

    # read the whole input file
    with xmlin.open(encoding='utf-8') as infile:
        text = infile.read()

    # generate separated plain text contents and mark-up description
    plaintext, markup = split_xml(text, includes, excludes, config.getboolean('keep_linebreaks'))

    # write out plain text contents
    with txtout.open('w', encoding='utf-8') as txtfile:
        txtfile.write(plaintext)
        txtfile.close()

    # write out XML mark-up description as JSON
    with jsonout.open('w', encoding='utf-8') as jsonfile:
        json.dump(markup, jsonfile, indent=0)
        jsonfile.close()


if __name__ == '__main__':
    """
    Split XML into plain text and stand-off XML mark-up description
    ===============================================================

    Input: input XML file name (UTF-8 encoding)
        (Only XML elements (with atrributes), comments and processing instructions are treated. Anything else is ignored.)

    Output: creates two new files with the same base name as the input file, but with the extension `.txt` and `.json`

    Options: `-c <filename>` Read additional configuration. By default, the file 'xmlanntools.ini' is
             searched and loaded both from the directory with the scripts and the current working dir.
             Later configuration files override previous settings. Command-line arguments override all.

             `-p` <profile>' Use 'profile' section from the configuration. If not provided, the
             section 'DEFAULT' will be used as fallback or the 'profile' specified in this section.

             '-t <text_elements>' List of comma separated constraints (see documentation for details) on basic text elements
             (e.g. paragraph level elements). Turns on content aware extraction where only text contents from the specified elements
             is extracted into the plain text output. Provisional line breaks are added to the plain text output between the text
             fragments, which will be removed later by `standoff2xml`. The text elements may also be nested within each other.

             '-kl' Keep line breaks within the specified text elements. By default, line breaks within the text elements are
             (irreversibly) converted to spaces. (This option has thus no effect if `-t` is not used!)

             '-e <element_names>' List of comma separated constraints on elements to ignore/skip when extracting plain text contents.
             Anything within their scope is ignored, even nested elements of the type specified by `-t`.
    """

    import sys
    import os
    import argparse
    import configparser

    parser = argparse.ArgumentParser(description="Split XML into plain text and stand-off XML mark-up")
    parser.add_argument("infile", help="input XML file name (UTF-8)")
    parser.add_argument("-c", "--config", help="additional config file", type=str)
    parser.add_argument("-p", "--profile", help="config profile to use", type=str, default='DEFAULT')
    parser.add_argument("-t", "--text-elements", help="text elements to extract contents from", type=str)
    parser.add_argument("-e", "--exclude-elements", help="elements to ignore", type=str)
    parser.add_argument("-kl", "--keep-linebreaks", help="keep line breaks within text elements", action="store_true")
    args = parser.parse_args()

    # read configuration from files
    scriptpath = path = os.path.dirname(os.path.realpath(__file__))
    curpath = os.getcwd()
    profiles = configparser.ConfigParser()
    profiles.read([scriptpath+'/xmlanntools.ini', curpath+'/xmlanntools.ini'])
    if args.config:
        read = profiles.read(args.config)
        if read != [args.config]:
            raise Exception("Failed reading configuration file: '{0}'".format(args.config))

    # if no profile was specified, check whether the DEFAULT profile specifies a default profile
    cur_profile = args.profile
    if cur_profile == 'DEFAULT' and profiles[cur_profile].get('profile'):
        cur_profile = profiles[cur_profile].get('profile')

    # evt. update/override currently selected profile configuration with values from command-line arguments
    profiles.read_dict({cur_profile: {k: v for k, v in vars(args).items() if v is not None and v != False}})
    config = profiles[cur_profile]

    if config.getboolean('keep_linebreaks') and config.get('text_elements') is None:
        sys.stderr.write("WARNING: Option '-kl/--keep-linebreaks' has no effect unless text elements are specified.\n")
        exit(1)

    process(config, Path(args.infile))