-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathxml2standoff
More file actions
executable file
·441 lines (379 loc) · 18.8 KB
/
xml2standoff
File metadata and controls
executable file
·441 lines (379 loc) · 18.8 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# Copyright (c) 2024-2025 Pavel Vondřička <pavel.vondricka@ff.cuni.cz>
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; version 2
# dated June, 1991.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
import html
import re
import json
from io import StringIO
from pathlib import Path
def scan_to(text, ptr, mark):
"""
Scan text up to the given mark and return preceding contents and the mark
Parameters:
-----------
text : str
Text string
ptr : integer
Pointer to the current character position in the text (used as scan start position)
mark: str
(Sub)string to seek until
Returns:
--------
segment : str
Text segment (str) from start (original ptr position) to the start of the searched *mark*
mark : str
The found mark (str) or empty string if not found until EOF
ptr : integer
Pointer to the last position (the end of the found *mark*)
"""
start = ptr
while ptr < len(text) and text[ptr:ptr+len(mark)] != mark:
ptr += 1
if text[ptr:ptr+len(mark)] != mark:
mark = ''
pre = ptr
ptr += len(mark)
return text[start:pre], mark, ptr
def text_continues(text, ptr, str):
"""
Test whether text contents at position `ptr` continues with the given string
Parameters:
-----------
text : str
text string
ptr : integer
Pointer to the current character position in the text (to test)
str : str
String to test for
Returns:
--------
result : boolean
Match or not
"""
return text[ptr:ptr+len(str)] == str
def re_partition(pattern, string):
match = re.search(pattern, string)
if not match:
return string, '', ''
return string[:match.start()], match.group(0), string[match.end():]
def parse_constraints(description):
"""
Parse additional constraints on textelements (or excluded elements)
Parameters:
-----------
description: str
string reptresentation of the constraints
Returns:
--------
constraints: dict
constraints on the name, attributes and/or parents (recursive) of the element
Supported format example:
- anyparent//direct_parent/element_name[@attr1="value1"][@attr2="value2"]
(all attribute constraints must be satisfied (AND logic))
"""
name = ''
parents, element = description.rsplit('/', 1) if '/' in description else ('', description)
direct = not parents.endswith('/')
if not direct:
parents = parents[:-1]
name, *attr_constraints = element.split('[')
attrs = []
for constr in attr_constraints:
assert constr[-1] == ']', "Missing end bracket in constraint: ["+constr
aname, avalue = constr[:-1].split('=')
attrs.append({
'name': aname.lstrip('@ ').rstrip(),
'value': avalue.strip("\" ")
})
parent = None if not parents else parse_constraints(parents)
if parent:
parent['direct'] = direct
return {
'name': name,
'parent': parent,
'attrs': attrs
}
def matches_constraint(constraint, heap):
"""
Test whether the current element (last on the heap) matches the given constraint
(see `parse_constraints` above for details)
"""
if not len(heap):
return False
ptr = -1
current = heap[ptr]
# compare current element name with name requested
if constraint['name'] != '*' and current['name'] != constraint['name'] and current['name'] != constraint['name']+'/':
return False
# test for attribute constraints
for attr_constr in constraint['attrs']:
expr = re.compile(r"(?<!\S)"+attr_constr['name']+r"\s*=\s*\""+html.escape(attr_constr['value'], quote=True)+"\"")
if not expr.search(current['contents']):
return False
# test for parent constraints
parent_constr = constraint['parent']
if parent_constr:
while True:
parent_test = matches_constraint(parent_constr, heap[:ptr])
if parent_test:
return True
ptr -= 1
if parent_constr['direct'] or abs(ptr) >= len(heap):
return False
return True
def matches_any_constraints(constraints, heap):
"""
Test whether the current element (last on the heap) matches any of the constraints in the list
"""
for constr in constraints:
if matches_constraint(constr, heap):
return True
return False
def split_xml(text, txtelements=None, excludes=None, keep_linebreaks=False):
"""
Split XML into plaintext and stand-off XML markup description
Parameters:
-----------
text : str
XML input as text string
txtelements : list of element constraints
list of names of elements containg text to be extracted
excludes : list of element constraints
list of names of elements to be skipped
keep_linebreaks : boolean
keep linebreaks within text elements or remove them?
Returns:
--------
plaintext : str
Extracted plaintext contents (str) stripped of all XML mark-up
markup : list of dict
XML markup description in the form of list of XML element descriptions:
- name (str) : name of the XML element
- contents (str) : other contents of the XML element tag (attributes with values, XML comment contents etc.)
- start (integer) : character position in the plain text where the XML element's span starts
- end (integer) : character position in the plain text where the XML element's span ends
- level (integher) : level of nesting (depth) in the XML tree
- order (integer) : order of element in the XML tree
- text (str): (optional) text contents at the beginning of the element (if not a text element, i.e. it has not been extracted into plain text file)
- tail (str): (optional) text contents at the end of the element (have not been extracted into plain text file)
- skip (int): (optional) skip following plain text contents at the start of the element (provisional line break)
- cut (int): (optional) cut plain text contents from the end of the element (provisional line break)
- csep (string): content separator (whitespace) between element name and other contents (attributes) if more than just the common single space character
"""
# buffer for the extracted plaintext
plaintext = StringIO()
# character counter for the extracted plain text contents
charcnt = 0
# heap of open element spans
heap = []
# buffer for the XML mark-up descxription
markup = []
# XML element level and order counters
slevel = 0
order = 0
# pointer to the current character position in the source XML string
ptr = 0
# content aware extraction:
# depth within text elements (may be nested)
txtel_depth = 0
# depth within excluded elements (may be nested)
exclude_depth = 0
# tuple with a current mark-up element description and its attribute to keep the text which shall not be extracted
text_store = None
# shall we remove line breaks within text elements?
normalize_linebreaks = txtelements is not None and not keep_linebreaks
# count lines
line_count = 1
lbre = re.compile(r"[\n\r]+")
while True:
# scan to the beginning of next XML mark-up and add the contents to the plain text buffer
(contents, mark, ptr) = scan_to(text, ptr, "<")
contents = html.unescape(contents)
line_count += contents.count("\n")
if text_store is None:
# default: extract text contents into the output plain text file
if normalize_linebreaks:
contents = lbre.sub(" ", contents)
charcnt += len(contents)
plaintext.write(contents)
else:
# mark-up element description to store/keep contents within
text_store[0][text_store[1]] = contents
text_store = None
if len(mark):
# treat XML elements, comments and processing instructions and their tag contents
if text_continues(text, ptr, '?'):
# processing instruction
(element, mark, ptr) = scan_to(text, ptr, "?>")
line_count += element.count("\n")
(ename, csep, econtents) = re_partition(r"(\s+)", element)
econtents += '?'
elif text_continues(text, ptr, '!--'):
# XML comment
(element, mark, ptr) = scan_to(text, ptr, "-->")
line_count += element.count("\n")
ename = '!--'
econtents = element[3:] + '--'
else:
# XML element
(element, mark, ptr) = scan_to(text, ptr, ">")
line_count += element.count("\n")
(ename, csep, econtents) = re_partition(r"(\s+)", element)
# store the mark-up data into the mark-up buffer
if ename.startswith("/"):
# end tag: add ending position and append the element span description into the mark-up buffer
ename = ename.lstrip("/")
last = heap.pop()
slevel -= 1
if ename != last['name']:
raise Exception("XML parsing error at line {0}: Element '{1}' not closed before end of element '{2}'."
.format(line_count, last['name'], ename))
last['end'] = charcnt
markup.append(last)
# is the ending element a text element or an excluded element?
if txtelements is not None:
if matches_any_constraints(txtelements, heap + [last]):
txtel_depth -= 1
if exclude_depth == 0:
# insert provisional line break into the plain text that will be cut in the reverse conversion (unless within excluded scope)
charcnt += 1
plaintext.write("\n")
last['cut'] = 1
last['end'] = charcnt
# outside the scope of text elements: do not extract following text anymore, but keep it within the mark-up description
if txtel_depth == 0:
text_store = (last, 'tail')
if excludes is not None:
if matches_any_constraints(excludes, heap + [last]):
exclude_depth -= 1
# still within the scope of excluded elements: no text extraction
if exclude_depth > 0:
text_store = (last, 'tail')
else:
# start tag, empty element, comment or processing instruction
order += 1
if ename.startswith("?") or ename.startswith("!") or ename.endswith("/") or econtents.endswith("/"):
# zero-span: empty tags, comments and proc. instructions have no end tag and can be inserted into the buffer immediately
eldesc = {"name": ename, "contents": econtents,
"start": charcnt, "end": charcnt, "level": slevel, 'order': order}
if csep and csep != ' ':
eldesc['csep'] = csep
# should the empty element add a provisional line break?
if txtelements is not None and exclude_depth == 0 and ename.endswith("/") and matches_any_constraints(txtelements, heap + [eldesc]):
charcnt += 1
plaintext.write("\n")
eldesc['skip'] = 1
markup.append(eldesc)
if (txtelements is not None and txtel_depth == 0) or (excludes is not None and exclude_depth > 0):
text_store = (eldesc, 'tail')
else:
# pair start tags: add description to the heap to be inserted into the buffer when a matching end tag is found
eldesc = {"name": ename, "contents": econtents,
"start": charcnt, "level": slevel, 'order': order}
if csep and csep != ' ':
eldesc['csep'] = csep
# if included elements are defined, check whether we shall extract the text contents or store/keep them within the mark-up description
if txtelements is not None:
if matches_any_constraints(txtelements, heap + [eldesc]):
txtel_depth += 1
text_store = None
if exclude_depth == 0:
# insert provisional line break into the plain text that will be skipped in the reverse conversion
charcnt += 1
plaintext.write("\n")
eldesc['skip'] = 1
elif txtel_depth == 0:
text_store = (eldesc, 'text')
# if excluded elements are defined, keep text contents within their description and do not extract them
if excludes is not None:
if matches_any_constraints(excludes, heap + [eldesc]):
exclude_depth += 1
if exclude_depth > 0:
text_store = (eldesc, 'text')
heap.append(eldesc)
slevel += 1
if not len(mark):
# end of XML contents: exit
break
return plaintext.getvalue(), markup
def process(config, xmlin, txtout=None, jsonout=None):
txtout = txtout or xmlin.with_suffix('.txt')
jsonout = jsonout or xmlin.with_suffix('.json')
includes = [parse_constraints(i) for i in config.get('text_elements').replace(',',' ').split()] if config.get('text_elements') else None
excludes = [parse_constraints(i) for i in config.get('exclude_elements').replace(',',' ').split()] if config.get('exclude_elements') else None
# read the whole input file
with xmlin.open(encoding='utf-8') as infile:
text = infile.read()
# generate separated plain text contents and mark-up description
plaintext, markup = split_xml(text, includes, excludes, config.getboolean('keep_linebreaks'))
# write out plain text contents
with txtout.open('w', encoding='utf-8') as txtfile:
txtfile.write(plaintext)
txtfile.close()
# write out XML mark-up description as JSON
with jsonout.open('w', encoding='utf-8') as jsonfile:
json.dump(markup, jsonfile, indent=0)
jsonfile.close()
if __name__ == '__main__':
"""
Split XML into plain text and stand-off XML mark-up description
===============================================================
Input: input XML file name (UTF-8 encoding)
(Only XML elements (with atrributes), comments and processing instructions are treated. Anything else is ignored.)
Output: creates two new files with the same base name as the input file, but with the extension `.txt` and `.json`
Options: `-c <filename>` Read additional configuration. By default, the file 'xmlanntools.ini' is
searched and loaded both from the directory with the scripts and the current working dir.
Later configuration files override previous settings. Command-line arguments override all.
`-p` <profile>' Use 'profile' section from the configuration. If not provided, the
section 'DEFAULT' will be used as fallback or the 'profile' specified in this section.
'-t <text_elements>' List of comma separated constraints (see documentation for details) on basic text elements
(e.g. paragraph level elements). Turns on content aware extraction where only text contents from the specified elements
is extracted into the plain text output. Provisional line breaks are added to the plain text output between the text
fragments, which will be removed later by `standoff2xml`. The text elements may also be nested within each other.
'-kl' Keep line breaks within the specified text elements. By default, line breaks within the text elements are
(irreversibly) converted to spaces. (This option has thus no effect if `-t` is not used!)
'-e <element_names>' List of comma separated constraints on elements to ignore/skip when extracting plain text contents.
Anything within their scope is ignored, even nested elements of the type specified by `-t`.
"""
import sys
import os
import argparse
import configparser
parser = argparse.ArgumentParser(description="Split XML into plain text and stand-off XML mark-up")
parser.add_argument("infile", help="input XML file name (UTF-8)")
parser.add_argument("-c", "--config", help="additional config file", type=str)
parser.add_argument("-p", "--profile", help="config profile to use", type=str, default='DEFAULT')
parser.add_argument("-t", "--text-elements", help="text elements to extract contents from", type=str)
parser.add_argument("-e", "--exclude-elements", help="elements to ignore", type=str)
parser.add_argument("-kl", "--keep-linebreaks", help="keep line breaks within text elements", action="store_true")
args = parser.parse_args()
# read configuration from files
scriptpath = path = os.path.dirname(os.path.realpath(__file__))
curpath = os.getcwd()
profiles = configparser.ConfigParser()
profiles.read([scriptpath+'/xmlanntools.ini', curpath+'/xmlanntools.ini'])
if args.config:
read = profiles.read(args.config)
if read != [args.config]:
raise Exception("Failed reading configuration file: '{0}'".format(args.config))
# if no profile was specified, check whether the DEFAULT profile specifies a default profile
cur_profile = args.profile
if cur_profile == 'DEFAULT' and profiles[cur_profile].get('profile'):
cur_profile = profiles[cur_profile].get('profile')
# evt. update/override currently selected profile configuration with values from command-line arguments
profiles.read_dict({cur_profile: {k: v for k, v in vars(args).items() if v is not None and v != False}})
config = profiles[cur_profile]
if config.getboolean('keep_linebreaks') and config.get('text_elements') is None:
sys.stderr.write("WARNING: Option '-kl/--keep-linebreaks' has no effect unless text elements are specified.\n")
exit(1)
process(config, Path(args.infile))