11import xml .etree .ElementTree as ET
22from io import BytesIO
33
4- # Prototype code, change #
5-
64def parser (xml_bytes , mapping ):
75 rows = []
86
@@ -21,16 +19,13 @@ def parser(xml_bytes, mapping):
2119 _stack .pop ()
2220
2321 for table_name , table_mapping in mapping .items ():
24- # Filter mapping to only paths present in this file
2522 table_mapping = {k : v for k , v in table_mapping .items () if k in real_paths }
2623 if not table_mapping :
2724 continue
2825
29- # Split mapping into text paths and attribute paths
3026 attr_mapping = {k : v for k , v in table_mapping .items () if "/@" in k }
3127 text_mapping = {k : v for k , v in table_mapping .items () if "/@" not in k }
3228
33- # Segment-wise common prefix (not character-wise)
3429 base_paths = [k .rsplit ("/@" , 1 )[0 ] if "/@" in k else k for k in table_mapping .keys ()]
3530 split_paths = [p .strip ("/" ).split ("/" ) for p in base_paths ]
3631 prefix_segments = []
@@ -46,6 +41,14 @@ def parser(xml_bytes, mapping):
4641 empty_row = {col : None for col in table_mapping .values ()}
4742 current_path = []
4843 current_row = empty_row .copy ()
44+ # Track accumulated values for repeating fields
45+ accumulator = {col : [] for col in table_mapping .values ()}
46+
47+ def flush_row ():
48+ # Merge accumulator into current_row as pipe-delimited strings
49+ for col , values in accumulator .items ():
50+ if values :
51+ current_row [col ] = "|" .join (values )
4952
5053 for event , elem in ET .iterparse (BytesIO (xml_bytes ), events = ("start" , "end" )):
5154 tag = elem .tag .split ("}" )[- 1 ] if "}" in elem .tag else elem .tag .split (":" )[- 1 ]
@@ -57,24 +60,31 @@ def parser(xml_bytes, mapping):
5760
5861 # Handle text content
5962 if path in text_mapping :
60- current_row [text_mapping [path ]] = elem .text
63+ col = text_mapping [path ]
64+ if elem .text and elem .text .strip ():
65+ accumulator [col ].append (elem .text .strip ())
6166
6267 # Handle attributes
6368 for attr_name , attr_value in elem .attrib .items ():
6469 attr_path = f"{ path } /@{ attr_name } "
6570 if attr_path in attr_mapping :
66- current_row [attr_mapping [attr_path ]] = attr_value
71+ col = attr_mapping [attr_path ]
72+ accumulator [col ].append (attr_value )
6773
6874 if path == row_boundary :
75+ flush_row ()
6976 current_row ["_table" ] = table_name
70- rows .append (current_row )
77+ if any (v is not None and str (v ).strip () for k , v in current_row .items () if k != "_table" ):
78+ rows .append (current_row )
7179 current_row = empty_row .copy ()
80+ accumulator = {col : [] for col in table_mapping .values ()}
7281
7382 current_path .pop ()
7483
7584 # Flush if boundary was the root (single-row tables like doc header)
76- if any (v is not None for v in current_row .values ()):
85+ flush_row ()
86+ if any (v is not None and str (v ).strip () for v in current_row .values ()):
7787 current_row ["_table" ] = table_name
7888 rows .append (current_row )
7989
80- return rows
90+ return rows
0 commit comments