-
Notifications
You must be signed in to change notification settings - Fork 3
Expand file tree
/
Copy pathxlsx_2_xml.py
More file actions
141 lines (107 loc) · 7.57 KB
/
xlsx_2_xml.py
File metadata and controls
141 lines (107 loc) · 7.57 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
import pandas as pd
import glob
import xml.etree.ElementTree as ET
from xml.dom.minidom import parseString
def xlsx_2_xml(excel_file_path):
# Read the 'DictionaryName' sheet to get the XML file name and description
dictionary_name_df = pd.read_excel(excel_file_path, sheet_name='DictionaryName')
dictionary_file = dictionary_name_df['DictionaryFile'].dropna().iloc[0].strip()
description_text = dictionary_name_df['Description'].dropna().iloc[1].strip()
dictionary_name = dictionary_name_df['DictionaryName'].dropna().iloc[0].strip()
dictionary_id = dictionary_name_df['Dictionary ID'].dropna().iloc[0].strip()
# Construct the XML file path using the extracted name
xml_file_path = f'/workspaces/def/docs/codes/DIGGS/0.1/{dictionary_file}.xml'
# Read the 'Definitions' and 'AssociatedElements' sheets for data
definitions_df = pd.read_excel(excel_file_path, sheet_name='Definitions')
associated_elements_df = pd.read_excel(excel_file_path, sheet_name='AssociatedElements')
# Check if the entire 'ConditionalElement' column is empty
is_conditional_element_empty = associated_elements_df['ConditionalElement'].isna().all()
if is_conditional_element_empty:
processing_instruction = '<?xml-stylesheet type="text/xsl" href="https://diggsml.org/def/stylesheets/codelists.xsl"?>\n'
else:
processing_instruction = '<?xml-stylesheet type="text/xsl" href="https://diggsml.org/def/stylesheets/propertylists.xsl"?>\n'
# print(definitions_df)
# print(associated_elements_df)
# Namespace map
NS_MAP = {
"":"http://www.opengis.net/gml/3.2",
"gml": "http://www.opengis.net/gml/3.2",
"xsi": "http://www.w3.org/2001/XMLSchema-instance",
"diggs": "http://diggsml.org/schemas/2.6",
}
# Register namespaces
for prefix, uri in NS_MAP.items():
ET.register_namespace(prefix, uri)
# Create the root element with its namespaces
root_attribs = {
"{http://www.w3.org/2001/XMLSchema-instance}schemaLocation": "http://diggsml.org/schemas/2.6 https://diggsml.org/schemas/2.6/Diggs.xsd",
"{http://www.opengis.net/gml/3.2}id": dictionary_id # Correct use of the gml namespace
}
root = ET.Element(ET.QName(NS_MAP['gml'], 'Dictionary'), attrib=root_attribs)
# Add sub-elements like description and identifier with the gml prefix
description = ET.SubElement(root, ET.QName(NS_MAP['gml'], 'description'))
description.text = description_text
# Corrected identifier element: Removed gml prefix from codeSpace attribute
identifier = ET.SubElement(root, ET.QName(NS_MAP['gml'], 'identifier'), attrib={'codeSpace': "https://diggsml.org/def/authorities.xml#DIGGS"})
identifier.text = "https://diggsml.org/def/codes/DIGGS/0.1/" + dictionary_file + ".xml"
# Add sub-element name with the gml prefix
name = ET.SubElement(root, ET.QName(NS_MAP['gml'], 'name'))
name.text = dictionary_name
# Populate the XML with data from the 'Definitions' sheet, using the 'gml' prefix for GML elements
for _, row in definitions_df.iterrows():
entry = ET.SubElement(root, ET.QName(NS_MAP['gml'], 'dictionaryEntry'))
definition = ET.SubElement(entry, ET.QName(NS_MAP['diggs'], 'Definition'), attrib={ET.QName(NS_MAP['gml'], 'id'): str(row['ID']).strip()})
# Check and add elements only if they are not blank
if pd.notna(row['Description']) and row['Description'].strip():
ET.SubElement(definition, ET.QName(NS_MAP['gml'], 'description')).text = row['Description'].strip()
if pd.notna(row['Name']) and row['Name'].strip():
identifier_attrib = {'codeSpace': "https://diggsml.org/def/authorities.xml#DIGGS"}
identifier = ET.SubElement(definition, ET.QName(NS_MAP['gml'], 'identifier'), attrib=identifier_attrib)
identifier.text = "https://diggsml.org/def/codes/DIGGS/0.1/" + dictionary_file + ".xml#" + str(row['ID']).strip()
ET.SubElement(definition, ET.QName(NS_MAP['gml'], 'name')).text = row['Name'].strip()
if pd.notna(row['DataType']) and row['DataType'].strip():
ET.SubElement(definition, ET.QName(NS_MAP['diggs'], 'dataType')).text = str(row['DataType']).strip()
if pd.notna(row['QuantityClass']) and row['QuantityClass'].strip():
ET.SubElement(definition, ET.QName(NS_MAP['diggs'], 'quantityClass')).text = str(row['QuantityClass']).strip()
if pd.notna(row['Authority']) and row['Authority'].strip():
ET.SubElement(definition, ET.QName(NS_MAP['diggs'], 'authority')).text = str(row['Authority']).strip()
if pd.notna(row['Reference']) and row['Reference'].strip():
ET.SubElement(definition, ET.QName(NS_MAP['diggs'], 'reference')).text = str(row['Reference']).strip()
for _, row in associated_elements_df.iterrows():
# Find the parent definition element by matching the ID
definition_id = str(row['ID']).strip() # Assuming there's an 'ID' column to match with definitions
source_element = str(row['SourceElement']).strip() if pd.notna(row['SourceElement']) else None # The XPath or other identifier
conditional_element = str(row['ConditionalElement']).strip() if pd.notna(row['ConditionalElement']) else None # The XPath or other identifier
# Find the definition element this occurrence is associated with
for definition in root.findall(f".//{{{NS_MAP['diggs']}}}Definition"):
if definition.get(ET.QName(NS_MAP['gml'], 'id')) == definition_id:
# Once the correct definition is found, create the occurrences element if not already present
occurrences = definition.find(f".//{{{NS_MAP['diggs']}}}occurrences")
if occurrences is None: # If there's no occurrences element, create one
occurrences = ET.SubElement(definition, ET.QName(NS_MAP['diggs'], 'occurrences'))
# Add the Occurrence element
occurrence = ET.SubElement(occurrences, ET.QName(NS_MAP['diggs'], 'Occurrence'))
if source_element:
source_element_xpath = ET.SubElement(occurrence, ET.QName(NS_MAP['diggs'], 'sourceElementXpath'))
source_element_xpath.text = source_element
if conditional_element:
conditional_element_xpath = ET.SubElement(occurrence, ET.QName(NS_MAP['diggs'], 'conditionalElementXpath'))
conditional_element_xpath.text = conditional_element
break # Exit the loop once the occurrence is added to the correct definition
# Convert the ElementTree to a string
tree_str = ET.tostring(root, 'utf-8')
# Use minidom to pretty-print
dom = parseString(tree_str)
pretty_xml_as_string = dom.toprettyxml(indent=" ")
# Manually adjust the order of the XML declaration and the processing instruction
xml_declaration = '<?xml version="1.0" encoding="UTF-8"?>\n'
final_xml_str = xml_declaration + processing_instruction + pretty_xml_as_string
# Remove the duplicate XML declaration added by `toprettyxml`
final_xml_str = final_xml_str.replace('<?xml version="1.0" ?>', '', 1)
# Write the final XML string to the file
with open(xml_file_path, 'w', encoding='utf-8') as xml_file:
xml_file.write(final_xml_str)
return(f"XML file with correct order of declarations created at: {xml_file_path}")
# Replace 'your_directory_path' with the path to your directory
for xlsx_file in glob.glob('/workspaces/def/Codelist Excel Files and Conversion Templates to XML/*.xlsx'):
print(xlsx_2_xml(xlsx_file))