Skip to content

Commit 5a74c9f

Browse files
authored
Merge pull request #328 from NASA-IMPACT/umm-c-schema-update-la
Schema update for umm-c (1.18.2 to 1.18.4)
2 parents 9287b7a + 2f64e98 commit 5a74c9f

3 files changed

Lines changed: 114 additions & 23 deletions

File tree

pyQuARC/code/checker.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,9 @@
1414
from .string_validator import StringValidator
1515
from .url_validator import UrlValidator
1616

17+
from .schema_validator import SchemaValidator
18+
from .constants import UMM_C # or however you define metadata format
19+
1720
from .constants import ECHO10_C, SCHEMA_PATHS
1821

1922

pyQuARC/code/schema_validator.py

Lines changed: 104 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -6,9 +6,23 @@
66
from jsonschema import Draft7Validator, RefResolver
77
from lxml import etree
88
from urllib.request import pathname2url
9+
from .utils import read_json_schema_from_url
10+
from .constants import ECHO10_C, SCHEMA_PATHS, UMM_C, UMM_G
911

10-
from .constants import ECHO10_C, SCHEMA_PATHS, UMM_C
1112

13+
SUPPORTED_UMM_C_VERSIONS = ["v1.18.4", "v1.18.3", "v1.18.2"]
14+
DEFAULT_UMM_C_VERSION = "v1.18.4" # Or any other version you prefer as default
15+
16+
# Define UMM-G versions if you want to make it flexible as well
17+
SUPPORTED_UMM_G_VERSIONS = ["v1.6.6"]
18+
DEFAULT_UMM_G_VERSION = "v1.6.6"
19+
20+
SCHEMA_CDN_BASE = "https://cdn.earthdata.nasa.gov/umm"
21+
22+
REMOTE_XML_SCHEMAS = {
23+
"echo10_collection": "https://git.earthdata.nasa.gov/projects/EMFD/repos/echo-schemas/browse/schemas/10.0/Collection.xsd",
24+
"echo10_granule": "https://git.earthdata.nasa.gov/projects/EMFD/repos/echo-schemas/browse/schemas/10.0/Granule.xsd"
25+
}
1226

1327
class SchemaValidator:
1428
"""
@@ -21,6 +35,10 @@ def __init__(
2135
self,
2236
check_messages,
2337
metadata_format=ECHO10_C,
38+
# Add a new parameter for UMM-C version
39+
umm_c_version=DEFAULT_UMM_C_VERSION,
40+
# Add a new parameter for UMM-G version (if you want to make it flexible too)
41+
umm_g_version=DEFAULT_UMM_G_VERSION
2442
):
2543
"""
2644
Args:
@@ -29,41 +47,95 @@ def __init__(
2947
validation_paths (list of str): The path of the fields in the
3048
metadata that need to be validated. In the form
3149
['Collection/StartDate', ...].
50+
umm_c_version (str): The specific UMM-C version to use for validation (e.g., "v1.18.4").
51+
umm_g_version (str): The specific UMM-G version to use for validation (e.g., "v1.6.6").
52+
check_messages (dict): A dictionary of check messages for errors.
3253
"""
3354
self.metadata_format = metadata_format
55+
# Validate and store the UMM-C version
56+
if umm_c_version not in SUPPORTED_UMM_C_VERSIONS:
57+
raise ValueError(
58+
f"Unsupported UMM-C version: {umm_c_version}. "
59+
f"Supported versions are: {', '.join(SUPPORTED_UMM_C_VERSIONS)}"
60+
)
61+
self.umm_c_version = umm_c_version
62+
63+
# Validate and store the UMM-G version
64+
if umm_g_version not in SUPPORTED_UMM_G_VERSIONS:
65+
raise ValueError(
66+
f"Unsupported UMM-G version: {umm_g_version}. "
67+
f"Supported versions are: {', '.join(SUPPORTED_UMM_G_VERSIONS)}"
68+
)
69+
self.umm_g_version = umm_g_version
70+
3471
if metadata_format.startswith("umm-"):
3572
self.validator_func = self.run_json_validator
3673
else:
3774
self.validator_func = self.run_xml_validator
3875
self.check_messages = check_messages
3976

77+
78+
4079
def read_xml_schema(self):
4180
"""
42-
Reads the xml schema file
81+
Reads the XML schema file (either from a remote URL or local path).
4382
"""
44-
# The XML schema file (echo10_xml.xsd) imports another schema file (MetadataCommon.xsd)
45-
# Python cannot figure out the import if they are in a different location than the calling script
46-
# Thus we need to set an environment variable to let it know where the files are located
47-
# Path to catalog must be a url
83+
from urllib.request import urlopen
84+
85+
# Maintain XML catalog handling
4886
catalog_path = f"file:{pathname2url(str(SCHEMA_PATHS['catalog']))}"
49-
# Temporarily set the environment variable
5087
os.environ["XML_CATALOG_FILES"] = os.environ.get(
5188
"XML_CATALOG_FILES", catalog_path
5289
)
5390

54-
with open(SCHEMA_PATHS[f"{self.metadata_format}_schema"]) as schema_file:
55-
file_content = schema_file.read().encode()
56-
xmlschema_doc = etree.parse(BytesIO(file_content))
57-
schema = etree.XMLSchema(xmlschema_doc)
58-
return schema
91+
def get_raw_schema_url(browse_url: str) -> str:
92+
"""Convert /browse/ URL into /raw/ for direct XML download."""
93+
if "/browse/" in browse_url:
94+
return browse_url.replace("/browse/", "/raw/") + "?at=refs%2Fheads%2Fmaster"
95+
return browse_url
5996

97+
# Select remote schema if metadata_format matches
98+
schema_url = REMOTE_XML_SCHEMAS.get(self.metadata_format)
99+
try:
100+
if schema_url:
101+
raw_url = get_raw_schema_url(schema_url)
102+
print(f"Fetching schema remotely from: {raw_url}")
103+
import ssl
104+
ssl_context = ssl._create_unverified_context() # Disable certificate check safely for this fetch
105+
with urlopen(raw_url, context=ssl_context) as response:
106+
file_content = response.read()
107+
else:
108+
# Fallback to local schema file
109+
with open(SCHEMA_PATHS[f"{self.metadata_format}_schema"]) as schema_file:
110+
file_content = schema_file.read().encode()
111+
112+
xmlschema_doc = etree.parse(BytesIO(file_content))
113+
schema = etree.XMLSchema(xmlschema_doc)
114+
return schema
115+
116+
except Exception as e:
117+
print(f"⚠️ Remote fetch failed or unavailable for {self.metadata_format}: {e}")
118+
print("Falling back to local schema file...")
119+
with open(SCHEMA_PATHS[f"{self.metadata_format}_schema"]) as schema_file:
120+
file_content = schema_file.read().encode()
121+
xmlschema_doc = etree.parse(BytesIO(file_content))
122+
schema = etree.XMLSchema(xmlschema_doc)
123+
return schema
124+
60125
def read_json_schema(self):
61126
"""
62127
Reads the json schema file
63128
"""
129+
if self.metadata_format == UMM_C:
130+
schema_url = (f"{SCHEMA_CDN_BASE}/collection/{self.umm_c_version}/umm-c-json-schema.json")
131+
return read_json_schema_from_url(schema_url)
132+
133+
if self.metadata_format == UMM_G:
134+
schema_url = (f"{SCHEMA_CDN_BASE}/granule/{self.umm_g_version}/umm-g-json-schema.json")
135+
return read_json_schema_from_url(schema_url)
136+
64137
with open(SCHEMA_PATHS[f"{self.metadata_format}-json-schema"]) as schema_file:
65-
schema = json.load(schema_file)
66-
return schema
138+
return json.load(schema_file)
67139

68140
def run_json_validator(self, content_to_validate):
69141
"""
@@ -77,19 +149,28 @@ def run_json_validator(self, content_to_validate):
77149
schema_store = {}
78150

79151
if self.metadata_format == UMM_C:
80-
with open(SCHEMA_PATHS["umm-cmn-json-schema"]) as schema_file:
81-
schema_base = json.load(schema_file)
82152

83-
# workaround to read local referenced schema file (only supports uri)
84-
schema_store = {
85-
schema_base.get("$id", "/umm-cmn-json-schema.json"): schema_base,
86-
schema_base.get("$id", "umm-cmn-json-schema.json"): schema_base,
87-
}
88153

89-
errors = {}
154+
#umm_cmn_schema_url = f"{SCHEMA_CDN_BASE}/collection/{self.umm_c_version}/umm-c-json-schema.json"
155+
# If it's *not* versioned and always the latest or a specific fixed version, adjust this URL
156+
# e.g., f"{SCHEMA_CDN_BASE}/common/umm-cmn-json-schema.json" or from SCHEMA_PATHS
90157

91-
resolver = RefResolver.from_schema(schema, store=schema_store)
158+
try:
159+
with open(SCHEMA_PATHS["umm-cmn-json-schema"]) as common_schema_file:
160+
schema_base = json.load(common_schema_file)
161+
# 1. Add the schema using its $id (most common canonical reference)
162+
if "$id" in schema_base:
163+
schema_store[schema_base["$id"]] = schema_base
164+
165+
# 2. Add the schema using the full URL you fetched it from (if different from $id or for robustness)
166+
schema_store["/umm-cmn-json-schema.json"] = schema_base
167+
schema_store["umm-cmn-json-schema.json"] = schema_base
168+
except Exception as e:
169+
print(f"Error loading UMM Common schema from {SCHEMA_PATHS['umm-cmn-json-schema']}: {e}")
170+
print("Schema validation for UMM-C might proceed without common schema, leading to incomplete validation.")
92171

172+
errors = {}
173+
resolver = RefResolver.from_schema(schema, store=schema_store)
93174
validator = Draft7Validator(
94175
schema, format_checker=Draft7Validator.FORMAT_CHECKER, resolver=resolver
95176
)

pyQuARC/code/utils.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,13 @@ def get_date_time(dt_str):
8383
continue
8484
return None
8585

86+
def read_json_schema_from_url(url):
87+
"""
88+
Downloads and returns a JSON schema from a given URL.
89+
"""
90+
response = requests.get(url)
91+
response.raise_for_status()
92+
return response.json()
8693

8794
def get_concept_type(concept_id):
8895
"""

0 commit comments

Comments
 (0)