Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions pyQuARC/code/checker.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
from .schema_validator import SchemaValidator
from .constants import UMM_C # or however you define metadata format

from .constants import ECHO10_C, SCHEMA_PATHS
from .constants import ECHO10_C, SCHEMA_PATHS, ZENODO


class Checker:
Expand Down Expand Up @@ -295,7 +295,7 @@ def _xml_postprocessor(_, key, value):

kwargs = {}
parser = json.loads
if not self.metadata_format.startswith("umm-"):
if not self.metadata_format.startswith("umm-") and self.metadata_format != ZENODO:
parser = parse
kwargs = {"postprocessor": _xml_postprocessor}
json_metadata = parser(metadata_content, **kwargs)
Expand Down
3 changes: 2 additions & 1 deletion pyQuARC/code/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,9 @@
UMM_C = "umm-c"
UMM_G = "umm-g"
ECHO10_G = "echo-g"
ZENODO = "zenodo"

SUPPORTED_FORMATS = [DIF, ECHO10_C, UMM_C, UMM_G, ECHO10_G]
SUPPORTED_FORMATS = [DIF, ECHO10_C, UMM_C, UMM_G, ECHO10_G, ZENODO]

# Changed to os instead of pathlib
# https://github.com/aio-libs/aiohttp/issues/3977
Expand Down
60 changes: 60 additions & 0 deletions pyQuARC/code/custom_validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -279,6 +279,66 @@ def count_check(count, values, key):
num_items = len(items)
return {"valid": int(count) == num_items, "value": (count, num_items)}

@staticmethod
def fields_not_equal_check(field_a, field_b):
"""
Checks that two fields do not have the same value (case-insensitive strip)

Args:
field_a: First field value
field_b: Second field value

Returns:
(dict) An object with the validity of the check and the instance
"""
if field_a is None or field_b is None:
return {"valid": True, "value": (field_a, field_b)}
valid = str(field_a).strip().lower() != str(field_b).strip().lower()
return {"valid": valid, "value": (field_a, field_b)}

@staticmethod
def required_if_check(field_value, condition_value, expected_value):
"""
Checks that `field_value` is present when `condition_value` equals `expected_value`

Args:
field_value: The field that is conditionally required
condition_value: The field whose value triggers the requirement
expected_value (str): The value that triggers the requirement

Returns:
(dict) An object with the validity of the check and the instance
"""
if condition_value is None:
return {"valid": True, "value": field_value}
if str(condition_value).strip() == str(expected_value).strip():
valid = field_value is not None and str(field_value).strip() != ""
return {"valid": valid, "value": field_value}
return {"valid": True, "value": field_value}

@staticmethod
def min_items_check(field_value, min_count):
"""
Checks that a list field has at least `min_count` items

Args:
field_value: The field value (expected to be a list)
min_count (int): Minimum number of items required

Returns:
(dict) An object with the validity of the check and the instance
"""
if field_value is None:
count = 0
elif isinstance(field_value, list):
count = len(field_value)
else:
count = 1
return {
"valid": count >= int(min_count),
"value": count,
}

@staticmethod
def opendap_link_check(related_urls, key, extra=None):
"""
Expand Down
4 changes: 3 additions & 1 deletion pyQuARC/code/schema_validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from lxml import etree
from urllib.request import pathname2url
from .utils import read_json_schema_from_url
from .constants import ECHO10_C, SCHEMA_PATHS, UMM_C, UMM_G
from .constants import ECHO10_C, SCHEMA_PATHS, UMM_C, UMM_G, ZENODO


SUPPORTED_UMM_C_VERSIONS = ["v1.18.4"]
Expand Down Expand Up @@ -70,6 +70,8 @@ def __init__(

if metadata_format.startswith("umm-"):
self.validator_func = self.run_json_validator
elif metadata_format == ZENODO:
self.validator_func = lambda _: {} # no structural schema for Zenodo
else:
self.validator_func = self.run_xml_validator
self.check_messages = check_messages
Expand Down
57 changes: 57 additions & 0 deletions pyQuARC/code/string_validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -495,6 +495,63 @@ def validate_granule_platform_against_collection(
)
return {"valid": validity, "value": platform_shortname}

@staticmethod
@if_arg
def forbidden_values_check(value, forbidden_list):
"""
Checks that `value` does not contain any of the forbidden strings (case-insensitive)

Args:
value (str): The field value to check
forbidden_list (list): List of forbidden strings

Returns:
(dict) An object with the validity of the check and the instance
"""
value_lower = value.lower().strip()
for forbidden in forbidden_list:
if forbidden.lower() in value_lower:
return {"valid": False, "value": value}
return {"valid": True, "value": value}

@staticmethod
@if_arg
def min_length_check(value, min_length):
"""
Checks that the length of `value` is at least `min_length`

Args:
value (str): The field value
min_length (int): Minimum required length

Returns:
(dict) An object with the validity of the check and the instance
"""
length = len(str(value))
return {
"valid": length >= int(min_length),
"value": length,
}

@staticmethod
@if_arg
def regex_check(value, pattern):
"""
Checks that `value` matches the given regex `pattern`

Args:
value (str): The field value
pattern (str): The regex pattern

Returns:
(dict) An object with the validity of the check and the instance
"""
import re
return {
"valid": bool(re.match(pattern, str(value))),
"value": value,
}

@if_arg
def validate_granule_data_format_against_collection(
granule_data_format, collection_shortname=None, version=None, dataset_id=None
Expand Down
4 changes: 2 additions & 2 deletions pyQuARC/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
from code.utils import get_concept_type, get_headers
else:
from .code.checker import Checker
from .code.constants import COLOR, ECHO10_C, SUPPORTED_FORMATS
from .code.constants import COLOR, ECHO10_C, SUPPORTED_FORMATS, ZENODO
from .code.downloader import Downloader
from .code.utils import get_cmr_url, is_valid_cmr_url
from .code.utils import get_concept_type, get_headers
Expand Down Expand Up @@ -389,7 +389,7 @@ def display_results(self):
action="store",
nargs="?",
type=str,
help=f"The metadata format. Choices are: echo-c (echo10 collection), echo-g (echo10 granule), dif10 (dif10 collection), umm-c (umm-json collection), umm-g (umm-json granules)",
help=f"The metadata format. Choices are: echo-c (echo10 collection), echo-g (echo10 granule), dif10 (dif10 collection), umm-c (umm-json collection), umm-g (umm-json granules), zenodo (Zenodo/InvenioRDM record)",
)
parser.add_argument(
"--cmr_host",
Expand Down
144 changes: 144 additions & 0 deletions pyQuARC/schemas/check_messages.json
Original file line number Diff line number Diff line change
Expand Up @@ -1110,5 +1110,149 @@
"url": "https://wiki.earthdata.nasa.gov/display/CMR/Related+URLs"
},
"remediation": "Recommend providing an OPeNDAP in the granule's Online Resources or Related URLs fields for enhanced data accessibility."
},
"title_min_length_check": {
"failure": "The title is too short (current length: `{}`). A minimum of 10 characters is recommended.",
"help": {
"message": "",
"url": "https://wiki.earthdata.nasa.gov/display/CMR/Entry+Title"
},
"remediation": "Provide a more descriptive title with at least 10 characters."
},
"title_forbidden_values_check": {
"failure": "The title `{}` contains a generic or placeholder term (e.g. 'test', 'sample', 'untitled', 'dataset').",
"help": {
"message": "",
"url": "https://wiki.earthdata.nasa.gov/display/CMR/Entry+Title"
},
"remediation": "Replace the title with a meaningful, descriptive name for the dataset."
},
"title_description_not_same_check": {
"failure": "The title and description have identical content.",
"help": {
"message": "",
"url": "https://wiki.earthdata.nasa.gov/display/CMR/Abstract"
},
"remediation": "Provide a distinct description that elaborates on the dataset beyond what the title conveys."
},
"description_min_length_check": {
"failure": "The description is too short (current length: `{}`). A minimum of 100 characters is recommended.",
"help": {
"message": "",
"url": "https://wiki.earthdata.nasa.gov/display/CMR/Abstract"
},
"remediation": "Provide a more comprehensive description of at least 100 characters."
},
"long_description_recommended_check": {
"failure": "The description is less than 250 characters (current length: `{}`). A longer description improves discoverability.",
"help": {
"message": "",
"url": "https://wiki.earthdata.nasa.gov/display/CMR/Abstract"
},
"remediation": "Consider expanding the description to at least 250 characters, similar to a journal abstract."
},
"version_presence_check": {
"failure": "No version information is provided.",
"help": {
"message": "",
"url": "https://wiki.earthdata.nasa.gov/display/CMR/Version"
},
"remediation": "Provide a version identifier for the dataset (e.g. '1', '1.0', '001')."
},
"access_constraints_presence_check": {
"failure": "No access constraints information is provided.",
"help": {
"message": "",
"url": "https://wiki.earthdata.nasa.gov/display/CMR/Access+Constraints"
},
"remediation": "Provide an access constraints value such as 'Public', 'Restricted', 'Internal', or 'Embargoed'."
},
"access_constraints_vocab_check": {
"failure": "The access constraints value `{}` is not from the controlled vocabulary.",
"help": {
"message": "",
"url": "https://wiki.earthdata.nasa.gov/display/CMR/Access+Constraints"
},
"remediation": "Select an access constraints value from: ['Public', 'Restricted', 'Internal', 'Embargoed']."
},
"use_constraints_conditional_check": {
"failure": "Use constraints are required when access is set to 'Restricted' but none were provided.",
"help": {
"message": "",
"url": "https://wiki.earthdata.nasa.gov/display/CMR/Use+Constraints"
},
"remediation": "Provide a use constraints description explaining the restrictions on this dataset."
},
"collection_platform_presence_check": {
"failure": "No platform information is provided.",
"help": {
"message": "",
"url": "https://wiki.earthdata.nasa.gov/display/CMR/Platform"
},
"remediation": "Provide at least one platform short name. Use a valid GCMD platform keyword where possible."
},
"platform_other_description_check": {
"failure": "Platform is set to 'Other' but no descriptive long name is provided.",
"help": {
"message": "",
"url": "https://wiki.earthdata.nasa.gov/display/CMR/Platform"
},
"remediation": "When platform short name is 'Other', provide a descriptive long name to identify the platform."
},
"collection_instrument_presence_check": {
"failure": "No instrument information is provided.",
"help": {
"message": "",
"url": "https://wiki.earthdata.nasa.gov/display/CMR/Instrument"
},
"remediation": "Provide at least one instrument short name. Use a valid GCMD instrument keyword where possible."
},
"instrument_other_description_check": {
"failure": "Instrument is set to 'Other' but no descriptive long name is provided.",
"help": {
"message": "",
"url": "https://wiki.earthdata.nasa.gov/display/CMR/Instrument"
},
"remediation": "When instrument short name is 'Other', provide a descriptive long name to identify the instrument."
},
"spatial_reference_type_vocab_check": {
"failure": "The spatial reference type `{}` is not from the controlled vocabulary.",
"help": {
"message": "",
"url": "https://wiki.earthdata.nasa.gov/display/CMR/Spatial+Extent"
},
"remediation": "Select a spatial reference type from: ['Geographic', 'Projected', 'UTM', 'Polar', 'Custom']."
},
"spatial_resolution_presence_check": {
"failure": "No spatial resolution information is provided.",
"help": {
"message": "",
"url": "https://wiki.earthdata.nasa.gov/display/CMR/Spatial+Extent"
},
"remediation": "Provide spatial resolution or bounding coordinate information to help users understand the spatial coverage."
},
"spatial_resolution_units_check": {
"failure": "The spatial resolution value `{}` does not match the expected format (e.g. '30 m', '1.5 km', '0.25 deg').",
"help": {
"message": "",
"url": "https://wiki.earthdata.nasa.gov/display/CMR/Spatial+Extent"
},
"remediation": "Provide the spatial resolution as a number followed by a unit: 'm', 'km', or 'deg' (e.g. '30 m', '1 km')."
},
"temporal_resolution_vocab_check": {
"failure": "The temporal resolution value `{}` is not from the controlled vocabulary.",
"help": {
"message": "",
"url": "https://wiki.earthdata.nasa.gov/display/CMR/Temporal+Extent"
},
"remediation": "Select a temporal resolution from: ['Hourly', 'Daily', 'Weekly', 'Monthly', 'Yearly']."
},
"science_keywords_min_count_check": {
"failure": "Only `{}` science keyword(s) provided. At least 3 are recommended.",
"help": {
"message": "",
"url": "https://wiki.earthdata.nasa.gov/display/CMR/Science+Keywords"
},
"remediation": "Provide at least 3 science keywords from the GCMD keyword list to improve discoverability."
}
}
36 changes: 36 additions & 0 deletions pyQuARC/schemas/checks.json
Original file line number Diff line number Diff line change
Expand Up @@ -318,5 +318,41 @@
"data_type": "custom",
"check_function": "opendap_link_check",
"available": true
},
"forbidden_values_check": {
"data_type": "string",
"check_function": "forbidden_values_check",
"description": "Checks that a field value does not contain any forbidden strings.",
"available": true
},
"min_length_check": {
"data_type": "string",
"check_function": "min_length_check",
"description": "Checks that a field value meets a minimum character length.",
"available": true
},
"regex_check": {
"data_type": "string",
"check_function": "regex_check",
"description": "Checks that a field value matches a given regex pattern.",
"available": true
},
"fields_not_equal_check": {
"data_type": "custom",
"check_function": "fields_not_equal_check",
"description": "Checks that two fields do not have the same value.",
"available": true
},
"required_if_check": {
"data_type": "custom",
"check_function": "required_if_check",
"description": "Checks that a field is present when another field equals a specific value.",
"available": true
},
"min_items_check": {
"data_type": "custom",
"check_function": "min_items_check",
"description": "Checks that a list field contains at least a minimum number of items.",
"available": true
}
}
Loading
Loading