diff --git a/pyQuARC/code/checker.py b/pyQuARC/code/checker.py index 69904bee..ad6f9528 100644 --- a/pyQuARC/code/checker.py +++ b/pyQuARC/code/checker.py @@ -17,7 +17,7 @@ from .schema_validator import SchemaValidator from .constants import UMM_C # or however you define metadata format -from .constants import ECHO10_C, SCHEMA_PATHS +from .constants import ECHO10_C, SCHEMA_PATHS, ZENODO class Checker: @@ -295,7 +295,7 @@ def _xml_postprocessor(_, key, value): kwargs = {} parser = json.loads - if not self.metadata_format.startswith("umm-"): + if not self.metadata_format.startswith("umm-") and self.metadata_format != ZENODO: parser = parse kwargs = {"postprocessor": _xml_postprocessor} json_metadata = parser(metadata_content, **kwargs) diff --git a/pyQuARC/code/constants.py b/pyQuARC/code/constants.py index 6eadb68b..a0150e41 100644 --- a/pyQuARC/code/constants.py +++ b/pyQuARC/code/constants.py @@ -6,8 +6,9 @@ UMM_C = "umm-c" UMM_G = "umm-g" ECHO10_G = "echo-g" +ZENODO = "zenodo" -SUPPORTED_FORMATS = [DIF, ECHO10_C, UMM_C, UMM_G, ECHO10_G] +SUPPORTED_FORMATS = [DIF, ECHO10_C, UMM_C, UMM_G, ECHO10_G, ZENODO] # Changed to os instead of pathlib # https://github.com/aio-libs/aiohttp/issues/3977 diff --git a/pyQuARC/code/custom_validator.py b/pyQuARC/code/custom_validator.py index ab789d3f..11b9c84e 100644 --- a/pyQuARC/code/custom_validator.py +++ b/pyQuARC/code/custom_validator.py @@ -279,6 +279,66 @@ def count_check(count, values, key): num_items = len(items) return {"valid": int(count) == num_items, "value": (count, num_items)} + @staticmethod + def fields_not_equal_check(field_a, field_b): + """ + Checks that two fields do not have the same value (case-insensitive strip) + + Args: + field_a: First field value + field_b: Second field value + + Returns: + (dict) An object with the validity of the check and the instance + """ + if field_a is None or field_b is None: + return {"valid": True, "value": (field_a, field_b)} + valid = str(field_a).strip().lower() != str(field_b).strip().lower() + return {"valid": valid, "value": (field_a, field_b)} + + @staticmethod + def required_if_check(field_value, condition_value, expected_value): + """ + Checks that `field_value` is present when `condition_value` equals `expected_value` + + Args: + field_value: The field that is conditionally required + condition_value: The field whose value triggers the requirement + expected_value (str): The value that triggers the requirement + + Returns: + (dict) An object with the validity of the check and the instance + """ + if condition_value is None: + return {"valid": True, "value": field_value} + if str(condition_value).strip() == str(expected_value).strip(): + valid = field_value is not None and str(field_value).strip() != "" + return {"valid": valid, "value": field_value} + return {"valid": True, "value": field_value} + + @staticmethod + def min_items_check(field_value, min_count): + """ + Checks that a list field has at least `min_count` items + + Args: + field_value: The field value (expected to be a list) + min_count (int): Minimum number of items required + + Returns: + (dict) An object with the validity of the check and the instance + """ + if field_value is None: + count = 0 + elif isinstance(field_value, list): + count = len(field_value) + else: + count = 1 + return { + "valid": count >= int(min_count), + "value": count, + } + @staticmethod def opendap_link_check(related_urls, key, extra=None): """ diff --git a/pyQuARC/code/schema_validator.py b/pyQuARC/code/schema_validator.py index 1ba229e9..59beed90 100644 --- a/pyQuARC/code/schema_validator.py +++ b/pyQuARC/code/schema_validator.py @@ -7,7 +7,7 @@ from lxml import etree from urllib.request import pathname2url from .utils import read_json_schema_from_url -from .constants import ECHO10_C, SCHEMA_PATHS, UMM_C, UMM_G +from .constants import ECHO10_C, SCHEMA_PATHS, UMM_C, UMM_G, ZENODO SUPPORTED_UMM_C_VERSIONS = ["v1.18.4"] @@ -70,6 +70,8 @@ def __init__( if metadata_format.startswith("umm-"): self.validator_func = self.run_json_validator + elif metadata_format == ZENODO: + self.validator_func = lambda _: {} # no structural schema for Zenodo else: self.validator_func = self.run_xml_validator self.check_messages = check_messages diff --git a/pyQuARC/code/string_validator.py b/pyQuARC/code/string_validator.py index 8ba756c1..a80e178c 100644 --- a/pyQuARC/code/string_validator.py +++ b/pyQuARC/code/string_validator.py @@ -495,6 +495,63 @@ def validate_granule_platform_against_collection( ) return {"valid": validity, "value": platform_shortname} + @staticmethod + @if_arg + def forbidden_values_check(value, forbidden_list): + """ + Checks that `value` does not contain any of the forbidden strings (case-insensitive) + + Args: + value (str): The field value to check + forbidden_list (list): List of forbidden strings + + Returns: + (dict) An object with the validity of the check and the instance + """ + value_lower = value.lower().strip() + for forbidden in forbidden_list: + if forbidden.lower() in value_lower: + return {"valid": False, "value": value} + return {"valid": True, "value": value} + + @staticmethod + @if_arg + def min_length_check(value, min_length): + """ + Checks that the length of `value` is at least `min_length` + + Args: + value (str): The field value + min_length (int): Minimum required length + + Returns: + (dict) An object with the validity of the check and the instance + """ + length = len(str(value)) + return { + "valid": length >= int(min_length), + "value": length, + } + + @staticmethod + @if_arg + def regex_check(value, pattern): + """ + Checks that `value` matches the given regex `pattern` + + Args: + value (str): The field value + pattern (str): The regex pattern + + Returns: + (dict) An object with the validity of the check and the instance + """ + import re + return { + "valid": bool(re.match(pattern, str(value))), + "value": value, + } + @if_arg def validate_granule_data_format_against_collection( granule_data_format, collection_shortname=None, version=None, dataset_id=None diff --git a/pyQuARC/main.py b/pyQuARC/main.py index 361a4286..54bb69d4 100644 --- a/pyQuARC/main.py +++ b/pyQuARC/main.py @@ -19,7 +19,7 @@ from code.utils import get_concept_type, get_headers else: from .code.checker import Checker - from .code.constants import COLOR, ECHO10_C, SUPPORTED_FORMATS + from .code.constants import COLOR, ECHO10_C, SUPPORTED_FORMATS, ZENODO from .code.downloader import Downloader from .code.utils import get_cmr_url, is_valid_cmr_url from .code.utils import get_concept_type, get_headers @@ -389,7 +389,7 @@ def display_results(self): action="store", nargs="?", type=str, - help=f"The metadata format. Choices are: echo-c (echo10 collection), echo-g (echo10 granule), dif10 (dif10 collection), umm-c (umm-json collection), umm-g (umm-json granules)", + help=f"The metadata format. Choices are: echo-c (echo10 collection), echo-g (echo10 granule), dif10 (dif10 collection), umm-c (umm-json collection), umm-g (umm-json granules), zenodo (Zenodo/InvenioRDM record)", ) parser.add_argument( "--cmr_host", diff --git a/pyQuARC/schemas/check_messages.json b/pyQuARC/schemas/check_messages.json index ffe03742..6cb92460 100644 --- a/pyQuARC/schemas/check_messages.json +++ b/pyQuARC/schemas/check_messages.json @@ -1110,5 +1110,149 @@ "url": "https://wiki.earthdata.nasa.gov/display/CMR/Related+URLs" }, "remediation": "Recommend providing an OPeNDAP in the granule's Online Resources or Related URLs fields for enhanced data accessibility." + }, + "title_min_length_check": { + "failure": "The title is too short (current length: `{}`). A minimum of 10 characters is recommended.", + "help": { + "message": "", + "url": "https://wiki.earthdata.nasa.gov/display/CMR/Entry+Title" + }, + "remediation": "Provide a more descriptive title with at least 10 characters." + }, + "title_forbidden_values_check": { + "failure": "The title `{}` contains a generic or placeholder term (e.g. 'test', 'sample', 'untitled', 'dataset').", + "help": { + "message": "", + "url": "https://wiki.earthdata.nasa.gov/display/CMR/Entry+Title" + }, + "remediation": "Replace the title with a meaningful, descriptive name for the dataset." + }, + "title_description_not_same_check": { + "failure": "The title and description have identical content.", + "help": { + "message": "", + "url": "https://wiki.earthdata.nasa.gov/display/CMR/Abstract" + }, + "remediation": "Provide a distinct description that elaborates on the dataset beyond what the title conveys." + }, + "description_min_length_check": { + "failure": "The description is too short (current length: `{}`). A minimum of 100 characters is recommended.", + "help": { + "message": "", + "url": "https://wiki.earthdata.nasa.gov/display/CMR/Abstract" + }, + "remediation": "Provide a more comprehensive description of at least 100 characters." + }, + "long_description_recommended_check": { + "failure": "The description is less than 250 characters (current length: `{}`). A longer description improves discoverability.", + "help": { + "message": "", + "url": "https://wiki.earthdata.nasa.gov/display/CMR/Abstract" + }, + "remediation": "Consider expanding the description to at least 250 characters, similar to a journal abstract." + }, + "version_presence_check": { + "failure": "No version information is provided.", + "help": { + "message": "", + "url": "https://wiki.earthdata.nasa.gov/display/CMR/Version" + }, + "remediation": "Provide a version identifier for the dataset (e.g. '1', '1.0', '001')." + }, + "access_constraints_presence_check": { + "failure": "No access constraints information is provided.", + "help": { + "message": "", + "url": "https://wiki.earthdata.nasa.gov/display/CMR/Access+Constraints" + }, + "remediation": "Provide an access constraints value such as 'Public', 'Restricted', 'Internal', or 'Embargoed'." + }, + "access_constraints_vocab_check": { + "failure": "The access constraints value `{}` is not from the controlled vocabulary.", + "help": { + "message": "", + "url": "https://wiki.earthdata.nasa.gov/display/CMR/Access+Constraints" + }, + "remediation": "Select an access constraints value from: ['Public', 'Restricted', 'Internal', 'Embargoed']." + }, + "use_constraints_conditional_check": { + "failure": "Use constraints are required when access is set to 'Restricted' but none were provided.", + "help": { + "message": "", + "url": "https://wiki.earthdata.nasa.gov/display/CMR/Use+Constraints" + }, + "remediation": "Provide a use constraints description explaining the restrictions on this dataset." + }, + "collection_platform_presence_check": { + "failure": "No platform information is provided.", + "help": { + "message": "", + "url": "https://wiki.earthdata.nasa.gov/display/CMR/Platform" + }, + "remediation": "Provide at least one platform short name. Use a valid GCMD platform keyword where possible." + }, + "platform_other_description_check": { + "failure": "Platform is set to 'Other' but no descriptive long name is provided.", + "help": { + "message": "", + "url": "https://wiki.earthdata.nasa.gov/display/CMR/Platform" + }, + "remediation": "When platform short name is 'Other', provide a descriptive long name to identify the platform." + }, + "collection_instrument_presence_check": { + "failure": "No instrument information is provided.", + "help": { + "message": "", + "url": "https://wiki.earthdata.nasa.gov/display/CMR/Instrument" + }, + "remediation": "Provide at least one instrument short name. Use a valid GCMD instrument keyword where possible." + }, + "instrument_other_description_check": { + "failure": "Instrument is set to 'Other' but no descriptive long name is provided.", + "help": { + "message": "", + "url": "https://wiki.earthdata.nasa.gov/display/CMR/Instrument" + }, + "remediation": "When instrument short name is 'Other', provide a descriptive long name to identify the instrument." + }, + "spatial_reference_type_vocab_check": { + "failure": "The spatial reference type `{}` is not from the controlled vocabulary.", + "help": { + "message": "", + "url": "https://wiki.earthdata.nasa.gov/display/CMR/Spatial+Extent" + }, + "remediation": "Select a spatial reference type from: ['Geographic', 'Projected', 'UTM', 'Polar', 'Custom']." + }, + "spatial_resolution_presence_check": { + "failure": "No spatial resolution information is provided.", + "help": { + "message": "", + "url": "https://wiki.earthdata.nasa.gov/display/CMR/Spatial+Extent" + }, + "remediation": "Provide spatial resolution or bounding coordinate information to help users understand the spatial coverage." + }, + "spatial_resolution_units_check": { + "failure": "The spatial resolution value `{}` does not match the expected format (e.g. '30 m', '1.5 km', '0.25 deg').", + "help": { + "message": "", + "url": "https://wiki.earthdata.nasa.gov/display/CMR/Spatial+Extent" + }, + "remediation": "Provide the spatial resolution as a number followed by a unit: 'm', 'km', or 'deg' (e.g. '30 m', '1 km')." + }, + "temporal_resolution_vocab_check": { + "failure": "The temporal resolution value `{}` is not from the controlled vocabulary.", + "help": { + "message": "", + "url": "https://wiki.earthdata.nasa.gov/display/CMR/Temporal+Extent" + }, + "remediation": "Select a temporal resolution from: ['Hourly', 'Daily', 'Weekly', 'Monthly', 'Yearly']." + }, + "science_keywords_min_count_check": { + "failure": "Only `{}` science keyword(s) provided. At least 3 are recommended.", + "help": { + "message": "", + "url": "https://wiki.earthdata.nasa.gov/display/CMR/Science+Keywords" + }, + "remediation": "Provide at least 3 science keywords from the GCMD keyword list to improve discoverability." } } \ No newline at end of file diff --git a/pyQuARC/schemas/checks.json b/pyQuARC/schemas/checks.json index 4fa0df4c..0cba2346 100644 --- a/pyQuARC/schemas/checks.json +++ b/pyQuARC/schemas/checks.json @@ -318,5 +318,41 @@ "data_type": "custom", "check_function": "opendap_link_check", "available": true + }, + "forbidden_values_check": { + "data_type": "string", + "check_function": "forbidden_values_check", + "description": "Checks that a field value does not contain any forbidden strings.", + "available": true + }, + "min_length_check": { + "data_type": "string", + "check_function": "min_length_check", + "description": "Checks that a field value meets a minimum character length.", + "available": true + }, + "regex_check": { + "data_type": "string", + "check_function": "regex_check", + "description": "Checks that a field value matches a given regex pattern.", + "available": true + }, + "fields_not_equal_check": { + "data_type": "custom", + "check_function": "fields_not_equal_check", + "description": "Checks that two fields do not have the same value.", + "available": true + }, + "required_if_check": { + "data_type": "custom", + "check_function": "required_if_check", + "description": "Checks that a field is present when another field equals a specific value.", + "available": true + }, + "min_items_check": { + "data_type": "custom", + "check_function": "min_items_check", + "description": "Checks that a list field contains at least a minimum number of items.", + "available": true } } diff --git a/pyQuARC/schemas/rule_mapping.json b/pyQuARC/schemas/rule_mapping.json index 64214155..26adfd3e 100644 --- a/pyQuARC/schemas/rule_mapping.json +++ b/pyQuARC/schemas/rule_mapping.json @@ -5906,5 +5906,575 @@ }, "severity": "info", "check_id": "url_update_email_check" + }, + "title_min_length_check": { + "rule_name": "Title Minimum Length Check", + "fields_to_apply": { + "echo-c": [ + { + "fields": ["Collection/DataSetId"], + "data": [10] + } + ], + "dif10": [ + { + "fields": ["DIF/Entry_Title"], + "data": [10] + } + ], + "umm-c": [ + { + "fields": ["EntryTitle"], + "data": [10] + } + ], + "zenodo": [ + { + "fields": ["metadata/title"], + "data": [10] + } + ] + }, + "severity": "warning", + "check_id": "min_length_check" + }, + "title_forbidden_values_check": { + "rule_name": "Title Forbidden Values Check", + "fields_to_apply": { + "echo-c": [ + { + "fields": ["Collection/DataSetId"], + "data": [["test", "sample", "untitled", "dataset"]] + } + ], + "dif10": [ + { + "fields": ["DIF/Entry_Title"], + "data": [["test", "sample", "untitled", "dataset"]] + } + ], + "umm-c": [ + { + "fields": ["EntryTitle"], + "data": [["test", "sample", "untitled", "dataset"]] + } + ], + "zenodo": [ + { + "fields": ["metadata/title"], + "data": [["test", "sample", "untitled", "dataset"]] + } + ] + }, + "severity": "warning", + "check_id": "forbidden_values_check" + }, + "title_description_not_same_check": { + "rule_name": "Title and Description Not Same Check", + "fields_to_apply": { + "echo-c": [ + { + "fields": [ + "Collection/DataSetId", + "Collection/Description" + ] + } + ], + "dif10": [ + { + "fields": [ + "DIF/Entry_Title", + "DIF/Summary/Abstract" + ] + } + ], + "umm-c": [ + { + "fields": [ + "EntryTitle", + "Abstract" + ] + } + ], + "zenodo": [ + { + "fields": [ + "metadata/title", + "metadata/description" + ] + } + ] + }, + "severity": "warning", + "check_id": "fields_not_equal_check" + }, + "description_min_length_check": { + "rule_name": "Description Minimum Length Check", + "fields_to_apply": { + "echo-c": [ + { + "fields": ["Collection/Description"], + "data": [100] + } + ], + "dif10": [ + { + "fields": ["DIF/Summary/Abstract"], + "data": [100] + } + ], + "umm-c": [ + { + "fields": ["Abstract"], + "data": [100] + } + ], + "zenodo": [ + { + "fields": ["metadata/description"], + "data": [100] + } + ] + }, + "severity": "warning", + "check_id": "min_length_check" + }, + "long_description_recommended_check": { + "rule_name": "Long Description Recommended Check", + "fields_to_apply": { + "echo-c": [ + { + "fields": ["Collection/Description"], + "data": [250] + } + ], + "dif10": [ + { + "fields": ["DIF/Summary/Abstract"], + "data": [250] + } + ], + "umm-c": [ + { + "fields": ["Abstract"], + "data": [250] + } + ], + "zenodo": [ + { + "fields": ["metadata/description"], + "data": [250] + } + ] + }, + "severity": "info", + "check_id": "min_length_check" + }, + "version_presence_check": { + "rule_name": "Version Presence Check", + "fields_to_apply": { + "echo-c": [ + { + "fields": ["Collection/VersionId"] + } + ], + "dif10": [ + { + "fields": ["DIF/Version"] + } + ], + "umm-c": [ + { + "fields": ["Version"] + } + ], + "zenodo": [ + { + "fields": ["metadata/version"] + } + ] + }, + "severity": "warning", + "check_id": "one_item_presence_check" + }, + "access_constraints_presence_check": { + "rule_name": "Access Constraints Presence Check", + "fields_to_apply": { + "echo-c": [ + { + "fields": ["Collection/RestrictionFlag"] + } + ], + "dif10": [ + { + "fields": ["DIF/Access_Constraints"] + } + ], + "umm-c": [ + { + "fields": ["AccessConstraints/Value"] + } + ], + "zenodo": [ + { + "fields": ["metadata/custom/nasa:access_constraints"] + } + ] + }, + "severity": "error", + "check_id": "one_item_presence_check" + }, + "access_constraints_vocab_check": { + "rule_name": "Access Constraints Vocabulary Check", + "fields_to_apply": { + "dif10": [ + { + "fields": ["DIF/Access_Constraints"], + "data": [["Public", "Restricted", "Internal", "Embargoed"]] + } + ], + "umm-c": [ + { + "fields": ["AccessConstraints/Value"], + "data": [["Public", "Restricted", "Internal", "Embargoed"]] + } + ], + "zenodo": [ + { + "fields": ["metadata/custom/nasa:access_constraints"], + "data": [["Public", "Restricted", "Internal", "Embargoed"]] + } + ] + }, + "severity": "error", + "check_id": "controlled_keywords_check" + }, + "use_constraints_conditional_check": { + "rule_name": "Use Constraints Required When Restricted Check", + "fields_to_apply": { + "dif10": [ + { + "fields": [ + "DIF/Use_Constraints/Description", + "DIF/Access_Constraints", + "Restricted" + ] + } + ], + "umm-c": [ + { + "fields": [ + "UseConstraints/Description", + "AccessConstraints/Value", + "Restricted" + ] + } + ], + "zenodo": [ + { + "fields": [ + "metadata/custom/nasa:use_constraints", + "metadata/custom/nasa:access_constraints", + "Restricted" + ] + } + ] + }, + "severity": "error", + "check_id": "required_if_check" + }, + "collection_platform_presence_check": { + "rule_name": "Collection Platform Presence Check", + "fields_to_apply": { + "echo-c": [ + { + "fields": ["Collection/Platforms/Platform/ShortName"] + } + ], + "dif10": [ + { + "fields": ["DIF/Platform/Short_Name"] + } + ], + "umm-c": [ + { + "fields": ["Platforms/ShortName"] + } + ], + "zenodo": [ + { + "fields": ["metadata/custom/nasa:platform/id"] + } + ] + }, + "severity": "error", + "check_id": "one_item_presence_check" + }, + "platform_other_description_check": { + "rule_name": "Platform Other Description Required Check", + "fields_to_apply": { + "echo-c": [ + { + "fields": [ + "Collection/Platforms/Platform/LongName", + "Collection/Platforms/Platform/ShortName", + "Other" + ] + } + ], + "dif10": [ + { + "fields": [ + "DIF/Platform/Long_Name", + "DIF/Platform/Short_Name", + "Other" + ] + } + ], + "umm-c": [ + { + "fields": [ + "Platforms/LongName", + "Platforms/ShortName", + "Other" + ] + } + ], + "zenodo": [ + { + "fields": [ + "metadata/custom/nasa:platform_other", + "metadata/custom/nasa:platform/id", + "other" + ] + } + ] + }, + "severity": "error", + "check_id": "required_if_check" + }, + "collection_instrument_presence_check": { + "rule_name": "Collection Instrument Presence Check", + "fields_to_apply": { + "echo-c": [ + { + "fields": ["Collection/Platforms/Platform/Instruments/Instrument/ShortName"] + } + ], + "dif10": [ + { + "fields": ["DIF/Platform/Instrument/Short_Name"] + } + ], + "umm-c": [ + { + "fields": ["Platforms/Instruments/ShortName"] + } + ], + "zenodo": [ + { + "fields": ["metadata/custom/nasa:instrument/id"] + } + ] + }, + "severity": "error", + "check_id": "one_item_presence_check" + }, + "instrument_other_description_check": { + "rule_name": "Instrument Other Description Required Check", + "fields_to_apply": { + "echo-c": [ + { + "fields": [ + "Collection/Platforms/Platform/Instruments/Instrument/LongName", + "Collection/Platforms/Platform/Instruments/Instrument/ShortName", + "Other" + ] + } + ], + "dif10": [ + { + "fields": [ + "DIF/Platform/Instrument/Long_Name", + "DIF/Platform/Instrument/Short_Name", + "Other" + ] + } + ], + "umm-c": [ + { + "fields": [ + "Platforms/Instruments/LongName", + "Platforms/Instruments/ShortName", + "Other" + ] + } + ], + "zenodo": [ + { + "fields": [ + "metadata/custom/nasa:instrument_other", + "metadata/custom/nasa:instrument/id", + "other" + ] + } + ] + }, + "severity": "error", + "check_id": "required_if_check" + }, + "spatial_reference_type_vocab_check": { + "rule_name": "Spatial Reference Type Vocabulary Check", + "fields_to_apply": { + "echo-c": [ + { + "fields": ["Collection/Spatial/SpatialCoverageType"], + "data": [["Geographic", "Projected", "UTM", "Polar", "Custom"]] + } + ], + "dif10": [ + { + "fields": ["DIF/Spatial_Coverage/Spatial_Coverage_Type"], + "data": [["Geographic", "Projected", "UTM", "Polar", "Custom"]] + } + ], + "umm-c": [ + { + "fields": ["SpatialExtent/SpatialCoverageType"], + "data": [["Geographic", "Projected", "UTM", "Polar", "Custom"]] + } + ], + "zenodo": [ + { + "fields": ["metadata/custom/nasa:spatial_reference_type/id"], + "data": [["geographic", "projected", "utm", "polar", "custom"]] + } + ] + }, + "severity": "warning", + "check_id": "controlled_keywords_check" + }, + "spatial_resolution_presence_check": { + "rule_name": "Spatial Resolution Presence Check", + "fields_to_apply": { + "echo-c": [ + { + "fields": [ + "Collection/Spatial/HorizontalSpatialDomain/Geometry/BoundingRectangle/WestBoundingCoordinate", + "Collection/Spatial/HorizontalSpatialDomain/Geometry/BoundingRectangle/EastBoundingCoordinate", + "Collection/Spatial/HorizontalSpatialDomain/Geometry/BoundingRectangle/NorthBoundingCoordinate", + "Collection/Spatial/HorizontalSpatialDomain/Geometry/BoundingRectangle/SouthBoundingCoordinate" + ] + } + ], + "dif10": [ + { + "fields": [ + "DIF/Spatial_Coverage/Geometry/Bounding_Rectangle/Westernmost_Longitude", + "DIF/Spatial_Coverage/Geometry/Bounding_Rectangle/Easternmost_Longitude", + "DIF/Spatial_Coverage/Geometry/Bounding_Rectangle/Northernmost_Latitude", + "DIF/Spatial_Coverage/Geometry/Bounding_Rectangle/Southernmost_Latitude" + ] + } + ], + "umm-c": [ + { + "fields": [ + "SpatialExtent/HorizontalSpatialDomain/ResolutionAndCoordinateSystem/HorizontalDataResolution/GenericResolutions/XDimension" + ] + } + ], + "zenodo": [ + { + "fields": ["metadata/custom/nasa:spatial_resolution"] + } + ] + }, + "severity": "warning", + "check_id": "one_item_presence_check" + }, + "spatial_resolution_units_check": { + "rule_name": "Spatial Resolution Units Format Check", + "fields_to_apply": { + "umm-c": [ + { + "fields": [ + "SpatialExtent/HorizontalSpatialDomain/ResolutionAndCoordinateSystem/HorizontalDataResolution/GenericResolutions/Unit" + ], + "data": ["^[0-9]+(\\.[0-9]+)?\\s?(m|km|deg)$"] + } + ], + "zenodo": [ + { + "fields": ["metadata/custom/nasa:spatial_resolution"], + "data": ["^[0-9]+(\\.[0-9]+)?\\s?(m|km|deg)$"] + } + ] + }, + "severity": "warning", + "check_id": "regex_check" + }, + "temporal_resolution_vocab_check": { + "rule_name": "Temporal Resolution Vocabulary Check", + "fields_to_apply": { + "echo-c": [ + { + "fields": ["Collection/Temporal/TemporalRangeType"], + "data": [["Hourly", "Daily", "Weekly", "Monthly", "Yearly"]] + } + ], + "dif10": [ + { + "fields": ["DIF/Temporal_Coverage/Temporal_Resolution"], + "data": [["Hourly", "Daily", "Weekly", "Monthly", "Yearly"]] + } + ], + "umm-c": [ + { + "fields": ["TemporalExtents/TemporalResolution/Value"], + "data": [["Hourly", "Daily", "Weekly", "Monthly", "Yearly"]] + } + ], + "zenodo": [ + { + "fields": ["metadata/custom/nasa:temporal_resolution/id"], + "data": [["hourly", "daily", "weekly", "monthly", "yearly"]] + } + ] + }, + "severity": "warning", + "check_id": "controlled_keywords_check" + }, + "science_keywords_min_count_check": { + "rule_name": "Science Keywords Minimum Count Check", + "fields_to_apply": { + "echo-c": [ + { + "fields": ["Collection/ScienceKeywords/ScienceKeyword"], + "data": [3] + } + ], + "dif10": [ + { + "fields": ["DIF/Science_Keywords"], + "data": [3] + } + ], + "umm-c": [ + { + "fields": ["ScienceKeywords"], + "data": [3] + } + ], + "zenodo": [ + { + "fields": ["metadata/keywords"], + "data": [3] + } + ] + }, + "severity": "warning", + "check_id": "min_items_check" + } } -} \ No newline at end of file + diff --git a/rr-extension.md b/rr-extension.md new file mode 100644 index 00000000..79f2bb58 --- /dev/null +++ b/rr-extension.md @@ -0,0 +1,177 @@ +# pyQuARC Extension: New Rules, Zenodo Format Support + +This document records all changes made to pyQuARC to add new metadata quality rules and support for the Zenodo/InvenioRDM metadata format. + +--- + +## 1. New Supported Format: `zenodo` + +### `pyQuARC/code/constants.py` +- Added `ZENODO = "zenodo"` +- Added `ZENODO` to `SUPPORTED_FORMATS` + +### `pyQuARC/code/checker.py` +- Imported `ZENODO` from constants +- Updated the `run()` method parser selection so Zenodo is treated as JSON (same as `umm-*`), not XML + +### `pyQuARC/code/schema_validator.py` +- Imported `ZENODO` from constants +- Added a branch in `__init__` to skip structural schema validation for Zenodo (no XSD/JSON schema registered): `self.validator_func = lambda _: {}` + +### `pyQuARC/main.py` +- Imported `ZENODO` from constants +- Added `zenodo` to the `--format` argument help text + +--- + +## 2. New Check Functions + +### `pyQuARC/code/string_validator.py` — class `StringValidator` + +| Function | Description | +|---|---| +| `forbidden_values_check(value, forbidden_list)` | Checks the field value does not contain any string from `forbidden_list` (case-insensitive) | +| `min_length_check(value, min_length)` | Checks the field value is at least `min_length` characters long | +| `regex_check(value, pattern)` | Checks the field value matches a given regex pattern | + +### `pyQuARC/code/custom_validator.py` — class `CustomValidator` + +| Function | Description | +|---|---| +| `fields_not_equal_check(field_a, field_b)` | Checks two fields do not have the same value (case-insensitive) | +| `required_if_check(field_value, condition_value, expected_value)` | Checks `field_value` is present when `condition_value` equals `expected_value` | +| `min_items_check(field_value, min_count)` | Checks a list field contains at least `min_count` items | + +--- + +## 3. New Check Definitions + +### `pyQuARC/schemas/checks.json` + +| Check ID | `data_type` | `check_function` | Description | +|---|---|---|---| +| `forbidden_values_check` | `string` | `forbidden_values_check` | Field must not contain placeholder/forbidden words | +| `min_length_check` | `string` | `min_length_check` | Field must meet a minimum character length | +| `regex_check` | `string` | `regex_check` | Field must match a regex pattern | +| `fields_not_equal_check` | `custom` | `fields_not_equal_check` | Two fields must not be identical | +| `required_if_check` | `custom` | `required_if_check` | Field is conditionally required based on another field's value | +| `min_items_check` | `custom` | `min_items_check` | List field must have a minimum number of items | + +--- + +## 4. New Rules + +All rules are defined in `pyQuARC/schemas/rule_mapping.json` and messages in `pyQuARC/schemas/check_messages.json`. + +### Core Metadata + +| Rule ID | Check ID | Severity | Formats | Description | +|---|---|---|---|---| +| `title_min_length_check` | `min_length_check` | warning | echo-c, dif10, umm-c, zenodo | Title must be at least 10 characters | +| `title_forbidden_values_check` | `forbidden_values_check` | warning | echo-c, dif10, umm-c, zenodo | Title must not contain: `test`, `sample`, `untitled`, `dataset` | +| `title_description_not_same_check` | `fields_not_equal_check` | warning | echo-c, dif10, umm-c, zenodo | Title and description must not be identical | +| `description_min_length_check` | `min_length_check` | warning | echo-c, dif10, umm-c, zenodo | Description must be at least 100 characters | +| `long_description_recommended_check` | `min_length_check` | info | echo-c, dif10, umm-c, zenodo | Description of at least 250 characters recommended | +| `version_presence_check` | `one_item_presence_check` | warning | echo-c, dif10, umm-c, zenodo | Version field must be present | + +### Access & Use Constraints + +| Rule ID | Check ID | Severity | Formats | Description | +|---|---|---|---|---| +| `access_constraints_presence_check` | `one_item_presence_check` | error | echo-c, dif10, umm-c, zenodo | Access constraints field must be present | +| `access_constraints_vocab_check` | `controlled_keywords_check` | error | dif10, umm-c, zenodo | Access constraints must be one of: `Public`, `Restricted`, `Internal`, `Embargoed` | +| `use_constraints_conditional_check` | `required_if_check` | error | dif10, umm-c, zenodo | Use constraints required when access constraints = `Restricted` | + +### Platform & Instrument + +| Rule ID | Check ID | Severity | Formats | Description | +|---|---|---|---|---| +| `collection_platform_presence_check` | `one_item_presence_check` | error | echo-c, dif10, umm-c, zenodo | At least one platform must be present | +| `platform_other_description_check` | `required_if_check` | error | echo-c, dif10, umm-c, zenodo | Platform long name required when short name = `Other` | +| `collection_instrument_presence_check` | `one_item_presence_check` | error | echo-c, dif10, umm-c, zenodo | At least one instrument must be present | +| `instrument_other_description_check` | `required_if_check` | error | echo-c, dif10, umm-c, zenodo | Instrument long name required when short name = `Other` | + +### Spatial + +| Rule ID | Check ID | Severity | Formats | Description | +|---|---|---|---|---| +| `spatial_reference_type_vocab_check` | `controlled_keywords_check` | warning | echo-c, dif10, umm-c, zenodo | Spatial reference type must be one of: `Geographic`, `Projected`, `UTM`, `Polar`, `Custom` | +| `spatial_resolution_presence_check` | `one_item_presence_check` | warning | echo-c, dif10, umm-c, zenodo | Spatial resolution or bounding coordinates must be present | +| `spatial_resolution_units_check` | `regex_check` | warning | umm-c, zenodo | Spatial resolution must match pattern `^[0-9]+(\.[0-9]+)?\s?(m\|km\|deg)$` (e.g. `30 m`, `1 km`, `0.25 deg`) | + +### Temporal + +| Rule ID | Check ID | Severity | Formats | Description | +|---|---|---|---|---| +| `temporal_resolution_vocab_check` | `controlled_keywords_check` | warning | echo-c, dif10, umm-c, zenodo | Temporal resolution must be one of: `Hourly`, `Daily`, `Weekly`, `Monthly`, `Yearly` | + +### Keywords + +| Rule ID | Check ID | Severity | Formats | Description | +|---|---|---|---|---| +| `science_keywords_min_count_check` | `min_items_check` | warning | echo-c, dif10, umm-c, zenodo | At least 3 keywords must be provided | + +--- + +## 5. Zenodo Field Path Mapping + +The Zenodo record structure differs from NASA CMR formats. Custom NASA fields live under `metadata.custom` (not `custom_fields`). The table below shows how each rule maps to Zenodo field paths. + +| Rule | Zenodo Field Path | +|---|---| +| `title_min_length_check` | `metadata/title` | +| `title_forbidden_values_check` | `metadata/title` | +| `title_description_not_same_check` | `metadata/title`, `metadata/description` | +| `description_min_length_check` | `metadata/description` | +| `long_description_recommended_check` | `metadata/description` | +| `version_presence_check` | `metadata/version` | +| `access_constraints_presence_check` | `metadata/custom/nasa:access_constraints` | +| `access_constraints_vocab_check` | `metadata/custom/nasa:access_constraints` | +| `use_constraints_conditional_check` | `metadata/custom/nasa:use_constraints`, `metadata/custom/nasa:access_constraints` | +| `collection_platform_presence_check` | `metadata/custom/nasa:platform/id` | +| `platform_other_description_check` | `metadata/custom/nasa:platform_other`, `metadata/custom/nasa:platform/id` | +| `collection_instrument_presence_check` | `metadata/custom/nasa:instrument/id` | +| `instrument_other_description_check` | `metadata/custom/nasa:instrument_other`, `metadata/custom/nasa:instrument/id` | +| `spatial_reference_type_vocab_check` | `metadata/custom/nasa:spatial_reference_type/id` | +| `spatial_resolution_presence_check` | `metadata/custom/nasa:spatial_resolution` | +| `spatial_resolution_units_check` | `metadata/custom/nasa:spatial_resolution` | +| `temporal_resolution_vocab_check` | `metadata/custom/nasa:temporal_resolution/id` | +| `science_keywords_min_count_check` | `metadata/keywords` | + +> **Note:** Zenodo vocabulary values use lowercase IDs (e.g. `"geographic"`, `"daily"`). NASA CMR formats use title-case display strings (e.g. `"Geographic"`, `"Daily"`). + +--- + +## 6. How to Extend + +### Add a value to a controlled vocabulary +Edit the `data` array in the relevant rule in `pyQuARC/schemas/rule_mapping.json`. No code changes needed. + +```json +"zenodo": [ + { + "fields": ["metadata/custom/nasa:spatial_reference_type/id"], + "data": [["geographic", "projected", "utm", "polar", "custom", "celestial"]] + } +] +``` + +### Add a new rule using an existing check +1. Add an entry to `pyQuARC/schemas/rule_mapping.json` referencing an existing `check_id` +2. Add a message entry to `pyQuARC/schemas/check_messages.json` + +### Add a new check function +1. Add a `@staticmethod` to the appropriate validator class in `pyQuARC/code/`: + - String checks → `string_validator.py` (`StringValidator`) + - Custom/multi-field checks → `custom_validator.py` (`CustomValidator`) + - Date/time checks → `datetime_validator.py` (`DatetimeValidator`) + - URL checks → `url_validator.py` (`UrlValidator`) +2. Register it in `pyQuARC/schemas/checks.json` +3. Add a rule in `pyQuARC/schemas/rule_mapping.json` +4. Add a message in `pyQuARC/schemas/check_messages.json` + +### Add a new metadata format +1. Add a constant in `pyQuARC/code/constants.py` and include it in `SUPPORTED_FORMATS` +2. Update the parser branch in `pyQuARC/code/checker.py` (`run()` method) +3. Update `pyQuARC/code/schema_validator.py` to handle or skip schema validation +4. Add format-specific field paths to each relevant rule in `pyQuARC/schemas/rule_mapping.json` diff --git a/tests/fixtures/test_zenodo_record.json b/tests/fixtures/test_zenodo_record.json new file mode 100644 index 00000000..6be4a0d0 --- /dev/null +++ b/tests/fixtures/test_zenodo_record.json @@ -0,0 +1,284 @@ +{ + "created": "2026-03-27T20:45:13.871821+00:00", + "modified": "2026-04-27T22:00:27.781482+00:00", + "id": 50, + "conceptrecid": "49", + "doi": "10.83615/DASP.50", + "conceptdoi": "10.83615/DASP.49", + "doi_url": "https://handle.test.datacite.org/10.83615/DASP.50", + "metadata": { + "title": "Test 3272026", + "doi": "10.83615/DASP.50", + "publication_date": "2026-03-27", + "description": "

Test NASA

", + "access_right": "open", + "creators": [ + { + "name": "Test, Person", + "affiliation": "Glenn Research Center; Johnson Space Center; Cornell University; Massachusetts Institute of Technology" + } + ], + "contributors": [ + { + "name": "Jane, Doe", + "affiliation": "Goddard Space Flight Center; Kennedy Space Center; Brown University", + "type": "ContactPerson" + } + ], + "keywords": [ + "Test", + "Test 2" + ], + "related_identifiers": [ + { + "identifier": "Test", + "relation": "isRequiredBy", + "resource_type": "dataset", + "scheme": "other" + } + ], + "dates": [ + { + "type": "collected", + "description": "Test" + } + ], + "version": "2.0", + "references": [ + "Test" + ], + "language": "eng", + "custom": { + "code:codeRepository": "https://example.com", + "code:programmingLanguage": [ + { + "id": "python", + "title": { + "en": "Python" + } + } + ], + "code:developmentStatus": { + "id": "concept", + "title": { + "cs": "Koncept", + "en": "Concept", + "es": "Concepto", + "sv": "Koncept" + } + }, + "nasa:instrument": [ + { + "id": "spectrometer", + "title": { + "en": "Spectrometer" + } + } + ], + "nasa:instrument_other": [ + "Test" + ], + "nasa:mission": [ + { + "id": "mission_1", + "title": { + "en": "Mission 1" + } + } + ], + "nasa:mission_other": [ + "Test" + ], + "nasa:platform": { + "id": "water_based", + "title": { + "en": "Water-based" + } + }, + "nasa:processing_level": { + "id": "reanalysis", + "title": { + "en": "Reanalysis" + } + }, + "nasa:science_topic": [ + { + "id": "science_topic_1", + "title": { + "en": "Science Topic 1" + } + } + ], + "nasa:temporal_start": "2025-01-01", + "nasa:temporal_end": "2025-02-01", + "nasa:temporal_resolution": { + "id": "hourly", + "title": { + "en": "Hourly" + } + }, + "nasa:spatial_resolution": "10m", + "nasa:spatial_extents": "1,1,1,1", + "nasa:spatial_reference_type": { + "id": "celestial", + "title": { + "en": "Celestial" + } + }, + "nasa:access_url": "https://example.com", + "nasa:access_constraints": "Test", + "nasa:documentation_url": "https://example.com", + "nasa:use_constraints": "Test", + "nasa:quality_flags": true, + "nasa:update_frequency": { + "id": "monthly", + "title": { + "en": "Monthly" + } + }, + "nasa:experiment_description_html": "

Test

", + "nasa:experiment_url": "https://example.com" + }, + "resource_type": { + "title": "Dataset", + "type": "dataset" + }, + "journal": { + "issue": "Test", + "pages": "15-23", + "title": "Test", + "volume": "Test" + }, + "meeting": { + "title": "Test", + "acronym": "Test", + "dates": "2025-01-01", + "place": "Test", + "url": "https://example.com", + "session": "VI", + "session_part": "1" + }, + "imprint": { + "place": "Test", + "isbn": "0-06-251587-X" + }, + "thesis": { + "university": "Test" + }, + "alternate_identifiers": [ + { + "identifier": "Test" + } + ], + "license": { + "id": "cc-by-4.0" + }, + "grants": [ + { + "code": "101129744", + "internal_id": "10.13039/501100000780::101129744", + "funder": { + "name": "European Commission", + "doi": "10.13039/501100000780", + "acronym": "EC" + }, + "title": "European Virtual Institute for Research Software Excellence", + "acronym": "EVERSE", + "program": "HORIZON.1.3", + "url": "https://cordis.europa.eu/project/id/101129744" + } + ], + "communities": [ + { + "id": "planetary-science" + } + ], + "relations": { + "version": [ + { + "index": 0, + "is_last": false, + "parent": { + "pid_type": "recid", + "pid_value": "49" + } + } + ] + } + }, + "title": "Test 3272026", + "links": { + "self": "https://d2bkyuwgbw2w6x.cloudfront.net/api/records/50", + "self_html": "https://d2bkyuwgbw2w6x.cloudfront.net/records/50", + "preview_html": "https://d2bkyuwgbw2w6x.cloudfront.net/records/50?preview=1", + "doi": "https://handle.test.datacite.org/10.83615/DASP.50", + "self_doi": "https://handle.test.datacite.org/10.83615/DASP.50", + "self_doi_html": "https://d2bkyuwgbw2w6x.cloudfront.net/doi/10.83615/DASP.50", + "reserve_doi": "https://d2bkyuwgbw2w6x.cloudfront.net/api/records/50/draft/pids/doi", + "parent": "https://d2bkyuwgbw2w6x.cloudfront.net/api/records/49", + "parent_html": "https://d2bkyuwgbw2w6x.cloudfront.net/records/49", + "parent_doi": "https://handle.test.datacite.org/10.83615/DASP.49", + "parent_doi_html": "https://d2bkyuwgbw2w6x.cloudfront.net/doi/10.83615/DASP.49", + "self_iiif_manifest": "https://d2bkyuwgbw2w6x.cloudfront.net/api/iiif/record:50/manifest", + "self_iiif_sequence": "https://d2bkyuwgbw2w6x.cloudfront.net/api/iiif/record:50/sequence/default", + "files": "https://d2bkyuwgbw2w6x.cloudfront.net/api/records/50/files", + "media_files": "https://d2bkyuwgbw2w6x.cloudfront.net/api/records/50/media-files", + "thumbnails": { + "10": "https://d2bkyuwgbw2w6x.cloudfront.net/api/iiif/record:50:This%20is%20a%20dummy%20PDF.pdf/full/%5E10,/0/default.jpg", + "50": "https://d2bkyuwgbw2w6x.cloudfront.net/api/iiif/record:50:This%20is%20a%20dummy%20PDF.pdf/full/%5E50,/0/default.jpg", + "100": "https://d2bkyuwgbw2w6x.cloudfront.net/api/iiif/record:50:This%20is%20a%20dummy%20PDF.pdf/full/%5E100,/0/default.jpg", + "250": "https://d2bkyuwgbw2w6x.cloudfront.net/api/iiif/record:50:This%20is%20a%20dummy%20PDF.pdf/full/%5E250,/0/default.jpg", + "750": "https://d2bkyuwgbw2w6x.cloudfront.net/api/iiif/record:50:This%20is%20a%20dummy%20PDF.pdf/full/%5E750,/0/default.jpg", + "1200": "https://d2bkyuwgbw2w6x.cloudfront.net/api/iiif/record:50:This%20is%20a%20dummy%20PDF.pdf/full/%5E1200,/0/default.jpg" + }, + "archive": "https://d2bkyuwgbw2w6x.cloudfront.net/api/records/50/files-archive", + "archive_media": "https://d2bkyuwgbw2w6x.cloudfront.net/api/records/50/media-files-archive", + "latest": "https://d2bkyuwgbw2w6x.cloudfront.net/api/records/50/versions/latest", + "latest_html": "https://d2bkyuwgbw2w6x.cloudfront.net/records/50/latest", + "versions": "https://d2bkyuwgbw2w6x.cloudfront.net/api/records/50/versions", + "draft": "https://d2bkyuwgbw2w6x.cloudfront.net/api/records/50/draft", + "access_links": "https://d2bkyuwgbw2w6x.cloudfront.net/api/records/50/access/links", + "access_grants": "https://d2bkyuwgbw2w6x.cloudfront.net/api/records/50/access/grants", + "access_users": "https://d2bkyuwgbw2w6x.cloudfront.net/api/records/50/access/users", + "access_request": "https://d2bkyuwgbw2w6x.cloudfront.net/api/records/50/access/request", + "access": "https://d2bkyuwgbw2w6x.cloudfront.net/api/records/50/access", + "communities": "https://d2bkyuwgbw2w6x.cloudfront.net/api/records/50/communities", + "communities-suggestions": "https://d2bkyuwgbw2w6x.cloudfront.net/api/records/50/communities-suggestions", + "request_deletion": "https://d2bkyuwgbw2w6x.cloudfront.net/api/records/50/request-deletion", + "file_modification": "https://d2bkyuwgbw2w6x.cloudfront.net/api/records/50/file-modification", + "requests": "https://d2bkyuwgbw2w6x.cloudfront.net/api/records/50/requests" + }, + "updated": "2026-04-27T22:00:27.781482+00:00", + "recid": "50", + "revision": 6, + "files": [ + { + "id": "d055e2c5-ba50-4f87-8050-068ede1dab71", + "key": "This is a dummy PDF.pdf", + "size": 18107, + "checksum": "md5:47ea2836f55e2fa813f62879e472ce1f", + "links": { + "self": "https://d2bkyuwgbw2w6x.cloudfront.net/api/records/50/files/This is a dummy PDF.pdf/content" + } + } + ], + "swh": {}, + "owners": [ + { + "id": "1" + } + ], + "status": "published", + "stats": { + "downloads": 7, + "unique_downloads": 6, + "views": 6, + "unique_views": 5, + "version_downloads": 7, + "version_unique_downloads": 6, + "version_unique_views": 5, + "version_views": 6 + }, + "state": "done", + "submitted": true +} \ No newline at end of file