diff --git a/hsclient/hydroshare.py b/hsclient/hydroshare.py index 59ef710..8414ab5 100644 --- a/hsclient/hydroshare.py +++ b/hsclient/hydroshare.py @@ -1,12 +1,11 @@ import getpass +import json import os import pathlib import pickle import shutil import sqlite3 -import tempfile import time -import urllib.parse from concurrent.futures import ThreadPoolExecutor from contextlib import closing from datetime import datetime @@ -16,7 +15,11 @@ from typing import Callable, Dict, List, TYPE_CHECKING, Union from urllib.parse import quote, unquote, urlparse from uuid import uuid4 -from zipfile import ZipFile +import s3fs + +from hsclient.metadata_adapter.adapter import MetadataAdapter +from hsclient.schema.utils import load_json + if TYPE_CHECKING: import fiona @@ -43,20 +46,28 @@ import requests -from hsmodels.schemas import load_rdf, rdf_string from hsmodels.schemas.base_models import BaseMetadata from hsmodels.schemas.enums import AggregationType from hsmodels.schemas.fields import BoxCoverage, PointCoverage +from pydantic import TypeAdapter from requests_oauthlib import OAuth2Session from hsclient.json_models import ResourcePreview, User from hsclient.oauth2_model import Token -from hsclient.utils import attribute_filter, encode_resource_url, is_aggregation, main_file_type +from hsclient.schema.base import MediaType +from hsclient.utils import attribute_filter, encode_resource_url, main_file_type import pkg_resources # part of setuptools -VERSION = pkg_resources.get_distribution(__package__).version + +# Try to get version from pkg_resources, fallback to __init__.__version__ +try: + VERSION = pkg_resources.get_distribution(__package__).version +except Exception: + from hsclient import __version__ + VERSION = __version__ CHECK_TASK_PING_INTERVAL = 10 +MEDIA_ITEMS_ADAPTER = TypeAdapter(List[MediaType]) class File(str): @@ -64,15 +75,16 @@ class File(str): A File path string representing the path to a file within a resource. :param value: the string path value :param file_url: the fully qualified url to the file on hydroshare.org - :param checksum: the md5 checksum of the file + :param checksum: the checksum value provided for the file metadata """ - def __new__(cls, value, file_url, checksum): + def __new__(cls, value, file_url, checksum, size): return super(File, cls).__new__(cls, value) - def __init__(self, value, file_url, checksum): + def __init__(self, value, file_url, checksum, size): self._file_url = file_url self._checksum = checksum + self._size = size @property def path(self) -> str: @@ -104,6 +116,10 @@ def url(self): """The url to the file on HydroShare""" return self._file_url + @property + def size(self): + """The size of the file in bytes""" + return self._size def refresh(f): """ @@ -133,10 +149,18 @@ def wrapper(*args, **kwargs): class Aggregation: """Represents an Aggregation in HydroShare""" - def __init__(self, map_path, hs_session, checksums=None): - self._map_path = map_path + _ADDITIONAL_TYPE_TO_AGGREGATION_TYPE = { + "MultiDimensional": AggregationType.MultidimensionalAggregation, + "GeographicRaster": AggregationType.GeographicRasterAggregation, + "GeographicFeature": AggregationType.GeographicFeatureAggregation, + "Tabular": AggregationType.CSVFileAggregation, + } + + def __init__(self, map_path, hs_session, s3_client, checksums=None): + self._map_path = map_path # TODO: should change it to jsonld_path self._hs_session = hs_session - self._retrieved_map = None + self._s3_client = s3_client + self._retrieved_map = None # TODO: probably not needed anymore self._retrieved_metadata = None self._parsed_files = None self._parsed_aggregations = None @@ -146,6 +170,7 @@ def __init__(self, map_path, hs_session, checksums=None): def __str__(self): return self._map_path + # TODO: This is not needed anymore - delete it @property def _map(self): if not self._retrieved_map: @@ -155,7 +180,7 @@ def _map(self): @property def _metadata(self): if not self._retrieved_metadata: - self._retrieved_metadata = self._retrieve_and_parse(self.metadata_path) + self._retrieved_metadata = self._retrieve_and_parse(self.jsonld_metadata_path) return self._retrieved_metadata @property @@ -164,61 +189,123 @@ def _checksums(self): self._parsed_checksums = self._retrieve_checksums(self._checksums_path) return self._parsed_checksums + def _file_path_from_content_url(self, content_url: str, name: str = None) -> str: + parsed_path = urlparse(str(content_url)).path + if "/data/contents/" in parsed_path: + return unquote(parsed_path.split("/data/contents/", 1)[1].strip("/")) + if name: + return unquote(str(name).strip("/")) + raise ValueError(f"Could not determine file path from associatedMedia contentUrl: {content_url}") + + def _associated_media_items(self): + associated_media = getattr(self.metadata, "associatedMedia", None) + if associated_media is None: + return [] + if isinstance(associated_media, list): + return associated_media + return [associated_media] + + @property + def _aggregation_type(self): + metadata_type = getattr(self.metadata, "type", None) + if isinstance(metadata_type, AggregationType): + return metadata_type + + additional_type = getattr(self.metadata, "additionalType", None) + if additional_type is None: + return None + + return self._ADDITIONAL_TYPE_TO_AGGREGATION_TYPE.get(str(additional_type)) + @property def _files(self): if not self._parsed_files: self._parsed_files = [] - for file in self._map.describes.files: - if not is_aggregation(str(file)): - if not file.path == self.metadata_path: - if not str(file.path).endswith('/'): # checking for folders, shouldn't have to do this - file_checksum_path = file.path.split(self._resource_path, 1)[1].strip("/") - file_path = unquote( - file_checksum_path.split( - "data/contents/", - )[1] - ) - f = File(file_path, unquote(file.path), self._checksums[file_checksum_path]) - self._parsed_files.append(f) + for media in self._associated_media_items(): + content_url = getattr(media, "contentUrl", None) + if not content_url: + continue + + file_path = self._file_path_from_content_url(content_url, getattr(media, "name", None)) + checksum = getattr(media, "sha256", None) + size = getattr(media, "contentSize", None) + self._parsed_files.append(File(file_path, str(content_url), checksum, size)) return self._parsed_files @property def _aggregations(self): + if not self._parsed_aggregations: + self._parsed_aggregations = [] + file_based_aggregations = self._aggregations_file_based + if file_based_aggregations: + self._parsed_aggregations.extend(file_based_aggregations) + self._prefetch_aggregation_metadata(self._parsed_aggregations) + # TODO: Need to uncomment this after content type metadata mapping is implemented + # self._convert_typed_aggregations() + return self._parsed_aggregations + + def _prefetch_aggregation_metadata(self, aggregations): def populate_metadata(_aggr): - _aggr._metadata + try: + return _aggr._metadata + except Exception: + return None + + if not aggregations: + return + with ThreadPoolExecutor() as executor: + list(executor.map(populate_metadata, aggregations)) + + def _convert_typed_aggregations(self) -> None: if not self._parsed_aggregations: - self._parsed_aggregations = [] - for file in self._map.describes.files: - if is_aggregation(str(file)): - self._parsed_aggregations.append(Aggregation(unquote(file.path), self._hs_session, self._checksums)) - - # load metadata for all aggregations (metadata is needed to create any typed aggregation) - with ThreadPoolExecutor() as executor: - executor.map(populate_metadata, self._parsed_aggregations) - - # convert aggregations to aggregation type supporting data object - aggregations_copy = self._parsed_aggregations[:] - typed_aggregation_classes = {AggregationType.MultidimensionalAggregation: NetCDFAggregation, - AggregationType.TimeSeriesAggregation: TimeseriesAggregation, - AggregationType.GeographicRasterAggregation: GeoRasterAggregation, - AggregationType.GeographicFeatureAggregation: GeoFeatureAggregation, - AggregationType.CSVFileAggregation: CSVAggregation - } - for aggr in aggregations_copy: - typed_aggr_cls = typed_aggregation_classes.get(aggr.metadata.type, None) - if typed_aggr_cls: - typed_aggr = typed_aggr_cls.create(base_aggr=aggr) - # swapping the generic aggregation with the typed aggregation in the aggregation list - self._parsed_aggregations.remove(aggr) - self._parsed_aggregations.append(typed_aggr) + return + + typed_aggregation_classes = { + AggregationType.MultidimensionalAggregation: NetCDFAggregation, + AggregationType.TimeSeriesAggregation: TimeseriesAggregation, + AggregationType.GeographicRasterAggregation: GeoRasterAggregation, + AggregationType.GeographicFeatureAggregation: GeoFeatureAggregation, + AggregationType.CSVFileAggregation: CSVAggregation, + } + converted_aggregations = [] + for aggr in self._parsed_aggregations: + try: + typed_aggr_cls = typed_aggregation_classes.get(aggr._aggregation_type) + except Exception: + converted_aggregations.append(aggr) + continue + if typed_aggr_cls: + converted_aggregations.append(typed_aggr_cls.create(base_aggr=aggr)) + else: + converted_aggregations.append(aggr) + self._parsed_aggregations = converted_aggregations - return self._parsed_aggregations + @property + def _aggregations_file_based(self): + jsonld_prefix = f"{self.bucket_path}/.hsjsonld/" + existing_paths = {aggr.jsonld_metadata_path for aggr in self._parsed_aggregations} + file_based_aggregations = [] + resource_jsonld_file_path = self.jsonld_metadata_path + for file_path in self._s3_client.find(jsonld_prefix): + if not file_path.endswith(".json"): + continue + if file_path == resource_jsonld_file_path: + continue + if file_path in existing_paths: + continue + + file_based_aggregations.append( + Aggregation(file_path, self._hs_session, self._s3_client) + ) + existing_paths.add(file_path) + + return file_based_aggregations @property def _checksums_path(self): - path = self.metadata_path.split("/data/", 1)[0] + path = self._resource_path path = urljoin(path, "manifest-md5.txt") return path @@ -230,12 +317,16 @@ def _hsapi_path(self): @property def _resource_path(self): - resource_path = self.metadata_path[: len("/resource/b4ce17c17c654a5c8004af73f2df87ab/")].strip("/") + resource_path = f"/resource/{self.resource_id}" return resource_path - def _retrieve_and_parse(self, path): - file_str = self._hs_session.retrieve_string(path) - instance = load_rdf(file_str) + def _retrieve_and_parse(self, path, as_pydantic: bool = True): + file_str = self._s3_client.cat(path).decode("utf-8") + json_data = json.loads(file_str) + if not as_pydantic: + return json_data + # Gets the appropriate schema model for the file-based json metadata + instance = load_json(json_data, path) return instance def _retrieve_checksums(self, path): @@ -264,6 +355,7 @@ def _download(self, save_path: str = "", unzip_to: str = None) -> str: return unzip_to return downloaded_zip + # TODO: Remove this property @property def metadata_file(self): """The path to the metadata file""" @@ -276,21 +368,45 @@ def metadata(self) -> BaseMetadata: @property def metadata_path(self) -> str: - """The path to the metadata file""" - return urlparse(str(self._map.describes.is_documented_by)).path + """Compatibility alias for code paths still expecting metadata_path.""" + return self.jsonld_metadata_path + + @property + def jsonld_metadata_path(self) -> str: + """The path to the JSON-LD metadata file""" + return self._map_path + + @property + def user_metadata_path(self) -> str: + """The path to the user entered metadata file for the aggregation""" + # Full file path to the user metadata file starting with {bucket_name}/{resource_id}/.hsmetadata/ + base_meta_path = f"{self.bucket_path}/.hsmetadata" + if self._aggregation_type == AggregationType.FileSetAggregation: + return f"{base_meta_path}/{self.main_file_path}/user_metadata.json" + return f"{base_meta_path}/{self.main_file_path}.user_metadata.json" + + @property + def extracted_metadata_path(self) -> Union[str, None]: + """The path to the extracted metadata file for the aggregation""" + base_meta_path = f"{self.bucket_path}/.hsmetadata" + if self._aggregation_type in [AggregationType.FileSetAggregation, AggregationType.SingleFileAggregation]: + return None + return f"{base_meta_path}/{self.main_file_path}.json" @property def main_file_path(self) -> str: """The path to the main file in the aggregation""" + # This path is relative to the data/contents/ directory in the resource if self._main_file_path is not None: return self._main_file_path - mft = main_file_type(self.metadata.type) + aggregation_type = self._aggregation_type + mft = main_file_type(aggregation_type) if mft: for file in self.files(): if str(file).endswith(mft): self._main_file_path = file.path return self._main_file_path - if self.metadata.type == AggregationType.FileSetAggregation: + if aggregation_type == AggregationType.FileSetAggregation: self._main_file_path = self.files()[0].folder return self._main_file_path self._main_file_path = self.files()[0].path @@ -299,13 +415,13 @@ def main_file_path(self) -> str: @refresh def save(self) -> None: """ - Saves the metadata back to HydroShare + Saves the metadata back to HydroShare as user_metadata.json in the .hsmetadata folder :return: None """ - metadata_file = self.metadata_file - metadata_string = rdf_string(self._retrieved_metadata, rdf_format="xml") - url = urljoin(self._hsapi_path, "ingest_metadata") - self._hs_session.upload_file(url, files={'file': (metadata_file, metadata_string)}) + metadata_json = self.metadata.model_dump_json() + # TODO: This 'metadata_json' needs to be updated to use the new schema.org metadata + # for the aggregation before writing to s3 + self._s3_client.write_text(self.user_metadata_path, metadata_json) def files(self, search_aggregations: bool = False, **kwargs) -> List[File]: """ @@ -351,6 +467,8 @@ def aggregations(self, **kwargs) -> List[BaseMetadata]: elif key.startswith('files__'): file_args = {key[len('files__'):]: value} aggregations = [agg for agg in aggregations if agg.files(**file_args)] + elif key == 'type': + aggregations = [agg for agg in aggregations if agg._aggregation_type == value] else: aggregations = filter(lambda agg: attribute_filter(agg.metadata, key, value), aggregations) return list(aggregations) @@ -380,14 +498,20 @@ def refresh(self) -> None: self._parsed_aggregations = None self._parsed_checksums = None self._main_file_path = None + time.sleep(1) # give some time to s3 eventing to regerrate the metadata files + #TODO: This delete method needs to be removed - the Resource class aggregation_delete() + # method implements aggregation delete using s3 protocol def delete(self) -> None: """Deletes this aggregation from HydroShare""" + aggregation_type = self._aggregation_type + if aggregation_type is None: + raise Exception("Aggregation type could not be determined") path = urljoin( self._hsapi_path, "functions", "delete-file-type", - self.metadata.type.value + "LogicalFile", + aggregation_type.value + "LogicalFile", self.main_file_path, ) self._hs_session.delete(path, status_code=200) @@ -400,7 +524,7 @@ class DataObjectSupportingAggregation(Aggregation): @staticmethod def create(aggr_cls, base_aggr): """Creates a type specific aggregation object from an instance of Aggregation""" - aggr = aggr_cls(base_aggr._map_path, base_aggr._hs_session, base_aggr._parsed_checksums) + aggr = aggr_cls(base_aggr._map_path, base_aggr._hs_session, base_aggr._s3_client, base_aggr._parsed_checksums) aggr._retrieved_map = base_aggr._retrieved_map aggr._retrieved_metadata = base_aggr._retrieved_metadata aggr._parsed_files = base_aggr._parsed_files @@ -436,12 +560,12 @@ def _validate_aggregation_path(self, agg_path: str, for_save_data: bool = False) def _get_data_object(self, agg_path: str, func: Callable, **func_kwargs) -> \ Union['pandas.DataFrame', 'fiona.Collection', 'rasterio.DatasetReader', 'xarray.Dataset']: - if self._data_object is not None and self.metadata.type != AggregationType.TimeSeriesAggregation: + if self._data_object is not None and self._aggregation_type != AggregationType.TimeSeriesAggregation: return self._data_object file_path = self._validate_aggregation_path(agg_path) data_object = func(file_path, **func_kwargs) - if self.metadata.type == AggregationType.MultidimensionalAggregation: + if self._aggregation_type == AggregationType.MultidimensionalAggregation: data_object.load() data_object.close() @@ -450,7 +574,7 @@ def _get_data_object(self, agg_path: str, func: Callable, **func_kwargs) -> \ return data_object def _validate_aggregation_for_update(self, resource: 'Resource', agg_type: AggregationType) -> None: - if self.metadata.type != agg_type: + if self._aggregation_type != agg_type: raise Exception(f"Not a {agg_type.value} aggregation") if self._data_object is None: @@ -934,45 +1058,123 @@ def save_data_object(self, resource: 'Resource', agg_path: str, as_new_aggr: boo class Resource(Aggregation): """Represents a Resource in HydroShare""" + @property + def _file_manifest_path(self) -> str: + return f"{dirname(self.jsonld_metadata_path)}/file_manifest.json" + + def _associated_media_items(self): + try: + manifest = self._retrieve_and_parse(self._file_manifest_path, as_pydantic=False) + except Exception: + return super()._associated_media_items() + + if isinstance(manifest, dict): + manifest = manifest.get("associatedMedia", manifest) + + if not manifest: + return [] + + try: + if isinstance(manifest, list): + return MEDIA_ITEMS_ADAPTER.validate_python(manifest) + return [MEDIA_ITEMS_ADAPTER.validate_python([manifest])[0]] + except Exception: + return super()._associated_media_items() + + @property + def user_metadata_path(self) -> str: + """The path to the user entered metadata json file for the resource""" + return f"{self.bucket_path}/.hsmetadata/user_metadata.json" + @property def _hsapi_path(self): path = urlparse(str(self.metadata.identifier)).path return '/hsapi' + path + def _build_s3_path(self, path: str = "") -> str: + if self._s3_client is None: + raise ValueError("S3 client is not configured for this resource") + + if not hasattr(self, "bucket_path") or not self.bucket_path: + raise ValueError("Resource S3 path information is not available") + + remote_path_parts = [self.bucket_path.strip("/")] + normalized_path = path.strip("/") if path else "" + if normalized_path: + if normalized_path.startswith(".hsmetadata"): + remote_path_parts.append(normalized_path) + else: + remote_path_parts.append("data") + remote_path_parts.append('contents') + remote_path_parts.append(normalized_path) + return "/".join(remote_path_parts) + def _upload(self, file, destination_path): - path = urljoin(self._hsapi_path, "files", destination_path.strip("/")) - self._hs_session.upload_file(path, files={'file': open(file, 'rb')}, status_code=201) + file_name = os.path.basename(file) + destination = destination_path.strip("/") + remote_path = self._build_s3_path(urljoin(destination, file_name) if destination else file_name) + self._s3_client.put(file, remote_path) def _delete_file(self, path) -> None: - path = urljoin(self._hsapi_path, "files", path) - self._hs_session.delete(path, status_code=200) + remote_path = self._build_s3_path(path) + self._s3_client.rm(remote_path) + + def _move_file(self, src_path: str, dst_path: str) -> None: + src_remote_path = self._build_s3_path(src_path) + dst_remote_path = self._build_s3_path(dst_path) + self._s3_client.mv(src_remote_path, dst_remote_path) + + def _move_folder(self, src_path: str, dst_path: str) -> None: + src_remote_path = self._build_s3_path(src_path) + dst_remote_path = self._build_s3_path(dst_path) + self._s3_client.mv(src_remote_path, dst_remote_path, recursive=True) def _download_file_folder(self, path: str, save_path: str) -> None: + # We don't need to use the S3 client for this as s3 signed + # URLs are used by the rest api endpoint return self._hs_session.retrieve_zip(path, save_path) def _delete_file_folder(self, path: str) -> None: - path = urljoin(self._hsapi_path, "folders", path) - self._hs_session.delete(path, status_code=200) - + if self._file_exists(path): + # path is not an empty folder, so we can delete it from s3 directly + remote_path = self._build_s3_path(path) + self._s3_client.rm(remote_path, recursive=True) + return + # path probably points to an empty folder that would not exist in s3, + # so we need to call the hs api to delete it from database + folder_path = urljoin(self._hsapi_path, "folders", path) + self._hs_session.delete(folder_path, status_code=200) + + def _file_exists(self, path: str) -> bool: + remote_path = self._build_s3_path(path) + return self._s3_client.exists(remote_path) + # system information @property def resource_id(self) -> str: """The resource id (guid) of the HydroShare resource""" - return self._map.identifier + # get the resource id from the jsonld_metadata_path + return self.jsonld_metadata_path.split("/")[1] + # TODO: remove this property @property def metadata_file(self): """The path to the metadata file""" return self.metadata_path.split("/data/", 1)[1] + @property + def bucket_path(self): + """The bucket path for the resource in the S3 bucket""" + return self.jsonld_metadata_path.split("/.hsjsonld/", 1)[0] + def system_metadata(self): """ The system metadata associated with the HydroShare resource returns: JSON object """ - hsapi_path = urljoin(self._hsapi_path, 'sysmeta') - return self._hs_session.get(hsapi_path, status_code=200).json() + system_metadata_path = f"{self.bucket_path}/.hsmetadata/system_metadata.json" + return self._retrieve_and_parse(system_metadata_path, as_pydantic=False) # access operations @@ -997,7 +1199,7 @@ def access_permission(self): # resource operations - def new_version(self): + def new_version(self) -> 'Resource': """ Creates a new version of the resource on HydroShare :return: A Resource object of the newly created resource version @@ -1005,17 +1207,19 @@ def new_version(self): path = urljoin(self._hsapi_path, "version") response = self._hs_session.post(path, status_code=202) resource_id = response.text - return Resource("/resource/{}/data/resourcemap.xml".format(resource_id), self._hs_session) + new_res_jsonld_metadata_path = self.jsonld_metadata_path.replace(self.resource_id, resource_id, 1) + return Resource(new_res_jsonld_metadata_path, self._hs_session, self._s3_client) - def copy(self): + def copy(self) -> 'Resource': """ Copies this Resource into a new resource on HydroShare - returns: A Resource object of the newly copied resource + :return: A Resource object of the newly copied resource """ path = urljoin(self._hsapi_path, "copy") response = self._hs_session.post(path, status_code=202) resource_id = response.text - return Resource("/resource/{}/data/resourcemap.xml".format(resource_id), self._hs_session) + copy_res_jsonld_metadata_path = self.jsonld_metadata_path.replace(self.resource_id, resource_id, 1) + return Resource(copy_res_jsonld_metadata_path, self._hs_session, self._s3_client) def download(self, save_path: str = "") -> str: """ @@ -1037,12 +1241,12 @@ def delete(self) -> None: @refresh def save(self) -> None: """ - Saves the metadata to HydroShare + Saves the user provided metadata to HydroShare as user_metadata.json in the .hsmetadata folder :return: None """ - metadata_string = rdf_string(self._retrieved_metadata, rdf_format="xml") - path = urljoin(self._hsapi_path, "ingest_metadata") - self._hs_session.upload_file(path, files={'file': ('resourcemetadata.xml', metadata_string)}) + metadata = MetadataAdapter.to_resource_metadata(self.metadata) + metadata_json = metadata.model_dump_json(by_alias=True, exclude_none=True) + self._s3_client.write_text(self.user_metadata_path, metadata_json) # referenced content operations @@ -1098,6 +1302,8 @@ def folder_rename(self, path: str, new_path: str) -> None: :param new_path: the new path folder name :return: None """ + # Note: Folder operations are handled through rest endpoints instead of S3 client + # because the folder can be an empty folder and S3 storage doesn't support empty folder concept. self.file_rename(path=path, new_path=new_path) @refresh @@ -1152,8 +1358,7 @@ def file_rename(self, path: str, new_path: str) -> None: :param new_path: the renamed path to the file :return: None """ - rename_path = urljoin(self._hsapi_path, "functions", "move-or-rename") - self._hs_session.post(rename_path, status_code=200, data={"source_path": path, "target_path": new_path}) + self._move_file(src_path=path, dst_path=new_path) @refresh def file_zip(self, path: str, zip_name: str = None, remove_file: bool = True) -> None: @@ -1184,7 +1389,7 @@ def file_unzip(self, path: str, overwrite: bool = True, ingest_metadata=True) -> self._hs_session.post( unzip_path, status_code=200, data={"overwrite": overwrite, "ingest_metadata": ingest_metadata} ) - + # TODO: This function needs to be updated to use s3 protocol def file_aggregate(self, path: str, agg_type: AggregationType, refresh: bool = True): """ Aggregate a file to a HydroShare aggregation type. Aggregating files allows you to specify metadata specific @@ -1195,6 +1400,10 @@ def file_aggregate(self, path: str, agg_type: AggregationType, refresh: bool = T :param refresh: Defaults True, toggles automatic refreshing of the updated resource in HydroShare :return: The newly created Aggregation object if refresh is True """ + # TODO: For creating a singlefile or fileset aggregation, we just need to write a user_metadata.json + # file as {file_path}.user_metadata.json or {folder_path}/user_metadata.json + # For other aggregation types, I think we need to write the {file_path}.user_metadata.json file + # and we need to update the hsextract application code to extract metadata from that file if the {file_path}.json doesn't exist type_value = agg_type.value data = {} if agg_type == AggregationType.SingleFileAggregation: @@ -1221,40 +1430,34 @@ def file_upload(self, *files: str, destination_path: str = "") -> None: :param destination_path: The path on HydroShare to upload the files to, defaults to the root contents directory :return: None """ - if len(files) == 1: - self._upload(files[0], destination_path=destination_path) - else: - with tempfile.TemporaryDirectory() as tmpdir: - zipped_file = os.path.join(tmpdir, 'files.zip') - with ZipFile(zipped_file, 'w') as zipped: - for file in files: - zipped.write(file, os.path.basename(file)) - self._upload(zipped_file, destination_path=destination_path) - unzip_path = urljoin( - self._hsapi_path, "functions", "unzip", "data", "contents", destination_path, 'files.zip' - ) - self._hs_session.post( - unzip_path, status_code=200, data={"overwrite": "true", "ingest_metadata": "true"} - ) - # TODO, return those files? + for _file in files: + if not os.path.isfile(_file): + raise Exception(f"File '{_file}' does not exist or is not a file") + for _file in files: + self._upload(_file, destination_path=destination_path) # aggregation operations @refresh def aggregation_remove(self, aggregation: Aggregation) -> None: """ - Removes an aggregation from HydroShare. This does not remove the files in the aggregation. + Removes an aggregation from HydroShare. This does not remove the data files in the + aggregation - removes only the associated metadata files :param aggregation: The aggregation object to remove :return: None """ - path = urljoin( - aggregation._hsapi_path, - "functions", - "remove-file-type", - aggregation.metadata.type.value + "LogicalFile", - aggregation.main_file_path, - ) - aggregation._hs_session.post(path, status_code=200) + aggregation_type = aggregation._aggregation_type + if aggregation_type is None: + raise Exception("Aggregation type could not be determined") + # delete the user metadata file for the aggregation if it exists + user_meta_file_to_delete = aggregation.user_metadata_path.replace(self.bucket_path + "/", "", 1) + if self._file_exists(user_meta_file_to_delete): + self._delete_file(user_meta_file_to_delete) + + if aggregation_type not in [AggregationType.FileSetAggregation, AggregationType.SingleFileAggregation]: + extract_meta_file_to_delete = aggregation.extracted_metadata_path.replace(self.bucket_path + "/", "", 1) + if self._file_exists(extract_meta_file_to_delete): + self._delete_file(extract_meta_file_to_delete) aggregation.refresh() @refresh @@ -1265,21 +1468,56 @@ def aggregation_move(self, aggregation: Aggregation, dst_path: str = "") -> None :param dst_path: The target file path to move the aggregation to - target folder must exist :return: None """ - path = urljoin( - aggregation._hsapi_path, - aggregation.metadata.type.value + "LogicalFile", - aggregation.main_file_path, - "functions", - "move-file-type", - dst_path, - ) - response = aggregation._hs_session.post(path, status_code=200) - json_response = response.json() - task_id = json_response['id'] - status, _ = self._hs_session.check_task(task_id) - while status != 'true': - status, _ = self._hs_session.check_task(task_id) - time.sleep(CHECK_TASK_PING_INTERVAL) + aggregation_type = aggregation._aggregation_type + if aggregation_type is None: + raise Exception("Aggregation type could not be determined") + aggr_path = aggregation.main_file_path + if aggregation_type == AggregationType.FileSetAggregation: + dst_path = os.path.join(dst_path, os.path.basename(aggr_path)) + self._move_folder(aggr_path, dst_path) + # move the user metadata file if it exists + user_mata_file_path = self.aggregation.user_metadata_path.replace(self.bucket_path + "/", "", 1) + dst_path = os.path.join(dst_path, "user_metadata.json") + dst_path = f".hsmetadata/{dst_path}" + if self._file_exists(user_mata_file_path): + self._move_file(user_mata_file_path, dst_path) + elif aggregation_type == AggregationType.SingleFileAggregation: + data_file_src_path = aggr_path + data_file_name = os.path.basename(data_file_src_path) + data_file_dst_path = os.path.join(dst_path, data_file_name) + self._move_file(data_file_src_path, data_file_dst_path) + # move the user metadata file if it exists + user_meta_dst_path = os.path.join(dst_path, f"{data_file_name}.user_metadata.json") + user_meta_dst_path = f".hsmetadata/{user_meta_dst_path}" + user_meta_src_path = self.aggregation.user_metadata_path.replace(self.bucket_path + "/", "", 1) + if self._file_exists(user_meta_src_path): + self._move_file(user_meta_src_path, user_meta_dst_path) + else: + # move the data files in the aggregation + for associated_media_item in aggregation._associated_media_items(): + media_item_url_path = associated_media_item["contentUrl"] + file_src_path = self._file_path_from_content_url(media_item_url_path) + file_dst_path = os.path.join(dst_path, os.path.basename(file_src_path)) + if self._file_exists(file_src_path): + self._move_file(file_src_path, file_dst_path) + # move the extracted metadata file if it exists + # TODO: We propbly don't need to move the extracted metadata file as it will be generated + # on content file move as part of s3 event processing + extract_meta_src_path = aggregation.extracted_metadata_path.replace(self.bucket_path + "/", "", 1) + extracted_file_name = os.path.basename(aggr_path) + ".json" + extract_meta_dst_path = os.path.join(dst_path, extracted_file_name) + extract_meta_dst_path = f".hsmetadata/{extract_meta_dst_path}" + if self._file_exists(extract_meta_src_path): + self._move_file(extract_meta_src_path, extract_meta_dst_path) + + # move the user metadata file if it exists + data_file_name = os.path.basename(aggr_path) + user_meta_dst_path = os.path.join(dst_path, f"{data_file_name}.user_metadata.json") + user_meta_dst_path = f".hsmetadata/{user_meta_dst_path}" + user_meta_src_path = self.aggregation.user_metadata_path.replace(self.bucket_path + "/", "", 1) + if self._file_exists(user_meta_src_path): + self._move_file(user_meta_src_path, user_meta_dst_path) + aggregation.refresh() @refresh @@ -1289,7 +1527,38 @@ def aggregation_delete(self, aggregation: Aggregation) -> None: :param aggregation: The aggregation object to delete :return: None """ - aggregation.delete() + aggregation_type = aggregation._aggregation_type + if aggregation_type is None: + raise Exception("Aggregation type could not be determined") + + # delete the user metadata file for the aggregation if it exists + # TODO: We probably don't need to delete the user metadata file as it will be deleted + # when the content file is deleted as part of s3 event processing + if self._s3_client.exists(aggregation.user_metadata_path): + self._s3_client.delete(aggregation.user_metadata_path) + + if aggregation_type == AggregationType.SingleFileAggregation: + # for single file aggregations we just need to delete the data file + media_item_url_path = aggregation._associated_media_items()[0]["contentUrl"] + media_item_path = self._file_path_from_content_url(media_item_url_path) + if self._file_exists(media_item_path): + self._delete_file(media_item_path) + elif aggregation_type == AggregationType.FileSetAggregation: + # for file set aggregations we need to delete the entire folder + self._delete_file_folder(aggregation.main_file_path) + else: + # delete the extracted metadata json file for the aggregation + # TODO: We probably don't need to delete the extracted metadata file as it will be deleted + # when the content file is deleted as part of s3 event processing + if self._s3_client.exists(aggregation.extracted_metadata_path): + self._s3_client.delete(aggregation.extracted_metadata_path) + + # delete all data files in the aggregation + for associated_media_item in aggregation._associated_media_items(): + media_item_url_path = associated_media_item["contentUrl"] + media_item_path = self._file_path_from_content_url(media_item_url_path) + if self._file_exists(media_item_path): + self._delete_file(media_item_path) def aggregation_download(self, aggregation: Aggregation, save_path: str = "", unzip_to: str = None) -> str: """ @@ -1337,7 +1606,7 @@ def __init__( def set_auth(self, auth): if self._client_id: - raise NotImplementedError(f"This session is an Oauth2 session and does not provide the set_oauth method") + raise NotImplementedError("This session is an Oauth2 session and does not provide the set_oauth method") self._session.auth = auth def set_oauth(self, client_id: str, token: Union[Token, Dict[str, str]]): @@ -1501,6 +1770,7 @@ class HydroShare: default_host = 'www.hydroshare.org' default_protocol = "https" default_port = 443 + default_s3_endpoint_url = "https://s3.hydroshare.org" def __init__( self, @@ -1510,6 +1780,7 @@ def __init__( protocol: str = default_protocol, port: int = default_port, client_id: str = None, + s3_endpoint_url: str = default_s3_endpoint_url, token: Union[Token, Dict[str, str]] = None, ): if client_id or token: @@ -1528,6 +1799,12 @@ def __init__( self.my_user_info() # validate credentials self._resource_object_cache: Dict[str, Resource] = dict() + self._s3_access_key: str = None + self._s3_secret_key: str = None + self._s3_endpoint_url = s3_endpoint_url + if username and password: + self._set_user_s3_credentials() + self._create_s3_client() def sign_in(self) -> None: """Prompts for username/password. Useful for avoiding saving your HydroShare credentials to a notebook""" @@ -1535,6 +1812,10 @@ def sign_in(self) -> None: password = getpass.getpass("Password for {}: ".format(username)) self._hs_session.set_auth((username, password)) self.my_user_info() # validate credentials + # set user s3 credentials + self._set_user_s3_credentials() + # create s3 client + self._create_s3_client() @classmethod def hs_juptyerhub(cls, hs_auth_path="/home/jovyan/data/.hs_auth"): @@ -1652,7 +1933,25 @@ def resource(self, resource_id: str, validate: bool = True, use_cache: bool = Tr if resource_id in self._resource_object_cache and use_cache: return self._resource_object_cache[resource_id] - res = Resource("/resource/{}/data/resourcemap.xml".format(resource_id), self._hs_session) + # Retrieve the bucket name and prefix for the resource to access its files in the bucket + try: + response = self._hs_session.get(f'/hsapi/resource/s3/{resource_id}/', status_code=200) + response_json = response.json() + + if 'bucket' not in response_json: + raise Exception(f"ERROR: Resource was not found for resource_id: {resource_id}") + + bucket_name = response_json['bucket'] + prefix = response_json['prefix'] + assert resource_id == prefix.split("/")[0] + + # Use JSON-LD metadata file instead of XML + resource_jsonld_metadata_path = f"{bucket_name}/{resource_id}/.hsjsonld/dataset_metadata.json" + except Exception as e: + raise Exception(f"ERROR: Failed to retrieve S3 path for resource_id: {resource_id} - {str(e)}") + + res = Resource(map_path=resource_jsonld_metadata_path, hs_session=self._hs_session, s3_client=self._s3_client) + if validate: res.metadata @@ -1688,3 +1987,37 @@ def my_user_info(self): """ response = self._hs_session.get('/hsapi/userInfo/', status_code=200) return response.json() + + def _set_user_s3_credentials(self) -> None: + """ + Retrieves and stores the user's S3 credentials from HydroShare + """ + try: + response = self._hs_session.post('/hsapi/user/service/accounts/s3/', status_code=201) + response_json = response.json() + + if 'access_key' not in response_json: + print("ERROR: Invalid username/password") + self._s3_access_key = None + self._s3_secret_key = None + else: + self._s3_access_key = response_json['access_key'] + self._s3_secret_key = response_json['secret_key'] + print("User authentication is successful!") + except Exception as e: + print(f"ERROR: Failed to retrieve S3 credentials - {str(e)}") + self._s3_access_key = None + self._s3_secret_key = None + + def _create_s3_client(self) -> None: + """ + Creates an S3 client object using the user's S3 credentials + """ + if self._s3_access_key and self._s3_secret_key: + self._s3_client = s3fs.S3FileSystem( + key=self._s3_access_key, + secret=self._s3_secret_key, + endpoint_url=self._s3_endpoint_url + ) + else: + self._s3_client = None diff --git a/hsclient/metadata_adapter/__init__.py b/hsclient/metadata_adapter/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/hsclient/metadata_adapter/adapter.py b/hsclient/metadata_adapter/adapter.py new file mode 100644 index 0000000..56992cb --- /dev/null +++ b/hsclient/metadata_adapter/adapter.py @@ -0,0 +1,24 @@ +from typing import Union + +from hsclient.metadata_adapter.legacy_resource_models import LegacyResourceMetadata +from hsclient.metadata_adapter.resource_models import SchemaOrgResourceMetadata +from hsclient.metadata_adapter.legacy_resource_adapter import LegacyResourceMetadataAdapter +from hsclient.metadata_adapter.resource_adapter import ResourceMetadataAdapter + + +class MetadataAdapter: + @staticmethod + def to_resource_metadata(legacy_metadata: Union[LegacyResourceMetadata, dict]) -> SchemaOrgResourceMetadata: + if isinstance(legacy_metadata, LegacyResourceMetadata): + adapter = LegacyResourceMetadataAdapter(**legacy_metadata.model_dump()) + else: + adapter = LegacyResourceMetadataAdapter(**legacy_metadata) + return adapter.to_resource_metadata() + + @staticmethod + def to_legacy_resource_metadata(metadata: Union[SchemaOrgResourceMetadata, dict]) -> LegacyResourceMetadata: + if isinstance(metadata, SchemaOrgResourceMetadata): + adapter = ResourceMetadataAdapter(**metadata.model_dump()) + else: + adapter = ResourceMetadataAdapter(**metadata) + return adapter.to_legacy_resource_metadata() diff --git a/hsclient/metadata_adapter/legacy_resource_adapter.py b/hsclient/metadata_adapter/legacy_resource_adapter.py new file mode 100644 index 0000000..7544bd3 --- /dev/null +++ b/hsclient/metadata_adapter/legacy_resource_adapter.py @@ -0,0 +1,127 @@ +from hsclient.metadata_adapter.resource_models import SchemaOrgResourceMetadata +from hsclient.metadata_adapter.legacy_resource_models import LegacyResourceMetadata +from hsclient.schema import base as schema + + +class LegacyResourceMetadataAdapter(LegacyResourceMetadata): + """This adapter takes legacy resource metadata and converts it to schema.org format. + This adapter is used for saving user edited legacy resource metadata in schema.org format, + (as user resource metadata) to write to s3. + """ + def to_dataset_creators(self): + creators = [] + for creator in self.creators: + creators.append(creator.to_dataset_creator()) + return creators + + def to_dataset_contributors(self): + contributors = [] + for contributor in self.contributors: + contributors.append(contributor.to_dataset_contributor()) + return contributors + + def to_dataset_funding(self): + grants = [] + for award in self.awards: + grants.append(award.to_dataset_grant()) + return grants + + def to_dataset_associated_media(self): + return self.associatedMedia + + def to_dataset_is_part_of(self): + return self._to_dataset_part_relations("IsPartOf") + + def to_dataset_has_part(self): + return self._to_dataset_part_relations("HasPart") + + def to_dataset_relation(self): + relations = [] + for relation in self.relations: + relations.append(relation.to_dataset_relation()) + return relations + + def _to_dataset_part_relations(self, relation_type: str): + part_relations = [] + for relation in self.relations: + part_relation = relation.to_dataset_part_relation(relation_type) + if part_relation: + part_relations.append(part_relation) + return part_relations + + def to_dataset_spatial_coverage(self): + if self.spatial_coverage: + return self.spatial_coverage.to_dataset_spatial_coverage() + return None + + def to_dataset_period_coverage(self): + if self.period_coverage: + return self.period_coverage.to_dataset_temporal_coverage() + return None + + def to_dataset_keywords(self): + if self.subjects: + return self.subjects + return ["HydroShare"] + + def to_dataset_license(self): + if self.rights: + return self.rights.to_dataset_license() + + def to_dataset_creative_work_status(self): + status_defined_terms = { + "public": schema.Public, + "published": schema.Published, + "discoverable": schema.Discoverable, + "private": schema.Private, + } + if self.sharing_status: + return status_defined_terms[self.sharing_status].model_construct() + + def to_dataset_additional_properties(self): + additional_properties = [] + if self.additional_metadata: + for key, value in self.additional_metadata.items(): + property_value = schema.PropertyValue.model_construct() + property_value.name = key + property_value.value = value + additional_properties.append(property_value) + return additional_properties + + @staticmethod + def to_dataset_provider(): + provider = schema.Organization.model_construct() + provider.name = "HydroShare" + provider.url = "https://www.hydroshare.org/" + return provider + + def to_resource_metadata(self): + # Generate resource metadata in schema.org format from the legacy resource metadata + dataset = SchemaOrgResourceMetadata.model_construct(**self.extra_columns) + dataset.additionalType = self.type + dataset.provider = self.to_dataset_provider() + dataset.name = self.title + dataset.description = self.abstract + dataset.url = self.url + dataset.identifier = [str(self.identifier)] if self.identifier else [] + dataset.creator = self.to_dataset_creators() + dataset.contributor = self.to_dataset_contributors() + dataset.dateCreated = self.created + dataset.dateModified = self.modified + dataset.datePublished = self.published + dataset.keywords = self.to_dataset_keywords() + dataset.inLanguage = self.language + dataset.funding = self.to_dataset_funding() + dataset.spatialCoverage = self.to_dataset_spatial_coverage() + dataset.temporalCoverage = self.to_dataset_period_coverage() + dataset.associatedMedia = self.to_dataset_associated_media() + dataset.isPartOf = self.isPartOf + dataset.hasPart = self.hasPart + if self.publisher: + dataset.publisher = self.publisher.to_dataset_publisher() + dataset.license = self.to_dataset_license() + dataset.citation = [self.citation] + dataset.creativeWorkStatus = self.to_dataset_creative_work_status() + dataset.relation = self.to_dataset_relation() + dataset.additionalProperty = self.to_dataset_additional_properties() + return dataset diff --git a/hsclient/metadata_adapter/legacy_resource_models.py b/hsclient/metadata_adapter/legacy_resource_models.py new file mode 100644 index 0000000..64866d3 --- /dev/null +++ b/hsclient/metadata_adapter/legacy_resource_models.py @@ -0,0 +1,271 @@ +from enum import Enum +from datetime import datetime +from typing import Any, List, Optional, Union, Literal + +from pydantic import BaseModel, HttpUrl, model_validator, ValidationError, TypeAdapter, AnyUrl + +import hsclient.schema.base as schema +from hsclient.schema.core import SchemaBaseModel + + +class StringEnum(str, Enum): + pass + + +url_adapter = TypeAdapter(AnyUrl) + +def is_url(value: str) -> bool: + try: + url_adapter.validate_python(value) + return True + except ValidationError: + return False + + +class BasePerson(BaseModel): + name: Optional[str] = None + email: Optional[str] = None + organization: Optional[str] = None + homepage: Optional[HttpUrl] = None + address: Optional[str] = None + identifiers: Optional[dict] = {} + + def to_dataset_person(self, person_type): + if self.name: + person = person_type.model_construct() + person.name = self.name + if self.email: + person.email = self.email + if self.organization: + affiliation = schema.Organization.model_construct() + affiliation.name = self.organization + person.affiliation = affiliation + _ORCID_identifier = self.identifiers.get("ORCID", "") + if _ORCID_identifier: + person.identifier = _ORCID_identifier + else: + person = schema.Organization.model_construct() + person.name = self.organization + if self.homepage: + person.url = self.homepage + if self.address: + person.address = self.address + + return person + + +class Creator(BasePerson): + + def to_dataset_creator(self): + return self.to_dataset_person(schema.Creator) + + +class Contributor(BasePerson): + + def to_dataset_contributor(self): + return self.to_dataset_person(schema.Contributor) + +class Publisher(BaseModel): + name: Optional[str] = None + url: Optional[HttpUrl] = None + + def to_dataset_publisher(self): + publisher = schema.Organization.model_construct() + publisher.name = self.name + publisher.url = self.url + return publisher + +class Award(BaseModel): + funding_agency_name: str + title: Optional[str] = None + number: Optional[str] = None + funding_agency_url: Optional[HttpUrl] = None + + def to_dataset_grant(self): + grant = schema.Grant.model_construct() + if self.title: + grant.name = self.title + else: + grant.name = self.funding_agency_name + if self.number: + grant.identifier = self.number + + funder = schema.Organization.model_construct() + funder.name = self.funding_agency_name + if self.funding_agency_url: + funder.url = self.funding_agency_url + + grant.funder = funder + return grant + + +class TemporalCoverage(BaseModel): + start: datetime + end: datetime + + def to_dataset_temporal_coverage(self): + temp_cov = schema.TemporalCoverage.model_construct() + if self.start: + temp_cov.startDate = self.start + if self.end: + temp_cov.endDate = self.end + return temp_cov + + +class SpatialCoverageBox(BaseModel): + name: Optional[str] = None + northlimit: float + eastlimit: float + southlimit: float + westlimit: float + type: str = "box" + # TODO: These 2 fields are not supported as there are no matching fields in the + # schema.org spatial coverage model, but we may have to add them to the schema.org + # spatial coverage model if we want to preserve the metadata editing api in hsclient + # units: Optional[str] = None + # projection: Optional[str] = None + + + def to_dataset_spatial_coverage(self): + place = schema.Place.model_construct() + if self.name: + place.name = self.name + + place.geo = schema.GeoShape.model_construct() + place.geo.box = f"{self.northlimit} {self.eastlimit} {self.southlimit} {self.westlimit}" + return place + + +class SpatialCoveragePoint(BaseModel): + name: Optional[str] = None + north: float + east: float + type: str = "point" + # TODO: These 2 fields are not supported as there are no matching fields in the schema.org + # spatial coverage model, but we may have to add them to the schema.org + # spatial coverage model if we want to preserve the metadata editing api in hsclient + # units: Optional[str] = None + # projection: Optional[str] = None + + + def to_dataset_spatial_coverage(self): + place = schema.Place.model_construct() + if self.name: + place.name = self.name + place.geo = schema.GeoCoordinates.model_construct() + place.geo.latitude = self.north + place.geo.longitude = self.east + return place + + +class Relation(BaseModel): + type: str + value: str + + def to_dataset_relation(self): + relation = schema.Relation.model_construct() + relation.name = self.type + self.value = self.value.strip() + if is_url(self.value): + relation.url = self.value + relation.description = self.value + return relation + + def to_dataset_part_relation(self, relation_type: str): + relation = None + if relation_type not in ["isPartOf", "hasPart"]: + relation = schema.IsPartOf.model_construct() + if relation_type == "IsPartOf" and self.type.endswith("is part of"): + relation = schema.IsPartOf.model_construct() + elif relation_type == "HasPart" and self.type.endswith("resource includes"): + relation = schema.HasPart.model_construct() + else: + relation = schema.Relation.model_construct() + relation.name = self.type + + if ',' in self.value: + description, url = self.value.rsplit(',', 1) + else: + description, url = self.value, "" + if not description: + description = "" + if not url: + url = "" + relation.description = description.strip() + relation.url = url.strip() + return relation + + +class Rights(BaseModel): + statement: Optional[str] = None + url: Optional[HttpUrl] = None + + def to_dataset_license(self): + _license = schema.CreativeWork.model_construct() + _license.name = self.statement + _license.url = self.url + return _license + +class LegacyResourceMetadata(SchemaBaseModel): + """This represents legacy metadata model for a HydroShare resource. + This is used to convert legacy resource metadata to resource metadata in schema.org format + to write to s3 as user metadata for a resource. + """ + type: Optional[str] = None + title: Optional[str] = None + abstract: Optional[str] = None + url: Optional[HttpUrl] = None + identifier: Optional[HttpUrl] = None + creators: List[Creator] = [] + contributors: List[Contributor] = [] + created: Optional[datetime] = None + modified: Optional[datetime] = None + published: Optional[datetime] = None + publisher: Optional[Publisher] = None + subjects: Optional[List[str]] = None + language: Optional[str] = None + rights: Optional[Rights] = None + awards: List[Award] = [] + spatial_coverage: Optional[Union[SpatialCoverageBox, SpatialCoveragePoint]] = None + period_coverage: Optional[TemporalCoverage] = None + relations: Optional[List[Relation]] = [] + + # 'isPartOf', 'hasPart', and 'provider' are not part of the legacy resource metadata model + isPartOf: Optional[List[schema.IsPartOf]] = [] + hasPart: Optional[List[schema.HasPart]] = [] + provider: Optional[Union[schema.Organization, schema.Provider]] = None + citation: Optional[str] = None + + # 'associatedMedia' is not part of the legacy resource metadata model, + # but we need it to generate the resource files objects in hsclient Resource model + associatedMedia: Union[List[Any], Any] = None + sharing_status: Optional[Literal["private", "public", "published", "discoverable"]] = None + additional_metadata: Optional[dict] = {} + extra_columns : Optional[dict] = {} + + _frozen_fields: set = set() + + def freeze_field(self, name: str): + # Check that it's a valid model field + if name not in self.__class__.model_fields: + raise ValueError(f"'{name}' is not a valid field of {self.__class__.__name__}") + + self._frozen_fields.add(name) + + def __setattr__(self, name, value): + if hasattr(self, "_frozen_fields") and name in self._frozen_fields: + raise AttributeError(f"Field '{name}' is frozen") + super().__setattr__(name, value) + + @model_validator(mode="before") + @classmethod + def set_extra_columns(cls, data: Any): + if isinstance(data, dict): + extra_fields = data.keys() - cls.model_fields.keys() + if "extra_columns" not in data: + data["extra_columns"] = {} + if extra_fields: + data["extra_columns"].update( + {field_name: data[field_name] for field_name in extra_fields} + ) + return data diff --git a/hsclient/metadata_adapter/resource_adapter.py b/hsclient/metadata_adapter/resource_adapter.py new file mode 100644 index 0000000..3dc9c10 --- /dev/null +++ b/hsclient/metadata_adapter/resource_adapter.py @@ -0,0 +1,243 @@ +from datetime import datetime +from typing import Optional, Union, List + +from pydantic import HttpUrl +from hsclient.metadata_adapter.resource_models import ( + SchemaOrgCreator, + SchemaOrgContributor, + SchemaOrgOrganization, +) +from hsclient.metadata_adapter.legacy_resource_models import ( + Rights as LegacyRights, + Award as LegacyAward, + Relation as LegacyRelation, + TemporalCoverage as LegacyPeriodCoverage, + SpatialCoverageBox as LegacyBoxCoverage, + SpatialCoveragePoint as LegacyPointCoverage, + Publisher as LegacyPublisher, + LegacyResourceMetadata, +) +from hsclient.schema.base import ( + CreativeWork, + Grant, + IsPartOf, + HasPart, + Place, + PublisherOrganization, + PropertyValue, + Relation, + SchemaBaseModel, + TemporalCoverage, + GeoCoordinates, + Organization, + Provider, + Draft, + Private, + Incomplete, + Obsolete, + Published, + Public, + Discoverable, + GeoShape, + MediaType, +) + + +class ResourceMetadataAdapter(SchemaBaseModel): + """A pydantic model representing the Schema.org based CoreMetadata for HydroShare resources, + with methods to convert to legacy resource metadata models used for metadata editing using hsclient.""" + type: Optional[str] = None + additionalType: Optional[str] = None + name: Optional[str] = None + description: Optional[str] = None + url: Optional[HttpUrl] = None + identifier: Optional[List[str]] = None + creator: Optional[List[Union[SchemaOrgCreator, SchemaOrgOrganization]]] = [] + contributor: Optional[List[Union[SchemaOrgContributor, SchemaOrgOrganization]]] = [] + dateCreated: Optional[datetime] = None + dateModified: Optional[datetime] = None + datePublished: Optional[datetime] = None + keywords: Optional[List[str]] = [] + inLanguage: Optional[str] = None + license: Optional[Union[CreativeWork, HttpUrl]] = None + funding: Optional[List[Grant]] = [] + relation: Optional[List[Relation]] = [] + associatedMedia: Optional[Union[MediaType, List[MediaType]]] =[] + isPartOf: Optional[List[IsPartOf]] = [] + hasPart: Optional[List[HasPart]] = [] + + temporalCoverage: Optional[TemporalCoverage] = None + spatialCoverage: Optional[Place] = None + publisher: Optional[PublisherOrganization] = None + additionalProperty: Optional[List[PropertyValue]] = [] + citation: Optional[List[str]] = [] + # No need to convert provider as there is no matching field in the legacy metadata model + # and it is not allowed for editing using hsclient + provider: Union[Organization, Provider] = None + + creativeWorkStatus: Optional[Union[Draft, Private, Incomplete, Obsolete, Published, Public, Discoverable]] = None + + def to_legacy_sharing_status(self) -> Optional[str]: + if self.creativeWorkStatus is None: + return None + print(f"Creative work status: {self.creativeWorkStatus}") + if isinstance(self.creativeWorkStatus, Published): + return "published" + elif isinstance(self.creativeWorkStatus, Public): + return "public" + elif isinstance(self.creativeWorkStatus, Discoverable): + return "discoverable" + return 'private' + + def to_legacy_citation(self) -> Optional[str]: + if not self.citation: + return None + return self.citation[0] + + def to_legacy_additional_metadata(self) -> Optional[dict]: + if not self.additionalProperty: + return {} + + return {prop.name: prop.value for prop in self.additionalProperty} + + def to_legacy_publisher(self) -> Optional[LegacyPublisher]: + if not self.publisher: + return None + publisher = LegacyPublisher.model_construct() + publisher.name = self.publisher.name + if self.publisher.url: + publisher.url = str(self.publisher.url) + return publisher + + def to_legacy_spatial_coverage(self) -> Optional[Union[LegacyPointCoverage, LegacyBoxCoverage]]: + if not self.spatialCoverage or not self.spatialCoverage.geo: + return None + + geo = self.spatialCoverage.geo + if isinstance(geo, GeoCoordinates): + return LegacyPointCoverage.model_construct( + name=self.spatialCoverage.name, + north=geo.latitude, + east=geo.longitude, + type="point", + ) + elif isinstance(geo, GeoShape): + northlimit, eastlimit, southlimit, westlimit = None, None, None, None + try: + northlimit, eastlimit, southlimit, westlimit = map(float, geo.box.split()) + except Exception as e: + print(f"Error parsing geo.box string: {geo.box}, error: {e}") + return None + + return LegacyBoxCoverage.model_construct( + name=self.spatialCoverage.name, + northlimit=northlimit, + eastlimit=eastlimit, + southlimit=southlimit, + westlimit=westlimit, + type="box", + ) + else: + return None + + def to_legacy_temporal_coverage(self) -> Optional[LegacyPeriodCoverage]: + if not self.temporalCoverage: + return None + legacy_period = LegacyPeriodCoverage.model_construct() + legacy_period.start = self.temporalCoverage.startDate + if self.temporalCoverage.endDate: + legacy_period.end = self.temporalCoverage.endDate + return legacy_period + + def to_legacy_relations(self) -> Optional[List[LegacyRelation]]: + if not self.relation: + return [] + legacy_relations = [] + for relation in self.relation: + if type(relation) in (IsPartOf, HasPart): + continue + legacy_relation = LegacyRelation.model_construct() + legacy_relation.type = relation.name + legacy_relation.value = _build_relation_value(relation.description, relation.url) + legacy_relations.append(legacy_relation) + return legacy_relations + + def to_legacy_award(self) -> Optional[List[LegacyAward]]: + if not self.funding: + return [] + awards = [] + for grant in self.funding: + award = LegacyAward.model_construct() + award.title = grant.name + if grant.identifier: + award.number = grant.identifier + if grant.funder and grant.funder.name: + award.funding_agency_name = grant.funder.name + if grant.funder.url: + award.funding_agency_url = str(grant.funder.url) + awards.append(award) + return awards + + def to_legacy_rights(self) -> Optional[LegacyRights]: + if self.license is None: + return None + if not isinstance(self.license, CreativeWork): + return LegacyRights(url=str(self.license)) + return LegacyRights(statement=self.license.name, url=str(self.license.url) if self.license.url else None) + + def to_legacy_resource_metadata(self) -> LegacyResourceMetadata: + legacy_metadata = LegacyResourceMetadata.model_construct() + legacy_metadata.type = self.additionalType + legacy_metadata.title = self.name + legacy_metadata.abstract = self.description + legacy_metadata.url = self.url + if self.identifier: + legacy_metadata.identifier = self.identifier[0] + legacy_metadata.creators = [creator.to_legacy_creator() for creator in self.creator or []] + legacy_metadata.contributors = [contributor.to_legacy_contributor() for contributor in self.contributor or []] + legacy_metadata.created = self.dateCreated + legacy_metadata.modified = self.dateModified + legacy_metadata.published = self.datePublished + legacy_metadata.subjects = self.keywords + legacy_metadata.language = self.inLanguage + legacy_metadata.rights = self.to_legacy_rights() + legacy_metadata.awards = self.to_legacy_award() + legacy_metadata.spatial_coverage = self.to_legacy_spatial_coverage() + legacy_metadata.period_coverage = self.to_legacy_temporal_coverage() + legacy_metadata.relations = self.to_legacy_relations() + # The legacy model originally doesnot have hasPart, isPartOf, and provider fields, + # we are providing them here for completeness so that they can be accessed in hsclient, + # - no conversion is needed from schemaorg to legacy for these fields + legacy_metadata.hasPart = self.hasPart + legacy_metadata.isPartOf = self.isPartOf + legacy_metadata.provider = self.provider + + legacy_metadata.citation = self.to_legacy_citation() + legacy_metadata.additional_metadata = self.to_legacy_additional_metadata() + legacy_metadata.associatedMedia = self.associatedMedia + legacy_metadata.publisher = self.to_legacy_publisher() + legacy_metadata.sharing_status = self.to_legacy_sharing_status() + # set the frozen fields so that these fields can't be edited using hsclient + for field in [ + "type", + "url", + "created", + "modified", + "published", + "publisher", + "identifier", + "sharing_status", + "citation", + 'provider', + "associatedMedia", + ]: + legacy_metadata.freeze_field(field) + + return legacy_metadata + +def _build_relation_value(description: Optional[str], url: Optional[str]) -> str: + description = (description or "").strip() + url = (str(url) if url else "").strip() + if description and url: + return f"{description}, {url}" + return description or url diff --git a/hsclient/metadata_adapter/resource_models.py b/hsclient/metadata_adapter/resource_models.py new file mode 100644 index 0000000..64cd3e9 --- /dev/null +++ b/hsclient/metadata_adapter/resource_models.py @@ -0,0 +1,118 @@ +from datetime import datetime +from typing import Optional, Union, List + +from pydantic import Field + +from hsclient.metadata_adapter.legacy_resource_models import Creator as LegacyCreator +from hsclient.metadata_adapter.legacy_resource_models import Contributor as LegacyContributor +from hsclient.schema.base import ( + Contributor, + Creator, + Organization, + MediaType, +) +from hsclient.schema.core import CoreMetadata + + +class SchemaOrgResourceMetadata(CoreMetadata): + # This class overrides the CoreMetadata frozen fields for the adapter to work + type: str = Field( + alias="@type", # type: ignore + default="CreativeWork", + description="A creative work that may include various forms of content, such as datasets," + " software source code, digital documents, etc.", + json_schema_extra={"readOnly": True}, + ) + additionalType: Optional[str] = Field( + title="Additional type", + description="An additional type for the resource. This can be used to further specify the type of the" + " resource (e.g., Composite Resource).", + json_schema_extra={"readOnly": True}, + ) + dateCreated: datetime = Field( + title="Date created", description="The date on which the resource was created.", + json_schema_extra={"readOnly": True}, + ) + datePublished: Optional[datetime] = Field( + title="Date published", + description="Date of first publication for the resource.", + default=None, + json_schema_extra={"readOnly": True}, + ) + dateModified: Optional[datetime] = Field( + title="Date modified", + description="The date on which the resource was most recently modified or updated.", + default=None, + json_schema_extra={"readOnly": True}, + ) + associatedMedia: Optional[Union[MediaType, List[MediaType]]] = Field( + title="Resource content", + description="A media object that encodes this CreativeWork. This property is a synonym for encoding.", + default=None, + json_schema_extra={"readOnly": True}, + ) + citation: Optional[List[str]] = Field( + title="Citation", + description="A bibliographic citation for the resource.", + default=None, + json_schema_extra={"readOnly": True}, + ) + +class SchemaOrgOrganization(Organization): + + def to_legacy_creator(self) -> LegacyCreator: + creator = LegacyCreator.model_construct() + creator.organization = self.name + if self.url: + creator.homepage = str(self.url) + if self.address: + creator.address = self.address + return creator + + def to_legacy_contributor(self) -> LegacyContributor: + contributor = LegacyContributor.model_construct() + contributor.organization = self.name + if self.url: + contributor.homepage = str(self.url) + if self.address: + contributor.address = self.address + return contributor + + +class SchemaOrgCreator(Creator): + + def to_legacy_creator(self) -> LegacyCreator: + creator = LegacyCreator.model_construct() + creator.name = self.name + creator.email = self.email + if self.identifier: + orcid = _normalize_orcid(str(self.identifier)) + if orcid: + creator.identifiers = {"ORCID": orcid} + if self.affiliation: + creator.organization = self.affiliation.name + + return creator + +class SchemaOrgContributor(Contributor): + + def to_legacy_contributor(self) -> LegacyContributor: + contributor = LegacyContributor.model_construct() + contributor.name = self.name + contributor.email = self.email + if self.identifier: + orcid = _normalize_orcid(str(self.identifier)) + if orcid: + contributor.identifiers = {"ORCID": orcid} + if self.affiliation: + contributor.organization = self.affiliation.name + return contributor + + +def _normalize_orcid(orcid: Optional[str]) -> Optional[str]: + if not orcid: + return None + orcid = str(orcid).strip() + if orcid.startswith("http://orcid.org/") or orcid.startswith("https://orcid.org/"): + return str(orcid) + return None diff --git a/hsclient/schema/__init__.py b/hsclient/schema/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/hsclient/schema/base.py b/hsclient/schema/base.py new file mode 100644 index 0000000..1a376be --- /dev/null +++ b/hsclient/schema/base.py @@ -0,0 +1,792 @@ +import re +from datetime import datetime +from enum import Enum +from typing import Any, List, Optional, Union, Literal + + +from pydantic import ( + BaseModel, + ConfigDict, + EmailStr, + Field, + HttpUrl, + AnyUrl, + field_validator, + model_validator, + GetJsonSchemaHandler, + ValidationInfo, +) + + +from pydantic.json_schema import JsonSchemaValue + +orcid_pattern = "\\b\\d{4}-\\d{4}-\\d{4}-\\d{3}[0-9X]\\b" +orcid_pattern_placeholder = "e.g. '0000-0001-2345-6789'" +orcid_pattern_error = "must match the ORCID pattern. e.g. '0000-0001-2345-6789'" + + +def modify_json_schema(schema: dict[str, Any]) -> None: + for prop in schema.get("properties", {}).values(): + if "format" in prop and prop["format"] == "uri": + # Replace "format" with a regex pattern for URL matching + prop.pop("format") + prop["pattern"] = ( + "^(http:\\/\\/www\\.|https:\\/\\/www\\.|http:\\/\\/|https:\\/\\/)?" + "[a-z0-9]+([\\-\\.]{1}[a-z0-9]+)*\\.[a-z]{2,5}(:[0-9]{1,5})?" + "(\\/.*)?$" + ) + prop["errorMessage"] = {"pattern": 'must match format "url"'} + + +class SchemaBaseModel(BaseModel): + model_config = ConfigDict(json_schema_extra=modify_json_schema, extra="ignore") + model_config.update(arbitrary_types_allowed=True, json_encoders={HttpUrl: str, AnyUrl: str}) + + +class DefinedTerm(SchemaBaseModel): + type: str = Field(alias="@type", default="DefinedTerm") + name: str = Field(description="The name of the term or item being defined.") + description: str = Field(description="The description of the item being defined.") + + +class Published(DefinedTerm): + name: Literal["Published"] = Field(default="Published") + description: str = Field( + default="The resource has been permanently published and should be considered final and complete", + readOnly=True, + description="The description of the item being defined.", + ) + + +class Public(DefinedTerm): + name: Literal["Public"] = Field(default="Public") + description: str = Field( + default="The resource is publicly accessible and can be viewed or downloaded by anyone", + readOnly=True, + description="The description of the item being defined.", + ) + + +class Private(DefinedTerm): + name: Literal["Private"] = Field(default="Private") + description: str = Field( + default="The resource is private and can only be accessed by authorized users", + readOnly=True, + description="The description of the item being defined.", + ) + + +class Discoverable(DefinedTerm): + name: Literal["Discoverable"] = Field(default="Discoverable") + description: str = Field( + default="The resource is discoverable and can be found through search engines or other discovery mechanisms", + readOnly=True, + description="The description of the item being defined.", + ) + + +class CreativeWork(SchemaBaseModel): + type: Literal["CreativeWork"] = Field( + alias="@type", # type: ignore + default="CreativeWork", + description="Submission type can include various forms of content, such as datasets, " + "software source code, digital documents, etc.", + ) + name: Optional[str] = Field(description="Submission's name or title", title="Name or title", default=None) + description: Optional[str] = Field( + description="The description of the creative work.", default=None + ) + url: Optional[HttpUrl] = Field( + title="URL", + description="A URL to the creative work.", + default=None, + ) + + +class Person(SchemaBaseModel): + type: Literal["Person"] = Field( + alias="@type", description="A person.", default="Person" # type: ignore + ) + name: str = Field( + description="A string containing the full name of the person. Personal name format: Family Name, Given Name." + ) + email: Optional[EmailStr] = Field( + description="A string containing an email address for the person.", default=None + ) + identifier: Optional[List[str]] = Field( + description="Unique identifiers for the person. Where identifiers can be encoded as URLs, enter URLs here.", + default=None, + ) + model_config = { + "populate_by_name": True, # Ensures aliases work during model initialization + } + + +class Organization(SchemaBaseModel): + type: Literal["Organization"] = Field( + alias="@type", # type: ignore + default="Organization", + ) + name: str = Field(description="Name of the provider organization or repository.") + url: Optional[HttpUrl] = Field( + title="URL", + description="A URL to the homepage for the organization.", + default=None, + ) + address: Optional[str] = Field( + description="Full address for the organization - e.g., “8200 Old Main Hill, Logan, UT 84322-8200”.", + default=None, + ) # Should address be a string or another constrained type? + model_config = { + "populate_by_name": True, # Ensures aliases work during model initialization + } + + +class Affiliation(Organization): + name: str = Field( + description="Name of the organization the creator is affiliated with." + ) + + +class Provider(Person): + identifier: Optional[str] = Field( + description="ORCID identifier for the person.", + json_schema_extra={ + "pattern": orcid_pattern, + "options": {"placeholder": orcid_pattern_placeholder}, + "errorMessage": {"pattern": orcid_pattern_error}, + }, + default=None, + ) + email: Optional[EmailStr] = Field( + description="A string containing an email address for the provider.", + default=None, + ) + affiliation: Optional[Affiliation] = Field( + description="The affiliation of the creator with the organization.", + default=None, + ) + + +class Creator(Person): + identifier: Optional[str] = Field( + description="ORCID identifier for creator.", + json_schema_extra={ + "pattern": orcid_pattern, + "options": {"placeholder": orcid_pattern_placeholder}, + "errorMessage": {"pattern": orcid_pattern_error}, + }, + default=None, + ) + email: Optional[EmailStr] = Field( + description="A string containing an email address for the creator.", + default=None, + ) + affiliation: Optional[Affiliation] = Field( + description="The affiliation of the creator with the organization.", + default=None, + ) + + +class Contributor(Person): + identifier: Optional[str] = Field( + description="ORCID identifier for contributor.", + json_schema_extra={ + "pattern": orcid_pattern, + "options": {"placeholder": orcid_pattern_placeholder}, + "errorMessage": {"pattern": orcid_pattern_error}, + }, + default=None, + ) + email: Optional[EmailStr] = Field( + description="A string containing an email address for the creator.", + default=None, + ) + affiliation: Optional[Affiliation] = Field( + description="The affiliation of the creator with the organization.", + default=None, + ) + + +class FunderOrganization(Organization): + @classmethod + def __get_pydantic_json_schema__( + cls, schema: JsonSchemaValue, handler: GetJsonSchemaHandler + ) -> JsonSchemaValue: + schema.update(schema, title="Funding Organization") + return schema + + name: str = Field(description="Name of the organization.") + + +class PublisherOrganization(Organization): + name: str = Field(description="Name of the publishing organization.") + url: Optional[HttpUrl] = Field( + title="URL", + description="A URL to the homepage for the publisher organization or repository.", + default=None, + ) + + +class SourceOrganization(Organization): + name: str = Field(description="Name of the organization that created the data.") + + +class DefinedTerm(SchemaBaseModel): + type: Literal["DefinedTerm"] = Field(alias="@type", default="DefinedTerm") # type: ignore + name: str = Field(description="The name of the term or item being defined.") + description: str = Field(description="The description of the item being defined.") + + +class Draft(DefinedTerm): + name: Literal["Draft"] = Field(default="Draft") + description: str = Field( + default="The resource is in draft state and should not be considered final. Content and metadata may change", + description="The description of the item being defined.", + json_schema_extra={"readOnly": True}, + ) + + +class Incomplete(DefinedTerm): + name: Literal["Incomplete"] = Field(default="Incomplete") + description: str = Field( + default="Data collection is ongoing or the resource is not completed", + description="The description of the item being defined.", + json_schema_extra={"readOnly": True}, + ) + + +class Obsolete(DefinedTerm): + name: Literal["Obsolete"] = Field(default="Obsolete") + description: str = Field( + default="The resource has been replaced by a newer version, or the resource is no longer considered applicable", + description="The description of the item being defined.", + json_schema_extra={"readOnly": True}, + ) + + +class HasPart(CreativeWork): + url: Optional[HttpUrl] = Field( + title="URL", description="The URL address to the data resource.", default=None + ) + description: Optional[str] = Field( + description="Information about a related resource that is part of this resource.", + default=None, + ) + + +class IsPartOf(CreativeWork): + url: Optional[HttpUrl] = Field( + title="URL", description="The URL address to the data resource.", default=None + ) + description: Optional[str] = Field( + description="Information about a related resource that this resource is a " + "part of - e.g., a related collection.", + default=None, + ) + + +class Relation(CreativeWork): + url: Optional[HttpUrl] = Field( + title="URL", description="The URL address to the data resource.", default=None + ) + description: Optional[str] = Field( + description="Holds all relations other than 'hasPart' and 'isPartOf'.", + default=None, + ) + + +class MediaObjectPartOf(CreativeWork): + url: Optional[HttpUrl] = Field( + title="URL", + description="The URL address to the related metadata document.", + default=None, + ) + description: Optional[str] = Field( + description="Information about a related metadata document.", default=None + ) + + +class SubjectOf(CreativeWork): + url: Optional[HttpUrl] = Field( + title="URL", + description="The URL address that serves as a reference to access additional details related to the record. " + "It is important to note that this type of metadata solely pertains to the record itself and " + "may not necessarily be an integral component of the record, unlike the HasPart metadata.", + default=None, + ) + description: Optional[str] = Field( + description="Information about a related resource that is about or describes this " + "resource - e.g., a related metadata document describing the resource.", + default=None, + ) + + +class LanguageEnum(str, Enum): + @classmethod + def __get_pydantic_json_schema__( + cls, schema: JsonSchemaValue, handler: GetJsonSchemaHandler + ) -> JsonSchemaValue: + schema.update(type="string", title="Language", description="") + return schema + + eng = "eng" + esp = "esp" + + +class InLanguageStr(str): + @classmethod + def __get_pydantic_json_schema__( + cls, schema: JsonSchemaValue, handler: GetJsonSchemaHandler + ) -> JsonSchemaValue: + schema.update( + type="string", title="Other", description="Please specify another language." + ) + return schema + + +# TODO: should we allow a list of identifiers? Does this align with SchemaOrg? +# Doing so means that we are actually storing this as a comma separated string +# of identifiers which seems strange. +class IdentifierStr(str): + @classmethod + def __get_pydantic_json_schema__( + cls, schema: JsonSchemaValue, handler: GetJsonSchemaHandler + ) -> JsonSchemaValue: + schema.update( + {"type": "array", "items": {"type": "string", "title": "Identifier"}} + ) + return schema + + @classmethod + def __get_validators__(cls): + yield cls.validate + + @classmethod + def validate( + cls, value: Union[str, List[str]], info: ValidationInfo + ) -> "IdentifierStr": + if isinstance(value, str): + value = [value] # Convert single string to list + if not isinstance(value, list) or not all( + isinstance(item, str) for item in value + ): + raise TypeError("Identifier must be a string or a list of strings") + return cls(", ".join(value)) # Join the list into a single string for storage + + +# TODO: start here. +class SpatialReference(SchemaBaseModel): + type: Literal["SpatialReference"] = Field( + alias="@type", # type: ignore + default="SpatialReference", + description="The spatial reference system associated with the Place's geographic representation.", + ) + name: str = Field( + title="Name", + description="Name of the spatial reference system.", + ) + srsType: str = Field( + title="SRS Type", + description="Type of the spatial reference system, either Geographic or Projected.", + ) + code: Optional[str] = Field( + title="Code", + description="Code of the spatial reference system.", + default=None, + ) + wktString: Optional[str] = Field( + title="SRS WKT String", + description="The string representation of the spatial reference system in Well-Known-Text format.", + default=None, + ) + + @field_validator("srsType") + def validate_content_size(cls, v): + v = v.strip().lower() + if not v: + raise ValueError("empty string") + if v not in ["geographic", "projected"]: + raise ValueError("SRS Type must be either 'geographic' or 'projected'") + return v + + +class Grant(SchemaBaseModel): + type: Literal["Grant"] = Field( + alias="@type", # type: ignore + default="Grant", + description="This metadata represents details about a grant or financial assistance provided to an " + "individual(s) or organization(s) for supporting the work related to the record.", + ) + name: str = Field( + title="Name or title", + description="A text string indicating the name or title of the grant or financial assistance.", + ) + description: Optional[str] = Field( + description="A text string describing the grant or financial assistance.", + default=None, + ) + identifier: Optional[str] = Field( + title="Funding identifier", + description="Grant award number or other identifier.", + default=None, + ) + funder: Optional[FunderOrganization] = Field( + description="The organization that provided the funding or sponsorship.", + default=None, + ) + + +class TemporalCoverage(SchemaBaseModel): + startDate: datetime = Field( + title="Start date", + description="A date/time object containing the instant corresponding to the commencement of the time " + "interval (ISO8601 formatted date - YYYY-MM-DDTHH:MM).", + json_schema_extra={ + "formatMaximum": {"$data": "1/endDate"}, + "errorMessage": { + "formatMaximum": "must be lesser than or equal to End date" + }, + }, + ) + endDate: Optional[datetime] = Field( + title="End date", + description="A date/time object containing the instant corresponding to the termination of the time " + "interval (ISO8601 formatted date - YYYY-MM-DDTHH:MM). If the ending date is left off, " + "that means the temporal coverage is ongoing.", + json_schema_extra={ + "formatMinimum": {"$data": "1/startDate"}, + "errorMessage": { + "formatMinimum": "must be greater than or equal to Start date" + }, + }, + default=None, + ) + + +class GeoCoordinates(SchemaBaseModel): + type: Literal["GeoCoordinates"] = Field( + alias="@type", # type: ignore + default="GeoCoordinates", + description="Geographic coordinates that represent a specific location on the Earth's surface. " + "GeoCoordinates typically consists of two components: latitude and longitude.", + ) + latitude: float = Field( + description="Represents the angular distance of a location north or south of the equator, " + "measured in degrees and ranges from -90 to +90 degrees." + ) + longitude: float = Field( + description="Represents the angular distance of a location east or west of the Prime Meridian, " + "measured in degrees and ranges from -180 to +180 degrees." + ) + + @field_validator("latitude") + def validate_latitude(cls, v): + if not -90 <= v <= 90: + raise ValueError("Latitude must be between -90 and 90") + return v + + @field_validator("longitude") + def validate_longitude(cls, v): + if not -180 <= v <= 180: + raise ValueError("Longitude must be between -180 and 180") + return v + + +class GeoShape(SchemaBaseModel): + type: Literal["GeoShape"] = Field( + alias="@type", # type: ignore + default="GeoShape", + description="A structured representation that describes the coordinates of a geographic feature.", + ) + validate_bbox: bool = Field( + default=True, + exclude=True, + description="Flag to turn on/off bounding box validation", + ) + box: str = Field( + description="A box is a rectangular region defined by a pair of coordinates representing the " + "southwest and northeast corners of the box." + ) + + @field_validator("box") + def validate_box(cls, v, info): + # ignoring validation for now + if not isinstance(v, str): + raise TypeError("string required") + v = v.strip() + if not v: + raise ValueError("empty string") + + # exit if validation is turned off + if not info.data.get("validate_bbox", 'Could not find "validate_bbox"'): + return v + + v_parts = v.split(" ") + if len(v_parts) != 4: + raise ValueError("Bounding box must have 4 coordinate points") + for index, item in enumerate(v_parts, start=1): + try: + item = float(item) + except ValueError: + raise ValueError("Bounding box coordinate value is not a number") + item = abs(item) + if index % 2 == 0: + if item >= 180: + raise ValueError( + f"Bounding box coordinate east/west must be between -180 and 180, got {item}" + ) + elif item >= 90: + raise ValueError( + f"Bounding box coordinate north/south must be between -90 and 90, got {item}" + ) + + return v + + +class PropertyValue(SchemaBaseModel): + type: Literal["PropertyValue"] = Field( + alias="@type", # type: ignore + default="PropertyValue", + description="A property-value pair.", + ) + name: str = Field(description="The name of the property.") + + value: Union[str, float, bool] = ( + Field( # this also could be a StructuredValue for more complex cases (if we want) + description="The value of the property." + ) + ) + + propertyID: Optional[str] = Field( + title="Property ID", description="The ID of the property.", default=None + ) + unitCode: Optional[str] = Field( + title="Measurement unit", + description="The unit of measurement for the value.", + default=None, + ) + description: Optional[str] = Field( + description="A description of the property.", default=None + ) + minValue: Optional[float] = Field( + title="Minimum value", + description="The minimum allowed value for the property.", + default=None, + ) + maxValue: Optional[float] = Field( + title="Maximum value", + description="The maximum allowed value for the property.", + default=None, + ) + measurementTechnique: Optional[str] = Field( + title="Measurement technique", + description="A technique or technology used in a measurement.", + default=None, + ) + model_config = { + "populate_by_name": True, # Ensures aliases work during model initialization + "title": "PropertyValue", + } + + +class Place(SchemaBaseModel): + type: Literal["Place"] = Field( + alias="@type", # type: ignore + default="Place", + description="Represents the focus area of the record's content.", + ) + name: Optional[str] = Field(description="Name of the place.", default=None) + geo: Optional[Union[GeoCoordinates, GeoShape]] = Field( + description="Specifies the geographic coordinates of the place in the form of a point location, line, " + "or area coverage extent.", + default=None, + ) + + additionalProperty: Optional[List[PropertyValue]] = Field( + title="Additional properties", + default=None, + description="Additional properties of the place.", + ) + + srs: Optional[SpatialReference] = Field( + description="The spatial reference system associated with the Place's geographic representation", + default=None, + ) + + @model_validator(mode="after") + def validate_geo_or_name_required(self): + if not self.name and not self.geo: + raise ValueError( + "Either place name or geo location of the place must be provided" + ) + return self + + +class MediaObject(SchemaBaseModel): + type: Literal["MediaObject"] = Field( + alias="@type", # type: ignore + default="MediaObject", + description="An item that encodes the record.", + ) + contentUrl: Union[str, AnyUrl] = Field( + title="Content URL", + description="The direct URL link to access or download the actual content of the media object.", + ) + encodingFormat: Optional[str] = Field( + title="Encoding format", + description="Represents the specific file format in which the media is encoded.", + default=None, + ) # TODO enum for encoding formats + contentSize: str = Field( + title="Content size", + description="Represents the file size, expressed in bytes, kilobytes, megabytes, or another " + "unit of measurement.", + ) + name: str = Field(description="The name of the media object (file).") + sha256: Optional[str] = Field( + title="SHA-256", + description="The SHA-256 hash of the media object.", + default=None, + ) + isPartOf: Optional[List[MediaObjectPartOf]] = Field( + title="Is part of", + description="Link to or citation for a related metadata document that this media object is a part of", + default=None, + ) + + @field_validator("contentSize") + def validate_content_size(cls, v): + v = v.strip() + if not v: + raise ValueError("empty string") + + match = re.match(r"([0-9.]+)([a-zA-Z]+$)", v.replace(" ", "")) + if not match: + raise ValueError("invalid format") + + size_unit = match.group(2) + if size_unit.upper() not in [ + "KB", + "MB", + "GB", + "TB", + "PB", + "KILOBYTES", + "MEGABYTES", + "GIGABYTES", + "TERABYTES", + "PETABYTES", + ]: + raise ValueError("invalid unit") + + return v + + # TODO: not validating the SHA-256 hash for now as the hydroshare content file hash is in md5 format + # @validator('sha256') + # def validate_sha256_string_format(cls, v): + # if v: + # v = v.strip() + # if v and not re.match(r"^[a-fA-F0-9]{64}$", v): + # raise ValueError('invalid SHA-2 + + +class DataDownload(MediaObject): + type: Literal["DataDownload"] = Field( + alias="@type", # type: ignore + default="DataDownload", + description="All or part of a Dataset in downloadable form.", + ) + measurementMethod: Optional[Union[DefinedTerm, HttpUrl, str]] = Field( + title="Measurement method", + description="A subproperty of measurementTechnique that can be used for specifying specific methods, in particular via MeasurementMethodEnum.", + default=None, + ) + measurementTechnique: Optional[Union[DefinedTerm, HttpUrl, str]] = Field( + title="Measurement technique", + description="A technique, method or technology used in an Observation, StatisticalVariable or Dataset (or DataDownload, DataCatalog), corresponding to the method used for measuring the corresponding variable(s) (for datasets, described using variableMeasured; for Observation, a StatisticalVariable). Often but not necessarily each variableMeasured will have an explicit representation as (or mapping to) an property such as those defined in Schema.org, or other RDF vocabularies and 'knowledge graphs'. In that case the subproperty of variableMeasured called measuredProperty is applicable.", + default=None, + ) + + +class VideoObject(MediaObject): + type: Literal["VideoObject"] = Field( + alias="@type", # type: ignore + default="VideoObject", + description="A video file.", + ) + # there are many fields that we could implement here, but I don't think they'll be + # used. I'm adding VideoObject because it's referenced in out unit tests. Consider + # removing in the future. + + +# combine the media objects together to make referencing easier +MediaType = Union[MediaObject, DataDownload, VideoObject] + + +class Dataset(CreativeWork): + measurementMethod: Optional[ + Union[ + HttpUrl, + List[HttpUrl], + DefinedTerm, + List[DefinedTerm], + str, + List[str], + ] + ] = None + issn: Optional[Union[str, List[str]]] = None + measurementTechnique: Optional[ + Union[ + str, + List[str], + HttpUrl, + List[HttpUrl], + DefinedTerm, + List[DefinedTerm], + ] + ] = None + catalog: Optional[Union["DataCatalog", List["DataCatalog"]]] = None + variablesMeasured: Optional[ + Union[str, List[str], "PropertyValue", List["PropertyValue"]] + ] = None + variableMeasured: Optional[ + Union[ + str, + List[str], + "PropertyValue", + List["PropertyValue"], + ] + ] = None + includedDataCatalog: Optional[Union["DataCatalog", List["DataCatalog"]]] = None + includedInDataCatalog: Optional[Union["DataCatalog", List["DataCatalog"]]] = None + datasetTimeInterval: Optional[Union[datetime, List[datetime]]] = None + distribution: Optional[Union["DataDownload", List["DataDownload"]]] = None + + +class DataCatalog(CreativeWork): + """ + A collection of datasets. + """ + + measurementMethod: Optional[ + Union[ + HttpUrl, + List[HttpUrl], + DefinedTerm, + List[DefinedTerm], + str, + List[str], + ] + ] = None + dataset: Optional[Union[Dataset, List[Dataset]]] = None + measurementTechnique: Optional[ + Union[ + str, + List[str], + HttpUrl, + List[HttpUrl], + DefinedTerm, + List[DefinedTerm], + ] + ] = None diff --git a/hsclient/schema/core.py b/hsclient/schema/core.py new file mode 100644 index 0000000..d9acd7d --- /dev/null +++ b/hsclient/schema/core.py @@ -0,0 +1,222 @@ +from datetime import datetime +from typing import List, Optional, Union + + +from pydantic import ( + Field, + HttpUrl, +) + +from .base import ( + CreativeWork, + SchemaBaseModel, + Creator, + Contributor, + Organization, + Provider, + PublisherOrganization, + SubjectOf, + LanguageEnum, + InLanguageStr, + Draft, + Incomplete, + Obsolete, + Published, + Public, + Discoverable, + Grant, + TemporalCoverage, + Place, + HasPart, + IsPartOf, + Relation, + MediaType, + PropertyValue, +) + + +class CoreMetadata(SchemaBaseModel): + + ################### + # REQUIRED FIELDS # + ################### + context: HttpUrl = Field( + alias="@context", # type: ignore + default=HttpUrl( + "https://hydroshare.org/schema" + ), # TODO: This is a placeholder for now. + description="Specifies the vocabulary employed for understanding the structured data markup.", + ) + type: str = Field( + alias="@type", # type: ignore + default="CreativeWork", + description="A creative work that may include various forms of content, such as datasets," + " software source code, digital documents, etc.", + frozen=True, + json_schema_extra={"readOnly": True}, + # json_schema_extra={ + # "enum": ["Dataset", "Notebook", "Software Source Code"], + # }, + ) + additionalType: Optional[str] = Field( + title="Additional type", + description="An additional type for the resource. This can be used to further specify the type of the" + " resource (e.g., Composite Resource).", + frozen=True, + json_schema_extra={"readOnly": True}, + ) + name: str = Field( + title="Name or title", + description="A text string with a descriptive name or title for the resource.", + ) + description: Optional[str] = Field( + default=None, + title="Description or abstract", + description="A text string containing a description/abstract for the resource.", + ) + url: HttpUrl = Field( + title="URL", + description="A URL for the landing page that describes the resource and where the content " + "of the resource can be accessed. If there is no landing page," + " provide the URL of the content.", + ) + identifier: List[str] = Field( + title="Identifiers", + description="Any kind of identifier for the resource. Identifiers may be DOIs or unique strings " + "assigned by a repository. Multiple identifiers can be entered. Where identifiers can be " + "encoded as URLs, enter URLs here.", + ) + + creator: List[Union[Creator, Organization]] = Field( + description="Person or Organization that created the resource." + ) + dateCreated: datetime = Field( + title="Date created", description="The date on which the resource was created.", + frozen=True, + json_schema_extra={"readOnly": True}, + ) + keywords: List[str] = Field( + min_length=1, + description="Keywords or tags used to describe the dataset, delimited by commas.", + ) + license: Union[CreativeWork, HttpUrl] = Field( + description="A license document that applies to the resource." + ) + provider: Union[Organization, Provider] = Field( + description="The repository, service provider, organization, person, or service performer that provides" + " access to the resource." + ) + + ################### + # OPTIONAL FIELDS # + ################### + contributor: Optional[List[Union[Contributor, Organization]]] = Field( + description="Person or Organization that contributed to the resource.", + default=None + ) + publisher: Optional[PublisherOrganization] = Field( + title="Publisher", + description="Where the resource is permanently published, indicated the repository, service provider," + " or organization that published the resource - e.g., CUAHSI HydroShare." + " This may be the same as Provider.", + default=None, + ) + datePublished: Optional[datetime] = Field( + title="Date published", + description="Date of first publication for the resource.", + default=None, + frozen=True, + json_schema_extra={"readOnly": True}, + ) + subjectOf: Optional[List[SubjectOf]] = Field( + title="Subject of", + description="Link to or citation for a related resource that is about or describes this resource" + " - e.g., a journal paper that describes this resource or a related metadata document " + "describing the resource.", + default=None, + ) + version: Optional[str] = Field( + description="A text string indicating the version of the resource.", + default=None, + ) # TODO find something better than float for number + inLanguage: Optional[Union[LanguageEnum, InLanguageStr]] = Field( + title="Language", + description="The language of the content of the resource.", + default=None, + ) + creativeWorkStatus: Optional[Union[Draft, Incomplete, Obsolete, Published, Public, Discoverable]] = Field( + title="Resource status", + description="The status of this resource in terms of its stage in a lifecycle. " + "Example terms include Incomplete, Draft, Published, and Obsolete.", + default=None, + ) + dateModified: Optional[datetime] = Field( + title="Date modified", + description="The date on which the resource was most recently modified or updated.", + default=None, + frozen=True, + json_schema_extra={"readOnly": True}, + ) + funding: Optional[List[Grant]] = Field( + description="A Grant or monetary assistance that directly or indirectly provided funding or sponsorship " + "for creation of the resource.", + default=None, + ) + temporalCoverage: Optional[TemporalCoverage] = Field( + title="Temporal coverage", + description="The time period that applies to all of the content within the resource.", + default=None, + ) + spatialCoverage: Optional[Place] = Field( + description="The spatialCoverage of a CreativeWork indicates the place(s) which are the focus of the content. " + "It is a sub property of contentLocation intended primarily for more technical and " + "detailed materials. For example with a Dataset, it indicates areas that the dataset " + "describes: a dataset of New York weather would have spatialCoverage which was the " + "place: the state of New York.", + default=None, + ) + hasPart: Optional[List[HasPart]] = Field( + title="Has part", + description="Link to or citation for a related resource that is part of this resource.", + default=None, + ) + isPartOf: Optional[List[IsPartOf]] = Field( + title="Is part of", + description="Link to or citation for a related resource that this resource is a " + "part of - e.g., a related collection.", + default=None, + ) + # 'relation' is not a standard schema.org property, but we include it here to + # capture any other types of relations that don't fit into the above properties + relation: Optional[List[Relation]] = Field( + title="Relation", + description="All other types of relations", + default=None, + ) + additionalProperty: Optional[List[PropertyValue]] = Field( + title="Additional properties", + default=None, + description="Additional properties of the place.", + ) + + # using MediaType here to allow for MediaObject and its subclasses (e.g., DataDownload, VideoObject) + associatedMedia: Optional[Union[MediaType, List[MediaType]]] = Field( + title="Resource content", + description="A media object that encodes this CreativeWork. This property is a synonym for encoding.", + default=None, + frozen=True, + json_schema_extra={"readOnly": True}, + ) + citation: Optional[List[str]] = Field( + title="Citation", + description="A bibliographic citation for the resource.", + default=None, + frozen=True, + json_schema_extra={"readOnly": True}, + ) + model_config = { + "arbitrary_types_allowed": True, + "json_encoders": { + HttpUrl: str, # Convert HttpUrl to a string during serialization + }, + } diff --git a/hsclient/schema/dataset.py b/hsclient/schema/dataset.py new file mode 100644 index 0000000..b766edb --- /dev/null +++ b/hsclient/schema/dataset.py @@ -0,0 +1,155 @@ +from .core import CoreMetadata +from enum import Enum +from typing import Optional, List, Union, Literal +from pydantic import Field, HttpUrl +from datetime import datetime + +from .base import ( + Public, + Published, + Discoverable, + Private, + PropertyValue, + Organization, + DataCatalog, + Creator, + CreativeWork, + Provider, + MediaType, +) + +from .datavariable import Dimension, DataVariable + + +class AdditionalType(str, Enum): + GEOGRAPHIC_FEATURE = 'GeographicFeature' + GEOGRAPHIC_RASTER = 'GeographicRaster' + MULTIDIMENSIONAL = 'MultiDimensional' + TABULAR = 'Tabular' + +class ScientificDataset(CoreMetadata): + """ + A generic dataset extends the CoreMetadata class with a few additional fields and is designed to capture + scientific file-level metadata.. It also overrides many of the required CoreMetadata fields to make them + optional. It generally follows the design of the Schema.org Dataset class. + """ + + context: HttpUrl = Field( + alias="@context", # type: ignore + default=HttpUrl( + "https://hydroshare.org/schema" + ), # TODO: This is a placeholder for now. + description="Specifies the vocabulary employed for understanding the structured data markup.", + ) + type: Literal["ScientificDataset"] = Field( + alias="@type", # type: ignore + default="ScientificDataset", + description="A body of structured information describing some topic(s) of interest.", + frozen=True, + json_schema_extra={"readOnly": True}, + ) + + variableMeasured: List[Union[str, PropertyValue, DataVariable]] = Field( + title="Variables measured", description="Measured variables." + ) + + dimensions: List[Dimension] = Field( + title="Dimensions", + description="Dimensions defined in the multi-dimensional dataset.", + ) + + # redefine associatedMedia from "Core" as a required field + associatedMedia: Union[MediaType, List[MediaType]] = Field( + title="Resource content", + description="A media object that encodes this CreativeWork. This property is a synonym for encoding.", + default=[] + ) + + coordinates: Optional[List[DataVariable]] = Field( + default=None, + title="Coordinates", + description="Coordinate variables that provide values along a dimension", + ) + + includedInDataCatalog: Optional[DataCatalog] = Field( + default=None, + title="DataCatalog", + description="A data catalog which contains this dataset.", + ) + + additionalProperty: Optional[ + Union[str, List[str], PropertyValue, List[PropertyValue]] + ] = Field( + title="Additional properties", + default=None, + description="Additional properties of the dataset that don't fit into schema org.", + ) + sourceOrganization: Optional[Organization] = Field( + default=None, + title="Source organization", + description="The organization that provided the data for this dataset.", + ) + additionalType: Optional[AdditionalType] = Field( + default=None, + title="Additional Type", + description = "Additional descriptive types associated with the ScientificDataset. This is typically used by applications to provide specialized funcationality for categories for content.", + frozen=True, + json_schema_extra={"readOnly": True}, + ) + + # --------------------------------------------- + # make required CoreMetadata fields "Optional", + # but preserve the metadata defined in the + # parent class + # --------------------------------------------- + name: Optional[str] = Field( + default=None, + title="Name or title", + description="A text string with a descriptive name or title for the resource.", + ) + description: Optional[str] = Field( + default=None, + title="Description or abstract", + description="A text string containing a description/abstract for the resource.", + ) + url: Optional[HttpUrl] = Field( + default=None, + title="URL", + description="A URL for the landing page that describes the resource and where the content " + "of the resource can be accessed. If there is no landing page," + " provide the URL of the content.", + ) + identifier: Optional[List[str]] = Field( + default=None, + title="Identifiers", + description="Any kind of identifier for the resource. Identifiers may be DOIs or unique strings " + "assigned by a repository. Multiple identifiers can be entered. Where identifiers can be " + "encoded as URLs, enter URLs here.", + ) + + creator: Optional[List[Union[Creator, Organization]]] = Field( + default=None, description="Person or Organization that created the resource." + ) + dateCreated: Optional[datetime] = Field( + default=None, + title="Date created", + description="The date on which the resource was created.", + frozen=True, + json_schema_extra={"readOnly": True}, + ) + keywords: Optional[List[str]] = Field( + default=None, + description="Keywords or tags used to describe the dataset, delimited by commas.", + ) + license: Optional[Union[CreativeWork, HttpUrl]] = Field( + default=None, description="A license document that applies to the resource." + ) + provider: Optional[Union[Organization, Provider]] = Field( + default=None, + description="The repository, service provider, organization, person, or service performer that provides" + " access to the resource.", + ) + sharing_status: Optional[Union[Public, Published, Private, Discoverable]] = Field( + default=None, + description="The Sharing status of the resource. This is a controlled vocabulary term from CUAHSI HydroShare", + ) diff --git a/hsclient/schema/datavariable.py b/hsclient/schema/datavariable.py new file mode 100644 index 0000000..80b9871 --- /dev/null +++ b/hsclient/schema/datavariable.py @@ -0,0 +1,96 @@ +#!/usr/bin/env python3 + +""" +CUAHSI's extension to the SchemaOrg vocabulary to better encapsulate +scientific data variable metadata. +""" + +from typing import Optional, Literal, Union +from pydantic import Field, BaseModel, HttpUrl + + +class Dimension(BaseModel): + """ + A variable dimension defines an axes of a variable; it provides the shape of the variable. + + """ + + context: HttpUrl = Field( + alias="@context", # type: ignore + default=HttpUrl( + "https://hydroshare.org/schema" + ), # TODO: This is a placeholder for now. + description="Specifies the vocabulary employed for understanding the structured data markup.", + ) + type: Literal["Dimension"] = Field( + alias="@type", # type: ignore + default="Dimension", + description="A body of structured information describing variable dimensions.", + ) + name: str = Field( + title="Dimension Name", + description="The name of the dimension", + ) + shape: int = Field( + title="Variable Shape", + description="The shape of the variable", + ) + description: Optional[str] = Field( + default=None, + title="Variable Description", + description="The description of the variable measured", + ) + +class DataVariable(BaseModel): + + context: HttpUrl = Field( + alias="@context", # type: ignore + default=HttpUrl( + "https://hydroshare.org/schema" + ), # TODO: This is a placeholder for now. + description="Specifies the vocabulary employed for understanding the structured data markup.", + ) + type: Literal["DataVariable"] = Field( + alias="@type", # type: ignore + default="DataVariable", + description="A body of structured information describing core metadata shared by all data variables.", + ) + + name: str = Field( + title="Variable Name", + description="The name of the variable measured", + ) + dimensions: Union[str, list[str]] = Field( + title="Variable Dimensions", + description="The dimension names corresponding to the variable being measured", + ) + description: Optional[str] = Field( + default=None, + title="Variable Description", + description="The description of the variable measured", + ) + dataType: Optional[str] = Field( + default=None, + title="The data type of the variable", + description="The data type of the variable measured", + ) + unit: Optional[str] = Field( + default=None, + title="Variable Unit", + description="The unit of the variable measured", + ) + minValue: Optional[Union[float,str]] = Field( + title="Minimum Value", + description="The minimum value in the raster grid", + default=None, + ) + maxValue: Optional[Union[float,str]] = Field( + title="Maximum Value", + description="The maximum value in the raster grid", + default=None, + ) + noDataValue: Optional[Union[float,str]] = Field( + title="No Data Value", + description="The numerical value used to represent null data in the raster grid", + default=None, + ) \ No newline at end of file diff --git a/hsclient/schema/utils.py b/hsclient/schema/utils.py new file mode 100644 index 0000000..aac9f60 --- /dev/null +++ b/hsclient/schema/utils.py @@ -0,0 +1,35 @@ +from typing import Any, Dict, Union + +from ..metadata_adapter.legacy_resource_models import LegacyResourceMetadata +from ..metadata_adapter.adapter import MetadataAdapter +from .base import Place, GeoShape +from .dataset import ScientificDataset +from pydantic import ValidationError + +def load_json(json_data: Dict[str, Any], data_path: str) -> Union[LegacyResourceMetadata, ScientificDataset]: + """Loads JSON metadata into the appropriate schema model based on file path.""" + # TODO: PK: hydroshare needs to be fixed so that relation has the valid data. + # if 'relation' in json_data: + # json_data['relation'] = [] + + if data_path.endswith("dataset_metadata.json"): + return MetadataAdapter.to_legacy_resource_metadata(json_data) + + try: + # Content type/aggregation metadata + return ScientificDataset.model_validate(json_data) + except ValidationError: + spatial_coverage = json_data.get('spatialCoverage') + spatial_coverage_model = None + if spatial_coverage and spatial_coverage['type'] == 'Place': + geo = spatial_coverage['geo'] + if geo and geo['type'] == 'GeoShape': + geo_model = GeoShape(box=geo['box'], validate_bbox=False) + place = Place(name=spatial_coverage.get('name'), srs=spatial_coverage.get('srs'), geo=geo_model) + spatial_coverage_model = place + json_data['spatialCoverage'] = None + content_type_metadata= ScientificDataset.model_validate(json_data) + content_type_metadata.spatialCoverage = spatial_coverage_model + # TODO: PK: Need to convert an instance of ScientificDataset to corresponding + # content type legacy metadata model + return content_type_metadata diff --git a/requirements.txt b/requirements.txt index 3f0cf55..ad6c6d5 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,6 +2,7 @@ hsmodels>=1.0.4 pytest requests==2.24.0 email-validator +s3fs pandas netCDF4 xarray diff --git a/tests/conftest.py b/tests/conftest.py index a9faa84..3b105d5 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -12,7 +12,11 @@ def change_test_dir(request): @pytest.fixture() def hydroshare(change_test_dir): - hs = HydroShare(os.getenv("HYDRO_USERNAME"), os.getenv("HYDRO_PASSWORD"), os.getenv("HYDRO_HOST", "beta.hydroshare.org")) + hs = HydroShare( + os.getenv("HYDRO_USERNAME"), + os.getenv("HYDRO_PASSWORD"), + os.getenv("HYDRO_HOST", "beta.hydroshare.org"), + s3_endpoint_url=os.getenv("HYDRO_S3_ENDPOINT_URL", "https://s3.beta.hydroshare.org")) return hs diff --git a/tests/test_resource_metadata_adapter.py b/tests/test_resource_metadata_adapter.py new file mode 100644 index 0000000..5c664b9 --- /dev/null +++ b/tests/test_resource_metadata_adapter.py @@ -0,0 +1,271 @@ +from hsclient.metadata_adapter.resource_models import SchemaOrgResourceMetadata +from hsclient.metadata_adapter.legacy_resource_models import LegacyResourceMetadata +from hsclient.metadata_adapter.adapter import MetadataAdapter +from hsclient.schema.utils import load_json + + +def test_adapter_to_legacy_resource_metadata() -> None: + metadata = { + "@context": "https://hydroshare.org/schema", + "@type": "CreativeWork", + "additionalType": "CompositeResource", + "name": "Schema Title", + "description": "Schema Abstract", + "url": "https://example.com/resource", + "identifier": ["https://example.com/id", "doi:10.1/example"], + "creator": [ + { + "@type": "Person", + "name": "Doe, Jane", + "email": "jane@example.com", + "identifier": "https://orcid.org/0000-0001-2345-6789", + "affiliation": {"@type": "Organization", "name": "CUAHSI"}, + } + ], + "contributor": [{"@type": "Organization", "name": "Utah State University", "url": "https://usu.edu"}], + "dateCreated": "2024-01-01T00:00:00", + "dateModified": "2024-01-02T00:00:00", + "datePublished": "2024-01-03T00:00:00", + "keywords": ["hydrology", "water"], + "inLanguage": "eng", + "license": {"@type": "CreativeWork", "name": "CC-BY-4.0", "url": "https://example.com/license"}, + "provider": {"@type": "Organization", "name": "HydroShare", "url": "https://www.hydroshare.org/"}, + "publisher": {"@type": "Organization", "name": "CUAHSI HydroShare", "url": "https://www.hydroshare.org/"}, + "funding": [ + { + "@type": "Grant", + "name": "Grant Title", + "identifier": "NSF-123", + "funder": {"@type": "Organization", "name": "NSF", "url": "https://nsf.gov"}, + } + ], + "spatialCoverage": { + "@type": "Place", + "name": "Logan", + "geo": {"@type": "GeoShape", "box": "42.0 -111.0 41.5 -111.5"}, + }, + "temporalCoverage": {"startDate": "2024-01-01T00:00:00", "endDate": "2024-01-31T00:00:00"}, + "hasPart": [{"@type": "CreativeWork", "name": "Child resource", "url": "https://example.com/child"}], + "relation": [{"name": "References", "description": "Journal article", "url": "https://example.com/paper"}], + # "relations": [{"type": "The content of this resource references", "value": "Journal article https://example.com/paper"}], + "citation": ["Citation text"], + "creativeWorkStatus": {"name": "Public"}, + "additionalProperty": [{"name": "custom-key", "value": "custom-value"}], + } + + result = MetadataAdapter.to_legacy_resource_metadata(metadata) + + assert isinstance(result, LegacyResourceMetadata) + assert result.type == "CompositeResource" + assert result.title == "Schema Title" + assert result.abstract == "Schema Abstract" + assert str(result.identifier) == "https://example.com/id" + assert result.additional_metadata == {"custom-key": "custom-value"} + assert result.publisher.name == "CUAHSI HydroShare" + assert str(result.publisher.url) == "https://www.hydroshare.org/" + assert result.period_coverage.start.isoformat() == "2024-01-01T00:00:00" + assert result.period_coverage.end.isoformat() == "2024-01-31T00:00:00" + assert result.spatial_coverage.name == "Logan" + assert result.spatial_coverage.northlimit == 42.0 + assert result.spatial_coverage.eastlimit == -111.0 + assert result.spatial_coverage.southlimit == 41.5 + assert result.spatial_coverage.westlimit == -111.5 + assert result.citation == "Citation text" + assert len(result.relations) == 1 + assert result.relations[0].type == "References" + assert result.relations[0].value == "Journal article, https://example.com/paper" + assert result.contributors[0].organization == "Utah State University" + assert str(result.contributors[0].homepage) == "https://usu.edu/" + assert result.created.isoformat() == "2024-01-01T00:00:00" + assert result.modified.isoformat() == "2024-01-02T00:00:00" + assert result.published.isoformat() == "2024-01-03T00:00:00" + assert result.subjects == ["hydrology", "water"] + assert result.sharing_status == "public" + assert result.isPartOf == [] + assert len(result.hasPart) == 1 + assert str(result.hasPart[0].url) == "https://example.com/child" + assert result.hasPart[0].name == "Child resource" + assert result.rights.statement == "CC-BY-4.0" + assert str(result.rights.url) == "https://example.com/license" + assert result.awards[0].title == "Grant Title" + assert result.awards[0].number == "NSF-123" + assert result.awards[0].funding_agency_name == "NSF" + assert str(result.awards[0].funding_agency_url) == "https://nsf.gov/" + assert result.creators[0].name == "Doe, Jane" + assert result.creators[0].email == "jane@example.com" + assert result.creators[0].organization == "CUAHSI" + assert result.creators[0].identifiers == {"ORCID": "https://orcid.org/0000-0001-2345-6789"} + assert result.provider.name == "HydroShare" + assert str(result.provider.url) == "https://www.hydroshare.org/" + + +def test_adapter_to_schema_org_metadata() -> None: + metadata = { + "type": "CompositeResource", + "title": "Legacy Title", + "abstract": "Legacy Abstract", + "url": "https://example.com/resource", + "identifier": "https://example.com/id", + "creators": [ + { + "name": "Doe, Jane", + "email": "jane@example.com", + "organization": "CUAHSI", + "identifiers": {"ORCID": "https://orcid.org/0000-0001-2345-6789"}, + } + ], + "contributors": [{"organization": "Utah State University", "homepage": "https://usu.edu"}], + "created": "2024-01-01T00:00:00", + "modified": "2024-01-02T00:00:00", + "published": "2024-01-03T00:00:00", + "subjects": ["hydrology", "water"], + "language": "eng", + "rights": {"statement": "CC-BY-4.0", "url": "https://example.com/license"}, + "awards": [ + { + "funding_agency_name": "NSF", + "title": "Grant Title", + "number": "NSF-123", + "funding_agency_url": "https://nsf.gov", + } + ], + "spatial_coverage": { + "type": "box", + "name": "Logan", + "northlimit": 42.0, + "eastlimit": -111.0, + "southlimit": 41.5, + "westlimit": -111.5, + "units": "Decimal degrees", + "projection": "WGS 84 EPSG:4326", + }, + "period_coverage": {"start": "2024-01-01T00:00:00", "end": "2024-01-31T00:00:00"}, + "relations": [ + { + "type": "The content of this resource references", + "value": "Journal article, https://example.com/paper", + }, + ], + "citation": "Citation text", + "additional_metadata": {"custom-key": "custom-value"}, + "publisher": {"name": "CUAHSI HydroShare", "url": "https://www.hydroshare.org/"}, + } + + result = MetadataAdapter.to_resource_metadata(metadata) + + assert isinstance(result, SchemaOrgResourceMetadata) + assert result.type == "CreativeWork" + assert result.additionalType == "CompositeResource" + assert result.name == "Legacy Title" + assert result.description == "Legacy Abstract" + assert str(result.url) == "https://example.com/resource" + assert result.identifier == ["https://example.com/id"] + assert result.citation == ["Citation text"] + assert result.creator[0].name == "Doe, Jane" + assert result.contributor[0].name == "Utah State University" + assert result.dateCreated.isoformat() == "2024-01-01T00:00:00" + assert result.dateModified.isoformat() == "2024-01-02T00:00:00" + assert result.datePublished.isoformat() == "2024-01-03T00:00:00" + assert result.keywords == ["hydrology", "water"] + assert result.additionalProperty[0].name == "custom-key" + assert result.additionalProperty[0].value == "custom-value" + assert result.publisher.name == "CUAHSI HydroShare" + assert str(result.publisher.url) == "https://www.hydroshare.org/" + assert len(result.relation) == 1 + assert result.relation[0].name == "The content of this resource references" + assert result.inLanguage == "eng" + assert result.license.name == "CC-BY-4.0" + assert str(result.license.url) == "https://example.com/license" + assert result.funding[0].name == "Grant Title" + assert result.funding[0].identifier == "NSF-123" + assert result.funding[0].funder.name == "NSF" + assert str(result.funding[0].funder.url) == "https://nsf.gov/" + assert result.spatialCoverage.geo.box == "42.0 -111.0 41.5 -111.5" + assert result.temporalCoverage.startDate.isoformat() == "2024-01-01T00:00:00" + assert result.temporalCoverage.endDate.isoformat() == "2024-01-31T00:00:00" + assert result.provider.name == "HydroShare" + assert str(result.provider.url) == "https://www.hydroshare.org/" + + +def test_load_json_returns_legacy_resource_metadata_for_resource_metadata_json_file() -> None: + metadata = { + "@context": "https://hydroshare.org/schema", + "@type": "CreativeWork", + "additionalType": "CompositeResource", + "name": "Schema Title", + "description": "Schema Abstract", + "url": "https://example.com/resource", + "identifier": ["https://example.com/id"], + "creator": [{"@type": "Person", "name": "Doe, Jane"}], + "contributor": [{"@type": "Organization", "name": "Utah State University", "url": "https://usu.edu"}], + "dateCreated": "2024-01-01T00:00:00", + "dateModified": "2024-01-02T00:00:00", + "datePublished": "2024-01-03T00:00:00", + "publisher": {"@type": "Organization", "name": "HydroShare", "url": "https://www.hydroshare.org/"}, + "keywords": ["hydrology"], + "inLanguage": "eng", + "license": {"@type": "CreativeWork", "name": "CC-BY-4.0", "url": "https://example.com/license"}, + "funding": [ + { + "@type": "Grant", + "name": "Grant Title", + "identifier": "NSF-123", + "funder": {"@type": "Organization", "name": "NSF", "url": "https://nsf.gov"}, + } + ], + "spatialCoverage": { + "@type": "Place", + "name": "Logan", + "geo": {"@type": "GeoShape", "box": "42.0 -111.0 41.5 -111.5"}, + }, + "temporalCoverage": {"startDate": "2024-01-01T00:00:00", "endDate": "2024-01-31T00:00:00"}, + "relation": [{"name": "References", "description": "Journal article", "url": "https://example.com/paper"}], + "hasPart": [{"@type": "CreativeWork", "name": "Child resource", "url": "https://example.com/child"}], + "creativeWorkStatus": {"name": "Public"}, + "provider": {"@type": "Organization", "name": "HydroShare", "url": "https://www.hydroshare.org/"}, + "citation": ["Citation text"], + "additionalProperty": [{"name": "custom-key", "value": "custom-value"}], + } + + result = load_json(metadata, "123/.hsjsonld/dataset_metadata.json") + + assert isinstance(result, LegacyResourceMetadata) + assert result.title == "Schema Title" + assert result.abstract == "Schema Abstract" + assert str(result.url) == "https://example.com/resource" + assert str(result.identifier) == "https://example.com/id" + assert result.creators[0].name == "Doe, Jane" + assert result.contributors[0].organization == "Utah State University" + assert str(result.contributors[0].homepage) == "https://usu.edu/" + assert result.created.isoformat() == "2024-01-01T00:00:00" + assert result.modified.isoformat() == "2024-01-02T00:00:00" + assert result.published.isoformat() == "2024-01-03T00:00:00" + assert result.subjects == ["hydrology"] + assert result.language == "eng" + assert result.rights.statement == "CC-BY-4.0" + assert str(result.rights.url) == "https://example.com/license" + assert result.awards[0].funding_agency_name == "NSF" + assert result.awards[0].title == "Grant Title" + assert result.awards[0].number == "NSF-123" + assert str(result.awards[0].funding_agency_url) == "https://nsf.gov/" + assert result.spatial_coverage.name == "Logan" + assert result.spatial_coverage.northlimit == 42.0 + assert result.spatial_coverage.eastlimit == -111.0 + assert result.spatial_coverage.southlimit == 41.5 + assert result.spatial_coverage.westlimit == -111.5 + assert result.period_coverage.start.isoformat() == "2024-01-01T00:00:00" + assert result.period_coverage.end.isoformat() == "2024-01-31T00:00:00" + assert result.sharing_status == "public" + assert result.additional_metadata == {"custom-key": "custom-value"} + assert result.publisher.name == "HydroShare" + assert str(result.publisher.url) == "https://www.hydroshare.org/" + assert len(result.relations) == 1 + assert result.relations[0].type == "References" + assert result.relations[0].value == "Journal article, https://example.com/paper" + assert result.isPartOf == [] + assert len(result.hasPart) == 1 + assert str(result.hasPart[0].url) == "https://example.com/child" + assert result.hasPart[0].name == "Child resource" + assert result.provider.name == "HydroShare" + assert str(result.provider.url) == "https://www.hydroshare.org/" + assert result.citation == "Citation text" diff --git a/tests/test_resource_metadata_editing.py b/tests/test_resource_metadata_editing.py new file mode 100644 index 0000000..c6b739e --- /dev/null +++ b/tests/test_resource_metadata_editing.py @@ -0,0 +1,182 @@ +from datetime import datetime + +from hsmodels.schemas.fields import ( + AwardInfo, + BoxCoverage, + PointCoverage, + Contributor, + Creator, + PeriodCoverage, + Relation, + Rights, +) +from hsmodels.schemas.enums import RelationType + +def test_update_resource_metadata(hydroshare) -> None: + """ + Test updating resource metadata for a HydroShare resource using hsclient. + This is to ensure that the resource metadata editing API has not changed in hsclient due to source metadata + being in schema.org format. + """ + hs = hydroshare + new_res = None + try: + new_res = hs.create() + assert new_res.metadata.title == "Untitled resource" + assert len(new_res.resource_id) == 32 + assert len(new_res.metadata.creators) == 1 + assert len(new_res.metadata.contributors) == 0 + assert len(new_res.metadata.awards) == 0 + assert len(new_res.metadata.relations) == 0 + assert new_res.metadata.spatial_coverage is None + assert new_res.metadata.period_coverage is None + assert new_res.metadata.rights is not None + assert new_res.metadata.relations == [] + assert new_res.metadata.citation is not None + assert new_res.metadata.additional_metadata == {} + assert len(new_res.metadata.associatedMedia) == 1 + assert new_res.metadata.associatedMedia[0].contentUrl.endswith( + f"/{new_res.resource_id}/.hsjsonld/file_manifest.json" + ) + assert new_res.metadata.publisher is None + assert new_res.metadata.sharing_status == "private" + assert new_res.metadata.created is not None + assert new_res.metadata.modified is not None + assert new_res.metadata.published is None + # update the resource metadata + new_res.metadata.title = "Resource Metadata Editing Example" + new_res.metadata.abstract = ( + "This resource demonstrates how to update metadata for a HydroShare resource using hsclient." + ) + new_res.metadata.subjects = ["hsclient", "HydroShare", "metadata"] + + new_res.metadata.spatial_coverage = BoxCoverage( + name="Logan, Utah", + northlimit=41.7910, + eastlimit=-111.7664, + southlimit=41.6732, + westlimit=-111.9079, + projection="WGS 84 EPSG:4326", + type="box", + units="Decimal degrees", + ) + new_res.metadata.period_coverage = PeriodCoverage( + start=datetime.strptime("2024-01-01T00:00:00", "%Y-%m-%dT%H:%M:%S"), + end=datetime.strptime("2024-01-31T00:00:00", "%Y-%m-%dT%H:%M:%S"), + ) + + new_res.metadata.additional_metadata = { + "Observed Variable": "Temperature", + "Site Location": "Logan, Utah", + } + + new_res.metadata.relations.append( + Relation( + type=RelationType.isReferencedBy, + value="Example article, https://example.com/article", + ) + ) + + new_res.metadata.awards.append( + AwardInfo( + funding_agency_name="National Science Foundation", + title="Example Funding Award", + number="NSF-123456", + funding_agency_url="https://www.nsf.gov", + ) + ) + + new_res.metadata.creators.append( + Creator( + name="Doe, Jane", + organization="Example University", + email="jane.doe@example.com", + ) + ) + + new_res.metadata.contributors.append( + Contributor( + name="Smith, Alex", + organization="Example Lab", + email="alex.smith@example.com", + ) + ) + + new_res.metadata.rights = Rights( + statement="Creative Commons Attribution 4.0 International", + url="https://creativecommons.org/licenses/by/4.0/", + ) + + new_res.save() + # sleep(1) # wait for the resource to be saved and the metadata to be updated in HydroShare + + assert new_res.metadata.title == "Resource Metadata Editing Example" + assert ( + new_res.metadata.abstract + == "This resource demonstrates how to update metadata for a HydroShare resource using hsclient." + ) + assert new_res.metadata.subjects == ["hsclient", "HydroShare", "metadata"] + assert new_res.metadata.additional_metadata == { + "Observed Variable": "Temperature", + "Site Location": "Logan, Utah", + } + assert len(new_res.metadata.relations) == 1 + assert new_res.metadata.relations[0].type == RelationType.isReferencedBy + assert new_res.metadata.relations[0].value == "Example article, https://example.com/article" + assert len(new_res.metadata.awards) == 1 + assert new_res.metadata.awards[0].funding_agency_name == "National Science Foundation" + assert new_res.metadata.awards[0].title == "Example Funding Award" + assert new_res.metadata.awards[0].number == "NSF-123456" + assert str(new_res.metadata.awards[0].funding_agency_url) == "https://www.nsf.gov/" + assert len(new_res.metadata.creators) == 2 + assert new_res.metadata.creators[-1].name == "Doe, Jane" + assert new_res.metadata.creators[-1].organization == "Example University" + assert new_res.metadata.creators[-1].email == "jane.doe@example.com" + assert len(new_res.metadata.contributors) == 1 + assert new_res.metadata.contributors[0].name == "Smith, Alex" + assert new_res.metadata.contributors[0].organization == "Example Lab" + assert new_res.metadata.contributors[0].email == "alex.smith@example.com" + assert new_res.metadata.rights.statement == "Creative Commons Attribution 4.0 International" + assert str(new_res.metadata.rights.url) == "https://creativecommons.org/licenses/by/4.0/" + assert new_res.metadata.spatial_coverage.name == "Logan, Utah" + assert new_res.metadata.spatial_coverage.northlimit == 41.7910 + assert new_res.metadata.spatial_coverage.eastlimit == -111.7664 + assert new_res.metadata.spatial_coverage.southlimit == 41.6732 + assert new_res.metadata.spatial_coverage.westlimit == -111.9079 + # check projection attribute does exist as the schema.org spatial coverage model has no matching fields + # if we add the 'projection' field to the schema.org spatial coverage model, + # we can update this assertion to check the value of the projection field. + assert not hasattr(new_res.metadata.spatial_coverage, "projection") + # assert new_res.metadata.spatial_coverage.projection == "WGS 84 EPSG:4326" + + assert hasattr(new_res.metadata.spatial_coverage, "type") + assert new_res.metadata.spatial_coverage.type == "box" + + # check units attribute does exist as the schema.org spatial coverage model has no matching fields + # if we add the 'units' field to the schema.org spatial coverage model, + # we can update this assertion to check the value of the units field. + assert not hasattr(new_res.metadata.spatial_coverage, "units") + # assert new_res.metadata.spatial_coverage.units == "Decimal degrees" + assert new_res.metadata.period_coverage.start.isoformat() == "2024-01-01T00:00:00" + assert new_res.metadata.period_coverage.end.isoformat() == "2024-01-31T00:00:00" + assert new_res.metadata.publisher is None + + # update using point spatial coverage + new_res.metadata.spatial_coverage = PointCoverage( + name="Logan, Utah", + north=41.7910, + east=-111.7664, + projection="WGS 84 EPSG:4326", + units="Decimal degrees", + ) + new_res.save() + assert new_res.metadata.spatial_coverage.name == "Logan, Utah" + assert new_res.metadata.spatial_coverage.north == 41.7910 + assert new_res.metadata.spatial_coverage.east == -111.7664 + assert new_res.metadata.spatial_coverage.type == "point" + finally: + if new_res is not None: + try: + new_res.delete() + except Exception: + pass