Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
78 changes: 0 additions & 78 deletions data_collections_api/base_schema.py

This file was deleted.

10 changes: 9 additions & 1 deletion data_collections_api/cli/data_collections_main.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from data_collections_api.cli.record_upload import get_arg_parser as get_upload_parser
from data_collections_api.cli.record_upload import main as upload_main
from data_collections_api.metadata import dump_example, validate_cli
from data_collections_api.schemas import SCHEMAS


def get_arg_parser() -> argparse.ArgumentParser:
Expand Down Expand Up @@ -47,6 +48,13 @@ def get_arg_parser() -> argparse.ArgumentParser:
help="Parse FILE as this type (default: determine from suffix).",
default=None,
)
sp.add_argument(
"-S",
"--schema",
choices=SCHEMAS.keys(),
help="Validate against given schema (default: default).",
default="default",
)
sp.set_defaults(func=validate_cli)

# Dump
Expand All @@ -61,7 +69,7 @@ def get_arg_parser() -> argparse.ArgumentParser:
"-f",
"--format",
choices=("json", "yaml"),
help="Parse FILE as this type (default: determine from suffix).",
help="Dump FILE as this type (default: determine from suffix).",
default=None,
)
sp.set_defaults(func=dump_example)
Expand Down
18 changes: 17 additions & 1 deletion data_collections_api/dumpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from contextlib import suppress
import json
from pathlib import Path
from typing import Any, Literal, NamedTuple, TextIO
from typing import Any, Literal, NamedTuple, TextIO, overload

_YAML_TYPE = None

Expand Down Expand Up @@ -313,6 +313,22 @@ def get_str_loader(fmt: Formats):
return get_load_dump(fmt, loader=True, string=True)


@overload
def guess_format(path: Path) -> Formats: ... # numpydoc ignore=GL08


@overload
def guess_format(
path: Path, *, raise_on_invalid: Literal[True]
) -> Formats: ... # numpydoc ignore=GL08


@overload
def guess_format(
path: Path, *, raise_on_invalid: Literal[False]
) -> Formats | None: ... # numpydoc ignore=GL08


def guess_format(path: Path, *, raise_on_invalid: bool = True) -> Formats | None:
"""
Guess format from path suffix.
Expand Down
18 changes: 9 additions & 9 deletions data_collections_api/metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,14 @@
from functools import singledispatch
from pathlib import Path

from data_collections_api.base_schema import base_schema as schema
from data_collections_api.dumpers import (
Formats,
get_dumper,
get_loader,
get_str_loader,
guess_format,
)
from data_collections_api.schemas import Schema, get_schema

EXAMPLES_FOLDER = Path(__file__).parent / "examples"

Expand Down Expand Up @@ -64,31 +64,31 @@ def validate_metadata(_val, fmt: Formats | None = None):


@validate_metadata.register(dict)
def _(data: dict) -> dict:
return schema.validate(data)
def _(data: dict, schema: Schema | str) -> dict:
return get_schema(schema).validate(data)


@validate_metadata.register(str)
def _(data: Path | str, fmt: Formats) -> dict:
def _(data: Path | str, schema: Schema | str, fmt: Formats) -> dict:
try:
data = get_str_loader(fmt)(data)
except Exception:
data = Path(data)
return validate_metadata(data)
else:
return schema.validate(data)

return get_schema(schema).validate(data)


@validate_metadata.register(Path)
def _(path: Path, fmt: Formats | None = None) -> dict:
def _(path: Path, schema: Schema | str, fmt: Formats | None = None) -> dict:
fmt = fmt or guess_format(path)
data = get_loader(fmt)(path)
return schema.validate(data)
return get_schema(schema).validate(data)


@validate_metadata.register(argparse.Namespace)
def _(inp: argparse.Namespace) -> dict:
return validate_metadata(inp.file, inp.format)
return validate_metadata(inp.file, inp.schema, inp.format)


def validate_cli(inp: argparse.Namespace) -> dict:
Expand Down
52 changes: 52 additions & 0 deletions data_collections_api/schemas/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
"""Module defining different schemas available for use."""

from __future__ import annotations

from functools import singledispatch

from schema import Schema as Schema

from .base import base_schema

SCHEMAS = {
"base": base_schema,
"default": base_schema,
}


@singledispatch
def get_schema(schema) -> Schema:
"""
Get schema.

Parameters
----------
schema : Schema | str
Schema to get.

Returns
-------
Schema
Desired schema.

Raises
------
NotImplementedError
Passed an invalid type.

Examples
--------
>>> get_schema(base_schema)
>>> get_schema("default")
"""
raise NotImplementedError(f"Cannot find schema with {type(schema).__name__}")


@get_schema.register
def _(schema: Schema) -> Schema:
return schema


@get_schema.register
def _(schema: str) -> Schema:
return SCHEMAS[schema]
108 changes: 108 additions & 0 deletions data_collections_api/schemas/base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
"""Parsing schema for metadata."""

from __future__ import annotations

from datetime import date
from urllib.parse import urlparse, urlunparse

from schema import And, Literal, Optional, Or, Regex, Schema, Use

ORCID_ID_RE = r"(\d{4}-){3}\d{4}"
UUID_RE = r"\d{8}-(\d{4}-){3}\d{12}"

id_schema = Or(
{
Literal("scheme", description="ID scheme."): "orcid",
Literal("identifier", description="An [ORCID](https://orcid.org)."): Regex(ORCID_ID_RE),
},
{
Optional(Literal("scheme", description="ID scheme."), default="doi"): "doi",
Literal("identifier", description="A [DOI](https://www.doi.org)"): And(
Use(urlparse), lambda x: x.scheme and x.netloc, Use(urlunparse)
),
},
)

creator_schema = Schema(
{
Optional(Literal("affiliations", description="Member affiliations.")): [
{
Literal("name", description="Name of institution."): str,
},
],
Literal("person_or_org", description="Person or organisation."): {
Or(
Literal("name", description="Full set of given names."),
Literal("family_name", description="Family name(s)."),
): And(str, len),
Optional(Literal("given_name", description="Given name(s).")): And(str, len),
Optional(Literal("identifiers", description="ORCIDs or other IDs")): [id_schema],
Literal("type", description="Personal or organisation."): Or("personal"),
},
},
ignore_extra_keys=True,
)

metadata_schema = Schema(
{
Literal("title", description="Title of resource."): And(str, len),
Literal("description", description="Summary of resource."): And(str, len),
Literal("creators", description="List of creators."): [creator_schema],
Literal("rights", description="Rights or license."): [
{
Literal("id", description="ID of rights or license."): Or("cc-by-4.0"),
},
],
Literal("resource_type", description="Type of resource."): {
Literal("id", description="Resource class."): Or("model"),
},
Optional(
Literal("subjects", description="List of keywords defining subjects resource covers."),
default=[],
): [{Literal("subject", description="Subject keyword."): str}],
Literal("version", description="Current version of resource."): Regex(r"^v\d+(\.\d+)*"),
Optional(Literal("publisher", description="Publisher of resource.")): str,
Optional(Literal("publication_date", description="Date of publication of resource.")): Or(
date.fromisoformat, date.fromtimestamp
),
Optional(
Literal("identifiers", description="Resource identifiers such as ORCID or DOI.")
): [id_schema],
},
)

base_schema = Schema(
{
Optional(
Literal("access", description="Accessibility of data outside of owners."),
default={"files": "public", "record": "public"},
): {
Optional(Literal("embargo", description="Details of resource embargo.")): {
Literal("active", description="Whether resource is under embargo."): bool,
Literal("reason", description="Cause for embargo."): Or(str, None),
},
Optional(
Literal("files", description="Accessibility to individual files."), default="public"
): Or("public", "private"),
Optional(
Literal("record", description="Accessibility to record as a whole."),
default="public",
): Or("public", "private"),
Optional(Literal("status", description="Current status or resource.")): Or(
"open", "closed"
),
},
Optional(Literal("files", description="Details of files.")): {
Literal("enabled", description="Whether file is enabled."): bool
},
Literal("custom_fields", description="Block for custom data."): {
Literal("dsmd", description="Domain specific metadata (dsmd)."): [dict]
},
Literal("metadata", description="Resource metadata."): metadata_schema,
Optional(
Literal("community", description="UUID of community associated with resource.")
): Regex(UUID_RE),
},
description="Base schema from which community specific schemas are built.",
name="base",
)
29 changes: 29 additions & 0 deletions docs/source/api/data_collections_api.cli.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
data\_collections\_api.cli package
==================================

Submodules
----------

data\_collections\_api.cli.data\_collections\_main module
---------------------------------------------------------

.. automodule:: data_collections_api.cli.data_collections_main
:members:
:show-inheritance:
:undoc-members:

data\_collections\_api.cli.record\_upload module
------------------------------------------------

.. automodule:: data_collections_api.cli.record_upload
:members:
:show-inheritance:
:undoc-members:

Module contents
---------------

.. automodule:: data_collections_api.cli
:members:
:show-inheritance:
:undoc-members:
Loading