diff --git a/data_collections_api/base_schema.py b/data_collections_api/base_schema.py deleted file mode 100644 index 7c65c55..0000000 --- a/data_collections_api/base_schema.py +++ /dev/null @@ -1,78 +0,0 @@ -"""Parsing schema for metadata.""" - -from __future__ import annotations - -from datetime import date -from urllib.parse import urlparse, urlunparse -from uuid import UUID - -from schema import And, Optional, Or, Regex, Schema, Use - -ORCID_ID_RE = r"(\d{4}-){3}\d{4}" - -id_schema = Or( - { - "scheme": "orcid", - "identifier": Regex(ORCID_ID_RE), - }, - { - "identifier": And(Use(urlparse), lambda x: x.scheme and x.netloc, Use(urlunparse)), - Optional("scheme", default="doi"): "doi", - }, -) - -creator_schema = Schema( - { - Optional("affiliations"): [ - { - "name": str, - }, - ], - "person_or_org": { - Or("name", "family_name"): And(str, len), - Optional("given_name"): And(str, len), - Optional("identifiers"): [id_schema], - "type": Or("personal"), - }, - }, - ignore_extra_keys=True, -) - -metadata_schema = Schema( - { - "title": And(str, len), - "description": And(str, len), - "creators": [creator_schema], - "rights": [ - { - "id": Or("cc-by-4.0"), - }, - ], - "resource_type": { - "id": Or("model"), - }, - Optional("subjects", default=[]): [{"subject": str}], - "version": Regex(r"^v\d+(\.\d+)*"), - Optional("publisher"): str, - Optional("publication_date"): Or(date.fromisoformat, date.fromtimestamp), - Optional("identifiers"): [id_schema], - }, -) - -base_schema = Schema( - { - Optional("access", default={"files": "public", "record": "public"}): { - Optional("embargo"): { - "active": bool, - "reason": Or(str, None), - }, - Optional("files", default="public"): Or("public", "private"), - Optional("record", default="public"): Or("public", "private"), - Optional("status"): Or("open", "closed"), - }, - Optional("files"): {"enabled": bool}, - "custom_fields": {"dsmd": [dict]}, - "metadata": metadata_schema, - Optional("community"): UUID, - }, -) diff --git a/data_collections_api/cli/data_collections_main.py b/data_collections_api/cli/data_collections_main.py index 288c2aa..61bf82b 100644 --- a/data_collections_api/cli/data_collections_main.py +++ b/data_collections_api/cli/data_collections_main.py @@ -9,6 +9,7 @@ from data_collections_api.cli.record_upload import get_arg_parser as get_upload_parser from data_collections_api.cli.record_upload import main as upload_main from data_collections_api.metadata import dump_example, validate_cli +from data_collections_api.schemas import SCHEMAS def get_arg_parser() -> argparse.ArgumentParser: @@ -47,6 +48,13 @@ def get_arg_parser() -> argparse.ArgumentParser: help="Parse FILE as this type (default: determine from suffix).", default=None, ) + sp.add_argument( + "-S", + "--schema", + choices=SCHEMAS.keys(), + help="Validate against given schema (default: default).", + default="default", + ) sp.set_defaults(func=validate_cli) # Dump @@ -61,7 +69,7 @@ def get_arg_parser() -> argparse.ArgumentParser: "-f", "--format", choices=("json", "yaml"), - help="Parse FILE as this type (default: determine from suffix).", + help="Dump FILE as this type (default: determine from suffix).", default=None, ) sp.set_defaults(func=dump_example) diff --git a/data_collections_api/dumpers.py b/data_collections_api/dumpers.py index 34dd1f6..0475650 100644 --- a/data_collections_api/dumpers.py +++ b/data_collections_api/dumpers.py @@ -6,7 +6,7 @@ from contextlib import suppress import json from pathlib import Path -from typing import Any, Literal, NamedTuple, TextIO +from typing import Any, Literal, NamedTuple, TextIO, overload _YAML_TYPE = None @@ -313,6 +313,22 @@ def get_str_loader(fmt: Formats): return get_load_dump(fmt, loader=True, string=True) +@overload +def guess_format(path: Path) -> Formats: ... # numpydoc ignore=GL08 + + +@overload +def guess_format( + path: Path, *, raise_on_invalid: Literal[True] +) -> Formats: ... # numpydoc ignore=GL08 + + +@overload +def guess_format( + path: Path, *, raise_on_invalid: Literal[False] +) -> Formats | None: ... # numpydoc ignore=GL08 + + def guess_format(path: Path, *, raise_on_invalid: bool = True) -> Formats | None: """ Guess format from path suffix. diff --git a/data_collections_api/metadata.py b/data_collections_api/metadata.py index 944b394..de5b0e3 100644 --- a/data_collections_api/metadata.py +++ b/data_collections_api/metadata.py @@ -6,7 +6,6 @@ from functools import singledispatch from pathlib import Path -from data_collections_api.base_schema import base_schema as schema from data_collections_api.dumpers import ( Formats, get_dumper, @@ -14,6 +13,7 @@ get_str_loader, guess_format, ) +from data_collections_api.schemas import Schema, get_schema EXAMPLES_FOLDER = Path(__file__).parent / "examples" @@ -64,31 +64,31 @@ def validate_metadata(_val, fmt: Formats | None = None): @validate_metadata.register(dict) -def _(data: dict) -> dict: - return schema.validate(data) +def _(data: dict, schema: Schema | str) -> dict: + return get_schema(schema).validate(data) @validate_metadata.register(str) -def _(data: Path | str, fmt: Formats) -> dict: +def _(data: Path | str, schema: Schema | str, fmt: Formats) -> dict: try: data = get_str_loader(fmt)(data) except Exception: data = Path(data) return validate_metadata(data) - else: - return schema.validate(data) + + return get_schema(schema).validate(data) @validate_metadata.register(Path) -def _(path: Path, fmt: Formats | None = None) -> dict: +def _(path: Path, schema: Schema | str, fmt: Formats | None = None) -> dict: fmt = fmt or guess_format(path) data = get_loader(fmt)(path) - return schema.validate(data) + return get_schema(schema).validate(data) @validate_metadata.register(argparse.Namespace) def _(inp: argparse.Namespace) -> dict: - return validate_metadata(inp.file, inp.format) + return validate_metadata(inp.file, inp.schema, inp.format) def validate_cli(inp: argparse.Namespace) -> dict: diff --git a/data_collections_api/schemas/__init__.py b/data_collections_api/schemas/__init__.py new file mode 100644 index 0000000..fe54fed --- /dev/null +++ b/data_collections_api/schemas/__init__.py @@ -0,0 +1,52 @@ +"""Module defining different schemas available for use.""" + +from __future__ import annotations + +from functools import singledispatch + +from schema import Schema as Schema + +from .base import base_schema + +SCHEMAS = { + "base": base_schema, + "default": base_schema, +} + + +@singledispatch +def get_schema(schema) -> Schema: + """ + Get schema. + + Parameters + ---------- + schema : Schema | str + Schema to get. + + Returns + ------- + Schema + Desired schema. + + Raises + ------ + NotImplementedError + Passed an invalid type. + + Examples + -------- + >>> get_schema(base_schema) + >>> get_schema("default") + """ + raise NotImplementedError(f"Cannot find schema with {type(schema).__name__}") + + +@get_schema.register +def _(schema: Schema) -> Schema: + return schema + + +@get_schema.register +def _(schema: str) -> Schema: + return SCHEMAS[schema] diff --git a/data_collections_api/schemas/base.py b/data_collections_api/schemas/base.py new file mode 100644 index 0000000..8556092 --- /dev/null +++ b/data_collections_api/schemas/base.py @@ -0,0 +1,108 @@ +"""Parsing schema for metadata.""" + +from __future__ import annotations + +from datetime import date +from urllib.parse import urlparse, urlunparse + +from schema import And, Literal, Optional, Or, Regex, Schema, Use + +ORCID_ID_RE = r"(\d{4}-){3}\d{4}" +UUID_RE = r"\d{8}-(\d{4}-){3}\d{12}" + +id_schema = Or( + { + Literal("scheme", description="ID scheme."): "orcid", + Literal("identifier", description="An [ORCID](https://orcid.org)."): Regex(ORCID_ID_RE), + }, + { + Optional(Literal("scheme", description="ID scheme."), default="doi"): "doi", + Literal("identifier", description="A [DOI](https://www.doi.org)"): And( + Use(urlparse), lambda x: x.scheme and x.netloc, Use(urlunparse) + ), + }, +) + +creator_schema = Schema( + { + Optional(Literal("affiliations", description="Member affiliations.")): [ + { + Literal("name", description="Name of institution."): str, + }, + ], + Literal("person_or_org", description="Person or organisation."): { + Or( + Literal("name", description="Full set of given names."), + Literal("family_name", description="Family name(s)."), + ): And(str, len), + Optional(Literal("given_name", description="Given name(s).")): And(str, len), + Optional(Literal("identifiers", description="ORCIDs or other IDs")): [id_schema], + Literal("type", description="Personal or organisation."): Or("personal"), + }, + }, + ignore_extra_keys=True, +) + +metadata_schema = Schema( + { + Literal("title", description="Title of resource."): And(str, len), + Literal("description", description="Summary of resource."): And(str, len), + Literal("creators", description="List of creators."): [creator_schema], + Literal("rights", description="Rights or license."): [ + { + Literal("id", description="ID of rights or license."): Or("cc-by-4.0"), + }, + ], + Literal("resource_type", description="Type of resource."): { + Literal("id", description="Resource class."): Or("model"), + }, + Optional( + Literal("subjects", description="List of keywords defining subjects resource covers."), + default=[], + ): [{Literal("subject", description="Subject keyword."): str}], + Literal("version", description="Current version of resource."): Regex(r"^v\d+(\.\d+)*"), + Optional(Literal("publisher", description="Publisher of resource.")): str, + Optional(Literal("publication_date", description="Date of publication of resource.")): Or( + date.fromisoformat, date.fromtimestamp + ), + Optional( + Literal("identifiers", description="Resource identifiers such as ORCID or DOI.") + ): [id_schema], + }, +) + +base_schema = Schema( + { + Optional( + Literal("access", description="Accessibility of data outside of owners."), + default={"files": "public", "record": "public"}, + ): { + Optional(Literal("embargo", description="Details of resource embargo.")): { + Literal("active", description="Whether resource is under embargo."): bool, + Literal("reason", description="Cause for embargo."): Or(str, None), + }, + Optional( + Literal("files", description="Accessibility to individual files."), default="public" + ): Or("public", "private"), + Optional( + Literal("record", description="Accessibility to record as a whole."), + default="public", + ): Or("public", "private"), + Optional(Literal("status", description="Current status or resource.")): Or( + "open", "closed" + ), + }, + Optional(Literal("files", description="Details of files.")): { + Literal("enabled", description="Whether file is enabled."): bool + }, + Literal("custom_fields", description="Block for custom data."): { + Literal("dsmd", description="Domain specific metadata (dsmd)."): [dict] + }, + Literal("metadata", description="Resource metadata."): metadata_schema, + Optional( + Literal("community", description="UUID of community associated with resource.") + ): Regex(UUID_RE), + }, + description="Base schema from which community specific schemas are built.", + name="base", +) diff --git a/docs/source/api/data_collections_api.cli.rst b/docs/source/api/data_collections_api.cli.rst new file mode 100644 index 0000000..55eb18d --- /dev/null +++ b/docs/source/api/data_collections_api.cli.rst @@ -0,0 +1,29 @@ +data\_collections\_api.cli package +================================== + +Submodules +---------- + +data\_collections\_api.cli.data\_collections\_main module +--------------------------------------------------------- + +.. automodule:: data_collections_api.cli.data_collections_main + :members: + :show-inheritance: + :undoc-members: + +data\_collections\_api.cli.record\_upload module +------------------------------------------------ + +.. automodule:: data_collections_api.cli.record_upload + :members: + :show-inheritance: + :undoc-members: + +Module contents +--------------- + +.. automodule:: data_collections_api.cli + :members: + :show-inheritance: + :undoc-members: diff --git a/docs/source/api/data_collections_api.rst b/docs/source/api/data_collections_api.rst index 1572033..eae62b2 100644 --- a/docs/source/api/data_collections_api.rst +++ b/docs/source/api/data_collections_api.rst @@ -8,6 +8,7 @@ Subpackages :maxdepth: 4 data_collections_api.cli + data_collections_api.schemas Submodules ---------- @@ -36,14 +37,6 @@ data\_collections\_api.metadata module :show-inheritance: :undoc-members: -data\_collections\_api.schema module ------------------------------------- - -.. automodule:: data_collections_api.schema - :members: - :show-inheritance: - :undoc-members: - Module contents --------------- diff --git a/docs/source/api/data_collections_api.schemas.rst b/docs/source/api/data_collections_api.schemas.rst new file mode 100644 index 0000000..05185db --- /dev/null +++ b/docs/source/api/data_collections_api.schemas.rst @@ -0,0 +1,21 @@ +data\_collections\_api.schemas package +====================================== + +Submodules +---------- + +data\_collections\_api.schemas.base module +------------------------------------------ + +.. automodule:: data_collections_api.schemas.base + :members: + :show-inheritance: + :undoc-members: + +Module contents +--------------- + +.. automodule:: data_collections_api.schemas + :members: + :show-inheritance: + :undoc-members: diff --git a/docs/source/api/modules.rst b/docs/source/api/modules.rst index 19f1e46..4ce85db 100644 --- a/docs/source/api/modules.rst +++ b/docs/source/api/modules.rst @@ -1,5 +1,5 @@ -API Documentation -================= +data_collections_api +==================== .. toctree:: :maxdepth: 4 diff --git a/docs/source/cli.rst b/docs/source/cli.rst index 4f77bc0..f25f5a1 100644 --- a/docs/source/cli.rst +++ b/docs/source/cli.rst @@ -1,32 +1,228 @@ CLI Usage ========= -``data_collections_api`` provides a few commandline tools for -simplifying the process of uploading or verifying data. +``data_collections_api`` provides a few command-line tools for simplifying the process of uploading +or verifying data and metadata. data_collections ---------------- -``data_collections`` is the general top-level interface to the -tools. These tools are implemented as sub-parsers within the main -module. +.. program:: data_collections +.. describe:: data_collections + + .. option:: operation {validate,template,dump,upload} + + .. option:: validate + + Validate metadata + + .. option:: template + .. option:: dump + + Dump a template file. + + .. option:: `upload` + + Upload a dataset to an invenio repository. + + .. option:: -V, --version + + Show program's version number and exit. + +``data_collections`` is the general top-level interface to the tools. These tools are implemented as +sub-parsers within the main module. + +.. admonition:: Running ``data_collections`` + + By default, if the ``data_collections_api`` package is installed, ``data_collections`` is + installed as an executable script on your main ``PATH``. In general, this is the main entry + point. + + If that is not desired, it is possible to run ``data_collections`` through the python module + system:: + + python -m data_collections_api + + where the ``data_collections_api`` **module** (folder) is on the current ``sys.path`` (by being + installed, in the current ``PYTHONPATH`` or being in the current working directory.):: + + PYTHONPATH=/path/containing/data_collections_api python -m data_collections_api + + Throughout the rest of this page, we will assume ``data_collections`` is used as the main + entrypoint. + + +.. _upload: upload ****** -Construct a set of data and upload a set of files along with the metadata to an -Invenio repository. +.. program:: data_collections upload +.. describe:: data_collections upload + + .. option:: --api-url URL + + URL for the API associated with the Invenio repository, e.g. + https://data-collections-staging.psdi.ac.uk/api + + .. option:: --api-key str + + Your API key/token for accessing the Invenio repository instance. + + .. option:: --metadata-path file + + File path to the yaml file containing the metadata to upload a record to an Invenio + repository, e.g. path/to/files/record.yaml + + .. option:: -f {json,yaml}, --metadata-format {json,yaml} + + Parse metadata file as this type (default: yaml). + + .. option:: --files FILES [FILES ...] + + List of file paths associated with the record to be uploaded, e.g. path/to/files/data.* + + .. option:: --community str + + Name of a Invenio repository community to upload the record to, e.g. biosimdb, + data-to-knowledge, etc. + + +``data_collections_api`` can take your data and metadata and automatically upload it to the Invenio +repository. To do so, you need to have some information at hand: + +- The URL of the repository you wish to upload the data to. In the case of PSDI data, this will + often be https://data-collections.psdi.ac.uk. +- Your API key (also called a Personal Access Token or PAT) for the repository to give permissions + to write and upload data. +- A metadata file detailing the data relating to the files (see :doc:`schemas/index`). +- The files ready to upload. + +With all this prepared, uploading the data is as simple as: + +.. code-block:: console + + data_collections upload --api-url https://data-collections.psdi.ac.uk --api-key 1234567890abcdef --metadata-path /path/to/metata_file.yaml --files FILE1 FILE2 --community my_community + +.. note:: + + Since this is a common operation it is also available as the standalone :option:`upload_record` + +.. _validate: validate ******** +.. program:: data_collections validate + +.. describe:: data_collections validate + + .. option:: FILE + + File to validate. + + .. option:: -f {json,yaml}, --format {json,yaml} + + Parse :option:`FILE` as this type (default: determine from suffix). + + .. option:: -S SCHEMA, --schema SCHEMA + + Validate against the given schema (default: :doc:`base`) + Validate the metadata file for a dataset before uploading. +``data_collections_api`` can validate your metadata file against the schema to verify the contents +of the file match what is required to make a valid upload. + +.. note:: + + The validator does not verify most data itself, you must ensure that all entries are spelled and + written correctly. + +To validate a data file simply run: + +.. code-block:: console + + data_collections validate [file] + +e.g. + +.. code-block:: console + + data_collections validate examples/biosim_record.yaml + +The file can be either in ``json`` or ``yaml`` formats (see: :doc:`schema`). :option:`data_collections validate` will attempt to determine the +appropriate format from the file extension, but this can be specified explicitly with the ``-f`` +flag. + +.. code-block:: console + + data_collections validate -f json examples/biosim_record.yaml + +.. note:: + + The above will raise an error since the file is not in ``json`` format. + dump **** -Dump a template metadata file ready for modification to upload. +.. program:: data_collections template +.. describe:: data_collections template +.. describe:: data_collections dump + + .. option:: FILE + + File to dump. + + .. option:: -f {json,yaml}, --format {json,yaml} + + Dump :option:`FILE` as this type (default: determine from suffix). + +``data_collections_api`` provides a method to quick-start building metadata, ``template`` will dump +an example metadata file for a particular community and data-type (though currently only a basic +example is available). To do so, simply run + +.. code-block:: console + + data_collections dump my_metadata.yaml + +You can then edit and modify this template to fill in the data needed. upload_record ------------- + +.. program:: upload_record +.. describe:: upload_record + + .. option:: --api-url URL + + URL for the API associated with the Invenio repository, e.g. + https://data-collections-staging.psdi.ac.uk/api + + .. option:: --api-key str + + Your API key/token for accessing the Invenio repository instance. + + .. option:: --metadata-path file + + File path to the yaml file containing the metadata to upload a record to an Invenio + repository, e.g. path/to/files/record.yaml + + .. option:: -f {json,yaml}, --metadata-format {json,yaml} + + Parse metadata file as this type (default: yaml). + + .. option:: --files FILES [FILES ...] + + List of file paths associated with the record to be uploaded, e.g. ``path/to/files/data.*`` + + .. option:: --community str + + Name of a Invenio repository community to upload the record to, e.g. biosimdb, + data-to-knowledge, etc. + + +One-stop tool to upload a record to the repository, see `upload`_. + +.. _pat_guide: ... diff --git a/docs/source/conf.py b/docs/source/conf.py index 829180d..e2c0d2f 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -8,8 +8,16 @@ from __future__ import annotations import time +import sys +from pathlib import Path + +DOCS_SRC = Path(__file__).parent.resolve() +sys.path.append(str(DOCS_SRC.parents[2])) +sys.path.append(str(DOCS_SRC)) import data_collections_api +from scripts.schema_gen import main as gen_schema + project = "Data Collections API" copyright_first_year = "2024" @@ -24,13 +32,27 @@ extensions = [ "numpydoc", "sphinx.ext.autodoc", + # "sphinx.ext.apidoc", "sphinx.ext.autosummary", "sphinx.ext.intersphinx", "sphinx.ext.mathjax", "sphinx.ext.viewcode", "sphinxcontrib.contentui", + "myst_parser", +] + +source_suffix = { + ".rst": "restructuredtext", + ".txt": "markdown", + ".md": "markdown", +} + +apidoc_modules = [ + {"path": "../../data_collections_api", "destination": "api/"}, ] +gen_schema(["-Fv", f"-o={DOCS_SRC / 'schemas'}", "-O=%s.md", "all"]) + always_use_bars_union = True napoleon_include_special_with_doc = True napoleon_use_param = True @@ -45,7 +67,6 @@ "python": ("https://docs.python.org/3", None), } - templates_path = ["_templates"] exclude_patterns = [] diff --git a/docs/source/index.rst b/docs/source/index.rst index 766a609..78bccf3 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -14,4 +14,6 @@ Project to allow simplified editing and construction of Invenio data for the PSD :caption: Contents: cli - api/modules + schema + schemas/index + API Documentation diff --git a/docs/source/schema.rst b/docs/source/schema.rst new file mode 100644 index 0000000..2afa57e --- /dev/null +++ b/docs/source/schema.rst @@ -0,0 +1,6 @@ +Metadata Format +=============== + +The metadata file may be either in a `json `__ or `yaml `__ format. + +Each community may have their own metadata requirements, which are all encompassed in the ``custom_fields.dsmd`` field, the full list of supported schemas is available in the :doc:`schemas/index` section. diff --git a/docs/source/schemas/base.md b/docs/source/schemas/base.md new file mode 100644 index 0000000..1183cdc --- /dev/null +++ b/docs/source/schemas/base.md @@ -0,0 +1,44 @@ +# base + +Base schema from which community specific schemas are built. + +### Type: `object` + +> ⚠️ Additional properties are not allowed. + +| Property | Type | Required | Possible values | Default | Description | +| -------- | ---- | -------- | --------------- | ------- | ----------- | +| custom_fields | `object` | ✅ | object | | Block for custom data. | +| custom_fields.dsmd | `array` | ✅ | object | | Domain specific metadata (dsmd). | +| metadata | `object` | ✅ | object | | Resource metadata. | +| metadata.title | `string` | ✅ | string | | Title of resource. | +| metadata.description | `string` | ✅ | string | | Summary of resource. | +| metadata.creators | `array` | ✅ | object | | List of creators. | +| metadata.creators[].affiliations | `array` | | object | | Member affiliations. | +| metadata.creators[].affiliations[].name | `string` | ✅ | string | | Name of institution. | +| metadata.creators[].person_or_org | `object` | ✅ | object | | Person or organisation. | +| metadata.creators[].person_or_org.name | `string` | | string | | Full set of given names. | +| metadata.creators[].person_or_org.family_name | `string` | | string | | Family name(s). | +| metadata.creators[].person_or_org.given_name | `string` | | string | | Given name(s). | +| metadata.creators[].person_or_org.identifiers | `array` | | object and/or object | | ORCIDs or other IDs | +| metadata.creators[].person_or_org.type | `const` | ✅ | `personal` | | Personal or organisation. | +| metadata.rights | `array` | ✅ | object | | Rights or license. | +| metadata.rights[].id | `const` | ✅ | `cc-by-4.0` | | ID of rights or license. | +| metadata.resource_type | `object` | ✅ | object | | Type of resource. | +| metadata.resource_type.id | `const` | ✅ | `model` | | Resource class. | +| metadata.subjects | `array` | | object | `[]` | List of keywords defining subjects resource covers. | +| metadata.subjects[].subject | `string` | ✅ | string | | Subject keyword. | +| metadata.version | `string` | ✅ | [`^v\d+(\.\d+)*`](https://regex101.com/?regex=%5Ev%5Cd%2B%28%5C.%5Cd%2B%29%2A) | | Current version of resource. | +| metadata.publisher | `string` | | string | | Publisher of resource. | +| metadata.publication_date | `None` | | None | | Date of publication of resource. | +| metadata.identifiers | `array` | | object and/or object | | Resource identifiers such as ORCID or DOI. | +| access | `object` | | object | `{"files": "public", "record": "public"}` | Accessibility of data outside of owners. | +| access.embargo | `object` | | object | | Details of resource embargo. | +| access.embargo.active | `boolean` | ✅ | boolean | | Whether resource is under embargo. | +| access.embargo.reason | `string` or `null` | ✅ | string | | Cause for embargo. | +| access.files | `None` | | `public` `private` | `"public"` | Accessibility to individual files. | +| access.record | `None` | | `public` `private` | `"public"` | Accessibility to record as a whole. | +| access.status | `None` | | `open` `closed` | | Current status or resource. | +| files | `object` | | object | | Details of files. | +| files.enabled | `boolean` | ✅ | boolean | | Whether file is enabled. | +| community | `string` | | [`\d{8}-(\d{4}-){3}\d{12}`](https://regex101.com/?regex=%5Cd%7B8%7D-%28%5Cd%7B4%7D-%29%7B3%7D%5Cd%7B12%7D) | | UUID of community associated with resource. | diff --git a/docs/source/schemas/index.rst b/docs/source/schemas/index.rst new file mode 100644 index 0000000..abff4eb --- /dev/null +++ b/docs/source/schemas/index.rst @@ -0,0 +1,10 @@ +Schemas +======= + +This page documents the available schemas. + +.. toctree:: + :maxdepth: 1 + :caption: Schemas: + + base diff --git a/docs/source/scripts/__init__.py b/docs/source/scripts/__init__.py new file mode 100644 index 0000000..14e5b9b --- /dev/null +++ b/docs/source/scripts/__init__.py @@ -0,0 +1 @@ +"""Set of scripts for building docs.""" diff --git a/docs/source/scripts/schema_gen.py b/docs/source/scripts/schema_gen.py new file mode 100644 index 0000000..00bceed --- /dev/null +++ b/docs/source/scripts/schema_gen.py @@ -0,0 +1,265 @@ +"""Generate schema documentation.""" + +from __future__ import annotations + +import argparse +from pathlib import Path +from shutil import rmtree +import sys +from textwrap import indent +from typing import TYPE_CHECKING + +import jsonschema_markdown + +from data_collections_api.schemas import SCHEMAS, Schema, get_schema + +if TYPE_CHECKING: + from collections.abc import Sequence + +__author__ = "Jacob Wilkins" +__version__ = "0.1" + +INDEX_MD = """\ +{filename} +{underline} + +This page documents the available schemas. + +.. toctree:: + :maxdepth: 1 + :caption: Schemas: + +{schemas} + +""" + + +def get_arg_parser() -> argparse.ArgumentParser: + """Get parser for CLI. + + Returns + ------- + argparse.ArgumentParser + Arg parser. + """ + parser = argparse.ArgumentParser( + description="Convert a schema to a markdown document.", + ) + + parser.add_argument("-V", "--version", action="version", version=f"%(prog)s v{__version__}") + parser.add_argument( + "-v", + "--verbose", + action="store_true", + help="Print while generating schemas", + ) + parser.add_argument( + "-F", + "--force", + action="store_true", + help="Force removal of output directory (if not CWD). (default: %(default)s)", + ) + parser.add_argument( + "schemas", + nargs="*", + choices=SCHEMAS.keys() | {"all"}, + help="Schemas to convert or 'all' if all are to be done. (default: %(default)r)", + default="all", + ) + + parser.add_argument( + "--clear", + action=argparse.BooleanOptionalAction, + help="Clear folder before writing. (default: %(default)s)", + default=True, + ) + parser.add_argument( + "--index", + action=argparse.BooleanOptionalAction, + help="Write index file with toctree to folder. (default: %(default)s)", + default=True, + ) + + parser.add_argument( + "--header", + help="Title of index file. (default: %(default)r)", + default="Schemas", + ) + + parser.add_argument( + "-O", + "--out-name", + help=( + "Format to use for naming output, " + "substituting '%%s' for schema key. (default: %(default)r)" + ), + default="%s.md", + ) + parser.add_argument( + "-o", + "--out-folder", + help="Folder to write formatted docs in. (default: %(default)r)", + default="schemas", + type=Path, + ) + + return parser + + +def process_schema( + schema_key: Schema | str, + *, + name: str | None = None, +) -> str: + """Process a schema into markdown. + + Parameters + ---------- + schema_key : Schema or str + Key for schemas. + name : str, optional + Override for name (mandatory if passing :class:`schema` directly). + + Returns + ------- + str + Markdown rendered documentation. + + Raises + ------ + ValueError + Name not passed with Schema. + """ + match (schema_key, name): + case (_, str() as inp): + name = inp + case (str() as inp, _): + name = inp + case _: + raise ValueError(f"Cannot reliably determine name from {type(schema_key).__name__}") + + schema = get_schema(schema_key) + json_schema = schema.json_schema(name) + + return jsonschema_markdown.generate( + json_schema, + title=name, + footer=False, + hide_empty_columns=True, + ) + + +def get_filename(fmt: str, key: str) -> str: + """Format filename from CLI. + + Parameters + ---------- + fmt : str + CLI format. + key : str + Schema key. + + Returns + ------- + str + Formatted filename. + + Examples + -------- + >>> get_filename("%s.md", "base") + 'base.md' + """ + return fmt % key + + +def clear_folder(folder: Path, *, force: bool = False, verbose: bool = False) -> None: + """Delete folder and create new (empty) one. + + Parameters + ---------- + folder : Path + Folder to clear. + force : bool + Do not ask whether to remove folder. + verbose : bool + Print status. + """ + if not folder.exists(): + return + + if folder.samefile(Path.cwd()): + print("Cannot clear folder as this is current working directory.") + return + + if ( + not force + and input(f"Running this will clear {folder}, are you sure you want to continue? [y/N] ") + .strip() + .lower() + != "y" + ): + print("Cancelling.") + sys.exit() + + if verbose: + print(f"Deleting {folder}...") + + rmtree(folder, ignore_errors=True) + folder.mkdir() + + +def main(args_in: Sequence[str] | None = None, /) -> None: + """Parse schemas and dump to file. + + Parameters + ---------- + args_in : Sequence[str], optional + Pass CLI params directly. + """ + parser = get_arg_parser() + args = parser.parse_args(args_in) + + # Get unique (by schema), but ordered keys matching reqs + schemas = { + schema: key + for key, schema in reversed(SCHEMAS.items()) + if "all" in args.schemas or key in args.schemas + } + out_names = [get_filename(args.out_name, key) for key in schemas.values()] + + if args.verbose: + print(f"Generating schemas for keys {', '.join(map(repr, schemas.values()))}...") + + if args.clear: + clear_folder(args.out_folder, force=args.force, verbose=args.verbose) + + for key, out_name in zip(schemas.values(), out_names, strict=True): + out_path = args.out_folder / out_name + + if args.verbose: + print(f"Generating schema for {key!r} to {out_path}...") + + markdown = process_schema(key) + + with out_path.open("w", encoding="utf-8") as out: + out.write(markdown) + + if args.index: + if args.verbose: + print(f"Writing index to {args.out_folder / 'index.rst'}...") + + with (args.out_folder / "index.rst").open("w", encoding="utf-8") as out: + out.write( + INDEX_MD.format( + filename=args.header, + underline="=" * len(args.header), + schemas=indent("\n".join(Path(key).stem for key in out_names), " " * 3), + ) + ) + + if args.verbose: + print("Done with schemas") + + +if __name__ == "__main__": + main() diff --git a/pyproject.toml b/pyproject.toml index 8326679..6553353 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -41,6 +41,8 @@ docs = [ "sphinxcontrib-contentui<1.0.0,>=0.2.5", "furo==2025.9.25", "numpydoc>=1.9.0", + "myst-parser", + "jsonschema-markdown", ] lint = ["pre-commit<5.0.0,>=4.2.0", "ruff==0.13.3", "numpydoc>=0.19.0"] test = ["pytest==8.3.4", "pytest-cov==5.0.0"]