From bd2e891928347a346c69be65049a72eb3317004b Mon Sep 17 00:00:00 2001 From: Jacob Wilkins Date: Tue, 20 Jan 2026 09:00:16 +0000 Subject: [PATCH 1/3] Add initial docs --- data_collections_api/schemas/base.py | 102 +++++--- docs/source/api/data_collections_api.cli.rst | 29 +++ docs/source/api/data_collections_api.rst | 9 +- .../api/data_collections_api.schemas.rst | 21 ++ docs/source/api/modules.rst | 4 +- docs/source/cli.rst | 79 +++++- docs/source/conf.py | 23 +- docs/source/index.rst | 2 + docs/source/schema.rst | 6 + docs/source/schemas/base.md | 44 ++++ docs/source/schemas/index.rst | 11 + docs/source/scripts/__init__.py | 0 docs/source/scripts/schema_gen.py | 245 ++++++++++++++++++ pyproject.toml | 6 +- 14 files changed, 530 insertions(+), 51 deletions(-) create mode 100644 docs/source/api/data_collections_api.cli.rst create mode 100644 docs/source/api/data_collections_api.schemas.rst create mode 100644 docs/source/schema.rst create mode 100644 docs/source/schemas/base.md create mode 100644 docs/source/schemas/index.rst create mode 100644 docs/source/scripts/__init__.py create mode 100644 docs/source/scripts/schema_gen.py diff --git a/data_collections_api/schemas/base.py b/data_collections_api/schemas/base.py index 7c65c55..8556092 100644 --- a/data_collections_api/schemas/base.py +++ b/data_collections_api/schemas/base.py @@ -4,35 +4,40 @@ from datetime import date from urllib.parse import urlparse, urlunparse -from uuid import UUID -from schema import And, Optional, Or, Regex, Schema, Use +from schema import And, Literal, Optional, Or, Regex, Schema, Use ORCID_ID_RE = r"(\d{4}-){3}\d{4}" +UUID_RE = r"\d{8}-(\d{4}-){3}\d{12}" id_schema = Or( { - "scheme": "orcid", - "identifier": Regex(ORCID_ID_RE), + Literal("scheme", description="ID scheme."): "orcid", + Literal("identifier", description="An [ORCID](https://orcid.org)."): Regex(ORCID_ID_RE), }, { - "identifier": And(Use(urlparse), lambda x: x.scheme and x.netloc, Use(urlunparse)), - Optional("scheme", default="doi"): "doi", + Optional(Literal("scheme", description="ID scheme."), default="doi"): "doi", + Literal("identifier", description="A [DOI](https://www.doi.org)"): And( + Use(urlparse), lambda x: x.scheme and x.netloc, Use(urlunparse) + ), }, ) creator_schema = Schema( { - Optional("affiliations"): [ + Optional(Literal("affiliations", description="Member affiliations.")): [ { - "name": str, + Literal("name", description="Name of institution."): str, }, ], - "person_or_org": { - Or("name", "family_name"): And(str, len), - Optional("given_name"): And(str, len), - Optional("identifiers"): [id_schema], - "type": Or("personal"), + Literal("person_or_org", description="Person or organisation."): { + Or( + Literal("name", description="Full set of given names."), + Literal("family_name", description="Family name(s)."), + ): And(str, len), + Optional(Literal("given_name", description="Given name(s).")): And(str, len), + Optional(Literal("identifiers", description="ORCIDs or other IDs")): [id_schema], + Literal("type", description="Personal or organisation."): Or("personal"), }, }, ignore_extra_keys=True, @@ -40,39 +45,64 @@ metadata_schema = Schema( { - "title": And(str, len), - "description": And(str, len), - "creators": [creator_schema], - "rights": [ + Literal("title", description="Title of resource."): And(str, len), + Literal("description", description="Summary of resource."): And(str, len), + Literal("creators", description="List of creators."): [creator_schema], + Literal("rights", description="Rights or license."): [ { - "id": Or("cc-by-4.0"), + Literal("id", description="ID of rights or license."): Or("cc-by-4.0"), }, ], - "resource_type": { - "id": Or("model"), + Literal("resource_type", description="Type of resource."): { + Literal("id", description="Resource class."): Or("model"), }, - Optional("subjects", default=[]): [{"subject": str}], - "version": Regex(r"^v\d+(\.\d+)*"), - Optional("publisher"): str, - Optional("publication_date"): Or(date.fromisoformat, date.fromtimestamp), - Optional("identifiers"): [id_schema], + Optional( + Literal("subjects", description="List of keywords defining subjects resource covers."), + default=[], + ): [{Literal("subject", description="Subject keyword."): str}], + Literal("version", description="Current version of resource."): Regex(r"^v\d+(\.\d+)*"), + Optional(Literal("publisher", description="Publisher of resource.")): str, + Optional(Literal("publication_date", description="Date of publication of resource.")): Or( + date.fromisoformat, date.fromtimestamp + ), + Optional( + Literal("identifiers", description="Resource identifiers such as ORCID or DOI.") + ): [id_schema], }, ) base_schema = Schema( { - Optional("access", default={"files": "public", "record": "public"}): { - Optional("embargo"): { - "active": bool, - "reason": Or(str, None), + Optional( + Literal("access", description="Accessibility of data outside of owners."), + default={"files": "public", "record": "public"}, + ): { + Optional(Literal("embargo", description="Details of resource embargo.")): { + Literal("active", description="Whether resource is under embargo."): bool, + Literal("reason", description="Cause for embargo."): Or(str, None), }, - Optional("files", default="public"): Or("public", "private"), - Optional("record", default="public"): Or("public", "private"), - Optional("status"): Or("open", "closed"), + Optional( + Literal("files", description="Accessibility to individual files."), default="public" + ): Or("public", "private"), + Optional( + Literal("record", description="Accessibility to record as a whole."), + default="public", + ): Or("public", "private"), + Optional(Literal("status", description="Current status or resource.")): Or( + "open", "closed" + ), }, - Optional("files"): {"enabled": bool}, - "custom_fields": {"dsmd": [dict]}, - "metadata": metadata_schema, - Optional("community"): UUID, + Optional(Literal("files", description="Details of files.")): { + Literal("enabled", description="Whether file is enabled."): bool + }, + Literal("custom_fields", description="Block for custom data."): { + Literal("dsmd", description="Domain specific metadata (dsmd)."): [dict] + }, + Literal("metadata", description="Resource metadata."): metadata_schema, + Optional( + Literal("community", description="UUID of community associated with resource.") + ): Regex(UUID_RE), }, + description="Base schema from which community specific schemas are built.", + name="base", ) diff --git a/docs/source/api/data_collections_api.cli.rst b/docs/source/api/data_collections_api.cli.rst new file mode 100644 index 0000000..55eb18d --- /dev/null +++ b/docs/source/api/data_collections_api.cli.rst @@ -0,0 +1,29 @@ +data\_collections\_api.cli package +================================== + +Submodules +---------- + +data\_collections\_api.cli.data\_collections\_main module +--------------------------------------------------------- + +.. automodule:: data_collections_api.cli.data_collections_main + :members: + :show-inheritance: + :undoc-members: + +data\_collections\_api.cli.record\_upload module +------------------------------------------------ + +.. automodule:: data_collections_api.cli.record_upload + :members: + :show-inheritance: + :undoc-members: + +Module contents +--------------- + +.. automodule:: data_collections_api.cli + :members: + :show-inheritance: + :undoc-members: diff --git a/docs/source/api/data_collections_api.rst b/docs/source/api/data_collections_api.rst index 1572033..eae62b2 100644 --- a/docs/source/api/data_collections_api.rst +++ b/docs/source/api/data_collections_api.rst @@ -8,6 +8,7 @@ Subpackages :maxdepth: 4 data_collections_api.cli + data_collections_api.schemas Submodules ---------- @@ -36,14 +37,6 @@ data\_collections\_api.metadata module :show-inheritance: :undoc-members: -data\_collections\_api.schema module ------------------------------------- - -.. automodule:: data_collections_api.schema - :members: - :show-inheritance: - :undoc-members: - Module contents --------------- diff --git a/docs/source/api/data_collections_api.schemas.rst b/docs/source/api/data_collections_api.schemas.rst new file mode 100644 index 0000000..05185db --- /dev/null +++ b/docs/source/api/data_collections_api.schemas.rst @@ -0,0 +1,21 @@ +data\_collections\_api.schemas package +====================================== + +Submodules +---------- + +data\_collections\_api.schemas.base module +------------------------------------------ + +.. automodule:: data_collections_api.schemas.base + :members: + :show-inheritance: + :undoc-members: + +Module contents +--------------- + +.. automodule:: data_collections_api.schemas + :members: + :show-inheritance: + :undoc-members: diff --git a/docs/source/api/modules.rst b/docs/source/api/modules.rst index 19f1e46..4ce85db 100644 --- a/docs/source/api/modules.rst +++ b/docs/source/api/modules.rst @@ -1,5 +1,5 @@ -API Documentation -================= +data_collections_api +==================== .. toctree:: :maxdepth: 4 diff --git a/docs/source/cli.rst b/docs/source/cli.rst index 4f77bc0..01a8fbe 100644 --- a/docs/source/cli.rst +++ b/docs/source/cli.rst @@ -7,26 +7,101 @@ simplifying the process of uploading or verifying data. data_collections ---------------- +.. code-block:: text + + usage: data_collections [-h] [-V] {validate,template,dump,upload} ... + + Single-utility API for data handling with remote depositories. + + positional arguments: + {validate,template,dump,upload} + validate Validate metadata + template (dump) Dump a template file. + upload Upload a dataset to an invenio repository. + + options: + -h, --help show this help message and exit + -V, --version show program's version number and exit + ``data_collections`` is the general top-level interface to the tools. These tools are implemented as sub-parsers within the main module. + upload ****** Construct a set of data and upload a set of files along with the metadata to an -Invenio repository. +Invenio repository. This is an alias for ``upload_record``. validate ******** -Validate the metadata file for a dataset before uploading. +.. code-block:: text + + usage: data_collections validate [-h] [-f {json,yaml}] file + + Validate a metadata file or string. + + positional arguments: + file File to validate + + options: + -h, --help show this help message and exit + -f, --format {json,yaml} + Parse FILE as this type (default: determine from + suffix). + +Validate the metadata file for a dataset complies with the schema before uploading. See `schema`__ for details on a valid metadata file. dump **** +.. code-block:: text + + usage: data_collections template [-h] [-f {json,yaml}] file + + Dump a file template to file. + + positional arguments: + file File to write + + options: + -h, --help show this help message and exit + -f, --format {json,yaml} + Parse FILE as this type (default: determine from + suffix). + + Dump a template metadata file ready for modification to upload. upload_record ------------- + +.. code-block:: text + + usage: upload_record [-h] --api-url URL --api-key str --metadata-path file + [-f {json,yaml}] [--files FILES [FILES ...]] [--community str] + + Upload records to Invenio repository + + options: + -h, --help show this help message and exit + --api-url URL URL for the API associated with the Invenio repository, e.g. + https://data-collections-staging.psdi.ac.uk/api + --api-key str Your API key/token for accessing the Invenio repository + instance. + --metadata-path file File path to the yaml file containing the metadata to upload + a record to an Invenio repository, e.g. + path/to/files/record.yaml + -f {json,yaml}, --metadata-format {json,yaml} + Parse metadata file as this type (default: yaml). + --files FILES [FILES ...] + List of file paths associated with the record to be + uploaded, e.g. path/to/files/data.* + --community str Name of a Invenio repository community to upload the record + to, e.g. biosimdb, data-to-knowledge, etc. + + +One-stop tool to upload a record to the repository. This requries that you have already defined your metadata file (see ``dump`` and ``validate``) and got an API key (see: PSDI Invenio docs on how to get this) diff --git a/docs/source/conf.py b/docs/source/conf.py index 829180d..85566c9 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -8,8 +8,16 @@ from __future__ import annotations import time +import sys +from pathlib import Path + +DOCS_SRC = Path(__file__).parent.resolve() +sys.path.append(str(DOCS_SRC.parents[2])) +sys.path.append(str(DOCS_SRC)) import data_collections_api +from scripts.schema_gen import main as gen_schema + project = "Data Collections API" copyright_first_year = "2024" @@ -24,13 +32,27 @@ extensions = [ "numpydoc", "sphinx.ext.autodoc", + "sphinx.ext.apidoc", "sphinx.ext.autosummary", "sphinx.ext.intersphinx", "sphinx.ext.mathjax", "sphinx.ext.viewcode", "sphinxcontrib.contentui", + "myst_parser", +] + +source_suffix = { + ".rst": "restructuredtext", + ".txt": "markdown", + ".md": "markdown", +} + +apidoc_modules = [ + {"path": "../../data_collections_api", "destination": "api/"}, ] +gen_schema(["-Fv", f"-o={DOCS_SRC / 'schemas'}", "-O=%s.md", "all"]) + always_use_bars_union = True napoleon_include_special_with_doc = True napoleon_use_param = True @@ -45,7 +67,6 @@ "python": ("https://docs.python.org/3", None), } - templates_path = ["_templates"] exclude_patterns = [] diff --git a/docs/source/index.rst b/docs/source/index.rst index 766a609..efaa7aa 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -14,4 +14,6 @@ Project to allow simplified editing and construction of Invenio data for the PSD :caption: Contents: cli + schema + schemas/index api/modules diff --git a/docs/source/schema.rst b/docs/source/schema.rst new file mode 100644 index 0000000..2afa57e --- /dev/null +++ b/docs/source/schema.rst @@ -0,0 +1,6 @@ +Metadata Format +=============== + +The metadata file may be either in a `json `__ or `yaml `__ format. + +Each community may have their own metadata requirements, which are all encompassed in the ``custom_fields.dsmd`` field, the full list of supported schemas is available in the :doc:`schemas/index` section. diff --git a/docs/source/schemas/base.md b/docs/source/schemas/base.md new file mode 100644 index 0000000..1183cdc --- /dev/null +++ b/docs/source/schemas/base.md @@ -0,0 +1,44 @@ +# base + +Base schema from which community specific schemas are built. + +### Type: `object` + +> ⚠️ Additional properties are not allowed. + +| Property | Type | Required | Possible values | Default | Description | +| -------- | ---- | -------- | --------------- | ------- | ----------- | +| custom_fields | `object` | ✅ | object | | Block for custom data. | +| custom_fields.dsmd | `array` | ✅ | object | | Domain specific metadata (dsmd). | +| metadata | `object` | ✅ | object | | Resource metadata. | +| metadata.title | `string` | ✅ | string | | Title of resource. | +| metadata.description | `string` | ✅ | string | | Summary of resource. | +| metadata.creators | `array` | ✅ | object | | List of creators. | +| metadata.creators[].affiliations | `array` | | object | | Member affiliations. | +| metadata.creators[].affiliations[].name | `string` | ✅ | string | | Name of institution. | +| metadata.creators[].person_or_org | `object` | ✅ | object | | Person or organisation. | +| metadata.creators[].person_or_org.name | `string` | | string | | Full set of given names. | +| metadata.creators[].person_or_org.family_name | `string` | | string | | Family name(s). | +| metadata.creators[].person_or_org.given_name | `string` | | string | | Given name(s). | +| metadata.creators[].person_or_org.identifiers | `array` | | object and/or object | | ORCIDs or other IDs | +| metadata.creators[].person_or_org.type | `const` | ✅ | `personal` | | Personal or organisation. | +| metadata.rights | `array` | ✅ | object | | Rights or license. | +| metadata.rights[].id | `const` | ✅ | `cc-by-4.0` | | ID of rights or license. | +| metadata.resource_type | `object` | ✅ | object | | Type of resource. | +| metadata.resource_type.id | `const` | ✅ | `model` | | Resource class. | +| metadata.subjects | `array` | | object | `[]` | List of keywords defining subjects resource covers. | +| metadata.subjects[].subject | `string` | ✅ | string | | Subject keyword. | +| metadata.version | `string` | ✅ | [`^v\d+(\.\d+)*`](https://regex101.com/?regex=%5Ev%5Cd%2B%28%5C.%5Cd%2B%29%2A) | | Current version of resource. | +| metadata.publisher | `string` | | string | | Publisher of resource. | +| metadata.publication_date | `None` | | None | | Date of publication of resource. | +| metadata.identifiers | `array` | | object and/or object | | Resource identifiers such as ORCID or DOI. | +| access | `object` | | object | `{"files": "public", "record": "public"}` | Accessibility of data outside of owners. | +| access.embargo | `object` | | object | | Details of resource embargo. | +| access.embargo.active | `boolean` | ✅ | boolean | | Whether resource is under embargo. | +| access.embargo.reason | `string` or `null` | ✅ | string | | Cause for embargo. | +| access.files | `None` | | `public` `private` | `"public"` | Accessibility to individual files. | +| access.record | `None` | | `public` `private` | `"public"` | Accessibility to record as a whole. | +| access.status | `None` | | `open` `closed` | | Current status or resource. | +| files | `object` | | object | | Details of files. | +| files.enabled | `boolean` | ✅ | boolean | | Whether file is enabled. | +| community | `string` | | [`\d{8}-(\d{4}-){3}\d{12}`](https://regex101.com/?regex=%5Cd%7B8%7D-%28%5Cd%7B4%7D-%29%7B3%7D%5Cd%7B12%7D) | | UUID of community associated with resource. | diff --git a/docs/source/schemas/index.rst b/docs/source/schemas/index.rst new file mode 100644 index 0000000..4e27baf --- /dev/null +++ b/docs/source/schemas/index.rst @@ -0,0 +1,11 @@ +Schemas +======= + +This page documents the available schemas. + +.. toctree:: + :maxdepth: 1 + :caption: Schemas: + + base + diff --git a/docs/source/scripts/__init__.py b/docs/source/scripts/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/docs/source/scripts/schema_gen.py b/docs/source/scripts/schema_gen.py new file mode 100644 index 0000000..747ef90 --- /dev/null +++ b/docs/source/scripts/schema_gen.py @@ -0,0 +1,245 @@ +"""Generate schema documentation.""" + +from __future__ import annotations + +import argparse +from pathlib import Path +from shutil import rmtree +from textwrap import indent +from typing import TYPE_CHECKING + +import jsonschema_markdown + +from data_collections_api.schemas import SCHEMAS, Schema, get_schema + +if TYPE_CHECKING: + from collections.abc import Sequence + +__author__ = "Jacob Wilkins" +__version__ = "0.1" + +INDEX_MD = """\ +{filename} +{underline} + +This page documents the available schemas. + +.. toctree:: + :maxdepth: 1 + :caption: Schemas: + +{schemas} + +""" + + +def get_arg_parser() -> argparse.ArgumentParser: + """Get parser for CLI. + + Returns + ------- + argparse.ArgumentParser + Arg parser. + """ + parser = argparse.ArgumentParser( + description="Convert a schema to a markdown document.", + ) + + parser.add_argument("-V", "--version", action="version", version=f"%(prog)s v{__version__}") + parser.add_argument( + "-v", + "--verbose", + action="store_true", + help="Print while generating schemas", + ) + parser.add_argument( + "-F", + "--force", + action="store_true", + help="Force removal of output directory (if not CWD). (default: %(default)s)", + ) + parser.add_argument( + "schemas", + nargs="*", + choices=SCHEMAS.keys() | {"all"}, + help="Schemas to convert or 'all' if all are to be done. (default: %(default)r)", + default="all", + ) + + parser.add_argument( + "--clear", + action=argparse.BooleanOptionalAction, + help="Clear folder before writing. (default: %(default)s)", + default=True, + ) + parser.add_argument( + "--index", + action=argparse.BooleanOptionalAction, + help="Write index file with toctree to folder. (default: %(default)s)", + default=True, + ) + + parser.add_argument( + "--header", + help="Title of index file. (default: %(default)r)", + default="Schemas", + ) + + parser.add_argument( + "-O", + "--out-name", + help=( + "Format to use for naming output, " + "substituting '%%s' for schema key. (default: %(default)r)" + ), + default="%s.md", + ) + parser.add_argument( + "-o", + "--out-folder", + help="Folder to write formatted docs in. (default: %(default)r)", + default="schemas", + type=Path, + ) + + return parser + + +def process_schema( + schema_key: Schema | str, + *, + name: str | None = None, +) -> str: + """Process a schema into markdown. + + Parameters + ---------- + schema_key : Schema or str + Key for schemas. + name : str, optional + Override for name (mandatory if passing :class:`schema` directly). + + Returns + ------- + str + Markdown rendered documentation. + + Raises + ------ + ValueError + Name not passed with Schema. + """ + match (schema_key, name): + case (_, str() as inp): + name = inp + case (str() as inp, _): + name = inp + case _: + raise ValueError(f"Cannot reliably determine name from {type(schema_key).__name__}") + + schema = get_schema(schema_key) + json_schema = schema.json_schema(name) + + return jsonschema_markdown.generate( + json_schema, + title=name, + footer=False, + hide_empty_columns=True, + ) + + +def get_filename(fmt: str, key: str) -> str: + """Format filename from CLI. + + Parameters + ---------- + fmt : str + CLI format. + key : str + Schema key. + + Returns + ------- + str + Formatted filename. + + Examples + -------- + >>> get_filename("%s.md", "base") + 'base.md' + """ + return fmt % key + + +def main(args_in: Sequence[str] | None = None, /) -> None: + """Parse schemas and dump to file. + + Parameters + ---------- + args_in : Sequence[str], optional + Pass CLI params directly. + """ + parser = get_arg_parser() + args = parser.parse_args(args_in) + + # Get unique (by schema), but ordered keys matching reqs + schemas = { + schema: key + for key, schema in reversed(SCHEMAS.items()) + if "all" in args.schemas or key in args.schemas + } + out_names = [get_filename(args.out_name, key) for key in schemas.values()] + + if args.verbose: + print(f"Generating schemas for keys {', '.join(map(repr, schemas.values()))}...") + + if args.clear and args.out_folder.exists() and not args.out_folder.samefile(Path.cwd()): + if ( + not args.force + and input( + f"Running this will clear {args.out_folder}," + " are you sure you want to continue? [y/N] " + ) + .strip() + .lower() + != "y" + ): + print("Cancelling.") + return + + if args.verbose: + print(f"Deleting {args.out_folder}...") + + rmtree(args.out_folder, ignore_errors=True) + args.out_folder.mkdir() + + for key, out_name in zip(schemas.values(), out_names, strict=True): + out_path = args.out_folder / out_name + + if args.verbose: + print(f"Generating schema for {key!r} to {out_path}...") + + markdown = process_schema(key) + + with out_path.open("w", encoding="utf-8") as out: + out.write(markdown) + + if args.index: + if args.verbose: + print(f"Writing index to {args.out_folder / 'index.rst'}...") + + with (args.out_folder / "index.rst").open("w", encoding="utf-8") as out: + out.write( + INDEX_MD.format( + filename=args.header, + underline="=" * len(args.header), + schemas=indent("\n".join(Path(key).stem for key in out_names), " " * 3), + ) + ) + + if args.verbose: + print("Done with schemas") + + +if __name__ == "__main__": + main() diff --git a/pyproject.toml b/pyproject.toml index 8326679..7c1d863 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -11,7 +11,7 @@ authors = [ { name = "Jas Kalayan" }, { name = "Alin M. Elena" }, ] -requires-python = ">=3.10" +requires-python = ">=3.11" classifiers = [ "Programming Language :: Python", "Programming Language :: Python :: 3.10", @@ -37,10 +37,12 @@ dependencies = [ ruamel = ["ruamel.yaml>=0.17.22"] yaml = ["pyYAML>=3.13"] docs = [ - "sphinx>=0.13.1", + "sphinx>=8.2", "sphinxcontrib-contentui<1.0.0,>=0.2.5", "furo==2025.9.25", "numpydoc>=1.9.0", + "myst-parser", + "jsonschema-markdown", ] lint = ["pre-commit<5.0.0,>=4.2.0", "ruff==0.13.3", "numpydoc>=0.19.0"] test = ["pytest==8.3.4", "pytest-cov==5.0.0"] From ecc5844bb17e41d938f196a2edf19646b0911783 Mon Sep 17 00:00:00 2001 From: Jacob Wilkins Date: Wed, 21 Jan 2026 14:27:18 +0000 Subject: [PATCH 2/3] Merge in documentation written in improve-docs --- .../cli/data_collections_main.py | 2 +- docs/source/cli.rst | 239 +++++++++++++----- docs/source/index.rst | 2 +- docs/source/scripts/__init__.py | 1 + 4 files changed, 181 insertions(+), 63 deletions(-) diff --git a/data_collections_api/cli/data_collections_main.py b/data_collections_api/cli/data_collections_main.py index eb6b063..61bf82b 100644 --- a/data_collections_api/cli/data_collections_main.py +++ b/data_collections_api/cli/data_collections_main.py @@ -69,7 +69,7 @@ def get_arg_parser() -> argparse.ArgumentParser: "-f", "--format", choices=("json", "yaml"), - help="Parse FILE as this type (default: determine from suffix).", + help="Dump FILE as this type (default: determine from suffix).", default=None, ) sp.set_defaults(func=dump_example) diff --git a/docs/source/cli.rst b/docs/source/cli.rst index 01a8fbe..b04f3b8 100644 --- a/docs/source/cli.rst +++ b/docs/source/cli.rst @@ -1,107 +1,224 @@ CLI Usage ========= -``data_collections_api`` provides a few commandline tools for -simplifying the process of uploading or verifying data. +``data_collections_api`` provides a few command-line tools for simplifying the process of uploading +or verifying data and metadata. data_collections ---------------- -.. code-block:: text +.. program:: data_collections +.. describe:: data_collections - usage: data_collections [-h] [-V] {validate,template,dump,upload} ... + .. option:: operation {validate,template,dump,upload} - Single-utility API for data handling with remote depositories. + .. option:: validate - positional arguments: - {validate,template,dump,upload} - validate Validate metadata - template (dump) Dump a template file. - upload Upload a dataset to an invenio repository. + Validate metadata - options: - -h, --help show this help message and exit - -V, --version show program's version number and exit + .. option:: template + .. option:: dump -``data_collections`` is the general top-level interface to the -tools. These tools are implemented as sub-parsers within the main -module. + Dump a template file. + .. option:: `upload` + + Upload a dataset to an invenio repository. + + .. option:: -V, --version + + Show program's version number and exit. + +``data_collections`` is the general top-level interface to the tools. These tools are implemented as +sub-parsers within the main module. + +.. admonition:: Running ``data_collections`` + + By default, if the ``data_collections_api`` package is installed, ``data_collections`` is + installed as an executable script on your main ``PATH``. In general, this is the main entry + point. + + If that is not desired, it is possible to run ``data_collections`` through the python module + system:: + + python -m data_collections_api + + where the ``data_collections_api`` **module** (folder) is on the current ``sys.path`` (by being + installed, in the current ``PYTHONPATH`` or being in the current working directory.):: + + PYTHONPATH=/path/containing/data_collections_api python -m data_collections_api + + Throughout the rest of this page, we will assume ``data_collections`` is used as the main + entrypoint. + + +.. _upload: upload ****** -Construct a set of data and upload a set of files along with the metadata to an -Invenio repository. This is an alias for ``upload_record``. +.. program:: data_collections upload +.. describe:: data_collections upload + + .. option:: --api-url URL + + URL for the API associated with the Invenio repository, e.g. + https://data-collections-staging.psdi.ac.uk/api + + .. option:: --api-key str + + Your API key/token for accessing the Invenio repository instance. + + .. option:: --metadata-path file + + File path to the yaml file containing the metadata to upload a record to an Invenio + repository, e.g. path/to/files/record.yaml + + .. option:: -f {json,yaml}, --metadata-format {json,yaml} + + Parse metadata file as this type (default: yaml). + + .. option:: --files FILES [FILES ...] + + List of file paths associated with the record to be uploaded, e.g. path/to/files/data.* + + .. option:: --community str + + Name of a Invenio repository community to upload the record to, e.g. biosimdb, + data-to-knowledge, etc. + + +``data_collections_api`` can take your data and metadata and automatically upload it to the Invenio +repository. To do so, you need to have some information at hand: + +- The URL of the repository you wish to upload the data to. In the case of PSDI data, this will + often be https://data-collections.psdi.ac.uk. +- Your API key (also called a Personal Access Token or PAT) for the repository to give permissions + to write and upload data. +- A metadata file detailing the data relating to the files (see :doc:`schemas/index`). +- The files ready to upload. + +With all this prepared, uploading the data is as simple as: + +.. code-block:: console + + data_collections upload --api-url https://data-collections.psdi.ac.uk --api-key 1234567890abcdef --metadata-path /path/to/metata_file.yaml --files FILE1 FILE2 --community my_community + +.. note:: + + Since this is a common operation it is also available as the standalone :option:`upload_record` + +.. _validate: validate ******** -.. code-block:: text +.. program:: data_collections validate + +.. describe:: data_collections validate + + .. option:: FILE + + File to validate. + + .. option:: -f {json,yaml}, --format {json,yaml} + + Parse :option:`FILE` as this type (default: determine from suffix). + +Validate the metadata file for a dataset before uploading. + +``data_collections_api`` can validate your metadata file against the schema to verify the contents +of the file match what is required to make a valid upload. + +.. note:: + + The validator does not verify most data itself, you must ensure that all entries are spelled and + written correctly. + +To validate a data file simply run: + +.. code-block:: console + + data_collections validate [file] - usage: data_collections validate [-h] [-f {json,yaml}] file +e.g. - Validate a metadata file or string. +.. code-block:: console - positional arguments: - file File to validate + data_collections validate examples/biosim_record.yaml - options: - -h, --help show this help message and exit - -f, --format {json,yaml} - Parse FILE as this type (default: determine from - suffix). +The file can be either in ``json`` or ``yaml`` formats (see: :doc:`schema`). :option:`data_collections validate` will attempt to determine the +appropriate format from the file extension, but this can be specified explicitly with the ``-f`` +flag. -Validate the metadata file for a dataset complies with the schema before uploading. See `schema`__ for details on a valid metadata file. +.. code-block:: console + + data_collections validate -f json examples/biosim_record.yaml + +.. note:: + + The above will raise an error since the file is not in ``json`` format. dump **** -.. code-block:: text +.. program:: data_collections template +.. describe:: data_collections template +.. describe:: data_collections dump + + .. option:: FILE - usage: data_collections template [-h] [-f {json,yaml}] file + File to dump. - Dump a file template to file. + .. option:: -f {json,yaml}, --format {json,yaml} - positional arguments: - file File to write + Dump :option:`FILE` as this type (default: determine from suffix). - options: - -h, --help show this help message and exit - -f, --format {json,yaml} - Parse FILE as this type (default: determine from - suffix). +``data_collections_api`` provides a method to quick-start building metadata, ``template`` will dump +an example metadata file for a particular community and data-type (though currently only a basic +example is available). To do so, simply run +.. code-block:: console -Dump a template metadata file ready for modification to upload. + data_collections dump my_metadata.yaml + +You can then edit and modify this template to fill in the data needed. upload_record ------------- -.. code-block:: text +.. program:: upload_record +.. describe:: upload_record + + .. option:: --api-url URL + + URL for the API associated with the Invenio repository, e.g. + https://data-collections-staging.psdi.ac.uk/api + + .. option:: --api-key str + + Your API key/token for accessing the Invenio repository instance. + + .. option:: --metadata-path file + + File path to the yaml file containing the metadata to upload a record to an Invenio + repository, e.g. path/to/files/record.yaml + + .. option:: -f {json,yaml}, --metadata-format {json,yaml} + + Parse metadata file as this type (default: yaml). + + .. option:: --files FILES [FILES ...] + + List of file paths associated with the record to be uploaded, e.g. ``path/to/files/data.*`` - usage: upload_record [-h] --api-url URL --api-key str --metadata-path file - [-f {json,yaml}] [--files FILES [FILES ...]] [--community str] + .. option:: --community str - Upload records to Invenio repository + Name of a Invenio repository community to upload the record to, e.g. biosimdb, + data-to-knowledge, etc. - options: - -h, --help show this help message and exit - --api-url URL URL for the API associated with the Invenio repository, e.g. - https://data-collections-staging.psdi.ac.uk/api - --api-key str Your API key/token for accessing the Invenio repository - instance. - --metadata-path file File path to the yaml file containing the metadata to upload - a record to an Invenio repository, e.g. - path/to/files/record.yaml - -f {json,yaml}, --metadata-format {json,yaml} - Parse metadata file as this type (default: yaml). - --files FILES [FILES ...] - List of file paths associated with the record to be - uploaded, e.g. path/to/files/data.* - --community str Name of a Invenio repository community to upload the record - to, e.g. biosimdb, data-to-knowledge, etc. +One-stop tool to upload a record to the repository, see `upload`_. -One-stop tool to upload a record to the repository. This requries that you have already defined your metadata file (see ``dump`` and ``validate``) and got an API key (see: PSDI Invenio docs on how to get this) +.. _pat_guide: ... diff --git a/docs/source/index.rst b/docs/source/index.rst index efaa7aa..78bccf3 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -16,4 +16,4 @@ Project to allow simplified editing and construction of Invenio data for the PSD cli schema schemas/index - api/modules + API Documentation diff --git a/docs/source/scripts/__init__.py b/docs/source/scripts/__init__.py index e69de29..14e5b9b 100644 --- a/docs/source/scripts/__init__.py +++ b/docs/source/scripts/__init__.py @@ -0,0 +1 @@ +"""Set of scripts for building docs.""" From ab10f7292abe340c67941a75857fbc86f8e8b87d Mon Sep 17 00:00:00 2001 From: Jacob Wilkins Date: Wed, 21 Jan 2026 14:48:53 +0000 Subject: [PATCH 3/3] Respond to review --- docs/source/cli.rst | 4 +++ docs/source/conf.py | 2 +- docs/source/schemas/index.rst | 1 - docs/source/scripts/schema_gen.py | 58 +++++++++++++++++++++---------- pyproject.toml | 4 +-- 5 files changed, 46 insertions(+), 23 deletions(-) diff --git a/docs/source/cli.rst b/docs/source/cli.rst index b04f3b8..f25f5a1 100644 --- a/docs/source/cli.rst +++ b/docs/source/cli.rst @@ -125,6 +125,10 @@ validate Parse :option:`FILE` as this type (default: determine from suffix). + .. option:: -S SCHEMA, --schema SCHEMA + + Validate against the given schema (default: :doc:`base`) + Validate the metadata file for a dataset before uploading. ``data_collections_api`` can validate your metadata file against the schema to verify the contents diff --git a/docs/source/conf.py b/docs/source/conf.py index 85566c9..e2c0d2f 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -32,7 +32,7 @@ extensions = [ "numpydoc", "sphinx.ext.autodoc", - "sphinx.ext.apidoc", + # "sphinx.ext.apidoc", "sphinx.ext.autosummary", "sphinx.ext.intersphinx", "sphinx.ext.mathjax", diff --git a/docs/source/schemas/index.rst b/docs/source/schemas/index.rst index 4e27baf..abff4eb 100644 --- a/docs/source/schemas/index.rst +++ b/docs/source/schemas/index.rst @@ -8,4 +8,3 @@ This page documents the available schemas. :caption: Schemas: base - diff --git a/docs/source/scripts/schema_gen.py b/docs/source/scripts/schema_gen.py index 747ef90..00bceed 100644 --- a/docs/source/scripts/schema_gen.py +++ b/docs/source/scripts/schema_gen.py @@ -5,6 +5,7 @@ import argparse from pathlib import Path from shutil import rmtree +import sys from textwrap import indent from typing import TYPE_CHECKING @@ -171,6 +172,42 @@ def get_filename(fmt: str, key: str) -> str: return fmt % key +def clear_folder(folder: Path, *, force: bool = False, verbose: bool = False) -> None: + """Delete folder and create new (empty) one. + + Parameters + ---------- + folder : Path + Folder to clear. + force : bool + Do not ask whether to remove folder. + verbose : bool + Print status. + """ + if not folder.exists(): + return + + if folder.samefile(Path.cwd()): + print("Cannot clear folder as this is current working directory.") + return + + if ( + not force + and input(f"Running this will clear {folder}, are you sure you want to continue? [y/N] ") + .strip() + .lower() + != "y" + ): + print("Cancelling.") + sys.exit() + + if verbose: + print(f"Deleting {folder}...") + + rmtree(folder, ignore_errors=True) + folder.mkdir() + + def main(args_in: Sequence[str] | None = None, /) -> None: """Parse schemas and dump to file. @@ -193,25 +230,8 @@ def main(args_in: Sequence[str] | None = None, /) -> None: if args.verbose: print(f"Generating schemas for keys {', '.join(map(repr, schemas.values()))}...") - if args.clear and args.out_folder.exists() and not args.out_folder.samefile(Path.cwd()): - if ( - not args.force - and input( - f"Running this will clear {args.out_folder}," - " are you sure you want to continue? [y/N] " - ) - .strip() - .lower() - != "y" - ): - print("Cancelling.") - return - - if args.verbose: - print(f"Deleting {args.out_folder}...") - - rmtree(args.out_folder, ignore_errors=True) - args.out_folder.mkdir() + if args.clear: + clear_folder(args.out_folder, force=args.force, verbose=args.verbose) for key, out_name in zip(schemas.values(), out_names, strict=True): out_path = args.out_folder / out_name diff --git a/pyproject.toml b/pyproject.toml index 7c1d863..6553353 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -11,7 +11,7 @@ authors = [ { name = "Jas Kalayan" }, { name = "Alin M. Elena" }, ] -requires-python = ">=3.11" +requires-python = ">=3.10" classifiers = [ "Programming Language :: Python", "Programming Language :: Python :: 3.10", @@ -37,7 +37,7 @@ dependencies = [ ruamel = ["ruamel.yaml>=0.17.22"] yaml = ["pyYAML>=3.13"] docs = [ - "sphinx>=8.2", + "sphinx>=0.13.1", "sphinxcontrib-contentui<1.0.0,>=0.2.5", "furo==2025.9.25", "numpydoc>=1.9.0",