From bd2e891928347a346c69be65049a72eb3317004b Mon Sep 17 00:00:00 2001
From: Jacob Wilkins <jacob.wilkins@stfc.ac.uk>
Date: Tue, 20 Jan 2026 09:00:16 +0000
Subject: [PATCH 1/3] Add initial docs

---
 data_collections_api/schemas/base.py          | 102 +++++---
 docs/source/api/data_collections_api.cli.rst  |  29 +++
 docs/source/api/data_collections_api.rst      |   9 +-
 .../api/data_collections_api.schemas.rst      |  21 ++
 docs/source/api/modules.rst                   |   4 +-
 docs/source/cli.rst                           |  79 +++++-
 docs/source/conf.py                           |  23 +-
 docs/source/index.rst                         |   2 +
 docs/source/schema.rst                        |   6 +
 docs/source/schemas/base.md                   |  44 ++++
 docs/source/schemas/index.rst                 |  11 +
 docs/source/scripts/__init__.py               |   0
 docs/source/scripts/schema_gen.py             | 245 ++++++++++++++++++
 pyproject.toml                                |   6 +-
 14 files changed, 530 insertions(+), 51 deletions(-)
 create mode 100644 docs/source/api/data_collections_api.cli.rst
 create mode 100644 docs/source/api/data_collections_api.schemas.rst
 create mode 100644 docs/source/schema.rst
 create mode 100644 docs/source/schemas/base.md
 create mode 100644 docs/source/schemas/index.rst
 create mode 100644 docs/source/scripts/__init__.py
 create mode 100644 docs/source/scripts/schema_gen.py

diff --git a/data_collections_api/schemas/base.py b/data_collections_api/schemas/base.py
index 7c65c55..8556092 100644
--- a/data_collections_api/schemas/base.py
+++ b/data_collections_api/schemas/base.py
@@ -4,35 +4,40 @@
 
 from datetime import date
 from urllib.parse import urlparse, urlunparse
-from uuid import UUID
 
-from schema import And, Optional, Or, Regex, Schema, Use
+from schema import And, Literal, Optional, Or, Regex, Schema, Use
 
 ORCID_ID_RE = r"(\d{4}-){3}\d{4}"
+UUID_RE = r"\d{8}-(\d{4}-){3}\d{12}"
 
 id_schema = Or(
     {
-        "scheme": "orcid",
-        "identifier": Regex(ORCID_ID_RE),
+        Literal("scheme", description="ID scheme."): "orcid",
+        Literal("identifier", description="An [ORCID](https://orcid.org)."): Regex(ORCID_ID_RE),
     },
     {
-        "identifier": And(Use(urlparse), lambda x: x.scheme and x.netloc, Use(urlunparse)),
-        Optional("scheme", default="doi"): "doi",
+        Optional(Literal("scheme", description="ID scheme."), default="doi"): "doi",
+        Literal("identifier", description="A [DOI](https://www.doi.org)"): And(
+            Use(urlparse), lambda x: x.scheme and x.netloc, Use(urlunparse)
+        ),
     },
 )
 
 creator_schema = Schema(
     {
-        Optional("affiliations"): [
+        Optional(Literal("affiliations", description="Member affiliations.")): [
             {
-                "name": str,
+                Literal("name", description="Name of institution."): str,
             },
         ],
-        "person_or_org": {
-            Or("name", "family_name"): And(str, len),
-            Optional("given_name"): And(str, len),
-            Optional("identifiers"): [id_schema],
-            "type": Or("personal"),
+        Literal("person_or_org", description="Person or organisation."): {
+            Or(
+                Literal("name", description="Full set of given names."),
+                Literal("family_name", description="Family name(s)."),
+            ): And(str, len),
+            Optional(Literal("given_name", description="Given name(s).")): And(str, len),
+            Optional(Literal("identifiers", description="ORCIDs or other IDs")): [id_schema],
+            Literal("type", description="Personal or organisation."): Or("personal"),
         },
     },
     ignore_extra_keys=True,
@@ -40,39 +45,64 @@
 
 metadata_schema = Schema(
     {
-        "title": And(str, len),
-        "description": And(str, len),
-        "creators": [creator_schema],
-        "rights": [
+        Literal("title", description="Title of resource."): And(str, len),
+        Literal("description", description="Summary of resource."): And(str, len),
+        Literal("creators", description="List of creators."): [creator_schema],
+        Literal("rights", description="Rights or license."): [
             {
-                "id": Or("cc-by-4.0"),
+                Literal("id", description="ID of rights or license."): Or("cc-by-4.0"),
             },
         ],
-        "resource_type": {
-            "id": Or("model"),
+        Literal("resource_type", description="Type of resource."): {
+            Literal("id", description="Resource class."): Or("model"),
         },
-        Optional("subjects", default=[]): [{"subject": str}],
-        "version": Regex(r"^v\d+(\.\d+)*"),
-        Optional("publisher"): str,
-        Optional("publication_date"): Or(date.fromisoformat, date.fromtimestamp),
-        Optional("identifiers"): [id_schema],
+        Optional(
+            Literal("subjects", description="List of keywords defining subjects resource covers."),
+            default=[],
+        ): [{Literal("subject", description="Subject keyword."): str}],
+        Literal("version", description="Current version of resource."): Regex(r"^v\d+(\.\d+)*"),
+        Optional(Literal("publisher", description="Publisher of resource.")): str,
+        Optional(Literal("publication_date", description="Date of publication of resource.")): Or(
+            date.fromisoformat, date.fromtimestamp
+        ),
+        Optional(
+            Literal("identifiers", description="Resource identifiers such as ORCID or DOI.")
+        ): [id_schema],
     },
 )
 
 base_schema = Schema(
     {
-        Optional("access", default={"files": "public", "record": "public"}): {
-            Optional("embargo"): {
-                "active": bool,
-                "reason": Or(str, None),
+        Optional(
+            Literal("access", description="Accessibility of data outside of owners."),
+            default={"files": "public", "record": "public"},
+        ): {
+            Optional(Literal("embargo", description="Details of resource embargo.")): {
+                Literal("active", description="Whether resource is under embargo."): bool,
+                Literal("reason", description="Cause for embargo."): Or(str, None),
             },
-            Optional("files", default="public"): Or("public", "private"),
-            Optional("record", default="public"): Or("public", "private"),
-            Optional("status"): Or("open", "closed"),
+            Optional(
+                Literal("files", description="Accessibility to individual files."), default="public"
+            ): Or("public", "private"),
+            Optional(
+                Literal("record", description="Accessibility to record as a whole."),
+                default="public",
+            ): Or("public", "private"),
+            Optional(Literal("status", description="Current status or resource.")): Or(
+                "open", "closed"
+            ),
         },
-        Optional("files"): {"enabled": bool},
-        "custom_fields": {"dsmd": [dict]},
-        "metadata": metadata_schema,
-        Optional("community"): UUID,
+        Optional(Literal("files", description="Details of files.")): {
+            Literal("enabled", description="Whether file is enabled."): bool
+        },
+        Literal("custom_fields", description="Block for custom data."): {
+            Literal("dsmd", description="Domain specific metadata (dsmd)."): [dict]
+        },
+        Literal("metadata", description="Resource metadata."): metadata_schema,
+        Optional(
+            Literal("community", description="UUID of community associated with resource.")
+        ): Regex(UUID_RE),
     },
+    description="Base schema from which community specific schemas are built.",
+    name="base",
 )
diff --git a/docs/source/api/data_collections_api.cli.rst b/docs/source/api/data_collections_api.cli.rst
new file mode 100644
index 0000000..55eb18d
--- /dev/null
+++ b/docs/source/api/data_collections_api.cli.rst
@@ -0,0 +1,29 @@
+data\_collections\_api.cli package
+==================================
+
+Submodules
+----------
+
+data\_collections\_api.cli.data\_collections\_main module
+---------------------------------------------------------
+
+.. automodule:: data_collections_api.cli.data_collections_main
+   :members:
+   :show-inheritance:
+   :undoc-members:
+
+data\_collections\_api.cli.record\_upload module
+------------------------------------------------
+
+.. automodule:: data_collections_api.cli.record_upload
+   :members:
+   :show-inheritance:
+   :undoc-members:
+
+Module contents
+---------------
+
+.. automodule:: data_collections_api.cli
+   :members:
+   :show-inheritance:
+   :undoc-members:
diff --git a/docs/source/api/data_collections_api.rst b/docs/source/api/data_collections_api.rst
index 1572033..eae62b2 100644
--- a/docs/source/api/data_collections_api.rst
+++ b/docs/source/api/data_collections_api.rst
@@ -8,6 +8,7 @@ Subpackages
    :maxdepth: 4
 
    data_collections_api.cli
+   data_collections_api.schemas
 
 Submodules
 ----------
@@ -36,14 +37,6 @@ data\_collections\_api.metadata module
    :show-inheritance:
    :undoc-members:
 
-data\_collections\_api.schema module
-------------------------------------
-
-.. automodule:: data_collections_api.schema
-   :members:
-   :show-inheritance:
-   :undoc-members:
-
 Module contents
 ---------------
 
diff --git a/docs/source/api/data_collections_api.schemas.rst b/docs/source/api/data_collections_api.schemas.rst
new file mode 100644
index 0000000..05185db
--- /dev/null
+++ b/docs/source/api/data_collections_api.schemas.rst
@@ -0,0 +1,21 @@
+data\_collections\_api.schemas package
+======================================
+
+Submodules
+----------
+
+data\_collections\_api.schemas.base module
+------------------------------------------
+
+.. automodule:: data_collections_api.schemas.base
+   :members:
+   :show-inheritance:
+   :undoc-members:
+
+Module contents
+---------------
+
+.. automodule:: data_collections_api.schemas
+   :members:
+   :show-inheritance:
+   :undoc-members:
diff --git a/docs/source/api/modules.rst b/docs/source/api/modules.rst
index 19f1e46..4ce85db 100644
--- a/docs/source/api/modules.rst
+++ b/docs/source/api/modules.rst
@@ -1,5 +1,5 @@
-API Documentation
-=================
+data_collections_api
+====================
 
 .. toctree::
    :maxdepth: 4
diff --git a/docs/source/cli.rst b/docs/source/cli.rst
index 4f77bc0..01a8fbe 100644
--- a/docs/source/cli.rst
+++ b/docs/source/cli.rst
@@ -7,26 +7,101 @@ simplifying the process of uploading or verifying data.
 data_collections
 ----------------
 
+.. code-block:: text
+
+   usage: data_collections [-h] [-V] {validate,template,dump,upload} ...
+
+   Single-utility API for data handling with remote depositories.
+
+   positional arguments:
+     {validate,template,dump,upload}
+       validate            Validate metadata
+       template (dump)     Dump a template file.
+       upload              Upload a dataset to an invenio repository.
+
+   options:
+     -h, --help            show this help message and exit
+     -V, --version         show program's version number and exit
+
 ``data_collections`` is the general top-level interface to the
 tools. These tools are implemented as sub-parsers within the main
 module.
 
+
 upload
 ******
 
 Construct a set of data and upload a set of files along with the metadata to an
-Invenio repository.
+Invenio repository. This is an alias for ``upload_record``.
 
 validate
 ********
 
-Validate the metadata file for a dataset before uploading.
+.. code-block:: text
+
+   usage: data_collections validate [-h] [-f {json,yaml}] file
+
+   Validate a metadata file or string.
+
+   positional arguments:
+     file                  File to validate
+
+   options:
+     -h, --help            show this help message and exit
+     -f, --format {json,yaml}
+                           Parse FILE as this type (default: determine from
+                           suffix).
+
+Validate the metadata file for a dataset complies with the schema before uploading. See `schema`__ for details on a valid metadata file.
 
 dump
 ****
 
+.. code-block:: text
+
+   usage: data_collections template [-h] [-f {json,yaml}] file
+
+   Dump a file template to file.
+
+   positional arguments:
+     file                  File to write
+
+   options:
+     -h, --help            show this help message and exit
+     -f, --format {json,yaml}
+                           Parse FILE as this type (default: determine from
+                           suffix).
+
+
 Dump a template metadata file ready for modification to upload.
 
 
 upload_record
 -------------
+
+.. code-block:: text
+
+   usage: upload_record [-h] --api-url URL --api-key str --metadata-path file
+                        [-f {json,yaml}] [--files FILES [FILES ...]] [--community str]
+
+   Upload records to Invenio repository
+
+   options:
+     -h, --help            show this help message and exit
+     --api-url URL         URL for the API associated with the Invenio repository, e.g.
+                           https://data-collections-staging.psdi.ac.uk/api
+     --api-key str         Your API key/token for accessing the Invenio repository
+                           instance.
+     --metadata-path file  File path to the yaml file containing the metadata to upload
+                           a record to an Invenio repository, e.g.
+                           path/to/files/record.yaml
+     -f {json,yaml}, --metadata-format {json,yaml}
+                           Parse metadata file as this type (default: yaml).
+     --files FILES [FILES ...]
+                           List of file paths associated with the record to be
+                           uploaded, e.g. path/to/files/data.*
+     --community str       Name of a Invenio repository community to upload the record
+                           to, e.g. biosimdb, data-to-knowledge, etc.
+
+
+One-stop tool to upload a record to the repository. This requries that you have already defined your metadata file (see ``dump`` and ``validate``) and got an API key (see: PSDI Invenio docs on how to get this)
diff --git a/docs/source/conf.py b/docs/source/conf.py
index 829180d..85566c9 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -8,8 +8,16 @@
 from __future__ import annotations
 
 import time
+import sys
+from pathlib import Path
+
+DOCS_SRC = Path(__file__).parent.resolve()
+sys.path.append(str(DOCS_SRC.parents[2]))
+sys.path.append(str(DOCS_SRC))
 
 import data_collections_api
+from scripts.schema_gen import main as gen_schema
+
 
 project = "Data Collections API"
 copyright_first_year = "2024"
@@ -24,13 +32,27 @@
 extensions = [
     "numpydoc",
     "sphinx.ext.autodoc",
+    "sphinx.ext.apidoc",
     "sphinx.ext.autosummary",
     "sphinx.ext.intersphinx",
     "sphinx.ext.mathjax",
     "sphinx.ext.viewcode",
     "sphinxcontrib.contentui",
+    "myst_parser",
+]
+
+source_suffix = {
+    ".rst": "restructuredtext",
+    ".txt": "markdown",
+    ".md": "markdown",
+}
+
+apidoc_modules = [
+    {"path": "../../data_collections_api", "destination": "api/"},
 ]
 
+gen_schema(["-Fv", f"-o={DOCS_SRC / 'schemas'}", "-O=%s.md", "all"])
+
 always_use_bars_union = True
 napoleon_include_special_with_doc = True
 napoleon_use_param = True
@@ -45,7 +67,6 @@
     "python": ("https://docs.python.org/3", None),
 }
 
-
 templates_path = ["_templates"]
 exclude_patterns = []
 
diff --git a/docs/source/index.rst b/docs/source/index.rst
index 766a609..efaa7aa 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -14,4 +14,6 @@ Project to allow simplified editing and construction of Invenio data for the PSD
    :caption: Contents:
 
    cli
+   schema
+   schemas/index
    api/modules
diff --git a/docs/source/schema.rst b/docs/source/schema.rst
new file mode 100644
index 0000000..2afa57e
--- /dev/null
+++ b/docs/source/schema.rst
@@ -0,0 +1,6 @@
+Metadata Format
+===============
+
+The metadata file may be either in a `json <https://www.json.org/json-en.html>`__ or `yaml <https://yaml.org>`__ format.
+
+Each community may have their own metadata requirements, which are all encompassed in the ``custom_fields.dsmd`` field, the full list of supported schemas is available in the :doc:`schemas/index` section.
diff --git a/docs/source/schemas/base.md b/docs/source/schemas/base.md
new file mode 100644
index 0000000..1183cdc
--- /dev/null
+++ b/docs/source/schemas/base.md
@@ -0,0 +1,44 @@
+# base
+
+Base schema from which community specific schemas are built.
+
+### Type: `object`
+
+> ⚠️ Additional properties are not allowed.
+
+| Property | Type | Required | Possible values | Default | Description |
+| -------- | ---- | -------- | --------------- | ------- | ----------- |
+| custom_fields | `object` | ✅ | object |  | Block for custom data. |
+| custom_fields.dsmd | `array` | ✅ | object |  | Domain specific metadata (dsmd). |
+| metadata | `object` | ✅ | object |  | Resource metadata. |
+| metadata.title | `string` | ✅ | string |  | Title of resource. |
+| metadata.description | `string` | ✅ | string |  | Summary of resource. |
+| metadata.creators | `array` | ✅ | object |  | List of creators. |
+| metadata.creators[].affiliations | `array` |  | object |  | Member affiliations. |
+| metadata.creators[].affiliations[].name | `string` | ✅ | string |  | Name of institution. |
+| metadata.creators[].person_or_org | `object` | ✅ | object |  | Person or organisation. |
+| metadata.creators[].person_or_org.name | `string` |  | string |  | Full set of given names. |
+| metadata.creators[].person_or_org.family_name | `string` |  | string |  | Family name(s). |
+| metadata.creators[].person_or_org.given_name | `string` |  | string |  | Given name(s). |
+| metadata.creators[].person_or_org.identifiers | `array` |  | object and/or object |  | ORCIDs or other IDs |
+| metadata.creators[].person_or_org.type | `const` | ✅ | `personal` |  | Personal or organisation. |
+| metadata.rights | `array` | ✅ | object |  | Rights or license. |
+| metadata.rights[].id | `const` | ✅ | `cc-by-4.0` |  | ID of rights or license. |
+| metadata.resource_type | `object` | ✅ | object |  | Type of resource. |
+| metadata.resource_type.id | `const` | ✅ | `model` |  | Resource class. |
+| metadata.subjects | `array` |  | object | `[]` | List of keywords defining subjects resource covers. |
+| metadata.subjects[].subject | `string` | ✅ | string |  | Subject keyword. |
+| metadata.version | `string` | ✅ | [`^v\d+(\.\d+)*`](https://regex101.com/?regex=%5Ev%5Cd%2B%28%5C.%5Cd%2B%29%2A) |  | Current version of resource. |
+| metadata.publisher | `string` |  | string |  | Publisher of resource. |
+| metadata.publication_date | `None` |  | None |  | Date of publication of resource. |
+| metadata.identifiers | `array` |  | object and/or object |  | Resource identifiers such as ORCID or DOI. |
+| access | `object` |  | object | `{"files": "public", "record": "public"}` | Accessibility of data outside of owners. |
+| access.embargo | `object` |  | object |  | Details of resource embargo. |
+| access.embargo.active | `boolean` | ✅ | boolean |  | Whether resource is under embargo. |
+| access.embargo.reason | `string` or `null` | ✅ | string |  | Cause for embargo. |
+| access.files | `None` |  | `public` `private` | `"public"` | Accessibility to individual files. |
+| access.record | `None` |  | `public` `private` | `"public"` | Accessibility to record as a whole. |
+| access.status | `None` |  | `open` `closed` |  | Current status or resource. |
+| files | `object` |  | object |  | Details of files. |
+| files.enabled | `boolean` | ✅ | boolean |  | Whether file is enabled. |
+| community | `string` |  | [`\d{8}-(\d{4}-){3}\d{12}`](https://regex101.com/?regex=%5Cd%7B8%7D-%28%5Cd%7B4%7D-%29%7B3%7D%5Cd%7B12%7D) |  | UUID of community associated with resource. |
diff --git a/docs/source/schemas/index.rst b/docs/source/schemas/index.rst
new file mode 100644
index 0000000..4e27baf
--- /dev/null
+++ b/docs/source/schemas/index.rst
@@ -0,0 +1,11 @@
+Schemas
+=======
+
+This page documents the available schemas.
+
+.. toctree::
+   :maxdepth: 1
+   :caption: Schemas:
+
+   base
+
diff --git a/docs/source/scripts/__init__.py b/docs/source/scripts/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/docs/source/scripts/schema_gen.py b/docs/source/scripts/schema_gen.py
new file mode 100644
index 0000000..747ef90
--- /dev/null
+++ b/docs/source/scripts/schema_gen.py
@@ -0,0 +1,245 @@
+"""Generate schema documentation."""
+
+from __future__ import annotations
+
+import argparse
+from pathlib import Path
+from shutil import rmtree
+from textwrap import indent
+from typing import TYPE_CHECKING
+
+import jsonschema_markdown
+
+from data_collections_api.schemas import SCHEMAS, Schema, get_schema
+
+if TYPE_CHECKING:
+    from collections.abc import Sequence
+
+__author__ = "Jacob Wilkins"
+__version__ = "0.1"
+
+INDEX_MD = """\
+{filename}
+{underline}
+
+This page documents the available schemas.
+
+.. toctree::
+   :maxdepth: 1
+   :caption: Schemas:
+
+{schemas}
+
+"""
+
+
+def get_arg_parser() -> argparse.ArgumentParser:
+    """Get parser for CLI.
+
+    Returns
+    -------
+    argparse.ArgumentParser
+        Arg parser.
+    """
+    parser = argparse.ArgumentParser(
+        description="Convert a schema to a markdown document.",
+    )
+
+    parser.add_argument("-V", "--version", action="version", version=f"%(prog)s v{__version__}")
+    parser.add_argument(
+        "-v",
+        "--verbose",
+        action="store_true",
+        help="Print while generating schemas",
+    )
+    parser.add_argument(
+        "-F",
+        "--force",
+        action="store_true",
+        help="Force removal of output directory (if not CWD). (default: %(default)s)",
+    )
+    parser.add_argument(
+        "schemas",
+        nargs="*",
+        choices=SCHEMAS.keys() | {"all"},
+        help="Schemas to convert or 'all' if all are to be done. (default: %(default)r)",
+        default="all",
+    )
+
+    parser.add_argument(
+        "--clear",
+        action=argparse.BooleanOptionalAction,
+        help="Clear folder before writing. (default: %(default)s)",
+        default=True,
+    )
+    parser.add_argument(
+        "--index",
+        action=argparse.BooleanOptionalAction,
+        help="Write index file with toctree to folder. (default: %(default)s)",
+        default=True,
+    )
+
+    parser.add_argument(
+        "--header",
+        help="Title of index file. (default: %(default)r)",
+        default="Schemas",
+    )
+
+    parser.add_argument(
+        "-O",
+        "--out-name",
+        help=(
+            "Format to use for naming output, "
+            "substituting '%%s' for schema key. (default: %(default)r)"
+        ),
+        default="%s.md",
+    )
+    parser.add_argument(
+        "-o",
+        "--out-folder",
+        help="Folder to write formatted docs in. (default: %(default)r)",
+        default="schemas",
+        type=Path,
+    )
+
+    return parser
+
+
+def process_schema(
+    schema_key: Schema | str,
+    *,
+    name: str | None = None,
+) -> str:
+    """Process a schema into markdown.
+
+    Parameters
+    ----------
+    schema_key : Schema or str
+        Key for schemas.
+    name : str, optional
+        Override for name (mandatory if passing :class:`schema` directly).
+
+    Returns
+    -------
+    str
+        Markdown rendered documentation.
+
+    Raises
+    ------
+    ValueError
+        Name not passed with Schema.
+    """
+    match (schema_key, name):
+        case (_, str() as inp):
+            name = inp
+        case (str() as inp, _):
+            name = inp
+        case _:
+            raise ValueError(f"Cannot reliably determine name from {type(schema_key).__name__}")
+
+    schema = get_schema(schema_key)
+    json_schema = schema.json_schema(name)
+
+    return jsonschema_markdown.generate(
+        json_schema,
+        title=name,
+        footer=False,
+        hide_empty_columns=True,
+    )
+
+
+def get_filename(fmt: str, key: str) -> str:
+    """Format filename from CLI.
+
+    Parameters
+    ----------
+    fmt : str
+        CLI format.
+    key : str
+        Schema key.
+
+    Returns
+    -------
+    str
+        Formatted filename.
+
+    Examples
+    --------
+    >>> get_filename("%s.md", "base")
+    'base.md'
+    """
+    return fmt % key
+
+
+def main(args_in: Sequence[str] | None = None, /) -> None:
+    """Parse schemas and dump to file.
+
+    Parameters
+    ----------
+    args_in : Sequence[str], optional
+        Pass CLI params directly.
+    """
+    parser = get_arg_parser()
+    args = parser.parse_args(args_in)
+
+    # Get unique (by schema), but ordered keys matching reqs
+    schemas = {
+        schema: key
+        for key, schema in reversed(SCHEMAS.items())
+        if "all" in args.schemas or key in args.schemas
+    }
+    out_names = [get_filename(args.out_name, key) for key in schemas.values()]
+
+    if args.verbose:
+        print(f"Generating schemas for keys {', '.join(map(repr, schemas.values()))}...")
+
+    if args.clear and args.out_folder.exists() and not args.out_folder.samefile(Path.cwd()):
+        if (
+            not args.force
+            and input(
+                f"Running this will clear {args.out_folder},"
+                " are you sure you want to continue? [y/N] "
+            )
+            .strip()
+            .lower()
+            != "y"
+        ):
+            print("Cancelling.")
+            return
+
+        if args.verbose:
+            print(f"Deleting {args.out_folder}...")
+
+        rmtree(args.out_folder, ignore_errors=True)
+        args.out_folder.mkdir()
+
+    for key, out_name in zip(schemas.values(), out_names, strict=True):
+        out_path = args.out_folder / out_name
+
+        if args.verbose:
+            print(f"Generating schema for {key!r} to {out_path}...")
+
+        markdown = process_schema(key)
+
+        with out_path.open("w", encoding="utf-8") as out:
+            out.write(markdown)
+
+    if args.index:
+        if args.verbose:
+            print(f"Writing index to {args.out_folder / 'index.rst'}...")
+
+        with (args.out_folder / "index.rst").open("w", encoding="utf-8") as out:
+            out.write(
+                INDEX_MD.format(
+                    filename=args.header,
+                    underline="=" * len(args.header),
+                    schemas=indent("\n".join(Path(key).stem for key in out_names), " " * 3),
+                )
+            )
+
+    if args.verbose:
+        print("Done with schemas")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/pyproject.toml b/pyproject.toml
index 8326679..7c1d863 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -11,7 +11,7 @@ authors = [
     { name = "Jas Kalayan" },
     { name = "Alin M. Elena" },
 ]
-requires-python = ">=3.10"
+requires-python = ">=3.11"
 classifiers = [
     "Programming Language :: Python",
     "Programming Language :: Python :: 3.10",
@@ -37,10 +37,12 @@ dependencies = [
 ruamel = ["ruamel.yaml>=0.17.22"]
 yaml = ["pyYAML>=3.13"]
 docs = [
-   "sphinx>=0.13.1",
+   "sphinx>=8.2",
    "sphinxcontrib-contentui<1.0.0,>=0.2.5",
    "furo==2025.9.25",
    "numpydoc>=1.9.0",
+   "myst-parser",
+   "jsonschema-markdown",
 ]
 lint = ["pre-commit<5.0.0,>=4.2.0", "ruff==0.13.3", "numpydoc>=0.19.0"]
 test = ["pytest==8.3.4", "pytest-cov==5.0.0"]

From ecc5844bb17e41d938f196a2edf19646b0911783 Mon Sep 17 00:00:00 2001
From: Jacob Wilkins <jacob.wilkins@stfc.ac.uk>
Date: Wed, 21 Jan 2026 14:27:18 +0000
Subject: [PATCH 2/3] Merge in documentation written in improve-docs

---
 .../cli/data_collections_main.py              |   2 +-
 docs/source/cli.rst                           | 239 +++++++++++++-----
 docs/source/index.rst                         |   2 +-
 docs/source/scripts/__init__.py               |   1 +
 4 files changed, 181 insertions(+), 63 deletions(-)

diff --git a/data_collections_api/cli/data_collections_main.py b/data_collections_api/cli/data_collections_main.py
index eb6b063..61bf82b 100644
--- a/data_collections_api/cli/data_collections_main.py
+++ b/data_collections_api/cli/data_collections_main.py
@@ -69,7 +69,7 @@ def get_arg_parser() -> argparse.ArgumentParser:
         "-f",
         "--format",
         choices=("json", "yaml"),
-        help="Parse FILE as this type (default: determine from suffix).",
+        help="Dump FILE as this type (default: determine from suffix).",
         default=None,
     )
     sp.set_defaults(func=dump_example)
diff --git a/docs/source/cli.rst b/docs/source/cli.rst
index 01a8fbe..b04f3b8 100644
--- a/docs/source/cli.rst
+++ b/docs/source/cli.rst
@@ -1,107 +1,224 @@
 CLI Usage
 =========
 
-``data_collections_api`` provides a few commandline tools for
-simplifying the process of uploading or verifying data.
+``data_collections_api`` provides a few command-line tools for simplifying the process of uploading
+or verifying data and metadata.
 
 data_collections
 ----------------
 
-.. code-block:: text
+.. program:: data_collections
+.. describe:: data_collections
 
-   usage: data_collections [-h] [-V] {validate,template,dump,upload} ...
+   .. option:: operation {validate,template,dump,upload}
 
-   Single-utility API for data handling with remote depositories.
+      .. option:: validate
 
-   positional arguments:
-     {validate,template,dump,upload}
-       validate            Validate metadata
-       template (dump)     Dump a template file.
-       upload              Upload a dataset to an invenio repository.
+         Validate metadata
 
-   options:
-     -h, --help            show this help message and exit
-     -V, --version         show program's version number and exit
+      .. option:: template
+      .. option:: dump
 
-``data_collections`` is the general top-level interface to the
-tools. These tools are implemented as sub-parsers within the main
-module.
+         Dump a template file.
 
+      .. option:: `upload`
+
+         Upload a dataset to an invenio repository.
+
+   .. option:: -V, --version
+
+      Show program's version number and exit.
+
+``data_collections`` is the general top-level interface to the tools. These tools are implemented as
+sub-parsers within the main module.
+
+.. admonition:: Running ``data_collections``
+
+   By default, if the ``data_collections_api`` package is installed, ``data_collections`` is
+   installed as an executable script on your main ``PATH``. In general, this is the main entry
+   point.
+
+   If that is not desired, it is possible to run ``data_collections`` through the python module
+   system::
+
+     python -m data_collections_api
+
+   where the ``data_collections_api`` **module** (folder) is on the current ``sys.path`` (by being
+   installed, in the current ``PYTHONPATH`` or being in the current working directory.)::
+
+     PYTHONPATH=/path/containing/data_collections_api python -m data_collections_api
+
+   Throughout the rest of this page, we will assume ``data_collections`` is used as the main
+   entrypoint.
+
+
+.. _upload:
 
 upload
 ******
 
-Construct a set of data and upload a set of files along with the metadata to an
-Invenio repository. This is an alias for ``upload_record``.
+.. program:: data_collections upload
+.. describe:: data_collections upload
+
+   .. option:: --api-url URL
+
+      URL for the API associated with the Invenio repository, e.g.
+      https://data-collections-staging.psdi.ac.uk/api
+
+   .. option:: --api-key str
+
+      Your API key/token for accessing the Invenio repository instance.
+
+   .. option:: --metadata-path file
+
+      File path to the yaml file containing the metadata to upload a record to an Invenio
+      repository, e.g.  path/to/files/record.yaml
+
+   .. option:: -f {json,yaml}, --metadata-format {json,yaml}
+
+      Parse metadata file as this type (default: yaml).
+
+   .. option:: --files FILES [FILES ...]
+
+      List of file paths associated with the record to be uploaded, e.g. path/to/files/data.*
+
+   .. option:: --community str
+
+      Name of a Invenio repository community to upload the record to, e.g. biosimdb,
+      data-to-knowledge, etc.
+
+
+``data_collections_api`` can take your data and metadata and automatically upload it to the Invenio
+repository. To do so, you need to have some information at hand:
+
+- The URL of the repository you wish to upload the data to. In the case of PSDI data, this will
+  often be https://data-collections.psdi.ac.uk.
+- Your API key (also called a Personal Access Token or PAT) for the repository to give permissions
+  to write and upload data.
+- A metadata file detailing the data relating to the files (see :doc:`schemas/index`).
+- The files ready to upload.
+
+With all this prepared, uploading the data is as simple as:
+
+.. code-block:: console
+
+   data_collections upload --api-url https://data-collections.psdi.ac.uk --api-key 1234567890abcdef --metadata-path /path/to/metata_file.yaml --files FILE1 FILE2 --community my_community
+
+.. note::
+
+   Since this is a common operation it is also available as the standalone :option:`upload_record`
+
+.. _validate:
 
 validate
 ********
 
-.. code-block:: text
+.. program:: data_collections validate
+
+.. describe:: data_collections validate
+
+   .. option:: FILE
+
+      File to validate.
+
+   .. option:: -f {json,yaml}, --format {json,yaml}
+
+      Parse :option:`FILE` as this type (default: determine from suffix).
+
+Validate the metadata file for a dataset before uploading.
+
+``data_collections_api`` can validate your metadata file against the schema to verify the contents
+of the file match what is required to make a valid upload.
+
+.. note::
+
+   The validator does not verify most data itself, you must ensure that all entries are spelled and
+   written correctly.
+
+To validate a data file simply run:
+
+.. code-block:: console
+
+   data_collections validate [file]
 
-   usage: data_collections validate [-h] [-f {json,yaml}] file
+e.g.
 
-   Validate a metadata file or string.
+.. code-block:: console
 
-   positional arguments:
-     file                  File to validate
+   data_collections validate examples/biosim_record.yaml
 
-   options:
-     -h, --help            show this help message and exit
-     -f, --format {json,yaml}
-                           Parse FILE as this type (default: determine from
-                           suffix).
+The file can be either in ``json`` or ``yaml`` formats (see: :doc:`schema`). :option:`data_collections validate` will attempt to determine the
+appropriate format from the file extension, but this can be specified explicitly with the ``-f``
+flag.
 
-Validate the metadata file for a dataset complies with the schema before uploading. See `schema`__ for details on a valid metadata file.
+.. code-block:: console
+
+   data_collections validate -f json examples/biosim_record.yaml
+
+.. note::
+
+   The above will raise an error since the file is not in ``json`` format.
 
 dump
 ****
 
-.. code-block:: text
+.. program:: data_collections template
+.. describe:: data_collections template
+.. describe:: data_collections dump
+
+   .. option:: FILE
 
-   usage: data_collections template [-h] [-f {json,yaml}] file
+      File to dump.
 
-   Dump a file template to file.
+   .. option:: -f {json,yaml}, --format {json,yaml}
 
-   positional arguments:
-     file                  File to write
+      Dump :option:`FILE` as this type (default: determine from suffix).
 
-   options:
-     -h, --help            show this help message and exit
-     -f, --format {json,yaml}
-                           Parse FILE as this type (default: determine from
-                           suffix).
+``data_collections_api`` provides a method to quick-start building metadata, ``template`` will dump
+an example metadata file for a particular community and data-type (though currently only a basic
+example is available).  To do so, simply run
 
+.. code-block:: console
 
-Dump a template metadata file ready for modification to upload.
+   data_collections dump my_metadata.yaml
+
+You can then edit and modify this template to fill in the data needed.
 
 
 upload_record
 -------------
 
-.. code-block:: text
+.. program:: upload_record
+.. describe:: upload_record
+
+   .. option:: --api-url URL
+
+      URL for the API associated with the Invenio repository, e.g.
+      https://data-collections-staging.psdi.ac.uk/api
+
+   .. option:: --api-key str
+
+      Your API key/token for accessing the Invenio repository instance.
+
+   .. option:: --metadata-path file
+
+      File path to the yaml file containing the metadata to upload a record to an Invenio
+      repository, e.g.  path/to/files/record.yaml
+
+   .. option:: -f {json,yaml}, --metadata-format {json,yaml}
+
+      Parse metadata file as this type (default: yaml).
+
+   .. option:: --files FILES [FILES ...]
+
+      List of file paths associated with the record to be uploaded, e.g. ``path/to/files/data.*``
 
-   usage: upload_record [-h] --api-url URL --api-key str --metadata-path file
-                        [-f {json,yaml}] [--files FILES [FILES ...]] [--community str]
+   .. option:: --community str
 
-   Upload records to Invenio repository
+      Name of a Invenio repository community to upload the record to, e.g. biosimdb,
+      data-to-knowledge, etc.
 
-   options:
-     -h, --help            show this help message and exit
-     --api-url URL         URL for the API associated with the Invenio repository, e.g.
-                           https://data-collections-staging.psdi.ac.uk/api
-     --api-key str         Your API key/token for accessing the Invenio repository
-                           instance.
-     --metadata-path file  File path to the yaml file containing the metadata to upload
-                           a record to an Invenio repository, e.g.
-                           path/to/files/record.yaml
-     -f {json,yaml}, --metadata-format {json,yaml}
-                           Parse metadata file as this type (default: yaml).
-     --files FILES [FILES ...]
-                           List of file paths associated with the record to be
-                           uploaded, e.g. path/to/files/data.*
-     --community str       Name of a Invenio repository community to upload the record
-                           to, e.g. biosimdb, data-to-knowledge, etc.
 
+One-stop tool to upload a record to the repository, see `upload`_.
 
-One-stop tool to upload a record to the repository. This requries that you have already defined your metadata file (see ``dump`` and ``validate``) and got an API key (see: PSDI Invenio docs on how to get this)
+.. _pat_guide: ...
diff --git a/docs/source/index.rst b/docs/source/index.rst
index efaa7aa..78bccf3 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -16,4 +16,4 @@ Project to allow simplified editing and construction of Invenio data for the PSD
    cli
    schema
    schemas/index
-   api/modules
+   API Documentation <api/modules>
diff --git a/docs/source/scripts/__init__.py b/docs/source/scripts/__init__.py
index e69de29..14e5b9b 100644
--- a/docs/source/scripts/__init__.py
+++ b/docs/source/scripts/__init__.py
@@ -0,0 +1 @@
+"""Set of scripts for building docs."""

From ab10f7292abe340c67941a75857fbc86f8e8b87d Mon Sep 17 00:00:00 2001
From: Jacob Wilkins <jacob.wilkins@stfc.ac.uk>
Date: Wed, 21 Jan 2026 14:48:53 +0000
Subject: [PATCH 3/3] Respond to review

---
 docs/source/cli.rst               |  4 +++
 docs/source/conf.py               |  2 +-
 docs/source/schemas/index.rst     |  1 -
 docs/source/scripts/schema_gen.py | 58 +++++++++++++++++++++----------
 pyproject.toml                    |  4 +--
 5 files changed, 46 insertions(+), 23 deletions(-)

diff --git a/docs/source/cli.rst b/docs/source/cli.rst
index b04f3b8..f25f5a1 100644
--- a/docs/source/cli.rst
+++ b/docs/source/cli.rst
@@ -125,6 +125,10 @@ validate
 
       Parse :option:`FILE` as this type (default: determine from suffix).
 
+   .. option:: -S SCHEMA, --schema SCHEMA
+
+      Validate against the given schema (default: :doc:`base`)
+
 Validate the metadata file for a dataset before uploading.
 
 ``data_collections_api`` can validate your metadata file against the schema to verify the contents
diff --git a/docs/source/conf.py b/docs/source/conf.py
index 85566c9..e2c0d2f 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -32,7 +32,7 @@
 extensions = [
     "numpydoc",
     "sphinx.ext.autodoc",
-    "sphinx.ext.apidoc",
+    # "sphinx.ext.apidoc",
     "sphinx.ext.autosummary",
     "sphinx.ext.intersphinx",
     "sphinx.ext.mathjax",
diff --git a/docs/source/schemas/index.rst b/docs/source/schemas/index.rst
index 4e27baf..abff4eb 100644
--- a/docs/source/schemas/index.rst
+++ b/docs/source/schemas/index.rst
@@ -8,4 +8,3 @@ This page documents the available schemas.
    :caption: Schemas:
 
    base
-
diff --git a/docs/source/scripts/schema_gen.py b/docs/source/scripts/schema_gen.py
index 747ef90..00bceed 100644
--- a/docs/source/scripts/schema_gen.py
+++ b/docs/source/scripts/schema_gen.py
@@ -5,6 +5,7 @@
 import argparse
 from pathlib import Path
 from shutil import rmtree
+import sys
 from textwrap import indent
 from typing import TYPE_CHECKING
 
@@ -171,6 +172,42 @@ def get_filename(fmt: str, key: str) -> str:
     return fmt % key
 
 
+def clear_folder(folder: Path, *, force: bool = False, verbose: bool = False) -> None:
+    """Delete folder and create new (empty) one.
+
+    Parameters
+    ----------
+    folder : Path
+        Folder to clear.
+    force : bool
+        Do not ask whether to remove folder.
+    verbose : bool
+        Print status.
+    """
+    if not folder.exists():
+        return
+
+    if folder.samefile(Path.cwd()):
+        print("Cannot clear folder as this is current working directory.")
+        return
+
+    if (
+        not force
+        and input(f"Running this will clear {folder}, are you sure you want to continue? [y/N] ")
+        .strip()
+        .lower()
+        != "y"
+    ):
+        print("Cancelling.")
+        sys.exit()
+
+    if verbose:
+        print(f"Deleting {folder}...")
+
+    rmtree(folder, ignore_errors=True)
+    folder.mkdir()
+
+
 def main(args_in: Sequence[str] | None = None, /) -> None:
     """Parse schemas and dump to file.
 
@@ -193,25 +230,8 @@ def main(args_in: Sequence[str] | None = None, /) -> None:
     if args.verbose:
         print(f"Generating schemas for keys {', '.join(map(repr, schemas.values()))}...")
 
-    if args.clear and args.out_folder.exists() and not args.out_folder.samefile(Path.cwd()):
-        if (
-            not args.force
-            and input(
-                f"Running this will clear {args.out_folder},"
-                " are you sure you want to continue? [y/N] "
-            )
-            .strip()
-            .lower()
-            != "y"
-        ):
-            print("Cancelling.")
-            return
-
-        if args.verbose:
-            print(f"Deleting {args.out_folder}...")
-
-        rmtree(args.out_folder, ignore_errors=True)
-        args.out_folder.mkdir()
+    if args.clear:
+        clear_folder(args.out_folder, force=args.force, verbose=args.verbose)
 
     for key, out_name in zip(schemas.values(), out_names, strict=True):
         out_path = args.out_folder / out_name
diff --git a/pyproject.toml b/pyproject.toml
index 7c1d863..6553353 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -11,7 +11,7 @@ authors = [
     { name = "Jas Kalayan" },
     { name = "Alin M. Elena" },
 ]
-requires-python = ">=3.11"
+requires-python = ">=3.10"
 classifiers = [
     "Programming Language :: Python",
     "Programming Language :: Python :: 3.10",
@@ -37,7 +37,7 @@ dependencies = [
 ruamel = ["ruamel.yaml>=0.17.22"]
 yaml = ["pyYAML>=3.13"]
 docs = [
-   "sphinx>=8.2",
+   "sphinx>=0.13.1",
    "sphinxcontrib-contentui<1.0.0,>=0.2.5",
    "furo==2025.9.25",
    "numpydoc>=1.9.0",